├── .editorconfig ├── .github ├── FUNDING.yml ├── dependabot.yml ├── linters │ ├── .flake8 │ ├── .markdown-lint.yml │ ├── .python-lint │ └── .yaml-lint.yml └── workflows │ └── linter.yml ├── .gitignore ├── README.md ├── config └── storage-cors.json ├── crontab ├── dataflow ├── java │ ├── .gitignore │ ├── README.md │ ├── nb-configuration.xml │ ├── nbactions.xml │ ├── pom.xml │ └── src │ │ └── main │ │ └── java │ │ └── com │ │ └── httparchive │ │ └── dataflow │ │ ├── BigQueryImport.java │ │ └── GcsPathCoder.java └── python │ ├── .gitignore │ ├── README.md │ ├── adblock.py │ ├── bigquery_import.py │ ├── get_rules.sh │ ├── requirements.txt │ ├── run.sh │ └── setup.py ├── datalab └── histograms.ipynb ├── docs └── README.md ├── schema ├── .sqlfluffignore ├── httparchive_schema.sql ├── pages.json ├── requests.json └── schema.rb ├── sql ├── .sqlfluff ├── .sqlfluffignore ├── addDate.js ├── delete_date_from_reports.sh ├── generate_reports.sh ├── get_bigquery_dates.sh ├── histograms │ ├── bootupJs.sql │ ├── bytesCss.sql │ ├── bytesFont.sql │ ├── bytesHtml.sql │ ├── bytesImg.sql │ ├── bytesJs.sql │ ├── bytesOther.sql │ ├── bytesTotal.sql │ ├── bytesVideo.sql │ ├── compileJs.sql │ ├── cruxCls.sql │ ├── cruxDcl.sql │ ├── cruxFcp.sql │ ├── cruxFp.sql │ ├── cruxInp.sql │ ├── cruxLcp.sql │ ├── cruxOl.sql │ ├── cruxShopifyThemes.sql │ ├── cruxTtfb.sql │ ├── dcl.sql │ ├── evalJs.sql │ ├── fcp.sql │ ├── gzipSavings.sql │ ├── htmlElementPopularity.sql │ ├── imgSavings.sql │ ├── offscreenImages.sql │ ├── ol.sql │ ├── optimizedImages.sql │ ├── reqCss.sql │ ├── reqFont.sql │ ├── reqHtml.sql │ ├── reqImg.sql │ ├── reqJs.sql │ ├── reqOther.sql │ ├── reqTotal.sql │ ├── reqVideo.sql │ ├── speedIndex.sql │ ├── tcp.sql │ └── ttci.sql ├── lens │ ├── drupal │ │ ├── crux_histograms.sql │ │ ├── crux_timeseries.sql │ │ ├── histograms.sql │ │ └── timeseries.sql │ ├── magento │ │ ├── crux_histograms.sql │ │ ├── crux_timeseries.sql │ │ ├── histograms.sql │ │ └── timeseries.sql │ ├── top100k │ │ ├── crux_histograms.sql │ │ ├── crux_timeseries.sql │ │ ├── histograms.sql │ │ └── timeseries.sql │ ├── top10k │ │ ├── crux_histograms.sql │ │ ├── crux_timeseries.sql │ │ ├── histograms.sql │ │ └── timeseries.sql │ ├── top1k │ │ ├── crux_histograms.sql │ │ ├── crux_timeseries.sql │ │ ├── histograms.sql │ │ └── timeseries.sql │ ├── top1m │ │ ├── crux_histograms.sql │ │ ├── crux_timeseries.sql │ │ ├── histograms.sql │ │ └── timeseries.sql │ └── wordpress │ │ ├── crux_histograms.sql │ │ ├── crux_timeseries.sql │ │ ├── histograms.sql │ │ └── timeseries.sql ├── new_metric.sh └── timeseries │ ├── a11yButtonName.sql │ ├── a11yColorContrast.sql │ ├── a11yImageAlt.sql │ ├── a11yLabel.sql │ ├── a11yLinkName.sql │ ├── a11yScores.sql │ ├── asyncClipboardRead.sql │ ├── badgeClear.sql │ ├── badgeSet.sql │ ├── bootupJs.sql │ ├── bytesCss.sql │ ├── bytesFont.sql │ ├── bytesHtml.sql │ ├── bytesImg.sql │ ├── bytesJs.sql │ ├── bytesOther.sql │ ├── bytesTotal.sql │ ├── bytesVideo.sql │ ├── canonical.sql │ ├── contentIndex.sql │ ├── cruxFastDcl.sql │ ├── cruxFastFcp.sql │ ├── cruxFastFp.sql │ ├── cruxFastInp.sql │ ├── cruxFastLcp.sql │ ├── cruxFastOl.sql │ ├── cruxFastTtfb.sql │ ├── cruxLargeCls.sql │ ├── cruxPassesCWV.sql │ ├── cruxSlowFcp.sql │ ├── cruxSlowInp.sql │ ├── cruxSlowLcp.sql │ ├── cruxSlowTtfb.sql │ ├── cruxSmallCls.sql │ ├── dcl.sql │ ├── fcp.sql │ ├── fontDisplay.sql │ ├── getInstalledRelatedApps.sql │ ├── gzipSavings.sql │ ├── h2.sql │ ├── h3.sql │ ├── hreflang.sql │ ├── idleDetection.sql │ ├── imgLazy.sql │ ├── imgSavings.sql │ ├── legible.sql │ ├── linkText.sql │ ├── notificationTriggers.sql │ ├── numUrls.sql │ ├── offscreenImages.sql │ ├── ol.sql │ ├── optimizedImages.sql │ ├── pctHttps.sql │ ├── periodicBackgroundSync.sql │ ├── periodicBackgroundSyncRegister.sql │ ├── quicTransport.sql │ ├── reqCss.sql │ ├── reqFont.sql │ ├── reqHtml.sql │ ├── reqImg.sql │ ├── reqJs.sql │ ├── reqOther.sql │ ├── reqTotal.sql │ ├── reqVideo.sql │ ├── screenWakeLock.sql │ ├── speedIndex.sql │ ├── storageEstimate.sql │ ├── storagePersist.sql │ ├── swControlledPages.sql │ ├── tcp.sql │ ├── ttci.sql │ └── webSocketStream.sql ├── sync_csv.sh ├── sync_har.sh ├── urls ├── .gitignore ├── Gemfile ├── process.rb ├── run.sh └── schema.json └── util └── fixcsv.py /.editorconfig: -------------------------------------------------------------------------------- 1 | # editorconfig.org 2 | root = true 3 | 4 | [*] 5 | indent_style = space 6 | end_of_line = lf 7 | charset = utf-8 8 | trim_trailing_whitespace = true 9 | insert_final_newline = true 10 | 11 | [*.{html,md,js,json,css,sql}] 12 | indent_size = 2 13 | 14 | [*.py] 15 | indent_size = 4 16 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] 4 | patreon: # Replace with a single Patreon username 5 | open_collective: httparchive 6 | ko_fi: # Replace with a single Ko-fi username 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | otechie: # Replace with a single Otechie username 12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] 13 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "github-actions" 4 | directory: "/" 5 | schedule: 6 | interval: "daily" 7 | -------------------------------------------------------------------------------- /.github/linters/.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 120 3 | per-file-ignores = 4 | # The __init__.py file imports the routes and errors file at bottom 5 | /github/workspace/src/server/__init__.py:E402,F401 6 | /tmp/lint/src/server/__init__.py:E402,F401 7 | /tmp/lint/server/__init__.py:E402,F401 8 | -------------------------------------------------------------------------------- /.github/linters/.markdown-lint.yml: -------------------------------------------------------------------------------- 1 | --- 2 | ########################### 3 | ########################### 4 | ## Markdown Linter rules ## 5 | ########################### 6 | ########################### 7 | 8 | # Linter rules doc: 9 | # - https://github.com/DavidAnson/markdownlint 10 | # 11 | # Note: 12 | # To comment out a single error: 13 | # 14 | # any violations you want 15 | # 16 | # 17 | 18 | ############### 19 | # Rules by id # 20 | ############### 21 | MD004: false # Unordered list style 22 | MD007: false # Allow extra spaces for lists - don't cause issues and will just annoy authors 23 | MD009: false # Allow trailing spaces - don't cause issues and will just annoy authors 24 | MD013: false # Don't demand maximum line lengths 25 | MD024: 26 | siblings_only: true # Allows sub-headings to be reused under different headings 27 | MD026: 28 | punctuation: ".,;:!。,;:" # List of not allowed 29 | MD029: false # Ordered list item prefix 30 | MD033: false # Allow inline HTML 31 | MD034: false # Allow base URLs 32 | MD036: false # Emphasis used instead of a heading 33 | MD040: false # Don't demand language for all code blocks 34 | 35 | ################# 36 | # Rules by tags # 37 | ################# 38 | blank_lines: false # Error on blank lines 39 | -------------------------------------------------------------------------------- /.github/linters/.yaml-lint.yml: -------------------------------------------------------------------------------- 1 | --- 2 | ######################################################## 3 | # HTTP Archive Overrides for YAML Lint # 4 | # https://yamllint.readthedocs.io/en/stable/rules.html # 5 | ######################################################## 6 | rules: 7 | document-start: disable 8 | line-length: 9 | max: 120 10 | -------------------------------------------------------------------------------- /.github/workflows/linter.yml: -------------------------------------------------------------------------------- 1 | ########################### 2 | ## Linter GitHub Actions ## 3 | ########################### 4 | # 5 | # Documentation: https://github.com/github/super-linter/ 6 | # 7 | # Exception config files are in the .github/linters directory 8 | # 9 | name: Lint Code Base 10 | on: 11 | - workflow_dispatch 12 | - pull_request 13 | jobs: 14 | lint: 15 | name: Lint Code Base 16 | runs-on: ubuntu-20.04 17 | steps: 18 | - name: Checkout Code 19 | uses: actions/checkout@v4 20 | with: 21 | # Full git history is needed to get a proper list of changed files within `super-linter` 22 | fetch-depth: 0 23 | - name: Set VALIDATE_ALL_CODEBASE variable to false 24 | # Only run the full workflow for manual runs or if upgrading the super linter 25 | if: | 26 | github.event_name != 'workflow_dispatch' && 27 | startsWith(github.event.pull_request.title,'Bump super-linter/super-linter') != true 28 | run: | 29 | echo "VALIDATE_ALL_CODEBASE=false" >> $GITHUB_ENV 30 | - name: Lint Code Base 31 | uses: super-linter/super-linter/slim@v7 32 | env: 33 | DEFAULT_BRANCH: master 34 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}s 35 | FILTER_REGEX_EXCLUDE: .*/dataflow/ 36 | # VALIDATE_BASH: true 37 | VALIDATE_EDITORCONFIG: true 38 | VALIDATE_JAVASCRIPT_ES: true 39 | VALIDATE_JSON: true 40 | VALIDATE_MARKDOWN: true 41 | VALIDATE_PYTHON_PYLINT: true 42 | VALIDATE_PYTHON_FLAKE8: true 43 | VALIDATE_SQLFLUFF: true 44 | VALIDATE_YAML: true 45 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | data 2 | tools 3 | .DS_Store 4 | *.out 5 | 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # HTTP Archive + BigQuery data import 2 | 3 | _Note: you don't need to import this data yourself, the BigQuery dataset is public! [Getting started](https://github.com/HTTPArchive/httparchive.org/blob/master/docs/gettingstarted_bigquery.md)._ 4 | 5 | However, if you do want your own private copy of the dataset... The following import and sync scripts will help you import the [HTTP Archive dataset](http://httparchive.org/downloads.php) into BigQuery and keep it up to date. 6 | 7 | ```bash 8 | $> sh sync.sh Jun_15_2013 9 | $> sh sync.sh mobile_Jun_15_2013 10 | ``` 11 | 12 | That's all there is to it. The sync script handles all the necessary processing: 13 | 14 | * Archives are fetched from archive.org (and cached locally) 15 | * Archived CSV is transformed to BigQuery compatible escaping 16 | * You will need +pigz+ installed for parallel compression 17 | * Request files are split into <1GB compressed CSV's 18 | * Resulting pages and request data is synced to a Google Storage bucket 19 | * BigQuery import is kicked off for each of compressed archives on Google Storage 20 | 21 | After the upload is complete, a copy of the latest tables can be made with: 22 | 23 | ```bash 24 | $> bq.py cp runs.2013_06_15_pages runs.latest_pages 25 | $> bq.py cp runs.2013_06_15_pages_mobile runs.latest_pages_mobile 26 | $> bq.py cp runs.2013_06_15_requests runs.latest_requests 27 | $> bq.py cp runs.2013_06_15_requests_mobile runs.latest_requests_mobile 28 | ``` 29 | 30 | (MIT License) - Copyright (c) 2013 Ilya Grigorik 31 | -------------------------------------------------------------------------------- /config/storage-cors.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "origin": [ 4 | "http://beta.httparchive.org", 5 | "https://beta.httparchive.org", 6 | "http://httparchive.org", 7 | "https://httparchive.org", 8 | "http://www.httparchive.org", 9 | "https://www.httparchive.org", 10 | "http://staging.httparchive.org", 11 | "https://staging.httparchive.org", 12 | "http://localhost:8080", 13 | "http://127.0.0.1:8080", 14 | "http://httparchive.appspot.com", 15 | "https://httparchive.appspot.com", 16 | "http://httparchive-staging.appspot.com", 17 | "https://httparchive-staging.appspot.com", 18 | "https://drupal.httparchive.org", 19 | "https://magento.httparchive.org", 20 | "https://wordpress.httparchive.org" 21 | ], 22 | "responseHeader": ["Content-Type"], 23 | "method": ["GET", "HEAD"], 24 | "maxAgeSeconds": 3600 25 | } 26 | ] 27 | -------------------------------------------------------------------------------- /crontab: -------------------------------------------------------------------------------- 1 | #0 15 * * * /bin/bash -l -c 'cd /home/igrigorik/code && ./sync_csv.sh `date +\%b_1_\%Y`' >> /var/log/HAimport.log 2>&1 2 | #0 8 * * * /bin/bash -l -c 'cd /home/igrigorik/code && ./sync_csv.sh mobile_`date +\%b_1_\%Y`' >> /var/log/HAimport.log 2>&1 3 | 4 | #0 10 * * * /bin/bash -l -c 'cd /home/igrigorik/code && ./sync_har.sh chrome' >> /var/log/HA-import-har-chrome.log 2>&1 5 | #0 11 * * * /bin/bash -l -c 'cd /home/igrigorik/code && ./sync_har.sh android' >> /var/log/HA-import-har-android.log 2>&1 6 | 7 | # Attempt to run the reports everyday 8 | 0 8 * * * /bin/bash -l -c 'cd /home/igrigorik/code && sql/generate_reports.sh -th `date "+\%Y_\%m_01"` -l ALL' >> /var/log/generate_reports.log 2>&1 9 | 10 | # Run the reports on the 2nd to pick up blink table updates 11 | 0 7 2 * * /bin/bash -l -c 'cd /home/igrigorik/code && sql/generate_reports.sh -th `date -d "-1 month" "+\%Y_\%m_01"` -l ALL' >> /var/log/generate_last_months_reports.log 2>&1 12 | 13 | # Run the CrUX reports on 15th 14 | 0 7 15 * * /bin/bash -l -c 'cd /home/igrigorik/code && sql/generate_reports.sh -th `date -d "-1 month" "+\%Y_\%m_01"` -r "*crux*" -l ALL' >> /var/log/crux_reruns.log 2>&1 15 | -------------------------------------------------------------------------------- /dataflow/java/.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | -------------------------------------------------------------------------------- /dataflow/java/README.md: -------------------------------------------------------------------------------- 1 | # Loading data 2 | 3 | ``` 4 | mvn compile exec:java -Dexec.mainClass=com.httparchive.dataflow.BigQueryImport -Dexec.args="--project=httparchive --stagingLocation=gs://httparchive/dataflow/staging --runner=BlockingDataflowPipelineRunner --input=desktop-Oct_15_2015 --workerMachineType=n1-standard-4" 5 | ``` 6 | 7 | ## Installing Java on Debian 8 | - https://www.digitalocean.com/community/tutorials/how-to-manually-install-oracle-java-on-a-debian-or-ubuntu-vps 9 | -------------------------------------------------------------------------------- /dataflow/java/nb-configuration.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 9 | 10 | 16 | JDK_1.7 17 | true 18 | 19 | 20 | -------------------------------------------------------------------------------- /dataflow/java/nbactions.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | run 5 | 6 | jar 7 | 8 | 9 | process-classes 10 | org.codehaus.mojo:exec-maven-plugin:1.2.1:exec 11 | 12 | 13 | -classpath %classpath com.httparchive.dataflow.BigQueryImport --project=httparchive --stagingLocation=gs://httparchive/dataflow/staging --runner=DirectPipelineRunner --input=test-Apr_25_2017 14 | java 15 | /Users/igrigorik/google-cloud-sdk/bin:/usr/local/git/current/bin:/usr/local/bin:/usr/bin:/bin:/usr/local/sbin:/usr/sbin:/sbin:/usr/local/bin/g4bin:/usr/local/git/bin 16 | 17 | 18 | 19 | debug 20 | 21 | jar 22 | 23 | 24 | process-classes 25 | org.codehaus.mojo:exec-maven-plugin:1.2.1:exec 26 | 27 | 28 | -Xdebug -Xrunjdwp:transport=dt_socket,server=n,address=${jpda.address} -classpath %classpath com.httparchive.dataflow.BigQueryImport --project=httparchive --stagingLocation=gs://httparchive/dataflow/staging --runner=DirectPipelineRunner --input=test-Apr_25_2017 29 | java 30 | /Users/igrigorik/google-cloud-sdk/bin:/usr/local/git/current/bin:/usr/local/bin:/usr/bin:/bin:/usr/local/sbin:/usr/sbin:/sbin:/usr/local/bin/g4bin:/usr/local/git/bin 31 | true 32 | 33 | 34 | 35 | profile 36 | 37 | jar 38 | 39 | 40 | process-classes 41 | org.codehaus.mojo:exec-maven-plugin:1.2.1:exec 42 | 43 | 44 | -classpath %classpath com.httparchive.dataflow.BigQueryImport --project=httparchive --stagingLocation=gs://httparchive/dataflow/staging --runner=DirectPipelineRunner --input=test-Apr_25_2017 45 | java 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /dataflow/java/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | com.httparchive 5 | Dataflow 6 | 1.0-SNAPSHOT 7 | jar 8 | 9 | UTF-8 10 | 1.7 11 | 1.7 12 | 13 | 14 | 15 | 16 | com.google.cloud.dataflow 17 | google-cloud-dataflow-java-sdk-all 18 | 1.9.0 19 | 20 | 21 | org.slf4j 22 | slf4j-simple 23 | 1.7.21 24 | 25 | 26 | org.slf4j 27 | slf4j-api 28 | 1.7.21 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /dataflow/java/src/main/java/com/httparchive/dataflow/GcsPathCoder.java: -------------------------------------------------------------------------------- 1 | /* 2 | * To change this license header, choose License Headers in Project Properties. 3 | * To change this template file, choose Tools | Templates 4 | * and open the template in the editor. 5 | */ 6 | package com.httparchive.dataflow; 7 | 8 | import com.google.cloud.dataflow.sdk.coders.AtomicCoder; 9 | import com.google.cloud.dataflow.sdk.coders.Coder.Context; 10 | import com.google.cloud.dataflow.sdk.coders.StringUtf8Coder; 11 | import com.google.cloud.dataflow.sdk.util.gcsfs.GcsPath; 12 | 13 | import java.io.IOException; 14 | import java.io.InputStream; 15 | import java.io.OutputStream; 16 | import java.nio.ByteBuffer; 17 | 18 | public class GcsPathCoder extends AtomicCoder { 19 | 20 | public static GcsPathCoder of() { 21 | return INSTANCE; 22 | } 23 | 24 | @Override 25 | public void encode(GcsPath value, OutputStream outStream, Context context) 26 | throws IOException { 27 | String strValue = value.toResourceName(); 28 | StringUtf8Coder.of().encode(strValue, outStream, context); 29 | } 30 | 31 | public GcsPath decode(ByteBuffer in) { 32 | return GcsPath.fromResourceName(in.toString()); 33 | } 34 | 35 | @Override 36 | public GcsPath decode(InputStream inStream, Context context) throws IOException { 37 | try { 38 | String strValue = StringUtf8Coder.of().decode(inStream, context); 39 | return GcsPath.fromResourceName(strValue); 40 | } catch (IOException e) { 41 | System.out.println("Failed to decode GcsPath: " + e); 42 | System.out.println(inStream); 43 | System.out.println(context); 44 | throw e; 45 | } 46 | } 47 | 48 | private static final GcsPathCoder INSTANCE = new GcsPathCoder(); 49 | 50 | /** 51 | * TableCell can hold arbitrary Object instances, which makes the encoding 52 | * non-deterministic. 53 | * 54 | * @return 55 | */ 56 | @Deprecated 57 | public boolean isDeterministic() { 58 | return false; 59 | } 60 | 61 | @Override 62 | public void verifyDeterministic() throws NonDeterministicException { 63 | throw new NonDeterministicException(this, 64 | "HAR can hold arbitrary instances which may be non-deterministic."); 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /dataflow/python/.gitignore: -------------------------------------------------------------------------------- 1 | env 2 | lib 3 | adblock.egg* 4 | local 5 | credentials/auth.json 6 | __pycache__ -------------------------------------------------------------------------------- /dataflow/python/README.md: -------------------------------------------------------------------------------- 1 | # HTTP Archive Python Dataflow 2 | 3 | ## Installation 4 | 5 | Follow the [Quickstart using Python](https://cloud.google.com/dataflow/docs/quickstarts/quickstart-python#before-you-begin) guide. 6 | 7 | 1. Create and activate a Python 3 `virtualenv`: 8 | 9 | ``` 10 | python -m virtualenv --python=python3 --clear env 11 | source env/bin/activate 12 | ``` 13 | 14 | 2. Install dependencies: 15 | 16 | ``` 17 | pip install -r requirements.txt 18 | ``` 19 | 20 | 3. Create a service account key, save it to `credentials/cert.json`, and set the environment variable: 21 | 22 | ``` 23 | export GOOGLE_APPLICATION_CREDENTIALS="./credentials/auth.json" 24 | ``` 25 | 26 | This needs to be reset during the startup of every shell. 27 | 28 | ## Running the pipeline 29 | 30 | 1. Activate the Python virtual environment: 31 | 32 | ``` 33 | source env/bin/activate 34 | ``` 35 | 36 | 2. Run `bigquery_import.py`: 37 | 38 | ``` 39 | python bigquery_import.py \ 40 | --runner=DataflowRunner \ 41 | --project=httparchive \ 42 | --temp_location=gs://httparchive/dataflow/temp \ 43 | --staging_location=gs://httparchive/dataflow/staging \ 44 | --region=us-west1 \ 45 | --machine_type=n1-standard-32 \ 46 | --input=android-Dec_1_2020 \ 47 | --worker_disk_type=compute.googleapis.com/projects//zones//diskTypes/pd-ssd 48 | ``` 49 | 50 | The `--runner=DataflowRunner` option forces the pipeline to run in the cloud using Dataflow. To run locally, omit this option. Be aware that crawls consume TB of disk space, so only run locally using subsetted input datasets. To create a subset dataset, copy a few HAR files on GCS to a new directory. 51 | 52 | 3. Decativate the virtual environment: 53 | 54 | ``` 55 | deactivate 56 | ``` 57 | -------------------------------------------------------------------------------- /dataflow/python/adblock.py: -------------------------------------------------------------------------------- 1 | """A easylist classifier.""" 2 | 3 | from __future__ import absolute_import 4 | from adblockparser import AdblockRules 5 | 6 | import argparse 7 | import logging 8 | import re2 as re 9 | 10 | import google.cloud.dataflow as df 11 | 12 | class EasylistClassifyDoFn(df.DoFn): 13 | def process(self, *args): 14 | row, classifiers = args[0].element, args[1] 15 | row['type'] = '' 16 | 17 | for (name, classifier) in classifiers.items(): 18 | # TODO: add script initiator check 19 | if classifier.should_block(row['url'], { 20 | 'domain': row['domain'], 21 | 'third-party': row['third_party'] 22 | }): 23 | row['type'] = name 24 | print row 25 | break 26 | 27 | del row['domain'] 28 | del row['third_party'] 29 | yield row 30 | 31 | def run(argv=None): 32 | parser = argparse.ArgumentParser() 33 | parser.add_argument('--input', 34 | dest='input', 35 | required=True, 36 | help='BigQuery request input table.') 37 | parser.add_argument('--output', 38 | dest='output', 39 | help='BigQuery output table.') 40 | known_args, pipeline_args = parser.parse_known_args(argv) 41 | 42 | output_table = '%s' % known_args.output 43 | input_query = """ 44 | SELECT 45 | page, url, 46 | DOMAIN(page) as domain, 47 | IF (DOMAIN(page) == DOMAIN(url), false, true) AS third_party, 48 | FROM [%s] 49 | """ % known_args.input 50 | 51 | classifiers = {} 52 | for file in ['ad', 'tracker', 'social']: 53 | rules = [line.rstrip('\n') for line in open('local/'+file+'.txt')] 54 | classifier = AdblockRules(rules, 55 | supported_options=['domain', 'third-party'], 56 | skip_unsupported_rules=False, use_re2=True) 57 | del rules 58 | classifiers[file] = classifier 59 | 60 | p = df.Pipeline(argv=pipeline_args) 61 | 62 | (p 63 | | df.Read('read', df.io.BigQuerySource(query=input_query)) 64 | | df.ParDo('classify', EasylistClassifyDoFn(), classifiers) 65 | # | df.io.Write('write', df.io.TextFileSink('out'))) 66 | | df.Write('write', df.io.BigQuerySink( 67 | output_table, 68 | schema='page:STRING, url:STRING, type:STRING', 69 | create_disposition=df.io.BigQueryDisposition.CREATE_IF_NEEDED, 70 | write_disposition=df.io.BigQueryDisposition.WRITE_TRUNCATE))) 71 | 72 | p.run() 73 | 74 | if __name__ == '__main__': 75 | logging.getLogger().setLevel(logging.INFO) 76 | run() 77 | -------------------------------------------------------------------------------- /dataflow/python/bigquery_import.py: -------------------------------------------------------------------------------- 1 | """HTTP Archive dataflow pipeline for generating HAR data on BigQuery.""" 2 | 3 | from __future__ import absolute_import 4 | 5 | import argparse 6 | from copy import deepcopy 7 | from datetime import datetime 8 | from hashlib import sha256 9 | import json 10 | import logging 11 | import re 12 | 13 | import apache_beam as beam 14 | import apache_beam.io.gcp.gcsio as gcsio 15 | from apache_beam.options.pipeline_options import PipelineOptions 16 | from apache_beam.options.pipeline_options import SetupOptions 17 | 18 | 19 | # BigQuery can handle rows up to 100 MB. 20 | MAX_CONTENT_SIZE = 2 * 1024 * 1024 21 | # Number of times to partition the requests tables. 22 | NUM_PARTITIONS = 4 23 | 24 | 25 | def get_page(har): 26 | """Parses the page from a HAR object.""" 27 | 28 | if not har: 29 | return 30 | 31 | page = har.get('log').get('pages')[0] 32 | url = page.get('_URL') 33 | 34 | metadata = page.get('_metadata') 35 | if metadata: 36 | # The page URL from metadata is more accurate. 37 | # See https://github.com/HTTPArchive/data-pipeline/issues/48 38 | url = metadata.get('tested_url', url) 39 | 40 | try: 41 | payload_json = to_json(page) 42 | except: 43 | logging.warning('Skipping pages payload for "%s": unable to stringify as JSON.' % url) 44 | return 45 | 46 | payload_size = len(payload_json) 47 | if payload_size > MAX_CONTENT_SIZE: 48 | logging.warning('Skipping pages payload for "%s": payload size (%s) exceeds the maximum content size of %s bytes.' % (url, payload_size, MAX_CONTENT_SIZE)) 49 | return 50 | 51 | return [{ 52 | 'url': url, 53 | 'payload': payload_json 54 | }] 55 | 56 | 57 | def get_page_url(har): 58 | """Parses the page URL from a HAR object.""" 59 | 60 | page = get_page(har) 61 | 62 | if not page: 63 | logging.warning('Unable to get URL from page (see preceding warning).') 64 | return 65 | 66 | return page[0].get('url') 67 | 68 | 69 | def partition_step(fn, har, index): 70 | """Partitions functions across multiple concurrent steps.""" 71 | 72 | logging.info(f'partitioning step {fn}, index {index}') 73 | 74 | if not har: 75 | logging.warning('Unable to partition step, null HAR.') 76 | return 77 | 78 | page = har.get('log').get('pages')[0] 79 | metadata = page.get('_metadata') 80 | if metadata.get('crawl_depth') and metadata.get('crawl_depth') != '0': 81 | # Only home pages have a crawl depth of 0. 82 | return 83 | 84 | page_url = get_page_url(har) 85 | 86 | if not page_url: 87 | logging.warning('Skipping HAR: unable to get page URL (see preceding warning).') 88 | return 89 | 90 | hash = hash_url(page_url) 91 | if hash % NUM_PARTITIONS != index: 92 | logging.info(f'Skipping partition. {hash} % {NUM_PARTITIONS} != {index}') 93 | return 94 | 95 | return fn(har) 96 | 97 | 98 | def get_requests(har): 99 | """Parses the requests from a HAR object.""" 100 | 101 | if not har: 102 | return 103 | 104 | page_url = get_page_url(har) 105 | 106 | if not page_url: 107 | # The page_url field indirectly depends on the get_page function. 108 | # If the page data is unavailable for whatever reason, skip its requests. 109 | logging.warning('Skipping requests payload: unable to get page URL (see preceding warning).') 110 | return 111 | 112 | entries = har.get('log').get('entries') 113 | 114 | requests = [] 115 | 116 | for request in entries: 117 | 118 | request_url = request.get('_full_url') 119 | 120 | try: 121 | payload = to_json(trim_request(request)) 122 | except: 123 | logging.warning('Skipping requests payload for "%s": unable to stringify as JSON.' % request_url) 124 | continue 125 | 126 | payload_size = len(payload) 127 | if payload_size > MAX_CONTENT_SIZE: 128 | logging.warning('Skipping requests payload for "%s": payload size (%s) exceeded maximum content size of %s bytes.' % (request_url, payload_size, MAX_CONTENT_SIZE)) 129 | continue 130 | 131 | requests.append({ 132 | 'page': page_url, 133 | 'url': request_url, 134 | 'payload': payload 135 | }) 136 | 137 | return requests 138 | 139 | 140 | def trim_request(request): 141 | """Removes redundant fields from the request object.""" 142 | 143 | # Make a copy first so the response body can be used later. 144 | request = deepcopy(request) 145 | request.get('response').get('content').pop('text', None) 146 | return request 147 | 148 | 149 | def hash_url(url): 150 | """Hashes a given URL to a process-stable integer value.""" 151 | return int(sha256(url.encode('utf-8')).hexdigest(), 16) 152 | 153 | 154 | def get_response_bodies(har): 155 | """Parses response bodies from a HAR object.""" 156 | 157 | page_url = get_page_url(har) 158 | requests = har.get('log').get('entries') 159 | 160 | response_bodies = [] 161 | 162 | for request in requests: 163 | request_url = request.get('_full_url') 164 | body = None 165 | if request.get('response') and request.get('response').get('content'): 166 | body = request.get('response').get('content').get('text', None) 167 | 168 | if body == None: 169 | continue 170 | 171 | truncated = len(body) > MAX_CONTENT_SIZE 172 | if truncated: 173 | logging.warning('Truncating response body for "%s". Response body size %s exceeds limit %s.' % (request_url, len(body), MAX_CONTENT_SIZE)) 174 | 175 | response_bodies.append({ 176 | 'page': page_url, 177 | 'url': request_url, 178 | 'body': body[:MAX_CONTENT_SIZE], 179 | 'truncated': truncated 180 | }) 181 | 182 | return response_bodies 183 | 184 | 185 | def get_technologies(har): 186 | """Parses the technologies from a HAR object.""" 187 | 188 | if not har: 189 | return 190 | 191 | page = har.get('log').get('pages')[0] 192 | page_url = page.get('_URL') 193 | app_names = page.get('_detected_apps', {}) 194 | categories = page.get('_detected', {}) 195 | 196 | # When there are no detected apps, it appears as an empty array. 197 | if isinstance(app_names, list): 198 | app_names = {} 199 | categories = {} 200 | 201 | app_map = {} 202 | app_list = [] 203 | for app, info_list in app_names.items(): 204 | if not info_list: 205 | continue 206 | # There may be multiple info values. Add each to the map. 207 | for info in info_list.split(','): 208 | app_id = '%s %s' % (app, info) if len(info) > 0 else app 209 | app_map[app_id] = app 210 | 211 | for category, apps in categories.items(): 212 | for app_id in apps.split(','): 213 | app = app_map.get(app_id) 214 | info = '' 215 | if app == None: 216 | app = app_id 217 | else: 218 | info = app_id[len(app):].strip() 219 | app_list.append({ 220 | 'url': page_url, 221 | 'category': category, 222 | 'app': app, 223 | 'info': info 224 | }) 225 | 226 | return app_list 227 | 228 | 229 | def get_lighthouse_reports(har): 230 | """Parses Lighthouse results from a HAR object.""" 231 | 232 | if not har: 233 | return 234 | 235 | report = har.get('_lighthouse') 236 | 237 | if not report: 238 | return 239 | 240 | page_url = get_page_url(har) 241 | 242 | if not page_url: 243 | logging.warning('Skipping lighthouse report: unable to get page URL (see preceding warning).') 244 | return 245 | 246 | # Omit large UGC. 247 | report.get('audits').get('screenshot-thumbnails', {}).get('details', {}).pop('items', None) 248 | 249 | try: 250 | report_json = to_json(report) 251 | except: 252 | logging.warning('Skipping Lighthouse report for "%s": unable to stringify as JSON.' % page_url) 253 | return 254 | 255 | report_size = len(report_json) 256 | if report_size > MAX_CONTENT_SIZE: 257 | logging.warning('Skipping Lighthouse report for "%s": Report size (%s) exceeded maximum content size of %s bytes.' % (page_url, report_size, MAX_CONTENT_SIZE)) 258 | return 259 | 260 | return [{ 261 | 'url': page_url, 262 | 'report': report_json 263 | }] 264 | 265 | 266 | def to_json(obj): 267 | """Returns a JSON representation of the object. 268 | 269 | This method attempts to mirror the output of the 270 | legacy Java Dataflow pipeline. For the most part, 271 | the default `json.dumps` config does the trick, 272 | but there are a few settings to make it more consistent: 273 | 274 | - Omit whitespace between properties 275 | - Do not escape non-ASCII characters (preserve UTF-8) 276 | 277 | One difference between this Python implementation and the 278 | Java implementation is the way long numbers are handled. 279 | A Python-serialized JSON string might look like this: 280 | 281 | "timestamp":1551686646079.9998 282 | 283 | while the Java-serialized string uses scientific notation: 284 | 285 | "timestamp":1.5516866460799998E12 286 | 287 | Out of a sample of 200 actual request objects, this was 288 | the only difference between implementations. This can be 289 | considered an improvement. 290 | """ 291 | 292 | if not obj: 293 | raise ValueError 294 | 295 | return json.dumps(obj, separators=(',', ':'), ensure_ascii=False) 296 | 297 | 298 | def from_json(str): 299 | """Returns an object from the JSON representation.""" 300 | 301 | try: 302 | return json.loads(str) 303 | except Exception as e: 304 | logging.error('Unable to parse JSON object "%s...": %s' % (str[:50], e)) 305 | return 306 | 307 | 308 | def get_gcs_dir(release): 309 | """Formats a release string into a gs:// directory.""" 310 | 311 | return 'gs://httparchive/crawls/%s/' % release 312 | 313 | 314 | def gcs_list(gcs_dir): 315 | """Lists all files in a GCS directory.""" 316 | gcs = gcsio.GcsIO() 317 | return gcs.list_prefix(gcs_dir) 318 | 319 | 320 | def get_bigquery_uri(release, dataset): 321 | """Formats a release string into a BigQuery dataset/table.""" 322 | 323 | client, date_string = release.split('-') 324 | 325 | if client == 'chrome': 326 | client = 'desktop' 327 | elif client == 'android': 328 | client = 'mobile' 329 | 330 | date_obj = datetime.strptime(date_string, '%b_%d_%Y') # Mar_01_2020 331 | date_string = date_obj.strftime('%Y_%m_%d') # 2020_03_01 332 | 333 | return 'httparchive:%s.%s_%s' % (dataset, date_string, client) 334 | 335 | 336 | def run(argv=None): 337 | """Constructs and runs the BigQuery import pipeline.""" 338 | parser = argparse.ArgumentParser() 339 | parser.add_argument( 340 | '--input', 341 | required=True, 342 | help='Input Cloud Storage directory to process.') 343 | known_args, pipeline_args = parser.parse_known_args(argv) 344 | pipeline_options = PipelineOptions(pipeline_args) 345 | pipeline_options.view_as(SetupOptions).save_main_session = True 346 | 347 | 348 | with beam.Pipeline(options=pipeline_options) as p: 349 | gcs_dir = get_gcs_dir(known_args.input) 350 | 351 | hars = (p 352 | | beam.Create([gcs_dir]) 353 | | beam.io.ReadAllFromText() 354 | | 'MapJSON' >> beam.Map(from_json)) 355 | 356 | for i in range(NUM_PARTITIONS): 357 | (hars 358 | | f'MapPages{i}' >> beam.FlatMap( 359 | (lambda i: lambda har: partition_step(get_page, har, i))(i)) 360 | | f'WritePages{i}' >> beam.io.WriteToBigQuery( 361 | get_bigquery_uri(known_args.input, 'pages'), 362 | schema='url:STRING, payload:STRING', 363 | write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, 364 | create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED)) 365 | 366 | (hars 367 | | f'MapTechnologies{i}' >> beam.FlatMap( 368 | (lambda i: lambda har: partition_step(get_technologies, har, i))(i)) 369 | | f'WriteTechnologies{i}' >> beam.io.WriteToBigQuery( 370 | get_bigquery_uri(known_args.input, 'technologies'), 371 | schema='url:STRING, category:STRING, app:STRING, info:STRING', 372 | write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, 373 | create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED)) 374 | 375 | (hars 376 | | f'MapLighthouseReports{i}' >> beam.FlatMap( 377 | (lambda i: lambda har: partition_step(get_lighthouse_reports, har, i))(i)) 378 | | f'WriteLighthouseReports{i}' >> beam.io.WriteToBigQuery( 379 | get_bigquery_uri(known_args.input, 'lighthouse'), 380 | schema='url:STRING, report:STRING', 381 | write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, 382 | create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED)) 383 | (hars 384 | | f'MapRequests{i}' >> beam.FlatMap( 385 | (lambda i: lambda har: partition_step(get_requests, har, i))(i)) 386 | | f'WriteRequests{i}' >> beam.io.WriteToBigQuery( 387 | get_bigquery_uri(known_args.input, 'requests'), 388 | schema='page:STRING, url:STRING, payload:STRING', 389 | write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, 390 | create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED)) 391 | 392 | (hars 393 | | f'MapResponseBodies{i}' >> beam.FlatMap( 394 | (lambda i: lambda har: partition_step(get_response_bodies, har, i))(i)) 395 | | f'WriteResponseBodies{i}' >> beam.io.WriteToBigQuery( 396 | get_bigquery_uri(known_args.input, 'response_bodies'), 397 | schema='page:STRING, url:STRING, body:STRING, truncated:BOOLEAN', 398 | write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, 399 | create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED)) 400 | 401 | 402 | if __name__ == '__main__': 403 | logging.getLogger().setLevel(logging.INFO) 404 | run() 405 | -------------------------------------------------------------------------------- /dataflow/python/get_rules.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ ! -f easyprivacy.txt ]; then 4 | wget --no-check-certificate https://easylist-downloads.adblockplus.org/easyprivacy.txt -O local/tracker.txt 5 | fi 6 | 7 | if [ ! -f easylist_noelemhide.txt ]; then 8 | wget --no-check-certificate https://easylist-downloads.adblockplus.org/easylist_noelemhide.txt -O local/ad.txt 9 | fi 10 | 11 | if [ ! -f fanboy-annoyance.txt ]; then 12 | wget --no-check-certificate https://easylist-downloads.adblockplus.org/fanboy-annoyance.txt -O local/social.txt 13 | fi 14 | 15 | -------------------------------------------------------------------------------- /dataflow/python/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-beam[gcp]==2.31 2 | -------------------------------------------------------------------------------- /dataflow/python/run.sh: -------------------------------------------------------------------------------- 1 | # Omit the runner option to run the pipeline locally. 2 | #--runner=DataflowRunner \ 3 | python bigquery_import.py \ 4 | --runner=DataflowRunner \ 5 | --project=httparchive \ 6 | --temp_location=gs://httparchive/dataflow/temp \ 7 | --staging_location=gs://httparchive/dataflow/staging \ 8 | --region=us-west1 \ 9 | --machine_type=n1-standard-32 \ 10 | --input=android-Jul_1_2021 \ 11 | --worker_disk_type=compute.googleapis.com/projects//zones//diskTypes/pd-ssd 12 | -------------------------------------------------------------------------------- /dataflow/python/setup.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import setuptools 3 | 4 | from setuptools.command.bdist_egg import bdist_egg as _bdist_egg 5 | 6 | class bdist_egg(_bdist_egg): # pylint: disable=invalid-name 7 | def run(self): 8 | self.run_command('CustomCommands') 9 | _bdist_egg.run(self) 10 | 11 | # Some custom command to run during setup. 12 | CUSTOM_COMMANDS = [ 13 | ['apt-get', 'update'], 14 | ['apt-get', '--assume-yes', 'install', 'libre2-1', 'libre2-dev'], 15 | ] 16 | 17 | class CustomCommands(setuptools.Command): 18 | def initialize_options(self): 19 | pass 20 | 21 | def finalize_options(self): 22 | pass 23 | 24 | def RunCustomCommand(self, command_list): 25 | print 'Running command: %s' % command_list 26 | p = subprocess.Popen( 27 | command_list, 28 | stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) 29 | # Can use communicate(input='y\n'.encode()) if the command run requires 30 | # some confirmation. 31 | stdout_data, _ = p.communicate() 32 | print 'Command output: %s' % stdout_data 33 | if p.returncode != 0: 34 | raise RuntimeError( 35 | 'Command %s failed: exit code: %s' % (command_list, p.returncode)) 36 | 37 | def run(self): 38 | for command in CUSTOM_COMMANDS: 39 | self.RunCustomCommand(command) 40 | 41 | 42 | # Configure the required packages and scripts to install. 43 | REQUIRED_PACKAGES = [ 44 | 'adblockparser', 45 | 're2' 46 | ] 47 | 48 | setuptools.setup( 49 | name='adblock', 50 | version='0.0.1', 51 | description='adblock pipeline', 52 | install_requires=REQUIRED_PACKAGES, 53 | packages=setuptools.find_packages(), 54 | cmdclass={ 55 | # Command class instantiated and run during easy_install scenarios. 56 | 'bdist_egg': bdist_egg, 57 | 'CustomCommands': CustomCommands, 58 | } 59 | ) 60 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # BigQuery Pipeline 2 | 3 | ## Dataflow 4 | 5 | TODO 6 | 7 | ## JSON Generation 8 | 9 | After each crawl, the [generate_reports.sh](../sql/generate_reports.sh) script is run with the date of the crawl. For example: 10 | 11 | ```sh 12 | sql/generate_reports.sh -t -h 2017_09_01 13 | ``` 14 | 15 | This will generate timeseries and histogram reports for all metrics using predefined SQL queries. The histogram queries will fill table placeholders with the crawl date provided. For example: 16 | 17 | ```sql 18 | SELECT ... FROM `httparchive.runs.${YYYY_MM_DD}_pages* ...` 19 | ``` 20 | 21 | will become 22 | 23 | ```sql 24 | SELECT ... FROM `httparchive.runs.2017_09_01_pages* ...` 25 | ``` 26 | 27 | After executing the histogram/timeseries queries for each metric on BigQuery, the results will be saved as JSON on Google Storage. For example, the `bytesJS` histogram would be saved to `gs://httparchive/reports/2017_09_01/bytesJS.json`. The timeseries for the same metric would be saved to `gs://httparchive/reports/bytesJS.json`. 28 | 29 | ### Running Manually 30 | 31 | Sometimes it's necessary to manually run this process, for example if a new metric is added or specific dates need to be backfilled. The generate_reports.sh script can be run with a different configuration of flags depending on your needs. From the script's documentation: 32 | 33 | ```sh 34 | # Flags: 35 | # 36 | # -t: Whether to generate timeseries. 37 | # Note to run in incremental mode also need to use -h to pass date 38 | # 39 | # -h: Whether to generate histograms. Must be accompanied by the date to query. 40 | # 41 | # -f: Whether to force histogram querying and updating even if the data exists. 42 | # Timeseries are usually appended to from last date, but this flag forces a complete rerun 43 | # 44 | # -l: Which lens to run. 45 | # Can also be set to ALL to run both the base (lens-less) report and all lenses. 46 | # 47 | # -r: Optional pattern match for reports to be run. Use quotes to avoid the shell expanding names 48 | # (e.g. "*crux*") 49 | ``` 50 | 51 | You can omit one of the `-t` or `-h` flags to focus only on histogram or timeseries generation. The `-f` flag ensures that histogram data gets overwritten. Omit this flag to skip queries for dates that already exist (much faster for batch jobs, see below). 52 | 53 | ### Getting Dates Dynamically 54 | 55 | If you're adding a new metric, it would be a pain to run the generation script manually for each date. HTTP Archive has over 300 crawls worth of dated tables in BigQuery! The [getBigQueryDates.sh](../sql/getBigQueryDates.sh) script can be used to get all of the dates in `YYYY_MM_DD` format for a particular table type. For example, if your new metric depends on the `pages` tables of the `runs` dataset (eg `httparchive.runs.2017_09_01_pages`), you could get the dates representing all of the matiching tables by running this command: 56 | 57 | ```sh 58 | sql/getBigQueryDates.sh runs pages 59 | ``` 60 | 61 | Or if you want to limit the results to a particular range, you can pass in upper and lower bounds: 62 | 63 | ```sh 64 | sql/getBigQueryDates.sh runs pages 2015_01_01 2015_12_15 65 | ``` 66 | 67 | The output of this script is a newline-delimited list of dates. This format enables convenient piping of the output as input to the generate_reports.sh script. For example: 68 | 69 | ```sh 70 | sql/getBigQueryDates.sh runs pages | \ 71 | xargs -I date sql/generate_reports.sh -h date 72 | ``` 73 | 74 | `xargs` handles the processing of each date and calls the other script. 75 | 76 | ### Generating Specific Metrics 77 | 78 | _TODO: document `sql/generate_report.sh`. This updates one histogram/timeseries at a time._ 79 | 80 | Running `generate_reports.sh` without the `-f` flag will result in metrics whose JSON results are already on Google Storage to skip being requeried. To regenerate results for specific metrics, the easiest thing to do may be to remove its results from Google Storage first, rather than running with the `-f` flag enabled and waiting for all other metrics to be queried and uploaded. 81 | 82 | For example, if a change is made to the `reqTotal.sql` histogram query, then you can "invalidate" all histogram results for this query by deleting all respective JSON files from Google Storage: 83 | 84 | ```sh 85 | gsutil rm gs://httparchive/reports/*/reqTotal.json 86 | ``` 87 | 88 | The wildcard in the YYYY_MM_DD position will instruct `gsutil` to delete all histogram results for this specific metric. 89 | 90 | Now you can delete more metric-specific results or rerun `generate_reports.sh` without the `-f` flag and only the desired metrics will be requeried. 91 | 92 | Note that cdn.httparchive.org may still contain the old version of the JSON file for the duration of the TTL. See below for more on invalidating the cache. 93 | 94 | ## Serving the JSON Files 95 | 96 | The Google Storage bucket is behind an App Engine load balancer and CDN, which is aliased as [https://cdn.httparchive.org](https://cdn.httparchive.org). Accessing the JSON data follows the same pattern as the `gs://` URL. For example, the public URL for `gs://httparchive/reports/2017_09_01/bytesJS.json` is [https://cdn.httparchive.org/reports/2017_09_01/bytesJS.json](https://cdn.httparchive.org/reports/2017_09_01/bytesJS.json). Each file is configured to be served with `Content-Type: application/json` and `Cache-Control: public, max-age=3600` headers. 97 | 98 | The cache lifetime is set to 1 hour. If the cache needs to be invalidated for a particular file, this can be done by an administrator in the App Engine dashboard. 99 | 100 | A whitelist of origins are allowed to access the CDN. This list is maintained in [config/storage-cors.json](../config/storage-cors.json) and is configured to allow development, staging, and production servers. To save changes to this file, run: 101 | 102 | ```sh 103 | gsutil cors set config/storage-cors.json gs://httparchive` 104 | ``` 105 | 106 | This will update the CORS settings for the Google Storage bucket. 107 | -------------------------------------------------------------------------------- /schema/.sqlfluffignore: -------------------------------------------------------------------------------- 1 | httparchive_schema.sql 2 | -------------------------------------------------------------------------------- /schema/pages.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "name": "pageid", 4 | "type": "INTEGER" 5 | }, 6 | { 7 | "name": "createDate", 8 | "type": "INTEGER" 9 | }, 10 | { 11 | "name": "archive", 12 | "type": "STRING" 13 | }, 14 | { 15 | "name": "label", 16 | "type": "STRING" 17 | }, 18 | { 19 | "name": "crawlid", 20 | "type": "INTEGER" 21 | }, 22 | { 23 | "name": "wptid", 24 | "type": "STRING" 25 | }, 26 | { 27 | "name": "wptrun", 28 | "type": "INTEGER" 29 | }, 30 | { 31 | "name": "url", 32 | "type": "STRING" 33 | }, 34 | { 35 | "name": "urlShort", 36 | "type": "STRING" 37 | }, 38 | { 39 | "name": "urlhash", 40 | "type": "INTEGER" 41 | }, 42 | { 43 | "name": "cdn", 44 | "type": "STRING" 45 | }, 46 | { 47 | "name": "startedDateTime", 48 | "type": "INTEGER" 49 | }, 50 | { 51 | "name": "TTFB", 52 | "type": "INTEGER" 53 | }, 54 | { 55 | "name": "renderStart", 56 | "type": "INTEGER" 57 | }, 58 | { 59 | "name": "onContentLoaded", 60 | "type": "INTEGER" 61 | }, 62 | { 63 | "name": "onLoad", 64 | "type": "INTEGER" 65 | }, 66 | { 67 | "name": "fullyLoaded", 68 | "type": "INTEGER" 69 | }, 70 | { 71 | "name": "visualComplete", 72 | "type": "INTEGER" 73 | }, 74 | { 75 | "name": "PageSpeed", 76 | "type": "INTEGER" 77 | }, 78 | { 79 | "name": "SpeedIndex", 80 | "type": "INTEGER" 81 | }, 82 | { 83 | "name": "rank", 84 | "type": "INTEGER" 85 | }, 86 | { 87 | "name": "reqTotal", 88 | "type": "INTEGER" 89 | }, 90 | { 91 | "name": "reqHtml", 92 | "type": "INTEGER" 93 | }, 94 | { 95 | "name": "reqJS", 96 | "type": "INTEGER" 97 | }, 98 | { 99 | "name": "reqCSS", 100 | "type": "INTEGER" 101 | }, 102 | { 103 | "name": "reqImg", 104 | "type": "INTEGER" 105 | }, 106 | { 107 | "name": "reqGif", 108 | "type": "INTEGER" 109 | }, 110 | { 111 | "name": "reqJpg", 112 | "type": "INTEGER" 113 | }, 114 | { 115 | "name": "reqPng", 116 | "type": "INTEGER" 117 | }, 118 | { 119 | "name": "reqFont", 120 | "type": "INTEGER" 121 | }, 122 | { 123 | "name": "reqFlash", 124 | "type": "INTEGER" 125 | }, 126 | { 127 | "name": "reqJson", 128 | "type": "INTEGER" 129 | }, 130 | { 131 | "name": "reqOther", 132 | "type": "INTEGER" 133 | }, 134 | { 135 | "name": "bytesTotal", 136 | "type": "INTEGER" 137 | }, 138 | { 139 | "name": "bytesHtml", 140 | "type": "INTEGER" 141 | }, 142 | { 143 | "name": "bytesJS", 144 | "type": "INTEGER" 145 | }, 146 | { 147 | "name": "bytesCSS", 148 | "type": "INTEGER" 149 | }, 150 | { 151 | "name": "bytesImg", 152 | "type": "INTEGER" 153 | }, 154 | { 155 | "name": "bytesGif", 156 | "type": "INTEGER" 157 | }, 158 | { 159 | "name": "bytesJpg", 160 | "type": "INTEGER" 161 | }, 162 | { 163 | "name": "bytesPng", 164 | "type": "INTEGER" 165 | }, 166 | { 167 | "name": "bytesFont", 168 | "type": "INTEGER" 169 | }, 170 | { 171 | "name": "bytesFlash", 172 | "type": "INTEGER" 173 | }, 174 | { 175 | "name": "bytesJson", 176 | "type": "INTEGER" 177 | }, 178 | { 179 | "name": "bytesOther", 180 | "type": "INTEGER" 181 | }, 182 | { 183 | "name": "bytesHtmlDoc", 184 | "type": "INTEGER" 185 | }, 186 | { 187 | "name": "numDomains", 188 | "type": "INTEGER" 189 | }, 190 | { 191 | "name": "maxDomainReqs", 192 | "type": "INTEGER" 193 | }, 194 | { 195 | "name": "numRedirects", 196 | "type": "INTEGER" 197 | }, 198 | { 199 | "name": "numErrors", 200 | "type": "INTEGER" 201 | }, 202 | { 203 | "name": "numGlibs", 204 | "type": "INTEGER" 205 | }, 206 | { 207 | "name": "numHttps", 208 | "type": "INTEGER" 209 | }, 210 | { 211 | "name": "numCompressed", 212 | "type": "INTEGER" 213 | }, 214 | { 215 | "name": "numDomElements", 216 | "type": "INTEGER" 217 | }, 218 | { 219 | "name": "maxageNull", 220 | "type": "INTEGER" 221 | }, 222 | { 223 | "name": "maxage0", 224 | "type": "INTEGER" 225 | }, 226 | { 227 | "name": "maxage1", 228 | "type": "INTEGER" 229 | }, 230 | { 231 | "name": "maxage30", 232 | "type": "INTEGER" 233 | }, 234 | { 235 | "name": "maxage365", 236 | "type": "INTEGER" 237 | }, 238 | { 239 | "name": "maxageMore", 240 | "type": "INTEGER" 241 | }, 242 | { 243 | "name": "gzipTotal", 244 | "type": "INTEGER" 245 | }, 246 | { 247 | "name": "gzipSavings", 248 | "type": "INTEGER" 249 | }, 250 | { 251 | "name": "_connections", 252 | "type": "INTEGER" 253 | }, 254 | { 255 | "name": "_adult_site", 256 | "type": "BOOLEAN" 257 | }, 258 | { 259 | "name": "avg_dom_depth", 260 | "type": "INTEGER" 261 | }, 262 | { 263 | "name": "document_height", 264 | "type": "INTEGER" 265 | }, 266 | { 267 | "name": "document_width", 268 | "type": "INTEGER" 269 | }, 270 | { 271 | "name": "localstorage_size", 272 | "type": "INTEGER" 273 | }, 274 | { 275 | "name": "sessionstorage_size", 276 | "type": "INTEGER" 277 | }, 278 | { 279 | "name": "num_iframes", 280 | "type": "INTEGER" 281 | }, 282 | { 283 | "name": "num_scripts", 284 | "type": "INTEGER" 285 | }, 286 | { 287 | "name": "doctype", 288 | "type": "STRING" 289 | }, 290 | { 291 | "name": "meta_viewport", 292 | "type": "STRING" 293 | }, 294 | { 295 | "name": "reqAudio", 296 | "type": "INTEGER" 297 | }, 298 | { 299 | "name": "reqVideo", 300 | "type": "INTEGER" 301 | }, 302 | { 303 | "name": "reqText", 304 | "type": "INTEGER" 305 | }, 306 | { 307 | "name": "reqXml", 308 | "type": "INTEGER" 309 | }, 310 | { 311 | "name": "reqWebp", 312 | "type": "INTEGER" 313 | }, 314 | { 315 | "name": "reqSvg", 316 | "type": "INTEGER" 317 | }, 318 | { 319 | "name": "bytesAudio", 320 | "type": "INTEGER" 321 | }, 322 | { 323 | "name": "bytesVideo", 324 | "type": "INTEGER" 325 | }, 326 | { 327 | "name": "bytesText", 328 | "type": "INTEGER" 329 | }, 330 | { 331 | "name": "bytesXml", 332 | "type": "INTEGER" 333 | }, 334 | { 335 | "name": "bytesWebp", 336 | "type": "INTEGER" 337 | }, 338 | { 339 | "name": "bytesSvg", 340 | "type": "INTEGER" 341 | }, 342 | { 343 | "name": "num_scripts_async", 344 | "type": "INTEGER" 345 | }, 346 | { 347 | "name": "num_scripts_sync", 348 | "type": "INTEGER" 349 | }, 350 | { 351 | "name": "usertiming", 352 | "type": "INTEGER" 353 | } 354 | ] 355 | -------------------------------------------------------------------------------- /schema/requests.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "name": "requestid", 4 | "type": "INTEGER" 5 | }, 6 | { 7 | "name": "pageid", 8 | "type": "INTEGER" 9 | }, 10 | { 11 | "name": "startedDateTime", 12 | "type": "INTEGER" 13 | }, 14 | { 15 | "name": "time", 16 | "type": "INTEGER" 17 | }, 18 | { 19 | "name": "method", 20 | "type": "STRING" 21 | }, 22 | { 23 | "name": "url", 24 | "type": "STRING" 25 | }, 26 | { 27 | "name": "urlShort", 28 | "type": "STRING" 29 | }, 30 | { 31 | "name": "redirectUrl", 32 | "type": "STRING" 33 | }, 34 | { 35 | "name": "firstReq", 36 | "type": "BOOLEAN" 37 | }, 38 | { 39 | "name": "firstHtml", 40 | "type": "BOOLEAN" 41 | }, 42 | { 43 | "name": "reqHttpVersion", 44 | "type": "STRING" 45 | }, 46 | { 47 | "name": "reqHeadersSize", 48 | "type": "INTEGER" 49 | }, 50 | { 51 | "name": "reqBodySize", 52 | "type": "INTEGER" 53 | }, 54 | { 55 | "name": "reqCookieLen", 56 | "type": "INTEGER" 57 | }, 58 | { 59 | "name": "reqOtherHeaders", 60 | "type": "STRING" 61 | }, 62 | { 63 | "name": "status", 64 | "type": "INTEGER" 65 | }, 66 | { 67 | "name": "respHttpVersion", 68 | "type": "STRING" 69 | }, 70 | { 71 | "name": "respHeadersSize", 72 | "type": "INTEGER" 73 | }, 74 | { 75 | "name": "respBodySize", 76 | "type": "INTEGER" 77 | }, 78 | { 79 | "name": "respSize", 80 | "type": "INTEGER" 81 | }, 82 | { 83 | "name": "respCookieLen", 84 | "type": "INTEGER" 85 | }, 86 | { 87 | "name": "expAge", 88 | "type": "INTEGER" 89 | }, 90 | { 91 | "name": "mimeType", 92 | "type": "STRING" 93 | }, 94 | { 95 | "name": "respOtherHeaders", 96 | "type": "STRING" 97 | }, 98 | { 99 | "name": "req_accept", 100 | "type": "STRING" 101 | }, 102 | { 103 | "name": "req_accept_charset", 104 | "type": "STRING" 105 | }, 106 | { 107 | "name": "req_accept_encoding", 108 | "type": "STRING" 109 | }, 110 | { 111 | "name": "req_accept_language", 112 | "type": "STRING" 113 | }, 114 | { 115 | "name": "req_connection", 116 | "type": "STRING" 117 | }, 118 | { 119 | "name": "req_host", 120 | "type": "STRING" 121 | }, 122 | { 123 | "name": "req_if_modified_since", 124 | "type": "STRING" 125 | }, 126 | { 127 | "name": "req_if_none_match", 128 | "type": "STRING" 129 | }, 130 | { 131 | "name": "req_referer", 132 | "type": "STRING" 133 | }, 134 | { 135 | "name": "req_user_agent", 136 | "type": "STRING" 137 | }, 138 | { 139 | "name": "resp_accept_ranges", 140 | "type": "STRING" 141 | }, 142 | { 143 | "name": "resp_age", 144 | "type": "STRING" 145 | }, 146 | { 147 | "name": "resp_cache_control", 148 | "type": "STRING" 149 | }, 150 | { 151 | "name": "resp_connection", 152 | "type": "STRING" 153 | }, 154 | { 155 | "name": "resp_content_encoding", 156 | "type": "STRING" 157 | }, 158 | { 159 | "name": "resp_content_language", 160 | "type": "STRING" 161 | }, 162 | { 163 | "name": "resp_content_length", 164 | "type": "STRING" 165 | }, 166 | { 167 | "name": "resp_content_location", 168 | "type": "STRING" 169 | }, 170 | { 171 | "name": "resp_content_type", 172 | "type": "STRING" 173 | }, 174 | { 175 | "name": "resp_date", 176 | "type": "STRING" 177 | }, 178 | { 179 | "name": "resp_etag", 180 | "type": "STRING" 181 | }, 182 | { 183 | "name": "resp_expires", 184 | "type": "STRING" 185 | }, 186 | { 187 | "name": "resp_keep_alive", 188 | "type": "STRING" 189 | }, 190 | { 191 | "name": "resp_last_modified", 192 | "type": "STRING" 193 | }, 194 | { 195 | "name": "resp_location", 196 | "type": "STRING" 197 | }, 198 | { 199 | "name": "resp_pragma", 200 | "type": "STRING" 201 | }, 202 | { 203 | "name": "resp_server", 204 | "type": "STRING" 205 | }, 206 | { 207 | "name": "resp_transfer_encoding", 208 | "type": "STRING" 209 | }, 210 | { 211 | "name": "resp_vary", 212 | "type": "STRING" 213 | }, 214 | { 215 | "name": "resp_via", 216 | "type": "STRING" 217 | }, 218 | { 219 | "name": "resp_x_powered_by", 220 | "type": "STRING" 221 | }, 222 | { 223 | "name": "_cdn_provider", 224 | "type": "STRING" 225 | }, 226 | { 227 | "name": "_gzip_save", 228 | "type": "INTEGER" 229 | }, 230 | { 231 | "name": "crawlid", 232 | "type": "INTEGER" 233 | }, 234 | { 235 | "name": "type", 236 | "type": "STRING" 237 | }, 238 | { 239 | "name": "ext", 240 | "type": "STRING" 241 | }, 242 | { 243 | "name": "format", 244 | "type": "STRING" 245 | } 246 | ] 247 | -------------------------------------------------------------------------------- /schema/schema.rb: -------------------------------------------------------------------------------- 1 | require 'json' 2 | 3 | def type(t) 4 | case t 5 | when /tinyint/ then "BOOLEAN" 6 | when /int/ then "INTEGER" 7 | when /varchar|text/ then "STRING" 8 | end 9 | end 10 | 11 | def scan(table) 12 | schema = [] 13 | s = IO.read('httparchive_schema.sql') 14 | 15 | m = s.match(/CREATE\sTABLE\s`#{table}`\s\((.*?)PRIMARY\sKEY/m)[1] 16 | m.split("\n").compact.each do |f| 17 | next if f.strip.empty? 18 | fm = f.strip.match(/`(.*?)`\s(\w+)/m) 19 | 20 | schema << { 21 | "name" => fm[1], 22 | "type" => type(fm[2]) 23 | } 24 | end 25 | 26 | schema 27 | end 28 | 29 | jj scan(ARGV[0]) 30 | -------------------------------------------------------------------------------- /sql/.sqlfluff: -------------------------------------------------------------------------------- 1 | [sqlfluff] 2 | ## verbose is an integer (0-2) indicating the level of log output 3 | verbose = 0 4 | ## Turn off color formatting of output 5 | nocolor = False 6 | ## Supported dialects https://docs.sqlfluff.com/en/stable/dialects.html 7 | ## Or run 'sqlfluff dialects' 8 | dialect = bigquery 9 | ## One of [raw|jinja|python|placeholder] 10 | templater = jinja 11 | ## Comma separated list of rules to check, or None for all 12 | rules = None 13 | ## Comma separated list of rules to exclude, or None 14 | exclude_rules = AL01,AL04,AL07,AL09,AM03,AM05,CP02,CP03,CV02,CV12,LT05,LT09,LT14,RF01,RF02,RF03,RF04,ST01,ST02,ST05,ST06,ST07 15 | # AL04 - Asks for unique table aliases meaning it complains if selecting from two 2021_07_01 tables as implicit alias is table name (not fully qualified) so same. 16 | # AL07 - Avoid aliases in from and join - why? 17 | # AM03 - if using DESC in one ORDER BY column, then insist on ASC/DESC for all. 18 | # AM05 - INNER JOIN must be fully qualified. Probably should use this but not our style. 19 | # CP02 - Unquoted identifiers (e.g. column names) will be mixed case so don't enforce case 20 | # CP03 - Function names will be mixed case so don't enforce case 21 | # CV02 - Use COALESCE instead of IFNULL or NVL. We think ISNULL is clearer. 22 | # CV12 - Doesn't work with UNNEST. https://github.com/sqlfluff/sqlfluff/issues/6558 23 | # LT05 - We allow longer lines as some of our queries are complex. Maybe should limit in future? 24 | # LT09 - Select targets should be on new lines but sub clauses don't always obey this. Maybe revisit in future? 25 | # LT14 - Keywords on newline. We have some simple, single line joins 26 | # RF01 - BigQuery uses STRUCTS which can look like incorrect table references 27 | # RF02 - Asks for qualified columns for ambiguous ones, but we not qualify our columns, and they are not really ambiguous (or BigQuery would complain) 28 | # RF03 - Insists on references in column names even if not ambiguous. Bit OTT. 29 | # RF04 - Avoids keywords as identifiers but we use this a lot (e.g. AS count, AS max...etc.) 30 | # ST07 - Uses joins instead of USING - why? 31 | # ST06 - Insists on wildcards (*) in certain SELECT order - why? 32 | # ST01 - Do not use ELSE NULL as redundant. But it's clearer!? 33 | # ST05 - Use CTEs instead of subqueries. We don't use this consistently and big rewrite to do that. 34 | # ST02 - Use coalesce instead of case if you can. But it's clearer!? 35 | 36 | [sqlfluff:indentation] 37 | tab_space_size = 2 38 | indent_unit = space 39 | indented_using_on = False 40 | 41 | [sqlfluff:layout:type:binary_operator] 42 | line_position = trailing 43 | 44 | [sqlfluff:templater:jinja:context] 45 | BLINK_DATE_JOIN="AND 1=2" 46 | 47 | [tool.sqlfluff.rules.capitalisation.keywords] 48 | capitalisation_policy = "upper" 49 | 50 | [sqlfluff:rules:convention.count_rows] 51 | # Consistent syntax to count all rows 52 | prefer_count_0 = True 53 | 54 | [sqlfluff:rules:references.special_chars] 55 | # Special characters in identifiers 56 | additional_allowed_characters = ".-${}" 57 | -------------------------------------------------------------------------------- /sql/.sqlfluffignore: -------------------------------------------------------------------------------- 1 | /lens/*/crux_histograms.sql 2 | /lens/*/crux_timeseries.sql 3 | /lens/*/histograms.sql 4 | /lens/*/timeseries.sql 5 | -------------------------------------------------------------------------------- /sql/addDate.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | /** 3 | * Adds a single date to dates.json if it doesn't already exist. 4 | * 5 | * Usage: 6 | * 7 | * node sql/addDate.js 2017_09_01 8 | * 9 | */ 10 | 11 | const fs = require('fs'); 12 | 13 | 14 | const date = process.argv[2]; 15 | if (!date) { 16 | console.error(`You must pass a YYYY_MM_DD-formatted date as input. For example: 17 | sql/addDate.js 2017_09_01`); 18 | process.exit(1); 19 | } 20 | 21 | fs.readFile('config/dates.json', 'utf8', (err, data) => { 22 | if (err) { 23 | console.error(err); 24 | return; 25 | } 26 | 27 | // Use a set to dedupe. 28 | let dates = new Set(JSON.parse(data)); 29 | dates.add(date); 30 | dates = Array.from(dates).sort((a, b) => { 31 | return a > b ? -1 : 1; 32 | }); 33 | 34 | const dateStr = JSON.stringify(dates, null, 2) + '\n'; 35 | 36 | // Update the config file. 37 | fs.writeFile('config/dates.json', dateStr, 'utf8', (err) => { 38 | if (err) { 39 | console.error(err); 40 | } 41 | 42 | console.log('Updated config/dates.json'); 43 | }); 44 | }); 45 | -------------------------------------------------------------------------------- /sql/delete_date_from_reports.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Removes a particular date JSON from timeseries reports on Google Storage. 4 | # 5 | # Usage: 6 | # 7 | # $ sql/delete_date_from_reports.sh -d YYYY_MM_DD 8 | # $ sql/delete_date_from_reports.sh -d YYYY_MM_DD -l top1k 9 | # $ sql/delete_date_from_reports.sh -d YYYY_MM_DD -l top1k -r "*crux*" 10 | # 11 | # Flags: 12 | # 13 | # -l: Optional name of the report lens to generate, eg "top10k". 14 | # 15 | # -r: Optional name of the report files to generate, eg "*crux*". 16 | # 17 | 18 | set -o pipefail 19 | 20 | LENS_ARG="" 21 | REPORTS="*" 22 | VERBOSE=0 23 | NO_CHANGES=0 24 | 25 | # Read the flags. 26 | while getopts ":nvd:l:r:" opt; do 27 | case "${opt}" in 28 | d) 29 | YYYY_MM_DD=${OPTARG} 30 | ;; 31 | v) 32 | VERBOSE=1 33 | ;; 34 | n) 35 | NO_CHANGES=1 36 | ;; 37 | l) 38 | LENS_ARG=${OPTARG} 39 | ;; 40 | r) 41 | REPORTS=${OPTARG} 42 | ;; 43 | esac 44 | done 45 | 46 | if [[ "${YYYY_MM_DD}" == "" ]]; then 47 | echo "Usage $0 -d 2021_12_01" 48 | exit 1 49 | fi 50 | 51 | echo "${YYYY_MM_DD}" 52 | 53 | # Run all timeseries queries. 54 | for query in sql/timeseries/$REPORTS.sql; do 55 | 56 | if [[ ! -f $query ]]; then 57 | echo "Nothing to do" 58 | continue; 59 | fi 60 | 61 | # Extract the metric name from the file path. 62 | metric=$(echo $(basename $query) | cut -d"." -f1) 63 | 64 | if [[ "${LENS_ARG}" == "" ]]; then 65 | LENSES=("") 66 | echo "Deleting ${metric} report for base" 67 | elif [[ "${LENS_ARG}" == "ALL" ]]; then 68 | LENSES=("" $(ls sql/lens)) 69 | echo "Deleting ${metric} report for base and all lenses" 70 | else 71 | LENSES=("${LENS_ARG}") 72 | echo "Deleting ${metric} report for one lens" 73 | fi 74 | 75 | for LENS in "${LENSES[@]}" 76 | do 77 | 78 | gs_lens_dir="" 79 | if [[ $LENS != "" ]]; then 80 | gs_lens_dir="$LENS/" 81 | fi 82 | 83 | current_contents="" 84 | gs_url="gs://httparchive/reports/$gs_lens_dir${metric}.json" 85 | gsutil ls $gs_url &> /dev/null 86 | 87 | if [ $? -eq 0 ]; then 88 | 89 | echo "Updating this query: ${metric} for LENS: ${LENS}" 90 | 91 | # The file exists, so remove the requested date 92 | current_contents=$(gsutil cat $gs_url) 93 | 94 | if [ ${VERBOSE} -eq 1 ]; then 95 | echo "Current JSON:" 96 | echo "${current_contents}\n" 97 | fi 98 | 99 | new_contents=$(echo "$current_contents" | jq -c --indent 1 --arg date "${YYYY_MM_DD}" '.[] | select(.date!=$date)' | tr -d '\n' | sed 's/^/[ /' | sed 's/}$/ } ]\n/' | sed 's/}{/ }, {/g') 100 | 101 | if [ ${VERBOSE} -eq 1 ]; then 102 | echo "New JSON:" 103 | echo "${new_contents}" 104 | fi 105 | 106 | # Make sure the removal succeeded. 107 | if [ $? -eq 0 ] && [ ${NO_CHANGES} -eq 0 ]; then 108 | 109 | # Upload the response to Google Storage. 110 | echo "Uploading new file to Google Storage" 111 | echo $new_contents \ 112 | | gsutil -h "Content-Type:application/json" cp - $gs_url 113 | else 114 | echo $new_contents >&2 115 | fi 116 | fi 117 | done 118 | done 119 | 120 | echo -e "Done" 121 | -------------------------------------------------------------------------------- /sql/generate_reports.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Updates the JSON reports on Google Storage with the latest BigQuery data. 4 | # 5 | # Usage: 6 | # 7 | # $ sql/generateReports.sh -t -h YYYY_MM_DD 8 | # 9 | # Flags: 10 | # 11 | # -t: Whether to generate timeseries. 12 | # 13 | # -h: Whether to generate histograms. Must be accompanied by the date to query. 14 | # 15 | # -f: Whether to force querying and updating even if the data exists. 16 | # 17 | # -l: Optional name of the report lens to generate, eg "top10k". 18 | # 19 | # -r: Optional name of the report files to generate, eg "*crux*". 20 | # 21 | 22 | set -o pipefail 23 | 24 | BQ_CMD="bq --format prettyjson --project_id httparchive query --max_rows 1000000" 25 | FORCE=0 26 | GENERATE_HISTOGRAM=0 27 | GENERATE_TIMESERIES=0 28 | LENS_ARG="" 29 | REPORTS="*" 30 | VERBOSE=0 31 | 32 | # Read the flags. 33 | while getopts ":ftvh:l:r:" opt; do 34 | case "${opt}" in 35 | h) 36 | GENERATE_HISTOGRAM=1 37 | YYYY_MM_DD=${OPTARG} 38 | dateParts=(`echo ${OPTARG} | tr "_" "\\n"`) 39 | YYYYMM=${dateParts[0]}${dateParts[1]} 40 | DATE=${dateParts[0]}-${dateParts[1]}-${dateParts[2]} 41 | ;; 42 | t) 43 | GENERATE_TIMESERIES=1 44 | ;; 45 | v) 46 | VERBOSE=1 47 | ;; 48 | f) 49 | FORCE=1 50 | ;; 51 | l) 52 | LENS_ARG=${OPTARG} 53 | ;; 54 | r) 55 | REPORTS=${OPTARG} 56 | ;; 57 | esac 58 | done 59 | 60 | # Exit early if there is nothing to do. 61 | if [ $GENERATE_HISTOGRAM -eq 0 -a $GENERATE_TIMESERIES -eq 0 ]; then 62 | echo -e "You must provide one or both -t or -h flags." >&2 63 | echo -e "For example: sql/generateReports.sh -t -h 2017_08_01" >&2 64 | exit 1 65 | fi 66 | 67 | # Check if all tables for the given date are available in BigQuery. 68 | # Tables representing desktop/mobile and HAR/CSV data sources must exist. 69 | DATED_TABLES_READY=0 70 | if [ -n "$YYYY_MM_DD" ]; then 71 | echo "Checking if tables are ready for ${DATE}..." 72 | DESKTOP_ROOT_PAGES_EXIST=$(bq query --nouse_legacy_sql --format csv --headless -q "SELECT true FROM httparchive.crawl.pages WHERE date = '${DATE}' AND client = 'desktop' AND is_root_page LIMIT 1" | tail -1) 73 | DESKTOP_NON_ROOT_PAGES_EXIST=$(bq query --nouse_legacy_sql --format csv --headless -q "SELECT true FROM httparchive.crawl.pages WHERE date = '${DATE}' AND client = 'desktop' AND is_root_page LIMIT 1" | tail -1) 74 | MOBILE_ROOT_PAGES_EXIST=$(bq query --nouse_legacy_sql --format csv --headless -q "SELECT true FROM httparchive.crawl.pages WHERE date = '${DATE}' AND client = 'mobile' AND NOT is_root_page LIMIT 1" | tail -1) 75 | MOBILE_NON_ROOT_PAGES_EXIST=$(bq query --nouse_legacy_sql --format csv --headless -q "SELECT true FROM httparchive.crawl.pages WHERE date = '${DATE}' AND client = 'mobile' AND NOT is_root_page LIMIT 1" | tail -1) 76 | DESKTOP_ROOT_REQUESTS_EXIST=$(bq query --nouse_legacy_sql --format csv --headless -q "SELECT true FROM httparchive.crawl.requests WHERE date = '${DATE}' AND client = 'desktop' AND is_root_page LIMIT 1" | tail -1) 77 | DESKTOP_NON_ROOT_REQUESTS_EXIST=$(bq query --nouse_legacy_sql --format csv --headless -q "SELECT true FROM httparchive.crawl.requests WHERE date = '${DATE}' AND client = 'desktop' AND is_root_page LIMIT 1" | tail -1) 78 | MOBILE_ROOT_REQUESTS_EXIST=$(bq query --nouse_legacy_sql --format csv --headless -q "SELECT true FROM httparchive.crawl.requests WHERE date = '${DATE}' AND client = 'mobile' AND NOT is_root_page LIMIT 1" | tail -1) 79 | MOBILE_NON_ROOT_REQUESTS_EXIST=$(bq query --nouse_legacy_sql --format csv --headless -q "SELECT true FROM httparchive.crawl.requests WHERE date = '${DATE}' AND client = 'mobile' AND NOT is_root_page LIMIT 1" | tail -1) 80 | echo "Finished checking if dates are ready" 81 | if [[ "$DESKTOP_ROOT_PAGES_EXIST" == true && "$DESKTOP_NON_ROOT_PAGES_EXIST" == true && "$MOBILE_ROOT_PAGES_EXIST" == true && "$MOBILE_NON_ROOT_PAGES_EXIST" == true && "$DESKTOP_ROOT_REQUESTS_EXIST" == true && "$DESKTOP_NON_ROOT_REQUESTS_EXIST" == true && "$MOBILE_ROOT_REQUESTS_EXIST" == true && "$MOBILE_NON_ROOT_REQUESTS_EXIST" == true ]]; then 82 | DATED_TABLES_READY=1 83 | fi 84 | fi 85 | if [ $GENERATE_HISTOGRAM -ne 0 -a $DATED_TABLES_READY -ne 1 ]; then 86 | echo -e "The BigQuery tables for $DATE are not available." >&2 87 | 88 | # List table data for debugging 89 | echo $(date) 90 | echo "Desktop root pages ready: ${DESKTOP_ROOT_PAGES_EXIST}" 91 | echo "Desktop non-root pages ready: ${DESKTOP_NON_ROOT_PAGES_EXIST}" 92 | echo "Mobile root pages ready: ${MOBILE_ROOT_PAGES_EXIST}" 93 | echo "Mobile non-root pages ready: ${MOBILE_NON_ROOT_PAGES_EXIST}" 94 | echo "Desktop root requests ready: ${DESKTOP_ROOT_REQUESTS_EXIST}" 95 | echo "Desktop non-root requests ready: ${DESKTOP_NON_ROOT_REQUESTS_EXIST}" 96 | echo "Mobile root requests ready: ${MOBILE_ROOT_REQUESTS_EXIST}" 97 | echo "Mobile non-root requests ready: ${MOBILE_NON_ROOT_REQUESTS_EXIST}" 98 | exit 1 99 | fi 100 | 101 | if [ $GENERATE_HISTOGRAM -eq 0 ]; then 102 | echo -e "Skipping histograms" 103 | else 104 | echo -e "Generating histograms for date $DATE" 105 | 106 | # Run all histogram queries. 107 | for query in sql/histograms/$REPORTS.sql; do 108 | 109 | if [[ ! -f $query ]]; then 110 | echo "Nothing to do" 111 | continue; 112 | fi 113 | 114 | # Extract the metric name from the file path. 115 | # For example, `sql/histograms/foo.sql` will produce `foo`. 116 | metric=$(echo $(basename $query) | cut -d"." -f1) 117 | 118 | echo -e "Generating $metric histogram" 119 | 120 | if [[ "${LENS_ARG}" == "" ]]; then 121 | LENSES=("") 122 | echo "Generating ${metric} report for base" 123 | elif [[ "${LENS_ARG}" == "ALL" ]]; then 124 | LENSES=("" $(ls sql/lens)) 125 | echo "Generating ${metric} report for base and all lenses" 126 | else 127 | LENSES=("${LENS_ARG}") 128 | echo "Generating ${metric} report for one lens" 129 | fi 130 | 131 | for LENS in "${LENSES[@]}" 132 | do 133 | 134 | gs_lens_dir="" 135 | if [[ $LENS != "" ]]; then 136 | if [ ! -f "sql/lens/$LENS/histograms.sql" ] || [ ! -f "sql/lens/$LENS/timeseries.sql" ]; then 137 | echo -e "Lens histogram/timeseries files not found in sql/lens/$LENS." 138 | exit 1 139 | fi 140 | gs_lens_dir="$LENS/" 141 | fi 142 | 143 | gs_url="gs://httparchive/reports/$gs_lens_dir$YYYY_MM_DD/${metric}.json" 144 | gsutil ls $gs_url &> /dev/null 145 | if [ $? -eq 0 ] && [ $FORCE -eq 0 ]; then 146 | # The file already exists, so skip the query. 147 | echo -e "Skipping $gs_lens_dir$YYYY_MM_DD/$metric histogram as already exists" 148 | continue 149 | fi 150 | 151 | # Replace the date template in the query. 152 | if [[ $LENS != "" ]]; then 153 | echo -e "Generating ${metric} report for $LENS" 154 | lens_clause="$(cat sql/lens/$LENS/histograms.sql)" 155 | lens_clause_and="$(cat sql/lens/$LENS/histograms.sql) AND" 156 | lens_join="" 157 | 158 | if [[ $metric == crux* ]]; then 159 | lens_clause="" 160 | lens_clause_and="" 161 | if [[ -f sql/lens/$LENS/crux_histograms.sql ]]; then 162 | echo "Using alternative crux lens join" 163 | lens_join="$(cat sql/lens/$LENS/crux_histograms.sql | tr '\n' ' ')" 164 | else 165 | echo "CrUX queries do not support histograms for this lens so skipping" 166 | continue 167 | fi 168 | 169 | sql=$(sed -e "s/\(\`chrome-ux-report[^\`]*\`\)/\1 $lens_join/" $query \ 170 | | sed -e "s/\${YYYY-MM-DD}/$DATE/g" \ 171 | | sed -e "s/\${YYYYMM}/$YYYYMM/g") 172 | else 173 | 174 | if [[ $(grep -i "WHERE" $query) ]]; then 175 | # If WHERE clause already exists then add to it 176 | sql=$(sed -e "s/\(WHERE\)/\1 $lens_clause_and /" $query \ 177 | | sed -e "s/\${YYYY-MM-DD}/$DATE/g" \ 178 | | sed -e "s/\${YYYYMM}/$YYYYMM/g") 179 | else 180 | # If WHERE clause does not exists then add it, before GROUP BY 181 | sql=$(sed -e "s/\(GROUP BY\)/WHERE $lens_clause \1/" $query \ 182 | | sed -e "s/\${YYYY-MM-DD}/$DATE/g" \ 183 | | sed -e "s/\${YYYYMM}/$YYYYMM/g") 184 | fi 185 | fi 186 | else 187 | echo -e "Generating ${metric} report for base (no lens)" 188 | sql=$(sed -e "s/\${YYYY-MM-DD}/$DATE/g" $query \ 189 | | sed -e "s/\${YYYYMM}/$YYYYMM/g") 190 | fi 191 | 192 | if [ ${VERBOSE} -eq 1 ]; then 193 | echo "Running this query:" 194 | echo "${sql}\n" 195 | fi 196 | 197 | # Run the histogram query on BigQuery. 198 | START_TIME=$SECONDS 199 | result=$(echo "${sql}" | $BQ_CMD) 200 | 201 | # Make sure the query succeeded. 202 | if [ $? -eq 0 ]; then 203 | ELAPSED_TIME=$(($SECONDS - $START_TIME)) 204 | if [[ $LENS != "" ]]; then 205 | echo "$metric for $LENS took $ELAPSED_TIME seconds" 206 | else 207 | echo "$metric took $ELAPSED_TIME seconds" 208 | fi 209 | # Upload the response to Google Storage. 210 | echo $result \ 211 | | gsutil -h "Content-Type:application/json" cp - $gs_url 212 | else 213 | echo $result >&2 214 | fi 215 | done 216 | done 217 | fi 218 | 219 | if [ $GENERATE_TIMESERIES -eq 0 ]; then 220 | echo -e "Skipping timeseries" 221 | else 222 | echo -e "Generating timeseries" 223 | 224 | # Run all timeseries queries. 225 | for query in sql/timeseries/$REPORTS.sql; do 226 | 227 | if [[ ! -f $query ]]; then 228 | echo "Nothing to do" 229 | continue; 230 | fi 231 | 232 | # Extract the metric name from the file path. 233 | metric=$(echo $(basename $query) | cut -d"." -f1) 234 | 235 | if [[ "${LENS_ARG}" == "" ]]; then 236 | LENSES=("") 237 | echo "Generating ${metric} report for base" 238 | elif [[ "${LENS_ARG}" == "ALL" ]]; then 239 | LENSES=("" $(ls sql/lens)) 240 | echo "Generating ${metric} report for base and all lenses" 241 | else 242 | LENSES=("${LENS_ARG}") 243 | echo "Generating ${metric} report for one lens" 244 | fi 245 | 246 | for LENS in "${LENSES[@]}" 247 | do 248 | 249 | gs_lens_dir="" 250 | if [[ $LENS != "" ]]; then 251 | if [ ! -f "sql/lens/$LENS/histograms.sql" ] || [ ! -f "sql/lens/$LENS/timeseries.sql" ]; then 252 | echo -e "Lens histogram/timeseries files not found in sql/lens/$LENS." 253 | exit 1 254 | fi 255 | gs_lens_dir="$LENS/" 256 | fi 257 | 258 | date_join="" 259 | max_date="" 260 | current_contents="" 261 | gs_url="gs://httparchive/reports/$gs_lens_dir${metric}.json" 262 | gsutil ls $gs_url &> /dev/null 263 | if [ $? -eq 0 ]; then 264 | # The file already exists, so check max date 265 | current_contents=$(gsutil cat $gs_url) 266 | max_date=$(echo $current_contents | jq -r '[ .[] | .date ] | max') 267 | if [[ $FORCE -eq 0 && -n "${max_date}" ]]; then 268 | 269 | # Only run if new dates 270 | if [[ -z "${YYYY_MM_DD}" || "${max_date}" < "${YYYY_MM_DD}" ]]; then 271 | if [[ $metric != crux* ]]; then # CrUX is quick and join is more compilicated so just do a full run of that 272 | date_join="date > CAST(REPLACE(\"$max_date\",\"_\",\"-\") AS DATE)" 273 | # Skip 2022_05_12 tables 274 | date_join="${date_join}" 275 | if [[ -n "$YYYY_MM_DD" ]]; then 276 | # If a date is given, then only run up until then (in case next month is mid run as do not wanna get just desktop data) 277 | date_join="${date_join} AND date <= \"$DATE\"" 278 | fi 279 | fi 280 | 281 | echo -e "Generating $gs_lens_dir$metric timeseries in incremental mode from ${max_date} to ${YYYY_MM_DD}" 282 | 283 | else 284 | echo -e "Skipping $gs_lens_dir$metric timeseries as ${YYYY_MM_DD} already exists in the data. Run in force mode (-f) if you want to rerun." 285 | continue 286 | fi 287 | 288 | elif [[ -n "$YYYY_MM_DD" ]]; then 289 | # Even if doing a force run we only wanna run up until date given in case next month is mid-run as do not wanna get just desktop data 290 | if [[ $metric != crux* ]]; then # CrUX is quick and join is more compilicated so just do a full run of that 291 | # If a date is given, then only run up until then (in case next month is mid run as do not wanna get just desktop data) 292 | date_join="date <= \"$DATE\"" 293 | # Skip 2022_05_12 tables 294 | date_join="${date_join}" 295 | fi 296 | 297 | echo -e "Force Mode=${FORCE}. Generating $gs_lens_dir$metric timeseries from start until ${YYYY_MM_DD}." 298 | fi 299 | elif [[ -n "$YYYY_MM_DD" ]]; then 300 | # Even if the file does not exist we only wanna run up until date given in case next month is mid-run as do not wanna get just desktop data 301 | if [[ $metric != crux* ]]; then # CrUX is quick and join is more compilicated so just do a full run of that 302 | date_join="date <= \"$DATE\"" 303 | # Skip 2022_05_12 tables 304 | date_join="${date_join}" 305 | fi 306 | 307 | echo -e "Timeseries does not exist. Generating $gs_lens_dir$metric timeseries from start until ${YYYY_MM_DD}" 308 | 309 | else 310 | echo -e "Timeseries does not exist. Generating $gs_lens_dir$metric timeseries from start" 311 | fi 312 | 313 | if [[ $LENS != "" ]]; then 314 | 315 | if [[ $metric != crux* ]]; then 316 | lens_clause="$(cat sql/lens/$LENS/timeseries.sql)" 317 | lens_clause_and="$(cat sql/lens/$LENS/timeseries.sql) AND" 318 | lens_join="" 319 | else 320 | echo "CrUX query so using alternative lens join" 321 | lens_clause="" 322 | lens_clause_and="" 323 | lens_join="$(cat sql/lens/$LENS/crux_timeseries.sql | tr '\n' ' ')" 324 | fi 325 | 326 | if [[ -n "${date_join}" ]]; then 327 | if [[ $(grep -i "WHERE" $query) ]]; then 328 | # If WHERE clause already exists then add to it 329 | sql=$(sed -e "s/\(WHERE\)/\1 $lens_clause_and $date_join AND/" $query \ 330 | | sed -e "s/\(\`[^\`]*\`)*\)/\1 $lens_join/") 331 | else 332 | # If WHERE clause does not exists then add it, before GROUP BY 333 | sql=$(sed -e "s/\(GROUP BY\)/WHERE $lens_clause_and $date_join \1/" $query \ 334 | | sed -e "s/\(\`[^\`]*\`)*\)/\1 $lens_join/") 335 | fi 336 | else 337 | if [[ $(grep -i "WHERE" $query) ]]; then 338 | # If WHERE clause already exists then add to it 339 | sql=$(sed -e "s/\(WHERE\)/\1 $lens_clause_and /" $query \ 340 | | sed -e "s/\(\`[^\`]*\`)*\)/\1 $lens_join/") 341 | else 342 | # If WHERE clause does not exists then add it, before GROUP BY 343 | sql=$(sed -e "s/\(GROUP BY\)/WHERE $lens_clause \1/" $query \ 344 | | sed -e "s/\(\`[^\`]*\`)*\)/\1 $lens_join/") 345 | fi 346 | fi 347 | 348 | else 349 | if [[ -n "${date_join}" ]]; then 350 | if [[ $(grep -i "WHERE" $query) ]]; then 351 | # If WHERE clause already exists then add to it 352 | sql=$(sed -e "s/\(WHERE\)/\1 $date_join AND /" $query) 353 | else 354 | # If WHERE clause does not exists then add it, before GROUP BY 355 | sql=$(sed -e "s/\(GROUP BY\)/WHERE $date_join \1/" $query) 356 | fi 357 | else 358 | sql=$(cat $query) 359 | fi 360 | fi 361 | 362 | if [ ${VERBOSE} -eq 1 ]; then 363 | echo "Running this query:" 364 | echo "${sql}\n" 365 | fi 366 | 367 | # Run the timeseries query on BigQuery. 368 | START_TIME=$SECONDS 369 | result=$(echo "${sql}" | $BQ_CMD) 370 | 371 | # Make sure the query succeeded. 372 | if [ $? -eq 0 ]; then 373 | ELAPSED_TIME=$(($SECONDS - $START_TIME)) 374 | if [[ $LENS != "" ]]; then 375 | echo "$metric for $LENS took $ELAPSED_TIME seconds" 376 | else 377 | echo "$metric took $ELAPSED_TIME seconds" 378 | fi 379 | 380 | # If it is a partial run, then combine with the current results. 381 | if [[ $FORCE -eq 0 && -n "${current_contents}" && $metric != crux* ]]; then 382 | result=$(echo ${result} ${current_contents} | jq '.+= input') 383 | fi 384 | 385 | # Upload the response to Google Storage. 386 | echo $result \ 387 | | gsutil -h "Content-Type:application/json" cp - $gs_url 388 | else 389 | echo $result >&2 390 | fi 391 | done 392 | done 393 | fi 394 | 395 | echo -e "Done" 396 | -------------------------------------------------------------------------------- /sql/get_bigquery_dates.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Gets a list of dates for a given BigQuery table. 4 | # 5 | # Example usage: 6 | # 7 | # sql/getBigQueryDates.sh har lighthouse 8 | # 9 | # Where the first argument is the dataset and the 10 | # second argument is the table suffix. 11 | # 12 | # Example output: 13 | # 14 | # 2017_08_15 15 | # 2017_08_01 16 | # 2017_07_15 17 | # 2017_07_01 18 | # 2017_06_15 19 | # 2017_06_01 20 | # 21 | # May be combined with the generateReports.sh script 22 | # to generate a histogram for each date. For example: 23 | # 24 | # sql/get_bigquery_dates.sh runs pages | \ 25 | # xargs -I date sql/generate_reports.sh -h date 26 | # 27 | 28 | set -eo pipefail 29 | 30 | DATASET=$1 31 | SUFFIX=$2 32 | MIN=$3 33 | MAX=$4 34 | 35 | if [ -z "$DATASET" ]; then 36 | echo "Dataset argument required." >&2 37 | echo "Example usage: sql/getBigQueryDates.sh har lighthouse" >&2 38 | exit 1 39 | fi 40 | 41 | having="" 42 | if [ ! -z "$MIN" ] || [ ! -z "$MAX" ]; then 43 | having="HAVING 44 | " 45 | if [ ! -z "$MIN" ]; then 46 | having+=" date >= \"$MIN\"" 47 | if [ ! -z "$MAX" ]; then 48 | having+=" AND 49 | " 50 | fi 51 | fi 52 | if [ ! -z "$MAX" ]; then 53 | having+=" date <= \"$MAX\"" 54 | fi 55 | having+=" 56 | " 57 | fi 58 | 59 | query=$(cat <= 0 25 | ) 26 | ) 27 | ORDER BY 28 | bin, 29 | client 30 | -------------------------------------------------------------------------------- /sql/histograms/cruxCls.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | CREATE TEMPORARY FUNCTION spreadBins(bins ARRAY>) 3 | RETURNS ARRAY> 4 | LANGUAGE js AS """ 5 | // Convert into 0.01 bins and spread the density around. 6 | const WIDTH = 0.01; 7 | return (bins || []).reduce((bins, bin) => { 8 | bin.start = +bin.start; 9 | bin.end = Math.min(bin.end, bin.start + 10); 10 | const binWidth = bin.end - bin.start; 11 | for (let start = bin.start; start < bin.end; start += WIDTH) { 12 | bins.push({ 13 | start, 14 | density: bin.density / (binWidth / WIDTH) 15 | }); 16 | } 17 | return bins; 18 | }, []); 19 | """; 20 | 21 | SELECT 22 | *, 23 | SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf 24 | FROM ( 25 | SELECT 26 | *, 27 | volume / SUM(volume) OVER (PARTITION BY client) AS pdf 28 | FROM ( 29 | SELECT 30 | IF(form_factor.name = 'desktop', 'desktop', 'mobile') AS client, 31 | bin.start AS bin, 32 | SUM(bin.density) AS volume 33 | FROM ( 34 | SELECT 35 | form_factor, 36 | spreadBins(layout_instability.cumulative_layout_shift.histogram.bin) AS bins 37 | FROM 38 | `chrome-ux-report.all.${YYYYMM}` 39 | ) 40 | CROSS JOIN 41 | UNNEST(bins) AS bin 42 | GROUP BY 43 | bin, 44 | client 45 | ) 46 | ) 47 | ORDER BY 48 | bin, 49 | client 50 | -------------------------------------------------------------------------------- /sql/histograms/cruxDcl.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | CREATE TEMPORARY FUNCTION spreadBins(bins ARRAY>) 3 | RETURNS ARRAY> 4 | LANGUAGE js AS """ 5 | // Convert into 100ms bins and spread the density around. 6 | const WIDTH = 100; 7 | return (bins || []).reduce((bins, bin) => { 8 | bin.start = +bin.start; 9 | bin.end = Math.min(bin.end, bin.start + 5000); 10 | const binWidth = bin.end - bin.start; 11 | for (let start = bin.start; start < bin.end; start += WIDTH) { 12 | bins.push({ 13 | start, 14 | density: bin.density / (binWidth / WIDTH) 15 | }); 16 | } 17 | return bins; 18 | }, []); 19 | """; 20 | 21 | SELECT 22 | *, 23 | SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf 24 | FROM ( 25 | SELECT 26 | *, 27 | volume / SUM(volume) OVER (PARTITION BY client) AS pdf 28 | FROM ( 29 | SELECT 30 | IF(form_factor.name = 'desktop', 'desktop', 'mobile') AS client, 31 | bin.start / 1000 AS bin, 32 | SUM(bin.density) AS volume 33 | FROM ( 34 | SELECT 35 | form_factor, 36 | spreadBins(dom_content_loaded.histogram.bin) AS bins 37 | FROM 38 | `chrome-ux-report.all.${YYYYMM}` 39 | ) 40 | CROSS JOIN 41 | UNNEST(bins) AS bin 42 | GROUP BY 43 | bin, 44 | client 45 | ) 46 | ) 47 | ORDER BY 48 | bin, 49 | client 50 | -------------------------------------------------------------------------------- /sql/histograms/cruxFcp.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | CREATE TEMPORARY FUNCTION spreadBins(bins ARRAY>) 3 | RETURNS ARRAY> 4 | LANGUAGE js AS """ 5 | // Convert into 100ms bins and spread the density around. 6 | const WIDTH = 100; 7 | return (bins || []).reduce((bins, bin) => { 8 | bin.start = +bin.start; 9 | bin.end = Math.min(bin.end, bin.start + 5000); 10 | const binWidth = bin.end - bin.start; 11 | for (let start = bin.start; start < bin.end; start += WIDTH) { 12 | bins.push({ 13 | start, 14 | density: bin.density / (binWidth / WIDTH) 15 | }); 16 | } 17 | return bins; 18 | }, []); 19 | """; 20 | 21 | SELECT 22 | *, 23 | SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf 24 | FROM ( 25 | SELECT 26 | *, 27 | volume / SUM(volume) OVER (PARTITION BY client) AS pdf 28 | FROM ( 29 | SELECT 30 | IF(form_factor.name = 'desktop', 'desktop', 'mobile') AS client, 31 | bin.start / 1000 AS bin, 32 | SUM(bin.density) AS volume 33 | FROM ( 34 | SELECT 35 | form_factor, 36 | spreadBins(first_contentful_paint.histogram.bin) AS bins 37 | FROM 38 | `chrome-ux-report.all.${YYYYMM}` 39 | ) 40 | CROSS JOIN 41 | UNNEST(bins) AS bin 42 | GROUP BY 43 | bin, 44 | client 45 | ) 46 | ) 47 | ORDER BY 48 | bin, 49 | client 50 | -------------------------------------------------------------------------------- /sql/histograms/cruxFp.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | CREATE TEMPORARY FUNCTION spreadBins(bins ARRAY>) 3 | RETURNS ARRAY> 4 | LANGUAGE js AS """ 5 | // Convert into 100ms bins and spread the density around. 6 | const WIDTH = 100; 7 | return (bins || []).reduce((bins, bin) => { 8 | bin.start = +bin.start; 9 | bin.end = Math.min(bin.end, bin.start + 5000); 10 | const binWidth = bin.end - bin.start; 11 | for (let start = bin.start; start < bin.end; start += WIDTH) { 12 | bins.push({ 13 | start, 14 | density: bin.density / (binWidth / WIDTH) 15 | }); 16 | } 17 | return bins; 18 | }, []); 19 | """; 20 | 21 | SELECT 22 | *, 23 | SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf 24 | FROM ( 25 | SELECT 26 | *, 27 | volume / SUM(volume) OVER (PARTITION BY client) AS pdf 28 | FROM ( 29 | SELECT 30 | IF(form_factor.name = 'desktop', 'desktop', 'mobile') AS client, 31 | bin.start / 1000 AS bin, 32 | SUM(bin.density) AS volume 33 | FROM ( 34 | SELECT 35 | form_factor, 36 | spreadBins(first_paint.histogram.bin) AS bins 37 | FROM 38 | `chrome-ux-report.all.${YYYYMM}` 39 | ) 40 | CROSS JOIN 41 | UNNEST(bins) AS bin 42 | GROUP BY 43 | bin, 44 | client 45 | ) 46 | ) 47 | ORDER BY 48 | bin, 49 | client 50 | -------------------------------------------------------------------------------- /sql/histograms/cruxInp.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | *, 4 | SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf 5 | FROM ( 6 | SELECT 7 | *, 8 | volume / SUM(volume) OVER (PARTITION BY client) AS pdf 9 | FROM ( 10 | SELECT 11 | IF(form_factor.name = 'desktop', 'desktop', 'mobile') AS client, 12 | bin.start AS bin, 13 | SUM(bin.density) AS volume 14 | FROM ( 15 | SELECT 16 | form_factor, 17 | interaction_to_next_paint.histogram.bin AS bins 18 | FROM 19 | `chrome-ux-report.all.${YYYYMM}` 20 | ) 21 | CROSS JOIN 22 | UNNEST(bins) AS bin 23 | GROUP BY 24 | bin, 25 | client 26 | ) 27 | ) 28 | ORDER BY 29 | bin, 30 | client 31 | -------------------------------------------------------------------------------- /sql/histograms/cruxLcp.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | CREATE TEMPORARY FUNCTION spreadBins(bins ARRAY>) 3 | RETURNS ARRAY> 4 | LANGUAGE js AS """ 5 | // Convert into 100ms bins and spread the density around. 6 | const WIDTH = 100; 7 | return (bins || []).reduce((bins, bin) => { 8 | bin.start = +bin.start; 9 | bin.end = Math.min(bin.end, bin.start + 5000); 10 | const binWidth = bin.end - bin.start; 11 | for (let start = bin.start; start < bin.end; start += WIDTH) { 12 | bins.push({ 13 | start, 14 | density: bin.density / (binWidth / WIDTH) 15 | }); 16 | } 17 | return bins; 18 | }, []); 19 | """; 20 | 21 | SELECT 22 | *, 23 | SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf 24 | FROM ( 25 | SELECT 26 | *, 27 | volume / SUM(volume) OVER (PARTITION BY client) AS pdf 28 | FROM ( 29 | SELECT 30 | IF(form_factor.name = 'desktop', 'desktop', 'mobile') AS client, 31 | bin.start / 1000 AS bin, 32 | SUM(bin.density) AS volume 33 | FROM ( 34 | SELECT 35 | form_factor, 36 | spreadBins(largest_contentful_paint.histogram.bin) AS bins 37 | FROM 38 | `chrome-ux-report.all.${YYYYMM}` 39 | ) 40 | CROSS JOIN 41 | UNNEST(bins) AS bin 42 | GROUP BY 43 | bin, 44 | client 45 | ) 46 | ) 47 | ORDER BY 48 | bin, 49 | client 50 | -------------------------------------------------------------------------------- /sql/histograms/cruxOl.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | CREATE TEMPORARY FUNCTION spreadBins(bins ARRAY>) 3 | RETURNS ARRAY> 4 | LANGUAGE js AS """ 5 | // Convert into 100ms bins and spread the density around. 6 | const WIDTH = 100; 7 | return (bins || []).reduce((bins, bin) => { 8 | bin.start = +bin.start; 9 | bin.end = Math.min(bin.end, bin.start + 5000); 10 | const binWidth = bin.end - bin.start; 11 | for (let start = bin.start; start < bin.end; start += WIDTH) { 12 | bins.push({ 13 | start, 14 | density: bin.density / (binWidth / WIDTH) 15 | }); 16 | } 17 | return bins; 18 | }, []); 19 | """; 20 | 21 | SELECT 22 | *, 23 | SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf 24 | FROM ( 25 | SELECT 26 | *, 27 | volume / SUM(volume) OVER (PARTITION BY client) AS pdf 28 | FROM ( 29 | SELECT 30 | IF(form_factor.name = 'desktop', 'desktop', 'mobile') AS client, 31 | bin.start / 1000 AS bin, 32 | SUM(bin.density) AS volume 33 | FROM ( 34 | SELECT 35 | form_factor, 36 | spreadBins(onload.histogram.bin) AS bins 37 | FROM 38 | `chrome-ux-report.all.${YYYYMM}` 39 | ) 40 | CROSS JOIN 41 | UNNEST(bins) AS bin 42 | GROUP BY 43 | bin, 44 | client 45 | ) 46 | ) 47 | ORDER BY 48 | bin, 49 | client 50 | -------------------------------------------------------------------------------- /sql/histograms/cruxShopifyThemes.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | -- Core web vitals by Shopify theme 3 | CREATE TEMP FUNCTION IS_GOOD(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( 4 | good / (good + needs_improvement + poor) >= 0.75 5 | ); 6 | 7 | CREATE TEMP FUNCTION IS_POOR(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( 8 | poor / (good + needs_improvement + poor) > 0.25 9 | ); 10 | 11 | CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( 12 | good + needs_improvement + poor > 0 13 | ); 14 | 15 | -- Test CrUX data exists 16 | WITH crux_test AS ( -- noqa: ST03 17 | SELECT 18 | 1 19 | FROM 20 | `chrome-ux-report.all.${YYYYMM}` 21 | ), 22 | 23 | -- All Shopify shops in HTTPArchive 24 | archive_pages AS ( 25 | SELECT 26 | client, 27 | page AS url, 28 | JSON_VALUE(custom_metrics.ecommerce.Shopify.theme.name) AS theme_name, 29 | JSON_VALUE(custom_metrics.ecommerce.Shopify.theme.theme_store_id) AS theme_store_id 30 | FROM 31 | `httparchive.crawl.pages` 32 | WHERE 33 | date = '${YYYY-MM-DD}' AND 34 | is_root_page AND 35 | JSON_VALUE(custom_metrics.ecommerce.Shopify.theme.name) IS NOT NULL --first grab all shops for market share 36 | ) 37 | 38 | SELECT 39 | client, 40 | archive_pages.theme_store_id AS id, 41 | theme_names.theme_name AS top_theme_name, 42 | COUNT(DISTINCT origin) AS origins, 43 | -- Origins with good LCP divided by origins with any LCP. 44 | SAFE_DIVIDE( 45 | COUNT(DISTINCT IF(IS_GOOD(fast_lcp, avg_lcp, slow_lcp), origin, NULL)), 46 | COUNT(DISTINCT IF(IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp), origin, NULL)) 47 | ) AS pct_good_lcp, 48 | -- Origins with needs improvement are anything not good, nor poor. 49 | 1 - 50 | SAFE_DIVIDE( 51 | COUNT(DISTINCT IF(IS_GOOD(fast_lcp, avg_lcp, slow_lcp), origin, NULL)), 52 | COUNT(DISTINCT IF(IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp), origin, NULL)) 53 | ) 54 | - 55 | SAFE_DIVIDE( 56 | COUNT(DISTINCT IF(IS_POOR(fast_lcp, avg_lcp, slow_lcp), origin, NULL)), 57 | COUNT(DISTINCT IF(IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp), origin, NULL))) 58 | AS pct_ni_lcp, 59 | -- Origins with poor LCP divided by origins with any LCP. 60 | SAFE_DIVIDE( 61 | COUNT(DISTINCT IF(IS_POOR(fast_lcp, avg_lcp, slow_lcp), origin, NULL)), 62 | COUNT(DISTINCT IF(IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp), origin, NULL)) 63 | ) AS pct_poor_lcp, 64 | 65 | -- Origins with good TTFB divided by origins with any TTFB. 66 | SAFE_DIVIDE( 67 | COUNT(DISTINCT IF(IS_GOOD(fast_ttfb, avg_ttfb, slow_ttfb), origin, NULL)), 68 | COUNT(DISTINCT IF(IS_NON_ZERO(fast_ttfb, avg_ttfb, slow_ttfb), origin, NULL)) 69 | ) AS pct_good_ttfb, 70 | -- Origins with needs improvement are anything not good, nor poor. 71 | 1 - 72 | SAFE_DIVIDE( 73 | COUNT(DISTINCT IF(IS_GOOD(fast_ttfb, avg_ttfb, slow_ttfb), origin, NULL)), 74 | COUNT(DISTINCT IF(IS_NON_ZERO(fast_ttfb, avg_ttfb, slow_ttfb), origin, NULL)) 75 | ) 76 | - 77 | SAFE_DIVIDE( 78 | COUNT(DISTINCT IF(IS_POOR(fast_ttfb, avg_ttfb, slow_ttfb), origin, NULL)), 79 | COUNT(DISTINCT IF(IS_NON_ZERO(fast_ttfb, avg_ttfb, slow_ttfb), origin, NULL))) 80 | AS pct_ni_ttfb, 81 | -- Origins with poor TTFB divided by origins with any TTFB. 82 | SAFE_DIVIDE( 83 | COUNT(DISTINCT IF(IS_POOR(fast_ttfb, avg_ttfb, slow_ttfb), origin, NULL)), 84 | COUNT(DISTINCT IF(IS_NON_ZERO(fast_ttfb, avg_ttfb, slow_ttfb), origin, NULL)) 85 | ) AS pct_poor_ttfb, 86 | 87 | -- Origins with good FCP divided by origins with any FCP. 88 | SAFE_DIVIDE( 89 | COUNT(DISTINCT IF(IS_GOOD(fast_fcp, avg_fcp, slow_fcp), origin, NULL)), 90 | COUNT(DISTINCT IF(IS_NON_ZERO(fast_fcp, avg_fcp, slow_fcp), origin, NULL)) 91 | ) AS pct_good_fcp, 92 | -- Origins with needs improvement are anything not good, nor poor. 93 | 1 - 94 | SAFE_DIVIDE( 95 | COUNT(DISTINCT IF(IS_GOOD(fast_fcp, avg_fcp, slow_fcp), origin, NULL)), 96 | COUNT(DISTINCT IF(IS_NON_ZERO(fast_fcp, avg_fcp, slow_fcp), origin, NULL)) 97 | ) 98 | - 99 | SAFE_DIVIDE( 100 | COUNT(DISTINCT IF(IS_POOR(fast_fcp, avg_fcp, slow_fcp), origin, NULL)), 101 | COUNT(DISTINCT IF(IS_NON_ZERO(fast_fcp, avg_fcp, slow_fcp), origin, NULL))) 102 | AS pct_ni_fcp, 103 | -- Origins with poor FCP divided by origins with any FCP. 104 | SAFE_DIVIDE( 105 | COUNT(DISTINCT IF(IS_POOR(fast_fcp, avg_fcp, slow_fcp), origin, NULL)), 106 | COUNT(DISTINCT IF(IS_NON_ZERO(fast_fcp, avg_fcp, slow_fcp), origin, NULL)) 107 | ) AS pct_poor_fcp, 108 | 109 | -- Origins with good INP divided by origins with any INP. 110 | SAFE_DIVIDE( 111 | COUNT(DISTINCT IF(IS_GOOD(fast_inp, avg_inp, slow_inp), origin, NULL)), 112 | COUNT(DISTINCT IF(IS_NON_ZERO(fast_inp, avg_inp, slow_inp), origin, NULL)) 113 | ) AS pct_good_inp, 114 | -- Origins with needs improvement are anything not good, nor poor. 115 | 1 - 116 | SAFE_DIVIDE( 117 | COUNT(DISTINCT IF(IS_GOOD(fast_inp, avg_inp, slow_inp), origin, NULL)), 118 | COUNT(DISTINCT IF(IS_NON_ZERO(fast_inp, avg_inp, slow_inp), origin, NULL)) 119 | ) 120 | - 121 | SAFE_DIVIDE( 122 | COUNT(DISTINCT IF(IS_POOR(fast_inp, avg_inp, slow_inp), origin, NULL)), 123 | COUNT(DISTINCT IF(IS_NON_ZERO(fast_inp, avg_inp, slow_inp), origin, NULL))) 124 | AS pct_ni_inp, 125 | -- Origins with poor INP divided by origins with any INP. 126 | SAFE_DIVIDE( 127 | COUNT(DISTINCT IF(IS_POOR(fast_inp, avg_inp, slow_inp), origin, NULL)), 128 | COUNT(DISTINCT IF(IS_NON_ZERO(fast_inp, avg_inp, slow_inp), origin, NULL)) 129 | ) AS pct_poor_inp, 130 | 131 | -- Origins with good CLS divided by origins with any CLS. 132 | SAFE_DIVIDE( 133 | COUNT(DISTINCT IF(IS_GOOD(small_cls, medium_cls, large_cls), origin, NULL)), 134 | COUNT(DISTINCT IF(IS_NON_ZERO(small_cls, medium_cls, large_cls), origin, NULL)) 135 | ) AS pct_good_cls, 136 | -- Origins with needs improvement are anything not good, nor poor. 137 | 1 - 138 | SAFE_DIVIDE( 139 | COUNT(DISTINCT IF(IS_GOOD(small_cls, medium_cls, large_cls), origin, NULL)), 140 | COUNT(DISTINCT IF(IS_NON_ZERO(small_cls, medium_cls, large_cls), origin, NULL)) 141 | ) 142 | - 143 | SAFE_DIVIDE( 144 | COUNT(DISTINCT IF(IS_POOR(small_cls, medium_cls, large_cls), origin, NULL)), 145 | COUNT(DISTINCT IF(IS_NON_ZERO(small_cls, medium_cls, large_cls), origin, NULL))) 146 | AS pct_ni_cls, 147 | -- Origins with poor CLS divided by origins with any CLS. 148 | SAFE_DIVIDE( 149 | COUNT(DISTINCT IF(IS_POOR(small_cls, medium_cls, large_cls), origin, NULL)), 150 | COUNT(DISTINCT IF(IS_NON_ZERO(small_cls, medium_cls, large_cls), origin, NULL)) 151 | ) AS pct_poor_cls, 152 | 153 | -- Origins with good LCP, INP (optional), and CLS divided by origins with any LCP and CLS. 154 | SAFE_DIVIDE( 155 | COUNT(DISTINCT IF( 156 | IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AND 157 | IS_GOOD(fast_inp, avg_inp, slow_inp) IS NOT FALSE AND 158 | IS_GOOD(small_cls, medium_cls, large_cls), origin, NULL 159 | )), 160 | COUNT(DISTINCT IF( 161 | IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp) AND 162 | IS_NON_ZERO(small_cls, medium_cls, large_cls), origin, NULL 163 | )) 164 | ) AS pct_good_cwv 165 | FROM 166 | `chrome-ux-report.materialized.device_summary` 167 | JOIN archive_pages 168 | ON 169 | CONCAT(origin, '/') = url AND 170 | IF(device = 'desktop', 'desktop', 'mobile') = client 171 | JOIN ( 172 | -- Add in top theme name for a theme store id AS this should usually be the real theme name 173 | SELECT 174 | COUNT(DISTINCT url) AS pages_count, 175 | theme_store_id, 176 | theme_name, 177 | row_number() OVER (PARTITION BY theme_store_id ORDER BY COUNT(DISTINCT url) DESC) AS rank 178 | FROM archive_pages 179 | GROUP BY 180 | theme_store_id, 181 | theme_name 182 | ORDER BY COUNT(DISTINCT url) DESC 183 | ) theme_names 184 | -- Include null theme store ids so that we can get full market share within CrUX 185 | ON IFNULL(theme_names.theme_store_id, 'N/A') = IFNULL(archive_pages.theme_store_id, 'N/A') 186 | WHERE 187 | date = '${YYYY-MM-DD}' AND 188 | theme_names.rank = 1 189 | GROUP BY 190 | client, 191 | id, 192 | top_theme_name 193 | ORDER BY 194 | origins DESC 195 | -------------------------------------------------------------------------------- /sql/histograms/cruxTtfb.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | *, 4 | SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf 5 | FROM ( 6 | SELECT 7 | *, 8 | volume / SUM(volume) OVER (PARTITION BY client) AS pdf 9 | FROM ( 10 | SELECT 11 | IF(form_factor.name = 'desktop', 'desktop', 'mobile') AS client, 12 | bin.start AS bin, 13 | SUM(bin.density) AS volume 14 | FROM ( 15 | SELECT 16 | form_factor, 17 | experimental.time_to_first_byte.histogram.bin AS bins 18 | FROM 19 | `chrome-ux-report.all.${YYYYMM}` 20 | ) 21 | CROSS JOIN 22 | UNNEST(bins) AS bin 23 | GROUP BY 24 | bin, 25 | client 26 | ) 27 | ) 28 | ORDER BY 29 | bin, 30 | client 31 | -------------------------------------------------------------------------------- /sql/histograms/dcl.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | *, 4 | SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf 5 | FROM ( 6 | SELECT 7 | *, 8 | volume / SUM(volume) OVER (PARTITION BY client) AS pdf 9 | FROM ( 10 | SELECT 11 | client, 12 | COUNT(0) AS volume, 13 | FLOOR(FLOAT64(summary.onContentLoaded) / 1000) AS bin 14 | FROM 15 | `httparchive.crawl.pages` 16 | WHERE 17 | date = '${YYYY-MM-DD}' AND 18 | is_root_page AND 19 | FLOAT64(summary.onContentLoaded) > 0 20 | GROUP BY 21 | bin, 22 | client 23 | ) 24 | ) 25 | ORDER BY 26 | bin, 27 | client 28 | -------------------------------------------------------------------------------- /sql/histograms/evalJs.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | *, 4 | SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf 5 | FROM ( 6 | SELECT 7 | *, 8 | volume / SUM(volume) OVER (PARTITION BY client) AS pdf 9 | FROM ( 10 | SELECT 11 | client, 12 | COUNT(0) AS volume, 13 | CAST(FLOAT64(r.payload['_cpu.EvaluateScript']) / 20 AS INT64) * 20 AS bin 14 | FROM 15 | `httparchive.crawl.requests` r 16 | INNER JOIN 17 | `httparchive.crawl.pages` 18 | USING (date, client, is_root_page, rank, page) 19 | WHERE 20 | date = '${YYYY-MM-DD}' AND 21 | is_root_page 22 | GROUP BY 23 | bin, 24 | client 25 | HAVING 26 | bin IS NOT NULL AND 27 | bin >= 0 28 | ) 29 | ) 30 | ORDER BY 31 | bin, 32 | client 33 | -------------------------------------------------------------------------------- /sql/histograms/fcp.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | *, 4 | SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf 5 | FROM ( 6 | SELECT 7 | *, 8 | volume / SUM(volume) OVER (PARTITION BY client) AS pdf 9 | FROM ( 10 | SELECT 11 | client, 12 | COUNT(0) AS volume, 13 | CAST(FLOOR(FLOAT64(payload['_chromeUserTiming.firstContentfulPaint']) / 1000) AS INT64) AS bin 14 | FROM 15 | `httparchive.crawl.pages` 16 | WHERE 17 | date = '${YYYY-MM-DD}' AND 18 | is_root_page 19 | GROUP BY 20 | bin, 21 | client 22 | HAVING 23 | bin IS NOT NULL AND 24 | bin >= 0 25 | ) 26 | ) 27 | ORDER BY 28 | bin, 29 | client 30 | -------------------------------------------------------------------------------- /sql/histograms/gzipSavings.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | *, 4 | SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf 5 | FROM ( 6 | SELECT 7 | *, 8 | volume / SUM(volume) OVER (PARTITION BY client) AS pdf 9 | FROM ( 10 | SELECT 11 | client, 12 | COUNT(0) AS volume, 13 | CAST(FLOOR(FLOAT64(payload._gzip_savings) / (1024 * 2)) * 2 AS INT64) AS bin 14 | FROM 15 | `httparchive.crawl.pages` 16 | WHERE 17 | date = '${YYYY-MM-DD}' AND 18 | is_root_page 19 | GROUP BY 20 | bin, 21 | client 22 | HAVING 23 | bin IS NOT NULL 24 | ) 25 | ) 26 | ORDER BY 27 | bin, 28 | client 29 | -------------------------------------------------------------------------------- /sql/histograms/htmlElementPopularity.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | CREATE TEMPORARY FUNCTION getElements(payload STRING) 3 | RETURNS ARRAY LANGUAGE js AS ''' 4 | try { 5 | var elements = JSON.parse(payload); 6 | if (Array.isArray(elements) || typeof elements != 'object') return []; 7 | return Object.keys(elements); 8 | } catch (e) { 9 | return []; 10 | } 11 | '''; 12 | 13 | SELECT 14 | client, 15 | element, 16 | COUNT(DISTINCT root_page) AS pages, 17 | total, 18 | COUNT(DISTINCT root_page) / total AS pct, 19 | ARRAY_TO_STRING(ARRAY_AGG(DISTINCT page LIMIT 5), ' ') AS sample_urls 20 | FROM 21 | `httparchive.crawl.pages` 22 | JOIN 23 | ( 24 | SELECT 25 | date, 26 | client, 27 | COUNT(DISTINCT root_page) AS total 28 | FROM 29 | `httparchive.crawl.pages` 30 | WHERE 31 | date = '${YYYY-MM-DD}' 32 | GROUP BY 33 | date, 34 | client 35 | ) 36 | USING (date, client), 37 | UNNEST(getElements(TO_JSON_STRING(custom_metrics.element_count))) AS element 38 | WHERE 39 | date = '${YYYY-MM-DD}' 40 | GROUP BY 41 | client, 42 | total, 43 | element 44 | HAVING 45 | COUNT(DISTINCT root_page) >= 10 46 | ORDER BY 47 | pages / total DESC, 48 | client 49 | -------------------------------------------------------------------------------- /sql/histograms/imgSavings.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | *, 4 | SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf 5 | FROM ( 6 | SELECT 7 | *, 8 | volume / SUM(volume) OVER (PARTITION BY client) AS pdf 9 | FROM ( 10 | SELECT 11 | client, 12 | COUNT(0) AS volume, 13 | CAST(FLOOR(FLOAT64(payload._image_savings) / (1024 * 10)) * 10 AS INT64) AS bin 14 | FROM 15 | `httparchive.crawl.pages` 16 | WHERE 17 | date = '${YYYY-MM-DD}' AND 18 | is_root_page 19 | GROUP BY 20 | bin, 21 | client 22 | HAVING 23 | bin IS NOT NULL 24 | ) 25 | ) 26 | ORDER BY 27 | bin, 28 | client 29 | -------------------------------------------------------------------------------- /sql/histograms/offscreenImages.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | *, 4 | SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf 5 | FROM ( 6 | SELECT 7 | *, 8 | volume / SUM(volume) OVER (PARTITION BY client) AS pdf 9 | FROM ( 10 | SELECT 11 | client, 12 | COUNT(0) AS volume, 13 | CAST(FLOOR(IFNULL( 14 | INT64(lighthouse.audits['offscreen-images'].details.overallSavingsBytes), 15 | INT64(lighthouse.audits['offscreen-images'].extendedInfo.value.wastedKb) * 1024 16 | ) / 10240) * 10 AS INT64) AS bin 17 | FROM 18 | `httparchive.crawl.pages` 19 | WHERE 20 | date >= '2022-03-01' AND 21 | date = '${YYYY-MM-DD}' AND 22 | is_root_page 23 | GROUP BY 24 | bin, 25 | client 26 | HAVING 27 | bin IS NOT NULL 28 | ) 29 | ) 30 | ORDER BY 31 | bin, 32 | client 33 | -------------------------------------------------------------------------------- /sql/histograms/ol.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | *, 4 | SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf 5 | FROM ( 6 | SELECT 7 | *, 8 | volume / SUM(volume) OVER (PARTITION BY client) AS pdf 9 | FROM ( 10 | SELECT 11 | client, 12 | COUNT(0) AS volume, 13 | FLOOR(FLOAT64(summary.onLoad) / 1000) AS bin 14 | FROM 15 | `httparchive.crawl.pages` 16 | WHERE 17 | date = '${YYYY-MM-DD}' AND 18 | is_root_page AND 19 | FLOAT64(summary.onLoad) > 0 20 | GROUP BY 21 | bin, 22 | client 23 | ) 24 | ) 25 | ORDER BY 26 | bin, 27 | client 28 | -------------------------------------------------------------------------------- /sql/histograms/optimizedImages.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | *, 4 | SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf 5 | FROM ( 6 | SELECT 7 | *, 8 | volume / SUM(volume) OVER (PARTITION BY client) AS pdf 9 | FROM ( 10 | SELECT 11 | client, 12 | COUNT(0) AS volume, 13 | CAST(FLOOR(IFNULL( 14 | INT64(lighthouse.audits['uses-optimized-images'].details.overallSavingsBytes), 15 | INT64(lighthouse.audits['uses-optimized-images'].extendedInfo.value.wastedKb) * 1024 16 | ) / 10240) * 10 AS INT64) AS bin 17 | FROM 18 | `httparchive.crawl.pages` 19 | WHERE 20 | date >= '2022-03-01' AND 21 | date = '${YYYY-MM-DD}' AND 22 | is_root_page 23 | GROUP BY 24 | bin, 25 | client 26 | HAVING 27 | bin IS NOT NULL 28 | ) 29 | ) 30 | ORDER BY 31 | bin, 32 | client 33 | -------------------------------------------------------------------------------- /sql/histograms/reqCss.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | *, 4 | SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf 5 | FROM ( 6 | SELECT 7 | *, 8 | volume / SUM(volume) OVER (PARTITION BY client) AS pdf 9 | FROM ( 10 | SELECT 11 | client, 12 | COUNT(0) AS volume, 13 | FLOAT64(summary.reqCss) AS bin 14 | FROM 15 | `httparchive.crawl.pages` 16 | WHERE 17 | date = '${YYYY-MM-DD}' AND 18 | is_root_page 19 | GROUP BY 20 | bin, 21 | client 22 | ) 23 | ) 24 | ORDER BY 25 | bin, 26 | client 27 | -------------------------------------------------------------------------------- /sql/histograms/reqFont.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | *, 4 | SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf 5 | FROM ( 6 | SELECT 7 | *, 8 | volume / SUM(volume) OVER (PARTITION BY client) AS pdf 9 | FROM ( 10 | SELECT 11 | client, 12 | COUNT(0) AS volume, 13 | FLOAT64(summary.reqFont) AS bin 14 | FROM 15 | `httparchive.crawl.pages` 16 | WHERE 17 | date = '${YYYY-MM-DD}' AND 18 | is_root_page 19 | GROUP BY 20 | bin, 21 | client 22 | ) 23 | ) 24 | ORDER BY 25 | bin, 26 | client 27 | -------------------------------------------------------------------------------- /sql/histograms/reqHtml.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | *, 4 | SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf 5 | FROM ( 6 | SELECT 7 | *, 8 | volume / SUM(volume) OVER (PARTITION BY client) AS pdf 9 | FROM ( 10 | SELECT 11 | client, 12 | COUNT(0) AS volume, 13 | FLOAT64(summary.reqHtml) AS bin 14 | FROM 15 | `httparchive.crawl.pages` 16 | WHERE 17 | date = '${YYYY-MM-DD}' AND 18 | is_root_page 19 | GROUP BY 20 | bin, 21 | client 22 | ) 23 | ) 24 | ORDER BY 25 | bin, 26 | client 27 | -------------------------------------------------------------------------------- /sql/histograms/reqImg.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | *, 4 | SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf 5 | FROM ( 6 | SELECT 7 | *, 8 | volume / SUM(volume) OVER (PARTITION BY client) AS pdf 9 | FROM ( 10 | SELECT 11 | client, 12 | COUNT(0) AS volume, 13 | FLOAT64(summary.reqImg) AS bin 14 | FROM 15 | `httparchive.crawl.pages` 16 | WHERE 17 | date = '${YYYY-MM-DD}' AND 18 | is_root_page 19 | GROUP BY 20 | bin, 21 | client 22 | ) 23 | ) 24 | ORDER BY 25 | bin, 26 | client 27 | -------------------------------------------------------------------------------- /sql/histograms/reqJs.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | *, 4 | SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf 5 | FROM ( 6 | SELECT 7 | *, 8 | volume / SUM(volume) OVER (PARTITION BY client) AS pdf 9 | FROM ( 10 | SELECT 11 | client, 12 | COUNT(0) AS volume, 13 | FLOAT64(summary.reqJS) AS bin 14 | FROM 15 | `httparchive.crawl.pages` 16 | WHERE 17 | date = '${YYYY-MM-DD}' AND 18 | is_root_page 19 | GROUP BY 20 | bin, 21 | client 22 | ) 23 | ) 24 | ORDER BY 25 | bin, 26 | client 27 | -------------------------------------------------------------------------------- /sql/histograms/reqOther.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | *, 4 | SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf 5 | FROM ( 6 | SELECT 7 | *, 8 | volume / SUM(volume) OVER (PARTITION BY client) AS pdf 9 | FROM ( 10 | SELECT 11 | client, 12 | COUNT(0) AS volume, 13 | FLOAT64(summary.reqOther) AS bin 14 | FROM 15 | `httparchive.crawl.pages` 16 | WHERE 17 | date = '${YYYY-MM-DD}' AND 18 | is_root_page 19 | GROUP BY 20 | bin, 21 | client 22 | ) 23 | ) 24 | ORDER BY 25 | bin, 26 | client 27 | -------------------------------------------------------------------------------- /sql/histograms/reqTotal.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | *, 4 | SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf 5 | FROM ( 6 | SELECT 7 | *, 8 | volume / SUM(volume) OVER (PARTITION BY client) AS pdf 9 | FROM ( 10 | SELECT 11 | client, 12 | COUNT(0) AS volume, 13 | FLOOR(FLOAT64(summary.reqTotal) / 10) * 10 AS bin 14 | FROM 15 | `httparchive.crawl.pages` 16 | WHERE 17 | date = '${YYYY-MM-DD}' AND 18 | is_root_page 19 | GROUP BY 20 | bin, 21 | client 22 | ) 23 | ) 24 | ORDER BY 25 | bin, 26 | client 27 | -------------------------------------------------------------------------------- /sql/histograms/reqVideo.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | *, 4 | SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf 5 | FROM ( 6 | SELECT 7 | *, 8 | volume / SUM(volume) OVER (PARTITION BY client) AS pdf 9 | FROM ( 10 | SELECT 11 | client, 12 | COUNT(0) AS volume, 13 | FLOAT64(summary.reqVideo) AS bin 14 | FROM 15 | `httparchive.crawl.pages` 16 | WHERE 17 | date = '${YYYY-MM-DD}' AND 18 | is_root_page 19 | GROUP BY 20 | bin, 21 | client 22 | ) 23 | ) 24 | ORDER BY 25 | bin, 26 | client 27 | -------------------------------------------------------------------------------- /sql/histograms/speedIndex.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | *, 4 | SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf 5 | FROM ( 6 | SELECT 7 | *, 8 | volume / SUM(volume) OVER (PARTITION BY client) AS pdf 9 | FROM ( 10 | SELECT 11 | client, 12 | COUNT(0) AS volume, 13 | CAST(FLOOR(FLOAT64(payload._SpeedIndex) / (1000)) * 1000 AS INT64) AS bin 14 | FROM 15 | `httparchive.crawl.pages` 16 | WHERE 17 | date = '${YYYY-MM-DD}' AND 18 | is_root_page 19 | GROUP BY 20 | bin, 21 | client 22 | HAVING 23 | bin IS NOT NULL 24 | ) 25 | ) 26 | ORDER BY 27 | bin, 28 | client 29 | -------------------------------------------------------------------------------- /sql/histograms/tcp.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | *, 4 | SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf 5 | FROM ( 6 | SELECT 7 | *, 8 | volume / SUM(volume) OVER (PARTITION BY client) AS pdf 9 | FROM ( 10 | SELECT 11 | client, 12 | COUNT(0) AS volume, 13 | INT64(summary._connections) AS bin 14 | FROM 15 | `httparchive.crawl.pages` 16 | WHERE 17 | date = '${YYYY-MM-DD}' AND 18 | is_root_page AND 19 | INT64(summary._connections) > 0 20 | GROUP BY 21 | bin, 22 | client 23 | ) 24 | ) 25 | ORDER BY 26 | bin, 27 | client 28 | -------------------------------------------------------------------------------- /sql/histograms/ttci.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | *, 4 | SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf 5 | FROM ( 6 | SELECT 7 | *, 8 | volume / SUM(volume) OVER (PARTITION BY client) AS pdf 9 | FROM ( 10 | SELECT 11 | client, 12 | COUNT(0) AS volume, 13 | CAST(FLOOR(CAST(IFNULL( 14 | FLOAT64(lighthouse.audits.interactive.numericValue), 15 | IFNULL( 16 | FLOAT64(lighthouse.audits['consistently-interactive'].rawValue), 17 | FLOAT64(lighthouse.audits.interactive.rawValue) 18 | ) 19 | ) AS FLOAT64) / 1000) AS INT64) AS bin 20 | FROM 21 | `httparchive.crawl.pages` 22 | WHERE 23 | date >= '2022-03-01' AND 24 | date = '${YYYY-MM-DD}' AND 25 | is_root_page 26 | GROUP BY 27 | bin, 28 | client 29 | HAVING 30 | bin IS NOT NULL 31 | ) 32 | ) 33 | ORDER BY 34 | bin, 35 | client 36 | -------------------------------------------------------------------------------- /sql/lens/drupal/crux_histograms.sql: -------------------------------------------------------------------------------- 1 | INNER JOIN ( 2 | SELECT 3 | page, 4 | client 5 | FROM 6 | `httparchive.crawl.pages` 7 | WHERE 8 | date = '${YYYY-MM-DD}' AND 9 | 'Drupal' IN UNNEST(technologies.technology) 10 | GROUP BY 11 | 1, 12 | 2 13 | ) 14 | ON (SUBSTR(page, 0, LENGTH(page) - 1) = origin AND form_factor.name = IF(client = 'desktop', 'desktop', 'phone')) 15 | -------------------------------------------------------------------------------- /sql/lens/drupal/crux_timeseries.sql: -------------------------------------------------------------------------------- 1 | INNER JOIN ( 2 | SELECT 3 | SUBSTR(page, 0, LENGTH(page) - 1) AS origin, 4 | IF(client = 'mobile', 'phone', client) AS device, 5 | date 6 | FROM 7 | `httparchive.crawl.pages` 8 | WHERE 9 | date >= '2010-11-15' AND 10 | is_root_page AND 11 | 'Drupal' IN UNNEST(technologies.technology) 12 | GROUP BY 13 | 1, 14 | 2, 15 | 3 16 | ) 17 | USING (origin, device, date) 18 | -------------------------------------------------------------------------------- /sql/lens/drupal/histograms.sql: -------------------------------------------------------------------------------- 1 | 'Drupal' IN UNNEST(technologies.technology) 2 | -------------------------------------------------------------------------------- /sql/lens/drupal/timeseries.sql: -------------------------------------------------------------------------------- 1 | 'Drupal' IN UNNEST(technologies.technology) 2 | -------------------------------------------------------------------------------- /sql/lens/magento/crux_histograms.sql: -------------------------------------------------------------------------------- 1 | INNER JOIN ( 2 | SELECT 3 | page, 4 | client 5 | FROM 6 | `httparchive.crawl.pages` 7 | WHERE 8 | date = '${YYYY-MM-DD}' AND 9 | 'Magento' IN UNNEST(technologies.technology) 10 | GROUP BY 11 | 1, 12 | 2 13 | ) 14 | ON (SUBSTR(page, 0, LENGTH(page) - 1) = origin AND form_factor.name = IF(client = 'desktop', 'desktop', 'phone')) 15 | -------------------------------------------------------------------------------- /sql/lens/magento/crux_timeseries.sql: -------------------------------------------------------------------------------- 1 | INNER JOIN ( 2 | SELECT 3 | SUBSTR(page, 0, LENGTH(page) - 1) AS origin, 4 | IF(client = 'mobile', 'phone', client) AS device, 5 | date 6 | FROM 7 | `httparchive.crawl.pages` 8 | WHERE 9 | date >= '2010-11-15' AND 10 | is_root_page AND 11 | 'Magento' IN UNNEST(technologies.technology) 12 | GROUP BY 13 | 1, 14 | 2, 15 | 3 16 | ) 17 | USING (origin, device, date) 18 | -------------------------------------------------------------------------------- /sql/lens/magento/histograms.sql: -------------------------------------------------------------------------------- 1 | 'Magento' IN UNNEST(technologies.technology) 2 | -------------------------------------------------------------------------------- /sql/lens/magento/timeseries.sql: -------------------------------------------------------------------------------- 1 | 'Magento' IN UNNEST(technologies.technology) 2 | -------------------------------------------------------------------------------- /sql/lens/top100k/crux_histograms.sql: -------------------------------------------------------------------------------- 1 | WHERE 2 | experimental.popularity.rank <= 100000 3 | -------------------------------------------------------------------------------- /sql/lens/top100k/crux_timeseries.sql: -------------------------------------------------------------------------------- 1 | INNER JOIN ( 2 | SELECT 3 | SUBSTR(page, 0, LENGTH(page) - 1) AS origin, 4 | IF(client = 'mobile', 'phone', client) AS device, 5 | date 6 | FROM 7 | `httparchive.crawl.pages` 8 | WHERE 9 | date >= '2010-11-15' AND 10 | is_root_page AND 11 | rank = 100000 12 | GROUP BY 13 | 1, 14 | 2, 15 | 3 16 | ) 17 | USING (origin, device, date) 18 | -------------------------------------------------------------------------------- /sql/lens/top100k/histograms.sql: -------------------------------------------------------------------------------- 1 | rank <= 100000 2 | -------------------------------------------------------------------------------- /sql/lens/top100k/timeseries.sql: -------------------------------------------------------------------------------- 1 | rank <= 100000 2 | -------------------------------------------------------------------------------- /sql/lens/top10k/crux_histograms.sql: -------------------------------------------------------------------------------- 1 | WHERE 2 | experimental.popularity.rank <= 10000 3 | -------------------------------------------------------------------------------- /sql/lens/top10k/crux_timeseries.sql: -------------------------------------------------------------------------------- 1 | INNER JOIN ( 2 | SELECT 3 | SUBSTR(page, 0, LENGTH(page) - 1) AS origin, 4 | IF(client = 'mobile', 'phone', client) AS device, 5 | date 6 | FROM 7 | `httparchive.crawl.pages` 8 | WHERE 9 | date >= '2010-11-15' AND 10 | is_root_page AND 11 | rank = 10000 12 | GROUP BY 13 | 1, 14 | 2, 15 | 3 16 | ) 17 | USING (origin, device, date) 18 | -------------------------------------------------------------------------------- /sql/lens/top10k/histograms.sql: -------------------------------------------------------------------------------- 1 | rank <= 10000 2 | -------------------------------------------------------------------------------- /sql/lens/top10k/timeseries.sql: -------------------------------------------------------------------------------- 1 | rank <= 10000 2 | -------------------------------------------------------------------------------- /sql/lens/top1k/crux_histograms.sql: -------------------------------------------------------------------------------- 1 | WHERE 2 | experimental.popularity.rank <= 1000 3 | -------------------------------------------------------------------------------- /sql/lens/top1k/crux_timeseries.sql: -------------------------------------------------------------------------------- 1 | INNER JOIN ( 2 | SELECT 3 | SUBSTR(page, 0, LENGTH(page) - 1) AS origin, 4 | IF(client = 'mobile', 'phone', client) AS device, 5 | date 6 | FROM 7 | `httparchive.crawl.pages` 8 | WHERE 9 | date >= '2010-11-15' AND 10 | is_root_page AND 11 | rank = 1000 12 | GROUP BY 13 | 1, 14 | 2, 15 | 3 16 | ) 17 | USING (origin, device, date) 18 | -------------------------------------------------------------------------------- /sql/lens/top1k/histograms.sql: -------------------------------------------------------------------------------- 1 | rank <= 1000 2 | -------------------------------------------------------------------------------- /sql/lens/top1k/timeseries.sql: -------------------------------------------------------------------------------- 1 | rank <= 1000 2 | -------------------------------------------------------------------------------- /sql/lens/top1m/crux_histograms.sql: -------------------------------------------------------------------------------- 1 | WHERE 2 | experimental.popularity.rank <= 1000000 3 | -------------------------------------------------------------------------------- /sql/lens/top1m/crux_timeseries.sql: -------------------------------------------------------------------------------- 1 | INNER JOIN ( 2 | SELECT 3 | SUBSTR(page, 0, LENGTH(page) - 1) AS origin, 4 | IF(client = 'mobile', 'phone', client) AS device, 5 | date 6 | FROM 7 | `httparchive.crawl.pages` 8 | WHERE 9 | date >= '2010-11-15' AND 10 | is_root_page AND 11 | rank = 1000000 12 | GROUP BY 13 | 1, 14 | 2, 15 | 3 16 | ) 17 | USING (origin, device, date) 18 | -------------------------------------------------------------------------------- /sql/lens/top1m/histograms.sql: -------------------------------------------------------------------------------- 1 | rank <= 10000 2 | -------------------------------------------------------------------------------- /sql/lens/top1m/timeseries.sql: -------------------------------------------------------------------------------- 1 | rank <= 10000 2 | -------------------------------------------------------------------------------- /sql/lens/wordpress/crux_histograms.sql: -------------------------------------------------------------------------------- 1 | INNER JOIN ( 2 | SELECT 3 | page, 4 | client 5 | FROM 6 | `httparchive.crawl.pages` 7 | WHERE 8 | date = '${YYYY-MM-DD}' AND 9 | 'WordPress' IN UNNEST(technologies.technology) 10 | GROUP BY 11 | 1, 12 | 2 13 | ) 14 | ON (SUBSTR(page, 0, LENGTH(page) - 1) = origin AND form_factor.name = IF(client = 'desktop', 'desktop', 'phone')) 15 | -------------------------------------------------------------------------------- /sql/lens/wordpress/crux_timeseries.sql: -------------------------------------------------------------------------------- 1 | INNER JOIN ( 2 | SELECT 3 | SUBSTR(page, 0, LENGTH(page) - 1) AS origin, 4 | IF(client = 'mobile', 'phone', client) AS device, 5 | date 6 | FROM 7 | `httparchive.crawl.pages` 8 | WHERE 9 | date >= '2010-11-15' AND 10 | is_root_page AND 11 | 'WordPress' IN UNNEST(technologies.technology) 12 | GROUP BY 13 | 1, 14 | 2, 15 | 3 16 | ) 17 | USING (origin, device, date) 18 | -------------------------------------------------------------------------------- /sql/lens/wordpress/histograms.sql: -------------------------------------------------------------------------------- 1 | 'WordPress' IN UNNEST(technologies.technology) 2 | -------------------------------------------------------------------------------- /sql/lens/wordpress/timeseries.sql: -------------------------------------------------------------------------------- 1 | 'WordPress' IN UNNEST(technologies.technology) 2 | -------------------------------------------------------------------------------- /sql/new_metric.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Initializes reports for a newly added metric. 4 | # 5 | # Example usage: 6 | # 7 | # sql/new_metric.sh histograms bootupJs lighthouse 8 | # 9 | # Where the first argument is the chart type, 10 | # the second argument is the metric name, 11 | # and the third argument is the BQ dataset. 12 | 13 | set -eo pipefail 14 | 15 | VIZ=$1 16 | METRIC=$2 17 | DATASET=$3 18 | 19 | if [ -z "$VIZ" ]; then 20 | echo "Chart type argument required." >&2 21 | echo "Example usage: sql/new_metric.sh histograms bootupJs lighthouse" >&2 22 | exit 1 23 | fi 24 | 25 | if [ -z "$METRIC" ]; then 26 | echo "Metric argument required." >&2 27 | echo "Example usage: sql/new_metric.sh histograms bootupJs lighthouse" >&2 28 | exit 1 29 | fi 30 | 31 | if [ -z "$DATASET" ]; then 32 | echo "Dataset argument required." >&2 33 | echo "Example usage: sql/new_metric.sh histograms bootupJs lighthouse" >&2 34 | exit 1 35 | fi 36 | 37 | if [ "$VIZ" == "histograms" ]; then 38 | cmd='sql/get_bigquery_dates.sh "$DATASET" "" | xargs -I date sql/generate_report.sh -d date/"$METRIC".json' 39 | fi 40 | if [ "$VIZ" == "timeseries" ]; then 41 | cmd='sql/generate_report.sh -d "$METRIC".json' 42 | fi 43 | 44 | eval $cmd 45 | 46 | lenses=$(ls -1 sql/lens) 47 | for lens in $lenses; do 48 | cmd+=" -l $lens" 49 | eval $cmd 50 | done 51 | -------------------------------------------------------------------------------- /sql/timeseries/a11yButtonName.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | ROUND(SUM(IF(LAX_STRING(lighthouse.audits['button-name'].score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent 7 | FROM 8 | `httparchive.crawl.pages` 9 | WHERE 10 | /* Should really use the following to only include eligible sites. */ 11 | /* LAX_STRING(lighthouse.audits['button-name'].score) IS NOT NULL AND */ 12 | lighthouse IS NOT NULL AND 13 | TO_JSON_STRING(lighthouse) != '{}' AND 14 | is_root_page AND 15 | date >= '2017-06-01' 16 | GROUP BY 17 | date, 18 | timestamp, 19 | client 20 | ORDER BY 21 | date DESC, 22 | client 23 | -------------------------------------------------------------------------------- /sql/timeseries/a11yColorContrast.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | ROUND(SUM(IF(LAX_STRING(lighthouse.audits['color-contrast'].score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent 7 | FROM 8 | `httparchive.crawl.pages` 9 | WHERE 10 | /* Should really use the following to only include eligible sites. */ 11 | /* LAX_STRING(lighthouse.audits['color-contrast'].score) IS NOT NULL AND */ 12 | lighthouse IS NOT NULL AND 13 | TO_JSON_STRING(lighthouse) != '{}' AND 14 | date >= '2017-06-01' AND 15 | is_root_page 16 | GROUP BY 17 | date, 18 | timestamp, 19 | client 20 | ORDER BY 21 | date DESC, 22 | client 23 | -------------------------------------------------------------------------------- /sql/timeseries/a11yImageAlt.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | ROUND(SUM(IF(LAX_STRING(lighthouse.audits['image-alt'].score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent 7 | FROM 8 | `httparchive.crawl.pages` 9 | WHERE 10 | /* Should really use the following to only include eligible sites. */ 11 | /* LAX_STRING(lighthouse.audits['image-alt'].score) IS NOT NULL AND */ 12 | lighthouse IS NOT NULL AND 13 | TO_JSON_STRING(lighthouse) != '{}' AND 14 | date >= '2017-06-01' AND 15 | is_root_page 16 | GROUP BY 17 | date, 18 | timestamp, 19 | client 20 | ORDER BY 21 | date DESC, 22 | client 23 | -------------------------------------------------------------------------------- /sql/timeseries/a11yLabel.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | ROUND(SUM(IF(LAX_STRING(lighthouse.audits.label.score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent 7 | FROM 8 | `httparchive.crawl.pages` 9 | WHERE 10 | /* Should really use the following to only include eligible sites. */ 11 | /* LAX_STRING(lighthouse.audits.label.score) IS NOT NULL AND */ 12 | lighthouse IS NOT NULL AND 13 | TO_JSON_STRING(lighthouse) != '{}' AND 14 | date >= '2017-06-01' AND 15 | is_root_page 16 | GROUP BY 17 | date, 18 | timestamp, 19 | client 20 | ORDER BY 21 | date DESC, 22 | client 23 | -------------------------------------------------------------------------------- /sql/timeseries/a11yLinkName.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | ROUND(SUM(IF(LAX_STRING(lighthouse.audits['link-name'].score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent 7 | FROM 8 | `httparchive.crawl.pages` 9 | WHERE 10 | /* Should really use the following to only include eligible sites. */ 11 | /* LAX_STRING(lighthouse.audits['link-name'].score) IS NOT NULL AND */ 12 | lighthouse IS NOT NULL AND 13 | TO_JSON_STRING(lighthouse) != '{}' AND 14 | date >= '2017-06-01' AND 15 | is_root_page 16 | GROUP BY 17 | date, 18 | timestamp, 19 | client 20 | ORDER BY 21 | date DESC, 22 | client 23 | -------------------------------------------------------------------------------- /sql/timeseries/a11yScores.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | # Lighthouse changed format of scores in v3.0.0 released in July 2018 so handle old with a UDF 3 | CREATE TEMPORARY FUNCTION getA11yScore(reportCategories JSON) 4 | RETURNS FLOAT64 DETERMINISTIC 5 | LANGUAGE js AS """ 6 | if(reportCategories) { 7 | return reportCategories.find(i => i.name === 'Accessibility').score; 8 | } 9 | """; 10 | 11 | SELECT 12 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 13 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 14 | client, 15 | ROUND(APPROX_QUANTILES(score, 1000)[OFFSET(100)], 2) AS p10, 16 | ROUND(APPROX_QUANTILES(score, 1000)[OFFSET(250)], 2) AS p25, 17 | ROUND(APPROX_QUANTILES(score, 1000)[OFFSET(500)], 2) AS p50, 18 | ROUND(APPROX_QUANTILES(score, 1000)[OFFSET(750)], 2) AS p75, 19 | ROUND(APPROX_QUANTILES(score, 1000)[OFFSET(900)], 2) AS p90 20 | FROM ( 21 | SELECT 22 | date, 23 | client, 24 | IFNULL(LAX_FLOAT64(lighthouse.categories.accessibility.score) * 100, getA11yScore(lighthouse.reportCategories)) AS score 25 | FROM 26 | `httparchive.crawl.pages` 27 | WHERE 28 | lighthouse IS NOT NULL AND 29 | TO_JSON_STRING(lighthouse) != '{}' AND 30 | date >= '2017-06-01' AND 31 | is_root_page 32 | ) 33 | GROUP BY 34 | date, 35 | timestamp, 36 | client 37 | ORDER BY 38 | date DESC, 39 | client 40 | -------------------------------------------------------------------------------- /sql/timeseries/asyncClipboardRead.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls, 7 | ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent 8 | FROM 9 | `httparchive.crawl.pages` 10 | LEFT OUTER JOIN UNNEST(features) AS feat 11 | ON (feat.id = '2369' OR feat.feature = 'AsyncClipboardAPIRead') 12 | WHERE 13 | date >= '2016-11-15' AND 14 | is_root_page 15 | GROUP BY 16 | date, 17 | timestamp, 18 | client 19 | ORDER BY 20 | date DESC, 21 | client, 22 | num_urls DESC 23 | -------------------------------------------------------------------------------- /sql/timeseries/badgeClear.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls, 7 | ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent 8 | FROM 9 | `httparchive.crawl.pages` 10 | LEFT OUTER JOIN UNNEST(features) AS feat 11 | ON (feat.id = '2727' OR feat.feature = 'BadgeClear') 12 | WHERE 13 | date >= '2016-11-15' AND 14 | is_root_page 15 | GROUP BY 16 | date, 17 | timestamp, 18 | client 19 | ORDER BY 20 | date DESC, 21 | client, 22 | num_urls DESC 23 | -------------------------------------------------------------------------------- /sql/timeseries/badgeSet.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls, 7 | ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent 8 | FROM 9 | `httparchive.crawl.pages` 10 | LEFT OUTER JOIN UNNEST(features) AS feat 11 | ON (feat.id = '2726' OR feat.feature = 'BadgeSet') 12 | WHERE 13 | date >= '2016-11-15' AND 14 | is_root_page 15 | GROUP BY 16 | date, 17 | timestamp, 18 | client 19 | ORDER BY 20 | date DESC, 21 | client, 22 | num_urls DESC 23 | -------------------------------------------------------------------------------- /sql/timeseries/bootupJs.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(100)], 2) AS p10, 7 | ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(250)], 2) AS p25, 8 | ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(500)], 2) AS p50, 9 | ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(750)], 2) AS p75, 10 | ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(900)], 2) AS p90 11 | FROM ( 12 | SELECT 13 | date, 14 | client, 15 | IFNULL( 16 | FLOAT64(lighthouse.audits['bootup-time'].numericValue), 17 | FLOAT64(lighthouse.audits['bootup-time'].rawValue) 18 | ) / 1000 AS value 19 | FROM 20 | `httparchive.crawl.pages` 21 | WHERE 22 | lighthouse IS NOT NULL AND 23 | TO_JSON_STRING(lighthouse) != '{}' AND 24 | date >= '2017-06-01' AND 25 | is_root_page 26 | ) 27 | GROUP BY 28 | date, 29 | timestamp, 30 | client 31 | ORDER BY 32 | date DESC, 33 | client 34 | -------------------------------------------------------------------------------- /sql/timeseries/bytesCss.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesCss), 1001)[OFFSET(101)] / 1024, 2) AS p10, 7 | ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesCss), 1001)[OFFSET(251)] / 1024, 2) AS p25, 8 | ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesCss), 1001)[OFFSET(501)] / 1024, 2) AS p50, 9 | ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesCss), 1001)[OFFSET(751)] / 1024, 2) AS p75, 10 | ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesCss), 1001)[OFFSET(901)] / 1024, 2) AS p90 11 | FROM 12 | `httparchive.crawl.pages` 13 | WHERE 14 | date >= '2010-11-15' AND 15 | is_root_page AND 16 | FLOAT64(summary.bytesCss) > 0 17 | GROUP BY 18 | date, 19 | timestamp, 20 | client 21 | ORDER BY 22 | date DESC, 23 | client 24 | -------------------------------------------------------------------------------- /sql/timeseries/bytesFont.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesFont), 1001)[OFFSET(101)] / 1024, 2) AS p10, 7 | ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesFont), 1001)[OFFSET(251)] / 1024, 2) AS p25, 8 | ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesFont), 1001)[OFFSET(501)] / 1024, 2) AS p50, 9 | ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesFont), 1001)[OFFSET(751)] / 1024, 2) AS p75, 10 | ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesFont), 1001)[OFFSET(901)] / 1024, 2) AS p90 11 | FROM 12 | `httparchive.crawl.pages` 13 | WHERE 14 | date >= '2010-11-15' AND 15 | is_root_page AND 16 | FLOAT64(summary.bytesFont) > 0 17 | GROUP BY 18 | date, 19 | timestamp, 20 | client 21 | ORDER BY 22 | date DESC, 23 | client 24 | -------------------------------------------------------------------------------- /sql/timeseries/bytesHtml.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesHtml), 1001)[OFFSET(101)] / 1024, 2) AS p10, 7 | ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesHtml), 1001)[OFFSET(251)] / 1024, 2) AS p25, 8 | ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesHtml), 1001)[OFFSET(501)] / 1024, 2) AS p50, 9 | ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesHtml), 1001)[OFFSET(751)] / 1024, 2) AS p75, 10 | ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesHtml), 1001)[OFFSET(901)] / 1024, 2) AS p90 11 | FROM 12 | `httparchive.crawl.pages` 13 | WHERE 14 | date >= '2010-11-15' AND 15 | is_root_page AND 16 | FLOAT64(summary.bytesHtml) > 0 17 | GROUP BY 18 | date, 19 | timestamp, 20 | client 21 | ORDER BY 22 | date DESC, 23 | client 24 | -------------------------------------------------------------------------------- /sql/timeseries/bytesImg.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesImg), 1001)[OFFSET(101)] / 1024, 2) AS p10, 7 | ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesImg), 1001)[OFFSET(251)] / 1024, 2) AS p25, 8 | ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesImg), 1001)[OFFSET(501)] / 1024, 2) AS p50, 9 | ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesImg), 1001)[OFFSET(751)] / 1024, 2) AS p75, 10 | ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesImg), 1001)[OFFSET(901)] / 1024, 2) AS p90 11 | FROM 12 | `httparchive.crawl.pages` 13 | WHERE 14 | date >= '2010-11-15' AND 15 | is_root_page AND 16 | FLOAT64(summary.bytesImg) > 0 17 | GROUP BY 18 | date, 19 | timestamp, 20 | client 21 | ORDER BY 22 | date DESC, 23 | client 24 | -------------------------------------------------------------------------------- /sql/timeseries/bytesJs.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesJS), 1001)[OFFSET(101)] / 1024, 2) AS p10, 7 | ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesJS), 1001)[OFFSET(251)] / 1024, 2) AS p25, 8 | ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesJS), 1001)[OFFSET(501)] / 1024, 2) AS p50, 9 | ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesJS), 1001)[OFFSET(751)] / 1024, 2) AS p75, 10 | ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesJS), 1001)[OFFSET(901)] / 1024, 2) AS p90 11 | FROM 12 | `httparchive.crawl.pages` 13 | WHERE 14 | date >= '2010-11-15' AND 15 | is_root_page AND 16 | FLOAT64(summary.bytesJS) > 0 17 | GROUP BY 18 | date, 19 | timestamp, 20 | client 21 | ORDER BY 22 | date DESC, 23 | client 24 | -------------------------------------------------------------------------------- /sql/timeseries/bytesOther.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesOther), 1001)[OFFSET(101)] / 1024, 2) AS p10, 7 | ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesOther), 1001)[OFFSET(251)] / 1024, 2) AS p25, 8 | ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesOther), 1001)[OFFSET(501)] / 1024, 2) AS p50, 9 | ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesOther), 1001)[OFFSET(751)] / 1024, 2) AS p75, 10 | ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesOther), 1001)[OFFSET(901)] / 1024, 2) AS p90 11 | FROM 12 | `httparchive.crawl.pages` 13 | WHERE 14 | date >= '2010-11-15' AND 15 | is_root_page AND 16 | FLOAT64(summary.bytesOther) > 0 17 | GROUP BY 18 | date, 19 | timestamp, 20 | client 21 | ORDER BY 22 | date DESC, 23 | client 24 | -------------------------------------------------------------------------------- /sql/timeseries/bytesTotal.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesTotal), 1001)[OFFSET(101)] / 1024, 2) AS p10, 7 | ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesTotal), 1001)[OFFSET(251)] / 1024, 2) AS p25, 8 | ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesTotal), 1001)[OFFSET(501)] / 1024, 2) AS p50, 9 | ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesTotal), 1001)[OFFSET(751)] / 1024, 2) AS p75, 10 | ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesTotal), 1001)[OFFSET(901)] / 1024, 2) AS p90 11 | FROM 12 | `httparchive.crawl.pages` 13 | WHERE 14 | date >= '2010-11-15' AND 15 | is_root_page AND 16 | FLOAT64(summary.bytesTotal) > 0 17 | GROUP BY 18 | date, 19 | timestamp, 20 | client 21 | ORDER BY 22 | date DESC, 23 | client 24 | -------------------------------------------------------------------------------- /sql/timeseries/bytesVideo.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesVideo), 1001)[OFFSET(101)] / 1024, 2) AS p10, 7 | ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesVideo), 1001)[OFFSET(251)] / 1024, 2) AS p25, 8 | ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesVideo), 1001)[OFFSET(501)] / 1024, 2) AS p50, 9 | ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesVideo), 1001)[OFFSET(751)] / 1024, 2) AS p75, 10 | ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesVideo), 1001)[OFFSET(901)] / 1024, 2) AS p90 11 | FROM 12 | `httparchive.crawl.pages` 13 | WHERE 14 | date >= '2010-11-15' AND 15 | is_root_page AND 16 | FLOAT64(summary.bytesVideo) > 0 17 | GROUP BY 18 | date, 19 | timestamp, 20 | client 21 | ORDER BY 22 | date DESC, 23 | client 24 | -------------------------------------------------------------------------------- /sql/timeseries/canonical.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | ROUND(SUM(IF(LAX_STRING(lighthouse.audits.canonical.score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent 7 | FROM 8 | `httparchive.crawl.pages` 9 | WHERE 10 | lighthouse IS NOT NULL AND 11 | TO_JSON_STRING(lighthouse) != '{}' AND 12 | date >= '2017-06-01' AND 13 | is_root_page 14 | GROUP BY 15 | date, 16 | timestamp, 17 | client 18 | ORDER BY 19 | date DESC, 20 | client 21 | -------------------------------------------------------------------------------- /sql/timeseries/contentIndex.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls, 7 | ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent 8 | FROM 9 | `httparchive.crawl.pages` 10 | LEFT OUTER JOIN UNNEST(features) AS feat 11 | ON (feat.id = '2983' OR feat.feature = 'ContentIndexAdd') 12 | WHERE 13 | date >= '2016-11-15' AND 14 | is_root_page 15 | GROUP BY 16 | date, 17 | timestamp, 18 | client 19 | ORDER BY 20 | date DESC, 21 | client, 22 | num_urls DESC 23 | -------------------------------------------------------------------------------- /sql/timeseries/cruxFastDcl.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | # Fast Dopm Content Loaded by device 3 | 4 | CREATE TEMP FUNCTION IS_GOOD(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( 5 | good / (good + needs_improvement + poor) >= 0.75 6 | ); 7 | 8 | CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( 9 | good + needs_improvement + poor > 0 10 | ); 11 | 12 | SELECT 13 | REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1_\\2_01') AS date, 14 | UNIX_DATE(CAST(REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1-\\2-01') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, 15 | IF(device = 'desktop', 'desktop', 'mobile') AS client, 16 | SAFE_DIVIDE( 17 | COUNT(DISTINCT IF(IS_GOOD(fast_dcl, avg_dcl, slow_dcl), origin, NULL)), 18 | COUNT(DISTINCT IF(IS_NON_ZERO(fast_dcl, avg_dcl, slow_dcl), origin, NULL)) 19 | ) * 100 AS percent 20 | FROM 21 | `chrome-ux-report.materialized.device_summary` 22 | WHERE 23 | device IN ('desktop', 'phone') 24 | GROUP BY 25 | date, 26 | timestamp, 27 | client 28 | ORDER BY 29 | date DESC, 30 | client 31 | -------------------------------------------------------------------------------- /sql/timeseries/cruxFastFcp.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | # Fast FCP by device 3 | 4 | CREATE TEMP FUNCTION IS_GOOD(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( 5 | good / (good + needs_improvement + poor) >= 0.75 6 | ); 7 | 8 | CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( 9 | good + needs_improvement + poor > 0 10 | ); 11 | 12 | SELECT 13 | REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1_\\2_01') AS date, 14 | UNIX_DATE(CAST(REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1-\\2-01') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, 15 | IF(device = 'desktop', 'desktop', 'mobile') AS client, 16 | SAFE_DIVIDE( 17 | COUNT(DISTINCT IF(IS_GOOD(fast_fcp, avg_fcp, slow_fcp), origin, NULL)), 18 | COUNT(DISTINCT IF(IS_NON_ZERO(fast_fcp, avg_fcp, slow_fcp), origin, NULL)) 19 | ) * 100 AS percent 20 | FROM 21 | `chrome-ux-report.materialized.device_summary` 22 | WHERE 23 | device IN ('desktop', 'phone') 24 | GROUP BY 25 | date, 26 | timestamp, 27 | client 28 | ORDER BY 29 | date DESC, 30 | client 31 | -------------------------------------------------------------------------------- /sql/timeseries/cruxFastFp.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | # Fast First Paint by device 3 | 4 | CREATE TEMP FUNCTION IS_GOOD(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( 5 | good / (good + needs_improvement + poor) >= 0.75 6 | ); 7 | 8 | CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( 9 | good + needs_improvement + poor > 0 10 | ); 11 | 12 | SELECT 13 | REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1_\\2_01') AS date, 14 | UNIX_DATE(CAST(REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1-\\2-01') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, 15 | IF(device = 'desktop', 'desktop', 'mobile') AS client, 16 | SAFE_DIVIDE( 17 | COUNT(DISTINCT IF(IS_GOOD(fast_fp, avg_fp, slow_fp), origin, NULL)), 18 | COUNT(DISTINCT IF(IS_NON_ZERO(fast_fp, avg_fp, slow_fp), origin, NULL)) 19 | ) * 100 AS percent 20 | FROM 21 | `chrome-ux-report.materialized.device_summary` 22 | WHERE 23 | device IN ('desktop', 'phone') 24 | GROUP BY 25 | date, 26 | timestamp, 27 | client 28 | ORDER BY 29 | date DESC, 30 | client 31 | -------------------------------------------------------------------------------- /sql/timeseries/cruxFastInp.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | # Small CLS by device 3 | 4 | CREATE TEMP FUNCTION IS_GOOD(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( 5 | good / (good + needs_improvement + poor) >= 0.75 6 | ); 7 | 8 | CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( 9 | good + needs_improvement + poor > 0 10 | ); 11 | 12 | SELECT 13 | REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1_\\2_01') AS date, 14 | UNIX_DATE(CAST(REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1-\\2-01') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, 15 | IF(device = 'desktop', 'desktop', 'mobile') AS client, 16 | SAFE_DIVIDE( 17 | COUNT(DISTINCT IF(IS_GOOD(fast_inp, avg_inp, slow_inp), origin, NULL)), 18 | COUNT(DISTINCT IF(IS_NON_ZERO(fast_inp, avg_inp, slow_inp), origin, NULL)) 19 | ) * 100 AS percent 20 | FROM 21 | `chrome-ux-report.materialized.device_summary` 22 | WHERE 23 | device IN ('desktop', 'phone') AND 24 | yyyymm >= 202202 25 | GROUP BY 26 | date, 27 | timestamp, 28 | client 29 | ORDER BY 30 | date DESC, 31 | client 32 | -------------------------------------------------------------------------------- /sql/timeseries/cruxFastLcp.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | # Fast LCP by device 3 | 4 | CREATE TEMP FUNCTION IS_GOOD(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( 5 | good / (good + needs_improvement + poor) >= 0.75 6 | ); 7 | 8 | CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( 9 | good + needs_improvement + poor > 0 10 | ); 11 | 12 | SELECT 13 | REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1_\\2_01') AS date, 14 | UNIX_DATE(CAST(REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1-\\2-01') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, 15 | IF(device = 'desktop', 'desktop', 'mobile') AS client, 16 | SAFE_DIVIDE( 17 | COUNT(DISTINCT IF(IS_GOOD(fast_lcp, avg_lcp, slow_lcp), origin, NULL)), 18 | COUNT(DISTINCT IF(IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp), origin, NULL)) 19 | ) * 100 AS percent 20 | FROM 21 | `chrome-ux-report.materialized.device_summary` 22 | WHERE 23 | device IN ('desktop', 'phone') AND 24 | yyyymm >= 201909 25 | GROUP BY 26 | date, 27 | timestamp, 28 | client 29 | ORDER BY 30 | date DESC, 31 | client 32 | -------------------------------------------------------------------------------- /sql/timeseries/cruxFastOl.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | # Fast Onload by device 3 | 4 | CREATE TEMP FUNCTION IS_GOOD(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( 5 | good / (good + needs_improvement + poor) >= 0.75 6 | ); 7 | 8 | CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( 9 | good + needs_improvement + poor > 0 10 | ); 11 | 12 | SELECT 13 | REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1_\\2_01') AS date, 14 | UNIX_DATE(CAST(REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1-\\2-01') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, 15 | IF(device = 'desktop', 'desktop', 'mobile') AS client, 16 | SAFE_DIVIDE( 17 | COUNT(DISTINCT IF(IS_GOOD(fast_ol, avg_ol, slow_ol), origin, NULL)), 18 | COUNT(DISTINCT IF(IS_NON_ZERO(fast_ol, avg_ol, slow_ol), origin, NULL)) 19 | ) * 100 AS percent 20 | FROM 21 | `chrome-ux-report.materialized.device_summary` 22 | WHERE 23 | device IN ('desktop', 'phone') 24 | GROUP BY 25 | date, 26 | timestamp, 27 | client 28 | ORDER BY 29 | date DESC, 30 | client 31 | -------------------------------------------------------------------------------- /sql/timeseries/cruxFastTtfb.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | # Small CLS by device 3 | 4 | CREATE TEMP FUNCTION IS_GOOD(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( 5 | SAFE_DIVIDE(good, (good + needs_improvement + poor)) >= 0.75 6 | ); 7 | 8 | CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( 9 | good + needs_improvement + poor > 0 10 | ); 11 | 12 | SELECT 13 | REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1_\\2_01') AS date, 14 | UNIX_DATE(CAST(REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1-\\2-01') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, 15 | IF(device = 'desktop', 'desktop', 'mobile') AS client, 16 | SAFE_DIVIDE( 17 | COUNT(DISTINCT IF(IS_GOOD(fast_ttfb, avg_ttfb, slow_ttfb), origin, NULL)), 18 | COUNT(DISTINCT IF(IS_NON_ZERO(fast_ttfb, avg_ttfb, slow_ttfb), origin, NULL)) 19 | ) * 100 AS percent 20 | FROM 21 | `chrome-ux-report.materialized.device_summary` 22 | WHERE 23 | device IN ('desktop', 'phone') 24 | GROUP BY 25 | date, 26 | timestamp, 27 | client 28 | ORDER BY 29 | date DESC, 30 | client 31 | -------------------------------------------------------------------------------- /sql/timeseries/cruxLargeCls.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | # Large CLS by device 3 | 4 | CREATE TEMP FUNCTION IS_POOR(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( 5 | poor / (good + needs_improvement + poor) >= 0.25 6 | ); 7 | 8 | CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( 9 | good + needs_improvement + poor > 0 10 | ); 11 | 12 | SELECT 13 | REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1_\\2_01') AS date, 14 | UNIX_DATE(CAST(REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1-\\2-01') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, 15 | IF(device = 'desktop', 'desktop', 'mobile') AS client, 16 | SAFE_DIVIDE( 17 | COUNT(DISTINCT IF(IS_POOR(small_cls, medium_cls, large_cls), origin, NULL)), 18 | COUNT(DISTINCT IF(IS_NON_ZERO(small_cls, medium_cls, large_cls), origin, NULL)) 19 | ) * 100 AS percent 20 | FROM 21 | `chrome-ux-report.materialized.device_summary` 22 | WHERE 23 | device IN ('desktop', 'phone') AND 24 | yyyymm >= 201905 25 | GROUP BY 26 | date, 27 | timestamp, 28 | client 29 | ORDER BY 30 | date DESC, 31 | client 32 | -------------------------------------------------------------------------------- /sql/timeseries/cruxPassesCWV.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | # Passes Core Web Vitals by device 3 | 4 | CREATE TEMP FUNCTION IS_GOOD(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( 5 | good / (good + needs_improvement + poor) >= 0.75 6 | ); 7 | 8 | CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( 9 | good + needs_improvement + poor > 0 10 | ); 11 | 12 | SELECT 13 | REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1_\\2_01') AS date, 14 | UNIX_DATE(CAST(REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1-\\2-01') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, 15 | IF(device = 'desktop', 'desktop', 'mobile') AS client, 16 | SAFE_DIVIDE( 17 | COUNT(DISTINCT IF( 18 | IF( 19 | /* INP replaced FID as a CWV in March 2024 (202402 release date). */ 20 | yyyymm >= 202402, 21 | /* INP/FID can be null and are not mandatory for CWV */ 22 | (p75_inp IS NULL OR IS_GOOD(fast_inp, avg_inp, slow_inp)), 23 | (p75_fid IS NULL OR IS_GOOD(fast_fid, avg_fid, slow_fid)) 24 | ) AND 25 | IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AND 26 | IS_GOOD(small_cls, medium_cls, large_cls), origin, NULL 27 | )), 28 | COUNT(DISTINCT origin) 29 | ) * 100 AS percent 30 | FROM 31 | `chrome-ux-report.materialized.device_summary` 32 | WHERE 33 | device IN ('desktop', 'phone') AND 34 | yyyymm > 201909 AND 35 | p75_lcp IS NOT NULL AND p75_cls IS NOT NULL /* Must have LCP and CLS */ 36 | GROUP BY 37 | date, 38 | timestamp, 39 | client 40 | ORDER BY 41 | date DESC, 42 | client 43 | -------------------------------------------------------------------------------- /sql/timeseries/cruxSlowFcp.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | # Slow FCP by device 3 | 4 | CREATE TEMP FUNCTION IS_POOR(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( 5 | poor / (good + needs_improvement + poor) >= 0.25 6 | ); 7 | 8 | CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( 9 | good + needs_improvement + poor > 0 10 | ); 11 | 12 | SELECT 13 | REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1_\\2_01') AS date, 14 | UNIX_DATE(CAST(REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1-\\2-01') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, 15 | IF(device = 'desktop', 'desktop', 'mobile') AS client, 16 | SAFE_DIVIDE( 17 | COUNT(DISTINCT IF(IS_POOR(fast_fcp, avg_fcp, slow_fcp), origin, NULL)), 18 | COUNT(DISTINCT IF(IS_NON_ZERO(fast_fcp, avg_fcp, slow_fcp), origin, NULL)) 19 | ) * 100 AS percent 20 | FROM 21 | `chrome-ux-report.materialized.device_summary` 22 | WHERE 23 | device IN ('desktop', 'phone') 24 | GROUP BY 25 | date, 26 | timestamp, 27 | client 28 | ORDER BY 29 | date DESC, 30 | client 31 | -------------------------------------------------------------------------------- /sql/timeseries/cruxSlowInp.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | # Large CLS by device 3 | 4 | CREATE TEMP FUNCTION IS_POOR(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( 5 | poor / (good + needs_improvement + poor) >= 0.25 6 | ); 7 | 8 | CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( 9 | good + needs_improvement + poor > 0 10 | ); 11 | 12 | SELECT 13 | REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1_\\2_01') AS date, 14 | UNIX_DATE(CAST(REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1-\\2-01') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, 15 | IF(device = 'desktop', 'desktop', 'mobile') AS client, 16 | SAFE_DIVIDE( 17 | COUNT(DISTINCT IF(IS_POOR(fast_inp, avg_inp, slow_inp), origin, NULL)), 18 | COUNT(DISTINCT IF(IS_NON_ZERO(fast_inp, avg_inp, slow_inp), origin, NULL)) 19 | ) * 100 AS percent 20 | FROM 21 | `chrome-ux-report.materialized.device_summary` 22 | WHERE 23 | device IN ('desktop', 'phone') AND 24 | yyyymm >= 202202 25 | GROUP BY 26 | date, 27 | timestamp, 28 | client 29 | ORDER BY 30 | date DESC, 31 | client 32 | -------------------------------------------------------------------------------- /sql/timeseries/cruxSlowLcp.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | # Slow LCP by device 3 | 4 | CREATE TEMP FUNCTION IS_POOR(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( 5 | poor / (good + needs_improvement + poor) >= 0.25 6 | ); 7 | 8 | CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( 9 | good + needs_improvement + poor > 0 10 | ); 11 | 12 | SELECT 13 | REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1_\\2_01') AS date, 14 | UNIX_DATE(CAST(REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1-\\2-01') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, 15 | IF(device = 'desktop', 'desktop', 'mobile') AS client, 16 | SAFE_DIVIDE( 17 | COUNT(DISTINCT IF(IS_POOR(fast_lcp, avg_lcp, slow_lcp), origin, NULL)), 18 | COUNT(DISTINCT IF(IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp), origin, NULL)) 19 | ) * 100 AS percent 20 | FROM 21 | `chrome-ux-report.materialized.device_summary` 22 | WHERE 23 | device IN ('desktop', 'phone') AND 24 | yyyymm >= 201909 25 | GROUP BY 26 | date, 27 | timestamp, 28 | client 29 | ORDER BY 30 | date DESC, 31 | client 32 | -------------------------------------------------------------------------------- /sql/timeseries/cruxSlowTtfb.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | # Large CLS by device 3 | 4 | CREATE TEMP FUNCTION IS_POOR(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( 5 | SAFE_DIVIDE(poor, (good + needs_improvement + poor)) >= 0.25 6 | ); 7 | 8 | CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( 9 | good + needs_improvement + poor > 0 10 | ); 11 | 12 | SELECT 13 | REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1_\\2_01') AS date, 14 | UNIX_DATE(CAST(REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1-\\2-01') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, 15 | IF(device = 'desktop', 'desktop', 'mobile') AS client, 16 | SAFE_DIVIDE( 17 | COUNT(DISTINCT IF(IS_POOR(fast_ttfb, avg_ttfb, slow_ttfb), origin, NULL)), 18 | COUNT(DISTINCT IF(IS_NON_ZERO(fast_ttfb, avg_ttfb, slow_ttfb), origin, NULL)) 19 | ) * 100 AS percent 20 | FROM 21 | `chrome-ux-report.materialized.device_summary` 22 | WHERE 23 | device IN ('desktop', 'phone') 24 | GROUP BY 25 | date, 26 | timestamp, 27 | client 28 | ORDER BY 29 | date DESC, 30 | client 31 | -------------------------------------------------------------------------------- /sql/timeseries/cruxSmallCls.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | # Small CLS by device 3 | 4 | CREATE TEMP FUNCTION IS_GOOD(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( 5 | good / (good + needs_improvement + poor) >= 0.75 6 | ); 7 | 8 | CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( 9 | good + needs_improvement + poor > 0 10 | ); 11 | 12 | SELECT 13 | REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1_\\2_01') AS date, 14 | UNIX_DATE(CAST(REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1-\\2-01') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, 15 | IF(device = 'desktop', 'desktop', 'mobile') AS client, 16 | SAFE_DIVIDE( 17 | COUNT(DISTINCT IF(IS_GOOD(small_cls, medium_cls, large_cls), origin, NULL)), 18 | COUNT(DISTINCT IF(IS_NON_ZERO(small_cls, medium_cls, large_cls), origin, NULL)) 19 | ) * 100 AS percent 20 | FROM 21 | `chrome-ux-report.materialized.device_summary` 22 | WHERE 23 | device IN ('desktop', 'phone') AND 24 | yyyymm >= 201905 25 | GROUP BY 26 | date, 27 | timestamp, 28 | client 29 | ORDER BY 30 | date DESC, 31 | client 32 | -------------------------------------------------------------------------------- /sql/timeseries/dcl.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | ROUND(APPROX_QUANTILES(FLOAT64(summary.onContentLoaded), 1001)[OFFSET(101)] / 1000, 2) AS p10, 7 | ROUND(APPROX_QUANTILES(FLOAT64(summary.onContentLoaded), 1001)[OFFSET(251)] / 1000, 2) AS p25, 8 | ROUND(APPROX_QUANTILES(FLOAT64(summary.onContentLoaded), 1001)[OFFSET(501)] / 1000, 2) AS p50, 9 | ROUND(APPROX_QUANTILES(FLOAT64(summary.onContentLoaded), 1001)[OFFSET(751)] / 1000, 2) AS p75, 10 | ROUND(APPROX_QUANTILES(FLOAT64(summary.onContentLoaded), 1001)[OFFSET(901)] / 1000, 2) AS p90 11 | FROM 12 | `httparchive.crawl.pages` 13 | WHERE 14 | date >= '2010-11-15' AND 15 | is_root_page AND 16 | FLOAT64(summary.onContentLoaded) > 0 17 | GROUP BY 18 | date, 19 | timestamp, 20 | client 21 | ORDER BY 22 | date DESC, 23 | client 24 | -------------------------------------------------------------------------------- /sql/timeseries/fcp.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | ROUND(APPROX_QUANTILES(FLOAT64(payload['_chromeUserTiming.firstContentfulPaint']), 1001)[OFFSET(101)] / 1024, 2) AS p10, 7 | ROUND(APPROX_QUANTILES(FLOAT64(payload['_chromeUserTiming.firstContentfulPaint']), 1001)[OFFSET(251)] / 1024, 2) AS p25, 8 | ROUND(APPROX_QUANTILES(FLOAT64(payload['_chromeUserTiming.firstContentfulPaint']), 1001)[OFFSET(501)] / 1024, 2) AS p50, 9 | ROUND(APPROX_QUANTILES(FLOAT64(payload['_chromeUserTiming.firstContentfulPaint']), 1001)[OFFSET(751)] / 1024, 2) AS p75, 10 | ROUND(APPROX_QUANTILES(FLOAT64(payload['_chromeUserTiming.firstContentfulPaint']), 1001)[OFFSET(901)] / 1024, 2) AS p90 11 | FROM 12 | `httparchive.crawl.pages` 13 | WHERE 14 | date >= '2016-12-15' AND 15 | is_root_page 16 | GROUP BY 17 | date, 18 | timestamp, 19 | client 20 | HAVING 21 | p50 IS NOT NULL 22 | ORDER BY 23 | date DESC, 24 | client 25 | -------------------------------------------------------------------------------- /sql/timeseries/fontDisplay.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | ROUND(SUM(IF(LAX_STRING(lighthouse.audits['font-display'].score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent 7 | FROM 8 | `httparchive.crawl.pages` 9 | WHERE 10 | lighthouse IS NOT NULL AND 11 | TO_JSON_STRING(lighthouse) != '{}' AND 12 | date >= '2017-06-01' AND 13 | is_root_page AND 14 | LAX_STRING(lighthouse.audits['font-display'].score) IS NOT NULL 15 | GROUP BY 16 | date, 17 | timestamp, 18 | client 19 | ORDER BY 20 | date DESC, 21 | client 22 | -------------------------------------------------------------------------------- /sql/timeseries/getInstalledRelatedApps.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls, 7 | ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent 8 | FROM 9 | `httparchive.crawl.pages` 10 | LEFT OUTER JOIN UNNEST(features) AS feat 11 | ON (feat.id = '1870' OR feat.feature = 'V8Navigator_GetInstalledRelatedApps_Method') 12 | WHERE 13 | date >= '2016-11-15' AND 14 | is_root_page 15 | GROUP BY 16 | date, 17 | timestamp, 18 | client 19 | ORDER BY 20 | date DESC, 21 | client, 22 | num_urls DESC 23 | -------------------------------------------------------------------------------- /sql/timeseries/gzipSavings.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | ROUND(APPROX_QUANTILES(FLOAT64(payload._gzip_savings), 1001)[OFFSET(101)] / 1024, 2) AS p10, 7 | ROUND(APPROX_QUANTILES(FLOAT64(payload._gzip_savings), 1001)[OFFSET(251)] / 1024, 2) AS p25, 8 | ROUND(APPROX_QUANTILES(FLOAT64(payload._gzip_savings), 1001)[OFFSET(501)] / 1024, 2) AS p50, 9 | ROUND(APPROX_QUANTILES(FLOAT64(payload._gzip_savings), 1001)[OFFSET(751)] / 1024, 2) AS p75, 10 | ROUND(APPROX_QUANTILES(FLOAT64(payload._gzip_savings), 1001)[OFFSET(901)] / 1024, 2) AS p90 11 | FROM 12 | `httparchive.crawl.pages` 13 | WHERE 14 | is_root_page 15 | GROUP BY 16 | date, 17 | timestamp, 18 | client 19 | ORDER BY 20 | date DESC, 21 | client 22 | -------------------------------------------------------------------------------- /sql/timeseries/h2.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | ROUND(SUM(IF(LAX_STRING(r.summary.respHttpVersion) = 'HTTP/2', 1, 0)) * 100 / COUNT(0), 2) AS percent 7 | FROM 8 | `httparchive.crawl.requests` r 9 | INNER JOIN 10 | `httparchive.crawl.pages` 11 | USING (date, client, is_root_page, rank, page) 12 | WHERE 13 | is_root_page AND 14 | date >= '2016-07-15' 15 | GROUP BY 16 | date, 17 | timestamp, 18 | client 19 | ORDER BY 20 | date DESC, 21 | client 22 | -------------------------------------------------------------------------------- /sql/timeseries/h3.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | # The amount of requests either using HTTP/3 or able to use it. 3 | # 4 | # We measure "ability to use" as well as "actual use", as HTTP Archive is a 5 | # cold crawl and so less likely to use HTTP/3 which requires prior visits. 6 | # 7 | # For "able to use" we look at the alt-svc response header. 8 | # 9 | # We also only measure official HTTP/3 (ALPN h3, h3-29) and not gQUIC or other 10 | # prior versions. h3-29 is the final draft version and will be switched to h3 11 | # when HTTP/3 is approved so we include that as it is HTTP/3 in all but name. 12 | # 13 | SELECT 14 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 15 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 16 | client, 17 | ROUND( 18 | SUM( 19 | IF( 20 | LAX_STRING(r.summary.respHttpVersion) IN ('HTTP/3', 'h3', 'h3-29') OR 21 | REGEXP_EXTRACT(REGEXP_EXTRACT(resp.value, r'(.*)'), r'(.*?)(?:, [^ ]* = .*)?$') LIKE '%h3=%' OR 22 | REGEXP_EXTRACT(REGEXP_EXTRACT(resp.value, r'(.*)'), r'(.*?)(?:, [^ ]* = .*)?$') LIKE '%h3-29=%', 23 | 1, 0 24 | ) 25 | ) * 100 / COUNT(0), 2 26 | ) AS percent 27 | FROM 28 | `httparchive.crawl.requests` r 29 | LEFT OUTER JOIN 30 | UNNEST(response_headers) AS resp 31 | ON (resp.name = 'alt-svc') 32 | INNER JOIN 33 | `httparchive.crawl.pages` 34 | USING (date, client, is_root_page, rank, page) 35 | WHERE 36 | date >= '2020-01-01' AND 37 | is_root_page 38 | GROUP BY 39 | date, 40 | timestamp, 41 | client 42 | ORDER BY 43 | date DESC, 44 | client 45 | -------------------------------------------------------------------------------- /sql/timeseries/hreflang.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | ROUND(SUM(IF(LAX_STRING(lighthouse.audits.hreflang.score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent 7 | FROM 8 | `httparchive.crawl.pages` 9 | WHERE 10 | lighthouse IS NOT NULL AND 11 | TO_JSON_STRING(lighthouse) != '{}' AND 12 | date >= '2017-06-01' AND 13 | is_root_page AND 14 | LAX_STRING(lighthouse.audits.hreflang.score) IS NOT NULL 15 | GROUP BY 16 | date, 17 | timestamp, 18 | client 19 | ORDER BY 20 | date DESC, 21 | client 22 | -------------------------------------------------------------------------------- /sql/timeseries/idleDetection.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls, 7 | ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent 8 | FROM 9 | `httparchive.crawl.pages` 10 | LEFT OUTER JOIN UNNEST(features) AS feat 11 | ON (feat.id = '2834' OR feat.feature = 'IdleDetectionStart') 12 | WHERE 13 | date >= '2016-11-15' AND 14 | is_root_page 15 | GROUP BY 16 | date, 17 | timestamp, 18 | client 19 | ORDER BY 20 | date DESC, 21 | client, 22 | num_urls DESC 23 | -------------------------------------------------------------------------------- /sql/timeseries/imgLazy.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | ROUND(COUNT(DISTINCT IF(LOWER(LAX_STRING(attr)) = 'lazy', page, NULL)) * 100 / COUNT(DISTINCT page), 2) AS percent 7 | FROM 8 | `httparchive.crawl.pages` 9 | LEFT JOIN 10 | UNNEST(JSON_EXTRACT_ARRAY(custom_metrics.other['img-loading-attr'])) AS attr 11 | WHERE 12 | is_root_page AND 13 | date > '2016-01-01' 14 | GROUP BY 15 | date, 16 | timestamp, 17 | client 18 | ORDER BY 19 | date DESC, 20 | client 21 | -------------------------------------------------------------------------------- /sql/timeseries/imgSavings.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | ROUND(APPROX_QUANTILES(FLOAT64(payload._image_savings), 1001)[OFFSET(101)] / 1024, 2) AS p10, 7 | ROUND(APPROX_QUANTILES(FLOAT64(payload._image_savings), 1001)[OFFSET(251)] / 1024, 2) AS p25, 8 | ROUND(APPROX_QUANTILES(FLOAT64(payload._image_savings), 1001)[OFFSET(501)] / 1024, 2) AS p50, 9 | ROUND(APPROX_QUANTILES(FLOAT64(payload._image_savings), 1001)[OFFSET(751)] / 1024, 2) AS p75, 10 | ROUND(APPROX_QUANTILES(FLOAT64(payload._image_savings), 1001)[OFFSET(901)] / 1024, 2) AS p90 11 | FROM 12 | `httparchive.crawl.pages` 13 | WHERE 14 | is_root_page AND 15 | date >= '2016-01-01' 16 | GROUP BY 17 | date, 18 | timestamp, 19 | client 20 | ORDER BY 21 | date DESC, 22 | client 23 | -------------------------------------------------------------------------------- /sql/timeseries/legible.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | ROUND(SUM(IF(LAX_STRING(lighthouse.audits['font-size'].score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent 7 | FROM 8 | `httparchive.crawl.pages` 9 | WHERE 10 | lighthouse IS NOT NULL AND 11 | date >= '2017-12-15' AND 12 | is_root_page AND 13 | LAX_STRING(lighthouse.audits['font-size'].score) IS NOT NULL 14 | GROUP BY 15 | date, 16 | timestamp, 17 | client 18 | ORDER BY 19 | date DESC, 20 | client 21 | -------------------------------------------------------------------------------- /sql/timeseries/linkText.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | ROUND(SUM(IF(LAX_STRING(lighthouse.audits['link-text'].score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent 7 | FROM 8 | `httparchive.crawl.pages` 9 | WHERE 10 | lighthouse IS NOT NULL AND 11 | date >= '2017-11-15' AND 12 | is_root_page AND 13 | LAX_STRING(lighthouse.audits['link-text'].score) IS NOT NULL 14 | GROUP BY 15 | date, 16 | timestamp, 17 | client 18 | ORDER BY 19 | date DESC, 20 | client 21 | -------------------------------------------------------------------------------- /sql/timeseries/notificationTriggers.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls, 7 | ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent 8 | FROM 9 | `httparchive.crawl.pages` 10 | LEFT OUTER JOIN UNNEST(features) AS feat 11 | ON (feat.id = '3017' OR feat.feature = 'NotificationShowTrigger') 12 | WHERE 13 | date >= '2016-11-15' AND 14 | is_root_page 15 | GROUP BY 16 | date, 17 | timestamp, 18 | client 19 | ORDER BY 20 | date DESC, 21 | client, 22 | num_urls DESC 23 | -------------------------------------------------------------------------------- /sql/timeseries/numUrls.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | COUNT(0) AS urls 7 | FROM 8 | `httparchive.crawl.pages` 9 | WHERE 10 | date >= '2010-11-15' AND 11 | is_root_page 12 | GROUP BY 13 | date, 14 | timestamp, 15 | client 16 | ORDER BY 17 | date DESC, 18 | client 19 | -------------------------------------------------------------------------------- /sql/timeseries/offscreenImages.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | ROUND(APPROX_QUANTILES(IFNULL(INT64(lighthouse.audits['offscreen-images'].details.overallSavingsBytes), INT64(lighthouse.audits['offscreen-images'].extendedInfo.value.wastedKb) * 1024), 1001)[OFFSET(101)] / 1024, 2) AS p10, 7 | ROUND(APPROX_QUANTILES(IFNULL(INT64(lighthouse.audits['offscreen-images'].details.overallSavingsBytes), INT64(lighthouse.audits['offscreen-images'].extendedInfo.value.wastedKb) * 1024), 1001)[OFFSET(251)] / 1024, 2) AS p25, 8 | ROUND(APPROX_QUANTILES(IFNULL(INT64(lighthouse.audits['offscreen-images'].details.overallSavingsBytes), INT64(lighthouse.audits['offscreen-images'].extendedInfo.value.wastedKb) * 1024), 1001)[OFFSET(501)] / 1024, 2) AS p50, 9 | ROUND(APPROX_QUANTILES(IFNULL(INT64(lighthouse.audits['offscreen-images'].details.overallSavingsBytes), INT64(lighthouse.audits['offscreen-images'].extendedInfo.value.wastedKb) * 1024), 1001)[OFFSET(751)] / 1024, 2) AS p75, 10 | ROUND(APPROX_QUANTILES(IFNULL(INT64(lighthouse.audits['offscreen-images'].details.overallSavingsBytes), INT64(lighthouse.audits['offscreen-images'].extendedInfo.value.wastedKb) * 1024), 1001)[OFFSET(901)] / 1024, 2) AS p90 11 | FROM 12 | `httparchive.crawl.pages` 13 | WHERE 14 | is_root_page AND 15 | date >= '2017-06-01' 16 | GROUP BY 17 | date, 18 | timestamp, 19 | client 20 | ORDER BY 21 | date DESC, 22 | client 23 | -------------------------------------------------------------------------------- /sql/timeseries/ol.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | ROUND(APPROX_QUANTILES(FLOAT64(summary.onLoad), 1001)[OFFSET(101)] / 1000, 2) AS p10, 7 | ROUND(APPROX_QUANTILES(FLOAT64(summary.onLoad), 1001)[OFFSET(251)] / 1000, 2) AS p25, 8 | ROUND(APPROX_QUANTILES(FLOAT64(summary.onLoad), 1001)[OFFSET(501)] / 1000, 2) AS p50, 9 | ROUND(APPROX_QUANTILES(FLOAT64(summary.onLoad), 1001)[OFFSET(751)] / 1000, 2) AS p75, 10 | ROUND(APPROX_QUANTILES(FLOAT64(summary.onLoad), 1001)[OFFSET(901)] / 1000, 2) AS p90 11 | FROM 12 | `httparchive.crawl.pages` 13 | WHERE 14 | date >= '2010-11-15' AND 15 | is_root_page AND 16 | FLOAT64(summary.onLoad) > 0 17 | GROUP BY 18 | date, 19 | timestamp, 20 | client 21 | ORDER BY 22 | date DESC, 23 | client 24 | -------------------------------------------------------------------------------- /sql/timeseries/optimizedImages.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | ROUND(APPROX_QUANTILES(IFNULL(INT64(lighthouse.audits['uses-optimized-images'].details.overallSavingsBytes), INT64(lighthouse.audits['uses-optimized-images'].extendedInfo.value.wastedKb) * 1024), 1001)[OFFSET(101)] / 1024, 2) AS p10, 7 | ROUND(APPROX_QUANTILES(IFNULL(INT64(lighthouse.audits['uses-optimized-images'].details.overallSavingsBytes), INT64(lighthouse.audits['uses-optimized-images'].extendedInfo.value.wastedKb) * 1024), 1001)[OFFSET(251)] / 1024, 2) AS p25, 8 | ROUND(APPROX_QUANTILES(IFNULL(INT64(lighthouse.audits['uses-optimized-images'].details.overallSavingsBytes), INT64(lighthouse.audits['uses-optimized-images'].extendedInfo.value.wastedKb) * 1024), 1001)[OFFSET(501)] / 1024, 2) AS p50, 9 | ROUND(APPROX_QUANTILES(IFNULL(INT64(lighthouse.audits['uses-optimized-images'].details.overallSavingsBytes), INT64(lighthouse.audits['uses-optimized-images'].extendedInfo.value.wastedKb) * 1024), 1001)[OFFSET(751)] / 1024, 2) AS p75, 10 | ROUND(APPROX_QUANTILES(IFNULL(INT64(lighthouse.audits['uses-optimized-images'].details.overallSavingsBytes), INT64(lighthouse.audits['uses-optimized-images'].extendedInfo.value.wastedKb) * 1024), 1001)[OFFSET(901)] / 1024, 2) AS p90 11 | FROM 12 | `httparchive.crawl.pages` 13 | WHERE 14 | date >= '2017-06-01' AND 15 | is_root_page 16 | GROUP BY 17 | date, 18 | timestamp, 19 | client 20 | ORDER BY 21 | date DESC, 22 | client 23 | -------------------------------------------------------------------------------- /sql/timeseries/pctHttps.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | ROUND(SUM(IF(STARTS_WITH(url, 'https'), 1, 0)) * 100 / COUNT(0), 2) AS percent 7 | FROM 8 | `httparchive.crawl.requests` 9 | INNER JOIN 10 | `httparchive.crawl.pages` 11 | USING (date, client, is_root_page, rank, page) 12 | WHERE 13 | is_root_page AND 14 | date >= '2016-01-01' 15 | GROUP BY 16 | date, 17 | timestamp, 18 | client 19 | ORDER BY 20 | date DESC, 21 | client 22 | -------------------------------------------------------------------------------- /sql/timeseries/periodicBackgroundSync.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls, 7 | ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent 8 | FROM 9 | `httparchive.crawl.pages` 10 | LEFT OUTER JOIN UNNEST(features) AS feat 11 | ON (feat.id = '2930' OR feat.feature = 'PeriodicBackgroundSync') 12 | WHERE 13 | date >= '2016-11-15' AND 14 | is_root_page 15 | GROUP BY 16 | date, 17 | timestamp, 18 | client 19 | ORDER BY 20 | date DESC, 21 | client, 22 | num_urls DESC 23 | -------------------------------------------------------------------------------- /sql/timeseries/periodicBackgroundSyncRegister.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls, 7 | ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent 8 | FROM 9 | `httparchive.crawl.pages` 10 | LEFT OUTER JOIN UNNEST(features) AS feat 11 | ON (feat.id = '2931' OR feat.feature = 'PeriodicBackgroundSyncRegister') 12 | WHERE 13 | date >= '2016-11-15' AND 14 | is_root_page 15 | GROUP BY 16 | date, 17 | timestamp, 18 | client 19 | ORDER BY 20 | date DESC, 21 | client, 22 | num_urls DESC 23 | -------------------------------------------------------------------------------- /sql/timeseries/quicTransport.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls, 7 | ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent 8 | FROM 9 | `httparchive.crawl.pages` 10 | LEFT OUTER JOIN UNNEST(features) AS feat 11 | ON (feat.id = '3184' OR feat.feature = 'QuicTransport') 12 | WHERE 13 | date >= '2016-11-15' AND 14 | is_root_page 15 | GROUP BY 16 | date, 17 | timestamp, 18 | client 19 | ORDER BY 20 | date DESC, 21 | client, 22 | num_urls DESC 23 | -------------------------------------------------------------------------------- /sql/timeseries/reqCss.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | ROUND(APPROX_QUANTILES(FLOAT64(summary.reqCss), 1001)[OFFSET(101)], 2) AS p10, 7 | ROUND(APPROX_QUANTILES(FLOAT64(summary.reqCss), 1001)[OFFSET(251)], 2) AS p25, 8 | ROUND(APPROX_QUANTILES(FLOAT64(summary.reqCss), 1001)[OFFSET(501)], 2) AS p50, 9 | ROUND(APPROX_QUANTILES(FLOAT64(summary.reqCss), 1001)[OFFSET(751)], 2) AS p75, 10 | ROUND(APPROX_QUANTILES(FLOAT64(summary.reqCss), 1001)[OFFSET(901)], 2) AS p90 11 | FROM 12 | `httparchive.crawl.pages` 13 | WHERE 14 | date >= '2010-11-15' AND 15 | is_root_page AND 16 | FLOAT64(summary.reqCss) > 0 17 | GROUP BY 18 | date, 19 | timestamp, 20 | client 21 | ORDER BY 22 | date DESC, 23 | client 24 | -------------------------------------------------------------------------------- /sql/timeseries/reqFont.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | ROUND(APPROX_QUANTILES(FLOAT64(summary.reqFont), 1001)[OFFSET(101)], 2) AS p10, 7 | ROUND(APPROX_QUANTILES(FLOAT64(summary.reqFont), 1001)[OFFSET(251)], 2) AS p25, 8 | ROUND(APPROX_QUANTILES(FLOAT64(summary.reqFont), 1001)[OFFSET(501)], 2) AS p50, 9 | ROUND(APPROX_QUANTILES(FLOAT64(summary.reqFont), 1001)[OFFSET(751)], 2) AS p75, 10 | ROUND(APPROX_QUANTILES(FLOAT64(summary.reqFont), 1001)[OFFSET(901)], 2) AS p90 11 | FROM 12 | `httparchive.crawl.pages` 13 | WHERE 14 | date >= '2010-11-15' AND 15 | is_root_page AND 16 | FLOAT64(summary.reqFont) > 0 17 | GROUP BY 18 | date, 19 | timestamp, 20 | client 21 | ORDER BY 22 | date DESC, 23 | client 24 | -------------------------------------------------------------------------------- /sql/timeseries/reqHtml.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | ROUND(APPROX_QUANTILES(FLOAT64(summary.reqHtml), 1001)[OFFSET(101)], 2) AS p10, 7 | ROUND(APPROX_QUANTILES(FLOAT64(summary.reqHtml), 1001)[OFFSET(251)], 2) AS p25, 8 | ROUND(APPROX_QUANTILES(FLOAT64(summary.reqHtml), 1001)[OFFSET(501)], 2) AS p50, 9 | ROUND(APPROX_QUANTILES(FLOAT64(summary.reqHtml), 1001)[OFFSET(751)], 2) AS p75, 10 | ROUND(APPROX_QUANTILES(FLOAT64(summary.reqHtml), 1001)[OFFSET(901)], 2) AS p90 11 | FROM 12 | `httparchive.crawl.pages` 13 | WHERE 14 | date >= '2010-11-15' AND 15 | is_root_page AND 16 | FLOAT64(summary.reqHtml) > 0 17 | GROUP BY 18 | date, 19 | timestamp, 20 | client 21 | ORDER BY 22 | date DESC, 23 | client 24 | -------------------------------------------------------------------------------- /sql/timeseries/reqImg.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | ROUND(APPROX_QUANTILES(FLOAT64(summary.reqImg), 1001)[OFFSET(101)], 2) AS p10, 7 | ROUND(APPROX_QUANTILES(FLOAT64(summary.reqImg), 1001)[OFFSET(251)], 2) AS p25, 8 | ROUND(APPROX_QUANTILES(FLOAT64(summary.reqImg), 1001)[OFFSET(501)], 2) AS p50, 9 | ROUND(APPROX_QUANTILES(FLOAT64(summary.reqImg), 1001)[OFFSET(751)], 2) AS p75, 10 | ROUND(APPROX_QUANTILES(FLOAT64(summary.reqImg), 1001)[OFFSET(901)], 2) AS p90 11 | FROM 12 | `httparchive.crawl.pages` 13 | WHERE 14 | date >= '2010-11-15' AND 15 | is_root_page AND 16 | FLOAT64(summary.reqImg) > 0 17 | GROUP BY 18 | date, 19 | timestamp, 20 | client 21 | ORDER BY 22 | date DESC, 23 | client 24 | -------------------------------------------------------------------------------- /sql/timeseries/reqJs.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | ROUND(APPROX_QUANTILES(FLOAT64(summary.reqJS), 1001)[OFFSET(101)], 2) AS p10, 7 | ROUND(APPROX_QUANTILES(FLOAT64(summary.reqJS), 1001)[OFFSET(251)], 2) AS p25, 8 | ROUND(APPROX_QUANTILES(FLOAT64(summary.reqJS), 1001)[OFFSET(501)], 2) AS p50, 9 | ROUND(APPROX_QUANTILES(FLOAT64(summary.reqJS), 1001)[OFFSET(751)], 2) AS p75, 10 | ROUND(APPROX_QUANTILES(FLOAT64(summary.reqJS), 1001)[OFFSET(901)], 2) AS p90 11 | FROM 12 | `httparchive.crawl.pages` 13 | WHERE 14 | date >= '2010-11-15' AND 15 | is_root_page AND 16 | FLOAT64(summary.reqJS) > 0 17 | GROUP BY 18 | date, 19 | timestamp, 20 | client 21 | ORDER BY 22 | date DESC, 23 | client 24 | -------------------------------------------------------------------------------- /sql/timeseries/reqOther.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | ROUND(APPROX_QUANTILES(FLOAT64(summary.reqOther), 1001)[OFFSET(101)], 2) AS p10, 7 | ROUND(APPROX_QUANTILES(FLOAT64(summary.reqOther), 1001)[OFFSET(251)], 2) AS p25, 8 | ROUND(APPROX_QUANTILES(FLOAT64(summary.reqOther), 1001)[OFFSET(501)], 2) AS p50, 9 | ROUND(APPROX_QUANTILES(FLOAT64(summary.reqOther), 1001)[OFFSET(751)], 2) AS p75, 10 | ROUND(APPROX_QUANTILES(FLOAT64(summary.reqOther), 1001)[OFFSET(901)], 2) AS p90 11 | FROM 12 | `httparchive.crawl.pages` 13 | WHERE 14 | date >= '2010-11-15' AND 15 | is_root_page AND 16 | FLOAT64(summary.reqOther) > 0 17 | GROUP BY 18 | date, 19 | timestamp, 20 | client 21 | ORDER BY 22 | date DESC, 23 | client 24 | -------------------------------------------------------------------------------- /sql/timeseries/reqTotal.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | ROUND(APPROX_QUANTILES(FLOAT64(summary.reqTotal), 1001)[OFFSET(101)], 2) AS p10, 7 | ROUND(APPROX_QUANTILES(FLOAT64(summary.reqTotal), 1001)[OFFSET(251)], 2) AS p25, 8 | ROUND(APPROX_QUANTILES(FLOAT64(summary.reqTotal), 1001)[OFFSET(501)], 2) AS p50, 9 | ROUND(APPROX_QUANTILES(FLOAT64(summary.reqTotal), 1001)[OFFSET(751)], 2) AS p75, 10 | ROUND(APPROX_QUANTILES(FLOAT64(summary.reqTotal), 1001)[OFFSET(901)], 2) AS p90 11 | FROM 12 | `httparchive.crawl.pages` 13 | WHERE 14 | date >= '2010-11-15' AND 15 | is_root_page AND 16 | FLOAT64(summary.reqTotal) > 0 17 | GROUP BY 18 | date, 19 | timestamp, 20 | client 21 | ORDER BY 22 | date DESC, 23 | client 24 | -------------------------------------------------------------------------------- /sql/timeseries/reqVideo.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | ROUND(APPROX_QUANTILES(FLOAT64(summary.reqVideo), 1001)[OFFSET(101)], 2) AS p10, 7 | ROUND(APPROX_QUANTILES(FLOAT64(summary.reqVideo), 1001)[OFFSET(251)], 2) AS p25, 8 | ROUND(APPROX_QUANTILES(FLOAT64(summary.reqVideo), 1001)[OFFSET(501)], 2) AS p50, 9 | ROUND(APPROX_QUANTILES(FLOAT64(summary.reqVideo), 1001)[OFFSET(751)], 2) AS p75, 10 | ROUND(APPROX_QUANTILES(FLOAT64(summary.reqVideo), 1001)[OFFSET(901)], 2) AS p90 11 | FROM 12 | `httparchive.crawl.pages` 13 | WHERE 14 | date >= '2010-11-15' AND 15 | is_root_page AND 16 | FLOAT64(summary.reqVideo) > 0 17 | GROUP BY 18 | date, 19 | timestamp, 20 | client 21 | ORDER BY 22 | date DESC, 23 | client 24 | -------------------------------------------------------------------------------- /sql/timeseries/screenWakeLock.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls, 7 | ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent 8 | FROM 9 | `httparchive.crawl.pages` 10 | LEFT OUTER JOIN UNNEST(features) AS feat 11 | ON (feat.id = '3005' OR feat.feature = 'WakeLockAcquireScreenLock') 12 | WHERE 13 | date >= '2016-11-15' AND 14 | is_root_page 15 | GROUP BY 16 | date, 17 | timestamp, 18 | client 19 | ORDER BY 20 | date DESC, 21 | client, 22 | num_urls DESC 23 | -------------------------------------------------------------------------------- /sql/timeseries/speedIndex.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | ROUND(APPROX_QUANTILES(FLOAT64(payload._SpeedIndex), 1001)[OFFSET(101)] / 1000, 2) AS p10, 7 | ROUND(APPROX_QUANTILES(FLOAT64(payload._SpeedIndex), 1001)[OFFSET(251)] / 1000, 2) AS p25, 8 | ROUND(APPROX_QUANTILES(FLOAT64(payload._SpeedIndex), 1001)[OFFSET(501)] / 1000, 2) AS p50, 9 | ROUND(APPROX_QUANTILES(FLOAT64(payload._SpeedIndex), 1001)[OFFSET(751)] / 1000, 2) AS p75, 10 | ROUND(APPROX_QUANTILES(FLOAT64(payload._SpeedIndex), 1001)[OFFSET(901)] / 1000, 2) AS p90 11 | FROM 12 | `httparchive.crawl.pages` 13 | WHERE 14 | is_root_page AND 15 | date >= '2016-01-01' 16 | GROUP BY 17 | date, 18 | timestamp, 19 | client 20 | ORDER BY 21 | date DESC, 22 | client 23 | -------------------------------------------------------------------------------- /sql/timeseries/storageEstimate.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls, 7 | ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent 8 | FROM 9 | `httparchive.crawl.pages` 10 | LEFT OUTER JOIN UNNEST(features) AS feat 11 | ON (feat.id = '1371' OR feat.feature = 'DurableStorageEstimate') 12 | WHERE 13 | date >= '2016-11-15' AND 14 | is_root_page 15 | GROUP BY 16 | date, 17 | timestamp, 18 | client 19 | ORDER BY 20 | date DESC, 21 | client, 22 | num_urls DESC 23 | -------------------------------------------------------------------------------- /sql/timeseries/storagePersist.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls, 7 | ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent 8 | FROM 9 | `httparchive.crawl.pages` 10 | LEFT OUTER JOIN 11 | UNNEST(features) AS feat 12 | ON (feat.id = '3018' OR feat.feature = 'DurableStoragePersist') 13 | WHERE 14 | date >= '2016-11-15' AND 15 | is_root_page 16 | GROUP BY 17 | date, 18 | timestamp, 19 | client 20 | ORDER BY 21 | date DESC, 22 | client, 23 | num_urls DESC 24 | -------------------------------------------------------------------------------- /sql/timeseries/swControlledPages.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | SUM(IF(feat.id = '990' OR feat.feature = 'ServiceWorkerControlledPage', 1, 0)) AS num_urls, 7 | ROUND(SUM(IF(feat.id = '990' OR feat.feature = 'ServiceWorkerControlledPage', 1, 0)) / COUNT(0) * 100, 5) AS percent 8 | FROM 9 | `httparchive.crawl.pages` 10 | LEFT OUTER JOIN 11 | UNNEST(features) AS feat 12 | ON (feat.id = '990' OR feat.feature = 'ServiceWorkerControlledPage') 13 | WHERE 14 | date >= '2016-11-15' AND 15 | is_root_page 16 | GROUP BY 17 | date, 18 | timestamp, 19 | client 20 | ORDER BY 21 | date DESC, 22 | client, 23 | num_urls DESC 24 | -------------------------------------------------------------------------------- /sql/timeseries/tcp.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | APPROX_QUANTILES(FLOAT64(summary._connections), 1001)[OFFSET(101)] AS p10, 7 | APPROX_QUANTILES(FLOAT64(summary._connections), 1001)[OFFSET(251)] AS p25, 8 | APPROX_QUANTILES(FLOAT64(summary._connections), 1001)[OFFSET(501)] AS p50, 9 | APPROX_QUANTILES(FLOAT64(summary._connections), 1001)[OFFSET(751)] AS p75, 10 | APPROX_QUANTILES(FLOAT64(summary._connections), 1001)[OFFSET(901)] AS p90 11 | FROM 12 | `httparchive.crawl.pages` 13 | WHERE 14 | date >= '2010-11-15' AND 15 | is_root_page AND 16 | FLOAT64(summary._connections) > 0 17 | GROUP BY 18 | date, 19 | timestamp, 20 | client 21 | ORDER BY 22 | date DESC, 23 | client 24 | -------------------------------------------------------------------------------- /sql/timeseries/ttci.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(100)], 2) AS p10, 7 | ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(250)], 2) AS p25, 8 | ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(500)], 2) AS p50, 9 | ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(750)], 2) AS p75, 10 | ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(900)], 2) AS p90 11 | FROM ( 12 | SELECT 13 | client, 14 | date, 15 | IFNULL( 16 | FLOAT64(lighthouse.audits.interactive.numericValue), 17 | IFNULL( 18 | FLOAT64(lighthouse.audits.interactive.rawValue), 19 | FLOAT64(lighthouse.audits['consistently-interactive'].rawValue) 20 | ) 21 | ) / 1000 AS value 22 | FROM 23 | `httparchive.crawl.pages` 24 | WHERE 25 | is_root_page AND 26 | date >= '2016-01-01' 27 | ) 28 | GROUP BY 29 | date, 30 | timestamp, 31 | client 32 | ORDER BY 33 | date DESC, 34 | client 35 | -------------------------------------------------------------------------------- /sql/timeseries/webSocketStream.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, 4 | UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, 5 | client, 6 | SUM(IF(feat.id = '3018' OR feat.feature = 'WebSocketStreamConstructor', 1, 0)) AS num_urls, 7 | ROUND(SUM(IF(feat.id = '3018' OR feat.feature = 'WebSocketStreamConstructor', 1, 0)) / COUNT(0) * 100, 5) AS percent 8 | FROM 9 | `httparchive.crawl.pages` 10 | LEFT OUTER JOIN 11 | UNNEST(features) AS feat 12 | ON (feat.id = '3018' OR feat.feature = 'WebSocketStreamConstructor') 13 | WHERE 14 | date >= '2016-11-15' AND 15 | is_root_page 16 | GROUP BY 17 | date, 18 | timestamp, 19 | client 20 | ORDER BY 21 | date DESC, 22 | client, 23 | num_urls DESC 24 | -------------------------------------------------------------------------------- /sync_csv.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Usage: 4 | # 5 | # ./sync_csv.sh [mobile_][Mon_D_YYYY] 6 | # 7 | # Examples: 8 | # 9 | # ./sync_csv.sh mobile_Dec_15_2018 10 | # ./sync_csv.sh Jan_1_2019 11 | 12 | DATA=$HOME/archive 13 | BASE=`pwd` 14 | 15 | if [ -n "$1" ]; then 16 | archive=$1 17 | if [[ $archive == *mobile* ]]; then 18 | mobile=1 19 | adate=${archive#mobile_} 20 | else 21 | mobile=0 22 | adate=$archive 23 | fi 24 | echo "Processing $adate, mobile: $mobile, archive: $archive" 25 | 26 | else 27 | echo "Must provide date, eg. Apr_15_2013" 28 | exit 1 29 | fi 30 | 31 | mkdir -p $DATA/processed/$archive 32 | 33 | cd $DATA 34 | 35 | YYYY_MM_DD=$(date --date="$(echo $adate | sed "s/_/ /g" -)" "+%Y_%m_%d") 36 | 37 | if [[ $mobile == 1 ]]; then 38 | client="mobile" 39 | else 40 | client="desktop" 41 | fi 42 | 43 | ptable="summary_pages.${YYYY_MM_DD}_${client}" 44 | rtable="summary_requests.${YYYY_MM_DD}_${client}" 45 | 46 | if bq show httparchive:${ptable} &> /dev/null && \ 47 | bq show httparchive:${rtable} &> /dev/null; then 48 | # Tables should be deleted from BigQuery first if the intent is to overwrite them. 49 | echo -e "BigQuery summary tables for ${YYYY_MM_DD}_${client} already exist, exiting" 50 | exit 0 51 | fi 52 | 53 | if [ ! -f httparchive_${archive}_pages.csv.gz ]; then 54 | echo -e "Downloading data for $archive" 55 | gsutil cp "gs://httparchive/downloads/httparchive_${archive}_pages.csv.gz" ./ 56 | if [ $? -ne 0 ]; then 57 | echo "Pages data for ${adate} is missing, exiting" 58 | exit 1 59 | fi 60 | else 61 | echo -e "Pages data already downloaded for $archive, skipping." 62 | fi 63 | 64 | if [ ! -f httparchive_${archive}_requests.csv.gz ]; then 65 | gsutil cp "gs://httparchive/downloads/httparchive_${archive}_requests.csv.gz" ./ 66 | if [ $? -ne 0 ]; then 67 | echo "Request data for ${adate} is missing, exiting" 68 | exit 1 69 | fi 70 | else 71 | echo -e "Request data already downloaded for $archive, skipping." 72 | fi 73 | 74 | if [ ! -f processed/${archive}/pages.csv.gz ]; then 75 | echo -e "Converting pages data" 76 | gunzip -c "httparchive_${archive}_pages.csv.gz" \ 77 | | sed -e 's/\\N,/"",/g' -e 's/\\N$/""/g' -e's/\([^\]\)\\"/\1""/g' -e's/\([^\]\)\\"/\1""/g' -e 's/\\"","/\\\\","/g' \ 78 | | gzip > "processed/${archive}/pages.csv.gz" 79 | else 80 | echo -e "Pages data already converted, skipping." 81 | fi 82 | 83 | if ls processed/${archive}/requests_* &> /dev/null; then 84 | echo -e "Request data already converted, skipping." 85 | else 86 | echo -e "Converting requests data" 87 | gunzip -c "httparchive_${archive}_requests.csv.gz" \ 88 | | sed -e 's/\\N,/"",/g' -e 's/\\N$/""/g' -e 's/\\"/""/g' -e 's/\\"","/\\\\","/g' \ 89 | | python fixcsv.py \ 90 | | split --lines=8000000 --filter='pigz - > $FILE.gz' - processed/$archive/requests_ 91 | fi 92 | 93 | cd processed/${archive} 94 | 95 | echo -e "Syncing data to Google Storage" 96 | gsutil cp -n * gs://httparchive/${archive}/ 97 | 98 | bq show httparchive:${ptable} &> /dev/null 99 | if [ $? -ne 0 ]; then 100 | echo -e "Submitting new pages import ${ptable} to BigQuery" 101 | bq load --max_bad_records 10 --replace $ptable gs://httparchive/${archive}/pages.csv.gz $BASE/schema/pages.json 102 | if [ $? -ne 0 ]; then 103 | echo "Error loading ${ptable}, exiting" 104 | exit 1 105 | fi 106 | else 107 | echo -e "${ptable} already exists, skipping." 108 | fi 109 | 110 | bq show httparchive:${rtable} &> /dev/null 111 | if [ $? -ne 0 ]; then 112 | echo -e "Submitting new requests import ${rtable} to BigQuery" 113 | bq load --max_bad_records 10 --replace $rtable gs://httparchive/${archive}/requests_* $BASE/schema/requests.json 114 | if [ $? -ne 0 ]; then 115 | echo "Error loading ${rtable}, exiting" 116 | exit 1 117 | fi 118 | else 119 | echo -e "${rtable} already exists, skipping." 120 | fi 121 | 122 | 123 | bq show httparchive:${rtable} &> /dev/null 124 | if [ $? -eq 0 ]; then 125 | echo -e "Deleting CSV artifacts..." 126 | rm $DATA/httparchive_${archive}_* 127 | rm -r $DATA/processed/$archive 128 | else 129 | echo "Error loading into BigQuery, exiting" 130 | exit 1 131 | fi 132 | 133 | echo -e "Attempting to generate reports for ${YYYY_MM_DD}..." 134 | cd $HOME/code 135 | 136 | gsutil -q stat gs://httparchive/reports/${YYYY_MM_DD}/* 137 | if [ $? -eq 1 ]; then 138 | . sql/generate_reports.sh -th ${YYYY_MM_DD} -l ALL 139 | else 140 | echo -e "Reports for ${YYYY_MM_DD} already exist, skipping." 141 | fi 142 | 143 | echo "Done" 144 | -------------------------------------------------------------------------------- /sync_har.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Usage: 4 | # 5 | # ./sync_har.sh [chrome,android] [YYYY-MM-DD] 6 | # 7 | # Examples: 8 | # 9 | # ./sync_har.sh chrome 10 | # ./sync_har.sh chrome 2019-01-01 11 | # ./sync_har.sh android 2018-12-15 12 | # 13 | 14 | cd $HOME/code/dataflow/python 15 | 16 | if [ -n "$2" ]; then 17 | day=$(date -d $2 +%d) 18 | MM=$(date -d $2 +%m) 19 | month=$(date -d $2 +%b) 20 | year=$(date -d $2 +%Y) 21 | else 22 | day=$(date +%d) 23 | MM=$(date +%m) 24 | month=$(date +%b) 25 | year=$(date +%Y) 26 | fi 27 | 28 | # All crawls begin on the first of the month. 29 | import_date=$(date +"${month}_1_${year}") 30 | YYYY_MM_DD="${year}_${MM}_01" 31 | 32 | if [ -n "$1" ]; then 33 | archive=$1 34 | if [[ $1 == *chrome* ]]; then 35 | client="desktop" 36 | bucket="chrome-${import_date}" 37 | else 38 | client="mobile" 39 | bucket="android-${import_date}" 40 | fi 41 | echo "Processing $bucket, client: $client" 42 | 43 | else 44 | echo "Must provide import type (e.g. chrome), and optional date:" 45 | echo "\t script.sh chrome 2016-01-15" 46 | exit 47 | fi 48 | 49 | if bq show "httparchive:pages.${YYYY_MM_DD}_${client}"; then 50 | echo "Table already exists in BigQuery, exiting" 51 | exit 1 52 | else 53 | echo "Table does not exist in BigQuery, checking gs://..." 54 | fi 55 | 56 | if ! gsutil stat "gs://httparchive/crawls/${bucket}/done"; then 57 | echo "Bucket does not exist or has not finished importing" 58 | exit 1 59 | else 60 | echo "Bucket exists, initiating DataFlow import..." 61 | fi 62 | 63 | export GOOGLE_APPLICATION_CREDENTIALS="./credentials/auth.json" 64 | 65 | if [ ! -f $GOOGLE_APPLICATION_CREDENTIALS ]; then 66 | echo "ERROR: ${GOOGLE_APPLICATION_CREDENTIALS} does not exist. See README for more info." 67 | exit 1 68 | fi 69 | 70 | source env/bin/activate 71 | 72 | python bigquery_import.py \ 73 | --runner=DataflowRunner \ 74 | --project=httparchive \ 75 | --temp_location=gs://httparchive/dataflow/temp \ 76 | --staging_location=gs://httparchive/dataflow/staging \ 77 | --region=us-west1 \ 78 | --machine_type=n1-standard-32 \ 79 | --input="${bucket}" \ 80 | --worker_disk_type=compute.googleapis.com/projects//zones//diskTypes/pd-ssd \ 81 | --experiment=use_beam_bq_sink 82 | 83 | deactivate 84 | 85 | echo -e "Attempting to generate reports for ${YYYY_MM_DD}..." 86 | cd $HOME/code 87 | 88 | gsutil -q stat gs://httparchive/reports/${YYYY_MM_DD}/* 89 | if [ $? -eq 1 ]; then 90 | . sql/generate_reports.sh -th ${YYYY_MM_DD} -l ALL 91 | else 92 | echo -e "Reports for ${YYYY_MM_DD} already exist, skipping." 93 | fi 94 | 95 | echo "Done" 96 | -------------------------------------------------------------------------------- /urls/.gitignore: -------------------------------------------------------------------------------- 1 | Gemfile.lock 2 | -------------------------------------------------------------------------------- /urls/Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | 3 | gem 'domainatrix' 4 | gem 'yajl-ruby', require: 'yajl' 5 | gem 'nokogiri' 6 | 7 | -------------------------------------------------------------------------------- /urls/process.rb: -------------------------------------------------------------------------------- 1 | require 'yajl' 2 | require 'zlib' 3 | require 'open3' 4 | require 'nokogiri' 5 | require 'optparse' 6 | require 'domainatrix' 7 | 8 | ROOT_PATH = '/' 9 | WWW = 'www' 10 | matched = 0 11 | res, options = {}, {} 12 | 13 | ARGV << "-h" if ARGV.empty? 14 | OptionParser.new do |opts| 15 | opts.banner = "Usage: process.rb [options]" 16 | 17 | opts.on('-a', '--alexa=file', 'Alexa input data') do |v| 18 | options[:alexa] = v 19 | end 20 | 21 | opts.on('-d', '--dmoz=file', 'DMOZ input data') do |v| 22 | options[:dmoz] = v 23 | end 24 | 25 | opts.on('-o', '--output=file', 'Output file') do |v| 26 | options[:output] = v || 'urls.json.gz' 27 | end 28 | 29 | opts.on('-h', '--help') do 30 | puts opts 31 | exit 32 | end 33 | end.parse! 34 | 35 | if options[:alexa].nil? or options[:dmoz].nil? 36 | raise OptionParser::MissingArgument 37 | end 38 | 39 | puts "Loading Alexa data..." 40 | IO.popen("unzip -p #{options[:alexa]}", 'rb') do |io| 41 | io.each do |line| 42 | rank, name = line.strip.split(',') 43 | res[name] = { 44 | alexa_domain: name, 45 | alexa_rank: rank.to_i, 46 | dmoz_topic: [] 47 | } 48 | end 49 | end 50 | 51 | puts "Loading DMOZ data..." 52 | Zlib::GzipReader.open(options[:dmoz]) do |gz| 53 | Nokogiri::XML::Reader(gz).each do |node| 54 | # 55 | # 56 | # About.com: Animation Guide 57 | # Keep up with developments in online animation for all skill levels. Download tools, and seek inspiration from online work. 58 | # Top/Arts/Animation 59 | # 60 | # 61 | if node.name == 'ExternalPage' && node.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT 62 | page = Nokogiri::XML(node.outer_xml).at('ExternalPage') 63 | 64 | url = Domainatrix.parse(page.attribute('about').text) 65 | next unless url.path == ROOT_PATH 66 | next unless url.subdomain.empty? or url.subdomain == WWW 67 | next if url.url.include? '?' or url.url.include? '#' 68 | 69 | if data = res[url.domain + "." + url.public_suffix] 70 | matched += 1 71 | data[:dmoz_topic] << page.at('topic').text 72 | data[:dmoz_url] ||= page.attribute('about').text 73 | data[:dmoz_title] ||= page.xpath('//d:Title').text 74 | data[:dmoz_description] ||= page.xpath('//d:Description').text 75 | end 76 | end 77 | end 78 | 79 | end 80 | 81 | File.open('urls.json.gz', 'w') do |f| 82 | gz = Zlib::GzipWriter.new(f) 83 | res.each_value do |val| 84 | gz.puts Yajl::Encoder.encode(val) 85 | end 86 | gz.close 87 | end 88 | 89 | puts "Done. Matched #{matches} DMOZ domains." 90 | 91 | -------------------------------------------------------------------------------- /urls/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | BASE=`pwd` 4 | TIMESTAMP=$(date "+%Y%m%d") 5 | DATA=$HOME/archive/urls/$TIMESTAMP 6 | 7 | mkdir -p $DATA 8 | cd $DATA 9 | 10 | echo -e "Fetching Alexa Top 1M archive" 11 | wget -nv -N "http://s3.amazonaws.com/alexa-static/top-1m.csv.zip" 12 | if [ $? -ne 0 ]; then 13 | echo "Alexa fetch failed, exiting" 14 | exit 15 | fi 16 | 17 | ## http://rdf.dmoz.org/ 18 | echo -e "Fetching DMOZ open directory RDF dump" 19 | wget -nv -N "http://rdf.dmoz.org/rdf/content.rdf.u8.gz" 20 | if [ $? -ne 0 ]; then 21 | echo "DMOZ fetch failed, exiting" 22 | exit 23 | fi 24 | 25 | ruby $BASE/process.rb -a top-1m.csv.zip -d content.rdf.u8.gz 26 | 27 | echo -e "Syncing data to Google Storage" 28 | gsutil cp -n *.{zip,gz} gs://httparchive/urls/${TIMESTAMP}/ 29 | 30 | echo -e "Importing results to BigQuery" 31 | bq load --source_format NEWLINE_DELIMITED_JSON urls.${TIMESTAMP} \ 32 | gs://httparchive/urls/${TIMESTAMP}/urls.json.gz \ 33 | $BASE/schema.json 34 | 35 | echo -e "Done." 36 | 37 | -------------------------------------------------------------------------------- /urls/schema.json: -------------------------------------------------------------------------------- 1 | [ 2 | {"name": "alexa_domain", "type": "STRING"}, 3 | {"name": "alexa_rank", "type": "INTEGER"}, 4 | {"name": "dmoz_topic", "type": "STRING", "mode":"REPEATED"}, 5 | {"name": "dmoz_url", "type": "STRING"}, 6 | {"name": "dmoz_title", "type": "STRING"}, 7 | {"name": "dmoz_description", "type": "STRING"} 8 | ] 9 | 10 | -------------------------------------------------------------------------------- /util/fixcsv.py: -------------------------------------------------------------------------------- 1 | import fileinput 2 | for line in fileinput.input(): 3 | if line.endswith('\\\n'): 4 | line = line[:-2] + ',' 5 | print(line), 6 | --------------------------------------------------------------------------------