├── .editorconfig
├── .github
    ├── FUNDING.yml
    ├── dependabot.yml
    ├── linters
    │   ├── .flake8
    │   ├── .markdown-lint.yml
    │   ├── .python-lint
    │   └── .yaml-lint.yml
    └── workflows
    │   └── linter.yml
├── .gitignore
├── README.md
├── config
    └── storage-cors.json
├── crontab
├── dataflow
    ├── java
    │   ├── .gitignore
    │   ├── README.md
    │   ├── nb-configuration.xml
    │   ├── nbactions.xml
    │   ├── pom.xml
    │   └── src
    │   │   └── main
    │   │       └── java
    │   │           └── com
    │   │               └── httparchive
    │   │                   └── dataflow
    │   │                       ├── BigQueryImport.java
    │   │                       └── GcsPathCoder.java
    └── python
    │   ├── .gitignore
    │   ├── README.md
    │   ├── adblock.py
    │   ├── bigquery_import.py
    │   ├── get_rules.sh
    │   ├── requirements.txt
    │   ├── run.sh
    │   └── setup.py
├── datalab
    └── histograms.ipynb
├── docs
    └── README.md
├── schema
    ├── .sqlfluffignore
    ├── httparchive_schema.sql
    ├── pages.json
    ├── requests.json
    └── schema.rb
├── sql
    ├── .sqlfluff
    ├── .sqlfluffignore
    ├── addDate.js
    ├── delete_date_from_reports.sh
    ├── generate_reports.sh
    ├── get_bigquery_dates.sh
    ├── histograms
    │   ├── bootupJs.sql
    │   ├── bytesCss.sql
    │   ├── bytesFont.sql
    │   ├── bytesHtml.sql
    │   ├── bytesImg.sql
    │   ├── bytesJs.sql
    │   ├── bytesOther.sql
    │   ├── bytesTotal.sql
    │   ├── bytesVideo.sql
    │   ├── compileJs.sql
    │   ├── cruxCls.sql
    │   ├── cruxDcl.sql
    │   ├── cruxFcp.sql
    │   ├── cruxFp.sql
    │   ├── cruxInp.sql
    │   ├── cruxLcp.sql
    │   ├── cruxOl.sql
    │   ├── cruxShopifyThemes.sql
    │   ├── cruxTtfb.sql
    │   ├── dcl.sql
    │   ├── evalJs.sql
    │   ├── fcp.sql
    │   ├── gzipSavings.sql
    │   ├── htmlElementPopularity.sql
    │   ├── imgSavings.sql
    │   ├── offscreenImages.sql
    │   ├── ol.sql
    │   ├── optimizedImages.sql
    │   ├── reqCss.sql
    │   ├── reqFont.sql
    │   ├── reqHtml.sql
    │   ├── reqImg.sql
    │   ├── reqJs.sql
    │   ├── reqOther.sql
    │   ├── reqTotal.sql
    │   ├── reqVideo.sql
    │   ├── speedIndex.sql
    │   ├── tcp.sql
    │   └── ttci.sql
    ├── lens
    │   ├── drupal
    │   │   ├── crux_histograms.sql
    │   │   ├── crux_timeseries.sql
    │   │   ├── histograms.sql
    │   │   └── timeseries.sql
    │   ├── magento
    │   │   ├── crux_histograms.sql
    │   │   ├── crux_timeseries.sql
    │   │   ├── histograms.sql
    │   │   └── timeseries.sql
    │   ├── top100k
    │   │   ├── crux_histograms.sql
    │   │   ├── crux_timeseries.sql
    │   │   ├── histograms.sql
    │   │   └── timeseries.sql
    │   ├── top10k
    │   │   ├── crux_histograms.sql
    │   │   ├── crux_timeseries.sql
    │   │   ├── histograms.sql
    │   │   └── timeseries.sql
    │   ├── top1k
    │   │   ├── crux_histograms.sql
    │   │   ├── crux_timeseries.sql
    │   │   ├── histograms.sql
    │   │   └── timeseries.sql
    │   ├── top1m
    │   │   ├── crux_histograms.sql
    │   │   ├── crux_timeseries.sql
    │   │   ├── histograms.sql
    │   │   └── timeseries.sql
    │   └── wordpress
    │   │   ├── crux_histograms.sql
    │   │   ├── crux_timeseries.sql
    │   │   ├── histograms.sql
    │   │   └── timeseries.sql
    ├── new_metric.sh
    └── timeseries
    │   ├── a11yButtonName.sql
    │   ├── a11yColorContrast.sql
    │   ├── a11yImageAlt.sql
    │   ├── a11yLabel.sql
    │   ├── a11yLinkName.sql
    │   ├── a11yScores.sql
    │   ├── asyncClipboardRead.sql
    │   ├── badgeClear.sql
    │   ├── badgeSet.sql
    │   ├── bootupJs.sql
    │   ├── bytesCss.sql
    │   ├── bytesFont.sql
    │   ├── bytesHtml.sql
    │   ├── bytesImg.sql
    │   ├── bytesJs.sql
    │   ├── bytesOther.sql
    │   ├── bytesTotal.sql
    │   ├── bytesVideo.sql
    │   ├── canonical.sql
    │   ├── contentIndex.sql
    │   ├── cruxFastDcl.sql
    │   ├── cruxFastFcp.sql
    │   ├── cruxFastFp.sql
    │   ├── cruxFastInp.sql
    │   ├── cruxFastLcp.sql
    │   ├── cruxFastOl.sql
    │   ├── cruxFastTtfb.sql
    │   ├── cruxLargeCls.sql
    │   ├── cruxPassesCWV.sql
    │   ├── cruxSlowFcp.sql
    │   ├── cruxSlowInp.sql
    │   ├── cruxSlowLcp.sql
    │   ├── cruxSlowTtfb.sql
    │   ├── cruxSmallCls.sql
    │   ├── dcl.sql
    │   ├── fcp.sql
    │   ├── fontDisplay.sql
    │   ├── getInstalledRelatedApps.sql
    │   ├── gzipSavings.sql
    │   ├── h2.sql
    │   ├── h3.sql
    │   ├── hreflang.sql
    │   ├── idleDetection.sql
    │   ├── imgLazy.sql
    │   ├── imgSavings.sql
    │   ├── legible.sql
    │   ├── linkText.sql
    │   ├── notificationTriggers.sql
    │   ├── numUrls.sql
    │   ├── offscreenImages.sql
    │   ├── ol.sql
    │   ├── optimizedImages.sql
    │   ├── pctHttps.sql
    │   ├── periodicBackgroundSync.sql
    │   ├── periodicBackgroundSyncRegister.sql
    │   ├── quicTransport.sql
    │   ├── reqCss.sql
    │   ├── reqFont.sql
    │   ├── reqHtml.sql
    │   ├── reqImg.sql
    │   ├── reqJs.sql
    │   ├── reqOther.sql
    │   ├── reqTotal.sql
    │   ├── reqVideo.sql
    │   ├── screenWakeLock.sql
    │   ├── speedIndex.sql
    │   ├── storageEstimate.sql
    │   ├── storagePersist.sql
    │   ├── swControlledPages.sql
    │   ├── tcp.sql
    │   ├── ttci.sql
    │   └── webSocketStream.sql
├── sync_csv.sh
├── sync_har.sh
├── urls
    ├── .gitignore
    ├── Gemfile
    ├── process.rb
    ├── run.sh
    └── schema.json
└── util
    └── fixcsv.py


/.editorconfig:
--------------------------------------------------------------------------------
 1 | # editorconfig.org
 2 | root = true
 3 | 
 4 | [*]
 5 | indent_style = space
 6 | end_of_line = lf
 7 | charset = utf-8
 8 | trim_trailing_whitespace = true
 9 | insert_final_newline = true
10 | 
11 | [*.{html,md,js,json,css,sql}]
12 | indent_size = 2
13 | 
14 | [*.py]
15 | indent_size = 4
16 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
 1 | # These are supported funding model platforms
 2 | 
 3 | github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
 4 | patreon: # Replace with a single Patreon username
 5 | open_collective: httparchive
 6 | ko_fi: # Replace with a single Ko-fi username
 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
 9 | liberapay: # Replace with a single Liberapay username
10 | issuehunt: # Replace with a single IssueHunt username
11 | otechie: # Replace with a single Otechie username
12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
13 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 |   - package-ecosystem: "github-actions"
4 |     directory: "/"
5 |     schedule:
6 |       interval: "daily"
7 | 


--------------------------------------------------------------------------------
/.github/linters/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 120
3 | per-file-ignores =
4 |     # The __init__.py file imports the routes and errors file at bottom
5 |     /github/workspace/src/server/__init__.py:E402,F401
6 |     /tmp/lint/src/server/__init__.py:E402,F401
7 |     /tmp/lint/server/__init__.py:E402,F401
8 | 


--------------------------------------------------------------------------------
/.github/linters/.markdown-lint.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | ###########################
 3 | ###########################
 4 | ## Markdown Linter rules ##
 5 | ###########################
 6 | ###########################
 7 | 
 8 | # Linter rules doc:
 9 | # - https://github.com/DavidAnson/markdownlint
10 | #
11 | # Note:
12 | # To comment out a single error:
13 | #   <!-- markdownlint-disable -->
14 | #   any violations you want
15 | #   <!-- markdownlint-restore -->
16 | #
17 | 
18 | ###############
19 | # Rules by id #
20 | ###############
21 | MD004: false                  # Unordered list style
22 | MD007: false                  # Allow extra spaces for lists - don't cause issues and will just annoy authors
23 | MD009: false                  # Allow trailing spaces - don't cause issues and will just annoy authors
24 | MD013: false                  # Don't demand maximum line lengths
25 | MD024:
26 |   siblings_only: true         # Allows sub-headings to be reused under different headings
27 | MD026:
28 |   punctuation: ".,;:!。，；:"  # List of not allowed
29 | MD029: false                  # Ordered list item prefix
30 | MD033: false                  # Allow inline HTML
31 | MD034: false                  # Allow base URLs
32 | MD036: false                  # Emphasis used instead of a heading
33 | MD040: false                  # Don't demand language for all code blocks
34 | 
35 | #################
36 | # Rules by tags #
37 | #################
38 | blank_lines: false             # Error on blank lines
39 | 


--------------------------------------------------------------------------------
/.github/linters/.yaml-lint.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | ########################################################
 3 | # HTTP Archive Overrides for YAML Lint                 #
 4 | # https://yamllint.readthedocs.io/en/stable/rules.html #
 5 | ########################################################
 6 | rules:
 7 |   document-start: disable
 8 |   line-length:
 9 |     max: 120
10 | 


--------------------------------------------------------------------------------
/.github/workflows/linter.yml:
--------------------------------------------------------------------------------
 1 | ###########################
 2 | ## Linter GitHub Actions ##
 3 | ###########################
 4 | #
 5 | # Documentation: https://github.com/github/super-linter/
 6 | #
 7 | # Exception config files are in the .github/linters directory
 8 | #
 9 | name: Lint Code Base
10 | on:
11 |   - workflow_dispatch
12 |   - pull_request
13 | jobs:
14 |   lint:
15 |     name: Lint Code Base
16 |     runs-on: ubuntu-20.04
17 |     steps:
18 |       - name: Checkout Code
19 |         uses: actions/checkout@v4
20 |         with:
21 |           # Full git history is needed to get a proper list of changed files within `super-linter`
22 |           fetch-depth: 0
23 |       - name: Set VALIDATE_ALL_CODEBASE variable to false
24 |         # Only run the full workflow for manual runs or if upgrading the super linter
25 |         if: |
26 |             github.event_name != 'workflow_dispatch' &&
27 |             startsWith(github.event.pull_request.title,'Bump super-linter/super-linter') != true
28 |         run: |
29 |           echo "VALIDATE_ALL_CODEBASE=false" >> $GITHUB_ENV
30 |       - name: Lint Code Base
31 |         uses: super-linter/super-linter/slim@v7
32 |         env:
33 |           DEFAULT_BRANCH: master
34 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}s
35 |           FILTER_REGEX_EXCLUDE: .*/dataflow/
36 |           # VALIDATE_BASH: true
37 |           VALIDATE_EDITORCONFIG: true
38 |           VALIDATE_JAVASCRIPT_ES: true
39 |           VALIDATE_JSON: true
40 |           VALIDATE_MARKDOWN: true
41 |           VALIDATE_PYTHON_PYLINT: true
42 |           VALIDATE_PYTHON_FLAKE8: true
43 |           VALIDATE_SQLFLUFF: true
44 |           VALIDATE_YAML: true
45 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | data
2 | tools
3 | .DS_Store
4 | *.out
5 | 
6 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # HTTP Archive + BigQuery data import
 2 | 
 3 | _Note: you don't need to import this data yourself, the BigQuery dataset is public! [Getting started](https://github.com/HTTPArchive/httparchive.org/blob/master/docs/gettingstarted_bigquery.md)._
 4 | 
 5 | However, if you do want your own private copy of the dataset... The following import and sync scripts will help you import the [HTTP Archive dataset](http://httparchive.org/downloads.php) into BigQuery and keep it up to date.
 6 | 
 7 | ```bash
 8 | $> sh sync.sh Jun_15_2013
 9 | $> sh sync.sh mobile_Jun_15_2013
10 | ```
11 | 
12 | That's all there is to it. The sync script handles all the necessary processing:
13 | 
14 | * Archives are fetched from archive.org (and cached locally)
15 | * Archived CSV is transformed to BigQuery compatible escaping
16 |   * You will need +pigz+ installed for parallel compression
17 | * Request files are split into <1GB compressed CSV's
18 | * Resulting pages and request data is synced to a Google Storage bucket
19 | * BigQuery import is kicked off for each of compressed archives on Google Storage
20 | 
21 | After the upload is complete, a copy of the latest tables can be made with:
22 | 
23 | ```bash
24 | $> bq.py cp runs.2013_06_15_pages runs.latest_pages
25 | $> bq.py cp runs.2013_06_15_pages_mobile runs.latest_pages_mobile
26 | $> bq.py cp runs.2013_06_15_requests runs.latest_requests
27 | $> bq.py cp runs.2013_06_15_requests_mobile runs.latest_requests_mobile
28 | ```
29 | 
30 | (MIT License) - Copyright (c) 2013 Ilya Grigorik
31 | 


--------------------------------------------------------------------------------
/config/storage-cors.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "origin": [
 4 |       "http://beta.httparchive.org",
 5 |       "https://beta.httparchive.org",
 6 |       "http://httparchive.org",
 7 |       "https://httparchive.org",
 8 |       "http://www.httparchive.org",
 9 |       "https://www.httparchive.org",
10 |       "http://staging.httparchive.org",
11 |       "https://staging.httparchive.org",
12 |       "http://localhost:8080",
13 |       "http://127.0.0.1:8080",
14 |       "http://httparchive.appspot.com",
15 |       "https://httparchive.appspot.com",
16 |       "http://httparchive-staging.appspot.com",
17 |       "https://httparchive-staging.appspot.com",
18 |       "https://drupal.httparchive.org",
19 |       "https://magento.httparchive.org",
20 |       "https://wordpress.httparchive.org"
21 |     ],
22 |     "responseHeader": ["Content-Type"],
23 |     "method": ["GET", "HEAD"],
24 |     "maxAgeSeconds": 3600
25 |   }
26 | ]
27 | 


--------------------------------------------------------------------------------
/crontab:
--------------------------------------------------------------------------------
 1 | #0 15 * * * /bin/bash -l -c 'cd /home/igrigorik/code && ./sync_csv.sh `date +\%b_1_\%Y`'  >> /var/log/HAimport.log 2>&1
 2 | #0  8 * * * /bin/bash -l -c 'cd /home/igrigorik/code && ./sync_csv.sh mobile_`date +\%b_1_\%Y`'  >> /var/log/HAimport.log 2>&1
 3 | 
 4 | #0 10 * * * /bin/bash -l -c 'cd /home/igrigorik/code && ./sync_har.sh chrome' >> /var/log/HA-import-har-chrome.log 2>&1
 5 | #0 11 * * * /bin/bash -l -c 'cd /home/igrigorik/code && ./sync_har.sh android' >> /var/log/HA-import-har-android.log 2>&1
 6 | 
 7 | # Attempt to run the reports everyday
 8 | 0  8 * * * /bin/bash -l -c 'cd /home/igrigorik/code && sql/generate_reports.sh -th `date "+\%Y_\%m_01"` -l ALL' >> /var/log/generate_reports.log 2>&1
 9 | 
10 | # Run the reports on the 2nd to pick up blink table updates
11 | 0  7 2 * * /bin/bash -l -c 'cd /home/igrigorik/code && sql/generate_reports.sh -th `date -d "-1 month" "+\%Y_\%m_01"` -l ALL' >> /var/log/generate_last_months_reports.log 2>&1
12 | 
13 | # Run the CrUX reports on 15th
14 | 0  7 15 * * /bin/bash -l -c 'cd /home/igrigorik/code && sql/generate_reports.sh -th `date -d "-1 month" "+\%Y_\%m_01"` -r "*crux*" -l ALL' >> /var/log/crux_reruns.log 2>&1
15 | 


--------------------------------------------------------------------------------
/dataflow/java/.gitignore:
--------------------------------------------------------------------------------
1 | target
2 | 


--------------------------------------------------------------------------------
/dataflow/java/README.md:
--------------------------------------------------------------------------------
1 | # Loading data
2 | 
3 | ```
4 | mvn compile exec:java -Dexec.mainClass=com.httparchive.dataflow.BigQueryImport -Dexec.args="--project=httparchive --stagingLocation=gs://httparchive/dataflow/staging --runner=BlockingDataflowPipelineRunner --input=desktop-Oct_15_2015 --workerMachineType=n1-standard-4"
5 | ```
6 | 
7 | ## Installing Java on Debian
8 | - https://www.digitalocean.com/community/tutorials/how-to-manually-install-oracle-java-on-a-debian-or-ubuntu-vps
9 | 


--------------------------------------------------------------------------------
/dataflow/java/nb-configuration.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project-shared-configuration>
 3 |     <!--
 4 | This file contains additional configuration written by modules in the NetBeans IDE.
 5 | The configuration is intended to be shared among all the users of project and
 6 | therefore it is assumed to be part of version control checkout.
 7 | Without this configuration present, some functionality in the IDE may be limited or fail altogether.
 8 | -->
 9 |     <properties xmlns="http://www.netbeans.org/ns/maven-properties-data/1">
10 |         <!--
11 | Properties that influence various parts of the IDE, especially code formatting and the like. 
12 | You can copy and paste the single properties, into the pom.xml file and the IDE will pick them up.
13 | That way multiple projects can share the same settings (useful for formatting rules for example).
14 | Any value defined here will override the pom.xml file value but is only applicable to the current project.
15 | -->
16 |         <netbeans.hint.jdkPlatform>JDK_1.7</netbeans.hint.jdkPlatform>
17 |         <org-netbeans-modules-javascript2-requirejs.enabled>true</org-netbeans-modules-javascript2-requirejs.enabled>
18 |     </properties>
19 | </project-shared-configuration>
20 | 


--------------------------------------------------------------------------------
/dataflow/java/nbactions.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <actions>
 3 |         <action>
 4 |             <actionName>run</actionName>
 5 |             <packagings>
 6 |                 <packaging>jar</packaging>
 7 |             </packagings>
 8 |             <goals>
 9 |                 <goal>process-classes</goal>
10 |                 <goal>org.codehaus.mojo:exec-maven-plugin:1.2.1:exec</goal>
11 |             </goals>
12 |             <properties>
13 |                 <exec.args>-classpath %classpath com.httparchive.dataflow.BigQueryImport --project=httparchive --stagingLocation=gs://httparchive/dataflow/staging --runner=DirectPipelineRunner --input=test-Apr_25_2017</exec.args>
14 |                 <exec.executable>java</exec.executable>
15 |                 <Env.PATH>/Users/igrigorik/google-cloud-sdk/bin:/usr/local/git/current/bin:/usr/local/bin:/usr/bin:/bin:/usr/local/sbin:/usr/sbin:/sbin:/usr/local/bin/g4bin:/usr/local/git/bin</Env.PATH>
16 |             </properties>
17 |         </action>
18 |         <action>
19 |             <actionName>debug</actionName>
20 |             <packagings>
21 |                 <packaging>jar</packaging>
22 |             </packagings>
23 |             <goals>
24 |                 <goal>process-classes</goal>
25 |                 <goal>org.codehaus.mojo:exec-maven-plugin:1.2.1:exec</goal>
26 |             </goals>
27 |             <properties>
28 |                 <exec.args>-Xdebug -Xrunjdwp:transport=dt_socket,server=n,address=${jpda.address} -classpath %classpath com.httparchive.dataflow.BigQueryImport --project=httparchive --stagingLocation=gs://httparchive/dataflow/staging --runner=DirectPipelineRunner --input=test-Apr_25_2017</exec.args>
29 |                 <exec.executable>java</exec.executable>
30 |                 <Env.PATH>/Users/igrigorik/google-cloud-sdk/bin:/usr/local/git/current/bin:/usr/local/bin:/usr/bin:/bin:/usr/local/sbin:/usr/sbin:/sbin:/usr/local/bin/g4bin:/usr/local/git/bin</Env.PATH>
31 |                 <jpda.listen>true</jpda.listen>
32 |             </properties>
33 |         </action>
34 |         <action>
35 |             <actionName>profile</actionName>
36 |             <packagings>
37 |                 <packaging>jar</packaging>
38 |             </packagings>
39 |             <goals>
40 |                 <goal>process-classes</goal>
41 |                 <goal>org.codehaus.mojo:exec-maven-plugin:1.2.1:exec</goal>
42 |             </goals>
43 |             <properties>
44 |                 <exec.args>-classpath %classpath com.httparchive.dataflow.BigQueryImport --project=httparchive --stagingLocation=gs://httparchive/dataflow/staging --runner=DirectPipelineRunner --input=test-Apr_25_2017</exec.args>
45 |                 <exec.executable>java</exec.executable>
46 |             </properties>
47 |         </action>
48 |     </actions>
49 | 


--------------------------------------------------------------------------------
/dataflow/java/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 3 |     <modelVersion>4.0.0</modelVersion>
 4 |     <groupId>com.httparchive</groupId>
 5 |     <artifactId>Dataflow</artifactId>
 6 |     <version>1.0-SNAPSHOT</version>
 7 |     <packaging>jar</packaging>
 8 |     <properties>
 9 |         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
10 |         <maven.compiler.source>1.7</maven.compiler.source>
11 |         <maven.compiler.target>1.7</maven.compiler.target>
12 |     </properties>
13 | 
14 |     <dependencies>
15 |         <dependency>
16 |             <groupId>com.google.cloud.dataflow</groupId>
17 |                 <artifactId>google-cloud-dataflow-java-sdk-all</artifactId>
18 |             <version>1.9.0</version>
19 |         </dependency>
20 |         <dependency>
21 |             <groupId>org.slf4j</groupId>
22 |             <artifactId>slf4j-simple</artifactId>
23 |             <version>1.7.21</version>
24 |         </dependency>
25 |         <dependency>
26 |             <groupId>org.slf4j</groupId>
27 |             <artifactId>slf4j-api</artifactId>
28 |             <version>1.7.21</version>
29 |         </dependency>
30 |     </dependencies>
31 | </project>
32 | 


--------------------------------------------------------------------------------
/dataflow/java/src/main/java/com/httparchive/dataflow/GcsPathCoder.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * To change this license header, choose License Headers in Project Properties.
 3 |  * To change this template file, choose Tools | Templates
 4 |  * and open the template in the editor.
 5 |  */
 6 | package com.httparchive.dataflow;
 7 | 
 8 | import com.google.cloud.dataflow.sdk.coders.AtomicCoder;
 9 | import com.google.cloud.dataflow.sdk.coders.Coder.Context;
10 | import com.google.cloud.dataflow.sdk.coders.StringUtf8Coder;
11 | import com.google.cloud.dataflow.sdk.util.gcsfs.GcsPath;
12 | 
13 | import java.io.IOException;
14 | import java.io.InputStream;
15 | import java.io.OutputStream;
16 | import java.nio.ByteBuffer;
17 | 
18 | public class GcsPathCoder extends AtomicCoder<GcsPath> {
19 | 
20 |     public static GcsPathCoder of() {
21 |         return INSTANCE;
22 |     }
23 | 
24 |     @Override
25 |     public void encode(GcsPath value, OutputStream outStream, Context context)
26 |             throws IOException {
27 |         String strValue = value.toResourceName();
28 |         StringUtf8Coder.of().encode(strValue, outStream, context);
29 |     }
30 | 
31 |     public GcsPath decode(ByteBuffer in) {
32 |         return GcsPath.fromResourceName(in.toString());
33 |     }
34 | 
35 |     @Override
36 |     public GcsPath decode(InputStream inStream, Context context) throws IOException {
37 |         try {
38 |             String strValue = StringUtf8Coder.of().decode(inStream, context);
39 |             return GcsPath.fromResourceName(strValue);
40 |         } catch (IOException e) {
41 |             System.out.println("Failed to decode GcsPath: " + e);
42 |             System.out.println(inStream);
43 |             System.out.println(context);
44 |             throw e;
45 |         }
46 |     }
47 | 
48 |     private static final GcsPathCoder INSTANCE = new GcsPathCoder();
49 | 
50 |     /**
51 |      * TableCell can hold arbitrary Object instances, which makes the encoding
52 |      * non-deterministic.
53 |      *
54 |      * @return
55 |      */
56 |     @Deprecated
57 |     public boolean isDeterministic() {
58 |         return false;
59 |     }
60 | 
61 |     @Override
62 |     public void verifyDeterministic() throws NonDeterministicException {
63 |         throw new NonDeterministicException(this,
64 |                 "HAR can hold arbitrary instances which may be non-deterministic.");
65 |     }
66 | }
67 | 


--------------------------------------------------------------------------------
/dataflow/python/.gitignore:
--------------------------------------------------------------------------------
1 | env
2 | lib
3 | adblock.egg*
4 | local
5 | credentials/auth.json
6 | __pycache__


--------------------------------------------------------------------------------
/dataflow/python/README.md:
--------------------------------------------------------------------------------
 1 | # HTTP Archive Python Dataflow
 2 | 
 3 | ## Installation
 4 | 
 5 | Follow the [Quickstart using Python](https://cloud.google.com/dataflow/docs/quickstarts/quickstart-python#before-you-begin) guide.
 6 | 
 7 | 1. Create and activate a Python 3 `virtualenv`:
 8 | 
 9 |   ```
10 |   python -m virtualenv --python=python3 --clear env
11 |   source env/bin/activate
12 |   ```
13 | 
14 | 2. Install dependencies:
15 | 
16 |   ```
17 |   pip install -r requirements.txt
18 |   ```
19 | 
20 | 3. Create a service account key, save it to `credentials/cert.json`, and set the environment variable:
21 | 
22 | ```
23 | export GOOGLE_APPLICATION_CREDENTIALS="./credentials/auth.json"
24 | ```
25 | 
26 | This needs to be reset during the startup of every shell.
27 | 
28 | ## Running the pipeline
29 | 
30 | 1. Activate the Python virtual environment:
31 | 
32 |   ```
33 |   source env/bin/activate
34 |   ```
35 | 
36 | 2. Run `bigquery_import.py`:
37 | 
38 |   ```
39 | python bigquery_import.py \
40 |   --runner=DataflowRunner \
41 |   --project=httparchive \
42 |   --temp_location=gs://httparchive/dataflow/temp \
43 |   --staging_location=gs://httparchive/dataflow/staging \
44 |   --region=us-west1 \
45 |   --machine_type=n1-standard-32 \
46 |   --input=android-Dec_1_2020 \
47 |   --worker_disk_type=compute.googleapis.com/projects//zones//diskTypes/pd-ssd
48 |   ```
49 | 
50 |   The `--runner=DataflowRunner` option forces the pipeline to run in the cloud using Dataflow. To run locally, omit this option. Be aware that crawls consume TB of disk space, so only run locally using subsetted input datasets. To create a subset dataset, copy a few HAR files on GCS to a new directory.
51 | 
52 | 3. Decativate the virtual environment:
53 | 
54 |   ```
55 |   deactivate
56 |   ```
57 | 


--------------------------------------------------------------------------------
/dataflow/python/adblock.py:
--------------------------------------------------------------------------------
 1 | """A easylist classifier."""
 2 | 
 3 | from __future__ import absolute_import
 4 | from adblockparser import AdblockRules
 5 | 
 6 | import argparse
 7 | import logging
 8 | import re2 as re
 9 | 
10 | import google.cloud.dataflow as df
11 | 
12 | class EasylistClassifyDoFn(df.DoFn):
13 |   def process(self, *args):
14 |     row, classifiers = args[0].element, args[1]
15 |     row['type'] = ''
16 | 
17 |     for (name, classifier) in classifiers.items():
18 |       # TODO: add script initiator check
19 |       if classifier.should_block(row['url'], {
20 |                   'domain': row['domain'],
21 |                   'third-party': row['third_party']
22 |                 }):
23 |         row['type'] = name
24 |         print row
25 |         break
26 | 
27 |     del row['domain']
28 |     del row['third_party']
29 |     yield row
30 | 
31 | def run(argv=None):
32 |   parser = argparse.ArgumentParser()
33 |   parser.add_argument('--input',
34 |                       dest='input',
35 |                       required=True,
36 |                       help='BigQuery request input table.')
37 |   parser.add_argument('--output',
38 |                       dest='output',
39 |                       help='BigQuery output table.')
40 |   known_args, pipeline_args = parser.parse_known_args(argv)
41 | 
42 |   output_table = '%s' % known_args.output
43 |   input_query = """
44 |     SELECT
45 |       page, url,
46 |       DOMAIN(page) as domain,
47 |       IF (DOMAIN(page) == DOMAIN(url), false, true) AS third_party,
48 |     FROM [%s]
49 |   """ % known_args.input
50 | 
51 |   classifiers = {}
52 |   for file in ['ad', 'tracker', 'social']:
53 |     rules = [line.rstrip('\n') for line in open('local/'+file+'.txt')]
54 |     classifier = AdblockRules(rules,
55 |                     supported_options=['domain', 'third-party'],
56 |                     skip_unsupported_rules=False, use_re2=True)
57 |     del rules
58 |     classifiers[file] = classifier
59 | 
60 |   p = df.Pipeline(argv=pipeline_args)
61 | 
62 |   (p
63 |   | df.Read('read', df.io.BigQuerySource(query=input_query))
64 |   | df.ParDo('classify', EasylistClassifyDoFn(), classifiers)
65 |   # | df.io.Write('write', df.io.TextFileSink('out')))
66 |   | df.Write('write', df.io.BigQuerySink(
67 |       output_table,
68 |       schema='page:STRING, url:STRING, type:STRING',
69 |       create_disposition=df.io.BigQueryDisposition.CREATE_IF_NEEDED,
70 |       write_disposition=df.io.BigQueryDisposition.WRITE_TRUNCATE)))
71 | 
72 |   p.run()
73 | 
74 | if __name__ == '__main__':
75 |   logging.getLogger().setLevel(logging.INFO)
76 |   run()
77 | 


--------------------------------------------------------------------------------
/dataflow/python/bigquery_import.py:
--------------------------------------------------------------------------------
  1 | """HTTP Archive dataflow pipeline for generating HAR data on BigQuery."""
  2 | 
  3 | from __future__ import absolute_import
  4 | 
  5 | import argparse
  6 | from copy import deepcopy
  7 | from datetime import datetime
  8 | from hashlib import sha256
  9 | import json
 10 | import logging
 11 | import re
 12 | 
 13 | import apache_beam as beam
 14 | import apache_beam.io.gcp.gcsio as gcsio
 15 | from apache_beam.options.pipeline_options import PipelineOptions
 16 | from apache_beam.options.pipeline_options import SetupOptions
 17 | 
 18 | 
 19 | # BigQuery can handle rows up to 100 MB.
 20 | MAX_CONTENT_SIZE = 2 * 1024 * 1024
 21 | # Number of times to partition the requests tables.
 22 | NUM_PARTITIONS = 4
 23 | 
 24 | 
 25 | def get_page(har):
 26 |   """Parses the page from a HAR object."""
 27 | 
 28 |   if not har:
 29 |     return
 30 | 
 31 |   page = har.get('log').get('pages')[0]
 32 |   url = page.get('_URL')
 33 | 
 34 |   metadata = page.get('_metadata')
 35 |   if metadata:
 36 |     # The page URL from metadata is more accurate.
 37 |     # See https://github.com/HTTPArchive/data-pipeline/issues/48
 38 |     url = metadata.get('tested_url', url)
 39 | 
 40 |   try:
 41 |     payload_json = to_json(page)
 42 |   except:
 43 |     logging.warning('Skipping pages payload for "%s": unable to stringify as JSON.' % url)
 44 |     return
 45 | 
 46 |   payload_size = len(payload_json)
 47 |   if payload_size > MAX_CONTENT_SIZE:
 48 |     logging.warning('Skipping pages payload for "%s": payload size (%s) exceeds the maximum content size of %s bytes.' % (url, payload_size, MAX_CONTENT_SIZE))
 49 |     return
 50 | 
 51 |   return [{
 52 |     'url': url,
 53 |     'payload': payload_json
 54 |   }]
 55 | 
 56 | 
 57 | def get_page_url(har):
 58 |   """Parses the page URL from a HAR object."""
 59 | 
 60 |   page = get_page(har)
 61 | 
 62 |   if not page:
 63 |     logging.warning('Unable to get URL from page (see preceding warning).')
 64 |     return
 65 | 
 66 |   return page[0].get('url')
 67 | 
 68 | 
 69 | def partition_step(fn, har, index):
 70 |   """Partitions functions across multiple concurrent steps."""
 71 | 
 72 |   logging.info(f'partitioning step {fn}, index {index}')
 73 | 
 74 |   if not har:
 75 |     logging.warning('Unable to partition step, null HAR.')
 76 |     return
 77 | 
 78 |   page = har.get('log').get('pages')[0]
 79 |   metadata = page.get('_metadata')
 80 |   if metadata.get('crawl_depth') and metadata.get('crawl_depth') != '0':
 81 |     # Only home pages have a crawl depth of 0.
 82 |     return
 83 | 
 84 |   page_url = get_page_url(har)
 85 | 
 86 |   if not page_url:
 87 |     logging.warning('Skipping HAR: unable to get page URL (see preceding warning).')
 88 |     return
 89 | 
 90 |   hash = hash_url(page_url)
 91 |   if hash % NUM_PARTITIONS != index:
 92 |     logging.info(f'Skipping partition. {hash} % {NUM_PARTITIONS} != {index}')
 93 |     return
 94 |   
 95 |   return fn(har)
 96 | 
 97 | 
 98 | def get_requests(har):
 99 |   """Parses the requests from a HAR object."""
100 | 
101 |   if not har:
102 |     return
103 | 
104 |   page_url = get_page_url(har)
105 | 
106 |   if not page_url:
107 |     # The page_url field indirectly depends on the get_page function.
108 |     # If the page data is unavailable for whatever reason, skip its requests.
109 |     logging.warning('Skipping requests payload: unable to get page URL (see preceding warning).')
110 |     return
111 | 
112 |   entries = har.get('log').get('entries')
113 | 
114 |   requests = []
115 | 
116 |   for request in entries:
117 | 
118 |     request_url = request.get('_full_url')
119 | 
120 |     try:
121 |       payload = to_json(trim_request(request))
122 |     except:
123 |       logging.warning('Skipping requests payload for "%s": unable to stringify as JSON.' % request_url)
124 |       continue
125 | 
126 |     payload_size = len(payload)
127 |     if payload_size > MAX_CONTENT_SIZE:
128 |       logging.warning('Skipping requests payload for "%s": payload size (%s) exceeded maximum content size of %s bytes.' % (request_url, payload_size, MAX_CONTENT_SIZE))
129 |       continue
130 | 
131 |     requests.append({
132 |       'page': page_url,
133 |       'url': request_url,
134 |       'payload': payload
135 |     })
136 | 
137 |   return requests
138 | 
139 | 
140 | def trim_request(request):
141 |   """Removes redundant fields from the request object."""
142 | 
143 |   # Make a copy first so the response body can be used later.
144 |   request = deepcopy(request)
145 |   request.get('response').get('content').pop('text', None)
146 |   return request
147 | 
148 | 
149 | def hash_url(url):
150 |   """Hashes a given URL to a process-stable integer value."""
151 |   return int(sha256(url.encode('utf-8')).hexdigest(), 16)
152 | 
153 | 
154 | def get_response_bodies(har):
155 |   """Parses response bodies from a HAR object."""
156 | 
157 |   page_url = get_page_url(har)
158 |   requests = har.get('log').get('entries')
159 | 
160 |   response_bodies = []
161 | 
162 |   for request in requests:
163 |     request_url = request.get('_full_url')
164 |     body = None
165 |     if request.get('response') and request.get('response').get('content'):
166 |         body = request.get('response').get('content').get('text', None)
167 | 
168 |     if body == None:
169 |       continue
170 | 
171 |     truncated = len(body) > MAX_CONTENT_SIZE
172 |     if truncated:
173 |       logging.warning('Truncating response body for "%s". Response body size %s exceeds limit %s.' % (request_url, len(body), MAX_CONTENT_SIZE))
174 | 
175 |     response_bodies.append({
176 |       'page': page_url,
177 |       'url': request_url,
178 |       'body': body[:MAX_CONTENT_SIZE],
179 |       'truncated': truncated
180 |     })
181 | 
182 |   return response_bodies
183 | 
184 | 
185 | def get_technologies(har):
186 |   """Parses the technologies from a HAR object."""
187 | 
188 |   if not har:
189 |     return
190 | 
191 |   page = har.get('log').get('pages')[0]
192 |   page_url = page.get('_URL')
193 |   app_names = page.get('_detected_apps', {})
194 |   categories = page.get('_detected', {})
195 | 
196 |   # When there are no detected apps, it appears as an empty array.
197 |   if isinstance(app_names, list):
198 |     app_names = {}
199 |     categories = {}
200 | 
201 |   app_map = {}
202 |   app_list = []
203 |   for app, info_list in app_names.items():
204 |     if not info_list:
205 |       continue
206 |     # There may be multiple info values. Add each to the map.
207 |     for info in info_list.split(','):
208 |       app_id = '%s %s' % (app, info) if len(info) > 0 else app
209 |       app_map[app_id] = app
210 | 
211 |   for category, apps in categories.items():
212 |     for app_id in apps.split(','):
213 |       app = app_map.get(app_id)
214 |       info = ''
215 |       if app == None:
216 |         app = app_id
217 |       else:
218 |         info = app_id[len(app):].strip()
219 |       app_list.append({
220 |         'url': page_url,
221 |         'category': category,
222 |         'app': app,
223 |         'info': info
224 |       })
225 | 
226 |   return app_list
227 | 
228 | 
229 | def get_lighthouse_reports(har):
230 |   """Parses Lighthouse results from a HAR object."""
231 | 
232 |   if not har:
233 |     return
234 | 
235 |   report = har.get('_lighthouse')
236 | 
237 |   if not report:
238 |     return
239 | 
240 |   page_url = get_page_url(har)
241 | 
242 |   if not page_url:
243 |     logging.warning('Skipping lighthouse report: unable to get page URL (see preceding warning).')
244 |     return
245 | 
246 |   # Omit large UGC.
247 |   report.get('audits').get('screenshot-thumbnails', {}).get('details', {}).pop('items', None)
248 | 
249 |   try:
250 |     report_json = to_json(report)
251 |   except:
252 |     logging.warning('Skipping Lighthouse report for "%s": unable to stringify as JSON.' % page_url)
253 |     return
254 | 
255 |   report_size = len(report_json)
256 |   if report_size > MAX_CONTENT_SIZE:
257 |     logging.warning('Skipping Lighthouse report for "%s": Report size (%s) exceeded maximum content size of %s bytes.' % (page_url, report_size, MAX_CONTENT_SIZE))
258 |     return
259 | 
260 |   return [{
261 |     'url': page_url,
262 |     'report': report_json
263 |   }]
264 | 
265 | 
266 | def to_json(obj):
267 |   """Returns a JSON representation of the object.
268 | 
269 |   This method attempts to mirror the output of the
270 |   legacy Java Dataflow pipeline. For the most part,
271 |   the default `json.dumps` config does the trick,
272 |   but there are a few settings to make it more consistent:
273 | 
274 |   - Omit whitespace between properties
275 |   - Do not escape non-ASCII characters (preserve UTF-8)
276 | 
277 |   One difference between this Python implementation and the
278 |   Java implementation is the way long numbers are handled.
279 |   A Python-serialized JSON string might look like this:
280 | 
281 |     "timestamp":1551686646079.9998
282 | 
283 |   while the Java-serialized string uses scientific notation:
284 | 
285 |     "timestamp":1.5516866460799998E12
286 | 
287 |   Out of a sample of 200 actual request objects, this was
288 |   the only difference between implementations. This can be
289 |   considered an improvement.
290 |   """
291 | 
292 |   if not obj:
293 |     raise ValueError
294 | 
295 |   return json.dumps(obj, separators=(',', ':'), ensure_ascii=False)
296 | 
297 | 
298 | def from_json(str):
299 |   """Returns an object from the JSON representation."""
300 | 
301 |   try:
302 |     return json.loads(str)
303 |   except Exception as e:
304 |     logging.error('Unable to parse JSON object "%s...": %s' % (str[:50], e))
305 |     return
306 | 
307 | 
308 | def get_gcs_dir(release):
309 |   """Formats a release string into a gs:// directory."""
310 | 
311 |   return 'gs://httparchive/crawls/%s/' % release
312 | 
313 | 
314 | def gcs_list(gcs_dir):
315 |   """Lists all files in a GCS directory."""
316 |   gcs = gcsio.GcsIO()
317 |   return gcs.list_prefix(gcs_dir)
318 | 
319 | 
320 | def get_bigquery_uri(release, dataset):
321 |   """Formats a release string into a BigQuery dataset/table."""
322 | 
323 |   client, date_string = release.split('-')
324 | 
325 |   if client == 'chrome':
326 |     client = 'desktop'
327 |   elif client == 'android':
328 |     client = 'mobile'
329 | 
330 |   date_obj = datetime.strptime(date_string, '%b_%d_%Y') # Mar_01_2020
331 |   date_string = date_obj.strftime('%Y_%m_%d') # 2020_03_01
332 | 
333 |   return 'httparchive:%s.%s_%s' % (dataset, date_string, client)
334 | 
335 | 
336 | def run(argv=None):
337 |   """Constructs and runs the BigQuery import pipeline."""
338 |   parser = argparse.ArgumentParser()
339 |   parser.add_argument(
340 |       '--input',
341 |       required=True,
342 |       help='Input Cloud Storage directory to process.')
343 |   known_args, pipeline_args = parser.parse_known_args(argv)
344 |   pipeline_options = PipelineOptions(pipeline_args)
345 |   pipeline_options.view_as(SetupOptions).save_main_session = True
346 | 
347 | 
348 |   with beam.Pipeline(options=pipeline_options) as p:
349 |     gcs_dir = get_gcs_dir(known_args.input)
350 | 
351 |     hars = (p
352 |       | beam.Create([gcs_dir])
353 |       | beam.io.ReadAllFromText()
354 |       | 'MapJSON' >> beam.Map(from_json))
355 | 
356 |     for i in range(NUM_PARTITIONS):
357 |       (hars
358 |         | f'MapPages{i}' >> beam.FlatMap(
359 |           (lambda i: lambda har: partition_step(get_page, har, i))(i))
360 |         | f'WritePages{i}' >> beam.io.WriteToBigQuery(
361 |           get_bigquery_uri(known_args.input, 'pages'),
362 |           schema='url:STRING, payload:STRING',
363 |           write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
364 |           create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED))
365 | 
366 |       (hars
367 |         | f'MapTechnologies{i}' >> beam.FlatMap(
368 |           (lambda i: lambda har: partition_step(get_technologies, har, i))(i))
369 |         | f'WriteTechnologies{i}' >> beam.io.WriteToBigQuery(
370 |           get_bigquery_uri(known_args.input, 'technologies'),
371 |           schema='url:STRING, category:STRING, app:STRING, info:STRING',
372 |           write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
373 |           create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED))
374 | 
375 |       (hars
376 |         | f'MapLighthouseReports{i}' >> beam.FlatMap(
377 |           (lambda i: lambda har: partition_step(get_lighthouse_reports, har, i))(i))
378 |         | f'WriteLighthouseReports{i}' >> beam.io.WriteToBigQuery(
379 |           get_bigquery_uri(known_args.input, 'lighthouse'),
380 |           schema='url:STRING, report:STRING',
381 |           write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
382 |           create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED))
383 |       (hars
384 |         | f'MapRequests{i}' >> beam.FlatMap(
385 |           (lambda i: lambda har: partition_step(get_requests, har, i))(i))
386 |         | f'WriteRequests{i}' >> beam.io.WriteToBigQuery(
387 |           get_bigquery_uri(known_args.input, 'requests'),
388 |           schema='page:STRING, url:STRING, payload:STRING',
389 |           write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
390 |           create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED))
391 | 
392 |       (hars
393 |         | f'MapResponseBodies{i}' >> beam.FlatMap(
394 |           (lambda i: lambda har: partition_step(get_response_bodies, har, i))(i))
395 |         | f'WriteResponseBodies{i}' >> beam.io.WriteToBigQuery(
396 |           get_bigquery_uri(known_args.input, 'response_bodies'),
397 |           schema='page:STRING, url:STRING, body:STRING, truncated:BOOLEAN',
398 |           write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
399 |           create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED))
400 | 
401 | 
402 | if __name__ == '__main__':
403 |   logging.getLogger().setLevel(logging.INFO)
404 |   run()
405 | 


--------------------------------------------------------------------------------
/dataflow/python/get_rules.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ ! -f easyprivacy.txt ]; then
 4 |   wget --no-check-certificate https://easylist-downloads.adblockplus.org/easyprivacy.txt -O local/tracker.txt
 5 | fi
 6 | 
 7 | if [ ! -f easylist_noelemhide.txt ]; then
 8 |   wget --no-check-certificate https://easylist-downloads.adblockplus.org/easylist_noelemhide.txt -O local/ad.txt
 9 | fi
10 | 
11 | if [ ! -f fanboy-annoyance.txt ]; then
12 |   wget --no-check-certificate https://easylist-downloads.adblockplus.org/fanboy-annoyance.txt -O local/social.txt
13 | fi
14 | 
15 | 


--------------------------------------------------------------------------------
/dataflow/python/requirements.txt:
--------------------------------------------------------------------------------
1 | apache-beam[gcp]==2.31
2 | 


--------------------------------------------------------------------------------
/dataflow/python/run.sh:
--------------------------------------------------------------------------------
 1 | # Omit the runner option to run the pipeline locally.
 2 | #--runner=DataflowRunner \
 3 | python bigquery_import.py \
 4 |   --runner=DataflowRunner \
 5 |   --project=httparchive \
 6 |   --temp_location=gs://httparchive/dataflow/temp \
 7 |   --staging_location=gs://httparchive/dataflow/staging \
 8 |   --region=us-west1 \
 9 |   --machine_type=n1-standard-32 \
10 |   --input=android-Jul_1_2021 \
11 |   --worker_disk_type=compute.googleapis.com/projects//zones//diskTypes/pd-ssd
12 | 


--------------------------------------------------------------------------------
/dataflow/python/setup.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import setuptools
 3 | 
 4 | from setuptools.command.bdist_egg import bdist_egg as _bdist_egg
 5 | 
 6 | class bdist_egg(_bdist_egg):  # pylint: disable=invalid-name
 7 |   def run(self):
 8 |     self.run_command('CustomCommands')
 9 |     _bdist_egg.run(self)
10 | 
11 | # Some custom command to run during setup.
12 | CUSTOM_COMMANDS = [
13 |   ['apt-get', 'update'],
14 |   ['apt-get', '--assume-yes', 'install', 'libre2-1', 'libre2-dev'],
15 | ]
16 | 
17 | class CustomCommands(setuptools.Command):
18 |   def initialize_options(self):
19 |     pass
20 | 
21 |   def finalize_options(self):
22 |     pass
23 | 
24 |   def RunCustomCommand(self, command_list):
25 |     print 'Running command: %s' % command_list
26 |     p = subprocess.Popen(
27 |         command_list,
28 |         stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
29 |     # Can use communicate(input='y\n'.encode()) if the command run requires
30 |     # some confirmation.
31 |     stdout_data, _ = p.communicate()
32 |     print 'Command output: %s' % stdout_data
33 |     if p.returncode != 0:
34 |       raise RuntimeError(
35 |           'Command %s failed: exit code: %s' % (command_list, p.returncode))
36 | 
37 |   def run(self):
38 |     for command in CUSTOM_COMMANDS:
39 |       self.RunCustomCommand(command)
40 | 
41 | 
42 | # Configure the required packages and scripts to install.
43 | REQUIRED_PACKAGES = [
44 |   'adblockparser',
45 |   're2'
46 | ]
47 | 
48 | setuptools.setup(
49 |     name='adblock',
50 |     version='0.0.1',
51 |     description='adblock pipeline',
52 |     install_requires=REQUIRED_PACKAGES,
53 |     packages=setuptools.find_packages(),
54 |     cmdclass={
55 |         # Command class instantiated and run during easy_install scenarios.
56 |         'bdist_egg': bdist_egg,
57 |         'CustomCommands': CustomCommands,
58 |         }
59 |     )
60 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
  1 | # BigQuery Pipeline
  2 | 
  3 | ## Dataflow
  4 | 
  5 | TODO
  6 | 
  7 | ## JSON Generation
  8 | 
  9 | After each crawl, the [generate_reports.sh](../sql/generate_reports.sh) script is run with the date of the crawl. For example:
 10 | 
 11 | ```sh
 12 | sql/generate_reports.sh -t -h 2017_09_01
 13 | ```
 14 | 
 15 | This will generate timeseries and histogram reports for all metrics using predefined SQL queries. The histogram queries will fill table placeholders with the crawl date provided. For example:
 16 | 
 17 | ```sql
 18 | SELECT ... FROM `httparchive.runs.${YYYY_MM_DD}_pages* ...`
 19 | ```
 20 | 
 21 | will become
 22 | 
 23 | ```sql
 24 | SELECT ... FROM `httparchive.runs.2017_09_01_pages* ...`
 25 | ```
 26 | 
 27 | After executing the histogram/timeseries queries for each metric on BigQuery, the results will be saved as JSON on Google Storage. For example, the `bytesJS` histogram would be saved to `gs://httparchive/reports/2017_09_01/bytesJS.json`. The timeseries for the same metric would be saved to `gs://httparchive/reports/bytesJS.json`.
 28 | 
 29 | ### Running Manually
 30 | 
 31 | Sometimes it's necessary to manually run this process, for example if a new metric is added or specific dates need to be backfilled. The generate_reports.sh script can be run with a different configuration of flags depending on your needs. From the script's documentation:
 32 | 
 33 | ```sh
 34 | # Flags:
 35 | #
 36 | #   -t: Whether to generate timeseries.
 37 | #       Note to run in incremental mode also need to use -h to pass date
 38 | #
 39 | #   -h: Whether to generate histograms. Must be accompanied by the date to query.
 40 | #
 41 | #   -f: Whether to force histogram querying and updating even if the data exists.
 42 | #       Timeseries are usually appended to from last date, but this flag forces a complete rerun
 43 | #
 44 | #   -l: Which lens to run.
 45 | #       Can also be set to ALL to run both the base (lens-less) report and all lenses.
 46 | #
 47 | #   -r: Optional pattern match for reports to be run. Use quotes to avoid the shell expanding names
 48 | #       (e.g. "*crux*")
 49 | ```
 50 | 
 51 | You can omit one of the `-t` or `-h` flags to focus only on histogram or timeseries generation. The `-f` flag ensures that histogram data gets overwritten. Omit this flag to skip queries for dates that already exist (much faster for batch jobs, see below).
 52 | 
 53 | ### Getting Dates Dynamically
 54 | 
 55 | If you're adding a new metric, it would be a pain to run the generation script manually for each date. HTTP Archive has over 300 crawls worth of dated tables in BigQuery! The [getBigQueryDates.sh](../sql/getBigQueryDates.sh) script can be used to get all of the dates in `YYYY_MM_DD` format for a particular table type. For example, if your new metric depends on the `pages` tables of the `runs` dataset (eg `httparchive.runs.2017_09_01_pages`), you could get the dates representing all of the matiching tables by running this command:
 56 | 
 57 | ```sh
 58 | sql/getBigQueryDates.sh runs pages
 59 | ```
 60 | 
 61 | Or if you want to limit the results to a particular range, you can pass in upper and lower bounds:
 62 | 
 63 | ```sh
 64 | sql/getBigQueryDates.sh runs pages 2015_01_01 2015_12_15
 65 | ```
 66 | 
 67 | The output of this script is a newline-delimited list of dates. This format enables convenient piping of the output as input to the generate_reports.sh script. For example:
 68 | 
 69 | ```sh
 70 | sql/getBigQueryDates.sh runs pages | \
 71 |   xargs -I date sql/generate_reports.sh -h date
 72 | ```
 73 | 
 74 | `xargs` handles the processing of each date and calls the other script.
 75 | 
 76 | ### Generating Specific Metrics
 77 | 
 78 | _TODO: document `sql/generate_report.sh`. This updates one histogram/timeseries at a time._
 79 | 
 80 | Running `generate_reports.sh` without the `-f` flag will result in metrics whose JSON results are already on Google Storage to skip being requeried. To regenerate results for specific metrics, the easiest thing to do may be to remove its results from Google Storage first, rather than running with the `-f` flag enabled and waiting for all other metrics to be queried and uploaded.
 81 | 
 82 | For example, if a change is made to the `reqTotal.sql` histogram query, then you can "invalidate" all histogram results for this query by deleting all respective JSON files from Google Storage:
 83 | 
 84 | ```sh
 85 | gsutil rm gs://httparchive/reports/*/reqTotal.json
 86 | ```
 87 | 
 88 | The wildcard in the YYYY_MM_DD position will instruct `gsutil` to delete all histogram results for this specific metric.
 89 | 
 90 | Now you can delete more metric-specific results or rerun `generate_reports.sh` without the `-f` flag and only the desired metrics will be requeried.
 91 | 
 92 | Note that cdn.httparchive.org may still contain the old version of the JSON file for the duration of the TTL. See below for more on invalidating the cache.
 93 | 
 94 | ## Serving the JSON Files
 95 | 
 96 | The Google Storage bucket is behind an App Engine load balancer and CDN, which is aliased as [https://cdn.httparchive.org](https://cdn.httparchive.org). Accessing the JSON data follows the same pattern as the `gs://` URL. For example, the public URL for `gs://httparchive/reports/2017_09_01/bytesJS.json` is [https://cdn.httparchive.org/reports/2017_09_01/bytesJS.json](https://cdn.httparchive.org/reports/2017_09_01/bytesJS.json). Each file is configured to be served with `Content-Type: application/json` and `Cache-Control: public, max-age=3600` headers.
 97 | 
 98 | The cache lifetime is set to 1 hour. If the cache needs to be invalidated for a particular file, this can be done by an administrator in the App Engine dashboard.
 99 | 
100 | A whitelist of origins are allowed to access the CDN. This list is maintained in [config/storage-cors.json](../config/storage-cors.json) and is configured to allow development, staging, and production servers. To save changes to this file, run:
101 | 
102 | ```sh
103 | gsutil cors set config/storage-cors.json gs://httparchive`
104 | ```
105 | 
106 | This will update the CORS settings for the Google Storage bucket.
107 | 


--------------------------------------------------------------------------------
/schema/.sqlfluffignore:
--------------------------------------------------------------------------------
1 | httparchive_schema.sql
2 | 


--------------------------------------------------------------------------------
/schema/pages.json:
--------------------------------------------------------------------------------
  1 | [
  2 |   {
  3 |     "name": "pageid",
  4 |     "type": "INTEGER"
  5 |   },
  6 |   {
  7 |     "name": "createDate",
  8 |     "type": "INTEGER"
  9 |   },
 10 |   {
 11 |     "name": "archive",
 12 |     "type": "STRING"
 13 |   },
 14 |   {
 15 |     "name": "label",
 16 |     "type": "STRING"
 17 |   },
 18 |   {
 19 |     "name": "crawlid",
 20 |     "type": "INTEGER"
 21 |   },
 22 |   {
 23 |     "name": "wptid",
 24 |     "type": "STRING"
 25 |   },
 26 |   {
 27 |     "name": "wptrun",
 28 |     "type": "INTEGER"
 29 |   },
 30 |   {
 31 |     "name": "url",
 32 |     "type": "STRING"
 33 |   },
 34 |   {
 35 |     "name": "urlShort",
 36 |     "type": "STRING"
 37 |   },
 38 |   {
 39 |     "name": "urlhash",
 40 |     "type": "INTEGER"
 41 |   },
 42 |   {
 43 |     "name": "cdn",
 44 |     "type": "STRING"
 45 |   },
 46 |   {
 47 |     "name": "startedDateTime",
 48 |     "type": "INTEGER"
 49 |   },
 50 |   {
 51 |     "name": "TTFB",
 52 |     "type": "INTEGER"
 53 |   },
 54 |   {
 55 |     "name": "renderStart",
 56 |     "type": "INTEGER"
 57 |   },
 58 |   {
 59 |     "name": "onContentLoaded",
 60 |     "type": "INTEGER"
 61 |   },
 62 |   {
 63 |     "name": "onLoad",
 64 |     "type": "INTEGER"
 65 |   },
 66 |   {
 67 |     "name": "fullyLoaded",
 68 |     "type": "INTEGER"
 69 |   },
 70 |   {
 71 |     "name": "visualComplete",
 72 |     "type": "INTEGER"
 73 |   },
 74 |   {
 75 |     "name": "PageSpeed",
 76 |     "type": "INTEGER"
 77 |   },
 78 |   {
 79 |     "name": "SpeedIndex",
 80 |     "type": "INTEGER"
 81 |   },
 82 |   {
 83 |     "name": "rank",
 84 |     "type": "INTEGER"
 85 |   },
 86 |   {
 87 |     "name": "reqTotal",
 88 |     "type": "INTEGER"
 89 |   },
 90 |   {
 91 |     "name": "reqHtml",
 92 |     "type": "INTEGER"
 93 |   },
 94 |   {
 95 |     "name": "reqJS",
 96 |     "type": "INTEGER"
 97 |   },
 98 |   {
 99 |     "name": "reqCSS",
100 |     "type": "INTEGER"
101 |   },
102 |   {
103 |     "name": "reqImg",
104 |     "type": "INTEGER"
105 |   },
106 |   {
107 |     "name": "reqGif",
108 |     "type": "INTEGER"
109 |   },
110 |   {
111 |     "name": "reqJpg",
112 |     "type": "INTEGER"
113 |   },
114 |   {
115 |     "name": "reqPng",
116 |     "type": "INTEGER"
117 |   },
118 |   {
119 |     "name": "reqFont",
120 |     "type": "INTEGER"
121 |   },
122 |   {
123 |     "name": "reqFlash",
124 |     "type": "INTEGER"
125 |   },
126 |   {
127 |     "name": "reqJson",
128 |     "type": "INTEGER"
129 |   },
130 |   {
131 |     "name": "reqOther",
132 |     "type": "INTEGER"
133 |   },
134 |   {
135 |     "name": "bytesTotal",
136 |     "type": "INTEGER"
137 |   },
138 |   {
139 |     "name": "bytesHtml",
140 |     "type": "INTEGER"
141 |   },
142 |   {
143 |     "name": "bytesJS",
144 |     "type": "INTEGER"
145 |   },
146 |   {
147 |     "name": "bytesCSS",
148 |     "type": "INTEGER"
149 |   },
150 |   {
151 |     "name": "bytesImg",
152 |     "type": "INTEGER"
153 |   },
154 |   {
155 |     "name": "bytesGif",
156 |     "type": "INTEGER"
157 |   },
158 |   {
159 |     "name": "bytesJpg",
160 |     "type": "INTEGER"
161 |   },
162 |   {
163 |     "name": "bytesPng",
164 |     "type": "INTEGER"
165 |   },
166 |   {
167 |     "name": "bytesFont",
168 |     "type": "INTEGER"
169 |   },
170 |   {
171 |     "name": "bytesFlash",
172 |     "type": "INTEGER"
173 |   },
174 |   {
175 |     "name": "bytesJson",
176 |     "type": "INTEGER"
177 |   },
178 |   {
179 |     "name": "bytesOther",
180 |     "type": "INTEGER"
181 |   },
182 |   {
183 |     "name": "bytesHtmlDoc",
184 |     "type": "INTEGER"
185 |   },
186 |   {
187 |     "name": "numDomains",
188 |     "type": "INTEGER"
189 |   },
190 |   {
191 |     "name": "maxDomainReqs",
192 |     "type": "INTEGER"
193 |   },
194 |   {
195 |     "name": "numRedirects",
196 |     "type": "INTEGER"
197 |   },
198 |   {
199 |     "name": "numErrors",
200 |     "type": "INTEGER"
201 |   },
202 |   {
203 |     "name": "numGlibs",
204 |     "type": "INTEGER"
205 |   },
206 |   {
207 |     "name": "numHttps",
208 |     "type": "INTEGER"
209 |   },
210 |   {
211 |     "name": "numCompressed",
212 |     "type": "INTEGER"
213 |   },
214 |   {
215 |     "name": "numDomElements",
216 |     "type": "INTEGER"
217 |   },
218 |   {
219 |     "name": "maxageNull",
220 |     "type": "INTEGER"
221 |   },
222 |   {
223 |     "name": "maxage0",
224 |     "type": "INTEGER"
225 |   },
226 |   {
227 |     "name": "maxage1",
228 |     "type": "INTEGER"
229 |   },
230 |   {
231 |     "name": "maxage30",
232 |     "type": "INTEGER"
233 |   },
234 |   {
235 |     "name": "maxage365",
236 |     "type": "INTEGER"
237 |   },
238 |   {
239 |     "name": "maxageMore",
240 |     "type": "INTEGER"
241 |   },
242 |   {
243 |     "name": "gzipTotal",
244 |     "type": "INTEGER"
245 |   },
246 |   {
247 |     "name": "gzipSavings",
248 |     "type": "INTEGER"
249 |   },
250 |   {
251 |     "name": "_connections",
252 |     "type": "INTEGER"
253 |   },
254 |   {
255 |     "name": "_adult_site",
256 |     "type": "BOOLEAN"
257 |   },
258 |   {
259 |     "name": "avg_dom_depth",
260 |     "type": "INTEGER"
261 |   },
262 |   {
263 |     "name": "document_height",
264 |     "type": "INTEGER"
265 |   },
266 |   {
267 |     "name": "document_width",
268 |     "type": "INTEGER"
269 |   },
270 |   {
271 |     "name": "localstorage_size",
272 |     "type": "INTEGER"
273 |   },
274 |   {
275 |     "name": "sessionstorage_size",
276 |     "type": "INTEGER"
277 |   },
278 |   {
279 |     "name": "num_iframes",
280 |     "type": "INTEGER"
281 |   },
282 |   {
283 |     "name": "num_scripts",
284 |     "type": "INTEGER"
285 |   },
286 |   {
287 |     "name": "doctype",
288 |     "type": "STRING"
289 |   },
290 |   {
291 |     "name": "meta_viewport",
292 |     "type": "STRING"
293 |   },
294 |   {
295 |     "name": "reqAudio",
296 |     "type": "INTEGER"
297 |   },
298 |   {
299 |     "name": "reqVideo",
300 |     "type": "INTEGER"
301 |   },
302 |   {
303 |     "name": "reqText",
304 |     "type": "INTEGER"
305 |   },
306 |   {
307 |     "name": "reqXml",
308 |     "type": "INTEGER"
309 |   },
310 |   {
311 |     "name": "reqWebp",
312 |     "type": "INTEGER"
313 |   },
314 |   {
315 |     "name": "reqSvg",
316 |     "type": "INTEGER"
317 |   },
318 |   {
319 |     "name": "bytesAudio",
320 |     "type": "INTEGER"
321 |   },
322 |   {
323 |     "name": "bytesVideo",
324 |     "type": "INTEGER"
325 |   },
326 |   {
327 |     "name": "bytesText",
328 |     "type": "INTEGER"
329 |   },
330 |   {
331 |     "name": "bytesXml",
332 |     "type": "INTEGER"
333 |   },
334 |   {
335 |     "name": "bytesWebp",
336 |     "type": "INTEGER"
337 |   },
338 |   {
339 |     "name": "bytesSvg",
340 |     "type": "INTEGER"
341 |   },
342 |   {
343 |     "name": "num_scripts_async",
344 |     "type": "INTEGER"
345 |   },
346 |   {
347 |     "name": "num_scripts_sync",
348 |     "type": "INTEGER"
349 |   },
350 |   {
351 |     "name": "usertiming",
352 |     "type": "INTEGER"
353 |   }
354 | ]
355 | 


--------------------------------------------------------------------------------
/schema/requests.json:
--------------------------------------------------------------------------------
  1 | [
  2 |   {
  3 |     "name": "requestid",
  4 |     "type": "INTEGER"
  5 |   },
  6 |   {
  7 |     "name": "pageid",
  8 |     "type": "INTEGER"
  9 |   },
 10 |   {
 11 |     "name": "startedDateTime",
 12 |     "type": "INTEGER"
 13 |   },
 14 |   {
 15 |     "name": "time",
 16 |     "type": "INTEGER"
 17 |   },
 18 |   {
 19 |     "name": "method",
 20 |     "type": "STRING"
 21 |   },
 22 |   {
 23 |     "name": "url",
 24 |     "type": "STRING"
 25 |   },
 26 |   {
 27 |     "name": "urlShort",
 28 |     "type": "STRING"
 29 |   },
 30 |   {
 31 |     "name": "redirectUrl",
 32 |     "type": "STRING"
 33 |   },
 34 |   {
 35 |     "name": "firstReq",
 36 |     "type": "BOOLEAN"
 37 |   },
 38 |   {
 39 |     "name": "firstHtml",
 40 |     "type": "BOOLEAN"
 41 |   },
 42 |   {
 43 |     "name": "reqHttpVersion",
 44 |     "type": "STRING"
 45 |   },
 46 |   {
 47 |     "name": "reqHeadersSize",
 48 |     "type": "INTEGER"
 49 |   },
 50 |   {
 51 |     "name": "reqBodySize",
 52 |     "type": "INTEGER"
 53 |   },
 54 |   {
 55 |     "name": "reqCookieLen",
 56 |     "type": "INTEGER"
 57 |   },
 58 |   {
 59 |     "name": "reqOtherHeaders",
 60 |     "type": "STRING"
 61 |   },
 62 |   {
 63 |     "name": "status",
 64 |     "type": "INTEGER"
 65 |   },
 66 |   {
 67 |     "name": "respHttpVersion",
 68 |     "type": "STRING"
 69 |   },
 70 |   {
 71 |     "name": "respHeadersSize",
 72 |     "type": "INTEGER"
 73 |   },
 74 |   {
 75 |     "name": "respBodySize",
 76 |     "type": "INTEGER"
 77 |   },
 78 |   {
 79 |     "name": "respSize",
 80 |     "type": "INTEGER"
 81 |   },
 82 |   {
 83 |     "name": "respCookieLen",
 84 |     "type": "INTEGER"
 85 |   },
 86 |   {
 87 |     "name": "expAge",
 88 |     "type": "INTEGER"
 89 |   },
 90 |   {
 91 |     "name": "mimeType",
 92 |     "type": "STRING"
 93 |   },
 94 |   {
 95 |     "name": "respOtherHeaders",
 96 |     "type": "STRING"
 97 |   },
 98 |   {
 99 |     "name": "req_accept",
100 |     "type": "STRING"
101 |   },
102 |   {
103 |     "name": "req_accept_charset",
104 |     "type": "STRING"
105 |   },
106 |   {
107 |     "name": "req_accept_encoding",
108 |     "type": "STRING"
109 |   },
110 |   {
111 |     "name": "req_accept_language",
112 |     "type": "STRING"
113 |   },
114 |   {
115 |     "name": "req_connection",
116 |     "type": "STRING"
117 |   },
118 |   {
119 |     "name": "req_host",
120 |     "type": "STRING"
121 |   },
122 |   {
123 |     "name": "req_if_modified_since",
124 |     "type": "STRING"
125 |   },
126 |   {
127 |     "name": "req_if_none_match",
128 |     "type": "STRING"
129 |   },
130 |   {
131 |     "name": "req_referer",
132 |     "type": "STRING"
133 |   },
134 |   {
135 |     "name": "req_user_agent",
136 |     "type": "STRING"
137 |   },
138 |   {
139 |     "name": "resp_accept_ranges",
140 |     "type": "STRING"
141 |   },
142 |   {
143 |     "name": "resp_age",
144 |     "type": "STRING"
145 |   },
146 |   {
147 |     "name": "resp_cache_control",
148 |     "type": "STRING"
149 |   },
150 |   {
151 |     "name": "resp_connection",
152 |     "type": "STRING"
153 |   },
154 |   {
155 |     "name": "resp_content_encoding",
156 |     "type": "STRING"
157 |   },
158 |   {
159 |     "name": "resp_content_language",
160 |     "type": "STRING"
161 |   },
162 |   {
163 |     "name": "resp_content_length",
164 |     "type": "STRING"
165 |   },
166 |   {
167 |     "name": "resp_content_location",
168 |     "type": "STRING"
169 |   },
170 |   {
171 |     "name": "resp_content_type",
172 |     "type": "STRING"
173 |   },
174 |   {
175 |     "name": "resp_date",
176 |     "type": "STRING"
177 |   },
178 |   {
179 |     "name": "resp_etag",
180 |     "type": "STRING"
181 |   },
182 |   {
183 |     "name": "resp_expires",
184 |     "type": "STRING"
185 |   },
186 |   {
187 |     "name": "resp_keep_alive",
188 |     "type": "STRING"
189 |   },
190 |   {
191 |     "name": "resp_last_modified",
192 |     "type": "STRING"
193 |   },
194 |   {
195 |     "name": "resp_location",
196 |     "type": "STRING"
197 |   },
198 |   {
199 |     "name": "resp_pragma",
200 |     "type": "STRING"
201 |   },
202 |   {
203 |     "name": "resp_server",
204 |     "type": "STRING"
205 |   },
206 |   {
207 |     "name": "resp_transfer_encoding",
208 |     "type": "STRING"
209 |   },
210 |   {
211 |     "name": "resp_vary",
212 |     "type": "STRING"
213 |   },
214 |   {
215 |     "name": "resp_via",
216 |     "type": "STRING"
217 |   },
218 |   {
219 |     "name": "resp_x_powered_by",
220 |     "type": "STRING"
221 |   },
222 |   {
223 |     "name": "_cdn_provider",
224 |     "type": "STRING"
225 |   },
226 |   {
227 |     "name": "_gzip_save",
228 |     "type": "INTEGER"
229 |   },
230 |   {
231 |     "name": "crawlid",
232 |     "type": "INTEGER"
233 |   },
234 |   {
235 |     "name": "type",
236 |     "type": "STRING"
237 |   },
238 |   {
239 |     "name": "ext",
240 |     "type": "STRING"
241 |   },
242 |   {
243 |     "name": "format",
244 |     "type": "STRING"
245 |   }
246 | ]
247 | 


--------------------------------------------------------------------------------
/schema/schema.rb:
--------------------------------------------------------------------------------
 1 | require 'json'
 2 | 
 3 | def type(t)
 4 |   case t
 5 |   when /tinyint/ then "BOOLEAN"
 6 |   when /int/ then "INTEGER"
 7 |   when /varchar|text/ then "STRING"
 8 |   end
 9 | end
10 | 
11 | def scan(table)
12 |   schema = []
13 |   s = IO.read('httparchive_schema.sql')
14 | 
15 |   m = s.match(/CREATE\sTABLE\s`#{table}`\s\((.*?)PRIMARY\sKEY/m)[1]
16 |   m.split("\n").compact.each do |f|
17 |     next if f.strip.empty?
18 |     fm = f.strip.match(/`(.*?)`\s(\w+)/m)
19 | 
20 |     schema << {
21 |       "name" => fm[1],
22 |       "type" => type(fm[2])
23 |     }
24 |   end
25 | 
26 |   schema
27 | end
28 | 
29 | jj scan(ARGV[0])
30 | 


--------------------------------------------------------------------------------
/sql/.sqlfluff:
--------------------------------------------------------------------------------
 1 | [sqlfluff]
 2 | ## verbose is an integer (0-2) indicating the level of log output
 3 | verbose = 0
 4 | ## Turn off color formatting of output
 5 | nocolor = False
 6 | ## Supported dialects https://docs.sqlfluff.com/en/stable/dialects.html
 7 | ## Or run 'sqlfluff dialects'
 8 | dialect = bigquery
 9 | ## One of [raw|jinja|python|placeholder]
10 | templater = jinja
11 | ## Comma separated list of rules to check, or None for all
12 | rules = None
13 | ## Comma separated list of rules to exclude, or None
14 | exclude_rules = AL01,AL04,AL07,AL09,AM03,AM05,CP02,CP03,CV02,CV12,LT05,LT09,LT14,RF01,RF02,RF03,RF04,ST01,ST02,ST05,ST06,ST07
15 | # AL04 - Asks for unique table aliases meaning it complains if selecting from two 2021_07_01 tables as implicit alias is table name (not fully qualified) so same.
16 | # AL07 - Avoid aliases in from and join - why?
17 | # AM03 - if using DESC in one ORDER BY column, then insist on ASC/DESC for all.
18 | # AM05 - INNER JOIN must be fully qualified. Probably should use this but not our style.
19 | # CP02 - Unquoted identifiers (e.g. column names) will be mixed case so don't enforce case
20 | # CP03 - Function names will be mixed case so don't enforce case
21 | # CV02 - Use COALESCE instead of IFNULL or NVL. We think ISNULL is clearer.
22 | # CV12 - Doesn't work with UNNEST. https://github.com/sqlfluff/sqlfluff/issues/6558
23 | # LT05 - We allow longer lines as some of our queries are complex. Maybe should limit in future?
24 | # LT09 - Select targets should be on new lines but sub clauses don't always obey this. Maybe revisit in future?
25 | # LT14 - Keywords on newline. We have some simple, single line joins
26 | # RF01 - BigQuery uses STRUCTS which can look like incorrect table references
27 | # RF02 - Asks for qualified columns for ambiguous ones, but we not qualify our columns, and they are not really ambiguous (or BigQuery would complain)
28 | # RF03 - Insists on references in column names even if not ambiguous. Bit OTT.
29 | # RF04 - Avoids keywords as identifiers but we use this a lot (e.g. AS count, AS max...etc.)
30 | # ST07 - Uses joins instead of USING - why?
31 | # ST06 - Insists on wildcards (*) in certain SELECT order - why?
32 | # ST01 - Do not use ELSE NULL as redundant. But it's clearer!?
33 | # ST05 - Use CTEs instead of subqueries. We don't use this consistently and big rewrite to do that.
34 | # ST02 - Use coalesce instead of case if you can. But it's clearer!?
35 | 
36 | [sqlfluff:indentation]
37 | tab_space_size = 2
38 | indent_unit = space
39 | indented_using_on = False
40 | 
41 | [sqlfluff:layout:type:binary_operator]
42 | line_position = trailing
43 | 
44 | [sqlfluff:templater:jinja:context]
45 | BLINK_DATE_JOIN="AND 1=2"
46 | 
47 | [tool.sqlfluff.rules.capitalisation.keywords]
48 | capitalisation_policy = "upper"
49 | 
50 | [sqlfluff:rules:convention.count_rows]
51 | # Consistent syntax to count all rows
52 | prefer_count_0 = True
53 | 
54 | [sqlfluff:rules:references.special_chars]
55 | # Special characters in identifiers
56 | additional_allowed_characters = ".-${}"
57 | 


--------------------------------------------------------------------------------
/sql/.sqlfluffignore:
--------------------------------------------------------------------------------
1 | /lens/*/crux_histograms.sql
2 | /lens/*/crux_timeseries.sql
3 | /lens/*/histograms.sql
4 | /lens/*/timeseries.sql
5 | 


--------------------------------------------------------------------------------
/sql/addDate.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | /**
 3 |  * Adds a single date to dates.json if it doesn't already exist.
 4 |  *
 5 |  * Usage:
 6 |  *
 7 |  *     node sql/addDate.js 2017_09_01
 8 |  *
 9 |  */
10 | 
11 | const fs = require('fs');
12 | 
13 | 
14 | const date = process.argv[2];
15 | if (!date) {
16 |   console.error(`You must pass a YYYY_MM_DD-formatted date as input. For example:
17 |   sql/addDate.js 2017_09_01`);
18 |   process.exit(1);
19 | }
20 | 
21 | fs.readFile('config/dates.json', 'utf8', (err, data) => {
22 |   if (err) {
23 |     console.error(err);
24 |     return;
25 |   }
26 | 
27 |   // Use a set to dedupe.
28 |   let dates = new Set(JSON.parse(data));
29 |   dates.add(date);
30 |   dates = Array.from(dates).sort((a, b) => {
31 |     return a > b ? -1 : 1;
32 |   });
33 | 
34 |   const dateStr = JSON.stringify(dates, null, 2) + '\n';
35 | 
36 |   // Update the config file.
37 |   fs.writeFile('config/dates.json', dateStr, 'utf8', (err) => {
38 |     if (err) {
39 |       console.error(err);
40 |     }
41 | 
42 |     console.log('Updated config/dates.json');
43 |   });
44 | });
45 | 


--------------------------------------------------------------------------------
/sql/delete_date_from_reports.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | #
  3 | # Removes a particular date JSON from timeseries reports on Google Storage.
  4 | #
  5 | # Usage:
  6 | #
  7 | #   $ sql/delete_date_from_reports.sh -d YYYY_MM_DD
  8 | #   $ sql/delete_date_from_reports.sh -d YYYY_MM_DD -l top1k
  9 | #   $ sql/delete_date_from_reports.sh -d YYYY_MM_DD -l top1k -r "*crux*"
 10 | #
 11 | # Flags:
 12 | #
 13 | #   -l: Optional name of the report lens to generate, eg "top10k".
 14 | #
 15 | #   -r: Optional name of the report files to generate, eg "*crux*".
 16 | #
 17 | 
 18 | set -o pipefail
 19 | 
 20 | LENS_ARG=""
 21 | REPORTS="*"
 22 | VERBOSE=0
 23 | NO_CHANGES=0
 24 | 
 25 | # Read the flags.
 26 | while getopts ":nvd:l:r:" opt; do
 27 |   case "${opt}" in
 28 |     d)
 29 |       YYYY_MM_DD=${OPTARG}
 30 |       ;;
 31 |     v)
 32 |       VERBOSE=1
 33 |       ;;
 34 |     n)
 35 |       NO_CHANGES=1
 36 |       ;;
 37 |     l)
 38 |       LENS_ARG=${OPTARG}
 39 |       ;;
 40 |     r)
 41 |       REPORTS=${OPTARG}
 42 |       ;;
 43 |   esac
 44 | done
 45 | 
 46 | if [[ "${YYYY_MM_DD}" == "" ]]; then
 47 |   echo "Usage $0 -d 2021_12_01"
 48 |   exit 1
 49 | fi
 50 | 
 51 | echo "${YYYY_MM_DD}"
 52 | 
 53 | # Run all timeseries queries.
 54 | for query in sql/timeseries/$REPORTS.sql; do
 55 | 
 56 |   if [[ ! -f $query ]]; then
 57 |     echo "Nothing to do"
 58 |     continue;
 59 |   fi
 60 | 
 61 |   # Extract the metric name from the file path.
 62 |   metric=$(echo $(basename $query) | cut -d"." -f1)
 63 | 
 64 |   if [[ "${LENS_ARG}" == "" ]]; then
 65 |     LENSES=("")
 66 |     echo "Deleting ${metric} report for base"
 67 |   elif [[ "${LENS_ARG}" == "ALL" ]]; then
 68 |     LENSES=("" $(ls sql/lens))
 69 |     echo "Deleting ${metric} report for base and all lenses"
 70 |   else
 71 |     LENSES=("${LENS_ARG}")
 72 |     echo "Deleting ${metric} report for one lens"
 73 |   fi
 74 | 
 75 |   for LENS in "${LENSES[@]}"
 76 |   do
 77 | 
 78 |     gs_lens_dir=""
 79 |     if [[ $LENS != "" ]]; then
 80 |       gs_lens_dir="$LENS/"
 81 |     fi
 82 | 
 83 |     current_contents=""
 84 |     gs_url="gs://httparchive/reports/$gs_lens_dir${metric}.json"
 85 |     gsutil ls $gs_url &> /dev/null
 86 | 
 87 |     if [ $? -eq 0 ]; then
 88 | 
 89 |       echo "Updating this query: ${metric} for LENS: ${LENS}"
 90 | 
 91 |       # The file exists, so remove the requested date
 92 |       current_contents=$(gsutil cat $gs_url)
 93 | 
 94 |       if [ ${VERBOSE} -eq 1 ]; then
 95 |         echo "Current JSON:"
 96 |         echo "${current_contents}\n"
 97 |       fi
 98 | 
 99 |       new_contents=$(echo "$current_contents" | jq -c --indent 1 --arg date "${YYYY_MM_DD}" '.[] | select(.date!=$date)' | tr -d '\n' | sed 's/^/[ /' | sed 's/}$/ } ]\n/' | sed 's/}{/ }, {/g')
100 | 
101 |       if [ ${VERBOSE} -eq 1 ]; then
102 |         echo "New JSON:"
103 |         echo "${new_contents}"
104 |       fi
105 | 
106 |       # Make sure the removal succeeded.
107 |       if [ $? -eq 0 ] && [ ${NO_CHANGES} -eq 0 ]; then
108 | 
109 |         # Upload the response to Google Storage.
110 |         echo "Uploading new file to Google Storage"
111 |         echo $new_contents \
112 |           | gsutil  -h "Content-Type:application/json" cp - $gs_url
113 |       else
114 |         echo $new_contents >&2
115 |       fi
116 |     fi
117 |   done
118 | done
119 | 
120 | echo -e "Done"
121 | 


--------------------------------------------------------------------------------
/sql/generate_reports.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | #
  3 | # Updates the JSON reports on Google Storage with the latest BigQuery data.
  4 | #
  5 | # Usage:
  6 | #
  7 | #   $ sql/generateReports.sh -t -h YYYY_MM_DD
  8 | #
  9 | # Flags:
 10 | #
 11 | #   -t: Whether to generate timeseries.
 12 | #
 13 | #   -h: Whether to generate histograms. Must be accompanied by the date to query.
 14 | #
 15 | #   -f: Whether to force querying and updating even if the data exists.
 16 | #
 17 | #   -l: Optional name of the report lens to generate, eg "top10k".
 18 | #
 19 | #   -r: Optional name of the report files to generate, eg "*crux*".
 20 | #
 21 | 
 22 | set -o pipefail
 23 | 
 24 | BQ_CMD="bq --format prettyjson --project_id httparchive query --max_rows 1000000"
 25 | FORCE=0
 26 | GENERATE_HISTOGRAM=0
 27 | GENERATE_TIMESERIES=0
 28 | LENS_ARG=""
 29 | REPORTS="*"
 30 | VERBOSE=0
 31 | 
 32 | # Read the flags.
 33 | while getopts ":ftvh:l:r:" opt; do
 34 |   case "${opt}" in
 35 |     h)
 36 |       GENERATE_HISTOGRAM=1
 37 |       YYYY_MM_DD=${OPTARG}
 38 |       dateParts=(`echo ${OPTARG} | tr "_" "\\n"`)
 39 |       YYYYMM=${dateParts[0]}${dateParts[1]}
 40 |       DATE=${dateParts[0]}-${dateParts[1]}-${dateParts[2]}
 41 |       ;;
 42 |     t)
 43 |       GENERATE_TIMESERIES=1
 44 |       ;;
 45 |     v)
 46 |       VERBOSE=1
 47 |       ;;
 48 |     f)
 49 |       FORCE=1
 50 |       ;;
 51 |     l)
 52 |       LENS_ARG=${OPTARG}
 53 |       ;;
 54 |     r)
 55 |       REPORTS=${OPTARG}
 56 |       ;;
 57 |   esac
 58 | done
 59 | 
 60 | # Exit early if there is nothing to do.
 61 | if [ $GENERATE_HISTOGRAM -eq 0 -a $GENERATE_TIMESERIES -eq 0 ]; then
 62 |   echo -e "You must provide one or both -t or -h flags." >&2
 63 |   echo -e "For example: sql/generateReports.sh -t -h 2017_08_01" >&2
 64 |   exit 1
 65 | fi
 66 | 
 67 | # Check if all tables for the given date are available in BigQuery.
 68 | # Tables representing desktop/mobile and HAR/CSV data sources must exist.
 69 | DATED_TABLES_READY=0
 70 | if [ -n "$YYYY_MM_DD" ]; then
 71 |   echo "Checking if tables are ready for ${DATE}..."
 72 |   DESKTOP_ROOT_PAGES_EXIST=$(bq query --nouse_legacy_sql --format csv --headless -q "SELECT true FROM httparchive.crawl.pages WHERE date = '${DATE}' AND client = 'desktop' AND is_root_page LIMIT 1" | tail -1)
 73 |   DESKTOP_NON_ROOT_PAGES_EXIST=$(bq query --nouse_legacy_sql --format csv --headless -q "SELECT true FROM httparchive.crawl.pages WHERE date = '${DATE}' AND client = 'desktop' AND is_root_page LIMIT 1" | tail -1)
 74 |   MOBILE_ROOT_PAGES_EXIST=$(bq query --nouse_legacy_sql --format csv --headless -q "SELECT true FROM httparchive.crawl.pages WHERE date = '${DATE}' AND client = 'mobile' AND NOT is_root_page LIMIT 1" | tail -1)
 75 |   MOBILE_NON_ROOT_PAGES_EXIST=$(bq query --nouse_legacy_sql --format csv --headless -q "SELECT true FROM httparchive.crawl.pages WHERE date = '${DATE}' AND client = 'mobile' AND NOT is_root_page LIMIT 1" | tail -1)
 76 |   DESKTOP_ROOT_REQUESTS_EXIST=$(bq query --nouse_legacy_sql --format csv --headless -q "SELECT true FROM httparchive.crawl.requests WHERE date = '${DATE}' AND client = 'desktop' AND is_root_page LIMIT 1" | tail -1)
 77 |   DESKTOP_NON_ROOT_REQUESTS_EXIST=$(bq query --nouse_legacy_sql --format csv --headless -q "SELECT true FROM httparchive.crawl.requests WHERE date = '${DATE}' AND client = 'desktop' AND is_root_page LIMIT 1" | tail -1)
 78 |   MOBILE_ROOT_REQUESTS_EXIST=$(bq query --nouse_legacy_sql --format csv --headless -q "SELECT true FROM httparchive.crawl.requests WHERE date = '${DATE}' AND client = 'mobile' AND NOT is_root_page LIMIT 1" | tail -1)
 79 |   MOBILE_NON_ROOT_REQUESTS_EXIST=$(bq query --nouse_legacy_sql --format csv --headless -q "SELECT true FROM httparchive.crawl.requests WHERE date = '${DATE}' AND client = 'mobile' AND NOT is_root_page LIMIT 1" | tail -1)
 80 |   echo "Finished checking if dates are ready"
 81 |   if [[ "$DESKTOP_ROOT_PAGES_EXIST" == true && "$DESKTOP_NON_ROOT_PAGES_EXIST" == true && "$MOBILE_ROOT_PAGES_EXIST" == true && "$MOBILE_NON_ROOT_PAGES_EXIST" == true && "$DESKTOP_ROOT_REQUESTS_EXIST" == true && "$DESKTOP_NON_ROOT_REQUESTS_EXIST" == true && "$MOBILE_ROOT_REQUESTS_EXIST" == true && "$MOBILE_NON_ROOT_REQUESTS_EXIST" == true ]]; then
 82 |     DATED_TABLES_READY=1
 83 |   fi
 84 | fi
 85 | if [ $GENERATE_HISTOGRAM -ne 0 -a $DATED_TABLES_READY -ne 1 ]; then
 86 |   echo -e "The BigQuery tables for $DATE are not available." >&2
 87 | 
 88 |   # List table data for debugging
 89 |   echo $(date)
 90 |   echo "Desktop root pages ready: ${DESKTOP_ROOT_PAGES_EXIST}"
 91 |   echo "Desktop non-root pages ready: ${DESKTOP_NON_ROOT_PAGES_EXIST}"
 92 |   echo "Mobile root pages ready: ${MOBILE_ROOT_PAGES_EXIST}"
 93 |   echo "Mobile non-root pages ready: ${MOBILE_NON_ROOT_PAGES_EXIST}"
 94 |   echo "Desktop root requests ready: ${DESKTOP_ROOT_REQUESTS_EXIST}"
 95 |   echo "Desktop non-root requests ready: ${DESKTOP_NON_ROOT_REQUESTS_EXIST}"
 96 |   echo "Mobile root requests ready: ${MOBILE_ROOT_REQUESTS_EXIST}"
 97 |   echo "Mobile non-root requests ready: ${MOBILE_NON_ROOT_REQUESTS_EXIST}"
 98 |   exit 1
 99 | fi
100 | 
101 | if [ $GENERATE_HISTOGRAM -eq 0 ]; then
102 |   echo -e "Skipping histograms"
103 | else
104 |   echo -e "Generating histograms for date $DATE"
105 | 
106 |   # Run all histogram queries.
107 |   for query in sql/histograms/$REPORTS.sql; do
108 | 
109 |     if [[ ! -f $query ]]; then
110 |       echo "Nothing to do"
111 |       continue;
112 |     fi
113 | 
114 |     # Extract the metric name from the file path.
115 |     # For example, `sql/histograms/foo.sql` will produce `foo`.
116 |     metric=$(echo $(basename $query) | cut -d"." -f1)
117 | 
118 |     echo -e "Generating $metric histogram"
119 | 
120 |     if [[ "${LENS_ARG}" == "" ]]; then
121 |       LENSES=("")
122 |       echo "Generating ${metric} report for base"
123 |     elif [[ "${LENS_ARG}" == "ALL" ]]; then
124 |       LENSES=("" $(ls sql/lens))
125 |       echo "Generating ${metric} report for base and all lenses"
126 |     else
127 |       LENSES=("${LENS_ARG}")
128 |       echo "Generating ${metric} report for one lens"
129 |     fi
130 | 
131 |     for LENS in "${LENSES[@]}"
132 |     do
133 | 
134 |       gs_lens_dir=""
135 |       if [[ $LENS != "" ]]; then
136 |         if [ ! -f "sql/lens/$LENS/histograms.sql" ] || [ ! -f "sql/lens/$LENS/timeseries.sql" ]; then
137 |           echo -e "Lens histogram/timeseries files not found in sql/lens/$LENS."
138 |           exit 1
139 |         fi
140 |         gs_lens_dir="$LENS/"
141 |       fi
142 | 
143 |       gs_url="gs://httparchive/reports/$gs_lens_dir$YYYY_MM_DD/${metric}.json"
144 |       gsutil ls $gs_url &> /dev/null
145 |       if [ $? -eq 0 ] && [ $FORCE -eq 0 ]; then
146 |         # The file already exists, so skip the query.
147 |         echo -e "Skipping $gs_lens_dir$YYYY_MM_DD/$metric histogram as already exists"
148 |         continue
149 |       fi
150 | 
151 |       # Replace the date template in the query.
152 |       if [[ $LENS != "" ]]; then
153 |         echo -e "Generating ${metric} report for $LENS"
154 |         lens_clause="$(cat sql/lens/$LENS/histograms.sql)"
155 |         lens_clause_and="$(cat sql/lens/$LENS/histograms.sql) AND"
156 |         lens_join=""
157 | 
158 |         if [[ $metric == crux* ]]; then
159 |           lens_clause=""
160 |           lens_clause_and=""
161 |           if [[ -f sql/lens/$LENS/crux_histograms.sql ]]; then
162 |             echo "Using alternative crux lens join"
163 |             lens_join="$(cat sql/lens/$LENS/crux_histograms.sql | tr '\n' ' ')"
164 |           else
165 |             echo "CrUX queries do not support histograms for this lens so skipping"
166 |             continue
167 |           fi
168 | 
169 |           sql=$(sed -e "s/\(\`chrome-ux-report[^\`]*\`\)/\1 $lens_join/" $query \
170 |             | sed -e "s/\${YYYY-MM-DD}/$DATE/g" \
171 |             | sed -e "s/\${YYYYMM}/$YYYYMM/g")
172 |         else
173 | 
174 |           if [[ $(grep -i "WHERE" $query) ]]; then
175 |             # If WHERE clause already exists then add to it
176 |             sql=$(sed -e "s/\(WHERE\)/\1 $lens_clause_and /" $query \
177 |               | sed -e "s/\${YYYY-MM-DD}/$DATE/g" \
178 |               | sed -e "s/\${YYYYMM}/$YYYYMM/g")
179 |           else
180 |             # If WHERE clause does not exists then add it, before GROUP BY
181 |             sql=$(sed -e "s/\(GROUP BY\)/WHERE $lens_clause \1/" $query \
182 |               | sed -e "s/\${YYYY-MM-DD}/$DATE/g" \
183 |               | sed -e "s/\${YYYYMM}/$YYYYMM/g")
184 |           fi
185 |         fi
186 |       else
187 |         echo -e "Generating ${metric} report for base (no lens)"
188 |         sql=$(sed -e "s/\${YYYY-MM-DD}/$DATE/g" $query \
189 |           | sed -e "s/\${YYYYMM}/$YYYYMM/g")
190 |       fi
191 | 
192 |       if [ ${VERBOSE} -eq 1 ]; then
193 |         echo "Running this query:"
194 |         echo "${sql}\n"
195 |       fi
196 | 
197 |       # Run the histogram query on BigQuery.
198 |       START_TIME=$SECONDS
199 |       result=$(echo "${sql}" | $BQ_CMD)
200 | 
201 |       # Make sure the query succeeded.
202 |       if [ $? -eq 0 ]; then
203 |         ELAPSED_TIME=$(($SECONDS - $START_TIME))
204 |         if [[ $LENS != "" ]]; then
205 |           echo "$metric for $LENS took $ELAPSED_TIME seconds"
206 |         else
207 |           echo "$metric took $ELAPSED_TIME seconds"
208 |         fi
209 |         # Upload the response to Google Storage.
210 |         echo $result \
211 |           | gsutil  -h "Content-Type:application/json" cp - $gs_url
212 |       else
213 |         echo $result >&2
214 |       fi
215 |     done
216 |   done
217 | fi
218 | 
219 | if [ $GENERATE_TIMESERIES -eq 0 ]; then
220 |   echo -e "Skipping timeseries"
221 | else
222 |   echo -e "Generating timeseries"
223 | 
224 |   # Run all timeseries queries.
225 |   for query in sql/timeseries/$REPORTS.sql; do
226 | 
227 |     if [[ ! -f $query ]]; then
228 |       echo "Nothing to do"
229 |       continue;
230 |     fi
231 | 
232 |     # Extract the metric name from the file path.
233 |     metric=$(echo $(basename $query) | cut -d"." -f1)
234 | 
235 |     if [[ "${LENS_ARG}" == "" ]]; then
236 |       LENSES=("")
237 |       echo "Generating ${metric} report for base"
238 |     elif [[ "${LENS_ARG}" == "ALL" ]]; then
239 |       LENSES=("" $(ls sql/lens))
240 |       echo "Generating ${metric} report for base and all lenses"
241 |     else
242 |       LENSES=("${LENS_ARG}")
243 |       echo "Generating ${metric} report for one lens"
244 |     fi
245 | 
246 |     for LENS in "${LENSES[@]}"
247 |     do
248 | 
249 |       gs_lens_dir=""
250 |       if [[ $LENS != "" ]]; then
251 |         if [ ! -f "sql/lens/$LENS/histograms.sql" ] || [ ! -f "sql/lens/$LENS/timeseries.sql" ]; then
252 |           echo -e "Lens histogram/timeseries files not found in sql/lens/$LENS."
253 |           exit 1
254 |         fi
255 |         gs_lens_dir="$LENS/"
256 |       fi
257 | 
258 |       date_join=""
259 |       max_date=""
260 |       current_contents=""
261 |       gs_url="gs://httparchive/reports/$gs_lens_dir${metric}.json"
262 |       gsutil ls $gs_url &> /dev/null
263 |       if [ $? -eq 0 ]; then
264 |         # The file already exists, so check max date
265 |         current_contents=$(gsutil cat $gs_url)
266 |         max_date=$(echo $current_contents | jq -r '[ .[] | .date ] | max')
267 |         if [[ $FORCE -eq 0 && -n "${max_date}" ]]; then
268 | 
269 |           # Only run if new dates
270 |           if [[ -z "${YYYY_MM_DD}" || "${max_date}" < "${YYYY_MM_DD}" ]]; then
271 |             if [[ $metric != crux* ]]; then # CrUX is quick and join is more compilicated so just do a full run of that
272 |               date_join="date > CAST(REPLACE(\"$max_date\",\"_\",\"-\") AS DATE)"
273 |               # Skip 2022_05_12 tables
274 |               date_join="${date_join}"
275 |               if [[ -n "$YYYY_MM_DD" ]]; then
276 |                 # If a date is given, then only run up until then (in case next month is mid run as do not wanna get just desktop data)
277 |                 date_join="${date_join} AND date <= \"$DATE\""
278 |               fi
279 |             fi
280 | 
281 |             echo -e "Generating $gs_lens_dir$metric timeseries in incremental mode from ${max_date} to ${YYYY_MM_DD}"
282 | 
283 |           else
284 |             echo -e "Skipping $gs_lens_dir$metric timeseries as ${YYYY_MM_DD} already exists in the data. Run in force mode (-f) if you want to rerun."
285 |             continue
286 |           fi
287 | 
288 |         elif [[ -n "$YYYY_MM_DD" ]]; then
289 |           # Even if doing a force run we only wanna run up until date given in case next month is mid-run as do not wanna get just desktop data
290 |           if [[ $metric != crux* ]]; then # CrUX is quick and join is more compilicated so just do a full run of that
291 |             # If a date is given, then only run up until then (in case next month is mid run as do not wanna get just desktop data)
292 |             date_join="date <= \"$DATE\""
293 |             # Skip 2022_05_12 tables
294 |             date_join="${date_join}"
295 |           fi
296 | 
297 |           echo -e "Force Mode=${FORCE}. Generating $gs_lens_dir$metric timeseries from start until ${YYYY_MM_DD}."
298 |         fi
299 |       elif [[ -n "$YYYY_MM_DD" ]]; then
300 |         # Even if the file does not exist we only wanna run up until date given in case next month is mid-run as do not wanna get just desktop data
301 |         if [[ $metric != crux* ]]; then # CrUX is quick and join is more compilicated so just do a full run of that
302 |           date_join="date <= \"$DATE\""
303 |           # Skip 2022_05_12 tables
304 |           date_join="${date_join}"
305 |         fi
306 | 
307 |         echo -e "Timeseries does not exist. Generating $gs_lens_dir$metric timeseries from start until ${YYYY_MM_DD}"
308 | 
309 |       else
310 |         echo -e "Timeseries does not exist. Generating $gs_lens_dir$metric timeseries from start"
311 |       fi
312 | 
313 |       if [[ $LENS != "" ]]; then
314 | 
315 |         if [[ $metric != crux* ]]; then
316 |           lens_clause="$(cat sql/lens/$LENS/timeseries.sql)"
317 |           lens_clause_and="$(cat sql/lens/$LENS/timeseries.sql) AND"
318 |           lens_join=""
319 |         else
320 |           echo "CrUX query so using alternative lens join"
321 |           lens_clause=""
322 |           lens_clause_and=""
323 |           lens_join="$(cat sql/lens/$LENS/crux_timeseries.sql | tr '\n' ' ')"
324 |         fi
325 | 
326 |         if [[ -n "${date_join}" ]]; then
327 |           if [[ $(grep -i "WHERE" $query) ]]; then
328 |             # If WHERE clause already exists then add to it
329 |             sql=$(sed -e "s/\(WHERE\)/\1 $lens_clause_and $date_join AND/" $query \
330 |               | sed -e "s/\(\`[^\`]*\`)*\)/\1 $lens_join/")
331 |           else
332 |             # If WHERE clause does not exists then add it, before GROUP BY
333 |             sql=$(sed -e "s/\(GROUP BY\)/WHERE $lens_clause_and $date_join \1/" $query \
334 |               | sed -e "s/\(\`[^\`]*\`)*\)/\1 $lens_join/")
335 |           fi
336 |         else
337 |           if [[ $(grep -i "WHERE" $query) ]]; then
338 |             # If WHERE clause already exists then add to it
339 |             sql=$(sed -e "s/\(WHERE\)/\1 $lens_clause_and /" $query \
340 |               | sed -e "s/\(\`[^\`]*\`)*\)/\1 $lens_join/")
341 |           else
342 |             # If WHERE clause does not exists then add it, before GROUP BY
343 |             sql=$(sed -e "s/\(GROUP BY\)/WHERE $lens_clause \1/" $query \
344 |               | sed -e "s/\(\`[^\`]*\`)*\)/\1 $lens_join/")
345 |           fi
346 |         fi
347 | 
348 |       else
349 |         if [[ -n "${date_join}" ]]; then
350 |           if [[ $(grep -i "WHERE" $query) ]]; then
351 |             # If WHERE clause already exists then add to it
352 |             sql=$(sed -e "s/\(WHERE\)/\1 $date_join AND /" $query)
353 |           else
354 |             # If WHERE clause does not exists then add it, before GROUP BY
355 |             sql=$(sed -e "s/\(GROUP BY\)/WHERE $date_join \1/" $query)
356 |           fi
357 |         else
358 |           sql=$(cat $query)
359 |         fi
360 |       fi
361 | 
362 |       if [ ${VERBOSE} -eq 1 ]; then
363 |         echo "Running this query:"
364 |         echo "${sql}\n"
365 |       fi
366 | 
367 |       # Run the timeseries query on BigQuery.
368 |       START_TIME=$SECONDS
369 |       result=$(echo "${sql}" | $BQ_CMD)
370 | 
371 |       # Make sure the query succeeded.
372 |       if [ $? -eq 0 ]; then
373 |         ELAPSED_TIME=$(($SECONDS - $START_TIME))
374 |         if [[ $LENS != "" ]]; then
375 |           echo "$metric for $LENS took $ELAPSED_TIME seconds"
376 |         else
377 |           echo "$metric took $ELAPSED_TIME seconds"
378 |         fi
379 | 
380 |         # If it is a partial run, then combine with the current results.
381 |         if [[ $FORCE -eq 0 && -n "${current_contents}" && $metric != crux* ]]; then
382 |           result=$(echo ${result} ${current_contents} | jq '.+= input')
383 |         fi
384 | 
385 |         # Upload the response to Google Storage.
386 |         echo $result \
387 |           | gsutil  -h "Content-Type:application/json" cp - $gs_url
388 |       else
389 |         echo $result >&2
390 |       fi
391 |     done
392 |   done
393 | fi
394 | 
395 | echo -e "Done"
396 | 


--------------------------------------------------------------------------------
/sql/get_bigquery_dates.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Gets a list of dates for a given BigQuery table.
 4 | #
 5 | # Example usage:
 6 | #
 7 | #   sql/getBigQueryDates.sh har lighthouse
 8 | #
 9 | # Where the first argument is the dataset and the
10 | # second argument is the table suffix.
11 | #
12 | # Example output:
13 | #
14 | #   2017_08_15
15 | #   2017_08_01
16 | #   2017_07_15
17 | #   2017_07_01
18 | #   2017_06_15
19 | #   2017_06_01
20 | #
21 | # May be combined with the generateReports.sh script
22 | # to generate a histogram for each date. For example:
23 | #
24 | #   sql/get_bigquery_dates.sh runs pages | \
25 | #     xargs -I date sql/generate_reports.sh -h date
26 | #
27 | 
28 | set -eo pipefail
29 | 
30 | DATASET=$1
31 | SUFFIX=$2
32 | MIN=$3
33 | MAX=$4
34 | 
35 | if [ -z "$DATASET" ]; then
36 |   echo "Dataset argument required." >&2
37 |   echo "Example usage: sql/getBigQueryDates.sh har lighthouse" >&2
38 |   exit 1
39 | fi
40 | 
41 | having=""
42 | if [ ! -z "$MIN" ] || [ ! -z "$MAX" ]; then
43 |   having="HAVING
44 | "
45 |   if [ ! -z "$MIN" ]; then
46 |     having+="  date >= \"$MIN\""
47 |     if [ ! -z "$MAX" ]; then
48 |         having+=" AND
49 | "
50 |     fi
51 |   fi
52 |   if [ ! -z "$MAX" ]; then
53 |     having+="  date <= \"$MAX\""
54 |   fi
55 |   having+="
56 | "
57 | fi
58 | 
59 | query=$(cat <<EOM
60 | #standardSQL
61 | SELECT
62 |   CONCAT('20', SUBSTR(_TABLE_SUFFIX, 0, 8)) AS date
63 | FROM
64 |   \`httparchive.$DATASET.20*\`
65 | WHERE
66 |   _TABLE_SUFFIX LIKE '%_$SUFFIX%'
67 | GROUP BY
68 |   date
69 | ${having}ORDER BY
70 |   date DESC
71 | EOM
72 | )
73 | 
74 | # Output only the resulting dates.
75 | echo "$query" | bq --quiet --format csv --project_id httparchive query --max_rows 10000  | tail -n +2
76 | 


--------------------------------------------------------------------------------
/sql/histograms/bootupJs.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   *,
 4 |   SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf
 5 | FROM (
 6 |   SELECT
 7 |     *,
 8 |     volume / SUM(volume) OVER (PARTITION BY client) AS pdf
 9 |   FROM (
10 |     SELECT
11 |       client,
12 |       COUNT(0) AS volume,
13 |       FLOOR(FLOAT64(IFNULL(lighthouse.audits['bootup-time'].numericValue, lighthouse.audits['bootup-time'].rawValue)) / 100) / 10 AS bin
14 |     FROM
15 |       `httparchive.crawl.pages`
16 |     WHERE
17 |       date = '${YYYY-MM-DD}' AND
18 |       is_root_page
19 |     GROUP BY
20 |       bin,
21 |       client
22 |     HAVING
23 |       bin IS NOT NULL
24 |   )
25 | )
26 | ORDER BY
27 |   bin,
28 |   client
29 | 


--------------------------------------------------------------------------------
/sql/histograms/bytesCss.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   *,
 4 |   SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf
 5 | FROM (
 6 |   SELECT
 7 |     *,
 8 |     volume / SUM(volume) OVER (PARTITION BY client) AS pdf
 9 |   FROM (
10 |     SELECT
11 |       client,
12 |       COUNT(0) AS volume,
13 |       CAST(FLOOR(FLOAT64(summary.bytesCss) / 10240) * 10 AS INT64) AS bin
14 |     FROM
15 |       `httparchive.crawl.pages`
16 |     WHERE
17 |       date = '${YYYY-MM-DD}' AND
18 |       is_root_page
19 |     GROUP BY
20 |       bin,
21 |       client
22 |   )
23 | )
24 | ORDER BY
25 |   bin,
26 |   client
27 | 


--------------------------------------------------------------------------------
/sql/histograms/bytesFont.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   *,
 4 |   SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf
 5 | FROM (
 6 |   SELECT
 7 |     *,
 8 |     volume / SUM(volume) OVER (PARTITION BY client) AS pdf
 9 |   FROM (
10 |     SELECT
11 |       client,
12 |       COUNT(0) AS volume,
13 |       CAST(FLOOR(FLOAT64(summary.bytesFont) / 10240) * 10 AS INT64) AS bin
14 |     FROM
15 |       `httparchive.crawl.pages`
16 |     WHERE
17 |       date = '${YYYY-MM-DD}' AND
18 |       is_root_page
19 |     GROUP BY
20 |       bin,
21 |       client
22 |   )
23 | )
24 | ORDER BY
25 |   bin,
26 |   client
27 | 


--------------------------------------------------------------------------------
/sql/histograms/bytesHtml.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   *,
 4 |   SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf
 5 | FROM (
 6 |   SELECT
 7 |     *,
 8 |     volume / SUM(volume) OVER (PARTITION BY client) AS pdf
 9 |   FROM (
10 |     SELECT
11 |       client,
12 |       COUNT(0) AS volume,
13 |       CAST(FLOOR(FLOAT64(summary.bytesHtml) / 10240) * 10 AS INT64) AS bin
14 |     FROM
15 |       `httparchive.crawl.pages`
16 |     WHERE
17 |       date = '${YYYY-MM-DD}' AND
18 |       is_root_page
19 |     GROUP BY
20 |       bin,
21 |       client
22 |   )
23 | )
24 | ORDER BY
25 |   bin,
26 |   client
27 | 


--------------------------------------------------------------------------------
/sql/histograms/bytesImg.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   *,
 4 |   SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf
 5 | FROM (
 6 |   SELECT
 7 |     *,
 8 |     volume / SUM(volume) OVER (PARTITION BY client) AS pdf
 9 |   FROM (
10 |     SELECT
11 |       client,
12 |       COUNT(0) AS volume,
13 |       CAST(FLOOR(FLOAT64(summary.bytesImg) / 102400) * 100 AS INT64) AS bin
14 |     FROM
15 |       `httparchive.crawl.pages`
16 |     WHERE
17 |       date = '${YYYY-MM-DD}' AND
18 |       is_root_page
19 |     GROUP BY
20 |       bin,
21 |       client
22 |   )
23 | )
24 | ORDER BY
25 |   bin,
26 |   client
27 | 


--------------------------------------------------------------------------------
/sql/histograms/bytesJs.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   *,
 4 |   SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf
 5 | FROM (
 6 |   SELECT
 7 |     *,
 8 |     volume / SUM(volume) OVER (PARTITION BY client) AS pdf
 9 |   FROM (
10 |     SELECT
11 |       client,
12 |       COUNT(0) AS volume,
13 |       CAST(FLOOR(FLOAT64(summary.bytesJS) / 10240) * 10 AS INT64) AS bin
14 |     FROM
15 |       `httparchive.crawl.pages`
16 |     WHERE
17 |       date = '${YYYY-MM-DD}' AND
18 |       is_root_page
19 |     GROUP BY
20 |       bin,
21 |       client
22 |   )
23 | )
24 | ORDER BY
25 |   bin,
26 |   client
27 | 


--------------------------------------------------------------------------------
/sql/histograms/bytesOther.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   *,
 4 |   SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf
 5 | FROM (
 6 |   SELECT
 7 |     *,
 8 |     volume / SUM(volume) OVER (PARTITION BY client) AS pdf
 9 |   FROM (
10 |     SELECT
11 |       client,
12 |       COUNT(0) AS volume,
13 |       CAST(FLOOR(FLOAT64(summary.bytesOther) / 10240) * 10 AS INT64) AS bin
14 |     FROM
15 |       `httparchive.crawl.pages`
16 |     WHERE
17 |       date = '${YYYY-MM-DD}' AND
18 |       is_root_page
19 |     GROUP BY
20 |       bin,
21 |       client
22 |   )
23 | )
24 | ORDER BY
25 |   bin,
26 |   client
27 | 


--------------------------------------------------------------------------------
/sql/histograms/bytesTotal.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   *,
 4 |   SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf
 5 | FROM (
 6 |   SELECT
 7 |     *,
 8 |     volume / SUM(volume) OVER (PARTITION BY client) AS pdf
 9 |   FROM (
10 |     SELECT
11 |       client,
12 |       COUNT(0) AS volume,
13 |       CAST(FLOOR(FLOAT64(summary.bytesTotal) / 102400) * 100 AS INT64) AS bin
14 |     FROM
15 |       `httparchive.crawl.pages`
16 |     WHERE
17 |       date = '${YYYY-MM-DD}' AND
18 |       is_root_page
19 |     GROUP BY
20 |       bin,
21 |       client
22 |   )
23 | )
24 | ORDER BY
25 |   bin,
26 |   client
27 | 


--------------------------------------------------------------------------------
/sql/histograms/bytesVideo.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   *,
 4 |   SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf
 5 | FROM (
 6 |   SELECT
 7 |     *,
 8 |     volume / SUM(volume) OVER (PARTITION BY client) AS pdf
 9 |   FROM (
10 |     SELECT
11 |       client,
12 |       COUNT(0) AS volume,
13 |       CAST(FLOOR(FLOAT64(summary.bytesVideo) / 10240) * 10 AS INT64) AS bin
14 |     FROM
15 |       `httparchive.crawl.pages`
16 |     WHERE
17 |       date = '${YYYY-MM-DD}' AND
18 |       is_root_page
19 |     GROUP BY
20 |       bin,
21 |       client
22 |   )
23 | )
24 | ORDER BY
25 |   bin,
26 |   client
27 | 


--------------------------------------------------------------------------------
/sql/histograms/compileJs.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   *,
 4 |   SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf
 5 | FROM (
 6 |   SELECT
 7 |     *,
 8 |     volume / SUM(volume) OVER (PARTITION BY client) AS pdf
 9 |   FROM (
10 |     SELECT
11 |       client,
12 |       COUNT(0) AS volume,
13 |       INT64(payload['_cpu.v8.compile']) AS bin
14 |     FROM
15 |       `httparchive.crawl.pages`
16 |     WHERE
17 |       date = '${YYYY-MM-DD}' AND
18 |       is_root_page
19 |     GROUP BY
20 |       bin,
21 |       client
22 |     HAVING
23 |       bin IS NOT NULL AND
24 |       bin >= 0
25 |   )
26 | )
27 | ORDER BY
28 |   bin,
29 |   client
30 | 


--------------------------------------------------------------------------------
/sql/histograms/cruxCls.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | CREATE TEMPORARY FUNCTION spreadBins(bins ARRAY<STRUCT<start NUMERIC, `end` NUMERIC, density FLOAT64>>)
 3 | RETURNS ARRAY<STRUCT<client STRING, start NUMERIC, density FLOAT64>>
 4 | LANGUAGE js AS """
 5 |   // Convert into 0.01 bins and spread the density around.
 6 |   const WIDTH = 0.01;
 7 |   return (bins || []).reduce((bins, bin) => {
 8 |     bin.start = +bin.start;
 9 |     bin.end = Math.min(bin.end, bin.start + 10);
10 |     const binWidth = bin.end - bin.start;
11 |     for (let start = bin.start; start < bin.end; start += WIDTH) {
12 |       bins.push({
13 |         start,
14 |         density: bin.density / (binWidth / WIDTH)
15 |       });
16 |     }
17 |     return bins;
18 |   }, []);
19 | """;
20 | 
21 | SELECT
22 |   *,
23 |   SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf
24 | FROM (
25 |   SELECT
26 |     *,
27 |     volume / SUM(volume) OVER (PARTITION BY client) AS pdf
28 |   FROM (
29 |     SELECT
30 |       IF(form_factor.name = 'desktop', 'desktop', 'mobile') AS client,
31 |       bin.start AS bin,
32 |       SUM(bin.density) AS volume
33 |     FROM (
34 |       SELECT
35 |         form_factor,
36 |         spreadBins(layout_instability.cumulative_layout_shift.histogram.bin) AS bins
37 |       FROM
38 |         `chrome-ux-report.all.${YYYYMM}`
39 |     )
40 |     CROSS JOIN
41 |       UNNEST(bins) AS bin
42 |     GROUP BY
43 |       bin,
44 |       client
45 |   )
46 | )
47 | ORDER BY
48 |   bin,
49 |   client
50 | 


--------------------------------------------------------------------------------
/sql/histograms/cruxDcl.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | CREATE TEMPORARY FUNCTION spreadBins(bins ARRAY<STRUCT<start INT64, `end` INT64, density FLOAT64>>)
 3 | RETURNS ARRAY<STRUCT<client STRING, start INT64, density FLOAT64>>
 4 | LANGUAGE js AS """
 5 |   // Convert into 100ms bins and spread the density around.
 6 |   const WIDTH = 100;
 7 |   return (bins || []).reduce((bins, bin) => {
 8 |     bin.start = +bin.start;
 9 |     bin.end = Math.min(bin.end, bin.start + 5000);
10 |     const binWidth = bin.end - bin.start;
11 |     for (let start = bin.start; start < bin.end; start += WIDTH) {
12 |       bins.push({
13 |         start,
14 |         density: bin.density / (binWidth / WIDTH)
15 |       });
16 |     }
17 |     return bins;
18 |   }, []);
19 | """;
20 | 
21 | SELECT
22 |   *,
23 |   SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf
24 | FROM (
25 |   SELECT
26 |     *,
27 |     volume / SUM(volume) OVER (PARTITION BY client) AS pdf
28 |   FROM (
29 |     SELECT
30 |       IF(form_factor.name = 'desktop', 'desktop', 'mobile') AS client,
31 |       bin.start / 1000 AS bin,
32 |       SUM(bin.density) AS volume
33 |     FROM (
34 |       SELECT
35 |         form_factor,
36 |         spreadBins(dom_content_loaded.histogram.bin) AS bins
37 |       FROM
38 |         `chrome-ux-report.all.${YYYYMM}`
39 |     )
40 |     CROSS JOIN
41 |       UNNEST(bins) AS bin
42 |     GROUP BY
43 |       bin,
44 |       client
45 |   )
46 | )
47 | ORDER BY
48 |   bin,
49 |   client
50 | 


--------------------------------------------------------------------------------
/sql/histograms/cruxFcp.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | CREATE TEMPORARY FUNCTION spreadBins(bins ARRAY<STRUCT<start INT64, `end` INT64, density FLOAT64>>)
 3 | RETURNS ARRAY<STRUCT<client STRING, start INT64, density FLOAT64>>
 4 | LANGUAGE js AS """
 5 |   // Convert into 100ms bins and spread the density around.
 6 |   const WIDTH = 100;
 7 |   return (bins || []).reduce((bins, bin) => {
 8 |     bin.start = +bin.start;
 9 |     bin.end = Math.min(bin.end, bin.start + 5000);
10 |     const binWidth = bin.end - bin.start;
11 |     for (let start = bin.start; start < bin.end; start += WIDTH) {
12 |       bins.push({
13 |         start,
14 |         density: bin.density / (binWidth / WIDTH)
15 |       });
16 |     }
17 |     return bins;
18 |   }, []);
19 | """;
20 | 
21 | SELECT
22 |   *,
23 |   SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf
24 | FROM (
25 |   SELECT
26 |     *,
27 |     volume / SUM(volume) OVER (PARTITION BY client) AS pdf
28 |   FROM (
29 |     SELECT
30 |       IF(form_factor.name = 'desktop', 'desktop', 'mobile') AS client,
31 |       bin.start / 1000 AS bin,
32 |       SUM(bin.density) AS volume
33 |     FROM (
34 |       SELECT
35 |         form_factor,
36 |         spreadBins(first_contentful_paint.histogram.bin) AS bins
37 |       FROM
38 |         `chrome-ux-report.all.${YYYYMM}`
39 |     )
40 |     CROSS JOIN
41 |       UNNEST(bins) AS bin
42 |     GROUP BY
43 |       bin,
44 |       client
45 |   )
46 | )
47 | ORDER BY
48 |   bin,
49 |   client
50 | 


--------------------------------------------------------------------------------
/sql/histograms/cruxFp.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | CREATE TEMPORARY FUNCTION spreadBins(bins ARRAY<STRUCT<start INT64, `end` INT64, density FLOAT64>>)
 3 | RETURNS ARRAY<STRUCT<client STRING, start INT64, density FLOAT64>>
 4 | LANGUAGE js AS """
 5 |   // Convert into 100ms bins and spread the density around.
 6 |   const WIDTH = 100;
 7 |   return (bins || []).reduce((bins, bin) => {
 8 |     bin.start = +bin.start;
 9 |     bin.end = Math.min(bin.end, bin.start + 5000);
10 |     const binWidth = bin.end - bin.start;
11 |     for (let start = bin.start; start < bin.end; start += WIDTH) {
12 |       bins.push({
13 |         start,
14 |         density: bin.density / (binWidth / WIDTH)
15 |       });
16 |     }
17 |     return bins;
18 |   }, []);
19 | """;
20 | 
21 | SELECT
22 |   *,
23 |   SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf
24 | FROM (
25 |   SELECT
26 |     *,
27 |     volume / SUM(volume) OVER (PARTITION BY client) AS pdf
28 |   FROM (
29 |     SELECT
30 |       IF(form_factor.name = 'desktop', 'desktop', 'mobile') AS client,
31 |       bin.start / 1000 AS bin,
32 |       SUM(bin.density) AS volume
33 |     FROM (
34 |       SELECT
35 |         form_factor,
36 |         spreadBins(first_paint.histogram.bin) AS bins
37 |       FROM
38 |         `chrome-ux-report.all.${YYYYMM}`
39 |     )
40 |     CROSS JOIN
41 |       UNNEST(bins) AS bin
42 |     GROUP BY
43 |       bin,
44 |       client
45 |   )
46 | )
47 | ORDER BY
48 |   bin,
49 |   client
50 | 


--------------------------------------------------------------------------------
/sql/histograms/cruxInp.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   *,
 4 |   SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf
 5 | FROM (
 6 |   SELECT
 7 |     *,
 8 |     volume / SUM(volume) OVER (PARTITION BY client) AS pdf
 9 |   FROM (
10 |     SELECT
11 |       IF(form_factor.name = 'desktop', 'desktop', 'mobile') AS client,
12 |       bin.start AS bin,
13 |       SUM(bin.density) AS volume
14 |     FROM (
15 |       SELECT
16 |         form_factor,
17 |         interaction_to_next_paint.histogram.bin AS bins
18 |       FROM
19 |         `chrome-ux-report.all.${YYYYMM}`
20 |     )
21 |     CROSS JOIN
22 |       UNNEST(bins) AS bin
23 |     GROUP BY
24 |       bin,
25 |       client
26 |   )
27 | )
28 | ORDER BY
29 |   bin,
30 |   client
31 | 


--------------------------------------------------------------------------------
/sql/histograms/cruxLcp.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | CREATE TEMPORARY FUNCTION spreadBins(bins ARRAY<STRUCT<start INT64, `end` INT64, density FLOAT64>>)
 3 | RETURNS ARRAY<STRUCT<client STRING, start INT64, density FLOAT64>>
 4 | LANGUAGE js AS """
 5 |   // Convert into 100ms bins and spread the density around.
 6 |   const WIDTH = 100;
 7 |   return (bins || []).reduce((bins, bin) => {
 8 |     bin.start = +bin.start;
 9 |     bin.end = Math.min(bin.end, bin.start + 5000);
10 |     const binWidth = bin.end - bin.start;
11 |     for (let start = bin.start; start < bin.end; start += WIDTH) {
12 |       bins.push({
13 |         start,
14 |         density: bin.density / (binWidth / WIDTH)
15 |       });
16 |     }
17 |     return bins;
18 |   }, []);
19 | """;
20 | 
21 | SELECT
22 |   *,
23 |   SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf
24 | FROM (
25 |   SELECT
26 |     *,
27 |     volume / SUM(volume) OVER (PARTITION BY client) AS pdf
28 |   FROM (
29 |     SELECT
30 |       IF(form_factor.name = 'desktop', 'desktop', 'mobile') AS client,
31 |       bin.start / 1000 AS bin,
32 |       SUM(bin.density) AS volume
33 |     FROM (
34 |       SELECT
35 |         form_factor,
36 |         spreadBins(largest_contentful_paint.histogram.bin) AS bins
37 |       FROM
38 |         `chrome-ux-report.all.${YYYYMM}`
39 |     )
40 |     CROSS JOIN
41 |       UNNEST(bins) AS bin
42 |     GROUP BY
43 |       bin,
44 |       client
45 |   )
46 | )
47 | ORDER BY
48 |   bin,
49 |   client
50 | 


--------------------------------------------------------------------------------
/sql/histograms/cruxOl.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | CREATE TEMPORARY FUNCTION spreadBins(bins ARRAY<STRUCT<start INT64, `end` INT64, density FLOAT64>>)
 3 | RETURNS ARRAY<STRUCT<client STRING, start INT64, density FLOAT64>>
 4 | LANGUAGE js AS """
 5 |   // Convert into 100ms bins and spread the density around.
 6 |   const WIDTH = 100;
 7 |   return (bins || []).reduce((bins, bin) => {
 8 |     bin.start = +bin.start;
 9 |     bin.end = Math.min(bin.end, bin.start + 5000);
10 |     const binWidth = bin.end - bin.start;
11 |     for (let start = bin.start; start < bin.end; start += WIDTH) {
12 |       bins.push({
13 |         start,
14 |         density: bin.density / (binWidth / WIDTH)
15 |       });
16 |     }
17 |     return bins;
18 |   }, []);
19 | """;
20 | 
21 | SELECT
22 |   *,
23 |   SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf
24 | FROM (
25 |   SELECT
26 |     *,
27 |     volume / SUM(volume) OVER (PARTITION BY client) AS pdf
28 |   FROM (
29 |     SELECT
30 |       IF(form_factor.name = 'desktop', 'desktop', 'mobile') AS client,
31 |       bin.start / 1000 AS bin,
32 |       SUM(bin.density) AS volume
33 |     FROM (
34 |       SELECT
35 |         form_factor,
36 |         spreadBins(onload.histogram.bin) AS bins
37 |       FROM
38 |         `chrome-ux-report.all.${YYYYMM}`
39 |     )
40 |     CROSS JOIN
41 |       UNNEST(bins) AS bin
42 |     GROUP BY
43 |       bin,
44 |       client
45 |   )
46 | )
47 | ORDER BY
48 |   bin,
49 |   client
50 | 


--------------------------------------------------------------------------------
/sql/histograms/cruxShopifyThemes.sql:
--------------------------------------------------------------------------------
  1 | #standardSQL
  2 | -- Core web vitals by Shopify theme
  3 | CREATE TEMP FUNCTION IS_GOOD(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS (
  4 |   good / (good + needs_improvement + poor) >= 0.75
  5 | );
  6 | 
  7 | CREATE TEMP FUNCTION IS_POOR(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS (
  8 |   poor / (good + needs_improvement + poor) > 0.25
  9 | );
 10 | 
 11 | CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS (
 12 |   good + needs_improvement + poor > 0
 13 | );
 14 | 
 15 | -- Test CrUX data exists
 16 | WITH crux_test AS ( -- noqa: ST03
 17 |   SELECT
 18 |     1
 19 |   FROM
 20 |     `chrome-ux-report.all.${YYYYMM}`
 21 | ),
 22 | 
 23 | -- All Shopify shops in HTTPArchive
 24 | archive_pages AS (
 25 |   SELECT
 26 |     client,
 27 |     page AS url,
 28 |     JSON_VALUE(custom_metrics.ecommerce.Shopify.theme.name) AS theme_name,
 29 |     JSON_VALUE(custom_metrics.ecommerce.Shopify.theme.theme_store_id) AS theme_store_id
 30 |   FROM
 31 |     `httparchive.crawl.pages`
 32 |   WHERE
 33 |     date = '${YYYY-MM-DD}' AND
 34 |     is_root_page AND
 35 |     JSON_VALUE(custom_metrics.ecommerce.Shopify.theme.name) IS NOT NULL --first grab all shops for market share
 36 | )
 37 | 
 38 | SELECT
 39 |   client,
 40 |   archive_pages.theme_store_id AS id,
 41 |   theme_names.theme_name AS top_theme_name,
 42 |   COUNT(DISTINCT origin) AS origins,
 43 |   -- Origins with good LCP divided by origins with any LCP.
 44 |   SAFE_DIVIDE(
 45 |     COUNT(DISTINCT IF(IS_GOOD(fast_lcp, avg_lcp, slow_lcp), origin, NULL)),
 46 |     COUNT(DISTINCT IF(IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp), origin, NULL))
 47 |   ) AS pct_good_lcp,
 48 |   -- Origins with needs improvement are anything not good, nor poor.
 49 |   1 -
 50 |   SAFE_DIVIDE(
 51 |     COUNT(DISTINCT IF(IS_GOOD(fast_lcp, avg_lcp, slow_lcp), origin, NULL)),
 52 |     COUNT(DISTINCT IF(IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp), origin, NULL))
 53 |   )
 54 |   -
 55 |   SAFE_DIVIDE(
 56 |     COUNT(DISTINCT IF(IS_POOR(fast_lcp, avg_lcp, slow_lcp), origin, NULL)),
 57 |     COUNT(DISTINCT IF(IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp), origin, NULL)))
 58 |     AS pct_ni_lcp,
 59 |   -- Origins with poor LCP divided by origins with any LCP.
 60 |   SAFE_DIVIDE(
 61 |     COUNT(DISTINCT IF(IS_POOR(fast_lcp, avg_lcp, slow_lcp), origin, NULL)),
 62 |     COUNT(DISTINCT IF(IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp), origin, NULL))
 63 |   ) AS pct_poor_lcp,
 64 | 
 65 |   -- Origins with good TTFB divided by origins with any TTFB.
 66 |   SAFE_DIVIDE(
 67 |     COUNT(DISTINCT IF(IS_GOOD(fast_ttfb, avg_ttfb, slow_ttfb), origin, NULL)),
 68 |     COUNT(DISTINCT IF(IS_NON_ZERO(fast_ttfb, avg_ttfb, slow_ttfb), origin, NULL))
 69 |   ) AS pct_good_ttfb,
 70 |   -- Origins with needs improvement are anything not good, nor poor.
 71 |   1 -
 72 |   SAFE_DIVIDE(
 73 |     COUNT(DISTINCT IF(IS_GOOD(fast_ttfb, avg_ttfb, slow_ttfb), origin, NULL)),
 74 |     COUNT(DISTINCT IF(IS_NON_ZERO(fast_ttfb, avg_ttfb, slow_ttfb), origin, NULL))
 75 |   )
 76 |   -
 77 |   SAFE_DIVIDE(
 78 |     COUNT(DISTINCT IF(IS_POOR(fast_ttfb, avg_ttfb, slow_ttfb), origin, NULL)),
 79 |     COUNT(DISTINCT IF(IS_NON_ZERO(fast_ttfb, avg_ttfb, slow_ttfb), origin, NULL)))
 80 |     AS pct_ni_ttfb,
 81 |   -- Origins with poor TTFB divided by origins with any TTFB.
 82 |   SAFE_DIVIDE(
 83 |     COUNT(DISTINCT IF(IS_POOR(fast_ttfb, avg_ttfb, slow_ttfb), origin, NULL)),
 84 |     COUNT(DISTINCT IF(IS_NON_ZERO(fast_ttfb, avg_ttfb, slow_ttfb), origin, NULL))
 85 |   ) AS pct_poor_ttfb,
 86 | 
 87 |   -- Origins with good FCP divided by origins with any FCP.
 88 |   SAFE_DIVIDE(
 89 |     COUNT(DISTINCT IF(IS_GOOD(fast_fcp, avg_fcp, slow_fcp), origin, NULL)),
 90 |     COUNT(DISTINCT IF(IS_NON_ZERO(fast_fcp, avg_fcp, slow_fcp), origin, NULL))
 91 |   ) AS pct_good_fcp,
 92 |   -- Origins with needs improvement are anything not good, nor poor.
 93 |   1 -
 94 |   SAFE_DIVIDE(
 95 |     COUNT(DISTINCT IF(IS_GOOD(fast_fcp, avg_fcp, slow_fcp), origin, NULL)),
 96 |     COUNT(DISTINCT IF(IS_NON_ZERO(fast_fcp, avg_fcp, slow_fcp), origin, NULL))
 97 |   )
 98 |   -
 99 |   SAFE_DIVIDE(
100 |     COUNT(DISTINCT IF(IS_POOR(fast_fcp, avg_fcp, slow_fcp), origin, NULL)),
101 |     COUNT(DISTINCT IF(IS_NON_ZERO(fast_fcp, avg_fcp, slow_fcp), origin, NULL)))
102 |     AS pct_ni_fcp,
103 |   -- Origins with poor FCP divided by origins with any FCP.
104 |   SAFE_DIVIDE(
105 |     COUNT(DISTINCT IF(IS_POOR(fast_fcp, avg_fcp, slow_fcp), origin, NULL)),
106 |     COUNT(DISTINCT IF(IS_NON_ZERO(fast_fcp, avg_fcp, slow_fcp), origin, NULL))
107 |   ) AS pct_poor_fcp,
108 | 
109 |   -- Origins with good INP divided by origins with any INP.
110 |   SAFE_DIVIDE(
111 |     COUNT(DISTINCT IF(IS_GOOD(fast_inp, avg_inp, slow_inp), origin, NULL)),
112 |     COUNT(DISTINCT IF(IS_NON_ZERO(fast_inp, avg_inp, slow_inp), origin, NULL))
113 |   ) AS pct_good_inp,
114 |   -- Origins with needs improvement are anything not good, nor poor.
115 |   1 -
116 |   SAFE_DIVIDE(
117 |     COUNT(DISTINCT IF(IS_GOOD(fast_inp, avg_inp, slow_inp), origin, NULL)),
118 |     COUNT(DISTINCT IF(IS_NON_ZERO(fast_inp, avg_inp, slow_inp), origin, NULL))
119 |   )
120 |   -
121 |   SAFE_DIVIDE(
122 |     COUNT(DISTINCT IF(IS_POOR(fast_inp, avg_inp, slow_inp), origin, NULL)),
123 |     COUNT(DISTINCT IF(IS_NON_ZERO(fast_inp, avg_inp, slow_inp), origin, NULL)))
124 |     AS pct_ni_inp,
125 |   -- Origins with poor INP divided by origins with any INP.
126 |   SAFE_DIVIDE(
127 |     COUNT(DISTINCT IF(IS_POOR(fast_inp, avg_inp, slow_inp), origin, NULL)),
128 |     COUNT(DISTINCT IF(IS_NON_ZERO(fast_inp, avg_inp, slow_inp), origin, NULL))
129 |   ) AS pct_poor_inp,
130 | 
131 |   -- Origins with good CLS divided by origins with any CLS.
132 |   SAFE_DIVIDE(
133 |     COUNT(DISTINCT IF(IS_GOOD(small_cls, medium_cls, large_cls), origin, NULL)),
134 |     COUNT(DISTINCT IF(IS_NON_ZERO(small_cls, medium_cls, large_cls), origin, NULL))
135 |   ) AS pct_good_cls,
136 |   -- Origins with needs improvement are anything not good, nor poor.
137 |   1 -
138 |   SAFE_DIVIDE(
139 |     COUNT(DISTINCT IF(IS_GOOD(small_cls, medium_cls, large_cls), origin, NULL)),
140 |     COUNT(DISTINCT IF(IS_NON_ZERO(small_cls, medium_cls, large_cls), origin, NULL))
141 |   )
142 |   -
143 |   SAFE_DIVIDE(
144 |     COUNT(DISTINCT IF(IS_POOR(small_cls, medium_cls, large_cls), origin, NULL)),
145 |     COUNT(DISTINCT IF(IS_NON_ZERO(small_cls, medium_cls, large_cls), origin, NULL)))
146 |     AS pct_ni_cls,
147 |   -- Origins with poor CLS divided by origins with any CLS.
148 |   SAFE_DIVIDE(
149 |     COUNT(DISTINCT IF(IS_POOR(small_cls, medium_cls, large_cls), origin, NULL)),
150 |     COUNT(DISTINCT IF(IS_NON_ZERO(small_cls, medium_cls, large_cls), origin, NULL))
151 |   ) AS pct_poor_cls,
152 | 
153 |   -- Origins with good LCP, INP (optional), and CLS divided by origins with any LCP and CLS.
154 |   SAFE_DIVIDE(
155 |     COUNT(DISTINCT IF(
156 |       IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AND
157 |       IS_GOOD(fast_inp, avg_inp, slow_inp) IS NOT FALSE AND
158 |       IS_GOOD(small_cls, medium_cls, large_cls), origin, NULL
159 |     )),
160 |     COUNT(DISTINCT IF(
161 |       IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp) AND
162 |       IS_NON_ZERO(small_cls, medium_cls, large_cls), origin, NULL
163 |     ))
164 |   ) AS pct_good_cwv
165 | FROM
166 |   `chrome-ux-report.materialized.device_summary`
167 | JOIN archive_pages
168 | ON
169 |   CONCAT(origin, '/') = url AND
170 |   IF(device = 'desktop', 'desktop', 'mobile') = client
171 | JOIN (
172 |   -- Add in top theme name for a theme store id AS this should usually be the real theme name
173 |   SELECT
174 |     COUNT(DISTINCT url) AS pages_count,
175 |     theme_store_id,
176 |     theme_name,
177 |     row_number() OVER (PARTITION BY theme_store_id ORDER BY COUNT(DISTINCT url) DESC) AS rank
178 |   FROM archive_pages
179 |   GROUP BY
180 |     theme_store_id,
181 |     theme_name
182 |   ORDER BY COUNT(DISTINCT url) DESC
183 | ) theme_names
184 | -- Include null theme store ids so that we can get full market share within CrUX
185 | ON IFNULL(theme_names.theme_store_id, 'N/A') = IFNULL(archive_pages.theme_store_id, 'N/A')
186 | WHERE
187 |   date = '${YYYY-MM-DD}' AND
188 |   theme_names.rank = 1
189 | GROUP BY
190 |   client,
191 |   id,
192 |   top_theme_name
193 | ORDER BY
194 |   origins DESC
195 | 


--------------------------------------------------------------------------------
/sql/histograms/cruxTtfb.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   *,
 4 |   SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf
 5 | FROM (
 6 |   SELECT
 7 |     *,
 8 |     volume / SUM(volume) OVER (PARTITION BY client) AS pdf
 9 |   FROM (
10 |     SELECT
11 |       IF(form_factor.name = 'desktop', 'desktop', 'mobile') AS client,
12 |       bin.start AS bin,
13 |       SUM(bin.density) AS volume
14 |     FROM (
15 |       SELECT
16 |         form_factor,
17 |         experimental.time_to_first_byte.histogram.bin AS bins
18 |       FROM
19 |         `chrome-ux-report.all.${YYYYMM}`
20 |     )
21 |     CROSS JOIN
22 |       UNNEST(bins) AS bin
23 |     GROUP BY
24 |       bin,
25 |       client
26 |   )
27 | )
28 | ORDER BY
29 |   bin,
30 |   client
31 | 


--------------------------------------------------------------------------------
/sql/histograms/dcl.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   *,
 4 |   SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf
 5 | FROM (
 6 |   SELECT
 7 |     *,
 8 |     volume / SUM(volume) OVER (PARTITION BY client) AS pdf
 9 |   FROM (
10 |     SELECT
11 |       client,
12 |       COUNT(0) AS volume,
13 |       FLOOR(FLOAT64(summary.onContentLoaded) / 1000) AS bin
14 |     FROM
15 |       `httparchive.crawl.pages`
16 |     WHERE
17 |       date = '${YYYY-MM-DD}' AND
18 |       is_root_page AND
19 |       FLOAT64(summary.onContentLoaded) > 0
20 |     GROUP BY
21 |       bin,
22 |       client
23 |   )
24 | )
25 | ORDER BY
26 |   bin,
27 |   client
28 | 


--------------------------------------------------------------------------------
/sql/histograms/evalJs.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   *,
 4 |   SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf
 5 | FROM (
 6 |   SELECT
 7 |     *,
 8 |     volume / SUM(volume) OVER (PARTITION BY client) AS pdf
 9 |   FROM (
10 |     SELECT
11 |       client,
12 |       COUNT(0) AS volume,
13 |       CAST(FLOAT64(r.payload['_cpu.EvaluateScript']) / 20 AS INT64) * 20 AS bin
14 |     FROM
15 |       `httparchive.crawl.requests` r
16 |     INNER JOIN
17 |       `httparchive.crawl.pages`
18 |     USING (date, client, is_root_page, rank, page)
19 |     WHERE
20 |       date = '${YYYY-MM-DD}' AND
21 |       is_root_page
22 |     GROUP BY
23 |       bin,
24 |       client
25 |     HAVING
26 |       bin IS NOT NULL AND
27 |       bin >= 0
28 |   )
29 | )
30 | ORDER BY
31 |   bin,
32 |   client
33 | 


--------------------------------------------------------------------------------
/sql/histograms/fcp.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   *,
 4 |   SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf
 5 | FROM (
 6 |   SELECT
 7 |     *,
 8 |     volume / SUM(volume) OVER (PARTITION BY client) AS pdf
 9 |   FROM (
10 |     SELECT
11 |       client,
12 |       COUNT(0) AS volume,
13 |       CAST(FLOOR(FLOAT64(payload['_chromeUserTiming.firstContentfulPaint']) / 1000) AS INT64) AS bin
14 |     FROM
15 |       `httparchive.crawl.pages`
16 |     WHERE
17 |       date = '${YYYY-MM-DD}' AND
18 |       is_root_page
19 |     GROUP BY
20 |       bin,
21 |       client
22 |     HAVING
23 |       bin IS NOT NULL AND
24 |       bin >= 0
25 |   )
26 | )
27 | ORDER BY
28 |   bin,
29 |   client
30 | 


--------------------------------------------------------------------------------
/sql/histograms/gzipSavings.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   *,
 4 |   SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf
 5 | FROM (
 6 |   SELECT
 7 |     *,
 8 |     volume / SUM(volume) OVER (PARTITION BY client) AS pdf
 9 |   FROM (
10 |     SELECT
11 |       client,
12 |       COUNT(0) AS volume,
13 |       CAST(FLOOR(FLOAT64(payload._gzip_savings) / (1024 * 2)) * 2 AS INT64) AS bin
14 |     FROM
15 |       `httparchive.crawl.pages`
16 |     WHERE
17 |       date = '${YYYY-MM-DD}' AND
18 |       is_root_page
19 |     GROUP BY
20 |       bin,
21 |       client
22 |     HAVING
23 |       bin IS NOT NULL
24 |   )
25 | )
26 | ORDER BY
27 |   bin,
28 |   client
29 | 


--------------------------------------------------------------------------------
/sql/histograms/htmlElementPopularity.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | CREATE TEMPORARY FUNCTION getElements(payload STRING)
 3 | RETURNS ARRAY<STRING> LANGUAGE js AS '''
 4 | try {
 5 |   var elements = JSON.parse(payload);
 6 |   if (Array.isArray(elements) || typeof elements != 'object') return [];
 7 |   return Object.keys(elements);
 8 | } catch (e) {
 9 |   return [];
10 | }
11 | ''';
12 | 
13 | SELECT
14 |   client,
15 |   element,
16 |   COUNT(DISTINCT root_page) AS pages,
17 |   total,
18 |   COUNT(DISTINCT root_page) / total AS pct,
19 |   ARRAY_TO_STRING(ARRAY_AGG(DISTINCT page LIMIT 5), ' ') AS sample_urls
20 | FROM
21 |   `httparchive.crawl.pages`
22 | JOIN
23 |   (
24 |     SELECT
25 |       date,
26 |       client,
27 |       COUNT(DISTINCT root_page) AS total
28 |     FROM
29 |       `httparchive.crawl.pages`
30 |     WHERE
31 |       date = '${YYYY-MM-DD}'
32 |     GROUP BY
33 |       date,
34 |       client
35 |   )
36 | USING (date, client),
37 |   UNNEST(getElements(TO_JSON_STRING(custom_metrics.element_count))) AS element
38 | WHERE
39 |   date = '${YYYY-MM-DD}'
40 | GROUP BY
41 |   client,
42 |   total,
43 |   element
44 | HAVING
45 |   COUNT(DISTINCT root_page) >= 10
46 | ORDER BY
47 |   pages / total DESC,
48 |   client
49 | 


--------------------------------------------------------------------------------
/sql/histograms/imgSavings.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   *,
 4 |   SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf
 5 | FROM (
 6 |   SELECT
 7 |     *,
 8 |     volume / SUM(volume) OVER (PARTITION BY client) AS pdf
 9 |   FROM (
10 |     SELECT
11 |       client,
12 |       COUNT(0) AS volume,
13 |       CAST(FLOOR(FLOAT64(payload._image_savings) / (1024 * 10)) * 10 AS INT64) AS bin
14 |     FROM
15 |       `httparchive.crawl.pages`
16 |     WHERE
17 |       date = '${YYYY-MM-DD}' AND
18 |       is_root_page
19 |     GROUP BY
20 |       bin,
21 |       client
22 |     HAVING
23 |       bin IS NOT NULL
24 |   )
25 | )
26 | ORDER BY
27 |   bin,
28 |   client
29 | 


--------------------------------------------------------------------------------
/sql/histograms/offscreenImages.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   *,
 4 |   SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf
 5 | FROM (
 6 |   SELECT
 7 |     *,
 8 |     volume / SUM(volume) OVER (PARTITION BY client) AS pdf
 9 |   FROM (
10 |     SELECT
11 |       client,
12 |       COUNT(0) AS volume,
13 |       CAST(FLOOR(IFNULL(
14 |         INT64(lighthouse.audits['offscreen-images'].details.overallSavingsBytes),
15 |         INT64(lighthouse.audits['offscreen-images'].extendedInfo.value.wastedKb) * 1024
16 |       ) / 10240) * 10 AS INT64) AS bin
17 |     FROM
18 |       `httparchive.crawl.pages`
19 |     WHERE
20 |       date >= '2022-03-01' AND
21 |       date = '${YYYY-MM-DD}' AND
22 |       is_root_page
23 |     GROUP BY
24 |       bin,
25 |       client
26 |     HAVING
27 |       bin IS NOT NULL
28 |   )
29 | )
30 | ORDER BY
31 |   bin,
32 |   client
33 | 


--------------------------------------------------------------------------------
/sql/histograms/ol.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   *,
 4 |   SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf
 5 | FROM (
 6 |   SELECT
 7 |     *,
 8 |     volume / SUM(volume) OVER (PARTITION BY client) AS pdf
 9 |   FROM (
10 |     SELECT
11 |       client,
12 |       COUNT(0) AS volume,
13 |       FLOOR(FLOAT64(summary.onLoad) / 1000) AS bin
14 |     FROM
15 |       `httparchive.crawl.pages`
16 |     WHERE
17 |       date = '${YYYY-MM-DD}' AND
18 |       is_root_page AND
19 |       FLOAT64(summary.onLoad) > 0
20 |     GROUP BY
21 |       bin,
22 |       client
23 |   )
24 | )
25 | ORDER BY
26 |   bin,
27 |   client
28 | 


--------------------------------------------------------------------------------
/sql/histograms/optimizedImages.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   *,
 4 |   SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf
 5 | FROM (
 6 |   SELECT
 7 |     *,
 8 |     volume / SUM(volume) OVER (PARTITION BY client) AS pdf
 9 |   FROM (
10 |     SELECT
11 |       client,
12 |       COUNT(0) AS volume,
13 |       CAST(FLOOR(IFNULL(
14 |         INT64(lighthouse.audits['uses-optimized-images'].details.overallSavingsBytes),
15 |         INT64(lighthouse.audits['uses-optimized-images'].extendedInfo.value.wastedKb) * 1024
16 |       ) / 10240) * 10 AS INT64) AS bin
17 |     FROM
18 |       `httparchive.crawl.pages`
19 |     WHERE
20 |       date >= '2022-03-01' AND
21 |       date = '${YYYY-MM-DD}' AND
22 |       is_root_page
23 |     GROUP BY
24 |       bin,
25 |       client
26 |     HAVING
27 |       bin IS NOT NULL
28 |   )
29 | )
30 | ORDER BY
31 |   bin,
32 |   client
33 | 


--------------------------------------------------------------------------------
/sql/histograms/reqCss.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   *,
 4 |   SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf
 5 | FROM (
 6 |   SELECT
 7 |     *,
 8 |     volume / SUM(volume) OVER (PARTITION BY client) AS pdf
 9 |   FROM (
10 |     SELECT
11 |       client,
12 |       COUNT(0) AS volume,
13 |       FLOAT64(summary.reqCss) AS bin
14 |     FROM
15 |       `httparchive.crawl.pages`
16 |     WHERE
17 |       date = '${YYYY-MM-DD}' AND
18 |       is_root_page
19 |     GROUP BY
20 |       bin,
21 |       client
22 |   )
23 | )
24 | ORDER BY
25 |   bin,
26 |   client
27 | 


--------------------------------------------------------------------------------
/sql/histograms/reqFont.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   *,
 4 |   SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf
 5 | FROM (
 6 |   SELECT
 7 |     *,
 8 |     volume / SUM(volume) OVER (PARTITION BY client) AS pdf
 9 |   FROM (
10 |     SELECT
11 |       client,
12 |       COUNT(0) AS volume,
13 |       FLOAT64(summary.reqFont) AS bin
14 |     FROM
15 |       `httparchive.crawl.pages`
16 |     WHERE
17 |       date = '${YYYY-MM-DD}' AND
18 |       is_root_page
19 |     GROUP BY
20 |       bin,
21 |       client
22 |   )
23 | )
24 | ORDER BY
25 |   bin,
26 |   client
27 | 


--------------------------------------------------------------------------------
/sql/histograms/reqHtml.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   *,
 4 |   SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf
 5 | FROM (
 6 |   SELECT
 7 |     *,
 8 |     volume / SUM(volume) OVER (PARTITION BY client) AS pdf
 9 |   FROM (
10 |     SELECT
11 |       client,
12 |       COUNT(0) AS volume,
13 |       FLOAT64(summary.reqHtml) AS bin
14 |     FROM
15 |       `httparchive.crawl.pages`
16 |     WHERE
17 |       date = '${YYYY-MM-DD}' AND
18 |       is_root_page
19 |     GROUP BY
20 |       bin,
21 |       client
22 |   )
23 | )
24 | ORDER BY
25 |   bin,
26 |   client
27 | 


--------------------------------------------------------------------------------
/sql/histograms/reqImg.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   *,
 4 |   SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf
 5 | FROM (
 6 |   SELECT
 7 |     *,
 8 |     volume / SUM(volume) OVER (PARTITION BY client) AS pdf
 9 |   FROM (
10 |     SELECT
11 |       client,
12 |       COUNT(0) AS volume,
13 |       FLOAT64(summary.reqImg) AS bin
14 |     FROM
15 |       `httparchive.crawl.pages`
16 |     WHERE
17 |       date = '${YYYY-MM-DD}' AND
18 |       is_root_page
19 |     GROUP BY
20 |       bin,
21 |       client
22 |   )
23 | )
24 | ORDER BY
25 |   bin,
26 |   client
27 | 


--------------------------------------------------------------------------------
/sql/histograms/reqJs.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   *,
 4 |   SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf
 5 | FROM (
 6 |   SELECT
 7 |     *,
 8 |     volume / SUM(volume) OVER (PARTITION BY client) AS pdf
 9 |   FROM (
10 |     SELECT
11 |       client,
12 |       COUNT(0) AS volume,
13 |       FLOAT64(summary.reqJS) AS bin
14 |     FROM
15 |       `httparchive.crawl.pages`
16 |     WHERE
17 |       date = '${YYYY-MM-DD}' AND
18 |       is_root_page
19 |     GROUP BY
20 |       bin,
21 |       client
22 |   )
23 | )
24 | ORDER BY
25 |   bin,
26 |   client
27 | 


--------------------------------------------------------------------------------
/sql/histograms/reqOther.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   *,
 4 |   SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf
 5 | FROM (
 6 |   SELECT
 7 |     *,
 8 |     volume / SUM(volume) OVER (PARTITION BY client) AS pdf
 9 |   FROM (
10 |     SELECT
11 |       client,
12 |       COUNT(0) AS volume,
13 |       FLOAT64(summary.reqOther) AS bin
14 |     FROM
15 |       `httparchive.crawl.pages`
16 |     WHERE
17 |       date = '${YYYY-MM-DD}' AND
18 |       is_root_page
19 |     GROUP BY
20 |       bin,
21 |       client
22 |   )
23 | )
24 | ORDER BY
25 |   bin,
26 |   client
27 | 


--------------------------------------------------------------------------------
/sql/histograms/reqTotal.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   *,
 4 |   SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf
 5 | FROM (
 6 |   SELECT
 7 |     *,
 8 |     volume / SUM(volume) OVER (PARTITION BY client) AS pdf
 9 |   FROM (
10 |     SELECT
11 |       client,
12 |       COUNT(0) AS volume,
13 |       FLOOR(FLOAT64(summary.reqTotal) / 10) * 10 AS bin
14 |     FROM
15 |       `httparchive.crawl.pages`
16 |     WHERE
17 |       date = '${YYYY-MM-DD}' AND
18 |       is_root_page
19 |     GROUP BY
20 |       bin,
21 |       client
22 |   )
23 | )
24 | ORDER BY
25 |   bin,
26 |   client
27 | 


--------------------------------------------------------------------------------
/sql/histograms/reqVideo.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   *,
 4 |   SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf
 5 | FROM (
 6 |   SELECT
 7 |     *,
 8 |     volume / SUM(volume) OVER (PARTITION BY client) AS pdf
 9 |   FROM (
10 |     SELECT
11 |       client,
12 |       COUNT(0) AS volume,
13 |       FLOAT64(summary.reqVideo) AS bin
14 |     FROM
15 |       `httparchive.crawl.pages`
16 |     WHERE
17 |       date = '${YYYY-MM-DD}' AND
18 |       is_root_page
19 |     GROUP BY
20 |       bin,
21 |       client
22 |   )
23 | )
24 | ORDER BY
25 |   bin,
26 |   client
27 | 


--------------------------------------------------------------------------------
/sql/histograms/speedIndex.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   *,
 4 |   SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf
 5 | FROM (
 6 |   SELECT
 7 |     *,
 8 |     volume / SUM(volume) OVER (PARTITION BY client) AS pdf
 9 |   FROM (
10 |     SELECT
11 |       client,
12 |       COUNT(0) AS volume,
13 |       CAST(FLOOR(FLOAT64(payload._SpeedIndex) / (1000)) * 1000 AS INT64) AS bin
14 |     FROM
15 |       `httparchive.crawl.pages`
16 |     WHERE
17 |       date = '${YYYY-MM-DD}' AND
18 |       is_root_page
19 |     GROUP BY
20 |       bin,
21 |       client
22 |     HAVING
23 |       bin IS NOT NULL
24 |   )
25 | )
26 | ORDER BY
27 |   bin,
28 |   client
29 | 


--------------------------------------------------------------------------------
/sql/histograms/tcp.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   *,
 4 |   SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf
 5 | FROM (
 6 |   SELECT
 7 |     *,
 8 |     volume / SUM(volume) OVER (PARTITION BY client) AS pdf
 9 |   FROM (
10 |     SELECT
11 |       client,
12 |       COUNT(0) AS volume,
13 |       INT64(summary._connections) AS bin
14 |     FROM
15 |       `httparchive.crawl.pages`
16 |     WHERE
17 |       date = '${YYYY-MM-DD}' AND
18 |       is_root_page AND
19 |       INT64(summary._connections) > 0
20 |     GROUP BY
21 |       bin,
22 |       client
23 |   )
24 | )
25 | ORDER BY
26 |   bin,
27 |   client
28 | 


--------------------------------------------------------------------------------
/sql/histograms/ttci.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   *,
 4 |   SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf
 5 | FROM (
 6 |   SELECT
 7 |     *,
 8 |     volume / SUM(volume) OVER (PARTITION BY client) AS pdf
 9 |   FROM (
10 |     SELECT
11 |       client,
12 |       COUNT(0) AS volume,
13 |       CAST(FLOOR(CAST(IFNULL(
14 |         FLOAT64(lighthouse.audits.interactive.numericValue),
15 |         IFNULL(
16 |           FLOAT64(lighthouse.audits['consistently-interactive'].rawValue),
17 |           FLOAT64(lighthouse.audits.interactive.rawValue)
18 |         )
19 |       ) AS FLOAT64) / 1000) AS INT64) AS bin
20 |     FROM
21 |       `httparchive.crawl.pages`
22 |     WHERE
23 |       date >= '2022-03-01' AND
24 |       date = '${YYYY-MM-DD}' AND
25 |       is_root_page
26 |     GROUP BY
27 |       bin,
28 |       client
29 |     HAVING
30 |       bin IS NOT NULL
31 |   )
32 | )
33 | ORDER BY
34 |   bin,
35 |   client
36 | 


--------------------------------------------------------------------------------
/sql/lens/drupal/crux_histograms.sql:
--------------------------------------------------------------------------------
 1 | INNER JOIN (
 2 |   SELECT
 3 |     page,
 4 |     client
 5 |   FROM
 6 |     `httparchive.crawl.pages`
 7 |   WHERE
 8 |     date = '${YYYY-MM-DD}' AND
 9 |     'Drupal' IN UNNEST(technologies.technology)
10 |   GROUP BY
11 |     1,
12 |     2
13 | )
14 | ON (SUBSTR(page, 0, LENGTH(page) - 1) = origin AND form_factor.name = IF(client = 'desktop', 'desktop', 'phone'))
15 | 


--------------------------------------------------------------------------------
/sql/lens/drupal/crux_timeseries.sql:
--------------------------------------------------------------------------------
 1 | INNER JOIN (
 2 |   SELECT
 3 |     SUBSTR(page, 0, LENGTH(page) - 1) AS origin,
 4 |     IF(client = 'mobile', 'phone', client) AS device,
 5 |     date
 6 |   FROM
 7 |     `httparchive.crawl.pages`
 8 |   WHERE
 9 |     date >= '2010-11-15' AND
10 |     is_root_page AND
11 |     'Drupal' IN UNNEST(technologies.technology)
12 |   GROUP BY
13 |     1,
14 |     2,
15 |     3
16 | )
17 | USING (origin, device, date)
18 | 


--------------------------------------------------------------------------------
/sql/lens/drupal/histograms.sql:
--------------------------------------------------------------------------------
1 | 'Drupal' IN UNNEST(technologies.technology)
2 | 


--------------------------------------------------------------------------------
/sql/lens/drupal/timeseries.sql:
--------------------------------------------------------------------------------
1 | 'Drupal' IN UNNEST(technologies.technology)
2 | 


--------------------------------------------------------------------------------
/sql/lens/magento/crux_histograms.sql:
--------------------------------------------------------------------------------
 1 | INNER JOIN (
 2 |   SELECT
 3 |     page,
 4 |     client
 5 |   FROM
 6 |     `httparchive.crawl.pages`
 7 |   WHERE
 8 |     date = '${YYYY-MM-DD}' AND
 9 |     'Magento' IN UNNEST(technologies.technology)
10 |   GROUP BY
11 |     1,
12 |     2
13 | )
14 | ON (SUBSTR(page, 0, LENGTH(page) - 1) = origin AND form_factor.name = IF(client = 'desktop', 'desktop', 'phone'))
15 | 


--------------------------------------------------------------------------------
/sql/lens/magento/crux_timeseries.sql:
--------------------------------------------------------------------------------
 1 | INNER JOIN (
 2 |   SELECT
 3 |     SUBSTR(page, 0, LENGTH(page) - 1) AS origin,
 4 |     IF(client = 'mobile', 'phone', client) AS device,
 5 |     date
 6 |   FROM
 7 |     `httparchive.crawl.pages`
 8 |   WHERE
 9 |     date >= '2010-11-15' AND
10 |     is_root_page AND
11 |     'Magento' IN UNNEST(technologies.technology)
12 |   GROUP BY
13 |     1,
14 |     2,
15 |     3
16 | )
17 | USING (origin, device, date)
18 | 


--------------------------------------------------------------------------------
/sql/lens/magento/histograms.sql:
--------------------------------------------------------------------------------
1 | 'Magento' IN UNNEST(technologies.technology)
2 | 


--------------------------------------------------------------------------------
/sql/lens/magento/timeseries.sql:
--------------------------------------------------------------------------------
1 | 'Magento' IN UNNEST(technologies.technology)
2 | 


--------------------------------------------------------------------------------
/sql/lens/top100k/crux_histograms.sql:
--------------------------------------------------------------------------------
1 | WHERE
2 |   experimental.popularity.rank <= 100000
3 | 


--------------------------------------------------------------------------------
/sql/lens/top100k/crux_timeseries.sql:
--------------------------------------------------------------------------------
 1 | INNER JOIN (
 2 |   SELECT
 3 |     SUBSTR(page, 0, LENGTH(page) - 1) AS origin,
 4 |     IF(client = 'mobile', 'phone', client) AS device,
 5 |     date
 6 |   FROM
 7 |     `httparchive.crawl.pages`
 8 |   WHERE
 9 |     date >= '2010-11-15' AND
10 |     is_root_page AND
11 |     rank = 100000
12 |   GROUP BY
13 |     1,
14 |     2,
15 |     3
16 | )
17 | USING (origin, device, date)
18 | 


--------------------------------------------------------------------------------
/sql/lens/top100k/histograms.sql:
--------------------------------------------------------------------------------
1 | rank <= 100000
2 | 


--------------------------------------------------------------------------------
/sql/lens/top100k/timeseries.sql:
--------------------------------------------------------------------------------
1 | rank <= 100000
2 | 


--------------------------------------------------------------------------------
/sql/lens/top10k/crux_histograms.sql:
--------------------------------------------------------------------------------
1 | WHERE
2 |   experimental.popularity.rank <= 10000
3 | 


--------------------------------------------------------------------------------
/sql/lens/top10k/crux_timeseries.sql:
--------------------------------------------------------------------------------
 1 | INNER JOIN (
 2 |   SELECT
 3 |     SUBSTR(page, 0, LENGTH(page) - 1) AS origin,
 4 |     IF(client = 'mobile', 'phone', client) AS device,
 5 |     date
 6 |   FROM
 7 |     `httparchive.crawl.pages`
 8 |   WHERE
 9 |     date >= '2010-11-15' AND
10 |     is_root_page AND
11 |     rank = 10000
12 |   GROUP BY
13 |     1,
14 |     2,
15 |     3
16 | )
17 | USING (origin, device, date)
18 | 


--------------------------------------------------------------------------------
/sql/lens/top10k/histograms.sql:
--------------------------------------------------------------------------------
1 | rank <= 10000
2 | 


--------------------------------------------------------------------------------
/sql/lens/top10k/timeseries.sql:
--------------------------------------------------------------------------------
1 | rank <= 10000
2 | 


--------------------------------------------------------------------------------
/sql/lens/top1k/crux_histograms.sql:
--------------------------------------------------------------------------------
1 | WHERE
2 |   experimental.popularity.rank <= 1000
3 | 


--------------------------------------------------------------------------------
/sql/lens/top1k/crux_timeseries.sql:
--------------------------------------------------------------------------------
 1 | INNER JOIN (
 2 |   SELECT
 3 |     SUBSTR(page, 0, LENGTH(page) - 1) AS origin,
 4 |     IF(client = 'mobile', 'phone', client) AS device,
 5 |     date
 6 |   FROM
 7 |     `httparchive.crawl.pages`
 8 |   WHERE
 9 |     date >= '2010-11-15' AND
10 |     is_root_page AND
11 |     rank = 1000
12 |   GROUP BY
13 |     1,
14 |     2,
15 |     3
16 | )
17 | USING (origin, device, date)
18 | 


--------------------------------------------------------------------------------
/sql/lens/top1k/histograms.sql:
--------------------------------------------------------------------------------
1 | rank <= 1000
2 | 


--------------------------------------------------------------------------------
/sql/lens/top1k/timeseries.sql:
--------------------------------------------------------------------------------
1 | rank <= 1000
2 | 


--------------------------------------------------------------------------------
/sql/lens/top1m/crux_histograms.sql:
--------------------------------------------------------------------------------
1 | WHERE
2 |   experimental.popularity.rank <= 1000000
3 | 


--------------------------------------------------------------------------------
/sql/lens/top1m/crux_timeseries.sql:
--------------------------------------------------------------------------------
 1 | INNER JOIN (
 2 |   SELECT
 3 |     SUBSTR(page, 0, LENGTH(page) - 1) AS origin,
 4 |     IF(client = 'mobile', 'phone', client) AS device,
 5 |     date
 6 |   FROM
 7 |     `httparchive.crawl.pages`
 8 |   WHERE
 9 |     date >= '2010-11-15' AND
10 |     is_root_page AND
11 |     rank = 1000000
12 |   GROUP BY
13 |     1,
14 |     2,
15 |     3
16 | )
17 | USING (origin, device, date)
18 | 


--------------------------------------------------------------------------------
/sql/lens/top1m/histograms.sql:
--------------------------------------------------------------------------------
1 | rank <= 10000
2 | 


--------------------------------------------------------------------------------
/sql/lens/top1m/timeseries.sql:
--------------------------------------------------------------------------------
1 | rank <= 10000
2 | 


--------------------------------------------------------------------------------
/sql/lens/wordpress/crux_histograms.sql:
--------------------------------------------------------------------------------
 1 | INNER JOIN (
 2 |   SELECT
 3 |     page,
 4 |     client
 5 |   FROM
 6 |     `httparchive.crawl.pages`
 7 |   WHERE
 8 |     date = '${YYYY-MM-DD}' AND
 9 |     'WordPress' IN UNNEST(technologies.technology)
10 |   GROUP BY
11 |     1,
12 |     2
13 | )
14 | ON (SUBSTR(page, 0, LENGTH(page) - 1) = origin AND form_factor.name = IF(client = 'desktop', 'desktop', 'phone'))
15 | 


--------------------------------------------------------------------------------
/sql/lens/wordpress/crux_timeseries.sql:
--------------------------------------------------------------------------------
 1 | INNER JOIN (
 2 |   SELECT
 3 |     SUBSTR(page, 0, LENGTH(page) - 1) AS origin,
 4 |     IF(client = 'mobile', 'phone', client) AS device,
 5 |     date
 6 |   FROM
 7 |     `httparchive.crawl.pages`
 8 |   WHERE
 9 |     date >= '2010-11-15' AND
10 |     is_root_page AND
11 |     'WordPress' IN UNNEST(technologies.technology)
12 |   GROUP BY
13 |     1,
14 |     2,
15 |     3
16 | )
17 | USING (origin, device, date)
18 | 


--------------------------------------------------------------------------------
/sql/lens/wordpress/histograms.sql:
--------------------------------------------------------------------------------
1 | 'WordPress' IN UNNEST(technologies.technology)
2 | 


--------------------------------------------------------------------------------
/sql/lens/wordpress/timeseries.sql:
--------------------------------------------------------------------------------
1 | 'WordPress' IN UNNEST(technologies.technology)
2 | 


--------------------------------------------------------------------------------
/sql/new_metric.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Initializes reports for a newly added metric.
 4 | #
 5 | # Example usage:
 6 | #
 7 | #   sql/new_metric.sh histograms bootupJs lighthouse
 8 | #
 9 | # Where the first argument is the chart type,
10 | # the second argument is the metric name,
11 | # and the third argument is the BQ dataset.
12 | 
13 | set -eo pipefail
14 | 
15 | VIZ=$1
16 | METRIC=$2
17 | DATASET=$3
18 | 
19 | if [ -z "$VIZ" ]; then
20 |   echo "Chart type argument required." >&2
21 |   echo "Example usage: sql/new_metric.sh histograms bootupJs lighthouse" >&2
22 |   exit 1
23 | fi
24 | 
25 | if [ -z "$METRIC" ]; then
26 |   echo "Metric argument required." >&2
27 |   echo "Example usage: sql/new_metric.sh histograms bootupJs lighthouse" >&2
28 |   exit 1
29 | fi
30 | 
31 | if [ -z "$DATASET" ]; then
32 |   echo "Dataset argument required." >&2
33 |   echo "Example usage: sql/new_metric.sh histograms bootupJs lighthouse" >&2
34 |   exit 1
35 | fi
36 | 
37 | if [ "$VIZ" == "histograms" ]; then
38 |   cmd='sql/get_bigquery_dates.sh "$DATASET" "" | xargs -I date sql/generate_report.sh -d date/"$METRIC".json'
39 | fi
40 | if [ "$VIZ" == "timeseries" ]; then
41 |   cmd='sql/generate_report.sh -d "$METRIC".json'
42 | fi
43 | 
44 | eval $cmd
45 | 
46 | lenses=$(ls -1 sql/lens)
47 | for lens in $lenses; do
48 |   cmd+=" -l $lens"
49 |   eval $cmd
50 | done
51 | 


--------------------------------------------------------------------------------
/sql/timeseries/a11yButtonName.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   ROUND(SUM(IF(LAX_STRING(lighthouse.audits['button-name'].score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent
 7 | FROM
 8 |   `httparchive.crawl.pages`
 9 | WHERE
10 |   /* Should really use the following to only include eligible sites. */
11 |   /* LAX_STRING(lighthouse.audits['button-name'].score) IS NOT NULL AND */
12 |   lighthouse IS NOT NULL AND
13 |   TO_JSON_STRING(lighthouse) != '{}' AND
14 |   is_root_page AND
15 |   date >= '2017-06-01'
16 | GROUP BY
17 |   date,
18 |   timestamp,
19 |   client
20 | ORDER BY
21 |   date DESC,
22 |   client
23 | 


--------------------------------------------------------------------------------
/sql/timeseries/a11yColorContrast.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   ROUND(SUM(IF(LAX_STRING(lighthouse.audits['color-contrast'].score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent
 7 | FROM
 8 |   `httparchive.crawl.pages`
 9 | WHERE
10 |   /* Should really use the following to only include eligible sites. */
11 |   /* LAX_STRING(lighthouse.audits['color-contrast'].score) IS NOT NULL AND */
12 |   lighthouse IS NOT NULL AND
13 |   TO_JSON_STRING(lighthouse) != '{}' AND
14 |   date >= '2017-06-01' AND
15 |   is_root_page
16 | GROUP BY
17 |   date,
18 |   timestamp,
19 |   client
20 | ORDER BY
21 |   date DESC,
22 |   client
23 | 


--------------------------------------------------------------------------------
/sql/timeseries/a11yImageAlt.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   ROUND(SUM(IF(LAX_STRING(lighthouse.audits['image-alt'].score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent
 7 | FROM
 8 |   `httparchive.crawl.pages`
 9 | WHERE
10 |   /* Should really use the following to only include eligible sites. */
11 |   /* LAX_STRING(lighthouse.audits['image-alt'].score) IS NOT NULL AND */
12 |   lighthouse IS NOT NULL AND
13 |   TO_JSON_STRING(lighthouse) != '{}' AND
14 |   date >= '2017-06-01' AND
15 |   is_root_page
16 | GROUP BY
17 |   date,
18 |   timestamp,
19 |   client
20 | ORDER BY
21 |   date DESC,
22 |   client
23 | 


--------------------------------------------------------------------------------
/sql/timeseries/a11yLabel.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   ROUND(SUM(IF(LAX_STRING(lighthouse.audits.label.score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent
 7 | FROM
 8 |   `httparchive.crawl.pages`
 9 | WHERE
10 |   /* Should really use the following to only include eligible sites. */
11 |   /* LAX_STRING(lighthouse.audits.label.score) IS NOT NULL AND */
12 |   lighthouse IS NOT NULL AND
13 |   TO_JSON_STRING(lighthouse) != '{}' AND
14 |   date >= '2017-06-01' AND
15 |   is_root_page
16 | GROUP BY
17 |   date,
18 |   timestamp,
19 |   client
20 | ORDER BY
21 |   date DESC,
22 |   client
23 | 


--------------------------------------------------------------------------------
/sql/timeseries/a11yLinkName.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   ROUND(SUM(IF(LAX_STRING(lighthouse.audits['link-name'].score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent
 7 | FROM
 8 |   `httparchive.crawl.pages`
 9 | WHERE
10 |   /* Should really use the following to only include eligible sites. */
11 |   /* LAX_STRING(lighthouse.audits['link-name'].score) IS NOT NULL AND */
12 |   lighthouse IS NOT NULL AND
13 |   TO_JSON_STRING(lighthouse) != '{}' AND
14 |   date >= '2017-06-01' AND
15 |   is_root_page
16 | GROUP BY
17 |   date,
18 |   timestamp,
19 |   client
20 | ORDER BY
21 |   date DESC,
22 |   client
23 | 


--------------------------------------------------------------------------------
/sql/timeseries/a11yScores.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | # Lighthouse changed format of scores in v3.0.0 released in July 2018 so handle old with a UDF
 3 | CREATE TEMPORARY FUNCTION getA11yScore(reportCategories JSON)
 4 | RETURNS FLOAT64 DETERMINISTIC
 5 | LANGUAGE js AS """
 6 |   if(reportCategories) {
 7 |     return reportCategories.find(i => i.name === 'Accessibility').score;
 8 |   }
 9 | """;
10 | 
11 | SELECT
12 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
13 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
14 |   client,
15 |   ROUND(APPROX_QUANTILES(score, 1000)[OFFSET(100)], 2) AS p10,
16 |   ROUND(APPROX_QUANTILES(score, 1000)[OFFSET(250)], 2) AS p25,
17 |   ROUND(APPROX_QUANTILES(score, 1000)[OFFSET(500)], 2) AS p50,
18 |   ROUND(APPROX_QUANTILES(score, 1000)[OFFSET(750)], 2) AS p75,
19 |   ROUND(APPROX_QUANTILES(score, 1000)[OFFSET(900)], 2) AS p90
20 | FROM (
21 |   SELECT
22 |     date,
23 |     client,
24 |     IFNULL(LAX_FLOAT64(lighthouse.categories.accessibility.score) * 100, getA11yScore(lighthouse.reportCategories)) AS score
25 |   FROM
26 |     `httparchive.crawl.pages`
27 |   WHERE
28 |     lighthouse IS NOT NULL AND
29 |     TO_JSON_STRING(lighthouse) != '{}' AND
30 |     date >= '2017-06-01' AND
31 |     is_root_page
32 | )
33 | GROUP BY
34 |   date,
35 |   timestamp,
36 |   client
37 | ORDER BY
38 |   date DESC,
39 |   client
40 | 


--------------------------------------------------------------------------------
/sql/timeseries/asyncClipboardRead.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls,
 7 |   ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent
 8 | FROM
 9 |   `httparchive.crawl.pages`
10 | LEFT OUTER JOIN UNNEST(features) AS feat
11 | ON (feat.id = '2369' OR feat.feature = 'AsyncClipboardAPIRead')
12 | WHERE
13 |   date >= '2016-11-15' AND
14 |   is_root_page
15 | GROUP BY
16 |   date,
17 |   timestamp,
18 |   client
19 | ORDER BY
20 |   date DESC,
21 |   client,
22 |   num_urls DESC
23 | 


--------------------------------------------------------------------------------
/sql/timeseries/badgeClear.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls,
 7 |   ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent
 8 | FROM
 9 |   `httparchive.crawl.pages`
10 | LEFT OUTER JOIN UNNEST(features) AS feat
11 | ON (feat.id = '2727' OR feat.feature = 'BadgeClear')
12 | WHERE
13 |   date >= '2016-11-15' AND
14 |   is_root_page
15 | GROUP BY
16 |   date,
17 |   timestamp,
18 |   client
19 | ORDER BY
20 |   date DESC,
21 |   client,
22 |   num_urls DESC
23 | 


--------------------------------------------------------------------------------
/sql/timeseries/badgeSet.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls,
 7 |   ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent
 8 | FROM
 9 |   `httparchive.crawl.pages`
10 | LEFT OUTER JOIN UNNEST(features) AS feat
11 | ON (feat.id = '2726' OR feat.feature = 'BadgeSet')
12 | WHERE
13 |   date >= '2016-11-15' AND
14 |   is_root_page
15 | GROUP BY
16 |   date,
17 |   timestamp,
18 |   client
19 | ORDER BY
20 |   date DESC,
21 |   client,
22 |   num_urls DESC
23 | 


--------------------------------------------------------------------------------
/sql/timeseries/bootupJs.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(100)], 2) AS p10,
 7 |   ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(250)], 2) AS p25,
 8 |   ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(500)], 2) AS p50,
 9 |   ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(750)], 2) AS p75,
10 |   ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(900)], 2) AS p90
11 | FROM (
12 |   SELECT
13 |     date,
14 |     client,
15 |     IFNULL(
16 |       FLOAT64(lighthouse.audits['bootup-time'].numericValue),
17 |       FLOAT64(lighthouse.audits['bootup-time'].rawValue)
18 |     ) / 1000 AS value
19 |   FROM
20 |     `httparchive.crawl.pages`
21 |   WHERE
22 |     lighthouse IS NOT NULL AND
23 |     TO_JSON_STRING(lighthouse) != '{}' AND
24 |     date >= '2017-06-01' AND
25 |     is_root_page
26 | )
27 | GROUP BY
28 |   date,
29 |   timestamp,
30 |   client
31 | ORDER BY
32 |   date DESC,
33 |   client
34 | 


--------------------------------------------------------------------------------
/sql/timeseries/bytesCss.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesCss), 1001)[OFFSET(101)] / 1024, 2) AS p10,
 7 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesCss), 1001)[OFFSET(251)] / 1024, 2) AS p25,
 8 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesCss), 1001)[OFFSET(501)] / 1024, 2) AS p50,
 9 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesCss), 1001)[OFFSET(751)] / 1024, 2) AS p75,
10 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesCss), 1001)[OFFSET(901)] / 1024, 2) AS p90
11 | FROM
12 |   `httparchive.crawl.pages`
13 | WHERE
14 |   date >= '2010-11-15' AND
15 |   is_root_page AND
16 |   FLOAT64(summary.bytesCss) > 0
17 | GROUP BY
18 |   date,
19 |   timestamp,
20 |   client
21 | ORDER BY
22 |   date DESC,
23 |   client
24 | 


--------------------------------------------------------------------------------
/sql/timeseries/bytesFont.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesFont), 1001)[OFFSET(101)] / 1024, 2) AS p10,
 7 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesFont), 1001)[OFFSET(251)] / 1024, 2) AS p25,
 8 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesFont), 1001)[OFFSET(501)] / 1024, 2) AS p50,
 9 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesFont), 1001)[OFFSET(751)] / 1024, 2) AS p75,
10 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesFont), 1001)[OFFSET(901)] / 1024, 2) AS p90
11 | FROM
12 |   `httparchive.crawl.pages`
13 | WHERE
14 |   date >= '2010-11-15' AND
15 |   is_root_page AND
16 |   FLOAT64(summary.bytesFont) > 0
17 | GROUP BY
18 |   date,
19 |   timestamp,
20 |   client
21 | ORDER BY
22 |   date DESC,
23 |   client
24 | 


--------------------------------------------------------------------------------
/sql/timeseries/bytesHtml.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesHtml), 1001)[OFFSET(101)] / 1024, 2) AS p10,
 7 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesHtml), 1001)[OFFSET(251)] / 1024, 2) AS p25,
 8 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesHtml), 1001)[OFFSET(501)] / 1024, 2) AS p50,
 9 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesHtml), 1001)[OFFSET(751)] / 1024, 2) AS p75,
10 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesHtml), 1001)[OFFSET(901)] / 1024, 2) AS p90
11 | FROM
12 |   `httparchive.crawl.pages`
13 | WHERE
14 |   date >= '2010-11-15' AND
15 |   is_root_page AND
16 |   FLOAT64(summary.bytesHtml) > 0
17 | GROUP BY
18 |   date,
19 |   timestamp,
20 |   client
21 | ORDER BY
22 |   date DESC,
23 |   client
24 | 


--------------------------------------------------------------------------------
/sql/timeseries/bytesImg.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesImg), 1001)[OFFSET(101)] / 1024, 2) AS p10,
 7 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesImg), 1001)[OFFSET(251)] / 1024, 2) AS p25,
 8 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesImg), 1001)[OFFSET(501)] / 1024, 2) AS p50,
 9 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesImg), 1001)[OFFSET(751)] / 1024, 2) AS p75,
10 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesImg), 1001)[OFFSET(901)] / 1024, 2) AS p90
11 | FROM
12 |   `httparchive.crawl.pages`
13 | WHERE
14 |   date >= '2010-11-15' AND
15 |   is_root_page AND
16 |   FLOAT64(summary.bytesImg) > 0
17 | GROUP BY
18 |   date,
19 |   timestamp,
20 |   client
21 | ORDER BY
22 |   date DESC,
23 |   client
24 | 


--------------------------------------------------------------------------------
/sql/timeseries/bytesJs.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesJS), 1001)[OFFSET(101)] / 1024, 2) AS p10,
 7 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesJS), 1001)[OFFSET(251)] / 1024, 2) AS p25,
 8 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesJS), 1001)[OFFSET(501)] / 1024, 2) AS p50,
 9 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesJS), 1001)[OFFSET(751)] / 1024, 2) AS p75,
10 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesJS), 1001)[OFFSET(901)] / 1024, 2) AS p90
11 | FROM
12 |   `httparchive.crawl.pages`
13 | WHERE
14 |   date >= '2010-11-15' AND
15 |   is_root_page AND
16 |   FLOAT64(summary.bytesJS) > 0
17 | GROUP BY
18 |   date,
19 |   timestamp,
20 |   client
21 | ORDER BY
22 |   date DESC,
23 |   client
24 | 


--------------------------------------------------------------------------------
/sql/timeseries/bytesOther.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesOther), 1001)[OFFSET(101)] / 1024, 2) AS p10,
 7 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesOther), 1001)[OFFSET(251)] / 1024, 2) AS p25,
 8 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesOther), 1001)[OFFSET(501)] / 1024, 2) AS p50,
 9 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesOther), 1001)[OFFSET(751)] / 1024, 2) AS p75,
10 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesOther), 1001)[OFFSET(901)] / 1024, 2) AS p90
11 | FROM
12 |   `httparchive.crawl.pages`
13 | WHERE
14 |   date >= '2010-11-15' AND
15 |   is_root_page AND
16 |   FLOAT64(summary.bytesOther) > 0
17 | GROUP BY
18 |   date,
19 |   timestamp,
20 |   client
21 | ORDER BY
22 |   date DESC,
23 |   client
24 | 


--------------------------------------------------------------------------------
/sql/timeseries/bytesTotal.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesTotal), 1001)[OFFSET(101)] / 1024, 2) AS p10,
 7 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesTotal), 1001)[OFFSET(251)] / 1024, 2) AS p25,
 8 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesTotal), 1001)[OFFSET(501)] / 1024, 2) AS p50,
 9 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesTotal), 1001)[OFFSET(751)] / 1024, 2) AS p75,
10 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesTotal), 1001)[OFFSET(901)] / 1024, 2) AS p90
11 | FROM
12 |   `httparchive.crawl.pages`
13 | WHERE
14 |   date >= '2010-11-15' AND
15 |   is_root_page AND
16 |   FLOAT64(summary.bytesTotal) > 0
17 | GROUP BY
18 |   date,
19 |   timestamp,
20 |   client
21 | ORDER BY
22 |   date DESC,
23 |   client
24 | 


--------------------------------------------------------------------------------
/sql/timeseries/bytesVideo.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesVideo), 1001)[OFFSET(101)] / 1024, 2) AS p10,
 7 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesVideo), 1001)[OFFSET(251)] / 1024, 2) AS p25,
 8 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesVideo), 1001)[OFFSET(501)] / 1024, 2) AS p50,
 9 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesVideo), 1001)[OFFSET(751)] / 1024, 2) AS p75,
10 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesVideo), 1001)[OFFSET(901)] / 1024, 2) AS p90
11 | FROM
12 |   `httparchive.crawl.pages`
13 | WHERE
14 |   date >= '2010-11-15' AND
15 |   is_root_page AND
16 |   FLOAT64(summary.bytesVideo) > 0
17 | GROUP BY
18 |   date,
19 |   timestamp,
20 |   client
21 | ORDER BY
22 |   date DESC,
23 |   client
24 | 


--------------------------------------------------------------------------------
/sql/timeseries/canonical.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   ROUND(SUM(IF(LAX_STRING(lighthouse.audits.canonical.score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent
 7 | FROM
 8 |   `httparchive.crawl.pages`
 9 | WHERE
10 |   lighthouse IS NOT NULL AND
11 |   TO_JSON_STRING(lighthouse) != '{}' AND
12 |   date >= '2017-06-01' AND
13 |   is_root_page
14 | GROUP BY
15 |   date,
16 |   timestamp,
17 |   client
18 | ORDER BY
19 |   date DESC,
20 |   client
21 | 


--------------------------------------------------------------------------------
/sql/timeseries/contentIndex.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls,
 7 |   ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent
 8 | FROM
 9 |   `httparchive.crawl.pages`
10 | LEFT OUTER JOIN UNNEST(features) AS feat
11 | ON (feat.id = '2983' OR feat.feature = 'ContentIndexAdd')
12 | WHERE
13 |   date >= '2016-11-15' AND
14 |   is_root_page
15 | GROUP BY
16 |   date,
17 |   timestamp,
18 |   client
19 | ORDER BY
20 |   date DESC,
21 |   client,
22 |   num_urls DESC
23 | 


--------------------------------------------------------------------------------
/sql/timeseries/cruxFastDcl.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | # Fast Dopm Content Loaded by device
 3 | 
 4 | CREATE TEMP FUNCTION IS_GOOD(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS (
 5 |   good / (good + needs_improvement + poor) >= 0.75
 6 | );
 7 | 
 8 | CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS (
 9 |   good + needs_improvement + poor > 0
10 | );
11 | 
12 | SELECT
13 |   REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1_\\2_01') AS date,
14 |   UNIX_DATE(CAST(REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1-\\2-01') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp,
15 |   IF(device = 'desktop', 'desktop', 'mobile') AS client,
16 |   SAFE_DIVIDE(
17 |     COUNT(DISTINCT IF(IS_GOOD(fast_dcl, avg_dcl, slow_dcl), origin, NULL)),
18 |     COUNT(DISTINCT IF(IS_NON_ZERO(fast_dcl, avg_dcl, slow_dcl), origin, NULL))
19 |   ) * 100 AS percent
20 | FROM
21 |   `chrome-ux-report.materialized.device_summary`
22 | WHERE
23 |   device IN ('desktop', 'phone')
24 | GROUP BY
25 |   date,
26 |   timestamp,
27 |   client
28 | ORDER BY
29 |   date DESC,
30 |   client
31 | 


--------------------------------------------------------------------------------
/sql/timeseries/cruxFastFcp.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | # Fast FCP by device
 3 | 
 4 | CREATE TEMP FUNCTION IS_GOOD(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS (
 5 |   good / (good + needs_improvement + poor) >= 0.75
 6 | );
 7 | 
 8 | CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS (
 9 |   good + needs_improvement + poor > 0
10 | );
11 | 
12 | SELECT
13 |   REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1_\\2_01') AS date,
14 |   UNIX_DATE(CAST(REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1-\\2-01') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp,
15 |   IF(device = 'desktop', 'desktop', 'mobile') AS client,
16 |   SAFE_DIVIDE(
17 |     COUNT(DISTINCT IF(IS_GOOD(fast_fcp, avg_fcp, slow_fcp), origin, NULL)),
18 |     COUNT(DISTINCT IF(IS_NON_ZERO(fast_fcp, avg_fcp, slow_fcp), origin, NULL))
19 |   ) * 100 AS percent
20 | FROM
21 |   `chrome-ux-report.materialized.device_summary`
22 | WHERE
23 |   device IN ('desktop', 'phone')
24 | GROUP BY
25 |   date,
26 |   timestamp,
27 |   client
28 | ORDER BY
29 |   date DESC,
30 |   client
31 | 


--------------------------------------------------------------------------------
/sql/timeseries/cruxFastFp.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | # Fast First Paint by device
 3 | 
 4 | CREATE TEMP FUNCTION IS_GOOD(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS (
 5 |   good / (good + needs_improvement + poor) >= 0.75
 6 | );
 7 | 
 8 | CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS (
 9 |   good + needs_improvement + poor > 0
10 | );
11 | 
12 | SELECT
13 |   REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1_\\2_01') AS date,
14 |   UNIX_DATE(CAST(REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1-\\2-01') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp,
15 |   IF(device = 'desktop', 'desktop', 'mobile') AS client,
16 |   SAFE_DIVIDE(
17 |     COUNT(DISTINCT IF(IS_GOOD(fast_fp, avg_fp, slow_fp), origin, NULL)),
18 |     COUNT(DISTINCT IF(IS_NON_ZERO(fast_fp, avg_fp, slow_fp), origin, NULL))
19 |   ) * 100 AS percent
20 | FROM
21 |   `chrome-ux-report.materialized.device_summary`
22 | WHERE
23 |   device IN ('desktop', 'phone')
24 | GROUP BY
25 |   date,
26 |   timestamp,
27 |   client
28 | ORDER BY
29 |   date DESC,
30 |   client
31 | 


--------------------------------------------------------------------------------
/sql/timeseries/cruxFastInp.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | # Small CLS by device
 3 | 
 4 | CREATE TEMP FUNCTION IS_GOOD(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS (
 5 |   good / (good + needs_improvement + poor) >= 0.75
 6 | );
 7 | 
 8 | CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS (
 9 |   good + needs_improvement + poor > 0
10 | );
11 | 
12 | SELECT
13 |   REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1_\\2_01') AS date,
14 |   UNIX_DATE(CAST(REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1-\\2-01') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp,
15 |   IF(device = 'desktop', 'desktop', 'mobile') AS client,
16 |   SAFE_DIVIDE(
17 |     COUNT(DISTINCT IF(IS_GOOD(fast_inp, avg_inp, slow_inp), origin, NULL)),
18 |     COUNT(DISTINCT IF(IS_NON_ZERO(fast_inp, avg_inp, slow_inp), origin, NULL))
19 |   ) * 100 AS percent
20 | FROM
21 |   `chrome-ux-report.materialized.device_summary`
22 | WHERE
23 |   device IN ('desktop', 'phone') AND
24 |   yyyymm >= 202202
25 | GROUP BY
26 |   date,
27 |   timestamp,
28 |   client
29 | ORDER BY
30 |   date DESC,
31 |   client
32 | 


--------------------------------------------------------------------------------
/sql/timeseries/cruxFastLcp.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | # Fast LCP by device
 3 | 
 4 | CREATE TEMP FUNCTION IS_GOOD(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS (
 5 |   good / (good + needs_improvement + poor) >= 0.75
 6 | );
 7 | 
 8 | CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS (
 9 |   good + needs_improvement + poor > 0
10 | );
11 | 
12 | SELECT
13 |   REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1_\\2_01') AS date,
14 |   UNIX_DATE(CAST(REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1-\\2-01') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp,
15 |   IF(device = 'desktop', 'desktop', 'mobile') AS client,
16 |   SAFE_DIVIDE(
17 |     COUNT(DISTINCT IF(IS_GOOD(fast_lcp, avg_lcp, slow_lcp), origin, NULL)),
18 |     COUNT(DISTINCT IF(IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp), origin, NULL))
19 |   ) * 100 AS percent
20 | FROM
21 |   `chrome-ux-report.materialized.device_summary`
22 | WHERE
23 |   device IN ('desktop', 'phone') AND
24 |   yyyymm >= 201909
25 | GROUP BY
26 |   date,
27 |   timestamp,
28 |   client
29 | ORDER BY
30 |   date DESC,
31 |   client
32 | 


--------------------------------------------------------------------------------
/sql/timeseries/cruxFastOl.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | # Fast Onload by device
 3 | 
 4 | CREATE TEMP FUNCTION IS_GOOD(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS (
 5 |   good / (good + needs_improvement + poor) >= 0.75
 6 | );
 7 | 
 8 | CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS (
 9 |   good + needs_improvement + poor > 0
10 | );
11 | 
12 | SELECT
13 |   REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1_\\2_01') AS date,
14 |   UNIX_DATE(CAST(REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1-\\2-01') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp,
15 |   IF(device = 'desktop', 'desktop', 'mobile') AS client,
16 |   SAFE_DIVIDE(
17 |     COUNT(DISTINCT IF(IS_GOOD(fast_ol, avg_ol, slow_ol), origin, NULL)),
18 |     COUNT(DISTINCT IF(IS_NON_ZERO(fast_ol, avg_ol, slow_ol), origin, NULL))
19 |   ) * 100 AS percent
20 | FROM
21 |   `chrome-ux-report.materialized.device_summary`
22 | WHERE
23 |   device IN ('desktop', 'phone')
24 | GROUP BY
25 |   date,
26 |   timestamp,
27 |   client
28 | ORDER BY
29 |   date DESC,
30 |   client
31 | 


--------------------------------------------------------------------------------
/sql/timeseries/cruxFastTtfb.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | # Small CLS by device
 3 | 
 4 | CREATE TEMP FUNCTION IS_GOOD(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS (
 5 |   SAFE_DIVIDE(good, (good + needs_improvement + poor)) >= 0.75
 6 | );
 7 | 
 8 | CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS (
 9 |   good + needs_improvement + poor > 0
10 | );
11 | 
12 | SELECT
13 |   REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1_\\2_01') AS date,
14 |   UNIX_DATE(CAST(REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1-\\2-01') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp,
15 |   IF(device = 'desktop', 'desktop', 'mobile') AS client,
16 |   SAFE_DIVIDE(
17 |     COUNT(DISTINCT IF(IS_GOOD(fast_ttfb, avg_ttfb, slow_ttfb), origin, NULL)),
18 |     COUNT(DISTINCT IF(IS_NON_ZERO(fast_ttfb, avg_ttfb, slow_ttfb), origin, NULL))
19 |   ) * 100 AS percent
20 | FROM
21 |   `chrome-ux-report.materialized.device_summary`
22 | WHERE
23 |   device IN ('desktop', 'phone')
24 | GROUP BY
25 |   date,
26 |   timestamp,
27 |   client
28 | ORDER BY
29 |   date DESC,
30 |   client
31 | 


--------------------------------------------------------------------------------
/sql/timeseries/cruxLargeCls.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | # Large CLS by device
 3 | 
 4 | CREATE TEMP FUNCTION IS_POOR(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS (
 5 |   poor / (good + needs_improvement + poor) >= 0.25
 6 | );
 7 | 
 8 | CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS (
 9 |   good + needs_improvement + poor > 0
10 | );
11 | 
12 | SELECT
13 |   REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1_\\2_01') AS date,
14 |   UNIX_DATE(CAST(REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1-\\2-01') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp,
15 |   IF(device = 'desktop', 'desktop', 'mobile') AS client,
16 |   SAFE_DIVIDE(
17 |     COUNT(DISTINCT IF(IS_POOR(small_cls, medium_cls, large_cls), origin, NULL)),
18 |     COUNT(DISTINCT IF(IS_NON_ZERO(small_cls, medium_cls, large_cls), origin, NULL))
19 |   ) * 100 AS percent
20 | FROM
21 |   `chrome-ux-report.materialized.device_summary`
22 | WHERE
23 |   device IN ('desktop', 'phone') AND
24 |   yyyymm >= 201905
25 | GROUP BY
26 |   date,
27 |   timestamp,
28 |   client
29 | ORDER BY
30 |   date DESC,
31 |   client
32 | 


--------------------------------------------------------------------------------
/sql/timeseries/cruxPassesCWV.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | # Passes Core Web Vitals by device
 3 | 
 4 | CREATE TEMP FUNCTION IS_GOOD(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS (
 5 |   good / (good + needs_improvement + poor) >= 0.75
 6 | );
 7 | 
 8 | CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS (
 9 |   good + needs_improvement + poor > 0
10 | );
11 | 
12 | SELECT
13 |   REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1_\\2_01') AS date,
14 |   UNIX_DATE(CAST(REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1-\\2-01') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp,
15 |   IF(device = 'desktop', 'desktop', 'mobile') AS client,
16 |   SAFE_DIVIDE(
17 |     COUNT(DISTINCT IF(
18 |       IF(
19 |         /* INP replaced FID as a CWV in March 2024 (202402 release date). */
20 |         yyyymm >= 202402,
21 |         /* INP/FID can be null and are not mandatory for CWV */
22 |         (p75_inp IS NULL OR IS_GOOD(fast_inp, avg_inp, slow_inp)),
23 |         (p75_fid IS NULL OR IS_GOOD(fast_fid, avg_fid, slow_fid))
24 |       ) AND
25 |       IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AND
26 |       IS_GOOD(small_cls, medium_cls, large_cls), origin, NULL
27 |     )),
28 |     COUNT(DISTINCT origin)
29 |   ) * 100 AS percent
30 | FROM
31 |   `chrome-ux-report.materialized.device_summary`
32 | WHERE
33 |   device IN ('desktop', 'phone') AND
34 |   yyyymm > 201909 AND
35 |   p75_lcp IS NOT NULL AND p75_cls IS NOT NULL /* Must have LCP and CLS */
36 | GROUP BY
37 |   date,
38 |   timestamp,
39 |   client
40 | ORDER BY
41 |   date DESC,
42 |   client
43 | 


--------------------------------------------------------------------------------
/sql/timeseries/cruxSlowFcp.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | # Slow FCP by device
 3 | 
 4 | CREATE TEMP FUNCTION IS_POOR(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS (
 5 |   poor / (good + needs_improvement + poor) >= 0.25
 6 | );
 7 | 
 8 | CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS (
 9 |   good + needs_improvement + poor > 0
10 | );
11 | 
12 | SELECT
13 |   REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1_\\2_01') AS date,
14 |   UNIX_DATE(CAST(REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1-\\2-01') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp,
15 |   IF(device = 'desktop', 'desktop', 'mobile') AS client,
16 |   SAFE_DIVIDE(
17 |     COUNT(DISTINCT IF(IS_POOR(fast_fcp, avg_fcp, slow_fcp), origin, NULL)),
18 |     COUNT(DISTINCT IF(IS_NON_ZERO(fast_fcp, avg_fcp, slow_fcp), origin, NULL))
19 |   ) * 100 AS percent
20 | FROM
21 |   `chrome-ux-report.materialized.device_summary`
22 | WHERE
23 |   device IN ('desktop', 'phone')
24 | GROUP BY
25 |   date,
26 |   timestamp,
27 |   client
28 | ORDER BY
29 |   date DESC,
30 |   client
31 | 


--------------------------------------------------------------------------------
/sql/timeseries/cruxSlowInp.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | # Large CLS by device
 3 | 
 4 | CREATE TEMP FUNCTION IS_POOR(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS (
 5 |   poor / (good + needs_improvement + poor) >= 0.25
 6 | );
 7 | 
 8 | CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS (
 9 |   good + needs_improvement + poor > 0
10 | );
11 | 
12 | SELECT
13 |   REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1_\\2_01') AS date,
14 |   UNIX_DATE(CAST(REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1-\\2-01') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp,
15 |   IF(device = 'desktop', 'desktop', 'mobile') AS client,
16 |   SAFE_DIVIDE(
17 |     COUNT(DISTINCT IF(IS_POOR(fast_inp, avg_inp, slow_inp), origin, NULL)),
18 |     COUNT(DISTINCT IF(IS_NON_ZERO(fast_inp, avg_inp, slow_inp), origin, NULL))
19 |   ) * 100 AS percent
20 | FROM
21 |   `chrome-ux-report.materialized.device_summary`
22 | WHERE
23 |   device IN ('desktop', 'phone') AND
24 |   yyyymm >= 202202
25 | GROUP BY
26 |   date,
27 |   timestamp,
28 |   client
29 | ORDER BY
30 |   date DESC,
31 |   client
32 | 


--------------------------------------------------------------------------------
/sql/timeseries/cruxSlowLcp.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | # Slow LCP by device
 3 | 
 4 | CREATE TEMP FUNCTION IS_POOR(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS (
 5 |   poor / (good + needs_improvement + poor) >= 0.25
 6 | );
 7 | 
 8 | CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS (
 9 |   good + needs_improvement + poor > 0
10 | );
11 | 
12 | SELECT
13 |   REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1_\\2_01') AS date,
14 |   UNIX_DATE(CAST(REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1-\\2-01') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp,
15 |   IF(device = 'desktop', 'desktop', 'mobile') AS client,
16 |   SAFE_DIVIDE(
17 |     COUNT(DISTINCT IF(IS_POOR(fast_lcp, avg_lcp, slow_lcp), origin, NULL)),
18 |     COUNT(DISTINCT IF(IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp), origin, NULL))
19 |   ) * 100 AS percent
20 | FROM
21 |   `chrome-ux-report.materialized.device_summary`
22 | WHERE
23 |   device IN ('desktop', 'phone') AND
24 |   yyyymm >= 201909
25 | GROUP BY
26 |   date,
27 |   timestamp,
28 |   client
29 | ORDER BY
30 |   date DESC,
31 |   client
32 | 


--------------------------------------------------------------------------------
/sql/timeseries/cruxSlowTtfb.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | # Large CLS by device
 3 | 
 4 | CREATE TEMP FUNCTION IS_POOR(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS (
 5 |   SAFE_DIVIDE(poor, (good + needs_improvement + poor)) >= 0.25
 6 | );
 7 | 
 8 | CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS (
 9 |   good + needs_improvement + poor > 0
10 | );
11 | 
12 | SELECT
13 |   REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1_\\2_01') AS date,
14 |   UNIX_DATE(CAST(REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1-\\2-01') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp,
15 |   IF(device = 'desktop', 'desktop', 'mobile') AS client,
16 |   SAFE_DIVIDE(
17 |     COUNT(DISTINCT IF(IS_POOR(fast_ttfb, avg_ttfb, slow_ttfb), origin, NULL)),
18 |     COUNT(DISTINCT IF(IS_NON_ZERO(fast_ttfb, avg_ttfb, slow_ttfb), origin, NULL))
19 |   ) * 100 AS percent
20 | FROM
21 |   `chrome-ux-report.materialized.device_summary`
22 | WHERE
23 |   device IN ('desktop', 'phone')
24 | GROUP BY
25 |   date,
26 |   timestamp,
27 |   client
28 | ORDER BY
29 |   date DESC,
30 |   client
31 | 


--------------------------------------------------------------------------------
/sql/timeseries/cruxSmallCls.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | # Small CLS by device
 3 | 
 4 | CREATE TEMP FUNCTION IS_GOOD(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS (
 5 |   good / (good + needs_improvement + poor) >= 0.75
 6 | );
 7 | 
 8 | CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS (
 9 |   good + needs_improvement + poor > 0
10 | );
11 | 
12 | SELECT
13 |   REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1_\\2_01') AS date,
14 |   UNIX_DATE(CAST(REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1-\\2-01') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp,
15 |   IF(device = 'desktop', 'desktop', 'mobile') AS client,
16 |   SAFE_DIVIDE(
17 |     COUNT(DISTINCT IF(IS_GOOD(small_cls, medium_cls, large_cls), origin, NULL)),
18 |     COUNT(DISTINCT IF(IS_NON_ZERO(small_cls, medium_cls, large_cls), origin, NULL))
19 |   ) * 100 AS percent
20 | FROM
21 |   `chrome-ux-report.materialized.device_summary`
22 | WHERE
23 |   device IN ('desktop', 'phone') AND
24 |   yyyymm >= 201905
25 | GROUP BY
26 |   date,
27 |   timestamp,
28 |   client
29 | ORDER BY
30 |   date DESC,
31 |   client
32 | 


--------------------------------------------------------------------------------
/sql/timeseries/dcl.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.onContentLoaded), 1001)[OFFSET(101)] / 1000, 2) AS p10,
 7 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.onContentLoaded), 1001)[OFFSET(251)] / 1000, 2) AS p25,
 8 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.onContentLoaded), 1001)[OFFSET(501)] / 1000, 2) AS p50,
 9 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.onContentLoaded), 1001)[OFFSET(751)] / 1000, 2) AS p75,
10 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.onContentLoaded), 1001)[OFFSET(901)] / 1000, 2) AS p90
11 | FROM
12 |   `httparchive.crawl.pages`
13 | WHERE
14 |   date >= '2010-11-15' AND
15 |   is_root_page AND
16 |   FLOAT64(summary.onContentLoaded) > 0
17 | GROUP BY
18 |   date,
19 |   timestamp,
20 |   client
21 | ORDER BY
22 |   date DESC,
23 |   client
24 | 


--------------------------------------------------------------------------------
/sql/timeseries/fcp.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   ROUND(APPROX_QUANTILES(FLOAT64(payload['_chromeUserTiming.firstContentfulPaint']), 1001)[OFFSET(101)] / 1024, 2) AS p10,
 7 |   ROUND(APPROX_QUANTILES(FLOAT64(payload['_chromeUserTiming.firstContentfulPaint']), 1001)[OFFSET(251)] / 1024, 2) AS p25,
 8 |   ROUND(APPROX_QUANTILES(FLOAT64(payload['_chromeUserTiming.firstContentfulPaint']), 1001)[OFFSET(501)] / 1024, 2) AS p50,
 9 |   ROUND(APPROX_QUANTILES(FLOAT64(payload['_chromeUserTiming.firstContentfulPaint']), 1001)[OFFSET(751)] / 1024, 2) AS p75,
10 |   ROUND(APPROX_QUANTILES(FLOAT64(payload['_chromeUserTiming.firstContentfulPaint']), 1001)[OFFSET(901)] / 1024, 2) AS p90
11 | FROM
12 |   `httparchive.crawl.pages`
13 | WHERE
14 |   date >= '2016-12-15' AND
15 |   is_root_page
16 | GROUP BY
17 |   date,
18 |   timestamp,
19 |   client
20 | HAVING
21 |   p50 IS NOT NULL
22 | ORDER BY
23 |   date DESC,
24 |   client
25 | 


--------------------------------------------------------------------------------
/sql/timeseries/fontDisplay.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   ROUND(SUM(IF(LAX_STRING(lighthouse.audits['font-display'].score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent
 7 | FROM
 8 |   `httparchive.crawl.pages`
 9 | WHERE
10 |   lighthouse IS NOT NULL AND
11 |   TO_JSON_STRING(lighthouse) != '{}' AND
12 |   date >= '2017-06-01' AND
13 |   is_root_page AND
14 |   LAX_STRING(lighthouse.audits['font-display'].score) IS NOT NULL
15 | GROUP BY
16 |   date,
17 |   timestamp,
18 |   client
19 | ORDER BY
20 |   date DESC,
21 |   client
22 | 


--------------------------------------------------------------------------------
/sql/timeseries/getInstalledRelatedApps.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls,
 7 |   ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent
 8 | FROM
 9 |   `httparchive.crawl.pages`
10 | LEFT OUTER JOIN UNNEST(features) AS feat
11 | ON (feat.id = '1870' OR feat.feature = 'V8Navigator_GetInstalledRelatedApps_Method')
12 | WHERE
13 |   date >= '2016-11-15' AND
14 |   is_root_page
15 | GROUP BY
16 |   date,
17 |   timestamp,
18 |   client
19 | ORDER BY
20 |   date DESC,
21 |   client,
22 |   num_urls DESC
23 | 


--------------------------------------------------------------------------------
/sql/timeseries/gzipSavings.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   ROUND(APPROX_QUANTILES(FLOAT64(payload._gzip_savings), 1001)[OFFSET(101)] / 1024, 2) AS p10,
 7 |   ROUND(APPROX_QUANTILES(FLOAT64(payload._gzip_savings), 1001)[OFFSET(251)] / 1024, 2) AS p25,
 8 |   ROUND(APPROX_QUANTILES(FLOAT64(payload._gzip_savings), 1001)[OFFSET(501)] / 1024, 2) AS p50,
 9 |   ROUND(APPROX_QUANTILES(FLOAT64(payload._gzip_savings), 1001)[OFFSET(751)] / 1024, 2) AS p75,
10 |   ROUND(APPROX_QUANTILES(FLOAT64(payload._gzip_savings), 1001)[OFFSET(901)] / 1024, 2) AS p90
11 | FROM
12 |   `httparchive.crawl.pages`
13 | WHERE
14 |   is_root_page
15 | GROUP BY
16 |   date,
17 |   timestamp,
18 |   client
19 | ORDER BY
20 |   date DESC,
21 |   client
22 | 


--------------------------------------------------------------------------------
/sql/timeseries/h2.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   ROUND(SUM(IF(LAX_STRING(r.summary.respHttpVersion) = 'HTTP/2', 1, 0)) * 100 / COUNT(0), 2) AS percent
 7 | FROM
 8 |   `httparchive.crawl.requests` r
 9 | INNER JOIN
10 |   `httparchive.crawl.pages`
11 | USING (date, client, is_root_page, rank, page)
12 | WHERE
13 |   is_root_page AND
14 |   date >= '2016-07-15'
15 | GROUP BY
16 |   date,
17 |   timestamp,
18 |   client
19 | ORDER BY
20 |   date DESC,
21 |   client
22 | 


--------------------------------------------------------------------------------
/sql/timeseries/h3.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | # The amount of requests either using HTTP/3 or able to use it.
 3 | #
 4 | # We measure "ability to use" as well as "actual use", as HTTP Archive is a
 5 | # cold crawl and so less likely to use HTTP/3 which requires prior visits.
 6 | #
 7 | # For "able to use" we look at the alt-svc response header.
 8 | #
 9 | # We also only measure official HTTP/3 (ALPN h3, h3-29) and not gQUIC or other
10 | # prior versions. h3-29 is the final draft version and will be switched to h3
11 | # when HTTP/3 is approved so we include that as it is HTTP/3 in all but name.
12 | #
13 | SELECT
14 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
15 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
16 |   client,
17 |   ROUND(
18 |     SUM(
19 |       IF(
20 |         LAX_STRING(r.summary.respHttpVersion) IN ('HTTP/3', 'h3', 'h3-29') OR
21 |         REGEXP_EXTRACT(REGEXP_EXTRACT(resp.value, r'(.*)'), r'(.*?)(?:, [^ ]* = .*)?$') LIKE '%h3=%' OR
22 |         REGEXP_EXTRACT(REGEXP_EXTRACT(resp.value, r'(.*)'), r'(.*?)(?:, [^ ]* = .*)?$') LIKE '%h3-29=%',
23 |         1, 0
24 |       )
25 |     ) * 100 / COUNT(0), 2
26 |   ) AS percent
27 | FROM
28 |   `httparchive.crawl.requests` r
29 | LEFT OUTER JOIN
30 |   UNNEST(response_headers) AS resp
31 | ON (resp.name = 'alt-svc')
32 | INNER JOIN
33 |   `httparchive.crawl.pages`
34 | USING (date, client, is_root_page, rank, page)
35 | WHERE
36 |   date >= '2020-01-01' AND
37 |   is_root_page
38 | GROUP BY
39 |   date,
40 |   timestamp,
41 |   client
42 | ORDER BY
43 |   date DESC,
44 |   client
45 | 


--------------------------------------------------------------------------------
/sql/timeseries/hreflang.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   ROUND(SUM(IF(LAX_STRING(lighthouse.audits.hreflang.score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent
 7 | FROM
 8 |   `httparchive.crawl.pages`
 9 | WHERE
10 |   lighthouse IS NOT NULL AND
11 |   TO_JSON_STRING(lighthouse) != '{}' AND
12 |   date >= '2017-06-01' AND
13 |   is_root_page AND
14 |   LAX_STRING(lighthouse.audits.hreflang.score) IS NOT NULL
15 | GROUP BY
16 |   date,
17 |   timestamp,
18 |   client
19 | ORDER BY
20 |   date DESC,
21 |   client
22 | 


--------------------------------------------------------------------------------
/sql/timeseries/idleDetection.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls,
 7 |   ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent
 8 | FROM
 9 |   `httparchive.crawl.pages`
10 | LEFT OUTER JOIN UNNEST(features) AS feat
11 | ON (feat.id = '2834' OR feat.feature = 'IdleDetectionStart')
12 | WHERE
13 |   date >= '2016-11-15' AND
14 |   is_root_page
15 | GROUP BY
16 |   date,
17 |   timestamp,
18 |   client
19 | ORDER BY
20 |   date DESC,
21 |   client,
22 |   num_urls DESC
23 | 


--------------------------------------------------------------------------------
/sql/timeseries/imgLazy.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   ROUND(COUNT(DISTINCT IF(LOWER(LAX_STRING(attr)) = 'lazy', page, NULL)) * 100 / COUNT(DISTINCT page), 2) AS percent
 7 | FROM
 8 |   `httparchive.crawl.pages`
 9 | LEFT JOIN
10 |   UNNEST(JSON_EXTRACT_ARRAY(custom_metrics.other['img-loading-attr'])) AS attr
11 | WHERE
12 |   is_root_page AND
13 |   date > '2016-01-01'
14 | GROUP BY
15 |   date,
16 |   timestamp,
17 |   client
18 | ORDER BY
19 |   date DESC,
20 |   client
21 | 


--------------------------------------------------------------------------------
/sql/timeseries/imgSavings.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   ROUND(APPROX_QUANTILES(FLOAT64(payload._image_savings), 1001)[OFFSET(101)] / 1024, 2) AS p10,
 7 |   ROUND(APPROX_QUANTILES(FLOAT64(payload._image_savings), 1001)[OFFSET(251)] / 1024, 2) AS p25,
 8 |   ROUND(APPROX_QUANTILES(FLOAT64(payload._image_savings), 1001)[OFFSET(501)] / 1024, 2) AS p50,
 9 |   ROUND(APPROX_QUANTILES(FLOAT64(payload._image_savings), 1001)[OFFSET(751)] / 1024, 2) AS p75,
10 |   ROUND(APPROX_QUANTILES(FLOAT64(payload._image_savings), 1001)[OFFSET(901)] / 1024, 2) AS p90
11 | FROM
12 |   `httparchive.crawl.pages`
13 | WHERE
14 |   is_root_page AND
15 |   date >= '2016-01-01'
16 | GROUP BY
17 |   date,
18 |   timestamp,
19 |   client
20 | ORDER BY
21 |   date DESC,
22 |   client
23 | 


--------------------------------------------------------------------------------
/sql/timeseries/legible.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   ROUND(SUM(IF(LAX_STRING(lighthouse.audits['font-size'].score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent
 7 | FROM
 8 |   `httparchive.crawl.pages`
 9 | WHERE
10 |   lighthouse IS NOT NULL AND
11 |   date >= '2017-12-15' AND
12 |   is_root_page AND
13 |   LAX_STRING(lighthouse.audits['font-size'].score) IS NOT NULL
14 | GROUP BY
15 |   date,
16 |   timestamp,
17 |   client
18 | ORDER BY
19 |   date DESC,
20 |   client
21 | 


--------------------------------------------------------------------------------
/sql/timeseries/linkText.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   ROUND(SUM(IF(LAX_STRING(lighthouse.audits['link-text'].score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent
 7 | FROM
 8 |   `httparchive.crawl.pages`
 9 | WHERE
10 |   lighthouse IS NOT NULL AND
11 |   date >= '2017-11-15' AND
12 |   is_root_page AND
13 |   LAX_STRING(lighthouse.audits['link-text'].score) IS NOT NULL
14 | GROUP BY
15 |   date,
16 |   timestamp,
17 |   client
18 | ORDER BY
19 |   date DESC,
20 |   client
21 | 


--------------------------------------------------------------------------------
/sql/timeseries/notificationTriggers.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls,
 7 |   ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent
 8 | FROM
 9 |   `httparchive.crawl.pages`
10 | LEFT OUTER JOIN UNNEST(features) AS feat
11 | ON (feat.id = '3017' OR feat.feature = 'NotificationShowTrigger')
12 | WHERE
13 |   date >= '2016-11-15' AND
14 |   is_root_page
15 | GROUP BY
16 |   date,
17 |   timestamp,
18 |   client
19 | ORDER BY
20 |   date DESC,
21 |   client,
22 |   num_urls DESC
23 | 


--------------------------------------------------------------------------------
/sql/timeseries/numUrls.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   COUNT(0) AS urls
 7 | FROM
 8 |   `httparchive.crawl.pages`
 9 | WHERE
10 |   date >= '2010-11-15' AND
11 |   is_root_page
12 | GROUP BY
13 |   date,
14 |   timestamp,
15 |   client
16 | ORDER BY
17 |   date DESC,
18 |   client
19 | 


--------------------------------------------------------------------------------
/sql/timeseries/offscreenImages.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   ROUND(APPROX_QUANTILES(IFNULL(INT64(lighthouse.audits['offscreen-images'].details.overallSavingsBytes), INT64(lighthouse.audits['offscreen-images'].extendedInfo.value.wastedKb) * 1024), 1001)[OFFSET(101)] / 1024, 2) AS p10,
 7 |   ROUND(APPROX_QUANTILES(IFNULL(INT64(lighthouse.audits['offscreen-images'].details.overallSavingsBytes), INT64(lighthouse.audits['offscreen-images'].extendedInfo.value.wastedKb) * 1024), 1001)[OFFSET(251)] / 1024, 2) AS p25,
 8 |   ROUND(APPROX_QUANTILES(IFNULL(INT64(lighthouse.audits['offscreen-images'].details.overallSavingsBytes), INT64(lighthouse.audits['offscreen-images'].extendedInfo.value.wastedKb) * 1024), 1001)[OFFSET(501)] / 1024, 2) AS p50,
 9 |   ROUND(APPROX_QUANTILES(IFNULL(INT64(lighthouse.audits['offscreen-images'].details.overallSavingsBytes), INT64(lighthouse.audits['offscreen-images'].extendedInfo.value.wastedKb) * 1024), 1001)[OFFSET(751)] / 1024, 2) AS p75,
10 |   ROUND(APPROX_QUANTILES(IFNULL(INT64(lighthouse.audits['offscreen-images'].details.overallSavingsBytes), INT64(lighthouse.audits['offscreen-images'].extendedInfo.value.wastedKb) * 1024), 1001)[OFFSET(901)] / 1024, 2) AS p90
11 | FROM
12 |   `httparchive.crawl.pages`
13 | WHERE
14 |   is_root_page AND
15 |   date >= '2017-06-01'
16 | GROUP BY
17 |   date,
18 |   timestamp,
19 |   client
20 | ORDER BY
21 |   date DESC,
22 |   client
23 | 


--------------------------------------------------------------------------------
/sql/timeseries/ol.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.onLoad), 1001)[OFFSET(101)] / 1000, 2) AS p10,
 7 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.onLoad), 1001)[OFFSET(251)] / 1000, 2) AS p25,
 8 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.onLoad), 1001)[OFFSET(501)] / 1000, 2) AS p50,
 9 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.onLoad), 1001)[OFFSET(751)] / 1000, 2) AS p75,
10 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.onLoad), 1001)[OFFSET(901)] / 1000, 2) AS p90
11 | FROM
12 |   `httparchive.crawl.pages`
13 | WHERE
14 |   date >= '2010-11-15' AND
15 |   is_root_page AND
16 |   FLOAT64(summary.onLoad) > 0
17 | GROUP BY
18 |   date,
19 |   timestamp,
20 |   client
21 | ORDER BY
22 |   date DESC,
23 |   client
24 | 


--------------------------------------------------------------------------------
/sql/timeseries/optimizedImages.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   ROUND(APPROX_QUANTILES(IFNULL(INT64(lighthouse.audits['uses-optimized-images'].details.overallSavingsBytes), INT64(lighthouse.audits['uses-optimized-images'].extendedInfo.value.wastedKb) * 1024), 1001)[OFFSET(101)] / 1024, 2) AS p10,
 7 |   ROUND(APPROX_QUANTILES(IFNULL(INT64(lighthouse.audits['uses-optimized-images'].details.overallSavingsBytes), INT64(lighthouse.audits['uses-optimized-images'].extendedInfo.value.wastedKb) * 1024), 1001)[OFFSET(251)] / 1024, 2) AS p25,
 8 |   ROUND(APPROX_QUANTILES(IFNULL(INT64(lighthouse.audits['uses-optimized-images'].details.overallSavingsBytes), INT64(lighthouse.audits['uses-optimized-images'].extendedInfo.value.wastedKb) * 1024), 1001)[OFFSET(501)] / 1024, 2) AS p50,
 9 |   ROUND(APPROX_QUANTILES(IFNULL(INT64(lighthouse.audits['uses-optimized-images'].details.overallSavingsBytes), INT64(lighthouse.audits['uses-optimized-images'].extendedInfo.value.wastedKb) * 1024), 1001)[OFFSET(751)] / 1024, 2) AS p75,
10 |   ROUND(APPROX_QUANTILES(IFNULL(INT64(lighthouse.audits['uses-optimized-images'].details.overallSavingsBytes), INT64(lighthouse.audits['uses-optimized-images'].extendedInfo.value.wastedKb) * 1024), 1001)[OFFSET(901)] / 1024, 2) AS p90
11 | FROM
12 |   `httparchive.crawl.pages`
13 | WHERE
14 |   date >= '2017-06-01' AND
15 |   is_root_page
16 | GROUP BY
17 |   date,
18 |   timestamp,
19 |   client
20 | ORDER BY
21 |   date DESC,
22 |   client
23 | 


--------------------------------------------------------------------------------
/sql/timeseries/pctHttps.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   ROUND(SUM(IF(STARTS_WITH(url, 'https'), 1, 0)) * 100 / COUNT(0), 2) AS percent
 7 | FROM
 8 |   `httparchive.crawl.requests`
 9 | INNER JOIN
10 |   `httparchive.crawl.pages`
11 | USING (date, client, is_root_page, rank, page)
12 | WHERE
13 |   is_root_page AND
14 |   date >= '2016-01-01'
15 | GROUP BY
16 |   date,
17 |   timestamp,
18 |   client
19 | ORDER BY
20 |   date DESC,
21 |   client
22 | 


--------------------------------------------------------------------------------
/sql/timeseries/periodicBackgroundSync.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls,
 7 |   ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent
 8 | FROM
 9 |   `httparchive.crawl.pages`
10 | LEFT OUTER JOIN UNNEST(features) AS feat
11 | ON (feat.id = '2930' OR feat.feature = 'PeriodicBackgroundSync')
12 | WHERE
13 |   date >= '2016-11-15' AND
14 |   is_root_page
15 | GROUP BY
16 |   date,
17 |   timestamp,
18 |   client
19 | ORDER BY
20 |   date DESC,
21 |   client,
22 |   num_urls DESC
23 | 


--------------------------------------------------------------------------------
/sql/timeseries/periodicBackgroundSyncRegister.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls,
 7 |   ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent
 8 | FROM
 9 |   `httparchive.crawl.pages`
10 | LEFT OUTER JOIN UNNEST(features) AS feat
11 | ON (feat.id = '2931' OR feat.feature = 'PeriodicBackgroundSyncRegister')
12 | WHERE
13 |   date >= '2016-11-15' AND
14 |   is_root_page
15 | GROUP BY
16 |   date,
17 |   timestamp,
18 |   client
19 | ORDER BY
20 |   date DESC,
21 |   client,
22 |   num_urls DESC
23 | 


--------------------------------------------------------------------------------
/sql/timeseries/quicTransport.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls,
 7 |   ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent
 8 | FROM
 9 |   `httparchive.crawl.pages`
10 | LEFT OUTER JOIN UNNEST(features) AS feat
11 | ON (feat.id = '3184' OR feat.feature = 'QuicTransport')
12 | WHERE
13 |   date >= '2016-11-15' AND
14 |   is_root_page
15 | GROUP BY
16 |   date,
17 |   timestamp,
18 |   client
19 | ORDER BY
20 |   date DESC,
21 |   client,
22 |   num_urls DESC
23 | 


--------------------------------------------------------------------------------
/sql/timeseries/reqCss.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.reqCss), 1001)[OFFSET(101)], 2) AS p10,
 7 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.reqCss), 1001)[OFFSET(251)], 2) AS p25,
 8 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.reqCss), 1001)[OFFSET(501)], 2) AS p50,
 9 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.reqCss), 1001)[OFFSET(751)], 2) AS p75,
10 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.reqCss), 1001)[OFFSET(901)], 2) AS p90
11 | FROM
12 |   `httparchive.crawl.pages`
13 | WHERE
14 |   date >= '2010-11-15' AND
15 |   is_root_page AND
16 |   FLOAT64(summary.reqCss) > 0
17 | GROUP BY
18 |   date,
19 |   timestamp,
20 |   client
21 | ORDER BY
22 |   date DESC,
23 |   client
24 | 


--------------------------------------------------------------------------------
/sql/timeseries/reqFont.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.reqFont), 1001)[OFFSET(101)], 2) AS p10,
 7 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.reqFont), 1001)[OFFSET(251)], 2) AS p25,
 8 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.reqFont), 1001)[OFFSET(501)], 2) AS p50,
 9 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.reqFont), 1001)[OFFSET(751)], 2) AS p75,
10 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.reqFont), 1001)[OFFSET(901)], 2) AS p90
11 | FROM
12 |   `httparchive.crawl.pages`
13 | WHERE
14 |   date >= '2010-11-15' AND
15 |   is_root_page AND
16 |   FLOAT64(summary.reqFont) > 0
17 | GROUP BY
18 |   date,
19 |   timestamp,
20 |   client
21 | ORDER BY
22 |   date DESC,
23 |   client
24 | 


--------------------------------------------------------------------------------
/sql/timeseries/reqHtml.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.reqHtml), 1001)[OFFSET(101)], 2) AS p10,
 7 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.reqHtml), 1001)[OFFSET(251)], 2) AS p25,
 8 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.reqHtml), 1001)[OFFSET(501)], 2) AS p50,
 9 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.reqHtml), 1001)[OFFSET(751)], 2) AS p75,
10 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.reqHtml), 1001)[OFFSET(901)], 2) AS p90
11 | FROM
12 |   `httparchive.crawl.pages`
13 | WHERE
14 |   date >= '2010-11-15' AND
15 |   is_root_page AND
16 |   FLOAT64(summary.reqHtml) > 0
17 | GROUP BY
18 |   date,
19 |   timestamp,
20 |   client
21 | ORDER BY
22 |   date DESC,
23 |   client
24 | 


--------------------------------------------------------------------------------
/sql/timeseries/reqImg.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.reqImg), 1001)[OFFSET(101)], 2) AS p10,
 7 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.reqImg), 1001)[OFFSET(251)], 2) AS p25,
 8 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.reqImg), 1001)[OFFSET(501)], 2) AS p50,
 9 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.reqImg), 1001)[OFFSET(751)], 2) AS p75,
10 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.reqImg), 1001)[OFFSET(901)], 2) AS p90
11 | FROM
12 |   `httparchive.crawl.pages`
13 | WHERE
14 |   date >= '2010-11-15' AND
15 |   is_root_page AND
16 |   FLOAT64(summary.reqImg) > 0
17 | GROUP BY
18 |   date,
19 |   timestamp,
20 |   client
21 | ORDER BY
22 |   date DESC,
23 |   client
24 | 


--------------------------------------------------------------------------------
/sql/timeseries/reqJs.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.reqJS), 1001)[OFFSET(101)], 2) AS p10,
 7 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.reqJS), 1001)[OFFSET(251)], 2) AS p25,
 8 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.reqJS), 1001)[OFFSET(501)], 2) AS p50,
 9 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.reqJS), 1001)[OFFSET(751)], 2) AS p75,
10 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.reqJS), 1001)[OFFSET(901)], 2) AS p90
11 | FROM
12 |   `httparchive.crawl.pages`
13 | WHERE
14 |   date >= '2010-11-15' AND
15 |   is_root_page AND
16 |   FLOAT64(summary.reqJS) > 0
17 | GROUP BY
18 |   date,
19 |   timestamp,
20 |   client
21 | ORDER BY
22 |   date DESC,
23 |   client
24 | 


--------------------------------------------------------------------------------
/sql/timeseries/reqOther.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.reqOther), 1001)[OFFSET(101)], 2) AS p10,
 7 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.reqOther), 1001)[OFFSET(251)], 2) AS p25,
 8 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.reqOther), 1001)[OFFSET(501)], 2) AS p50,
 9 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.reqOther), 1001)[OFFSET(751)], 2) AS p75,
10 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.reqOther), 1001)[OFFSET(901)], 2) AS p90
11 | FROM
12 |   `httparchive.crawl.pages`
13 | WHERE
14 |   date >= '2010-11-15' AND
15 |   is_root_page AND
16 |   FLOAT64(summary.reqOther) > 0
17 | GROUP BY
18 |   date,
19 |   timestamp,
20 |   client
21 | ORDER BY
22 |   date DESC,
23 |   client
24 | 


--------------------------------------------------------------------------------
/sql/timeseries/reqTotal.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.reqTotal), 1001)[OFFSET(101)], 2) AS p10,
 7 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.reqTotal), 1001)[OFFSET(251)], 2) AS p25,
 8 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.reqTotal), 1001)[OFFSET(501)], 2) AS p50,
 9 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.reqTotal), 1001)[OFFSET(751)], 2) AS p75,
10 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.reqTotal), 1001)[OFFSET(901)], 2) AS p90
11 | FROM
12 |   `httparchive.crawl.pages`
13 | WHERE
14 |   date >= '2010-11-15' AND
15 |   is_root_page AND
16 |   FLOAT64(summary.reqTotal) > 0
17 | GROUP BY
18 |   date,
19 |   timestamp,
20 |   client
21 | ORDER BY
22 |   date DESC,
23 |   client
24 | 


--------------------------------------------------------------------------------
/sql/timeseries/reqVideo.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.reqVideo), 1001)[OFFSET(101)], 2) AS p10,
 7 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.reqVideo), 1001)[OFFSET(251)], 2) AS p25,
 8 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.reqVideo), 1001)[OFFSET(501)], 2) AS p50,
 9 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.reqVideo), 1001)[OFFSET(751)], 2) AS p75,
10 |   ROUND(APPROX_QUANTILES(FLOAT64(summary.reqVideo), 1001)[OFFSET(901)], 2) AS p90
11 | FROM
12 |   `httparchive.crawl.pages`
13 | WHERE
14 |   date >= '2010-11-15' AND
15 |   is_root_page AND
16 |   FLOAT64(summary.reqVideo) > 0
17 | GROUP BY
18 |   date,
19 |   timestamp,
20 |   client
21 | ORDER BY
22 |   date DESC,
23 |   client
24 | 


--------------------------------------------------------------------------------
/sql/timeseries/screenWakeLock.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls,
 7 |   ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent
 8 | FROM
 9 |   `httparchive.crawl.pages`
10 | LEFT OUTER JOIN UNNEST(features) AS feat
11 | ON (feat.id = '3005' OR feat.feature = 'WakeLockAcquireScreenLock')
12 | WHERE
13 |   date >= '2016-11-15' AND
14 |   is_root_page
15 | GROUP BY
16 |   date,
17 |   timestamp,
18 |   client
19 | ORDER BY
20 |   date DESC,
21 |   client,
22 |   num_urls DESC
23 | 


--------------------------------------------------------------------------------
/sql/timeseries/speedIndex.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   ROUND(APPROX_QUANTILES(FLOAT64(payload._SpeedIndex), 1001)[OFFSET(101)] / 1000, 2) AS p10,
 7 |   ROUND(APPROX_QUANTILES(FLOAT64(payload._SpeedIndex), 1001)[OFFSET(251)] / 1000, 2) AS p25,
 8 |   ROUND(APPROX_QUANTILES(FLOAT64(payload._SpeedIndex), 1001)[OFFSET(501)] / 1000, 2) AS p50,
 9 |   ROUND(APPROX_QUANTILES(FLOAT64(payload._SpeedIndex), 1001)[OFFSET(751)] / 1000, 2) AS p75,
10 |   ROUND(APPROX_QUANTILES(FLOAT64(payload._SpeedIndex), 1001)[OFFSET(901)] / 1000, 2) AS p90
11 | FROM
12 |   `httparchive.crawl.pages`
13 | WHERE
14 |   is_root_page AND
15 |   date >= '2016-01-01'
16 | GROUP BY
17 |   date,
18 |   timestamp,
19 |   client
20 | ORDER BY
21 |   date DESC,
22 |   client
23 | 


--------------------------------------------------------------------------------
/sql/timeseries/storageEstimate.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls,
 7 |   ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent
 8 | FROM
 9 |   `httparchive.crawl.pages`
10 | LEFT OUTER JOIN UNNEST(features) AS feat
11 | ON (feat.id = '1371' OR feat.feature = 'DurableStorageEstimate')
12 | WHERE
13 |   date >= '2016-11-15' AND
14 |   is_root_page
15 | GROUP BY
16 |   date,
17 |   timestamp,
18 |   client
19 | ORDER BY
20 |   date DESC,
21 |   client,
22 |   num_urls DESC
23 | 


--------------------------------------------------------------------------------
/sql/timeseries/storagePersist.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls,
 7 |   ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent
 8 | FROM
 9 |   `httparchive.crawl.pages`
10 | LEFT OUTER JOIN
11 |   UNNEST(features) AS feat
12 | ON (feat.id = '3018' OR feat.feature = 'DurableStoragePersist')
13 | WHERE
14 |   date >= '2016-11-15' AND
15 |   is_root_page
16 | GROUP BY
17 |   date,
18 |   timestamp,
19 |   client
20 | ORDER BY
21 |   date DESC,
22 |   client,
23 |   num_urls DESC
24 | 


--------------------------------------------------------------------------------
/sql/timeseries/swControlledPages.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   SUM(IF(feat.id = '990' OR feat.feature = 'ServiceWorkerControlledPage', 1, 0)) AS num_urls,
 7 |   ROUND(SUM(IF(feat.id = '990' OR feat.feature = 'ServiceWorkerControlledPage', 1, 0)) / COUNT(0) * 100, 5) AS percent
 8 | FROM
 9 |   `httparchive.crawl.pages`
10 | LEFT OUTER JOIN
11 |   UNNEST(features) AS feat
12 | ON (feat.id = '990' OR feat.feature = 'ServiceWorkerControlledPage')
13 | WHERE
14 |   date >= '2016-11-15' AND
15 |   is_root_page
16 | GROUP BY
17 |   date,
18 |   timestamp,
19 |   client
20 | ORDER BY
21 |   date DESC,
22 |   client,
23 |   num_urls DESC
24 | 


--------------------------------------------------------------------------------
/sql/timeseries/tcp.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   APPROX_QUANTILES(FLOAT64(summary._connections), 1001)[OFFSET(101)] AS p10,
 7 |   APPROX_QUANTILES(FLOAT64(summary._connections), 1001)[OFFSET(251)] AS p25,
 8 |   APPROX_QUANTILES(FLOAT64(summary._connections), 1001)[OFFSET(501)] AS p50,
 9 |   APPROX_QUANTILES(FLOAT64(summary._connections), 1001)[OFFSET(751)] AS p75,
10 |   APPROX_QUANTILES(FLOAT64(summary._connections), 1001)[OFFSET(901)] AS p90
11 | FROM
12 |   `httparchive.crawl.pages`
13 | WHERE
14 |   date >= '2010-11-15' AND
15 |   is_root_page AND
16 |   FLOAT64(summary._connections) > 0
17 | GROUP BY
18 |   date,
19 |   timestamp,
20 |   client
21 | ORDER BY
22 |   date DESC,
23 |   client
24 | 


--------------------------------------------------------------------------------
/sql/timeseries/ttci.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(100)], 2) AS p10,
 7 |   ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(250)], 2) AS p25,
 8 |   ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(500)], 2) AS p50,
 9 |   ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(750)], 2) AS p75,
10 |   ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(900)], 2) AS p90
11 | FROM (
12 |   SELECT
13 |     client,
14 |     date,
15 |     IFNULL(
16 |       FLOAT64(lighthouse.audits.interactive.numericValue),
17 |       IFNULL(
18 |         FLOAT64(lighthouse.audits.interactive.rawValue),
19 |         FLOAT64(lighthouse.audits['consistently-interactive'].rawValue)
20 |       )
21 |     ) / 1000 AS value
22 |   FROM
23 |     `httparchive.crawl.pages`
24 |   WHERE
25 |     is_root_page AND
26 |     date >= '2016-01-01'
27 | )
28 | GROUP BY
29 |   date,
30 |   timestamp,
31 |   client
32 | ORDER BY
33 |   date DESC,
34 |   client
35 | 


--------------------------------------------------------------------------------
/sql/timeseries/webSocketStream.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date,
 4 |   UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp,
 5 |   client,
 6 |   SUM(IF(feat.id = '3018' OR feat.feature = 'WebSocketStreamConstructor', 1, 0)) AS num_urls,
 7 |   ROUND(SUM(IF(feat.id = '3018' OR feat.feature = 'WebSocketStreamConstructor', 1, 0)) / COUNT(0) * 100, 5) AS percent
 8 | FROM
 9 |   `httparchive.crawl.pages`
10 | LEFT OUTER JOIN
11 |   UNNEST(features) AS feat
12 | ON (feat.id = '3018' OR feat.feature = 'WebSocketStreamConstructor')
13 | WHERE
14 |   date >= '2016-11-15' AND
15 |   is_root_page
16 | GROUP BY
17 |   date,
18 |   timestamp,
19 |   client
20 | ORDER BY
21 |   date DESC,
22 |   client,
23 |   num_urls DESC
24 | 


--------------------------------------------------------------------------------
/sync_csv.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | #
  3 | # Usage:
  4 | #
  5 | #   ./sync_csv.sh [mobile_][Mon_D_YYYY]
  6 | #
  7 | # Examples:
  8 | #
  9 | #   ./sync_csv.sh mobile_Dec_15_2018
 10 | #   ./sync_csv.sh Jan_1_2019
 11 | 
 12 | DATA=$HOME/archive
 13 | BASE=`pwd`
 14 | 
 15 | if [ -n "$1" ]; then
 16 |   archive=$1
 17 |   if [[ $archive == *mobile* ]]; then
 18 |     mobile=1
 19 |     adate=${archive#mobile_}
 20 |   else
 21 |     mobile=0
 22 |     adate=$archive
 23 |   fi
 24 |   echo "Processing $adate, mobile: $mobile, archive: $archive"
 25 | 
 26 | else
 27 |   echo "Must provide date, eg. Apr_15_2013"
 28 |   exit 1
 29 | fi
 30 | 
 31 | mkdir -p $DATA/processed/$archive
 32 | 
 33 | cd $DATA
 34 | 
 35 | YYYY_MM_DD=$(date --date="$(echo $adate | sed "s/_/ /g" -)" "+%Y_%m_%d")
 36 | 
 37 | if [[ $mobile == 1 ]]; then
 38 |   client="mobile"
 39 | else
 40 |   client="desktop"
 41 | fi
 42 | 
 43 | ptable="summary_pages.${YYYY_MM_DD}_${client}"
 44 | rtable="summary_requests.${YYYY_MM_DD}_${client}"
 45 | 
 46 | if bq show httparchive:${ptable} &> /dev/null && \
 47 |    bq show httparchive:${rtable} &> /dev/null; then
 48 |   # Tables should be deleted from BigQuery first if the intent is to overwrite them.
 49 |   echo -e "BigQuery summary tables for ${YYYY_MM_DD}_${client} already exist, exiting"
 50 |   exit 0
 51 | fi
 52 | 
 53 | if [ ! -f httparchive_${archive}_pages.csv.gz ]; then
 54 |   echo -e "Downloading data for $archive"
 55 |   gsutil cp "gs://httparchive/downloads/httparchive_${archive}_pages.csv.gz" ./
 56 |   if [ $? -ne 0 ]; then
 57 |     echo "Pages data for ${adate} is missing, exiting"
 58 |     exit 1
 59 |   fi
 60 | else
 61 |   echo -e "Pages data already downloaded for $archive, skipping."
 62 | fi
 63 | 
 64 | if [ ! -f httparchive_${archive}_requests.csv.gz ]; then
 65 |   gsutil cp "gs://httparchive/downloads/httparchive_${archive}_requests.csv.gz" ./
 66 |   if [ $? -ne 0 ]; then
 67 |     echo "Request data for ${adate} is missing, exiting"
 68 |     exit 1
 69 |   fi
 70 | else
 71 |   echo -e "Request data already downloaded for $archive, skipping."
 72 | fi
 73 | 
 74 | if [ ! -f processed/${archive}/pages.csv.gz ]; then
 75 |   echo -e "Converting pages data"
 76 |   gunzip -c "httparchive_${archive}_pages.csv.gz" \
 77 |   | sed -e 's/\\N,/"",/g' -e 's/\\N$/""/g' -e's/\([^\]\)\\"/\1""/g' -e's/\([^\]\)\\"/\1""/g' -e 's/\\"","/\\\\","/g' \
 78 |   | gzip > "processed/${archive}/pages.csv.gz"
 79 | else
 80 |   echo -e "Pages data already converted, skipping."
 81 | fi
 82 | 
 83 | if ls processed/${archive}/requests_* &> /dev/null; then
 84 |   echo -e "Request data already converted, skipping."
 85 | else
 86 |   echo -e "Converting requests data"
 87 |   gunzip -c "httparchive_${archive}_requests.csv.gz" \
 88 |   | sed -e 's/\\N,/"",/g' -e 's/\\N$/""/g' -e 's/\\"/""/g' -e 's/\\"","/\\\\","/g' \
 89 |   | python fixcsv.py \
 90 |   | split --lines=8000000 --filter='pigz - > $FILE.gz' - processed/$archive/requests_
 91 | fi
 92 | 
 93 | cd processed/${archive}
 94 | 
 95 | echo -e "Syncing data to Google Storage"
 96 | gsutil cp -n * gs://httparchive/${archive}/
 97 | 
 98 | bq show httparchive:${ptable} &> /dev/null
 99 | if [ $? -ne 0 ]; then
100 |   echo -e "Submitting new pages import ${ptable} to BigQuery"
101 |   bq load --max_bad_records 10 --replace $ptable gs://httparchive/${archive}/pages.csv.gz $BASE/schema/pages.json
102 |   if [ $? -ne 0 ]; then
103 |     echo "Error loading ${ptable}, exiting"
104 |     exit 1
105 |   fi
106 | else
107 |   echo -e "${ptable} already exists, skipping."
108 | fi
109 | 
110 | bq show httparchive:${rtable} &> /dev/null
111 | if [ $? -ne 0 ]; then
112 |   echo -e "Submitting new requests import ${rtable} to BigQuery"
113 |   bq load --max_bad_records 10 --replace $rtable gs://httparchive/${archive}/requests_* $BASE/schema/requests.json
114 |   if [ $? -ne 0 ]; then
115 |     echo "Error loading ${rtable}, exiting"
116 |     exit 1
117 |   fi
118 | else
119 |   echo -e "${rtable} already exists, skipping."
120 | fi
121 | 
122 | 
123 | bq show httparchive:${rtable} &> /dev/null
124 | if [ $? -eq 0 ]; then
125 |   echo -e "Deleting CSV artifacts..."
126 |   rm $DATA/httparchive_${archive}_*
127 |   rm -r $DATA/processed/$archive
128 | else
129 |   echo "Error loading into BigQuery, exiting"
130 |   exit 1
131 | fi
132 | 
133 | echo -e "Attempting to generate reports for ${YYYY_MM_DD}..."
134 | cd $HOME/code
135 | 
136 | gsutil -q stat gs://httparchive/reports/${YYYY_MM_DD}/*
137 | if [ $? -eq 1 ]; then
138 |   . sql/generate_reports.sh -th ${YYYY_MM_DD} -l ALL
139 | else
140 |   echo -e "Reports for ${YYYY_MM_DD} already exist, skipping."
141 | fi
142 | 
143 | echo "Done"
144 | 


--------------------------------------------------------------------------------
/sync_har.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Usage:
 4 | #
 5 | #   ./sync_har.sh [chrome,android] [YYYY-MM-DD]
 6 | #
 7 | # Examples:
 8 | #
 9 | #   ./sync_har.sh chrome
10 | #   ./sync_har.sh chrome 2019-01-01
11 | #   ./sync_har.sh android 2018-12-15
12 | #
13 | 
14 | cd $HOME/code/dataflow/python
15 | 
16 | if [ -n "$2" ]; then
17 |   day=$(date -d $2 +%d)
18 |   MM=$(date -d $2 +%m)
19 |   month=$(date -d $2 +%b)
20 |   year=$(date -d $2 +%Y)
21 | else
22 |   day=$(date +%d)
23 |   MM=$(date +%m)
24 |   month=$(date +%b)
25 |   year=$(date +%Y)
26 | fi
27 | 
28 | # All crawls begin on the first of the month.
29 | import_date=$(date +"${month}_1_${year}")
30 | YYYY_MM_DD="${year}_${MM}_01"
31 | 
32 | if [ -n "$1" ]; then
33 |   archive=$1
34 |   if [[ $1 == *chrome* ]]; then
35 |     client="desktop"
36 |     bucket="chrome-${import_date}"
37 |   else
38 |     client="mobile"
39 |     bucket="android-${import_date}"
40 |   fi
41 |   echo "Processing $bucket, client: $client"
42 | 
43 | else
44 |   echo "Must provide import type (e.g. chrome), and optional date:"
45 |   echo "\t script.sh chrome 2016-01-15"
46 |   exit
47 | fi
48 | 
49 | if bq show "httparchive:pages.${YYYY_MM_DD}_${client}"; then
50 |   echo "Table already exists in BigQuery, exiting"
51 |   exit 1
52 | else
53 |   echo "Table does not exist in BigQuery, checking gs://..."
54 | fi
55 | 
56 | if ! gsutil stat "gs://httparchive/crawls/${bucket}/done"; then
57 |   echo "Bucket does not exist or has not finished importing"
58 |   exit 1
59 | else
60 |   echo "Bucket exists, initiating DataFlow import..."
61 | fi
62 | 
63 | export GOOGLE_APPLICATION_CREDENTIALS="./credentials/auth.json"
64 | 
65 | if [ ! -f $GOOGLE_APPLICATION_CREDENTIALS ]; then
66 |   echo "ERROR: ${GOOGLE_APPLICATION_CREDENTIALS} does not exist. See README for more info."
67 |   exit 1
68 | fi
69 | 
70 | source env/bin/activate
71 | 
72 | python bigquery_import.py \
73 |   --runner=DataflowRunner \
74 |   --project=httparchive \
75 |   --temp_location=gs://httparchive/dataflow/temp \
76 |   --staging_location=gs://httparchive/dataflow/staging \
77 |   --region=us-west1 \
78 |   --machine_type=n1-standard-32 \
79 |   --input="${bucket}" \
80 |   --worker_disk_type=compute.googleapis.com/projects//zones//diskTypes/pd-ssd \
81 |   --experiment=use_beam_bq_sink
82 | 
83 | deactivate
84 | 
85 | echo -e "Attempting to generate reports for ${YYYY_MM_DD}..."
86 | cd $HOME/code
87 | 
88 | gsutil -q stat gs://httparchive/reports/${YYYY_MM_DD}/*
89 | if [ $? -eq 1 ]; then
90 |   . sql/generate_reports.sh -th ${YYYY_MM_DD} -l ALL
91 | else
92 |   echo -e "Reports for ${YYYY_MM_DD} already exist, skipping."
93 | fi
94 | 
95 | echo "Done"
96 | 


--------------------------------------------------------------------------------
/urls/.gitignore:
--------------------------------------------------------------------------------
1 | Gemfile.lock
2 | 


--------------------------------------------------------------------------------
/urls/Gemfile:
--------------------------------------------------------------------------------
1 | source 'https://rubygems.org'
2 | 
3 | gem 'domainatrix'
4 | gem 'yajl-ruby', require: 'yajl'
5 | gem 'nokogiri'
6 | 
7 | 


--------------------------------------------------------------------------------
/urls/process.rb:
--------------------------------------------------------------------------------
 1 | require 'yajl'
 2 | require 'zlib'
 3 | require 'open3'
 4 | require 'nokogiri'
 5 | require 'optparse'
 6 | require 'domainatrix'
 7 | 
 8 | ROOT_PATH = '/'
 9 | WWW = 'www'
10 | matched = 0
11 | res, options = {}, {}
12 | 
13 | ARGV << "-h" if ARGV.empty?
14 | OptionParser.new do |opts|
15 |   opts.banner = "Usage: process.rb [options]"
16 | 
17 |   opts.on('-a', '--alexa=file', 'Alexa input data') do |v|
18 |     options[:alexa] = v
19 |   end
20 | 
21 |   opts.on('-d', '--dmoz=file', 'DMOZ input data') do |v|
22 |     options[:dmoz] = v
23 |   end
24 | 
25 |   opts.on('-o', '--output=file', 'Output file') do |v|
26 |     options[:output] = v || 'urls.json.gz'
27 |   end
28 | 
29 |   opts.on('-h', '--help') do
30 |     puts opts
31 |     exit
32 |   end
33 | end.parse!
34 | 
35 | if options[:alexa].nil? or options[:dmoz].nil?
36 |   raise OptionParser::MissingArgument
37 | end
38 | 
39 | puts "Loading Alexa data..."
40 | IO.popen("unzip -p #{options[:alexa]}", 'rb') do |io|
41 |   io.each do |line|
42 |     rank, name = line.strip.split(',')
43 |     res[name] = {
44 |         alexa_domain: name,
45 |         alexa_rank: rank.to_i,
46 |         dmoz_topic: []
47 |     }
48 |   end
49 | end
50 | 
51 | puts "Loading DMOZ data..."
52 | Zlib::GzipReader.open(options[:dmoz]) do |gz|
53 |   Nokogiri::XML::Reader(gz).each do |node|
54 |     #
55 |     # <ExternalPage about="http://animation.about.com/">
56 |     # <d:Title>About.com: Animation Guide</d:Title>
57 |     #   <d:Description>Keep up with developments in online animation for all skill levels. Download tools, and seek inspiration from online work.</d:Description>
58 |     # <topic>Top/Arts/Animation</topic>
59 |     # </ExternalPage>
60 |     #
61 |     if node.name == 'ExternalPage' && node.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT
62 |       page = Nokogiri::XML(node.outer_xml).at('ExternalPage')
63 | 
64 |       url = Domainatrix.parse(page.attribute('about').text)
65 |       next unless url.path == ROOT_PATH
66 |       next unless url.subdomain.empty? or url.subdomain == WWW
67 |       next if url.url.include? '?' or url.url.include? '#'
68 | 
69 |       if data = res[url.domain + "." + url.public_suffix]
70 |         matched += 1
71 |         data[:dmoz_topic] << page.at('topic').text
72 |         data[:dmoz_url] ||= page.attribute('about').text
73 |         data[:dmoz_title] ||= page.xpath('//d:Title').text
74 |         data[:dmoz_description] ||= page.xpath('//d:Description').text
75 |       end
76 |     end
77 |   end
78 | 
79 | end
80 | 
81 | File.open('urls.json.gz', 'w') do |f|
82 |   gz = Zlib::GzipWriter.new(f)
83 |   res.each_value do |val|
84 |     gz.puts Yajl::Encoder.encode(val)
85 |   end
86 |   gz.close
87 | end
88 | 
89 | puts "Done. Matched #{matches} DMOZ domains."
90 | 
91 | 


--------------------------------------------------------------------------------
/urls/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | BASE=`pwd`
 4 | TIMESTAMP=$(date "+%Y%m%d")
 5 | DATA=$HOME/archive/urls/$TIMESTAMP
 6 | 
 7 | mkdir -p $DATA
 8 | cd $DATA
 9 | 
10 | echo -e "Fetching Alexa Top 1M archive"
11 | wget -nv -N "http://s3.amazonaws.com/alexa-static/top-1m.csv.zip"
12 | if [ $? -ne 0 ]; then
13 |   echo "Alexa fetch failed, exiting"
14 |   exit
15 | fi
16 | 
17 | ## http://rdf.dmoz.org/
18 | echo -e "Fetching DMOZ open directory RDF dump"
19 | wget -nv -N "http://rdf.dmoz.org/rdf/content.rdf.u8.gz"
20 | if [ $? -ne 0 ]; then
21 |   echo "DMOZ fetch failed, exiting"
22 |   exit
23 | fi
24 | 
25 | ruby $BASE/process.rb -a top-1m.csv.zip -d content.rdf.u8.gz
26 | 
27 | echo -e "Syncing data to Google Storage"
28 | gsutil cp -n *.{zip,gz} gs://httparchive/urls/${TIMESTAMP}/
29 | 
30 | echo -e "Importing results to BigQuery"
31 | bq load --source_format NEWLINE_DELIMITED_JSON urls.${TIMESTAMP} \
32 |   gs://httparchive/urls/${TIMESTAMP}/urls.json.gz \
33 |   $BASE/schema.json
34 | 
35 | echo -e "Done."
36 | 
37 | 


--------------------------------------------------------------------------------
/urls/schema.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {"name": "alexa_domain", "type": "STRING"},
 3 |   {"name": "alexa_rank", "type": "INTEGER"},
 4 |   {"name": "dmoz_topic", "type": "STRING", "mode":"REPEATED"},
 5 |   {"name": "dmoz_url", "type": "STRING"},
 6 |   {"name": "dmoz_title", "type": "STRING"},
 7 |   {"name": "dmoz_description", "type": "STRING"}
 8 | ]
 9 | 
10 | 


--------------------------------------------------------------------------------
/util/fixcsv.py:
--------------------------------------------------------------------------------
1 | import fileinput
2 | for line in fileinput.input():
3 |     if line.endswith('\\\n'):
4 |         line = line[:-2] + ','
5 |     print(line),
6 | 


--------------------------------------------------------------------------------