├── .gitignore ├── .travis.yml ├── ANNOTATION.md ├── Dockerfile ├── LICENSE ├── README.md ├── annotation └── exports │ ├── tcpinfo_annotation_export.sql │ └── traceroute_hopannotation1_export.sql ├── cloudbuild.yaml ├── cmd └── stats-pipeline │ └── main.go ├── compose-annotation-export.yaml ├── compose-hopannotation1-export.yaml ├── config.json ├── config └── config.go ├── cors-settings.json ├── create_statistics_api.sh ├── docs ├── api-structure.md ├── format-schema.md ├── geo-precision.md └── stats-overview.md ├── exporter ├── exporter.go ├── exporter_test.go └── testdata │ └── export_query.sql ├── formatter ├── annotation.go ├── annotation_test.go ├── hopannotation1.go ├── hopannotation1_test.go ├── stats.go └── stats_test.go ├── go.mod ├── go.sum ├── histogram ├── table.go └── table_test.go ├── k8s └── data-pipeline │ ├── config │ ├── config-annotation-export.json │ ├── config-hopannotation1-export.json │ └── config.json │ ├── deployments │ ├── hopannotation1-export-template.yaml │ └── stats-pipeline.yaml.template │ ├── jobs │ ├── hopannotation1-export-cronjob.template │ └── stats-pipeline-cronjob.yaml.template │ └── services │ ├── hopannotation1-export.yaml │ └── stats-pipeline.yaml ├── maptiles ├── Dockerfile ├── Makefile ├── README.md ├── package-lock.json ├── package.json ├── run-pipeline.sh └── scripts │ ├── combine-blocks.sh │ ├── download-blocks.js │ ├── download-mlab.js │ ├── load-fcc-477-to-db.js │ ├── process-fcc.js │ ├── process-mlab.js │ └── unzip-blocks.sh ├── output ├── writer.go └── writer_test.go ├── pipeline ├── handlers.go ├── handlers_test.go ├── headers.go └── testdata │ ├── test_export.sql │ └── test_histogram.sql └── statistics ├── exports ├── cities.sql ├── cities_asn.sql ├── continents.sql ├── continents_asn.sql ├── countries.sql ├── countries_asn.sql ├── global_asn.sql ├── regions.sql ├── regions_asn.sql ├── us_counties.sql ├── us_counties_asn.sql ├── us_states.sql ├── us_states_asn.sql ├── us_tracts.sql └── us_tracts_asn.sql ├── queries ├── canary.sql ├── continent_asn_histogram.sql ├── continent_country_asn_histogram.sql ├── continent_country_histogram.sql ├── continent_country_region_asn_histogram.sql ├── continent_country_region_city_asn_histogram.sql ├── continent_country_region_city_histogram.sql ├── continent_country_region_histogram.sql ├── continent_histogram.sql ├── global_asn_histogram.sql ├── us_census_tracts_asn_histogram.sql ├── us_census_tracts_histogram.sql ├── us_county_asn_histogram.sql ├── us_county_histogram.sql ├── us_state_territories_asn_histogram.sql └── us_state_territories_histogram.sql └── scripts └── update_stats_continent_country_region_histogram.sh /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/ 2 | stats-pipeline 3 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: go 2 | 3 | go: 4 | - 1.20 5 | 6 | install: 7 | - go get -v -t ./... 8 | - go install -v ./... 9 | 10 | before_script: 11 | - go install github.com/mattn/goveralls@latest 12 | 13 | script: 14 | - go vet ./... 15 | - go build ./... 16 | - go test ./... -cover=1 -coverprofile=_c.cov 17 | # TODO: enable after passing. 18 | # - go test ./... -race 19 | 20 | after_script: 21 | - $GOPATH/bin/goveralls -service=travis-ci -coverprofile=_c.cov 22 | -------------------------------------------------------------------------------- /ANNOTATION.md: -------------------------------------------------------------------------------- 1 | # Annotation Export 2 | 3 | Directions for running the stats-pipeline for annotation export using the 4 | alternate `config-annotation-export.json`. 5 | 6 | ## Local development 7 | 8 | The `compose-annotation-export.yaml` specifies a docker compose configuration 9 | for running the stats-pipeline with an instance of pusher. Both services are 10 | able to use your local gcloud application default credentials. 11 | 12 | There are three "volumes" of interest: 13 | 14 | - `shared:/var/spool/ndt` - this is shared between the two containers, 15 | stats-pipeline writes files, and the pusher archives, uploads, and removes them. 16 | - `$HOME/.config/gcloud/:/root/.config/gcloud` - this provides access 17 | to your gcloud credentials. You must update the directory to your local home 18 | path. 19 | - `./:/config` - this provides access to the configuration and BigQuery SQL 20 | files in this repo. 21 | 22 | NOTE: Depending on your version of docker-compose, you may need to replace 23 | `$HOME` with your actual local directory name. 24 | 25 | You may run a local instance of the annotation export stats pipeline using 26 | docker-compose: 27 | 28 | ```sh 29 | docker-compose -f compose-annotation-export.yaml build 30 | docker-compose -f compose-annotation-export.yaml up 31 | ``` 32 | 33 | NOTE: this will upload sample archives to the configured GCS bucket in 34 | mlab-sandbox. Those files don't matter b/c it's sandbox, but be careful to 35 | distinguish between new files created from your run and archives from previous 36 | runs. 37 | 38 | You may trigger the export process using: 39 | 40 | ```sh 41 | curl -XPOST --data {} 'http://localhost:8080/v0/pipeline?step=exports&year=1' 42 | ``` 43 | 44 | Only the 'export' step is supported for annotation export, and the year is 45 | required but ignored. 46 | 47 | ## Kubernetes 48 | 49 | Only the hopannotation1 export process is available on Kubernetes. The export process is started via a CronJob which by default is scheduled to 50 | never run. 51 | 52 | To start an annotation export manually, run the following command on the 53 | `data-pipeline` cluster: 54 | 55 | ```sh 56 | kubectl create job --from=cronjob/hopannotation1-export-cronjob hopannotation1-export-manual 57 | ``` 58 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM golang:1.20 as build 2 | ENV CGO_ENABLED 0 3 | ADD . /go/src/github.com/m-lab/stats-pipeline 4 | WORKDIR /go/src/github.com/m-lab/stats-pipeline 5 | RUN go install \ 6 | -v \ 7 | -ldflags "-X github.com/m-lab/go/prometheusx.GitShortCommit=$(git log -1 --format=%h)" \ 8 | github.com/m-lab/stats-pipeline/cmd/stats-pipeline 9 | 10 | # Now copy the built image into the minimal base image 11 | FROM alpine:3.12 12 | RUN apk add ca-certificates 13 | COPY --from=build /go/bin/stats-pipeline / 14 | COPY --from=build /go/src/github.com/m-lab/stats-pipeline/statistics /statistics 15 | COPY --from=build /go/src/github.com/m-lab/stats-pipeline/annotation /annotation 16 | WORKDIR / 17 | ENTRYPOINT ["/stats-pipeline"] 18 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Version](https://img.shields.io/github/tag/m-lab/stats-pipeline.svg)](https://github.com/m-lab/stats-pipeline/releases) [![Build Status](https://travis-ci.com/m-lab/stats-pipeline.svg?branch=master)](https://travis-ci.com/m-lab/stats-pipeline) [![Coverage Status](https://coveralls.io/repos/github/m-lab/stats-pipeline/badge.svg?branch=master)](https://coveralls.io/github/m-lab/stats-pipeline?branch=master) [![GoDoc](https://godoc.org/github.com/m-lab/stats-pipeline?status.svg)](https://godoc.org/github.com/m-lab/stats-pipeline) [![Go Report Card](https://goreportcard.com/badge/github.com/m-lab/stats-pipeline)](https://goreportcard.com/report/github.com/m-lab/stats-pipeline) 2 | 3 | # Statistics Pipeline Service 4 | This repository contains code that processes NDT data and provides aggregate 5 | metrics by day for standard global, and some national geographies. The resulting 6 | aggregations are made available in JSON format, for use by other applications. 7 | 8 | The `stats-pipeline` service is written in Go, runs on GKE, and generates and 9 | updates daily aggregate statistics. Access is provided in public BigQuery tables 10 | and in per-year JSON formatted files hosted on GCS. 11 | 12 | ## Documentation Provided for the Statistics Pipeline Service 13 | * (This document) Overview of the `stats-pipeline` service, fields provided 14 | (schema), output formats, available geographies, and API URL structure. 15 | * [What Statistics are Provided by stats-pipeline, and How are They Calculated?][stats-overview] 16 | * [Geographic Precision in stats-pipeline][geo-precision] 17 | * [Statistics Output Format, Schema, and Field Descriptions][format-schema] 18 | * [Statistics API URL Structure, Available Geographies & Aggregations][api-structure] 19 | 20 | [stats-overview]: docs/stats-overview.md 21 | [geo-precision]: docs/geo-precision.md 22 | [format-schema]: docs/format-schema.md 23 | [api-structure]: docs/api-structure.md 24 | 25 | ## General Recommendations for All Aggregations of NDT data 26 | In general, [our recommendations][recommendations] for research aggregating NDT data are: 27 | 28 | * Don't oversimplify 29 | * Aggregate by ASN in addition to time/date and location 30 | * Be aware of, and illustrate multimodal distributions 31 | * Use histogram and logarithmic scales 32 | * Take into account, and compensate for, client bias and population drift 33 | 34 | [recommendations]: upcoming-blog-post 35 | 36 | ## Roadmap 37 | Below we list additional features, methods, geographies, etc. which may be 38 | considered for future versioned releases of `stats-pipeline`. 39 | 40 | ### Geographies 41 | * US Zip Codes, US Congressional Districts, Block Groups, Blocks 42 | 43 | ### Output Formats 44 | * histogram_daily_stats.csv - Same data as the JSON, but in CSV. Useful for importing into a spreadsheet. 45 | * histogram_daily_stats.sql - A SQL query which returns the same rows in the corresponding .json and .csv. Useful for verifying the exported data against the source and to tweak the query as needed by different use cases. 46 | -------------------------------------------------------------------------------- /annotation/exports/tcpinfo_annotation_export.sql: -------------------------------------------------------------------------------- 1 | -- Generate a synthetic UUID annotation from the base_tables.tcpinfo. 2 | -- 3 | -- NOTES 4 | -- 5 | -- For a single UUID, there may be multiple Timestamps per day in tcpinfo due to 6 | -- some long lived connections (probably not actual NDT measurements). The 7 | -- export query groups on the UUID and year_month_day to guarantee a single 8 | -- UUID per day. The query uses any Server and Client annotation from that day 9 | -- and the first (minimum) Timestamp. 10 | -- 11 | -- This export query may be safely run multiple times *IF* the previously 12 | -- generated annotations have been copied to the ndt/annotation GCS location and 13 | -- parsed into the raw_ndt.annotation tables. 14 | -- 15 | -- The query uses the "Left Excluding JOIN" pattern to select only rows from 16 | -- TCPINFO *without* corresponding rows in the annotation table (i.e. 17 | -- "annotation.id IS NULL"). Both tcpinfo and annotation tables are filtered by 18 | -- partition dates. 19 | WITH annotations AS ( 20 | SELECT * 21 | FROM `{{ .project }}.raw_ndt.annotation` 22 | WHERE date BETWEEN 23 | DATE_SUB(DATE('{{ .partitionID }}'), INTERVAL 1 DAY) 24 | AND DATE_ADD(DATE('{{ .partitionID }}'), INTERVAL 1 DAY) 25 | ), tcpinfos AS ( 26 | SELECT * 27 | FROM `{{ .project }}.base_tables.tcpinfo` 28 | WHERE DATE(TestTime) = DATE('{{ .partitionID }}') 29 | AND DATE('{{ .partitionID }}') BETWEEN 30 | DATE_SUB(DATE(_PARTITIONTIME), INTERVAL 1 DAY) 31 | AND DATE_ADD(DATE(_PARTITIONTIME), INTERVAL 1 DAY) 32 | ) 33 | 34 | SELECT 35 | tcpinfo.UUID, 36 | MIN(tcpinfo.TestTime) AS Timestamp, 37 | ANY_VALUE(tcpinfo.ServerX) AS Server, 38 | ANY_VALUE(tcpinfo.ClientX) AS Client, 39 | REPLACE(CAST(DATE(tcpinfo.TestTime) AS STRING), "-", "/") AS year_month_day, 40 | FROM 41 | tcpinfos AS tcpinfo 42 | LEFT OUTER JOIN 43 | annotations AS annotation 44 | ON 45 | tcpinfo.UUID = annotation.id 46 | AND DATE(tcpinfo.TestTime) = annotation.date 47 | WHERE 48 | annotation.id IS NULL 49 | AND tcpinfo.UUID != "" 50 | AND tcpinfo.UUID IS NOT NULL 51 | AND tcpinfo.ServerX.Site != "" 52 | AND tcpinfo.ServerX.Geo IS NOT NULL 53 | GROUP BY 54 | UUID, 55 | year_month_day 56 | -------------------------------------------------------------------------------- /annotation/exports/traceroute_hopannotation1_export.sql: -------------------------------------------------------------------------------- 1 | -- Generate a synthetic hopannotation1 annotation from base_tables.traceroute. 2 | 3 | WITH annotations AS ( 4 | SELECT * 5 | FROM `{{ .project }}.raw_ndt.hopannotation1` 6 | WHERE date BETWEEN 7 | DATE_SUB(DATE('{{ .partitionID }}'), INTERVAL 1 DAY) 8 | AND DATE_ADD(DATE('{{ .partitionID }}'), INTERVAL 1 DAY) 9 | ), traceroutes AS ( 10 | SELECT * 11 | FROM `{{ .project }}.base_tables.traceroute` 12 | WHERE DATE(TestTime) = DATE('{{ .partitionID }}') 13 | AND DATE('{{ .partitionID }}') BETWEEN 14 | DATE_SUB(DATE(_PARTITIONTIME), INTERVAL 1 DAY) 15 | AND DATE_ADD(DATE(_PARTITIONTIME), INTERVAL 1 DAY) 16 | ) 17 | 18 | SELECT 19 | -- The below fields make up the content of the hopannotation1 file and are written to disk. 20 | hop.Source.hopannotation1.ID, 21 | FORMAT_TIMESTAMP("%FT%TZ", MIN(hop.Source.hopannotation1.Timestamp)) AS Timestamp, 22 | ANY_VALUE(hop.Source.hopannotation1.Annotations) AS Annotations, 23 | -- The below fields are used to construct the file path and name. 24 | REPLACE(CAST(DATE(hop.Source.hopannotation1.Timestamp) AS STRING), "-", "/") AS Date, 25 | FORMAT_TIMESTAMP("%Y%m%dT000000Z", MIN(hop.Source.hopannotation1.Timestamp)) AS FilenameTimestamp, 26 | REGEXP_EXTRACT(hop.Source.hopannotation1.ID, r".+_(.+)_.+") AS Hostname, 27 | ANY_VALUE(hop.Source.IP) AS IP 28 | FROM 29 | traceroutes AS traceroute, UNNEST(Hop) AS hop 30 | LEFT OUTER JOIN 31 | annotations AS annotation 32 | ON 33 | hop.Source.hopannotation1.ID = annotation.id 34 | WHERE annotation.id IS NULL 35 | AND hop.Source.hopannotation1.ID != "" 36 | AND hop.Source.hopannotation1.ID IS NOT NULL 37 | AND hop.Source.hopannotation1.Annotations IS NOT NULL 38 | GROUP BY hop.Source.hopannotation1.ID, Date 39 | -------------------------------------------------------------------------------- /cloudbuild.yaml: -------------------------------------------------------------------------------- 1 | # NOTE: The stats-pipeline queries always read from the measurement-lab project. 2 | # This deployment is currently disabled in staging to prevent processing production data multiple times per day. 3 | steps: 4 | - name: "gcr.io/cloud-builders/docker" 5 | id: "Build the docker container" 6 | args: ["build", "-t", "gcr.io/$PROJECT_ID/stats-pipeline:$_DOCKER_TAG", "."] 7 | 8 | - name: "gcr.io/cloud-builders/docker" 9 | id: "Push the docker container to gcr.io" 10 | args: ["push", "gcr.io/$PROJECT_ID/stats-pipeline:$_DOCKER_TAG"] 11 | 12 | - name: "gcr.io/cloud-builders/kubectl" 13 | id: "Create configmap manifest" 14 | entrypoint: /bin/bash 15 | args: 16 | - -c 17 | - | 18 | kubectl create configmap stats-pipeline-config \ 19 | --from-file=k8s/$_CLUSTER_NAME/config -o yaml --dry-run > \ 20 | configmap-manifest.json 21 | 22 | - name: "gcr.io/cloud-builders/kubectl" 23 | id: "Apply configmap" 24 | args: 25 | - apply 26 | - -f 27 | - configmap-manifest.json 28 | env: 29 | - CLOUDSDK_COMPUTE_REGION=$_COMPUTE_REGION 30 | - CLOUDSDK_CONTAINER_CLUSTER=$_CLUSTER_NAME 31 | 32 | - name: "gcr.io/cloud-builders/gcloud" 33 | id: "Generate manifest for stats-pipeline deployment" 34 | entrypoint: /bin/sh 35 | args: 36 | - -c 37 | - | 38 | sed 's/{{GCLOUD_PROJECT}}/${PROJECT_ID}/g' \ 39 | k8s/$_CLUSTER_NAME/deployments/stats-pipeline.yaml.template > \ 40 | manifest.yaml 41 | 42 | - name: "gcr.io/cloud-builders/gke-deploy" 43 | id: "Create stats-pipeline deployment" 44 | args: 45 | - run 46 | - --filename=manifest.yaml 47 | - --image=gcr.io/$PROJECT_ID/stats-pipeline:$_DOCKER_TAG 48 | - --location=$_COMPUTE_REGION 49 | - --cluster=$_CLUSTER_NAME 50 | # gke-deploy will fail if the output folder is non-empty, thus we use 51 | # different folders for the two executions of this tool. 52 | - --output=pipeline/ 53 | 54 | - name: "gcr.io/cloud-builders/kubectl" 55 | id: "Create stats-pipeline service" 56 | args: 57 | - apply 58 | - -f 59 | - k8s/$_CLUSTER_NAME/services/stats-pipeline.yaml 60 | env: 61 | - CLOUDSDK_COMPUTE_REGION=$_COMPUTE_REGION 62 | - CLOUDSDK_CONTAINER_CLUSTER=$_CLUSTER_NAME 63 | 64 | - name: "gcr.io/cloud-builders/gcloud" 65 | id: "Generate manifest for hopannotation1-export deployment" 66 | entrypoint: /bin/sh 67 | args: 68 | - -c 69 | - | 70 | sed 's/{{GCLOUD_PROJECT}}/${PROJECT_ID}/g' \ 71 | k8s/$_CLUSTER_NAME/deployments/hopannotation1-export-template.yaml > \ 72 | hopannotation1-export-manifest.yaml 73 | 74 | # hopannotation1 export deployment and service. 75 | - name: "gcr.io/cloud-builders/gke-deploy" 76 | id: "Create hopannotation1-export deployment" 77 | args: 78 | - run 79 | - --filename=hopannotation1-export-manifest.yaml 80 | - --image=gcr.io/$PROJECT_ID/stats-pipeline:$_DOCKER_TAG 81 | - --location=$_COMPUTE_REGION 82 | - --cluster=$_CLUSTER_NAME 83 | # gke-deploy will fail if the output folder is non-empty, thus we use 84 | # different folders for the two executions of this tool. 85 | - --output=hopannotation1-export/ 86 | 87 | - name: "gcr.io/cloud-builders/kubectl" 88 | id: "Create hopannotation1-export service" 89 | args: 90 | - apply 91 | - -f 92 | - k8s/$_CLUSTER_NAME/services/hopannotation1-export.yaml 93 | env: 94 | - CLOUDSDK_COMPUTE_REGION=$_COMPUTE_REGION 95 | - CLOUDSDK_CONTAINER_CLUSTER=$_CLUSTER_NAME 96 | 97 | - name: "gcr.io/cloud-builders/docker" 98 | id: "Build the stats-pipeline-runner docker container" 99 | args: ["build", "-t", "gcr.io/$PROJECT_ID/stats-pipeline-runner:$_DOCKER_TAG", "maptiles/"] 100 | 101 | - name: "gcr.io/cloud-builders/docker" 102 | id: "Push the stats-pipeline-runner docker container to gcr.io" 103 | args: ["push", "gcr.io/$PROJECT_ID/stats-pipeline-runner:$_DOCKER_TAG"] 104 | 105 | - name: "gcr.io/cloud-builders/gcloud" 106 | id: "Generate manifest for the stats-pipeline-cronjob" 107 | entrypoint: /bin/sh 108 | args: 109 | - -c 110 | - | 111 | sed -e 's/{{GCLOUD_PROJECT}}/${PROJECT_ID}/g' \ 112 | -e "s/{{PIPELINE_CRON_SCHEDULE}}/${_PIPELINE_CRON_SCHEDULE}/g" \ 113 | k8s/$_CLUSTER_NAME/jobs/stats-pipeline-cronjob.yaml.template > \ 114 | stats-pipeline-cronjob.yaml 115 | 116 | - name: "gcr.io/cloud-builders/gcloud" 117 | id: "Generate manifest for the hopannotation1-export-cronjob" 118 | entrypoint: /bin/sh 119 | args: 120 | - -c 121 | - | 122 | sed -e 's/{{GCLOUD_PROJECT}}/${PROJECT_ID}/g' \ 123 | -e "s/{{ANNOTATION_EXPORT_CRON_SCHEDULE}}/${_ANNOTATION_EXPORT_CRON_SCHEDULE}/g" \ 124 | k8s/$_CLUSTER_NAME/jobs/hopannotation1-export-cronjob.template > \ 125 | hopannotation1-export-cronjob.yaml 126 | 127 | - name: "gcr.io/cloud-builders/gke-deploy" 128 | id: "Create stats-pipeline CronJob" 129 | args: 130 | - run 131 | - --filename=stats-pipeline-cronjob.yaml 132 | - --image=gcr.io/$PROJECT_ID/stats-pipeline-runner:$_DOCKER_TAG 133 | - --location=$_COMPUTE_REGION 134 | - --cluster=$_CLUSTER_NAME 135 | # gke-deploy will fail if the output folder is non-empty, thus we use 136 | # different folders for the two executions of this tool. 137 | - --output=stats-pipeline-runner/ 138 | 139 | - name: "gcr.io/cloud-builders/gke-deploy" 140 | id: "Create hopannotation1-export CronJob" 141 | args: 142 | - run 143 | - --filename=hopannotation1-export-cronjob.yaml 144 | - --image=gcr.io/$PROJECT_ID/stats-pipeline-runner:$_DOCKER_TAG 145 | - --location=$_COMPUTE_REGION 146 | - --cluster=$_CLUSTER_NAME 147 | # gke-deploy will fail if the output folder is non-empty, thus we use 148 | # different folders for the two executions of this tool. 149 | - --output=hopannotation1-export-runner/ 150 | -------------------------------------------------------------------------------- /cmd/stats-pipeline/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "encoding/json" 6 | "flag" 7 | "log" 8 | "net/http" 9 | "runtime" 10 | 11 | "cloud.google.com/go/bigquery" 12 | "cloud.google.com/go/storage" 13 | "github.com/googleapis/google-cloud-go-testing/bigquery/bqiface" 14 | "github.com/googleapis/google-cloud-go-testing/storage/stiface" 15 | 16 | "github.com/m-lab/go/flagx" 17 | "github.com/m-lab/go/httpx" 18 | "github.com/m-lab/go/prometheusx" 19 | "github.com/m-lab/go/rtx" 20 | "github.com/m-lab/go/uploader" 21 | "github.com/m-lab/stats-pipeline/config" 22 | "github.com/m-lab/stats-pipeline/exporter" 23 | "github.com/m-lab/stats-pipeline/formatter" 24 | "github.com/m-lab/stats-pipeline/output" 25 | "github.com/m-lab/stats-pipeline/pipeline" 26 | ) 27 | 28 | const dateFormat = "2006-01-02" 29 | 30 | var ( 31 | project string 32 | listenAddr string 33 | bucket string 34 | outputType = flagx.Enum{ 35 | Options: []string{"gcs", "local"}, 36 | Value: "gcs", 37 | } 38 | exportType = flagx.Enum{ 39 | Options: []string{"stats", "annotation", "hopannotation1"}, 40 | Value: "stats", 41 | } 42 | 43 | configFile = flagx.File{} 44 | mainCtx = context.Background() 45 | ) 46 | 47 | func init() { 48 | flag.StringVar(&listenAddr, "listenaddr", ":8080", "Address to listen on") 49 | flag.StringVar(&project, "project", "mlab-sandbox", 50 | "GCP Project ID to use") 51 | flag.StringVar(&bucket, "bucket", "statistics-mlab-sandbox", 52 | "GCS bucket to export the result to") 53 | flag.Var(&configFile, "config", "JSON configuration file") 54 | flag.Var(&outputType, "output", "Output to gcs or local files.") 55 | flag.Var(&exportType, "export", "Generate and export the named data type.") 56 | } 57 | 58 | func makeHTTPServer(listenAddr string, h http.Handler) *http.Server { 59 | return &http.Server{ 60 | Addr: listenAddr, 61 | Handler: h, 62 | } 63 | } 64 | 65 | func main() { 66 | flag.Parse() 67 | log.SetFlags(log.LUTC | log.Lshortfile | log.LstdFlags) 68 | rtx.Must(flagx.ArgsFromEnv(flag.CommandLine), "Could not parse env args") 69 | 70 | // Try parsing provided config file. 71 | var configs map[string]config.Config 72 | err := json.Unmarshal(configFile.Get(), &configs) 73 | rtx.Must(err, "cannot parse configuration file") 74 | 75 | bqClient, err := bigquery.NewClient(mainCtx, project) 76 | rtx.Must(err, "error initializing BQ client") 77 | 78 | gcsClient, err := storage.NewClient(mainCtx) 79 | rtx.Must(err, "error initializing GCS client") 80 | 81 | var wr exporter.Writer 82 | switch outputType.Value { 83 | case "gcs": 84 | wr = output.NewGCSWriter(uploader.New(stiface.AdaptClient(gcsClient), bucket)) 85 | case "local": 86 | wr = output.NewLocalWriter(bucket) 87 | } 88 | 89 | var f exporter.Formatter 90 | switch exportType.Value { 91 | case "stats": 92 | f = formatter.NewStatsQueryFormatter() 93 | case "annotation": 94 | f = formatter.NewTCPINFOAnnotationQueryFormatter() 95 | case "hopannotation1": 96 | f = formatter.NewTracerouteHopAnnotation1QueryFormatter() 97 | } 98 | exp := exporter.New(bqiface.AdaptClient(bqClient), project, wr, f) 99 | 100 | // Initialize handlers. 101 | pipelineHandler := pipeline.NewHandler(bqiface.AdaptClient(bqClient), 102 | exp, configs) 103 | 104 | // Initialize mux. 105 | mux := http.NewServeMux() 106 | mux.Handle("/v0/pipeline", pipelineHandler) 107 | 108 | log.Printf("GOMAXPROCS is %d", runtime.GOMAXPROCS(0)) 109 | 110 | // Start main HTTP server. 111 | s := makeHTTPServer(listenAddr, mux) 112 | rtx.Must(httpx.ListenAndServeAsync(s), "Could not start HTTP server") 113 | defer s.Close() 114 | 115 | // Start Prometheus server for monitoring. 116 | promServer := prometheusx.MustServeMetrics() 117 | defer promServer.Close() 118 | 119 | // Keep serving until the context is canceled. 120 | <-mainCtx.Done() 121 | } 122 | -------------------------------------------------------------------------------- /compose-annotation-export.yaml: -------------------------------------------------------------------------------- 1 | version: '3.7' 2 | volumes: 3 | shared: 4 | services: 5 | annotation_export: 6 | build: 7 | context: . 8 | dockerfile: Dockerfile 9 | image: local-stats-pipeline 10 | volumes: 11 | - shared:/var/spool/ndt 12 | - $HOME/.config/gcloud/:/root/.config/gcloud 13 | - ./k8s:/k8s 14 | ports: 15 | - target: 8080 16 | published: 8080 17 | protocol: tcp 18 | mode: host 19 | - target: 9990 20 | published: 9990 21 | protocol: tcp 22 | mode: host 23 | - target: 9991 24 | published: 9991 25 | protocol: tcp 26 | mode: host 27 | command: 28 | - -prometheusx.listen-address=:9990 29 | - -exporter.query-workers=1 30 | - -config=/k8s/data-pipeline/config/config-annotation-export.json 31 | - -output=local 32 | - -export=annotation 33 | - -bucket=/var/spool/ndt/annotation 34 | - -project=mlab-sandbox 35 | 36 | pusher: 37 | image: measurementlab/pusher:v1.19 38 | volumes: 39 | - shared:/var/spool/ndt 40 | - $HOME/.config/gcloud/:/root/.config/gcloud 41 | network_mode: "service:annotation_export" 42 | command: 43 | - -prometheusx.listen-address=:9991 44 | - -bucket=thirdparty-annotation-mlab-sandbox 45 | - -experiment=ndt 46 | - -datatype=annotation 47 | - -directory=/var/spool/ndt 48 | - -node_name=third-party 49 | - -archive_size_threshold=20MB 50 | - -max_file_age=10m # No need to wait after writing to upload a file (default 1h). 51 | - -archive_wait_time_min=15m # (default 30m0s) 52 | - -archive_wait_time_expected=30m # (default 1h0m0s) 53 | - -archive_wait_time_max=1h # (default 2h0m0s) 54 | - -sigterm_wait_time=60s 55 | - -metadata=MLAB.server.name=$HOSTNAME 56 | - -metadata=MLAB.experiment.name=ndt 57 | - -metadata=MLAB.pusher.image=measurementlab/pusher:v1.19 58 | - -metadata=MLAB.pusher.src.url=https://github.com/m-lab/pusher/tree/v1.19 59 | -------------------------------------------------------------------------------- /compose-hopannotation1-export.yaml: -------------------------------------------------------------------------------- 1 | version: '3.7' 2 | volumes: 3 | shared: 4 | services: 5 | hopannotation1_export: 6 | build: 7 | context: . 8 | dockerfile: Dockerfile 9 | image: local-stats-pipeline 10 | volumes: 11 | - shared:/var/spool/ndt 12 | - $HOME/.config/gcloud/:/root/.config/gcloud 13 | - ./k8s:/k8s 14 | ports: 15 | - target: 8080 16 | published: 8080 17 | protocol: tcp 18 | mode: host 19 | - target: 9990 20 | published: 9990 21 | protocol: tcp 22 | mode: host 23 | - target: 9991 24 | published: 9991 25 | protocol: tcp 26 | mode: host 27 | command: 28 | - -prometheusx.listen-address=:9990 29 | - -exporter.query-workers=1 30 | - -config=/k8s/data-pipeline/config/config-hopannotation1-export.json 31 | - -output=local 32 | - -export=hopannotation1 33 | - -bucket=/var/spool/ndt/hopannotation1 34 | - -project=mlab-sandbox 35 | 36 | pusher: 37 | image: measurementlab/pusher:v1.19 38 | volumes: 39 | - shared:/var/spool/ndt 40 | - $HOME/.config/gcloud/:/root/.config/gcloud 41 | network_mode: "service:hopannotation1_export" 42 | command: 43 | - -prometheusx.listen-address=:9991 44 | - -bucket=thirdparty-annotation-mlab-sandbox 45 | - -experiment=ndt 46 | - -datatype=hopannotation1 47 | - -directory=/var/spool/ndt 48 | - -node_name=third-party 49 | - -archive_size_threshold=20MB 50 | - -max_file_age=10m # No need to wait after writing to upload a file (default 1h). 51 | - -archive_wait_time_min=15m # (default 30m0s) 52 | - -archive_wait_time_expected=30m # (default 1h0m0s) 53 | - -archive_wait_time_max=1h # (default 2h0m0s) 54 | - -sigterm_wait_time=60s 55 | - -metadata=MLAB.server.name=$HOSTNAME 56 | - -metadata=MLAB.experiment.name=ndt 57 | - -metadata=MLAB.pusher.image=measurementlab/pusher:v1.19 58 | - -metadata=MLAB.pusher.src.url=https://github.com/m-lab/pusher/tree/v1.19 59 | -------------------------------------------------------------------------------- /config.json: -------------------------------------------------------------------------------- 1 | { 2 | "continents": { 3 | "histogramQueryFile": "statistics/queries/continent_histogram.sql", 4 | "exportQueryFile": "statistics/exports/continents.sql", 5 | "dataset": "statistics", 6 | "table": "continents", 7 | "outputPath": "v0/{{ .continent_code }}/{{ .year }}/histogram_daily_stats.json" 8 | }, 9 | "countries": { 10 | "histogramQueryFile": "statistics/queries/continent_country_histogram.sql", 11 | "exportQueryFile": "statistics/exports/countries.sql", 12 | "dataset": "statistics", 13 | "table": "countries", 14 | "outputPath": "v0/{{ .continent_code }}/{{ .country_code }}/{{ .year }}/histogram_daily_stats.json" 15 | }, 16 | "regions": { 17 | "histogramQueryFile": "statistics/queries/continent_country_region_histogram.sql", 18 | "exportQueryFile": "statistics/exports/regions.sql", 19 | "dataset": "statistics", 20 | "table": "regions", 21 | "outputPath": "v0/{{ .continent_code }}/{{ .country_code }}/{{ .ISO3166_2region1 }}/{{ .year }}/histogram_daily_stats.json" 22 | }, 23 | "cities": { 24 | "histogramQueryFile": "statistics/queries/continent_country_region_city_histogram.sql", 25 | "exportQueryFile": "statistics/exports/cities.sql", 26 | "dataset": "statistics", 27 | "table": "cities", 28 | "outputPath": "v0/{{ .continent_code }}/{{ .country_code }}/{{ .ISO3166_2region1 }}/{{ .city }}/{{ .year }}/histogram_daily_stats.json" 29 | }, 30 | "tracts": { 31 | "histogramQueryFile": "statistics/queries/us_census_tracts_histogram.sql", 32 | "exportQueryFile": "statistics/exports/us_tracts.sql", 33 | "dataset": "statistics", 34 | "table": "us_tracts", 35 | "outputPath": "v0/NA/US/tracts/{{ .GEOID }}/{{ .year }}/histogram_daily_stats.json" 36 | }, 37 | "states": { 38 | "histogramQueryFile": "statistics/queries/us_state_territories_histogram.sql", 39 | "exportQueryFile": "statistics/exports/us_states.sql", 40 | "dataset": "statistics", 41 | "table": "us_states", 42 | "outputPath": "v0/NA/US/states/{{ .GEOID }}/{{ .year }}/histogram_daily_stats.json" 43 | }, 44 | "counties": { 45 | "histogramQueryFile": "statistics/queries/us_county_histogram.sql", 46 | "exportQueryFile": "statistics/exports/us_counties.sql", 47 | "dataset": "statistics", 48 | "table": "us_counties", 49 | "outputPath": "v0/NA/US/counties/{{ .GEOID }}/{{ .year }}/histogram_daily_stats.json" 50 | }, 51 | "continents_asn": { 52 | "histogramQueryFile": "statistics/queries/continent_asn_histogram.sql", 53 | "exportQueryFile": "statistics/exports/continents_asn.sql", 54 | "dataset": "statistics", 55 | "table": "continents_asn", 56 | "outputPath": "v0/{{ .continent_code }}/asn/{{ .asn }}/{{ .year }}/histogram_daily_stats.json" 57 | }, 58 | "countries_asn": { 59 | "histogramQueryFile": "statistics/queries/continent_country_asn_histogram.sql", 60 | "exportQueryFile": "statistics/exports/countries_asn.sql", 61 | "dataset": "statistics", 62 | "table": "countries_asn", 63 | "outputPath": "v0/{{ .continent_code }}/{{ .country_code }}/asn/{{ .asn }}/{{ .year }}/histogram_daily_stats.json" 64 | }, 65 | "regions_asn": { 66 | "histogramQueryFile": "statistics/queries/continent_country_region_asn_histogram.sql", 67 | "exportQueryFile": "statistics/exports/regions_asn.sql", 68 | "dataset": "statistics", 69 | "table": "regions_asn", 70 | "outputPath": "v0/{{ .continent_code }}/{{ .country_code }}/{{ .ISO3166_2region1 }}/asn/{{ .asn }}/{{ .year }}/histogram_daily_stats.json" 71 | }, 72 | "cities_asn": { 73 | "histogramQueryFile": "statistics/queries/continent_country_region_city_asn_histogram.sql", 74 | "exportQueryFile": "statistics/exports/cities_asn.sql", 75 | "dataset": "statistics", 76 | "table": "cities_asn", 77 | "outputPath": "v0/{{ .continent_code }}/{{ .country_code }}/{{ .ISO3166_2region1 }}/{{ .city }}/asn/{{ .asn }}/{{ .year }}/histogram_daily_stats.json" 78 | }, 79 | "states_asn": { 80 | "histogramQueryFile": "statistics/queries/us_state_territories_asn_histogram.sql", 81 | "exportQueryFile": "statistics/exports/us_states_asn.sql", 82 | "dataset": "statistics", 83 | "table": "us_states_asn", 84 | "outputPath": "v0/NA/US/states/{{ .GEOID }}/asn/{{ .asn }}/{{ .year }}/histogram_daily_stats.json" 85 | }, 86 | "counties_asn": { 87 | "histogramQueryFile": "statistics/queries/us_county_asn_histogram.sql", 88 | "exportQueryFile": "statistics/exports/us_counties_asn.sql", 89 | "dataset": "statistics", 90 | "table": "us_counties_asn", 91 | "outputPath": "v0/NA/US/counties/{{ .GEOID }}/asn/{{ .asn }}/{{ .year }}/histogram_daily_stats.json" 92 | }, 93 | "tracts_asn": { 94 | "histogramQueryFile": "statistics/queries/us_census_tracts_asn_histogram.sql", 95 | "exportQueryFile": "statistics/exports/us_tracts_asn.sql", 96 | "dataset": "statistics", 97 | "table": "us_tracts_asn", 98 | "outputPath": "v0/NA/US/tracts/{{ .GEOID }}/asn/{{ .asn }}/{{ .year }}/histogram_daily_stats.json" 99 | }, 100 | "global_asn": { 101 | "histogramQueryFile": "statistics/queries/global_asn_histogram.sql", 102 | "exportQueryFile": "statistics/exports/global_asn.sql", 103 | "dataset": "statistics", 104 | "table": "global_asn", 105 | "outputPath": "v0/asn/{{ .asn }}/{{ .year }}/histogram_daily_stats.json" 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /config/config.go: -------------------------------------------------------------------------------- 1 | package config 2 | 3 | // Config is a configuration object for the stats pipeline. 4 | type Config struct { 5 | // HistogramQueryFile is the path to the query generating the histogram table. 6 | // This field is required. 7 | HistogramQueryFile string 8 | 9 | // DateField is the name of the date field in the query. 10 | // This is used to determine which rows to delete from the histogram table 11 | // when updating a certain range of dates. This field is required. 12 | DateField string 13 | 14 | // PartitionField is the field used to partition the histogram table. 15 | // It may be the same as DateField or a different field. 16 | // This field is optional. 17 | PartitionField string 18 | 19 | // PartitionType is the type of partitioning used. 20 | // Possible values are: 21 | // - "time": partition by timestamp, date or datetime 22 | // - "range": partition by integer range 23 | // This field is optional. 24 | PartitionType string 25 | 26 | // ExportQueryFile is the path to the export query. 27 | // This field is required. 28 | ExportQueryFile string 29 | 30 | // Dataset is the dataset name. This field is required. 31 | Dataset string 32 | 33 | // Table is the histogram table name. This field is required. 34 | Table string 35 | 36 | // OutputPath is a template defining the output path - either local or GCS. 37 | // This field is required. 38 | OutputPath string 39 | } 40 | -------------------------------------------------------------------------------- /cors-settings.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "origin": ["*"], 4 | "method": ["GET", "POST", "PUT", "HEAD", "PATCH", "DELETE", "OPTIONS"] 5 | } 6 | ] -------------------------------------------------------------------------------- /create_statistics_api.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # create_statistics_api.sh creates all GCP resources needed to serve the statistics 4 | # API from a GCS bucket. 5 | 6 | set -euxo pipefail 7 | PROJECT=${1:?Please provide project} 8 | 9 | # Create statistics GCS bucket. 10 | statistics_bucket="statistics-${PROJECT}" 11 | if ! gsutil acl get "gs://${statistics_bucket}" &> /dev/null ; then 12 | gsutil mb -p ${PROJECT} -l us-central1 "gs://${statistics_bucket}" 13 | gsutil defacl set public-read "gs://${statistics_bucket}" 14 | fi 15 | 16 | # Apply CORS settings to the statistics bucket. 17 | gsutil cors set cors-settings.json gs://${statistics_bucket} 18 | 19 | # Lookup or create loadbalancer IP. 20 | lb_ip=$( 21 | gcloud --project ${PROJECT} compute addresses describe \ 22 | statistics-lb-ip --global --format="value(address)" || : 23 | ) 24 | if [[ -z "${lb_ip}" ]] ; then 25 | lb_ip=$( 26 | gcloud --project ${PROJECT} compute addresses create \ 27 | statistics-lb-ip --ip-version=IPV4 --global --format="value(address)" 28 | ) 29 | fi 30 | 31 | # Lookup or create the backend bucket for the statistics data bucket. 32 | statistics_backend_name=$( 33 | gcloud --project ${PROJECT} compute backend-buckets describe \ 34 | statistics-bucket --format='value(name)' || : 35 | ) 36 | if [[ -z "${statistics_backend_name}" ]] ; then 37 | statistics_backend_name=$( 38 | gcloud --project ${PROJECT} compute backend-buckets create \ 39 | statistics-bucket \ 40 | --gcs-bucket-name ${statistics_bucket} --format='value(name)' 41 | ) 42 | fi 43 | 44 | # Create url-map. 45 | urlmap_name=$( 46 | gcloud --project ${PROJECT} compute url-maps describe \ 47 | statistics-url-map --format='value(name)' || : 48 | ) 49 | if [[ -z "${urlmap_name}" ]] ; then 50 | urlmap_name=$( 51 | gcloud --project ${PROJECT} compute url-maps create \ 52 | statistics-url-map \ 53 | --default-backend-bucket=${statistics_backend_name} \ 54 | --format='value(name)' 55 | ) 56 | fi 57 | 58 | # Setup DNS for statistics..measurementlab.net. 59 | current_ip=$( 60 | gcloud dns record-sets list --zone "${PROJECT}-measurementlab-net" \ 61 | --name "statistics.${PROJECT}.measurementlab.net." \ 62 | --format "value(rrdatas[0])" --project ${PROJECT} || : ) 63 | if [[ "${current_ip}" != "${lb_ip}" ]] ; then 64 | # Add the record, deleting the existing one first. 65 | gcloud dns record-sets transaction start \ 66 | --zone "${PROJECT}-measurementlab-net" \ 67 | --project ${PROJECT} 68 | # Allow remove to fail when CURRENT_IP is empty. 69 | gcloud dns record-sets transaction remove \ 70 | --zone "${PROJECT}-measurementlab-net" \ 71 | --name "statistics.${PROJECT}.measurementlab.net." \ 72 | --type A \ 73 | --ttl 300 \ 74 | "${current_ip}" --project ${PROJECT} || : 75 | gcloud dns record-sets transaction add \ 76 | --zone "${PROJECT}-measurementlab-net" \ 77 | --name "statistics.${PROJECT}.measurementlab.net." \ 78 | --type A \ 79 | --ttl 300 \ 80 | "${lb_ip}" \ 81 | --project ${PROJECT} 82 | gcloud dns record-sets transaction execute \ 83 | --zone "${PROJECT}-measurementlab-net" \ 84 | --project ${PROJECT} 85 | fi 86 | 87 | # Create managed TLS certificates. 88 | certificate_name=$( 89 | gcloud --project ${PROJECT} beta compute ssl-certificates describe \ 90 | statistics-certificate --format='value(name)' || : 91 | ) 92 | if [[ -z "${certificate_name}" ]] ; then 93 | certificate_name=$( 94 | gcloud --project ${PROJECT} beta compute ssl-certificates create \ 95 | statistics-certificate \ 96 | --domains statistics.${PROJECT}.measurementlab.net --format='value(name)' 97 | ) 98 | fi 99 | 100 | # Create the HTTPS target proxy connecting the url-map and managed certificate. 101 | proxy_name=$( 102 | gcloud --project ${PROJECT} compute target-https-proxies describe \ 103 | statistics-lb-proxy --format='value(name)' || : 104 | ) 105 | if [[ -z "${proxy_name}" ]] ; then 106 | proxy_name=$( 107 | gcloud --project ${PROJECT} compute target-https-proxies create \ 108 | statistics-lb-proxy \ 109 | --url-map ${urlmap_name} --ssl-certificates ${certificate_name} \ 110 | --format='value(name)' 111 | ) 112 | fi 113 | 114 | # Create the forwarding rule connecting our loadbalancer IP to the target proxy. 115 | forwarder_name=$( 116 | gcloud --project ${PROJECT} compute forwarding-rules describe \ 117 | statistics-forwarder --global --format='value(name)' || : 118 | ) 119 | if [[ -z "${forwarder_name}" ]] ; then 120 | gcloud --project ${PROJECT} compute forwarding-rules create \ 121 | statistics-forwarder \ 122 | --address ${lb_ip} --global \ 123 | --target-https-proxy ${proxy_name} \ 124 | --ports 443 125 | fi 126 | -------------------------------------------------------------------------------- /docs/api-structure.md: -------------------------------------------------------------------------------- 1 | # Statistics API URL Structure, Available Geographies & Aggregations 2 | The statistics API provides aggregations of NDT data, accessible using a well 3 | defined URL structure provided at https://statistics.measurementlab.net/ 4 | 5 | ## Versioning 6 | At the top level, a version number is used to provide incremental releases of 7 | the statistics API as new features are added. The current version of `stats-api` 8 | is **v0**, for example: `https://statistics.measurementlab.net/**v0**/` 9 | 10 | ## Available Geographies and ASN Aggregations 11 | The URL structure below defines the geographies available in `stats-api`. At 12 | each level, aggregates by year and ASN are provided: 13 | 14 | ### Global 15 | * `/v0/asn//` 16 | 17 | At the global geographic level, we aggregate by [Autonomous System Number][asn] 18 | 19 | [asn]: https://en.wikipedia.org/wiki/Autonomous_system_%28Internet%29 20 | 21 | ### Continent 22 | * `/v0///` 23 | * `/v0//asn///` 24 | 25 | Continents are represented by the two character continent code. 26 | 27 | ### Country 28 | * `/v0////` 29 | * `/v0///asn///` 30 | 31 | Countries are identified by their two character country code. 32 | 33 | ### ISO 3166-2 region level 1 34 | * `/v0/////` 35 | * `/v0////asn///` 36 | 37 | The [ISO 3166-2 standard][iso-3166] is used to identify subdivisions with countries. This 38 | code begins with the two character country code, appended with a hypen and up to 39 | three alphanumeric characters. 40 | 41 | [iso-3166]: https://en.wikipedia.org/wiki/ISO_3166-2 42 | 43 | ### United States County 44 | * `/v0/NA/US/counties///` 45 | * `/v0/NA/US/counties//asn///` 46 | 47 | United States Counties are identified using the shapefile polygons that define 48 | them, obtained through the US Census Bureau. The `GEOID` of each test is found 49 | by looking up the polygon that contains the test's annotated latitude and longitude. 50 | 51 | ### City 52 | * `/v0//////` 53 | * `/v0/////asn///` 54 | 55 | Cities are identified from the IP address annotations present in NDT data after 56 | it is published. 57 | 58 | ## Accessing Statistics Using the stats-api 59 | Using the API will depend largely on how you develop your application, but 60 | accessing the statistics is a matter of knowing the geography and year of 61 | interest, and using the appropriate URL pattern to access daily statistics. 62 | 63 | For example, to get statistics for Maryland in 2020, we would use this URL: 64 | `https://statistics.measurementlab.net/v0/NA/US/US-MD/2020/histogram_daily_stats.json` 65 | 66 | ## Additional Geographies Provided for Advisory / Comparison Use Only 67 | As mentioned in [Geographic Precision in `stats-pipeline`][geo-precision], NDT 68 | data may be aggregated by any geography, but the precision of individual test 69 | location annotations is limited to the precision of IP address geolocation. In 70 | geographies that are quite small, aggregate data should be compared with other 71 | datasets and used only in and advisory capacity. 72 | 73 | One example in the US is the Census Tract. The geographies of tracts are quite 74 | small, and to achieve address level precision would require the collection of 75 | new NDT test data using a third party integration of the test that requests 76 | location from the user in some way. A variety of [community-driven initiatives][community-tools] 77 | are doing this, but these more accurately located tests are maintained by those 78 | initiatives. However, seeing NDT data aggregated by census tract can be 79 | generally useful as a point of comparison with other datasets. As such we 80 | provide aggregation by US Census Tract for this type of use case. 81 | 82 | ### United States Census tracts 83 | * `/v0/NA/tracts///` 84 | * `/v0/NA/tracts//asn///` 85 | 86 | [geo-precision]: geo-precision.md 87 | [community-tools]: https://www.measurementlab.net/data/tools/#community 88 | -------------------------------------------------------------------------------- /docs/format-schema.md: -------------------------------------------------------------------------------- 1 | # Statistics Output Format 2 | All statistics provided by this API are for a particular geography and day, 3 | over a calendar year. In addition, we also provide aggregation of tests per 4 | Autonomous System Number, identifying statistics per provider within each 5 | geography and day/year. The Statistics Pipeline exports aggregate statistics 6 | across supported years and geographies in JSON files named: **histogram_daily_stats.json** 7 | 8 | Each file contains a JSON array with daily histograms and aggregate statistics. 9 | Each object in the array represents a histogram bucket per one day, and in our 10 | current histogram buckets there are 8 buckets generated. So for each aggregate 11 | on a given day, there will be 8 JSON objects. For a complete year the file for 12 | an aggregation will contain 365*8 objects. 13 | 14 | ## Schema and Field Descriptions 15 | 16 | Below is a list and description of the fields provided in a JSON object for a 17 | single day and bucket: 18 | 19 | | Field | Description | 20 | |:-------------|:--------------------------------------------------------| 21 | | "date":"2020-01-01", | The date in `YYYY-MM-DD` format. | 22 | | "bucket_min":0, | The lower bound of the bucket which the statistics in this object represent. | 23 | | "bucket_max":1.7782794100389228 | The upper bound of the bucket which the statistics in this object represent. | 24 | | "dl_LOG_AVG_rnd1":25.591 | The LOG Average of download measurements in this aggregation, using the first randomly selected test per IP address in the set. Value is presented in megabits per second. | 25 | | "dl_LOG_AVG_rnd2":25.577 | The LOG Average of download measurements in this aggregation, using the second randomly selected test per IP address in the set. Value is presented in megabits per second. | 26 | | "dl_minRTT_LOG_AVG_rnd1":26.256 | The LOG Average of Minimum Round Trip Time of download measurements in this aggregation, using the first randomly selected test per IP address in the set. Value is presented in milliseconds. | 27 | | "dl_minRTT_LOG_AVG_rnd2":26.268 | The LOG Average of Minimum Round Trip Time of download measurements in this aggregation, using the second randomly selected test per IP address in the set. Value is presented in milliseconds. | 28 | | "dl_frac_bucket":0.057 | The fraction of download measurements within this histogram bucket. | 29 | | "dl_samples_bucket":10695 | The number of download measurement samples in this bucket. | 30 | | "dl_samples_day":188725 | The number of download measurement samples on this day. | 31 | | "download_MIN":0 | The minimum download speed in megabits per second on this day. | 32 | | "download_Q25":8.141 | The first quartile (25th percentile) download speed in megabits per second on this day. | 33 | | "download_MED":31.95 | The median download speed in megabits per second on this day. | 34 | | "download_AVG":79.745 | The average or mean download speed in megabits per second on this day. | 35 | | "download_Q75":97.572 | The upper quartile (75th percentile) download speed in megabits per second on this day. | 36 | | "download_MAX":3655.15 | The maximum download speed in megabits per second on this day. | 37 | | "download_minRTT_MED":25 | The median Minimum Round Trip Time in milliseconds 38 | for download measurements on this day. | 39 | | "ul_LOG_AVG_rnd1":6.589 | The LOG Average of upload measurements in this 40 | aggregation, using the first randomly selected test per IP address in the set. Value is presented in megabits per second. | 41 | | "ul_LOG_AVG_rnd2":6.589 | The LOG Average of upload measurements in this 42 | aggregation, using the second randomly selected test per IP address in the set. 43 | Value is presented in megabits per second. | 44 | | "ul_minRTT_LOG_AVG_rnd1":24.988 | The LOG Average of Minimum Round Trip Time of upload measurements in this aggregation, using the first randomly selected test per IP address in the set. Value is presented in milliseconds. | 45 | | "ul_minRTT_LOG_AVG_rnd2":25.003 | The LOG Average of Minimum Round Trip Time of upload measurements in this aggregation, using the second randomly selected test per IP address in the set. Value is presented in milliseconds. | 46 | | "ul_frac_bucket":0.113 | The fraction of upload measurements within this histogram bucket. | 47 | | "ul_samples_bucket":20769 | The number of upload measurement samples in this bucket. | 48 | | "ul_samples_day":183326 | The number of upload measurement samples on this day. | 49 | | "upload_MIN":0 | The minimum upload speed in megabits per second on this day. | 50 | | "upload_Q25":2.356 | The first quartile (25th percentile) upload speed in megabits per second on this day. | 51 | | "upload_MED":7.857 | The median upload speed in megabits per second on this day. | 52 | | "upload_AVG":28.034 | The average or mean upload speed in megabits per second on this day. | 53 | | "upload_Q75":17.306 | The upper quartile (75th percentile) upload speed in megabits per second on this day. | 54 | | "upload_MAX":3199.958 | The maximum upload speed in megabits per second on this day. | 55 | | "upload_minRTT_MED":23.83 | The median Minimum Round Trip Time in milliseconds 56 | for upload measurements on this day. | 57 | 58 | ## Statistics Also Available in BigQuery 59 | In addition to being available in this JSON API, the same data may be queried in 60 | the following dataset: `measurement-lab.statistics` 61 | -------------------------------------------------------------------------------- /docs/geo-precision.md: -------------------------------------------------------------------------------- 1 | # Geographic Precision in `stats-pipeline` 2 | Understanding how location in NDT test results are identified is important when 3 | looking at aggregations of that data by geography. 4 | 5 | * NDT tests are conducted between a person or device testing (client) to an 6 | available M-Lab server. 7 | * The NDT measurements, and the IP address of the client are collected on the 8 | server, and pushed to our central archives. 9 | * Along the way, the results are annotated using the IP address as a lookup key, 10 | in publicly available datasets like Maxmind. 11 | * Location fields in NDT data represent the locations of ISP's equipment that 12 | hands out IP addresses, not the address or GPS location of the client. 13 | 14 | In general, aggregate NDT data should be considered advisory for geographic 15 | areas smaller than the ISO 3166-2 second level subdivisions within a country. 16 | For example in the US, the first level in the ISO 3166-2 standard corresponds to 17 | US states. The US does not identify second level subdivisions in this standard, 18 | therefore M-Lab recommends that US geographic aggregations at the county level 19 | as the smallest level of geography appropriate for this dataset, given current 20 | understanding. 21 | -------------------------------------------------------------------------------- /docs/stats-overview.md: -------------------------------------------------------------------------------- 1 | # What Statistics are Provided by `stats-pipeline`, and How are They Calculated? 2 | This service uses the M-Lab team's best, current understanding and recommended 3 | techniques for aggregating data in the NDT dataset. Below we provide a general 4 | text description of the approaches used by all queries, and snippets of one 5 | query to illustrate how they are accomplished in BigQuery SQL. 6 | 7 | The general approach currently used in the queries `stats-pipeline` uses to 8 | generate statistics: 9 | 10 | ## Establish a set of LOG scale "buckets" within which measurement test results will be grouped 11 | 12 | Bucketing or grouping results is fairly common. Think of the buckets used in 13 | `stats-pipeline` as "speed" ranges, where all measurements used in the aggregation will 14 | fall within one of the ranges. The fraction of measurements within each bucket 15 | in a single day and geography make up the "histogram" for that day in that 16 | geography. 17 | 18 | The snippet of SQL below produces our histogram buckets: 19 | ```~sql 20 | WITH 21 | --Generate equal sized buckets in log-space between near 0 Mbps and ~1 Gbps+ 22 | buckets AS ( 23 | SELECT POW(10, x-.25) AS bucket_min, POW(10,x+.25) AS bucket_max 24 | FROM UNNEST(GENERATE_ARRAY(0, 3.5, .5)) AS x 25 | ), 26 | ``` 27 | 28 | returning 8 buckets with the following ranges: 29 | 30 | ``` 31 | **bucket_min** **bucket_max** 32 | 0.56234132519034907 1.7782794100389228 33 | 1.7782794100389228 5.6234132519034912 34 | 5.6234132519034912 17.782794100389228 35 | 17.782794100389228 56.234132519034908 36 | 56.234132519034908 177.82794100389228 37 | 177.82794100389228 562.341325190349 38 | 562.341325190349 1778.2794100389228 39 | 1778.2794100389228 5623.4132519034911 40 | ``` 41 | 42 | ## Select the initial set of tests and filter out those that may not be properly annotated. 43 | Each query in `stats-pipeline` gathers test rows identified between two dates and within a geographic level. 44 | 45 | ``` 46 | --Select the initial set of tests 47 | dl_per_location AS ( 48 | SELECT 49 | date, 50 | client.Geo.ContinentCode AS continent_code, 51 | NET.SAFE_IP_FROM_STRING(Client.IP) AS ip, 52 | id, 53 | a.MeanThroughputMbps AS mbps, 54 | a.MinRTT AS MinRTT 55 | FROM `measurement-lab.ndt.unified_downloads` 56 | WHERE date BETWEEN @startdate AND @enddate 57 | AND a.MeanThroughputMbps != 0 58 | ), 59 | --Filter for only tests With good locations and valid IPs 60 | dl_per_location_cleaned AS ( 61 | SELECT * FROM dl_per_location 62 | WHERE 63 | continent_code IS NOT NULL 64 | AND continent_code != "" 65 | AND ip IS NOT NULL 66 | ), 67 | ``` 68 | 69 | ## Fingerprint all cleaned tests, and sort in an arbitrary, but repeatable order 70 | By using the FARM_FINGERPRINT function, an arbitrary fingerprint is assigned to 71 | each row. Sorting on the fingerprint, along with the random selection in the 72 | next section effectively randomizes the set used to aggregate our statistics. 73 | 74 | ``` 75 | --Fingerprint all cleaned tests, in an arbitrary but repeatable order 76 | dl_fingerprinted AS ( 77 | SELECT 78 | date, 79 | continent_code, 80 | ip, 81 | ARRAY_AGG(STRUCT(ABS(FARM_FINGERPRINT(id)) AS ffid, mbps, MinRTT) ORDER BY ABS(FARM_FINGERPRINT(id))) AS members 82 | FROM dl_per_location_cleaned 83 | GROUP BY date, continent_code, ip 84 | ), 85 | ``` 86 | 87 | ## Select two random rows for each IP using a prime number larger than the total number of tests 88 | 89 | ``` 90 | dl_random_ip_rows_perday AS ( 91 | SELECT 92 | date, 93 | continent_code, 94 | ip, 95 | ARRAY_LENGTH(members) AS tests, 96 | members[SAFE_OFFSET(MOD(511232941,ARRAY_LENGTH(members)))] AS random1, 97 | members[SAFE_OFFSET(MOD(906686609,ARRAY_LENGTH(members)))] AS random2 98 | FROM dl_fingerprinted 99 | ), 100 | ``` 101 | 102 | ## Calculate log averages and statistics per day from random samples 103 | 104 | ``` 105 | dl_stats_per_day AS ( 106 | SELECT 107 | date, continent_code, 108 | COUNT(*) AS dl_samples_day, 109 | ROUND(POW(10,AVG(Safe.LOG10(random1.mbps))),3) AS dl_LOG_AVG_rnd1, 110 | ROUND(POW(10,AVG(Safe.LOG10(random2.mbps))),3) AS dl_LOG_AVG_rnd2, 111 | ROUND(POW(10,AVG(Safe.LOG10(random1.MinRtt))),3) AS dl_minRTT_LOG_AVG_rnd1, 112 | ROUND(POW(10,AVG(Safe.LOG10(random2.MinRtt))),3) AS dl_minRTT_LOG_AVG_rnd2, 113 | ROUND(MIN(random1.mbps),3) AS download_MIN, 114 | ROUND(APPROX_QUANTILES(random1.mbps, 100) [SAFE_ORDINAL(25)],3) AS download_Q25, 115 | ROUND(APPROX_QUANTILES(random1.mbps, 100) [SAFE_ORDINAL(50)],3) AS download_MED, 116 | ROUND(AVG(random1.mbps),3) AS download_AVG, 117 | ROUND(APPROX_QUANTILES(random1.mbps, 100) [SAFE_ORDINAL(75)],3) AS download_Q75, 118 | ROUND(MAX(random1.mbps),3) AS download_MAX, 119 | ROUND(APPROX_QUANTILES(random1.MinRTT, 100) [SAFE_ORDINAL(50)],3) AS download_minRTT_MED, 120 | FROM dl_random_ip_rows_perday 121 | GROUP BY continent_code, date 122 | ), 123 | ``` 124 | 125 | ## Count the samples that fall into each bucket and get frequencies for the histogram 126 | 127 | ``` 128 | dl_histogram AS ( 129 | SELECT 130 | date, 131 | continent_code, 132 | --Set the lowest bucket's min to zero, so all tests below the generated min of the lowest bin are included. 133 | CASE WHEN bucket_left = 0.5623413251903491 THEN 0 134 | ELSE bucket_left END AS bucket_min, 135 | bucket_right AS bucket_max, 136 | COUNTIF(random1.mbps < bucket_right AND random1.mbps >= bucket_left) AS dl_samples_bucket, 137 | ROUND(COUNTIF(random1.mbps < bucket_right AND random1.mbps >= bucket_left) / COUNT(*), 3) AS dl_frac_bucket 138 | FROM dl_random_ip_rows_perday CROSS JOIN buckets 139 | GROUP BY 140 | date, 141 | continent_code, 142 | bucket_min, 143 | bucket_max 144 | ), 145 | ``` 146 | -------------------------------------------------------------------------------- /exporter/testdata/export_query.sql: -------------------------------------------------------------------------------- 1 | SELECT * FROM {{ .sourceTable }} 2 | WHERE {{ .whereClause }} -------------------------------------------------------------------------------- /formatter/annotation.go: -------------------------------------------------------------------------------- 1 | package formatter 2 | 3 | import ( 4 | "encoding/json" 5 | "errors" 6 | "fmt" 7 | 8 | "cloud.google.com/go/bigquery" 9 | "cloud.google.com/go/civil" 10 | "github.com/m-lab/stats-pipeline/config" 11 | "github.com/m-lab/uuid-annotator/annotator" 12 | ) 13 | 14 | // AnnotationQueryFormatter prepares export queries for annotation data exported 15 | // by the stats pipeline. 16 | type AnnotationQueryFormatter struct { 17 | DateExpr string // BigQuery expression used to extract a row's date. 18 | } 19 | 20 | // NewTCPINFOAnnotationQueryFormatter creates a new AnnotationQueryFormatter. 21 | func NewTCPINFOAnnotationQueryFormatter() *AnnotationQueryFormatter { 22 | return &AnnotationQueryFormatter{DateExpr: "DATE(TestTime)"} 23 | } 24 | 25 | // Source returns a fully qualified bigquery table name. The year is ignored. 26 | func (f *AnnotationQueryFormatter) Source(project string, config config.Config, year int) string { 27 | return fmt.Sprintf("%s.%s.%s", project, config.Dataset, config.Table) 28 | } 29 | 30 | // Partitions returns a bigquery query for listing all partitions for a given 31 | // source table. The Annotation query partitions on `date`. 32 | func (f *AnnotationQueryFormatter) Partitions(source string) string { 33 | return fmt.Sprintf( 34 | `SELECT %s as date 35 | FROM %s 36 | WHERE %s < DATE('2020-03-11') 37 | GROUP BY date 38 | ORDER BY date`, f.DateExpr, source, f.DateExpr) 39 | } 40 | 41 | // Partition returns a date partition id based on a row returned by running the 42 | // Partitions() query. The partition id can be used in query templates. The 43 | // Annotation formatter conditions searches on the Date. 44 | func (f *AnnotationQueryFormatter) Partition(row map[string]bigquery.Value) string { 45 | date, ok := row["date"] 46 | if !ok { 47 | return "0001-01-01" // a noop expression. 48 | } 49 | partition, ok := date.(civil.Date) 50 | if !ok { 51 | return "0001-01-01" // a noop expression. 52 | } 53 | return fmt.Sprintf("%d-%02d-%02d", partition.Year, int(partition.Month), partition.Day) 54 | 55 | } 56 | 57 | // Marshal converts an export query row into a byte result suitable for writing 58 | // to disk. For annotation export, the format is marshalled to annotation.Annotations and then to JSON. 59 | func (f *AnnotationQueryFormatter) Marshal(rows []map[string]bigquery.Value) ([]byte, error) { 60 | if len(rows) == 0 { 61 | return nil, errors.New("zero length record") 62 | } 63 | // Serialize the bigquery row to JSON. This will include empty fields. 64 | j, err := json.Marshal(rows[0]) 65 | if err != nil { 66 | return nil, err 67 | } 68 | 69 | // Load JSON into real annotation struct. 70 | v := annotator.Annotations{} 71 | err = json.Unmarshal(j, &v) 72 | if err != nil { 73 | return nil, err 74 | } 75 | 76 | // Serialize the actual type to JSON, which omits empty fields. 77 | return json.Marshal(v) 78 | } 79 | -------------------------------------------------------------------------------- /formatter/annotation_test.go: -------------------------------------------------------------------------------- 1 | // Package formatter provides query formatters for export types supported by the stats pipeline. 2 | package formatter 3 | 4 | import ( 5 | "reflect" 6 | "testing" 7 | "time" 8 | 9 | "cloud.google.com/go/civil" 10 | 11 | "cloud.google.com/go/bigquery" 12 | "github.com/m-lab/stats-pipeline/config" 13 | ) 14 | 15 | func TestAnnotationQueryFormatter_Source(t *testing.T) { 16 | tests := []struct { 17 | name string 18 | project string 19 | config config.Config 20 | year int 21 | want string 22 | }{ 23 | { 24 | name: "success", 25 | project: "mlab-testing", 26 | config: config.Config{ 27 | Dataset: "statistics", 28 | Table: "bananas", 29 | }, 30 | year: 2019, 31 | want: "mlab-testing.statistics.bananas", 32 | }, 33 | } 34 | for _, tt := range tests { 35 | t.Run(tt.name, func(t *testing.T) { 36 | f := NewTCPINFOAnnotationQueryFormatter() 37 | if got := f.Source(tt.project, tt.config, tt.year); got != tt.want { 38 | t.Errorf("AnnotationQueryFormatter.Source() = %#v, want %#v", got, tt.want) 39 | } 40 | }) 41 | } 42 | } 43 | 44 | func TestAnnotationQueryFormatter_Partitions(t *testing.T) { 45 | tests := []struct { 46 | name string 47 | source string 48 | want string 49 | }{ 50 | { 51 | name: "success", 52 | source: "a.b.c", 53 | want: `SELECT DATE(TestTime) as date 54 | FROM a.b.c 55 | WHERE DATE(TestTime) < DATE('2020-03-11') 56 | GROUP BY date 57 | ORDER BY date`, 58 | }, 59 | } 60 | for _, tt := range tests { 61 | t.Run(tt.name, func(t *testing.T) { 62 | f := NewTCPINFOAnnotationQueryFormatter() 63 | if got := f.Partitions(tt.source); got != tt.want { 64 | t.Errorf("AnnotationQueryFormatter.Partitions() = %v, want %v", got, tt.want) 65 | } 66 | }) 67 | } 68 | } 69 | 70 | func TestAnnotationQueryFormatter_Partition(t *testing.T) { 71 | tests := []struct { 72 | name string 73 | row map[string]bigquery.Value 74 | want string 75 | }{ 76 | { 77 | name: "success", 78 | row: map[string]bigquery.Value{ 79 | "date": civil.DateOf(time.Date(2020, time.June, 01, 0, 0, 0, 0, time.UTC)), 80 | }, 81 | want: `2020-06-01`, 82 | }, 83 | { 84 | name: "error-missing-date", 85 | row: map[string]bigquery.Value{ 86 | "missing_date": 10, 87 | }, 88 | want: "0001-01-01", 89 | }, 90 | { 91 | name: "error-date-wrong-type", 92 | row: map[string]bigquery.Value{ 93 | "date": time.Date(2020, time.June, 01, 0, 0, 0, 0, time.UTC), 94 | }, 95 | want: "0001-01-01", 96 | }, 97 | } 98 | for _, tt := range tests { 99 | t.Run(tt.name, func(t *testing.T) { 100 | f := NewTCPINFOAnnotationQueryFormatter() 101 | if got := f.Partition(tt.row); got != tt.want { 102 | t.Errorf("AnnotationQueryFormatter.Partition() = %v, want %v", got, tt.want) 103 | } 104 | }) 105 | } 106 | } 107 | 108 | func TestAnnotationQueryFormatter_Marshal(t *testing.T) { 109 | tests := []struct { 110 | name string 111 | rows []map[string]bigquery.Value 112 | want []byte 113 | wantErr bool 114 | }{ 115 | { 116 | name: "success", 117 | rows: []map[string]bigquery.Value{ 118 | { 119 | "UUID": "abcdefghijklmnop", 120 | }, 121 | { 122 | "UUID": "IGNORED", 123 | }, 124 | }, 125 | want: []byte(`{"UUID":"abcdefghijklmnop","Timestamp":"0001-01-01T00:00:00Z","Server":{},"Client":{}}`), 126 | }, 127 | { 128 | name: "failure", 129 | rows: []map[string]bigquery.Value{ 130 | { 131 | // Functions are valid bigquery.Values but cannot be marshalled to JSON. 132 | "test": func() {}, 133 | }, 134 | }, 135 | wantErr: true, 136 | }, 137 | { 138 | name: "failure-empty-array", 139 | rows: []map[string]bigquery.Value{}, 140 | wantErr: true, 141 | }, 142 | } 143 | for _, tt := range tests { 144 | t.Run(tt.name, func(t *testing.T) { 145 | f := NewTCPINFOAnnotationQueryFormatter() 146 | got, err := f.Marshal(tt.rows) 147 | if (err != nil) != tt.wantErr { 148 | t.Errorf("AnnotationQueryFormatter.Marshal() error = %v, wantErr %v", err, tt.wantErr) 149 | return 150 | } 151 | if !reflect.DeepEqual(got, tt.want) { 152 | t.Errorf("AnnotationQueryFormatter.Marshal() = %q, want %q", string(got), string(tt.want)) 153 | } 154 | }) 155 | } 156 | } 157 | -------------------------------------------------------------------------------- /formatter/hopannotation1.go: -------------------------------------------------------------------------------- 1 | package formatter 2 | 3 | import ( 4 | "encoding/json" 5 | "errors" 6 | "fmt" 7 | 8 | "cloud.google.com/go/bigquery" 9 | "cloud.google.com/go/civil" 10 | "github.com/m-lab/stats-pipeline/config" 11 | "github.com/m-lab/traceroute-caller/hopannotation" 12 | ) 13 | 14 | // HopAnnotation1QueryFormatter prepares export queries for hopannotation1 data 15 | // exported by the stats pipeline. 16 | type HopAnnotation1QueryFormatter struct{} 17 | 18 | // NewTracerouteHopAnnotation1QueryFormatter creates a new HopAnnotation1QueryFormatter. 19 | func NewTracerouteHopAnnotation1QueryFormatter() *HopAnnotation1QueryFormatter { 20 | return &HopAnnotation1QueryFormatter{} 21 | } 22 | 23 | // Source returns a fully qualified bigquery table name. The year is ignored. 24 | func (f *HopAnnotation1QueryFormatter) Source(project string, config config.Config, year int) string { 25 | return fmt.Sprintf("%s.%s.%s", project, config.Dataset, config.Table) 26 | } 27 | 28 | // Partitions returns a bigquery query for listing all partitions for a given 29 | // source table. The HopAnnotation1 query partitions on `date`. 30 | func (f *HopAnnotation1QueryFormatter) Partitions(source string) string { 31 | return fmt.Sprintf( 32 | `SELECT DATE(TestTime) as date 33 | FROM %s 34 | WHERE DATE(TestTime) BETWEEN DATE('2019-03-29') AND DATE('2021-09-08') 35 | GROUP BY date 36 | ORDER BY date`, source) 37 | } 38 | 39 | // Partition returns a date partition id based on a row returned by running the 40 | // Partitions() query. The partition id can be used in query templates. The 41 | // HopAnnotation1 formatter condition searches on the Date. 42 | func (f *HopAnnotation1QueryFormatter) Partition(row map[string]bigquery.Value) string { 43 | date, ok := row["date"] 44 | if !ok { 45 | return "0001-01-01" // a noop expression. 46 | } 47 | partition, ok := date.(civil.Date) 48 | if !ok { 49 | return "0001-01-01" // a noop expression. 50 | } 51 | return fmt.Sprintf("%d-%02d-%02d", partition.Year, int(partition.Month), partition.Day) 52 | } 53 | 54 | // Marshal converts an export query row into a byte result suitable for writing 55 | // to disk. For hopannotation1 export, the format is marshalled to hopannotation.HopAnnotation1{}, 56 | // and then to JSON. 57 | func (f *HopAnnotation1QueryFormatter) Marshal(rows []map[string]bigquery.Value) ([]byte, error) { 58 | if len(rows) == 0 { 59 | return nil, errors.New("zero length record") 60 | } 61 | // Serialize the bigquery row to JSON. This will include empty fields. 62 | j, err := json.Marshal(rows[0]) 63 | if err != nil { 64 | return nil, err 65 | } 66 | 67 | // Load JSON into real hopannotation1 struct. 68 | v := hopannotation.HopAnnotation1{} 69 | err = json.Unmarshal(j, &v) 70 | if err != nil { 71 | return nil, err 72 | } 73 | 74 | // Serialize the actual type to JSON, which omits empty fields. 75 | return json.Marshal(v) 76 | } 77 | -------------------------------------------------------------------------------- /formatter/hopannotation1_test.go: -------------------------------------------------------------------------------- 1 | // Package formatter provides query formatters for export types supported by the stats pipeline. 2 | package formatter 3 | 4 | import ( 5 | "reflect" 6 | "testing" 7 | "time" 8 | 9 | "cloud.google.com/go/bigquery" 10 | "cloud.google.com/go/civil" 11 | "github.com/m-lab/stats-pipeline/config" 12 | "github.com/m-lab/uuid-annotator/annotator" 13 | ) 14 | 15 | func TestHopAnnotation1QueryFormatter_Source(t *testing.T) { 16 | tests := []struct { 17 | name string 18 | project string 19 | config config.Config 20 | year int 21 | want string 22 | }{ 23 | { 24 | name: "success", 25 | project: "mlab-testing", 26 | config: config.Config{ 27 | Dataset: "base_tables", 28 | Table: "traceroute", 29 | }, 30 | year: 2019, 31 | want: "mlab-testing.base_tables.traceroute", 32 | }, 33 | } 34 | for _, tt := range tests { 35 | t.Run(tt.name, func(t *testing.T) { 36 | f := NewTracerouteHopAnnotation1QueryFormatter() 37 | if got := f.Source(tt.project, tt.config, tt.year); got != tt.want { 38 | t.Errorf("HopAnnotation1QueryFormatter.Source() = %#v, want %#v", got, tt.want) 39 | } 40 | }) 41 | } 42 | } 43 | 44 | func TestHopAnnotation1QueryFormatter_Partitions(t *testing.T) { 45 | tests := []struct { 46 | name string 47 | source string 48 | want string 49 | }{ 50 | { 51 | name: "success", 52 | source: "a.b.c", 53 | want: `SELECT DATE(TestTime) as date 54 | FROM a.b.c 55 | WHERE DATE(TestTime) BETWEEN DATE('2019-03-29') AND DATE('2021-09-08') 56 | GROUP BY date 57 | ORDER BY date`, 58 | }, 59 | } 60 | for _, tt := range tests { 61 | t.Run(tt.name, func(t *testing.T) { 62 | f := NewTracerouteHopAnnotation1QueryFormatter() 63 | if got := f.Partitions(tt.source); got != tt.want { 64 | t.Errorf("HopAnnotation1QueryFormatter.Partitions() = %v, want %v", got, tt.want) 65 | } 66 | }) 67 | } 68 | } 69 | 70 | func TestHopAnnotation1QueryFormatter_Partition(t *testing.T) { 71 | tests := []struct { 72 | name string 73 | row map[string]bigquery.Value 74 | want string 75 | }{ 76 | { 77 | name: "success", 78 | row: map[string]bigquery.Value{ 79 | "date": civil.DateOf(time.Date(2020, time.June, 01, 0, 0, 0, 0, time.UTC)), 80 | }, 81 | want: `2020-06-01`, 82 | }, 83 | { 84 | name: "error-missing-date", 85 | row: map[string]bigquery.Value{ 86 | "missing_date": 10, 87 | }, 88 | want: "0001-01-01", 89 | }, 90 | { 91 | name: "error-date-wrong-type", 92 | row: map[string]bigquery.Value{ 93 | "date": time.Date(2020, time.June, 01, 0, 0, 0, 0, time.UTC), 94 | }, 95 | want: "0001-01-01", 96 | }, 97 | { 98 | name: "error-empty-map", 99 | row: map[string]bigquery.Value{}, 100 | want: "0001-01-01", 101 | }, 102 | } 103 | for _, tt := range tests { 104 | t.Run(tt.name, func(t *testing.T) { 105 | f := NewTracerouteHopAnnotation1QueryFormatter() 106 | if got := f.Partition(tt.row); got != tt.want { 107 | t.Errorf("HopAnnotation1QueryFormatter.Partition() = %v, want %v", got, tt.want) 108 | } 109 | }) 110 | } 111 | } 112 | 113 | func TestHopAnnotation1QueryFormatter_Marshal(t *testing.T) { 114 | tests := []struct { 115 | name string 116 | rows []map[string]bigquery.Value 117 | want []byte 118 | wantErr bool 119 | }{ 120 | { 121 | name: "hopannotation1-schema-success", 122 | rows: []map[string]bigquery.Value{ 123 | { 124 | "ID": "abcdefghijklmnop", 125 | "Timestamp": "2021-03-21T11:09:00Z", 126 | "Annotations": &annotator.ClientAnnotations{ 127 | Geo: &annotator.Geolocation{ 128 | ContinentCode: "EU", 129 | CountryCode: "ES", 130 | CountryName: "Spain", 131 | Region: "CT", 132 | Subdivision1ISOCode: "CT", 133 | Subdivision1Name: "Catalonia", 134 | Subdivision2ISOCode: "B", 135 | Subdivision2Name: "Barcelona", 136 | City: "Canet de Mar", 137 | PostalCode: "08360", 138 | Latitude: 1, 139 | Longitude: 2, 140 | AccuracyRadiusKm: 1, 141 | }, 142 | Network: &annotator.Network{ 143 | CIDR: "84.88.0.0/17", 144 | ASNumber: 13041, 145 | ASName: "Consorci de Universitaris de Catalunya", 146 | }, 147 | }, 148 | }, 149 | }, 150 | want: []byte(`{"ID":"abcdefghijklmnop","Timestamp":"2021-03-21T11:09:00Z","Annotations":{"Geo":{"ContinentCode":"EU","CountryCode":"ES","CountryName":"Spain","Region":"CT","Subdivision1ISOCode":"CT","Subdivision1Name":"Catalonia","Subdivision2ISOCode":"B","Subdivision2Name":"Barcelona","City":"Canet de Mar","PostalCode":"08360","Latitude":1,"Longitude":2,"AccuracyRadiusKm":1},"Network":{"CIDR":"84.88.0.0/17","ASNumber":13041,"ASName":"Consorci de Universitaris de Catalunya"}}}`), 151 | }, 152 | { 153 | name: "annotation-schema-ignored", 154 | rows: []map[string]bigquery.Value{ 155 | { 156 | "UUID": "abcdefghijklmnop", 157 | "Timestamp": "0001-01-01T00:00:00Z", 158 | "Server": annotator.ServerAnnotations{}, 159 | "Client": annotator.ClientAnnotations{}, 160 | }, 161 | }, 162 | want: []byte(`{"ID":"","Timestamp":"0001-01-01T00:00:00Z","Annotations":null}`), 163 | }, 164 | { 165 | name: "failure", 166 | rows: []map[string]bigquery.Value{ 167 | { 168 | // Functions are valid bigquery.Values but cannot be marshalled to JSON. 169 | "test": func() {}, 170 | }, 171 | }, 172 | wantErr: true, 173 | }, 174 | { 175 | name: "failure-empty-array", 176 | rows: []map[string]bigquery.Value{}, 177 | wantErr: true, 178 | }, 179 | } 180 | for _, tt := range tests { 181 | t.Run(tt.name, func(t *testing.T) { 182 | f := NewTracerouteHopAnnotation1QueryFormatter() 183 | got, err := f.Marshal(tt.rows) 184 | if (err != nil) != tt.wantErr { 185 | t.Errorf("HopAnnotation1QueryFormatter.Marshal() error = %v, wantErr %v", err, tt.wantErr) 186 | return 187 | } 188 | if !reflect.DeepEqual(got, tt.want) { 189 | t.Errorf("HopAnnotation1QueryFormatter.Marshal() = %q, want %q", string(got), string(tt.want)) 190 | } 191 | }) 192 | } 193 | } 194 | -------------------------------------------------------------------------------- /formatter/stats.go: -------------------------------------------------------------------------------- 1 | // Package formatter provides query formatters for export types supported by the stats pipeline. 2 | package formatter 3 | 4 | import ( 5 | "encoding/json" 6 | "fmt" 7 | 8 | "cloud.google.com/go/bigquery" 9 | "github.com/m-lab/stats-pipeline/config" 10 | ) 11 | 12 | // StatsQueryFormatter prepares export queries for statstics in the stats pipeline. 13 | type StatsQueryFormatter struct{} 14 | 15 | // NewStatsQueryFormatter creates a new StatsQueryFormatter. 16 | func NewStatsQueryFormatter() *StatsQueryFormatter { 17 | return &StatsQueryFormatter{} 18 | } 19 | 20 | // Source returns a fully qualified bigquery table name including a year suffix 21 | // used by the stats pipeline. 22 | func (f *StatsQueryFormatter) Source(project string, config config.Config, year int) string { 23 | return fmt.Sprintf("%s.%s.%s_%d", project, config.Dataset, config.Table, year) 24 | } 25 | 26 | // Partitions returns a bigquery query for listing all partitions for a given 27 | // source table. 28 | func (f *StatsQueryFormatter) Partitions(source string) string { 29 | return fmt.Sprintf( 30 | `SELECT shard 31 | FROM %s 32 | GROUP BY shard 33 | ORDER BY COUNT(*) DESC`, source) 34 | } 35 | 36 | // Partition returns a shard partition id based on a row returned by running the 37 | // Partitions() query. The partition id can be used in query templates. 38 | func (f *StatsQueryFormatter) Partition(row map[string]bigquery.Value) string { 39 | shard, ok := row["shard"] 40 | if !ok { 41 | return "-1" // a noop expression. 42 | } 43 | partition, ok := shard.(int64) 44 | if !ok { 45 | return "-1" // a noop expression. 46 | } 47 | return fmt.Sprintf("%d", partition) 48 | } 49 | 50 | // Marshal converts an export query row into a byte result suitable for writing 51 | // to disk. For stats pipeline export, the format is JSON. 52 | func (f *StatsQueryFormatter) Marshal(rows []map[string]bigquery.Value) ([]byte, error) { 53 | j, err := json.Marshal(rows) 54 | if err != nil { 55 | return nil, err 56 | } 57 | return j, nil 58 | } 59 | -------------------------------------------------------------------------------- /formatter/stats_test.go: -------------------------------------------------------------------------------- 1 | // Package formatter provides query formatters for export types supported by the stats pipeline. 2 | package formatter 3 | 4 | import ( 5 | "reflect" 6 | "testing" 7 | 8 | "cloud.google.com/go/bigquery" 9 | "github.com/m-lab/stats-pipeline/config" 10 | ) 11 | 12 | func TestStatsQueryFormatter_Source(t *testing.T) { 13 | tests := []struct { 14 | name string 15 | project string 16 | config config.Config 17 | year int 18 | want string 19 | }{ 20 | { 21 | name: "success", 22 | project: "mlab-testing", 23 | config: config.Config{ 24 | Dataset: "statistics", 25 | Table: "bananas", 26 | }, 27 | year: 2019, 28 | want: "mlab-testing.statistics.bananas_2019", 29 | }, 30 | } 31 | for _, tt := range tests { 32 | t.Run(tt.name, func(t *testing.T) { 33 | f := NewStatsQueryFormatter() 34 | if got := f.Source(tt.project, tt.config, tt.year); got != tt.want { 35 | t.Errorf("StatsQueryFormatter.Source() = %#v, want %#v", got, tt.want) 36 | } 37 | }) 38 | } 39 | } 40 | 41 | func TestStatsQueryFormatter_Partitions(t *testing.T) { 42 | tests := []struct { 43 | name string 44 | source string 45 | want string 46 | }{ 47 | { 48 | name: "success", 49 | source: "a.b.c", 50 | want: `SELECT shard 51 | FROM a.b.c 52 | GROUP BY shard 53 | ORDER BY COUNT(*) DESC`, 54 | }, 55 | } 56 | for _, tt := range tests { 57 | t.Run(tt.name, func(t *testing.T) { 58 | f := NewStatsQueryFormatter() 59 | if got := f.Partitions(tt.source); got != tt.want { 60 | t.Errorf("StatsQueryFormatter.Partitions() = %v, want %v", got, tt.want) 61 | } 62 | }) 63 | } 64 | } 65 | 66 | func TestStatsQueryFormatter_Partition(t *testing.T) { 67 | tests := []struct { 68 | name string 69 | row map[string]bigquery.Value 70 | want string 71 | }{ 72 | { 73 | name: "success", 74 | row: map[string]bigquery.Value{ 75 | "shard": int64(1234), 76 | }, 77 | want: "1234", 78 | }, 79 | { 80 | name: "error-missing-date", 81 | row: map[string]bigquery.Value{ 82 | "missing_shard": 10, 83 | }, 84 | want: "-1", 85 | }, 86 | { 87 | name: "error-date-wrong-type", 88 | row: map[string]bigquery.Value{ 89 | "shard": float64(1.3), 90 | }, 91 | want: "-1", 92 | }, 93 | } 94 | for _, tt := range tests { 95 | t.Run(tt.name, func(t *testing.T) { 96 | f := NewStatsQueryFormatter() 97 | if got := f.Partition(tt.row); got != tt.want { 98 | t.Errorf("StatsQueryFormatter.Partition() = %v, want %v", got, tt.want) 99 | } 100 | }) 101 | } 102 | } 103 | 104 | func TestStatsQueryFormatter_Marshal(t *testing.T) { 105 | tests := []struct { 106 | name string 107 | rows []map[string]bigquery.Value 108 | want []byte 109 | wantErr bool 110 | }{ 111 | { 112 | name: "success", 113 | rows: []map[string]bigquery.Value{ 114 | { 115 | "test": 1234, 116 | }, 117 | }, 118 | want: []byte(`[{"test":1234}]`), 119 | }, 120 | { 121 | name: "failure", 122 | rows: []map[string]bigquery.Value{ 123 | { 124 | // Functions are valid bigquery.Values but cannot be marshalled to JSON. 125 | "test": func() {}, 126 | }, 127 | }, 128 | wantErr: true, 129 | }, 130 | } 131 | for _, tt := range tests { 132 | t.Run(tt.name, func(t *testing.T) { 133 | f := NewStatsQueryFormatter() 134 | got, err := f.Marshal(tt.rows) 135 | if (err != nil) != tt.wantErr { 136 | t.Errorf("StatsQueryFormatter.Marshal() error = %v, wantErr %v", err, tt.wantErr) 137 | return 138 | } 139 | if !reflect.DeepEqual(got, tt.want) { 140 | t.Errorf("StatsQueryFormatter.Marshal() = %q, want %q", string(got), string(tt.want)) 141 | } 142 | }) 143 | } 144 | } 145 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/m-lab/stats-pipeline 2 | 3 | go 1.20 4 | 5 | require ( 6 | cloud.google.com/go v0.68.0 7 | cloud.google.com/go/bigquery v1.12.0 8 | cloud.google.com/go/storage v1.10.0 9 | github.com/googleapis/google-cloud-go-testing v0.0.0-20191008195207-8e1d251e947d 10 | github.com/m-lab/go v0.1.66 11 | github.com/m-lab/traceroute-caller v0.9.1 12 | github.com/m-lab/uuid-annotator v0.4.5 13 | github.com/prometheus/client_golang v1.11.0 14 | github.com/prometheus/client_model v0.2.0 15 | google.golang.org/api v0.32.0 16 | ) 17 | 18 | require ( 19 | github.com/araddon/dateparse v0.0.0-20200409225146-d820a6159ab1 // indirect 20 | github.com/beorn7/perks v1.0.1 // indirect 21 | github.com/cespare/xxhash/v2 v2.1.1 // indirect 22 | github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e // indirect 23 | github.com/golang/protobuf v1.4.3 // indirect 24 | github.com/googleapis/gax-go/v2 v2.0.5 // indirect 25 | github.com/jstemmer/go-junit-report v0.9.1 // indirect 26 | github.com/m-lab/tcp-info v1.5.3 // indirect 27 | github.com/matttproud/golang_protobuf_extensions v1.0.1 // indirect 28 | github.com/oschwald/geoip2-golang v1.5.0 // indirect 29 | github.com/oschwald/maxminddb-golang v1.8.0 // indirect 30 | github.com/prometheus/common v0.26.0 // indirect 31 | github.com/prometheus/procfs v0.6.0 // indirect 32 | go.opencensus.io v0.22.4 // indirect 33 | golang.org/x/lint v0.0.0-20200302205851-738671d3881b // indirect 34 | golang.org/x/mod v0.3.0 // indirect 35 | golang.org/x/net v0.0.0-20200927032502-5d4f70055728 // indirect 36 | golang.org/x/oauth2 v0.0.0-20200902213428-5d25da1a8d43 // indirect 37 | golang.org/x/sys v0.0.0-20210603081109-ebe580a85c40 // indirect 38 | golang.org/x/text v0.3.3 // indirect 39 | golang.org/x/tools v0.0.0-20201011145850-ed2f50202694 // indirect 40 | golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect 41 | google.golang.org/appengine v1.6.6 // indirect 42 | google.golang.org/genproto v0.0.0-20201009135657-4d944d34d83c // indirect 43 | google.golang.org/grpc v1.32.0 // indirect 44 | google.golang.org/protobuf v1.26.0-rc.1 // indirect 45 | ) 46 | -------------------------------------------------------------------------------- /histogram/table.go: -------------------------------------------------------------------------------- 1 | package histogram 2 | 3 | import ( 4 | "bytes" 5 | "context" 6 | "errors" 7 | "log" 8 | "net/http" 9 | "text/template" 10 | "time" 11 | 12 | "cloud.google.com/go/bigquery" 13 | "github.com/googleapis/google-cloud-go-testing/bigquery/bqiface" 14 | "github.com/prometheus/client_golang/prometheus" 15 | "github.com/prometheus/client_golang/prometheus/promauto" 16 | "google.golang.org/api/googleapi" 17 | ) 18 | 19 | var ( 20 | queryBytesProcessMetric = promauto.NewGaugeVec(prometheus.GaugeOpts{ 21 | Name: "stats_pipeline_histograms_bytes_processed", 22 | Help: "Bytes processed by the histogram query", 23 | }, []string{ 24 | "table", 25 | }) 26 | ) 27 | 28 | const ( 29 | dateFormat = "2006-01-02" 30 | deleteRowsTpl = "DELETE FROM {{.Table}} WHERE {{.DateField}} BETWEEN \"{{.Start}}\" AND \"{{.End}}\"" 31 | ) 32 | 33 | const ( 34 | // TimePartitioning represents date-based partitioning. 35 | TimePartitioning = "date" 36 | 37 | // RangePartitioning represents range-based partitioning. 38 | RangePartitioning = "range" 39 | ) 40 | 41 | type QueryConfig struct { 42 | // Query is the SQL Query to run. 43 | Query string 44 | 45 | // DateField is the field to use to determine which rows must be deleted 46 | // on a table update. It can be the same as partitionField, or different. 47 | DateField string 48 | 49 | // PartitionField is the field to use for date or range partitioning. 50 | PartitionField string 51 | 52 | // PartitionType is the type of partitioning to use (date or range). 53 | PartitionType string 54 | } 55 | 56 | // Table represents a bigquery table containing histogram data. 57 | // It embeds bigquery.Table and extends it with an UpdateHistogram method. 58 | type Table struct { 59 | bqiface.Table 60 | 61 | // config is the configuration for the query generating this table. 62 | config QueryConfig 63 | 64 | // client is the bigquery client used to execute the query. 65 | client bqiface.Client 66 | } 67 | 68 | // NewTable returns a new Table with the specified destination table, query 69 | // and BQ client. 70 | func NewTable(name string, ds string, config QueryConfig, 71 | client bqiface.Client) *Table { 72 | return &Table{ 73 | Table: client.Dataset(ds).Table(name), 74 | config: config, 75 | client: client, 76 | } 77 | } 78 | 79 | func (t *Table) queryConfig(query string) bqiface.QueryConfig { 80 | qc := bqiface.QueryConfig{} 81 | qc.Q = query 82 | return qc 83 | } 84 | 85 | // deleteRows removes rows where dateField is within the provided range. 86 | func (t *Table) deleteRows(ctx context.Context, start, end time.Time) error { 87 | tpl := template.Must(template.New("query").Parse(deleteRowsTpl)) 88 | q := &bytes.Buffer{} 89 | err := tpl.Execute(q, map[string]string{ 90 | "Table": t.DatasetID() + "." + t.TableID(), 91 | "DateField": t.config.DateField, 92 | "Start": start.Format(dateFormat), 93 | "End": end.Format(dateFormat), 94 | }) 95 | if err != nil { 96 | return err 97 | } 98 | // Check that table exists. 99 | _, err = t.client.Dataset(t.DatasetID()).Table(t.TableID()).Metadata(ctx) 100 | if e, ok := err.(*googleapi.Error); ok && e.Code == http.StatusNotFound { 101 | // deleting rows from a table that does not exist is a no-op. So, return 102 | // without error. 103 | return nil 104 | } 105 | log.Printf("Deleting existing histogram rows: %s\n", q.String()) 106 | query := t.client.Query(q.String()) 107 | _, err = query.Read(ctx) 108 | if err != nil { 109 | log.Printf("Warning: cannot remove previous rows (%v)", err) 110 | } 111 | return err 112 | } 113 | 114 | // UpdateHistogram generates the histogram data for the specified time range. 115 | // If any data for this time range exists already, it's overwritten. 116 | func (t *Table) UpdateHistogram(ctx context.Context, start, end time.Time) error { 117 | log.Printf("Updating table %s\n", t.TableID()) 118 | 119 | if t.config.DateField == "" || t.config.Query == "" { 120 | return errors.New("the Query and DateField must be specified") 121 | } 122 | 123 | // Make sure there aren't multiple histograms for this date range by 124 | // removing any previously inserted rows. 125 | err := t.deleteRows(ctx, start, end) 126 | if err != nil { 127 | return err 128 | } 129 | 130 | // Configure the histogram generation query. 131 | qc := t.queryConfig(t.config.Query) 132 | switch t.config.PartitionType { 133 | case RangePartitioning: 134 | qc.RangePartitioning = &bigquery.RangePartitioning{ 135 | Field: t.config.PartitionField, 136 | Range: &bigquery.RangePartitioningRange{ 137 | Start: 0, 138 | End: 3999, 139 | Interval: 1, 140 | }, 141 | } 142 | case TimePartitioning: 143 | qc.TimePartitioning = &bigquery.TimePartitioning{ 144 | Field: t.config.PartitionField, 145 | } 146 | default: 147 | // do nothing, since there is no need to partition the output. 148 | } 149 | 150 | qc.Dst = t.Table 151 | qc.WriteDisposition = bigquery.WriteAppend 152 | qc.Parameters = []bigquery.QueryParameter{ 153 | { 154 | Name: "startdate", 155 | Value: start.Format(dateFormat), 156 | }, 157 | { 158 | Name: "enddate", 159 | Value: end.Format(dateFormat), 160 | }, 161 | } 162 | query := t.client.Query(t.config.Query) 163 | query.SetQueryConfig(qc) 164 | 165 | // Run the histogram generation query. 166 | log.Printf("Generating histogram data for table %s\n", t.TableID()) 167 | bqJob, err := query.Run(ctx) 168 | if err != nil { 169 | return err 170 | } 171 | status, err := bqJob.Wait(ctx) 172 | if err != nil { 173 | return err 174 | } 175 | // Get bytes processed by the current query. 176 | queryBytesProcessMetric.WithLabelValues(t.Table.FullyQualifiedName()). 177 | Add(float64(status.Statistics.TotalBytesProcessed)) 178 | if status.Err() != nil { 179 | return status.Err() 180 | } 181 | return nil 182 | } 183 | -------------------------------------------------------------------------------- /histogram/table_test.go: -------------------------------------------------------------------------------- 1 | package histogram 2 | 3 | import ( 4 | "context" 5 | "errors" 6 | "net/http" 7 | "reflect" 8 | "testing" 9 | "time" 10 | 11 | "cloud.google.com/go/bigquery" 12 | "github.com/googleapis/google-cloud-go-testing/bigquery/bqiface" 13 | "github.com/m-lab/go/prometheusx/promtest" 14 | "github.com/m-lab/go/rtx" 15 | "google.golang.org/api/googleapi" 16 | ) 17 | 18 | // ***** mockClient ***** 19 | type mockClient struct { 20 | bqiface.Client 21 | queryReadMustFail bool 22 | queryRunMustFail bool 23 | tableMissingErr bool 24 | queries []string 25 | } 26 | 27 | func (c *mockClient) Dataset(name string) bqiface.Dataset { 28 | return &mockDataset{ 29 | name: name, 30 | tableMissingErr: c.tableMissingErr, 31 | } 32 | } 33 | 34 | func (c *mockClient) Query(query string) bqiface.Query { 35 | return &mockQuery{ 36 | client: c, 37 | q: query, 38 | readMustFail: c.queryReadMustFail, 39 | runMustFail: c.queryRunMustFail, 40 | } 41 | } 42 | 43 | // ***** mockDataset ***** 44 | type mockDataset struct { 45 | bqiface.Dataset 46 | name string 47 | tableMissingErr bool 48 | } 49 | 50 | func (ds *mockDataset) Table(name string) bqiface.Table { 51 | return &mockTable{ 52 | ds: ds.name, 53 | name: name, 54 | tableMissingErr: ds.tableMissingErr, 55 | } 56 | } 57 | 58 | // ***** mockTable ***** 59 | type mockTable struct { 60 | bqiface.Table 61 | ds string 62 | name string 63 | tableMissingErr bool 64 | } 65 | 66 | func (t *mockTable) DatasetID() string { 67 | return t.ds 68 | } 69 | 70 | func (t *mockTable) TableID() string { 71 | return t.name 72 | } 73 | 74 | func (t *mockTable) FullyQualifiedName() string { 75 | return t.name 76 | } 77 | 78 | func (t *mockTable) Metadata(ctx context.Context) (*bigquery.TableMetadata, error) { 79 | if t.tableMissingErr { 80 | return nil, &googleapi.Error{ 81 | Code: http.StatusNotFound, 82 | } 83 | } 84 | return nil, nil 85 | } 86 | 87 | // ********** mockQuery ********** 88 | type mockQuery struct { 89 | bqiface.Query 90 | client *mockClient 91 | q string 92 | qc bqiface.QueryConfig 93 | readMustFail bool 94 | runMustFail bool 95 | } 96 | 97 | func (q *mockQuery) Run(context.Context) (bqiface.Job, error) { 98 | if q.runMustFail { 99 | return nil, errors.New("Run() failed") 100 | } 101 | // Store the query's content into the client so it can be checked later. 102 | q.client.queries = append(q.client.queries, q.q) 103 | return &mockJob{}, nil 104 | } 105 | 106 | func (q *mockQuery) Read(context.Context) (bqiface.RowIterator, error) { 107 | if q.readMustFail { 108 | return nil, errors.New("Read() failed") 109 | } 110 | // Store the query's content into the client so it can be checked later. 111 | q.client.queries = append(q.client.queries, q.q) 112 | return &mockRowIterator{}, nil 113 | } 114 | 115 | func (q *mockQuery) SetQueryConfig(qc bqiface.QueryConfig) { 116 | q.qc = qc 117 | } 118 | 119 | // ***** mockRowIterator ***** 120 | type mockRowIterator struct { 121 | bqiface.RowIterator 122 | } 123 | 124 | // ***** mockJob ***** 125 | type mockJob struct { 126 | bqiface.Job 127 | waitMustFail bool 128 | } 129 | 130 | func (j *mockJob) Wait(context.Context) (*bigquery.JobStatus, error) { 131 | if j.waitMustFail { 132 | return nil, errors.New("Wait() failed") 133 | } 134 | return &bigquery.JobStatus{ 135 | State: bigquery.Done, 136 | Statistics: &bigquery.JobStatistics{ 137 | TotalBytesProcessed: 10, 138 | }, 139 | }, nil 140 | } 141 | 142 | // ***** Tests ***** 143 | func TestNewTable(t *testing.T) { 144 | table := NewTable("test_table", "dataset", QueryConfig{}, &mockClient{}) 145 | if table == nil { 146 | t.Errorf("NewTable() returned nil.") 147 | } 148 | } 149 | 150 | func TestTable_queryConfig(t *testing.T) { 151 | testQuery := "SELECT 1" 152 | table := NewTable("test", "dataset", QueryConfig{}, &mockClient{}) 153 | q := table.queryConfig(testQuery) 154 | if q.Q != testQuery { 155 | t.Errorf("queryConfig(): expected %s, got %s.", testQuery, q.Q) 156 | } 157 | } 158 | 159 | func TestTable_deleteRows(t *testing.T) { 160 | emptyConfig := QueryConfig{} 161 | table := NewTable("test", "dataset", emptyConfig, &mockClient{}) 162 | err := table.deleteRows(context.Background(), time.Now(), time.Now().Add(1*time.Minute)) 163 | if err != nil { 164 | t.Errorf("deleteRows() returned err: %v", err) 165 | } 166 | 167 | table = NewTable("test", "dataset", emptyConfig, &mockClient{ 168 | tableMissingErr: true, 169 | }) 170 | err = table.deleteRows(context.Background(), time.Now(), time.Now().Add(1*time.Minute)) 171 | if err != nil { 172 | t.Errorf("deleteRows() returned err: %v", err) 173 | } 174 | 175 | table = NewTable("test", "dataset", emptyConfig, &mockClient{ 176 | queryReadMustFail: true, 177 | }) 178 | err = table.deleteRows(context.Background(), time.Now(), time.Now().Add(1*time.Minute)) 179 | if err == nil { 180 | t.Errorf("deleteRows(): expected err, returned nil.") 181 | } 182 | } 183 | 184 | func TestTable_UpdateHistogram(t *testing.T) { 185 | start, err := time.Parse(dateFormat, "2020-01-01") 186 | rtx.Must(err, "cannot parse start time") 187 | end, err := time.Parse(dateFormat, "2020-12-31") 188 | rtx.Must(err, "cannot parse end time") 189 | tests := []struct { 190 | name string 191 | config QueryConfig 192 | client *mockClient 193 | want []string 194 | wantErr bool 195 | }{ 196 | { 197 | name: "ok", 198 | config: QueryConfig{ 199 | Query: "histogram generation query", 200 | DateField: "date", 201 | }, 202 | client: &mockClient{}, 203 | want: []string{ 204 | "DELETE FROM test_ds.test_table WHERE date BETWEEN \"2020-01-01\" AND \"2020-12-31\"", 205 | "histogram generation query", 206 | }, 207 | }, 208 | { 209 | name: "missing-date-field", 210 | config: QueryConfig{ 211 | Query: "", 212 | }, 213 | client: &mockClient{}, 214 | wantErr: true, 215 | }, 216 | { 217 | name: "missing-query-field", 218 | config: QueryConfig{ 219 | DateField: "date", 220 | }, 221 | client: &mockClient{}, 222 | wantErr: true, 223 | }, 224 | { 225 | name: "delete-rows-failure", 226 | config: QueryConfig{ 227 | Query: "test", 228 | }, 229 | client: &mockClient{ 230 | queryReadMustFail: true, 231 | }, 232 | wantErr: true, 233 | }, 234 | { 235 | name: "query-run-failure", 236 | config: QueryConfig{ 237 | Query: "test", 238 | }, 239 | client: &mockClient{ 240 | queryRunMustFail: true, 241 | }, 242 | wantErr: true, 243 | }, 244 | } 245 | for _, tt := range tests { 246 | t.Run(tt.name, func(t *testing.T) { 247 | hist := &Table{ 248 | Table: tt.client.Dataset("test_ds").Table("test_table"), 249 | config: tt.config, 250 | client: tt.client, 251 | } 252 | if err := hist.UpdateHistogram(context.Background(), start, 253 | end); (err != nil) != tt.wantErr { 254 | t.Errorf("Table.UpdateHistogram() error = %v, wantErr %v", err, tt.wantErr) 255 | } 256 | 257 | if mockClient, ok := hist.client.(*mockClient); ok { 258 | if tt.want != nil && !reflect.DeepEqual(mockClient.queries, tt.want) { 259 | t.Errorf("UpdateHistogram(): expected %v, got %v", tt.want, 260 | mockClient.queries) 261 | } 262 | } else { 263 | t.Fatalf("UpdateHistogram(): client isn't a mockClient.") 264 | } 265 | }) 266 | } 267 | } 268 | 269 | func TestPrometheusMetrics(t *testing.T) { 270 | queryBytesProcessMetric.WithLabelValues("x") 271 | 272 | promtest.LintMetrics(t) 273 | } 274 | -------------------------------------------------------------------------------- /k8s/data-pipeline/config/config-annotation-export.json: -------------------------------------------------------------------------------- 1 | { 2 | "tcpinfo": { 3 | "exportQueryFile": "/annotation/exports/tcpinfo_annotation_export.sql", 4 | "dataset": "base_tables", 5 | "table": "tcpinfo", 6 | "outputPath": "{{ .year_month_day }}/{{ .UUID }}.json" 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /k8s/data-pipeline/config/config-hopannotation1-export.json: -------------------------------------------------------------------------------- 1 | { 2 | "traceroute": { 3 | "exportQueryFile": "/annotation/exports/traceroute_hopannotation1_export.sql", 4 | "dataset": "base_tables", 5 | "table": "traceroute", 6 | "outputPath": "{{ .Date }}/{{ .FilenameTimestamp }}_{{ .Hostname }}_{{ .IP }}.json" 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /k8s/data-pipeline/config/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "continents": { 3 | "histogramQueryFile": "statistics/queries/continent_histogram.sql", 4 | "exportQueryFile": "statistics/exports/continents.sql", 5 | "dataset": "statistics", 6 | "table": "continents", 7 | "dateField": "date", 8 | "partitionField": "shard", 9 | "partitionType": "range", 10 | "outputPath": "v0/{{ .continent_code }}/{{ .year }}/histogram_daily_stats.json" 11 | }, 12 | "countries": { 13 | "histogramQueryFile": "statistics/queries/continent_country_histogram.sql", 14 | "exportQueryFile": "statistics/exports/countries.sql", 15 | "dataset": "statistics", 16 | "table": "countries", 17 | "dateField": "date", 18 | "partitionField": "shard", 19 | "partitionType": "range", 20 | "outputPath": "v0/{{ .continent_code }}/{{ .country_code }}/{{ .year }}/histogram_daily_stats.json" 21 | }, 22 | "regions": { 23 | "histogramQueryFile": "statistics/queries/continent_country_region_histogram.sql", 24 | "exportQueryFile": "statistics/exports/regions.sql", 25 | "dataset": "statistics", 26 | "table": "regions", 27 | "dateField": "date", 28 | "partitionField": "shard", 29 | "partitionType": "range", 30 | "outputPath": "v0/{{ .continent_code }}/{{ .country_code }}/{{ .ISO3166_2region1 }}/{{ .year }}/histogram_daily_stats.json" 31 | }, 32 | "cities": { 33 | "histogramQueryFile": "statistics/queries/continent_country_region_city_histogram.sql", 34 | "exportQueryFile": "statistics/exports/cities.sql", 35 | "dataset": "statistics", 36 | "table": "cities", 37 | "dateField": "date", 38 | "partitionField": "shard", 39 | "partitionType": "range", 40 | "outputPath": "v0/{{ .continent_code }}/{{ .country_code }}/{{ .ISO3166_2region1 }}/{{ .city }}/{{ .year }}/histogram_daily_stats.json" 41 | }, 42 | "tracts": { 43 | "histogramQueryFile": "statistics/queries/us_census_tracts_histogram.sql", 44 | "exportQueryFile": "statistics/exports/us_tracts.sql", 45 | "dataset": "statistics", 46 | "table": "us_tracts", 47 | "dateField": "date", 48 | "partitionField": "shard", 49 | "partitionType": "range", 50 | "outputPath": "v0/NA/US/tracts/{{ .GEOID }}/{{ .year }}/histogram_daily_stats.json" 51 | }, 52 | "states": { 53 | "histogramQueryFile": "statistics/queries/us_state_territories_histogram.sql", 54 | "exportQueryFile": "statistics/exports/us_states.sql", 55 | "dataset": "statistics", 56 | "table": "us_states", 57 | "dateField": "date", 58 | "partitionField": "shard", 59 | "partitionType": "range", 60 | "outputPath": "v0/NA/US/states/{{ .GEOID }}/{{ .year }}/histogram_daily_stats.json" 61 | }, 62 | "counties": { 63 | "histogramQueryFile": "statistics/queries/us_county_histogram.sql", 64 | "exportQueryFile": "statistics/exports/us_counties.sql", 65 | "dataset": "statistics", 66 | "table": "us_counties", 67 | "dateField": "date", 68 | "partitionField": "shard", 69 | "partitionType": "range", 70 | "outputPath": "v0/NA/US/counties/{{ .GEOID }}/{{ .year }}/histogram_daily_stats.json" 71 | }, 72 | "continents_asn": { 73 | "histogramQueryFile": "statistics/queries/continent_asn_histogram.sql", 74 | "exportQueryFile": "statistics/exports/continents_asn.sql", 75 | "dataset": "statistics", 76 | "table": "continents_asn", 77 | "dateField": "date", 78 | "partitionField": "shard", 79 | "partitionType": "range", 80 | "outputPath": "v0/{{ .continent_code }}/asn/{{ .asn }}/{{ .year }}/histogram_daily_stats.json" 81 | }, 82 | "countries_asn": { 83 | "histogramQueryFile": "statistics/queries/continent_country_asn_histogram.sql", 84 | "exportQueryFile": "statistics/exports/countries_asn.sql", 85 | "dataset": "statistics", 86 | "table": "countries_asn", 87 | "dateField": "date", 88 | "partitionField": "shard", 89 | "partitionType": "range", 90 | "outputPath": "v0/{{ .continent_code }}/{{ .country_code }}/asn/{{ .asn }}/{{ .year }}/histogram_daily_stats.json" 91 | }, 92 | "regions_asn": { 93 | "histogramQueryFile": "statistics/queries/continent_country_region_asn_histogram.sql", 94 | "exportQueryFile": "statistics/exports/regions_asn.sql", 95 | "dataset": "statistics", 96 | "table": "regions_asn", 97 | "dateField": "date", 98 | "partitionField": "shard", 99 | "partitionType": "range", 100 | "outputPath": "v0/{{ .continent_code }}/{{ .country_code }}/{{ .ISO3166_2region1 }}/asn/{{ .asn }}/{{ .year }}/histogram_daily_stats.json" 101 | }, 102 | "cities_asn": { 103 | "histogramQueryFile": "statistics/queries/continent_country_region_city_asn_histogram.sql", 104 | "exportQueryFile": "statistics/exports/cities_asn.sql", 105 | "dataset": "statistics", 106 | "table": "cities_asn", 107 | "dateField": "date", 108 | "partitionField": "shard", 109 | "partitionType": "range", 110 | "outputPath": "v0/{{ .continent_code }}/{{ .country_code }}/{{ .ISO3166_2region1 }}/{{ .city }}/asn/{{ .asn }}/{{ .year }}/histogram_daily_stats.json" 111 | }, 112 | "states_asn": { 113 | "histogramQueryFile": "statistics/queries/us_state_territories_asn_histogram.sql", 114 | "exportQueryFile": "statistics/exports/us_states_asn.sql", 115 | "dataset": "statistics", 116 | "table": "us_states_asn", 117 | "dateField": "date", 118 | "partitionField": "shard", 119 | "partitionType": "range", 120 | "outputPath": "v0/NA/US/states/{{ .GEOID }}/asn/{{ .asn }}/{{ .year }}/histogram_daily_stats.json" 121 | }, 122 | "counties_asn": { 123 | "histogramQueryFile": "statistics/queries/us_county_asn_histogram.sql", 124 | "exportQueryFile": "statistics/exports/us_counties_asn.sql", 125 | "dataset": "statistics", 126 | "table": "us_counties_asn", 127 | "dateField": "date", 128 | "partitionField": "shard", 129 | "partitionType": "range", 130 | "outputPath": "v0/NA/US/counties/{{ .GEOID }}/asn/{{ .asn }}/{{ .year }}/histogram_daily_stats.json" 131 | }, 132 | "tracts_asn": { 133 | "histogramQueryFile": "statistics/queries/us_census_tracts_asn_histogram.sql", 134 | "exportQueryFile": "statistics/exports/us_tracts_asn.sql", 135 | "dataset": "statistics", 136 | "table": "us_tracts_asn", 137 | "dateField": "date", 138 | "partitionField": "shard", 139 | "partitionType": "range", 140 | "outputPath": "v0/NA/US/tracts/{{ .GEOID }}/asn/{{ .asn }}/{{ .year }}/histogram_daily_stats.json" 141 | }, 142 | "global_asn": { 143 | "histogramQueryFile": "statistics/queries/global_asn_histogram.sql", 144 | "exportQueryFile": "statistics/exports/global_asn.sql", 145 | "dataset": "statistics", 146 | "table": "global_asn", 147 | "dateField": "date", 148 | "partitionField": "shard", 149 | "partitionType": "range", 150 | "outputPath": "v0/asn/{{ .asn }}/{{ .year }}/histogram_daily_stats.json" 151 | }, 152 | "canary": { 153 | "histogramQueryFile": "statistics/queries/canary.sql", 154 | "dataset": "statistics", 155 | "table": "ndt_canary", 156 | "dateField": "test_date", 157 | "partitionField": "test_date", 158 | "partitionType": "date" 159 | } 160 | } 161 | -------------------------------------------------------------------------------- /k8s/data-pipeline/deployments/hopannotation1-export-template.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: hopannotation1-export 5 | spec: 6 | strategy: 7 | type: Recreate 8 | replicas: 1 9 | selector: 10 | matchLabels: 11 | run: hopannotation1-export 12 | template: 13 | metadata: 14 | labels: 15 | run: hopannotation1-export 16 | annotations: 17 | prometheus.io/scrape: 'true' 18 | spec: 19 | containers: 20 | - name: stats-pipeline 21 | # The exact image to be deployed is replaced by gke-deploy, this is 22 | # a placeholder. 23 | image: gcr.io/{{GCLOUD_PROJECT}}/stats-pipeline 24 | args: 25 | # NOTE: in "local" output mode, and export "hopannotation1" mode, the 26 | # stats-pipeline will write results to subdirectories of the named 27 | # -bucket directory. 28 | - -prometheusx.listen-address=:9990 29 | - -exporter.query-workers=3 30 | - -config=/etc/hopannotation1-export/config-hopannotation1-export.json 31 | - -export=hopannotation1 32 | - -output=local 33 | - -bucket=/var/spool/ndt/hopannotation1 34 | - -project={{GCLOUD_PROJECT}} 35 | ports: 36 | # This is so Prometheus can be scraped. 37 | - name: prometheus-port 38 | containerPort: 9990 39 | - name: service-port 40 | containerPort: 8080 41 | livenessProbe: 42 | httpGet: 43 | path: /metrics 44 | port: prometheus-port 45 | # Note: This service runs on a dedicated 8-CPU node. 46 | resources: 47 | requests: 48 | cpu: "5" 49 | memory: "2Gi" 50 | volumeMounts: 51 | - name: config-volume 52 | mountPath: /etc/hopannotation1-export 53 | - name: shared-export-dir 54 | mountPath: /var/spool/ndt 55 | - name: pusher 56 | image: measurementlab/pusher:v1.20 57 | ports: 58 | - name: pusher-port 59 | containerPort: 9991 60 | args: 61 | - -prometheusx.listen-address=:9991 62 | - -bucket=thirdparty-annotation-{{GCLOUD_PROJECT}} 63 | - -experiment=ndt 64 | - -datatype=hopannotation1 65 | - -directory=/var/spool/ndt 66 | - -node_name=third-party 67 | # The following thresholds create archive uploads more quickly than defaults. 68 | # NOTE: JSON files compress around 60x, so 3MB archives are about 180MB on disk. 69 | - -archive_size_threshold=2MB 70 | - -max_file_age=10m # After writing, No need to wait to upload a file (default 1h). 71 | - -archive_wait_time_min=5m # (default 30m0s) 72 | - -archive_wait_time_expected=10m # (default 1h0m0s) 73 | - -archive_wait_time_max=15m # (default 2h0m0s) 74 | - -sigterm_wait_time=60s 75 | - -metadata=MLAB.server.name=data-pipeline 76 | - -metadata=MLAB.experiment.name=ndt 77 | - -metadata=MLAB.pusher.image=measurementlab/pusher:v1.20 78 | - -metadata=MLAB.pusher.src.url=https://github.com/m-lab/pusher/tree/v1.20 79 | resources: 80 | requests: 81 | cpu: "1500m" 82 | volumeMounts: 83 | - name: shared-export-dir 84 | mountPath: /var/spool/ndt 85 | 86 | # Run a node-exporter as part of the pod so that it has access to the same 87 | # namespace and volumes. This allows simple disk usage monitoring of the 88 | # shared disk. 89 | - image: prom/node-exporter:v0.18.1 90 | name: node-exporter 91 | # Note: only enable the filesystem collector, and ignore system paths. 92 | args: ["--no-collector.arp", 93 | "--no-collector.bcache", 94 | "--no-collector.bonding", 95 | "--no-collector.conntrack", 96 | "--no-collector.cpu", 97 | "--no-collector.cpufreq", 98 | "--no-collector.diskstats", 99 | "--no-collector.edac", 100 | "--no-collector.entropy", 101 | "--no-collector.filefd", 102 | "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($|/)", 103 | "--no-collector.hwmon", 104 | "--no-collector.infiniband", 105 | "--no-collector.ipvs", 106 | "--no-collector.loadavg", 107 | "--no-collector.mdadm", 108 | "--no-collector.meminfo", 109 | "--no-collector.netclass", 110 | "--no-collector.netdev", 111 | "--no-collector.netstat", 112 | "--no-collector.nfs", 113 | "--no-collector.nfsd", 114 | "--no-collector.pressure", 115 | "--no-collector.sockstat", 116 | "--no-collector.stat", 117 | "--no-collector.textfile", 118 | "--no-collector.time", 119 | "--no-collector.timex", 120 | "--no-collector.uname", 121 | "--no-collector.vmstat", 122 | "--no-collector.xfs", 123 | "--no-collector.zfs"] 124 | ports: 125 | - containerPort: 9100 126 | resources: 127 | requests: 128 | memory: "10Mi" 129 | cpu: "500m" 130 | limits: 131 | memory: "10Mi" 132 | cpu: "500m" 133 | volumeMounts: 134 | - name: shared-export-dir 135 | mountPath: /var-spool-ndt 136 | nodeSelector: 137 | statistics-node: 'true' 138 | volumes: 139 | - name: config-volume 140 | configMap: 141 | name: stats-pipeline-config 142 | - name: shared-export-dir 143 | emptyDir: 144 | # NOTE: allocates 50% of available RAM for tmpfs. 145 | medium: Memory 146 | -------------------------------------------------------------------------------- /k8s/data-pipeline/deployments/stats-pipeline.yaml.template: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: stats-pipeline 5 | spec: 6 | strategy: 7 | type: Recreate 8 | replicas: 1 9 | selector: 10 | matchLabels: 11 | run: stats-pipeline 12 | template: 13 | metadata: 14 | labels: 15 | run: stats-pipeline 16 | annotations: 17 | prometheus.io/scrape: 'true' 18 | spec: 19 | containers: 20 | - name: stats-pipeline 21 | # The exact image to be deployed is replaced by gke-deploy, this is 22 | # a placeholder. 23 | image: gcr.io/{{GCLOUD_PROJECT}}/stats-pipeline 24 | env: 25 | - name: PROJECT 26 | value: "{{GCLOUD_PROJECT}}" 27 | - name: CONFIG 28 | value: "/etc/stats-pipeline/config.json" 29 | - name: BUCKET 30 | value: "statistics-{{GCLOUD_PROJECT}}" 31 | ports: 32 | # This is so Prometheus can be scraped. 33 | - name: prometheus-port 34 | containerPort: 9990 35 | - name: service-port 36 | containerPort: 8080 37 | livenessProbe: 38 | httpGet: 39 | path: /metrics 40 | port: prometheus-port 41 | # Note: This service runs on a dedicated 8-CPU node. 42 | resources: 43 | limits: 44 | cpu: "8" 45 | memory: "30Gi" 46 | requests: 47 | cpu: "0.5" 48 | memory: "2Gi" 49 | volumeMounts: 50 | - name: config-volume 51 | mountPath: /etc/stats-pipeline 52 | nodeSelector: 53 | statistics-node: 'true' 54 | volumes: 55 | - name: config-volume 56 | configMap: 57 | name: stats-pipeline-config 58 | -------------------------------------------------------------------------------- /k8s/data-pipeline/jobs/hopannotation1-export-cronjob.template: -------------------------------------------------------------------------------- 1 | # cronjob.yaml 2 | apiVersion: batch/v1 3 | kind: CronJob 4 | metadata: 5 | name: hopannotation1-export-cronjob 6 | spec: 7 | schedule: "{{ANNOTATION_EXPORT_CRON_SCHEDULE}}" 8 | concurrencyPolicy: Forbid 9 | jobTemplate: 10 | spec: 11 | template: 12 | spec: 13 | restartPolicy: Never 14 | containers: 15 | - name: maptiles-runner 16 | # The exact image to be deployed is replaced by gke-deploy, this is 17 | # a placeholder 18 | image: gcr.io/{{GCLOUD_PROJECT}}/stats-pipeline-runner 19 | args: 20 | - /bin/bash 21 | - run-pipeline.sh 22 | - "hopannotation1-export-service:8080" 23 | -------------------------------------------------------------------------------- /k8s/data-pipeline/jobs/stats-pipeline-cronjob.yaml.template: -------------------------------------------------------------------------------- 1 | # cronjob.yaml 2 | apiVersion: batch/v1 3 | kind: CronJob 4 | metadata: 5 | name: stats-pipeline-cronjob 6 | spec: 7 | schedule: "{{PIPELINE_CRON_SCHEDULE}}" 8 | concurrencyPolicy: Forbid 9 | jobTemplate: 10 | spec: 11 | template: 12 | spec: 13 | restartPolicy: Never 14 | containers: 15 | - name: maptiles-runner 16 | # The exact image to be deployed is replaced by gke-deploy, this is 17 | # a placeholder 18 | image: gcr.io/{{GCLOUD_PROJECT}}/stats-pipeline-runner 19 | args: 20 | - /bin/bash 21 | - run-pipeline.sh 22 | - "stats-pipeline-service:8080" 23 | -------------------------------------------------------------------------------- /k8s/data-pipeline/services/hopannotation1-export.yaml: -------------------------------------------------------------------------------- 1 | # A ClusterIP service to make stats-pipeline available to other pods. 2 | apiVersion: v1 3 | kind: Service 4 | metadata: 5 | name: hopannotation1-export-service 6 | spec: 7 | type: ClusterIP 8 | selector: 9 | run: hopannotation1-export 10 | ports: 11 | - protocol: TCP 12 | port: 8080 13 | -------------------------------------------------------------------------------- /k8s/data-pipeline/services/stats-pipeline.yaml: -------------------------------------------------------------------------------- 1 | # A ClusterIP service to make stats-pipeline available to other pods. 2 | apiVersion: v1 3 | kind: Service 4 | metadata: 5 | name: stats-pipeline-service 6 | spec: 7 | type: ClusterIP 8 | selector: 9 | run: stats-pipeline 10 | ports: 11 | - protocol: TCP 12 | port: 8080 13 | -------------------------------------------------------------------------------- /maptiles/Dockerfile: -------------------------------------------------------------------------------- 1 | # This Dockerfile builds a minimal container with nodejs and the dependencies 2 | # to run the maptiles generation. 3 | 4 | # Build tippecanoe since it isn't included in the Alpine repo. 5 | FROM alpine:3.12 AS build-tippecanoe 6 | 7 | ARG TIPPECANOE_RELEASE="1.30.0" 8 | 9 | RUN mkdir -p /tmp/tippecanoe-src 10 | WORKDIR /tmp/tippecanoe-src 11 | 12 | RUN apk add --no-cache git make g++ libgcc libstdc++ sqlite-libs sqlite-dev zlib-dev bash 13 | 14 | RUN git clone https://github.com/mapbox/tippecanoe.git tippecanoe \ 15 | && cd tippecanoe \ 16 | && git checkout tags/$TIPPECANOE_RELEASE \ 17 | && make -j$(nproc) \ 18 | && make install 19 | 20 | # The actual container is based on the officiale cloud-sdk:alpine image. 21 | FROM gcr.io/google.com/cloudsdktool/cloud-sdk:324.0.0-alpine 22 | 23 | # Copy the only binary we need from the tippecanoe container. 24 | COPY --from=build-tippecanoe /usr/local/bin/tippecanoe /usr/local/bin/tippecanoe 25 | COPY --from=build-tippecanoe /usr/local/bin/tile-join /usr/local/bin/tile-join 26 | 27 | # Make sure we have make, nodejs/npm and tippecanoe's dependencies. 28 | RUN apk add --no-cache libstdc++ sqlite-libs make nodejs npm ca-certificates 29 | 30 | COPY ./ /home/node 31 | WORKDIR /home/node 32 | 33 | # These modules are required by the maptiles generation scripts. 34 | RUN npm install -g mapshaper 35 | RUN npm install 36 | -------------------------------------------------------------------------------- /maptiles/Makefile: -------------------------------------------------------------------------------- 1 | # Check that the GCS_BUCKET variable is set. 2 | check-env: 3 | ifndef GCS_BUCKET 4 | $(error Please set the GCS_BUCKET environment variable.) 5 | endif 6 | 7 | # download the block level shapefiles from the census 8 | # they serve them as a single zip per state so we have 9 | # to download them, unzip them, & convert them to geojson 10 | geographies/blocks/.downloaded: 11 | mkdir -p geographies/blocks 12 | node scripts/download-blocks.js 13 | ./scripts/unzip-blocks.sh 14 | mapshaper geographies/blocks/*.shp -o geographies/blocks/ format=geojson 15 | 16 | # clean up a bunch of the intermediary files because they are big! 17 | rm -f geographies/blocks/*.cpg 18 | rm -f geographies/blocks/*.dbf 19 | rm -f geographies/blocks/*.prj 20 | rm -f geographies/blocks/*.shp 21 | rm -f geographies/blocks/*.shx 22 | rm -f geographies/blocks/*.xml 23 | rm -f geographies/blocks/*.zip 24 | 25 | touch geographies/blocks/.downloaded 26 | 27 | # merge the block level geojson files into tract level files 28 | geographies/tracts: geographies/blocks/.downloaded 29 | mkdir -p geographies/tracts 30 | mapshaper -i geographies/blocks/tl*.json -dissolve TRACTCE10 copy-fields STATEFP10,COUNTYFP10 -o geographies/tracts/ format=geojson 31 | 32 | # and merge all of the tract level shapefiles into one national file 33 | geographies/tracts-geo.json: geographies/tracts 34 | mapshaper -i geographies/tracts/*.json merge-files -each 'fips = STATEFP10 + "" + COUNTYFP10 + TRACTCE10' -o geographies/tracts-geo.json format=geojson 35 | 36 | geographies/states-geo.json: check-env 37 | mkdir -p geographies 38 | gsutil -m cp gs://${GCS_BUCKET}/geographies/states-geo.json geographies/ 39 | 40 | geographies/counties-geo.json: check-env 41 | mkdir -p geographies 42 | gsutil -m cp gs://${GCS_BUCKET}/geographies/counties-geo.json geographies/ 43 | 44 | census/county-demographics-acs-5.csv: check-env 45 | mkdir -p census 46 | gsutil -m cp gs://${GCS_BUCKET}/census/county-demographics-acs-5.csv census/ 47 | 48 | census/county-internet-acs-5.csv: check-env 49 | mkdir -p census 50 | gsutil -m cp gs://${GCS_BUCKET}/census/county-internet-acs-5.csv census/ 51 | 52 | census/tract-demographics-acs-5.csv: check-env 53 | mkdir -p census 54 | gsutil -m cp gs://${GCS_BUCKET}/census/tract-demographics-acs-5.csv census/ 55 | 56 | census/tract-internet-acs-5.csv: check-env 57 | mkdir -p census 58 | gsutil -m cp gs://${GCS_BUCKET}/census/tract-internet-acs-5.csv census/ 59 | 60 | # dowload fcc 477 sqlite database from GCS 61 | fcc-477.sqlite: check-env 62 | gsutil -m cp gs://${GCS_BUCKET}/fcc/202006/fcc-477.sqlite fcc-477.sqlite 63 | 64 | # summarize 477 data by block 65 | fcc-block.json: fcc-477.sqlite 66 | node scripts/process-fcc.js block 67 | 68 | # summarize 477 data by county 69 | fcc-county.json: fcc-477.sqlite 70 | node scripts/process-fcc.js county 71 | 72 | # summarize 477 data by tract 73 | fcc-tract.json: fcc-477.sqlite 74 | node scripts/process-fcc.js tract 75 | 76 | # download mlab statistics from gcs 77 | mlab/counties/.downloaded: 78 | mkdir -p mlab/counties 79 | node scripts/download-mlab.js counties 80 | touch mlab/counties/.downloaded 81 | 82 | mlab/tracts/.downloaded: 83 | mkdir -p mlab/tracts 84 | node scripts/download-mlab.js tracts 85 | touch mlab/tracts/.downloaded 86 | 87 | # and then summarize the mlab statistics 88 | mlab-counties.json: mlab/counties/.downloaded 89 | node scripts/process-mlab.js counties 90 | 91 | mlab-tracts.json: mlab/tracts/.downloaded 92 | node scripts/process-mlab.js tracts 93 | 94 | # merge county data with census demographic information 95 | counties-geo-with-demographics.json: geographies/counties-geo.json census/county-demographics-acs-5.csv 96 | mapshaper geographies/counties-geo.json \ 97 | -join census/county-demographics-acs-5.csv keys=fips,geo_id field-types=fips:str,geo_id:str \ 98 | -each 'white_pct=white_pop / total_pop' \ 99 | -each 'black_pct=black_pop / total_pop' \ 100 | -each 'asian_pct=asian_pop / total_pop' \ 101 | -each 'hispanic_pct=hispanic_pop / total_pop' \ 102 | -each 'amerindian_pct=amerindian_pop / total_pop' \ 103 | -o format=geojson \ 104 | counties-geo-with-demographics.json 105 | 106 | # merge the above county file with acs internet info 107 | counties-geo-with-census.json: counties-geo-with-demographics.json census/county-internet-acs-5.csv 108 | mapshaper counties-geo-with-demographics.json \ 109 | -join census/county-internet-acs-5.csv keys=fips,fips field-types=fips:str,fips:str \ 110 | -o format=geojson \ 111 | counties-geo-with-census.json 112 | 113 | # merge the above county file with fcc 477 data 114 | counties-geo-with-census-fcc.json: counties-geo-with-census.json fcc-county.json 115 | mapshaper counties-geo-with-census.json \ 116 | -join fcc-county.json keys=fips,geo_id field-types=fips:str,geo_id:str \ 117 | -o format=geojson \ 118 | counties-geo-with-census-fcc.json 119 | 120 | # and finally, merge in county level mlab data. this file will 121 | # have census demographics, acs internet info, fcc 477 data, 122 | # and mlab statistics 123 | counties-geo-with-census-fcc-mlab.json: counties-geo-with-census-fcc.json mlab-counties.json 124 | mapshaper counties-geo-with-census-fcc.json \ 125 | -join mlab-counties.json keys=fips,geo_id field-types=fips:str,geo_id:str \ 126 | -o format=geojson \ 127 | counties-geo-with-census-fcc-mlab.json 128 | 129 | # and do the same thing for tracts 130 | 131 | # merge tract data with census demographic information 132 | tracts-geo-with-demographics.json: geographies/tracts-geo.json census/tract-demographics-acs-5.csv 133 | mapshaper geographies/tracts-geo.json \ 134 | -join census/tract-demographics-acs-5.csv keys=fips,geo_id field-types=fips:str,geo_id:str \ 135 | -each 'white_pct=white_pop / total_pop' \ 136 | -each 'black_pct=black_pop / total_pop' \ 137 | -each 'asian_pct=asian_pop / total_pop' \ 138 | -each 'hispanic_pct=hispanic_pop / total_pop' \ 139 | -each 'amerindian_pct=amerindian_pop / total_pop' \ 140 | -o format=geojson \ 141 | tracts-geo-with-demographics.json 142 | 143 | tracts-geo-with-census.json: tracts-geo-with-demographics.json census/tract-internet-acs-5.csv 144 | mapshaper tracts-geo-with-demographics.json \ 145 | -join census/tract-internet-acs-5.csv keys=fips,fips field-types=fips:str,fips:str \ 146 | -o format=geojson \ 147 | tracts-geo-with-census.json 148 | 149 | # and now with FCC 150 | tracts-geo-with-fcc.json: tracts-geo-with-census.json fcc-tract.json 151 | mapshaper tracts-geo-with-census.json \ 152 | -join fcc-tract.json keys=fips,geo_id field-types=fips:str,geo_id:str \ 153 | -o format=geojson \ 154 | tracts-geo-with-fcc.json 155 | 156 | tracts-geo-with-fcc-mlab.json: tracts-geo-with-fcc.json mlab-tracts.json 157 | mapshaper tracts-geo-with-fcc.json \ 158 | -join mlab-tracts.json keys=fips,geo_id field-types=fips:str,geo_id:str \ 159 | -o format=geojson \ 160 | tracts-geo-with-fcc-mlab.json 161 | 162 | # blocks.mbtiles: geographies/blocks/.downloaded 163 | # ./scripts/combine-blocks.sh 164 | 165 | counties.mbtiles: counties-geo-with-census-fcc-mlab.json 166 | tippecanoe -z12 -f -o counties.mbtiles -l counties counties-geo-with-census-fcc-mlab.json 167 | 168 | states.mbtiles: geographies/states-geo.json 169 | tippecanoe -z12 -f -o states.mbtiles -l states geographies/states-geo.json 170 | 171 | tracts.mbtiles: tracts-geo-with-fcc-mlab.json 172 | tippecanoe -z12 -f -o tracts.mbtiles -l tracts --no-tile-size-limit -x STATEFP10 -x COUNTYFP10 -x TRACTCE10 tracts-geo-with-fcc-mlab.json 173 | 174 | piecewise: check-env counties.mbtiles states.mbtiles tracts.mbtiles 175 | tile-join -pk -pC -z10 -f -e piecewise counties.mbtiles states.mbtiles tracts.mbtiles 176 | gsutil -m cp -r piecewise gs://${GCS_BUCKET} 177 | -------------------------------------------------------------------------------- /maptiles/README.md: -------------------------------------------------------------------------------- 1 | # Vectortiles 2 | 3 | This pipeline generates a vector tileset for the map tab on the admin interface. It pulls data from the Census, the FCC, and Measurement Lab, creates the tileset, and uploads it to Google Cloud Storage. It runs to completion with `make piecewise`. 4 | 5 | 6 | ## Required tools 7 | In addition to the things you normally need to run the `piecewise` project, you'll need the following things installed on your `PATH`. Installation instructions are in parenthesis. 8 | 9 | 1. `mapshaper` (`npm install -g mapshaper`) 10 | 1. `tippecanoe` (`brew install tippecanoe` or [other os instructions](https://github.com/mapbox/tippecanoe#installation)) 11 | 12 | It is recommended to use the same version of Node that `piecewise` uses. Some of this workflow's dependencies are installed through the top level `package.json`. 13 | 14 | ## Generated tileset 15 | 16 | The `piecewise` tileset has three layers: 17 | 18 | 1. `states` 19 | 1. `counties` 20 | 1. `tracts` 21 | 22 | Features in the `states` layer have the following properties: 23 | 24 | * `fips` (Source: Census) 25 | * `name` (Source: Census) 26 | 27 | Features in both the `counties` and `tracts` layers have the following properties: 28 | 29 | * `amerindian_pct` (Source: Census) 30 | * `amerindian_pop` (Source: Census) 31 | * `asian_pct` (Source: Census) 32 | * `asian_pop` (Source: Census) 33 | * `black_pct` (Source: Census) 34 | * `black_pop` (Source: Census) 35 | * `fips` (Source: Census) 36 | * `hispanic_pct` (Source: Census) 37 | * `hispanic_pop` (Source: Census) 38 | * `households_with_broadband_moe` (Source: Census) 39 | * `households_with_broadband_pct` (Source: Census) 40 | * `households_without_internet_moe` (Source: Census) 41 | * `households_without_internet_pct` (Source: Census) 42 | * `median_income` (Source: Census) 43 | * `name` (Source: Census) 44 | * `total_pop` (Source: Census) 45 | * `white_pct` (Source: Census) 46 | * `white_pop` (Source: Census) 47 | * `mean_max_ad_down` (Source: FCC) 48 | * `mean_max_ad_up` (Source: FCC) 49 | * `provider_count` (Source: FCC) 50 | * `source_rows` (Source: FCC) 51 | * `2020_jan_jun_median_dl` (Source: Measurement Lab) 52 | * `2020_july_dec_median_dl` (Source: Measurement Lab) 53 | * `2020_jan_jun_median_ul` (Source: Measurement Lab) 54 | * `2020_july_dec_median_ul` (Source: Measurement Lab) 55 | * `2020_jan_jun_percent_over_audio_threshold` (Source: Measurement Lab) 56 | * `2020_july_dec_percent_over_audio_threshold` (Source: Measurement Lab) 57 | * `2020_jan_jun_percent_over_video_threshold` (Source: Measurement Lab) 58 | * `2020_july_dec_percent_over_video_threshold` (Source: Measurement Lab) 59 | * `2020_jan_jun_total_dl_samples` (Source: Measurement Lab) 60 | * `2020_july_dec_total_dl_samples` (Source: Measurement Lab) 61 | * `2020_jan_jun_total_ul_samples` (Source: Measurement Lab) 62 | * `2020_july_dec_total_ul_samples` (Source: Measurement Lab) -------------------------------------------------------------------------------- /maptiles/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "maptiles", 3 | "version": "1.0.0", 4 | "description": "This pipeline generates a vector tileset for the map tab on the admin interface. It pulls data from the Census, the FCC, and Measurement Lab, creates the tileset, and uploads it to Google Cloud Storage. It runs to completion with `make piecewise`.", 5 | "main": "index.js", 6 | "scripts": { 7 | "test": "echo \"Error: no test specified\" && exit 1" 8 | }, 9 | "author": "", 10 | "license": "ISC", 11 | "dependencies": { 12 | "csv": "^5.3.2", 13 | "d3": "^6.3.1", 14 | "d3-collection": "^1.0.7", 15 | "p-queue": "^6.6.2", 16 | "sqlite": "^4.0.14", 17 | "sqlite3": "5.0.0", 18 | "wget-improved": "^3.2.1" 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /maptiles/run-pipeline.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # run-pipeline.sh starts stats-pipeline for the current year and then 4 | # generates updated maptiles. 5 | 6 | set -euxo pipefail 7 | ENDPOINT=${1?"Please provide the endpoint (hostname + port). Usage: $0 "} 8 | 9 | # Start the pipeline for the past 2 days. 10 | start=$(date -d "@$(( $(date +%s) - 86400 * 2 ))" +%Y-%m-%d) 11 | end=$(date +%Y-%m-%d) 12 | 13 | if ! curl -X POST "http://$ENDPOINT/v0/pipeline?start=${start}&end=${end}&step=all"; then 14 | echo "Running the pipeline failed, please check the container logs." 15 | exit 1 16 | fi 17 | 18 | echo "The pipeline completed successfully" 19 | # Note: this is disabled until the maptiles generation can run on multiple 20 | # years. Currently, 2020 is hardcoded and it would be pointless to regenerate 21 | # the maptiles every time the stats-pipeline runs. 22 | #export GCS_BUCKET=maptiles-${PROJECT} 23 | #make piecewise 24 | -------------------------------------------------------------------------------- /maptiles/scripts/combine-blocks.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # mapshaper blocks/**/*.shp -o blocks/ format=geojson 4 | # GEOJSON=$(ls blocks/tl_2019_06_tabblock10.json) 5 | # for FILE in $GEOJSON; 6 | # do 7 | # BASENAME=$(basename $FILE .json) 8 | # tippecanoe --force -l blocks -o blocks/$BASENAME.mbtiles $FILE 9 | # done 10 | 11 | # MBTILES=$(ls blocks/*.mbtiles) 12 | MBTILES="geographies/blocks/tl_2019_06_tabblock10.mbtiles" 13 | tile-join --no-tile-size-limit -o blocks.mbtiles $MBTILES -------------------------------------------------------------------------------- /maptiles/scripts/download-blocks.js: -------------------------------------------------------------------------------- 1 | const { default: Queue } = require('p-queue'); 2 | const wget = require('wget-improved'); 3 | 4 | const queue = new Queue({ concurrency: 2 }); 5 | 6 | function downloadBlocksForFips(fips) { 7 | const url = `https://www2.census.gov/geo/tiger/TIGER2019/TABBLOCK/tl_2019_${fips}_tabblock10.zip`; 8 | const dest = `./geographies/blocks/${fips}_blocks.zip`; 9 | 10 | return new Promise((resolve, reject) => { 11 | const download = wget.download(url, dest); 12 | 13 | download.on('error', function(err) { 14 | console.error(`Error downloading ${fips}:`, err); 15 | console.error(`\t${url}`); 16 | reject(err); 17 | }); 18 | 19 | download.on('end', function() { 20 | console.log(`Downloaded ${fips}`); 21 | resolve(); 22 | }); 23 | }); 24 | } 25 | 26 | function addToQueue(fips) { 27 | const nonFips = ['03', '07', '14', '43', '52']; 28 | return async () => { 29 | if (nonFips.includes(fips)) { 30 | console.log(`Skipping ${fips} because it isn't a US state`); 31 | return; 32 | } 33 | 34 | try { 35 | console.log(`Downloading ${fips}...`); 36 | await downloadBlocksForFips(fips); 37 | } catch { 38 | console.log(`Error with ${fips}, adding back to queue`); 39 | addToQueue(fips); 40 | } 41 | }; 42 | } 43 | 44 | for (let i = 1; i < 57; i++) { 45 | const fips = i < 10 ? `0${i}` : `${i}`; 46 | queue.add(addToQueue(fips)); 47 | } 48 | 49 | queue.onIdle().then(() => { 50 | console.log(`All done downloading shapefiles`); 51 | return; 52 | }); 53 | -------------------------------------------------------------------------------- /maptiles/scripts/download-mlab.js: -------------------------------------------------------------------------------- 1 | const { exec: execWithCallback } = require('child_process'); 2 | 3 | const { default: Queue } = require('p-queue'); 4 | 5 | function exec(args) { 6 | return new Promise((resolve, reject) => { 7 | execWithCallback(args, { maxBuffer: 2000 * 1024 }, (err, data) => { 8 | if (err) return reject(err); 9 | resolve(data); 10 | }); 11 | }); 12 | } 13 | 14 | const args = process.argv; 15 | const geographicLevel = args[2] || 'counties'; // can also be "tracts" 16 | const rootGsUrl = `gs://statistics-mlab-sandbox/v0/NA/US/${geographicLevel}`; 17 | 18 | let fileCount = 0; 19 | 20 | async function download(fips, i) { 21 | const cmd = `gsutil cp ${rootGsUrl}/${fips}/2020/histogram_daily_stats.json mlab/${geographicLevel}/${fips}.json`; 22 | try { 23 | console.log(`Downloading JSON file (FIPS: ${fips}, ${i} / ${fileCount})`); 24 | await exec(cmd); 25 | } catch (e) { 26 | console.error(`Error with ${fips}\n\t`, e); 27 | } 28 | } 29 | 30 | async function main() { 31 | console.log(`Finding MLab data files to download for ${geographicLevel}`); 32 | const filesList = await exec(`gsutil ls ${rootGsUrl}/`); 33 | const files = filesList.split('\n'); 34 | fileCount = files.length; 35 | 36 | console.log(`Found ${fileCount} files to download, starting now`); 37 | 38 | const queue = new Queue({ concurrency: 16 }); 39 | const fipsLength = geographicLevel === 'counties' ? '5' : '11'; 40 | const r = new RegExp(`${geographicLevel}\\/(\\d{${fipsLength}})\\/$`); 41 | 42 | files.forEach((file, fileIndex) => { 43 | const match = file.match(r); 44 | if (!match) return; 45 | 46 | const fips = match[1]; 47 | queue.add(async () => { 48 | await download(fips, fileIndex); 49 | }); 50 | }); 51 | 52 | await queue.onIdle(); 53 | 54 | console.log(`Downloaded ${files.length} JSON files`); 55 | } 56 | 57 | main(); 58 | -------------------------------------------------------------------------------- /maptiles/scripts/load-fcc-477-to-db.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs'); 2 | 3 | const { parse, transform } = require('csv'); 4 | const sqlite = require('sqlite3'); 5 | const { open } = require('sqlite'); 6 | 7 | const args = process.argv; 8 | const inputFileArg = args[2]; 9 | const dbFileName = `./fcc-477.sqlite`; 10 | 11 | (async () => { 12 | const db = await open({ 13 | filename: dbFileName, 14 | driver: sqlite.Database, 15 | }); 16 | await db.run( 17 | `CREATE TABLE rows(block_fips TEXT,tract_fips TEXT,county_fips TEXT,max_ad_down,max_ad_up,provider_id);`, 18 | ); 19 | 20 | // Optimizations 21 | await db.run( 22 | `PRAGMA synchronous = OFF`, 23 | ); 24 | 25 | await db.run( 26 | `BEGIN TRANSACTION`, 27 | ); 28 | 29 | let insertStmt = await db.prepare("INSERT INTO rows (block_fips,tract_fips,county_fips,max_ad_down,max_ad_up,provider_id) VALUES (?,?,?,?,?,?)"); 30 | let rowCount = 0; 31 | 32 | async function processRow(row, cb) { 33 | const consumer = row['Consumer']; 34 | const providerId = row['Provider_Id']; 35 | const blockCode = row['BlockCode']; 36 | const maxAdDown = row['MaxAdDown']; 37 | const maxAdUp = row['MaxAdUp']; 38 | if (consumer !== '1') return cb(null); 39 | const tractFips = blockCode.slice(0, 11); 40 | const countyFips = blockCode.slice(0, 5); 41 | 42 | try { 43 | await insertStmt.run(blockCode, tractFips, countyFips, maxAdDown, maxAdUp, providerId); 44 | rowCount += 1; 45 | if (rowCount % 100000 === 0) { 46 | console.log(`Stored ${rowCount} rows in the database`); 47 | } 48 | cb(null, blockCode); 49 | } catch (err) { 50 | console.log({ row }); 51 | cb(err); 52 | } 53 | } 54 | 55 | console.log(`Loading ${inputFileArg} into ${dbFileName}`); 56 | 57 | const input = fs.createReadStream(inputFileArg); 58 | const parser = parse({ columns: true }); 59 | const transformer = transform(processRow); 60 | 61 | transformer.on('end', async () => { 62 | console.log(`Added ${rowCount} rows, creating indexes now`); 63 | await db.run(`CREATE INDEX block_fips_index ON rows (block_fips);`); 64 | await db.run(`CREATE INDEX county_fips_index ON rows (county_fips);`); 65 | await db.run(`CREATE INDEX tract_fips_index ON rows (tract_fips);`); 66 | await db.run( 67 | `END TRANSACTION`, 68 | ); 69 | insertStmt.finalize(); 70 | console.log(`Generated ${dbFileName}`); 71 | }); 72 | 73 | input.pipe(parser).pipe(transformer); 74 | transformer.resume(); 75 | })(); 76 | -------------------------------------------------------------------------------- /maptiles/scripts/process-fcc.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs'); 2 | 3 | const sqlite = require('sqlite3'); 4 | const { open } = require('sqlite'); 5 | const { default: Queue } = require('p-queue'); 6 | 7 | const args = process.argv; 8 | const geographyType = args[2] || 'county'; // can also be "tract" 9 | 10 | let count = 0; 11 | const geographyColumn = `${geographyType}_fips`; 12 | let fipsLength = geographyType === 'county' ? 5 : 11; 13 | let isFirst = true; 14 | const outputFile = `fcc-${geographyType}.json`; 15 | const queue = new Queue({ concurrency: 50 }); 16 | const writeStream = fs.createWriteStream(outputFile); 17 | 18 | function mean(arr) { 19 | if (arr.length === 0) return 0; 20 | 21 | const sum = arr.reduce((accum, next) => { 22 | return accum + next; 23 | }, 0); 24 | return sum / arr.length; 25 | } 26 | 27 | async function processId(id, db) { 28 | count += 1; 29 | const query = `SELECT * FROM rows WHERE ${geographyColumn} = "${id}";`; 30 | 31 | const downs = []; 32 | const ups = []; 33 | const providers = new Set(); 34 | const rows = await db.all(query); 35 | console.log(`${rows.length} rows to analyze for ${id} (#${count})`); 36 | 37 | rows.forEach(r => { 38 | const { max_ad_down, max_ad_up, provider_id } = r; 39 | 40 | downs.push(+max_ad_down); 41 | ups.push(+max_ad_up); 42 | providers.add(provider_id); 43 | }); 44 | 45 | const delim = isFirst ? '' : ','; 46 | if (isFirst) isFirst = false; 47 | 48 | const json = JSON.stringify({ 49 | geo_id: `${id}`.padStart(fipsLength, '0'), // block code for blocks and fips for counties 50 | provider_count: providers.size, 51 | mean_max_ad_down: mean(downs), 52 | mean_max_ad_up: mean(ups), 53 | source_rows: rows.length, 54 | }); 55 | 56 | writeStream.write(`${delim}\n${json}`); 57 | return; 58 | } 59 | 60 | (async () => { 61 | const db = await open({ 62 | filename: `./fcc-477.sqlite`, 63 | driver: sqlite.Database, 64 | mode: sqlite.OPEN_READONLY, 65 | }); 66 | 67 | sqlite.verbose(); 68 | 69 | const ids = await db.all(`SELECT DISTINCT ${geographyColumn} FROM rows;`); 70 | 71 | writeStream.write('['); 72 | console.log(`Found ${ids.length} unique values for ${geographyColumn}`); 73 | 74 | await db.each(`SELECT DISTINCT ${geographyColumn} FROM rows;`, (err, row) => { 75 | if (err) { 76 | console.error(err); 77 | return; 78 | } 79 | 80 | const id = row[geographyColumn]; 81 | queue.add(() => processId(id, db)); 82 | }); 83 | 84 | await queue.onIdle(); 85 | writeStream.write(']'); 86 | })(); 87 | -------------------------------------------------------------------------------- /maptiles/scripts/process-mlab.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs'); 2 | 3 | const { median } = require('d3'); 4 | const { nest } = require('d3-collection') 5 | const glob = require('glob'); 6 | const { default: Queue } = require('p-queue'); 7 | 8 | const args = process.argv; 9 | const geographicLevel = args[2] || 'counties'; // can also be "tracts" 10 | 11 | glob(`mlab/${geographicLevel}/*.json`, async (err, files) => { 12 | let isFirst = true; 13 | const output = fs.createWriteStream(`mlab-${geographicLevel}.json`); 14 | const queue = new Queue({ concurrency: 4 }); 15 | 16 | async function processFile(filePath) { 17 | let delimiter = ','; 18 | const fipsMatch = filePath.match(/(\d+)\.json$/); 19 | const fips = fipsMatch ? fipsMatch[1] : null; 20 | const f = await fs.promises.readFile(filePath); 21 | const j = JSON.parse(f.toString()); 22 | 23 | const grouped = nest() 24 | .key(d => { 25 | const { date } = d; 26 | const [year, month] = date.split('-'); 27 | const halfOfTheYear = ['01', '02', '03', '04', '05', '06'].includes( 28 | month, 29 | ) 30 | ? 'jan_jun' 31 | : 'july_dec'; 32 | 33 | return `${year}_${halfOfTheYear}`; 34 | }) 35 | .entries(j); 36 | 37 | // currently only looks at download speeds 38 | const analyzed = grouped.map(grouped => { 39 | const { key, values } = grouped; 40 | 41 | const dates = new Set(); 42 | const dlMedians = []; 43 | const ulMedians = []; 44 | let dlSamplesOverAudio = 0; 45 | let dlSamplesOverVideo = 0; 46 | let totalDlSamples = 0; 47 | let totalUlSamples = 0; 48 | 49 | values.forEach(v => { 50 | const { 51 | bucket_min, 52 | date, 53 | dl_samples_bucket, 54 | download_MED, 55 | ul_samples_bucket, 56 | upload_MED, 57 | } = v; 58 | 59 | totalDlSamples += dl_samples_bucket; 60 | totalUlSamples += ul_samples_bucket; 61 | 62 | if (bucket_min > 2.5) { 63 | dlSamplesOverAudio += dl_samples_bucket; 64 | } 65 | 66 | if (bucket_min > 10) { 67 | dlSamplesOverVideo += dl_samples_bucket; 68 | } 69 | 70 | if (dates.has(date)) return; 71 | 72 | dlMedians.push(download_MED); 73 | ulMedians.push(upload_MED); 74 | dates.add(date); 75 | }); 76 | 77 | return { 78 | [`${key}_median_dl`]: median(dlMedians), 79 | [`${key}_median_ul`]: median(ulMedians), 80 | [`${key}_total_dl_samples`]: totalDlSamples, 81 | [`${key}_total_ul_samples`]: totalUlSamples, 82 | [`${key}_percent_over_audio_threshold`]: 83 | dlSamplesOverAudio / totalDlSamples, 84 | [`${key}_percent_over_video_threshold`]: 85 | dlSamplesOverVideo / totalDlSamples, 86 | }; 87 | }); 88 | 89 | const d = { geo_id: fips }; 90 | analyzed.forEach(a => { 91 | Object.assign(d, a); 92 | }); 93 | 94 | if (isFirst) delimiter = ''; 95 | isFirst = false; 96 | 97 | output.write(`${delimiter}\n${JSON.stringify(d)}`); 98 | console.log(`Processed ${filePath}`); 99 | } 100 | 101 | output.write('['); 102 | files.forEach(file => { 103 | queue.add(() => processFile(file)); 104 | }); 105 | 106 | await queue.onIdle(); 107 | output.write(']'); 108 | 109 | console.log(`Processed data from ${files.length} source files`); 110 | }); 111 | -------------------------------------------------------------------------------- /maptiles/scripts/unzip-blocks.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for FILE in $(ls geographies/blocks/*.zip); 4 | do 5 | unzip -o -d geographies/blocks $FILE 6 | done -------------------------------------------------------------------------------- /output/writer.go: -------------------------------------------------------------------------------- 1 | package output 2 | 3 | import ( 4 | "context" 5 | "io/ioutil" 6 | "os" 7 | "path/filepath" 8 | 9 | "github.com/m-lab/go/uploader" 10 | ) 11 | 12 | // GCSWriter provides Write operations to a GCS bucket. 13 | type GCSWriter struct { 14 | up *uploader.Uploader 15 | } 16 | 17 | // NewGCSWriter creates a new GCSWriter from the given uploader.Uploader. 18 | func NewGCSWriter(up *uploader.Uploader) *GCSWriter { 19 | return &GCSWriter{up: up} 20 | } 21 | 22 | // Write creates a new object at path containing content. 23 | func (u *GCSWriter) Write(ctx context.Context, path string, content []byte) error { 24 | _, err := u.up.Upload(ctx, path, content) 25 | return err 26 | } 27 | 28 | // LocalWriter provides Write operations to a local directory. 29 | type LocalWriter struct { 30 | dir string 31 | } 32 | 33 | // NewLocalWriter creates a new LocalWriter for the given output directory. 34 | func NewLocalWriter(dir string) *LocalWriter { 35 | return &LocalWriter{dir: dir} 36 | } 37 | 38 | // Write creates a new file at path containing content. 39 | func (lu *LocalWriter) Write(ctx context.Context, path string, content []byte) error { 40 | p := filepath.Join(lu.dir, path) 41 | d := filepath.Dir(p) // path may include additional directory elements. 42 | err := os.MkdirAll(d, os.ModePerm) 43 | if err != nil { 44 | return err 45 | } 46 | return ioutil.WriteFile(p, content, 0664) 47 | } 48 | -------------------------------------------------------------------------------- /output/writer_test.go: -------------------------------------------------------------------------------- 1 | package output 2 | 3 | import ( 4 | "context" 5 | "os" 6 | "path/filepath" 7 | "testing" 8 | 9 | "github.com/m-lab/go/cloudtest/gcsfake" 10 | "github.com/m-lab/go/testingx" 11 | "github.com/m-lab/go/uploader" 12 | ) 13 | 14 | func TestGCSWriter_Write(t *testing.T) { 15 | failingBucket := gcsfake.NewBucketHandle() 16 | failingBucket.WritesMustFail = true 17 | 18 | client := &gcsfake.GCSClient{} 19 | client.AddTestBucket("test_bucket", gcsfake.NewBucketHandle()) 20 | client.AddTestBucket("failing_bucket", failingBucket) 21 | 22 | tests := []struct { 23 | name string 24 | path string 25 | content []byte 26 | wantErr bool 27 | }{ 28 | { 29 | name: "success-write", 30 | path: "output/name", 31 | content: []byte{0, 1, 2}, 32 | }, 33 | } 34 | for _, tt := range tests { 35 | t.Run(tt.name, func(t *testing.T) { 36 | u := NewGCSWriter(uploader.New(client, "test_bucket")) 37 | if err := u.Write(context.Background(), tt.path, tt.content); (err != nil) != tt.wantErr { 38 | t.Errorf("GCSWriter.Write() error = %v, wantErr %v", err, tt.wantErr) 39 | } 40 | }) 41 | } 42 | } 43 | 44 | func TestLocalWriter_Write(t *testing.T) { 45 | tests := []struct { 46 | name string 47 | dir string 48 | path string 49 | content []byte 50 | wantErr bool 51 | }{ 52 | { 53 | name: "success", 54 | dir: t.TempDir(), 55 | path: "output/name", 56 | content: []byte{0, 1, 2}, 57 | }, 58 | { 59 | name: "error", 60 | dir: t.TempDir(), 61 | path: "file.not-a-dir/name", 62 | wantErr: true, 63 | }, 64 | } 65 | for _, tt := range tests { 66 | t.Run(tt.name, func(t *testing.T) { 67 | if tt.wantErr { 68 | p := filepath.Join(tt.dir, tt.path) 69 | err := os.MkdirAll(filepath.Dir(filepath.Dir(p)), os.ModePerm) 70 | testingx.Must(t, err, "failed to mkdir") 71 | // create a file where a directory should be. 72 | f, err := os.Create(filepath.Dir(p)) 73 | testingx.Must(t, err, "failed to create file") 74 | f.Close() 75 | } 76 | lu := NewLocalWriter(tt.dir) 77 | if err := lu.Write(context.Background(), tt.path, tt.content); (err != nil) != tt.wantErr { 78 | t.Errorf("LocalWriter.Write() error = %v, wantErr %v", err, tt.wantErr) 79 | } 80 | }) 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /pipeline/handlers_test.go: -------------------------------------------------------------------------------- 1 | package pipeline 2 | 3 | import ( 4 | "bytes" 5 | "context" 6 | "encoding/json" 7 | "io/ioutil" 8 | "net/http" 9 | "net/http/httptest" 10 | "reflect" 11 | "testing" 12 | "text/template" 13 | "time" 14 | 15 | "github.com/googleapis/google-cloud-go-testing/bigquery/bqiface" 16 | "github.com/m-lab/stats-pipeline/config" 17 | "github.com/m-lab/stats-pipeline/histogram" 18 | ) 19 | 20 | type mockClient struct { 21 | bqiface.Client 22 | } 23 | 24 | type mockExporter struct{} 25 | 26 | type mockHistogramTable struct{} 27 | 28 | func (ex *mockExporter) Export(context.Context, config.Config, *template.Template, int) error { 29 | return nil 30 | } 31 | 32 | func (h *mockHistogramTable) UpdateHistogram(context.Context, time.Time, time.Time) error { 33 | return nil 34 | } 35 | 36 | func TestHandler_ServeHTTP(t *testing.T) { 37 | mc := &mockClient{} 38 | me := &mockExporter{} 39 | conf := map[string]config.Config{ 40 | "test": { 41 | HistogramQueryFile: "testdata/test_histogram.sql", 42 | ExportQueryFile: "testdata/test_export.sql", 43 | Dataset: "test", 44 | Table: "testtable", 45 | }, 46 | } 47 | 48 | newHistogramTable = func(name, ds string, config histogram.QueryConfig, 49 | client bqiface.Client) HistogramTable { 50 | return &mockHistogramTable{} 51 | } 52 | 53 | tests := []struct { 54 | name string 55 | w http.ResponseWriter 56 | r *http.Request 57 | bqClient bqiface.Client 58 | exporter Exporter 59 | config map[string]config.Config 60 | statusCode int 61 | response *pipelineResult 62 | }{ 63 | { 64 | name: "ok", 65 | bqClient: mc, 66 | exporter: me, 67 | config: conf, 68 | r: httptest.NewRequest(http.MethodPost, 69 | "/v0/pipeline?start=2021-01-01&end=2021-12-31&step=all", 70 | bytes.NewReader([]byte{})), 71 | statusCode: http.StatusOK, 72 | response: &pipelineResult{ 73 | CompletedSteps: []pipelineStep{histogramsStep, exportsStep}, 74 | Errors: []string{}, 75 | }, 76 | }, 77 | { 78 | name: "invalid-method", 79 | r: httptest.NewRequest(http.MethodGet, 80 | "/v0/pipeline?year=2020&step=all", nil), 81 | statusCode: http.StatusMethodNotAllowed, 82 | response: &pipelineResult{ 83 | CompletedSteps: []pipelineStep{}, 84 | Errors: []string{ 85 | http.StatusText(http.StatusMethodNotAllowed), 86 | }, 87 | }, 88 | }, 89 | { 90 | name: "missing-parameter-start", 91 | r: httptest.NewRequest(http.MethodPost, "/v0/pipeline?end=2021-12-31&step=all", 92 | nil), 93 | statusCode: http.StatusBadRequest, 94 | response: &pipelineResult{ 95 | CompletedSteps: []pipelineStep{}, 96 | Errors: []string{errMissingStartDate.Error()}, 97 | }, 98 | }, 99 | { 100 | name: "missing-parameter-end", 101 | r: httptest.NewRequest(http.MethodPost, "/v0/pipeline?start=2021-01-01&step=all", 102 | nil), 103 | statusCode: http.StatusBadRequest, 104 | response: &pipelineResult{ 105 | CompletedSteps: []pipelineStep{}, 106 | Errors: []string{errMissingEndDate.Error()}, 107 | }, 108 | }, 109 | { 110 | name: "missing-parameter-step", 111 | r: httptest.NewRequest(http.MethodPost, 112 | "/v0/pipeline?start=2021-01-01&end=2021-12-31", nil), 113 | statusCode: http.StatusBadRequest, 114 | response: &pipelineResult{ 115 | CompletedSteps: []pipelineStep{}, 116 | Errors: []string{errMissingStep.Error()}, 117 | }, 118 | }, 119 | { 120 | name: "action-histograms", 121 | r: httptest.NewRequest(http.MethodPost, "/v0/pipeline?start=2021-01-01&end=2021-12-31&step=histograms", bytes.NewReader([]byte{})), 122 | statusCode: http.StatusOK, 123 | response: &pipelineResult{ 124 | CompletedSteps: []pipelineStep{histogramsStep}, 125 | Errors: []string{}, 126 | }, 127 | }, 128 | { 129 | name: "action-exports", 130 | r: httptest.NewRequest(http.MethodPost, "/v0/pipeline?start=2021-01-01&end=2021-12-31&step=exports", bytes.NewReader([]byte{})), 131 | statusCode: http.StatusOK, 132 | response: &pipelineResult{ 133 | CompletedSteps: []pipelineStep{exportsStep}, 134 | Errors: []string{}, 135 | }, 136 | }, 137 | { 138 | name: "action-all", 139 | r: httptest.NewRequest(http.MethodPost, "/v0/pipeline?start=2021-01-01&end=2021-12-31&step=all", bytes.NewReader([]byte{})), 140 | statusCode: http.StatusOK, 141 | response: &pipelineResult{ 142 | CompletedSteps: []pipelineStep{histogramsStep, exportsStep}, 143 | Errors: []string{}, 144 | }, 145 | }, 146 | { 147 | name: "invalid-start", 148 | r: httptest.NewRequest(http.MethodPost, "/v0/pipeline?start=xyz&end=2021-12-31&step=all", bytes.NewReader([]byte{})), 149 | statusCode: http.StatusBadRequest, 150 | response: &pipelineResult{ 151 | CompletedSteps: []pipelineStep{}, 152 | Errors: []string{"parsing time \"xyz\" as \"2006-01-02\": cannot parse \"xyz\" as \"2006\""}, 153 | }, 154 | }, 155 | { 156 | name: "invalid-end", 157 | r: httptest.NewRequest(http.MethodPost, "/v0/pipeline?start=2021-01-01&end=xyz&step=all", bytes.NewReader([]byte{})), 158 | statusCode: http.StatusBadRequest, 159 | response: &pipelineResult{ 160 | CompletedSteps: []pipelineStep{}, 161 | Errors: []string{"parsing time \"xyz\" as \"2006-01-02\": cannot parse \"xyz\" as \"2006\""}, 162 | }, 163 | }, 164 | { 165 | name: "invalid-range-start-after-end", 166 | r: httptest.NewRequest(http.MethodPost, "/v0/pipeline?start=2021-01-01&end=2020-12-31&step=all", bytes.NewReader([]byte{})), 167 | statusCode: http.StatusBadRequest, 168 | response: &pipelineResult{ 169 | CompletedSteps: []pipelineStep{}, 170 | Errors: []string{errInvalidDateRange.Error()}, 171 | }, 172 | }, 173 | } 174 | for _, tt := range tests { 175 | t.Run(tt.name, func(t *testing.T) { 176 | h := NewHandler(tt.bqClient, tt.exporter, tt.config) 177 | recorder := httptest.NewRecorder() 178 | h.ServeHTTP(recorder, tt.r) 179 | statusCode := recorder.Result().StatusCode 180 | if statusCode != tt.statusCode { 181 | t.Errorf("ServeHTTP(): expected %v, got %v", tt.statusCode, statusCode) 182 | } 183 | // Read response body and compare with the expected value. 184 | if tt.response != nil { 185 | body, err := ioutil.ReadAll(recorder.Result().Body) 186 | if err != nil { 187 | t.Errorf("Error while reading response body") 188 | } 189 | var responseJSON pipelineResult 190 | err = json.Unmarshal(body, &responseJSON) 191 | if err != nil { 192 | t.Errorf("Error while unmarshalling response body") 193 | } 194 | 195 | if !reflect.DeepEqual(responseJSON, *tt.response) { 196 | t.Errorf("Invalid response body: %v, expected %v", responseJSON, 197 | tt.response) 198 | } 199 | } 200 | 201 | }) 202 | } 203 | } 204 | 205 | func TestNewHandler(t *testing.T) { 206 | mc := &mockClient{} 207 | me := &mockExporter{} 208 | config := map[string]config.Config{} 209 | h := NewHandler(mc, me, config) 210 | if h == nil { 211 | t.Fatalf("NewHandler() returned nil") 212 | } 213 | if h.bqClient != mc || h.exporter != me || !reflect.DeepEqual(h.configs, config) { 214 | t.Errorf("NewHandler() didn't return the expected handler") 215 | } 216 | // Check we can read from the channel. 217 | if _, ok := <-h.pipelineCanRun; !ok { 218 | t.Errorf("NewHandler() didn't return a properly initialized handler.") 219 | } 220 | } 221 | 222 | func Test_getYearlyRanges(t *testing.T) { 223 | tests := []struct { 224 | name string 225 | start time.Time 226 | end time.Time 227 | want [][]time.Time 228 | }{ 229 | { 230 | name: "single-year", 231 | start: time.Date(2021, time.January, 1, 0, 0, 0, 0, time.UTC), 232 | end: time.Date(2021, time.December, 31, 0, 0, 0, 0, time.UTC), 233 | want: [][]time.Time{ 234 | { 235 | time.Date(2021, time.January, 1, 0, 0, 0, 0, time.UTC), 236 | time.Date(2021, time.December, 31, 0, 0, 0, 0, time.UTC), 237 | }, 238 | }, 239 | }, 240 | { 241 | name: "two-adjacent-years", 242 | start: time.Date(2020, time.January, 1, 0, 0, 0, 0, time.UTC), 243 | end: time.Date(2021, time.August, 31, 0, 0, 0, 0, time.UTC), 244 | want: [][]time.Time{ 245 | { 246 | time.Date(2020, time.January, 1, 0, 0, 0, 0, time.UTC), 247 | time.Date(2020, time.December, 31, 0, 0, 0, 0, time.UTC), 248 | }, 249 | { 250 | time.Date(2021, time.January, 1, 0, 0, 0, 0, time.UTC), 251 | time.Date(2021, time.August, 31, 0, 0, 0, 0, time.UTC), 252 | }, 253 | }, 254 | }, 255 | { 256 | name: "multiple-years", 257 | start: time.Date(2020, time.March, 1, 0, 0, 0, 0, time.UTC), 258 | end: time.Date(2022, time.August, 31, 0, 0, 0, 0, time.UTC), 259 | want: [][]time.Time{ 260 | { 261 | time.Date(2020, time.March, 1, 0, 0, 0, 0, time.UTC), 262 | time.Date(2020, time.December, 31, 0, 0, 0, 0, time.UTC), 263 | }, 264 | { 265 | time.Date(2021, time.January, 1, 0, 0, 0, 0, time.UTC), 266 | time.Date(2021, time.December, 31, 0, 0, 0, 0, time.UTC), 267 | }, 268 | { 269 | time.Date(2022, time.January, 1, 0, 0, 0, 0, time.UTC), 270 | time.Date(2022, time.August, 31, 0, 0, 0, 0, time.UTC), 271 | }, 272 | }, 273 | }, 274 | } 275 | for _, tt := range tests { 276 | t.Run(tt.name, func(t *testing.T) { 277 | if got := getYearlyRanges(tt.start, tt.end); !reflect.DeepEqual(got, tt.want) { 278 | t.Errorf("getYearlyRanges() = %v, want %v", got, tt.want) 279 | } 280 | }) 281 | } 282 | } 283 | -------------------------------------------------------------------------------- /pipeline/headers.go: -------------------------------------------------------------------------------- 1 | package pipeline 2 | 3 | import "errors" 4 | 5 | var ( 6 | errMissingStartDate = errors.New("missing mandatory parameter: start") 7 | errMissingEndDate = errors.New("missing mandatory parameter: end") 8 | errInvalidDateRange = errors.New("the end date must be after the start date") 9 | errMissingStep = errors.New("missing mandatory parameter: step") 10 | errAlreadyRunning = errors.New("the pipeline is running already") 11 | ) 12 | -------------------------------------------------------------------------------- /pipeline/testdata/test_export.sql: -------------------------------------------------------------------------------- 1 | SELECT * FROM {{.sourceTable}} -------------------------------------------------------------------------------- /pipeline/testdata/test_histogram.sql: -------------------------------------------------------------------------------- 1 | SELECT * FROM test -------------------------------------------------------------------------------- /statistics/exports/cities.sql: -------------------------------------------------------------------------------- 1 | SELECT *, EXTRACT(YEAR from date) as year 2 | FROM {{ .sourceTable }} 3 | WHERE shard = {{ .partitionID }} 4 | ORDER BY continent_code, country_code, ISO3166_2region1, city, date, bucket_min 5 | -------------------------------------------------------------------------------- /statistics/exports/cities_asn.sql: -------------------------------------------------------------------------------- 1 | SELECT *, EXTRACT(YEAR from date) as year 2 | FROM {{ .sourceTable }} 3 | WHERE shard = {{ .partitionID }} 4 | ORDER BY continent_code, country_code, ISO3166_2region1, city, asn, date, bucket_min 5 | -------------------------------------------------------------------------------- /statistics/exports/continents.sql: -------------------------------------------------------------------------------- 1 | SELECT *, EXTRACT(YEAR from date) as year 2 | FROM {{ .sourceTable }} 3 | WHERE shard = {{ .partitionID }} 4 | ORDER BY continent_code, date 5 | -------------------------------------------------------------------------------- /statistics/exports/continents_asn.sql: -------------------------------------------------------------------------------- 1 | SELECT *, EXTRACT(YEAR from date) as year 2 | FROM {{ .sourceTable }} 3 | WHERE shard = {{ .partitionID }} 4 | ORDER BY continent_code, asn, date, bucket_min 5 | -------------------------------------------------------------------------------- /statistics/exports/countries.sql: -------------------------------------------------------------------------------- 1 | SELECT *, EXTRACT(YEAR from date) as year 2 | FROM {{ .sourceTable }} 3 | WHERE shard = {{ .partitionID }} 4 | ORDER BY continent_code, country_code, date, bucket_min 5 | -------------------------------------------------------------------------------- /statistics/exports/countries_asn.sql: -------------------------------------------------------------------------------- 1 | SELECT *, EXTRACT(YEAR from date) as year 2 | FROM {{ .sourceTable }} 3 | WHERE shard = {{ .partitionID }} 4 | ORDER BY continent_code, country_code, asn, date, bucket_min 5 | -------------------------------------------------------------------------------- /statistics/exports/global_asn.sql: -------------------------------------------------------------------------------- 1 | SELECT *, EXTRACT(YEAR from date) as year 2 | FROM {{ .sourceTable }} 3 | WHERE shard = {{ .partitionID }} 4 | ORDER BY asn, date, bucket_min 5 | -------------------------------------------------------------------------------- /statistics/exports/regions.sql: -------------------------------------------------------------------------------- 1 | SELECT *, EXTRACT(YEAR from date) as year 2 | FROM {{ .sourceTable }} 3 | WHERE shard = {{ .partitionID }} 4 | ORDER BY continent_code, country_code, ISO3166_2region1, date, bucket_min 5 | -------------------------------------------------------------------------------- /statistics/exports/regions_asn.sql: -------------------------------------------------------------------------------- 1 | SELECT *, EXTRACT(YEAR from date) as year 2 | FROM {{ .sourceTable }} 3 | WHERE shard = {{ .partitionID }} 4 | ORDER BY continent_code, country_code, ISO3166_2region1, asn, date, bucket_min 5 | -------------------------------------------------------------------------------- /statistics/exports/us_counties.sql: -------------------------------------------------------------------------------- 1 | SELECT *, EXTRACT(YEAR from date) as year 2 | FROM {{ .sourceTable }} 3 | WHERE shard = {{ .partitionID }} 4 | ORDER BY GEOID, date, bucket_min 5 | -------------------------------------------------------------------------------- /statistics/exports/us_counties_asn.sql: -------------------------------------------------------------------------------- 1 | SELECT *, EXTRACT(YEAR from date) as year 2 | FROM {{ .sourceTable }} 3 | WHERE shard = {{ .partitionID }} 4 | ORDER BY GEOID, asn, date, bucket_min 5 | -------------------------------------------------------------------------------- /statistics/exports/us_states.sql: -------------------------------------------------------------------------------- 1 | SELECT *, EXTRACT(YEAR from date) as year 2 | FROM {{ .sourceTable }} 3 | WHERE shard = {{ .partitionID }} 4 | ORDER BY GEOID, date, bucket_min 5 | -------------------------------------------------------------------------------- /statistics/exports/us_states_asn.sql: -------------------------------------------------------------------------------- 1 | SELECT *, EXTRACT(YEAR from date) as year 2 | FROM {{ .sourceTable }} 3 | WHERE shard = {{ .partitionID }} 4 | ORDER BY GEOID, asn, date, bucket_min 5 | -------------------------------------------------------------------------------- /statistics/exports/us_tracts.sql: -------------------------------------------------------------------------------- 1 | SELECT *, EXTRACT(YEAR from date) as year 2 | FROM {{ .sourceTable }} 3 | WHERE shard = {{ .partitionID }} 4 | ORDER BY GEOID, date, bucket_min 5 | -------------------------------------------------------------------------------- /statistics/exports/us_tracts_asn.sql: -------------------------------------------------------------------------------- 1 | SELECT *, EXTRACT(YEAR from date) as year 2 | FROM {{ .sourceTable }} 3 | WHERE shard = {{ .partitionID }} 4 | ORDER BY GEOID, asn, date, bucket_min 5 | -------------------------------------------------------------------------------- /statistics/queries/continent_asn_histogram.sql: -------------------------------------------------------------------------------- 1 | WITH 2 | --Generate equal sized buckets in log-space between near 0 Mbps and ~1 Gbps+ 3 | buckets AS ( 4 | SELECT POW(10, x-.25) AS bucket_left, POW(10,x+.25) AS bucket_right 5 | FROM UNNEST(GENERATE_ARRAY(0, 3.5, .5)) AS x 6 | ), 7 | --Select the initial set of tests 8 | dl_per_location AS ( 9 | SELECT 10 | date, 11 | client.Geo.ContinentCode AS continent_code, 12 | client.Network.ASNumber AS asn, 13 | NET.SAFE_IP_FROM_STRING(Client.IP) AS ip, 14 | id, 15 | a.MeanThroughputMbps AS mbps, 16 | a.MinRTT AS MinRTT 17 | FROM `measurement-lab.ndt.unified_downloads` 18 | WHERE date BETWEEN @startdate AND @enddate 19 | AND a.MeanThroughputMbps != 0 20 | ), 21 | --Filter for only tests With good locations and valid IPs 22 | dl_per_location_cleaned AS ( 23 | SELECT * FROM dl_per_location 24 | WHERE 25 | continent_code IS NOT NULL 26 | AND continent_code != "" 27 | AND asn IS NOT NULL 28 | AND ip IS NOT NULL 29 | ), 30 | --Fingerprint all cleaned tests, in an arbitrary but repeatable order 31 | dl_fingerprinted AS ( 32 | SELECT 33 | date, 34 | continent_code, 35 | asn, 36 | ip, 37 | ARRAY_AGG(STRUCT(ABS(FARM_FINGERPRINT(id)) AS ffid, mbps, MinRTT) ORDER BY ABS(FARM_FINGERPRINT(id))) AS members 38 | FROM dl_per_location_cleaned 39 | GROUP BY date, continent_code, asn, ip 40 | ), 41 | --Select two random rows for each IP using a prime number larger than the 42 | -- total number of tests. random1 is used for per day/geo statistics in 43 | -- `dl_stats_per_day` and log averages using both random1 and random2 44 | dl_random_ip_rows_perday AS ( 45 | SELECT 46 | date, 47 | continent_code, 48 | asn, 49 | ip, 50 | ARRAY_LENGTH(members) AS tests, 51 | members[SAFE_OFFSET(MOD(511232941,ARRAY_LENGTH(members)))] AS random1, 52 | members[SAFE_OFFSET(MOD(906686609,ARRAY_LENGTH(members)))] AS random2 53 | FROM dl_fingerprinted 54 | ), 55 | --Calculate log averages and statistics per day from random samples 56 | dl_stats_per_day AS ( 57 | SELECT 58 | date, continent_code, asn, 59 | COUNT(*) AS dl_samples_day, 60 | ROUND(POW(10,AVG(Safe.LOG10(random1.mbps))),3) AS dl_LOG_AVG_rnd1, 61 | ROUND(POW(10,AVG(Safe.LOG10(random2.mbps))),3) AS dl_LOG_AVG_rnd2, 62 | ROUND(POW(10,AVG(Safe.LOG10(random1.MinRtt))),3) AS dl_minRTT_LOG_AVG_rnd1, 63 | ROUND(POW(10,AVG(Safe.LOG10(random2.MinRtt))),3) AS dl_minRTT_LOG_AVG_rnd2, 64 | ROUND(MIN(random1.mbps),3) AS download_MIN, 65 | ROUND(APPROX_QUANTILES(random1.mbps, 100) [SAFE_ORDINAL(25)],3) AS download_Q25, 66 | ROUND(APPROX_QUANTILES(random1.mbps, 100) [SAFE_ORDINAL(50)],3) AS download_MED, 67 | ROUND(AVG(random1.mbps),3) AS download_AVG, 68 | ROUND(APPROX_QUANTILES(random1.mbps, 100) [SAFE_ORDINAL(75)],3) AS download_Q75, 69 | ROUND(MAX(random1.mbps),3) AS download_MAX, 70 | ROUND(APPROX_QUANTILES(random1.MinRTT, 100) [SAFE_ORDINAL(50)],3) AS download_minRTT_MED, 71 | FROM dl_random_ip_rows_perday 72 | GROUP BY continent_code, asn, date 73 | ), 74 | --Count the samples that fall into each bucket and get frequencies 75 | dl_histogram AS ( 76 | SELECT 77 | date, 78 | continent_code, 79 | asn, 80 | --Set the lowest bucket's min to zero, so all tests below the generated min of the lowest bin are included. 81 | CASE WHEN bucket_left = 0.5623413251903491 THEN 0 82 | ELSE bucket_left END AS bucket_min, 83 | bucket_right AS bucket_max, 84 | COUNTIF(random1.mbps < bucket_right AND random1.mbps >= bucket_left) AS dl_samples_bucket, 85 | ROUND(COUNTIF(random1.mbps < bucket_right AND random1.mbps >= bucket_left) / COUNT(*), 3) AS dl_frac_bucket 86 | FROM dl_random_ip_rows_perday CROSS JOIN buckets 87 | GROUP BY 88 | date, 89 | continent_code, 90 | asn, 91 | bucket_min, 92 | bucket_max 93 | ), 94 | --Repeat for Upload tests 95 | --Select the initial set of tests 96 | ul_per_location AS ( 97 | SELECT 98 | date, 99 | client.Geo.ContinentCode AS continent_code, 100 | client.Network.ASNumber AS asn, 101 | NET.SAFE_IP_FROM_STRING(Client.IP) AS ip, 102 | id, 103 | a.MeanThroughputMbps AS mbps, 104 | a.MinRTT AS MinRTT 105 | FROM `measurement-lab.ndt.unified_uploads` 106 | WHERE date BETWEEN @startdate AND @enddate 107 | AND a.MeanThroughputMbps != 0 108 | ), 109 | --Filter for only tests With good locations and valid IPs 110 | ul_per_location_cleaned AS ( 111 | SELECT * FROM ul_per_location 112 | WHERE 113 | continent_code IS NOT NULL 114 | AND continent_code != "" 115 | AND asn IS NOT NULL 116 | AND ip IS NOT NULL 117 | ), 118 | --Fingerprint all cleaned tests, in an arbitrary but repeatable order. 119 | ul_fingerprinted AS ( 120 | SELECT 121 | date, 122 | continent_code, 123 | asn, 124 | ip, 125 | ARRAY_AGG(STRUCT(ABS(FARM_FINGERPRINT(id)) AS ffid, mbps, MinRTT) ORDER BY ABS(FARM_FINGERPRINT(id))) AS members 126 | FROM ul_per_location_cleaned 127 | GROUP BY date, continent_code, asn, ip 128 | ), 129 | --Select two random rows for each IP using a prime number larger than the 130 | -- total number of tests. random1 is used for per day/geo statistics in 131 | -- `ul_stats_per_day` and log averages using both random1 and random2 132 | ul_random_ip_rows_perday AS ( 133 | SELECT 134 | date, 135 | continent_code, 136 | asn, 137 | ip, 138 | ARRAY_LENGTH(members) AS tests, 139 | members[SAFE_OFFSET(MOD(511232941,ARRAY_LENGTH(members)))] AS random1, 140 | members[SAFE_OFFSET(MOD(906686609,ARRAY_LENGTH(members)))] AS random2 141 | FROM ul_fingerprinted 142 | ), 143 | --Calculate log averages and statistics per day from random samples 144 | ul_stats_per_day AS ( 145 | SELECT 146 | date, continent_code, asn, 147 | COUNT(*) AS ul_samples_day, 148 | ROUND(POW(10,AVG(Safe.LOG10(random1.mbps))),3) AS ul_LOG_AVG_rnd1, 149 | ROUND(POW(10,AVG(Safe.LOG10(random2.mbps))),3) AS ul_LOG_AVG_rnd2, 150 | ROUND(POW(10,AVG(Safe.LOG10(random1.MinRtt))),3) AS ul_minRTT_LOG_AVG_rnd1, 151 | ROUND(POW(10,AVG(Safe.LOG10(random2.MinRtt))),3) AS ul_minRTT_LOG_AVG_rnd2, 152 | ROUND(MIN(random1.mbps),3) AS upload_MIN, 153 | ROUND(APPROX_QUANTILES(random1.mbps, 100) [SAFE_ORDINAL(25)],3) AS upload_Q25, 154 | ROUND(APPROX_QUANTILES(random1.mbps, 100) [SAFE_ORDINAL(50)],3) AS upload_MED, 155 | ROUND(AVG(random1.mbps),3) AS upload_AVG, 156 | ROUND(APPROX_QUANTILES(random1.mbps, 100) [SAFE_ORDINAL(75)],3) AS upload_Q75, 157 | ROUND(MAX(random1.mbps),3) AS upload_MAX, 158 | ROUND(APPROX_QUANTILES(random1.MinRTT, 100) [SAFE_ORDINAL(50)],3) AS upload_minRTT_MED, 159 | FROM ul_random_ip_rows_perday 160 | GROUP BY continent_code, asn, date 161 | ), 162 | --Count the samples that fall into each bucket and get frequencies 163 | ul_histogram AS ( 164 | SELECT 165 | date, 166 | continent_code, 167 | asn, 168 | --Set the lowest bucket's min to zero, so all tests below the generated min of the lowest bin are included. 169 | CASE WHEN bucket_left = 0.5623413251903491 THEN 0 170 | ELSE bucket_left END AS bucket_min, 171 | bucket_right AS bucket_max, 172 | COUNTIF(random1.mbps < bucket_right AND random1.mbps >= bucket_left) AS ul_samples_bucket, 173 | ROUND(COUNTIF(random1.mbps < bucket_right AND random1.mbps >= bucket_left) / COUNT(*), 3) AS ul_frac_bucket 174 | FROM ul_random_ip_rows_perday CROSS JOIN buckets 175 | GROUP BY 176 | date, 177 | continent_code, 178 | asn, 179 | bucket_min, 180 | bucket_max 181 | ), 182 | --Gather final result set 183 | results AS ( 184 | SELECT *, MOD(ABS(FARM_FINGERPRINT(continent_code)), 1000) as shard FROM 185 | dl_histogram 186 | JOIN ul_histogram USING (date, continent_code, asn, bucket_min, bucket_max) 187 | JOIN dl_stats_per_day USING (date, continent_code, asn) 188 | JOIN ul_stats_per_day USING (date, continent_code, asn) 189 | ) 190 | --Show the results 191 | SELECT * FROM results 192 | -------------------------------------------------------------------------------- /statistics/queries/continent_country_asn_histogram.sql: -------------------------------------------------------------------------------- 1 | WITH 2 | --Generate equal sized buckets in log-space between near 0 Mbps and ~1 Gbps+ 3 | buckets AS ( 4 | SELECT POW(10, x-.25) AS bucket_left, POW(10,x+.25) AS bucket_right 5 | FROM UNNEST(GENERATE_ARRAY(0, 3.5, .5)) AS x 6 | ), 7 | --Select the initial set of tests 8 | dl_per_location AS ( 9 | SELECT 10 | date, 11 | client.Geo.ContinentCode AS continent_code, 12 | client.Geo.CountryCode AS country_code, 13 | client.Network.ASNumber AS asn, 14 | NET.SAFE_IP_FROM_STRING(Client.IP) AS ip, 15 | id, 16 | a.MeanThroughputMbps AS mbps, 17 | a.MinRTT AS MinRTT 18 | FROM `measurement-lab.ndt.unified_downloads` 19 | WHERE date BETWEEN @startdate AND @enddate 20 | AND a.MeanThroughputMbps != 0 21 | ), 22 | --Filter for only tests With good locations and valid IPs 23 | dl_per_location_cleaned AS ( 24 | SELECT * FROM dl_per_location 25 | WHERE 26 | continent_code IS NOT NULL AND continent_code != "" 27 | AND country_code IS NOT NULL AND country_code != "" 28 | AND asn IS NOT NULL 29 | AND ip IS NOT NULL 30 | ), 31 | --Fingerprint all cleaned tests, in an arbitrary but repeatable order 32 | dl_fingerprinted AS ( 33 | SELECT 34 | date, 35 | continent_code, 36 | country_code, 37 | asn, 38 | ip, 39 | ARRAY_AGG(STRUCT(ABS(FARM_FINGERPRINT(id)) AS ffid, mbps, MinRTT) ORDER BY ABS(FARM_FINGERPRINT(id))) AS members 40 | FROM dl_per_location_cleaned 41 | GROUP BY date, continent_code, country_code, asn, ip 42 | ), 43 | --Select two random rows for each IP using a prime number larger than the 44 | -- total number of tests. random1 is used for per day/geo statistics in 45 | -- `dl_stats_per_day` and log averages using both random1 and random2 46 | dl_random_ip_rows_perday AS ( 47 | SELECT 48 | date, 49 | continent_code, 50 | country_code, 51 | asn, 52 | ip, 53 | ARRAY_LENGTH(members) AS tests, 54 | members[SAFE_OFFSET(MOD(511232941,ARRAY_LENGTH(members)))] AS random1, 55 | members[SAFE_OFFSET(MOD(906686609,ARRAY_LENGTH(members)))] AS random2 56 | FROM dl_fingerprinted 57 | ), 58 | --Calculate log averages and statistics per day from random samples 59 | dl_stats_per_day AS ( 60 | SELECT 61 | date, continent_code, country_code, asn, 62 | COUNT(*) AS dl_samples_day, 63 | ROUND(POW(10,AVG(Safe.LOG10(random1.mbps))),3) AS dl_LOG_AVG_rnd1, 64 | ROUND(POW(10,AVG(Safe.LOG10(random2.mbps))),3) AS dl_LOG_AVG_rnd2, 65 | ROUND(POW(10,AVG(Safe.LOG10(random1.MinRtt))),3) AS dl_minRTT_LOG_AVG_rnd1, 66 | ROUND(POW(10,AVG(Safe.LOG10(random2.MinRtt))),3) AS dl_minRTT_LOG_AVG_rnd2, 67 | ROUND(MIN(random1.mbps),3) AS download_MIN, 68 | ROUND(APPROX_QUANTILES(random1.mbps, 100) [SAFE_ORDINAL(25)],3) AS download_Q25, 69 | ROUND(APPROX_QUANTILES(random1.mbps, 100) [SAFE_ORDINAL(50)],3) AS download_MED, 70 | ROUND(AVG(random1.mbps),3) AS download_AVG, 71 | ROUND(APPROX_QUANTILES(random1.mbps, 100) [SAFE_ORDINAL(75)],3) AS download_Q75, 72 | ROUND(MAX(random1.mbps),3) AS download_MAX, 73 | ROUND(APPROX_QUANTILES(random1.MinRTT, 100) [SAFE_ORDINAL(50)],3) AS download_minRTT_MED, 74 | FROM dl_random_ip_rows_perday 75 | GROUP BY date, continent_code, country_code, asn 76 | ), 77 | --Count the samples that fall into each bucket and get frequencies 78 | dl_histogram AS ( 79 | SELECT 80 | date, 81 | continent_code, 82 | country_code, 83 | asn, 84 | --Set the lowest bucket's min to zero, so all tests below the generated min of the lowest bin are included. 85 | CASE WHEN bucket_left = 0.5623413251903491 THEN 0 86 | ELSE bucket_left END AS bucket_min, 87 | bucket_right AS bucket_max, 88 | COUNTIF(random1.mbps < bucket_right AND random1.mbps >= bucket_left) AS dl_samples_bucket, 89 | ROUND(COUNTIF(random1.mbps < bucket_right AND random1.mbps >= bucket_left) / COUNT(*), 3) AS dl_frac_bucket 90 | FROM dl_random_ip_rows_perday CROSS JOIN buckets 91 | GROUP BY 92 | date, 93 | continent_code, 94 | country_code, 95 | asn, 96 | bucket_min, 97 | bucket_max 98 | ), 99 | --Repeat for Upload tests 100 | --Select the initial set of tests 101 | ul_per_location AS ( 102 | SELECT 103 | date, 104 | client.Geo.ContinentCode AS continent_code, 105 | client.Geo.CountryCode AS country_code, 106 | client.Network.ASNumber AS asn, 107 | NET.SAFE_IP_FROM_STRING(Client.IP) AS ip, 108 | id, 109 | a.MeanThroughputMbps AS mbps, 110 | a.MinRTT AS MinRTT 111 | FROM `measurement-lab.ndt.unified_uploads` 112 | WHERE date BETWEEN @startdate AND @enddate 113 | AND a.MeanThroughputMbps != 0 114 | ), 115 | --Filter for only tests With good locations and valid IPs 116 | ul_per_location_cleaned AS ( 117 | SELECT * FROM ul_per_location 118 | WHERE 119 | continent_code IS NOT NULL AND continent_code != "" 120 | AND country_code IS NOT NULL AND country_code != "" 121 | AND asn IS NOT NULL 122 | AND ip IS NOT NULL 123 | ), 124 | --Fingerprint all cleaned tests, in an arbitrary but repeatable order. 125 | ul_fingerprinted AS ( 126 | SELECT 127 | date, 128 | continent_code, 129 | country_code, 130 | asn, 131 | ip, 132 | ARRAY_AGG(STRUCT(ABS(FARM_FINGERPRINT(id)) AS ffid, mbps, MinRTT) ORDER BY ABS(FARM_FINGERPRINT(id))) AS members 133 | FROM ul_per_location_cleaned 134 | GROUP BY date, continent_code, country_code, asn, ip 135 | ), 136 | --Select two random rows for each IP using a prime number larger than the 137 | -- total number of tests. random1 is used for per day/geo statistics in 138 | -- `ul_stats_per_day` and log averages using both random1 and random2 139 | ul_random_ip_rows_perday AS ( 140 | SELECT 141 | date, 142 | continent_code, 143 | country_code, 144 | asn, 145 | ip, 146 | ARRAY_LENGTH(members) AS tests, 147 | members[SAFE_OFFSET(MOD(511232941,ARRAY_LENGTH(members)))] AS random1, 148 | members[SAFE_OFFSET(MOD(906686609,ARRAY_LENGTH(members)))] AS random2 149 | FROM ul_fingerprinted 150 | ), 151 | --Calculate log averages and statistics per day from random samples 152 | ul_stats_per_day AS ( 153 | SELECT 154 | date, continent_code, country_code, asn, 155 | COUNT(*) AS ul_samples_day, 156 | ROUND(POW(10,AVG(Safe.LOG10(random1.mbps))),3) AS ul_LOG_AVG_rnd1, 157 | ROUND(POW(10,AVG(Safe.LOG10(random2.mbps))),3) AS ul_LOG_AVG_rnd2, 158 | ROUND(POW(10,AVG(Safe.LOG10(random1.MinRtt))),3) AS ul_minRTT_LOG_AVG_rnd1, 159 | ROUND(POW(10,AVG(Safe.LOG10(random2.MinRtt))),3) AS ul_minRTT_LOG_AVG_rnd2, 160 | ROUND(MIN(random1.mbps),3) AS upload_MIN, 161 | ROUND(APPROX_QUANTILES(random1.mbps, 100) [SAFE_ORDINAL(25)],3) AS upload_Q25, 162 | ROUND(APPROX_QUANTILES(random1.mbps, 100) [SAFE_ORDINAL(50)],3) AS upload_MED, 163 | ROUND(AVG(random1.mbps),3) AS upload_AVG, 164 | ROUND(APPROX_QUANTILES(random1.mbps, 100) [SAFE_ORDINAL(75)],3) AS upload_Q75, 165 | ROUND(MAX(random1.mbps),3) AS upload_MAX, 166 | ROUND(APPROX_QUANTILES(random1.MinRTT, 100) [SAFE_ORDINAL(50)],3) AS upload_minRTT_MED, 167 | FROM ul_random_ip_rows_perday 168 | GROUP BY date, continent_code, country_code, asn 169 | ), 170 | --Count the samples that fall into each bucket and get frequencies 171 | ul_histogram AS ( 172 | SELECT 173 | date, 174 | continent_code, 175 | country_code, 176 | asn, 177 | --Set the lowest bucket's min to zero, so all tests below the generated min of the lowest bin are included. 178 | CASE WHEN bucket_left = 0.5623413251903491 THEN 0 179 | ELSE bucket_left END AS bucket_min, 180 | bucket_right AS bucket_max, 181 | COUNTIF(random1.mbps < bucket_right AND random1.mbps >= bucket_left) AS ul_samples_bucket, 182 | ROUND(COUNTIF(random1.mbps < bucket_right AND random1.mbps >= bucket_left) / COUNT(*), 3) AS ul_frac_bucket 183 | FROM ul_random_ip_rows_perday CROSS JOIN buckets 184 | GROUP BY 185 | date, 186 | continent_code, 187 | country_code, 188 | asn, 189 | bucket_min, 190 | bucket_max 191 | ), 192 | --Gather final result set 193 | results AS ( 194 | SELECT *, MOD(ABS(FARM_FINGERPRINT(CAST(asn AS STRING))), 4000) as shard FROM 195 | dl_histogram 196 | JOIN ul_histogram USING (date, continent_code, country_code, asn, bucket_min, 197 | bucket_max) 198 | JOIN dl_stats_per_day USING (date, continent_code, country_code, asn) 199 | JOIN ul_stats_per_day USING (date, continent_code, country_code, asn) 200 | ) 201 | --Show the results 202 | SELECT * FROM results 203 | -------------------------------------------------------------------------------- /statistics/queries/continent_country_histogram.sql: -------------------------------------------------------------------------------- 1 | WITH 2 | --Generate equal sized buckets in log-space between near 0 Mbps and ~1 Gbps+ 3 | buckets AS ( 4 | SELECT POW(10, x-.25) AS bucket_left, POW(10,x+.25) AS bucket_right 5 | FROM UNNEST(GENERATE_ARRAY(0, 3.5, .5)) AS x 6 | ), 7 | --Select the initial set of tests 8 | dl_per_location AS ( 9 | SELECT 10 | date, 11 | client.Geo.ContinentCode AS continent_code, 12 | client.Geo.CountryCode AS country_code, 13 | NET.SAFE_IP_FROM_STRING(Client.IP) AS ip, 14 | id, 15 | a.MeanThroughputMbps AS mbps, 16 | a.MinRTT AS MinRTT 17 | FROM `measurement-lab.ndt.unified_downloads` 18 | WHERE date BETWEEN @startdate AND @enddate 19 | AND a.MeanThroughputMbps != 0 20 | ), 21 | --Filter for only tests With good locations and valid IPs 22 | dl_per_location_cleaned AS ( 23 | SELECT * FROM dl_per_location 24 | WHERE 25 | continent_code IS NOT NULL AND continent_code != "" 26 | AND country_code IS NOT NULL AND country_code != "" 27 | AND ip IS NOT NULL 28 | ), 29 | --Fingerprint all cleaned tests, in an arbitrary but repeatable order 30 | dl_fingerprinted AS ( 31 | SELECT 32 | date, 33 | continent_code, 34 | country_code, 35 | ip, 36 | ARRAY_AGG(STRUCT(ABS(FARM_FINGERPRINT(id)) AS ffid, mbps, MinRTT) ORDER BY ABS(FARM_FINGERPRINT(id))) AS members 37 | FROM dl_per_location_cleaned 38 | GROUP BY date, continent_code, country_code, ip 39 | ), 40 | --Select two random rows for each IP using a prime number larger than the 41 | -- total number of tests. random1 is used for per day/geo statistics in 42 | -- `dl_stats_per_day` and log averages using both random1 and random2 43 | dl_random_ip_rows_perday AS ( 44 | SELECT 45 | date, 46 | continent_code, 47 | country_code, 48 | ip, 49 | ARRAY_LENGTH(members) AS tests, 50 | members[SAFE_OFFSET(MOD(511232941,ARRAY_LENGTH(members)))] AS random1, 51 | members[SAFE_OFFSET(MOD(906686609,ARRAY_LENGTH(members)))] AS random2 52 | FROM dl_fingerprinted 53 | ), 54 | --Calculate log averages and statistics per day from random samples 55 | dl_stats_per_day AS ( 56 | SELECT 57 | date, continent_code, country_code, 58 | COUNT(*) AS dl_samples_day, 59 | ROUND(POW(10,AVG(Safe.LOG10(random1.mbps))),3) AS dl_LOG_AVG_rnd1, 60 | ROUND(POW(10,AVG(Safe.LOG10(random2.mbps))),3) AS dl_LOG_AVG_rnd2, 61 | ROUND(POW(10,AVG(Safe.LOG10(random1.MinRtt))),3) AS dl_minRTT_LOG_AVG_rnd1, 62 | ROUND(POW(10,AVG(Safe.LOG10(random2.MinRtt))),3) AS dl_minRTT_LOG_AVG_rnd2, 63 | ROUND(MIN(random1.mbps),3) AS download_MIN, 64 | ROUND(APPROX_QUANTILES(random1.mbps, 100) [SAFE_ORDINAL(25)],3) AS download_Q25, 65 | ROUND(APPROX_QUANTILES(random1.mbps, 100) [SAFE_ORDINAL(50)],3) AS download_MED, 66 | ROUND(AVG(random1.mbps),3) AS download_AVG, 67 | ROUND(APPROX_QUANTILES(random1.mbps, 100) [SAFE_ORDINAL(75)],3) AS download_Q75, 68 | ROUND(MAX(random1.mbps),3) AS download_MAX, 69 | ROUND(APPROX_QUANTILES(random1.MinRTT, 100) [SAFE_ORDINAL(50)],3) AS download_minRTT_MED, 70 | FROM dl_random_ip_rows_perday 71 | GROUP BY continent_code, country_code, date 72 | ), 73 | --Count the samples that fall into each bucket and get frequencies 74 | dl_histogram AS ( 75 | SELECT 76 | date, 77 | continent_code, 78 | country_code, 79 | --Set the lowest bucket's min to zero, so all tests below the generated min of the lowest bin are included. 80 | CASE WHEN bucket_left = 0.5623413251903491 THEN 0 81 | ELSE bucket_left END AS bucket_min, 82 | bucket_right AS bucket_max, 83 | COUNTIF(random1.mbps < bucket_right AND random1.mbps >= bucket_left) AS dl_samples_bucket, 84 | ROUND(COUNTIF(random1.mbps < bucket_right AND random1.mbps >= bucket_left) / COUNT(*), 3) AS dl_frac_bucket 85 | FROM dl_random_ip_rows_perday CROSS JOIN buckets 86 | GROUP BY 87 | date, 88 | continent_code, 89 | country_code, 90 | bucket_min, 91 | bucket_max 92 | ), 93 | --Repeat for Upload tests 94 | --Select the initial set of tests 95 | ul_per_location AS ( 96 | SELECT 97 | date, 98 | client.Geo.ContinentCode AS continent_code, 99 | client.Geo.CountryCode AS country_code, 100 | NET.SAFE_IP_FROM_STRING(Client.IP) AS ip, 101 | id, 102 | a.MeanThroughputMbps AS mbps, 103 | a.MinRTT AS MinRTT 104 | FROM `measurement-lab.ndt.unified_uploads` 105 | WHERE date BETWEEN @startdate AND @enddate 106 | AND a.MeanThroughputMbps != 0 107 | ), 108 | --Filter for only tests With good locations and valid IPs 109 | ul_per_location_cleaned AS ( 110 | SELECT * FROM ul_per_location 111 | WHERE 112 | continent_code IS NOT NULL AND continent_code != "" 113 | AND country_code IS NOT NULL AND country_code != "" 114 | AND ip IS NOT NULL 115 | ), 116 | --Fingerprint all cleaned tests, in an arbitrary but repeatable order. 117 | ul_fingerprinted AS ( 118 | SELECT 119 | date, 120 | continent_code, 121 | country_code, 122 | ip, 123 | ARRAY_AGG(STRUCT(ABS(FARM_FINGERPRINT(id)) AS ffid, mbps, MinRTT) ORDER BY ABS(FARM_FINGERPRINT(id))) AS members 124 | FROM ul_per_location_cleaned 125 | GROUP BY date, continent_code, country_code, ip 126 | ), 127 | --Select two random rows for each IP using a prime number larger than the 128 | -- total number of tests. random1 is used for per day/geo statistics in 129 | -- `ul_stats_per_day` and log averages using both random1 and random2 130 | ul_random_ip_rows_perday AS ( 131 | SELECT 132 | date, 133 | continent_code, 134 | country_code, 135 | ip, 136 | ARRAY_LENGTH(members) AS tests, 137 | members[SAFE_OFFSET(MOD(511232941,ARRAY_LENGTH(members)))] AS random1, 138 | members[SAFE_OFFSET(MOD(906686609,ARRAY_LENGTH(members)))] AS random2 139 | FROM ul_fingerprinted 140 | ), 141 | --Calculate log averages and statistics per day from random samples 142 | ul_stats_per_day AS ( 143 | SELECT 144 | date, continent_code, country_code, 145 | COUNT(*) AS ul_samples_day, 146 | ROUND(POW(10,AVG(Safe.LOG10(random1.mbps))),3) AS ul_LOG_AVG_rnd1, 147 | ROUND(POW(10,AVG(Safe.LOG10(random2.mbps))),3) AS ul_LOG_AVG_rnd2, 148 | ROUND(POW(10,AVG(Safe.LOG10(random1.MinRtt))),3) AS ul_minRTT_LOG_AVG_rnd1, 149 | ROUND(POW(10,AVG(Safe.LOG10(random2.MinRtt))),3) AS ul_minRTT_LOG_AVG_rnd2, 150 | ROUND(MIN(random1.mbps),3) AS upload_MIN, 151 | ROUND(APPROX_QUANTILES(random1.mbps, 100) [SAFE_ORDINAL(25)],3) AS upload_Q25, 152 | ROUND(APPROX_QUANTILES(random1.mbps, 100) [SAFE_ORDINAL(50)],3) AS upload_MED, 153 | ROUND(AVG(random1.mbps),3) AS upload_AVG, 154 | ROUND(APPROX_QUANTILES(random1.mbps, 100) [SAFE_ORDINAL(75)],3) AS upload_Q75, 155 | ROUND(MAX(random1.mbps),3) AS upload_MAX, 156 | ROUND(APPROX_QUANTILES(random1.MinRTT, 100) [SAFE_ORDINAL(50)],3) AS upload_minRTT_MED, 157 | FROM ul_random_ip_rows_perday 158 | GROUP BY continent_code, country_code, date 159 | ), 160 | --Count the samples that fall into each bucket and get frequencies 161 | ul_histogram AS ( 162 | SELECT 163 | date, 164 | continent_code, 165 | country_code, 166 | --Set the lowest bucket's min to zero, so all tests below the generated min of the lowest bin are included. 167 | CASE WHEN bucket_left = 0.5623413251903491 THEN 0 168 | ELSE bucket_left END AS bucket_min, 169 | bucket_right AS bucket_max, 170 | COUNTIF(random1.mbps < bucket_right AND random1.mbps >= bucket_left) AS ul_samples_bucket, 171 | ROUND(COUNTIF(random1.mbps < bucket_right AND random1.mbps >= bucket_left) / COUNT(*), 3) AS ul_frac_bucket 172 | FROM ul_random_ip_rows_perday CROSS JOIN buckets 173 | GROUP BY 174 | date, 175 | continent_code, 176 | country_code, 177 | bucket_min, 178 | bucket_max 179 | ), 180 | --Gather final result set 181 | results AS ( 182 | SELECT *, MOD(ABS(FARM_FINGERPRINT(country_code)), 1000) as shard FROM dl_histogram 183 | JOIN ul_histogram USING (date, continent_code, country_code, bucket_min, bucket_max) 184 | JOIN dl_stats_per_day USING (date, continent_code, country_code) 185 | JOIN ul_stats_per_day USING (date, continent_code, country_code) 186 | ) 187 | --Show the results 188 | SELECT * FROM results 189 | -------------------------------------------------------------------------------- /statistics/queries/continent_country_region_asn_histogram.sql: -------------------------------------------------------------------------------- 1 | WITH 2 | --Generate equal sized buckets in log-space between near 0 Mbps and ~1 Gbps+ 3 | buckets AS ( 4 | SELECT POW(10, x-.25) AS bucket_left, POW(10,x+.25) AS bucket_right 5 | FROM UNNEST(GENERATE_ARRAY(0, 3.5, .5)) AS x 6 | ), 7 | --Select the initial set of tests 8 | --Filter for only tests With good locations and valid IPs 9 | dl_per_location_cleaned AS ( 10 | SELECT 11 | date, 12 | client.Geo.ContinentCode AS continent_code, 13 | client.Geo.CountryCode AS country_code, 14 | CASE WHEN client.Geo.Subdivision1ISOCode != "" AND client.Geo.Subdivision1ISOCode IS NOT NULL 15 | THEN CONCAT(client.Geo.CountryCode,"-",client.Geo.Subdivision1ISOCode) 16 | ELSE CONCAT(client.Geo.CountryCode,"-",client.Geo.region) 17 | END AS ISO3166_2region1, 18 | client.Network.ASNumber AS asn, 19 | NET.SAFE_IP_FROM_STRING(Client.IP) AS ip, 20 | id, 21 | a.MeanThroughputMbps AS mbps, 22 | a.MinRTT AS MinRTT 23 | FROM `measurement-lab.ndt.unified_downloads` 24 | WHERE date BETWEEN @startdate AND @enddate 25 | AND a.MeanThroughputMbps != 0 26 | AND client.Geo.ContinentCode IS NOT NULL AND client.Geo.ContinentCode != "" 27 | AND client.Geo.CountryCode IS NOT NULL AND client.Geo.CountryCode != "" 28 | AND (client.Geo.Subdivision1ISOCode IS NOT NULL OR client.Geo.Region IS NOT NULL) 29 | AND (client.Geo.Subdivision1ISOCode != "" OR client.Geo.Region != "") 30 | AND client.Network.ASNumber IS NOT NULL 31 | AND Client.IP IS NOT NULL 32 | ), 33 | --Fingerprint all cleaned tests, in an arbitrary but repeatable order 34 | dl_fingerprinted AS ( 35 | SELECT 36 | date, 37 | continent_code, 38 | country_code, 39 | asn, 40 | ip, 41 | ISO3166_2region1, 42 | ARRAY_AGG(STRUCT(ABS(FARM_FINGERPRINT(id)) AS ffid, mbps, MinRTT) ORDER BY ABS(FARM_FINGERPRINT(id))) AS members 43 | FROM dl_per_location_cleaned 44 | GROUP BY date, continent_code, country_code, ISO3166_2region1, asn, ip 45 | ), 46 | --Select two random rows for each IP using a prime number larger than the 47 | -- total number of tests. random1 is used for per day/geo statistics in 48 | -- `dl_stats_per_day` and log averages using both random1 and random2 49 | dl_random_ip_rows_perday AS ( 50 | SELECT 51 | date, 52 | continent_code, 53 | country_code, 54 | ISO3166_2region1, 55 | asn, 56 | ip, 57 | ARRAY_LENGTH(members) AS tests, 58 | members[SAFE_OFFSET(MOD(511232941,ARRAY_LENGTH(members)))] AS random1, 59 | members[SAFE_OFFSET(MOD(906686609,ARRAY_LENGTH(members)))] AS random2 60 | FROM dl_fingerprinted 61 | ), 62 | --Calculate log averages and statistics per day from random samples 63 | dl_stats_per_day AS ( 64 | SELECT 65 | date, continent_code, country_code, ISO3166_2region1, asn, 66 | COUNT(*) AS dl_samples_day, 67 | ROUND(POW(10,AVG(Safe.LOG10(random1.mbps))),3) AS dl_LOG_AVG_rnd1, 68 | ROUND(POW(10,AVG(Safe.LOG10(random2.mbps))),3) AS dl_LOG_AVG_rnd2, 69 | ROUND(POW(10,AVG(Safe.LOG10(random1.MinRtt))),3) AS dl_minRTT_LOG_AVG_rnd1, 70 | ROUND(POW(10,AVG(Safe.LOG10(random2.MinRtt))),3) AS dl_minRTT_LOG_AVG_rnd2, 71 | ROUND(MIN(random1.mbps),3) AS download_MIN, 72 | ROUND(APPROX_QUANTILES(random1.mbps, 100) [SAFE_ORDINAL(25)],3) AS download_Q25, 73 | ROUND(APPROX_QUANTILES(random1.mbps, 100) [SAFE_ORDINAL(50)],3) AS download_MED, 74 | ROUND(AVG(random1.mbps),3) AS download_AVG, 75 | ROUND(APPROX_QUANTILES(random1.mbps, 100) [SAFE_ORDINAL(75)],3) AS download_Q75, 76 | ROUND(MAX(random1.mbps),3) AS download_MAX, 77 | ROUND(APPROX_QUANTILES(random1.MinRTT, 100) [SAFE_ORDINAL(50)],3) AS download_minRTT_MED, 78 | FROM dl_random_ip_rows_perday 79 | GROUP BY date, continent_code, country_code, ISO3166_2region1, asn 80 | ), 81 | --Count the samples that fall into each bucket and get frequencies 82 | dl_histogram AS ( 83 | SELECT 84 | date, 85 | continent_code, 86 | country_code, 87 | ISO3166_2region1, 88 | asn, 89 | --Set the lowest bucket's min to zero, so all tests below the generated min of the lowest bin are included. 90 | CASE WHEN bucket_left = 0.5623413251903491 THEN 0 91 | ELSE bucket_left END AS bucket_min, 92 | bucket_right AS bucket_max, 93 | COUNTIF(random1.mbps < bucket_right AND random1.mbps >= bucket_left) AS dl_samples_bucket, 94 | ROUND(COUNTIF(random1.mbps < bucket_right AND random1.mbps >= bucket_left) / COUNT(*), 3) AS dl_frac_bucket 95 | FROM dl_random_ip_rows_perday CROSS JOIN buckets 96 | GROUP BY 97 | date, 98 | continent_code, 99 | country_code, 100 | ISO3166_2region1, 101 | asn, 102 | bucket_min, 103 | bucket_max 104 | ), 105 | --Repeat for Upload tests 106 | --Select the initial set of tests 107 | --Filter for only tests With good locations and valid IPs 108 | ul_per_location_cleaned AS ( 109 | SELECT 110 | date, 111 | client.Geo.ContinentCode AS continent_code, 112 | client.Geo.CountryCode AS country_code, 113 | CASE WHEN client.Geo.Subdivision1ISOCode != "" AND client.Geo.Subdivision1ISOCode IS NOT NULL 114 | THEN CONCAT(client.Geo.CountryCode,"-",client.Geo.Subdivision1ISOCode) 115 | ELSE CONCAT(client.Geo.CountryCode,"-",client.Geo.region) 116 | END AS ISO3166_2region1, 117 | client.Network.ASNumber AS asn, 118 | NET.SAFE_IP_FROM_STRING(Client.IP) AS ip, 119 | id, 120 | a.MeanThroughputMbps AS mbps, 121 | a.MinRTT AS MinRTT 122 | FROM `measurement-lab.ndt.unified_uploads` 123 | WHERE date BETWEEN @startdate AND @enddate 124 | AND a.MeanThroughputMbps != 0 125 | AND client.Geo.ContinentCode IS NOT NULL AND client.Geo.ContinentCode != "" 126 | AND client.Geo.CountryCode IS NOT NULL AND client.Geo.CountryCode != "" 127 | AND (client.Geo.Subdivision1ISOCode IS NOT NULL OR client.Geo.Region IS NOT NULL) 128 | AND (client.Geo.Subdivision1ISOCode != "" OR client.Geo.Region != "") 129 | AND client.Network.ASNumber IS NOT NULL 130 | AND Client.IP IS NOT NULL 131 | ), 132 | --Fingerprint all cleaned tests, in an arbitrary but repeatable order. 133 | ul_fingerprinted AS ( 134 | SELECT 135 | date, 136 | continent_code, 137 | country_code, 138 | ISO3166_2region1, 139 | asn, 140 | ip, 141 | ARRAY_AGG(STRUCT(ABS(FARM_FINGERPRINT(id)) AS ffid, mbps, MinRTT) ORDER BY ABS(FARM_FINGERPRINT(id))) AS members 142 | FROM ul_per_location_cleaned 143 | GROUP BY date, continent_code, country_code, ISO3166_2region1, asn, ip 144 | ), 145 | --Select two random rows for each IP using a prime number larger than the 146 | -- total number of tests. random1 is used for per day/geo statistics in 147 | -- `ul_stats_per_day` and log averages using both random1 and random2 148 | ul_random_ip_rows_perday AS ( 149 | SELECT 150 | date, 151 | continent_code, 152 | country_code, 153 | ISO3166_2region1, 154 | asn, 155 | ip, 156 | ARRAY_LENGTH(members) AS tests, 157 | members[SAFE_OFFSET(MOD(511232941,ARRAY_LENGTH(members)))] AS random1, 158 | members[SAFE_OFFSET(MOD(906686609,ARRAY_LENGTH(members)))] AS random2 159 | FROM ul_fingerprinted 160 | ), 161 | --Calculate log averages and statistics per day from random samples 162 | ul_stats_per_day AS ( 163 | SELECT 164 | date, continent_code, country_code, ISO3166_2region1, asn, 165 | COUNT(*) AS ul_samples_day, 166 | ROUND(POW(10,AVG(Safe.LOG10(random1.mbps))),3) AS ul_LOG_AVG_rnd1, 167 | ROUND(POW(10,AVG(Safe.LOG10(random2.mbps))),3) AS ul_LOG_AVG_rnd2, 168 | ROUND(POW(10,AVG(Safe.LOG10(random1.MinRtt))),3) AS ul_minRTT_LOG_AVG_rnd1, 169 | ROUND(POW(10,AVG(Safe.LOG10(random2.MinRtt))),3) AS ul_minRTT_LOG_AVG_rnd2, 170 | ROUND(MIN(random1.mbps),3) AS upload_MIN, 171 | ROUND(APPROX_QUANTILES(random1.mbps, 100) [SAFE_ORDINAL(25)],3) AS upload_Q25, 172 | ROUND(APPROX_QUANTILES(random1.mbps, 100) [SAFE_ORDINAL(50)],3) AS upload_MED, 173 | ROUND(AVG(random1.mbps),3) AS upload_AVG, 174 | ROUND(APPROX_QUANTILES(random1.mbps, 100) [SAFE_ORDINAL(75)],3) AS upload_Q75, 175 | ROUND(MAX(random1.mbps),3) AS upload_MAX, 176 | ROUND(APPROX_QUANTILES(random1.MinRTT, 100) [SAFE_ORDINAL(50)],3) AS upload_minRTT_MED, 177 | FROM ul_random_ip_rows_perday 178 | GROUP BY date, continent_code, country_code, ISO3166_2region1, asn 179 | ), 180 | --Count the samples that fall into each bucket and get frequencies 181 | ul_histogram AS ( 182 | SELECT 183 | date, 184 | continent_code, 185 | country_code, 186 | ISO3166_2region1, 187 | asn, 188 | --Set the lowest bucket's min to zero, so all tests below the generated min of the lowest bin are included. 189 | CASE WHEN bucket_left = 0.5623413251903491 THEN 0 190 | ELSE bucket_left END AS bucket_min, 191 | bucket_right AS bucket_max, 192 | COUNTIF(random1.mbps < bucket_right AND random1.mbps >= bucket_left) AS ul_samples_bucket, 193 | ROUND(COUNTIF(random1.mbps < bucket_right AND random1.mbps >= bucket_left) / COUNT(*), 3) AS ul_frac_bucket 194 | FROM ul_random_ip_rows_perday CROSS JOIN buckets 195 | GROUP BY 196 | date, 197 | continent_code, 198 | country_code, 199 | ISO3166_2region1, 200 | asn, 201 | bucket_min, 202 | bucket_max 203 | ), 204 | --Gather final result set 205 | results AS ( 206 | SELECT *, MOD(ABS(FARM_FINGERPRINT(CAST(asn AS STRING))), 1000) AS shard FROM 207 | dl_histogram 208 | JOIN ul_histogram USING (date, continent_code, country_code, 209 | ISO3166_2region1, asn, bucket_min, bucket_max) 210 | JOIN dl_stats_per_day USING (date, continent_code, country_code, 211 | ISO3166_2region1, asn) 212 | JOIN ul_stats_per_day USING (date, continent_code, country_code, 213 | ISO3166_2region1, asn) 214 | ) 215 | --Show the results 216 | SELECT * FROM results 217 | -------------------------------------------------------------------------------- /statistics/queries/continent_country_region_city_histogram.sql: -------------------------------------------------------------------------------- 1 | WITH 2 | --Generate equal sized buckets in log-space between near 0 Mbps and ~1 Gbps+ 3 | buckets AS ( 4 | SELECT POW(10, x-.25) AS bucket_left, POW(10,x+.25) AS bucket_right 5 | FROM UNNEST(GENERATE_ARRAY(0, 3.5, .5)) AS x 6 | ), 7 | --Select the initial set of tests 8 | --Filter for only tests With good locations and valid IPs 9 | dl_per_location_cleaned AS ( 10 | SELECT 11 | date, 12 | client.Geo.ContinentCode AS continent_code, 13 | client.Geo.CountryCode AS country_code, 14 | CASE WHEN client.Geo.Subdivision1ISOCode != "" AND client.Geo.Subdivision1ISOCode IS NOT NULL 15 | THEN CONCAT(client.Geo.CountryCode,"-",client.Geo.Subdivision1ISOCode) 16 | ELSE CONCAT(client.Geo.CountryCode,"-",client.Geo.region) 17 | END AS ISO3166_2region1, 18 | client.Geo.City AS city, 19 | NET.SAFE_IP_FROM_STRING(Client.IP) AS ip, 20 | id, 21 | a.MeanThroughputMbps AS mbps, 22 | a.MinRTT AS MinRTT 23 | FROM `measurement-lab.ndt.unified_downloads` 24 | WHERE date BETWEEN @startdate AND @enddate 25 | AND a.MeanThroughputMbps != 0 26 | AND client.Geo.ContinentCode IS NOT NULL AND client.Geo.ContinentCode != "" 27 | AND client.Geo.CountryCode IS NOT NULL AND client.Geo.CountryCode != "" 28 | AND (client.Geo.Subdivision1ISOCode IS NOT NULL OR client.Geo.Region IS NOT NULL) 29 | AND (client.Geo.Subdivision1ISOCode != "" OR client.Geo.Region != "") 30 | AND client.Geo.City IS NOT NULL AND client.Geo.City != "" 31 | AND Client.IP IS NOT NULL 32 | ), 33 | --Fingerprint all cleaned tests, in an arbitrary but repeatable order 34 | dl_fingerprinted AS ( 35 | SELECT 36 | date, 37 | continent_code, 38 | country_code, 39 | ISO3166_2region1, 40 | city, 41 | ip, 42 | ARRAY_AGG(STRUCT(ABS(FARM_FINGERPRINT(id)) AS ffid, mbps, MinRTT) ORDER BY ABS(FARM_FINGERPRINT(id))) AS members 43 | FROM dl_per_location_cleaned 44 | GROUP BY date, continent_code, country_code, ISO3166_2region1, city, ip 45 | ), 46 | --Select two random rows for each IP using a prime number larger than the 47 | -- total number of tests. random1 is used for per day/geo statistics in 48 | -- `dl_stats_per_day` and log averages using both random1 and random2 49 | dl_random_ip_rows_perday AS ( 50 | SELECT 51 | date, 52 | continent_code, 53 | country_code, 54 | ISO3166_2region1, 55 | city, 56 | ip, 57 | ARRAY_LENGTH(members) AS tests, 58 | members[SAFE_OFFSET(MOD(511232941,ARRAY_LENGTH(members)))] AS random1, 59 | members[SAFE_OFFSET(MOD(906686609,ARRAY_LENGTH(members)))] AS random2 60 | FROM dl_fingerprinted 61 | ), 62 | --Calculate log averages and statistics per day from random samples 63 | dl_stats_per_day AS ( 64 | SELECT 65 | date, continent_code, country_code, ISO3166_2region1, city, 66 | COUNT(*) AS dl_samples_day, 67 | ROUND(POW(10,AVG(Safe.LOG10(random1.mbps))),3) AS dl_LOG_AVG_rnd1, 68 | ROUND(POW(10,AVG(Safe.LOG10(random2.mbps))),3) AS dl_LOG_AVG_rnd2, 69 | ROUND(POW(10,AVG(Safe.LOG10(random1.MinRtt))),3) AS dl_minRTT_LOG_AVG_rnd1, 70 | ROUND(POW(10,AVG(Safe.LOG10(random2.MinRtt))),3) AS dl_minRTT_LOG_AVG_rnd2, 71 | ROUND(MIN(random1.mbps),3) AS download_MIN, 72 | ROUND(APPROX_QUANTILES(random1.mbps, 100) [SAFE_ORDINAL(25)],3) AS download_Q25, 73 | ROUND(APPROX_QUANTILES(random1.mbps, 100) [SAFE_ORDINAL(50)],3) AS download_MED, 74 | ROUND(AVG(random1.mbps),3) AS download_AVG, 75 | ROUND(APPROX_QUANTILES(random1.mbps, 100) [SAFE_ORDINAL(75)],3) AS download_Q75, 76 | ROUND(MAX(random1.mbps),3) AS download_MAX, 77 | ROUND(APPROX_QUANTILES(random1.MinRTT, 100) [SAFE_ORDINAL(50)],3) AS download_minRTT_MED, 78 | FROM dl_random_ip_rows_perday 79 | GROUP BY date, continent_code, country_code, ISO3166_2region1, city 80 | ), 81 | --Count the samples that fall into each bucket and get frequencies 82 | dl_histogram AS ( 83 | SELECT 84 | date, 85 | continent_code, 86 | country_code, 87 | ISO3166_2region1, 88 | city, 89 | --Set the lowest bucket's min to zero, so all tests below the generated min of the lowest bin are included. 90 | CASE WHEN bucket_left = 0.5623413251903491 THEN 0 91 | ELSE bucket_left END AS bucket_min, 92 | bucket_right AS bucket_max, 93 | COUNTIF(random1.mbps < bucket_right AND random1.mbps >= bucket_left) AS dl_samples_bucket, 94 | ROUND(COUNTIF(random1.mbps < bucket_right AND random1.mbps >= bucket_left) / COUNT(*), 3) AS dl_frac_bucket 95 | FROM dl_random_ip_rows_perday CROSS JOIN buckets 96 | GROUP BY 97 | date, 98 | continent_code, 99 | country_code, 100 | ISO3166_2region1, 101 | city, 102 | bucket_min, 103 | bucket_max 104 | ), 105 | --Repeat for Upload tests 106 | --Select the initial set of tests 107 | --Filter for only tests With good locations and valid IPs 108 | ul_per_location_cleaned AS ( 109 | SELECT 110 | date, 111 | client.Geo.ContinentCode AS continent_code, 112 | client.Geo.CountryCode AS country_code, 113 | CASE WHEN client.Geo.Subdivision1ISOCode != "" AND client.Geo.Subdivision1ISOCode IS NOT NULL 114 | THEN CONCAT(client.Geo.CountryCode,"-",client.Geo.Subdivision1ISOCode) 115 | ELSE CONCAT(client.Geo.CountryCode,"-",client.Geo.region) 116 | END AS ISO3166_2region1, 117 | client.Geo.City AS city, 118 | NET.SAFE_IP_FROM_STRING(Client.IP) AS ip, 119 | id, 120 | a.MeanThroughputMbps AS mbps, 121 | a.MinRTT AS MinRTT 122 | FROM `measurement-lab.ndt.unified_uploads` 123 | WHERE date BETWEEN @startdate AND @enddate 124 | AND a.MeanThroughputMbps != 0 125 | AND client.Geo.ContinentCode IS NOT NULL AND client.Geo.ContinentCode != "" 126 | AND client.Geo.CountryCode IS NOT NULL AND client.Geo.CountryCode != "" 127 | AND (client.Geo.Subdivision1ISOCode IS NOT NULL OR client.Geo.Region IS NOT NULL) 128 | AND (client.Geo.Subdivision1ISOCode != "" OR client.Geo.Region != "") 129 | AND client.Geo.City IS NOT NULL AND client.Geo.City != "" 130 | AND Client.IP IS NOT NULL 131 | ), 132 | --Fingerprint all cleaned tests, in an arbitrary but repeatable order. 133 | ul_fingerprinted AS ( 134 | SELECT 135 | date, 136 | continent_code, 137 | country_code, 138 | ISO3166_2region1, 139 | city, 140 | ip, 141 | ARRAY_AGG(STRUCT(ABS(FARM_FINGERPRINT(id)) AS ffid, mbps, MinRTT) ORDER BY ABS(FARM_FINGERPRINT(id))) AS members 142 | FROM ul_per_location_cleaned 143 | GROUP BY date, continent_code, country_code, ISO3166_2region1, city, ip 144 | ), 145 | --Select two random rows for each IP using a prime number larger than the 146 | -- total number of tests. random1 is used for per day/geo statistics in 147 | -- `ul_stats_per_day` and log averages using both random1 and random2 148 | ul_random_ip_rows_perday AS ( 149 | SELECT 150 | date, 151 | continent_code, 152 | country_code, 153 | ISO3166_2region1, 154 | city, 155 | ip, 156 | ARRAY_LENGTH(members) AS tests, 157 | members[SAFE_OFFSET(MOD(511232941,ARRAY_LENGTH(members)))] AS random1, 158 | members[SAFE_OFFSET(MOD(906686609,ARRAY_LENGTH(members)))] AS random2 159 | FROM ul_fingerprinted 160 | ), 161 | --Calculate log averages and statistics per day from random samples 162 | ul_stats_per_day AS ( 163 | SELECT 164 | date, continent_code, country_code, ISO3166_2region1, city, 165 | COUNT(*) AS ul_samples_day, 166 | ROUND(POW(10,AVG(Safe.LOG10(random1.mbps))),3) AS ul_LOG_AVG_rnd1, 167 | ROUND(POW(10,AVG(Safe.LOG10(random2.mbps))),3) AS ul_LOG_AVG_rnd2, 168 | ROUND(POW(10,AVG(Safe.LOG10(random1.MinRtt))),3) AS ul_minRTT_LOG_AVG_rnd1, 169 | ROUND(POW(10,AVG(Safe.LOG10(random2.MinRtt))),3) AS ul_minRTT_LOG_AVG_rnd2, 170 | ROUND(MIN(random1.mbps),3) AS upload_MIN, 171 | ROUND(APPROX_QUANTILES(random1.mbps, 100) [SAFE_ORDINAL(25)],3) AS upload_Q25, 172 | ROUND(APPROX_QUANTILES(random1.mbps, 100) [SAFE_ORDINAL(50)],3) AS upload_MED, 173 | ROUND(AVG(random1.mbps),3) AS upload_AVG, 174 | ROUND(APPROX_QUANTILES(random1.mbps, 100) [SAFE_ORDINAL(75)],3) AS upload_Q75, 175 | ROUND(MAX(random1.mbps),3) AS upload_MAX, 176 | ROUND(APPROX_QUANTILES(random1.MinRTT, 100) [SAFE_ORDINAL(50)],3) AS upload_minRTT_MED, 177 | FROM ul_random_ip_rows_perday 178 | GROUP BY date, continent_code, country_code, ISO3166_2region1, city 179 | ), 180 | --Count the samples that fall into each bucket and get frequencies 181 | ul_histogram AS ( 182 | SELECT 183 | date, 184 | continent_code, 185 | country_code, 186 | ISO3166_2region1, 187 | city, 188 | --Set the lowest bucket's min to zero, so all tests below the generated min of the lowest bin are included. 189 | CASE WHEN bucket_left = 0.5623413251903491 THEN 0 190 | ELSE bucket_left END AS bucket_min, 191 | bucket_right AS bucket_max, 192 | COUNTIF(random1.mbps < bucket_right AND random1.mbps >= bucket_left) AS ul_samples_bucket, 193 | ROUND(COUNTIF(random1.mbps < bucket_right AND random1.mbps >= bucket_left) / COUNT(*), 3) AS ul_frac_bucket 194 | FROM ul_random_ip_rows_perday CROSS JOIN buckets 195 | GROUP BY 196 | date, 197 | continent_code, 198 | country_code, 199 | ISO3166_2region1, 200 | city, 201 | bucket_min, 202 | bucket_max 203 | ), 204 | --Gather final result set 205 | results AS ( 206 | SELECT *, MOD(ABS(FARM_FINGERPRINT(city)), 4000) as shard FROM dl_histogram 207 | JOIN ul_histogram USING (date, continent_code, country_code, ISO3166_2region1, city, bucket_min, bucket_max) 208 | JOIN dl_stats_per_day USING (date, continent_code, country_code, ISO3166_2region1, city) 209 | JOIN ul_stats_per_day USING (date, continent_code, country_code, ISO3166_2region1, city) 210 | ) 211 | --Show the results 212 | SELECT * FROM results 213 | -------------------------------------------------------------------------------- /statistics/queries/continent_country_region_histogram.sql: -------------------------------------------------------------------------------- 1 | WITH 2 | --Generate equal sized buckets in log-space between near 0 Mbps and ~1 Gbps+ 3 | buckets AS ( 4 | SELECT POW(10, x-.25) AS bucket_left, POW(10,x+.25) AS bucket_right 5 | FROM UNNEST(GENERATE_ARRAY(0, 3.5, .5)) AS x 6 | ), 7 | --Select the initial set of tests 8 | --Filter for only tests With good locations and valid IPs 9 | dl_per_location_cleaned AS ( 10 | SELECT 11 | date, 12 | client.Geo.ContinentCode AS continent_code, 13 | client.Geo.CountryCode AS country_code, 14 | CASE WHEN client.Geo.Subdivision1ISOCode != "" AND client.Geo.Subdivision1ISOCode IS NOT NULL 15 | THEN CONCAT(client.Geo.CountryCode,"-",client.Geo.Subdivision1ISOCode) 16 | ELSE CONCAT(client.Geo.CountryCode,"-",client.Geo.region) 17 | END AS ISO3166_2region1, 18 | NET.SAFE_IP_FROM_STRING(Client.IP) AS ip, 19 | id, 20 | a.MeanThroughputMbps AS mbps, 21 | a.MinRTT AS MinRTT 22 | FROM `measurement-lab.ndt.unified_downloads` 23 | WHERE date BETWEEN @startdate AND @enddate 24 | AND a.MeanThroughputMbps != 0 25 | AND client.Geo.ContinentCode IS NOT NULL AND client.Geo.ContinentCode != "" 26 | AND client.Geo.CountryCode IS NOT NULL AND client.Geo.CountryCode != "" 27 | AND (client.Geo.Subdivision1ISOCode IS NOT NULL OR client.Geo.Region IS NOT NULL) 28 | AND (client.Geo.Subdivision1ISOCode != "" OR client.Geo.Region != "") 29 | AND Client.IP IS NOT NULL 30 | ), 31 | --Fingerprint all cleaned tests, in an arbitrary but repeatable order 32 | dl_fingerprinted AS ( 33 | SELECT 34 | date, 35 | continent_code, 36 | country_code, 37 | ip, 38 | ISO3166_2region1, 39 | ARRAY_AGG(STRUCT(ABS(FARM_FINGERPRINT(id)) AS ffid, mbps, MinRTT) ORDER BY ABS(FARM_FINGERPRINT(id))) AS members 40 | FROM dl_per_location_cleaned 41 | GROUP BY date, continent_code, country_code, ISO3166_2region1, ip 42 | ), 43 | --Select two random rows for each IP using a prime number larger than the 44 | -- total number of tests. random1 is used for per day/geo statistics in 45 | -- `dl_stats_per_day` and log averages using both random1 and random2 46 | dl_random_ip_rows_perday AS ( 47 | SELECT 48 | date, 49 | continent_code, 50 | country_code, 51 | ISO3166_2region1, 52 | ip, 53 | ARRAY_LENGTH(members) AS tests, 54 | members[SAFE_OFFSET(MOD(511232941,ARRAY_LENGTH(members)))] AS random1, 55 | members[SAFE_OFFSET(MOD(906686609,ARRAY_LENGTH(members)))] AS random2 56 | FROM dl_fingerprinted 57 | ), 58 | --Calculate log averages and statistics per day from random samples 59 | dl_stats_per_day AS ( 60 | SELECT 61 | date, continent_code, country_code, ISO3166_2region1, 62 | COUNT(*) AS dl_samples_day, 63 | ROUND(POW(10,AVG(Safe.LOG10(random1.mbps))),3) AS dl_LOG_AVG_rnd1, 64 | ROUND(POW(10,AVG(Safe.LOG10(random2.mbps))),3) AS dl_LOG_AVG_rnd2, 65 | ROUND(POW(10,AVG(Safe.LOG10(random1.MinRtt))),3) AS dl_minRTT_LOG_AVG_rnd1, 66 | ROUND(POW(10,AVG(Safe.LOG10(random2.MinRtt))),3) AS dl_minRTT_LOG_AVG_rnd2, 67 | ROUND(MIN(random1.mbps),3) AS download_MIN, 68 | ROUND(APPROX_QUANTILES(random1.mbps, 100) [SAFE_ORDINAL(25)],3) AS download_Q25, 69 | ROUND(APPROX_QUANTILES(random1.mbps, 100) [SAFE_ORDINAL(50)],3) AS download_MED, 70 | ROUND(AVG(random1.mbps),3) AS download_AVG, 71 | ROUND(APPROX_QUANTILES(random1.mbps, 100) [SAFE_ORDINAL(75)],3) AS download_Q75, 72 | ROUND(MAX(random1.mbps),3) AS download_MAX, 73 | ROUND(APPROX_QUANTILES(random1.MinRTT, 100) [SAFE_ORDINAL(50)],3) AS download_minRTT_MED, 74 | FROM dl_random_ip_rows_perday 75 | GROUP BY date, continent_code, country_code, ISO3166_2region1 76 | ), 77 | --Count the samples that fall into each bucket and get frequencies 78 | dl_histogram AS ( 79 | SELECT 80 | date, 81 | continent_code, 82 | country_code, 83 | ISO3166_2region1, 84 | --Set the lowest bucket's min to zero, so all tests below the generated min of the lowest bin are included. 85 | CASE WHEN bucket_left = 0.5623413251903491 THEN 0 86 | ELSE bucket_left END AS bucket_min, 87 | bucket_right AS bucket_max, 88 | COUNTIF(random1.mbps < bucket_right AND random1.mbps >= bucket_left) AS dl_samples_bucket, 89 | ROUND(COUNTIF(random1.mbps < bucket_right AND random1.mbps >= bucket_left) / COUNT(*), 3) AS dl_frac_bucket 90 | FROM dl_random_ip_rows_perday CROSS JOIN buckets 91 | GROUP BY 92 | date, 93 | continent_code, 94 | country_code, 95 | ISO3166_2region1, 96 | bucket_min, 97 | bucket_max 98 | ), 99 | --Repeat for Upload tests 100 | --Select the initial set of tests 101 | --Filter for only tests With good locations and valid IPs 102 | ul_per_location_cleaned AS ( 103 | SELECT 104 | date, 105 | client.Geo.ContinentCode AS continent_code, 106 | client.Geo.CountryCode AS country_code, 107 | CASE WHEN client.Geo.Subdivision1ISOCode != "" AND client.Geo.Subdivision1ISOCode IS NOT NULL 108 | THEN CONCAT(client.Geo.CountryCode,"-",client.Geo.Subdivision1ISOCode) 109 | ELSE CONCAT(client.Geo.CountryCode,"-",client.Geo.region) 110 | END AS ISO3166_2region1, 111 | NET.SAFE_IP_FROM_STRING(Client.IP) AS ip, 112 | id, 113 | a.MeanThroughputMbps AS mbps, 114 | a.MinRTT AS MinRTT 115 | FROM `measurement-lab.ndt.unified_uploads` 116 | WHERE date BETWEEN @startdate AND @enddate 117 | AND a.MeanThroughputMbps != 0 118 | AND client.Geo.ContinentCode IS NOT NULL AND client.Geo.ContinentCode != "" 119 | AND client.Geo.CountryCode IS NOT NULL AND client.Geo.CountryCode != "" 120 | AND (client.Geo.Subdivision1ISOCode IS NOT NULL OR client.Geo.Region IS NOT NULL) 121 | AND (client.Geo.Subdivision1ISOCode != "" OR client.Geo.Region != "") 122 | AND Client.IP IS NOT NULL 123 | ), 124 | --Fingerprint all cleaned tests, in an arbitrary but repeatable order. 125 | ul_fingerprinted AS ( 126 | SELECT 127 | date, 128 | continent_code, 129 | country_code, 130 | ISO3166_2region1, 131 | ip, 132 | ARRAY_AGG(STRUCT(ABS(FARM_FINGERPRINT(id)) AS ffid, mbps, MinRTT) ORDER BY ABS(FARM_FINGERPRINT(id))) AS members 133 | FROM ul_per_location_cleaned 134 | GROUP BY date, continent_code, country_code, ISO3166_2region1, ip 135 | ), 136 | --Select two random rows for each IP using a prime number larger than the 137 | -- total number of tests. random1 is used for per day/geo statistics in 138 | -- `ul_stats_per_day` and log averages using both random1 and random2 139 | ul_random_ip_rows_perday AS ( 140 | SELECT 141 | date, 142 | continent_code, 143 | country_code, 144 | ISO3166_2region1, 145 | ip, 146 | ARRAY_LENGTH(members) AS tests, 147 | members[SAFE_OFFSET(MOD(511232941,ARRAY_LENGTH(members)))] AS random1, 148 | members[SAFE_OFFSET(MOD(906686609,ARRAY_LENGTH(members)))] AS random2 149 | FROM ul_fingerprinted 150 | ), 151 | --Calculate log averages and statistics per day from random samples 152 | ul_stats_per_day AS ( 153 | SELECT 154 | date, continent_code, country_code, ISO3166_2region1, 155 | COUNT(*) AS ul_samples_day, 156 | ROUND(POW(10,AVG(Safe.LOG10(random1.mbps))),3) AS ul_LOG_AVG_rnd1, 157 | ROUND(POW(10,AVG(Safe.LOG10(random2.mbps))),3) AS ul_LOG_AVG_rnd2, 158 | ROUND(POW(10,AVG(Safe.LOG10(random1.MinRtt))),3) AS ul_minRTT_LOG_AVG_rnd1, 159 | ROUND(POW(10,AVG(Safe.LOG10(random2.MinRtt))),3) AS ul_minRTT_LOG_AVG_rnd2, 160 | ROUND(MIN(random1.mbps),3) AS upload_MIN, 161 | ROUND(APPROX_QUANTILES(random1.mbps, 100) [SAFE_ORDINAL(25)],3) AS upload_Q25, 162 | ROUND(APPROX_QUANTILES(random1.mbps, 100) [SAFE_ORDINAL(50)],3) AS upload_MED, 163 | ROUND(AVG(random1.mbps),3) AS upload_AVG, 164 | ROUND(APPROX_QUANTILES(random1.mbps, 100) [SAFE_ORDINAL(75)],3) AS upload_Q75, 165 | ROUND(MAX(random1.mbps),3) AS upload_MAX, 166 | ROUND(APPROX_QUANTILES(random1.MinRTT, 100) [SAFE_ORDINAL(50)],3) AS upload_minRTT_MED, 167 | FROM ul_random_ip_rows_perday 168 | GROUP BY date, continent_code, country_code, ISO3166_2region1 169 | ), 170 | --Count the samples that fall into each bucket and get frequencies 171 | ul_histogram AS ( 172 | SELECT 173 | date, 174 | continent_code, 175 | country_code, 176 | ISO3166_2region1, 177 | --Set the lowest bucket's min to zero, so all tests below the generated min of the lowest bin are included. 178 | CASE WHEN bucket_left = 0.5623413251903491 THEN 0 179 | ELSE bucket_left END AS bucket_min, 180 | bucket_right AS bucket_max, 181 | COUNTIF(random1.mbps < bucket_right AND random1.mbps >= bucket_left) AS ul_samples_bucket, 182 | ROUND(COUNTIF(random1.mbps < bucket_right AND random1.mbps >= bucket_left) / COUNT(*), 3) AS ul_frac_bucket 183 | FROM ul_random_ip_rows_perday CROSS JOIN buckets 184 | GROUP BY 185 | date, 186 | continent_code, 187 | country_code, 188 | ISO3166_2region1, 189 | bucket_min, 190 | bucket_max 191 | ), 192 | --Gather final result set 193 | results AS ( 194 | SELECT *, MOD(ABS(FARM_FINGERPRINT(country_code)), 1000) as shard FROM dl_histogram 195 | JOIN ul_histogram USING (date, continent_code, country_code, ISO3166_2region1, bucket_min, bucket_max) 196 | JOIN dl_stats_per_day USING (date, continent_code, country_code, ISO3166_2region1) 197 | JOIN ul_stats_per_day USING (date, continent_code, country_code, ISO3166_2region1) 198 | ) 199 | --Show the results 200 | SELECT * FROM results 201 | -------------------------------------------------------------------------------- /statistics/queries/continent_histogram.sql: -------------------------------------------------------------------------------- 1 | WITH 2 | --Generate equal sized buckets in log-space between near 0 Mbps and ~1 Gbps+ 3 | buckets AS ( 4 | SELECT POW(10, x-.25) AS bucket_left, POW(10,x+.25) AS bucket_right 5 | FROM UNNEST(GENERATE_ARRAY(0, 3.5, .5)) AS x 6 | ), 7 | --Select the initial set of tests 8 | dl_per_location AS ( 9 | SELECT 10 | date, 11 | client.Geo.ContinentCode AS continent_code, 12 | NET.SAFE_IP_FROM_STRING(Client.IP) AS ip, 13 | id, 14 | a.MeanThroughputMbps AS mbps, 15 | a.MinRTT AS MinRTT 16 | FROM `measurement-lab.ndt.unified_downloads` 17 | WHERE date BETWEEN @startdate AND @enddate 18 | AND a.MeanThroughputMbps != 0 19 | ), 20 | --Filter for only tests With good locations and valid IPs 21 | dl_per_location_cleaned AS ( 22 | SELECT * FROM dl_per_location 23 | WHERE 24 | continent_code IS NOT NULL 25 | AND continent_code != "" 26 | AND ip IS NOT NULL 27 | ), 28 | --Fingerprint all cleaned tests, in an arbitrary but repeatable order 29 | dl_fingerprinted AS ( 30 | SELECT 31 | date, 32 | continent_code, 33 | ip, 34 | ARRAY_AGG(STRUCT(ABS(FARM_FINGERPRINT(id)) AS ffid, mbps, MinRTT) ORDER BY ABS(FARM_FINGERPRINT(id))) AS members 35 | FROM dl_per_location_cleaned 36 | GROUP BY date, continent_code, ip 37 | ), 38 | --Select two random rows for each IP using a prime number larger than the 39 | -- total number of tests. random1 is used for per day/geo statistics in 40 | -- `dl_stats_per_day` and log averages using both random1 and random2 41 | dl_random_ip_rows_perday AS ( 42 | SELECT 43 | date, 44 | continent_code, 45 | ip, 46 | ARRAY_LENGTH(members) AS tests, 47 | members[SAFE_OFFSET(MOD(511232941,ARRAY_LENGTH(members)))] AS random1, 48 | members[SAFE_OFFSET(MOD(906686609,ARRAY_LENGTH(members)))] AS random2 49 | FROM dl_fingerprinted 50 | ), 51 | --Calculate log averages and statistics per day from random samples 52 | dl_stats_per_day AS ( 53 | SELECT 54 | date, continent_code, 55 | COUNT(*) AS dl_samples_day, 56 | ROUND(POW(10,AVG(Safe.LOG10(random1.mbps))),3) AS dl_LOG_AVG_rnd1, 57 | ROUND(POW(10,AVG(Safe.LOG10(random2.mbps))),3) AS dl_LOG_AVG_rnd2, 58 | ROUND(POW(10,AVG(Safe.LOG10(random1.MinRtt))),3) AS dl_minRTT_LOG_AVG_rnd1, 59 | ROUND(POW(10,AVG(Safe.LOG10(random2.MinRtt))),3) AS dl_minRTT_LOG_AVG_rnd2, 60 | ROUND(MIN(random1.mbps),3) AS download_MIN, 61 | ROUND(APPROX_QUANTILES(random1.mbps, 100) [SAFE_ORDINAL(25)],3) AS download_Q25, 62 | ROUND(APPROX_QUANTILES(random1.mbps, 100) [SAFE_ORDINAL(50)],3) AS download_MED, 63 | ROUND(AVG(random1.mbps),3) AS download_AVG, 64 | ROUND(APPROX_QUANTILES(random1.mbps, 100) [SAFE_ORDINAL(75)],3) AS download_Q75, 65 | ROUND(MAX(random1.mbps),3) AS download_MAX, 66 | ROUND(APPROX_QUANTILES(random1.MinRTT, 100) [SAFE_ORDINAL(50)],3) AS download_minRTT_MED, 67 | FROM dl_random_ip_rows_perday 68 | GROUP BY continent_code, date 69 | ), 70 | --Count the samples that fall into each bucket and get frequencies 71 | dl_histogram AS ( 72 | SELECT 73 | date, 74 | continent_code, 75 | --Set the lowest bucket's min to zero, so all tests below the generated min of the lowest bin are included. 76 | CASE WHEN bucket_left = 0.5623413251903491 THEN 0 77 | ELSE bucket_left END AS bucket_min, 78 | bucket_right AS bucket_max, 79 | COUNTIF(random1.mbps < bucket_right AND random1.mbps >= bucket_left) AS dl_samples_bucket, 80 | ROUND(COUNTIF(random1.mbps < bucket_right AND random1.mbps >= bucket_left) / COUNT(*), 3) AS dl_frac_bucket 81 | FROM dl_random_ip_rows_perday CROSS JOIN buckets 82 | GROUP BY 83 | date, 84 | continent_code, 85 | bucket_min, 86 | bucket_max 87 | ), 88 | --Repeat for Upload tests 89 | --Select the initial set of tests 90 | ul_per_location AS ( 91 | SELECT 92 | date, 93 | client.Geo.ContinentCode AS continent_code, 94 | NET.SAFE_IP_FROM_STRING(Client.IP) AS ip, 95 | id, 96 | a.MeanThroughputMbps AS mbps, 97 | a.MinRTT AS MinRTT 98 | FROM `measurement-lab.ndt.unified_uploads` 99 | WHERE date BETWEEN @startdate AND @enddate 100 | AND a.MeanThroughputMbps != 0 101 | ), 102 | --Filter for only tests With good locations and valid IPs 103 | ul_per_location_cleaned AS ( 104 | SELECT * FROM ul_per_location 105 | WHERE 106 | continent_code IS NOT NULL 107 | AND continent_code != "" 108 | AND ip IS NOT NULL 109 | ), 110 | --Fingerprint all cleaned tests, in an arbitrary but repeatable order. 111 | ul_fingerprinted AS ( 112 | SELECT 113 | date, 114 | continent_code, 115 | ip, 116 | ARRAY_AGG(STRUCT(ABS(FARM_FINGERPRINT(id)) AS ffid, mbps, MinRTT) ORDER BY ABS(FARM_FINGERPRINT(id))) AS members 117 | FROM ul_per_location_cleaned 118 | GROUP BY date, continent_code, ip 119 | ), 120 | --Select two random rows for each IP using a prime number larger than the 121 | -- total number of tests. random1 is used for per day/geo statistics in 122 | -- `ul_stats_per_day` and log averages using both random1 and random2 123 | ul_random_ip_rows_perday AS ( 124 | SELECT 125 | date, 126 | continent_code, 127 | ip, 128 | ARRAY_LENGTH(members) AS tests, 129 | members[SAFE_OFFSET(MOD(511232941,ARRAY_LENGTH(members)))] AS random1, 130 | members[SAFE_OFFSET(MOD(906686609,ARRAY_LENGTH(members)))] AS random2 131 | FROM ul_fingerprinted 132 | ), 133 | --Calculate log averages and statistics per day from random samples 134 | ul_stats_per_day AS ( 135 | SELECT 136 | date, continent_code, 137 | COUNT(*) AS ul_samples_day, 138 | ROUND(POW(10,AVG(Safe.LOG10(random1.mbps))),3) AS ul_LOG_AVG_rnd1, 139 | ROUND(POW(10,AVG(Safe.LOG10(random2.mbps))),3) AS ul_LOG_AVG_rnd2, 140 | ROUND(POW(10,AVG(Safe.LOG10(random1.MinRtt))),3) AS ul_minRTT_LOG_AVG_rnd1, 141 | ROUND(POW(10,AVG(Safe.LOG10(random2.MinRtt))),3) AS ul_minRTT_LOG_AVG_rnd2, 142 | ROUND(MIN(random1.mbps),3) AS upload_MIN, 143 | ROUND(APPROX_QUANTILES(random1.mbps, 100) [SAFE_ORDINAL(25)],3) AS upload_Q25, 144 | ROUND(APPROX_QUANTILES(random1.mbps, 100) [SAFE_ORDINAL(50)],3) AS upload_MED, 145 | ROUND(AVG(random1.mbps),3) AS upload_AVG, 146 | ROUND(APPROX_QUANTILES(random1.mbps, 100) [SAFE_ORDINAL(75)],3) AS upload_Q75, 147 | ROUND(MAX(random1.mbps),3) AS upload_MAX, 148 | ROUND(APPROX_QUANTILES(random1.MinRTT, 100) [SAFE_ORDINAL(50)],3) AS upload_minRTT_MED, 149 | FROM ul_random_ip_rows_perday 150 | GROUP BY continent_code, date 151 | ), 152 | --Count the samples that fall into each bucket and get frequencies 153 | ul_histogram AS ( 154 | SELECT 155 | date, 156 | continent_code, 157 | --Set the lowest bucket's min to zero, so all tests below the generated min of the lowest bin are included. 158 | CASE WHEN bucket_left = 0.5623413251903491 THEN 0 159 | ELSE bucket_left END AS bucket_min, 160 | bucket_right AS bucket_max, 161 | COUNTIF(random1.mbps < bucket_right AND random1.mbps >= bucket_left) AS ul_samples_bucket, 162 | ROUND(COUNTIF(random1.mbps < bucket_right AND random1.mbps >= bucket_left) / COUNT(*), 3) AS ul_frac_bucket 163 | FROM ul_random_ip_rows_perday CROSS JOIN buckets 164 | GROUP BY 165 | date, 166 | continent_code, 167 | bucket_min, 168 | bucket_max 169 | ), 170 | --Gather final result set 171 | results AS ( 172 | SELECT *, MOD(ABS(FARM_FINGERPRINT(continent_code)), 1000) as shard FROM dl_histogram 173 | JOIN ul_histogram USING (date, continent_code, bucket_min, bucket_max) 174 | JOIN dl_stats_per_day USING (date, continent_code) 175 | JOIN ul_stats_per_day USING (date, continent_code) 176 | ) 177 | --Show the results 178 | SELECT * FROM results 179 | -------------------------------------------------------------------------------- /statistics/queries/global_asn_histogram.sql: -------------------------------------------------------------------------------- 1 | WITH 2 | --Generate equal sized buckets in log-space between near 0 Mbps and ~1 Gbps+ 3 | buckets AS ( 4 | SELECT POW(10, x-.25) AS bucket_left, POW(10,x+.25) AS bucket_right 5 | FROM UNNEST(GENERATE_ARRAY(0, 3.5, .5)) AS x 6 | ), 7 | --Select the initial set of tests 8 | dl_per_location AS ( 9 | SELECT 10 | date, 11 | client.Network.ASNumber AS asn, 12 | NET.SAFE_IP_FROM_STRING(Client.IP) AS ip, 13 | id, 14 | a.MeanThroughputMbps AS mbps, 15 | a.MinRTT AS MinRTT 16 | FROM `measurement-lab.ndt.unified_downloads` 17 | WHERE date BETWEEN @startdate AND @enddate 18 | AND a.MeanThroughputMbps != 0 19 | ), 20 | --Filter for only tests With good locations and valid IPs 21 | dl_per_location_cleaned AS ( 22 | SELECT * FROM dl_per_location 23 | WHERE 24 | asn IS NOT NULL 25 | AND ip IS NOT NULL 26 | ), 27 | --Fingerprint all cleaned tests, in an arbitrary but repeatable order 28 | dl_fingerprinted AS ( 29 | SELECT 30 | date, 31 | asn, 32 | ip, 33 | ARRAY_AGG(STRUCT(ABS(FARM_FINGERPRINT(id)) AS ffid, mbps, MinRTT) ORDER BY ABS(FARM_FINGERPRINT(id))) AS members 34 | FROM dl_per_location_cleaned 35 | GROUP BY date, asn, ip 36 | ), 37 | --Select two random rows for each IP using a prime number larger than the 38 | -- total number of tests. random1 is used for per day/geo statistics in 39 | -- `dl_stats_per_day` and log averages using both random1 and random2 40 | dl_random_ip_rows_perday AS ( 41 | SELECT 42 | date, 43 | asn, 44 | ip, 45 | ARRAY_LENGTH(members) AS tests, 46 | members[SAFE_OFFSET(MOD(511232941,ARRAY_LENGTH(members)))] AS random1, 47 | members[SAFE_OFFSET(MOD(906686609,ARRAY_LENGTH(members)))] AS random2 48 | FROM dl_fingerprinted 49 | ), 50 | --Calculate log averages and statistics per day from random samples 51 | dl_stats_per_day AS ( 52 | SELECT 53 | date, asn, 54 | COUNT(*) AS dl_samples_day, 55 | ROUND(POW(10,AVG(Safe.LOG10(random1.mbps))),3) AS dl_LOG_AVG_rnd1, 56 | ROUND(POW(10,AVG(Safe.LOG10(random2.mbps))),3) AS dl_LOG_AVG_rnd2, 57 | ROUND(POW(10,AVG(Safe.LOG10(random1.MinRtt))),3) AS dl_minRTT_LOG_AVG_rnd1, 58 | ROUND(POW(10,AVG(Safe.LOG10(random2.MinRtt))),3) AS dl_minRTT_LOG_AVG_rnd2, 59 | ROUND(MIN(random1.mbps),3) AS download_MIN, 60 | ROUND(APPROX_QUANTILES(random1.mbps, 100) [SAFE_ORDINAL(25)],3) AS download_Q25, 61 | ROUND(APPROX_QUANTILES(random1.mbps, 100) [SAFE_ORDINAL(50)],3) AS download_MED, 62 | ROUND(AVG(random1.mbps),3) AS download_AVG, 63 | ROUND(APPROX_QUANTILES(random1.mbps, 100) [SAFE_ORDINAL(75)],3) AS download_Q75, 64 | ROUND(MAX(random1.mbps),3) AS download_MAX, 65 | ROUND(APPROX_QUANTILES(random1.MinRTT, 100) [SAFE_ORDINAL(50)],3) AS download_minRTT_MED, 66 | FROM dl_random_ip_rows_perday 67 | GROUP BY asn, date 68 | ), 69 | --Count the samples that fall into each bucket and get frequencies 70 | dl_histogram AS ( 71 | SELECT 72 | date, 73 | asn, 74 | --Set the lowest bucket's min to zero, so all tests below the generated min of the lowest bin are included. 75 | CASE WHEN bucket_left = 0.5623413251903491 THEN 0 76 | ELSE bucket_left END AS bucket_min, 77 | bucket_right AS bucket_max, 78 | COUNTIF(random1.mbps < bucket_right AND random1.mbps >= bucket_left) AS dl_samples_bucket, 79 | ROUND(COUNTIF(random1.mbps < bucket_right AND random1.mbps >= bucket_left) / COUNT(*), 3) AS dl_frac_bucket 80 | FROM dl_random_ip_rows_perday CROSS JOIN buckets 81 | GROUP BY 82 | date, 83 | asn, 84 | bucket_min, 85 | bucket_max 86 | ), 87 | --Repeat for Upload tests 88 | --Select the initial set of tests 89 | ul_per_location AS ( 90 | SELECT 91 | date, 92 | client.Network.ASNumber AS asn, 93 | NET.SAFE_IP_FROM_STRING(Client.IP) AS ip, 94 | id, 95 | a.MeanThroughputMbps AS mbps, 96 | a.MinRTT AS MinRTT 97 | FROM `measurement-lab.ndt.unified_uploads` 98 | WHERE date BETWEEN @startdate AND @enddate 99 | AND a.MeanThroughputMbps != 0 100 | ), 101 | --Filter for only tests With good locations and valid IPs 102 | ul_per_location_cleaned AS ( 103 | SELECT * FROM ul_per_location 104 | WHERE 105 | asn IS NOT NULL 106 | AND ip IS NOT NULL 107 | ), 108 | --Fingerprint all cleaned tests, in an arbitrary but repeatable order. 109 | ul_fingerprinted AS ( 110 | SELECT 111 | date, 112 | asn, 113 | ip, 114 | ARRAY_AGG(STRUCT(ABS(FARM_FINGERPRINT(id)) AS ffid, mbps, MinRTT) ORDER BY ABS(FARM_FINGERPRINT(id))) AS members 115 | FROM ul_per_location_cleaned 116 | GROUP BY date, asn, ip 117 | ), 118 | --Select two random rows for each IP using a prime number larger than the 119 | -- total number of tests. random1 is used for per day/geo statistics in 120 | -- `ul_stats_per_day` and log averages using both random1 and random2 121 | ul_random_ip_rows_perday AS ( 122 | SELECT 123 | date, 124 | asn, 125 | ip, 126 | ARRAY_LENGTH(members) AS tests, 127 | members[SAFE_OFFSET(MOD(511232941,ARRAY_LENGTH(members)))] AS random1, 128 | members[SAFE_OFFSET(MOD(906686609,ARRAY_LENGTH(members)))] AS random2 129 | FROM ul_fingerprinted 130 | ), 131 | --Calculate log averages and statistics per day from random samples 132 | ul_stats_per_day AS ( 133 | SELECT 134 | date, asn, 135 | COUNT(*) AS ul_samples_day, 136 | ROUND(POW(10,AVG(Safe.LOG10(random1.mbps))),3) AS ul_LOG_AVG_rnd1, 137 | ROUND(POW(10,AVG(Safe.LOG10(random2.mbps))),3) AS ul_LOG_AVG_rnd2, 138 | ROUND(POW(10,AVG(Safe.LOG10(random1.MinRtt))),3) AS ul_minRTT_LOG_AVG_rnd1, 139 | ROUND(POW(10,AVG(Safe.LOG10(random2.MinRtt))),3) AS ul_minRTT_LOG_AVG_rnd2, 140 | ROUND(MIN(random1.mbps),3) AS upload_MIN, 141 | ROUND(APPROX_QUANTILES(random1.mbps, 100) [SAFE_ORDINAL(25)],3) AS upload_Q25, 142 | ROUND(APPROX_QUANTILES(random1.mbps, 100) [SAFE_ORDINAL(50)],3) AS upload_MED, 143 | ROUND(AVG(random1.mbps),3) AS upload_AVG, 144 | ROUND(APPROX_QUANTILES(random1.mbps, 100) [SAFE_ORDINAL(75)],3) AS upload_Q75, 145 | ROUND(MAX(random1.mbps),3) AS upload_MAX, 146 | ROUND(APPROX_QUANTILES(random1.MinRTT, 100) [SAFE_ORDINAL(50)],3) AS upload_minRTT_MED, 147 | FROM ul_random_ip_rows_perday 148 | GROUP BY asn, date 149 | ), 150 | --Count the samples that fall into each bucket and get frequencies 151 | ul_histogram AS ( 152 | SELECT 153 | date, 154 | asn, 155 | --Set the lowest bucket's min to zero, so all tests below the generated min of the lowest bin are included. 156 | CASE WHEN bucket_left = 0.5623413251903491 THEN 0 157 | ELSE bucket_left END AS bucket_min, 158 | bucket_right AS bucket_max, 159 | COUNTIF(random1.mbps < bucket_right AND random1.mbps >= bucket_left) AS ul_samples_bucket, 160 | ROUND(COUNTIF(random1.mbps < bucket_right AND random1.mbps >= bucket_left) / COUNT(*), 3) AS ul_frac_bucket 161 | FROM ul_random_ip_rows_perday CROSS JOIN buckets 162 | GROUP BY 163 | date, 164 | asn, 165 | bucket_min, 166 | bucket_max 167 | ), 168 | --Gather final result set 169 | results AS ( 170 | SELECT *, MOD(ABS(FARM_FINGERPRINT(CAST(asn AS STRING))), 1000) as shard FROM 171 | dl_histogram 172 | JOIN ul_histogram USING (date, asn, bucket_min, bucket_max) 173 | JOIN dl_stats_per_day USING (date, asn) 174 | JOIN ul_stats_per_day USING (date, asn) 175 | ) 176 | --Show the results 177 | SELECT * FROM results 178 | -------------------------------------------------------------------------------- /statistics/scripts/update_stats_continent_country_region_histogram.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eux 3 | 4 | PROJECT="mlab-sandbox" 5 | USERNAME="critzo" 6 | PUB_LOC="test-critzo-statistics" 7 | 8 | # Initially set the project to measurement-lab. 9 | gcloud config set project mlab-sandbox 10 | 11 | declare -a query_jobs=("continent_country_region_histogram") 12 | 13 | 14 | startday=2020-01-01 15 | endday=2020-01-01 16 | 17 | ######################### 18 | endday=$(TZ=GMT date -I -d "$endday + 1 day") 19 | yeararray=($(echo $startday | tr "-" "\n")) 20 | year=${yeararray[0]} 21 | 22 | for val in ${query_jobs[@]}; do 23 | RESULT_NAME="$val" 24 | QUERY="${RESULT_NAME}.sql" 25 | QUALIFIED_TABLE="${PROJECT}:test_critzo_statistics.${RESULT_NAME}" 26 | QUALIFIED_TABLE_IN_QUERY="${PROJECT}.test_critzo_statistics.${RESULT_NAME}" 27 | DATASET="test_critzo_statistics" 28 | TEMP_TABLE="temp_continent_country_region_stats" 29 | TEMP_STATS="${PROJECT}:${DATASET}.${TEMP_TABLE}" 30 | 31 | # Run bq query with generous row limit. Write results to temp table created above. 32 | # By default, bq fetches the query results to display in the shell, consuming a lot of memory. 33 | # Use --nosync to "fire-and-forget", then implement our own wait loop to defer the next command 34 | # until the table is populated. 35 | 36 | # TODO: add a check to see if this table exists already, and create it if not. 37 | 38 | while [ "$startday" != "$endday" ]; do 39 | JOB_ID=$(bq --nosync --project_id "${PROJECT}" query \ 40 | --parameter=startday::$startday --allow_large_results --destination_table "${QUALIFIED_TABLE}" \ 41 | --append_table --use_legacy_sql=false --max_rows=4000000 \ 42 | "$(cat "queries/${QUERY}")") 43 | 44 | JOB_ID="${JOB_ID#Successfully started query }" 45 | 46 | until [ DONE == $(bq --format json show --job "${JOB_ID}" | jq -r '.status.state') ] 47 | do 48 | sleep 30 49 | done 50 | 51 | startday=$(date -I -d "$startday + 1 day") 52 | done 53 | 54 | # Automate stats and outputs by continent, country, region, etc. using query params. 55 | # Get all combinations of continent, country, and region codes & save to a local csv. 56 | 57 | declare -a location_combos_query=("get_continent_country_region_codes_sample") 58 | 59 | for v in ${location_combos_query[@]}; do 60 | RESULT2_NAME="$v" 61 | QUERY2="${RESULT2_NAME}.sql" 62 | 63 | JOB_ID2=$(bq --format=csv --project_id "${PROJECT}" query \ 64 | --use_legacy_sql=false --max_rows=4000000 \ 65 | "$(cat "queries/${QUERY2}")" > continent_country_region_codes.csv ) 66 | done 67 | 68 | # bq exports csvs with a header. remove the header. 69 | sed -i '1d' continent_country_region_codes.csv 70 | 71 | # Make a temporary GCS bucket to store results. 72 | gsutil mb gs://${USERNAME}_temp_stats_continent_country_region 73 | 74 | # Loop through the csv lines, using three values as query parameters for a series of queries. 75 | while IFS=, read -r continent country region; 76 | do 77 | iso_region="$country-$region" 78 | 79 | JOB_ID3=$(bq --nosync query \ 80 | --use_legacy_sql=false \ 81 | --max_rows=4000000 \ 82 | --project_id "${PROJECT}" \ 83 | --allow_large_results --destination_table "${TEMP_STATS}" \ 84 | --replace "SELECT * FROM ${QUALIFIED_TABLE_IN_QUERY} WHERE continent_code = \"${continent}\" AND country_code = \"${country}\" AND ISO3166_2region1 = \"${iso_region}\" ORDER BY test_date, continent_code, country_code, country_name, ISO3166_2region1, bucket_min, bucket_max") 85 | 86 | JOB_ID3="${JOB_ID3#Successfully started query }" 87 | 88 | until [ DONE == $(bq --format json show --job "${JOB_ID3}" | jq -r '.status.state') ] 89 | do 90 | sleep 30 91 | done 92 | 93 | # Extract the rows to JSON and/or other output formats 94 | bq extract --destination_format NEWLINE_DELIMITED_JSON \ 95 | ${QUALIFIED_TABLE} \ 96 | gs://${USERNAME}_temp_stats_continent_country_region/${continent}/${country}/${region}/${year}/histogram_daily_stats.json 97 | 98 | done < continent_country_region_codes.csv 99 | 100 | # Copy the full list of generated stats from measurement-lab project temp GCS bucket 101 | gsutil -m cp -r gs://${USERNAME}_temp_stats_continent_country_region/* tmp/ 102 | 103 | # Convert all new line json files to json array format 104 | find ./tmp/ -type f -exec sed -i '1s/^/[/; $!s/$/,/; $s/$/]/' {} + 105 | 106 | # Publish the json array files to public GCS bucket 107 | gsutil -m cp -r tmp/* gs://${PUB_LOC}/ 108 | 109 | done 110 | 111 | # Cleanup 112 | ## Remove the temporary GCS bucket. 113 | gsutil rm -r gs://${USERNAME}_temp_stats_continent_country_region 114 | 115 | ## Remove local copies. 116 | rm -r ./tmp/* 117 | rm continent_country_region_codes.csv 118 | --------------------------------------------------------------------------------