├── .gitignore ├── docs ├── data │ ├── observable-latency.parquet.sh │ ├── flights-200k.parquet │ ├── seattle-weather.parquet │ ├── gaia.parquet.sh │ └── nyc-taxi.parquet.sh ├── components │ └── mosaic.js ├── style.css ├── nyc-taxi-rides.md ├── gaia-star-catalog.md ├── mosaic-duckdb-wasm.md ├── index.md ├── data-loading.md ├── observable-latency.md └── flight-delays.md ├── package.json ├── README.md ├── observablehq.config.ts ├── LICENSE └── .github └── workflows └── deploy.yml /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | dist/ 3 | docs/.observablehq/cache/ 4 | node_modules/ 5 | yarn-error.log 6 | -------------------------------------------------------------------------------- /docs/data/observable-latency.parquet.sh: -------------------------------------------------------------------------------- 1 | curl https://idl.uw.edu/mosaic-datasets/data/observable-latency.parquet 2 | -------------------------------------------------------------------------------- /docs/data/flights-200k.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uwdata/mosaic-framework-example/main/docs/data/flights-200k.parquet -------------------------------------------------------------------------------- /docs/data/seattle-weather.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uwdata/mosaic-framework-example/main/docs/data/seattle-weather.parquet -------------------------------------------------------------------------------- /docs/components/mosaic.js: -------------------------------------------------------------------------------- 1 | import * as vg from "npm:@uwdata/vgplot"; 2 | 3 | export async function vgplot(queries) { 4 | const mc = vg.coordinator(); 5 | const api = vg.createAPIContext({ coordinator: mc }); 6 | mc.databaseConnector(vg.wasmConnector()); 7 | if (queries) { 8 | await mc.exec(queries(api)); 9 | } 10 | return api; 11 | } 12 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "mosaic-framework-example", 3 | "type": "module", 4 | "private": true, 5 | "scripts": { 6 | "clean": "rm -rf docs/.observablehq/cache", 7 | "build": "rm -rf dist && observable build", 8 | "dev": "observable preview", 9 | "deploy": "observable deploy", 10 | "observable": "observable" 11 | }, 12 | "dependencies": { 13 | "@observablehq/framework": "^1.9.0" 14 | }, 15 | "engines": { 16 | "node": ">=18" 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Mosaic + Framework Examples 2 | 3 | This site shares examples of integrating [Mosaic](https://idl.uw.edu/mosaic) and DuckDB into [Observable Framework](https://observablehq.com/framework). 4 | 5 | **[View the deployed examples](https://idl.uw.edu/mosaic-framework-example)** 6 | 7 | The examples demonstrate: 8 | 9 | - Visualization and real-time interaction with massive data sets 10 | - Using Mosaic and DuckDB-WASM within Framework pages 11 | - Using DuckDB within a data loader and configuring GitHub Actions 12 | -------------------------------------------------------------------------------- /docs/style.css: -------------------------------------------------------------------------------- 1 | @import url("observablehq:default.css"); 2 | @import url("observablehq:theme-air.css"); 3 | 4 | #observablehq-header .banner { 5 | display: flex; 6 | align-items: center; 7 | justify-content: end; 8 | gap: 0.5rem; 9 | height: 2.2rem; 10 | margin: -1.5rem -2rem 2rem -2rem; 11 | padding: 0.5rem 2rem; 12 | border-bottom: solid 1px var(--theme-foreground-faintest); 13 | font: 500 14px var(--sans-serif); 14 | } 15 | 16 | #observablehq-header a[href] { 17 | color: inherit; 18 | } 19 | 20 | #observablehq-header a[target="_blank"]:hover span { 21 | text-decoration: underline; 22 | } 23 | 24 | #observablehq-header a[target="_blank"]:not(:hover, :focus)::after { 25 | color: var(--theme-foreground-muted); 26 | } 27 | 28 | .input label { 29 | margin-right: 0.5em; 30 | } 31 | -------------------------------------------------------------------------------- /docs/data/gaia.parquet.sh: -------------------------------------------------------------------------------- 1 | # The DuckDB executable must be on your environment path! 2 | # Write to a named file as portable file descriptors such as 3 | # (/dev/stdout) appear to be unavailable in GitHub actions. 4 | duckdb :memory: << EOF 5 | -- Compute u and v coordinates via natural earth projection 6 | CREATE TABLE gaia AS 7 | WITH prep AS ( 8 | SELECT 9 | radians((-l + 540) % 360 - 180) AS lambda, 10 | radians(b) AS phi, 11 | asin(sqrt(3)/2 * sin(phi)) AS t, 12 | t^2 AS t2, 13 | t2^3 AS t6, 14 | * 15 | FROM 'https://idl.uw.edu/mosaic-datasets/data/gaia-5m.parquet' 16 | ) 17 | SELECT 18 | ( 19 | (1.340264 * lambda * cos(t)) / 20 | (sqrt(3)/2 * (1.340264 + (-0.081106 * 3 * t2) + (t6 * (0.000893 * 7 + 0.003796 * 9 * t2)))) 21 | )::FLOAT AS u, 22 | (t * (1.340264 + (-0.081106 * t2) + (t6 * (0.000893 + 0.003796 * t2))))::FLOAT AS v, 23 | bp_rp::FLOAT AS bp_rp, 24 | phot_g_mean_mag::FLOAT AS phot_g_mean_mag, 25 | parallax::FLOAT AS parallax 26 | FROM prep 27 | WHERE parallax BETWEEN -5 AND 20; 28 | 29 | -- Write output parquet file 30 | COPY gaia TO 'gaia.parquet' WITH (FORMAT PARQUET); 31 | EOF 32 | 33 | cat gaia.parquet >&1 # Write output to stdout 34 | rm gaia.parquet # Clean up 35 | -------------------------------------------------------------------------------- /docs/data/nyc-taxi.parquet.sh: -------------------------------------------------------------------------------- 1 | # The DuckDB executable must be on your environment path! 2 | # Use DuckDB version 0.9.2 or later 3 | # Write to a named file as portable file descriptors such as 4 | # (/dev/stdout) appear to be unavailable in GitHub actions. 5 | duckdb :memory: << EOF 6 | -- Load spatial extension 7 | INSTALL spatial; LOAD spatial; 8 | 9 | -- Project, following the example at https://github.com/duckdb/duckdb_spatial 10 | CREATE TEMP TABLE rides AS SELECT 11 | pickup_datetime::TIMESTAMP AS datetime, 12 | ST_Transform(ST_Point(pickup_latitude, pickup_longitude), 'EPSG:4326', 'ESRI:102718') AS pick, 13 | ST_Transform(ST_Point(dropoff_latitude, dropoff_longitude), 'EPSG:4326', 'ESRI:102718') AS drop 14 | FROM 'https://idl.uw.edu/mosaic-datasets/data/nyc-rides-2010.parquet'; 15 | 16 | -- Write output parquet file 17 | COPY (SELECT 18 | HOUR(datetime) + MINUTE(datetime) / 60 AS time, 19 | ST_X(pick)::INTEGER AS px, -- extract pickup x-coord 20 | ST_Y(pick)::INTEGER AS py, -- extract pickup y-coord 21 | ST_X(drop)::INTEGER AS dx, -- extract dropff x-coord 22 | ST_Y(drop)::INTEGER AS dy -- extract dropff y-coord 23 | FROM rides) TO 'trips.parquet' WITH (FORMAT PARQUET); 24 | EOF 25 | 26 | cat trips.parquet >&1 # Write output to stdout 27 | rm trips.parquet # Clean up 28 | -------------------------------------------------------------------------------- /observablehq.config.ts: -------------------------------------------------------------------------------- 1 | // See https://observablehq.com/framework/config for documentation. 2 | export default { 3 | // The project’s title; used in the sidebar and webpage titles. 4 | title: "Mosaic + Framework", 5 | 6 | // The pages and sections in the sidebar. If you don’t specify this option, 7 | // all pages will be listed in alphabetical order. Listing pages explicitly 8 | // lets you organize them into sections and have unlisted pages. 9 | pages: [ 10 | { 11 | name: "Example Articles", 12 | pages: [ 13 | {name: "Flight Delays", path: "/flight-delays"}, 14 | {name: "NYC Taxi Rides", path: "/nyc-taxi-rides"}, 15 | {name: "Gaia Star Catalog", path: "/gaia-star-catalog"}, 16 | {name: "Observable Latency", path: "/observable-latency"}, 17 | ] 18 | }, 19 | { 20 | name: "Implementation Notes", 21 | pages: [ 22 | {name: "Data Loading with DuckDB", path: "/data-loading"}, 23 | {name: "Mosaic & DuckDB-WASM", path: "/mosaic-duckdb-wasm"} 24 | ] 25 | } 26 | ], 27 | 28 | // Some additional configuration options and their defaults: 29 | // theme: "default", // try "light", "dark", "slate", etc. 30 | style: "style.css", 31 | footer: `Interactive Data Lab, University of Washington`, 32 | toc: false, // whether to show the table of contents 33 | pager: true, // whether to show previous & next links in the footer 34 | }; 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2024, UW Interactive Data Lab 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | 3. Neither the name of the copyright holder nor the names of its 16 | contributors may be used to endorse or promote products derived from 17 | this software without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | -------------------------------------------------------------------------------- /.github/workflows/deploy.yml: -------------------------------------------------------------------------------- 1 | name: Deploy 2 | 3 | on: 4 | workflow_dispatch: 5 | push: 6 | branches: 7 | - main 8 | 9 | permissions: 10 | contents: read 11 | pages: write 12 | id-token: write 13 | 14 | concurrency: 15 | group: "pages" 16 | cancel-in-progress: false 17 | 18 | jobs: 19 | deploy: 20 | runs-on: ubuntu-latest 21 | 22 | environment: 23 | name: github-pages 24 | url: ${{ steps.deployment.outputs.page_url }} 25 | 26 | steps: 27 | - name: Install DuckDB CLI 28 | run: | 29 | wget https://github.com/duckdb/duckdb/releases/download/v0.10.0/duckdb_cli-linux-amd64.zip 30 | unzip duckdb_cli-linux-amd64.zip 31 | mkdir /opt/duckdb && mv duckdb /opt/duckdb && chmod +x /opt/duckdb/duckdb && sudo ln -s /opt/duckdb/duckdb /usr/bin/duckdb 32 | rm duckdb_cli-linux-amd64.zip 33 | 34 | - uses: actions/checkout@v4 35 | with: 36 | fetch-depth: 0 37 | 38 | - uses: actions/setup-node@v4 39 | with: 40 | node-version: "20.x" 41 | cache: "npm" 42 | 43 | - name: Install Node dependencies 44 | run: npm ci 45 | 46 | - id: date 47 | run: echo "date=$(TZ=America/Los_Angeles date +'%Y-%m-%d')" >> $GITHUB_OUTPUT 48 | 49 | - id: cache-data 50 | uses: actions/cache@v4 51 | with: 52 | path: | 53 | docs/.observablehq/cache 54 | key: data-${{ hashFiles('docs/data/*') }}-${{ steps.date.outputs.date }} 55 | 56 | - if: steps.cache-data.outputs.cache-hit == 'true' 57 | run: find docs/.observablehq/cache -type f -exec touch {} + 58 | 59 | - name: Build 60 | run: npm run build 61 | 62 | - uses: actions/configure-pages@v4 63 | - uses: actions/upload-pages-artifact@v3 64 | with: 65 | path: dist 66 | - name: Deploy 67 | id: deployment 68 | uses: actions/deploy-pages@v4 -------------------------------------------------------------------------------- /docs/nyc-taxi-rides.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: NYC Taxi Rides 3 | header: | 4 | 7 | sql: 8 | trips: data/nyc-taxi.parquet 9 | --- 10 | 11 | # NYC Taxi Rides 12 | ## Pickup and dropoff points for 1M NYC taxi rides on Jan 1-3, 2010. 13 | 14 | Using a data loader, we ingest a remote file into DuckDB and project [_longitude_, _latitude_] coordinates (in the database!) to spatial positions with units of feet (1 foot = 12 inches). 15 | We then load the prepared data to visualize taxi pickup and dropoff locations, as well as the volume of rides by the time of day. 16 | 17 | _Please wait a few seconds for the dataset to load._ 18 | 19 | ```js 20 | const $filter = vg.Selection.crossfilter(); 21 | 22 | const defaultAttributes = [ 23 | vg.width(335), 24 | vg.height(550), 25 | vg.margin(0), 26 | vg.xAxis(null), 27 | vg.yAxis(null), 28 | vg.xDomain([975000, 1005000]), 29 | vg.yDomain([190000, 240000]), 30 | vg.colorScale("symlog") 31 | ]; 32 | ``` 33 | 34 | ```js 35 | vg.hconcat( 36 | vg.plot( 37 | vg.raster( 38 | vg.from("trips", { filterBy: $filter }), 39 | { x: "px", y: "py", imageRendering: "pixelated" } 40 | ), 41 | vg.intervalXY({ as: $filter }), 42 | vg.text( 43 | [{label: "Taxi Pickups"}], 44 | { 45 | dx: 10, 46 | dy: 10, 47 | text: "label", 48 | fill: "black", 49 | fontSize: "1.2em", 50 | frameAnchor: "top-left" 51 | } 52 | ), 53 | ...defaultAttributes, 54 | vg.colorScheme("turbo") 55 | ), 56 | vg.hspace(10), 57 | vg.plot( 58 | vg.raster( 59 | vg.from("trips", { filterBy: $filter }), 60 | { x: "dx", y: "dy", imageRendering: "pixelated" } 61 | ), 62 | vg.intervalXY({ as: $filter }), 63 | vg.text( 64 | [{label: "Taxi Dropoffs"}], 65 | { 66 | dx: 10, 67 | dy: 10, 68 | text: "label", 69 | fill: "black", 70 | fontSize: "1.2em", 71 | frameAnchor: "top-left" 72 | } 73 | ), 74 | ...defaultAttributes, 75 | vg.colorScheme("turbo") 76 | ) 77 | ) 78 | ``` 79 | 80 | ```js 81 | vg.plot( 82 | vg.rectY( 83 | vg.from("trips"), 84 | { x: vg.bin("time"), y: vg.count(), inset: 0.5 } 85 | ), 86 | vg.intervalX({ as: $filter }), 87 | vg.yTickFormat("s"), 88 | vg.xLabel("Pickup Hour"), 89 | vg.yLabel("Number of Rides"), 90 | vg.width(680), 91 | vg.height(100) 92 | ) 93 | ``` 94 | 95 | Select an interval in a plot to filter the maps. 96 | _What spatial patterns can you find?_ 97 | -------------------------------------------------------------------------------- /docs/gaia-star-catalog.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Gaia Star Catalog 3 | header: | 4 | 7 | sql: 8 | gaia: data/gaia.parquet 9 | --- 10 | 11 | # Gaia Star Catalog 12 | ## Explore a 5M record sample of the 1.8B star catalog 13 | 14 | [Gaia](https://gea.esac.esa.int/archive/) is a European space mission providing astrometry, photometry, and spectroscopy of nearly 2000 million stars in the Milky Way as well as significant samples of extragalactic and solar system objects. 15 | 16 | Here we visualize a 5M star sample. 17 | A raster sky map reveals our Milky Way galaxy. 18 | Select higher parallax (≥ 6) stars in the histogram to reveal a [Hertzsprung-Russel diagram](https://en.wikipedia.org/wiki/Hertzsprung%E2%80%93Russell_diagram) in the plot of stellar color vs. magnitude on the right. 19 | 20 | ```js 21 | const $brush = vg.Selection.crossfilter(); 22 | ``` 23 | 24 | ```js 25 | vg.hconcat( 26 | vg.vconcat( 27 | vg.plot( 28 | vg.raster( 29 | vg.from("gaia", {filterBy: $brush}), 30 | { x: "u", y: "v", fill: "density", pixelSize: 2 } 31 | ), 32 | vg.intervalXY({pixelSize: 2, as: $brush}), 33 | vg.xyDomain(vg.Fixed), 34 | vg.colorScale("sqrt"), 35 | vg.colorScheme("viridis"), 36 | vg.xAxis(null), 37 | vg.yAxis(null), 38 | vg.width(560), 39 | vg.height(320), 40 | vg.margins({ top: 20, bottom: 10, left: 5, right: 5 }) 41 | ), 42 | vg.hconcat( 43 | vg.plot( 44 | vg.rectY( 45 | vg.from("gaia", {filterBy: $brush}), 46 | { 47 | x: vg.bin("phot_g_mean_mag"), 48 | y: vg.count(), 49 | fill: "steelblue", 50 | inset: 0.5 51 | } 52 | ), 53 | vg.intervalX({as: $brush}), 54 | vg.xDomain(vg.Fixed), 55 | vg.xTicks(5), 56 | vg.yScale("sqrt"), 57 | vg.yGrid(true), 58 | vg.width(280), 59 | vg.height(180), 60 | vg.marginLeft(65) 61 | ), 62 | vg.plot( 63 | vg.rectY( 64 | vg.from("gaia", {filterBy: $brush}), 65 | {x: vg.bin("parallax"), y: vg.count(), fill: "steelblue", inset: 0.5} 66 | ), 67 | vg.intervalX({as: $brush}), 68 | vg.xDomain(vg.Fixed), 69 | vg.xTicks(5), 70 | vg.yScale("sqrt"), 71 | vg.yGrid(true), 72 | vg.width(280), 73 | vg.height(180), 74 | vg.marginLeft(65) 75 | ) 76 | ) 77 | ), 78 | vg.hspace(10), 79 | vg.plot( 80 | vg.raster( 81 | vg.from("gaia", {filterBy: $brush}), 82 | { x: "bp_rp", y: "phot_g_mean_mag", fill: "density", pixelSize: 2 } 83 | ), 84 | vg.intervalXY({pixelSize: 2, as: $brush}), 85 | vg.xyDomain(vg.Fixed), 86 | vg.colorScale("sqrt"), 87 | vg.colorScheme("viridis"), 88 | vg.xTicks(5), 89 | vg.yReverse(true), 90 | vg.width(320), 91 | vg.height(500), 92 | vg.marginLeft(25), 93 | vg.marginTop(20), 94 | vg.marginRight(1) 95 | ) 96 | ) 97 | ``` 98 | -------------------------------------------------------------------------------- /docs/mosaic-duckdb-wasm.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Using Mosaic & DuckDB-WASM 3 | header: | 4 | 7 | --- 8 | 9 | # Using Mosaic & DuckDB-WASM 10 | 11 | Behind the scenes, a number of steps are needed for Mosaic and DuckDB-WASM to "play nice" with Observable's reactive runtime. 12 | Unlike standard JavaScript, the Observable runtime will happily run JavaScript "out-of-order". 13 | Observable uses dependencies among code blocks, rather than the order within the file, to determine what to run and when to run it. 14 | This reactivity can cause problems for code that depends on "side effects" that are not tracked by Observable's runtime. 15 | 16 | In the past, we had to carefully work our way around these side effects when manually loading data and initializing Mosaic. 17 | Fortunately, as of version 1.3.0 onward, Observable Framework includes built-in [DuckDB data loading](https://observablehq.com/framework/sql) and [Mosaic initialization](https://observablehq.com/framework/lib/mosaic) support to handle this for us. 18 | 19 | ## Loading Data into DuckDB-WASM 20 | 21 | Observable supports loading files by simply listing them in a page's YAML front matter under the `sql` key. The following example loads 200,000 flights records into DuckDB-WASM from a backing parquet file: 22 | 23 | ```yaml 24 | --- 25 | sql: 26 | flights: data/flights-200k.parquet 27 | --- 28 | ``` 29 | 30 | Observable ensures `sql` data loading is performed prior to downstream code execution, preventing out-of-order issues. If the data file is produced using a [data loader](https://observablehq.com/framework/loaders), the loader will be invoked, akin to using an Observable `FileAttachment`. 31 | 32 | ## Mosaic vgplot Initialization 33 | 34 | Observable Framework includes [Mosaic vgplot](https://idl.uw.edu/mosaic/what-is-mosaic/) as a "built-in" standard library component. If Observable sees the `vg` variable referenced but not otherwise defined, it automatically imports vgplot and includes it as a dependency. 35 | 36 | Observable Framework will instantiate a new API instance (bound to the `vg` variable) and configure it to use the built-in [DuckDBClient](https://observablehq.com/framework/lib/duckdb) in the Mosaic coordinator's [database connector](https://idl.uw.edu/mosaic/core/#data-source). 37 | 38 | Here's what the internal vgplot initialization looks like: 39 | 40 | ```js run=false 41 | import * as vgplot from "npm:@uwdata/vgplot"; 42 | import {getDefaultClient} from "observablehq:stdlib/duckdb"; 43 | 44 | export default async function vg() { 45 | const coordinator = new vgplot.Coordinator(); 46 | const api = vgplot.createAPIContext({coordinator}); 47 | const duckdb = (await getDefaultClient())._db; 48 | coordinator.databaseConnector(vgplot.wasmConnector({duckdb})); 49 | return api; 50 | } 51 | ``` 52 | 53 | This code first instantiates a new central coordinator, which manages all queries. 54 | It then creates a new API context, which is what ultimately is returned. 55 | 56 | Next, the code configures Mosaic to use DuckDB-WASM as an in-browser database. 57 | Normally the `wasmConnector()` method creates a new database instance in a worker thread, but here we instead pass in Observable's own DuckDB client. 58 | 59 | Once that completes, we're ready to use the API! 60 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Mosaic + Framework Examples 3 | header: | 4 | 7 | sql: 8 | weather: data/seattle-weather.parquet 9 | --- 10 | 11 | # Mosaic + Framework Examples 12 | ## Using Mosaic and DuckDB in Observable Framework 13 | 14 | [Mosaic](https://idl.uw.edu/mosaic) is a system for linking data visualizations, tables, and input widgets, all leveraging a database ([DuckDB](https://duckdb.org/)) for scalable processing. With Mosaic, you can interactively visualize and explore millions and even billions of data points. 15 | 16 | This site shows how to publish Mosaic and DuckDB-powered interactive dashboards and data-driven articles using [Observable Framework](https://observablehq.com/framework/). The examples illustrate: 17 | 18 | - Visualization and real-time interaction with massive data sets 19 | - Using Mosaic and DuckDB-WASM within Framework pages 20 | - Using DuckDB in a data loader and in GitHub Actions 21 | 22 | All source markup and code is available at . Or, use the source links at the top of each page! 23 | 24 | ## Example: Seattle Weather 25 | 26 | Our first example is an interactive dashboard of Seattle’s weather, including temperatures, precipitation, and the type of weather. Drag on the scatter plot to see the proportion of days that have sun, fog, drizzle, rain, or snow. 27 | 28 | ```js 29 | const $click = vg.Selection.single(); 30 | const $domain = vg.Param.array(["sun", "fog", "drizzle", "rain", "snow"]); 31 | const $colors = vg.Param.array(["#e7ba52", "#a7a7a7", "#aec7e8", "#1f77b4", "#9467bd"]); 32 | const $range = vg.Selection.intersect(); 33 | ``` 34 | 35 | ```js 36 | vg.vconcat( 37 | vg.hconcat( 38 | vg.plot( 39 | vg.dot( 40 | vg.from("weather", {filterBy: $click}), 41 | { 42 | x: vg.dateMonthDay("date"), 43 | y: "temp_max", 44 | fill: "weather", 45 | r: "precipitation", 46 | fillOpacity: 0.7 47 | } 48 | ), 49 | vg.intervalX({as: $range, brush: {fill: "none", stroke: "#888"}}), 50 | vg.highlight({by: $range, fill: "#ccc", fillOpacity: 0.2}), 51 | vg.colorLegend({as: $click, columns: 1}), 52 | vg.xyDomain(vg.Fixed), 53 | vg.xScale("utc"), 54 | vg.xTickFormat("%b"), 55 | vg.colorDomain($domain), 56 | vg.colorRange($colors), 57 | vg.rDomain(vg.Fixed), 58 | vg.rRange([2, 10]), 59 | vg.marginLeft(45), 60 | vg.width(660), 61 | vg.height(300) 62 | ) 63 | ), 64 | vg.plot( 65 | vg.barX( 66 | vg.from("weather"), 67 | {x: vg.count(), y: "weather", fill: "#ccc", fillOpacity: 0.2} 68 | ), 69 | vg.barX( 70 | vg.from("weather", {filterBy: $range}), 71 | {x: vg.count(), y: "weather", fill: "weather"} 72 | ), 73 | vg.toggleY({as: $click}), 74 | vg.highlight({by: $click}), 75 | vg.xDomain(vg.Fixed), 76 | vg.yDomain($domain), 77 | vg.yLabel(null), 78 | vg.colorDomain($domain), 79 | vg.colorRange($colors), 80 | vg.marginLeft(45), 81 | vg.width(660) 82 | ) 83 | ) 84 | ``` 85 | 86 | The examples linked below involve much larger datasets and a variety of visualization types. 87 | 88 | ## Example Articles 89 | 90 | - [Flight Delays](flight-delays) - examine over 200,000 flight records 91 | - [NYC Taxi Rides](nyc-taxi-rides) - load and visualize 1M NYC taxi cab rides 92 | - [Gaia Star Catalog](gaia-star-catalog) - explore a 5M star sample of the 1.8B star catalog 93 | - [Observable Web Latency](observable-latency) - re-visiting a view of over 7M web requests 94 | 95 | ## Implementation Notes 96 | 97 | - [Using DuckDB in Data Loaders and GitHub Actions](data-loading) 98 | - [Using Mosaic + DuckDB-WASM in Observable Framework](mosaic-duckdb-wasm) 99 | -------------------------------------------------------------------------------- /docs/data-loading.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Data Loading with DuckDB 3 | header: | 4 | 7 | --- 8 | 9 | # Data Loading with DuckDB 10 | 11 | Looking under the hood, this page provides guidance for using DuckDB in Framework data loaders and deploying it within GitHub Actions. 12 | 13 | ## Using DuckDB in Data Loaders 14 | 15 | The [NYC Taxi Rides](nyc-taxi-rides) and [Gaia Star Catalog](gaia-star-catalog) examples use [data loaders](https://observablehq.com/framework/loaders) to perform data preparation, generating pre-projected data and writing it to a Parquet file. 16 | 17 | The [shell script below](https://github.com/uwdata/mosaic-framework-example/blob/main/docs/data/nyc-taxi.parquet.sh) loads taxi data using the command line interface to DuckDB. 18 | The `duckdb` executable must be on your environment path... we'll come back to that! 19 | 20 | ```sh 21 | duckdb :memory: << EOF 22 | -- Load spatial extension 23 | INSTALL spatial; LOAD spatial; 24 | 25 | -- Project, following the example at https://github.com/duckdb/duckdb_spatial 26 | CREATE TEMP TABLE rides AS SELECT 27 | pickup_datetime::TIMESTAMP AS datetime, 28 | ST_Transform(ST_Point(pickup_latitude, pickup_longitude), 'EPSG:4326', 'ESRI:102718') AS pick, 29 | ST_Transform(ST_Point(dropoff_latitude, dropoff_longitude), 'EPSG:4326', 'ESRI:102718') AS drop 30 | FROM 'https://uwdata.github.io/mosaic-datasets/data/nyc-rides-2010.parquet'; 31 | 32 | -- Write output parquet file 33 | COPY (SELECT 34 | HOUR(datetime) + MINUTE(datetime) / 60 AS time, 35 | ST_X(pick)::INTEGER AS px, -- extract pickup x-coord 36 | ST_Y(pick)::INTEGER AS py, -- extract pickup y-coord 37 | ST_X(drop)::INTEGER AS dx, -- extract dropff x-coord 38 | ST_Y(drop)::INTEGER AS dy -- extract dropff y-coord 39 | FROM rides) TO 'trips.parquet' WITH (FORMAT PARQUET); 40 | EOF 41 | 42 | cat trips.parquet >&1 # Write output to stdout 43 | rm trips.parquet # Clean up 44 | ``` 45 | 46 | We invoke DuckDB with the `:memory:` argument to indicate an in-memory database. 47 | We also use the `<< EOF` shell script syntax to provide multi-line input, consisting of the desired SQL queries to run. 48 | 49 | The last query (`COPY ...`) writes a Parquet file to disk. 50 | However, Observable Framework requires that we instead write data to [`stdout`](https://en.wikipedia.org/wiki/Standard_streams#Standard_output_(stdout)). 51 | On some platforms we can do this by writing to the file descriptor `/dev/stdout`. 52 | However, this file does not exist on all platforms – including in GitHub Actions, where this query will fail. 53 | 54 | So we complete the script with two additional commands: 55 | 56 | - Write (`cat`) the bytes of the Parquet file to `stdout`. 57 | - Remove (`rm`) the generated file, as we no longer need it. 58 | 59 | ## Using DuckDB in GitHub Actions 60 | 61 | To deploy our Observable Framework site on GitHub, we use a [GitHub Actions workflow](https://github.com/uwdata/mosaic-framework-example/blob/main/.github/workflows/deploy.yml). 62 | As noted earlier, one issue when running in GitHub Actions is the lack of file-based access to `stdout`. 63 | But another, even more basic, issue is that we need to have DuckDB installed! 64 | 65 | This snippet installs DuckDB within a workflow. 66 | We download a zip file of the official release, unpack it, copy the `duckdb` executable to `/opt/duckdb`, and then link to `duckdb` in the directory `/usr/bin`, ensuring it is accessible to subsequent scripts: 67 | 68 | ```yaml 69 | steps: 70 | - name: Install DuckDB CLI 71 | run: | 72 | wget https://github.com/duckdb/duckdb/releases/download/v0.10.0/duckdb_cli-linux-amd64.zip 73 | unzip duckdb_cli-linux-amd64.zip 74 | mkdir /opt/duckdb && mv duckdb /opt/duckdb && chmod +x /opt/duckdb/duckdb && sudo ln -s /opt/duckdb/duckdb /usr/bin/duckdb 75 | rm duckdb_cli-linux-amd64.zip 76 | ``` 77 | 78 | We perform installation before the site build steps, ensuring `duckdb` is ready to go. -------------------------------------------------------------------------------- /docs/observable-latency.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Observable Web Latency 3 | header: | 4 | 7 | sql: 8 | latency: data/observable-latency.parquet 9 | --- 10 | 11 | # Observable Web Latency 12 | ## Recreating a custom graphic using Mosaic vgplot 13 | 14 | The Observable Framework documentation includes a wonderful example about [analyzing web logs](https://observablehq.com/framework/examples/api/), visualizing the latency (response time) of various routes on the Observable.com site. 15 | The marquee graphic is a pixel-level heatmap of over 7 million requests to Observable servers over the course of a week. 16 | The chart plots time vs. latency, where each pixel is colored according to the most common route (URL pattern) in that time and latency bin. 17 | 18 | That said, a lot is going on in the original [custom heatmap component](https://github.com/observablehq/framework/blob/main/examples/api/docs/components/apiHeatmap.js): 19 | 20 | - The data is pre-binned and aggregated for fast loading 21 | - Observable Plot and HTML Canvas code are intermixed in non-trivial ways 22 | - Frame-based animation is used to progressively render the graphic, presumably to combat sluggish rendering 23 | 24 | Here we re-create this graphic with [Mosaic vgplot](https://idl.uw.edu/mosaic/what-is-mosaic/), resulting in a simpler, standalone specification. 25 | We further leverage Mosaic's support for cross-chart linking and scalable filtering for real-time updates. 26 | 27 | ```js 28 | const $filter = vg.Selection.crossfilter(); 29 | const $highlight = vg.Selection.single(); 30 | ``` 31 | 32 | ```js 33 | vg.plot( 34 | vg.frame({fill: "black"}), 35 | vg.raster( 36 | vg.from("latency", {filterBy: $filter}), 37 | { 38 | x: "time", 39 | y: "latency", 40 | fill: vg.argmax("route", "count"), 41 | fillOpacity: vg.sum("count"), 42 | width: 2016, 43 | height: 500, 44 | imageRendering: "pixelated" 45 | } 46 | ), 47 | vg.intervalXY({as: $filter}), 48 | vg.colorDomain(vg.Fixed), 49 | vg.colorScheme("observable10"), 50 | vg.opacityDomain([0, 25]), 51 | vg.opacityClamp(true), 52 | vg.yScale("log"), 53 | vg.yLabel("↑ Duration (ms)"), 54 | vg.yDomain([0.5, 10000]), 55 | vg.yTickFormat("s"), 56 | vg.xScale("utc"), 57 | vg.xLabel(null), 58 | vg.xDomain([1706227200000, 1706832000000]), 59 | vg.width(1063), 60 | vg.height(550), 61 | vg.margins({left: 35, top: 20, bottom: 30, right: 20}) 62 | ) 63 | ``` 64 | 65 | ```js 66 | vg.plot( 67 | vg.barX( 68 | vg.from("latency", {filterBy: $filter}), 69 | { 70 | x: vg.sum("count"), 71 | y: "route", 72 | fill: "route", 73 | sort: {y: "-x", limit: 15} 74 | } 75 | ), 76 | vg.toggleY({as: $filter}), 77 | vg.toggleY({as: $highlight}), 78 | vg.highlight({by: $highlight}), 79 | vg.colorDomain(vg.Fixed), 80 | vg.xLabel("Routes by Total Requests"), 81 | vg.xTickFormat("s"), 82 | vg.yLabel(null), 83 | vg.width(1063), 84 | vg.height(300), 85 | vg.marginTop(5), 86 | vg.marginLeft(220), 87 | vg.marginBottom(35) 88 | ) 89 | ``` 90 | 91 | _Select bars in the chart of most-requested routes above to filter the heatmap and isolate patterns. Or, select a range in the heatmap to show only corresponding routes._ 92 | 93 | ## Implementation Notes 94 | 95 | While the original uses a pre-binned dataset, we might want to create graphics like this in a more exploratory context. So first we "reverse-engineered" the data into original units, with columns for `time` and `latency` values, in addition to `route` and request `count`. We can leverage DuckDB to re-bin and filter data on the fly! 96 | 97 | We then implement the latency heatmap using a vgplot `raster` mark. Here is what that looks like when using a declarative Mosaic specification in YAML: 98 | 99 | ```yaml 100 | plot: 101 | - mark: frame 102 | fill: black 103 | - mark: raster 104 | data: { from: latency, filterBy: $filter } 105 | x: time 106 | y: latency 107 | fill: { argmax: [route, count] } 108 | fillOpacity: { sum: count } 109 | width: 2016 110 | height: 500 111 | imageRendering: pixelated 112 | - select: intervalXY 113 | as: $filter 114 | colorDomain: Fixed 115 | colorScheme: observable10 116 | opacityDomain: [0, 25] 117 | opacityClamp: true 118 | yScale: log 119 | yLabel: ↑ Duration (ms) 120 | yDomain: [0.5, 10000] 121 | yTickFormat: s 122 | xScale: utc 123 | xLabel: null 124 | xDomain: [1706227200000, 1706832000000] 125 | width: 1063 126 | height: 550 127 | margins: { left: 35, top: 20, bottom: 30, right: 20 } 128 | ``` 129 | 130 | Key bits of the specification include: 131 | 132 | - Binning to a pixel grid based on `time` (_x_) and `latency` (_y_). 133 | - Mapping the pixel fill color to the `route` with largest request `count` per bin. 134 | - Mapping the pixel fill opacity to the sum of `count`s within a bin. 135 | - Interactive filtering using a selection (`$filter`). Setting `colorDomain: Fixed` ensures consistent colors; it prevents re-coloring when data is filtered. 136 | 137 | However, this re-creation does diverge from the original in a few ways: 138 | 139 | - The coloring is not identical. Ideally, vgplot should provide greater control over sorting scale domains (here, the list of unique `route` values). 140 | - The re-creation above does not include nice tooltips like the original. 141 | -------------------------------------------------------------------------------- /docs/flight-delays.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Flight Delays 3 | header: | 4 | 7 | sql: 8 | flights: data/flights-200k.parquet 9 | --- 10 | 11 | # Flight Delays 12 | ## Interactive exploration of large-scale transportation data 13 | 14 | What contributes to delayed airline flights? Let's examine a sample of over 200,000 flight records provided by the [U.S. DOT Bureau of Transportation Statistics](https://www.transtats.bts.gov/ontime/). 15 | 16 | We use [Mosaic vgplot](https://idl.uw.edu/mosaic/) to create scalable, interactive visualizations. Mosaic loads data from a Parquet file into DuckDB-WASM, running in the browser. Mosaic queries the database to transform data as part of the visualization process. 17 | 18 | ## Cross-Filtered Histograms 19 | 20 | The histograms below visualize the arrival delay, departure time, and distance flown. Select a region in any histogram to cross-filter the charts. 21 | _How are time and/or distance predictive of a flight being late? What is predictive of a flight being early?_ 22 | 23 | ```js 24 | // a selection instance to manage selected intervals from each plot 25 | const $brush = vg.Selection.crossfilter(); 26 | ``` 27 | 28 | ```js 29 | vg.vconcat( 30 | vg.plot( 31 | vg.rectY( 32 | vg.from("flights", { filterBy: $brush }), 33 | { x: vg.bin("delay"), y: vg.count(), fill: "steelblue", inset: 0.5 } 34 | ), 35 | vg.intervalX({ as: $brush }), 36 | vg.xDomain(vg.Fixed), 37 | vg.yTickFormat("s"), 38 | vg.xLabel("Delay (minutes)"), 39 | vg.yLabel("Number of Flights"), 40 | vg.width(600), 41 | vg.height(150) 42 | ), 43 | vg.plot( 44 | vg.rectY( 45 | vg.from("flights", { filterBy: $brush }), 46 | { x: vg.bin("time"), y: vg.count(), fill: "steelblue", inset: 0.5 } 47 | ), 48 | vg.intervalX({ as: $brush }), 49 | vg.xDomain(vg.Fixed), 50 | vg.yTickFormat("s"), 51 | vg.xLabel("Time (hour of day)"), 52 | vg.yLabel("Number of Flights"), 53 | vg.width(600), 54 | vg.height(150) 55 | ), 56 | vg.plot( 57 | vg.rectY( 58 | vg.from("flights", { filterBy: $brush }), 59 | { x: vg.bin("distance"), y: vg.count(), fill: "steelblue", inset: 0.5 } 60 | ), 61 | vg.intervalX({ as: $brush }), 62 | vg.xDomain(vg.Fixed), 63 | vg.yTickFormat("s"), 64 | vg.xLabel("Distance (miles)"), 65 | vg.yLabel("Number of Flights"), 66 | vg.width(600), 67 | vg.height(150) 68 | ) 69 | ) 70 | ``` 71 | 72 | When a selection changes, we need to filter the data and recount the number of records in each bin. The Mosaic system analyzes these queries and automatically optimizes updates by building indexes of pre-aggregated data ("data cubes") in the database, binned at the level of input pixels for the currently active view. 73 | 74 | While 200,000 points will stress many web-based visualization tools, Mosaic doesn't break a sweat. Now go ahead and try this with [10 million records](https://idl.uw.edu/mosaic/examples/flights-10m.html)! 75 | 76 | 77 | ## Density Hexbins 78 | 79 | The histograms above provide a useful first-look at the data. However, to discover relations among the data we had to interactively explore. Instead of "hiding" patterns behind interactions, let's visualize relationships directly. 80 | 81 | Below we use hexagonal bins to visualize the density (number of flights) by both time of day and arrival delay. Interactive histograms along the edges show marginal distributions for both. 82 | 83 | ```js 84 | const $scale = vg.Param.value("log"); // color scale type 85 | const $query = vg.Selection.intersect(); // interval ranges 86 | ``` 87 | 88 | ```js 89 | vg.vconcat( 90 | vg.hconcat( 91 | vg.menu({ label: "Color Scale", as: $scale, options: ["log", "linear", "sqrt"] }), 92 | vg.hspace(20), 93 | vg.colorLegend({ for: "hexbins" }) 94 | ), 95 | vg.hconcat( 96 | vg.plot( 97 | vg.rectY( 98 | vg.from("flights"), 99 | { x: vg.bin("time"), y: vg.count(), fill: "steelblue", inset: 0.5 } 100 | ), 101 | vg.intervalX({ as: $query }), 102 | vg.margins({"left":5,"right":5,"top":30,"bottom":0}), 103 | vg.xDomain(vg.Fixed), 104 | vg.xAxis("top"), 105 | vg.yAxis(null), 106 | vg.xLabelAnchor("center"), 107 | vg.xLabel("Time (hour of day)"), 108 | vg.width(605), 109 | vg.height(70) 110 | ), 111 | vg.hspace(80) 112 | ), 113 | vg.hconcat( 114 | vg.plot( 115 | vg.hexbin( 116 | vg.from("flights", { filterBy: $query }), 117 | { x: "time", y: "delay", fill: vg.count(), binWidth: 10 } 118 | ), 119 | vg.hexgrid({ binWidth: 10 }), 120 | vg.name("hexbins"), 121 | vg.colorScheme("ylgnbu"), 122 | vg.colorScale($scale), 123 | vg.margins({"left":5,"right":0,"top":0,"bottom":5}), 124 | vg.xAxis(null), 125 | vg.yAxis(null), 126 | vg.xyDomain(vg.Fixed), 127 | vg.width(600), 128 | vg.height(455) 129 | ), 130 | vg.plot( 131 | vg.rectX( 132 | vg.from("flights"), 133 | { x: vg.count(), y: vg.bin("delay"), fill: "steelblue", inset: 0.5 } 134 | ), 135 | vg.intervalY({ as: $query }), 136 | vg.margins({"left":0,"right":50,"top":4,"bottom":5}), 137 | vg.yDomain([-60,180]), 138 | vg.xAxis(null), 139 | vg.yAxis("right"), 140 | vg.yLabelAnchor("center"), 141 | vg.yLabel("Delay (minutes)"), 142 | vg.width(80), 143 | vg.height(455) 144 | ) 145 | ) 146 | ) 147 | ``` 148 | 149 | We can see right away that flights are more likely to be delayed if they leave later in the day. Delays may accrue as a single plane flies from airport to airport. 150 | 151 | The number of records in a hexbin vary from 0 to over 2,000, spanning multiple orders of magnitude. To see these orders more clearly, we default to a logarithmic color scale. _Try adjusting the color scale menu to see the effects of different choices._ 152 | 153 | ## Density Heatmaps 154 | 155 | For finer-grained detail, we can bin all the way down to the level of individual pixels. 156 | 157 | ```js 158 | const $filter = vg.Selection.crossfilter(); // interval ranges 159 | ``` 160 | 161 | ```js 162 | vg.hconcat( 163 | vg.plot( 164 | vg.raster( 165 | vg.from("flights", { filterBy: $filter }), 166 | { x: "time", y: "delay", fill: "density", imageRendering: "pixelated" } 167 | ), 168 | vg.intervalX({ as: $filter, brush: {fill: "none", stroke: "#888"} }), 169 | vg.colorScheme("blues"), 170 | vg.colorScale("symlog"), 171 | vg.xZero(true), 172 | vg.xLabel("Time (hour of day)"), 173 | vg.yLabel("Delay (minutes)"), 174 | vg.xyDomain(vg.Fixed), 175 | vg.width(315), 176 | vg.height(300) 177 | ), 178 | vg.hspace(10), 179 | vg.plot( 180 | vg.raster( 181 | vg.from("flights", { filterBy: $filter }), 182 | { x: "distance", y: "delay", fill: "density", imageRendering: "pixelated" } 183 | ), 184 | vg.intervalX({ as: $filter, brush: {fill: "none", stroke: "#888"} }), 185 | vg.colorScheme("blues"), 186 | vg.colorScale("symlog"), 187 | vg.xScale("log"), 188 | vg.xLabel("Distance (miles, log scale)"), 189 | vg.yLabel("Delay (minutes)"), 190 | vg.xyDomain(vg.Fixed), 191 | vg.width(315), 192 | vg.height(300) 193 | ) 194 | ) 195 | ``` 196 | 197 | The result is a raster, or heatmap, view. 198 | We now see some striping, which reveals that data values are truncated to a limited precision. 199 | As before, we can use interactive selections to cross-filter the charts. 200 | --------------------------------------------------------------------------------