├── Makefile
├── .env.sample
├── smoke-test.mjs
├── embeddings-nicar-solution.mjs
├── embeddings-nbc-search.mjs
├── LICENSE
├── package.json
├── embeddings-nbc-build.mjs
├── .gitignore
├── README.md
├── embeddings-nicar.mjs
└── structured.ipynb


/Makefile:
--------------------------------------------------------------------------------
1 | 
2 | .PHONY: deps
3 | deps:


--------------------------------------------------------------------------------
/.env.sample:
--------------------------------------------------------------------------------
1 | # Copy+paste this file to .env and fill in the proper values!
2 | OPENAI_API_KEY=


--------------------------------------------------------------------------------
/smoke-test.mjs:
--------------------------------------------------------------------------------
 1 | //import * as x from "@ai-sdk/openai";
 2 | import * as x1 from "@huggingface/transformers";
 3 | import * as x2 from "ai";
 4 | import * as x3 from "ollama";
 5 | import * as x4 from "zod";
 6 | 
 7 | import * as x5 from "better-sqlite3";
 8 | import * as x6 from "sqlite-vec";
 9 | 
10 | console.log("NICAR25 Everything works!");


--------------------------------------------------------------------------------
/embeddings-nicar-solution.mjs:
--------------------------------------------------------------------------------
 1 | 
 2 | // solution!
 3 | async function searchSpeakers(path, query) {
 4 |   console.log(`Searching for NICAR speakers related to: ${query}...`);
 5 |   const db = new Database(path);
 6 |   sqliteVec.load(db);
 7 |   const result = await ollama.embed({
 8 |     model: "all-minilm",
 9 |     input: query
10 |   });
11 |   const queryEmbedding = result.embeddings[0];
12 |   const knnQuery = db.prepare(`
13 |       SELECT 
14 |         rowid, 
15 |         distance,
16 |         speakers.bio
17 |       FROM vec_speakers 
18 |       LEFT JOIN speakers ON speakers.speaker_id = vec_speakers.rowid
19 |       WHERE bio_embedding MATCH ? 
20 |         AND k = 10;
21 |     `);
22 |   
23 |   console.log(query);
24 |   for(const {rowid, distance, bio} of knnQuery.all(JSON.stringify(queryEmbedding))) {
25 |     console.log(rowid, distance, bio);
26 |   }
27 | }


--------------------------------------------------------------------------------
/embeddings-nbc-search.mjs:
--------------------------------------------------------------------------------
 1 | import Database from "better-sqlite3";
 2 | import * as sqliteVec from "sqlite-vec";
 3 | import { Ollama } from 'ollama';
 4 | 
 5 | const ollama = new Ollama();
 6 | 
 7 | async function main() {
 8 | 
 9 |   const db = new Database("nbc-articles-nicar.db");
10 |   sqliteVec.load(db);
11 |   
12 |   const query = "reproductive rights";
13 | 
14 |   const result = await ollama.embed({
15 |     model: "all-minilm",
16 |     input: query
17 |   });
18 |   const queryEmbedding = result.embeddings[0];
19 |   const knnQuery = db.prepare(`
20 |       SELECT 
21 |         rowid, 
22 |         distance,
23 |         articles.headline
24 |       FROM vec_articles 
25 |       LEFT JOIN articles ON articles.id = vec_articles.rowid
26 |       WHERE headline_embedding MATCH ? 
27 |         AND k = 10;
28 |     `);
29 |   
30 |   console.log(query);
31 |   for(const {rowid, distance, headline} of knnQuery.all(JSON.stringify(queryEmbedding))) {
32 |     console.log(rowid, distance, headline);
33 |   }
34 | }
35 | 
36 | main();
37 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Alex Garcia
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "nicar25-ai-starter",
 3 |   "version": "1.0.0",
 4 |   "description": "",
 5 |   "main": "index.js",
 6 |   "scripts": {
 7 |     "download:nbc": "curl -o nbc-articles-nicar.db 'https://delicate-dream-7407.fly.storage.tigris.dev/nicar25/nbc-articles-nicar.db'",
 8 |     "download:nicar25-schedule": "curl -o nicar-2025-schedule.json 'https://schedules.ire.org/nicar-2025/nicar-2025-schedule.json'",
 9 |     "download:emails": "curl -o dwillis-emails.db 'https://delicate-dream-7407.fly.storage.tigris.dev/nicar25/dwillis-emails.db'",
10 |     "download": "npm run download:nicar25-schedule && npm run download:nbc && npm run download:emails",
11 |     "clean": "rm dwillis-emails.db nicar-2025-schedule.json nbc-articles-nicar.db"
12 |   },
13 |   "repository": {
14 |     "type": "git",
15 |     "url": "git+https://github.com/asg017/nicar25-ai-starter-js.git"
16 |   },
17 |   "keywords": [],
18 |   "author": "",
19 |   "license": "ISC",
20 |   "bugs": {
21 |     "url": "https://github.com/asg017/nicar25-ai-starter-js/issues"
22 |   },
23 |   "homepage": "https://github.com/asg017/nicar25-ai-starter-js#readme",
24 |   "dependencies": {
25 |     "@ai-sdk/openai": "^1.1.10",
26 |     "@huggingface/transformers": "^3.3.3",
27 |     "ai": "^4.1.36",
28 |     "better-sqlite3": "^11.8.1",
29 |     "deno": "^2.1.9",
30 |     "ollama": "^0.5.13",
31 |     "sqlite-vec": "^0.1.7-alpha.2",
32 |     "zod": "^3.24.2"
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/embeddings-nbc-build.mjs:
--------------------------------------------------------------------------------
 1 | import Database from "better-sqlite3";
 2 | import * as sqliteVec from "sqlite-vec";
 3 | import { Ollama } from 'ollama';
 4 | 
 5 | const ollama = new Ollama();
 6 | 
 7 | async function main() {
 8 | 
 9 |   // Step 1: Query headlines from SQLite database
10 |   const db = new Database("nbc-articles-nicar.db");
11 |   sqliteVec.load(db);
12 |   
13 |   for(const {id, headline} of db.prepare('SELECT id, headline FROM articles LIMIT 10;').all()) {
14 |     console.log(id, headline);
15 |   }
16 |   // comment out to continue to step 2
17 |   return;
18 | 
19 |   // Step 2: Try out embeddings with ollama
20 |   const {embeddings} = await ollama.embed({
21 |     model: "all-minilm",
22 |     input: "This is a test"
23 |   });
24 |   console.log(embeddings);
25 |   // comment out to continue to step 3
26 |   return;
27 | 
28 |   // Step 3: Save these embeddings into a SQLite database;
29 |   db.exec(`
30 |     CREATE VIRTUAL TABLE IF NOT EXISTS vec_articles USING vec0(
31 |       headline_embedding float[384]
32 |     );
33 |   `);
34 |   const insertEmbedding = db.prepare(`
35 |     INSERT INTO vec_articles (rowid, headline_embedding) VALUES 
36 |       (cast(? as integer), ?)
37 |   `);
38 |   
39 | 
40 |   for(const {id, headline} of db.prepare('SELECT id, headline FROM articles;').all()) {
41 |     console.log(id);
42 |     const {embeddings} = await ollama.embed({
43 |       model: "all-minilm",
44 |       input: headline
45 |     });
46 |     
47 |     insertEmbedding.run(id, JSON.stringify(embeddings[0]));
48 |   }
49 | }
50 | 
51 | main();
52 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Logs
  2 | logs
  3 | *.log
  4 | npm-debug.log*
  5 | yarn-debug.log*
  6 | yarn-error.log*
  7 | lerna-debug.log*
  8 | .pnpm-debug.log*
  9 | 
 10 | # Diagnostic reports (https://nodejs.org/api/report.html)
 11 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
 12 | 
 13 | # Runtime data
 14 | pids
 15 | *.pid
 16 | *.seed
 17 | *.pid.lock
 18 | 
 19 | # Directory for instrumented libs generated by jscoverage/JSCover
 20 | lib-cov
 21 | 
 22 | # Coverage directory used by tools like istanbul
 23 | coverage
 24 | *.lcov
 25 | 
 26 | # nyc test coverage
 27 | .nyc_output
 28 | 
 29 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
 30 | .grunt
 31 | 
 32 | # Bower dependency directory (https://bower.io/)
 33 | bower_components
 34 | 
 35 | # node-waf configuration
 36 | .lock-wscript
 37 | 
 38 | # Compiled binary addons (https://nodejs.org/api/addons.html)
 39 | build/Release
 40 | 
 41 | # Dependency directories
 42 | node_modules/
 43 | jspm_packages/
 44 | 
 45 | # Snowpack dependency directory (https://snowpack.dev/)
 46 | web_modules/
 47 | 
 48 | # TypeScript cache
 49 | *.tsbuildinfo
 50 | 
 51 | # Optional npm cache directory
 52 | .npm
 53 | 
 54 | # Optional eslint cache
 55 | .eslintcache
 56 | 
 57 | # Optional stylelint cache
 58 | .stylelintcache
 59 | 
 60 | # Microbundle cache
 61 | .rpt2_cache/
 62 | .rts2_cache_cjs/
 63 | .rts2_cache_es/
 64 | .rts2_cache_umd/
 65 | 
 66 | # Optional REPL history
 67 | .node_repl_history
 68 | 
 69 | # Output of 'npm pack'
 70 | *.tgz
 71 | 
 72 | # Yarn Integrity file
 73 | .yarn-integrity
 74 | 
 75 | # dotenv environment variable files
 76 | .env
 77 | .env.development.local
 78 | .env.test.local
 79 | .env.production.local
 80 | .env.local
 81 | 
 82 | # parcel-bundler cache (https://parceljs.org/)
 83 | .cache
 84 | .parcel-cache
 85 | 
 86 | # Next.js build output
 87 | .next
 88 | out
 89 | 
 90 | # Nuxt.js build / generate output
 91 | .nuxt
 92 | dist
 93 | 
 94 | # Gatsby files
 95 | .cache/
 96 | # Comment in the public line in if your project uses Gatsby and not Next.js
 97 | # https://nextjs.org/blog/next-9-1#public-directory-support
 98 | # public
 99 | 
100 | # vuepress build output
101 | .vuepress/dist
102 | 
103 | # vuepress v2.x temp and cache directory
104 | .temp
105 | .cache
106 | 
107 | # Docusaurus cache and generated files
108 | .docusaurus
109 | 
110 | # Serverless directories
111 | .serverless/
112 | 
113 | # FuseBox cache
114 | .fusebox/
115 | 
116 | # DynamoDB Local files
117 | .dynamodb/
118 | 
119 | # TernJS port file
120 | .tern-port
121 | 
122 | # Stores VSCode versions used for testing VSCode extensions
123 | .vscode-test
124 | 
125 | # yarn v2
126 | .yarn/cache
127 | .yarn/unplugged
128 | .yarn/build-state.yml
129 | .yarn/install-state.gz
130 | .pnp.*
131 | 
132 | 
133 | # nicar
134 | nicar-2025-schedule.json
135 | nbc-articles-nicar.db
136 | 
137 | *.db
138 | *.csv


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # NICAR25: AI Starter Pack in JavaScript for Data Journalism
 2 | 
 3 | This repo contains the code and tipsheet for the [*"AI starter pack: JavaScript"*](https://schedules.ire.org/nicar-2025/index.html#1175) class at [NICAR25](https://www.ire.org/training/conferences/nicar-2025/), happening on Friday March 7,  @ 2:15pm.
 4 | 
 5 | 
 6 | [SLIDES](https://docs.google.com/presentation/d/1bFHQg6DAoiKJ7G_yGF2mM1X3WWtN6tTGQMbZuvg2678/edit?usp=sharing) • [FEEDBACK FORM](https://forms.gle/ZSp9fzGEidcM9eJT6)
 7 | 
 8 | ## Part 1: Embeddings
 9 | 
10 | - [Simon Willison Embeddings Talk](https://www.youtube.com/watch?time_continue=50&v=ArnMdc-ICCM&source_ve_path=MjM4NTE)
11 | - Embeddings inference:
12 |   - [Ollama](https://ollama.com/)
13 |   - [transformers.js](https://huggingface.co/docs/transformers.js/en/index)
14 |   - [llama.cpp](https://github.com/ggml-org/llama.cpp)
15 |   - [sqlite-lembed](https://github.com/asg017/sqlite-lembed)
16 | - Datasets:
17 |   - [NBC News Headlines Scraper](https://github.com/asg017/nbc-headlines-scraper)
18 |   - [NICAR25 Schedule](https://schedules.ire.org/nicar-2025/), w/ JSON
19 | - [sqlite-vec](https://github.com/asg017/sqlite-vec)
20 |   - [`sqlite-vec` embedding visualizer demo](https://observablehq.com/d/04bc1c1b0de9db7c)
21 | - Additional links
22 |   - [Nomic Atlas](https://atlas.nomic.ai/) (embeddings visualization tool, product)
23 |   - [Latent Scopes](https://github.com/enjalot/latent-scope) (embedding visualization tool, open source)
24 | 
25 | Embeddings models I recommend (local-only)
26 | 
27 | | Name                                                                                                        | #Dimensions | Release Date    |
28 | | ----------------------------------------------------------------------------------------------------------- | ----------- | --------------- |
29 | | [`sentence-transformers/all-MiniLM-L6-v2`](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)   | 384         | ~August 2021    |
30 | | [`mixedbread-ai/mxbai-embed-xsmall-v1`](https://huggingface.co/mixedbread-ai/mxbai-embed-xsmall-v1)         | 384         | ~September 2024 |
31 | | [`nomic-ai/nomic-embed-text-v1.5`](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5)                   | 768         | ~February 2024  |
32 | | [`Snowflake/snowflake-arctic-embed-m-v2.0`](https://huggingface.co/Snowflake/snowflake-arctic-embed-m-v2.0) | 768         | ~December 2024  |
33 | 
34 | ## Part 2: Structured Output Generation
35 | 
36 | - [Derek Willis "LLM Extraction Challenge"](https://thescoop.org/archives/2025/01/27/llm-extraction-challenge-fundraising-emails/index.html) ([Repository](https://github.com/dwillis/LLM-Extraction-Challenge))
37 | - [Vercel AI SDK](https://sdk.vercel.ai/)
38 | - [OpenAI's Structured Output Guide](https://platform.openai.com/docs/guides/structured-outputs)
39 | 
40 | 
41 | Small, **local** LLMs I recommend for trying structred output generation:
42 | 
43 | - [`microsoft/Phi-4-mini-instruct`](https://huggingface.co/microsoft/Phi-4-mini-instruct)
44 | - [`meta-llama/Llama-3.2-3B`](https://huggingface.co/meta-llama/Llama-3.2-3B)
45 | - [`google/gemma-2-2b`](https://huggingface.co/google/gemma-2-2b)
46 | - [`mistralai/Mistral-Small-24B-Instruct-2501`](https://huggingface.co/mistralai/Mistral-Small-24B-Instruct-2501)
47 | 
48 | ## Running yourself
49 | 
50 | Download the following software:
51 | - [Node.js](https://nodejs.org/en/download)
52 | - [Ollama](https://ollama.com/download)
53 | - VS Code, with Jupyter notebooks
54 | 
55 | ```bash
56 | # step 1: `git clone` this repository
57 | git clone https://github.com/asg017/nicar25-ai-starter-js.git
58 | 
59 | 
60 | # step 2: with ollama installed, run:
61 | ollama pull all-minilm
62 | ```
63 | 
64 | Open the `nicar25-ai-starter-js` folder in VS Code. Then in the terminal inside VS Code, run:
65 | 
66 | ```bash
67 | npm install
68 | npm run download
69 | ```
70 | 
71 | And you should be set!


--------------------------------------------------------------------------------
/embeddings-nicar.mjs:
--------------------------------------------------------------------------------
  1 | import Database from "better-sqlite3";
  2 | import * as sqliteVec from "sqlite-vec";
  3 | import { Ollama } from 'ollama';
  4 | import { readFileSync } from "node:fs";
  5 | 
  6 | const ollama = new Ollama();
  7 | const NICAR_SCHEDULE_JSON_PATH = 'nicar-2025-schedule.json';
  8 | 
  9 | function importSchedule(db) {
 10 |   const data = JSON.parse(readFileSync(NICAR_SCHEDULE_JSON_PATH));
 11 |   const insertSession = db.prepare(`
 12 |     INSERT INTO sessions (session_id, session_title, description, session_type, start_time, end_time, duration_mins, skill_level, room_name, day) 
 13 |       VALUES (:session_id, :session_title, :description, :session_type, :start_time, :end_time, :duration_mins, :skill_level, :room_name, :day);
 14 |   `);
 15 |   
 16 |   const insertSpeaker = db.prepare(`
 17 |     INSERT INTO speakers (session_id, first_name, last_name, affiliation, bio) 
 18 |       VALUES (:session_id, :first_name, :last_name, :affiliation, :bio);
 19 |   `);
 20 | 
 21 |   for(const session of data) {
 22 |     insertSession.run({...session, room_name: session.room.room_name});
 23 |     for(const speaker of session.speakers) {
 24 |       insertSpeaker.run({
 25 |         session_id: session.session_id,
 26 |         first_name: speaker.first,
 27 |         last_name: speaker.last,
 28 |         affiliation: speaker.affiliation,
 29 |         bio: speaker.bio
 30 |       });
 31 |     }
 32 |   }
 33 | }
 34 | 
 35 | async function generateEmbeddings(db) {
 36 |   db.exec(`
 37 |     CREATE VIRTUAL TABLE IF NOT EXISTS vec_sessions USING vec0(
 38 |       description_embedding float[384]
 39 |     );
 40 | 
 41 |     CREATE VIRTUAL TABLE IF NOT EXISTS vec_speakers USING vec0(
 42 |       bio_embedding float[384]
 43 |     );
 44 |   `);
 45 | 
 46 |   const insertSessionEmbedding = db.prepare(`
 47 |     INSERT INTO vec_sessions (rowid, description_embedding) VALUES 
 48 |       (cast(? as integer), ?)
 49 |   `);
 50 |   const insertSpeakerEmbedding = db.prepare(`
 51 |     INSERT INTO vec_speakers (rowid, bio_embedding) VALUES 
 52 |       (cast(? as integer), ?)
 53 |   `);
 54 |   
 55 |   for(const {session_id, description} of db.prepare('SELECT session_id, description FROM sessions').all()) {
 56 |     // skip session with empty descriptions
 57 |     if(!description) {
 58 |       continue;
 59 |     }
 60 |     console.log(session_id);
 61 |     const {embeddings} = await ollama.embed({
 62 |       model: "all-minilm",
 63 |       input: description
 64 |     });
 65 |     insertSessionEmbedding.run(session_id, JSON.stringify(embeddings[0]));
 66 |   }
 67 | 
 68 |   for(const {speaker_id, bio} of db.prepare('SELECT speaker_id, bio FROM speakers').all()) {
 69 |     // skip speakers with empty bios
 70 |     if(!bio) {
 71 |       continue;
 72 |     }
 73 |     console.log(speaker_id);
 74 |     const {embeddings} = await ollama.embed({
 75 |       model: "all-minilm",
 76 |       input: bio
 77 |     });
 78 |     insertSpeakerEmbedding.run(speaker_id, JSON.stringify(embeddings[0]));
 79 |   }
 80 | }
 81 | 
 82 | async function build(path) {
 83 |   const db = new Database(":memory:");
 84 |   sqliteVec.load(db);
 85 | 
 86 |   db.exec(`
 87 |     CREATE TABLE IF NOT EXISTS sessions (
 88 |       session_id INTEGER PRIMARY KEY,
 89 |       session_title TEXT,
 90 |       description TEXT,
 91 |       session_type TEXT,
 92 |       start_time TEXT,
 93 |       end_time TEXT,
 94 |       duration_mins INTEGER,
 95 |       evergreen BOOLEAN,
 96 |       skill_level TEXT,
 97 |       room_name TEXT,
 98 |       day TEXT
 99 |     );
100 | 
101 |     CREATE TABLE IF NOT EXISTS speakers (
102 |       speaker_id INTEGER PRIMARY KEY AUTOINCREMENT,
103 |       session_id INTEGER,
104 |       first_name TEXT,
105 |       last_name TEXT,
106 |       affiliation TEXT,
107 |       bio TEXT,
108 |       FOREIGN KEY (session_id) REFERENCES sessions(session_id)
109 |     );
110 |   `);
111 |   importSchedule(db);
112 |   await generateEmbeddings(db);
113 |   db.prepare('vacuum into ?').run(path);
114 |   console.log(`Successfully saved database to ${path}`)
115 | }
116 | 
117 | async function searchSessions(path, query) {
118 |   console.log(`Searching for NICAR sessions related to: ${query}...`);
119 |   const db = new Database(path);
120 |   sqliteVec.load(db);
121 |   const result = await ollama.embed({
122 |     model: "all-minilm",
123 |     input: query
124 |   });
125 |   const queryEmbedding = result.embeddings[0];
126 |   const knnQuery = db.prepare(`
127 |       SELECT 
128 |         rowid, 
129 |         distance,
130 |         sessions.description
131 |       FROM vec_sessions 
132 |       LEFT JOIN sessions ON sessions.session_id = vec_sessions.rowid
133 |       WHERE description_embedding MATCH ? 
134 |         AND k = 10;
135 |     `);
136 |   
137 |   for(const {rowid, distance, description} of knnQuery.all(JSON.stringify(queryEmbedding))) {
138 |     console.log(rowid, distance, description);
139 |   }
140 | }
141 | async function searchSpeakers(path, query) {
142 |   // TODO try yourself!
143 | }
144 | 
145 | async function main() {
146 |   switch( process.argv[2] ) {
147 |     case "build": {
148 |       await build(process.argv[3]);
149 |       break;
150 |     }
151 |     case "search-sessions":{
152 |       await searchSessions(process.argv[3], process.argv[4]);
153 |       break;
154 |     }
155 |     case "search-speakers":{
156 |       await searchSpeakers(process.argv[3], process.argv[4]);
157 |       break;
158 |     }
159 |     default: {
160 |       console.log(
161 | `Usage: 
162 |   node embeddings-nicar.mjs build <database>
163 |   node embeddings-nicar.mjs search-sessions <database> <query>
164 |   node embeddings-nicar.mjs search-speakers <database> <query>
165 | `
166 | );
167 |     }
168 |   }
169 | }
170 | 
171 | main();
172 | 


--------------------------------------------------------------------------------
/structured.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Part 2: Structured Output Generation"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Second part of the \"AI Starter Pack JavaScript\" NICAR25 class - let's try out structured output generation!\n",
 15 |     "\n",
 16 |     "Structured output generation involves giving an LLM a JSON schema that it will follow when generating output. Instead of replying back with english or \"Sure thing!\", it will instead return the exact JSON parse-able schema that you care about. \n",
 17 |     "\n",
 18 |     "Structured output generation is great for:\n",
 19 |     "\n",
 20 |     "- Natural language processing\n",
 21 |     "- Extracting data out of super messy data\n",
 22 |     "- Fake data generation\n",
 23 |     "\n",
 24 |     "Though I wouldn't trust structured outputs for *real* data, like 'Return the FIPS code for Kern County' or \"return a list of current Lakers players and their season scores\""
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "metadata": {},
 30 |    "source": [
 31 |     "## Loading Environment variables for OpenAI\n",
 32 |     "\n",
 33 |     "Copy+paste the `.env.sample` file to a new `.env` file, then paste in the OpenAI key that I will share during the class."
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "import \"jsr:@std/dotenv/load\";"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "markdown",
 47 |    "metadata": {},
 48 |    "source": [
 49 |     "## Sample: Hitting the OpenAI API with the Vercel AI SDK"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "import { generateText } from \"npm:ai\";\n",
 59 |     "import { openai } from \"npm:@ai-sdk/openai\";\n",
 60 |     "\n",
 61 |     "const {text} = await generateText({\n",
 62 |     "  model: openai(\"gpt-4o-mini\"),\n",
 63 |     "  prompt: \"Short haiku about a lonely mountain\",\n",
 64 |     "});\n",
 65 |     "console.log(text);"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "// How do we get JSON back? Can we just ask for it?\n",
 75 |     "const {text} = await generateText({\n",
 76 |     "  model: openai(\"gpt-4o-mini\"),\n",
 77 |     "  prompt: \"JSON of a person with name and age\",\n",
 78 |     "});\n",
 79 |     "console.log(text);"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": null,
 85 |    "metadata": {},
 86 |    "outputs": [],
 87 |    "source": [
 88 |     "JSON.parse(text); // :("
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "markdown",
 93 |    "metadata": {},
 94 |    "source": [
 95 |     "## Solution: `generateObject()` with `zod`"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": null,
101 |    "metadata": {},
102 |    "outputs": [],
103 |    "source": [
104 |     "import { generateObject } from \"npm:ai\";\n",
105 |     "import { z } from \"npm:zod\";\n",
106 |     "\n",
107 |     "const PersonSchema = z.object({\n",
108 |     "  name: z.string(),\n",
109 |     "  age: z.number(),\n",
110 |     "});\n",
111 |     "\n",
112 |     "const {object} = await generateObject({\n",
113 |     "  model: openai(\"gpt-4o-mini\"),\n",
114 |     "  prompt: \"JSON of a person with name and age\",\n",
115 |     "  schema: PersonSchema\n",
116 |     "});\n",
117 |     "\n",
118 |     "console.log(object);\n",
119 |     "console.log(object.name);\n",
120 |     "console.log(object.age);"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "markdown",
125 |    "metadata": {},
126 |    "source": [
127 |     "## Now let's cook with campaign emails"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": null,
133 |    "metadata": {},
134 |    "outputs": [],
135 |    "source": [
136 |     "import {Database} from \"jsr:@db/sqlite\";\n",
137 |     "const db = new Database(\"dwillis-emails.db\");\n",
138 |     "\n",
139 |     "for (const { rowid, body } of db.sql`select rowid, body from emails_raw limit 10`) {\n",
140 |     "  console.log(rowid, body.substring(0, 100));\n",
141 |     "}"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": null,
147 |    "metadata": {},
148 |    "outputs": [],
149 |    "source": [
150 |     "const PROMPT = `\n",
151 |     "    Parse the following political email and return a JSON object with the following schema:\n",
152 |     "    \n",
153 |     "    \"committee\": Name of the committee in the disclaimer that begins with 'Paid for by'  \\\n",
154 |     "    but does not include 'Paid for by', the committee address, or the treasurer name. \n",
155 |     "    Should be null if not present.\n",
156 |     "    \n",
157 |     "    \"sender\" which is the name of the person, if any, mentioned as the author of the email. \n",
158 |     "    Should be null if not present.\n",
159 |     "    \n",
160 |     "    Do not include any other text, no yapping.\n",
161 |     "`;\n",
162 |     "\n",
163 |     "const EmailSchema = z.object({\n",
164 |     "  committee: z.string().nullable(),\n",
165 |     "  sender: z.string().nullable(),\n",
166 |     "});\n",
167 |     "\n",
168 |     "for (const { rowid, body } of db.sql`select rowid, body from emails_raw limit 3`) {\n",
169 |     "  const { object } = await generateObject({\n",
170 |     "    model: openai(\"gpt-4o-mini\"),\n",
171 |     "    prompt: PROMPT + body,\n",
172 |     "    schema: EmailSchema,\n",
173 |     "  });\n",
174 |     "  console.log(object);\n",
175 |     "}\n"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": null,
181 |    "metadata": {},
182 |    "outputs": [],
183 |    "source": []
184 |   }
185 |  ],
186 |  "metadata": {
187 |   "kernelspec": {
188 |    "display_name": "Deno",
189 |    "language": "typescript",
190 |    "name": "deno"
191 |   },
192 |   "language_info": {
193 |    "codemirror_mode": "typescript",
194 |    "file_extension": ".ts",
195 |    "mimetype": "text/x.typescript",
196 |    "name": "typescript",
197 |    "nbconvert_exporter": "script",
198 |    "pygments_lexer": "typescript",
199 |    "version": "5.6.2"
200 |   }
201 |  },
202 |  "nbformat": 4,
203 |  "nbformat_minor": 2
204 | }
205 | 


--------------------------------------------------------------------------------