├── Makefile ├── .env.sample ├── smoke-test.mjs ├── embeddings-nicar-solution.mjs ├── embeddings-nbc-search.mjs ├── LICENSE ├── package.json ├── embeddings-nbc-build.mjs ├── .gitignore ├── README.md ├── embeddings-nicar.mjs └── structured.ipynb /Makefile: -------------------------------------------------------------------------------- 1 | 2 | .PHONY: deps 3 | deps: -------------------------------------------------------------------------------- /.env.sample: -------------------------------------------------------------------------------- 1 | # Copy+paste this file to .env and fill in the proper values! 2 | OPENAI_API_KEY= -------------------------------------------------------------------------------- /smoke-test.mjs: -------------------------------------------------------------------------------- 1 | //import * as x from "@ai-sdk/openai"; 2 | import * as x1 from "@huggingface/transformers"; 3 | import * as x2 from "ai"; 4 | import * as x3 from "ollama"; 5 | import * as x4 from "zod"; 6 | 7 | import * as x5 from "better-sqlite3"; 8 | import * as x6 from "sqlite-vec"; 9 | 10 | console.log("NICAR25 Everything works!"); -------------------------------------------------------------------------------- /embeddings-nicar-solution.mjs: -------------------------------------------------------------------------------- 1 | 2 | // solution! 3 | async function searchSpeakers(path, query) { 4 | console.log(`Searching for NICAR speakers related to: ${query}...`); 5 | const db = new Database(path); 6 | sqliteVec.load(db); 7 | const result = await ollama.embed({ 8 | model: "all-minilm", 9 | input: query 10 | }); 11 | const queryEmbedding = result.embeddings[0]; 12 | const knnQuery = db.prepare(` 13 | SELECT 14 | rowid, 15 | distance, 16 | speakers.bio 17 | FROM vec_speakers 18 | LEFT JOIN speakers ON speakers.speaker_id = vec_speakers.rowid 19 | WHERE bio_embedding MATCH ? 20 | AND k = 10; 21 | `); 22 | 23 | console.log(query); 24 | for(const {rowid, distance, bio} of knnQuery.all(JSON.stringify(queryEmbedding))) { 25 | console.log(rowid, distance, bio); 26 | } 27 | } -------------------------------------------------------------------------------- /embeddings-nbc-search.mjs: -------------------------------------------------------------------------------- 1 | import Database from "better-sqlite3"; 2 | import * as sqliteVec from "sqlite-vec"; 3 | import { Ollama } from 'ollama'; 4 | 5 | const ollama = new Ollama(); 6 | 7 | async function main() { 8 | 9 | const db = new Database("nbc-articles-nicar.db"); 10 | sqliteVec.load(db); 11 | 12 | const query = "reproductive rights"; 13 | 14 | const result = await ollama.embed({ 15 | model: "all-minilm", 16 | input: query 17 | }); 18 | const queryEmbedding = result.embeddings[0]; 19 | const knnQuery = db.prepare(` 20 | SELECT 21 | rowid, 22 | distance, 23 | articles.headline 24 | FROM vec_articles 25 | LEFT JOIN articles ON articles.id = vec_articles.rowid 26 | WHERE headline_embedding MATCH ? 27 | AND k = 10; 28 | `); 29 | 30 | console.log(query); 31 | for(const {rowid, distance, headline} of knnQuery.all(JSON.stringify(queryEmbedding))) { 32 | console.log(rowid, distance, headline); 33 | } 34 | } 35 | 36 | main(); 37 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Alex Garcia 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "nicar25-ai-starter", 3 | "version": "1.0.0", 4 | "description": "", 5 | "main": "index.js", 6 | "scripts": { 7 | "download:nbc": "curl -o nbc-articles-nicar.db 'https://delicate-dream-7407.fly.storage.tigris.dev/nicar25/nbc-articles-nicar.db'", 8 | "download:nicar25-schedule": "curl -o nicar-2025-schedule.json 'https://schedules.ire.org/nicar-2025/nicar-2025-schedule.json'", 9 | "download:emails": "curl -o dwillis-emails.db 'https://delicate-dream-7407.fly.storage.tigris.dev/nicar25/dwillis-emails.db'", 10 | "download": "npm run download:nicar25-schedule && npm run download:nbc && npm run download:emails", 11 | "clean": "rm dwillis-emails.db nicar-2025-schedule.json nbc-articles-nicar.db" 12 | }, 13 | "repository": { 14 | "type": "git", 15 | "url": "git+https://github.com/asg017/nicar25-ai-starter-js.git" 16 | }, 17 | "keywords": [], 18 | "author": "", 19 | "license": "ISC", 20 | "bugs": { 21 | "url": "https://github.com/asg017/nicar25-ai-starter-js/issues" 22 | }, 23 | "homepage": "https://github.com/asg017/nicar25-ai-starter-js#readme", 24 | "dependencies": { 25 | "@ai-sdk/openai": "^1.1.10", 26 | "@huggingface/transformers": "^3.3.3", 27 | "ai": "^4.1.36", 28 | "better-sqlite3": "^11.8.1", 29 | "deno": "^2.1.9", 30 | "ollama": "^0.5.13", 31 | "sqlite-vec": "^0.1.7-alpha.2", 32 | "zod": "^3.24.2" 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /embeddings-nbc-build.mjs: -------------------------------------------------------------------------------- 1 | import Database from "better-sqlite3"; 2 | import * as sqliteVec from "sqlite-vec"; 3 | import { Ollama } from 'ollama'; 4 | 5 | const ollama = new Ollama(); 6 | 7 | async function main() { 8 | 9 | // Step 1: Query headlines from SQLite database 10 | const db = new Database("nbc-articles-nicar.db"); 11 | sqliteVec.load(db); 12 | 13 | for(const {id, headline} of db.prepare('SELECT id, headline FROM articles LIMIT 10;').all()) { 14 | console.log(id, headline); 15 | } 16 | // comment out to continue to step 2 17 | return; 18 | 19 | // Step 2: Try out embeddings with ollama 20 | const {embeddings} = await ollama.embed({ 21 | model: "all-minilm", 22 | input: "This is a test" 23 | }); 24 | console.log(embeddings); 25 | // comment out to continue to step 3 26 | return; 27 | 28 | // Step 3: Save these embeddings into a SQLite database; 29 | db.exec(` 30 | CREATE VIRTUAL TABLE IF NOT EXISTS vec_articles USING vec0( 31 | headline_embedding float[384] 32 | ); 33 | `); 34 | const insertEmbedding = db.prepare(` 35 | INSERT INTO vec_articles (rowid, headline_embedding) VALUES 36 | (cast(? as integer), ?) 37 | `); 38 | 39 | 40 | for(const {id, headline} of db.prepare('SELECT id, headline FROM articles;').all()) { 41 | console.log(id); 42 | const {embeddings} = await ollama.embed({ 43 | model: "all-minilm", 44 | input: headline 45 | }); 46 | 47 | insertEmbedding.run(id, JSON.stringify(embeddings[0])); 48 | } 49 | } 50 | 51 | main(); 52 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | npm-debug.log* 5 | yarn-debug.log* 6 | yarn-error.log* 7 | lerna-debug.log* 8 | .pnpm-debug.log* 9 | 10 | # Diagnostic reports (https://nodejs.org/api/report.html) 11 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json 12 | 13 | # Runtime data 14 | pids 15 | *.pid 16 | *.seed 17 | *.pid.lock 18 | 19 | # Directory for instrumented libs generated by jscoverage/JSCover 20 | lib-cov 21 | 22 | # Coverage directory used by tools like istanbul 23 | coverage 24 | *.lcov 25 | 26 | # nyc test coverage 27 | .nyc_output 28 | 29 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) 30 | .grunt 31 | 32 | # Bower dependency directory (https://bower.io/) 33 | bower_components 34 | 35 | # node-waf configuration 36 | .lock-wscript 37 | 38 | # Compiled binary addons (https://nodejs.org/api/addons.html) 39 | build/Release 40 | 41 | # Dependency directories 42 | node_modules/ 43 | jspm_packages/ 44 | 45 | # Snowpack dependency directory (https://snowpack.dev/) 46 | web_modules/ 47 | 48 | # TypeScript cache 49 | *.tsbuildinfo 50 | 51 | # Optional npm cache directory 52 | .npm 53 | 54 | # Optional eslint cache 55 | .eslintcache 56 | 57 | # Optional stylelint cache 58 | .stylelintcache 59 | 60 | # Microbundle cache 61 | .rpt2_cache/ 62 | .rts2_cache_cjs/ 63 | .rts2_cache_es/ 64 | .rts2_cache_umd/ 65 | 66 | # Optional REPL history 67 | .node_repl_history 68 | 69 | # Output of 'npm pack' 70 | *.tgz 71 | 72 | # Yarn Integrity file 73 | .yarn-integrity 74 | 75 | # dotenv environment variable files 76 | .env 77 | .env.development.local 78 | .env.test.local 79 | .env.production.local 80 | .env.local 81 | 82 | # parcel-bundler cache (https://parceljs.org/) 83 | .cache 84 | .parcel-cache 85 | 86 | # Next.js build output 87 | .next 88 | out 89 | 90 | # Nuxt.js build / generate output 91 | .nuxt 92 | dist 93 | 94 | # Gatsby files 95 | .cache/ 96 | # Comment in the public line in if your project uses Gatsby and not Next.js 97 | # https://nextjs.org/blog/next-9-1#public-directory-support 98 | # public 99 | 100 | # vuepress build output 101 | .vuepress/dist 102 | 103 | # vuepress v2.x temp and cache directory 104 | .temp 105 | .cache 106 | 107 | # Docusaurus cache and generated files 108 | .docusaurus 109 | 110 | # Serverless directories 111 | .serverless/ 112 | 113 | # FuseBox cache 114 | .fusebox/ 115 | 116 | # DynamoDB Local files 117 | .dynamodb/ 118 | 119 | # TernJS port file 120 | .tern-port 121 | 122 | # Stores VSCode versions used for testing VSCode extensions 123 | .vscode-test 124 | 125 | # yarn v2 126 | .yarn/cache 127 | .yarn/unplugged 128 | .yarn/build-state.yml 129 | .yarn/install-state.gz 130 | .pnp.* 131 | 132 | 133 | # nicar 134 | nicar-2025-schedule.json 135 | nbc-articles-nicar.db 136 | 137 | *.db 138 | *.csv -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NICAR25: AI Starter Pack in JavaScript for Data Journalism 2 | 3 | This repo contains the code and tipsheet for the [*"AI starter pack: JavaScript"*](https://schedules.ire.org/nicar-2025/index.html#1175) class at [NICAR25](https://www.ire.org/training/conferences/nicar-2025/), happening on Friday March 7, @ 2:15pm. 4 | 5 | 6 | [SLIDES](https://docs.google.com/presentation/d/1bFHQg6DAoiKJ7G_yGF2mM1X3WWtN6tTGQMbZuvg2678/edit?usp=sharing) • [FEEDBACK FORM](https://forms.gle/ZSp9fzGEidcM9eJT6) 7 | 8 | ## Part 1: Embeddings 9 | 10 | - [Simon Willison Embeddings Talk](https://www.youtube.com/watch?time_continue=50&v=ArnMdc-ICCM&source_ve_path=MjM4NTE) 11 | - Embeddings inference: 12 | - [Ollama](https://ollama.com/) 13 | - [transformers.js](https://huggingface.co/docs/transformers.js/en/index) 14 | - [llama.cpp](https://github.com/ggml-org/llama.cpp) 15 | - [sqlite-lembed](https://github.com/asg017/sqlite-lembed) 16 | - Datasets: 17 | - [NBC News Headlines Scraper](https://github.com/asg017/nbc-headlines-scraper) 18 | - [NICAR25 Schedule](https://schedules.ire.org/nicar-2025/), w/ JSON 19 | - [sqlite-vec](https://github.com/asg017/sqlite-vec) 20 | - [`sqlite-vec` embedding visualizer demo](https://observablehq.com/d/04bc1c1b0de9db7c) 21 | - Additional links 22 | - [Nomic Atlas](https://atlas.nomic.ai/) (embeddings visualization tool, product) 23 | - [Latent Scopes](https://github.com/enjalot/latent-scope) (embedding visualization tool, open source) 24 | 25 | Embeddings models I recommend (local-only) 26 | 27 | | Name | #Dimensions | Release Date | 28 | | ----------------------------------------------------------------------------------------------------------- | ----------- | --------------- | 29 | | [`sentence-transformers/all-MiniLM-L6-v2`](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) | 384 | ~August 2021 | 30 | | [`mixedbread-ai/mxbai-embed-xsmall-v1`](https://huggingface.co/mixedbread-ai/mxbai-embed-xsmall-v1) | 384 | ~September 2024 | 31 | | [`nomic-ai/nomic-embed-text-v1.5`](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5) | 768 | ~February 2024 | 32 | | [`Snowflake/snowflake-arctic-embed-m-v2.0`](https://huggingface.co/Snowflake/snowflake-arctic-embed-m-v2.0) | 768 | ~December 2024 | 33 | 34 | ## Part 2: Structured Output Generation 35 | 36 | - [Derek Willis "LLM Extraction Challenge"](https://thescoop.org/archives/2025/01/27/llm-extraction-challenge-fundraising-emails/index.html) ([Repository](https://github.com/dwillis/LLM-Extraction-Challenge)) 37 | - [Vercel AI SDK](https://sdk.vercel.ai/) 38 | - [OpenAI's Structured Output Guide](https://platform.openai.com/docs/guides/structured-outputs) 39 | 40 | 41 | Small, **local** LLMs I recommend for trying structred output generation: 42 | 43 | - [`microsoft/Phi-4-mini-instruct`](https://huggingface.co/microsoft/Phi-4-mini-instruct) 44 | - [`meta-llama/Llama-3.2-3B`](https://huggingface.co/meta-llama/Llama-3.2-3B) 45 | - [`google/gemma-2-2b`](https://huggingface.co/google/gemma-2-2b) 46 | - [`mistralai/Mistral-Small-24B-Instruct-2501`](https://huggingface.co/mistralai/Mistral-Small-24B-Instruct-2501) 47 | 48 | ## Running yourself 49 | 50 | Download the following software: 51 | - [Node.js](https://nodejs.org/en/download) 52 | - [Ollama](https://ollama.com/download) 53 | - VS Code, with Jupyter notebooks 54 | 55 | ```bash 56 | # step 1: `git clone` this repository 57 | git clone https://github.com/asg017/nicar25-ai-starter-js.git 58 | 59 | 60 | # step 2: with ollama installed, run: 61 | ollama pull all-minilm 62 | ``` 63 | 64 | Open the `nicar25-ai-starter-js` folder in VS Code. Then in the terminal inside VS Code, run: 65 | 66 | ```bash 67 | npm install 68 | npm run download 69 | ``` 70 | 71 | And you should be set! -------------------------------------------------------------------------------- /embeddings-nicar.mjs: -------------------------------------------------------------------------------- 1 | import Database from "better-sqlite3"; 2 | import * as sqliteVec from "sqlite-vec"; 3 | import { Ollama } from 'ollama'; 4 | import { readFileSync } from "node:fs"; 5 | 6 | const ollama = new Ollama(); 7 | const NICAR_SCHEDULE_JSON_PATH = 'nicar-2025-schedule.json'; 8 | 9 | function importSchedule(db) { 10 | const data = JSON.parse(readFileSync(NICAR_SCHEDULE_JSON_PATH)); 11 | const insertSession = db.prepare(` 12 | INSERT INTO sessions (session_id, session_title, description, session_type, start_time, end_time, duration_mins, skill_level, room_name, day) 13 | VALUES (:session_id, :session_title, :description, :session_type, :start_time, :end_time, :duration_mins, :skill_level, :room_name, :day); 14 | `); 15 | 16 | const insertSpeaker = db.prepare(` 17 | INSERT INTO speakers (session_id, first_name, last_name, affiliation, bio) 18 | VALUES (:session_id, :first_name, :last_name, :affiliation, :bio); 19 | `); 20 | 21 | for(const session of data) { 22 | insertSession.run({...session, room_name: session.room.room_name}); 23 | for(const speaker of session.speakers) { 24 | insertSpeaker.run({ 25 | session_id: session.session_id, 26 | first_name: speaker.first, 27 | last_name: speaker.last, 28 | affiliation: speaker.affiliation, 29 | bio: speaker.bio 30 | }); 31 | } 32 | } 33 | } 34 | 35 | async function generateEmbeddings(db) { 36 | db.exec(` 37 | CREATE VIRTUAL TABLE IF NOT EXISTS vec_sessions USING vec0( 38 | description_embedding float[384] 39 | ); 40 | 41 | CREATE VIRTUAL TABLE IF NOT EXISTS vec_speakers USING vec0( 42 | bio_embedding float[384] 43 | ); 44 | `); 45 | 46 | const insertSessionEmbedding = db.prepare(` 47 | INSERT INTO vec_sessions (rowid, description_embedding) VALUES 48 | (cast(? as integer), ?) 49 | `); 50 | const insertSpeakerEmbedding = db.prepare(` 51 | INSERT INTO vec_speakers (rowid, bio_embedding) VALUES 52 | (cast(? as integer), ?) 53 | `); 54 | 55 | for(const {session_id, description} of db.prepare('SELECT session_id, description FROM sessions').all()) { 56 | // skip session with empty descriptions 57 | if(!description) { 58 | continue; 59 | } 60 | console.log(session_id); 61 | const {embeddings} = await ollama.embed({ 62 | model: "all-minilm", 63 | input: description 64 | }); 65 | insertSessionEmbedding.run(session_id, JSON.stringify(embeddings[0])); 66 | } 67 | 68 | for(const {speaker_id, bio} of db.prepare('SELECT speaker_id, bio FROM speakers').all()) { 69 | // skip speakers with empty bios 70 | if(!bio) { 71 | continue; 72 | } 73 | console.log(speaker_id); 74 | const {embeddings} = await ollama.embed({ 75 | model: "all-minilm", 76 | input: bio 77 | }); 78 | insertSpeakerEmbedding.run(speaker_id, JSON.stringify(embeddings[0])); 79 | } 80 | } 81 | 82 | async function build(path) { 83 | const db = new Database(":memory:"); 84 | sqliteVec.load(db); 85 | 86 | db.exec(` 87 | CREATE TABLE IF NOT EXISTS sessions ( 88 | session_id INTEGER PRIMARY KEY, 89 | session_title TEXT, 90 | description TEXT, 91 | session_type TEXT, 92 | start_time TEXT, 93 | end_time TEXT, 94 | duration_mins INTEGER, 95 | evergreen BOOLEAN, 96 | skill_level TEXT, 97 | room_name TEXT, 98 | day TEXT 99 | ); 100 | 101 | CREATE TABLE IF NOT EXISTS speakers ( 102 | speaker_id INTEGER PRIMARY KEY AUTOINCREMENT, 103 | session_id INTEGER, 104 | first_name TEXT, 105 | last_name TEXT, 106 | affiliation TEXT, 107 | bio TEXT, 108 | FOREIGN KEY (session_id) REFERENCES sessions(session_id) 109 | ); 110 | `); 111 | importSchedule(db); 112 | await generateEmbeddings(db); 113 | db.prepare('vacuum into ?').run(path); 114 | console.log(`Successfully saved database to ${path}`) 115 | } 116 | 117 | async function searchSessions(path, query) { 118 | console.log(`Searching for NICAR sessions related to: ${query}...`); 119 | const db = new Database(path); 120 | sqliteVec.load(db); 121 | const result = await ollama.embed({ 122 | model: "all-minilm", 123 | input: query 124 | }); 125 | const queryEmbedding = result.embeddings[0]; 126 | const knnQuery = db.prepare(` 127 | SELECT 128 | rowid, 129 | distance, 130 | sessions.description 131 | FROM vec_sessions 132 | LEFT JOIN sessions ON sessions.session_id = vec_sessions.rowid 133 | WHERE description_embedding MATCH ? 134 | AND k = 10; 135 | `); 136 | 137 | for(const {rowid, distance, description} of knnQuery.all(JSON.stringify(queryEmbedding))) { 138 | console.log(rowid, distance, description); 139 | } 140 | } 141 | async function searchSpeakers(path, query) { 142 | // TODO try yourself! 143 | } 144 | 145 | async function main() { 146 | switch( process.argv[2] ) { 147 | case "build": { 148 | await build(process.argv[3]); 149 | break; 150 | } 151 | case "search-sessions":{ 152 | await searchSessions(process.argv[3], process.argv[4]); 153 | break; 154 | } 155 | case "search-speakers":{ 156 | await searchSpeakers(process.argv[3], process.argv[4]); 157 | break; 158 | } 159 | default: { 160 | console.log( 161 | `Usage: 162 | node embeddings-nicar.mjs build 163 | node embeddings-nicar.mjs search-sessions 164 | node embeddings-nicar.mjs search-speakers 165 | ` 166 | ); 167 | } 168 | } 169 | } 170 | 171 | main(); 172 | -------------------------------------------------------------------------------- /structured.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Part 2: Structured Output Generation" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Second part of the \"AI Starter Pack JavaScript\" NICAR25 class - let's try out structured output generation!\n", 15 | "\n", 16 | "Structured output generation involves giving an LLM a JSON schema that it will follow when generating output. Instead of replying back with english or \"Sure thing!\", it will instead return the exact JSON parse-able schema that you care about. \n", 17 | "\n", 18 | "Structured output generation is great for:\n", 19 | "\n", 20 | "- Natural language processing\n", 21 | "- Extracting data out of super messy data\n", 22 | "- Fake data generation\n", 23 | "\n", 24 | "Though I wouldn't trust structured outputs for *real* data, like 'Return the FIPS code for Kern County' or \"return a list of current Lakers players and their season scores\"" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "## Loading Environment variables for OpenAI\n", 32 | "\n", 33 | "Copy+paste the `.env.sample` file to a new `.env` file, then paste in the OpenAI key that I will share during the class." 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "import \"jsr:@std/dotenv/load\";" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "## Sample: Hitting the OpenAI API with the Vercel AI SDK" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "import { generateText } from \"npm:ai\";\n", 59 | "import { openai } from \"npm:@ai-sdk/openai\";\n", 60 | "\n", 61 | "const {text} = await generateText({\n", 62 | " model: openai(\"gpt-4o-mini\"),\n", 63 | " prompt: \"Short haiku about a lonely mountain\",\n", 64 | "});\n", 65 | "console.log(text);" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "// How do we get JSON back? Can we just ask for it?\n", 75 | "const {text} = await generateText({\n", 76 | " model: openai(\"gpt-4o-mini\"),\n", 77 | " prompt: \"JSON of a person with name and age\",\n", 78 | "});\n", 79 | "console.log(text);" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "JSON.parse(text); // :(" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": {}, 94 | "source": [ 95 | "## Solution: `generateObject()` with `zod`" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "import { generateObject } from \"npm:ai\";\n", 105 | "import { z } from \"npm:zod\";\n", 106 | "\n", 107 | "const PersonSchema = z.object({\n", 108 | " name: z.string(),\n", 109 | " age: z.number(),\n", 110 | "});\n", 111 | "\n", 112 | "const {object} = await generateObject({\n", 113 | " model: openai(\"gpt-4o-mini\"),\n", 114 | " prompt: \"JSON of a person with name and age\",\n", 115 | " schema: PersonSchema\n", 116 | "});\n", 117 | "\n", 118 | "console.log(object);\n", 119 | "console.log(object.name);\n", 120 | "console.log(object.age);" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "## Now let's cook with campaign emails" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "import {Database} from \"jsr:@db/sqlite\";\n", 137 | "const db = new Database(\"dwillis-emails.db\");\n", 138 | "\n", 139 | "for (const { rowid, body } of db.sql`select rowid, body from emails_raw limit 10`) {\n", 140 | " console.log(rowid, body.substring(0, 100));\n", 141 | "}" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "const PROMPT = `\n", 151 | " Parse the following political email and return a JSON object with the following schema:\n", 152 | " \n", 153 | " \"committee\": Name of the committee in the disclaimer that begins with 'Paid for by' \\\n", 154 | " but does not include 'Paid for by', the committee address, or the treasurer name. \n", 155 | " Should be null if not present.\n", 156 | " \n", 157 | " \"sender\" which is the name of the person, if any, mentioned as the author of the email. \n", 158 | " Should be null if not present.\n", 159 | " \n", 160 | " Do not include any other text, no yapping.\n", 161 | "`;\n", 162 | "\n", 163 | "const EmailSchema = z.object({\n", 164 | " committee: z.string().nullable(),\n", 165 | " sender: z.string().nullable(),\n", 166 | "});\n", 167 | "\n", 168 | "for (const { rowid, body } of db.sql`select rowid, body from emails_raw limit 3`) {\n", 169 | " const { object } = await generateObject({\n", 170 | " model: openai(\"gpt-4o-mini\"),\n", 171 | " prompt: PROMPT + body,\n", 172 | " schema: EmailSchema,\n", 173 | " });\n", 174 | " console.log(object);\n", 175 | "}\n" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [] 184 | } 185 | ], 186 | "metadata": { 187 | "kernelspec": { 188 | "display_name": "Deno", 189 | "language": "typescript", 190 | "name": "deno" 191 | }, 192 | "language_info": { 193 | "codemirror_mode": "typescript", 194 | "file_extension": ".ts", 195 | "mimetype": "text/x.typescript", 196 | "name": "typescript", 197 | "nbconvert_exporter": "script", 198 | "pygments_lexer": "typescript", 199 | "version": "5.6.2" 200 | } 201 | }, 202 | "nbformat": 4, 203 | "nbformat_minor": 2 204 | } 205 | --------------------------------------------------------------------------------