├── BENCH__efficient_language_models ├── prompt.txt ├── promptfooconfig.yaml └── tests.yaml ├── BENCH__minimal_test_suite ├── prompt.txt ├── promptfooconfig.yaml └── tests.yaml ├── imgs └── zero-cost-prompts.png ├── .gitignore ├── .env.sample ├── package.json ├── scripts ├── ollama_local_model_call.ts └── new_prompt_test.ts ├── README.md └── custom_models └── ollamaModelBase.js /BENCH__efficient_language_models/prompt.txt: -------------------------------------------------------------------------------- 1 | {{prompt}} -------------------------------------------------------------------------------- /BENCH__minimal_test_suite/prompt.txt: -------------------------------------------------------------------------------- 1 | Hey my name is {{name}}, are you ready to build? -------------------------------------------------------------------------------- /imgs/zero-cost-prompts.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/disler/elm-itv-benchmark/HEAD/imgs/zero-cost-prompts.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | custom_models/*.llamafile 2 | custom_models/*.gguf 3 | *.log 4 | logs/ 5 | 6 | node_modules/ 7 | *.zip 8 | 9 | .env 10 | 11 | *.json 12 | !package.json 13 | 14 | .DS_Store 15 | 16 | llama_files/* 17 | 18 | bun.lockb -------------------------------------------------------------------------------- /.env.sample: -------------------------------------------------------------------------------- 1 | GROQ_API_KEY= 2 | OPENAI_API_KEY= 3 | ANTHROPIC_API_KEY= 4 | PROMPTFOO_DELAY_MS=0 5 | VERTEX_API_KEY= 6 | VERTEX_PROJECT_ID= 7 | OLLAMA_BASE_URL="http://127.0.0.1:11434" 8 | REQUEST_TIMEOUT_MS=5000 9 | PROMPTFOO_ASSERTIONS_MAX_CONCURRENCY=3 -------------------------------------------------------------------------------- /BENCH__minimal_test_suite/promptfooconfig.yaml: -------------------------------------------------------------------------------- 1 | # test suite name 2 | description: "Minimal Test Suite" 3 | # model providers 4 | providers: 5 | # OpenAI GPT3 6 | - id: openai:gpt-3.5-turbo-1106 7 | # Meta Llama3 8 | - id: ollama:chat:llama3 9 | config: 10 | modelName: "llama3" 11 | stream: false 12 | 13 | # Repeat test N times 14 | evaluateOptions: 15 | repeat: 1 16 | -------------------------------------------------------------------------------- /BENCH__minimal_test_suite/tests.yaml: -------------------------------------------------------------------------------- 1 | # name of the test 2 | - description: "Minimal Test" 3 | # these variables are passed into your prompt.txt 4 | vars: 5 | name: Dan # update this to be your name! 6 | # these assertions are run on the output of your model 7 | assert: 8 | # this test passes if the output contains any of the values in the list 9 | - type: icontains-any 10 | value: ["yes", "absolutely", "ready", "build"] 11 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "elm-is-this-viable-benchmark", 3 | "version": "1.0.0", 4 | "description": "", 5 | "main": "index.js", 6 | "scripts": { 7 | "view": "promptfoo view", 8 | "elm": "promptfoo eval -c ./BENCH__efficient_language_models/promptfooconfig.yaml -t ./BENCH__efficient_language_models/tests.yaml -p ./BENCH__efficient_language_models/prompt.txt --no-cache", 9 | "minimal": "promptfoo eval -c ./BENCH__minimal_test_suite/promptfooconfig.yaml -t ./BENCH__minimal_test_suite/tests.yaml -p ./BENCH__minimal_test_suite/prompt.txt --no-cache" 10 | }, 11 | "author": "IndyDevDan", 12 | "license": "ISC", 13 | "dependencies": { 14 | "dotenv": "^16.4.5", 15 | "promptfoo": "^0.55.0" 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /scripts/ollama_local_model_call.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Use to test a local ollama model. 3 | */ 4 | async function prompt() { 5 | 6 | const readline = require('readline'); 7 | const rl = readline.createInterface({ 8 | input: process.stdin, 9 | output: process.stdout 10 | }); 11 | 12 | const inPrompt = await new Promise((resolve) => { 13 | rl.question('Enter your prompt > ', (userInput) => { 14 | rl.close(); 15 | resolve(userInput); 16 | }); 17 | }); 18 | 19 | let response; 20 | try { 21 | response = await fetch( 22 | `${process.env.OLLAMA_BASE_URL || 'http://localhost:11434'}/api/chat`, 23 | { 24 | method: 'POST', 25 | headers: { 26 | 'Content-Type': 'application/json', 27 | }, 28 | body: JSON.stringify({ 29 | model: "llama3", 30 | messages: [{ role: 'user', content: inPrompt }], 31 | stream: false 32 | }), 33 | }, 34 | ); 35 | } catch (err) { 36 | console.error(`API call error: ${err}`); 37 | } 38 | console.log(`response`, response) 39 | 40 | const asJson = await response.json() 41 | console.log(`asJson`, asJson) 42 | } 43 | 44 | async function main() { 45 | prompt() 46 | } 47 | 48 | main() -------------------------------------------------------------------------------- /BENCH__efficient_language_models/promptfooconfig.yaml: -------------------------------------------------------------------------------- 1 | description: "ELM ITV Benchmarks (Efficient Language Models - Is It Viable?)" 2 | providers: 3 | 4 | # Control CLOUD LLMs 5 | - id: openai:gpt-3.5-turbo-1106 6 | 7 | # Experimental Local ELMs 8 | - id: ollama:chat:llama3 9 | config: 10 | modelName: "llama3" 11 | stream: false 12 | temperature: 0.2 13 | - id: ollama:chat:phi3 14 | config: 15 | modelName: "phi3" 16 | stream: false 17 | temperature: 0.2 18 | - id: ollama:chat:gemma 19 | config: 20 | modelName: "gemma" 21 | stream: false 22 | temperature: 0.2 23 | 24 | # IF YOUR COMPUTER CAN HANDLE IT 25 | # - id: ollama:chat:llama3:70b 26 | # config: 27 | # modelName: "llama3:70b" 28 | # stream: false 29 | # temperature: 0.2 30 | 31 | # -------------------------------------------------- 32 | 33 | # Experimental Local ELMs (with tokens information) 34 | # Use this to get the token usage information 35 | # - id: ../custom_models/ollamaModelBase.js 36 | # config: 37 | # modelName: "llama3" 38 | # stream: false 39 | # temperature: 0.2 40 | # - id: ../custom_models/ollamaModelBase.js 41 | # config: 42 | # modelName: "phi3" 43 | # stream: false 44 | # temperature: 0.2 45 | # - id: ../custom_models/ollamaModelBase.js 46 | # config: 47 | # modelName: "gemma" 48 | # stream: false 49 | # temperature: 0.2 50 | # IF YOUR COMPUTER CAN HANDLE IT 51 | # - id: ../custom_models/ollamaModelBase.js 52 | # config: 53 | # modelName: "llama3:70b" 54 | # stream: false 55 | # temperature: 0.2 56 | evaluateOptions: 57 | repeat: 1 58 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Efficient Language Model Personal Viability Benchmarking 2 | > Simple, Opinionated benchmark for testing the viability of Efficient Language Models (ELMs) for personal use cases. 3 | > 4 | > Uses [bun](https://bun.sh/), [promptfoo](https://promptfoo.dev/), and [ollama](https://ollama.com/) for a minimalist, cross-platform, local LLM prompt testing & benchmarking experience. 5 | 6 | ![Zero Cost Prompts](./imgs/zero-cost-prompts.png) 7 | 8 | ## Setup 9 | - [Install Bun](https://bun.sh/docs/installation#macos-and-linux) 10 | - [Install Ollama](https://ollama.com/download) 11 | - [Install llama3](https://ollama.com/library/llama3) `ollama run llama3` 12 | - [Install phi3](https://ollama.com/library/phi3) `ollama run phi3` 13 | - [Install gemma](https://ollama.com/library/gemma) `ollama run gemma` 14 | - Setup .env variables 15 | - `cp .env.sample .env` 16 | - Add your OpenAI API key to the .env file 17 | - Install dependencies: `bun i` 18 | - Run the minimal tests: `bun minimal` 19 | - Open test viewer: `bun view` 20 | - Run the ELM-ITV tests: `bun elm` 21 | 22 | ## Guide 23 | - First, [watch the video](https://youtu.be/sb9wSWeOPI4) where we walk through ELMs and this codebase. 24 | - To get started take a look at `BENCH__minimal_test_suite/` to get an idea of how to structure a basic test suite. 25 | - Next take a look at the `BENCH__efficient_language_models/` test suite to get an idea of how you can setup tests for your own viability tests for ELMs. 26 | - Explore other [ollama based models](https://promptfoo.dev/docs/providers/ollama) you can test 27 | - Or [OpenAI models](https://promptfoo.dev/docs/providers/openai) 28 | - Or [Anthropic models](https://promptfoo.dev/docs/providers/anthropic) 29 | - Or [Groq models](https://promptfoo.dev/docs/providers/groq) 30 | - Modify the `BENCH__minimal_test_suite/` or `BENCH__efficient_language_models/` to suit your needs 31 | - Create a new test with the [Create a new test suite](#scripts) script 32 | 33 | ## Folder Structure 34 | - `/BENCH__` 35 | - `/prompt.txt` - the prompt(s) to test 36 | - `/test.yaml` - variables and assertions 37 | - `/promptfooconfig.yaml` - llm model config 38 | 39 | ## Scripts 40 | - Create a new test suite: `bun run ./scripts/new_prompt_test` 41 | - Run a test prompt against a running ollama server `bun run ./scripts/ollama_local_model_call` 42 | 43 | ## Resources 44 | - Ollama model library 45 | - https://ollama.com/library 46 | - LMSYS Chatbot Arena Leaderboard 47 | - https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard 48 | - Ollama api.md docs 49 | - https://github.com/ollama/ollama/blob/main/docs/api.md#generate-a-chat-completion 50 | - Promptfoo Ollama Provider 51 | - https://promptfoo.dev/docs/providers/ollama 52 | - Promptfoo LLM Providers 53 | - https://www.promptfoo.dev/docs/providers 54 | - Promptfoo Assertions 55 | - https://www.promptfoo.dev/docs/configuration/expected-outputs/ -------------------------------------------------------------------------------- /scripts/new_prompt_test.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Use this to quickly generate a new promptfoo test suite. 3 | */ 4 | import * as fs from 'fs'; 5 | import * as path from 'path'; 6 | import * as readline from 'readline'; 7 | 8 | 9 | interface FileAndContents { 10 | fileName: string 11 | contents: string 12 | } 13 | 14 | const PROMPT_TXT: FileAndContents = { 15 | fileName: "prompt.txt", 16 | contents: `convert the following natural language query into SQL: {{nlq}} 17 | 18 | SQL:` 19 | } 20 | 21 | const PROMPTFOO_CONFIG_YAML: FileAndContents = { 22 | fileName: "promptfooconfig.yaml", 23 | contents: `description: "" 24 | providers: 25 | - id: openai:gpt-3.5-turbo-1106 26 | evaluateOptions: 27 | repeat: 1 28 | ` 29 | } 30 | 31 | const PROMPT_TESTS: FileAndContents = { 32 | fileName: "tests.yaml", 33 | contents: `- description: "NLQ to SQL" 34 | vars: 35 | nlq: select all users 36 | id: 1 37 | assert: 38 | - type: icontains-all 39 | value: ["select", "*", "from", "users"] 40 | ` 41 | } 42 | 43 | function buildPackageScriptsCall(scriptName: string, testName: string) { 44 | return `promptfoo eval -c ./${testName}/promptfooconfig.yaml -t ./${testName}/tests.yaml -p ./${testName}/prompt.txt --no-cache` 45 | } 46 | 47 | function updateScripts(scriptName: string, testName: string) { 48 | const packageJsonPath = path.join(__dirname, '..', 'package.json'); 49 | const packageJson = JSON.parse(fs.readFileSync(packageJsonPath, 'utf8')); 50 | packageJson.scripts[scriptName] = buildPackageScriptsCall(scriptName, testName); 51 | fs.writeFileSync(packageJsonPath, JSON.stringify(packageJson, null, 2)); 52 | } 53 | 54 | async function userInput(prompt: string): Promise { 55 | const rl = readline.createInterface({ 56 | input: process.stdin, 57 | output: process.stdout 58 | }); 59 | 60 | const answer = await new Promise((resolve) => { 61 | rl.question(prompt, (input) => { 62 | resolve(input); 63 | }); 64 | }); 65 | 66 | rl.close(); 67 | return answer; 68 | } 69 | 70 | async function main() { 71 | 72 | let testName = await userInput('Please enter the test name: '); 73 | 74 | const ensureAlphaNumericAndUnderscore = (str: string) => { 75 | return str.replace(/[^a-zA-Z0-9_]/g, ''); 76 | } 77 | 78 | testName = ensureAlphaNumericAndUnderscore(testName); 79 | 80 | console.log(`Generating test suite for: ${testName}`); 81 | 82 | // prefix testName with BENCH__ 83 | testName = `BENCH__${testName}`; 84 | 85 | const testDirectory = path.join(__dirname, '..', testName); 86 | if (!fs.existsSync(testDirectory)) { 87 | fs.mkdirSync(testDirectory, { recursive: true }); 88 | } 89 | 90 | const files: FileAndContents[] = [PROMPT_TXT, PROMPTFOO_CONFIG_YAML, PROMPT_TESTS]; 91 | 92 | files.forEach(file => { 93 | const filePath = path.join(testDirectory, file.fileName); 94 | fs.writeFileSync(filePath, file.contents); 95 | console.log(`Created file: ${filePath}`); 96 | }); 97 | 98 | console.log(`Test suite '${testName}' created successfully. Update the test files to test your unique prompt`); 99 | 100 | let scriptName = await userInput('Please enter a name for the package.json script that will run the tests: '); 101 | scriptName = ensureAlphaNumericAndUnderscore(scriptName); 102 | updateScripts(scriptName, testName); 103 | console.log(`Script '${scriptName}' for test suite '${testName}' added to package.json successfully.`); 104 | } 105 | 106 | 107 | main() -------------------------------------------------------------------------------- /custom_models/ollamaModelBase.js: -------------------------------------------------------------------------------- 1 | /** 2 | * This is basically the same as using the ollama provider by promptfoo. 3 | * 4 | * The difference is that we report the token usage which gives promptfoo the info it needs to calculate tokens per second. 5 | */ 6 | 7 | const OllamaCompletionOptionKeys = new Set([ 8 | 'num_predict', 9 | 'top_k', 10 | 'top_p', 11 | 'tfs_z', 12 | 'seed', 13 | 'useNUMA', 14 | 'num_ctx', 15 | 'num_keep', 16 | 'num_batch', 17 | 'num_gqa', 18 | 'num_gpu', 19 | 'main_gpu', 20 | 'low_vram', 21 | 'f16_kv', 22 | 'logits_all', 23 | 'vocab_only', 24 | 'use_mmap', 25 | 'use_mlock', 26 | 'embedding_only', 27 | 'rope_frequency_base', 28 | 'rope_frequency_scale', 29 | 'typical_p', 30 | 'repeat_last_n', 31 | 'temperature', 32 | 'repeat_penalty', 33 | 'presence_penalty', 34 | 'frequency_penalty', 35 | 'mirostat', 36 | 'mirostat_tau', 37 | 'mirostat_eta', 38 | 'penalize_newline', 39 | 'stop', 40 | 'num_thread', 41 | ]); 42 | 43 | class OllamaChatProvider { 44 | constructor(options) { 45 | 46 | const { id, config, modelName } = options; 47 | this.modelName = config.modelName; 48 | this.config = config || {}; 49 | this.options = options 50 | } 51 | 52 | id() { 53 | return `ollama:chat:${this.modelName}`; 54 | } 55 | 56 | toString() { 57 | return `[Ollama Chat Provider ${this.modelName}]`; 58 | } 59 | 60 | async callApi(prompt) { 61 | const messages = [{ role: 'user', content: prompt }] 62 | 63 | const params = { 64 | model: this.config.modelName, 65 | messages, 66 | stream: false, 67 | options: Object.keys(this.config).reduce((options, key) => { 68 | if (OllamaCompletionOptionKeys.has(key)) { 69 | options[key] = this.config[key]; 70 | } 71 | return options; 72 | }, {}), 73 | }; 74 | 75 | let response; 76 | try { 77 | response = await fetch( 78 | `${process.env.OLLAMA_BASE_URL || 'http://localhost:11434'}/api/chat`, 79 | { 80 | method: 'POST', 81 | headers: { 82 | 'Content-Type': 'application/json', 83 | }, 84 | body: JSON.stringify(params), 85 | }, 86 | ); 87 | } catch (err) { 88 | console.log(`err`, err) 89 | return { 90 | error: `API call error: ${String(err)}. Output:\n${response?.data}`, 91 | }; 92 | } 93 | 94 | 95 | if (response.status !== 200) { 96 | return { 97 | error: `Ollama error: ${response.statusText}`, 98 | }; 99 | } 100 | /** 101 | { 102 | "model": "llama3", 103 | "created_at": "2024-04-27T19:26:40.21726Z", 104 | "message": { 105 | "role": "assistant", 106 | "content": "Here is the text converted into bullet points:\n\n- Here's a simple yet powerful idea that can help you take a large step toward useful and valuable agentic " 107 | }, 108 | "done": true, 109 | "total_duration": 2610576500, 110 | "load_duration": 748167, 111 | "prompt_eval_count": 165, 112 | "prompt_eval_duration": 300519000, 113 | "eval_count": 131, 114 | "eval_duration": 2307267000 115 | } 116 | */ 117 | const jsonResponse = await response.json() 118 | 119 | // console.log(`---------- RES Ollama generate API response: ${JSON.stringify(jsonResponse, null, 2)}`); 120 | 121 | try { 122 | 123 | const output = jsonResponse.message.content 124 | 125 | const tokenUsage = { 126 | total: jsonResponse.prompt_eval_count + jsonResponse.eval_count, 127 | prompt: jsonResponse.prompt_eval_count, 128 | completion: jsonResponse.eval_count, 129 | } 130 | 131 | return { 132 | output, 133 | tokenUsage, 134 | }; 135 | } catch (err) { 136 | return { 137 | error: `Ollama API response error: ${String(err)}: ${JSON.stringify(response.data)}`, 138 | }; 139 | } 140 | } 141 | } 142 | 143 | module.exports = OllamaChatProvider; 144 | 145 | -------------------------------------------------------------------------------- /BENCH__efficient_language_models/tests.yaml: -------------------------------------------------------------------------------- 1 | # String Manipulation 2 | - description: "Bullet Summary" 3 | vars: 4 | prompt: | 5 | Create a summary of the following text in bullet points. 6 | 7 | Here's a simple yet powerful idea that can help you take a large step toward useful and valuable agentic workflows. I have to warn you, this is one of those things that sounds really obvious after you hear it because it's hiding in plain sight. The idea is simple; it's called the two-way prompt. So, what is this? Why is it useful? And how can it help you build better AI agent workflows? Two-way prompting happens all the time in real collaborative workspaces. You are effectively two or more agents prompting each other to drive outcomes. Two-way prompting happens all the time when you're at work, with friends, with family, online, in comment sections, on PR reviews. You ask a question; your co-worker responds. They ask a question; you respond. Now, let's double click into what this looks like for your agentic tools. Right in agentic workflows, you are the critical communication process between you and your AI agents that are aiming to drive outcomes. In most agentic workflows, we fire off one prompt or configure some system prompt, and that's it. But we're missing a ton of opportunity here that we can unlock using two-way prompts. Let me show you a concrete example with Ada. So, Ada, of course, is our proof of concept personal AI assistant. And let me just go ahead and kick off this workflow so I can show you exactly how useful the two-way prompt can be. Ada, let's create some example code. 8 | assert: 9 | - type: icontains-all 10 | value: ["agentic", "workflows", "two-way", "prompt"] 11 | - description: "Bullet Each Sentence" 12 | vars: 13 | prompt: | 14 | Take the following text and for each sentence, convert it into a bullet point. Do not change the sentence. Maintain the punctuation. The punctuations '.!?' should trigger bullet points. Use - for the bullet point." 15 | 16 | Here's a simple yet powerful idea that can help you take a large step toward useful and valuable agentic workflows. I have to warn you, this is one of those things that sounds really obvious after you hear it because it's hiding in plain sight. The idea is simple; it's called the two-way prompt. So, what is this? Why is it useful? And how can it help you build better AI agent workflows? Two-way prompting happens all the time in real collaborative workspaces. You are effectively two or more agents prompting each other to drive outcomes. 17 | assert: 18 | - type: icontains-all 19 | value: 20 | - "- Here's a simple yet powerful idea that can help you take a large step toward useful and valuable agentic workflows." 21 | - "- I have to warn you, this is one of those things that sounds really obvious after you hear it because it's hiding in plain sight." 22 | - "- The idea is simple; it's called the two-way prompt." 23 | - "- So, what is this?" 24 | - "- Why is it useful?" 25 | - "- And how can it help you build better AI agent workflows?" 26 | - "- Two-way prompting happens all the time in real collaborative workspaces." 27 | - "- You are effectively two or more agents prompting each other to drive outcomes." 28 | - description: "Script to Markdown" 29 | vars: 30 | prompt: | 31 | Convert the following SCRIPT to markdown, follow the SCRIPTING_RULES. 32 | 33 | SCRIPTING_RULES 34 | - Create 1 h1 header with a interesting title. 35 | - Create 2 h2 sub headers, one for the summary and one for the details. 36 | - Each section should contain bullet points. 37 | - Start each section with a hook. 38 | - Use short paragraphs. 39 | - Use emojis to indicate the hooks. 40 | 41 | SCRIPT 42 | Here's a simple yet powerful idea that can help you take a large step toward useful and valuable agentic workflows. I have to warn you, this is one of those things that sounds really obvious after you hear it because it's hiding in plain sight. The idea is simple; it's called the two-way prompt. So, what is this? Why is it useful? And how can it help you build better AI agent workflows? Two-way prompting happens all the time in real collaborative workspaces. You are effectively two or more agents prompting each other to drive outcomes. 43 | assert: 44 | - type: icontains-all 45 | value: ["#", "##", "two-way", "prompt", "agentic", "workflows", "useful", "valuable", "better", "ai", "agent", "workflow", "prompting", "collaborative", "workspace", "two or more", "agents", "prompt each other", "outcome"] 46 | - type: javascript 47 | value: 'output.split("#").length > 4' 48 | # Command Generation 49 | - description: "Mac Bash Command: 'ls -a'" 50 | vars: 51 | prompt: "Mac: Bash: Concise: How do I list all hidden files in a directory?" 52 | assert: 53 | - type: icontains 54 | value: "ls -a" 55 | - description: "Mac Bash Command: 'find . -name'" 56 | vars: 57 | prompt: "Mac: Bash: Concise: How do I recursively search a directory for a file by name?" 58 | assert: 59 | - type: icontains-all 60 | value: ["find", "-name"] 61 | - description: "Complex Command: Git Branch Merge Conflict Resolution" 62 | vars: 63 | prompt: "Mac: Bash: Concise: How do I resolve merge conflicts in Git when trying to merge two branches?" 64 | assert: 65 | - type: icontains-all 66 | value: ["git", "merge", "conflict", "resolve", "git status", "git add", "git commit"] 67 | # Code Explanation 68 | - description: "Python Generator Functions Explanation" 69 | vars: 70 | prompt: "Concisely explain how I can use generator functions in Python in less than 100 words." 71 | assert: 72 | - type: icontains-all 73 | value: ["generator", "function", "python", "yield", "iter"] 74 | - type: javascript 75 | value: 'output.split(" ").length <= 100' 76 | - description: "Python Agent Routing Explanation" 77 | vars: 78 | prompt: | 79 | Explain what the PYTHON_CODE does in 100 words or less. 80 | 81 | PYTHON_CODE 82 | def get_first_keyword_in_prompt(prompt: str): 83 | map_keywords_to_agents = { 84 | "bash,browser": run_bash_command_workflow, 85 | "question": question_answer_workflow, 86 | "hello,hey,hi": soft_talk_workflow, 87 | "exit": end_conversation_workflow, 88 | } 89 | for keyword_group, agent in map_keywords_to_agents.items(): 90 | keywords = keyword_group.split(",") 91 | for keyword in keywords: 92 | if keyword in prompt.lower(): 93 | return agent, keyword 94 | return None, None 95 | assert: 96 | - type: icontains-all 97 | value: ["python", "keyword", "prompt", "and", "found"] 98 | - type: javascript 99 | value: 'output.split(" ").length <= 100' 100 | - description: "Python List Comprehensions Explanation" 101 | vars: 102 | prompt: "Concisely explain how I can use list comprehensions in Python in less than 100 words." 103 | assert: 104 | - type: icontains-all 105 | value: ["list", "comprehension", "python", "expression", "[", "]"] 106 | - type: javascript 107 | value: 'output.split(" ").length <= 100' 108 | # Text Classification 109 | - description: "SQL Natural Language Query Detection" 110 | vars: 111 | prompt: | 112 | Is the following BLOCK_OF_TEXT a SQL Natural Language Query (NLQ)? Respond Exclusively with 'yes' or 'no'. 113 | 114 | BLOCK_OF_TEXT 115 | 116 | select 10 users over the age of 21 with gmail address 117 | assert: 118 | - type: equals 119 | value: "yes" 120 | - description: "Positive vs Negative Sentiment Detection" 121 | vars: 122 | prompt: | 123 | Determine if the sentiment of the following TEXT is positive or negative. Respond exclusively with 'positive' or 'negative'. 124 | 125 | TEXT 126 | 127 | I love sunny days there's nothing like getting out in nature 128 | assert: 129 | - type: equals 130 | value: "positive" 131 | - description: "Detect Programming Language from Code Snippet" 132 | vars: 133 | prompt: | 134 | Identify the programming language used in the following CODE_SNIPPET: 135 | 136 | CODE_SNIPPET 137 | 138 | def example_function(): 139 | print("Hello, World!") 140 | 141 | assert: 142 | - type: icontains 143 | value: "python" 144 | # Email Management 145 | - description: "Email Categorization: Work" 146 | vars: 147 | prompt: | 148 | Categorize the following email into one of the following categories: work, personal, newsletter, other. Respond exclusively with the category name. 149 | 150 | EMAIL 151 | 152 | Subject: 153 | Action Items for next week 154 | From: 155 | john@workhard.com 156 | Body: 157 | Hey can you send over the action items for the week? 158 | assert: 159 | - type: equals 160 | value: "work" 161 | - description: "Email Categorization: Personal" 162 | vars: 163 | prompt: | 164 | Categorize the following email into one of the following categories: work, personal, newsletter, other. Respond exclusively with the category name. 165 | 166 | EMAIL 167 | 168 | Subject: 169 | Dinner plans this weekend 170 | From: 171 | sarah@gmail.com 172 | Body: 173 | Hey! Just wanted to see if you're free for dinner this Saturday? Let me know! 174 | assert: 175 | - type: equals 176 | value: "personal" 177 | - description: "Email Categorization: Newsletter" 178 | vars: 179 | prompt: | 180 | Categorize the following email into one of the following categories: work, personal, newsletter, other. Respond exclusively with the category name. 181 | 182 | EMAIL 183 | 184 | Subject: 185 | Your weekly tech newsletter 186 | From: 187 | newsletter@techdigest.com 188 | Body: 189 | Here are the top tech stories for this week... 190 | assert: 191 | - type: equals 192 | value: "newsletter" 193 | - description: "Email Categorization: Other" 194 | vars: 195 | prompt: | 196 | Categorize the following email into one of the following categories: work, personal, newsletter, other. Respond exclusively with the category name. 197 | 198 | EMAIL 199 | 200 | Subject: 201 | Your order has shipped! 202 | From: 203 | orders@onlinestore.com 204 | Body: 205 | Good news! Your recent order has shipped and is on its way to you. 206 | assert: 207 | - type: equals 208 | value: "other" 209 | - description: "Response Template: Concise Email Reply" 210 | vars: 211 | prompt: | 212 | Create a concise response to the USER_EMAIL. Follow the RESPONSE_STRUCTURE. 213 | 214 | RESPONSE_STRUCTURE: 215 | - Hey 216 | - Appreciate reach out 217 | - Reframe email to confirm details 218 | - Next steps, schedule meeting next week 219 | - Thanks for your time. Stay Focused, Keep building. - Dan. 220 | 221 | USER_EMAIL: 222 | Subject: 223 | Let's move forward on the project 224 | From: 225 | john@aiconsultingco.com 226 | Body: 227 | Hey Dan, I was thinking more about the product requirements for the idea we brainstormed last week. 228 | Let's discuss pricing, timeline and move forward with the proof of concept. Can we sync next week? 229 | Thanks for your time. 230 | assert: 231 | - type: icontains-all 232 | value: ["john", "product", "schedule", "pricing", "timeline", "proof",] 233 | - description: "Email Summarization: Concise Bullet Points" 234 | vars: 235 | prompt: | 236 | Create a concise summary of the DENSE_EMAIL. Extract the most important information into bullet points and then summarize the email in the SUMMARY_FORMAT. 237 | 238 | SUMMARY_FORMAT: 239 | Oneline Summary 240 | ... 241 | Bullet points 242 | - a 243 | - b 244 | - c 245 | 246 | DENSE_EMAIL: 247 | Subject: 248 | Project Update - New Requirements and Timeline Changes 249 | From: 250 | sarah@techconsultingfirm.com 251 | Body: 252 | Hi Dan, 253 | 254 | I wanted to provide an update on the ERP system implementation project. After our last meeting with the client, they have requested some additional features and changes to the original requirements. This includes integrating with their existing CRM system, adding advanced reporting capabilities, and supporting multi-currency transactions. 255 | 256 | Due to these new requirements, we will need to adjust our project timeline and milestones. I estimate that these changes will add approximately 3-4 weeks to our original schedule. We should also plan for additional testing and quality assurance to ensure the new features are working as expected. 257 | 258 | Please let me know if you have any concerns or questions about these changes. I think it's important that we communicate this to the client as soon as possible and set expectations around the revised timeline. 259 | 260 | Thanks, 261 | Sarah 262 | assert: 263 | - type: icontains-all 264 | value: ["ERP", "system", "-", "additional features", "adjust project timeline", "3-4 weeks", "revised timeline"] 265 | # Code Generation 266 | - description: "Simple Python function prefix_string" 267 | vars: 268 | prompt: | 269 | Implement the following python function prefix_string("abc", 2) -> "abcabc" 270 | assert: 271 | - type: icontains-all 272 | value: ["def", "prefix_string", "return"] 273 | - description: "Simple Python function prefix_string" 274 | vars: 275 | prompt: | 276 | Implement the following python function prefix_string("abc", 2) -> "abcabc" 277 | assert: 278 | - type: icontains-all 279 | value: ["def", "prefix_string", "return"] 280 | - description: "Python function to check if a string is a palindrome" 281 | vars: 282 | prompt: | 283 | Implement a Python function is_palindrome(s) that takes a string s and returns True if it is a palindrome, False otherwise. 284 | assert: 285 | - type: icontains-all 286 | value: ["def", "is_palindrome", "return", "True", "False", "==", "[::-1]"] 287 | - description: "Python function to find the longest common subsequence" 288 | vars: 289 | prompt: | 290 | Implement a Python function longest_common_subsequence(s1, s2) that takes two strings s1 and s2 and returns the longest common subsequence between them. 291 | assert: 292 | - type: icontains-all 293 | value: ["def", "longest_common_subsequence", "return", "max", "if", "else", "len"] 294 | - description: "Python class to implement a stack using a linked list" 295 | vars: 296 | prompt: | 297 | Implement a Python class Stack that represents a stack using a singly linked list. It should support push, pop, and is_empty operations. 298 | assert: 299 | - type: icontains-all 300 | value: ["class", "Stack", "Node", "__init__", "push", "pop", "is_empty", "self.head"] 301 | # Code Debugging 302 | - description: "Find the bug in this code" 303 | vars: 304 | prompt: | 305 | Find the bug in this code: 306 | 307 | def mult_and_sum_array(arr, multiple): 308 | multi_arr = [x * multiple for x in arr] 309 | sum = 0 310 | sum = sum(multi_arr) 311 | return sum 312 | assert: 313 | - type: icontains-all 314 | value: ["sum", "function", "built-in"] 315 | - type: icontains-any 316 | value: ["over", "both"] 317 | - description: "Find the bug in this code" 318 | vars: 319 | prompt: | 320 | Find the bug in this code: 321 | 322 | def find_max(nums): 323 | max_num = float('-inf') 324 | for num in nums: 325 | if num < max_num: 326 | max_num = num 327 | return max_num 328 | assert: 329 | - type: icontains-all 330 | value: ["find_max"] 331 | - type: icontains-any 332 | value: ["if", "condition"] 333 | - type: icontains-any 334 | value: ["<", "less"] 335 | - type: icontains-any 336 | value: [">", "greater"] 337 | # Context Window testing 338 | - description: "Context Window: Needle in the haystack test #1" 339 | vars: 340 | prompt: | 341 | What was the end of year prediction made in the SCRIPT below? 342 | 343 | SCRIPT 344 | Gemma Phi 3, OpenELM, and Llama 3. Open source language models are becoming more viable with every single release. The terminology from Apple's new OpenELM model is spot on. These efficient language models are taking center stage in the LLM ecosystem. Why are ELMs so important? Because they reshape the business model of your agentic tools and products. When you can run a prompt directly on your device, the cost of building goes to zero. The pace of innovation has been incredible, especially with the release of Llama 3. But every time a new model drops, I'm always asking the same question. Are efficient language models truly ready for on-device use? And how do you know your ELM meets your standards? I'm going to give you a couple of examples here. The first one is that you need to know your ELM. Everyone has different standards for their prompts, prompt chains, AI agents, and agentic workflows. How do you know your personal standards are being met by Phi 3, by Llama 3, and whatever's coming next? This is something that we stress on the channel a lot. Always look at where the ball is going, not where it is. If this trend of incredible local models continue, how soon will it be until we can do what GPT-4 does right on our device? With Llama 3, it's looking like this. It's looking like this. It's looking like this. It's looking like this. It's looking like this. It's looking like this. It's looking like this. It's looking like this. It's looking like this. That time is coming very soon. In this video, we're going to answer the question, are efficient language models ready for on-device use? How do you know if they're ready for your specific use cases? Here are all the big ideas. We're going to set some standards for what ELM attributes we actually care about. There are things like RAM consumption, tokens per second, accuracy. We're going to look at some specific attributes of ELMs and talk about where they need to be for them to work on-device for us. We're going to break down the IT V-Benchmark. We'll explain exactly what that is. That's going to help us answer the question, is this model good enough for your specific use cases? And then we're going to actually run the IT V-Benchmark on Gemma 5.3 and Llama 3 for real on-device use. So we're going to look at a concrete example of the IT V-Benchmark running on my M2 MacBook Pro with 64 gigabytes of RAM and really try to answer the question in a concrete way. Is this ready for prime time? Are these ELMs, are these efficient language models ready for prime time? Let's first walk through some standards and then I'll share some of my personal standards for ELMs. So we'll look at it through the lens of how I'm approaching this as I'm building out agentic tools and products. How do we know we're ready for on-device use? First two most important metrics we need to look at, accuracy and speed. Given your test suite that validates that this model works for your use case, what accuracy do you need? Is it okay if it fails a couple of tests giving you 90% or are you okay with, you know, 60, 70 or 80%? I think accuracy is the most important benchmark we should all be paying attention to. Something like speed is also a complete blocker if it's too low. So we'll be measuring speed and TPS, tokens per second. We'll look at a range from one token per second, all the way up to grok levels, right? Of something like 500 plus, you know, 1000 tokens per second level. What else do we need to pay attention to? Memory and context window. So memory coupled with speed are the big two constraints for ELMs right now. Efficient language model, models that can run on your device. They chew up anywhere from four gigabytes of RAM, of GPU, of CPU, all the way up to 128 and beyond. To run Lama 3, 70 billion parameter on my MacBook, it will chew up something like half of all my available RAM. We also have context window. This is a classic one. Then we have JSON response and vision support. We're not gonna focus on these too much. These are more yes, no, do they have it or do they not? Is it multimodal or not? There are a couple other things that we need to pay attention to. First of all, we need to pay attention to these other attributes that we're missing here, but I don't think they matter as much as these six and specifically these four at the top here. So let's go ahead and walk through this through the lens of my personal standards for efficient language models. Let's break it down. So first things first, the accuracy for the ITV benchmark, which we're about to get to must hit 80%. So if a model is not passing about 80% here, I automatically disqualify it. Tokens per second. I require at least 20 tokens per second minimum. If it's below this, it's honestly just not worth it. It's too slow. There's not enough happening. Anything above this, of course we'll accept. So keep in mind when you're setting your personal standards, you're really looking for ranges, right? Anything above 80% for me is golden. Anything above 20 tokens per second at a very minimum is what we're looking for. So let's look at memory. For me, I am only willing to consume up to about 32 gigabytes of RAM, GPU, CPU. However, it ends up getting sliced. On my 64 gigabyte, I have several Docker instances and other applications that are basically running 24 seven that constrain my dev environment. Regardless, I'm looking for ELMs that consume less than 32 gigabytes of memory. Context window, for me, the sweet spot is 32K and above. Lama 3 released with 8K. I said, cool. Benchmarks look great, but it's a little too small. For some of the larger prompts and prompt chains that I'm building up, I'm looking for 32K minimum context. I highly recommend you go through and set your personal standard for each one of these metrics, as they're likely to be the most important for getting your ELM, for getting a model running on your device. So JSON response, vision support. I don't really care about vision support. This is not a high priority for me. Of course, it's a nice to have. There are image models that can run in isolation. That does the trick for me. I'm not super concerned about having local on device multimodal models, at least right now. JSON response support is a must have. For me, this is built into a lot of the model providers, and it's typically not a problem anymore. So these are my personal standards. The most important ones are up here. 80% accuracy on the ITP benchmark, which we'll talk about in just a second. We have the speed. I'm looking for 20 tokens per second at a minimum. I'm looking for a memory consumption maximum of 32. And then of course, the context window. I am simplifying a lot of the details here, especially around the memory usage. I just want to give you a high level of how to think about what your standards are for ELMs. So that when they come around, you're ready to start using it for your personal tools and products. Having this ready to go as soon as these models are ready will save you time and money, especially as you scale up your usage of language models. So let's talk about the ITP benchmark. What is this? It's simple. It's nothing fancy. ITP is just, is this viable? That's what the test is all about. I just want to know, is this ELM viable? Are these efficient language models, AKA on device language models good enough? This code repository we're about to dive into. It's a personalized use case specific benchmark to quickly swap in and out ELMs, AKA on device language models to know if it's ready for your tools and applications. So let's go ahead and take a quick look at this code base. Link for this is going to be in the description. Let's go ahead and crack open VS code and let's just start with the README. So let's preview this and it's simple. This uses Bunn, PromptFu, and Alama for a minimalist cross-platform local LLM prompt testing and benchmarking experience. So before we dive into this anymore, I'm just going to go ahead, open up the terminal. I'm going to type Bunn run ELM, and that's going to kick off the test. So you can see right away, I have four models running, starting with GPT 3.5 as a control model to test against. And then you can see here, we have Alama Chat, Alama 3, we have PHY, and we have Gemma running as well. So while this is running through our 12 test cases, let's go ahead and take a look at what this code base looks like. So all the details that get set up are going to be in the README. Once you're able to get set up with this in less than a minute, this code base was designed specifically for you to help you benchmark local models for your use cases so that when they're ready, you can start saving time and saving money immediately. If we look at the structure, it's very simple. We have some setup, some minor scripts, and then we have the most important thing, bench, underscore, underscore, and then whatever the test suite name is. This one's called Efficient Language Models. So let's go ahead and look at the prompt. So the prompt is just a simple template. This gets filled in with each individual test run. And if we open up our test files, you can see here, let's go ahead and collapse everything. You can see here we have a list of what do we have here, 12 tests. They're sectioned off. You can see we have string manipulation here, command generation, code explanation, text classification. This is a work in progress of my personal ELM accuracy benchmark. By the time you're watching this, there'll likely be a few additional tests here. They'll be generic enough though, so that you can come in, understand them, and tweak them to fit your own specific use case. So let's go ahead and take a look at this. So this is the test file, and we'll look into this in more detail in just a second here. But if you go to the most important file, prompt through configuration, you can see here, let's go ahead and collapse this. We have our control cloud LLM. So I like to have a kind of control and an experimental group. The control group is going to be our cloud LLM that we want to prove our local models are as good as or near the performance of. Right now I'm using dbt 3.5. And then we have our experimental local ELMs. So we're going to go ahead and take a look at this. So in here, you can see we have LLM 3, we have 5.3, and we have Gemma. Again, you can tweak these. This is all built on top of LLM. Let's go ahead and run through our tool set quickly. We're using Bun, which is an all in one JavaScript runtime. Over the past year, the engineers have really matured the ecosystem. This is my go-to tool for all things JavaScript and TypeScript related. They recently just launched Windows support, which means that this code base will work out of the box for Mac, Linux, and Windows users. You can go ahead and click on this, and you'll be able to see the code base. Huge shout out to the Bun developers on all the great work here. We're using Ollama to serve our local language models. I probably don't need to introduce them. And last but not least, we're using PromptFu. I've talked about PromptFu in a few videos in the past, but it's super, super important to bring back up. This is how you can test your individual prompts against expectations. So what does that look like? If we scroll down to the hero here, you can see exactly what a test case looks like. So you have your prompts that you're going to test. So this is what you would normally type in a chat input field. And then you can go ahead and click test. And then you can go ahead and you have your individual models. Let's say you want to test OpenAI, Plod, and Mistral Large. You would put those all here. So for each provider, it's going to run every single prompt. And then at the bottom, you have your test cases. Your test cases can pass in variables to your prompts, as you can see here. And then most importantly, your test cases can assert specific expectations on the output of your LLM. So you can see here where you're running this type contains. We need to make sure that it has this string in it. We're making sure that the cost is below this amount, latency below this, etc. There are many different assertion types. The ITV benchmark repo uses these three key pieces of technology for a really, really simplistic experience. So you have your prompt configuration where you specify what models you want to use. You have your tests, which specify the details. So let's go ahead and look at one of these tests. You can see here, this is a simple bullet summary test. So I'm saying create a summary of the following text in bullet points. And then here's the script to one of our previous videos. So, you know, here's a simple yet powerful idea that can help you take a large step toward useful and valuable agentic workflows. We're asserting case insensitively that all of these items are in the response of the prompt. So let's go ahead and look at our output. Let's see if our prompts completed. Okay, so we have 33 success and 15 failed tests. So LLM3 ran every single one of these test cases here and reported its results. So let's go ahead and take a look at what that looks like. So after you run that was Bon ELM, after you run that you can run Bon View and if we open up package.json, and you can see Bon view just runs prompt foo view Bon view. This is going to kick off a local prompt foo server that shows us exactly what happened in the test runs. So right away, you can see we have a great summary of the results. So we have our control test failing at only one test, right. So it passed 91% accuracy. This and then we have llama 3 so close to my 80 standard we'll dig into where it went wrong in just a second here we then have phi 3 failed half of the 12 test cases and then we have gemma looks like it did one better 7 out of 12 so you can see here this is why it's important to have a control group specifically for testing elms it's really good to compare against a kind of high performing model and you know gpg 3.5 turbo it's not really even high performing anymore but it's a good benchmark for testing against local models because really if we use opus or gpt4 here the local models won't even come close so that's why i like to compare to something like gpg 3.5 you can also use cloud 3 haiku here this right away gives you a great benchmark on how local models are performing let's go ahead and look at one of these tests what happened where did things go wrong let's look at our text classification this is a simple test the prompt is is the following block of text a sql natural language query nlq respond exclusively with yes or no so this test here is going to look at how well the model can both answer correctly and answer precisely right it needs to say yes or no and then the block of text is select 10 users over the age of 21 with a gmail address and then we have the assertion type equals yes so our test case validates this test if it returns exclusively yes and we can look at the prompt test to see exactly what that looks like so if you go to test.yaml we can see we're looking for just yes this is what that test looks like right so this is our one of our text classification tests and and we have this assertion type equals yes so equals is used when you know exactly what you want the response to be a lot of the times you'll want something like a i contains all so case insensitive contains everything or a case insensitive contains any and there are lots of different assertions you can make you can easily dive into that i've linked that in the readme you'll want to look at the assertions documentation in prompt foo they have a whole list here of different assertions you can make to improve and strengthen your prompt test so that's what that test looks like and and you can kind of go through the line over each model to see exactly what went right what went wrong etc so feel free to check out the other test cases the long story short here is that by running the itv benchmark by running your personal benchmarks against local models you can have higher confidence and you can have first movers advantage on getting your hands on these local models and truly utilizing them as you can see here llama 3 is nearly within my standard of what i need an elm to do based on these 12 test cases i'll increase this to add a lot more of the use cases that i use out of these 12 test cases llama 3 is performing really really well and this is the 8b model right so if we look at a llama you can see here the default version that comes in here is the 8 billion parameter model that's the 4b quantization so pretty good stuff here i don't need to talk about how great llama 3 is the rest of the internet is doing that but it is really awesome to see how it performs on your specific use cases the closer you get to the metal here the closer you understand how these models are performing next to each other the better and the faster you're going to be able to take these models and productionize them in your tools and products i also just want to shout out how incredible it is to actually run these tests over and over and over again with the same model without thinking about the cost for a single second. You can see here, we're getting about 12 tokens per second across the board. So not ideal, not super great, but still everything completed fine. You can walk through the examples. A lot of these test cases are passing. This is really great. I'm gonna be keeping a pretty close eye on this stuff. So definitely like and subscribe if you're interested in the best local performing models. I feel like we're gonna have a few different classes of models, right? If we break this down, fastest, cheapest, and then it was best, slowest. And now what I think we need to do is take this and add a nest to it. So we basically say something like this, right? We say cloud, right? And then we say the slowest, most expensive. And then we say local, fastest, lower accuracy, and best, slowest, right? So things kind of change when you're at the local level. Now we're just trading off speed and accuracy, which simplifies things a lot, right? Because basically we were doing this where we had the fastest, cheapest, and we had lower accuracy. And then we had best, slowest, most expensive, right? So this is your Opus, this is your GPT-4, and this is your Haiku, GPT-3. But now we're getting into this interesting place where now we have things like this, right? Now we have PHY-3, we have LAMA-3, LAMA-3 is seven or eight billion. We also have Gemma. And then in the slowest, we have our bigger models, right? So this is where like LAMA-3 was at 70 billion, that's where this goes. And then, you know, whatever other big models that come out that are, you know, going to really trip your RAM, they're going to run slower, but they will give you the best performance that you can possibly have locally. So I'm keeping an eye on this. Hit the like and hit the sub if you want to stay up to date with how cloud versus local models progress. We're going to be covering these on the channel and I'll likely use, you know, this class system to separate them to keep an eye on these, right? First thing that needs to happen is we need anything at all. To run locally, right? So this is kind of, you know, in the future, same with this. Right now we need just anything to run well enough. So, you know, we need decent accuracy, any speed, right? So this is what we're looking for right now. And this stuff is going to come in the future. So that's the way I'm looking at this. The ITV benchmark can help you gain confidence in your prompts. Link for the code is going to be in the description. I built this to be ultra simple. Just follow the README to get started. Thanks to Bunn. Pramphu and Ollama. This should be completely cross-platform and I'll be updating this with some additional test cases. By the time you watch this, I'll likely have added several additional tests. I'm missing some things in here like code generation, context window length testing, and a couple other sections. So look forward to that. I hope all of this makes sense. Up your feeling, the speed of the open source community building toward usable viable ELMs. I think this is something that we've all been really excited about. And it's finally starting to happen. I'm going to predict by the end of the year, we're going to have an on-device Haiku to GPT-4 level model running, consuming less than 8 gigabytes of RAM. As soon as OpenELM hits Ollama, we'll be able to test this as well. And that's one of the highlights of using the ITV benchmark inside of this code base. You'll be able to quickly and seamlessly get that up and running by just updating the model name, adding a new configuration here like this. And then it'll look something like this, OpenELM, and then whatever the size is going to be, say it's the 3B, and that's it. Then you just run the test again, right? So that's the beauty of having a test suite like this set up and ready to go. You can, of course, come in here and customize this. You can add Opus, you can add Haiku, you can add other models, tweak it to your liking. That's what this is all about. I highly recommend you get in here and test this. This was important enough for me to take a break from personal AI assistance, and HSE, and all of that stuff. And I'll see you guys in the next video. Bye-bye. MacBook Pro M4 chip is released. And as the LLM community rolls out permutations of Llama 3, I think very soon, possibly before mid-2024, ELM's efficient language models will be ready for on-device use. Again, this is use case specific, which is really the whole point of me creating this video is to share this code base with you so that you can know exactly what your use case specific standards are. Because after you have standards set and a great prompting framework like PromptFu, you can then answer the question for yourself, for your tools, and for your products, is this efficient language model ready for my device? For me personally, the answer to this question is very soon. If you enjoyed this video, you know what to do. Thanks so much for watching. Stay focused and keep building. 345 | assert: 346 | - type: icontains-all 347 | value: ["on-device", "model", "GPT-4", "8", "RAM"] 348 | - description: "Context Window: Needle in the haystack test #2" 349 | vars: 350 | prompt: | 351 | What was the speakers personal accuracy requirement for the benchmark made in the SCRIPT below? 352 | 353 | SCRIPT 354 | Gemma Phi 3, OpenELM, and Llama 3. Open source language models are becoming more viable with every single release. The terminology from Apple's new OpenELM model is spot on. These efficient language models are taking center stage in the LLM ecosystem. Why are ELMs so important? Because they reshape the business model of your agentic tools and products. When you can run a prompt directly on your device, the cost of building goes to zero. The pace of innovation has been incredible, especially with the release of Llama 3. But every time a new model drops, I'm always asking the same question. Are efficient language models truly ready for on-device use? And how do you know your ELM meets your standards? I'm going to give you a couple of examples here. The first one is that you need to know your ELM. Everyone has different standards for their prompts, prompt chains, AI agents, and agentic workflows. How do you know your personal standards are being met by Phi 3, by Llama 3, and whatever's coming next? This is something that we stress on the channel a lot. Always look at where the ball is going, not where it is. If this trend of incredible local models continue, how soon will it be until we can do what GPT-4 does right on our device? With Llama 3, it's looking like this. It's looking like this. It's looking like this. It's looking like this. It's looking like this. It's looking like this. It's looking like this. It's looking like this. It's looking like this. That time is coming very soon. In this video, we're going to answer the question, are efficient language models ready for on-device use? How do you know if they're ready for your specific use cases? Here are all the big ideas. We're going to set some standards for what ELM attributes we actually care about. There are things like RAM consumption, tokens per second, accuracy. We're going to look at some specific attributes of ELMs and talk about where they need to be for them to work on-device for us. We're going to break down the IT V-Benchmark. We'll explain exactly what that is. That's going to help us answer the question, is this model good enough for your specific use cases? And then we're going to actually run the IT V-Benchmark on Gemma 5.3 and Llama 3 for real on-device use. So we're going to look at a concrete example of the IT V-Benchmark running on my M2 MacBook Pro with 64 gigabytes of RAM and really try to answer the question in a concrete way. Is this ready for prime time? Are these ELMs, are these efficient language models ready for prime time? Let's first walk through some standards and then I'll share some of my personal standards for ELMs. So we'll look at it through the lens of how I'm approaching this as I'm building out agentic tools and products. How do we know we're ready for on-device use? First two most important metrics we need to look at, accuracy and speed. Given your test suite that validates that this model works for your use case, what accuracy do you need? Is it okay if it fails a couple of tests giving you 90% or are you okay with, you know, 60, 70 or 80%? I think accuracy is the most important benchmark we should all be paying attention to. Something like speed is also a complete blocker if it's too low. So we'll be measuring speed and TPS, tokens per second. We'll look at a range from one token per second, all the way up to grok levels, right? Of something like 500 plus, you know, 1000 tokens per second level. What else do we need to pay attention to? Memory and context window. So memory coupled with speed are the big two constraints for ELMs right now. Efficient language model, models that can run on your device. They chew up anywhere from four gigabytes of RAM, of GPU, of CPU, all the way up to 128 and beyond. To run Lama 3, 70 billion parameter on my MacBook, it will chew up something like half of all my available RAM. We also have context window. This is a classic one. Then we have JSON response and vision support. We're not gonna focus on these too much. These are more yes, no, do they have it or do they not? Is it multimodal or not? There are a couple other things that we need to pay attention to. First of all, we need to pay attention to these other attributes that we're missing here, but I don't think they matter as much as these six and specifically these four at the top here. So let's go ahead and walk through this through the lens of my personal standards for efficient language models. Let's break it down. So first things first, the accuracy for the ITV benchmark, which we're about to get to must hit 80%. So if a model is not passing about 80% here, I automatically disqualify it. Tokens per second. I require at least 20 tokens per second minimum. If it's below this, it's honestly just not worth it. It's too slow. There's not enough happening. Anything above this, of course we'll accept. So keep in mind when you're setting your personal standards, you're really looking for ranges, right? Anything above 80% for me is golden. Anything above 20 tokens per second at a very minimum is what we're looking for. So let's look at memory. For me, I am only willing to consume up to about 32 gigabytes of RAM, GPU, CPU. However, it ends up getting sliced. On my 64 gigabyte, I have several Docker instances and other applications that are basically running 24 seven that constrain my dev environment. Regardless, I'm looking for ELMs that consume less than 32 gigabytes of memory. Context window, for me, the sweet spot is 32K and above. Lama 3 released with 8K. I said, cool. Benchmarks look great, but it's a little too small. For some of the larger prompts and prompt chains that I'm building up, I'm looking for 32K minimum context. I highly recommend you go through and set your personal standard for each one of these metrics, as they're likely to be the most important for getting your ELM, for getting a model running on your device. So JSON response, vision support. I don't really care about vision support. This is not a high priority for me. Of course, it's a nice to have. There are image models that can run in isolation. That does the trick for me. I'm not super concerned about having local on device multimodal models, at least right now. JSON response support is a must have. For me, this is built into a lot of the model providers, and it's typically not a problem anymore. So these are my personal standards. The most important ones are up here. 80% accuracy on the ITP benchmark, which we'll talk about in just a second. We have the speed. I'm looking for 20 tokens per second at a minimum. I'm looking for a memory consumption maximum of 32. And then of course, the context window. I am simplifying a lot of the details here, especially around the memory usage. I just want to give you a high level of how to think about what your standards are for ELMs. So that when they come around, you're ready to start using it for your personal tools and products. Having this ready to go as soon as these models are ready will save you time and money, especially as you scale up your usage of language models. So let's talk about the ITP benchmark. What is this? It's simple. It's nothing fancy. ITP is just, is this viable? That's what the test is all about. I just want to know, is this ELM viable? Are these efficient language models, AKA on device language models good enough? This code repository we're about to dive into. It's a personalized use case specific benchmark to quickly swap in and out ELMs, AKA on device language models to know if it's ready for your tools and applications. So let's go ahead and take a quick look at this code base. Link for this is going to be in the description. Let's go ahead and crack open VS code and let's just start with the README. So let's preview this and it's simple. This uses Bunn, PromptFu, and Alama for a minimalist cross-platform local LLM prompt testing and benchmarking experience. So before we dive into this anymore, I'm just going to go ahead, open up the terminal. I'm going to type Bunn run ELM, and that's going to kick off the test. So you can see right away, I have four models running, starting with GPT 3.5 as a control model to test against. And then you can see here, we have Alama Chat, Alama 3, we have PHY, and we have Gemma running as well. So while this is running through our 12 test cases, let's go ahead and take a look at what this code base looks like. So all the details that get set up are going to be in the README. Once you're able to get set up with this in less than a minute, this code base was designed specifically for you to help you benchmark local models for your use cases so that when they're ready, you can start saving time and saving money immediately. If we look at the structure, it's very simple. We have some setup, some minor scripts, and then we have the most important thing, bench, underscore, underscore, and then whatever the test suite name is. This one's called Efficient Language Models. So let's go ahead and look at the prompt. So the prompt is just a simple template. This gets filled in with each individual test run. And if we open up our test files, you can see here, let's go ahead and collapse everything. You can see here we have a list of what do we have here, 12 tests. They're sectioned off. You can see we have string manipulation here, command generation, code explanation, text classification. This is a work in progress of my personal ELM accuracy benchmark. By the time you're watching this, there'll likely be a few additional tests here. They'll be generic enough though, so that you can come in, understand them, and tweak them to fit your own specific use case. So let's go ahead and take a look at this. So this is the test file, and we'll look into this in more detail in just a second here. But if you go to the most important file, prompt through configuration, you can see here, let's go ahead and collapse this. We have our control cloud LLM. So I like to have a kind of control and an experimental group. The control group is going to be our cloud LLM that we want to prove our local models are as good as or near the performance of. Right now I'm using dbt 3.5. And then we have our experimental local ELMs. So we're going to go ahead and take a look at this. So in here, you can see we have LLM 3, we have 5.3, and we have Gemma. Again, you can tweak these. This is all built on top of LLM. Let's go ahead and run through our tool set quickly. We're using Bun, which is an all in one JavaScript runtime. Over the past year, the engineers have really matured the ecosystem. This is my go-to tool for all things JavaScript and TypeScript related. They recently just launched Windows support, which means that this code base will work out of the box for Mac, Linux, and Windows users. You can go ahead and click on this, and you'll be able to see the code base. Huge shout out to the Bun developers on all the great work here. We're using Ollama to serve our local language models. I probably don't need to introduce them. And last but not least, we're using PromptFu. I've talked about PromptFu in a few videos in the past, but it's super, super important to bring back up. This is how you can test your individual prompts against expectations. So what does that look like? If we scroll down to the hero here, you can see exactly what a test case looks like. So you have your prompts that you're going to test. So this is what you would normally type in a chat input field. And then you can go ahead and click test. And then you can go ahead and you have your individual models. Let's say you want to test OpenAI, Plod, and Mistral Large. You would put those all here. So for each provider, it's going to run every single prompt. And then at the bottom, you have your test cases. Your test cases can pass in variables to your prompts, as you can see here. And then most importantly, your test cases can assert specific expectations on the output of your LLM. So you can see here where you're running this type contains. We need to make sure that it has this string in it. We're making sure that the cost is below this amount, latency below this, etc. There are many different assertion types. The ITV benchmark repo uses these three key pieces of technology for a really, really simplistic experience. So you have your prompt configuration where you specify what models you want to use. You have your tests, which specify the details. So let's go ahead and look at one of these tests. You can see here, this is a simple bullet summary test. So I'm saying create a summary of the following text in bullet points. And then here's the script to one of our previous videos. So, you know, here's a simple yet powerful idea that can help you take a large step toward useful and valuable agentic workflows. We're asserting case insensitively that all of these items are in the response of the prompt. So let's go ahead and look at our output. Let's see if our prompts completed. Okay, so we have 33 success and 15 failed tests. So LLM3 ran every single one of these test cases here and reported its results. So let's go ahead and take a look at what that looks like. So after you run that was Bon ELM, after you run that you can run Bon View and if we open up package.json, and you can see Bon view just runs prompt foo view Bon view. This is going to kick off a local prompt foo server that shows us exactly what happened in the test runs. So right away, you can see we have a great summary of the results. So we have our control test failing at only one test, right. So it passed 91% accuracy. This and then we have llama 3 so close to my 80 standard we'll dig into where it went wrong in just a second here we then have phi 3 failed half of the 12 test cases and then we have gemma looks like it did one better 7 out of 12 so you can see here this is why it's important to have a control group specifically for testing elms it's really good to compare against a kind of high performing model and you know gpg 3.5 turbo it's not really even high performing anymore but it's a good benchmark for testing against local models because really if we use opus or gpt4 here the local models won't even come close so that's why i like to compare to something like gpg 3.5 you can also use cloud 3 haiku here this right away gives you a great benchmark on how local models are performing let's go ahead and look at one of these tests what happened where did things go wrong let's look at our text classification this is a simple test the prompt is is the following block of text a sql natural language query nlq respond exclusively with yes or no so this test here is going to look at how well the model can both answer correctly and answer precisely right it needs to say yes or no and then the block of text is select 10 users over the age of 21 with a gmail address and then we have the assertion type equals yes so our test case validates this test if it returns exclusively yes and we can look at the prompt test to see exactly what that looks like so if you go to test.yaml we can see we're looking for just yes this is what that test looks like right so this is our one of our text classification tests and and we have this assertion type equals yes so equals is used when you know exactly what you want the response to be a lot of the times you'll want something like a i contains all so case insensitive contains everything or a case insensitive contains any and there are lots of different assertions you can make you can easily dive into that i've linked that in the readme you'll want to look at the assertions documentation in prompt foo they have a whole list here of different assertions you can make to improve and strengthen your prompt test so that's what that test looks like and and you can kind of go through the line over each model to see exactly what went right what went wrong etc so feel free to check out the other test cases the long story short here is that by running the itv benchmark by running your personal benchmarks against local models you can have higher confidence and you can have first movers advantage on getting your hands on these local models and truly utilizing them as you can see here llama 3 is nearly within my standard of what i need an elm to do based on these 12 test cases i'll increase this to add a lot more of the use cases that i use out of these 12 test cases llama 3 is performing really really well and this is the 8b model right so if we look at a llama you can see here the default version that comes in here is the 8 billion parameter model that's the 4b quantization so pretty good stuff here i don't need to talk about how great llama 3 is the rest of the internet is doing that but it is really awesome to see how it performs on your specific use cases the closer you get to the metal here the closer you understand how these models are performing next to each other the better and the faster you're going to be able to take these models and productionize them in your tools and products i also just want to shout out how incredible it is to actually run these tests over and over and over again with the same model without thinking about the cost for a single second. You can see here, we're getting about 12 tokens per second across the board. So not ideal, not super great, but still everything completed fine. You can walk through the examples. A lot of these test cases are passing. This is really great. I'm gonna be keeping a pretty close eye on this stuff. So definitely like and subscribe if you're interested in the best local performing models. I feel like we're gonna have a few different classes of models, right? If we break this down, fastest, cheapest, and then it was best, slowest. And now what I think we need to do is take this and add a nest to it. So we basically say something like this, right? We say cloud, right? And then we say the slowest, most expensive. And then we say local, fastest, lower accuracy, and best, slowest, right? So things kind of change when you're at the local level. Now we're just trading off speed and accuracy, which simplifies things a lot, right? Because basically we were doing this where we had the fastest, cheapest, and we had lower accuracy. And then we had best, slowest, most expensive, right? So this is your Opus, this is your GPT-4, and this is your Haiku, GPT-3. But now we're getting into this interesting place where now we have things like this, right? Now we have PHY-3, we have LAMA-3, LAMA-3 is seven or eight billion. We also have Gemma. And then in the slowest, we have our bigger models, right? So this is where like LAMA-3 was at 70 billion, that's where this goes. And then, you know, whatever other big models that come out that are, you know, going to really trip your RAM, they're going to run slower, but they will give you the best performance that you can possibly have locally. So I'm keeping an eye on this. Hit the like and hit the sub if you want to stay up to date with how cloud versus local models progress. We're going to be covering these on the channel and I'll likely use, you know, this class system to separate them to keep an eye on these, right? First thing that needs to happen is we need anything at all. To run locally, right? So this is kind of, you know, in the future, same with this. Right now we need just anything to run well enough. So, you know, we need decent accuracy, any speed, right? So this is what we're looking for right now. And this stuff is going to come in the future. So that's the way I'm looking at this. The ITV benchmark can help you gain confidence in your prompts. Link for the code is going to be in the description. I built this to be ultra simple. Just follow the README to get started. Thanks to Bunn. Pramphu and Ollama. This should be completely cross-platform and I'll be updating this with some additional test cases. By the time you watch this, I'll likely have added several additional tests. I'm missing some things in here like code generation, context window length testing, and a couple other sections. So look forward to that. I hope all of this makes sense. Up your feeling, the speed of the open source community building toward usable viable ELMs. I think this is something that we've all been really excited about. And it's finally starting to happen. I'm going to predict by the end of the year, we're going to have an on-device Haiku to GPT-4 level model running, consuming less than 8 gigabytes of RAM. As soon as OpenELM hits Ollama, we'll be able to test this as well. And that's one of the highlights of using the ITV benchmark inside of this code base. You'll be able to quickly and seamlessly get that up and running by just updating the model name, adding a new configuration here like this. And then it'll look something like this, OpenELM, and then whatever the size is going to be, say it's the 3B, and that's it. Then you just run the test again, right? So that's the beauty of having a test suite like this set up and ready to go. You can, of course, come in here and customize this. You can add Opus, you can add Haiku, you can add other models, tweak it to your liking. That's what this is all about. I highly recommend you get in here and test this. This was important enough for me to take a break from personal AI assistance, and HSE, and all of that stuff. And I'll see you guys in the next video. Bye-bye. MacBook Pro M4 chip is released. And as the LLM community rolls out permutations of Llama 3, I think very soon, possibly before mid-2024, ELM's efficient language models will be ready for on-device use. Again, this is use case specific, which is really the whole point of me creating this video is to share this code base with you so that you can know exactly what your use case specific standards are. Because after you have standards set and a great prompting framework like PromptFu, you can then answer the question for yourself, for your tools, and for your products, is this efficient language model ready for my device? For me personally, the answer to this question is very soon. If you enjoyed this video, you know what to do. Thanks so much for watching. Stay focused and keep building. 355 | assert: 356 | - type: icontains-any 357 | value: ["80%", "80 percent"] 358 | # Personal AI Assistant Responses 359 | - description: "Prompt Test: Ada AI Assistant" 360 | vars: 361 | prompt: | 362 | You are a friendly, ultra helpful, attentive, concise AI assistant named 'Ada'. 363 | 364 | You work with your human companion 'Dan' to build valuable experience through software. 365 | 366 | We both like short, concise, back-and-forth conversations. 367 | 368 | Concisely communicate the following message to your human companion: 'Select an image to generate a Vue component from'. 369 | assert: 370 | - type: icontains-all 371 | value: ["select", "image", "generate", "Vue", "component", "dan"] 372 | - description: "Prompt Test: Personal AI Assistant" 373 | vars: 374 | prompt: | 375 | You are a friendly, ultra helpful, attentive, concise AI assistant named 'Ada'. 376 | 377 | You work with your human companion 'Dan' to build valuable experience through software. 378 | 379 | We both like short, concise, back-and-forth conversations. 380 | 381 | Communicate the following message to your human companion: 'I've found the URL in your clipboard. I'll scrape the URL and example generate code for you. But first, what about the example code would you like me to focus on?' 382 | assert: 383 | - type: icontains-all 384 | value: ["URL", "clipboard", "scrape", "generate", "code", "dan", "what", "focus"] 385 | - description: "Prompt Test: Personal AI Assistant" 386 | vars: 387 | prompt: | 388 | You are a friendly, ultra helpful, attentive, concise AI assistant named 'Ada'. 389 | 390 | You work with your human companion 'Dan' to build valuable experience through software. 391 | 392 | We both like short, concise, back-and-forth conversations. 393 | 394 | Communicate the following message to your human companion: 'Code has been written to the working directory.' 395 | assert: 396 | - type: icontains-all 397 | value: ["code", "written", "working", "directory", "dan"] 398 | --------------------------------------------------------------------------------