├── .editorconfig
├── .env.template
├── .gitattributes
├── .gitignore
├── .yarn
    └── patches
    │   └── promptfoo-npm-0.113.3-239bf96f0e.patch
├── CITATION.cff
├── README.md
├── assertions
    ├── fhirPathEquals.mjs
    ├── isBundle.mjs
    ├── metaElementMissing.mjs
    └── validateOperation.mjs
├── etc
    └── fhir-gpt.yaml
├── evals
    ├── extraction
    │   ├── config-minimalist.yaml
    │   ├── config-specialist.yaml
    │   ├── providers.yaml
    │   └── tests
    │   │   ├── basic-demographics.yaml
    │   │   ├── conditions.yaml
    │   │   ├── explanations-of-benefit.yaml
    │   │   ├── medication-requests.yaml
    │   │   ├── observations.yaml
    │   │   ├── patient-history.json
    │   │   └── patient-history.yaml
    └── generation
    │   ├── config-multi-turn-tool-use.js
    │   ├── config-zero-shot-bundle.yaml
    │   ├── markdown-transformer.js
    │   ├── providers.yaml
    │   └── tests.yaml
├── package.json
├── providers
    ├── AnthropicMessagesWithRecursiveToolCallsProvider.ts
    └── OpenAiResponsesWithRecursiveToolCallsProvider.ts
├── tools
    └── validateFhirBundle.mjs
├── tsconfig.json
└── yarn.lock


/.editorconfig:
--------------------------------------------------------------------------------
 1 | root = true
 2 | 
 3 | [*]
 4 | end_of_line = lf
 5 | insert_final_newline = true
 6 | 
 7 | [*.{js,json,yml}]
 8 | charset = utf-8
 9 | indent_style = space
10 | indent_size = 2
11 | 


--------------------------------------------------------------------------------
/.env.template:
--------------------------------------------------------------------------------
1 | OPENAI_API_KEY=
2 | ANTHROPIC_API_KEY=
3 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | /.yarn/**            linguist-vendored
2 | /.yarn/releases/*    binary
3 | /.yarn/plugins/**/*  binary
4 | /.pnp.*              binary linguist-generated
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Based on https://raw.githubusercontent.com/github/gitignore/main/Node.gitignore
  2 | 
  3 | # Logs
  4 | 
  5 | logs
  6 | _.log
  7 | npm-debug.log_
  8 | yarn-debug.log*
  9 | yarn-error.log*
 10 | lerna-debug.log*
 11 | .pnpm-debug.log*
 12 | 
 13 | # Caches
 14 | 
 15 | .cache
 16 | 
 17 | # Diagnostic reports (https://nodejs.org/api/report.html)
 18 | 
 19 | report.[0-9]_.[0-9]_.[0-9]_.[0-9]_.json
 20 | 
 21 | # Runtime data
 22 | 
 23 | pids
 24 | _.pid
 25 | _.seed
 26 | *.pid.lock
 27 | 
 28 | # Directory for instrumented libs generated by jscoverage/JSCover
 29 | 
 30 | lib-cov
 31 | 
 32 | # Coverage directory used by tools like istanbul
 33 | 
 34 | coverage
 35 | *.lcov
 36 | 
 37 | # nyc test coverage
 38 | 
 39 | .nyc_output
 40 | 
 41 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
 42 | 
 43 | .grunt
 44 | 
 45 | # Bower dependency directory (https://bower.io/)
 46 | 
 47 | bower_components
 48 | 
 49 | # node-waf configuration
 50 | 
 51 | .lock-wscript
 52 | 
 53 | # Compiled binary addons (https://nodejs.org/api/addons.html)
 54 | 
 55 | build/Release
 56 | 
 57 | # Dependency directories
 58 | 
 59 | node_modules/
 60 | jspm_packages/
 61 | 
 62 | # Snowpack dependency directory (https://snowpack.dev/)
 63 | 
 64 | web_modules/
 65 | 
 66 | # TypeScript cache
 67 | 
 68 | *.tsbuildinfo
 69 | 
 70 | # Optional npm cache directory
 71 | 
 72 | .npm
 73 | 
 74 | # Optional eslint cache
 75 | 
 76 | .eslintcache
 77 | 
 78 | # Optional stylelint cache
 79 | 
 80 | .stylelintcache
 81 | 
 82 | # Microbundle cache
 83 | 
 84 | .rpt2_cache/
 85 | .rts2_cache_cjs/
 86 | .rts2_cache_es/
 87 | .rts2_cache_umd/
 88 | 
 89 | # Optional REPL history
 90 | 
 91 | .node_repl_history
 92 | 
 93 | # Output of 'npm pack'
 94 | 
 95 | *.tgz
 96 | 
 97 | # Yarn Integrity file
 98 | 
 99 | .yarn-integrity
100 | 
101 | # dotenv environment variable files
102 | 
103 | .env
104 | .env.development.local
105 | .env.test.local
106 | .env.production.local
107 | .env.local
108 | 
109 | # parcel-bundler cache (https://parceljs.org/)
110 | 
111 | .parcel-cache
112 | 
113 | # Next.js build output
114 | 
115 | .next
116 | out
117 | 
118 | # Nuxt.js build / generate output
119 | 
120 | .nuxt
121 | dist
122 | 
123 | # Gatsby files
124 | 
125 | # Comment in the public line in if your project uses Gatsby and not Next.js
126 | 
127 | # https://nextjs.org/blog/next-9-1#public-directory-support
128 | 
129 | # public
130 | 
131 | # vuepress build output
132 | 
133 | .vuepress/dist
134 | 
135 | # vuepress v2.x temp and cache directory
136 | 
137 | .temp
138 | 
139 | # Docusaurus cache and generated files
140 | 
141 | .docusaurus
142 | 
143 | # Serverless directories
144 | 
145 | .serverless/
146 | 
147 | # FuseBox cache
148 | 
149 | .fusebox/
150 | 
151 | # DynamoDB Local files
152 | 
153 | .dynamodb/
154 | 
155 | # TernJS port file
156 | 
157 | .tern-port
158 | 
159 | # Stores VSCode versions used for testing VSCode extensions
160 | 
161 | .vscode-test
162 | 
163 | # yarn v2
164 | 
165 | .yarn/cache
166 | .yarn/unplugged
167 | .yarn/build-state.yml
168 | .yarn/install-state.gz
169 | .pnp.*
170 | 
171 | # IntelliJ based IDEs
172 | .idea
173 | 
174 | # Finder (MacOS) folder config
175 | .DS_Store
176 | 
177 | evals/private-*


--------------------------------------------------------------------------------
/.yarn/patches/promptfoo-npm-0.113.3-239bf96f0e.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/package.json b/package.json
 2 | index e17eeb5e4ecf61ac3c8b6f96d6a16445f31a60e1..142ad6371c613270ece06fa02543cdda934d1ce0 100644
 3 | --- a/package.json
 4 | +++ b/package.json
 5 | @@ -14,6 +14,42 @@
 6 |      ".": {
 7 |        "import": "./dist/src/index.js",
 8 |        "require": "./dist/src/index.js"
 9 | +    },
10 | +    "./dist/src/providers/openai/responses": {
11 | +      "import": "./dist/src/providers/openai/responses.js",
12 | +      "require": "./dist/src/providers/openai/responses.js"
13 | +    },
14 | +    "./dist/src/providers/openai/util": {
15 | +      "import": "./dist/src/providers/openai/util.js",
16 | +      "require": "./dist/src/providers/openai/util.js"
17 | +    },
18 | +    "./dist/src/logger": {
19 | +      "import": "./dist/src/logger.js",
20 | +      "require": "./dist/src/logger.js"
21 | +    },
22 | +    "./dist/src/providers/openai/types": {
23 | +      "import": "./dist/src/providers/openai/types.js",
24 | +      "require": "./dist/src/providers/openai/types.js"
25 | +    },
26 | +    "./dist/src/envars": {
27 | +      "import": "./dist/src/envars.js",
28 | +      "require": "./dist/src/envars.js"
29 | +    },
30 | +    "./dist/src/util": {
31 | +      "import": "./dist/src/util/index.js",
32 | +      "require": "./dist/src/util/index.js"
33 | +    },
34 | +    "./dist/src/providers/anthropic/generic": {
35 | +      "import": "./dist/src/providers/anthropic/generic.js",
36 | +      "require": "./dist/src/providers/anthropic/generic.js"
37 | +    },
38 | +    "./dist/src/providers/anthropic/types": {
39 | +      "import": "./dist/src/providers/anthropic/types.js",
40 | +      "require": "./dist/src/providers/anthropic/types.js"
41 | +    },
42 | +    "./dist/src/providers/anthropic/util": {
43 | +      "import": "./dist/src/providers/anthropic/util.js",
44 | +      "require": "./dist/src/providers/anthropic/util.js"
45 |      }
46 |    },
47 |    "workspaces": [
48 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | cff-version: 1.2.0
 2 | message: 'If you use this software, please cite it as below.'
 3 | authors:
 4 |   - family-names: 'Kelly'
 5 |     given-names: 'Joshua'
 6 |     orcid: 'https://orcid.org/0009-0000-7191-0595'
 7 | title: 'FHIR LLM Eval'
 8 | version: 0.0.1
 9 | date-released: 2024-11-22
10 | url: 'https://github.com/flexpa/fhir-llm-evals'
11 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # @flexpa/llm-fhir-eval
 2 | 
 3 | > [!NOTE]
 4 | > Follow the development progress on [FHIR Chat](https://chat.fhir.org/#narrow/channel/323443-Artificial-Intelligence.2FMachine-Learning-.28AI.2FML.29/topic/LLM.20FHIR.20Eval.20Preview/near/483998202).
 5 | 
 6 | ## Overview
 7 | 
 8 | `@flexpa/llm-fhir-eval` is an evaluation framework designed to benchmark the performance of LLMs on FHIR-specific tasks including generation, validation, and extraction. This framework systematically tests and validates the capabilities of LLMs in handling various healthcare-interoperability related tasks, ensuring they meet the standards required for effective FHIR implementations. It implements evaluations from prior art such as [FHIR-GPT](https://ai.nejm.org/doi/10.1056/AIcs2300301).
 9 | 
10 | ## Benchmark
11 | 
12 | `@flexpa/llm-fhir-eval` benchmarks FHIR-specific tasks including:
13 | 
14 | 1. **FHIR Resource Generation**:
15 |    - Generate accurate FHIR resources such as `Patient`, `Observation`, `MedicationStatement`, etc.
16 |    - Test the ability to create complex resource relationships and validate terminology bindings.
17 | 
18 | 2. **FHIR Resource Validation**:
19 |    - Validate FHIR resources using operations like `$validate`.
20 |    - Check for schema compliance, required field presence, and value set binding verification.
21 | 
22 | 3. **Data Extraction**:
23 |    - Extract structured FHIR-compliant data from clinical notes and other unstructured data.
24 |    - Evaluate the proficiency of LLMs in extracting specific healthcare data elements.
25 | 
26 | 4. **Tool Use**:
27 |    - Test models' ability to use FHIR validation tools and other healthcare-specific functions.
28 |    - Validate proper tool calling for FHIR operations.
29 | 
30 | ## Available Evaluations
31 | 
32 | 1. **Data Extraction** (`evals/extraction/`)
33 |    - Description: Comprehensive evaluation of extracting structured FHIR data from unstructured clinical text.
34 |    - Configurations: Both minimalist and specialist approaches available.
35 |    - Test categories: Basic demographics, conditions, explanations of benefit, medication requests, observations.
36 | 
37 | 2. **FHIR Resource Generation** (`evals/generation/`)
38 |    - Description: Tests the ability to generate valid FHIR resources and bundles.
39 |    - Configurations: Zero-shot bundle generation and multi-turn tool use scenarios.
40 |    - Models supported: GPT-3.5-turbo, GPT-4.1, O3 (low/high reasoning), Claude 3.5 Haiku, Claude 3.5 Sonnet, Claude Sonnet 4, Claude Opus 4
41 | 
42 | ## Custom Assertions
43 | 
44 | The framework includes custom assertion functions:
45 | 
46 | - `fhirPathEquals.mjs`: Validates FHIR Path expressions
47 | - `isBundle.mjs`: Checks if output is a valid FHIR Bundle
48 | - `metaElementMissing.mjs`: Validates required metadata elements
49 | - `validateOperation.mjs`: Validates FHIR operation results
50 | 
51 | ## Tools
52 | 
53 | - `validateFhirBundle.mjs`: Tool for validating FHIR Bundle resources
54 | 
55 | ## Custom Providers
56 | 
57 | - `AnthropicMessagesWithRecursiveToolCallsProvider.ts`: Enhanced Anthropic provider with recursive tool calling (up to 10 depth levels)
58 | - `OpenAiResponsesWithRecursiveToolCallsProvider.ts`: Enhanced OpenAI provider with recursive tool calling
59 | 
60 | These providers enable multi-turn tool interactions where models can iteratively call validation tools to improve their FHIR resource generation.
61 | 
62 | ## Commands to Run Evaluations
63 | 
64 | Install dependencies and set up environment variables:
65 | 
66 | ```bash
67 | yarn install
68 | ```
69 | 
70 | Copy the `.env.template` file to `.env` and supply your API keys for the models you plan to test.
71 | 
72 | Run an evaluation:
73 | 
74 | ```bash
75 | # Example: Run the extraction evaluation with minimalist config
76 | promptfoo eval -c evals/extraction/config-minimalist.yaml
77 | 
78 | # Example: Run the FHIR bundle generation evaluation
79 | promptfoo eval -c evals/generation/config-zero-shot-bundle.yaml
80 | 
81 | # Example: Run multi-turn tool use evaluation
82 | promptfoo eval -c evals/generation/config-multi-turn-tool-use.js
83 | ```
84 | 
85 | The evaluation will print its performance metrics to the console and optionally save results to files.
86 | 
87 | 


--------------------------------------------------------------------------------
/assertions/fhirPathEquals.mjs:
--------------------------------------------------------------------------------
 1 | import { evalFhirPath } from '@medplum/core';
 2 | 
 3 | export default (output, context) => {
 4 |   try {
 5 |     const result = JSON.parse(output);
 6 |     const fhirPath = context.fhirpath;
 7 |     const evalResults = evalFhirPath(fhirPath, result);
 8 |     return evalResults.length > 0;
 9 |   } catch {
10 |     return false;
11 |   }
12 | };
13 | 


--------------------------------------------------------------------------------
/assertions/isBundle.mjs:
--------------------------------------------------------------------------------
1 | export default (output, _context) => {
2 |   try {
3 |     const result = JSON.parse(output);
4 |     return result.resourceType === 'Bundle';
5 |   } catch {
6 |     return false;
7 |   }
8 | };
9 | 


--------------------------------------------------------------------------------
/assertions/metaElementMissing.mjs:
--------------------------------------------------------------------------------
 1 | export default (output, _context) => {
 2 |   try {
 3 |     const result = JSON.parse(output);
 4 |     if (!result || typeof result !== 'object') {
 5 |       return false;
 6 |     }
 7 | 
 8 |     // Bundle itself should not include a meta element.
 9 |     if (result.meta !== undefined) {
10 |       return false;
11 |     }
12 | 
13 |     if (!Array.isArray(result.entry)) {
14 |       return false;
15 |     }
16 | 
17 |     // Each resource must either:
18 |     // 1. Have no meta element, OR
19 |     // 2. Have a meta element whose only property is `profile`.
20 |     const resourceMetaIsValid = (meta) => {
21 |       if (meta === undefined) return true;
22 |       // meta must be an object with exactly one key: "profile"
23 |       return typeof meta === 'object' && meta !== null && Object.keys(meta).length === 1 && 'profile' in meta;
24 |     };
25 | 
26 |     return result.entry.every((e) => resourceMetaIsValid(e.resource?.meta));
27 |   } catch {
28 |     return false;
29 |   }
30 | };
31 | 


--------------------------------------------------------------------------------
/assertions/validateOperation.mjs:
--------------------------------------------------------------------------------
 1 | import { randomUUID } from 'crypto';
 2 | 
 3 | // Validate a FHIR resource using the local validator and return error issues or true if none found
 4 | export async function validate(modelResponse) {
 5 |   const response = await fetch('http://localhost:8082/validate', {
 6 |     method: 'POST',
 7 |     headers: {
 8 |       accept: 'application/json',
 9 |       'Content-Type': 'application/json',
10 |     },
11 |     body: JSON.stringify({
12 |       cliContext: {
13 |         sv: '4.0.1',
14 |         ig: ['hl7.fhir.us.core#4.0.0'],
15 |         locale: 'en',
16 |       },
17 |       filesToValidate: [
18 |         {
19 |           fileName: 'manually_entered_file.json',
20 |           fileContent: modelResponse,
21 |           fileType: 'json',
22 |         },
23 |       ],
24 |       sessionId: randomUUID(),
25 |     }),
26 |   });
27 |   const data = await response.json();
28 | 
29 |   const errorIssues = data.outcomes.flatMap((outcome) => outcome.issues).filter((issue) => issue.level === 'ERROR');
30 | 
31 |   return errorIssues;
32 | }
33 | 
34 | export default async function evaluate(modelResponse) {
35 |   const response = await validate(modelResponse);
36 | 
37 |   return response.length === 0 ? true : response;
38 | }
39 | 


--------------------------------------------------------------------------------
/etc/fhir-gpt.yaml:
--------------------------------------------------------------------------------
  1 | description: 'FHIR-GPT Prompt Supplementary Appendix DOI: 10.1056/AIcs2300301'
  2 | 
  3 | providers:
  4 |   - anthropic:messages:claude-3-5-sonnet-20241022
  5 |   - openai:chat:gpt-4o
  6 |   - openai:chat:gpt-4o-mini
  7 | 
  8 | prompts:
  9 |   - label: 'FHIR-GPT Prompt'
 10 |     description: 'Prompt used in FHIR-GPT'
 11 |     raw: |
 12 |       You are a helpful assistant that can help with medication data extraction.
 13 |       User will paste a short narrative that describes the administration of a drug.
 14 |       Please extract the drug route (How drug should enter body), e.g. PO, IV.
 15 |       All other drug information, e,g. dosage, frequency, reason shall be discarded.
 16 | 
 17 |       Please MUST ONLY return the converted .json result without any explanations, or contexts.
 18 |       The output itself must be parse-able with python's json.loads()
 19 |       The output should start and end with brackets.
 20 | 
 21 |       If you cannot find related drug route, you MUST leave it as blank and MUST return a blank json {}
 22 |       You MUST use information only from the original text, MUST NOT infer from the context.
 23 | 
 24 |       For each drug route, please extract the originial text and find its most related SNOMED code in o
 25 |       If you cannot find an exact same meaning SNOMED drug route code, just leave it as blank.
 26 | 
 27 |       For example, the narrative "Oxycodone-Acetaminophen 5-325 mg Tablet Sig: 1-2 Tablets PO\nQ4-6H (e
 28 |       You should return a json format: {'text': 'PO', 'coding': [{'system': 'http://snomed.info/sct',
 29 | 
 30 |       Another example, the narrative "Daptomycin, intravenously for a total of 14\ndays"
 31 |       You should return a json format: {'text': 'intravenously', 'coding': [{'system': 'http://snomed.i
 32 | 
 33 |       Another example, the narrative "heparin sodium, porcine 5000 UNT/ML Injectable Solution"
 34 |       You should return a json format: {'text': 'injections'}
 35 | 
 36 |       Another example, the narrative "Oxycodone-Acetaminophen 5-325 mg q4h prn torn ACL pain"
 37 |       You should return blank json {}
 38 | 
 39 |       SNOMED codes to select from:
 40 | 
 41 |       Code Display
 42 |       284009009 Route of administration values
 43 |       6064005 Topical route
 44 |       10547007 Otic route
 45 |       12130007 Intra-articular route
 46 |       16857009 Per vagina
 47 |       26643006 Oral route
 48 |       34206005 Subcutaneous route
 49 |       37161004 Per rectum
 50 |       37737002 Intraluminal route
 51 |       37839007 Sublingual route
 52 |       38239002 Intraperitoneal route
 53 |       45890007 Transdermal route
 54 |       46713006 Nasal route
 55 |       47625008 Intravenous route
 56 |       54471007 Buccal route
 57 |       54485002 Ophthalmic route
 58 |       58100008 Intra-arterial route
 59 |       60213007 Intramedullary route
 60 |       62226000 Intrauterine route
 61 |       72607000 Intrathecal route
 62 |       78421000 Intramuscular route
 63 |       90028008 Urethral route
 64 |       127490009 Gastrostomy route
 65 |       127491008 Jejunostomy route
 66 |       127492001 Nasogastric route
 67 |       372449004 Dental use
 68 |       372450004 Endocervical use
 69 |       372451000 Endosinusial use
 70 |       372452007 Endotracheopulmonary use
 71 |       372453002 Extra-amniotic use
 72 |       372454008 Gastroenteral use
 73 |       372457001 Gingival use
 74 |       372458006 Intraamniotic use
 75 |       372459003 Intrabursal use
 76 |       372460008 Intracardiac use
 77 |       372461007 Intracavernous use
 78 |       372462000 Intracervical route
 79 |       372463005 Intracoronary use
 80 |       372464004 Intradermal use
 81 |       372465003 Intradiscal use
 82 |       372466002 Intralesional use
 83 |       372467006 Intralymphatic use
 84 |       372468001 Intraocular use
 85 |       372469009 Intrapleural use
 86 |       372470005 Intrasternal use
 87 |       372471009 Intravesical use
 88 |       372472002 Ocular route
 89 |       372473007 Oromucosal use
 90 |       372474001 Periarticular use
 91 |       372475000 Perineural use
 92 |       372476004 Subconjunctival use
 93 |       404815008 Transmucosal route
 94 |       404818005 Intratracheal route
 95 |       404819002 Intrabiliary route
 96 |       404820008 Epidural route
 97 |       416174007 Suborbital route
 98 |       417070009 Caudal route
 99 |       417255000 Intraosseous route
100 |       417950001 Intrathoracic route
101 |       417985001 Enteral route
102 |       417989007 Intraductal route
103 |       418091004 Intratympanic route
104 |       418114005 Intravenous central route
105 |       418133000 Intramyometrial route
106 |       418136008 Gastro-intestinal stoma route
107 |       418162004 Colostomy route
108 |       418204005 Periurethral route
109 |       418287000 Intracoronal route
110 |       418321004 Retrobulbar route
111 |       418331006 Intracartilaginous route
112 |       418401004 Intravitreal route
113 |       418418000 Intraspinal route
114 |       418441008 Orogastric route
115 |       418511008 Transurethral route
116 |       418586008 Intratendinous route
117 |       418608002 Intracorneal route
118 |       418664002 Oropharyngeal route
119 |       418722009 Peribulbar route
120 |       418730005 Nasojejunal route
121 |       418743005 Fistula route
122 |       418813001 Surgical drain route
123 |       418821007 Intracameral route
124 |       418851001 Paracervical route
125 |       418877009 Intrasynovial route
126 |       418887008 Intraduodenal route
127 |       418892005 Intracisternal route
128 |       418947002 Intratesticular route
129 |       418987007 Intracranial route
130 |       419021003 Tumour cavity route
131 |       419165009 Paravertebral route
132 |       419231003 Intrasinal route
133 |       419243002 Transcervical route
134 |       419320008 Subtendinous route
135 |       419396008 Intraabdominal route
136 |       419601003 Subgingival route
137 |       419631009 Intraovarian route
138 |       419684008 Ureteral route
139 |       419762003 Peritendinous route
140 |       419778001 Intrabronchial route
141 |       419810008 Intraprostatic route
142 |       419874009 Submucosal route
143 |       419894000 Surgical cavity route
144 |       419954003 Ileostomy route
145 |       419993007 Intravenous peripheral route
146 | 
147 |       User input:
148 |       {{note}}
149 | 
150 | defaultAssert:
151 |   - type: is-json
152 | 
153 | tests:
154 |   - vars:
155 |       note: |
156 |         Carvedilol
157 |         6.25 mg PO BID
158 |     assert:
159 |       - type: equals
160 |         value:
161 |           {
162 |             'text': 'PO',
163 |             'coding': [{ 'system': 'http://snomed.info/sct', 'code': '26643006', 'display': 'Oral route' }],
164 |           }
165 | 


--------------------------------------------------------------------------------
/evals/extraction/config-minimalist.yaml:
--------------------------------------------------------------------------------
 1 | # yaml-language-server: $schema=https://promptfoo.dev/config-schema.json
 2 | description: 'Structured FHIR Data Extraction'
 3 | 
 4 | providers:
 5 |   - file://./providers.yaml
 6 | 
 7 | prompts:
 8 |   - label: 'Minimalist'
 9 |     raw: |
10 |       Extract the answer to the question from the FHIR resource.
11 | 
12 |       <fhir-resource>
13 |       {{resource}}
14 |       </fhir-resource>
15 | 
16 |       <question>
17 |       {{question}}
18 |       </question>
19 | 
20 | defaultTest:
21 |   options:
22 |     transform: output.trim()
23 | 
24 | tests:
25 |   - file://tests/basic-demographics.yaml
26 |   - file://tests/conditions.yaml
27 |   - file://tests/observations.yaml
28 |   - file://tests/explanations-of-benefit.yaml
29 |   - file://tests/medication-requests.yaml
30 |   - file://tests/patient-history.yaml
31 | 


--------------------------------------------------------------------------------
/evals/extraction/config-specialist.yaml:
--------------------------------------------------------------------------------
 1 | # yaml-language-server: $schema=https://promptfoo.dev/config-schema.json
 2 | description: 'Structured FHIR Data Extraction'
 3 | 
 4 | providers:
 5 |   - file://./providers.yaml
 6 | 
 7 | prompts:
 8 |   - label: 'Specialist'
 9 |     raw: |
10 |       You are a FHIR data extraction specialist.
11 |       Given a FHIR resource and a question, extract the requested information.
12 |       Return only the specific answer without explanation.
13 |       If the question cannot be answered with the information provided, return "N/A".
14 |       Do not infer or make assumptions.
15 |       When the question is about a specific value, return the value only.
16 |       When the value exists literally in the FHIR resource, return the value only.
17 |       If a unit is specified, return the value with unit, in the normally expected format.
18 |       Do not return extra text or formatting including unnecesary quotes around strings.
19 |       Do not append or prepend any newlines.
20 | 
21 |       <fhir-resource>
22 |       {{resource}}
23 |       </fhir-resource>
24 | 
25 |       <question>
26 |       {{question}}
27 |       </question>
28 | 
29 | defaultTest:
30 |   options:
31 |     transform: output.trim()
32 | 
33 | tests:
34 |   - file://tests/basic-demographics.yaml
35 |   - file://tests/conditions.yaml
36 |   - file://tests/observations.yaml
37 |   - file://tests/explanations-of-benefit.yaml
38 |   - file://tests/medication-requests.yaml
39 |   - file://tests/patient-history.yaml
40 | 


--------------------------------------------------------------------------------
/evals/extraction/providers.yaml:
--------------------------------------------------------------------------------
 1 | - label: openai-gpt-3.5-turbo
 2 |   id: openai:chat:gpt-3.5-turbo
 3 |   config:
 4 |     max_output_tokens: 8092
 5 | - label: openai-gpt-4.1
 6 |   id: openai:responses:gpt-4.1
 7 |   config:
 8 |     max_output_tokens: 8092
 9 | - label: openai-o3-low
10 |   id: openai:responses:o3
11 |   config:
12 |     max_output_tokens: 16184
13 |     reasoning_effort: 'low'
14 | - label: openai-o3-high
15 |   id: openai:responses:o3
16 |   config:
17 |     max_output_tokens: 16184
18 |     reasoning_effort: 'high'
19 | - label: claude-3-5-haiku-20241022
20 |   id: anthropic:messages:claude-3-5-haiku-20241022
21 |   config:
22 |     max_output_tokens: 8092
23 | - label: anthropic-claude-3-5-sonnet-20241022
24 |   id: anthropic:messages:claude-3-5-sonnet-20241022
25 |   config:
26 |     max_output_tokens: 8092
27 | - label: anthropic-claude-sonnet-4-20250514
28 |   id: anthropic:messages:claude-sonnet-4-20250514
29 |   config:
30 |     max_output_tokens: 8092
31 | - label: anthropic-claude-opus-4-20250514
32 |   id: anthropic:messages:claude-opus-4-20250514
33 |   config:
34 |     max_output_tokens: 8092
35 | - label: google-gemini-2.0-flash
36 |   id: google:gemini-2.0-flash
37 |   config:
38 |     max_output_tokens: 8092
39 | - label: google-gemini-2.5-flash-preview-05-20
40 |   id: google:gemini-2.5-flash-preview-05-20
41 |   config:
42 |     max_output_tokens: 8092
43 | - label: google-gemini-2.5-pro-preview-05-06
44 |   id: google:gemini-2.5-pro-preview-05-06
45 |   config:
46 |     max_output_tokens: 8092
47 | - label: ii-medical-8b
48 |   id: openai:chat:II-Medical-8B
49 |   config:
50 |     max_output_tokens: 16184
51 |     apiBaseUrl: https://g6ifi04b81u9oza5.us-east-1.aws.endpoints.huggingface.cloud/v1
52 |     # @note set API KEY
53 |     # apiKey: 
54 |     showThinking: false
55 |   transform: |
56 |     output = output.replace(/<think>.*<\/think>/gis, '').trim();
57 |     output = output.replace(/<\s*\/?\s*Answer\s*>/gi, '').trim();
58 |     return output;
59 | - label: medgemma-4b-it
60 |   id: openai:chat:medgemma-4b-it
61 |   config:
62 |     max_output_tokens: 16184
63 |     apiBaseUrl: https://a6pf0b0uqcuajaua.us-east-1.aws.endpoints.huggingface.cloud/v1
64 |     # @note set API KEY
65 |     # apiKey: 
66 |     showThinking: false
67 |   transform: |
68 |     output = output.replace(/<\s*\/?\s*Answer\s*>/gi, '').trim();
69 |     return output;
70 | - label: medgemma-27b-text-it
71 |   id: openai:chat:medgemma-27b-text-it
72 |   config:
73 |     max_output_tokens: 16184
74 |     apiBaseUrl: https://i7n97jz1el3l39h5.us-east-1.aws.endpoints.huggingface.cloud/v1
75 |     # @note set API KEY
76 |     # apiKey: 
77 |     showThinking: false
78 |   transform: |
79 |     output = output.replace(/<\s*\/?\s*Answer\s*>/gi, '').trim();
80 |     return output;
81 | 


--------------------------------------------------------------------------------
/evals/extraction/tests/basic-demographics.yaml:
--------------------------------------------------------------------------------
  1 | - description: Full name
  2 |   vars:
  3 |     resource: &patient_resource_json |
  4 |       {
  5 |         "resourceType": "Patient",
  6 |         "id": "example",
  7 |         "name": [
  8 |           {
  9 |             "use": "official",
 10 |             "family": "Smith-Jones",
 11 |             "given": ["John", "Jacob", "Jingleheimer"],
 12 |             "prefix": ["Dr."],
 13 |             "suffix": ["Jr."]
 14 |           },
 15 |           {
 16 |             "use": "nickname",
 17 |             "given": ["Jack"]
 18 |           },
 19 |           {
 20 |             "use": "maiden",
 21 |             "family": "Johnson"
 22 |           }
 23 |         ],
 24 |         "gender": "other",
 25 |         "birthDate": "1990-01-15",
 26 |         "address": [
 27 |           {
 28 |             "use": "home",
 29 |             "line": ["123 Main St", "Apt 4B"],
 30 |             "city": "Anytown",
 31 |             "state": "CA",
 32 |             "postalCode": "90210"
 33 |           },
 34 |           {
 35 |             "use": "old",
 36 |             "line": ["456 Elm St"],
 37 |             "city": "Oldtown",
 38 |             "state": "NY",
 39 |             "postalCode": "54321"
 40 |           }
 41 |         ],
 42 |         "telecom": [
 43 |           {
 44 |             "system": "phone",
 45 |             "value": "555-123-4567",
 46 |             "use": "home"
 47 |           },
 48 |           {
 49 |             "system": "email",
 50 |             "value": "john.smith@example.com",
 51 |             "use": "work"
 52 |           }
 53 |         ],
 54 |         "maritalStatus": {
 55 |           "coding": [
 56 |             {
 57 |               "system": "http://terminology.hl7.org/CodeSystem/v3-MaritalStatus",
 58 |               "code": "M",
 59 |               "display": "Married"
 60 |             }
 61 |           ]
 62 |         },
 63 |         "multipleBirthBoolean": true
 64 |       }
 65 |     question: What is the patient's full name?
 66 |   assert:
 67 |     - type: equals
 68 |       value: Dr. John Jacob Jingleheimer Smith-Jones Jr.
 69 | 
 70 | - description: Date of birth
 71 |   vars:
 72 |     resource: *patient_resource_json
 73 |     question: What is the patient's date of birth?
 74 |   assert:
 75 |     - type: equals
 76 |       value: '1990-01-15'
 77 | 
 78 | - description: Nickname
 79 |   vars:
 80 |     resource: *patient_resource_json
 81 |     question: What is the patient's nickname?
 82 |   assert:
 83 |     - type: equals
 84 |       value: Jack
 85 | 
 86 | - description: Complete home address
 87 |   vars:
 88 |     resource: *patient_resource_json
 89 |     question: What is the patient's complete home address in standard format?
 90 |   assert:
 91 |     - type: contains-any
 92 |       value:
 93 |         - 123 Main St, Apt 4B, Anytown, CA 90210
 94 |         - 123 Main St Apt 4B, Anytown, CA 90210
 95 |         - 123 Main St, Apt 4B Anytown CA 90210
 96 |         - 123 Main St Apt 4B Anytown, CA 90210
 97 | 
 98 | - description: State of residence
 99 |   vars:
100 |     resource: *patient_resource_json
101 |     question: What state does the patient live in?
102 |   assert:
103 |     - type: equals
104 |       value: CA
105 | 
106 | - description: All given names in official name
107 |   vars:
108 |     resource: *patient_resource_json
109 |     question: List all given names for the patient's official name.
110 |   assert:
111 |     - type: contains-any
112 |       value:
113 |         - John, Jacob, Jingleheimer
114 |         - John Jacob Jingleheimer
115 |         - "John\nJacob\nJingleheimer"
116 | 
117 | - description: Maiden name
118 |   vars:
119 |     resource: *patient_resource_json
120 |     question: What is the patient's maiden name?
121 |   assert:
122 |     - type: equals
123 |       value: Johnson
124 | 
125 | - description: All addresses
126 |   vars:
127 |     resource: *patient_resource_json
128 |     question: What was the patient's previous address?
129 |   assert:
130 |     - type: contains-any
131 |       value:
132 |         - 456 Elm St, Oldtown, NY 54321
133 |         - 456 Elm St, Oldtown, NY, 54321
134 | 
135 | - description: Home address, line 2
136 |   vars:
137 |     resource: *patient_resource_json
138 |     question: What is the second line of the patient's home address?
139 |   assert:
140 |     - type: equals
141 |       value: Apt 4B
142 | 
143 | - description: All telecoms
144 |   vars:
145 |     resource: *patient_resource_json
146 |     question: List all contact methods for the patient in the format "value (system), value (system), etc.".
147 |   assert:
148 |     - type: equals
149 |       value: 555-123-4567 (phone), john.smith@example.com (email)
150 | 
151 | - description: Marital status code
152 |   vars:
153 |     resource: *patient_resource_json
154 |     question: What is the patient's marital status code?
155 |   assert:
156 |     - type: equals
157 |       value: M
158 | 
159 | - description: Is the patient a multiple birth? True or false?
160 |   vars:
161 |     resource: *patient_resource_json
162 |     question: Was the patient part of a multiple birth?
163 |   assert:
164 |     - type: equals
165 |       value: true
166 | 
167 | - description: Gender value
168 |   vars:
169 |     resource: *patient_resource_json
170 |     question: What is the patient's gender?
171 |   assert:
172 |     - type: equals
173 |       value: other
174 | 


--------------------------------------------------------------------------------
/evals/extraction/tests/conditions.yaml:
--------------------------------------------------------------------------------
  1 | - vars:
  2 |     resource: &condition_resource_json |
  3 |       {
  4 |         "resourceType": "Condition",
  5 |         "id": "example",
  6 |         "clinicalStatus": {
  7 |           "coding": [
  8 |             {
  9 |               "system": "http://terminology.hl7.org/CodeSystem/condition-clinical",
 10 |               "code": "active",
 11 |               "display": "Active"
 12 |             }
 13 |           ]
 14 |         },
 15 |         "verificationStatus": {
 16 |           "coding": [
 17 |             {
 18 |               "system": "http://terminology.hl7.org/CodeSystem/condition-ver-status",
 19 |               "code": "confirmed",
 20 |               "display": "Confirmed"
 21 |             }
 22 |           ]
 23 |         },
 24 |         "category": [
 25 |           {
 26 |             "coding": [
 27 |               {
 28 |                 "system": "http://terminology.hl7.org/CodeSystem/condition-category",
 29 |                 "code": "problem-list-item",
 30 |                 "display": "Problem List Item"
 31 |               }
 32 |             ]
 33 |           }
 34 |         ],
 35 |         "severity": {
 36 |           "coding": [
 37 |             {
 38 |               "system": "http://snomed.info/sct",
 39 |               "code": "24484000",
 40 |               "display": "Severe"
 41 |             }
 42 |           ]
 43 |         },
 44 |         "code": {
 45 |           "coding": [
 46 |             {
 47 |               "system": "http://snomed.info/sct",
 48 |               "code": "195967001",
 49 |               "display": "Asthma"
 50 |             }
 51 |           ]
 52 |         },
 53 |         "onsetDateTime": "2020-03-15"
 54 |       }
 55 |     question: Is this an active severe condition, and if so, what is it? You must specify both the condition and the severity.
 56 |   assert:
 57 |     - type: factuality
 58 |       value: Yes, severe active asthma
 59 | - vars:
 60 |     resource: *condition_resource_json
 61 |     question: When did the patient develop the condition?
 62 |   assert:
 63 |     - type: equals
 64 |       value: '2020-03-15'
 65 | - vars:
 66 |     resource: *condition_resource_json
 67 |     question: What coding system is used to specify the condition's severity?
 68 |   assert:
 69 |     - type: equals
 70 |       value: http://snomed.info/sct
 71 | - description: Condition with multiple codings for code
 72 |   vars:
 73 |     resource: |
 74 |       {
 75 |         "resourceType": "Condition",
 76 |         "clinicalStatus": {
 77 |           "coding": [ { "system": "http://terminology.hl7.org/CodeSystem/condition-clinical", "code": "active" } ]
 78 |         },
 79 |         "code": {
 80 |           "coding": [
 81 |             { "system": "http://snomed.info/sct", "code": "44054006", "display": "Diabetes mellitus type 2" },
 82 |             { "system": "http://icd10who.org", "code": "E11", "display": "Type 2 diabetes mellitus" }
 83 |           ]
 84 |         },
 85 |         "onsetDateTime": "2018-06-01"
 86 |       }
 87 |     question: What is the SNOMED code for the condition?
 88 |   assert:
 89 |     - type: equals
 90 |       value: 44054006
 91 | - description: Condition with missing severity
 92 |   vars:
 93 |     resource: |
 94 |       {
 95 |         "resourceType": "Condition",
 96 |         "clinicalStatus": {
 97 |           "coding": [ { "system": "http://terminology.hl7.org/CodeSystem/condition-clinical", "code": "active" } ]
 98 |         },
 99 |         "code": {
100 |           "coding": [ { "system": "http://snomed.info/sct", "code": "38341003", "display": "Hypertension" } ]
101 |         },
102 |         "onsetDateTime": "2019-01-01"
103 |       }
104 |     question: What is the severity of the condition?
105 |   assert:
106 |     - type: equals
107 |       value: N/A
108 | - description: Condition with ambiguous severity (multiple codings)
109 |   vars:
110 |     resource: |
111 |       {
112 |         "resourceType": "Condition",
113 |         "clinicalStatus": {
114 |           "coding": [ { "system": "http://terminology.hl7.org/CodeSystem/condition-clinical", "code": "active" } ]
115 |         },
116 |         "severity": {
117 |           "coding": [
118 |             { "system": "http://snomed.info/sct", "code": "255604002", "display": "Mild" },
119 |             { "system": "http://snomed.info/sct", "code": "24484000", "display": "Severe" }
120 |           ]
121 |         },
122 |         "code": {
123 |           "coding": [ { "system": "http://snomed.info/sct", "code": "195967001", "display": "Asthma" } ]
124 |         },
125 |         "onsetDateTime": "2021-05-10"
126 |       }
127 |     question: List all severities coded for this condition.
128 |   assert:
129 |     - type: contains-any
130 |       value:
131 |         - Mild, Severe
132 |         - Mild,Severe
133 |         - "Mild\nSevere"
134 | - description: Condition with reason reference
135 |   vars:
136 |     resource: |
137 |       {
138 |         "resourceType": "Condition",
139 |         "clinicalStatus": {
140 |           "coding": [ { "system": "http://terminology.hl7.org/CodeSystem/condition-clinical", "code": "active" } ]
141 |         },
142 |         "code": {
143 |           "coding": [ { "system": "http://snomed.info/sct", "code": "195967001", "display": "Asthma" } ]
144 |         },
145 |         "onsetDateTime": "2022-02-02",
146 |         "evidence": [
147 |           { "detail": [ { "reference": "Observation/obs123" } ] }
148 |         ]
149 |       }
150 |     question: What is the referenced evidence resource type?
151 |   assert:
152 |     - type: equals
153 |       value: Observation
154 | - description: Ultimate Condition extraction challenge - Staging and Progression
155 |   vars:
156 |     resource: |
157 |       {
158 |         "resourceType": "Bundle",
159 |         "type": "collection",
160 |         "entry": [
161 |           {
162 |             "fullUrl": "urn:uuid:cond-diabetes",
163 |             "resource": {
164 |               "resourceType": "Condition",
165 |               "id": "cond-diabetes",
166 |               "clinicalStatus": { "coding": [ { "system": "http://terminology.hl7.org/CodeSystem/condition-clinical", "code": "active" } ] },
167 |               "verificationStatus": { "coding": [ { "system": "http://terminology.hl7.org/CodeSystem/condition-ver-status", "code": "confirmed" } ] },
168 |               "code": { "coding": [ { "system": "http://snomed.info/sct", "code": "44054006", "display": "Diabetes mellitus type 2" } ], "text": "Diabetes mellitus type 2" },
169 |               "subject": { "reference": "Patient/example" },
170 |               "onsetDateTime": "2015-01-01"
171 |             }
172 |           },
173 |           {
174 |             "fullUrl": "urn:uuid:cond-hypertension",
175 |             "resource": {
176 |               "resourceType": "Condition",
177 |               "id": "cond-hypertension",
178 |               "clinicalStatus": { "coding": [ { "system": "http://terminology.hl7.org/CodeSystem/condition-clinical", "code": "active" } ] },
179 |               "verificationStatus": { "coding": [ { "system": "http://terminology.hl7.org/CodeSystem/condition-ver-status", "code": "confirmed" } ] },
180 |               "code": { "coding": [ { "system": "http://snomed.info/sct", "code": "38341003", "display": "Hypertensive disorder, systemic arterial" } ], "text": "Hypertension" },
181 |               "subject": { "reference": "Patient/example" },
182 |               "onsetDateTime": "2016-01-01"
183 |             }
184 |           },
185 |           {
186 |             "fullUrl": "urn:uuid:cond-ckd-stage2-resolved",
187 |             "resource": {
188 |               "resourceType": "Condition",
189 |               "id": "cond-ckd-stage2-resolved",
190 |               "clinicalStatus": { "coding": [ { "system": "http://terminology.hl7.org/CodeSystem/condition-clinical", "code": "resolved" } ] },
191 |               "verificationStatus": { "coding": [ { "system": "http://terminology.hl7.org/CodeSystem/condition-ver-status", "code": "confirmed" } ] },
192 |               "code": { "coding": [ { "system": "http://snomed.info/sct", "code": "431856001", "display": "Chronic kidney disease stage 2" } ], "text": "CKD Stage 2" },
193 |               "subject": { "reference": "Patient/example" },
194 |               "onsetDateTime": "2020-03-15",
195 |               "abatementDateTime": "2021-01-10"
196 |             }
197 |           },
198 |           {
199 |             "fullUrl": "urn:uuid:cond-ckd-stage3a-active",
200 |             "resource": {
201 |               "resourceType": "Condition",
202 |               "id": "cond-ckd-stage3a-active",
203 |               "clinicalStatus": { "coding": [ { "system": "http://terminology.hl7.org/CodeSystem/condition-clinical", "code": "active" } ] },
204 |               "verificationStatus": { "coding": [ { "system": "http://terminology.hl7.org/CodeSystem/condition-ver-status", "code": "confirmed" } ] },
205 |               "code": { "coding": [ { "system": "http://snomed.info/sct", "code": "441208003", "display": "Chronic kidney disease stage 3A" } ], "text": "CKD Stage 3a" },
206 |               "subject": { "reference": "Patient/example" },
207 |               "onsetDateTime": "2021-02-20"
208 |             }
209 |           },
210 |           {
211 |             "fullUrl": "urn:uuid:cond-ckd-general-error",
212 |             "resource": {
213 |               "resourceType": "Condition",
214 |               "id": "cond-ckd-general-error",
215 |               "clinicalStatus": { "coding": [ { "system": "http://terminology.hl7.org/CodeSystem/condition-clinical", "code": "active" } ] },
216 |               "verificationStatus": { "coding": [ { "system": "http://terminology.hl7.org/CodeSystem/condition-ver-status", "code": "entered-in-error" } ] },
217 |               "code": { "coding": [ { "system": "http://snomed.info/sct", "code": "709044004", "display": "Chronic kidney disease" } ], "text": "CKD general" },
218 |               "subject": { "reference": "Patient/example" },
219 |               "onsetDateTime": "2021-01-05"
220 |             }
221 |           },
222 |           {
223 |             "fullUrl": "urn:uuid:cond-ckd-stage1-refuted",
224 |             "resource": {
225 |               "resourceType": "Condition",
226 |               "id": "cond-ckd-stage1-refuted",
227 |               "clinicalStatus": { "coding": [ { "system": "http://terminology.hl7.org/CodeSystem/condition-clinical", "code": "active" } ] },
228 |               "verificationStatus": { "coding": [ { "system": "http://terminology.hl7.org/CodeSystem/condition-ver-status", "code": "refuted" } ] },
229 |               "code": { "coding": [ { "system": "http://snomed.info/sct", "code": "431855002", "display": "Chronic kidney disease stage 1" } ], "text": "CKD Stage 1" },
230 |               "subject": { "reference": "Patient/example" },
231 |               "onsetDateTime": "2019-11-01"
232 |             }
233 |           },
234 |           {
235 |             "fullUrl": "urn:uuid:cond-ckd-stage2-differential",
236 |             "resource": {
237 |               "resourceType": "Condition",
238 |               "id": "cond-ckd-stage2-differential",
239 |               "clinicalStatus": { "coding": [ { "system": "http://terminology.hl7.org/CodeSystem/condition-clinical", "code": "active" } ] },
240 |               "verificationStatus": { "coding": [ { "system": "http://terminology.hl7.org/CodeSystem/condition-ver-status", "code": "differential" } ] },
241 |               "code": { "coding": [ { "system": "http://snomed.info/sct", "code": "431856001", "display": "Chronic kidney disease stage 2" } ], "text": "CKD Stage 2 (Differential)" },
242 |               "subject": { "reference": "Patient/example" },
243 |               "onsetDateTime": "2020-12-01"
244 |             }
245 |           },
246 |           {
247 |             "fullUrl": "urn:uuid:cond-aki-resolved",
248 |             "resource": {
249 |               "resourceType": "Condition",
250 |               "id": "cond-aki-resolved",
251 |               "clinicalStatus": { "coding": [ { "system": "http://terminology.hl7.org/CodeSystem/condition-clinical", "code": "resolved" } ] },
252 |               "verificationStatus": { "coding": [ { "system": "http://terminology.hl7.org/CodeSystem/condition-ver-status", "code": "confirmed" } ] },
253 |               "code": { "coding": [ { "system": "http://snomed.info/sct", "code": "66547009", "display": "Acute kidney injury" } ], "text": "Acute Kidney Injury" },
254 |               "subject": { "reference": "Patient/example" },
255 |               "onsetDateTime": "2022-06-01",
256 |               "abatementDateTime": "2022-07-15"
257 |             }
258 |           }
259 |         ]
260 |       }
261 |     question: What is the SNOMED CT code for the highest currently active and confirmed stage of Chronic Kidney Disease (CKD) based on the patient's record?
262 |   assert:
263 |     - type: equals
264 |       value: 441208003 # CKD Stage 3a
265 | 


--------------------------------------------------------------------------------
/evals/extraction/tests/medication-requests.yaml:
--------------------------------------------------------------------------------
  1 | - vars:
  2 |     resource: &medication_request_resource_json |
  3 |       {
  4 |         "resourceType": "MedicationRequest",
  5 |         "id": "example",
  6 |         "status": "active",
  7 |         "intent": "order",
  8 |         "medicationCodeableConcept": {
  9 |           "coding": [
 10 |             {
 11 |               "system": "http://www.nlm.nih.gov/research/umls/rxnorm",
 12 |               "code": "1049502",
 13 |               "display": "Acetaminophen 325 MG"
 14 |             }
 15 |           ]
 16 |         },
 17 |         "dosageInstruction": [
 18 |           {
 19 |             "sequence": 1,
 20 |             "timing": {
 21 |               "repeat": {
 22 |                 "frequency": 4,
 23 |                 "period": 1,
 24 |                 "periodUnit": "d"
 25 |               }
 26 |             },
 27 |             "doseAndRate": [
 28 |               {
 29 |                 "type": {
 30 |                   "coding": [
 31 |                     {
 32 |                       "system": "http://terminology.hl7.org/CodeSystem/dose-rate-type",
 33 |                       "code": "ordered",
 34 |                       "display": "Ordered"
 35 |                     }
 36 |                   ]
 37 |                 },
 38 |                 "doseQuantity": {
 39 |                   "value": 1,
 40 |                   "unit": "tablet"
 41 |                 }
 42 |               }
 43 |             ]
 44 |           }
 45 |         ]
 46 |       }
 47 |     question: What is the daily frequency?
 48 |   assert:
 49 |     - type: equals
 50 |       value: 4
 51 | - vars:
 52 |     resource: *medication_request_resource_json
 53 |     question: What is the medication name and strength?
 54 |   assert:
 55 |     - type: equals
 56 |       value: Acetaminophen 325 MG
 57 | - vars:
 58 |     resource: *medication_request_resource_json
 59 |     question: What is the dose quantity?
 60 |   assert:
 61 |     - type: equals
 62 |       value: 1 tablet
 63 | - vars:
 64 |     resource: *medication_request_resource_json
 65 |     question: What is the status of the medication request?
 66 |   assert:
 67 |     - type: equals
 68 |       value: active
 69 | - vars:
 70 |     resource: *medication_request_resource_json
 71 |     question: Does the medication have an RxNorm code? If yes, what is it?
 72 |   assert:
 73 |     - type: contains
 74 |       value: '1049502'
 75 | - description: MedicationRequest with multiple codings in medicationCodeableConcept
 76 |   vars:
 77 |     resource: |
 78 |       {
 79 |         "resourceType": "MedicationRequest",
 80 |         "status": "active",
 81 |         "intent": "order",
 82 |         "medicationCodeableConcept": {
 83 |           "coding": [
 84 |             { "system": "http://www.nlm.nih.gov/research/umls/rxnorm", "code": "316074", "display": "Ibuprofen 200 MG" },
 85 |             { "system": "http://snomed.info/sct", "code": "387207008", "display": "Ibuprofen" }
 86 |           ]
 87 |         },
 88 |         "dosageInstruction": [
 89 |           {
 90 |             "doseAndRate": [
 91 |               { "doseQuantity": { "value": 2, "unit": "tablet" } }
 92 |             ]
 93 |           }
 94 |         ]
 95 |       }
 96 |     question: What is the RxNorm code for the medication?
 97 |   assert:
 98 |     - type: equals
 99 |       value: 316074
100 | - description: MedicationRequest with missing dose unit
101 |   vars:
102 |     resource: |
103 |       {
104 |         "resourceType": "MedicationRequest",
105 |         "status": "active",
106 |         "intent": "order",
107 |         "medicationCodeableConcept": {
108 |           "coding": [
109 |             { "system": "http://www.nlm.nih.gov/research/umls/rxnorm", "code": "197361", "display": "Lisinopril 10 MG" }
110 |           ]
111 |         },
112 |         "dosageInstruction": [
113 |           {
114 |             "doseAndRate": [
115 |               { "doseQuantity": { "value": 1 } }
116 |             ]
117 |           }
118 |         ]
119 |       }
120 |     question: What is the dose quantity (including unit, if available)?
121 |   assert:
122 |     - type: equals
123 |       value: 1
124 | - description: MedicationRequest with dose as a range
125 |   vars:
126 |     resource: |
127 |       {
128 |         "resourceType": "MedicationRequest",
129 |         "status": "active",
130 |         "intent": "order",
131 |         "medicationCodeableConcept": {
132 |           "coding": [
133 |             { "system": "http://www.nlm.nih.gov/research/umls/rxnorm", "code": "617314", "display": "Metformin 500 MG" }
134 |           ]
135 |         },
136 |         "dosageInstruction": [
137 |           {
138 |             "doseAndRate": [
139 |               { "doseRange": { "low": { "value": 1, "unit": "tablet" }, "high": { "value": 2, "unit": "tablet" } } }
140 |             ]
141 |           }
142 |         ]
143 |       }
144 |     question: What is the dose range (including units)?
145 |   assert:
146 |     - type: contains-any
147 |       value:
148 |         - '1-2 tablet'
149 |         - '1 tablet - 2 tablet'
150 |         - '1-2 tablets'
151 |         - '1 to 2 tablets'
152 |         - '1 to 2 tablet'
153 |         - 1 tablet-2 tablet
154 | - description: MedicationRequest with coded reason for prescription
155 |   vars:
156 |     resource: |
157 |       {
158 |         "resourceType": "MedicationRequest",
159 |         "status": "active",
160 |         "intent": "order",
161 |         "medicationCodeableConcept": {
162 |           "coding": [
163 |             { "system": "http://www.nlm.nih.gov/research/umls/rxnorm", "code": "1049630", "display": "Atorvastatin 20 MG" }
164 |           ]
165 |         },
166 |         "reasonCode": [
167 |           { "coding": [ { "system": "http://snomed.info/sct", "code": "13644009", "display": "Hypercholesterolemia" } ] }
168 |         ],
169 |         "dosageInstruction": [
170 |           {
171 |             "doseAndRate": [
172 |               { "doseQuantity": { "value": 1, "unit": "tablet" } }
173 |             ]
174 |           }
175 |         ]
176 |       }
177 |     question: What is the coded reason for this prescription? Please return the code only.
178 |   assert:
179 |     - type: equals
180 |       value: 13644009
181 | - description: MedicationRequest with non-standard status
182 |   vars:
183 |     resource: |
184 |       {
185 |         "resourceType": "MedicationRequest",
186 |         "status": "on-hold",
187 |         "intent": "order",
188 |         "medicationCodeableConcept": {
189 |           "coding": [
190 |             { "system": "http://www.nlm.nih.gov/research/umls/rxnorm", "code": "617314", "display": "Metformin 500 MG" }
191 |           ]
192 |         },
193 |         "dosageInstruction": [
194 |           {
195 |             "doseAndRate": [
196 |               { "doseQuantity": { "value": 1, "unit": "tablet" } }
197 |             ]
198 |           }
199 |         ]
200 |       }
201 |     question: What is the status of this medication request?
202 |   assert:
203 |     - type: equals
204 |       value: on-hold
205 | - description: Ultimate MedicationRequest extraction challenge
206 |   vars:
207 |     resource: |
208 |       {
209 |         "resourceType": "Bundle",
210 |         "type": "collection",
211 |         "entry": [
212 |           {
213 |             "resource": {
214 |               "resourceType": "MedicationRequest",
215 |               "status": "active",
216 |               "intent": "order",
217 |               "medicationCodeableConcept": {
218 |                 "coding": [
219 |                   { "system": "http://www.nlm.nih.gov/research/umls/rxnorm", "code": "860975", "display": "Ibuprofen 200 MG" },
220 |                   { "system": "http://snomed.info/sct", "code": "387207008", "display": "Ibuprofen" }
221 |                 ]
222 |               },
223 |               "dosageInstruction": [
224 |                 {
225 |                   "doseAndRate": [
226 |                     { "doseQuantity": { "value": 2, "unit": "tablet" } }
227 |                   ]
228 |                 }
229 |               ],
230 |               "reasonCode": [
231 |                 { "coding": [ { "system": "http://snomed.info/sct", "code": "386661006", "display": "Fever" } ] }
232 |               ],
233 |               "extension": [
234 |                 {
235 |                   "url": "http://example.org/fhir/StructureDefinition/medicationrequest-priority",
236 |                   "valueCode": "routine"
237 |                 }
238 |               ]
239 |             }
240 |           },
241 |           {
242 |             "resource": {
243 |               "resourceType": "MedicationRequest",
244 |               "status": "on-hold",
245 |               "intent": "order",
246 |               "medicationCodeableConcept": {
247 |                 "coding": [
248 |                   { "system": "http://www.nlm.nih.gov/research/umls/rxnorm", "code": "617314", "display": "Metformin 500 MG" },
249 |                   { "system": "http://snomed.info/sct", "code": "860975", "display": "Ibuprofen 200 MG" }
250 |                 ]
251 |               },
252 |               "dosageInstruction": [
253 |                 {
254 |                   "doseAndRate": [
255 |                     { "doseRange": { "low": { "value": 1, "unit": "tablet" }, "high": { "value": 2, "unit": "tablet" } } }
256 |                   ]
257 |                 }
258 |               ],
259 |               "reasonCode": [
260 |                 { "coding": [ { "system": "http://snomed.info/sct", "code": "44054006", "display": "Diabetes mellitus type 2" } ] }
261 |               ]
262 |             }
263 |           },
264 |           {
265 |             "resource": {
266 |               "resourceType": "MedicationRequest",
267 |               "status": "cancelled",
268 |               "intent": "order",
269 |               "medicationCodeableConcept": {
270 |                 "coding": [
271 |                   { "system": "http://www.nlm.nih.gov/research/umls/rxnorm", "code": "1049630", "display": "Atorvastatin 20 MG" }
272 |                 ]
273 |               },
274 |               "dosageInstruction": [
275 |                 {
276 |                   "doseAndRate": [
277 |                     { "doseQuantity": { "value": 1 } }
278 |                   ]
279 |                 }
280 |               ],
281 |               "reasonCode": [
282 |                 { "coding": [ { "system": "http://snomed.info/sct", "code": "13644009", "display": "Hypercholesterolemia" } ] }
283 |               ]
284 |             }
285 |           },
286 |           {
287 |             "resource": {
288 |               "resourceType": "MedicationRequest",
289 |               "status": "active",
290 |               "intent": "order",
291 |               "medicationCodeableConcept": {
292 |                 "coding": [
293 |                   { "system": "http://www.nlm.nih.gov/research/umls/rxnorm", "code": "197361", "display": "Lisinopril 10 MG" }
294 |                 ]
295 |               },
296 |               "dosageInstruction": [
297 |                 {
298 |                   "doseAndRate": [
299 |                     { "doseQuantity": { "value": 1, "unit": "tablet" } }
300 |                   ]
301 |                 }
302 |               ],
303 |               "reasonCode": [
304 |                 { "coding": [ { "system": "http://snomed.info/sct", "code": "38341003", "display": "Hypertension" } ] }
305 |               ]
306 |             }
307 |           },
308 |           {
309 |             "resource": {
310 |               "resourceType": "MedicationRequest",
311 |               "status": "active",
312 |               "intent": "order",
313 |               "medicationCodeableConcept": {
314 |                 "coding": [
315 |                   { "system": "http://www.nlm.nih.gov/research/umls/rxnorm", "code": "617314", "display": "Metformin 500 MG" }
316 |                 ]
317 |               },
318 |               "dosageInstruction": [
319 |                 {
320 |                   "doseAndRate": [
321 |                     { "doseQuantity": { "value": 1, "unit": "tablet" } }
322 |                   ]
323 |                 }
324 |               ],
325 |               "reasonCode": [
326 |                 { "coding": [ { "system": "http://snomed.info/sct", "code": "44054006", "display": "Diabetes mellitus type 2" } ] }
327 |               ]
328 |             }
329 |           }
330 |         ]
331 |       }
332 |     question: What is the active medication for diabetes mellitus type 2 (RxNorm code)?
333 |   assert:
334 |     - type: equals
335 |       value: 617314
336 | 


--------------------------------------------------------------------------------
/evals/extraction/tests/observations.yaml:
--------------------------------------------------------------------------------
  1 | - description: Systolic blood pressure extraction
  2 |   vars:
  3 |     resource: &observation_resource_json |
  4 |       {
  5 |         "resourceType": "Observation",
  6 |         "id": "blood-pressure",
  7 |         "status": "final",
  8 |         "category": [
  9 |           {
 10 |             "coding": [
 11 |               {
 12 |                 "system": "http://terminology.hl7.org/CodeSystem/observation-category",
 13 |                 "code": "vital-signs",
 14 |                 "display": "Vital Signs"
 15 |               }
 16 |             ]
 17 |           }
 18 |         ],
 19 |         "code": {
 20 |           "coding": [
 21 |             {
 22 |               "system": "http://loinc.org",
 23 |               "code": "85354-9",
 24 |               "display": "Blood pressure panel"
 25 |             }
 26 |           ]
 27 |         },
 28 |         "component": [
 29 |           {
 30 |             "code": {
 31 |               "coding": [
 32 |                 {
 33 |                   "system": "http://loinc.org",
 34 |                   "code": "8480-6",
 35 |                   "display": "Systolic blood pressure"
 36 |                 }
 37 |               ]
 38 |             },
 39 |             "valueQuantity": {
 40 |               "value": 120,
 41 |               "unit": "mmHg"
 42 |             }
 43 |           },
 44 |           {
 45 |             "code": {
 46 |               "coding": [
 47 |                 {
 48 |                   "system": "http://loinc.org",
 49 |                   "code": "8462-4",
 50 |                   "display": "Diastolic blood pressure"
 51 |                 }
 52 |               ]
 53 |             },
 54 |             "valueQuantity": {
 55 |               "value": 80,
 56 |               "unit": "mmHg"
 57 |             }
 58 |           }
 59 |         ]
 60 |       }
 61 |     question: What is the systolic blood pressure value (including unit)?
 62 |   assert:
 63 |     - type: equals
 64 |       value: 120 mmHg
 65 | - description: Diastolic blood pressure extraction
 66 |   vars:
 67 |     resource: *observation_resource_json
 68 |     question: What is the diastolic blood pressure value (including unit)?
 69 |   assert:
 70 |     - type: equals
 71 |       value: 80 mmHg
 72 | - description: Complete blood pressure reading in systolic/diastolic format
 73 |   vars:
 74 |     resource: *observation_resource_json
 75 |     question: What is the complete blood pressure reading in systolic/diastolic format?
 76 |   assert:
 77 |     - type: equals
 78 |       value: 120/80 mmHg
 79 | - description: CodeableConcept extraction without display
 80 |   vars:
 81 |     resource: |
 82 |       {
 83 |         "resourceType": "Observation",
 84 |         "id": "dce7c80f-36fa-4693-bce8-75ca9d90a53c",
 85 |         "status": "final",
 86 |         "category": [
 87 |           {
 88 |             "coding": [
 89 |               {
 90 |                 "system": "http://terminology.hl7.org/CodeSystem/observation-category",
 91 |                 "code": "vital-signs",
 92 |               }
 93 |             ]
 94 |           }
 95 |         ],
 96 |         "code": {
 97 |           "coding": [
 98 |             {
 99 |               "system": "http://loinc.org",
100 |               "code": "85354-9",
101 |             }
102 |           ]
103 |         },
104 |         "component": [
105 |           {
106 |             "code": {
107 |               "coding": [
108 |                 {
109 |                   "system": "http://loinc.org",
110 |                   "code": "8480-6",
111 |                 }
112 |               ]
113 |             },
114 |             "valueQuantity": {
115 |               "value": 120,
116 |               "unit": "mmHg"
117 |             }
118 |           },
119 |           {
120 |             "code": {
121 |               "coding": [
122 |                 {
123 |                   "system": "http://loinc.org",
124 |                   "code": "8462-4",
125 |                 }
126 |               ]
127 |             },
128 |             "valueQuantity": {
129 |               "value": 80,
130 |               "unit": "mmHg"
131 |             }
132 |           }
133 |         ]
134 |       }
135 |     question: What is the diastolic blood pressure value (including unit)?
136 |   assert:
137 |     - type: equals
138 |       value: 80 mmHg
139 | - description: Systolic blood pressure with missing unit
140 |   vars:
141 |     resource: |
142 |       {
143 |         "resourceType": "Observation",
144 |         "id": "a1b2c3d4",
145 |         "status": "final",
146 |         "code": {
147 |           "coding": [
148 |             { "system": "http://loinc.org", "code": "85354-9" }
149 |           ]
150 |         },
151 |         "component": [
152 |           {
153 |             "code": { "coding": [ { "system": "http://loinc.org", "code": "8480-6" } ] },
154 |             "valueQuantity": { "value": 130 }
155 |           }
156 |         ]
157 |       }
158 |     question: What is the systolic blood pressure value?
159 |   assert:
160 |     - type: equals
161 |       value: 130
162 | - description: Diastolic blood pressure as string value
163 |   vars:
164 |     resource: |
165 |       {
166 |         "resourceType": "Observation",
167 |         "id": "e5f6g7h8",
168 |         "status": "final",
169 |         "code": {
170 |           "coding": [
171 |             { "system": "http://loinc.org", "code": "85354-9" }
172 |           ]
173 |         },
174 |         "component": [
175 |           {
176 |             "code": { "coding": [ { "system": "http://loinc.org", "code": "8462-4" } ] },
177 |             "valueString": "eighty-two mmHg"
178 |           }
179 |         ]
180 |       }
181 |     question: What is the diastolic blood pressure value?
182 |   assert:
183 |     - type: contains-any
184 |       value:
185 |         - eighty-two mmHg
186 |         - 82 mmHg
187 | - description: Blood pressure with extra irrelevant component
188 |   vars:
189 |     resource: |
190 |       {
191 |         "resourceType": "Observation",
192 |         "id": "i9j0k1l2",
193 |         "status": "final",
194 |         "code": {
195 |           "coding": [
196 |             { "system": "http://loinc.org", "code": "85354-9" }
197 |           ]
198 |         },
199 |         "component": [
200 |           {
201 |             "code": { "coding": [ { "system": "http://loinc.org", "code": "8480-6" } ] },
202 |             "valueQuantity": { "value": 110, "unit": "mmHg" }
203 |           },
204 |           {
205 |             "code": { "coding": [ { "system": "http://loinc.org", "code": "8462-4" } ] },
206 |             "valueQuantity": { "value": 70, "unit": "mmHg" }
207 |           },
208 |           {
209 |             "code": { "coding": [ { "system": "http://loinc.org", "code": "9999-9" } ] },
210 |             "valueQuantity": { "value": 999, "unit": "foo" }
211 |           }
212 |         ]
213 |       }
214 |     question: What is the complete blood pressure reading in systolic/diastolic format?
215 |   assert:
216 |     - type: contains-any
217 |       value:
218 |         - 110/70 mmHg
219 |         - 110 mmHg/70 mmHg
220 | - description: Blood pressure with non-standard code
221 |   vars:
222 |     resource: |
223 |       {
224 |         "resourceType": "Observation",
225 |         "id": "m3n4o5p6",
226 |         "status": "final",
227 |         "code": {
228 |           "coding": [
229 |             { "system": "http://loinc.org", "code": "99999-9" }
230 |           ]
231 |         },
232 |         "component": [
233 |           {
234 |             "code": { "coding": [ { "system": "http://loinc.org", "code": "8480-6" } ] },
235 |             "valueQuantity": { "value": 115, "unit": "mmHg" }
236 |           },
237 |           {
238 |             "code": { "coding": [ { "system": "http://loinc.org", "code": "8462-4" } ] },
239 |             "valueQuantity": { "value": 75, "unit": "mmHg" }
240 |           }
241 |         ]
242 |       }
243 |     question: What is the complete blood pressure reading in systolic/diastolic format?
244 |   assert:
245 |     - type: contains-any
246 |       value:
247 |         - 115/75 mmHg
248 |         - 115 mmHg/75 mmHg
249 | - description: Potassium value only in extension
250 |   vars:
251 |     resource: |
252 |       {
253 |         "resourceType": "Observation",
254 |         "id": "z1x2c3v4",
255 |         "status": "final",
256 |         "code": {
257 |           "coding": [
258 |             { "system": "http://loinc.org", "code": "2823-3", "display": "Potassium [Moles/volume] in Serum or Plasma" }
259 |           ]
260 |         },
261 |         "extension": [
262 |           {
263 |             "url": "http://example.org/fhir/StructureDefinition/observation-value",
264 |             "valueQuantity": { "value": 4.2, "unit": "mmol/L" }
265 |           }
266 |         ]
267 |       }
268 |     question: What is the potassium value (including unit)?
269 |   assert:
270 |     - type: equals
271 |       value: 4.2 mmol/L
272 | - description: Glucose with multiple value types
273 |   vars:
274 |     resource: |
275 |       {
276 |         "resourceType": "Observation",
277 |         "id": "g5h6j7k8",
278 |         "status": "final",
279 |         "code": {
280 |           "coding": [
281 |             { "system": "http://loinc.org", "code": "2345-7", "display": "Glucose [Mass/volume] in Blood" }
282 |           ]
283 |         },
284 |         "valueQuantity": { "value": 95, "unit": "mg/dL" },
285 |         "interpretation": [
286 |           { "coding": [ { "system": "http://terminology.hl7.org/CodeSystem/v3-ObservationInterpretation", "code": "N", "display": "Normal" } ] }
287 |         ]
288 |       }
289 |     question: What is the glucose value (including unit)?
290 |   assert:
291 |     - type: equals
292 |       value: 95 mg/dL
293 | - description: Sodium with coded interpretation only
294 |   vars:
295 |     resource: |
296 |       {
297 |         "resourceType": "Observation",
298 |         "id": "s1o2d3i4",
299 |         "status": "final",
300 |         "code": {
301 |           "coding": [
302 |             { "system": "http://loinc.org", "code": "2951-2", "display": "Sodium [Moles/volume] in Serum or Plasma" }
303 |           ]
304 |         },
305 |         "valueQuantity": { "value": 150, "unit": "mmol/L" },
306 |         "interpretation": [
307 |           { "coding": [ { "system": "http://terminology.hl7.org/CodeSystem/v3-ObservationInterpretation", "code": "H", "display": "High" } ] }
308 |         ]
309 |       }
310 |     question: What is the sodium interpretation code?
311 |   assert:
312 |     - type: equals
313 |       value: H
314 | - description: Cholesterol as a range
315 |   vars:
316 |     resource: |
317 |       {
318 |         "resourceType": "Observation",
319 |         "id": "c1h2o3l4",
320 |         "status": "final",
321 |         "code": {
322 |           "coding": [
323 |             { "system": "http://loinc.org", "code": "2093-3", "display": "Cholesterol [Mass/volume] in Serum or Plasma" }
324 |           ]
325 |         },
326 |         "valueRange": {
327 |           "low": { "value": 180, "unit": "mg/dL" },
328 |           "high": { "value": 200, "unit": "mg/dL" }
329 |         }
330 |       }
331 |     question: What is the cholesterol value range (including units)?
332 |   assert:
333 |     - type: contains-any
334 |       value:
335 |         - 180-200 mg/dL
336 |         - 100 - 200 mg/dL
337 |         - 180 mg/dL - 200 mg/dL
338 |         - 180 mg/dL to 200 mg/dL
339 |         - 180 - 200 mg/dL
340 | - description: Hemoglobin A1c with multiple codings
341 |   vars:
342 |     resource: |
343 |       {
344 |         "resourceType": "Observation",
345 |         "id": "h1a2b3c4",
346 |         "status": "final",
347 |         "code": {
348 |           "coding": [
349 |             { "system": "http://loinc.org", "code": "4548-4", "display": "Hemoglobin A1c/Hemoglobin.total in Blood" },
350 |             { "system": "http://snomed.info/sct", "code": "43396009", "display": "Hemoglobin A1c measurement" }
351 |           ]
352 |         },
353 |         "valueQuantity": { "value": 6.1, "unit": "%" }
354 |       }
355 |     question: What is the hemoglobin A1c value (including unit)?
356 |   assert:
357 |     - type: contains-any
358 |       value:
359 |         - 6.1 %
360 |         - 6.1%
361 | - description: TSH value only in narrative
362 |   vars:
363 |     resource: |
364 |       {
365 |         "resourceType": "Observation",
366 |         "id": "t1s2h3n4",
367 |         "status": "final",
368 |         "code": {
369 |           "coding": [
370 |             { "system": "http://loinc.org", "code": "3016-3", "display": "Thyrotropin [Units/volume] in Serum or Plasma" }
371 |           ]
372 |         },
373 |         "text": {
374 |           "status": "generated",
375 |           "div": "<div>TSH: 2.5 mIU/L</div>"
376 |         }
377 |       }
378 |     question: What is the TSH value (including unit)?
379 |   assert:
380 |     - type: equals
381 |       value: 2.5 mIU/L
382 | - description: Creatinine value in referenced DiagnosticReport
383 |   vars:
384 |     resource: |
385 |       {
386 |         "resourceType": "Bundle",
387 |         "type": "collection",
388 |         "entry": [
389 |           {
390 |             "resource": {
391 |               "resourceType": "Observation",
392 |               "id": "cr1e2a3t4",
393 |               "status": "final",
394 |               "code": {
395 |                 "coding": [
396 |                   { "system": "http://loinc.org", "code": "2160-0", "display": "Creatinine [Mass/volume] in Serum or Plasma" }
397 |                 ]
398 |               },
399 |               "derivedFrom": [
400 |                 { "reference": "DiagnosticReport/dr1234" }
401 |               ]
402 |             }
403 |           },
404 |           {
405 |             "resource": {
406 |               "resourceType": "DiagnosticReport",
407 |               "id": "dr1234",
408 |               "result": [
409 |                 { "reference": "Observation/cr1e2a3t4-value" }
410 |               ]
411 |             }
412 |           },
413 |           {
414 |             "resource": {
415 |               "resourceType": "Observation",
416 |               "id": "cr1e2a3t4-value",
417 |               "valueQuantity": { "value": 1.1, "unit": "mg/dL" }
418 |             }
419 |           }
420 |         ]
421 |       }
422 |     question: What is the creatinine value (including unit)?
423 |   assert:
424 |     - type: equals
425 |       value: 1.1 mg/dL
426 | - description: Vitamin D with localized display
427 |   vars:
428 |     resource: |
429 |       {
430 |         "resourceType": "Observation",
431 |         "id": "v1d2e3f4",
432 |         "status": "final",
433 |         "code": {
434 |           "coding": [
435 |             { "system": "http://loinc.org", "code": "1989-3", "display": "Vitamina D" }
436 |           ]
437 |         },
438 |         "valueQuantity": { "value": 30, "unit": "ng/mL" }
439 |       }
440 |     question: What is the vitamin D value (including unit)?
441 |   assert:
442 |     - type: equals
443 |       value: 30 ng/mL
444 | - description: White blood cell count with conflicting values
445 |   vars:
446 |     resource: |
447 |       {
448 |         "resourceType": "Observation",
449 |         "id": "w1b2c3c4",
450 |         "status": "final",
451 |         "code": {
452 |           "coding": [
453 |             { "system": "http://loinc.org", "code": "6690-2", "display": "Leukocytes [#/volume] in Blood by Automated count" }
454 |           ]
455 |         },
456 |         "valueQuantity": { "value": 7.0, "unit": "10^3/uL" },
457 |         "component": [
458 |           {
459 |             "code": { "coding": [ { "system": "http://loinc.org", "code": "6690-2" } ] },
460 |             "valueQuantity": { "value": 6.8, "unit": "10^3/uL" }
461 |           }
462 |         ]
463 |       }
464 |     question: What is the main white blood cell count value (including unit)?
465 |   assert:
466 |     - type: equals
467 |       value: 7.0 10^3/uL
468 | - description: Calcium with unusual units
469 |   vars:
470 |     resource: |
471 |       {
472 |         "resourceType": "Observation",
473 |         "id": "ca1l2c3i4",
474 |         "status": "final",
475 |         "code": {
476 |           "coding": [
477 |             { "system": "http://loinc.org", "code": "17861-6", "display": "Calcium [Moles/volume] in Serum or Plasma" }
478 |           ]
479 |         },
480 |         "valueQuantity": { "value": 2.2, "unit": "mmol/L", "code": "mg/dL" }
481 |       }
482 |     question: What is the calcium value (including unit)?
483 |   assert:
484 |     - type: equals
485 |       value: 2.2 mmol/L
486 | - description: Glucose - find the most recent value from a bundle of many observations (with out-of-order dates)
487 |   vars:
488 |     resource: |
489 |       {
490 |         "resourceType": "Bundle",
491 |         "type": "collection",
492 |         "entry": [
493 |           { "resource": { "resourceType": "Observation", "status": "final", "code": { "coding": [ { "system": "http://loinc.org", "code": "2345-7", "display": "Glucose [Mass/volume] in Blood" } ] }, "valueQuantity": { "value": 90, "unit": "mg/dL" }, "effectiveDateTime": "2022-01-01T08:00:00Z" } },
494 |           { "resource": { "resourceType": "Observation", "status": "final", "code": { "coding": [ { "system": "http://loinc.org", "code": "2345-7", "display": "Glucose [Mass/volume] in Blood" } ] }, "valueQuantity": { "value": 110, "unit": "mg/dL" }, "effectiveDateTime": "2022-02-01T08:00:00Z" } },
495 |           { "resource": { "resourceType": "Observation", "status": "final", "code": { "coding": [ { "system": "http://loinc.org", "code": "2345-7", "display": "Glucose [Mass/volume] in Blood" } ] }, "valueQuantity": { "value": 105, "unit": "mg/dL" }, "effectiveDateTime": "2022-01-15T08:00:00Z" } },
496 |           { "resource": { "resourceType": "Observation", "status": "final", "code": { "coding": [ { "system": "http://loinc.org", "code": "2345-7", "display": "Glucose [Mass/volume] in Blood" } ] }, "valueQuantity": { "value": 99, "unit": "mg/dL" }, "effectiveDateTime": "2021-12-31T08:00:00Z" } },
497 |           { "resource": { "resourceType": "Observation", "status": "final", "code": { "coding": [ { "system": "http://loinc.org", "code": "2345-7", "display": "Glucose [Mass/volume] in Blood" } ] }, "valueQuantity": { "value": 115, "unit": "mg/dL" }, "effectiveDateTime": "2022-03-01T08:00:00Z" } },
498 |           { "resource": { "resourceType": "Observation", "status": "final", "code": { "coding": [ { "system": "http://loinc.org", "code": "2345-7", "display": "Glucose [Mass/volume] in Blood" } ] }, "valueQuantity": { "value": 101, "unit": "mg/dL" }, "effectiveDateTime": "2022-02-28T23:59:59Z" } },
499 |           { "resource": { "resourceType": "Observation", "status": "final", "code": { "coding": [ { "system": "http://loinc.org", "code": "2345-7", "display": "Glucose [Mass/volume] in Blood" } ] }, "valueQuantity": { "value": 112, "unit": "mg/dL" }, "effectiveDateTime": "2022-02-01T07:59:59Z" } },
500 |           { "resource": { "resourceType": "Observation", "status": "final", "code": { "coding": [ { "system": "http://loinc.org", "code": "2345-7", "display": "Glucose [Mass/volume] in Blood" } ] }, "valueQuantity": { "value": 108, "unit": "mg/dL" }, "effectiveDateTime": "2022-03-01T07:59:59Z" } },
501 |           { "resource": { "resourceType": "Observation", "status": "final", "code": { "coding": [ { "system": "http://loinc.org", "code": "2345-7", "display": "Glucose [Mass/volume] in Blood" } ] }, "valueQuantity": { "value": 113, "unit": "mg/dL" }, "effectiveDateTime": "2022-02-01T08:00:01Z" } }
502 |         ]
503 |       }
504 |     question: What is the most recent glucose value (including unit)?
505 |   assert:
506 |     - type: equals
507 |       value: 115 mg/dL
508 | - description: Ultimate lab value extraction challenge
509 |   vars:
510 |     resource: |
511 |       {
512 |         "resourceType": "Bundle",
513 |         "type": "collection",
514 |         "entry": [
515 |           {
516 |             "resource": {
517 |               "resourceType": "Observation",
518 |               "status": "final",
519 |               "subject": { "reference": "Patient/123" },
520 |               "specimen": { "display": "Serum" },
521 |               "code": {
522 |                 "coding": [
523 |                   { "system": "http://loinc.org", "code": "1558-6", "display": "Glucose [Mass/volume] in Serum or Plasma --Fasting" },
524 |                   { "system": "http://snomed.info/sct", "code": "43396009", "display": "Fasting glucose" }
525 |                 ]
526 |               },
527 |               "valueQuantity": { "value": 98, "unit": "mg/dL" },
528 |               "effectiveDateTime": "2022-04-01T09:00:00Z"
529 |             }
530 |           },
531 |           {
532 |             "resource": {
533 |               "resourceType": "Observation",
534 |               "status": "final",
535 |               "subject": { "reference": "Patient/123" },
536 |               "specimen": { "display": "Serum" },
537 |               "code": {
538 |                 "coding": [
539 |                   { "system": "http://loinc.org", "code": "1558-6", "display": "Glucose [Mass/volume] in Serum or Plasma --Fasting" }
540 |                 ]
541 |               },
542 |               "extension": [
543 |                 {
544 |                   "url": "http://example.org/fhir/StructureDefinition/observation-value",
545 |                   "valueQuantity": { "value": 102, "unit": "mg/dL" }
546 |                 }
547 |               ],
548 |               "effectiveDateTime": "2022-05-01T09:00:00Z"
549 |             }
550 |           },
551 |           {
552 |             "resource": {
553 |               "resourceType": "Observation",
554 |               "status": "entered-in-error",
555 |               "subject": { "reference": "Patient/123" },
556 |               "specimen": { "display": "Serum" },
557 |               "code": {
558 |                 "coding": [
559 |                   { "system": "http://loinc.org", "code": "1558-6", "display": "Glucose [Mass/volume] in Serum or Plasma --Fasting" }
560 |                 ]
561 |               },
562 |               "valueQuantity": { "value": 200, "unit": "mg/dL" },
563 |               "effectiveDateTime": "2022-06-01T09:00:00Z"
564 |             }
565 |           },
566 |           {
567 |             "resource": {
568 |               "resourceType": "Observation",
569 |               "status": "final",
570 |               "subject": { "reference": "Patient/456" },
571 |               "specimen": { "display": "Serum" },
572 |               "code": {
573 |                 "coding": [
574 |                   { "system": "http://loinc.org", "code": "1558-6", "display": "Glucose [Mass/volume] in Serum or Plasma --Fasting" }
575 |                 ]
576 |               },
577 |               "valueQuantity": { "value": 99, "unit": "mg/dL" },
578 |               "effectiveDateTime": "2022-07-01T09:00:00Z"
579 |             }
580 |           },
581 |           {
582 |             "resource": {
583 |               "resourceType": "Observation",
584 |               "status": "final",
585 |               "subject": { "reference": "Patient/123" },
586 |               "specimen": { "display": "Plasma" },
587 |               "code": {
588 |                 "coding": [
589 |                   { "system": "http://loinc.org", "code": "1558-6", "display": "Glucose [Mass/volume] in Serum or Plasma --Fasting" }
590 |                 ]
591 |               },
592 |               "valueQuantity": { "value": 100, "unit": "mg/dL" },
593 |               "effectiveDateTime": "2022-05-02T09:00:00Z"
594 |             }
595 |           },
596 |           {
597 |             "resource": {
598 |               "resourceType": "Observation",
599 |               "status": "final",
600 |               "subject": { "reference": "Patient/123" },
601 |               "specimen": { "display": "Serum" },
602 |               "code": {
603 |                 "coding": [
604 |                   { "system": "http://loinc.org", "code": "1558-6", "display": "Glucose [Mass/volume] in Serum or Plasma --Fasting" }
605 |                 ]
606 |               },
607 |               "valueString": "one hundred and five mg/dL",
608 |               "effectiveDateTime": "2022-05-03T09:00:00Z"
609 |             }
610 |           },
611 |           {
612 |             "resource": {
613 |               "resourceType": "Observation",
614 |               "status": "final",
615 |               "subject": { "reference": "Patient/123" },
616 |               "specimen": { "display": "Serum" },
617 |               "code": {
618 |                 "coding": [
619 |                   { "system": "http://loinc.org", "code": "1558-6", "display": "Glucose [Mass/volume] in Serum or Plasma --Fasting" }
620 |                 ]
621 |               },
622 |               "valueQuantity": { "value": 5.8, "unit": "mmol/L" },
623 |               "effectiveDateTime": "2022-05-04T09:00:00Z"
624 |             }
625 |           },
626 |           {
627 |             "resource": {
628 |               "resourceType": "Observation",
629 |               "status": "final",
630 |               "subject": { "reference": "Patient/123" },
631 |               "specimen": { "display": "Serum" },
632 |               "code": {
633 |                 "coding": [
634 |                   { "system": "http://loinc.org", "code": "1558-6", "display": "Glucose [Mass/volume] in Serum or Plasma --Fasting" }
635 |                 ]
636 |               },
637 |               "text": {
638 |                 "status": "generated",
639 |                 "div": "<div>Glucose: 106 mg/dL</div>"
640 |               },
641 |               "effectiveDateTime": "2022-05-05T09:00:00Z"
642 |             }
643 |           }
644 |         ]
645 |       }
646 |     question: What is the most recent, valid, fasting serum glucose value (in mg/dL) for patient "Patient/123", considering all available information, and normalizing units if necessary?
647 |   assert:
648 |     - type: equals
649 |       value: 106 mg/dL
650 | 


--------------------------------------------------------------------------------
/evals/extraction/tests/patient-history.yaml:
--------------------------------------------------------------------------------
 1 | - description: Martial status
 2 |   vars:
 3 |     resource:
 4 |       - file://tests/patient-history.json
 5 |     question: What is the patient's recorded marital status (just the code)?
 6 |   assert:
 7 |     - type: equals
 8 |       value: D
 9 | - description: Clinical event
10 |   vars:
11 |     resource:
12 |       - file://tests/patient-history.json
13 |     question: Which SNOMED CT code identifies the procedure documented in the April 18 1979 encounter?
14 |   assert:
15 |     - type: equals
16 |       value: 162673000
17 | - description: BMI
18 |   vars:
19 |     resource:
20 |       - file://tests/patient-history.json
21 |     question: What was the patient's Body-mass index (BMI) value (with units) measured in May 2024?
22 |   assert:
23 |     - type: equals
24 |       value: 28.14 kg/m2
25 | - description: Hard-mode
26 |   vars:
27 |     resource:
28 |       - file://tests/patient-history.json
29 |     question: On which exact date did the patient transition from full-time to part-time employment, and which insurer was billed for the encounter where that change was first documented?
30 |   assert:
31 |     - type: factuality
32 |       value: May 8 2024 and Anthem
33 | - description: Age calculation and temporal reasoning
34 |   vars:
35 |     resource:
36 |       - file://tests/patient-history.json
37 |     question: How old was the patient when they received their higher education finding, and how many years passed before they had their first lipid panel performed?
38 |   assert:
39 |     - type: factuality
40 |       value: 18 years old when received higher education finding, and 46 years passed before first lipid panel in May 2025
41 | - description: Insurance transitions and coverage gaps
42 |   vars:
43 |     resource:
44 |       - file://tests/patient-history.json
45 |     question: List all the insurance providers the patient has had in chronological order, including when they had no insurance, and identify the longest continuous period with the same coverage.
46 |   assert:
47 |     - type: factuality
48 |       value: NO_INSURANCE (1979), Aetna (2001), Cigna Health (2017), Anthem (2023-2025). Longest continuous coverage was with Anthem from 2023-2025.
49 | - description: Substance use screening interpretation
50 |   vars:
51 |     resource:
52 |       - file://tests/patient-history.json
53 |     question: What substance use screening tools were administered in May 2024, what were the scores?
54 |   assert:
55 |     - type: factuality
56 |       value: PHQ-2 (score 0) and DAST-10 (score 1) were administered.
57 | - description: Social determinants and clinical correlations
58 |   vars:
59 |     resource:
60 |       - file://tests/patient-history.json
61 |     question: According to the PRAPARE assessment, what was the patient's employment status, education level, and stress level?
62 |   assert:
63 |     - type: factuality
64 |       value: Part-time employment, completed more than high school education, and reported no stress.
65 | - description: Vital signs trend analysis
66 |   vars:
67 |     resource:
68 |       - file://tests/patient-history.json
69 |     question: Compare the patient's blood pressure readings between May 2024 and May 2025, and determine if there was improvement or deterioration. What was the pain score trend during the same period?
70 |   assert:
71 |     - type: factuality
72 |       value: Blood pressure improved from 102/84 mmHg (May 2024) to 104/79 mmHg (May 2025) - diastolic decreased by 5 mmHg. Pain score worsened from 2/10 to 4/10.
73 | - description: Complex condition timeline
74 |   vars:
75 |     resource:
76 |       - file://tests/patient-history.json
77 |     question: What conditions were documented as resolved during the patient's care, when were they first diagnosed, and what was the duration of each condition?
78 |   assert:
79 |     - type: factuality
80 |       value: Three conditions resolved - Full-time employment (May 3, 2023 to May 8, 2024, duration 370 days), Social isolation (May 3, 2023 to May 8, 2024, duration 370 days), and Medication review due (May 8, 2024 to May 14, 2025, duration 371 days)
81 | 


--------------------------------------------------------------------------------
/evals/generation/config-multi-turn-tool-use.js:
--------------------------------------------------------------------------------
  1 | import validateFhirBundle from '../../tools/validateFhirBundle.mjs';
  2 | 
  3 | const openAItools = {
  4 |   tools: [
  5 |     {
  6 |       type: 'function',
  7 |       name: 'validate_fhir_bundle',
  8 |       description:
  9 |         'Validate a FHIR bundle - you should use this tool recursively to fix errors, using it again after you have called it to ensure that FHIR resources are fully valid after making changes',
 10 |       parameters: {
 11 |         type: 'object',
 12 |         properties: {
 13 |           bundle: {
 14 |             type: 'object',
 15 |             description: 'The FHIR bundle to validate',
 16 |           },
 17 |         },
 18 |         required: ['bundle'],
 19 |       },
 20 |     },
 21 |   ],
 22 |   functionToolCallbacks: {
 23 |     validate_fhir_bundle: validateFhirBundle,
 24 |   },
 25 | };
 26 | 
 27 | const anthropicTools = {
 28 |   tools: [
 29 |     {
 30 |       name: 'validate_fhir_bundle',
 31 |       description:
 32 |         'Validate a FHIR bundle - you should use this tool recursively to fix errors, using it again after you have called it to ensure that FHIR resources are fully valid after making changes',
 33 |       input_schema: {
 34 |         type: 'object',
 35 |         properties: {
 36 |           bundle: {
 37 |             type: 'object',
 38 |             description: 'The FHIR bundle to validate',
 39 |           },
 40 |         },
 41 |         required: ['bundle'],
 42 |       },
 43 |     },
 44 |   ],
 45 |   functionToolCallbacks: {
 46 |     validate_fhir_bundle: validateFhirBundle,
 47 |   },
 48 | };
 49 | 
 50 | const anthropicModel = {
 51 |   id: 'file://../../providers/AnthropicMessagesWithRecursiveToolCallsProvider.ts',
 52 |   transform: 'file://./markdown-transformer.js',
 53 | };
 54 | 
 55 | const anthropicModelConfig = {
 56 |   max_tokens: 8092,
 57 |   // tool_choice: 'auto',
 58 |   max_tool_calls: 10,
 59 |   ...anthropicTools,
 60 | };
 61 | 
 62 | const openAIModel = {
 63 |   id: 'file://../../providers/OpenAiResponsesWithRecursiveToolCallsProvider.ts',
 64 |   transform: 'file://./markdown-transformer.js',
 65 | };
 66 | 
 67 | const openAIModelConfig = {
 68 |   max_output_tokens: 16184,
 69 |   tool_choice: 'auto',
 70 |   max_tool_calls: 10,
 71 |   ...openAItools,
 72 | };
 73 | 
 74 | /** @type {import('promptfoo').TestSuiteConfig} */
 75 | const config = {
 76 |   description: 'FHIR Bundle Generation (Multi Turn Tool Use)',
 77 | 
 78 |   providers: [
 79 |     {
 80 |       ...openAIModel,
 81 |       config: {
 82 |         ...openAIModelConfig,
 83 |         model: 'gpt-3.5-turbo',
 84 |       },
 85 |       label: 'openai-gpt-3.5-turbo',
 86 |     },
 87 |     {
 88 |       ...openAIModel,
 89 |       config: {
 90 |         ...openAIModelConfig,
 91 |         model: 'gpt-4.1',
 92 |       },
 93 |       label: 'openai-gpt-4.1',
 94 |     },
 95 |     {
 96 |       ...openAIModel,
 97 |       config: {
 98 |         ...openAIModelConfig,
 99 |         model: 'o3',
100 |         reasoning_effort: 'low',
101 |       },
102 |       label: 'openai-o3-low',
103 |     },
104 |     {
105 |       ...openAIModel,
106 |       config: {
107 |         ...openAIModelConfig,
108 |         model: 'o3',
109 |         reasoning_effort: 'high',
110 |       },
111 |       label: 'openai-o3-high',
112 |     },
113 |     {
114 |       ...anthropicModel,
115 |       config: {
116 |         ...anthropicModelConfig,
117 |         model: 'claude-3-5-haiku-20241022',
118 |       },
119 |       label: 'anthropic-claude-3-5-haiku-20241022',
120 |     },
121 |     {
122 |       ...anthropicModel,
123 |       config: {
124 |         ...anthropicModelConfig,
125 |         model: 'claude-3-5-sonnet-20241022',
126 |       },
127 |       label: 'anthropic-claude-3-5-sonnet-202410224',
128 |     },
129 |     {
130 |       ...anthropicModel,
131 |       config: {
132 |         ...anthropicModelConfig,
133 |         model: 'claude-sonnet-4-20250514',
134 |       },
135 |       label: 'anthropic-claude-sonnet-4-20250514',
136 |     },
137 |     {
138 |       ...anthropicModel,
139 |       config: {
140 |         ...anthropicModelConfig,
141 |         model: 'claude-opus-4-20250514',
142 |       },
143 |       label: 'anthropic-claude-opus-4-20250514',
144 |     },
145 |   ],
146 | 
147 |   prompts: [
148 |     {
149 |       label: 'Unstructured Note to FHIR',
150 |       raw: `You are a health informaticist expert in FHIR. 
151 | You will receive unstructured notes and you need to structure them into FHIR resources.
152 | You must only include data that is present in the note.
153 | You must only return a valid FHIR JSON Bundle, with the appropriate resources, with no additional explanation.
154 | You may include multiple resources in the bundle.
155 | You must follow the FHIR R4 specification.
156 | You mut not include a meta element in the resources.
157 | When generating a CodeableConcept, you must include a coding element with a system, code, and display.
158 | When generating a CodeableConcept, you must use a display matching what is expected by the CodeSystem.
159 | Each entry in a Bundle must have a fullUrl which is the identity of the resource in the entry.
160 | The id of a resource must be a valid UUID in lowercase.
161 | 
162 | You have access to a validator tool that will validate the FHIR bundle.
163 | You should use this tool recursively to fix errors, using it again after you have called it to ensure that FHIR resources are fully valid after making changes.
164 | 
165 | Include the FHIR JSON bundle in your final response.
166 | <note>
167 | {{note}}
168 | </note>`,
169 |     },
170 |   ],
171 | 
172 |   defaultTest: {
173 |     assert: [
174 |       { type: 'is-json' },
175 |       { type: 'javascript', value: 'file://../../assertions/isBundle.mjs' },
176 |       { type: 'javascript', value: 'file://../../assertions/metaElementMissing.mjs' },
177 |       { type: 'javascript', value: 'file://../../assertions/validateOperation.mjs' },
178 |     ],
179 |   },
180 | 
181 |   tests: ['file://tests.yaml'],
182 | };
183 | 
184 | export default config;
185 | 


--------------------------------------------------------------------------------
/evals/generation/config-zero-shot-bundle.yaml:
--------------------------------------------------------------------------------
 1 | # yaml-language-server: $schema=https://promptfoo.dev/config-schema.json
 2 | description: 'FHIR Bundle Generation (Zero Shot)'
 3 | 
 4 | providers:
 5 |   - file://providers.yaml
 6 | 
 7 | prompts:
 8 |   - label: 'Unstructured Note to FHIR'
 9 |     raw: |
10 |       You are a health informaticist expert in FHIR. 
11 |       You will receive unstructured notes and you need to structure them into FHIR resources.
12 |       You must only include data that is present in the note.
13 |       You must only return a valid FHIR JSON Bundle, with the appropriate resources, with no additional explanation.
14 |       You may include multiple resources in the bundle.
15 |       You must follow the FHIR R4 specification.
16 |       You mut not include a meta element in the resources.
17 |       When generateing a CodeableConcept, you must include a coding element with a system, code, and display.
18 |       When generating a CodeableConcept, you must use a display matching what is expected by the CodeSystem.
19 |       Each entry in a Bundle must have a fullUrl which is the identity of the resource in the entry.
20 |       The id of a resource must be a valid UUID in lowercase.
21 | 
22 |       You must only return JSON with no additional markup or explanation.
23 | 
24 |       <note>
25 |       {{note}}
26 |       </note>
27 | 
28 | defaultTest:
29 |   assert:
30 |     - type: is-json
31 |     - type: javascript
32 |       value: file://../../assertions/isBundle.mjs
33 |     - type: javascript
34 |       value: file://../../assertions/metaElementMissing.mjs
35 |     - type: javascript
36 |       value: file://../../assertions/validateOperation.mjs
37 | 
38 | tests:
39 |   - file://tests.yaml
40 | 


--------------------------------------------------------------------------------
/evals/generation/markdown-transformer.js:
--------------------------------------------------------------------------------
 1 | module.exports = (output, _context) => {
 2 |   // If the output contains a fenced JSON code block, extract and return the JSON inside the fence.
 3 |   if (typeof output === 'string') {
 4 |     // Look for ```json ... ``` first
 5 |     const jsonFenceMatch = output.match(/```json\s*([\s\S]*?)```/i);
 6 |     if (jsonFenceMatch && jsonFenceMatch[1]) {
 7 |       return jsonFenceMatch[1].trim();
 8 |     }
 9 | 
10 |     const alternateFenchMatch = output.match(/```\s*([\s\S]*?)```/i);
11 |     if (alternateFenchMatch && alternateFenchMatch[1]) {
12 |       return alternateFenchMatch[1].trim();
13 |     }
14 |   }
15 | 
16 |   // Default: return the original output untouched.
17 |   return output;
18 | };
19 | 


--------------------------------------------------------------------------------
/evals/generation/providers.yaml:
--------------------------------------------------------------------------------
 1 | - label: openai-gpt-3.5-turbo
 2 |   id: openai:chat:gpt-3.5-turbo
 3 |   config:
 4 |     max_output_tokens: 16184
 5 | - label: openai-gpt-4.1
 6 |   id: openai:responses:gpt-4.1
 7 |   config:
 8 |     max_output_tokens: 16184
 9 | - label: openai-o3-low
10 |   id: openai:responses:o3
11 |   config:
12 |     max_output_tokens: 16184
13 |     reasoning_effort: 'low'
14 | - label: openai-o3-high
15 |   id: openai:responses:o3
16 |   config:
17 |     max_output_tokens: 16184
18 |     reasoning_effort: 'high'
19 | - label: claude-3-5-haiku-20241022
20 |   id: anthropic:messages:claude-3-5-haiku-20241022
21 |   config:
22 |     max_output_tokens: 8192
23 | - label: anthropic-claude-3-5-sonnet-20241022
24 |   id: anthropic:messages:claude-3-5-sonnet-20241022
25 |   config:
26 |     max_tokens: 8192
27 | - label: anthropic-claude-sonnet-4-20250514
28 |   id: anthropic:messages:claude-sonnet-4-20250514
29 |   config:
30 |     max_tokens: 8192
31 |   transform: |
32 |     output = output.replace(/^```json\n/, '').replace(/\n```$/, '').trim();
33 |     return output;
34 | - label: anthropic-claude-opus-4-20250514
35 |   id: anthropic:messages:claude-opus-4-20250514
36 |   config:
37 |     max_tokens: 8192
38 |   transform: |
39 |     output = output.replace(/^```json\n/, '').replace(/\n```$/, '').trim();
40 |     return output;
41 | - label: google-gemini-2.0-flash
42 |   id: google:gemini-2.0-flash
43 |   config:
44 |     max_output_tokens: 16184
45 |     generationConfig:
46 |       response_mime_type: 'application/json'
47 | - label: google-gemini-2.5-flash-preview-05-20
48 |   id: google:gemini-2.5-flash-preview-05-20
49 |   config:
50 |     max_output_tokens: 16184
51 |     generationConfig:
52 |       response_mime_type: 'application/json'
53 | - label: google-gemini-2.5-pro-preview-05-06
54 |   id: google:gemini-2.5-pro-preview-05-06
55 |   config:
56 |     max_output_tokens: 16184
57 |     generationConfig:
58 |       response_mime_type: 'application/json'
59 | - label: ii-medical-8b
60 |   id: openai:chat:II-Medical-8B
61 |   config:
62 |     max_output_tokens: 16184
63 |     apiBaseUrl: https://g6ifi04b81u9oza5.us-east-1.aws.endpoints.huggingface.cloud/v1
64 |     showThinking: false
65 |   transform: |
66 |     output = output.replace(/<think>.*<\/think>/gis, '').trim();
67 |     output = output.replace(/<\s*\/?\s*Answer\s*>/gi, '').trim();
68 |     return output;
69 | - label: medgemma-4b-it
70 |   id: openai:chat:medgemma-4b-it
71 |   config:
72 |     max_output_tokens: 16184
73 |     apiBaseUrl: https://a6pf0b0uqcuajaua.us-east-1.aws.endpoints.huggingface.cloud/v1
74 |     showThinking: false
75 |   transform: |
76 |     output = output.replace(/<\s*\/?\s*Answer\s*>/gi, '').trim();
77 |       output = output.replace(/```json\s*([\s\S]*?)```/i, '$1').trim();
78 |     return output;
79 | - label: medgemma-27b-text-it
80 |   id: openai:chat:medgemma-27b-text-it
81 |   config:
82 |     max_output_tokens: 16184
83 |     apiBaseUrl: https://i7n97jz1el3l39h5.us-east-1.aws.endpoints.huggingface.cloud/v1
84 |     showThinking: false
85 |   transform: |
86 |     output = output.replace(/<\s*\/?\s*Answer\s*>/gi, '').trim();
87 |     output = output.replace(/```json\s*([\s\S]*?)```/i, '$1').trim();
88 |     return output;
89 | 


--------------------------------------------------------------------------------
/evals/generation/tests.yaml:
--------------------------------------------------------------------------------
  1 | - description: Basic patient resource
  2 |   vars:
  3 |     note: |
  4 |       Patient Marie Curie (DOB: 1867-11-07)
  5 |   assert:
  6 |     - type: javascript
  7 |       value: |
  8 |         const result = JSON.parse(output);
  9 |         return (
 10 |           result.entry.some(e => e.resource.resourceType === 'Patient') &&
 11 |           result.entry.some(e => e.resource.name.some(n => n.given && n.given.includes('Marie') && n.family && n.family.includes('Curie'))) &&
 12 |           result.entry.some(e => e.resource.birthDate === '1867-11-07')
 13 |         );
 14 | 
 15 | - vars:
 16 |     note: |
 17 |       Patient: Emily Chen, born 2010-11-05, received influenza vaccine on 2023-10-15.
 18 |   assert:
 19 |     - type: javascript
 20 |       value: |
 21 |         const result = JSON.parse(output);
 22 |         return (
 23 |           result.entry.some(e =>
 24 |             e.resource.resourceType === 'Immunization' &&
 25 |             e.resource.vaccineCode?.coding?.some(c => c.display?.toLowerCase().includes('influenza'))
 26 |           )
 27 |         );
 28 | 
 29 | - vars:
 30 |     note: |
 31 |       Patient John Smith (DOB: 1990-01-15) presented with fever (39.2°C) and cough on 2024-03-15.
 32 |       BP was 120/80. History of asthma. Prescribed azithromycin 500mg daily for 3 days.
 33 |   assert:
 34 |     - type: javascript
 35 |       value: |
 36 |         const result = JSON.parse(output);
 37 |         return (
 38 |           result.entry.some(e => e.resource.resourceType === 'Patient') &&
 39 |           result.entry.some(e => e.resource.resourceType === 'Observation') &&
 40 |           result.entry.some(e => e.resource.resourceType === 'MedicationRequest')
 41 |         );
 42 | 
 43 | - description: Blood chemistry diagnostic report
 44 |   vars:
 45 |     note: |
 46 |       Patient: Carlos Ramirez (DOB: 1972-02-09) had routine blood work on 2024-04-20 showing elevated LDL cholesterol (160 mg/dL).
 47 |   assert:
 48 |     - type: javascript
 49 |       value: |
 50 |         const result = JSON.parse(output);
 51 |         return (
 52 |           result.entry.some(e => e.resource.resourceType === 'Patient') &&
 53 |           result.entry.some(e => e.resource.resourceType === 'DiagnosticReport') &&
 54 |           result.entry.some(e => e.resource.resourceType === 'Observation' && e.resource.code?.coding?.[0]?.display?.toLowerCase().includes('ldl'))
 55 |         );
 56 | 
 57 | - description: Documented peanut allergy
 58 |   vars:
 59 |     note: |
 60 |       Patient Sarah Johnson (DOB: 1985-07-22) has a severe peanut allergy resulting in anaphylaxis.
 61 |   assert:
 62 |     - type: javascript
 63 |       value: |
 64 |         const result = JSON.parse(output);
 65 |         return (
 66 |           result.entry.some(e => e.resource.resourceType === 'Patient') &&
 67 |           result.entry.some(e => e.resource.resourceType === 'AllergyIntolerance' && e.resource.code?.coding?.[0]?.display?.toLowerCase().includes('peanut'))
 68 |         );
 69 | 
 70 | - description: Outpatient encounter with blood pressure observation
 71 |   vars:
 72 |     note: |
 73 |       Patient Michael Lee (DOB: 1998-12-01) had an outpatient visit on 2024-05-02 with blood pressure 118/76 mmHg.
 74 |   assert:
 75 |     - type: javascript
 76 |       value: |
 77 |         const result = JSON.parse(output);
 78 |         return (
 79 |           result.entry.some(e => e.resource.resourceType === 'Encounter') &&
 80 |           result.entry.some(e => e.resource.resourceType === 'Observation' && (e.resource.code?.text?.toLowerCase().includes('blood pressure') || e.resource.code?.coding?.some(c => c.display?.toLowerCase().includes('blood pressure'))))
 81 |         );
 82 | 
 83 | - description: Chronic condition diabetes mellitus type 2
 84 |   vars:
 85 |     note: |
 86 |       Patient Olivia Nguyen (DOB: 1960-06-14) diagnosed with Type 2 diabetes mellitus on 2015-08-10.
 87 |   assert:
 88 |     - type: javascript
 89 |       value: |
 90 |         const result = JSON.parse(output);
 91 |         return (
 92 |           result.entry.some(e => e.resource.resourceType === 'Condition' && (e.resource.code?.text?.toLowerCase().includes('diabetes') || e.resource.code?.coding?.some(c => c.display?.toLowerCase().includes('diabetes'))))
 93 |         );
 94 | 
 95 | - description: Appendectomy procedure
 96 |   vars:
 97 |     note: |
 98 |       Patient Liam Patel (DOB: 2002-03-30) underwent an emergency appendectomy on 2024-01-12.
 99 |   assert:
100 |     - type: javascript
101 |       value: |
102 |         const result = JSON.parse(output);
103 |         return (
104 |           result.entry.some(e => e.resource.resourceType === 'Procedure' && (e.resource.code?.text?.toLowerCase().includes('appendectomy') || e.resource.code?.coding?.some(c => c.display?.toLowerCase().includes('appendectomy'))))
105 |         );
106 | 
107 | - description: Medication statement – metformin therapy
108 |   vars:
109 |     note: |
110 |       Patient Noah Kim (DOB: 1975-11-19) takes metformin 500mg twice daily for Type 2 diabetes.
111 |   assert:
112 |     - type: javascript
113 |       value: |
114 |         const result = JSON.parse(output);
115 |         return (
116 |           result.entry.some(e => e.resource.resourceType === 'MedicationStatement' && (e.resource.medicationCodeableConcept?.text?.toLowerCase().includes('metformin') || e.resource.medicationCodeableConcept?.coding?.some(c => c.display?.toLowerCase().includes('metformin'))))
117 |         );
118 | 
119 | - description: Healthcare organization details
120 |   vars:
121 |     note: |
122 |       Redwood Medical Center located at 456 Elm St, Springfield, phone 555-6789.
123 |   assert:
124 |     - type: javascript
125 |       value: |
126 |         const result = JSON.parse(output);
127 |         return (
128 |           result.entry.some(e => e.resource.resourceType === 'Organization' && e.resource.name?.includes('Redwood Medical Center'))
129 |         );
130 | 
131 | - description: Practitioner profile cardiologist
132 |   vars:
133 |     note: |
134 |       Cardiologist Dr. Jane Taylor, National Provider Identifier (NPI): 1234567890.
135 |   assert:
136 |     - type: javascript
137 |       value: |
138 |         const result = JSON.parse(output);
139 |         return (
140 |           result.entry.some(e => e.resource.resourceType === 'Practitioner' && e.resource.name?.some(n => n.family?.includes('Taylor'))) &&
141 |           result.entry.some(e => e.resource.resourceType === 'Practitioner' && e.resource.identifier?.some(id => id.value === '1234567890'))
142 |         );
143 | 
144 | - description: Insurance plan coverage period
145 |   vars:
146 |     note: |
147 |       UnitedHealthcare Gold Plan (policy ID 987654) valid from 2024-01-01 through 2024-12-31 for Daniel Williams (DOB: 1980-05-15).
148 |   assert:
149 |     - type: javascript
150 |       value: |
151 |         const result = JSON.parse(output);
152 |         return (
153 |           result.entry.some(e => e.resource.resourceType === 'Coverage') &&
154 |           result.entry.some(e => e.resource.resourceType === 'Patient')
155 |         );
156 | 
157 | - description: Outpatient claim for office visit
158 |   vars:
159 |     note: |
160 |       Claim of $123.45 for CPT code 99213 submitted on 2024-04-10.
161 |   assert:
162 |     - type: javascript
163 |       value: |
164 |         const result = JSON.parse(output);
165 |         return (
166 |           result.entry.some(e => e.resource.resourceType === 'Claim' && e.resource.total?.value === 123.45) &&
167 |           result.entry.some(e => e.resource.resourceType === 'Claim' && e.resource.item?.some(i => i.productOrService?.coding?.some(c => c.code === '99213')))
168 |         );
169 | 
170 | - description: Complex patient case
171 |   vars:
172 |     note: |
173 |       Patient John Smith (DOB: 1945-08-20) has hypertension and type 2 diabetes. Blood pressure reading on 2024-03-15 was 140/90 mmHg. A1C measured 7.2% on 2024-03-15.
174 |   assert:
175 |     - type: javascript
176 |       value: |
177 |         const result = JSON.parse(output);
178 |         return (
179 |           result.entry.some(e => e.resource.resourceType === 'Patient' && e.resource.name?.some(n => n.family === 'Smith')) &&
180 |           result.entry.some(e => e.resource.resourceType === 'Condition' && e.resource.code?.coding?.some(c => c.code === '44054006')) && // Type 2 diabetes
181 |           result.entry.some(e => e.resource.resourceType === 'Condition' && e.resource.code?.coding?.some(c => c.code === '38341003')) && // Hypertension
182 |           result.entry.some(e => e.resource.resourceType === 'Observation' && e.resource.code?.coding?.some(c => c.code === '85354-9')) && // Blood pressure
183 |           result.entry.some(e => e.resource.resourceType === 'Observation' && e.resource.code?.coding?.some(c => c.code === '4548-4')) // Hemoglobin A1c
184 |         );
185 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "@flexpa/llm-fhir-eval",
 3 |   "module": "index.ts",
 4 |   "version": "0.0.4",
 5 |   "scripts": {
 6 |     "eval": "promptfoo eval"
 7 |   },
 8 |   "devDependencies": {
 9 |     "promptfoo": "0.113.3"
10 |   },
11 |   "resolutions": {
12 |     "promptfoo@0.113.3": "patch:promptfoo@npm%3A0.113.3#./.yarn/patches/promptfoo-npm-0.113.3-239bf96f0e.patch"
13 |   },
14 |   "peerDependencies": {
15 |     "typescript": "^5.0.0"
16 |   },
17 |   "flexpa": {
18 |     "publishableRepo": true
19 |   },
20 |   "packageManager": "yarn@4.9.1"
21 | }
22 | 


--------------------------------------------------------------------------------
/providers/AnthropicMessagesWithRecursiveToolCallsProvider.ts:
--------------------------------------------------------------------------------
  1 | import { type EnvOverrides, type ProviderResponse } from 'promptfoo';
  2 | import type Anthropic from '@anthropic-ai/sdk';
  3 | import { APIError } from '@anthropic-ai/sdk';
  4 | import { getEnvInt, getEnvFloat } from 'promptfoo/dist/src/envars';
  5 | import logger from 'promptfoo/dist/src/logger';
  6 | import { maybeLoadToolsFromExternalFile } from 'promptfoo/dist/src/util';
  7 | import { AnthropicGenericProvider } from 'promptfoo/dist/src/providers/anthropic/generic';
  8 | import type { AnthropicMessageOptions } from 'promptfoo/dist/src/providers/anthropic/types';
  9 | import {
 10 |   outputFromMessage,
 11 |   parseMessages,
 12 |   calculateAnthropicCost,
 13 |   getTokenUsage,
 14 |   ANTHROPIC_MODELS,
 15 | } from 'promptfoo/dist/src/providers/anthropic/util';
 16 | 
 17 | /* eslint-disable @typescript-eslint/no-explicit-any */
 18 | export default class AnthropicMessagesProvider extends AnthropicGenericProvider {
 19 |   declare config: AnthropicMessageOptions;
 20 |   private initializationPromise: Promise<void> | null = null;
 21 | 
 22 |   static ANTHROPIC_MODELS = ANTHROPIC_MODELS;
 23 | 
 24 |   static ANTHROPIC_MODELS_NAMES = ANTHROPIC_MODELS.map((model) => model.id);
 25 | 
 26 |   constructor(options: { id?: string; config?: AnthropicMessageOptions; env?: EnvOverrides } = {}) {
 27 |     if (!AnthropicMessagesProvider.ANTHROPIC_MODELS_NAMES.includes(options.config!.model!)) {
 28 |       logger.warn(`Using unknown Anthropic model: ${options.config?.model}`);
 29 |     }
 30 |     super(options.config?.model || 'claude-3-5-sonnet-20241022', options);
 31 |     const { id } = options;
 32 |     this.id = id ? () => id : this.id;
 33 |   }
 34 | 
 35 |   async cleanup(): Promise<void> {}
 36 | 
 37 |   toString(): string {
 38 |     if (!this.modelName) {
 39 |       throw new Error('Anthropic model name is not set. Please provide a valid model name.');
 40 |     }
 41 |     return `[Anthropic Messages Provider ${this.modelName}]`;
 42 |   }
 43 | 
 44 |   private extractToolUses(message: any): {
 45 |     toolUses: Array<{ id: string; name: string; arguments: any }>;
 46 |     resultText: string;
 47 |   } {
 48 |     const toolUses: Array<{ id: string; name: string; arguments: any }> = [];
 49 |     let resultText = '';
 50 | 
 51 |     // The assistant message content is an array of blocks. Iterate over them to find tool calls.
 52 |     if (Array.isArray((message as any).content)) {
 53 |       for (const block of (message as any).content) {
 54 |         if (block.type === 'tool_use') {
 55 |           toolUses.push({ id: block.id, name: block.name, arguments: block.input });
 56 |         } else if (block.type === 'text') {
 57 |           resultText += block.text;
 58 |         }
 59 |       }
 60 |     } else if (typeof (message as any).content === 'string') {
 61 |       // Fallback: if the content is a simple string, treat it as text.
 62 |       resultText += (message as any).content;
 63 |     }
 64 | 
 65 |     return { toolUses, resultText };
 66 |   }
 67 | 
 68 |   private async executeToolCall(
 69 |     toolUse: { id: string; name: string; arguments: any },
 70 |     // eslint-disable-next-line @typescript-eslint/no-explicit-any
 71 |     callbacks: Record<string, (...args: any[]) => any>,
 72 |   ): Promise<{ role: 'user'; content: Array<{ type: 'tool_result'; tool_use_id: string; content: string }> }> {
 73 |     const { name, id, arguments: args } = toolUse;
 74 | 
 75 |     if (!callbacks[name]) {
 76 |       logger.warn(`No callback configured for tool '${name}'. Returning placeholder result.`);
 77 |       return {
 78 |         role: 'user',
 79 |         content: [
 80 |           {
 81 |             type: 'tool_result',
 82 |             tool_use_id: id,
 83 |             content: `No callback configured for tool '${name}'.`,
 84 |           },
 85 |         ],
 86 |       } as any;
 87 |     }
 88 | 
 89 |     try {
 90 |       const toolResult = await callbacks[name](typeof args === 'string' ? args : JSON.stringify(args));
 91 |       return {
 92 |         role: 'user',
 93 |         content: [
 94 |           {
 95 |             type: 'tool_result',
 96 |             tool_use_id: id,
 97 |             content: typeof toolResult === 'string' ? toolResult : JSON.stringify(toolResult),
 98 |           },
 99 |         ],
100 |       } as any;
101 |     } catch (error) {
102 |       logger.error(`Error executing tool '${name}': ${error}`);
103 |       return {
104 |         role: 'user',
105 |         content: [
106 |           {
107 |             type: 'tool_result',
108 |             tool_use_id: id,
109 |             content: `error: ${error instanceof Error ? error.message : String(error)}`,
110 |           },
111 |         ],
112 |       } as any;
113 |     }
114 |   }
115 | 
116 |   async callApi(prompt: string): Promise<ProviderResponse> {
117 |     // Wait for MCP initialization if it's in progress
118 |     if (this.initializationPromise) {
119 |       await this.initializationPromise;
120 |     }
121 | 
122 |     if (!this.apiKey) {
123 |       throw new Error(
124 |         'Anthropic API key is not set. Set the ANTHROPIC_API_KEY environment variable or add `apiKey` to the provider config.',
125 |       );
126 |     }
127 | 
128 |     if (!this.modelName) {
129 |       throw new Error('Anthropic model name is not set. Please provide a valid model name.');
130 |     }
131 | 
132 |     const { system, extractedMessages, thinking } = parseMessages(prompt);
133 | 
134 |     // Get MCP tools if client is initialized
135 |     const fileTools = maybeLoadToolsFromExternalFile(this.config.tools) || [];
136 |     const allTools = [...fileTools];
137 | 
138 |     const maxDepth = (this.config as any)?.maxDepth ?? 10;
139 |     const toolCallbacks: Record<string, (...args: any[]) => any> | undefined = (this.config as any)
140 |       ?.functionToolCallbacks;
141 | 
142 |     let depth = 0;
143 |     let messages: any[] = extractedMessages as any[];
144 | 
145 |     let accumTokenUsage = { prompt: 0, completion: 0, total: 0 } as {
146 |       prompt: number;
147 |       completion: number;
148 |       total: number;
149 |     };
150 |     let accumCost = 0;
151 | 
152 |     const headers: Record<string, string> = {
153 |       ...(this.config.headers || {}),
154 |     };
155 | 
156 |     // Add beta features header if specified
157 |     if (this.config.beta?.length) {
158 |       headers['anthropic-beta'] = this.config.beta.join(',');
159 |     }
160 | 
161 |     while (depth <= maxDepth) {
162 |       const params: Anthropic.MessageCreateParams = {
163 |         model: this.modelName,
164 |         ...(system ? { system } : {}),
165 |         max_tokens:
166 |           this.config?.max_tokens || getEnvInt('ANTHROPIC_MAX_TOKENS', this.config.thinking || thinking ? 2048 : 1024),
167 |         messages,
168 |         stream: false,
169 |         temperature:
170 |           this.config.thinking || thinking
171 |             ? this.config.temperature
172 |             : this.config.temperature || getEnvFloat('ANTHROPIC_TEMPERATURE', 0),
173 |         ...(allTools.length > 0 ? { tools: allTools } : {}),
174 |         ...(this.config.tool_choice ? { tool_choice: this.config.tool_choice } : {}),
175 |         ...(this.config.thinking || thinking ? { thinking: this.config.thinking || thinking } : {}),
176 |         ...(typeof this.config?.extra_body === 'object' && this.config.extra_body ? this.config.extra_body : {}),
177 |       };
178 | 
179 |       logger.debug(`Calling Anthropic Messages API (depth ${depth}): ${JSON.stringify(params)}`);
180 | 
181 |       let response: any;
182 |       try {
183 |         response = await this.anthropic.messages.create(params, {
184 |           ...(Object.keys(headers).length > 0 ? { headers } : {}),
185 |         });
186 |       } catch (err) {
187 |         logger.error(`Anthropic Messages API call error: ${err instanceof Error ? err.message : String(err)}`);
188 |         if (err instanceof APIError && err.error) {
189 |           const errorDetails = err.error as { error: { message: string; type: string } };
190 |           return {
191 |             error: `API call error: ${errorDetails.error.message}, status ${err.status}, type ${errorDetails.error.type}`,
192 |           };
193 |         }
194 |         return {
195 |           error: `API call error: ${err instanceof Error ? err.message : String(err)}`,
196 |         };
197 |       }
198 | 
199 |       // Aggregate token usage and cost
200 |       const tokenUsage = getTokenUsage(response, false);
201 |       accumTokenUsage = {
202 |         prompt: (accumTokenUsage.prompt || 0) + (tokenUsage.prompt || 0),
203 |         completion: (accumTokenUsage.completion || 0) + (tokenUsage.completion || 0),
204 |         total: (accumTokenUsage.total || 0) + (tokenUsage.total || 0),
205 |       };
206 | 
207 |       const callCost = calculateAnthropicCost(
208 |         this.modelName,
209 |         this.config,
210 |         (response as any).usage?.input_tokens,
211 |         (response as any).usage?.output_tokens,
212 |       );
213 |       accumCost += callCost || 0;
214 | 
215 |       // Push assistant response to history for next iteration (if any)
216 |       messages.push({ role: 'assistant', content: (response as any).content } as any);
217 | 
218 |       // Extract tool uses from this response
219 |       const { toolUses } = this.extractToolUses(response);
220 | 
221 |       if (toolUses.length === 0 || depth >= maxDepth) {
222 |         return {
223 |           output: outputFromMessage(response, this.config.showThinking ?? true),
224 |           tokenUsage: accumTokenUsage,
225 |           cost: accumCost,
226 |           metadata: {
227 |             depth,
228 |           },
229 |         };
230 |       }
231 | 
232 |       // Execute tools and append their results to history
233 |       const toolResultMessages: any[] = [];
234 |       for (const toolUse of toolUses) {
235 |         const toolMessage = await this.executeToolCall(toolUse, toolCallbacks || {});
236 |         toolResultMessages.push(toolMessage);
237 |       }
238 | 
239 |       messages = [...messages, ...toolResultMessages];
240 | 
241 |       depth += 1;
242 |     }
243 | 
244 |     // If loop exits without returning, it means we exceeded maxDepth with pending tool calls
245 |     return {
246 |       output: 'Reached maximum recursion depth without completing all tool calls.',
247 |       tokenUsage: accumTokenUsage,
248 |       cost: accumCost,
249 |       metadata: {
250 |         depth,
251 |         reachedMaxDepth: true,
252 |       },
253 |     };
254 |   }
255 | }
256 | 


--------------------------------------------------------------------------------
/providers/OpenAiResponsesWithRecursiveToolCallsProvider.ts:
--------------------------------------------------------------------------------
  1 | /* eslint-disable @typescript-eslint/no-explicit-any */
  2 | import promptfoo from 'promptfoo';
  3 | import { OpenAiResponsesProvider } from 'promptfoo/dist/src/providers/openai/responses';
  4 | import { getTokenUsage, calculateOpenAICost, formatOpenAiError } from 'promptfoo/dist/src/providers/openai/util';
  5 | import logger from 'promptfoo/dist/src/logger';
  6 | import type { CallApiContextParams, CallApiOptionsParams, ProviderResponse, EnvOverrides } from 'promptfoo';
  7 | import type { OpenAiCompletionOptions } from 'promptfoo/dist/src/providers/openai/types';
  8 | 
  9 | const fetchWithCache = promptfoo.cache.fetchWithCache;
 10 | 
 11 | const REQUEST_TIMEOUT_MS = 1200000; // 30 seconds
 12 | 
 13 | type ApiResponse = {
 14 |   data: any;
 15 |   cached: boolean;
 16 |   status: number;
 17 |   statusText: string;
 18 | };
 19 | 
 20 | type FunctionCall = {
 21 |   call_id: string;
 22 |   name: string;
 23 |   input?: any;
 24 |   arguments?: any;
 25 | };
 26 | 
 27 | type OpenAiResponsesWithRecursiveToolCallsProviderOptions = OpenAiCompletionOptions & {
 28 |   model: string;
 29 | };
 30 | 
 31 | /**
 32 |  * Extended version of OpenAI responses provider that recursively handles function calls
 33 |  * until no more functions need to be called or max depth is reached.
 34 |  */
 35 | export default class OpenAiResponsesWithRecursiveToolCallsProvider extends OpenAiResponsesProvider {
 36 |   constructor(
 37 |     options: { config?: OpenAiResponsesWithRecursiveToolCallsProviderOptions; id?: string; env?: EnvOverrides } = {},
 38 |   ) {
 39 |     super(options.config?.model || 'o3', options);
 40 |   }
 41 | 
 42 |   private async makeApiCall(body: any, config: any): Promise<ApiResponse> {
 43 |     return await fetchWithCache(
 44 |       `${this.getApiUrl()}/responses`,
 45 |       {
 46 |         method: 'POST',
 47 |         headers: {
 48 |           'Content-Type': 'application/json',
 49 |           Authorization: `Bearer ${this.getApiKey()}`,
 50 |           ...(this.getOrganization() ? { 'OpenAI-Organization': this.getOrganization() } : {}),
 51 |           ...config.headers,
 52 |         },
 53 |         body: JSON.stringify(body),
 54 |       },
 55 |       REQUEST_TIMEOUT_MS,
 56 |     );
 57 |   }
 58 | 
 59 |   private extractFunctionCalls(output: any[]): {
 60 |     functionCalls: FunctionCall[];
 61 |     result: string;
 62 |     refusal: string | null;
 63 |   } {
 64 |     let result = '';
 65 |     let refusal = null;
 66 |     const functionCalls: FunctionCall[] = [];
 67 | 
 68 |     for (const item of output) {
 69 |       if (item.type === 'function_call') {
 70 |         functionCalls.push(item);
 71 |         result = JSON.stringify(item);
 72 |       } else if (item.type === 'message' && item.role === 'assistant') {
 73 |         if (item.content) {
 74 |           for (const contentItem of item.content) {
 75 |             if (contentItem.type === 'output_text') {
 76 |               result += contentItem.text;
 77 |             } else if (contentItem.type === 'tool_use' || contentItem.type === 'function_call') {
 78 |               functionCalls.push(contentItem);
 79 |               result = JSON.stringify(contentItem);
 80 |             } else if (contentItem.type === 'refusal') {
 81 |               refusal = contentItem.refusal;
 82 |             }
 83 |           }
 84 |         } else if (item.refusal) {
 85 |           refusal = item.refusal;
 86 |         }
 87 |       } else if (item.type === 'tool_result') {
 88 |         result = JSON.stringify(item);
 89 |       }
 90 |     }
 91 | 
 92 |     return { functionCalls, result, refusal };
 93 |   }
 94 | 
 95 |   private async executeFunctionCall(
 96 |     functionCall: FunctionCall,
 97 |     // eslint-disable-next-line @typescript-eslint/no-unsafe-function-type
 98 |     callbacks: Record<string, Function>,
 99 |   ): Promise<{ type: string; call_id: string; output: any }> {
100 |     const functionName = functionCall.name;
101 |     const functionArgs = functionCall.input || functionCall.arguments;
102 | 
103 |     try {
104 |       const functionResult = await callbacks[functionName](
105 |         typeof functionArgs === 'string' ? functionArgs : JSON.stringify(functionArgs),
106 |       );
107 | 
108 |       return {
109 |         type: 'function_call_output',
110 |         call_id: functionCall.call_id,
111 |         output: functionResult,
112 |       };
113 |     } catch (error) {
114 |       logger.error(`Error executing function ${functionName}: ${error}`);
115 |       return {
116 |         type: 'function_call_output',
117 |         call_id: functionCall.call_id,
118 |         output: `error: ${error}`,
119 |       };
120 |     }
121 |   }
122 | 
123 |   private async processResponse(
124 |     initialBody: any,
125 |     response: ApiResponse,
126 |     config: any,
127 |     depth: number = 0,
128 |     maxDepth: number = 10,
129 |     accumTokenUsage: { prompt: number; completion: number; total: number } = { prompt: 0, completion: 0, total: 0 },
130 |     accumCost: number = 0,
131 |   ): Promise<ProviderResponse> {
132 |     const { data, cached } = response;
133 | 
134 |     if (data.error) {
135 |       await data.deleteFromCache?.();
136 |       return { error: formatOpenAiError(data) };
137 |     }
138 | 
139 |     const { functionCalls, result, refusal } = this.extractFunctionCalls(data.output || []);
140 | 
141 |     // Handle refusal
142 |     const currentTokenUsage = getTokenUsage(data, cached);
143 |     const currentCost = calculateOpenAICost(
144 |       this.modelName,
145 |       config,
146 |       data.usage?.input_tokens,
147 |       data.usage?.output_tokens,
148 |       0,
149 |       0,
150 |     );
151 |     const totalTokenUsage = {
152 |       prompt: (accumTokenUsage.prompt || 0) + (currentTokenUsage.prompt || 0),
153 |       completion: (accumTokenUsage.completion || 0) + (currentTokenUsage.completion || 0),
154 |       total: (accumTokenUsage.total || 0) + (currentTokenUsage.total || 0),
155 |     };
156 |     const totalCost = (accumCost || 0) + (currentCost || 0);
157 | 
158 |     if (refusal) {
159 |       return {
160 |         output: refusal,
161 |         tokenUsage: totalTokenUsage,
162 |         isRefusal: true,
163 |         cached,
164 |         cost: totalCost,
165 |         raw: data,
166 |         metadata: {
167 |           depth,
168 |         },
169 |       };
170 |     }
171 | 
172 |     // If no function calls or reached max depth, return current result
173 |     if (functionCalls.length === 0 || depth >= maxDepth) {
174 |       return {
175 |         output: result,
176 |         tokenUsage: totalTokenUsage,
177 |         cached,
178 |         cost: totalCost,
179 |         raw: data,
180 |         metadata: {
181 |           depth,
182 |         },
183 |       };
184 |     }
185 | 
186 |     // Execute functions and update conversation history
187 |     if (config.functionToolCallbacks) {
188 |       const newHistory = [];
189 | 
190 |       // Execute functions and add results to history
191 |       for (const functionCall of functionCalls) {
192 |         if (config.functionToolCallbacks[functionCall.name]) {
193 |           const result = await this.executeFunctionCall(functionCall, config.functionToolCallbacks);
194 |           newHistory.push(result);
195 |         }
196 |       }
197 | 
198 |       // Make next recursive call
199 |       const nextBody = {
200 |         ...initialBody,
201 |         input: newHistory,
202 |         previous_response_id: response.data.id,
203 |       };
204 | 
205 |       try {
206 |         const nextResponse = await this.makeApiCall(nextBody, config);
207 | 
208 |         if (nextResponse.status < 200 || nextResponse.status >= 300) {
209 |           logger.info(`API ERROR: ${JSON.stringify(nextResponse.data)}`);
210 |           return {
211 |             error: `API error: ${nextResponse.status} ${nextResponse.statusText}`,
212 |             output: result,
213 |             tokenUsage: totalTokenUsage,
214 |             cached: false,
215 |             cost: totalCost,
216 |             raw: data,
217 |             metadata: {
218 |               depth,
219 |             },
220 |           };
221 |         }
222 | 
223 |         // Process next response recursively
224 |         const nextResult = await this.processResponse(
225 |           nextBody,
226 |           nextResponse,
227 |           config,
228 |           depth + 1,
229 |           maxDepth,
230 |           totalTokenUsage,
231 |           totalCost,
232 |         );
233 | 
234 |         return {
235 |           ...nextResult,
236 |           cached: false,
237 |         };
238 |       } catch (err) {
239 |         logger.error(`API call error in recursive call: ${err}`);
240 |         return {
241 |           error: `API call error in recursive call: ${err}`,
242 |           output: result,
243 |           tokenUsage: totalTokenUsage,
244 |           cached: false,
245 |           cost: totalCost,
246 |           raw: data,
247 |           metadata: {
248 |             depth,
249 |           },
250 |         };
251 |       }
252 |     }
253 | 
254 |     // If no callbacks configured, return current result
255 |     return {
256 |       output: result,
257 |       tokenUsage: totalTokenUsage,
258 |       cached,
259 |       cost: totalCost,
260 |       raw: data,
261 |       metadata: {
262 |         depth,
263 |       },
264 |     };
265 |   }
266 | 
267 |   async callApi(
268 |     prompt: string,
269 |     context?: CallApiContextParams,
270 |     callApiOptions?: CallApiOptionsParams,
271 |   ): Promise<ProviderResponse> {
272 |     if (!this.getApiKey()) {
273 |       throw new Error(
274 |         'OpenAI API key is not set. Set the OPENAI_API_KEY environment variable or add `apiKey` to the provider config.',
275 |       );
276 |     }
277 | 
278 |     const { body: initialBody, config } = this.getOpenAiBody(prompt, context, callApiOptions);
279 |     logger.debug(`Calling OpenAI Responses API: ${JSON.stringify(initialBody)}`);
280 | 
281 |     try {
282 |       const initialResponse = await this.makeApiCall(initialBody, config);
283 | 
284 |       if (initialResponse.status < 200 || initialResponse.status >= 300) {
285 |         return {
286 |           error: `API error: ${initialResponse.status} ${initialResponse.statusText}\n${
287 |             typeof initialResponse.data === 'string' ? initialResponse.data : JSON.stringify(initialResponse.data)
288 |           }`,
289 |         };
290 |       }
291 | 
292 |       // Start recursive processing
293 |       return await this.processResponse(initialBody, initialResponse, config);
294 |     } catch (err) {
295 |       logger.error(`Initial API call error: ${err}`);
296 |       return {
297 |         error: `Initial API call error: ${err}`,
298 |       };
299 |     }
300 |   }
301 | }
302 | 


--------------------------------------------------------------------------------
/tools/validateFhirBundle.mjs:
--------------------------------------------------------------------------------
 1 | import { validate } from '../assertions/validateOperation.mjs';
 2 | 
 3 | export default async function validateFhirBundle(bundle) {
 4 |   const response = await validate(JSON.stringify(JSON.parse(bundle).bundle));
 5 |   if (response.length > 0) {
 6 |     return JSON.stringify(response);
 7 |   }
 8 |   return `No errors found. Here is the bundle: ${JSON.stringify(JSON.parse(bundle).bundle)}`;
 9 | }
10 | 


--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     // Enable latest features
 4 |     "lib": ["ESNext", "DOM"],
 5 |     "target": "ESNext",
 6 |     "module": "ESNext",
 7 |     "moduleDetection": "force",
 8 |     "jsx": "react-jsx",
 9 |     "allowJs": true,
10 | 
11 |     // Bundler mode
12 |     "moduleResolution": "node",
13 |     "allowImportingTsExtensions": true,
14 |     "verbatimModuleSyntax": true,
15 |     "noEmit": true,
16 | 
17 |     // Best practices
18 |     "strict": true,
19 |     "skipLibCheck": true,
20 |     "noFallthroughCasesInSwitch": true,
21 | 
22 |     // Some stricter flags (disabled by default)
23 |     "noUnusedLocals": false,
24 |     "noUnusedParameters": false,
25 |     "noPropertyAccessFromIndexSignature": false
26 |   }
27 | }
28 | 


--------------------------------------------------------------------------------