├── tests
    ├── __init__.py
    ├── helpers
    │   ├── __init__.py
    │   └── utils.py
    ├── data
    │   ├── invalid.jsonl
    │   └── correctness_sm.jsonl
    ├── custom_metric_test.py
    ├── classification_metrics_test.py
    ├── generation_semantic_test.py
    ├── llm_test.py
    ├── generation_metrics_test.py
    └── retrieval_metrics_test.py
├── continuous_eval
    ├── __init__.py
    ├── metrics
    │   ├── code
    │   │   ├── __init__.py
    │   │   ├── python
    │   │   │   └── __init__.py
    │   │   └── sql
    │   │   │   ├── __init__.py
    │   │   │   ├── prompts
    │   │   │       ├── sql_correctness_user.jinja2
    │   │   │       └── sql_correctness_sys.jinja2
    │   │   │   └── llm.py
    │   ├── __init__.py
    │   ├── base
    │   │   ├── __init__.py
    │   │   └── llm.py
    │   ├── custom
    │   │   ├── __init__.py
    │   │   ├── custom_metric_user.jinja2
    │   │   ├── custom_metric_sys_probabilistic.jinja2
    │   │   ├── custom_metric_sys.jinja2
    │   │   └── custom_metric.py
    │   ├── generation
    │   │   └── text
    │   │   │   ├── prompts
    │   │   │       ├── faithfulness_user.jinja2
    │   │   │       ├── ans_relevance_user.jinja2
    │   │   │       ├── style_consistency_user.jinja2
    │   │   │       ├── ans_correctness_user.jinja2
    │   │   │       ├── ans_relevance_sys.jinja2
    │   │   │       ├── faithfulness_sys.jinja2
    │   │   │       ├── style_consistency_sys.jinja2
    │   │   │       └── ans_correctness_sys.jinja2
    │   │   │   ├── __init__.py
    │   │   │   ├── utils.py
    │   │   │   └── bert.py
    │   ├── retrieval
    │   │   ├── prompts
    │   │   │   ├── context_precision_user.jinja2
    │   │   │   ├── context_coverage_user.jinja2
    │   │   │   ├── context_precision_sys.jinja2
    │   │   │   └── context_coverage_sys.jinja2
    │   │   ├── __init__.py
    │   │   ├── simple_tokenizer.py
    │   │   ├── tokens.py
    │   │   ├── precision_recall_f1.py
    │   │   ├── matching_strategy.py
    │   │   └── ranked.py
    │   ├── classification
    │   │   ├── __init__.py
    │   │   └── classification.py
    │   └── tools
    │   │   └── match.py
    ├── utils
    │   ├── generic.py
    │   ├── types.py
    │   └── telemetry.py
    ├── llms
    │   ├── __init__.py
    │   ├── openai.py
    │   ├── cohere.py
    │   ├── base.py
    │   ├── anthropic.py
    │   ├── bedrock.py
    │   ├── anthropic_bedrock.py
    │   ├── google.py
    │   └── azure.py
    └── eval
    │   ├── __init__.py
    │   ├── types.py
    │   ├── tests.py
    │   ├── logger.py
    │   └── modules.py
├── docs
    ├── tsconfig.json
    ├── src
    │   ├── env.d.ts
    │   ├── content
    │   │   ├── config.ts
    │   │   └── docs
    │   │   │   ├── metrics
    │   │   │       ├── Generation
    │   │   │       │   ├── LLM-Based
    │   │   │       │   │   ├── faithfulness.md
    │   │   │       │   │   ├── relevance.md
    │   │   │       │   │   ├── correctness.md
    │   │   │       │   │   └── style.md
    │   │   │       │   ├── Semantic
    │   │   │       │   │   ├── bert_answer_relevance.md
    │   │   │       │   │   ├── bert_answer_similarity.md
    │   │   │       │   │   └── deberta_answer_scores.md
    │   │   │       │   └── Deterministic
    │   │   │       │   │   ├── flesch_kincaid_readability.md
    │   │   │       │   │   ├── correctness.md
    │   │   │       │   │   └── faithfulness.md
    │   │   │       ├── Code
    │   │   │       │   ├── Deterministic
    │   │   │       │   │   ├── code_string_match.md
    │   │   │       │   │   ├── sql_syntax_match.md
    │   │   │       │   │   ├── python_ast_similarity.md
    │   │   │       │   │   └── sql_ast_similarity.md
    │   │   │       │   └── Probabilistic
    │   │   │       │   │   └── llm-based.md
    │   │   │       ├── Tools
    │   │   │       │   └── Deterministic
    │   │   │       │   │   └── tool_selection.md
    │   │   │       ├── Retrieval
    │   │   │       │   ├── LLM-Based
    │   │   │       │   │   ├── context_precision.md
    │   │   │       │   │   └── context_coverage.md
    │   │   │       │   └── Deterministic
    │   │   │       │   │   ├── token_count.md
    │   │   │       │   │   ├── rank_aware_metrics.md
    │   │   │       │   │   └── precision_recall.md
    │   │   │       ├── Classification
    │   │   │       │   └── Deterministic
    │   │   │       │   │   └── classification_metrics.md
    │   │   │       ├── base.md
    │   │   │       └── overview.md
    │   │   │   ├── getting-started
    │   │   │       ├── Introduction.md
    │   │   │       ├── installation.md
    │   │   │       └── quickstart.md
    │   │   │   ├── pipeline
    │   │   │       ├── eval_runner.mdx
    │   │   │       ├── pipeline_logger.mdx
    │   │   │       ├── llms.mdx
    │   │   │       ├── pipeline.md
    │   │   │       └── metrics_and_tests.md
    │   │   │   └── index.mdx
    │   ├── components
    │   │   └── ThemeSelect.astro
    │   └── styles
    │   │   └── custom.css
    ├── public
    │   ├── module-level-eval.png
    │   ├── continuous-eval-logo.png
    │   ├── synthetic-data-demo.png
    │   └── favicon.svg
    ├── .gitignore
    ├── package.json
    ├── README.md
    └── astro.config.mjs
├── .env.example
├── examples
    ├── single_metric.py
    ├── evaluation_on_dataset.py
    └── modular_evaluation.py
├── .github
    └── workflows
    │   └── codeflash-optimize.yaml
├── .pre-commit-config.yaml
├── pyproject.toml
├── CODE_OF_CONDUCT.md
└── .gitignore


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/continuous_eval/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/helpers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/continuous_eval/metrics/code/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 |   "extends": "astro/tsconfigs/strict"
3 | }
4 | 


--------------------------------------------------------------------------------
/continuous_eval/metrics/__init__.py:
--------------------------------------------------------------------------------
1 | from continuous_eval.metrics.base import Metric
2 | 


--------------------------------------------------------------------------------
/docs/src/env.d.ts:
--------------------------------------------------------------------------------
1 | /// <reference path="../.astro/types.d.ts" />
2 | /// <reference types="astro/client" />
3 | 


--------------------------------------------------------------------------------
/continuous_eval/metrics/base/__init__.py:
--------------------------------------------------------------------------------
1 | from .metric import Arg, Field, Metric
2 | from .prompt import MetricPrompt
3 | 


--------------------------------------------------------------------------------
/continuous_eval/metrics/custom/__init__.py:
--------------------------------------------------------------------------------
1 | from .custom_metric import ProbabilisticCustomMetric,CustomMetric, Example
2 | 


--------------------------------------------------------------------------------
/continuous_eval/metrics/code/python/__init__.py:
--------------------------------------------------------------------------------
1 | from .code_deterministic_metrics import CodeStringMatch, PythonASTSimilarity
2 | 


--------------------------------------------------------------------------------
/docs/public/module-level-eval.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/relari-ai/continuous-eval/HEAD/docs/public/module-level-eval.png


--------------------------------------------------------------------------------
/continuous_eval/utils/generic.py:
--------------------------------------------------------------------------------
1 | def all_sets_equal(d: dict) -> bool:
2 |     return len(set(map(frozenset, d.values()))) == 1
3 | 


--------------------------------------------------------------------------------
/docs/public/continuous-eval-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/relari-ai/continuous-eval/HEAD/docs/public/continuous-eval-logo.png


--------------------------------------------------------------------------------
/docs/public/synthetic-data-demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/relari-ai/continuous-eval/HEAD/docs/public/synthetic-data-demo.png


--------------------------------------------------------------------------------
/continuous_eval/metrics/code/sql/__init__.py:
--------------------------------------------------------------------------------
1 | from .deterministic import SQLASTSimilarity, SQLSyntaxMatch
2 | from .llm import SQLCorrectness
3 | 


--------------------------------------------------------------------------------
/continuous_eval/llms/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import LLMFactory
2 | from .openai import OpenAI
3 | 
4 | LLMFactory.register_provider("openai", OpenAI)
5 | 


--------------------------------------------------------------------------------
/continuous_eval/metrics/generation/text/prompts/faithfulness_user.jinja2:
--------------------------------------------------------------------------------
1 | Context:
2 | ```
3 | {{ context }}
4 | ```
5 | 
6 | Statement: `{{ statement }}`
7 | 


--------------------------------------------------------------------------------
/continuous_eval/metrics/retrieval/prompts/context_precision_user.jinja2:
--------------------------------------------------------------------------------
1 | Question: {{question}}
2 | Context:
3 | ```
4 | {{context}}
5 | ```
6 | Response:
7 | 


--------------------------------------------------------------------------------
/continuous_eval/metrics/classification/__init__.py:
--------------------------------------------------------------------------------
1 | from continuous_eval.metrics.classification.classification import (
2 |     SingleLabelClassification,
3 | )
4 | 


--------------------------------------------------------------------------------
/continuous_eval/metrics/generation/text/prompts/ans_relevance_user.jinja2:
--------------------------------------------------------------------------------
1 | Question: `{{ question }}`
2 | Generated answer: `{{ answer }}`
3 | 
4 | Evaluation:
5 | 


--------------------------------------------------------------------------------
/continuous_eval/metrics/custom/custom_metric_user.jinja2:
--------------------------------------------------------------------------------
1 | {%- for key in arguments.keys() %}{{ key }}: `{{ "{{ " + key + " }}" }}`
2 | {% endfor %}
3 | Evaluation:
4 | 


--------------------------------------------------------------------------------
/continuous_eval/metrics/retrieval/prompts/context_coverage_user.jinja2:
--------------------------------------------------------------------------------
1 | Question: `{{question}}`
2 | Context:
3 | {% for ctx in context %}
4 | `{{ ctx }}`
5 | {% endfor %}
6 | Answer: `{{answer}}`
7 | 
8 | Your response:
9 | 


--------------------------------------------------------------------------------
/continuous_eval/eval/__init__.py:
--------------------------------------------------------------------------------
 1 | from continuous_eval.eval.dataset import Dataset
 2 | from continuous_eval.eval.modules import Metric, Module, Test, Tool
 3 | from continuous_eval.eval.pipeline import (
 4 |     CalledTools,
 5 |     ModuleOutput,
 6 |     Pipeline,
 7 |     SingleModulePipeline,
 8 | )
 9 | from continuous_eval.eval.runner import EvaluationRunner
10 | 


--------------------------------------------------------------------------------
/continuous_eval/metrics/generation/text/prompts/style_consistency_user.jinja2:
--------------------------------------------------------------------------------
 1 | Generated answer: `{{ answer }}`
 2 | {% if ground_truth_answers | length > 1 %}
 3 | Reference Answer(s):
 4 | {% for answer in ground_truth_answers %}
 5 | - `{{ answer }}`
 6 | {% endfor %}
 7 | {% else %}
 8 | Reference Answer: `{{ ground_truth_answers[0] }}`
 9 | {% endif %}
10 | 
11 | Response:
12 | 


--------------------------------------------------------------------------------
/docs/.gitignore:
--------------------------------------------------------------------------------
 1 | # build output
 2 | dist/
 3 | # generated types
 4 | .astro/
 5 | 
 6 | # dependencies
 7 | node_modules/
 8 | 
 9 | # logs
10 | npm-debug.log*
11 | yarn-debug.log*
12 | yarn-error.log*
13 | pnpm-debug.log*
14 | 
15 | 
16 | # environment variables
17 | .env
18 | .env.production
19 | 
20 | # macOS-specific files
21 | .DS_Store
22 | 
23 | # Generated by d2
24 | public/d2


--------------------------------------------------------------------------------
/continuous_eval/eval/types.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Dict, TypedDict
 2 | 
 3 | UID = str
 4 | 
 5 | 
 6 | def args_to_dict(*args, **kwargs):
 7 |     arg_dict = dict(kwargs)
 8 |     for index, value in enumerate(args):
 9 |         arg_dict[f"_arg{index}"] = value
10 |     return arg_dict
11 | 
12 | 
13 | class ToolCall(TypedDict):
14 |     name: str
15 |     kwargs: Dict[str, Any]
16 | 


--------------------------------------------------------------------------------
/continuous_eval/metrics/generation/text/prompts/ans_correctness_user.jinja2:
--------------------------------------------------------------------------------
 1 | Question: `{{ question }}`
 2 | {% if ground_truth_answers | length > 1 %}
 3 | Ground truth answers:
 4 | {% for answer in ground_truth_answers %}
 5 | - `{{ answer }}`
 6 | {% endfor %}
 7 | {% else %}
 8 | Ground truth answer: `{{ ground_truth_answers[0] }}`
 9 | {% endif %}
10 | Generated answer: `{{ answer }}`
11 | 
12 | Evaluation:
13 | 


--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
 1 | # For OpenAI
 2 | OPENAI_API_KEY="sk-xxxx"
 3 | 
 4 | # For Anthropic
 5 | ANTHROPIC_API_KEY="sk-ant-xxxx"
 6 | 
 7 | # For Gemini
 8 | GEMINI_API_KEY="xxxx"
 9 | 
10 | # For Cohere
11 | COHERE_API_KEY="xxxx"
12 | 
13 | # For Azure OpenAI
14 | AZURE_OPENAI_API_KEY="sk-xxxx"
15 | AZURE_OPENAI_API_VERSION="2023-03-15-preview"
16 | AZURE_ENDPOINT="https://xxx.openai.azure.com/"
17 | AZURE_DEPLOYMENT="gpt-35-turbo-16k"
18 | 
19 | # Set default LLM
20 | EVAL_LLM="gpt-3.5-turbo-0125"
21 | 


--------------------------------------------------------------------------------
/docs/src/content/config.ts:
--------------------------------------------------------------------------------
 1 | import { defineCollection } from 'astro:content';
 2 | import { docsSchema, i18nSchema } from '@astrojs/starlight/schema';
 3 | 
 4 | export const collections = {
 5 | 	docs: defineCollection({ schema: docsSchema() }),
 6 | 	i18n: defineCollection({ type: 'data', schema: i18nSchema() }),
 7 | };
 8 | 
 9 | export const base = '/';
10 | 
11 | export const versions = [
12 |   ['v0.3', 'v0.3.x (canary)'],
13 |   ['v0.2', 'v0.2.x (latest)'],
14 | ];
15 | 
16 | export const defaultVersion = 'v0.3';
17 | 


--------------------------------------------------------------------------------
/docs/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "docs",
 3 |   "type": "module",
 4 |   "version": "0.0.1",
 5 |   "scripts": {
 6 |     "dev": "astro dev",
 7 |     "start": "astro dev",
 8 |     "build": "astro check && astro build",
 9 |     "preview": "astro preview",
10 |     "astro": "astro"
11 |   },
12 |   "dependencies": {
13 |     "@astrojs/check": "^0.9.4",
14 |     "@astrojs/starlight": "^0.30.3",
15 |     "astro": "^5.0.2",
16 |     "rehype-mathjax": "^6.0.0",
17 |     "remark-math": "^6.0.0",
18 |     "sharp": "^0.32.5",
19 |     "typescript": "^5.7.2"
20 |   },
21 |   "engines": {
22 |     "node": ">=18.0.0"
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/continuous_eval/metrics/generation/text/__init__.py:
--------------------------------------------------------------------------------
 1 | from continuous_eval.metrics.generation.text.deterministic import (
 2 |     DeterministicAnswerCorrectness,
 3 |     DeterministicFaithfulness,
 4 |     FleschKincaidReadability,
 5 | )
 6 | 
 7 | try:
 8 |     from continuous_eval.metrics.generation.text.semantic import (
 9 |         BertAnswerRelevance,
10 |         BertAnswerSimilarity,
11 |         DebertaAnswerScores,
12 |     )
13 | except ImportError:
14 |     pass
15 | from continuous_eval.metrics.generation.text.llm_based import (
16 |     AnswerCorrectness,
17 |     AnswerRelevance,
18 |     Faithfulness,
19 |     StyleConsistency,
20 | )
21 | 


--------------------------------------------------------------------------------
/continuous_eval/metrics/custom/custom_metric_sys_probabilistic.jinja2:
--------------------------------------------------------------------------------
 1 | You are an expert evaluator system for a somewhat intelligent system.
 2 | You need to evaluate the following criteria:
 3 | {{ criteria }}
 4 | 
 5 | -- GUIDELINES --
 6 | When evaluating the answer, strictly adhere to the following guidelines:
 7 | {{ rubric }}
 8 | -- END OF GUIDELINES --
 9 | 
10 | {% if examples %}
11 | -- EXAMPLES --
12 | {%- for example in examples %}
13 | INPUT:
14 | {%- for key, value in example.input.items() %}
15 | {{ key }}: `{{ value }}`
16 | {%- endfor %}
17 | 
18 | EVALUATION:
19 | {{ example.output | tojson }}
20 | --
21 | {%- endfor %} END OF EXAMPLES --
22 | {% endif %}
23 | 


--------------------------------------------------------------------------------
/tests/data/invalid.jsonl:
--------------------------------------------------------------------------------
 1 | {"question": "who is playing halftime show super bowl 2018"}
 2 | {"question": "who said beware of the ides of march"}
 3 | {"question": "what kind of beast is the beast from beauty and the beast"}
 4 | {"question": "when does the second half of vikings season 5 air"}
 5 | {"question": "who's the guy in call me maybe"}
 6 | {"question": "who sang what are we doing in love"}
 7 | {"question": "who signed the largest on the declaration of independence"}
 8 | {"question": "who has won the 2017 mens singles mutua madrid open tennis"}
 9 | {"question": "original cast of natasha pierre and the great comet of 1812"}
10 | {"question": "when was where have all the flowers gone written"}
11 | 


--------------------------------------------------------------------------------
/examples/single_metric.py:
--------------------------------------------------------------------------------
 1 | from continuous_eval.metrics.retrieval import PrecisionRecallF1
 2 | 
 3 | # A dataset is just a list of dictionaries containing the relevant information
 4 | datum = {
 5 |     "question": "What is the capital of France?",
 6 |     "retrieved_context": [
 7 |         "Paris is the capital of France and its largest city.",
 8 |         "Lyon is a major city in France.",
 9 |     ],
10 |     "ground_truth_context": ["Paris is the capital of France."],
11 |     "answer": "Paris",
12 |     "ground_truths": ["Paris"],
13 | }
14 | 
15 | # Let's initialize the metric
16 | metric = PrecisionRecallF1()
17 | 
18 | # Let's calculate the metric for the first datum
19 | print(metric(**datum))
20 | 


--------------------------------------------------------------------------------
/docs/public/favicon.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 128 128"><path fill-rule="evenodd" d="M81 36 64 0 47 36l-1 2-9-10a6 6 0 0 0-9 9l10 10h-2L0 64l36 17h2L28 91a6 6 0 1 0 9 9l9-10 1 2 17 36 17-36v-2l9 10a6 6 0 1 0 9-9l-9-9 2-1 36-17-36-17-2-1 9-9a6 6 0 1 0-9-9l-9 10v-2Zm-17 2-2 5c-4 8-11 15-19 19l-5 2 5 2c8 4 15 11 19 19l2 5 2-5c4-8 11-15 19-19l5-2-5-2c-8-4-15-11-19-19l-2-5Z" clip-rule="evenodd"/><path d="M118 19a6 6 0 0 0-9-9l-3 3a6 6 0 1 0 9 9l3-3Zm-96 4c-2 2-6 2-9 0l-3-3a6 6 0 1 1 9-9l3 3c3 2 3 6 0 9Zm0 82c-2-2-6-2-9 0l-3 3a6 6 0 1 0 9 9l3-3c3-2 3-6 0-9Zm96 4a6 6 0 0 1-9 9l-3-3a6 6 0 1 1 9-9l3 3Z"/><style>path{fill:#000}@media (prefers-color-scheme:dark){path{fill:#fff}}</style></svg>
2 | 


--------------------------------------------------------------------------------
/continuous_eval/metrics/code/sql/prompts/sql_correctness_user.jinja2:
--------------------------------------------------------------------------------
 1 | Carefully read the following question that describes what the SQL query should do:
 2 | <question>
 3 | {{question}}
 4 | </question>
 5 | 
 6 | Now, examine the generated SQL query:
 7 | <generated_sql>
 8 | {{answer}}
 9 | </generated_sql>
10 | 
11 | Compare it to the correct SQL query:
12 | {%- set ground_truth_answers = [ground_truth_answers] if ground_truth_answers is string else ground_truth_answers -%}
13 | {% for gt in ground_truth_answers %}
14 | <correct_sql>
15 | {{ gt }}
16 | </correct_sql>
17 | {%- endfor %}
18 | 
19 | {% if schema %}
20 | The schema of the database is as follows:
21 | <schema>
22 | {{ schema }}
23 | </schema>
24 | {% endif %}
25 | 
26 | Your evaluation:
27 | 


--------------------------------------------------------------------------------
/continuous_eval/metrics/retrieval/__init__.py:
--------------------------------------------------------------------------------
 1 | from continuous_eval.metrics.retrieval.llm_based import (
 2 |     ContextCoverage,
 3 |     ContextPrecision,
 4 | )
 5 | from continuous_eval.metrics.retrieval.matching_strategy import (
 6 |     ExactChunkMatch,
 7 |     ExactSentenceMatch,
 8 |     RougeChunkMatch,
 9 |     RougeSentenceMatch,
10 | )
11 | from continuous_eval.metrics.retrieval.precision_recall_f1 import PrecisionRecallF1
12 | from continuous_eval.metrics.retrieval.ranked import RankedRetrievalMetrics
13 | from continuous_eval.metrics.retrieval.tokens import TokenCount
14 | 
15 | from nltk import download as nltk_download
16 | 
17 | nltk_download("punkt", quiet=True)
18 | nltk_download("punkt_tab", quiet=True)
19 | nltk_download("stopwords", quiet=True)
20 | 


--------------------------------------------------------------------------------
/continuous_eval/metrics/retrieval/prompts/context_precision_sys.jinja2:
--------------------------------------------------------------------------------
 1 | Given the following question and context, verify if the information in the given context is useful in answering the question.
 2 | Respond with either Yes or No, followed by reasoning.
 3 | {% if use_few_shot %}
 4 | 
 5 | --EXAMPLES--
 6 | Example 1
 7 | Question: What is the capital of France?
 8 | Context: Paris is the largest city and the capital of France. It has many historical monuments.
 9 | Response: Yes
10 | Reasoning: The context states that Paris is the capital of France.
11 | 
12 | Example 2:
13 | Question: What is the capital of France?
14 | Context: Lyon is a major city in France. It is known for its culinary arts.
15 | Response: No
16 | Reasoning: The context does not mention any city that is the capital of France.
17 | {% endif %}
18 | Now evaluate the following:
19 | 


--------------------------------------------------------------------------------
/continuous_eval/metrics/custom/custom_metric_sys.jinja2:
--------------------------------------------------------------------------------
 1 | You are an expert evaluator system for a somewhat intelligent system.
 2 | You need to evaluate the following criteria:
 3 | {{ criteria }}
 4 | 
 5 | -- GUIDELINES --
 6 | When evaluating the answer, strictly adhere to the following guidelines:
 7 | {{ rubric }}
 8 | -- END OF GUIDELINES --
 9 | 
10 | -- RESPONSE FORMAT --
11 | Output the your evaluation in JSON format following the following schema:
12 | {
13 | {% for key, value in response_format.items() -%}
14 |   "{{ key }}": ({{ value.type }}) {{ value.description }},
15 | {% endfor -%}
16 | }
17 | -- END OF RESPONSE FORMAT --
18 | {% if examples %}
19 | -- EXAMPLES --
20 | {%- for example in examples %}
21 | INPUT:
22 | {%- for key, value in example.input.items() %}
23 | {{ key }}: `{{ value }}`
24 | {%- endfor %}
25 | 
26 | EVALUATION:
27 | {{ example.output | tojson }}
28 | --
29 | {%- endfor %} END OF EXAMPLES --
30 | {% endif %}
31 | 


--------------------------------------------------------------------------------
/docs/src/content/docs/metrics/Generation/LLM-Based/faithfulness.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: LLM-based Faithfulness
 3 | ---
 4 | 
 5 | ### Definition
 6 | 
 7 | **LLM-based Faithfulness** measures how grounded is the generated answer on the retrieved contexts. 
 8 | 
 9 | ### Example Usage
10 | 
11 | Required data items: `answer`, and `retrieved_context`
12 | 
13 | ```python
14 | from continuous_eval.metrics.generation.text import Faithfulness
15 | 
16 | datum = {
17 |     "question": "Who wrote 'Romeo and Juliet'?",
18 |     "retrieved_context": ["William Shakespeare is the author of 'Romeo and Juliet'."],
19 |     "answer": "Shakespeare wrote 'Romeo and Juliet'",
20 |     "ground_truth_answers": "Shakespeare",
21 | }
22 | metric = Faithfulness()
23 | print(metric(**datum))
24 | ```
25 | 
26 | ### Sample Output
27 | 
28 | ```python
29 | {
30 |     "faithfulness": 1.0,
31 |     "reasoning": "The statement directly reflects the context.",
32 | }
33 | ```
34 | 


--------------------------------------------------------------------------------
/.github/workflows/codeflash-optimize.yaml:
--------------------------------------------------------------------------------
 1 | name: Codeflash
 2 | 
 3 | on:
 4 |   pull_request:
 5 |   workflow_dispatch:
 6 | 
 7 | jobs:
 8 |   optimize:
 9 |     name: Optimize new code in this PR
10 |     if: ${{ github.actor != 'codeflash-ai[bot]' }}
11 |     runs-on: ubuntu-latest
12 |     env:
13 |       CODEFLASH_API_KEY: ${{ secrets.CODEFLASH_API_KEY }}
14 |       CODEFLASH_PR_NUMBER: ${{ github.event.number }}
15 |     steps:
16 |       - uses: actions/checkout@v4
17 |         with:
18 |           fetch-depth: 0
19 |       - name: Set up Python
20 |         uses: actions/setup-python@v5
21 |         with:
22 |           python-version: '3.12'
23 |       - name: Install Project Dependencies
24 |         run: |
25 |           python -m pip install --upgrade pip
26 |           pip install poetry
27 |           poetry install --all-extras
28 |       - name: Run Codeflash to optimize code
29 |         id: optimize_code
30 |         run: |
31 |           poetry env use python
32 |           poetry run codeflash
33 | 


--------------------------------------------------------------------------------
/docs/src/content/docs/metrics/Code/Deterministic/code_string_match.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Code String Match
 3 | sidebar:
 4 |     order: 1
 5 | ---
 6 | 
 7 | ### Definitions
 8 | 
 9 | **Code String Match** measures how close the generated code string is to the ground truth code string.
10 | 
11 | It outputs both the binary exact match score and the fuzzy match score in the range of (0.0 - 1.0).
12 | 
13 | <br>
14 | 
15 | ### Example Usage
16 | 
17 | Required data items: `answer`, `ground_truth_answers`
18 | 
19 | ```python
20 | from continuous_eval.metrics.code.python import CodeStringMatch
21 | 
22 | datum = {
23 |     "answer": "def function(x, y):\n  return x + y",
24 |     "ground_truth_answers": [
25 |         "def foo(x, y):\n  return x * y",
26 |         "def foo(x, y):\n  return x + y",
27 |     ],
28 | }
29 | 
30 | metric = CodeStringMatch()
31 | print(metric(**datum))
32 | ```
33 | 
34 | ### Example Output
35 | 
36 | ```JSON
37 | {
38 |     "Exact_Match_Score": 0, 
39 |     "Fuzzy_Match_Score": 0.89
40 | }
41 | ```
42 | 


--------------------------------------------------------------------------------
/docs/src/content/docs/getting-started/Introduction.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Introduction
 3 | description: Overview
 4 | sidebar:
 5 |   badge:
 6 |     text: new
 7 |     variant: tip
 8 | ---
 9 | 
10 | ## What is continuous-eval?
11 | 
12 | `continuous-eval` is an open-source package created for granular and holistic evaluation of GenAI application pipelines.
13 | 
14 | <img src="/module-level-eval.png"></img>
15 | 
16 | ## How is continuous-eval different?
17 | 
18 | - **Modularized Evaluation**: Measure each module in the pipeline with tailored metrics.
19 | 
20 | - **Comprehensive Metric Library**: Covers Retrieval-Augmented Generation (RAG), Code Generation, Agent Tool Use, Classification and a variety of other LLM use cases. Mix and match Deterministic, Semantic and LLM-based metrics.
21 | 
22 | ## Resources
23 | 
24 | - **Relari Blog:** Useful articles on how to evaluate LLM applications [link](https://www.relari.ai/blog)
25 | - **Discord:** Join our community of LLM developers [Discord](https://discord.gg/GJnM8SRsHr)
26 | - **Reach out to founders:** [Email](mailto:founders@relari.ai) or [Schedule a chat](https://cal.com/pasquale/continuous-eval)
27 | 


--------------------------------------------------------------------------------
/docs/src/components/ThemeSelect.astro:
--------------------------------------------------------------------------------
 1 | ---
 2 | import type { Props } from '@astrojs/starlight/props';
 3 | import Default from '@astrojs/starlight/components/ThemeSelect.astro';
 4 | import Select from '@astrojs/starlight/components/Select.astro';
 5 | import { defaultVersion, versions } from '../content/config.ts';
 6 | 
 7 | const url = new URL(Astro.url);
 8 | const currentVersion = /^\/v\d\.\d+/.test(url.pathname)
 9 |   ? url.pathname.split('/')[1]
10 |   : defaultVersion;
11 | console.log(currentVersion);
12 | const options = versions.map(([version, label]) => ({
13 |   label,
14 |   selected: currentVersion === version,
15 |   value: `/${version}`,
16 | }));
17 | ---
18 | 
19 | <script>
20 |   const select = document.querySelector('select');
21 |   select?.addEventListener('change', () => {
22 |     const url = new URL(window.location.href);
23 |     const pathname = select.value + url.pathname.replace(/^\/v\d\.\d+/, '');
24 |     url.pathname = pathname.replace(/\/$/, '');
25 |     window.location.href = url.href;
26 |   });
27 | </script>
28 | 
29 | <Select label="version" value="auto" options={options} width="auto" icon="document" />
30 | <Default {...Astro.props}><slot /></Default>
31 | 


--------------------------------------------------------------------------------
/docs/src/content/docs/getting-started/installation.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Installation
 3 | description: How to install continuous-eval
 4 | ---
 5 | 
 6 | `continuous-eval` is provided as an open-source Python package. 
 7 | To install it, run the following command:
 8 | 
 9 | ```bash
10 | python3 -m pip install continuous-eval
11 | ```
12 | 
13 | the package offers optional extras for additional functionality:
14 | 
15 | - `semantic` to support semantic metrics that use small models such as BERT, DeBERTa.
16 | - `anthropic` to support Anthropic's Claude model
17 | - `google` to support Google's Gemini model
18 | - `bedrock` to support AWS's Bedrock models
19 | - `cohere` to support Cohere's models
20 | - `azure` to support Microsoft's Azure models
21 | 
22 | with PIP you can install any combination of them. For example:
23 | 
24 | ```bash
25 | pip install continuous-eval[anthropic,gemini,generators]
26 | ```
27 | 
28 | Otherwise you can install continuous-eval from source
29 | 
30 | ```bash
31 | git clone https://github.com/relari-ai/continuous-eval.git && cd continuous-eval
32 | poetry install --all-extras
33 | ```
34 | 
35 | continuous-eval is tested on Python 3.10, 3.11 and 3.12 on Linux and MacOS.
36 | 


--------------------------------------------------------------------------------
/continuous_eval/metrics/generation/text/prompts/ans_relevance_sys.jinja2:
--------------------------------------------------------------------------------
 1 | You are an expert evaluator system for a question answering system.
 2 | You need to evaluate the relevance and completeness of the generated answer based on the question.
 3 | 
 4 | You should output a single score between 1 to 3.
 5 | Use the following guidelines for evaluation:
 6 | * 1 means that the answer is completely irrelevant to the question.
 7 | * 2 means that the answer is partially relevant to the question or it only partially answers the question.
 8 | * 3 means that the answer is relevant to the question and completely answers the question.
 9 | 
10 | {% if 'use_few_shot' %}
11 | {% raw %}
12 | -- BEGIN OF EXAMPLE --
13 | Question: `What is the process of photosynthesis?`
14 | Generated Answer: `Photosynthesis is an important process of all plants.`
15 | Evaluation:
16 | {
17 |   "reasoning": "The answer acknowledges the importance of photosynthesis for plants, which is partially relevant. However, it fails to explain the process of photosynthesis, thereby only partially answering the question.",
18 |   "score": 2
19 | }
20 | -- END OF EXAMPLES --
21 | {% endraw %}
22 | {% endif %}
23 | 
24 | Output a reasoning for your judgement and your score.
25 | 


--------------------------------------------------------------------------------
/docs/src/content/docs/metrics/Code/Deterministic/sql_syntax_match.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: SQL Syntax Match
 3 | sidebar:
 4 |   order: 2
 5 | ---
 6 | 
 7 | ## Definitions
 8 | 
 9 | **SQL Syntax Match** evaluates the syntactic equivalence between generated SQL queries and a set of ground truth queries. The strict comparison can tolerate formatting changes.
10 | 
11 | ## Example Usage
12 | 
13 | Required data items: `answer`, `ground_truth_answers`
14 | 
15 | ```python
16 | from continuous_eval.metrics.code.sql import SQLSyntaxMatch
17 | 
18 | datum = {
19 |     "answer": "SELECT * FROM users;",
20 |     "ground_truth_answers": "SELECT  *  from  users;",
21 | }
22 | 
23 | metric = SQLSyntaxMatch()
24 | print(metric(**datum))
25 | ```
26 | 
27 | You can optionally initialize the metric to use optimized SQL queries using the [sqlglot optimizer](https://github.com/tobymao/sqlglot?tab=readme-ov-file#sql-optimizer) and optionally pass in the schema. For example:
28 | 
29 | ```python
30 | schema={"x": {"A": "INT", "B": "INT", "C": "INT", "D": "INT", "Z": "STRING"}}
31 | sql_syntax_match_optimized = SQLSyntaxMatch(optimized=True, schema=schema)
32 | ```
33 | 
34 | ## Example Output
35 | 
36 | ```JSON
37 | {
38 |     "SQL_Syntax_Match": 1.0
39 | }
40 | ```
41 | 


--------------------------------------------------------------------------------
/docs/src/content/docs/metrics/Generation/LLM-Based/relevance.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: LLM-based Answer Relevance
 3 | ---
 4 | 
 5 | ### Definition
 6 | 
 7 | **LLM-based Answer Relevance** outputs a score between 0.0 - 1.0 assessing the consistency of the generated answer based on the reference ground truth answers.
 8 | 
 9 | **Scoring rubric in LLM Prompt:**
10 | 
11 | - 0.0 means that the answer is completely irrelevant to the question.
12 | - 0.5 means that the answer is partially relevant to the question or it only partially answers the question.
13 | - 1.0 means that the answer is relevant to the question and completely answers the question.
14 | 
15 | ### Example Usage
16 | 
17 | Required data items: `question`, `answer`
18 | 
19 | ```python
20 | from continuous_eval.metrics.generation.text import AnswerRelevance
21 | 
22 | datum = {
23 |     "question": "Who wrote 'Romeo and Juliet'?",
24 |     "answer": "Shakespeare wrote 'Romeo and Juliet'",
25 | }
26 | 
27 | metric = AnswerRelevance()
28 | print(metric(**datum))
29 | ```
30 | 
31 | ### Sample Output
32 | 
33 | ```python
34 | {
35 |     "relevance": 0.9999999999959147,
36 |     "reasoning": "The generated answer correctly identifies Shakespeare as the author of 'Romeo and Juliet'.",
37 | }
38 | 
39 | ```
40 | 


--------------------------------------------------------------------------------
/docs/src/styles/custom.css:
--------------------------------------------------------------------------------
 1 | /* Dark mode colors. */
 2 | :root {
 3 | 	--sl-color-accent-low: #486480;
 4 | 	--sl-color-accent: #F06449;
 5 | 	--sl-color-accent-high: #7ccab0;
 6 | 	--sl-color-white: #ffffff;
 7 | 	--sl-color-gray-1: #eceef2;
 8 | 	--sl-color-gray-2: #c0c2c7;
 9 | 	--sl-color-gray-3: #888b96;
10 | 	--sl-color-gray-4: #545861;
11 | 	--sl-color-gray-5: #353841;
12 | 	--sl-color-gray-6: rgb(30 41 59);
13 | 	--sl-color-black: #030620;
14 | }
15 | 
16 | /* Light mode colors. */
17 | :root[data-theme='light'] {
18 | 	--sl-color-accent-low: #F06449;
19 | 	--sl-color-accent: #7ccab0;
20 | 	--sl-color-accent-high: #486480;
21 | 	--sl-color-white: #17181c;
22 | 	--sl-color-gray-1: #24272f;
23 | 	--sl-color-gray-2: #353841;
24 | 	--sl-color-gray-3: #545861;
25 | 	--sl-color-gray-4: #888b96;
26 | 	--sl-color-gray-5: #c0c2c7;
27 | 	--sl-color-gray-6: #eceef2;
28 | 	--sl-color-gray-7: #f5f6f8;
29 | 	--sl-color-black: #ffffff;
30 | }
31 | 
32 | .top-level>li>details>ul>li>details>ul>li>details>summary>div>span {
33 |   font-weight: normal;
34 |   font-family: var(--__sl-font);
35 |   font-size: var(--sl-text-sm);
36 | }
37 | 
38 | .top-level>li>details>ul>li>details>summary>div>span {
39 |   font-weight: light;
40 |   font-family: var(--__sl-font);
41 |   font-size: var(--sl-text-sm);
42 | }
43 | 


--------------------------------------------------------------------------------
/docs/src/content/docs/pipeline/eval_runner.mdx:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Evaluation Runner
 3 | sidebar:
 4 |   badge:
 5 |     text: new
 6 |     variant: tip
 7 | ---
 8 | 
 9 | The `EvaluationRunner` manages the evaluation process for a pipeline.
10 | 
11 | To evaluate on the dataset defined in the pipeline you can run
12 | 
13 | ```python
14 | runner = EvaluationRunner(pipeline)
15 | eval_results = runner.evaluate()
16 | ```
17 | 
18 | The evaluate method will run each metric and return a `MetricsResults` object with the results.
19 | 
20 | ```python
21 | metrics = evalrunner.evaluate(pipelog)
22 | metrics.results() # returns a dictionary with the results
23 | ```
24 | 
25 | ### Evaluate on pipeline logs
26 | 
27 | If you use the `PipelineLogger` to log your pipeline outputs, you can evaluate on the logs directly.
28 | 
29 | ```python
30 | pipelog = PipelineLogger(pipeline=pipeline)
31 | # ... log or load logs
32 | 
33 | # Run the evaluation...
34 | evalrunner = EvaluationRunner(pipeline)
35 | metrics = evalrunner.evaluate(pipelog)
36 | ```
37 | 
38 | ### Evaluate on other data
39 | 
40 | You can also evaluate on other data by passing a `Dataset` object to the `evaluate` method.
41 | 
42 | ```python
43 | dataset = Dataset(...)
44 | evalrunner = EvaluationRunner(pipeline)
45 | metrics = evalrunner.evaluate(dataset)
46 | ```
47 | 


--------------------------------------------------------------------------------
/docs/src/content/docs/metrics/Tools/Deterministic/tool_selection.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Tool Selection Accuracy
 3 | sidebar:
 4 |     order: 1
 5 | ---
 6 | 
 7 | ### Definitions
 8 | 
 9 | **Tool Selection Accuracy** measures how well an LLM selects a tool / function in a given module.
10 | 
11 | The used tools are compared with the expected tools and the metric outputs:
12 | 
13 | - `num_correct`: total number of tools that are selected AND called with the correct arguments
14 | - `score`: `num_correct` / total number of tools in `ground_truths`
15 | 
16 | ### Example Usage
17 | 
18 | Required data items: `tools`, `ground_truths`
19 | 
20 | ```python
21 | from continuous_eval.metrics.tools.match import ToolSelectionAccuracy
22 | from continuous_eval.eval.types import ToolCall
23 | 
24 | tools = [
25 |     ToolCall(name="useless", kwargs={}),
26 |     ToolCall(name="multiply", kwargs={"a": 2, "b": 3}),
27 | ]
28 | 
29 | ground_truths = [
30 |     ToolCall(name="useless", kwargs={}),
31 |     ToolCall(name="add", kwargs={"a": 2, "b": 3}),
32 | ]
33 | 
34 | datum = {
35 |     "tools": tools,
36 |     "ground_truths": ground_truths,
37 | }
38 | 
39 | metric = ToolSelectionAccuracy()
40 | print(metric(**datum))
41 | ```
42 | 
43 | ### Example Output
44 | 
45 | ```python
46 | {
47 |     "num_correct": 1, 
48 |     "score": 0.5
49 | }
50 | ```
51 | 


--------------------------------------------------------------------------------
/docs/src/content/docs/metrics/Retrieval/LLM-Based/context_precision.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: LLM-based Context Precision
 3 | ---
 4 | 
 5 | ### Definition
 6 | 
 7 | Context Precision is used to measure information density.
 8 | 
 9 | $$
10 | \text{Context Precision} =
11 | \frac{
12 |   \text{Number of Relevant Chunks in Retrieved Contexts}
13 | }{
14 |   \text{Total Number of Chunks in Retrieved Contexts}
15 | }
16 | $$
17 | 
18 | while
19 | 
20 | $$
21 | \text{Mean Average Precision (mAP)} = \frac{1}{\text{Number of Relevant Chunks}} \sum_{j=1}^{\text{Number of Retrieved Context}} \text{ Precision at Rank } j
22 | $$
23 | 
24 | 
25 | ### Example Usage
26 | 
27 | Required data items: `question`, `retrieved_context`
28 | 
29 | ```python
30 | from continuous_eval.metrics.retrieval import ContextPrecision
31 | 
32 | datum = {
33 |     "question": "What is the capital of France?",
34 |     "retrieved_context": [
35 |         "Paris is the capital of France and also the largest city in the country.",
36 |         "Lyon is a major city in France.",
37 |     ],
38 | }
39 | 
40 | metric = ContextPrecision()
41 | print(metric(**datum))
42 | ```
43 | 
44 | ### Sample Output
45 | 
46 | ```python
47 | {
48 |     "percentage_relevant": 0.5,
49 |     "context_precision": 0.5000000000746547,
50 |     "context_mean_average_precision": 1.0,
51 | }
52 | ```
53 | 


--------------------------------------------------------------------------------
/continuous_eval/llms/openai.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import Dict
 3 | 
 4 | from openai import OpenAI as _OpenAI
 5 | 
 6 | from .base import LLMInterface
 7 | 
 8 | 
 9 | class OpenAI(LLMInterface):
10 |     def __init__(self, model: str, **kwargs):
11 |         if os.getenv("OPENAI_API_KEY") is None:
12 |             raise ValueError(
13 |                 "Please set the environment variable OPENAI_API_KEY. "
14 |                 "You can get one at https://beta.openai.com/account/api-keys."
15 |             )
16 |         self.client = _OpenAI()
17 |         self.model = model
18 |         self.defaults = {
19 |             "seed": 0,
20 |             "temperature": 0.0,
21 |             "max_tokens": 2048,
22 |             "top_p": 1,
23 |             "frequency_penalty": 0,
24 |             "presence_penalty": 0,
25 |         }
26 |         self.defaults.update(kwargs)
27 | 
28 |     def run(self, prompt: Dict[str, str], temperature: float = 1.0) -> str:
29 |         kwargs = self.defaults.copy()
30 |         kwargs["temperature"] = temperature
31 |         response = self.client.chat.completions.create(
32 |             model=self.model,
33 |             messages=[
34 |                 {"role": "system", "content": prompt["system_prompt"]},
35 |                 {"role": "user", "content": prompt["user_prompt"]},
36 |             ],
37 |             **kwargs,
38 |         )
39 |         return response.choices[0].message.content
40 | 


--------------------------------------------------------------------------------
/docs/src/content/docs/metrics/Code/Deterministic/python_ast_similarity.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Python AST Similarity
 3 | sidebar:
 4 |   order: 1
 5 | ---
 6 | 
 7 | ### Definitions
 8 | 
 9 | **Python AST Similarity** compares the structure of two Python programs (generated code string vs. ground truth code string) by analyzing their Abstract Syntax Trees (ASTs). It evaluates how similar these programs are by matching nodes in the trees, considering both the types of statements and their organization. The comparison can involve reordering certain parts for a deeper match and uses a scoring system to quantify similarity.
10 | 
11 | <br>
12 | 
13 | :::note
14 | The metric depends on syntactically correct Python scripts to produce the Abstract Syntax Trees (ASTs). If the scripts contain syntax errors and cannot be parsed, the metric will yield a score of -1.0.
15 | :::
16 | 
17 | <br>
18 | 
19 | ### Example Usage
20 | 
21 | Required data items: `answer`, `ground_truth_answers`
22 | 
23 | ```python
24 | from continuous_eval.metrics.code.python import PythonASTSimilarity
25 | 
26 | datum = {
27 |     "answer": "def function(x, y):\n  return x + y",
28 |     "ground_truth_answers": [
29 |         "def foo(x, y):\n  return x * y",
30 |         "def foo(x, y):\n  return x + y",
31 |     ],
32 | }
33 | 
34 | metric = PythonASTSimilarity()
35 | print(metric(**datum))
36 | ```
37 | 
38 | ### Example Output
39 | 
40 | ```JSON
41 | {
42 |     "Python_AST_Similarity": 1.0
43 | }
44 | ```
45 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
 1 | # Documentation
 2 | 
 3 | [![Built with Starlight](https://astro.badg.es/v2/built-with-starlight/tiny.svg)](https://starlight.astro.build)
 4 | 
 5 | ## Requirements
 6 | 
 7 | - [Node.js](https://nodejs.org/en/download/)
 8 | - [npm](https://www.npmjs.com/get-npm)
 9 | 
10 | ## 🧞 Commands
11 | 
12 | All commands are run from the root of the project, from a terminal:
13 | 
14 | | Command                   | Action                                           |
15 | | :------------------------ | :----------------------------------------------- |
16 | | `npm install`             | Installs dependencies                            |
17 | | `npm run dev`             | Starts local dev server at `localhost:4321`      |
18 | | `npm run build`           | Build your production site to `./dist/`          |
19 | | `npm run preview`         | Preview your build locally, before deploying     |
20 | | `npm run astro ...`       | Run CLI commands like `astro add`, `astro check` |
21 | | `npm run astro -- --help` | Get help using the Astro CLI                     |
22 | 
23 | 
24 | ## How to enable versioning
25 | 
26 | - Add the following to the `astro.config.mjs` file:
27 | 
28 | ```js
29 | base: '/v0.3',
30 | outDir: './dist/v0.3',
31 | ```
32 | 
33 | 
34 | - Add the following to the `astro.config.mjs` file:
35 | 
36 | ```js
37 | components: {
38 |         // Override the default `SocialIcons` component.
39 |         ThemeSelect: './src/components/ThemeSelect.astro',
40 |       },
41 | ```
42 | 


--------------------------------------------------------------------------------
/docs/src/content/docs/metrics/Retrieval/Deterministic/token_count.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Token Count
 3 | ---
 4 | 
 5 | ### Definitions
 6 | 
 7 | Token Count calculates the number of tokens used in the retrieved context.
 8 | 
 9 | A required input for the metrics is `encoder_name` for tiktoken. 
10 | 
11 | For example, for the most recent OpenAI models, you use `cl100k_base` as the encoder. For other models, you should look up the specific tokenizer used, or alternatively, you can also use `approx` to get an approximate token count which measures 1 token for every 4 characters.
12 | 
13 | :::tip
14 | **Tokens in `retrieved_context` often accounts for the majority of LLM token usage in a RAG application.** 
15 | Token count is useful to keep track of if you are concerned about LLM cost, LLM context window limit, and LLM performance issued caused by low context precision (such as "needle-in-a-haystack" problems).
16 | :::
17 | 
18 | Required data items: `retrieved_context`
19 | 
20 | ```python
21 | from continuous_eval.metrics.retrieval import TokenCount
22 | 
23 | datum = {
24 |     "retrieved_context": [
25 |         "Lyon is a major city in France.",
26 |         "Paris is the capital of France and also the largest city in the country.",
27 |     ],
28 |     "ground_truth_context": ["Paris is the capital of France."],
29 | }
30 | 
31 | metric = TokenCount(encoder_name="cl100k_base")
32 | print(metric(**datum))
33 | ```
34 | 
35 | ### Example Output
36 | 
37 | ```JSON
38 | {
39 |     'num_tokens': 24, 
40 | }
41 | ```
42 | 


--------------------------------------------------------------------------------
/docs/src/content/docs/metrics/Retrieval/LLM-Based/context_coverage.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Context Coverage
 3 | ---
 4 | 
 5 | ### Definition
 6 | 
 7 | Context Coverage measures completeness of the retrieved contexts to generated a ground truth answer.
 8 | 
 9 | $$
10 | \text{Context Coverage} =
11 | \frac{
12 |   \text{Number of Statements in Generated Answer Attributed to the Ground Truth Contexts}
13 | }{
14 |   \text{Total Number of Statements in Generated Answer}
15 | }
16 | $$
17 | 
18 | This metric requires the LLM evaluator to output correct and complex JSON. If the JSON cannot be parsed, the score returns -1.0.
19 | 
20 | ### Example Usage
21 | 
22 | Required data items: `question`, `retrieved_context`, `ground_truths`
23 | 
24 | ```python
25 | from continuous_eval.metrics.retrieval import ContextCoverage
26 | 
27 | datum = {
28 |     "question": "What is the largest and second city in France?",
29 |     "retrieved_context": [
30 |         "Lyon is a major city in France.",
31 |         "Paris is the capital of France and also the largest city in the country.",
32 |     ],
33 |     "ground_truth_answers": ["Paris is the largest city in France and Marseille is the second largest."],
34 | }
35 | 
36 | metric = ContextCoverage()
37 | print(metric(**datum))
38 | ```
39 | 
40 | ### Sample Output
41 | 
42 | ```JSON
43 | {
44 |     "context_coverage": 0.5,
45 |     "statements": [
46 |         "Paris is the largest city in France.",
47 |         "Marseille is the second largest city in France.",
48 |     ],
49 | }
50 | ```
51 | 


--------------------------------------------------------------------------------
/docs/src/content/docs/metrics/Generation/Semantic/bert_answer_relevance.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: BERT Answer Relevance
 3 | ---
 4 | 
 5 | ### Definitions
 6 | 
 7 | **BERT Answer Relevance** measures the semantic similarity between the Generated Answer and the Question
 8 | 
 9 | This metric leverages the [BERT model](https://huggingface.co/bert-base-uncased) to calculate semantic similarity.
10 | 
11 | <br>
12 | 
13 | :::note
14 | Semantic similarity between `answer` and `question` is not necessarily a good indication that the answer is relevant to the question. Test the metric to see how well correlated with human judgement.
15 | :::
16 | 
17 | ### Example Usage
18 | 
19 | Required data items: `question`, `answer`
20 | 
21 | ```python
22 | from continuous_eval.metrics.generation.text import BertAnswerSimilarity
23 | 
24 | datum = {
25 |     "question": "Who wrote 'Romeo and Juliet'?",
26 |     "retrieved_context": ["William Shakespeare is the author of 'Romeo and Juliet'."],
27 |     "ground_truth_context": ["William Shakespeare is the author of 'Romeo and Juliet'."],
28 |     "answer": "Shakespeare wrote 'Romeo and Juliet'",
29 |     "ground_truths": [
30 |         "William Shakespeare wrote 'Romeo and Juliet", 
31 |         "William Shakespeare", 
32 |         "Shakespeare", 
33 |         "Shakespeare is the author of 'Romeo and Juliet'"
34 |     ]
35 | }
36 | 
37 | metric = BertAnswerSimilarity()
38 | print(metric(**datum))
39 | ```
40 | 
41 | ### Example Output
42 | 
43 | ```JSON
44 | {
45 |     'bert_answer_relevance': 0.8146507143974304
46 | }
47 | ```
48 | 


--------------------------------------------------------------------------------
/continuous_eval/metrics/generation/text/prompts/faithfulness_sys.jinja2:
--------------------------------------------------------------------------------
 1 | You are tasked to evaluate whether the statement is fully supported by the context. With a reasoning following the response, respond with either Yes or No.
 2 | 
 3 | {% if 'use_few_shot' %}
 4 | {% raw %}
 5 | -- EXAMPLES --
 6 | Example 1:
 7 | Context: The Eiffel Tower, a wrought-iron lattice tower on the Champ de Mars in Paris, France, is one of the most famous landmarks in the world. It was designed by Gustave Eiffel and completed in 1889.
 8 | Statement: The Eiffel Tower can be found in the center of London, near the Thames River.
 9 | Response:
10 | {
11 |   "reasoning": "The statement contradicts with the context, which states that Eiffel Tower is in Paris, as opposed to the center of London.",
12 |   "score": "No"
13 | }
14 | 
15 | Example 2:
16 | Context: Photosynthesis is a process used by plants and other organisms to convert light energy into chemical energy that can later be released to fuel the organisms' activities. This chemical energy is stored in carbohydrate molecules, such as sugars, which are synthesized from carbon dioxide and water.
17 | Statement: Photosynthesis in plants primarily involves the conversion of light energy into chemical energy stored in forms such as sugar.
18 | Response:
19 | {
20 |   "reasoning": "The statement is supported by the context, which states that photosynthesis converts light energy into chemical energy and that the chemical energy is stored in carbohydrate molecules, such as sugars.",
21 |   "score": "Yes"
22 | }
23 | {% endraw %}
24 | {% endif %}
25 | 


--------------------------------------------------------------------------------
/continuous_eval/metrics/retrieval/prompts/context_coverage_sys.jinja2:
--------------------------------------------------------------------------------
 1 | Given a question, context, and answer, analyze each statement in the answer and classify if the statement can be attributed to the given context or not.
 2 | 
 3 | Your response should be a list of JSON objects, each containing a statement, reasoning, and attribution.
 4 | {% raw %}
 5 | {
 6 |   "reason": str,
 7 |   "statement": str,
 8 |   "attribution": bool
 9 | }
10 | {% endraw %}
11 | {% if use_few_shot %}
12 | 
13 | --EXAMPLES--
14 | Question: `What are the main characteristics of Jupiter?`
15 | Context:
16 | ```
17 | Jupiter is the fifth planet from the Sun and the largest in the Solar System. It is a gas giant with a mass more than two and a half times that of all the other planets in the Solar System combined, but less than one-thousandth the mass of the Sun. Jupiter is known for its prominent Great Red Spot, a giant storm larger than Earth that has been ongoing for hundreds of years.
18 | ```
19 | Answer: `Jupiter is the largest planet in our Solar System and has a giant storm known as the Great Red Spot.`
20 | Classification:{% raw %}
21 | [
22 |   {
23 |     "statement":"Jupiter is the largest planet in the Solar System.",
24 |     "reason": "This is directly stated in the context.",
25 |     "attribution": true
26 |   },
27 |   {
28 |     "statement":"Jupiter is closer to the Sun than Earth.",
29 |     "reason": "The context contradicts this, stating Jupiter is the fifth planet from the Sun, while Earth is the third.",
30 |     "attribution": false
31 |   }
32 | ]
33 | {% endraw %}
34 | {% endif %}
35 | 


--------------------------------------------------------------------------------
/docs/src/content/docs/metrics/Generation/LLM-Based/correctness.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: AnswerCorrectness
 3 | ---
 4 | 
 5 | ### Definition
 6 | 
 7 | ** Answer Correctness** outputs a score between 0.0 - 1.0 assessing the overall quality of the answer, given the question and ground truth answer. 
 8 | 
 9 | **Scoring rubric in LLM Prompt:**
10 | 
11 | - 0.0 means that the answer is completely irrelevant to the question.
12 | - 0.25 means that the answer is relevant to the question but contains major errors.
13 | - 0.5 means that the answer is relevant to the question and is partially correct.
14 | - 0.75 means that the answer is relevant to the question and is correct.
15 | - 1.0 means that the answer is relevant to the question and is correct and complete.
16 | 
17 | ### Example Usage
18 | 
19 | Required data items: `question`, `answer`, `ground_truths`
20 | 
21 | ```python
22 | from continuous_eval.metrics.generation.text import AnswerCorrectness
23 | 
24 | datum = {
25 |     "question": "Who wrote 'Romeo and Juliet'?",
26 |     "answer": "Shakespeare wrote 'Romeo and Juliet'",
27 |     "ground_truth_answers": [
28 |         "William Shakespeare wrote 'Romeo and Juliet", 
29 |         "William Shakespeare", 
30 |         "Shakespeare", 
31 |         "Shakespeare is the author of 'Romeo and Juliet'"
32 |     ]
33 | }
34 | 
35 | metric = AnswerCorrectness()
36 | print(metric(**datum))
37 | ```
38 | 
39 | ### Sample Output
40 | 
41 | ```python
42 | {
43 |     "correctness": 0.9999867895679586,
44 |     "reasoning": "The generated answer correctly identifies Shakespeare as the author of 'Romeo and Juliet'.",
45 | }
46 | ```
47 | 


--------------------------------------------------------------------------------
/docs/src/content/docs/metrics/Generation/Semantic/bert_answer_similarity.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: BERT Answer Similarity
 3 | ---
 4 | 
 5 | ### Definitions
 6 | 
 7 | **BERT Answer Similarity** measures the semantic similarity between the Generated Answer and the Ground Truth Answers.
 8 | 
 9 | This metric leverages the [BERT model](https://huggingface.co/bert-base-uncased) to calculate semantic similarity.
10 | 
11 | <br>
12 | 
13 | :::note
14 | Semantic similarity between `answer` and `ground_truths` is not necessarily a good indication that the answer is correct. Test the metric to see how well correlated with human judgement.
15 | :::
16 | 
17 | ### Example Usage
18 | 
19 | Required data items: `answer`, `ground_truths`
20 | 
21 | ```python
22 | from continuous_eval.metrics.generation.text import BertAnswerSimilarity
23 | 
24 | datum = {
25 |     "question": "Who wrote 'Romeo and Juliet'?",
26 |     "retrieved_context": ["William Shakespeare is the author of 'Romeo and Juliet'."],
27 |     "ground_truth_context": ["William Shakespeare is the author of 'Romeo and Juliet'."],
28 |     "answer": "Shakespeare wrote 'Romeo and Juliet'",
29 |     "ground_truth_answers": [
30 |         "William Shakespeare wrote 'Romeo and Juliet", 
31 |         "William Shakespeare", 
32 |         "Shakespeare", 
33 |         "Shakespeare is the author of 'Romeo and Juliet'"
34 |     ]
35 | }
36 | 
37 | metric = BertAnswerSimilarity()
38 | print(metric(**datum))
39 | ```
40 | 
41 | ### Example Output
42 | 
43 | The metric outputs the max BERT similarity score calculated using items in `ground_truths`
44 | 
45 | ```JSON
46 | {
47 |     'bert_answer_similarity': 0.9274404048919678
48 | }
49 | ```
50 | 


--------------------------------------------------------------------------------
/continuous_eval/llms/cohere.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import Dict, Optional
 3 | 
 4 | from .base import LLMInterface
 5 | 
 6 | try:
 7 |     from cohere import ClientV2, SystemChatMessageV2, UserChatMessageV2
 8 | 
 9 |     COHERE_AVAILABLE = True
10 | except ImportError:
11 |     COHERE_AVAILABLE = False
12 | 
13 | 
14 | class Cohere(LLMInterface):
15 |     def __init__(self, model: str, api_key: Optional[str] = None, **kwargs):
16 |         if not COHERE_AVAILABLE:
17 |             raise ValueError("Cohere is not available. Please install it.")
18 |         if api_key is None and os.getenv("COHERE_API_KEY") is None:
19 |             raise ValueError(
20 |                 "Please set the environment variable COHERE_API_KEY. "
21 |                 "You can get one at https://cohere.ai/."
22 |             )
23 |         self.client = ClientV2(api_key=api_key or os.getenv("COHERE_API_KEY"))  # type: ignore
24 |         self.model = model
25 |         self.defaults = {
26 |             "seed": 0,
27 |             "temperature": 0.0,
28 |             "max_tokens": 2048,
29 |         }
30 |         self.defaults.update(kwargs)
31 | 
32 |     def run(self, prompt: Dict[str, str], temperature: float = 1.0) -> str:
33 |         kwargs = self.defaults.copy()
34 |         kwargs["temperature"] = temperature
35 |         response = self.client.chat(
36 |             model=self.model,
37 |             messages=[
38 |                 SystemChatMessageV2(content=prompt["system_prompt"]),  # type: ignore
39 |                 UserChatMessageV2(content=prompt["user_prompt"]),
40 |             ],
41 |             **kwargs,
42 |         )
43 |         return response.message.content[0].text  # type: ignore
44 | 


--------------------------------------------------------------------------------
/docs/src/content/docs/pipeline/pipeline_logger.mdx:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Pipeline Logger
 3 | sidebar:
 4 |   badge:
 5 |     text: new
 6 |     variant: tip
 7 | ---
 8 | 
 9 | 
10 | ## Definition
11 | 
12 | The `PipelineLogger` allows storing the output of each module in a format that is evaluation-ready.
13 | 
14 | To create a pipeline logger you need to specify a pipeline first.
15 | 
16 | ```python
17 | from continuous_eval.eval.logger import PipelineLogger
18 | 
19 | pipelog = PipelineLogger(pipeline=pipeline)
20 | ```
21 | 
22 | now you can log the output of each module in the pipeline.
23 | 
24 | ```python
25 | pipelog.log(uid=sample_uid, module="module_name", value=data)
26 | ```
27 | 
28 | - `uid` is the required unique identifier for each sample (e.g. "question_01234"). `uid` is used to match the logged outputs from each module in the pipeline for a given sample.
29 | - `module` is the name of the module you are logging (e.g. "llm_answer_generator", "retriever", etc.)
30 | - `value` is the output of the module (e.g. "The capital of France is Paris." from the "llm_answer_generator" module)
31 | 
32 | With the pipeline logger you can also save/load logs from a file.
33 | 
34 | ```python
35 | pipelog.save("logs.jsonl")
36 | # ... later
37 | pipelog.load("logs.jsonl")
38 | ```
39 | 
40 | to use it in the evaluation runner, you can pass it as an argument to the `evaluate` method.
41 | 
42 | ```python
43 | metrics = evalrunner.evaluate(pipelog)
44 | ```
45 | 
46 | ### Log tools use
47 | 
48 | To log the use of a tool in a module, should just specify the `tool_args` method.
49 | 
50 | ```python
51 | pipelog.log(uid=sample_uid, module="module_name", value="tool_name", tool_args={"a": a, "b": b})
52 | ```
53 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | # Exclude hidden files, init, tensorrt folders, protobuf generated files, markdown files, and DB model.py files
 2 | exclude: ^(.*/\..*|.*__init__.py|.*_pb2.py|.*_pb2_grpc.py|.*.md|.*.database.*.models.py)$
 3 | default_language_version:
 4 |   python: python3.9
 5 | repos:
 6 | - repo: https://github.com/pre-commit/pre-commit-hooks
 7 |   rev: v4.4.0
 8 |   hooks:
 9 |   - id: check-ast  # check if valid python file
10 |     stages: [commit]
11 |   - id: debug-statements  # check if left in debug statements
12 |     stages: [commit]
13 |   - id: trailing-whitespace  # remove trailing whitespace
14 |     stages: [commit]
15 |   - id: end-of-file-fixer  # have one line at end of file
16 |     stages: [commit]
17 |   - id: check-docstring-first  # File comments should come before imports
18 |     stages: [commit]
19 |   - id: check-yaml  # make sure valid yaml
20 |     stages: [commit]
21 |   - id: name-tests-test  # check that tests start with test_*
22 |     args: ['--pytest']
23 |     stages: [commit]
24 |     exclude: ^tests/helpers/
25 |   - id: check-added-large-files  # check that we're not committing big files to git, excluding lfs files.
26 |     args: ['--maxkb=2000']  # set max file size to 2MB
27 |     stages: [commit]
28 | - repo: https://github.com/asottile/pyupgrade  # use Python3 tricks
29 |   rev: v2.32.0
30 |   hooks:
31 |   - id: pyupgrade
32 |     args: ['--keep-percent-format', '--keep-runtime-typing']
33 |     stages: [commit]
34 | repos:
35 | - repo: https://github.com/astral-sh/ruff-pre-commit
36 |   # Ruff version.
37 |   rev: v0.8.3
38 |   hooks:
39 |     # Run the linter.
40 |     - id: ruff
41 |       args: [ --fix ]
42 |     # Run the formatter.
43 |     - id: ruff-format


--------------------------------------------------------------------------------
/continuous_eval/metrics/generation/text/prompts/style_consistency_sys.jinja2:
--------------------------------------------------------------------------------
 1 | You are an expert evaluator system for a question answering system.
 2 | You only need to evaluate the style of the generated answer based on some reference answers, regardless of whether the answer is correct or not.
 3 | Assess style aspects such as tone, verbosity, formality, complexity, use of terminology, etc.
 4 | 
 5 | -- BEGIN OF GUIDELINES --
 6 | You must output a single score between 1 to 4.
 7 | Use the following guidelines for evaluation:
 8 | - 1 means that the answer is in a completely different style as the reference answer(s).
 9 | - 2 means that the answer is barely in the same style as the reference answer(s), with noticable differences.
10 | - 3 means that the answer is largely in the same style as the reference answer(s) but there's a slight difference in some aspects.
11 | - 4 means that there's no dicernable style difference between the generated answer and reference answer(s).
12 | -- END OF GUIDELINES --
13 | 
14 | {% if 'use_few_shot' %}
15 | {% raw %}
16 | -- BEGIN OF EXAMPLE --
17 | Generated Answer: `Got it, can you tell me more about it?`
18 | Reference Answer(s):
19 | - `I apologize for the difficulties you're facing. To assist you better, could you please provide more details about the problem?`
20 | - `We're sorry to hear that you're experiencing difficulties. To help you better, could you please provide more details about the problem?`
21 | Response:
22 | {
23 |   "reasoning": "The generated answer is more concise and less formal than the reference answers.",
24 |   "score": 2
25 | }
26 | -- END OF EXAMPLES --
27 | {% endraw %}
28 | {% endif %}
29 | 
30 | Output a reasoning for your judgement and your score.
31 | 


--------------------------------------------------------------------------------
/continuous_eval/llms/base.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from abc import ABC, abstractmethod
 3 | from collections import defaultdict
 4 | from typing import Dict, Optional, Union
 5 | 
 6 | 
 7 | class LLMInterface(ABC):
 8 |     @abstractmethod
 9 |     def run(self, prompt: Dict[str, str], temperature: float = 0) -> str:
10 |         pass
11 | 
12 | 
13 | class LLMInterfaceFactory:
14 |     @abstractmethod
15 |     def __call__(self, model: str, **kwargs) -> LLMInterface:
16 |         pass
17 | 
18 | 
19 | class _LLMFactory:
20 |     def __init__(self):
21 |         self.providers = defaultdict(dict)
22 | 
23 |     def register_provider(
24 |         self,
25 |         provider: str,
26 |         provider_class: Union[type[LLMInterface], LLMInterfaceFactory],
27 |         model: Optional[str] = "*",
28 |     ):
29 |         self.providers[provider][model] = provider_class
30 | 
31 |     @staticmethod
32 |     def default() -> str:
33 |         return os.getenv("DEFAULT_EVAL_MODEL", "openai:gpt-4o-mini")
34 | 
35 |     def get(self, model: Optional[str] = None, **kwargs) -> LLMInterface:
36 |         if model is None:
37 |             model = self.default()
38 |         _split = model.split(":")
39 |         provider = _split[0]
40 |         model = model[len(provider) + 1 :]
41 |         if provider not in self.providers:
42 |             raise ValueError(f"Provider {provider} not found")
43 |         if model in self.providers[provider]:
44 |             return self.providers[provider][model](model, **kwargs)
45 |         elif "*" in self.providers[provider]:
46 |             return self.providers[provider]["*"](model, **kwargs)
47 |         raise ValueError(f"Model {model} not found for provider {provider}")
48 | 
49 | 
50 | LLMFactory = _LLMFactory()
51 | 


--------------------------------------------------------------------------------
/docs/src/content/docs/metrics/Code/Probabilistic/llm-based.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: SQL Correctness
 3 | sidebar:
 4 |     order: 1
 5 | ---
 6 | 
 7 | ### Definition
 8 | 
 9 | **SQL Correctness** evaluates the alignment between the generated SQL query and the ground truth SQL query with respect to the intended functionality of the query.
10 | 
11 | ## Example Usage
12 | 
13 | Required data items: `answer`, `ground_truth_answers`
14 | 
15 | ```python
16 | from continuous_eval.metrics.code.sql import SQLCorrectness
17 | 
18 | datum = {
19 |     "question": "What's the name of the user with email 'john@example.com'?",
20 |     "answer": "SELECT * FROM users where email = 'john@example.com';",
21 |     "ground_truth_answers": "SELECT name FROM users where email = 'john@example.com';",
22 | }
23 | 
24 | metric = SQLCorrectness()
25 | print(metric(**datum))
26 | ```
27 | 
28 | ### Example Output
29 | 
30 | ```python
31 | {
32 |     "reasoning": "The generated SQL query retrieves data from the 'users' table based on the email 'john@example.com', but it uses 'SELECT *', returning all columns instead of just the 'name' as required.",
33 |     "score": 0.5761831599,
34 | }
35 | ```
36 | 
37 | ## Schema
38 | 
39 | You can optionally pass the database schema.
40 | 
41 | ```python
42 | from continuous_eval.metrics.code.sql import SQLCorrectness
43 | 
44 | datum = {
45 |     "question": "What's the name of the user with email 'john@example.com'?",
46 |     "answer": "SELECT * FROM users where email = 'john@example.com';",
47 |     "ground_truth_answers": "SELECT name FROM users where email = 'john@example.com';",
48 |     "schema": {
49 |         "users": ["id", "name", "email"],
50 |         "orders": ["id", "user_id", "amount"],
51 |     },
52 | }
53 | 
54 | metric = SQLCorrectness()
55 | print(metric(**datum))
56 | ```
57 | 
58 | 


--------------------------------------------------------------------------------
/docs/src/content/docs/metrics/Classification/Deterministic/classification_metrics.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Classification Metrics
 3 | sidebar:
 4 |     order: 1
 5 | ---
 6 | 
 7 | ### Definitions
 8 | 
 9 | **Single Label Classification** measures the performance of a classification module.
10 | 
11 | <br>
12 | 
13 | ### Example Usage
14 | 
15 | Required data items: `predicted_class`, `ground_truth_class`
16 | 
17 | ```python
18 | from continuous_eval.metrics.classification import SingleLabelClassification
19 | 
20 | datum = {
21 |     "predicted_class": "quantitative_question",
22 |     "ground_truth_class": "qualitative_question",
23 | }
24 | 
25 | metric = SingleLabelClassification(classes={"qualitative_question", "quantitative_question"})
26 | print(metric(**datum))
27 | ```
28 | 
29 | #### Example Output
30 | 
31 | ```JSON
32 | {
33 |   'classification_prediction': 'quantitative_question', 'classification_ground_truth': 'qualitative_question', 'classification_correct': False
34 | }
35 | ```
36 | 
37 | ### Aggregate Results
38 | 
39 | ```python
40 | from continuous_eval.metrics.classification import SingleLabelClassification
41 | 
42 | y_pred = ["A", "A", "B", "A", "B"]
43 | y_true =  ["A", "B", "B", "A", "B"]
44 | 
45 | metric = SingleLabelClassification(classes={"A", "B"})
46 | results = [metric(y, y_gt) for y, y_gt in zip(y_pred, y_true)]
47 | print(metric.aggregate(results))
48 | ```
49 | 
50 | :::note
51 | The evaluation manager will aggregate the results of the metric automatically, calling `aggregate` is only necessary if you are not using the evaluation manager.
52 | :::
53 | 
54 | #### Example Output
55 | 
56 | ```JSON
57 | {
58 |   "accuracy": 0.8,
59 |   "balanced_accuracy": 0.8333333333333333,
60 |   "precision": 0.8333333333333333,
61 |   "recall": 0.8333333333333333,
62 |   "f1": 0.8,
63 | }
64 | ```
65 | 


--------------------------------------------------------------------------------
/continuous_eval/llms/anthropic.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import Dict, Optional
 3 | 
 4 | from .base import LLMInterface
 5 | 
 6 | try:
 7 |     from anthropic import Anthropic as _Anthropic
 8 | 
 9 |     ANTHROPIC_AVAILABLE = True
10 | except ImportError:
11 |     ANTHROPIC_AVAILABLE = False
12 | 
13 | 
14 | class Anthropic(LLMInterface):
15 |     def __init__(self, model: str, api_key: Optional[str] = None, **kwargs):
16 |         if not ANTHROPIC_AVAILABLE:
17 |             raise ValueError("Anthropic is not available")
18 |         if api_key is None and os.getenv("ANTHROPIC_API_KEY") is None:
19 |             raise ValueError(
20 |                 "Please set the environment variable ANTHROPIC_API_KEY. "
21 |                 "You can get one at https://www.anthropic.com/signup."
22 |             )
23 |         self.client = _Anthropic(
24 |             api_key=api_key or os.getenv("ANTHROPIC_API_KEY")
25 |         )
26 |         self.model = model
27 |         self.defaults = {
28 |             "max_tokens": 2048,
29 |             "temperature": 1.0,
30 |             "top_k": 1,
31 |             "top_p": 1,
32 |         }
33 |         self.defaults.update(kwargs)
34 | 
35 |     def run(self, prompt: Dict[str, str], temperature: float = 1.0) -> str:
36 |         kwargs = self.defaults.copy()
37 |         kwargs["temperature"] = temperature
38 |         response = self.client.messages.create(
39 |             model=self.model,
40 |             system=prompt["system_prompt"],
41 |             messages=[
42 |                 {
43 |                     "role": "user",
44 |                     "content": [
45 |                         {"type": "text", "text": prompt["user_prompt"]}
46 |                     ],
47 |                 }
48 |             ],
49 |             **kwargs,
50 |         )
51 |         return response.content[0].text
52 | 


--------------------------------------------------------------------------------
/continuous_eval/eval/tests.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Any, Dict, List
 3 | 
 4 | 
 5 | class Test(ABC):
 6 |     @property
 7 |     def name(self) -> str:
 8 |         """Name of the test."""
 9 |         raise NotImplementedError
10 | 
11 |     @abstractmethod
12 |     def run(self, metrics_per_sample) -> bool:
13 |         """Run the test on the module.
14 | 
15 |         Args:
16 |             module (_type_): _description_
17 | 
18 |         Returns:
19 |             bool: Pass (True) or Fail (False)
20 |         """
21 |         raise NotImplementedError
22 | 
23 |     def asdict(self):
24 |         return {
25 |             "__class__": self.__class__.__name__,
26 |             "name": self.name,
27 |         }
28 | 
29 | 
30 | # Some common tests
31 | class GreaterOrEqualThan(Test):
32 |     def __init__(self, test_name: str, metric_name: str, min_value: float):
33 |         self._name = test_name
34 |         self._key = metric_name
35 |         self._value = min_value
36 | 
37 |     @property
38 |     def name(self) -> str:
39 |         return self._name
40 | 
41 |     def run(self, metrics_per_sample: List[Dict[str, Any]]) -> bool:
42 |         return all(
43 |             sample[self._key] >= self._value for sample in metrics_per_sample
44 |         )
45 | 
46 | 
47 | class MeanGreaterOrEqualThan(Test):
48 |     def __init__(self, test_name: str, metric_name: str, min_value: float):
49 |         self._name = test_name
50 |         self._key = metric_name
51 |         self._value = min_value
52 | 
53 |     @property
54 |     def name(self) -> str:
55 |         return self._name
56 | 
57 |     def run(self, metrics_per_sample: List[Dict[str, Any]]) -> bool:
58 |         return (
59 |             sum(sample[self._key] for sample in metrics_per_sample)
60 |             / len(metrics_per_sample)
61 |             >= self._value
62 |         )
63 | 


--------------------------------------------------------------------------------
/docs/src/content/docs/metrics/Generation/LLM-Based/style.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: LLM-based Style Consistency
 3 | ---
 4 | 
 5 | ### Definition
 6 | 
 7 | **LLM-based Style Consistency** outputs a score between 0.0 - 1.0 assessing the relevance and completeness of the generated answer based on the question. It assess style aspects such as tone, verbosity, formality, complexity, use of terminology, etc.
 8 | 
 9 | **Scoring rubric in LLM Prompt:**
10 | 
11 | - 0.0 means that the answer is in a completely different style as the reference answer(s).
12 | - 0.33 means that the answer is barely in the same style as the reference answer(s), with noticable differences.
13 | - 0.66 means that the answer is largely in the same style as the reference answer(s) but there's a slight difference in some aspects.
14 | - 1.0 means that there's no dicernable style difference between the generated answer and reference answer(s).
15 | 
16 | ### Example Usage
17 | 
18 | Required data items: `answer`, `ground_truths`
19 | 
20 | ```python
21 | from continuous_eval.metrics.generation.text import StyleConsistency
22 | 
23 | datum = {
24 |     "answer": "Quantum computers work by utilizing quantum mechanics principles, specifically using qubits for complex computations.",
25 |     "ground_truth_answers": [
26 |         "A quantum computer is like having a super magical brain that can think about lots of different things all at the same time, really fast!"
27 |     ]
28 | }
29 | 
30 | metric = StyleConsistency()
31 | print(metric(**datum))
32 | ```
33 | 
34 | ### Sample Output
35 | 
36 | ```python
37 | {
38 |     "consistency": 0.2500002355611551,
39 |     "reasoning": "The generated answer is highly technical and formal. The reference answer uses a child-friendly comparison, making it much more accessible, while the generated answer would likely be less understandable to a general audience. This demonstrates a significant difference in style.",
40 | }
41 | ```
42 | 


--------------------------------------------------------------------------------
/docs/src/content/docs/metrics/Generation/Deterministic/flesch_kincaid_readability.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Flesch–Kincaid Readability
 3 | sidebar:
 4 |     order: 2
 5 | ---
 6 | 
 7 | ### Definitions
 8 | 
 9 | The Flesch–Kincaid measures how easy it is to understand an by considering factors like sentence length and word complexity. There are two main types: the Flesch Reading Ease, which rates texts on a scale from easy to difficult, and the Flesch–Kincaid Grade Level, which estimates the U.S. school grade level needed to comprehend the text.
10 | 
11 | **Flesch Reading Ease** higher scores indicate material that is easier to read; lower numbers mark passages that are more difficult to read.
12 | 
13 | **Flesch–Kincaid Grade Level** corresponds to a U.S. grade level. For example, a score of 8.0 suggests that the text should be understandable by an 8th grader. Higher scores indicate material that is more complex and thus requires a higher level of education to understand.
14 | 
15 | :::note
16 | **Negative scores.** Both scores can be negative.
17 | A negative Flesh reading ease indicates that the text is very difficult to read and a negative Flesch–Kincaid grade level indicates that the text is very easy to read (they are inversely correlated).
18 | Note that the lowest grade level score in theory is −3.40.
19 | :::
20 | 
21 | To know more about the test read [Wikipedia](https://en.wikipedia.org/wiki/Flesch–Kincaid_readability_tests).
22 | 
23 | ### Example Usage
24 | 
25 | Required data items: `answer`
26 | 
27 | ```python
28 | from continuous_eval.metrics.generation.text  import FleschKincaidReadability
29 | 
30 | datum = {
31 |     "answer": "The cat sat on the mat.",
32 | }
33 | 
34 | metric = FleschKincaidReadability()
35 | print(metric(**datum))
36 | ```
37 | 
38 | ### Example Output
39 | 
40 | ```JSON
41 | {
42 |     "flesch_reading_ease": 116.14500000000001,
43 |     "flesch_kincaid_grade_level": -1.4499999999999993,
44 | }
45 | ```
46 | 


--------------------------------------------------------------------------------
/continuous_eval/metrics/code/sql/prompts/sql_correctness_sys.jinja2:
--------------------------------------------------------------------------------
 1 | You are an expert SQL developer tasked with evaluating the correctness of a generated SQL query compared to a ground truth (correct) SQL query.
 2 | Your evaluation should be based on how well the generated query matches the intended functionality described in the natural language question.
 3 | 
 4 | Analyze both queries, paying attention to their structure, syntax, and how well they address the requirements specified in the question. Consider the following aspects:
 5 | 1. Does the generated query produce the same result as the correct query?
 6 | 2. Are the table names and column references correct?
 7 | 3. Are the JOIN conditions (if any) properly specified?
 8 | 4. Are the WHERE clauses (if any) correctly formulated?
 9 | 5. Are aggregations, groupings, and orderings (if required) properly implemented?
10 | 6. Does the query handle any specific conditions or edge cases mentioned in the question?
11 | 
12 | Based on your analysis, determine a score from 0 to 5, where:
13 | 0 - Completely incorrect or unrelated to the question
14 | 1 - Major errors, missing crucial components
15 | 2 - Partially correct, but with significant mistakes
16 | 3 - Mostly correct, with minor errors
17 | 4 - Correct functionality, but with slight imperfections
18 | 5 - Perfectly matches the correct query and fully addresses the question
19 | 
20 | Before providing the final score, write a detailed reasoning for your evaluation. Explain the strengths and weaknesses of the generated query, and justify why you assigned the particular score.
21 | 
22 | Present your evaluation in the following format:
23 | {
24 |   "reasoning": "Your detailed reasoning here",
25 |   "score": Your numerical score here (between 0 and 5)
26 | }
27 | 
28 | Ensure that your reasoning is thorough and your score accurately reflects the quality of the generated SQL query in relation to the correct query and the given question.
29 | 


--------------------------------------------------------------------------------
/examples/evaluation_on_dataset.py:
--------------------------------------------------------------------------------
 1 | from time import perf_counter
 2 | 
 3 | from continuous_eval.data_downloader import example_data_downloader
 4 | from continuous_eval.eval import EvaluationRunner, SingleModulePipeline
 5 | from continuous_eval.eval.tests import GreaterOrEqualThan
 6 | from continuous_eval.metrics.retrieval import (
 7 |     PrecisionRecallF1,
 8 |     RankedRetrievalMetrics,
 9 | )
10 | 
11 | 
12 | def main():
13 |     # Let's download the retrieval dataset example
14 |     dataset = example_data_downloader("retrieval")
15 | 
16 |     pipeline = SingleModulePipeline(
17 |         dataset=dataset,  # type: ignore
18 |         eval=[
19 |             PrecisionRecallF1().use(
20 |                 retrieved_context=dataset.retrieved_contexts,  # type: ignore
21 |                 ground_truth_context=dataset.ground_truth_contexts,  # type: ignore
22 |             ),
23 |             RankedRetrievalMetrics().use(
24 |                 retrieved_context=dataset.retrieved_contexts,  # type: ignore
25 |                 ground_truth_context=dataset.ground_truth_contexts,  # type: ignore
26 |             ),
27 |         ],
28 |         tests=[
29 |             GreaterOrEqualThan(
30 |                 test_name="Recall", metric_name="context_recall", min_value=0.8
31 |             ),
32 |         ],
33 |     )
34 | 
35 |     # We start the evaluation manager and run the metrics
36 |     tic = perf_counter()
37 |     runner = EvaluationRunner(pipeline)
38 |     eval_results = runner.evaluate()
39 |     toc = perf_counter()
40 |     print("Evaluation results:")
41 |     print(eval_results.aggregate())
42 |     print(f"Elapsed time: {toc - tic:.2f} seconds\n")
43 | 
44 |     print("Running tests...")
45 |     test_results = runner.test(eval_results)
46 |     print(test_results)
47 | 
48 | 
49 | if __name__ == "__main__":
50 |     # It is important to run this script in a new process to avoid
51 |     # multiprocessing issues
52 |     main()
53 | 


--------------------------------------------------------------------------------
/continuous_eval/metrics/retrieval/simple_tokenizer.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import string
 3 | from copy import copy
 4 | from typing import List
 5 | 
 6 | from nltk.corpus import stopwords
 7 | from nltk.tokenize.api import TokenizerI
 8 | from nltk.tokenize.destructive import MacIntyreContractions
 9 | 
10 | # This is a workaround to avoid make sure that the stopwords are loaded before the tokenizer is used by a thread
11 | try:
12 |     stopwords.ensure_loaded()
13 | except LookupError:
14 |     stopwords.ensure_loaded()
15 | 
16 | 
17 | # Modified version of NLTKWordTokenizer
18 | class SimpleTokenizer(TokenizerI):
19 |     # List of contractions adapted from Robert MacIntyre's tokenizer.
20 |     _contractions = MacIntyreContractions()
21 |     CONTRACTIONS2 = list(map(re.compile, _contractions.CONTRACTIONS2))
22 |     CONTRACTIONS3 = list(map(re.compile, _contractions.CONTRACTIONS3))
23 | 
24 |     IS_NUMBER = re.compile(r"^[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?$")
25 | 
26 |     def tokenize(self, text: str, remove_stopwords=True) -> List[str]:
27 |         text = copy(text.lower())
28 | 
29 |         # strip punctuation (including dollar symbol and commas in numbers)
30 |         text = "".join(
31 |             [char for char in text if char not in string.punctuation]
32 |         )
33 | 
34 |         if remove_stopwords:
35 |             stop_words = set(stopwords.words("english"))
36 |             text = " ".join(
37 |                 [word for word in text.split() if word not in stop_words]
38 |             )
39 | 
40 |         # add extra space to make things easier
41 |         text = " " + text + " "
42 | 
43 |         for regexp in self.CONTRACTIONS2:
44 |             text = regexp.sub(r" \1 \2 ", text)
45 |         for regexp in self.CONTRACTIONS3:
46 |             text = regexp.sub(r" \1 \2 ", text)
47 | 
48 |         return [
49 |             t if (self.IS_NUMBER.match(t) is None) else float(t)
50 |             for t in text.split()
51 |         ]
52 | 


--------------------------------------------------------------------------------
/continuous_eval/llms/bedrock.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from typing import Dict
 4 | 
 5 | import boto3
 6 | 
 7 | from .base import LLMInterface
 8 | 
 9 | 
10 | class Bedrock(LLMInterface):
11 |     def __init__(self, model) -> None:
12 |         super().__init__()
13 |         self.model = model
14 |         self.client = boto3.client(
15 |             service_name=os.getenv("BEDROCK_SERVICE_NAME", "bedrock-runtime")
16 |         )
17 | 
18 |     def run(
19 |         self,
20 |         prompt: Dict[str, str],
21 |         temperature: float = 1.0,
22 |         max_tokens: int = 1024,
23 |     ) -> str:
24 |         body = json.dumps(
25 |             {
26 |                 "max_tokens": max_tokens,
27 |                 "top_k": 1,
28 |                 "stop_sequences": [],
29 |                 "temperature": temperature,
30 |                 "top_p": 0.999,
31 |                 "messages": [
32 |                     {
33 |                         "role": "system",
34 |                         "content": [
35 |                             {"type": "text", "text": prompt["system_prompt"]}
36 |                         ],
37 |                     },
38 |                     {
39 |                         "role": "user",
40 |                         "content": [
41 |                             {"type": "text", "text": prompt["user_prompt"]}
42 |                         ],
43 |                     },
44 |                 ],
45 |             }
46 |         )
47 |         response = self.client.invoke_model(body=body, modelId=self.model)
48 |         response_body = json.loads(response.get("body").read())
49 |         return response_body.get("content")
50 | 
51 |         # self.client.model_kwargs["temperature"] = temperature
52 |         # user_message = HumanMessage(content=prompt["user_prompt"])
53 |         # sys_message = SystemMessage(content=prompt["system_prompt"])
54 |         # response = self.client.invoke([sys_message, user_message])
55 |         # return response.content
56 | 


--------------------------------------------------------------------------------
/docs/src/content/docs/pipeline/llms.mdx:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: SupportedLLMs
 3 | ---
 4 | 
 5 | ## Supported LLMs
 6 | 
 7 | - OpenAI
 8 | - Anthropic
 9 | - Google
10 | - HuggingFace
11 | - Azure
12 | - Groq
13 | - Meta
14 | - Gemini
15 | 
16 | ## How to use LLMs
17 | 
18 | To use an LLM, you need to specify the `llm` parameter in the `Pipeline` class.
19 | 
20 | ```python
21 | from continuous_eval.llms import LLMFactory
22 | 
23 | llm = LLMFactory.get("openai:gpt-4o-mini")
24 | 
25 | print(
26 |     llm.run(
27 |         {
28 |             "system_prompt": "You are a helpful assistant.",
29 |             "user_prompt": "What is the capital of France?",
30 |         }
31 |     )
32 | )
33 | ```
34 | 
35 | By default only OpenAI is supported. To use other LLMs, you need to install the corresponding package and register the provider.
36 | Suppose we want to use `Anthropic`'s `Claude 3.5 Sonnet`.
37 | 
38 | ```bash
39 | poetry install -E anthropic
40 | # or
41 | pip install continuous-eval[anthropic]
42 | ```
43 | 
44 | Then, register the provider.
45 | 
46 | ```python
47 | from continuous_eval.llms import LLMFactory
48 | from continuous_eval.llms.anthropic import Anthropic
49 | 
50 | LLMFactory.register_provider("anthropic", Anthropic)
51 | llm = LLMFactory.get("anthropic:claude-3-5-sonnet-20241022")
52 | ```
53 | 
54 | alternatively, we can also register a specific model, for example with Azure
55 | 
56 | ```python
57 | from continuous_eval.llms import LLMFactory
58 | from continuous_eval.llms.azure_openai import AzureOpenAIFactory
59 | 
60 | LLMFactory.register_provider(
61 |     "azure_openai",
62 |     model="gpt-4o-mini",
63 |     provider_class=AzureOpenAIFactory(
64 |         api_key=os.getenv("AZURE_OPENAI_API_KEY"),
65 |         api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
66 |         endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
67 |         deployment=os.getenv("AZURE_OPENAI_DEPLOYMENT"),
68 |     ),
69 | )
70 | llm = LLMFactory.get("azure_openai:gpt-4o-mini")
71 | ```
72 | 
73 | 
74 | 


--------------------------------------------------------------------------------
/tests/custom_metric_test.py:
--------------------------------------------------------------------------------
 1 | from continuous_eval.metrics.base.metric import Arg, Field
 2 | from continuous_eval.metrics.custom import CustomMetric, Example
 3 | from tests.helpers.utils import validate_metric_metadata
 4 | 
 5 | 
 6 | def test_llm_based_metric():
 7 |     metric = CustomMetric(
 8 |         name="ProfessionalTone",
 9 |         criteria="Professional tone in the answer",
10 |         rubric="""1: The answer is not professional.
11 | 2: The answer is somewhat professional.
12 | 3: The answer is very professional.""",
13 |         arguments={
14 |             "answer": Arg(type=str, description="The answer to evaluate."),
15 |             "context": Arg(type=str, description="The context of the answer."),
16 |         },
17 |         response_format={
18 |             "explanation": Field(
19 |                 type=str,
20 |                 description="The explanation for the score given to the answer",
21 |             ),
22 |             "score": Field(
23 |                 type=int, description="The score of the answer between 1 and 3"
24 |             ),
25 |         },
26 |         examples=[
27 |             Example(
28 |                 input={
29 |                     "answer": "Hello, you good toray?",
30 |                     "context": "Toray means today in Japanese.",
31 |                 },
32 |                 output={
33 |                     "explanation": "The answer is not professional.",
34 |                     "score": 1,
35 |                 },
36 |             ),
37 |             Example(
38 |                 input={
39 |                     "answer": "Good morning, how are you today?",
40 |                     "context": "None",
41 |                 },
42 |                 output={
43 |                     "explanation": "The answer is somewhat professional.",
44 |                     "score": 2,
45 |                 },
46 |             ),
47 |         ],
48 |     )
49 |     ret = metric(
50 |         answer="Hello, you good toray?",
51 |         context="Toray means today in Japanese.",
52 |     )
53 |     validate_metric_metadata(metric, ret)
54 | 


--------------------------------------------------------------------------------
/tests/classification_metrics_test.py:
--------------------------------------------------------------------------------
 1 | from continuous_eval.metrics.classification import SingleLabelClassification
 2 | from tests.helpers.utils import all_close, validate_metric_metadata
 3 | 
 4 | 
 5 | def test_numeric():
 6 |     y_pred = [0, 2, 1, 3]
 7 |     y_true = [0, 1, 2, 3]
 8 |     classes = 4
 9 | 
10 |     expected = {
11 |         "accuracy": 0.5,
12 |         "balanced_accuracy": 0.5,
13 |         "precision": 0.5,
14 |         "recall": 0.5,
15 |         "f1": 0.5,
16 |     }
17 | 
18 |     metric = SingleLabelClassification(classes=classes)
19 |     results = [metric(y, y_gt) for y, y_gt in zip(y_pred, y_true)]
20 |     validate_metric_metadata(metric, results)
21 |     agg = metric.aggregate(results)
22 |     assert all_close(agg, expected)
23 | 
24 | 
25 | def test_string_class():
26 |     y_pred = ["0", "2", "1", "3"]
27 |     y_true = ["0", "1", "2", "3"]
28 |     classes = {"0", "1", "2", "3"}
29 | 
30 |     expected = {
31 |         "accuracy": 0.5,
32 |         "balanced_accuracy": 0.5,
33 |         "precision": 0.5,
34 |         "recall": 0.5,
35 |         "f1": 0.5,
36 |     }
37 | 
38 |     metric = SingleLabelClassification(classes=classes)
39 |     results = [metric(y, y_gt) for y, y_gt in zip(y_pred, y_true)]
40 |     validate_metric_metadata(metric, results)
41 |     agg = metric.aggregate(results)
42 |     assert all_close(agg, expected)
43 | 
44 | 
45 | def test_probability_scores():
46 |     y_pred = [
47 |         [0.6, 0.1, 0.2, 0.1],  # 0
48 |         [0.1, 0.3, 0.5, 0.1],  # 2
49 |         [0.2, 0.5, 0.2, 0.1],  # 1
50 |         [0.0, 0.05, 0.05, 0.9],  # 3
51 |     ]
52 |     y_true = [0, 1, 2, 3]
53 |     classes = 4
54 | 
55 |     expected = {
56 |         "accuracy": 0.5,
57 |         "balanced_accuracy": 0.5,
58 |         "precision": 0.5,
59 |         "recall": 0.5,
60 |         "f1": 0.5,
61 |     }
62 | 
63 |     metric = SingleLabelClassification(classes=classes)
64 |     results = [metric(y, y_gt) for y, y_gt in zip(y_pred, y_true)]
65 |     validate_metric_metadata(metric, results)
66 |     agg = metric.aggregate(results)
67 |     assert all_close(agg, expected)
68 | 


--------------------------------------------------------------------------------
/continuous_eval/metrics/generation/text/prompts/ans_correctness_sys.jinja2:
--------------------------------------------------------------------------------
 1 | You are an expert evaluator system for a question answering system.
 2 | You need to evaluate the quality of the generated answer based on the question and reference ground truth answer.
 3 | 
 4 | -- GUIDELINES --
 5 | When evaluating the answer, you should:
 6 | - Break down the answer into main statements.
 7 | - Break down the ground truth answer into main statements.
 8 | - Compare the main statements of both the answer and the ground truth answer.
 9 | - Output a score based on the comparison.
10 | 
11 | You should output a single score between 1 to 5.Use the following guidelines for evaluation:
12 | - 1 means that the answer is completely irrelevant to the question.
13 | - 2 means that the answer is relevant to the question but contains major errors.
14 | - 3 means that the answer is relevant to the question and is partially correct.
15 | - 4 means that the answer is relevant to the question and is correct.
16 | - 5 means that the answer is relevant to the question and is correct and complete.
17 | -- END OF GUIDELINES --
18 | 
19 | {% if 'use_few_shot' %}
20 | {% raw %}
21 | -- BEGIN OF EXAMPLES --
22 | Example 1:
23 | Question: What is the capital of France?
24 | Ground truth answer: Paris
25 | Generated answer: The capital of France is London.
26 | Response:
27 | {
28 |   "reasoning": "The generated answer is incorrect because the capital of France is Paris, not London.",
29 |   "score": 1
30 | }
31 | 
32 | Example 2:
33 | Question: What is the capital of France?
34 | Ground truth answer: Paris
35 | Generated answer: The capital of France is Paris.
36 | Response:
37 | {
38 |   "reasoning": "The generated answer is correct because the capital of France is Paris.",
39 |   "score": 5
40 | }
41 | 
42 | Example 3:
43 | Question: What does SaaS stand for?
44 | Ground truth answer: Software as a Service
45 | Generated answer: SaaS is a software service.
46 | Response:
47 | {
48 |   "reasoning": "The generated answer is partially correct because SaaS stands for Software as a Service.",
49 |   "score": 3
50 | }
51 | -- END OF EXAMPLES --
52 | {% endraw %}
53 | {% endif %}
54 | 
55 | Output a reasoning for your judgement and your score.
56 | 


--------------------------------------------------------------------------------
/continuous_eval/llms/anthropic_bedrock.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | import boto3
 4 | 
 5 | from .base import LLMInterface, LLMInterfaceFactory
 6 | 
 7 | 
 8 | class AnthropicBedrock(LLMInterface):
 9 |     def __init__(
10 |         self,
11 |         model: str,
12 |         anthropic_version: str,
13 |         region_name: str,
14 |         service_name: str = "bedrock-runtime",
15 |         **kwargs,
16 |     ):
17 |         self.model = model
18 |         self.anthropic_version = anthropic_version
19 |         self.client = boto3.client(service_name, region_name=region_name)
20 |         self.defaults = {
21 |             "anthropic_version": anthropic_version,
22 |             "max_tokens": 1024,
23 |             "temperature": 1.0,
24 |         }
25 |         self.defaults.update(kwargs)
26 | 
27 |     def run(self, prompt: dict) -> str:
28 |         body = self.defaults.copy()
29 |         body["system"] = prompt["system_prompt"]
30 |         body["messages"] = [{"role": "user", "content": prompt["user_prompt"]}]
31 |         response = self.client.invoke_model(
32 |             modelId=self.model, body=json.dumps(body)
33 |         )
34 |         model_response = json.loads(response["body"].read())
35 |         return model_response["content"][0]["text"]
36 | 
37 | 
38 | class AnthropicBedrockFactory(LLMInterfaceFactory):
39 |     def __init__(
40 |         self,
41 |         model: str,
42 |         anthropic_version: str,
43 |         region_name: str,
44 |         service_name: str = "bedrock-runtime",
45 |         **kwargs,
46 |     ):
47 |         self.model = model
48 |         self.anthropic_version = anthropic_version
49 |         self.region_name = region_name
50 |         self.service_name = service_name
51 |         self.defaults = {
52 |             "max_tokens": 1024,
53 |             "temperature": 1.0,
54 |         }
55 |         self.defaults.update(kwargs)
56 | 
57 |     def __call__(self, model, **kwargs):
58 |         all_kwargs = {**self.defaults, **kwargs}
59 |         return AnthropicBedrock(
60 |             model=self.model,
61 |             anthropic_version=self.anthropic_version,
62 |             region_name=self.region_name,
63 |             service_name=self.service_name,
64 |             **all_kwargs,
65 |         )
66 | 


--------------------------------------------------------------------------------
/docs/src/content/docs/metrics/Retrieval/Deterministic/rank_aware_metrics.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Ranked-Aware Metrics
 3 | ---
 4 | 
 5 | ### Definitions
 6 | 
 7 | Rank-aware metrics takes into account the order in which the contexts are retrieved.
 8 | 
 9 | **Average Precision (AP)** measures all relevant chunks retrieved and calculates weighted score. Mean of AP across dataset is frequently referred to as **MAP**.
10 | 
11 | $$ \text{Average Precision (AP)} = \frac{1}{\text{Number of Relevant Documents}} \sum_{j=1}^{\text{Number of Retrieved Documents}} \text{Precision at Rank } j $$
12 | 
13 | <br>
14 | 
15 | **Reciprocal Rank (RR)** measures when your **first relevant chunk** appear in your retrieval. Mean of RR across dataset is frequently referred to as **MRR**.
16 | 
17 | $$ \text{Reciprocal Rank (RR)} = \frac{1}{\text{Rank of First Relevant Document}} $$
18 | 
19 | <br>
20 | 
21 | **Normalized Discounted Cumulative Gain (NDCG)** accounts for the cases where your classification of relevancy is non-binary. 
22 | 
23 | $$ \text{Normalized Discounted Cumulative Gain (NDCG)} = \frac{\text{DCG at Rank } k}{\text{IDCG at Rank } k} $$
24 | 
25 | <br>
26 | 
27 | 
28 | :::tip
29 | Focus on **MRR if a single chunk typically contains all the information** needed to answer a question.
30 | Focus on **MAP if multiple chunks need to be synthesized** to answer a question.
31 | :::
32 | 
33 | ##### Matching Strategy
34 | 
35 | Please checkout explanation for Matching strategy in [Matching Strategy](/../precision_recall/)
36 | 
37 | ### Example Usage
38 | 
39 | Required data items: `retrieved_context`, `ground_truth_context`
40 | 
41 | ```python
42 | from continuous_eval.metrics.retrieval import RankedRetrievalMetrics, RougeChunkMatch
43 | 
44 | datum = {
45 |     "retrieved_context": [
46 |         "Lyon is a major city in France.",
47 |         "Paris is the capital of France and also the largest city in the country.",
48 |     ],
49 |     "ground_truth_context": ["Paris is the capital of France."],
50 | }
51 | 
52 | metric = RankedRetrievalMetrics(RougeChunkMatch())
53 | print(metric(**datum))
54 | ```
55 | 
56 | ### Example Output
57 | 
58 | ```JSON
59 | {
60 |     'average_precision': 0.5, 
61 |     'reciprocal_rank': 0.5, 
62 |     'ndcg': 0.6309297535714574
63 | }
64 | ```
65 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "continuous-eval"
 3 | version = "0.3.14post2"
 4 | description = "Open-Source Evaluation for GenAI Applications."
 5 | authors = ["Yi Zhang <yi@relari.ai>", "Pasquale Antonante <pasquale@relari.ai>"]
 6 | readme = "README.md"
 7 | license = "Apache-2.0"
 8 | packages = [{include = "continuous_eval"}]
 9 | 
10 | [tool.poetry.dependencies]
11 | python = ">=3.10,<3.13"
12 | python-dotenv = "^1.0.0"
13 | nltk = "^3.9.1"
14 | rouge = "^1.0.1"
15 | openai = "^1.58.1"
16 | tqdm = "^4.66.1"
17 | requests = "^2.31.0"
18 | posthog = "^3.7.4"
19 | pyyaml = "^6.0.1"
20 | appdirs = "^1.4.4"
21 | munkres = "^1.1.4"
22 | thefuzz = "^0.22.1"
23 | tenacity = ">=8.1.0,<9.0.0"
24 | json-repair = "^0.31.0"
25 | sqlglot = "^26.0.0"
26 | tiktoken = "^0.8.0"
27 | scikit-learn = "^1.6.0"
28 | jinja2 = "^3.1.4"
29 | 
30 | # Optional model providers
31 | boto3 = {version = "^1.35.77", optional = true}
32 | azure-ai-inference = {version = "^1.0.0b6", optional = true}
33 | anthropic = {version = "^0.40.0", optional = true}
34 | cohere = {version = "^5.13.3", optional = true}
35 | google-generativeai = {version = "^0.8.3", optional = true}
36 | 
37 | # Semantic
38 | torch = {version = "^2.1.1", optional = true, python = "^3.11"}
39 | transformers = {version = "^4.47.1", optional = true}
40 | sentence-transformers = {version = "^3.3.1", optional = true, python = "^3.11"}
41 | pandas = {version = "^2.2.3", optional = true}
42 | sentencepiece = {version = "^0.2.0", optional = true}
43 | 
44 | [tool.poetry.extras]
45 | semantic = ["torch", "transformers", "sentencepiece", "sentence-transformers", "pandas"]
46 | bedrock = ["boto3"]
47 | azure = ["azure-ai-inference"]
48 | anthropic = ["anthropic"]
49 | cohere = ["cohere"]
50 | google = ["google-generativeai"]
51 | 
52 | [tool.poetry.group.dev.dependencies]
53 | pytest = "^7.4.3"
54 | pre-commit = "^4.0.1"
55 | ruff = "^0.7.4"
56 | codeflash = ">=0.9.0"
57 | 
58 | [tool.ruff]
59 | line-length = 80
60 | 
61 | [tool.codeflash]
62 | module-root = "continuous_eval"
63 | tests-root = "tests"
64 | test-framework = "pytest"
65 | ignore-paths = []
66 | formatter-cmds = ["ruff check --exit-zero --fix $file", "ruff format $file"]
67 | 
68 | [build-system]
69 | requires = ["poetry-core"]
70 | build-backend = "poetry.core.masonry.api"
71 | 


--------------------------------------------------------------------------------
/continuous_eval/llms/google.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import Dict, Optional
 3 | 
 4 | import google.generativeai as genai
 5 | from google.generativeai.types.generation_types import GenerationConfig
 6 | from google.generativeai.types.safety_types import (
 7 |     HarmCategory,
 8 |     LooseSafetySettingDict,
 9 | )
10 | 
11 | from .base import LLMInterface
12 | 
13 | 
14 | class GoogleAI(LLMInterface):
15 |     def __init__(self, model: str, api_key: Optional[str] = None, **kwargs):
16 |         if api_key is None and os.getenv("GOOGLE_API_KEY") is None:
17 |             raise ValueError(
18 |                 "Please set the environment variable GOOGLE_API_KEY."
19 |             )
20 |         genai.configure(api_key=api_key or os.getenv("GOOGLE_API_KEY"))
21 |         self.model = model
22 |         self.safety_settings = [
23 |             LooseSafetySettingDict(
24 |                 category=HarmCategory.HARM_CATEGORY_HARASSMENT,
25 |                 threshold="block_none",
26 |             ),
27 |             LooseSafetySettingDict(
28 |                 category=HarmCategory.HARM_CATEGORY_HATE_SPEECH,
29 |                 threshold="block_none",
30 |             ),
31 |             LooseSafetySettingDict(
32 |                 category=HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
33 |                 threshold="block_none",
34 |             ),
35 |             LooseSafetySettingDict(
36 |                 category=HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
37 |                 threshold="block_none",
38 |             ),
39 |         ]
40 |         self.defaults = {
41 |             "max_output_tokens": 2048,
42 |             "temperature": 1.0,
43 |             "top_p": 1.0,
44 |             "top_k": 1,
45 |         }
46 |         self.defaults.update(kwargs)
47 | 
48 |     def run(self, prompt: Dict[str, str], temperature: float = 1.0) -> str:
49 |         kwargs = self.defaults.copy()
50 |         kwargs["temperature"] = temperature
51 |         model = genai.GenerativeModel(
52 |             model_name=self.model,
53 |             system_instruction=prompt["system_prompt"],
54 |             safety_settings=self.safety_settings,
55 |             generation_config=GenerationConfig(**kwargs),
56 |         )
57 |         response = model.generate_content(prompt["user_prompt"])
58 |         return response.text
59 | 


--------------------------------------------------------------------------------
/docs/src/content/docs/metrics/Code/Deterministic/sql_ast_similarity.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: SQL AST Similarity
 3 | sidebar:
 4 |   order: 1
 5 | ---
 6 | 
 7 | ### Definitions
 8 | 
 9 | **SQL AST Similarity** compares the structure of two SQL queries by analyzing their Abstract Syntax Trees (ASTs). This metric assesses similarity by matching the nodes within these trees, taking into account the statement types and their arrangement. Different types of tree differences (such as insert, remove, update, move, etc.) are weighted differently to calculate the final similarity score.
10 | 
11 | <br>
12 | 
13 | $$
14 | \text{SQL AST Similarity} = 1 - \frac{\text{Total Weight Changes}}{\text{Maximum Possible Nodes}}
15 | $$
16 | 
17 | <br>
18 | 
19 | :::note
20 | The metric depends on syntactically correct SQL queries to produce the Abstract Syntax Trees (ASTs). If the scripts contain syntax errors and cannot be parsed, the metric will yield a score of -1.0.
21 | :::
22 | 
23 | <br>
24 | 
25 | ### Example Usage
26 | 
27 | Required data items: `answer`, `ground_truth_answers`
28 | 
29 | ```python
30 | from continuous_eval.metrics.code.sql import SQLASTSimilarity
31 | 
32 | datum = {
33 |     "answer": "SELECT name, age FROM customers",
34 |     "ground_truth_answers": "SELECT age, name FROM customers",
35 | }
36 | 
37 | metric = SQLASTSimilarity()
38 | print(metric(**datum))
39 | ```
40 | 
41 | You can optionally initialize the metric to use optimized SQL queries using the [sqlglot optimizer](https://github.com/tobymao/sqlglot?tab=readme-ov-file#sql-optimizer) and optionally pass in the schema. For example:
42 | 
43 | ```python
44 | schema={"x": {"A": "INT", "B": "INT", "C": "INT", "D": "INT", "Z": "STRING"}}
45 | sql_syntax_match_optimized = SQLASTSimilarity(optimized=True, schema=schema)
46 | ```
47 | 
48 | You can also customize weights to different types of nodes in the AST diff.
49 | Higher weights indicate more significant changes, which are expected to have a greater impact on query semantics.
50 | 
51 | ```python
52 | from continuous_eval.metrics.code.sql.deterministic import ASTDiffWeightConfig
53 | 
54 | weights = ASTDiffWeightConfig(
55 |     keep=0.0,
56 |     update=2,
57 |     insert=1.0,
58 |     remove=1.5,
59 |     move=0,
60 |     default=0,
61 | )
62 | metric = SQLASTSimilarity(diff_weights=weights)
63 | ```
64 | 
65 | ### Example Output
66 | 
67 | ```JSON
68 | {
69 |     "SQL_AST_Similarity": 0.9375
70 | }
71 | ```
72 | 


--------------------------------------------------------------------------------
/continuous_eval/metrics/retrieval/tokens.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Union
 2 | 
 3 | import tiktoken
 4 | 
 5 | from continuous_eval.metrics.base import Field, Metric
 6 | 
 7 | _CHARACTERS_PER_TOKEN = 4.0
 8 | 
 9 | 
10 | class TokenCount(Metric):
11 |     """
12 |     Count the number of tokens in the retrieved context.
13 |     """
14 | 
15 |     # Encodings specify how text is converted into tokens.
16 |     # Different models use different encodings.
17 |     # | Encoding Name            | OpenAI Models                                                                 |
18 |     # |--------------------------|-------------------------------------------------------------------------------|
19 |     # | `o200k_base`             | `gpt-4o`, `gpt-4o-mini`                                                      |
20 |     # | `cl100k_base`            | `gpt-4-turbo`, `gpt-4`, `gpt-3.5-turbo`, `text-embedding-ada-002`, `text-embedding-3-small`, `text-embedding-3-large` |
21 |     # | `p50k_base`              | Codex models, `text-davinci-002`, `text-davinci-003`                         |
22 |     # | `r50k_base` (or `gpt2`)  | GPT-3 models like `davinci`                                                  |
23 | 
24 |     def __init__(self, encoder_name: str = "gpt-4o-mini") -> None:
25 |         super().__init__(is_cpu_bound=True)
26 |         if encoder_name == "approx":
27 |             self._encoder = None
28 |         else:
29 |             try:
30 |                 self._encoder = tiktoken.get_encoding(encoder_name)
31 |             except ValueError:
32 |                 try:
33 |                     self._encoder = tiktoken.encoding_for_model(encoder_name)
34 |                 except ValueError:
35 |                     raise ValueError(
36 |                         f"Invalid encoder name: {encoder_name}. You can use encoders names like `o200k_base` or model names like `gpt4o-mini`."
37 |                     )
38 | 
39 |     def compute(self, retrieved_context: Union[str, List[str]], **kwargs):
40 |         ctx = (
41 |             "\n".join(retrieved_context)
42 |             if isinstance(retrieved_context, list)
43 |             else retrieved_context
44 |         )
45 |         if self._encoder is None:
46 |             num_tokens = int(len(ctx) / _CHARACTERS_PER_TOKEN)
47 |         else:
48 |             num_tokens = len(self._encoder.encode(ctx))
49 |         return {"num_tokens": num_tokens}
50 | 
51 |     @property
52 |     def schema(self):
53 |         return {"num_tokens": Field(type=int)}
54 | 


--------------------------------------------------------------------------------
/examples/modular_evaluation.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Dict, List
 2 | 
 3 | from continuous_eval.data_downloader import example_data_downloader
 4 | from continuous_eval.eval import (
 5 |     Dataset,
 6 |     EvaluationRunner,
 7 |     Module,
 8 |     ModuleOutput,
 9 |     Pipeline,
10 | )
11 | from continuous_eval.eval.result_types import PipelineResults
12 | from continuous_eval.metrics.generation.text import (
13 |     AnswerCorrectness,
14 | )
15 | from continuous_eval.metrics.retrieval import (
16 |     PrecisionRecallF1,
17 |     RankedRetrievalMetrics,
18 | )
19 | 
20 | 
21 | def page_content(docs: List[Dict[str, Any]]) -> List[str]:
22 |     # Extract the content of the retrieved documents from the pipeline results
23 |     return [doc["page_content"] for doc in docs]
24 | 
25 | 
26 | def main():
27 |     dataset: Dataset = example_data_downloader("graham_essays/small/dataset")  # type: ignore
28 |     results: Dict = example_data_downloader("graham_essays/small/results")  # type: ignore
29 | 
30 |     # Simple 3-step RAG pipeline with Retriever->Reranker->Generation
31 |     retriever = Module(
32 |         name="retriever",
33 |         input=dataset.question,  # type: ignore
34 |         output=List[str],
35 |         eval=[
36 |             PrecisionRecallF1().use(
37 |                 retrieved_context=ModuleOutput(page_content),
38 |                 ground_truth_context=dataset.ground_truth_context,  # type: ignore
39 |             ),
40 |         ],
41 |     )
42 | 
43 |     reranker = Module(
44 |         name="reranker",
45 |         input=retriever,
46 |         output=List[Dict[str, str]],
47 |         eval=[
48 |             RankedRetrievalMetrics().use(
49 |                 retrieved_context=ModuleOutput(page_content),
50 |                 ground_truth_context=dataset.ground_truth_context,  # type: ignore
51 |             ),
52 |         ],
53 |     )
54 | 
55 |     llm = Module(
56 |         name="llm",
57 |         input=reranker,
58 |         output=str,
59 |         eval=[
60 |             AnswerCorrectness().use(
61 |                 question=dataset.question,  # type: ignore
62 |                 answer=ModuleOutput(),
63 |                 ground_truth_answers=dataset.ground_truth_answers,  # type: ignore
64 |             ),
65 |         ],
66 |     )
67 | 
68 |     pipeline = Pipeline([retriever, reranker, llm], dataset=dataset)
69 |     print(pipeline.graph_repr())  # visualize the pipeline in marmaid format
70 | 
71 |     runner = EvaluationRunner(pipeline)
72 |     eval_results = runner.evaluate(PipelineResults.from_dict(results))
73 |     print(eval_results.aggregate())
74 | 
75 | 
76 | if __name__ == "__main__":
77 |     main()
78 | 


--------------------------------------------------------------------------------
/docs/src/content/docs/metrics/Generation/Deterministic/correctness.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Correctness
 3 | sidebar:
 4 |     order: 1
 5 | ---
 6 | 
 7 | ### Definitions
 8 | 
 9 | **Answer Correctness** measures how close the generated answer is the the ground truth reference answers.
10 | 
11 | Below are the list of deterministic metrics that measure the relationship between the generated answer and the ground truth reference answers.
12 | 
13 | **ROUGE-L** measures the longest common subsequence between the generated answer and the ground truth answers.
14 | 
15 | <br>
16 | 
17 | **Token Overlap** calculates the token overlap between the generated answer and the ground truth answers.
18 | 
19 | <br>
20 | 
21 | 
22 | **BLEU** (Bilingual Evaluation Understudy) calculates the n-gram precision. (Below: `p_n` is the n-gram precision, `w_n` is the weight for each n-gram, and `BP` is the brevity penalty to penalize short answers)
23 | 
24 | $$
25 | \text{BLEU Score} = \text{BP} \times \exp\left(\sum_{n=1}^{N} w_n \log p_n\right)
26 | $$
27 | 
28 | 
29 | <br>
30 | 
31 | 
32 | **Answer Correctness** is a basket of metrics that include the **Precision, Recall and F1** of **ROUGE-L** and **Token Overlap**, as well as the **BLEU** score.
33 | 
34 | When there are multiple ground truth reference answers, the max score is taken.
35 | 
36 | <br>
37 | 
38 | 
39 | :::note
40 | **Token Overlap Recall and Rouge L Recall are shown to the best metrics in our experiment.** [Correntness Metric Evaluation](https://medium.com/relari/a-practical-guide-to-rag-evaluation-part-2-generation-c79b1bde0f5d) (scroll to "result for Correctness").
41 | However, this conclusion likely varies by dataset. Test to see how close these scores align with human evaluation.
42 | :::
43 | 
44 | ### Example Usage
45 | 
46 | Required data items: `answer`, `ground_truths`
47 | 
48 | ```python
49 | from continuous_eval.metrics.generation.text import DeterministicAnswerCorrectness
50 | 
51 | datum = {
52 |     "answer": "Shakespeare wrote 'Romeo and Juliet'",
53 |     "ground_truth_answers": [
54 |         "William Shakespeare wrote 'Romeo and Juliet", 
55 |         "William Shakespeare", 
56 |         "Shakespeare", 
57 |         "Shakespeare is the author of 'Romeo and Juliet'"
58 |     ]
59 | }
60 | 
61 | metric = DeterministicAnswerCorrectness()
62 | print(metric(**datum))
63 | ```
64 | 
65 | ### Example Output
66 | 
67 | ```python
68 | {
69 |     "rouge_l_recall": 1.0,
70 |     "rouge_l_precision": 0.8,
71 |     "rouge_l_f1": 0.7272727223140496,
72 |     "token_overlap_recall": 1.0,
73 |     "token_overlap_precision": 1.0,
74 |     "token_overlap_f1": 0.888888888888889,
75 |     "bleu_score": 0.799402901304756,
76 | }
77 | ```
78 | 


--------------------------------------------------------------------------------
/docs/src/content/docs/pipeline/pipeline.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Pipeline Overview
 3 | sidebar:
 4 |   badge:
 5 |     text: new
 6 |     variant: tip
 7 | ---
 8 | 
 9 | ## Definition
10 | 
11 | To evaluate your custom AI application pipeline, you first need to define it in using the `Pipeline` classes.
12 | <br>
13 | 
14 | A pipeline is a sequence of steps that transform data from one format to another. In a typical AI application, it usually starts with a user instruction, then goes through a series of steps (`Module`) to return an answer.
15 | 
16 | The basic component of a pipeline is a `Module`.
17 | A module is a named component with specific inputs and outputs.
18 | It can be a simple function or a complex model that takes some input and returns some output.
19 | 
20 | To define a module you need to specify the following:
21 | 
22 | - `name`: a unique name for the module
23 | - `input`: the input of the module, can be a dataset field (`DatasetField`, see dataset page) another module or nothing (`None`)
24 | - `output` the output type (e.g., `str`, `List[str]`, `Dict[str, str]`, etc.)
25 | - `description`: Optional string describing the field
26 | - `eval`: an optional list of metrics (see next page)
27 | - `tests`: an optional list of tests (see next page)
28 | 
29 | Through the `Pipeline` class, you can define a sequence of modules that represent your application pipeline.
30 | 
31 | ### Example
32 | 
33 | Consider the following pipeline example:
34 | 
35 | ```d2
36 | direction: right
37 | Dataset: Eval Dataset
38 | Dataset.shape: oval
39 | Dataset -> Retriever
40 | Retriever -> Reranker -> Generator
41 | ```
42 | 
43 | This Retrieval-Augmented Generation (RAG) pipeline consists of three simple modules. A Retriever that fetches the relevant documents, a Reranker that reorders and filters the documents, and a Generator that uses LLM to generate a response based on information in the documents.
44 | 
45 | ```python title="pipeline.py"
46 | from continuous_eval.eval import Module, Pipeline, Dataset
47 | from typing import List, Dict
48 | 
49 | dataset = Dataset("dataset_folder") # Evaluation dataset that contains all the questions and optional the expected module outputs
50 | 
51 | retriever = Module(
52 |     name="Retriever",
53 |     input=dataset.question, 
54 |     output=List[Dict[str, str]],
55 | )
56 | 
57 | reranker = Module(
58 |     name="Reranker",
59 |     input=retriever,
60 |     output=List[Dict[str, str]],
61 | )
62 | 
63 | llm = Module(
64 |     name="LLM",
65 |     input=reranker,
66 |     output=str,
67 | )
68 | 
69 | pipeline = Pipeline([retriever, reranker, llm], dataset=dataset)
70 | print(pipeline.graph_repr()) # visualize the pipeline in Mermaid graph format
71 | ```
72 | 


--------------------------------------------------------------------------------
/continuous_eval/metrics/tools/match.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, List
 2 | 
 3 | from continuous_eval.eval.types import ToolCall
 4 | from continuous_eval.metrics.base import Field, Metric
 5 | 
 6 | 
 7 | def _count_matches(ground_truth, tools, order_sensitive=False):
 8 |     if order_sensitive:
 9 |         # For order-sensitive matching
10 |         matches = 0
11 |         gt_index = 0
12 | 
13 |         for tool in tools:
14 |             if gt_index < len(ground_truth) and ground_truth[gt_index] == tool:
15 |                 matches += 1
16 |                 gt_index += 1
17 |         return matches
18 |     else:
19 |         # For order-insensitive matching, convert dictionaries to hashable tuples
20 |         def make_hashable(obj):
21 |             if isinstance(obj, dict):
22 |                 return tuple(
23 |                     sorted((k, make_hashable(v)) for k, v in obj.items())
24 |                 )
25 |             elif isinstance(obj, list):
26 |                 return tuple(make_hashable(v) for v in obj)
27 |             else:
28 |                 return obj
29 | 
30 |         ground_truth_set = set(make_hashable(d) for d in ground_truth)
31 |         tools_set = set(make_hashable(d) for d in tools)
32 |         intersection = ground_truth_set & tools_set
33 |         return len(intersection)
34 | 
35 | 
36 | class ToolSelectionAccuracy(Metric):
37 |     """
38 |     Computes the accuracy of tool selection.
39 |     """
40 | 
41 |     def __init__(
42 |         self,
43 |         order_sensitive: bool = False,
44 |         ignore_kwargs: bool = False,
45 |     ) -> None:
46 |         super().__init__(is_cpu_bound=True)
47 |         self._order_sensitive = order_sensitive
48 |         self._ignore_kwargs = ignore_kwargs
49 | 
50 |     def compute(
51 |         self, tools: List[ToolCall], ground_truths: List[ToolCall], **kwargs
52 |     ):
53 |         if self._ignore_kwargs:
54 |             _ground_truths = [{"name": t["name"]} for t in ground_truths]
55 |             _tools = [{"name": t["name"]} for t in tools]
56 |         else:
57 |             _ground_truths, _tools = ground_truths, tools
58 |         num_correct = _count_matches(
59 |             _ground_truths, _tools, order_sensitive=self._order_sensitive
60 |         )
61 |         score = 1.0
62 |         if len(ground_truths) > 0:
63 |             score = num_correct / len(ground_truths)
64 |         elif len(tools) > 0:
65 |             score = 0.0
66 | 
67 |         return {
68 |             "num_correct": num_correct,
69 |             "score": score,
70 |         }
71 | 
72 |     @property
73 |     def schema(self) -> Dict[str, Field]:
74 |         return {
75 |             "score": Field(type=float, limits=(0, 1)),
76 |             "num_correct": Field(type=int),
77 |         }
78 | 


--------------------------------------------------------------------------------
/continuous_eval/llms/azure.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import Dict, Optional
 3 | 
 4 | from .base import LLMInterface, LLMInterfaceFactory
 5 | 
 6 | try:
 7 |     from azure.ai.inference import ChatCompletionsClient
 8 |     from azure.core.credentials import AzureKeyCredential
 9 | 
10 |     AZURE_AVAILABLE = True
11 | except ImportError:
12 |     AZURE_AVAILABLE = False
13 | 
14 | 
15 | class AzureAI(LLMInterface):
16 |     def __init__(
17 |         self,
18 |         endpoint: Optional[str] = None,
19 |         credential: Optional[str] = None,
20 |         **kwargs,
21 |     ):
22 |         if not AZURE_AVAILABLE:
23 |             raise ValueError("Azure OpenAI is not available")
24 |         if os.getenv("AZURE_ENDPOINT") is None and endpoint is None:
25 |             raise ValueError(
26 |                 "Please set endpoint or the environment variable AZURE_ENDPOINT"
27 |             )
28 |         if os.getenv("AZURE_CREDENTIAL") is None and credential is None:
29 |             raise ValueError(
30 |                 "Please set credential or the environment variable AZURE_CREDENTIAL"
31 |             )
32 |         _endpoint = endpoint or os.getenv("AZURE_ENDPOINT")
33 |         _credential = AzureKeyCredential(
34 |             credential or os.getenv("AZURE_CREDENTIAL")
35 |         )  # type: ignore
36 |         self.client = ChatCompletionsClient(
37 |             endpoint=_endpoint,  # type: ignore
38 |             credential=_credential,
39 |         )
40 |         self.defaults = {
41 |             "temperature": 1.0,
42 |             "max_tokens": 2048,
43 |         }
44 |         self.defaults.update(kwargs)
45 | 
46 |     def run(self, prompt: Dict[str, str], temperature: float = 0) -> str:
47 |         kwargs = self.defaults.copy()
48 |         kwargs["temperature"] = temperature
49 |         response = self.client.complete(
50 |             messages=[
51 |                 {"role": "system", "content": prompt["system_prompt"]},
52 |                 {"role": "user", "content": prompt["user_prompt"]},
53 |             ],
54 |             **kwargs,
55 |         )
56 |         return response.choices[0].message.content
57 | 
58 | 
59 | class AzureAIFactory(LLMInterfaceFactory):
60 |     def __init__(
61 |         self,
62 |         endpoint: Optional[str] = None,
63 |         credential: Optional[str] = None,
64 |         **kwargs,
65 |     ):
66 |         self.endpoint = endpoint or os.getenv("AZURE_ENDPOINT")
67 |         self.credential = credential or os.getenv("AZURE_CREDENTIAL")
68 |         self.extra_kwargs = kwargs
69 | 
70 |     def __call__(self, model, **kwargs):
71 |         all_kwargs = {**self.extra_kwargs, **kwargs}
72 |         return AzureAI(
73 |             endpoint=self.endpoint,
74 |             credential=self.credential,
75 |             **all_kwargs,
76 |         )
77 | 


--------------------------------------------------------------------------------
/docs/src/content/docs/index.mdx:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Welcome!
 3 | description: Making LLM development a science rather than an art.
 4 | ---
 5 | 
 6 | **Start today with continuous-eval and make your LLM development a science not an art!**
 7 | 
 8 | import { LinkCard, CardGrid } from '@astrojs/starlight/components';
 9 | import { Icon } from '@astrojs/starlight/components';
10 | 
11 | <br></br>
12 | 
13 | <CardGrid>
14 |   <LinkCard
15 |     title="🚀 Getting Started"
16 |     description="Install the package and learn how to get started quickly."
17 |     href="getting-started/introduction/"
18 |   />
19 | </CardGrid>
20 | 
21 | ### Navigate this Documentation
22 | 
23 | <CardGrid>
24 |   <LinkCard
25 |   title="🚰 Pipeline"
26 |   description="Define your GenAI application pipeline and run evaluation over a tailored dataset."
27 |   href="pipeline/pipeline/"
28 |   />
29 |   <LinkCard
30 |     title="📊 Metrics"
31 |     description="Explore the available metrics and learn how to combine multiple metrics effectively."
32 |     href="metrics/overview/"
33 |   />
34 |   <LinkCard
35 |     title="🔍 Datasets"
36 |     description="Explore sample datasets try to run evaluation on them."
37 |     href="pipeline/eval_dataset/"
38 |   />
39 |   <LinkCard
40 |     title="💡 Examples"
41 |     description="Discover code snippets and examples to help you understand and implement different evaluation pipelines."
42 |     href="examples/basics/0_single_metric/"
43 |   />
44 | </CardGrid>
45 | 
46 | ### Other Resources
47 | 
48 | - **Blog Posts:**
49 |   * Practical Guide to RAG Pipeline Evaluation: [Part 1: Retrieval](https://medium.com/relari/a-practical-guide-to-rag-pipeline-evaluation-part-1-27a472b09893)
50 |   * Practical Guide to RAG Pipeline Evaluation: [Part 2: Generation](https://medium.com/relari/a-practical-guide-to-rag-evaluation-part-2-generation-c79b1bde0f5d)
51 |   * How important is a Golden Dataset for LLM evaluation?
52 |  [link](https://medium.com/relari/how-important-is-a-golden-dataset-for-llm-pipeline-evaluation-4ef6deb14dc5)
53 |   * How to evaluate complex GenAI Apps: a granular approach
54 |  [link](https://medium.com/relari/how-to-evaluate-complex-genai-apps-a-granular-approach-0ab929d5b3e2)
55 |   * How to make the most out of LLM production data: simulated user feedback
56 |  [link](https://medium.com/towards-data-science/how-to-make-the-most-out-of-llm-production-data-simulated-user-feedback-843c444febc7)
57 |   * Generate synthetic data to test LLM applications
58 |  [link](https://medium.com/relari/generate-synthetic-data-to-test-llm-applications-4bffeb51b80e)
59 | 
60 | - **Discord:** Join our community of LLM developers [Discord](https://discord.gg/GJnM8SRsHr)
61 | - **Reach out to founders:** [Email](mailto:founders@relari.ai) or [Schedule a chat](https://cal.com/pasquale/continuous-eval)
62 | 


--------------------------------------------------------------------------------
/docs/src/content/docs/metrics/Generation/Deterministic/faithfulness.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Faithfulness
 3 | sidebar:
 4 |     order: 2
 5 | ---
 6 | 
 7 | ### Definitions
 8 | 
 9 | **Faithfulness** measures how grounded is the generated answer on the retrieved contexts. 
10 | 
11 | Below are the list of deterministic metrics that measure the relationship between the generated answer and the retrieved contexts.
12 | 
13 | **ROUGE-L Precision** measures the longest common subsequence between the generated answer and the retrieved contexts.
14 | 
15 | $$
16 | \text{ROUGE-L Precision} = \frac{\text{Longest Common Subsequence (LCS) between Answer and Contexts}}{\text{Length of Generated Answer}}
17 | $$
18 | 
19 | <br>
20 | 
21 | **Token Overlap Precision** calculates the precision of token overlap between the generated answer and the retrieved contexts.
22 | 
23 | $$
24 | \text{Token Overlap Precision} = \frac{\text{Count of Common Tokens between Answer and Contexts}}{\text{Total Tokens in Generated Answer}}
25 | $$
26 | 
27 | <br>
28 | 
29 | 
30 | **BLEU** (Bilingual Evaluation Understudy) calculates the n-gram precision. (Below: `p_n` is the n-gram precision, `w_n` is the weight for each n-gram, and `BP` is the brevity penalty to penalize short answers)
31 | 
32 | $$
33 | \text{BLEU Score} = \text{BP} \times \exp\left(\sum_{n=1}^{N} w_n \log p_n\right)
34 | $$
35 | 
36 | 
37 | <br>
38 | 
39 | 
40 | **Rouge|Token Overlap|Bleu Faithfulness** is defined as the proportion of the sentences in the generated answer that can matched to the retrieved context above a threshold.
41 | 
42 | 
43 | $$
44 | \text{Rouge|Token Overlap|Bleu Faithfulness} = \frac{\text{Number of Sentences in Answer Matched to Context above Threshold}}{\text{Total Number of Sentences in Generated Answer}}
45 | $$
46 | 
47 | <br>
48 | 
49 | 
50 | ### Example Usage
51 | 
52 | Required data items: `retrieved_context`, `answer`
53 | 
54 | ```python
55 | from continuous_eval.metrics.generation.text  import DeterministicFaithfulness
56 | 
57 | datum = {
58 |     "retrieved_context": ["William Shakespeare is the author of 'Romeo and Juliet'."],
59 |     "answer": "William Shakespeare wrote 'Romeo and Juliet'. He is born in Ireland",
60 | }
61 | 
62 | metric = DeterministicFaithfulness()
63 | print(metric(**datum))
64 | ```
65 | 
66 | ### Example Output
67 | 
68 | `by_sentence` values are the list of sentence-level rouge | token_overlap | bleu scores for each sentence in the answer.
69 | 
70 | `default_threshold` for a sentence to be considered faithful is set to be `0.5`.
71 | 
72 | ```python
73 | {
74 |     "rouge_faithfulness": 0.5,
75 |     "token_overlap_faithfulness": 0.5,
76 |     "bleu_faithfulness": 0.37023896751607194,
77 |     "rouge_p_by_sentence": [0.8333333333333334, 0.2],
78 |     "token_overlap_p_by_sentence": [0.8, 0.0],
79 |     "bleu_score_by_sentence": [0.6855956729300113, 0.05488226210213251],
80 | }
81 | ```
82 | 


--------------------------------------------------------------------------------
/continuous_eval/metrics/code/sql/llm.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from typing import Dict, List, Optional, Union
 3 | 
 4 | from continuous_eval.metrics.base import (
 5 |     Arg,
 6 |     Field,
 7 |     MetricPrompt,
 8 |     response_type,
 9 | )
10 | from continuous_eval.metrics.base.probabilistic import (
11 |     DEFAULT_MODEL,
12 |     ProbabilisticMetric,
13 | )
14 | 
15 | _CWD = Path(__file__).parent
16 | 
17 | 
18 | class SQLCorrectness(ProbabilisticMetric):
19 |     """
20 |     The LLM based SQL correctness metric.
21 |     Measures whether the generated SQL query is correct and matches the ground truth SQL query considering natural language exp.
22 |     """
23 | 
24 |     def __init__(
25 |         self,
26 |         temperature: float = 1.0,
27 |         model: str = DEFAULT_MODEL,
28 |     ):
29 |         prompt = MetricPrompt.from_file(
30 |             system_prompt_path=_CWD / "prompts" / "sql_correctness_sys.jinja2",
31 |             user_prompt_path=_CWD / "prompts" / "sql_correctness_user.jinja2",
32 |             response_format=response_type.Integer(ge=0, le=5),  # type: ignore
33 |             description=None,
34 |         )
35 |         super().__init__(
36 |             name="SQLCorrectness",
37 |             prompt=prompt,
38 |             temperature=temperature,
39 |             model=model,
40 |         )
41 | 
42 |     def compute(
43 |         self,
44 |         question: str,
45 |         answer: str,
46 |         ground_truth_answers: Union[List[str], str],
47 |         schema: Optional[Dict] = None,
48 |         **kwargs,
49 |     ):
50 |         score = super().compute(
51 |             question=question,
52 |             answer=answer,
53 |             ground_truth_answers=ground_truth_answers,
54 |             schema=schema,
55 |         )
56 |         return {
57 |             "reasoning": score["SQLCorrectness_reasoning"],
58 |             "score": self.prompt.response_format.weighted_score(
59 |                 score["SQLCorrectness_probabilities"]
60 |             ),
61 |         }
62 | 
63 |     @property
64 |     def args(self):
65 |         return {
66 |             "question": Arg(
67 |                 type=str, description="The question asked to the system"
68 |             ),
69 |             "answer": Arg(type=str, description="The generated SQL query"),
70 |             "ground_truth_answers": Arg(
71 |                 type=Union[str, List[str]],
72 |                 description="The ground truth SQL query",
73 |             ),
74 |             "schema": Arg(
75 |                 type=Dict,
76 |                 description="The schema of the database",
77 |                 is_required=True,
78 |             ),
79 |         }
80 | 
81 |     @property
82 |     def schema(self):
83 |         return {
84 |             "reasoning": Field(
85 |                 type=str, description="The reasoning for the correctness score."
86 |             ),
87 |             "score": Field(
88 |                 type=float, description="The correctness score.", limits=(0, 1)
89 |             ),
90 |         }
91 | 


--------------------------------------------------------------------------------
/continuous_eval/metrics/base/llm.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Dict
 2 | 
 3 | from continuous_eval.llms import LLMFactory
 4 | from continuous_eval.metrics.base import Field as MetricField
 5 | from continuous_eval.metrics.base import Metric, MetricPrompt
 6 | from continuous_eval.metrics.base.response_type import ScoringFunction
 7 | 
 8 | 
 9 | class LLMMetric(Metric):
10 |     def __init__(
11 |         self,
12 |         name: str,
13 |         prompt: MetricPrompt,
14 |         temperature: float = 1.0,
15 |         model: str = LLMFactory.default(),
16 |     ):
17 |         super().__init__()
18 |         if isinstance(prompt.response_format, type):
19 |             assert issubclass(
20 |                 prompt.response_format, ScoringFunction
21 |             ), "Prompt response_format must be a ScoringFunction"
22 |         else:
23 |             assert isinstance(
24 |                 prompt.response_format, ScoringFunction
25 |             ), "Prompt response_format must be a ScoringFunction"
26 |         self._name = name
27 |         self.prompt = prompt
28 |         self.temperature = temperature
29 |         self.model = model
30 |         self._llm = LLMFactory.get(model)
31 | 
32 |     @property
33 |     def name(self):
34 |         return self._name
35 | 
36 |     @property
37 |     def help(self):
38 |         return self.prompt.description or "No description available"
39 | 
40 |     @property
41 |     def schema(self) -> Dict[str, MetricField]:
42 |         if hasattr(self.prompt.response_format, "schema"):
43 |             return {
44 |                 k: MetricField(type=t)
45 |                 for k, t in self.prompt.response_format.schema.items()  # type: ignore
46 |             }  # type: ignore
47 |         else:
48 |             return {
49 |                 f"{self.name}_score": MetricField(
50 |                     type=self.prompt.response_format.type  # type: ignore
51 |                 ),  # type: ignore
52 |                 f"{self.name}_reasoning": MetricField(type=str),
53 |             }
54 | 
55 |     @property
56 |     def args(self):
57 |         return self.prompt.args
58 | 
59 |     def serialize(self):
60 |         return {
61 |             "name": self.name,
62 |             "prompt": self.prompt.serialize(),
63 |             "temperature": self.temperature,
64 |             "model": self.model,
65 |         }
66 | 
67 |     @classmethod
68 |     def deserialize(cls, data: Dict[str, Any]):
69 |         name = data["name"]
70 |         prompt = MetricPrompt.deserialize(data["prompt"])
71 |         temperature = data["temperature"]
72 |         model = data["model"]
73 |         return cls(
74 |             name=name, prompt=prompt, temperature=temperature, model=model
75 |         )
76 | 
77 |     def compute(self, **kwargs):
78 |         if self.overloaded_params is not None:
79 |             margs = {
80 |                 arg: kwargs[f.name] for arg, f in self.overloaded_params.items()
81 |             }
82 |         else:
83 |             margs = kwargs
84 |         prompt = self.prompt.render(**margs)
85 |         res = self._llm.run(prompt=prompt, temperature=self.temperature)
86 |         score = self.prompt.response_format.score(res)  # type: ignore
87 |         return score
88 | 


--------------------------------------------------------------------------------
/docs/src/content/docs/metrics/Generation/Semantic/deberta_answer_scores.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: DeBERTa Answer Scores
 3 | ---
 4 | 
 5 | ### Definitions
 6 | 
 7 | **DeBERTa scores** measure semantic relationship between the Generated Answer and the Ground Truth Answers **in three categories:**
 8 | 
 9 | - **Entailment**: the Generated Answer IMPLIES a Ground Truth Answer.
10 | 
11 | - **Contradiction**: the Generated Answer CONTRADICTS a Ground Truth Answer.
12 | 
13 | - **Neutral**: the Generated Answer and the Ground Truth Answer have neutral logical relationship.
14 | 
15 | This metric leverages the [NLI DeBERTa v3 model](https://huggingface.co/cross-encoder/nli-deberta-v3-large) to calculate the scores. This DeBERTa model (Decoding-enhanced BERT with Disentangled Attention) is a fine-tuned version specifically designed to measure the above relationships.
16 | 
17 | **The scores output the probability of the model's prediction of each class (between 0 and 1).** Because we are mostly interested in finding out if entailment or contradiction relationships, our scores only output those two.
18 | 
19 | <br>
20 | 
21 | :::tip
22 | **The DeBERTa scores take it one step further than the BERT metrics which only measures semantic closeness.** In the context of RAG, it is a more nuanced assessment of answer quality, correlates better with human evaluation.
23 | [Correntness Metric Evaluation](https://medium.com/relari/a-practical-guide-to-rag-evaluation-part-2-generation-c79b1bde0f5d) (scroll to "result for Correctness").
24 | :::
25 | 
26 | 
27 | ### Example Usage
28 | 
29 | Required data items: `answer`, `ground_truths`
30 | 
31 | ```python
32 | from continuous_eval.metrics.generation.text import DebertaAnswerScores
33 | 
34 | datum = {
35 |     "question": "Who wrote 'Romeo and Juliet'?",
36 |     "retrieved_context": ["William Shakespeare is the author of 'Romeo and Juliet'."],
37 |     "ground_truth_context": ["William Shakespeare is the author of 'Romeo and Juliet'."],
38 |     "answer": "Shakespeare wrote 'Romeo and Juliet'",
39 |     "ground_truth_answers": [
40 |         "William Shakespeare wrote 'Romeo and Juliet", 
41 |         "William Shakespeare", 
42 |         "Shakespeare", 
43 |         "Shakespeare is the author of 'Romeo and Juliet'"
44 |     ]
45 | }
46 | 
47 | metric = DebertaAnswerScores()
48 | print(metric(**datum))
49 | 
50 | reverse_metric = DebertaAnswerScores(reverse=True)
51 | print(reverse_metric.calculate(**datum))
52 | ```
53 | 
54 | ### Example Output
55 | 
56 | Default: `reverse = False`
57 | 
58 | ```JSON
59 | {
60 |     'deberta_answer_entailment': 0.9989350438117981, 
61 |     'deberta_answer_contradiction': 2.3176469767349772e-05
62 | }
63 | ```
64 | 
65 | The above scores suggests that the model is highly confident that the Generate Answer implies at least one of the Ground Truth Answers, and that it unlikely contradicts with any of them.
66 | 
67 | Default: `reverse = True`
68 | 
69 | ```JSON
70 | {
71 |     'deberta_reverse_answer_entailment': 0.9990990161895752, 
72 |     'deberta_reverse_answer_contradiction': 3.902518074028194e-05
73 | }
74 | ```
75 | 
76 | The above scores suggests that the model is highly confident that the Generate Answer implies at least one of the Ground Truth Answers, and that it unlikely contradicts with any of them.
77 | 


--------------------------------------------------------------------------------
/continuous_eval/utils/types.py:
--------------------------------------------------------------------------------
 1 | import builtins
 2 | import typing
 3 | from typing import Any, Type, get_origin
 4 | 
 5 | from pydantic import ValidationError, create_model
 6 | 
 7 | 
 8 | def str_to_type_hint(type_str: str):
 9 |     # Utility to safely get type from typing module or builtins
10 |     def get_type(type_name):
11 |         if hasattr(builtins, type_name):
12 |             return getattr(builtins, type_name)
13 |         elif hasattr(typing, type_name):
14 |             return getattr(typing, type_name)
15 |         else:
16 |             raise ValueError(
17 |                 f"Type {type_name} not found in builtins or typing module"
18 |             )
19 | 
20 |     # Base case: simple type without parameters
21 |     if "[" not in type_str:
22 |         return get_type(type_str)
23 |     # Recursive case: generic type with parameters
24 |     base_type_str, params_str = type_str.split("[", 1)
25 |     params_str = params_str.rstrip("]")  # Remove the closing bracket
26 |     # Recursively convert parameter strings back to type hints
27 |     params = [
28 |         str_to_type_hint(param.strip()) for param in params_str.split(",")
29 |     ]
30 |     base_type = get_type(base_type_str.strip())
31 |     # Reconstruct the generic type with its parameters
32 |     return base_type[tuple(params)]
33 | 
34 | 
35 | def type_hint_to_str(type_hint: Type):
36 |     if hasattr(type_hint, "__origin__"):  # Check if it's a generic type
37 |         # Get the base type name (e.g., 'List' or 'Dict')
38 |         base = type_hint.__origin__.__name__.title()
39 |         # Recursively process the arguments (e.g., the contents of List, Dict, etc.)
40 |         args = ", ".join(type_hint_to_str(arg) for arg in type_hint.__args__)
41 |         return f"{base}[{args}]"
42 |     elif hasattr(type_hint, "__name__"):
43 |         return type_hint.__name__
44 |     else:
45 |         return type_hint if isinstance(type_hint, str) else repr(type_hint)
46 | 
47 | 
48 | def instantiate_type(type_hint: Type):
49 |     origin = get_origin(type_hint)
50 |     # If the origin is None, it means type_hint is not a generic type
51 |     # and we assume type_hint itself is directly instantiable
52 |     if origin is None:
53 |         origin = type_hint
54 |     try:
55 |         # This only works for types without required arguments in their __init__.
56 |         instance = origin()
57 |     except TypeError:
58 |         # If instantiation fails, return an error message or raise a custom exception
59 |         instance = None
60 |     return instance
61 | 
62 | 
63 | def check_type(var: Any, type_hint: Any) -> bool:
64 |     """
65 |     Checks if 'var' matches the 'type_hint'.
66 | 
67 |     Args:
68 |     var (Any): The variable to check.
69 |     type_hint (Any): The type hint (from the typing module) against which to check the variable.
70 | 
71 |     Returns:
72 |     bool: True if 'var' matches the 'type_hint', False otherwise.
73 |     """
74 |     # Dynamically create a Pydantic model with one field 'data' of the provided type hint
75 |     DynamicModel = create_model("DynamicModel", data=(type_hint, ...))
76 |     try:
77 |         # Create an instance of the model with 'var' as the data to validate the type
78 |         DynamicModel(data=var)
79 |         return True
80 |     except ValidationError:
81 |         return False
82 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Code of Conduct - continuous-eval
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to make participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to a positive environment for our
15 | community include:
16 | 
17 | * Demonstrating empathy and kindness toward other people
18 | * Being respectful of differing opinions, viewpoints, and experiences
19 | * Giving and gracefully accepting constructive feedback
20 | * Accepting responsibility and apologizing to those affected by our mistakes,
21 |   and learning from the experience
22 | * Focusing on what is best not just for us as individuals, but for the
23 |   overall community
24 | 
25 | Examples of unacceptable behavior include:
26 | 
27 | * The use of sexualized language or imagery, and sexual attention or
28 |   advances
29 | * Trolling, insulting or derogatory comments, and personal or political attacks
30 | * Public or private harassment
31 | * Publishing others' private information, such as a physical or email
32 |   address, without their explicit permission
33 | * Other conduct which could reasonably be considered inappropriate in a
34 |   professional setting
35 | 
36 | ## Our Responsibilities
37 | 
38 | Project maintainers are responsible for clarifying and enforcing our standards of
39 | acceptable behavior and will take appropriate and fair corrective action in
40 | response to any instances of unacceptable behavior.
41 | 
42 | Project maintainers have the right and responsibility to remove, edit, or reject
43 | comments, commits, code, wiki edits, issues, and other contributions that are
44 | not aligned to this Code of Conduct, or to ban
45 | temporarily or permanently any contributor for other behaviors that they deem
46 | inappropriate, threatening, offensive, or harmful.
47 | 
48 | ## Scope
49 | 
50 | This Code of Conduct applies within all community spaces, and also applies when
51 | an individual is officially representing the community in public spaces.
52 | Examples of representing our community include using an official e-mail address,
53 | posting via an official social media account, or acting as an appointed
54 | representative at an online or offline event.
55 | 
56 | ## Enforcement
57 | 
58 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
59 | reported to the community leaders responsible for enforcement at <>.
60 | All complaints will be reviewed and investigated promptly and fairly.
61 | 
62 | All community leaders are obligated to respect the privacy and security of the
63 | reporter of any incident.
64 | 
65 | ## Attribution
66 | 
67 | This Code of Conduct is adapted from the [Contributor Covenant](https://contributor-covenant.org/), version
68 | [1.4](https://www.contributor-covenant.org/version/1/4/code-of-conduct/code_of_conduct.md) and
69 | [2.0](https://www.contributor-covenant.org/version/2/0/code_of_conduct/code_of_conduct.md).
70 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | pypi-token.pypi
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | cover/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Sphinx documentation
 60 | docs/_build/
 61 | 
 62 | # PyBuilder
 63 | .pybuilder/
 64 | target/
 65 | 
 66 | # Jupyter Notebook
 67 | .ipynb_checkpoints
 68 | 
 69 | # IPython
 70 | profile_default/
 71 | ipython_config.py
 72 | 
 73 | # pyenv
 74 | #   For a library or package, you might want to ignore these files since the code is
 75 | #   intended to run in multiple environments; otherwise, check them in:
 76 | # .python-version
 77 | 
 78 | # pipenv
 79 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 80 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 81 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 82 | #   install all needed dependencies.
 83 | #Pipfile.lock
 84 | 
 85 | # poetry
 86 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 87 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
 88 | #   commonly ignored for libraries.
 89 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
 90 | #poetry.lock
 91 | 
 92 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
 93 | __pypackages__/
 94 | 
 95 | # Environments
 96 | .env
 97 | .venv
 98 | env/
 99 | venv/
100 | ENV/
101 | env.bak/
102 | venv.bak/
103 | 
104 | # mkdocs documentation
105 | /site
106 | 
107 | # mypy
108 | .mypy_cache/
109 | .dmypy.json
110 | dmypy.json
111 | 
112 | # Pyre type checker
113 | .pyre/
114 | 
115 | # pytype static type analyzer
116 | .pytype/
117 | 
118 | # Cython debug symbols
119 | cython_debug/
120 | 
121 | # PyCharm
122 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
123 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
124 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
125 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
126 | .idea/
127 | 
128 | # VSCode
129 | .vscode/
130 | 
131 | # Others
132 | .DS_Store
133 | generated_dataset/
134 | data/
135 | local_test.py
136 | temp/
137 | 


--------------------------------------------------------------------------------
/continuous_eval/metrics/generation/text/utils.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import string
 3 | import warnings
 4 | from typing import Optional, Union
 5 | 
 6 | import nltk
 7 | from rouge import Rouge
 8 | 
 9 | from continuous_eval.metrics.retrieval.simple_tokenizer import SimpleTokenizer
10 | 
11 | 
12 | def _numeric_matcher(input_val, min_val, max_val) -> Optional[float]:
13 |     pattern = r"\d+(?:\.\d+)?"  # Match any number (integer or float)
14 |     matches = re.findall(pattern, input_val)
15 |     if not matches:
16 |         return None
17 |     return max(min_val, min(max_val, float(matches[0])))
18 | 
19 | 
20 | class ScoringFunctions:
21 |     @staticmethod
22 |     def Numeric(
23 |         min_val: Union[int, float] = 1,
24 |         max_val: Union[int, float] = 5,
25 |     ):
26 |         assert min_val < max_val, "min_val should be less than max_val"
27 |         return lambda input_val: _numeric_matcher(input_val, min_val, max_val)
28 | 
29 |     @staticmethod
30 |     def Identity(value: str):
31 |         return value
32 | 
33 | 
34 | class TokenOverlap:
35 |     def __init__(self):
36 |         super().__init__()
37 |         self._tokenizer = SimpleTokenizer()
38 | 
39 |     def _tokenize(self, text, language="english"):
40 |         sentences = nltk.tokenize.sent_tokenize(text, language)
41 |         return [
42 |             token
43 |             for sent in sentences
44 |             for token in self._tokenizer.tokenize(sent)
45 |         ]
46 | 
47 |     def calculate(self, prediction, reference):
48 |         pred_tokens = self._tokenize(prediction)
49 |         ref_tokens = self._tokenize(reference)
50 | 
51 |         token_overlap = set(pred_tokens) & set(ref_tokens)
52 |         token_overlap_count = len(token_overlap)
53 | 
54 |         try:
55 |             precision = token_overlap_count / len(pred_tokens)
56 |         except ZeroDivisionError:
57 |             precision = 0.0
58 |         try:
59 |             recall = token_overlap_count / len(ref_tokens)
60 |         except ZeroDivisionError:
61 |             recall = 0.0
62 |         try:
63 |             f1 = 2 * ((precision * recall) / (precision + recall))
64 |         except ZeroDivisionError:
65 |             f1 = 0.0
66 | 
67 |         return {
68 |             "token_overlap_precision": precision,
69 |             "token_overlap_recall": recall,
70 |             "token_overlap_f1": f1,
71 |         }
72 | 
73 | 
74 | class RougeScore:
75 |     def calculate(self, prediction, reference):
76 |         rouge = Rouge()
77 |         if prediction == "" or all(c in string.punctuation for c in prediction):
78 |             # If the prediction is empty or only punctuation, the ROUGE score is 0
79 |             rouge_l_score = {"p": 0.0, "r": 0.0, "f": 0.0}
80 |         else:
81 |             scores = rouge.get_scores(prediction, reference)
82 |             rouge_l_score = scores[0]["rouge-l"]
83 |         return {
84 |             "rouge_l_precision": rouge_l_score["p"],
85 |             "rouge_l_recall": rouge_l_score["r"],
86 |             "rouge_l_f1": rouge_l_score["f"],
87 |         }
88 | 
89 | 
90 | class BleuScore:
91 |     def calculate(self, prediction, reference):
92 |         warnings.filterwarnings("ignore")
93 |         bleu = nltk.translate.bleu_score.sentence_bleu([reference], prediction)
94 |         warnings.filterwarnings("default")
95 |         return {"bleu_score": bleu}
96 | 


--------------------------------------------------------------------------------
/tests/data/correctness_sm.jsonl:
--------------------------------------------------------------------------------
 1 | {"question": "who is playing halftime show super bowl 2018", "answer": "Justin Timberlake performed at the Super Bowl LII halftime show in 2018.", "ground_truths": ["Justin Timberlake"], "dataset": "natural_questions", "model": "gpt-3.5-turbo", "annotation": "correct"}
 2 | {"question": "who said beware of the ides of march", "answer": "The soothsayer in William Shakespeare's play \"Julius Caesar\" said \"Beware the Ides of March.\"", "ground_truths": ["the soothsayer", "a seer", "a haruspex named Spurinna"], "dataset": "natural_questions", "model": "gpt-3.5-turbo", "annotation": "correct"}
 3 | {"question": "what kind of beast is the beast from beauty and the beast", "answer": "Based on the given passages, the Beast from \"Beauty and the Beast\" is described as a chimera, a fictional creature that is a mixture of several animals. The passages mention that the Beast has the head structure and horns of a buffalo, the arms and body of a bear, the eyebrows of a gorilla, the jaws, teeth, and mane of a lion, the tusks of a wild boar, and the legs and tail of a wolf.", "ground_truths": ["a chimera"], "dataset": "natural_questions", "model": "gpt-3.5-turbo", "annotation": "correct"}
 4 | {"question": "who is opening for little mix glory days tour", "answer": "The opening acts for the Little Mix Glory Days Tour are: ", "ground_truths": ["Sheppard", "Zoe Badwi", "Louisa Johnson", "Bronnie", "The Vamps", "Ella Eyre", "Conor Maynard"], "dataset": "natural_questions", "model": "alpaca-7b", "annotation": "incorrect"}
 5 | {"question": "who's the guy in call me maybe", "answer": "Based on the given passages, the guy in the song \"Call Me Maybe\" is Holden Nowell.", "ground_truths": ["Holden Nowell"], "dataset": "natural_questions", "model": "gpt-3.5-turbo", "annotation": "correct"}
 6 | {"question": "who sang what are we doing in love", "answer": "Dottie West with the uncredited vocals of Kenny Rogers sang \"What Are We Doin' in Love\".", "ground_truths": ["Dottie West", "Kenny Rogers"], "dataset": "natural_questions", "model": "gpt-3.5-turbo", "annotation": "correct"}
 7 | {"question": "who signed the largest on the declaration of independence", "answer": "The most famous and largest signature on the Declaration of Independence is that of John Hancock, who signed first as President of Congress.", "ground_truths": ["John Hancock"], "dataset": "natural_questions", "model": "gpt-3.5-turbo", "annotation": "correct"}
 8 | {"question": "who has won the 2017 mens singles mutua madrid open tennis", "answer": "The given passages do not provide information on the 2017 Men's Singles Mutua Madrid Open tennis.", "ground_truths": ["Rafael Nadal"], "dataset": "natural_questions", "model": "gpt-3.5-turbo", "annotation": "refuse-to-answer"}
 9 | {"question": "who won the most medals at the 2014 winter olympics", "answer": "Norway, after Russia was stripped of two gold medals in bobsleigh. However, 4 of Russia's gold medals were later stripped due to doping.", "ground_truths": ["Russia\u00a0(", "Russia"], "dataset": "natural_questions", "model": "gpt-3.5-turbo", "annotation": "incorrect"}
10 | {"question": "when was where have all the flowers gone written", "answer": "\"Where Have All the Flowers Gone?\" was written by Pete Seeger in 1955 and published in \"Sing Out!\" magazine. Additional verses were added in May 1960 by Joe Hickerson.", "ground_truths": ["1955"], "dataset": "natural_questions", "model": "gpt-3.5-turbo", "annotation": "correct"}
11 | 


--------------------------------------------------------------------------------
/continuous_eval/eval/logger.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | from enum import Enum
 4 | from pathlib import Path
 5 | from typing import Any, Optional, Union
 6 | 
 7 | from continuous_eval.eval.modules import TOOL_PREFIX
 8 | from continuous_eval.eval.pipeline import Pipeline
 9 | from continuous_eval.utils.types import instantiate_type
10 | 
11 | logger = logging.getLogger("eval-manager")
12 | Serializable = Any
13 | 
14 | 
15 | class LogMode(Enum):
16 |     APPEND = 0
17 |     REPLACE = 1
18 | 
19 | 
20 | class PipelineLogger:
21 |     def __init__(self, pipeline: Optional[Pipeline] = None):
22 |         self._pipeline: Optional[Pipeline] = pipeline
23 |         self.data = dict()
24 | 
25 |     @property
26 |     def pipeline(self) -> Pipeline:
27 |         if self._pipeline is None:
28 |             raise ValueError("Pipeline not set")
29 |         return self._pipeline
30 | 
31 |     def _empty_sample(self):
32 |         if self._pipeline is None:
33 |             raise ValueError("Pipeline not set")
34 |         empty_samples = dict()
35 |         for module in self._pipeline.modules:
36 |             empty_samples[module.name] = instantiate_type(module.output)
37 |             if module.tools:
38 |                 empty_samples[f"{TOOL_PREFIX}{module.name}"] = list()
39 |         return empty_samples
40 | 
41 |     def log(
42 |         self,
43 |         uid: Serializable,
44 |         module: str,
45 |         value: Any,
46 |         mode: LogMode = LogMode.REPLACE,
47 |         **kwargs,
48 |     ):
49 |         # Make sure everything looks good
50 |         assert uid is not None, "UID cannot be None"
51 |         if self._pipeline is None:
52 |             raise ValueError("Pipeline not set")
53 |         if uid not in self.data:
54 |             self.data[uid] = self._empty_sample()
55 |         if kwargs and "tool_args" in kwargs:
56 |             key = f"{TOOL_PREFIX}{module}"
57 |             self.data[uid][key].append(
58 |                 {"name": value, "kwargs": kwargs["tool_args"]}
59 |             )
60 |         else:
61 |             if mode == LogMode.REPLACE:
62 |                 self.data[uid][module] = value
63 |             elif mode == LogMode.APPEND:
64 |                 if not isinstance(self.data[uid][module], list):
65 |                     if isinstance(value, list):
66 |                         self.data[uid][module].extend(value)
67 |                     else:
68 |                         self.data[uid][module].append(value)
69 |                 else:
70 |                     self.data[uid][module].add(value)
71 | 
72 |     def save(self, filepath: Union[str, Path]):
73 |         if isinstance(filepath, str):
74 |             filepath = Path(filepath)
75 |         assert filepath.suffix == ".jsonl", "File must be a JSONL file"
76 |         assert self.data, "No samples to save"
77 |         with open(filepath, "w") as f:
78 |             for uid, res in self.data.items():
79 |                 line = {**{"__uid": uid}, **res}
80 |                 json_record = json.dumps(line, ensure_ascii=False)
81 |                 f.write(json_record + "\n")
82 | 
83 |     def load(self, filepath: Union[str, Path]):
84 |         if isinstance(filepath, str):
85 |             filepath = Path(filepath)
86 |         assert filepath.suffix == ".jsonl", "File must be a JSONL file"
87 |         with open(filepath, "r") as f:
88 |             for line in f:
89 |                 record = json.loads(line)
90 |                 uid = record.pop("__uid")
91 |                 self.data[uid] = record
92 | 


--------------------------------------------------------------------------------
/continuous_eval/metrics/retrieval/precision_recall_f1.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | from nltk.tokenize import sent_tokenize
 4 | 
 5 | from continuous_eval.metrics.base import Field, Metric
 6 | from continuous_eval.metrics.retrieval.matching_strategy import (
 7 |     MatchingStrategy,
 8 |     MatchingStrategyType,
 9 |     RougeChunkMatch,
10 | )
11 | 
12 | 
13 | class PrecisionRecallF1(Metric):
14 |     """Calculate the precision, recall, and f1 score for the retrieved context given the ground truth context."""
15 | 
16 |     def __init__(self, matching_strategy: MatchingStrategy = RougeChunkMatch()):
17 |         super().__init__(is_cpu_bound=True)
18 |         assert isinstance(
19 |             matching_strategy, MatchingStrategy
20 |         ), "Matching strategy must be an instance of MatchingStrategy."
21 |         self.matching_strategy = matching_strategy
22 | 
23 |     def compute(
24 |         self,
25 |         retrieved_context: List[str],
26 |         ground_truth_context: List[str],
27 |         **kwargs,
28 |     ):
29 |         # Calculate precision, recall and f1 based on different matching strategies.
30 |         # These metrics do not consider the order or rank of relevant information in the retrieval.
31 |         if self.matching_strategy.type == MatchingStrategyType.CHUNK_MATCH:
32 |             ret_components = retrieved_context
33 |             gt_components = ground_truth_context
34 |         elif self.matching_strategy.type == MatchingStrategyType.SENTENCE_MATCH:
35 |             ret_components = [
36 |                 sentence
37 |                 for chunk in retrieved_context
38 |                 for sentence in sent_tokenize(chunk)
39 |             ]
40 |             gt_components = [
41 |                 sentence
42 |                 for chunk in ground_truth_context
43 |                 for sentence in sent_tokenize(chunk)
44 |             ]
45 | 
46 |         relevant_ret_components = 0
47 |         hit_gt_components = set()
48 |         gt_components = set(
49 |             gt_components
50 |         )  # remove duplicates in ground truth context if any
51 |         for ret_component in ret_components:
52 |             ret_component_matched = False
53 |             for gt_component in gt_components:
54 |                 if self.matching_strategy.is_relevant(
55 |                     ret_component, gt_component
56 |                 ):
57 |                     ret_component_matched = True
58 |                     hit_gt_components.add(gt_component)
59 |                     continue
60 |             relevant_ret_components += int(ret_component_matched)
61 |         precision = (
62 |             relevant_ret_components / len(ret_components)
63 |             if ret_components
64 |             else 0.0
65 |         )
66 |         recall = (
67 |             len(hit_gt_components) / len(gt_components)
68 |             if gt_components
69 |             else 0.0
70 |         )
71 | 
72 |         try:
73 |             f1 = 2 * (precision * recall) / (precision + recall)
74 |         except ZeroDivisionError:
75 |             f1 = 0.0
76 |         return {
77 |             "context_precision": precision,
78 |             "context_recall": recall,
79 |             "context_f1": f1,
80 |         }
81 | 
82 |     @property
83 |     def schema(self):
84 |         return {
85 |             "context_precision": Field(type=float, limits=(0, 1)),
86 |             "context_recall": Field(type=float, limits=(0, 1)),
87 |             "context_f1": Field(type=float, limits=(0, 1)),
88 |         }
89 | 


--------------------------------------------------------------------------------
/tests/helpers/utils.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | from numbers import Number
 3 | from typing import Any, Dict, List, Union, Mapping
 4 | 
 5 | from continuous_eval.metrics.base import Arg, Field
 6 | from continuous_eval.utils.types import check_type
 7 | 
 8 | 
 9 | def all_close(
10 |     datum_1: Mapping[str, Union[Number, List[Number]]],
11 |     datum_2: Mapping[str, Union[Number, List[Number]]],
12 |     rel_tol: float = 1e-8,
13 |     abs_tol: float = 1e-4,
14 | ) -> bool:
15 |     if set(datum_1.keys()) != set(datum_2.keys()):
16 |         return False
17 |     for key, value1 in datum_1.items():
18 |         if isinstance(value1, list):
19 |             if not all(
20 |                 math.isclose(v1, v2, rel_tol=rel_tol, abs_tol=abs_tol)
21 |                 for v1, v2 in zip(value1, datum_2[key])
22 |             ):
23 |                 return False
24 |         else:
25 |             if not math.isclose(
26 |                 value1, datum_2[key], rel_tol=rel_tol, abs_tol=abs_tol
27 |             ):
28 |                 return False
29 |     return True
30 | 
31 | 
32 | def list_of_dicts_to_dict_of_lists(data: List[Dict[str, Any]]):
33 |     # Initialize the result dictionary
34 |     result = {}
35 |     for item in data:
36 |         for key, value in item.items():
37 |             # If the key doesn't exist in the result dictionary, initialize it with a list
38 |             if key not in result:
39 |                 result[key] = []
40 |             # Append the value to the list associated with the key
41 |             result[key].append(value)
42 |     return result
43 | 
44 | 
45 | def validate_schema(schema: Dict[str, Field], data: Dict[str, Any]):
46 |     for key, value in schema.items():
47 |         if key not in data:
48 |             raise ValueError(f"Key {key} not found in data")
49 |         if not check_type(data[key], value.type):
50 |             raise ValueError(
51 |                 f"Value {data[key]} for key {key} is not of type {value.type_hint}"
52 |             )
53 |         if value.limits is not None:
54 |             assert (
55 |                 value.limits[0] <= data[key] <= value.limits[1]
56 |             ), f"Value {data[key]} for key {key} is not in limits {value.limits}"
57 |     return True
58 | 
59 | 
60 | def validate_args(args: Dict[str, Arg]):
61 |     assert isinstance(args, dict), "args must be a dictionary"
62 |     for key, value in args.items():
63 |         assert isinstance(
64 |             key, str
65 |         ), f"All keys in args must be strings, got {type(key)}"
66 |         assert isinstance(
67 |             value, Arg
68 |         ), f"All values in args must be of type Field, got {type(value)}"
69 |     return True
70 | 
71 | 
72 | def validate_metric_metadata(metric, results=None):
73 |     assert isinstance(metric.name, str), "Metric name must be a string"
74 |     assert isinstance(metric.help, str), "Metric help must be a string"
75 |     assert isinstance(metric.schema, dict), "Metric schema must be a dictionary"
76 |     assert isinstance(metric.args, dict), "Metric args must be a dictionary"
77 |     assert metric.help != "No description available", "No help available"
78 |     if results is not None:
79 |         if isinstance(results, list):
80 |             assert all(
81 |                 validate_schema(metric.schema, datum) for datum in results
82 |             ), "Metric schema and args do not match"
83 |         else:
84 |             assert validate_schema(
85 |                 metric.schema, results
86 |             ), "Metric schema and args do not match"
87 |     assert validate_args(metric.args), "Metric args do not match"
88 | 


--------------------------------------------------------------------------------
/continuous_eval/metrics/classification/classification.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Dict, List, Literal, Set, Union
 2 | 
 3 | import numpy as np
 4 | from sklearn.metrics import (
 5 |     accuracy_score,
 6 |     balanced_accuracy_score,
 7 |     f1_score,
 8 |     precision_score,
 9 |     recall_score,
10 | )
11 | 
12 | from continuous_eval.metrics.base import Field, Metric
13 | 
14 | 
15 | class SingleLabelClassification(Metric):
16 |     """
17 |     Evaluate the accuracy, precision, recall, and F1 score of a single-label classification task.
18 |     """
19 | 
20 |     def __init__(
21 |         self,
22 |         classes: Union[int, Set[str]],
23 |         average: Literal["micro", "macro", "weighted"] = "macro",
24 |     ):
25 |         super().__init__(is_cpu_bound=True)
26 |         assert average in ["macro", "micro", "weighted"]
27 |         self._average = average
28 |         if isinstance(classes, int):
29 |             # Assume classes are 0, 1, 2, ..., classes-1
30 |             self._classes = list(range(classes))
31 |         else:
32 |             self._classes = (
33 |                 classes if isinstance(classes, list) else set(classes)
34 |             )
35 |             if len(self._classes) != len(classes):
36 |                 raise ValueError("Classes must be unique")
37 |         # Create a mapping from class labels to integers
38 |         self._class_to_index = {
39 |             label: index for index, label in enumerate(self._classes)
40 |         }
41 | 
42 |     def compute(
43 |         self,
44 |         predicted_class: Union[str, int, List[float]],
45 |         ground_truth_class: Union[str, int],
46 |     ):
47 |         if isinstance(predicted_class, list):
48 |             predicted_class = np.argmax(
49 |                 predicted_class
50 |             ).item()  # Convert to int
51 |             if self._classes is not None:
52 |                 predicted_class = self._classes[predicted_class]
53 |         return {
54 |             "classification_prediction": predicted_class,
55 |             "classification_ground_truth": ground_truth_class,
56 |             "classification_correct": predicted_class == ground_truth_class,
57 |         }
58 | 
59 |     def aggregate(
60 |         self,
61 |         results: List[Dict[str, Union[str, int]]],
62 |     ) -> Any:
63 |         if self._classes is None:
64 |             classes = {r["classification_prediction"] for r in results}
65 |             classes.update({r["classification_ground_truth"] for r in results})
66 |             classes = sorted(classes)
67 |         else:
68 |             classes = self._classes
69 |         class_to_index = {c: i for i, c in enumerate(classes)}
70 |         pred = [class_to_index[r["classification_prediction"]] for r in results]
71 |         gt = [class_to_index[r["classification_ground_truth"]] for r in results]
72 |         return {
73 |             "accuracy": accuracy_score(gt, pred),
74 |             "balanced_accuracy": balanced_accuracy_score(gt, pred),
75 |             "precision": precision_score(
76 |                 gt, pred, average=self._average, zero_division=1.0
77 |             ),  # type: ignore
78 |             "recall": recall_score(
79 |                 gt, pred, average=self._average, zero_division=1.0
80 |             ),  # type: ignore
81 |             "f1": f1_score(gt, pred, average=self._average, zero_division=1.0),  # type: ignore
82 |         }
83 | 
84 |     @property
85 |     def schema(self):
86 |         return {
87 |             "classification_prediction": Field(type=Union[str, int]),
88 |             "classification_ground_truth": Field(type=Union[str, int]),
89 |             "classification_correct": Field(type=bool),
90 |         }
91 | 


--------------------------------------------------------------------------------
/tests/generation_semantic_test.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | pytest.importorskip("torch", reason="Torch is required for the tests.")
  4 | pytest.importorskip("pandas", reason="Pandas is required for the tests.")
  5 | 
  6 | # ruff: noqa: E402
  7 | from continuous_eval.metrics.generation.text.semantic import (
  8 |     BertAnswerRelevance,
  9 |     BertAnswerSimilarity,
 10 |     BertSimilarity,
 11 |     DebertaAnswerScores,
 12 | )
 13 | 
 14 | from tests.helpers.utils import list_of_dicts_to_dict_of_lists
 15 | 
 16 | 
 17 | def test_bert_similarity_mean():
 18 |     data = [
 19 |         {"prediction": "This is a test", "reference": "This is a test"},
 20 |         {"prediction": "This is cat", "reference": "A cat is on the table"},
 21 |     ]
 22 | 
 23 |     metric = BertSimilarity()
 24 |     x = metric.batch(**list_of_dicts_to_dict_of_lists(data))
 25 |     assert x["bert_similarity"][0] > x["bert_similarity"][1]
 26 | 
 27 |     y = metric("The pen is on the table", "This book is red")
 28 |     assert y["bert_similarity"] > 0 and y["bert_similarity"] < 1
 29 | 
 30 | 
 31 | def test_bert_similarity_mean_pooler_output():
 32 |     data = [
 33 |         {"prediction": "This is a test", "reference": "This is a test"},
 34 |         {"prediction": "This is cat", "reference": "A cat is on the table"},
 35 |     ]
 36 | 
 37 |     metric = BertSimilarity(pooler_output=True)
 38 |     x = metric.batch(**list_of_dicts_to_dict_of_lists(data))
 39 |     assert x["bert_similarity"][0] > x["bert_similarity"][1]
 40 | 
 41 |     y = metric("The pen is on the table", "This book is red")
 42 |     assert y["bert_similarity"] > 0 and y["bert_similarity"] < 1
 43 | 
 44 | 
 45 | def test_answer_relevance():
 46 |     data = [
 47 |         {
 48 |             "question": "Who wrote the 'The Hitchhiker's Guide'?",
 49 |             "answer": "Douglas Adams",
 50 |         },
 51 |         {
 52 |             "question": "Answer to the Ultimate Question of Life, the Universe, and Everything",
 53 |             "answer": "The number 42",
 54 |         },
 55 |     ]
 56 |     metric = BertAnswerRelevance()
 57 |     x = metric.batch(**list_of_dicts_to_dict_of_lists(data))
 58 |     assert all(
 59 |         z["bert_answer_relevance"] > 0 and z["bert_answer_relevance"] < 1
 60 |         for z in x
 61 |     )
 62 | 
 63 | 
 64 | def test_answer_similarity():
 65 |     dataset = [
 66 |         {
 67 |             "answer": "Samuel Adams",
 68 |             "ground_truth_answers": ["Douglas Adams"],
 69 |         },
 70 |         {
 71 |             "answer": "The number 42",
 72 |             "ground_truth_answers": ["The number 42", "42"],
 73 |         },
 74 |     ]
 75 |     metric = BertAnswerSimilarity()
 76 |     x = metric.batch(**list_of_dicts_to_dict_of_lists(dataset))
 77 |     y = metric(**dataset[1])
 78 |     assert (
 79 |         abs(x[1]["bert_answer_similarity"] - y["bert_answer_similarity"]) < 1e-1
 80 |     )
 81 | 
 82 | 
 83 | def test_deberta_answer_scores():
 84 |     data = [
 85 |         {
 86 |             "answer": "Samuel Adams",
 87 |             "ground_truth_answers": ["Douglas Adams"],
 88 |         },
 89 |         {
 90 |             "answer": "The number 42",
 91 |             "ground_truth_answers": ["The number 42", "42"],
 92 |         },
 93 |     ]
 94 |     metric = DebertaAnswerScores()
 95 |     x = metric.batch(**list_of_dicts_to_dict_of_lists(data))
 96 |     y = metric(**data[0])
 97 |     assert (
 98 |         abs(x[0]["deberta_answer_entailment"] - y["deberta_answer_entailment"])
 99 |         < 1e-5
100 |     )
101 |     assert (
102 |         abs(
103 |             x[0]["deberta_answer_contradiction"]
104 |             - y["deberta_answer_contradiction"]
105 |         )
106 |         < 1e-5
107 |     )
108 | 


--------------------------------------------------------------------------------
/continuous_eval/eval/modules.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass, field
  2 | from typing import Any, Dict, List, Optional, Type, Union
  3 | 
  4 | from continuous_eval.eval.dataset import DatasetField
  5 | from continuous_eval.eval.tests import Test
  6 | from continuous_eval.metrics import Metric
  7 | from continuous_eval.utils.types import type_hint_to_str
  8 | 
  9 | TOOL_PREFIX = "_tool__"
 10 | 
 11 | 
 12 | def _serialize_input_type(obj):
 13 |     if isinstance(obj, DatasetField):
 14 |         return {"__class__": obj.__class__.__name__, "name": obj.name}
 15 |     elif isinstance(obj, Module):
 16 |         return {"__class__": obj.__class__.__name__, "name": obj.name}
 17 |     elif isinstance(obj, type):
 18 |         return type_hint_to_str(obj)
 19 |     elif isinstance(obj, (list, tuple)):
 20 |         return [_serialize_input_type(x) for x in obj]
 21 |     elif obj is None:
 22 |         return "None"
 23 |     else:
 24 |         raise TypeError(
 25 |             f"Object of type {type(obj).__name__} is not serializable"
 26 |         )
 27 | 
 28 | 
 29 | @dataclass(frozen=True, eq=True)
 30 | class Tool:
 31 |     name: str
 32 |     args: Dict[str, Type] = field(default_factory=dict)
 33 |     out_type: Optional[Type] = field(default=None)
 34 |     description: Optional[str] = field(default=None)
 35 | 
 36 |     def asdict(self):
 37 |         return {
 38 |             "name": self.name,
 39 |             "args": {k: type_hint_to_str(v) for k, v in self.args.items()},
 40 |             "out_type": type_hint_to_str(self.out_type),
 41 |             "description": self.description,
 42 |         }
 43 | 
 44 | 
 45 | @dataclass(frozen=True, eq=True)
 46 | class Module:
 47 |     name: str
 48 |     input: Union[DatasetField, "Module", None]
 49 |     output: Union[Type, None] = field(default=None)
 50 |     description: Optional[str] = field(default=None)
 51 |     eval: Optional[List[Metric]] = field(default=None)
 52 |     tests: Optional[List[Test]] = field(default=None)
 53 |     tools: Optional[List[Tool]] = field(default=None)
 54 | 
 55 |     def __post_init__(self):
 56 |         if self.name == "":
 57 |             raise ValueError("Module name cannot be empty")
 58 |         if self.tests is not None:
 59 |             test_names = {test.name for test in self.tests}
 60 |             assert len(test_names) == len(
 61 |                 self.tests
 62 |             ), "Each test name must be unique"
 63 |         if self.eval is not None:
 64 |             eval_names = {metric.name for metric in self.eval}
 65 |             assert len(eval_names) == len(
 66 |                 self.eval
 67 |             ), "Each metric name must be unique"
 68 | 
 69 |     def asdict(self):
 70 |         return {
 71 |             "name": self.name,
 72 |             "input": _serialize_input_type(self.input),
 73 |             "output": type_hint_to_str(self.output),
 74 |             "description": self.description,
 75 |             "eval": [metric.asdict() for metric in self.eval]
 76 |             if self.eval
 77 |             else None,
 78 |             "tests": [test.asdict() for test in self.tests]
 79 |             if self.tests
 80 |             else None,
 81 |             "tools": [tool.asdict() for tool in self.tools]
 82 |             if self.tools
 83 |             else None,
 84 |         }
 85 | 
 86 | 
 87 | def SingleModule(
 88 |     name: str = "system",
 89 |     description: Optional[str] = "",
 90 |     eval: Optional[List[Metric]] = None,
 91 |     tests: Optional[List[Test]] = None,
 92 | ) -> Module:
 93 |     return Module(
 94 |         name=name,
 95 |         input=None,
 96 |         output=Any,
 97 |         description=description,
 98 |         eval=eval,
 99 |         tests=tests,
100 |     )
101 | 


--------------------------------------------------------------------------------
/tests/llm_test.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | from dotenv import load_dotenv
  4 | 
  5 | from continuous_eval.llms.anthropic import Anthropic
  6 | from continuous_eval.llms.anthropic_bedrock import AnthropicBedrockFactory
  7 | from continuous_eval.llms.azure import AzureAIFactory
  8 | from continuous_eval.llms.azure_openai import AzureOpenAIFactory
  9 | from continuous_eval.llms.base import _LLMFactory
 10 | from continuous_eval.llms.cohere import Cohere
 11 | from continuous_eval.llms.google import GoogleAI
 12 | from continuous_eval.llms.openai import OpenAI
 13 | 
 14 | load_dotenv()
 15 | 
 16 | _PROMPT = {
 17 |     "system_prompt": "You are a helpful assistant.",
 18 |     "user_prompt": "What is the capital of France?",
 19 | }
 20 | 
 21 | 
 22 | def test_openai():
 23 |     _llm_factory = _LLMFactory()
 24 |     _llm_factory.register_provider("openai", OpenAI)
 25 |     llm = _llm_factory.get("openai:gpt-4o-mini")
 26 |     res = llm.run(_PROMPT)
 27 |     assert res is not None and isinstance(res, str) and len(res) > 0
 28 | 
 29 | 
 30 | def test_azure():
 31 |     _llm_factory = _LLMFactory()
 32 |     _llm_factory.register_provider(
 33 |         "azure",
 34 |         model="azure_test_model",
 35 |         provider_class=AzureAIFactory(
 36 |             endpoint=os.getenv("AZURE_ENDPOINT"),
 37 |             credential=os.getenv("AZURE_CREDENTIAL"),
 38 |         ),
 39 |     )
 40 |     llm = _llm_factory.get("azure:azure_test_model")
 41 |     res = llm.run(_PROMPT)
 42 |     assert res is not None and isinstance(res, str) and len(res) > 0
 43 | 
 44 | 
 45 | def test_azure_openai():
 46 |     _llm_factory = _LLMFactory()
 47 |     _llm_factory.register_provider(
 48 |         "azure_openai",
 49 |         model="gpt-4o-mini",
 50 |         provider_class=AzureOpenAIFactory(
 51 |             api_key=os.getenv("AZURE_OPENAI_API_KEY"),
 52 |             api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
 53 |             endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
 54 |             deployment=os.getenv("AZURE_OPENAI_DEPLOYMENT"),
 55 |         ),
 56 |     )
 57 |     llm = _llm_factory.get("azure_openai:gpt-4o-mini")
 58 |     res = llm.run(_PROMPT)
 59 |     assert res is not None and isinstance(res, str) and len(res) > 0
 60 | 
 61 | 
 62 | def test_anthropic_bedrock():
 63 |     _llm_factory = _LLMFactory()
 64 |     _llm_factory.register_provider(
 65 |         "anthropic_bedrock",
 66 |         model="claude-3-5-haiku",
 67 |         provider_class=AnthropicBedrockFactory(
 68 |             model="anthropic.claude-3-5-haiku-20241022-v1:0",
 69 |             anthropic_version="bedrock-2023-05-31",
 70 |             region_name="us-west-2",
 71 |         ),
 72 |     )
 73 |     llm = _llm_factory.get("anthropic_bedrock:claude-3-5-haiku")
 74 |     res = llm.run(_PROMPT)
 75 |     assert res is not None and isinstance(res, str) and len(res) > 0
 76 | 
 77 | 
 78 | def test_anthropic():
 79 |     _llm_factory = _LLMFactory()
 80 |     _llm_factory.register_provider("anthropic", Anthropic)
 81 |     llm = _llm_factory.get("anthropic:claude-3-5-sonnet-20241022")
 82 |     res = llm.run(_PROMPT)
 83 |     assert res is not None and isinstance(res, str) and len(res) > 0
 84 | 
 85 | 
 86 | def test_cohere():
 87 |     _llm_factory = _LLMFactory()
 88 |     _llm_factory.register_provider("cohere", Cohere)
 89 |     llm = _llm_factory.get("cohere:command-r-plus")
 90 |     res = llm.run(_PROMPT)
 91 |     assert res is not None and isinstance(res, str) and len(res) > 0
 92 | 
 93 | 
 94 | def test_google():
 95 |     _llm_factory = _LLMFactory()
 96 |     _llm_factory.register_provider("google", GoogleAI)
 97 |     llm = _llm_factory.get("google:gemini-1.5-flash-002")
 98 |     res = llm.run(_PROMPT)
 99 |     assert res is not None and isinstance(res, str) and len(res) > 0
100 | 


--------------------------------------------------------------------------------
/continuous_eval/metrics/retrieval/matching_strategy.py:
--------------------------------------------------------------------------------
  1 | from abc import ABC, abstractmethod
  2 | from enum import Enum, auto
  3 | 
  4 | from rouge import Rouge
  5 | 
  6 | _DEFAULT_ROUGE_CHUNK_MATCH_THRESHOLD = 0.7
  7 | _DEFAULT_ROUGE_SENTENCE_MATCH_THRESHOLD = 0.8
  8 | 
  9 | 
 10 | class MatchingStrategyType(Enum):
 11 |     CHUNK_MATCH = auto()
 12 |     SENTENCE_MATCH = auto()
 13 | 
 14 | 
 15 | class MatchingStrategy(ABC):
 16 |     @property
 17 |     @abstractmethod
 18 |     def type(self):
 19 |         pass
 20 | 
 21 |     @abstractmethod
 22 |     def is_relevant(self, retrieved_component, ground_truth_component):
 23 |         pass
 24 | 
 25 | 
 26 | class ExactChunkMatch(MatchingStrategy):
 27 |     @property
 28 |     def type(self):
 29 |         return MatchingStrategyType.CHUNK_MATCH
 30 | 
 31 |     def is_relevant(self, retrieved_component, ground_truth_component):
 32 |         return retrieved_component == ground_truth_component
 33 | 
 34 |     def __getstate__(self):
 35 |         return {}
 36 | 
 37 |     def __setstate__(self, state):
 38 |         pass
 39 | 
 40 | 
 41 | class ExactSentenceMatch(MatchingStrategy):
 42 |     @property
 43 |     def type(self):
 44 |         return MatchingStrategyType.SENTENCE_MATCH
 45 | 
 46 |     def is_relevant(self, retrieved_component, ground_truth_component):
 47 |         return retrieved_component == ground_truth_component
 48 | 
 49 |     def __getstate__(self):
 50 |         return {}
 51 | 
 52 |     def __setstate__(self, state):
 53 |         pass
 54 | 
 55 | 
 56 | class RougeChunkMatch(MatchingStrategy):
 57 |     _rouge = Rouge()
 58 | 
 59 |     def __init__(self, threshold=_DEFAULT_ROUGE_CHUNK_MATCH_THRESHOLD) -> None:
 60 |         super().__init__()
 61 |         self.threshold = threshold
 62 | 
 63 |     @property
 64 |     def type(self):
 65 |         return MatchingStrategyType.CHUNK_MATCH
 66 | 
 67 |     def is_relevant(self, retrieved_component, ground_truth_component):
 68 |         try:
 69 |             score = RougeChunkMatch._rouge.get_scores(
 70 |                 retrieved_component, ground_truth_component, ignore_empty=True
 71 |             )
 72 |         except Exception:
 73 |             return False
 74 |         try:
 75 |             return score[0]["rouge-l"]["r"] >= self.threshold
 76 |         except Exception:
 77 |             return False
 78 | 
 79 |     def __getstate__(self):
 80 |         return {"threshold": self.threshold}
 81 | 
 82 |     def __setstate__(self, state):
 83 |         self.threshold = state["threshold"]
 84 | 
 85 | 
 86 | class RougeSentenceMatch(MatchingStrategy):
 87 |     _rouge = Rouge()
 88 |     # def __new__(cls, *args, **kwargs):
 89 |     #     # Always initialize _rouge during object creation
 90 |     #     instance = super().__new__(cls)
 91 |     #     instance._rouge = Rouge()  # type: ignore
 92 |     #     return instance
 93 | 
 94 |     def __init__(
 95 |         self, threshold=_DEFAULT_ROUGE_SENTENCE_MATCH_THRESHOLD
 96 |     ) -> None:
 97 |         super().__init__()
 98 |         self.threshold = threshold
 99 | 
100 |     @property
101 |     def type(self):
102 |         return MatchingStrategyType.SENTENCE_MATCH
103 | 
104 |     def is_relevant(self, retrieved_component, ground_truth_component):
105 |         try:
106 |             score = RougeSentenceMatch._rouge.get_scores(
107 |                 retrieved_component, ground_truth_component, ignore_empty=True
108 |             )
109 |         except Exception:
110 |             return False
111 |         try:
112 |             return score[0]["rouge-l"]["r"] >= self.threshold
113 |         except Exception:
114 |             return False
115 | 
116 |     def __getstate__(self):
117 |         return {"threshold": self.threshold}
118 | 
119 |     def __setstate__(self, state):
120 |         self.threshold = state["threshold"]
121 | 


--------------------------------------------------------------------------------
/docs/src/content/docs/metrics/Retrieval/Deterministic/precision_recall.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Context Precision & Recall
  3 | ---
  4 | 
  5 | ### Definitions
  6 | 
  7 | **Context Precision: measures signal vs. noise** — what proportion of the retrieved contexts are relevant?
  8 | 
  9 | $$
 10 | \text{Context Precision} = \frac{\text{Relevant Retrieved Contexts}}{\text{All Retrieved Contexts}}
 11 | $$
 12 | <br>
 13 | 
 14 | **Context Recall: measures completeness** — what proportion of all relevant contexts are retrieved?
 15 | 
 16 | $$
 17 | \text{Context Recall} = \frac{\text{Relevant Retrieved Contexts}}{\text{All Ground Truth Contexts}}
 18 | $$
 19 | 
 20 | <br>
 21 | 
 22 | **F1:** harmonic mean of precision and recall
 23 | 
 24 | $$
 25 | \text{F1 Score} = 2 \times \frac{\text{Context Precision} \times \text{Context Recall}}{\text{Context Precision} + \text{Context Recall}}
 26 | $$
 27 | 
 28 | 
 29 | :::tip
 30 | 
 31 | **Context Recall should be the North Star metric for retrieval.**
 32 | This is because a retrieval system is only acceptable for generation if there is confidence that the retrieved context is complete enough to answer the question
 33 | 
 34 | :::
 35 | 
 36 | ##### Matching Strategy
 37 | 
 38 | Given that the ground truth contexts can be defined differently from the exact chunks retrieved. For example, a ground truth contexts can be a sentence that contains the information, while the contexts retrieved are uniform 512-token chunks. We have following matching strategies that determine relevance:
 39 | 
 40 | <style>
 41 |     code {
 42 |         white-space: nowrap;
 43 |     }
 44 | </style>
 45 | 
 46 | <div style="overflow-x:auto; font-size: small">
 47 |     <table cellpadding="5" cellspacing="0">
 48 |         <thead>
 49 |             <tr>
 50 |                 <th>Match Type</th>
 51 |                 <th>Component</th>
 52 |                 <th>Retrieved Component Considered relevant if:</th>
 53 |             </tr>
 54 |         </thead>
 55 |         <tbody>
 56 |             <tr>
 57 |                 <td><code>ExactChunkMatch()</code></td>
 58 |                 <td>Chunk</td>
 59 |                 <td>Exact match to a Ground Truth Context Chunk.</td>
 60 |             </tr>
 61 |             <tr>
 62 |                 <td><code>ExactSentenceMatch()</code></td>
 63 |                 <td>Sentence</td>
 64 |                 <td>Exact match to a Ground Truth Context Sentence.</td>
 65 |             </tr>
 66 |             <tr>
 67 |                 <td><code>RoughChunkMatch()</code></td>
 68 |                 <td>Chunk</td>
 69 |                 <td>Match to a Ground Truth Context Chunk with ROUGE-L Recall &gt; <code>ROUGE_CHUNK_MATCH_THRESHOLD</code> (default 0.7).</td>
 70 |             </tr>
 71 |             <tr>
 72 |                 <td><code>RougeSentenceMatch()</code></td>
 73 |                 <td>Sentence</td>
 74 |                 <td>Match to a Ground Truth Context Sentence with ROUGE-L Recall &gt; <code>ROUGE_CHUNK_SENTENCE_THRESHOLD</code> (default 0.8).</td>
 75 |             </tr>
 76 |         </tbody>
 77 |     </table>
 78 | </div>
 79 | 
 80 | ### Example Usage
 81 | 
 82 | Required data items: `retrieved_context`, `ground_truth_context`
 83 | 
 84 | ```python
 85 | from continuous_eval.metrics.retrieval import PrecisionRecallF1, RougeChunkMatch
 86 | 
 87 | datum = {
 88 |     "retrieved_context": [
 89 |         "Paris is the capital of France and also the largest city in the country.",
 90 |         "Lyon is a major city in France.",
 91 |     ],
 92 |     "ground_truth_context": ["Paris is the capital of France."],
 93 | }
 94 | 
 95 | metric = PrecisionRecallF1(RougeChunkMatch())
 96 | print(metric(**datum))
 97 | ```
 98 | 
 99 | ### Example Output
100 | 
101 | ```JSON
102 | {
103 |     'context_precision': 0.5, 
104 |     'context_recall': 1.0, 
105 |     'context_f1': 0.6666666666666666
106 | }
107 | ```
108 | 


--------------------------------------------------------------------------------
/docs/src/content/docs/pipeline/metrics_and_tests.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Metrics and Tests
  3 | sidebar:
  4 |   badge:
  5 |     text: new
  6 |     variant: tip
  7 | ---
  8 | 
  9 | ## Metrics
 10 | 
 11 | When defining modules in the pipeline you can also specify metrics to evaluate the module outputs. The metrics are defined in the `eval` attribute of the module definition.
 12 | 
 13 | To specify the input to each metric you can use the `use` method.
 14 | For example, suppose we have a retriever on which we want to use the `PrecisionRecallF1` metric.
 15 | We can define the retriever as follows:
 16 | 
 17 | ```python
 18 | from continuous_eval.eval import Module, ModuleOutput
 19 | from continuous_eval.metrics.retrieval import PrecisionRecallF1
 20 | 
 21 | retriever = Module(
 22 |     name="retriever",
 23 |     input=dataset.question,
 24 |     output=Documents,
 25 |     eval=[
 26 |         PrecisionRecallF1().use(
 27 |             retrieved_context=ModuleOutput(),
 28 |             ground_truth_context=dataset.ground_truth_context,
 29 |         ),
 30 |     ],
 31 | )
 32 | ```
 33 | 
 34 | The `PrecisionRecallF1` metric expects two inputs: `retrieved_context` and `ground_truth_context`.
 35 | To use it to evaluate the module we specify that the `retrieved_context` is the module's output while the `ground_truth_context` is the dataset's ground truth context (here we used the dataset field).
 36 | 
 37 | The ModuleOutput class is flexible and allows for custom selectors.
 38 | Since `PrecisionRecallF1` expect a `List[str]` as input for both arguments, by specifying `ModuleOutput` we assume the module is actually returning a list of strings. Suppose instead that it returns a list of dictionaries where `"page_content"` is the key for the text we want to evaluate.
 39 | We could specify the output as follows:
 40 | 
 41 | ```python
 42 | PrecisionRecallF1().use(
 43 |     retrieved_context=ModuleOutput(lambda x: [z["page_content"] for z in x]),
 44 |     ground_truth_context=dataset.ground_truth_context,
 45 | ),
 46 | ```
 47 | 
 48 | the evaluation runner will take care of extracting the relevant data from the module output and the dataset.
 49 | 
 50 | ## Tests
 51 | 
 52 | Each module can also have tests to ensure the module is working as expected.
 53 | The tests are defined in the `tests` attribute of the module definition.
 54 | 
 55 | Suppose we want to make sure the average precision of the retriever is greater than 0.8.
 56 | 
 57 | ```python
 58 | from continuous_eval.eval import Module, ModuleOutput
 59 | from continuous_eval.metrics.retrieval import PrecisionRecallF1
 60 | from continuous_eval.eval.tests import MeanGreaterOrEqualThan
 61 | 
 62 | retriever = Module(
 63 |     name="retriever",
 64 |     input=dataset.question,
 65 |     output=Documents,
 66 |     eval=[
 67 |         PrecisionRecallF1().use(
 68 |             retrieved_context=DocumentsContent,
 69 |             ground_truth_context=dataset.ground_truth_context,
 70 |         ),
 71 |     ],
 72 |     tests=[
 73 |         MeanGreaterOrEqualThan(
 74 |             test_name="Average Precision", metric_name="context_recall", min_value=0.8
 75 |         ),
 76 |     ],
 77 | )
 78 | ```
 79 | 
 80 | The `MeanGreaterOrEqualThan` test expects the name of the metric to test, the minimum value, and the name of the test.
 81 | The evaluation runner will run the test and report the results.
 82 | 
 83 | ```python
 84 | evalrunner = EvaluationRunner(pipeline)
 85 | metrics = evalrunner.evaluate()
 86 | 
 87 | print("\nTests results:")
 88 | tests = evalrunner.test(metrics)
 89 | for module_name, test_results in tests.results.items():
 90 |     print(f"{module_name}")
 91 |     for test_name in test_results:
 92 |         print(f" - {test_name}: {'PASS' if test_results[test_name] else 'FAIL'}")
 93 | ```
 94 | 
 95 | The output will be:
 96 | 
 97 | ```text
 98 | Tests results:
 99 | retriever
100 |  - Average Precision: PASS
101 | ```
102 | 


--------------------------------------------------------------------------------
/docs/src/content/docs/metrics/base.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Metric Classes
 3 | ---
 4 | 
 5 | ## Base Class
 6 | 
 7 | The `Metric` class is the base class for all metrics. It provides a common interface for all metrics, and it is used to create new metrics.
 8 | 
 9 | A valid metric must implement the following methods:
10 | 
11 | - `compute`: compute the metric
12 | - `schema`: return the output schema of the metric
13 | 
14 | Let's see an example: consider (a simplified version of) the `TokenCount` metric.
15 | 
16 | ```python
17 | class TokenCount(Metric):
18 |     """
19 |     Calculate the number of tokens in the retrieved context.
20 |     """
21 |     def __init__(self) -> None:
22 |         super().__init__(is_cpu_bound=True)
23 | 
24 |     def compute(self, retrieved_context:List[str], **kwargs):
25 |         ctx = "\n".join(retrieved_context)
26 |         num_tokens = int(len(ctx) / 4.0)
27 |         return {"num_tokens": num_tokens}
28 | 
29 |     @property
30 |     def schema(self):
31 |         return {"num_tokens": Field(type=int)}
32 | ```
33 | 
34 | It is important to annotate the arguments of the `compute` method with the expected type. This is used to validate the input of the metric and provide the `args` property.
35 | Also, it is important to add the `**kwargs` argument to the `compute` method.
36 | 
37 | Optionally, you can add a `help` property or add a docstring to the class to provide a description of the metric.
38 | 
39 | ## Step-by-Step Explanation
40 | 
41 | The provided code defines a Python class named `TokenCount`, which is a subclass of a base class called `Metric`. This class is designed to compute a specific metric related to token counting in a given context. Here’s a step-by-step explanation of the code:
42 | 
43 | ### Class Definition
44 | 
45 | ```python
46 | class TokenCount(Metric):
47 | ```
48 | 
49 | This line defines a new class `TokenCount` that inherits from the `Metric` class. This means `TokenCount` will have access to all methods and properties of `Metric`.
50 | 
51 | ### Constructor Method
52 | 
53 | ```python
54 | def __init__(self) -> None:
55 |     super().__init__(is_cpu_bound=True)
56 | ```
57 | 
58 | The `__init__` method is the constructor for the `TokenCount` class. It initializes a new instance of the class, in particular it will inherit the `batch` processing method.
59 | 
60 | The method supports both CPU-bound and GPU-bound processing. `super().__init__(is_cpu_bound=True)` calls the constructor of the parent class (`Metric`) and passes an argument `is_cpu_bound=True`, indicating that this metric may be CPU-bound. The performance of the metric is heavily influenced by the `is_cpu_bound` flag, so make sure to set it to `True` if the metric is CPU-bound.
61 | 
62 | You can also disable multi-processing by setting `disable_multiprocessing=True`. Alternatively, you can implement your own `batch` method.
63 | 
64 | ### Compute Method
65 | 
66 | ```python
67 | def compute(self, retrieved_context, **kwargs):
68 |     ctx = "\n".join(retrieved_context)
69 |     num_tokens = int(len(ctx) / 4.0)
70 |     return {"num_tokens": num_tokens}
71 | ```
72 | 
73 | The `compute` method is responsible for calculating the metric. It takes `retrieved_context` as an input, which is expected to be a list of strings. It is **mandatory** to implement this method.
74 | 
75 | The method returns a dictionary containing the computed number of tokens: `{"num_tokens": num_tokens}`.
76 | 
77 | ### Schema Property
78 | 
79 | ```python
80 | @property
81 | def schema(self):
82 |     return {"num_tokens": Field(type=int)}
83 | ```
84 | 
85 | The `schema` property defines the output structure of the metric. This can be inferred from the `compute` method as well.
86 | 
87 | `Field(type=int)` indicates that the value associated with `"num_tokens"` is expected to be of type integer.
88 | 
89 | ## Default Metrics
90 | 
91 | Continuous-eval provides a set of default metrics that are useful for evaluating the performance of a model. These metrics are implemented in the `metrics` module.
92 | 


--------------------------------------------------------------------------------
/continuous_eval/utils/telemetry.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | import os
  4 | import uuid
  5 | from functools import lru_cache, wraps
  6 | from pathlib import Path
  7 | from typing import Any, Dict, Optional
  8 | 
  9 | from appdirs import user_data_dir
 10 | from dotenv import load_dotenv
 11 | from posthog import Posthog
 12 | 
 13 | load_dotenv()
 14 | 
 15 | _USER_DATA_DIR_NAME = "continuous_eval"
 16 | _DO_NOT_TRACK = "CONTINUOUS_EVAL_DO_NOT_TRACK"
 17 | _DEBUG_TELEMETRY = "CONTINUOUS_EVAL_DEBUG_TELEMETRY"
 18 | _USER_ID_PREFIX = "ce-"
 19 | 
 20 | logger = logging.getLogger("AnonymousTelemetry")
 21 | logger.setLevel(logging.DEBUG)  # Set to lowest level to allow all messages
 22 | 
 23 | 
 24 | @lru_cache(maxsize=1)
 25 | def _do_not_track() -> bool:
 26 |     return os.environ.get(_DO_NOT_TRACK, "false").lower() == "true"
 27 | 
 28 | 
 29 | @lru_cache(maxsize=1)
 30 | def _debug_telemetry() -> bool:
 31 |     return os.environ.get(_DEBUG_TELEMETRY, "false").lower() == "true"
 32 | 
 33 | 
 34 | @lru_cache(maxsize=1)
 35 | def _get_or_generate_uid() -> str:
 36 |     user_id_path = Path(user_data_dir(appname=_USER_DATA_DIR_NAME))
 37 |     user_id_path.mkdir(parents=True, exist_ok=True)
 38 |     uuid_filepath = user_id_path / "config.json"
 39 |     user_id = None
 40 |     if uuid_filepath.is_file():
 41 |         # try reading the file first
 42 |         try:
 43 |             user_id = json.load(open(uuid_filepath))["userid"]
 44 |         except Exception:
 45 |             pass
 46 |     if user_id is None:
 47 |         user_id = _USER_ID_PREFIX + uuid.uuid4().hex
 48 |         try:
 49 |             with open(uuid_filepath, "w") as f:
 50 |                 json.dump({"userid": user_id}, f)
 51 |         except Exception:
 52 |             pass
 53 |     return user_id
 54 | 
 55 | 
 56 | class AnonymousTelemetry:
 57 |     def __init__(self):
 58 |         self.uid = _get_or_generate_uid()
 59 |         self._client = Posthog(
 60 |             "phc_FS1KnMOU6v6FWqO5jyjiVDcdBKyHF61KCajn7oANpPC",
 61 |             host="https://us.i.posthog.com",
 62 |             debug=_debug_telemetry(),
 63 |         )
 64 |         if _do_not_track():
 65 |             logger.debug("Telemetry is disabled")
 66 |             self._client.disabled = True
 67 | 
 68 |     def event(self, name: Optional[str] = None, info: Dict[str, Any] = {}):
 69 |         def decorator(func):
 70 |             @wraps(func)
 71 |             def wrapper(*args, **kwargs):
 72 |                 # event_name = name or args[0].__class__.__name__
 73 |                 self.log_event(name=func.__qualname__, info=info)
 74 |                 return func(*args, **kwargs)
 75 | 
 76 |             return wrapper
 77 | 
 78 |         return decorator
 79 | 
 80 |     def log_event(self, name: str, info: Dict[str, Any] = {}):
 81 |         try:
 82 |             self._client.capture(
 83 |                 distinct_id=self.uid, event=name, properties=info
 84 |             )
 85 |         except Exception as e:
 86 |             # This way it silences all thread level logging as well
 87 |             if _debug_telemetry():
 88 |                 logging.debug(f"Telemetry error: {e}")
 89 | 
 90 | 
 91 | def telemetry_initializer() -> AnonymousTelemetry:
 92 |     """
 93 |     This function is executed once per child process to initialize telemetry.
 94 |     """
 95 |     global telemetry
 96 |     telemetry = AnonymousTelemetry()
 97 |     logger.debug("Telemetry reinitialized in child process.")
 98 |     return telemetry
 99 | 
100 | 
101 | telemetry = telemetry_initializer()
102 | 
103 | 
104 | def telemetry_event(name: Optional[str] = None, info: Dict[str, Any] = {}):
105 |     global telemetry
106 | 
107 |     def decorator(func):
108 |         @wraps(func)
109 |         def wrapper(*args, **kwargs):
110 |             event_name = name or args[0].__class__.__name__
111 |             info["__qualname__"] = func.__qualname__
112 |             telemetry.log_event(name=event_name, info=info)
113 | 
114 |             return func(*args, **kwargs)
115 | 
116 |         return wrapper
117 | 
118 |     return decorator
119 | 


--------------------------------------------------------------------------------
/tests/generation_metrics_test.py:
--------------------------------------------------------------------------------
  1 | from continuous_eval.metrics.generation.text import (
  2 |     AnswerCorrectness,
  3 |     AnswerRelevance,
  4 |     DeterministicAnswerCorrectness,
  5 |     DeterministicFaithfulness,
  6 |     Faithfulness,
  7 |     FleschKincaidReadability,
  8 |     StyleConsistency,
  9 | )
 10 | from tests.helpers import example_datum
 11 | from tests.helpers.utils import all_close, validate_metric_metadata
 12 | 
 13 | 
 14 | def test_deterministic_answer_relevance():
 15 |     data = [
 16 |         example_datum.ROMEO_AND_JULIET,
 17 |         example_datum.IMPLICATIONS_GLOBAL_WARMING,
 18 |     ]
 19 |     expected_results = [
 20 |         {
 21 |             "rouge_l_recall": 1.0,
 22 |             "rouge_l_precision": 1.0,
 23 |             "rouge_l_f1": 0.999999995,
 24 |             "token_overlap_recall": 1.0,
 25 |             "token_overlap_precision": 1.0,
 26 |             "token_overlap_f1": 1.0,
 27 |             "bleu_score": 1.0,
 28 |         },
 29 |         {
 30 |             "rouge_l_recall": 0.75,
 31 |             "rouge_l_precision": 0.375,
 32 |             "rouge_l_f1": 0.49999999555555563,
 33 |             "token_overlap_recall": 1.0,
 34 |             "token_overlap_precision": 0.5714285714285714,
 35 |             "token_overlap_f1": 0.7272727272727273,
 36 |             "bleu_score": 0.4734525552325106,
 37 |         },
 38 |     ]
 39 |     metric = DeterministicAnswerCorrectness()
 40 |     results = [metric(**datum) for datum in data]
 41 |     validate_metric_metadata(metric, results)
 42 |     assert all(
 43 |         all_close(res, expected)
 44 |         for res, expected in zip(results, expected_results)
 45 |     )
 46 | 
 47 | 
 48 | def test_rouge_sentence_faithfulness():
 49 |     data = [example_datum.CAPITAL_OF_FRANCE]
 50 |     expected_results = [
 51 |         {
 52 |             "rouge_faithfulness": 1.0,
 53 |             "token_overlap_faithfulness": 1.0,
 54 |             "bleu_faithfulness": 3.3720152341391845e-06,
 55 |             "rouge_p_by_sentence": [1.0],
 56 |             "token_overlap_p_by_sentence": [1.0],
 57 |             "bleu_score_by_sentence": [3.3720152341391845e-06],
 58 |         },
 59 |     ]
 60 |     metric = DeterministicFaithfulness()
 61 |     results = [metric(**datum) for datum in data]
 62 |     validate_metric_metadata(metric, results)
 63 |     assert all(
 64 |         all_close(res, expected)
 65 |         for res, expected in zip(results, expected_results)
 66 |     )
 67 | 
 68 | 
 69 | def test_llm_based_faithfulness():
 70 |     data = [
 71 |         example_datum.CAPITAL_OF_FRANCE,
 72 |         example_datum.IMPLICATIONS_GLOBAL_WARMING,
 73 |     ]
 74 |     metric = Faithfulness()
 75 |     results = [metric(**datum) for datum in data]
 76 |     validate_metric_metadata(metric, results)
 77 | 
 78 | 
 79 | def test_llm_based_answer_correctness():
 80 |     data = [
 81 |         example_datum.CAPITAL_OF_FRANCE,
 82 |         example_datum.IMPLICATIONS_GLOBAL_WARMING,
 83 |     ]
 84 |     metric = AnswerCorrectness()
 85 |     results = [metric(**datum) for datum in data]
 86 |     validate_metric_metadata(metric, results)
 87 | 
 88 | 
 89 | def test_llm_based_answer_relevance():
 90 |     data = [
 91 |         example_datum.CAPITAL_OF_FRANCE,
 92 |         example_datum.IMPLICATIONS_GLOBAL_WARMING,
 93 |     ]
 94 |     metric = AnswerRelevance()
 95 |     results = [metric(**datum) for datum in data]
 96 |     validate_metric_metadata(metric, results)
 97 | 
 98 | 
 99 | def test_llm_based_style_consistency():
100 |     data = [
101 |         example_datum.CAPITAL_OF_FRANCE,
102 |         example_datum.IMPLICATIONS_GLOBAL_WARMING,
103 |     ]
104 |     metric = StyleConsistency()
105 |     results = [metric(**datum) for datum in data]
106 |     validate_metric_metadata(metric, results)
107 | 
108 | 
109 | def test_flesch_kincaid():
110 |     expected = {
111 |         "flesch_reading_ease": 116.14500000000001,
112 |         "flesch_kincaid_grade_level": -1.4499999999999993,
113 |     }
114 |     metric = FleschKincaidReadability()
115 |     result = metric(answer="The cat sat on the mat.")
116 |     validate_metric_metadata(metric, result)
117 |     assert all_close(result, expected)
118 | 


--------------------------------------------------------------------------------
/continuous_eval/metrics/custom/custom_metric.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass
  2 | from pathlib import Path
  3 | from typing import Any, Dict, List, Optional
  4 | 
  5 | from jinja2 import BaseLoader, Environment
  6 | 
  7 | from continuous_eval.llms import LLMFactory
  8 | from continuous_eval.metrics.base import (
  9 |     Arg,
 10 |     Field,
 11 |     MetricPrompt,
 12 |     response_type,
 13 | )
 14 | from continuous_eval.metrics.base.llm import LLMMetric
 15 | from continuous_eval.metrics.base.probabilistic import ProbabilisticMetric
 16 | 
 17 | _CWD = Path(__file__).parent
 18 | 
 19 | 
 20 | @dataclass(frozen=True, eq=True)
 21 | class Example:
 22 |     input: Dict[str, Any]
 23 |     output: Dict[str, Any]
 24 | 
 25 | 
 26 | class CustomMetric(LLMMetric):
 27 |     def __init__(
 28 |         self,
 29 |         name: str,
 30 |         criteria: str,
 31 |         rubric: str,
 32 |         arguments: Dict[str, Arg],
 33 |         response_format: Dict[str, Field],
 34 |         examples: Optional[List[Example]] = None,
 35 |         temperature: float = 1.0,
 36 |         model: str = LLMFactory.default(),
 37 |     ):
 38 |         with open(_CWD / "custom_metric_sys.jinja2") as f:
 39 |             raw_system_prompt = f.read()
 40 |         with open(_CWD / "custom_metric_user.jinja2") as f:
 41 |             raw_user_prompt = f.read()
 42 |         env = Environment(loader=BaseLoader())
 43 |         sys_prompt_template = env.from_string(raw_system_prompt)
 44 |         user_prompt_template = env.from_string(raw_user_prompt)
 45 |         sys_prompt = sys_prompt_template.render(
 46 |             criteria=criteria,
 47 |             rubric=rubric,
 48 |             examples=examples,
 49 |             response_format=response_format,
 50 |         )
 51 |         user_prompt = user_prompt_template.render(arguments=arguments)
 52 |         self._criteria = criteria
 53 |         self.prompt = MetricPrompt(
 54 |             sys_prompt,
 55 |             user_prompt,
 56 |             response_format=response_type.JSON(
 57 |                 {k: v.type for k, v in response_format.items()}
 58 |             ),
 59 |         )
 60 |         super().__init__(
 61 |             name=name, prompt=self.prompt, temperature=temperature, model=model
 62 |         )
 63 | 
 64 |     @property
 65 |     def help(self):
 66 |         return self._criteria
 67 | 
 68 | 
 69 | class ProbabilisticCustomMetric(ProbabilisticMetric):
 70 |     def __init__(
 71 |         self,
 72 |         name: str,
 73 |         criteria: str,
 74 |         rubric: str,
 75 |         arguments: Dict[str, Arg],
 76 |         response_format: response_type.ResponseFormatBaseType,
 77 |         examples: Optional[List[Example]] = None,
 78 |         temperature: float = 1.0,
 79 |         model: str = LLMFactory.default(),
 80 |     ):
 81 |         if not isinstance(
 82 |             response_format, response_type.ResponseFormatBaseType
 83 |         ):
 84 |             raise ValueError("response_format must be a ResponseFormatBaseType")
 85 |         if isinstance(response_format, response_type.JSON):
 86 |             raise ValueError(
 87 |                 "Probabilistic metrics do not support JSON response format, use CustomMetric instead"
 88 |             )
 89 |         with open(_CWD / "custom_metric_sys_probabilistic.jinja2") as f:
 90 |             raw_system_prompt = f.read()
 91 |         with open(_CWD / "custom_metric_user.jinja2") as f:
 92 |             raw_user_prompt = f.read()
 93 |         env = Environment(loader=BaseLoader())
 94 |         sys_prompt_template = env.from_string(raw_system_prompt)
 95 |         user_prompt_template = env.from_string(raw_user_prompt)
 96 |         sys_prompt = sys_prompt_template.render(
 97 |             criteria=criteria,
 98 |             rubric=rubric,
 99 |             examples=examples,
100 |             response_format=response_format,
101 |         )
102 |         user_prompt = user_prompt_template.render(arguments=arguments)
103 |         self._criteria = criteria
104 |         self.prompt = MetricPrompt(
105 |             sys_prompt,
106 |             user_prompt,
107 |             response_format=response_format,
108 |         )
109 |         super().__init__(
110 |             name=name, prompt=self.prompt, temperature=temperature, model=model
111 |         )
112 | 
113 |     @property
114 |     def help(self):
115 |         return self._criteria
116 | 


--------------------------------------------------------------------------------
/tests/retrieval_metrics_test.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | from continuous_eval.metrics.retrieval import (
  4 |     ContextCoverage,
  5 |     ContextPrecision,
  6 |     ExactSentenceMatch,
  7 |     PrecisionRecallF1,
  8 |     RankedRetrievalMetrics,
  9 |     RougeChunkMatch,
 10 |     RougeSentenceMatch,
 11 |     TokenCount,
 12 | )
 13 | from tests.helpers import example_datum
 14 | from tests.helpers.utils import all_close, validate_metric_metadata
 15 | 
 16 | 
 17 | def test_precision_recall_exact_chunk_match():
 18 |     data = [example_datum.CAPITAL_OF_FRANCE, example_datum.ROMEO_AND_JULIET]
 19 |     expected_results = [
 20 |         {"context_precision": 0.0, "context_recall": 0.0, "context_f1": 0.0},
 21 |         {"context_precision": 1.0, "context_recall": 1.0, "context_f1": 1.0},
 22 |     ]
 23 |     metric = PrecisionRecallF1(RougeChunkMatch(threshold=0.7))
 24 |     results = [metric(**datum) for datum in data]
 25 |     validate_metric_metadata(metric, results)
 26 |     assert all(
 27 |         all_close(metric(**datum), expected)
 28 |         for datum, expected in zip(data, expected_results)
 29 |     )  # type: ignore
 30 | 
 31 | 
 32 | def test_precision_recall_exact_sentence_match():
 33 |     data = [example_datum.ROMEO_AND_JULIET]
 34 |     expected_results = [
 35 |         {"context_precision": 1.0, "context_recall": 1.0, "context_f1": 1.0}
 36 |     ]
 37 |     metric = PrecisionRecallF1(RougeSentenceMatch(threshold=0.8))
 38 |     results = [metric(**datum) for datum in data]
 39 |     validate_metric_metadata(metric, results)
 40 |     assert all(
 41 |         all_close(metric(**datum), expected)
 42 |         for datum, expected in zip(data, expected_results)
 43 |     )  # type: ignore
 44 | 
 45 | 
 46 | def test_precision_recall_rouge_sentence_match():
 47 |     data = [
 48 |         example_datum.CAPITAL_OF_FRANCE,
 49 |         example_datum.IMPLICATIONS_GLOBAL_WARMING,
 50 |     ]
 51 |     expected_results = [
 52 |         {"context_precision": 0.0, "context_recall": 0.0, "context_f1": 0.0},
 53 |         {
 54 |             "context_precision": 0.09090909090909091,
 55 |             "context_recall": 0.5,
 56 |             "context_f1": 0.15384615384615385,
 57 |         },
 58 |     ]
 59 |     metric = PrecisionRecallF1(RougeSentenceMatch())
 60 |     results = [metric(**datum) for datum in data]
 61 |     validate_metric_metadata(metric, results)
 62 |     assert all(
 63 |         all_close(metric(**datum), expected)
 64 |         for datum, expected in zip(data, expected_results)
 65 |     )  # type: ignore
 66 | 
 67 | 
 68 | def test_ranked_retrieval_exact_chunk_match():
 69 |     data = [example_datum.CAPITAL_OF_FRANCE, example_datum.ROMEO_AND_JULIET]
 70 |     expected_results = [
 71 |         {"average_precision": 0, "reciprocal_rank": 0, "ndcg": 0.0},
 72 |         {"average_precision": 1.0, "reciprocal_rank": 1.0, "ndcg": 1.0},
 73 |     ]
 74 |     metric = RankedRetrievalMetrics(RougeChunkMatch())
 75 |     results = [metric(**datum) for datum in data]
 76 |     validate_metric_metadata(metric, results)
 77 |     assert all(
 78 |         all_close(metric(**datum), expected)
 79 |         for datum, expected in zip(data, expected_results)
 80 |     )  # type: ignore
 81 | 
 82 | 
 83 | def test_ranked_retrieval_exact_sentence_match():
 84 |     with pytest.raises(AssertionError):
 85 |         RankedRetrievalMetrics(ExactSentenceMatch())
 86 | 
 87 | 
 88 | def test_context_precision():
 89 |     data = [example_datum.CAPITAL_OF_FRANCE, example_datum.ROMEO_AND_JULIET]
 90 |     metric = ContextPrecision()
 91 |     results = [metric(**datum) for datum in data]
 92 |     validate_metric_metadata(metric, results)
 93 | 
 94 | 
 95 | def test_context_coverage():
 96 |     data = [example_datum.CAPITAL_OF_FRANCE, example_datum.ROMEO_AND_JULIET]
 97 |     metric = ContextCoverage(model="openai:gpt-4o")
 98 |     results = [metric(**datum) for datum in data]
 99 |     validate_metric_metadata(metric, results)
100 | 
101 | 
102 | def test_token_count():
103 |     data = [example_datum.CAPITAL_OF_FRANCE, example_datum.ROMEO_AND_JULIET]
104 |     metric = TokenCount("o200k_base")
105 |     expected = [17, 16]
106 |     assert (
107 |         result := [metric(**datum)["num_tokens"] for datum in data]
108 |     ) == expected, result
109 |     validate_metric_metadata(metric)
110 |     expected = [17, 18]
111 |     metric = TokenCount("approx")
112 |     assert (
113 |         result := [metric(**datum)["num_tokens"] for datum in data]
114 |     ) == expected, result
115 | 


--------------------------------------------------------------------------------
/docs/src/content/docs/getting-started/quickstart.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Quick Start
  3 | description: Quick Start
  4 | ---
  5 | 
  6 | If you haven't installed continuous-eval, go [here](../../getting-started/installation/).
  7 | 
  8 | ## Run a single metric
  9 | 
 10 | Import the metric of your choice ([see all metrics](../../metrics/overview/)) and get the results.
 11 | 
 12 | ```python
 13 | from continuous_eval.metrics.retrieval import PrecisionRecallF1
 14 | 
 15 | # A dataset is just a list of dictionaries containing the relevant information
 16 | datum = {
 17 |     "question": "What is the capital of France?",
 18 |     "retrieved_context": [
 19 |         "Paris is the capital of France and its largest city.",
 20 |         "Lyon is a major city in France.",
 21 |     ],
 22 |     "ground_truth_context": ["Paris is the capital of France."],
 23 |     "answer": "Paris",
 24 |     "ground_truths": ["Paris"],
 25 | }
 26 | 
 27 | # Let's initialize the metric
 28 | metric = PrecisionRecallF1()
 29 | 
 30 | # Let's calculate the metric for the first datum
 31 | print(metric(**datum))
 32 | ```
 33 | 
 34 | ## Run evaluation over a dataset
 35 | 
 36 | In the following code example, we load an example evaluation dataset `retrieval`, create a pipeline with one module, and selected two metric groups `PrecisionRecallF1`, `RankedRetrievalMetrics`. 
 37 | 
 38 | The aggregated results are printed in the terminal.
 39 | 
 40 | ```python
 41 | from time import perf_counter
 42 | 
 43 | from continuous_eval.data_downloader import example_data_downloader
 44 | from continuous_eval.eval import Dataset, EvaluationRunner, SingleModulePipeline
 45 | from continuous_eval.eval.tests import GreaterOrEqualThan
 46 | from continuous_eval.metrics.retrieval import (
 47 |     PrecisionRecallF1,
 48 |     RankedRetrievalMetrics,
 49 | )
 50 | 
 51 | 
 52 | def main():
 53 |     # Let's download the retrieval dataset example
 54 |     dataset_jsonl = example_data_downloader("retrieval")
 55 |     dataset = Dataset(dataset_jsonl) 
 56 | 
 57 |     pipeline = SingleModulePipeline(
 58 |         dataset=dataset,
 59 |         eval=[
 60 |             PrecisionRecallF1().use(
 61 |                 retrieved_context=dataset.retrieved_contexts,  # type: ignore
 62 |                 ground_truth_context=dataset.ground_truth_contexts,  # type: ignore
 63 |             ),
 64 |             RankedRetrievalMetrics().use(
 65 |                 retrieved_context=dataset.retrieved_contexts,  # type: ignore
 66 |                 ground_truth_context=dataset.ground_truth_contexts,  # type: ignore
 67 |             ),
 68 |         ],
 69 |         tests=[
 70 |             GreaterOrEqualThan(
 71 |                 test_name="Recall", metric_name="context_recall", min_value=0.8
 72 |             ),
 73 |         ],
 74 |     )
 75 | 
 76 |     # We start the evaluation manager and run the metrics
 77 |     tic = perf_counter()
 78 |     runner = EvaluationRunner(pipeline)
 79 |     eval_results = runner.evaluate()
 80 |     toc = perf_counter()
 81 |     print("Evaluation results:")
 82 |     print(eval_results.aggregate())
 83 |     print(f"Elapsed time: {toc - tic:.2f} seconds\n")
 84 | 
 85 |     print("Running tests...")
 86 |     test_results = runner.test(eval_results)
 87 |     print(test_results)
 88 | 
 89 | 
 90 | if __name__ == "__main__":
 91 |     # It is important to run this script in a new process to avoid
 92 |     # multiprocessing issues
 93 |     main()
 94 | 
 95 | ```
 96 | 
 97 | Continuous-eval is designed to support multi-module evaluation. In this case we instead suppose the system is composed by one single module (the retriever) so we can use the `SingleModulePipeline` class to setup the pipeline.
 98 | 
 99 | In the pipeline we added both metrics (i.e., `PrecisionRecallF1` and `RankedRetrievalMetrics`) and tests (i.e., `GreaterOrEqualThan` on the recall metric). Read more about this in the [Metrics and Tests](../../pipeline/metrics_and_tests) page.
100 | 
101 | ## Curate a golden dataset
102 | 
103 | **We recommend AI teams invest in curating a high-quality golden dataset** (curated domain experts and checked against user data) to properly evaluate and improve the LLM pipeline. The evaluation golden dataset should be diverse enough to capture unique design requirements in each LLM pipeline.
104 | 
105 | **Relari offers more custom synthetic dataset generation / augmentation as a service.** We have generated granular pipeline-level datasets for SEC Filing, Company Transcript, Coding Agents, Dynamic Tool Use, Enterprise Search, Sales Contracts, Company Wiki, Slack Conversation, Customer Support Tickets, Product Docs, etc. [Contact us](mailto:founders@relari.ai) if you are interested.


--------------------------------------------------------------------------------
/docs/src/content/docs/metrics/overview.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Overview of Metrics
  3 | description: Overview of different types of metrics
  4 | sidebar:
  5 |   badge:
  6 |     text: beta
  7 |     variant: tip
  8 | ---
  9 | 
 10 | ## Metric Categories
 11 | 
 12 | The `continuous-eval` package offers three categories of metrics based on how they are computed:
 13 | 
 14 | - **Deterministic metrics:** calculated based on statistical formulas
 15 | - **Semantic:** calculated using smaller models
 16 | - **Probabilistic:** calculated by an Evaluation LLM with curated prompts
 17 | 
 18 | All the metrics comes with pros and cons and there's not a one-size-fits-all evaluation pipeline that's optimal for every use case. We aim to provide a wide range of metrics for you to choose from.
 19 | 
 20 | ## Using a metric
 21 | 
 22 | There are two ways to use a metric: Directly or through a pipeline.
 23 | 
 24 | ### 1. Directly
 25 | 
 26 | Each metric has a `__call__` method that takes a dictionary of data and returns a dictionary of results.
 27 | 
 28 | ```python
 29 | from continuous_eval.metrics.retrieval import PrecisionRecallF1
 30 | 
 31 | datum = {
 32 |     "question": "What is the capital of France?",
 33 |     "retrieved_context": [
 34 |         "Paris is the capital of France and its largest city.",
 35 |         "Lyon is a major city in France.",
 36 |     ],
 37 |     "ground_truth_context": ["Paris is the capital of France."],
 38 |     "answer": "Paris",
 39 |     "ground_truth_answers": ["Paris"],
 40 | }
 41 | 
 42 | metric = PrecisionRecallF1()
 43 | 
 44 | print(metric(**datum))
 45 | ```
 46 | 
 47 | Additionally, each metric has a `args`, `schema` and `help` properties that describe the metric.
 48 | The property `args` is a dictionary of arguments that can be passed to the metric
 49 | 
 50 | ```text
 51 | >> print(metric.args)
 52 | {
 53 |   'retrieved_context': Arg(type=typing.List[str], description='', is_required=True, default=None), 
 54 |   'ground_truth_context': Arg(type=typing.List[str], description='', is_required=True, default=None)
 55 | }
 56 | ```
 57 | 
 58 | The property `schema` is a dictionary of arguments that can be passed to the metric
 59 | 
 60 | ```text
 61 | >> print(metric.schema)
 62 | {
 63 |   {'context_precision': Field(type=<class 'float'>, limits=(0.0, 1.0), internal=False, description=None), 
 64 |   'context_recall': Field(type=<class 'float'>, limits=(0.0, 1.0), internal=False, description=None), 
 65 |   'context_f1': Field(type=<class 'float'>, limits=(0.0, 1.0), internal=False, description=None)
 66 | }
 67 | ```
 68 | 
 69 | And finally, the property `help` is a string that describes the metric
 70 | 
 71 | ```text
 72 | >> print(metric.help)
 73 | "Calculate the precision, recall, and f1 score for the retrieved context given the ground truth context."
 74 | ```
 75 | 
 76 | ### 2. Through a pipeline
 77 | 
 78 | This example shows how to use a metric through a pipeline, which is the recommended way when you want to evaluate over a dataset.
 79 | 
 80 | ```python
 81 | from continuous_eval.data_downloader import example_data_downloader
 82 | from continuous_eval.eval import EvaluationRunner, SingleModulePipeline
 83 | from continuous_eval.metrics.retrieval import PrecisionRecallF1, RankedRetrievalMetrics
 84 | 
 85 | if __name__ == "__main__":
 86 |     # Let's download the retrieval dataset example
 87 |     dataset = example_data_downloader("retrieval") 
 88 | 
 89 |     # Define the pipeline (system under test)
 90 |     pipeline = SingleModulePipeline(
 91 |         dataset=dataset,
 92 |         eval=[
 93 |             PrecisionRecallF1().use(
 94 |                 retrieved_context=dataset.retrieved_contexts,
 95 |                 ground_truth_context=dataset.ground_truth_contexts,
 96 |             ),
 97 |             RankedRetrievalMetrics().use(
 98 |                 retrieved_context=dataset.retrieved_contexts,
 99 |                 ground_truth_context=dataset.ground_truth_contexts,
100 |             ),
101 |         ],
102 |     )
103 | 
104 |     # We start the evaluation runner and run the metrics over the downloaded dataset
105 |     evalrunner = EvaluationRunner(pipeline)
106 |     metrics = evalrunner.evaluate(dataset)
107 |     print(metrics.aggregate())
108 | ```
109 | 
110 | Note that it is important to place the code that uses the metric inside the `__main__` block, otherwise the multiprocessing evaluation will not work (and will fall back to the single-process evaluation).;
111 | 
112 | ## More examples
113 | 
114 | You can find more examples in the [examples folder](https://github.com/relari-ai/continuous-eval/tree/main/examples) or in the [example repository](https://github.com/relari-ai/examples).
115 | 


--------------------------------------------------------------------------------
/continuous_eval/metrics/retrieval/ranked.py:
--------------------------------------------------------------------------------
  1 | from math import log
  2 | from typing import List
  3 | 
  4 | from continuous_eval.metrics.base import Field, Metric
  5 | from continuous_eval.metrics.retrieval.matching_strategy import (
  6 |     MatchingStrategy,
  7 |     MatchingStrategyType,
  8 |     RougeChunkMatch,
  9 | )
 10 | 
 11 | 
 12 | class RankedRetrievalMetrics(Metric):
 13 |     """Calculate the average precision, reciprocal rank, and normalized discounted cumulative gain for the retrieved context given the ground truth context."""
 14 | 
 15 |     def __init__(
 16 |         self, matching_strategy: MatchingStrategy = RougeChunkMatch()
 17 |     ) -> None:
 18 |         super().__init__(is_cpu_bound=True)
 19 |         self.matching_strategy = matching_strategy
 20 |         assert isinstance(
 21 |             matching_strategy, MatchingStrategy
 22 |         ), "Matching strategy must be an instance of MatchingStrategy."
 23 |         assert (
 24 |             self.matching_strategy.type == MatchingStrategyType.CHUNK_MATCH
 25 |         ), "Ranked metrics are calculated at chunk level."
 26 | 
 27 |     def compute(
 28 |         self,
 29 |         retrieved_context: List[str],
 30 |         ground_truth_context: List[str],
 31 |         **kwargs,
 32 |     ):
 33 |         # Calculate ranked metrics (MAP, MRR, NDCG) based on different matching strategies.
 34 |         map = self.calculate_average_precision(
 35 |             retrieved_context, ground_truth_context
 36 |         )
 37 |         mrr = self.calculate_reciprocal_rank(
 38 |             retrieved_context, ground_truth_context
 39 |         )
 40 |         ndcg = self.calculate_normalized_discounted_cumulative_gain(
 41 |             retrieved_context, ground_truth_context
 42 |         )
 43 |         return {"average_precision": map, "reciprocal_rank": mrr, "ndcg": ndcg}
 44 | 
 45 |     def calculate_average_precision(
 46 |         self, retrieved_context, ground_truth_context
 47 |     ):
 48 |         # Calculate average precision for a single query retrieval
 49 | 
 50 |         # Calculate average precision for each relevant chunk
 51 |         average_precision = 0
 52 |         relevant_chunks = 0
 53 | 
 54 |         for i, chunk in enumerate(retrieved_context):
 55 |             for ground_truth_chunk in ground_truth_context:
 56 |                 if self.matching_strategy.is_relevant(
 57 |                     chunk, ground_truth_chunk
 58 |                 ):
 59 |                     relevant_chunks += 1
 60 |                     average_precision += relevant_chunks / (i + 1)
 61 |                     break
 62 | 
 63 |         return average_precision / relevant_chunks if relevant_chunks else 0
 64 | 
 65 |     def calculate_reciprocal_rank(
 66 |         self, retrieved_context, ground_truth_context
 67 |     ):
 68 |         # Calculate reciprocal rank for a single query retrieval
 69 | 
 70 |         # Calculate reciprocal rank for each relevant chunk
 71 |         for i, chunk in enumerate(retrieved_context):
 72 |             for ground_truth_chunk in ground_truth_context:
 73 |                 if self.matching_strategy.is_relevant(
 74 |                     chunk, ground_truth_chunk
 75 |                 ):
 76 |                     return 1 / (i + 1)
 77 | 
 78 |         # If no relevant chunk is found, return 0
 79 |         return 0
 80 | 
 81 |     def calculate_normalized_discounted_cumulative_gain(
 82 |         self, retrieved_context, ground_truth_context
 83 |     ):
 84 |         # Calculate normalized discounted cumulative gain for a single query retrieval
 85 | 
 86 |         # Calculate discounted cumulative gain
 87 |         dcg = 0
 88 |         matched_ground_truths = set()
 89 | 
 90 |         for i, chunk in enumerate(retrieved_context):
 91 |             for ground_truth_chunk in ground_truth_context:
 92 |                 if (
 93 |                     ground_truth_chunk not in matched_ground_truths
 94 |                     and self.matching_strategy.is_relevant(
 95 |                         chunk, ground_truth_chunk
 96 |                     )
 97 |                 ):
 98 |                     # Calculate relevance score (relevant gain = 1)
 99 |                     dcg += 1 / log(i + 2, 2)
100 |                     matched_ground_truths.add(ground_truth_chunk)
101 |                     break
102 | 
103 |         # Calculate ideal discounted cumulative gain
104 |         idcg = 0
105 |         for i in range(len(ground_truth_context)):
106 |             idcg += 1 / log(i + 2, 2)
107 | 
108 |         return dcg / idcg
109 | 
110 |     @property
111 |     def schema(self):
112 |         return {
113 |             "average_precision": Field(type=float, limits=(0, 1)),
114 |             "reciprocal_rank": Field(type=float, limits=(0, 1)),
115 |             "ndcg": Field(type=float, limits=(0, 1)),
116 |         }
117 | 


--------------------------------------------------------------------------------
/continuous_eval/metrics/generation/text/bert.py:
--------------------------------------------------------------------------------
  1 | from typing import List
  2 | 
  3 | try:
  4 |     import torch
  5 | except ImportError:
  6 |     raise ImportError("To use BertSimilarity, please install PyTorch.")
  7 | try:
  8 |     from sentence_transformers import CrossEncoder
  9 |     from transformers import BertModel, BertTokenizer
 10 | except ImportError:
 11 |     raise ImportError(
 12 |         "To use BertSimilarity, please install sentence-transformers and transformers."
 13 |     )
 14 | from continuous_eval.metrics.base import Arg, Field, Metric
 15 | 
 16 | 
 17 | class DebertaScores:
 18 |     def __init__(self):
 19 |         self._model = CrossEncoder(
 20 |             "cross-encoder/nli-deberta-v3-large",
 21 |             tokenizer_args={"use_fast": False},
 22 |         )
 23 |         self._batch_size = 32
 24 | 
 25 |     @property
 26 |     def device(self):
 27 |         return self._model._target_device
 28 | 
 29 |     def _batch_predict(self, sentence_pairs, batch_size):
 30 |         """
 31 |         Predicts in batches.
 32 |         """
 33 |         batched_predictions = []
 34 |         for i in range(0, len(sentence_pairs), batch_size):
 35 |             batch = sentence_pairs[i : i + batch_size]
 36 |             predictions = self._model.predict(batch)
 37 |             batched_predictions.extend(predictions)
 38 |         return batched_predictions
 39 | 
 40 |     def __call__(self, sentence_pairs, batch_size=32):
 41 |         """
 42 |         Splits sentence_pairs into batches of size batch_size and performs prediction on each batch.
 43 |         """
 44 |         return self._batch_predict(sentence_pairs, batch_size)
 45 | 
 46 | 
 47 | class BertSimilarity(Metric):
 48 |     """
 49 |     Evaluate the semantic similarity between the generated text and the reference text using BERT.
 50 |     """
 51 | 
 52 |     def __init__(self, pooler_output: bool = False):
 53 |         super().__init__(disable_multiprocessing=True)
 54 |         # Load pre-trained BERT model and tokenizer
 55 |         self._tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
 56 |         self._model = BertModel.from_pretrained("bert-base-uncased")
 57 |         self._pooler_output = pooler_output
 58 |         self.batch_size = 32
 59 | 
 60 |     def batch(self, prediction: List[str], reference: List[str]):
 61 |         # Function to yield batches of data
 62 |         def mini_batches(data, batch_size):
 63 |             for i in range(0, len(data), batch_size):
 64 |                 yield data[i : i + batch_size]
 65 | 
 66 |         # Process batches
 67 |         all_similarities = []
 68 |         for pred_batch, ref_batch in zip(
 69 |             mini_batches(prediction, self.batch_size),
 70 |             mini_batches(reference, self.batch_size),
 71 |         ):
 72 |             batch_result = self._subprocess(pred_batch, ref_batch)
 73 |             all_similarities.extend(batch_result)
 74 |         return {"bert_similarity": all_similarities}
 75 | 
 76 |     def _subprocess(self, prediction: List[str], reference: List[str]):
 77 |         predictions = self._tokenizer(prediction, padding=True)
 78 |         references = self._tokenizer(reference, padding=True)
 79 | 
 80 |         # Get BERT embeddings for the tokens
 81 |         with torch.no_grad():
 82 |             pred_embedding = self._model(  # type: ignore
 83 |                 torch.tensor(predictions["input_ids"]),
 84 |                 attention_mask=torch.tensor(predictions["attention_mask"]),
 85 |             )
 86 |             ref_embedding = self._model(  # type: ignore
 87 |                 torch.tensor(references["input_ids"]),
 88 |                 attention_mask=torch.tensor(references["attention_mask"]),
 89 |             )
 90 |             if self._pooler_output:
 91 |                 pred_embedding = pred_embedding.pooler_output
 92 |                 ref_embedding = ref_embedding.pooler_output
 93 |             else:
 94 |                 pred_embedding = pred_embedding[0].mean(dim=1)
 95 |                 ref_embedding = ref_embedding[0].mean(dim=1)
 96 | 
 97 |         cosine_similarity = torch.nn.CosineSimilarity(dim=0)
 98 |         semantic_similarity = cosine_similarity(
 99 |             pred_embedding.T, ref_embedding.T
100 |         )
101 |         semantic_similarity = torch.clip(semantic_similarity, min=0.0, max=1.0)
102 |         return semantic_similarity.tolist()
103 | 
104 |     def compute(self, prediction: str, reference: str):
105 |         res = self.batch(prediction=[prediction], reference=[reference])
106 |         return {"bert_similarity": res["bert_similarity"][0]}
107 | 
108 |     @property
109 |     def args(self):
110 |         return {
111 |             "prediction": Arg(type=str, description="The generated text"),
112 |             "reference": Arg(type=str, description="The reference text"),
113 |         }
114 | 
115 |     @property
116 |     def schema(self):
117 |         return {"bert_similarity": Field(type=float)}
118 | 


--------------------------------------------------------------------------------
/docs/astro.config.mjs:
--------------------------------------------------------------------------------
  1 | import { defineConfig } from 'astro/config';
  2 | import starlight from '@astrojs/starlight';
  3 | import remarkMath from 'remark-math';
  4 | import rehypeMathjax from 'rehype-mathjax';
  5 | 
  6 | // https://astro.build/config
  7 | export default defineConfig({
  8 |   site: 'https://docs.relari.ai',
  9 |   // base: '/v0.3',
 10 |   // outDir: './dist/v0.3',
 11 |   base: '.',
 12 |   outDir: './dist',
 13 |   trailingSlash: "never",
 14 |   markdown: {
 15 |     remarkPlugins: [remarkMath],
 16 |     rehypePlugins: [rehypeMathjax],
 17 |   },
 18 | 	integrations: [
 19 | 		starlight({
 20 | 			title: 'Continuous Eval',
 21 |       // components: {
 22 |       //   // Override the default `SocialIcons` component.
 23 |       //   ThemeSelect: './src/components/ThemeSelect.astro',
 24 |       // },
 25 | 			tableOfContents: { minHeadingLevel: 2, maxHeadingLevel: 4, },
 26 | 			customCss: [
 27 | 				// Relative path to your custom CSS file
 28 | 				'./src/styles/custom.css',
 29 | 			],
 30 | 			social: {
 31 | 				github: 'https://github.com/relari-ai/continuous-eval/tree/main',
 32 | 			},
 33 | 			sidebar: [
 34 | 				{
 35 | 					label: 'Getting Started',
 36 | 					items: [
 37 | 						// Each item here is one entry in the navigation menu.
 38 | 						{ label: 'Start Here!', link: '/'},
 39 | 						{ label: 'Why continuous-eval?', link: '/getting-started/introduction/'},
 40 | 						{ label: 'Installation', link: '/getting-started/installation/' },
 41 | 						{ label: 'Quick Start', link: '/getting-started/quickstart/' },
 42 | 					],
 43 | 				},
 44 |         {
 45 | 					label: 'Orchestration',
 46 | 					items: [
 47 | 						// Each item here is one entry in the navigation menu.
 48 | 						{ label: 'Pipeline', link: '/pipeline/pipeline'},
 49 | 						{ label: 'Dataset', link: '/pipeline/eval_dataset' },
 50 | 						{ label: 'Metrics and Tests', link: '/pipeline/metrics_and_tests'},
 51 |             { label: 'Pipeline Logger', link: '/pipeline/pipeline_logger' },
 52 |             { label: 'Evaluation Runner', link: '/pipeline/eval_runner' },
 53 |             { label: 'Supported LLMs', link: '/pipeline/llms' },
 54 | 					],
 55 | 
 56 |         },
 57 | 				{
 58 | 					label: 'Metrics',
 59 | 					items: [
 60 | 						{ label: 'Overview', link: '/metrics/overview/' },
 61 | 						{label: 'Metric Class', link: '/metrics/base/'},
 62 | 						{label: 'LLM-as-a-Judge Metrics', link: '/metrics/llm_as_a_judge/'},
 63 | 						{label: 'Probabilistic LLM Metrics', link: '/metrics/probabilistic_metrics/'},			
 64 | 						{
 65 | 							label: 'Retrieval',
 66 | 							collapsed: true,
 67 | 							items: [
 68 | 								{
 69 | 									label: 'Deterministic',
 70 | 									autogenerate: { directory: '/metrics/Retrieval/Deterministic/' }
 71 | 								},
 72 | 								{
 73 | 									label: 'LLM-Based',
 74 | 									autogenerate: { directory: '/metrics/Retrieval/LLM-Based/' }
 75 | 								},
 76 | 							]
 77 | 						},
 78 | 						{
 79 | 							label: 'Text Generation',
 80 | 							collapsed: true,
 81 | 							items: [
 82 | 								{
 83 | 									label: 'Deterministic',
 84 | 									autogenerate: { directory: '/metrics/Generation/Deterministic/' }
 85 | 								},
 86 | 								{
 87 | 									label: 'Semantic',
 88 | 									items: [
 89 | 										{ label: 'DeBERTa Answer Scores', link: '/metrics/generation/semantic/deberta_answer_scores/' },
 90 | 										{ label: 'BERT Answer Similarity', link: '/metrics/generation/semantic/bert_answer_similarity/' },
 91 | 										{ label: 'BERT Answer Relevance', link: '/metrics/generation/semantic/bert_answer_relevance/' },
 92 | 									]
 93 | 								},
 94 | 								{
 95 | 									label: 'LLM-Based',
 96 | 									autogenerate: { directory: '/metrics/Generation/LLM-Based/' }
 97 | 								},
 98 | 							]
 99 | 						},
100 | 						{
101 | 							label: 'Code Generation',
102 | 							collapsed: true,
103 | 							items : [
104 | 								{
105 | 									label: 'Deterministic',
106 | 									autogenerate: { directory: '/metrics/Code/Deterministic/' }
107 | 								},
108 | 								{
109 | 									label: 'LLM-Based',
110 | 									autogenerate: { directory: '/metrics/Code/LLM-Based/' }
111 | 								}
112 | 							]
113 | 						},
114 | 						{
115 | 							label: 'Agent Tool Use',
116 | 							collapsed: true,
117 | 							items : [
118 | 								{
119 | 									label: 'Deterministic',
120 | 									autogenerate: { directory: '/metrics/Tools/Deterministic/' }
121 | 								},
122 | 							]
123 | 						},
124 | 						{
125 | 							label: 'Classification',
126 | 							collapsed: true,
127 | 							items : [
128 | 								{
129 | 									label: 'Deterministic',
130 | 									autogenerate: { directory: '/metrics/Classification/Deterministic/' }
131 | 								},
132 | 							]
133 | 						},
134 | 						],
135 | 				},
136 |         {
137 | 					label: 'Examples 🔗',
138 |           attrs: {
139 |             target: '_blank',
140 |             rel: 'noopener noreferrer',
141 |           },
142 |           link: 'https://github.com/relari-ai/examples'
143 | 				},
144 | 			],
145 | 		}),
146 | 	],
147 | });
148 | 


--------------------------------------------------------------------------------