├── tests ├── __init__.py ├── helpers │ ├── __init__.py │ └── utils.py ├── data │ ├── invalid.jsonl │ └── correctness_sm.jsonl ├── custom_metric_test.py ├── classification_metrics_test.py ├── generation_semantic_test.py ├── llm_test.py ├── generation_metrics_test.py └── retrieval_metrics_test.py ├── continuous_eval ├── __init__.py ├── metrics │ ├── code │ │ ├── __init__.py │ │ ├── python │ │ │ └── __init__.py │ │ └── sql │ │ │ ├── __init__.py │ │ │ ├── prompts │ │ │ ├── sql_correctness_user.jinja2 │ │ │ └── sql_correctness_sys.jinja2 │ │ │ └── llm.py │ ├── __init__.py │ ├── base │ │ ├── __init__.py │ │ └── llm.py │ ├── custom │ │ ├── __init__.py │ │ ├── custom_metric_user.jinja2 │ │ ├── custom_metric_sys_probabilistic.jinja2 │ │ ├── custom_metric_sys.jinja2 │ │ └── custom_metric.py │ ├── generation │ │ └── text │ │ │ ├── prompts │ │ │ ├── faithfulness_user.jinja2 │ │ │ ├── ans_relevance_user.jinja2 │ │ │ ├── style_consistency_user.jinja2 │ │ │ ├── ans_correctness_user.jinja2 │ │ │ ├── ans_relevance_sys.jinja2 │ │ │ ├── faithfulness_sys.jinja2 │ │ │ ├── style_consistency_sys.jinja2 │ │ │ └── ans_correctness_sys.jinja2 │ │ │ ├── __init__.py │ │ │ ├── utils.py │ │ │ └── bert.py │ ├── retrieval │ │ ├── prompts │ │ │ ├── context_precision_user.jinja2 │ │ │ ├── context_coverage_user.jinja2 │ │ │ ├── context_precision_sys.jinja2 │ │ │ └── context_coverage_sys.jinja2 │ │ ├── __init__.py │ │ ├── simple_tokenizer.py │ │ ├── tokens.py │ │ ├── precision_recall_f1.py │ │ ├── matching_strategy.py │ │ └── ranked.py │ ├── classification │ │ ├── __init__.py │ │ └── classification.py │ └── tools │ │ └── match.py ├── utils │ ├── generic.py │ ├── types.py │ └── telemetry.py ├── llms │ ├── __init__.py │ ├── openai.py │ ├── cohere.py │ ├── base.py │ ├── anthropic.py │ ├── bedrock.py │ ├── anthropic_bedrock.py │ ├── google.py │ └── azure.py └── eval │ ├── __init__.py │ ├── types.py │ ├── tests.py │ ├── logger.py │ └── modules.py ├── docs ├── tsconfig.json ├── src │ ├── env.d.ts │ ├── content │ │ ├── config.ts │ │ └── docs │ │ │ ├── metrics │ │ │ ├── Generation │ │ │ │ ├── LLM-Based │ │ │ │ │ ├── faithfulness.md │ │ │ │ │ ├── relevance.md │ │ │ │ │ ├── correctness.md │ │ │ │ │ └── style.md │ │ │ │ ├── Semantic │ │ │ │ │ ├── bert_answer_relevance.md │ │ │ │ │ ├── bert_answer_similarity.md │ │ │ │ │ └── deberta_answer_scores.md │ │ │ │ └── Deterministic │ │ │ │ │ ├── flesch_kincaid_readability.md │ │ │ │ │ ├── correctness.md │ │ │ │ │ └── faithfulness.md │ │ │ ├── Code │ │ │ │ ├── Deterministic │ │ │ │ │ ├── code_string_match.md │ │ │ │ │ ├── sql_syntax_match.md │ │ │ │ │ ├── python_ast_similarity.md │ │ │ │ │ └── sql_ast_similarity.md │ │ │ │ └── Probabilistic │ │ │ │ │ └── llm-based.md │ │ │ ├── Tools │ │ │ │ └── Deterministic │ │ │ │ │ └── tool_selection.md │ │ │ ├── Retrieval │ │ │ │ ├── LLM-Based │ │ │ │ │ ├── context_precision.md │ │ │ │ │ └── context_coverage.md │ │ │ │ └── Deterministic │ │ │ │ │ ├── token_count.md │ │ │ │ │ ├── rank_aware_metrics.md │ │ │ │ │ └── precision_recall.md │ │ │ ├── Classification │ │ │ │ └── Deterministic │ │ │ │ │ └── classification_metrics.md │ │ │ ├── base.md │ │ │ └── overview.md │ │ │ ├── getting-started │ │ │ ├── Introduction.md │ │ │ ├── installation.md │ │ │ └── quickstart.md │ │ │ ├── pipeline │ │ │ ├── eval_runner.mdx │ │ │ ├── pipeline_logger.mdx │ │ │ ├── llms.mdx │ │ │ ├── pipeline.md │ │ │ └── metrics_and_tests.md │ │ │ └── index.mdx │ ├── components │ │ └── ThemeSelect.astro │ └── styles │ │ └── custom.css ├── public │ ├── module-level-eval.png │ ├── continuous-eval-logo.png │ ├── synthetic-data-demo.png │ └── favicon.svg ├── .gitignore ├── package.json ├── README.md └── astro.config.mjs ├── .env.example ├── examples ├── single_metric.py ├── evaluation_on_dataset.py └── modular_evaluation.py ├── .github └── workflows │ └── codeflash-optimize.yaml ├── .pre-commit-config.yaml ├── pyproject.toml ├── CODE_OF_CONDUCT.md └── .gitignore /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /continuous_eval/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/helpers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /continuous_eval/metrics/code/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "astro/tsconfigs/strict" 3 | } 4 | -------------------------------------------------------------------------------- /continuous_eval/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | from continuous_eval.metrics.base import Metric 2 | -------------------------------------------------------------------------------- /docs/src/env.d.ts: -------------------------------------------------------------------------------- 1 | /// 2 | /// 3 | -------------------------------------------------------------------------------- /continuous_eval/metrics/base/__init__.py: -------------------------------------------------------------------------------- 1 | from .metric import Arg, Field, Metric 2 | from .prompt import MetricPrompt 3 | -------------------------------------------------------------------------------- /continuous_eval/metrics/custom/__init__.py: -------------------------------------------------------------------------------- 1 | from .custom_metric import ProbabilisticCustomMetric,CustomMetric, Example 2 | -------------------------------------------------------------------------------- /continuous_eval/metrics/code/python/__init__.py: -------------------------------------------------------------------------------- 1 | from .code_deterministic_metrics import CodeStringMatch, PythonASTSimilarity 2 | -------------------------------------------------------------------------------- /docs/public/module-level-eval.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/relari-ai/continuous-eval/HEAD/docs/public/module-level-eval.png -------------------------------------------------------------------------------- /continuous_eval/utils/generic.py: -------------------------------------------------------------------------------- 1 | def all_sets_equal(d: dict) -> bool: 2 | return len(set(map(frozenset, d.values()))) == 1 3 | -------------------------------------------------------------------------------- /docs/public/continuous-eval-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/relari-ai/continuous-eval/HEAD/docs/public/continuous-eval-logo.png -------------------------------------------------------------------------------- /docs/public/synthetic-data-demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/relari-ai/continuous-eval/HEAD/docs/public/synthetic-data-demo.png -------------------------------------------------------------------------------- /continuous_eval/metrics/code/sql/__init__.py: -------------------------------------------------------------------------------- 1 | from .deterministic import SQLASTSimilarity, SQLSyntaxMatch 2 | from .llm import SQLCorrectness 3 | -------------------------------------------------------------------------------- /continuous_eval/llms/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import LLMFactory 2 | from .openai import OpenAI 3 | 4 | LLMFactory.register_provider("openai", OpenAI) 5 | -------------------------------------------------------------------------------- /continuous_eval/metrics/generation/text/prompts/faithfulness_user.jinja2: -------------------------------------------------------------------------------- 1 | Context: 2 | ``` 3 | {{ context }} 4 | ``` 5 | 6 | Statement: `{{ statement }}` 7 | -------------------------------------------------------------------------------- /continuous_eval/metrics/retrieval/prompts/context_precision_user.jinja2: -------------------------------------------------------------------------------- 1 | Question: {{question}} 2 | Context: 3 | ``` 4 | {{context}} 5 | ``` 6 | Response: 7 | -------------------------------------------------------------------------------- /continuous_eval/metrics/classification/__init__.py: -------------------------------------------------------------------------------- 1 | from continuous_eval.metrics.classification.classification import ( 2 | SingleLabelClassification, 3 | ) 4 | -------------------------------------------------------------------------------- /continuous_eval/metrics/generation/text/prompts/ans_relevance_user.jinja2: -------------------------------------------------------------------------------- 1 | Question: `{{ question }}` 2 | Generated answer: `{{ answer }}` 3 | 4 | Evaluation: 5 | -------------------------------------------------------------------------------- /continuous_eval/metrics/custom/custom_metric_user.jinja2: -------------------------------------------------------------------------------- 1 | {%- for key in arguments.keys() %}{{ key }}: `{{ "{{ " + key + " }}" }}` 2 | {% endfor %} 3 | Evaluation: 4 | -------------------------------------------------------------------------------- /continuous_eval/metrics/retrieval/prompts/context_coverage_user.jinja2: -------------------------------------------------------------------------------- 1 | Question: `{{question}}` 2 | Context: 3 | {% for ctx in context %} 4 | `{{ ctx }}` 5 | {% endfor %} 6 | Answer: `{{answer}}` 7 | 8 | Your response: 9 | -------------------------------------------------------------------------------- /continuous_eval/eval/__init__.py: -------------------------------------------------------------------------------- 1 | from continuous_eval.eval.dataset import Dataset 2 | from continuous_eval.eval.modules import Metric, Module, Test, Tool 3 | from continuous_eval.eval.pipeline import ( 4 | CalledTools, 5 | ModuleOutput, 6 | Pipeline, 7 | SingleModulePipeline, 8 | ) 9 | from continuous_eval.eval.runner import EvaluationRunner 10 | -------------------------------------------------------------------------------- /continuous_eval/metrics/generation/text/prompts/style_consistency_user.jinja2: -------------------------------------------------------------------------------- 1 | Generated answer: `{{ answer }}` 2 | {% if ground_truth_answers | length > 1 %} 3 | Reference Answer(s): 4 | {% for answer in ground_truth_answers %} 5 | - `{{ answer }}` 6 | {% endfor %} 7 | {% else %} 8 | Reference Answer: `{{ ground_truth_answers[0] }}` 9 | {% endif %} 10 | 11 | Response: 12 | -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | # build output 2 | dist/ 3 | # generated types 4 | .astro/ 5 | 6 | # dependencies 7 | node_modules/ 8 | 9 | # logs 10 | npm-debug.log* 11 | yarn-debug.log* 12 | yarn-error.log* 13 | pnpm-debug.log* 14 | 15 | 16 | # environment variables 17 | .env 18 | .env.production 19 | 20 | # macOS-specific files 21 | .DS_Store 22 | 23 | # Generated by d2 24 | public/d2 -------------------------------------------------------------------------------- /continuous_eval/eval/types.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, TypedDict 2 | 3 | UID = str 4 | 5 | 6 | def args_to_dict(*args, **kwargs): 7 | arg_dict = dict(kwargs) 8 | for index, value in enumerate(args): 9 | arg_dict[f"_arg{index}"] = value 10 | return arg_dict 11 | 12 | 13 | class ToolCall(TypedDict): 14 | name: str 15 | kwargs: Dict[str, Any] 16 | -------------------------------------------------------------------------------- /continuous_eval/metrics/generation/text/prompts/ans_correctness_user.jinja2: -------------------------------------------------------------------------------- 1 | Question: `{{ question }}` 2 | {% if ground_truth_answers | length > 1 %} 3 | Ground truth answers: 4 | {% for answer in ground_truth_answers %} 5 | - `{{ answer }}` 6 | {% endfor %} 7 | {% else %} 8 | Ground truth answer: `{{ ground_truth_answers[0] }}` 9 | {% endif %} 10 | Generated answer: `{{ answer }}` 11 | 12 | Evaluation: 13 | -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | # For OpenAI 2 | OPENAI_API_KEY="sk-xxxx" 3 | 4 | # For Anthropic 5 | ANTHROPIC_API_KEY="sk-ant-xxxx" 6 | 7 | # For Gemini 8 | GEMINI_API_KEY="xxxx" 9 | 10 | # For Cohere 11 | COHERE_API_KEY="xxxx" 12 | 13 | # For Azure OpenAI 14 | AZURE_OPENAI_API_KEY="sk-xxxx" 15 | AZURE_OPENAI_API_VERSION="2023-03-15-preview" 16 | AZURE_ENDPOINT="https://xxx.openai.azure.com/" 17 | AZURE_DEPLOYMENT="gpt-35-turbo-16k" 18 | 19 | # Set default LLM 20 | EVAL_LLM="gpt-3.5-turbo-0125" 21 | -------------------------------------------------------------------------------- /docs/src/content/config.ts: -------------------------------------------------------------------------------- 1 | import { defineCollection } from 'astro:content'; 2 | import { docsSchema, i18nSchema } from '@astrojs/starlight/schema'; 3 | 4 | export const collections = { 5 | docs: defineCollection({ schema: docsSchema() }), 6 | i18n: defineCollection({ type: 'data', schema: i18nSchema() }), 7 | }; 8 | 9 | export const base = '/'; 10 | 11 | export const versions = [ 12 | ['v0.3', 'v0.3.x (canary)'], 13 | ['v0.2', 'v0.2.x (latest)'], 14 | ]; 15 | 16 | export const defaultVersion = 'v0.3'; 17 | -------------------------------------------------------------------------------- /docs/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "docs", 3 | "type": "module", 4 | "version": "0.0.1", 5 | "scripts": { 6 | "dev": "astro dev", 7 | "start": "astro dev", 8 | "build": "astro check && astro build", 9 | "preview": "astro preview", 10 | "astro": "astro" 11 | }, 12 | "dependencies": { 13 | "@astrojs/check": "^0.9.4", 14 | "@astrojs/starlight": "^0.30.3", 15 | "astro": "^5.0.2", 16 | "rehype-mathjax": "^6.0.0", 17 | "remark-math": "^6.0.0", 18 | "sharp": "^0.32.5", 19 | "typescript": "^5.7.2" 20 | }, 21 | "engines": { 22 | "node": ">=18.0.0" 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /continuous_eval/metrics/generation/text/__init__.py: -------------------------------------------------------------------------------- 1 | from continuous_eval.metrics.generation.text.deterministic import ( 2 | DeterministicAnswerCorrectness, 3 | DeterministicFaithfulness, 4 | FleschKincaidReadability, 5 | ) 6 | 7 | try: 8 | from continuous_eval.metrics.generation.text.semantic import ( 9 | BertAnswerRelevance, 10 | BertAnswerSimilarity, 11 | DebertaAnswerScores, 12 | ) 13 | except ImportError: 14 | pass 15 | from continuous_eval.metrics.generation.text.llm_based import ( 16 | AnswerCorrectness, 17 | AnswerRelevance, 18 | Faithfulness, 19 | StyleConsistency, 20 | ) 21 | -------------------------------------------------------------------------------- /continuous_eval/metrics/custom/custom_metric_sys_probabilistic.jinja2: -------------------------------------------------------------------------------- 1 | You are an expert evaluator system for a somewhat intelligent system. 2 | You need to evaluate the following criteria: 3 | {{ criteria }} 4 | 5 | -- GUIDELINES -- 6 | When evaluating the answer, strictly adhere to the following guidelines: 7 | {{ rubric }} 8 | -- END OF GUIDELINES -- 9 | 10 | {% if examples %} 11 | -- EXAMPLES -- 12 | {%- for example in examples %} 13 | INPUT: 14 | {%- for key, value in example.input.items() %} 15 | {{ key }}: `{{ value }}` 16 | {%- endfor %} 17 | 18 | EVALUATION: 19 | {{ example.output | tojson }} 20 | -- 21 | {%- endfor %} END OF EXAMPLES -- 22 | {% endif %} 23 | -------------------------------------------------------------------------------- /tests/data/invalid.jsonl: -------------------------------------------------------------------------------- 1 | {"question": "who is playing halftime show super bowl 2018"} 2 | {"question": "who said beware of the ides of march"} 3 | {"question": "what kind of beast is the beast from beauty and the beast"} 4 | {"question": "when does the second half of vikings season 5 air"} 5 | {"question": "who's the guy in call me maybe"} 6 | {"question": "who sang what are we doing in love"} 7 | {"question": "who signed the largest on the declaration of independence"} 8 | {"question": "who has won the 2017 mens singles mutua madrid open tennis"} 9 | {"question": "original cast of natasha pierre and the great comet of 1812"} 10 | {"question": "when was where have all the flowers gone written"} 11 | -------------------------------------------------------------------------------- /examples/single_metric.py: -------------------------------------------------------------------------------- 1 | from continuous_eval.metrics.retrieval import PrecisionRecallF1 2 | 3 | # A dataset is just a list of dictionaries containing the relevant information 4 | datum = { 5 | "question": "What is the capital of France?", 6 | "retrieved_context": [ 7 | "Paris is the capital of France and its largest city.", 8 | "Lyon is a major city in France.", 9 | ], 10 | "ground_truth_context": ["Paris is the capital of France."], 11 | "answer": "Paris", 12 | "ground_truths": ["Paris"], 13 | } 14 | 15 | # Let's initialize the metric 16 | metric = PrecisionRecallF1() 17 | 18 | # Let's calculate the metric for the first datum 19 | print(metric(**datum)) 20 | -------------------------------------------------------------------------------- /docs/public/favicon.svg: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /continuous_eval/metrics/code/sql/prompts/sql_correctness_user.jinja2: -------------------------------------------------------------------------------- 1 | Carefully read the following question that describes what the SQL query should do: 2 | 3 | {{question}} 4 | 5 | 6 | Now, examine the generated SQL query: 7 | 8 | {{answer}} 9 | 10 | 11 | Compare it to the correct SQL query: 12 | {%- set ground_truth_answers = [ground_truth_answers] if ground_truth_answers is string else ground_truth_answers -%} 13 | {% for gt in ground_truth_answers %} 14 | 15 | {{ gt }} 16 | 17 | {%- endfor %} 18 | 19 | {% if schema %} 20 | The schema of the database is as follows: 21 | 22 | {{ schema }} 23 | 24 | {% endif %} 25 | 26 | Your evaluation: 27 | -------------------------------------------------------------------------------- /continuous_eval/metrics/retrieval/__init__.py: -------------------------------------------------------------------------------- 1 | from continuous_eval.metrics.retrieval.llm_based import ( 2 | ContextCoverage, 3 | ContextPrecision, 4 | ) 5 | from continuous_eval.metrics.retrieval.matching_strategy import ( 6 | ExactChunkMatch, 7 | ExactSentenceMatch, 8 | RougeChunkMatch, 9 | RougeSentenceMatch, 10 | ) 11 | from continuous_eval.metrics.retrieval.precision_recall_f1 import PrecisionRecallF1 12 | from continuous_eval.metrics.retrieval.ranked import RankedRetrievalMetrics 13 | from continuous_eval.metrics.retrieval.tokens import TokenCount 14 | 15 | from nltk import download as nltk_download 16 | 17 | nltk_download("punkt", quiet=True) 18 | nltk_download("punkt_tab", quiet=True) 19 | nltk_download("stopwords", quiet=True) 20 | -------------------------------------------------------------------------------- /continuous_eval/metrics/retrieval/prompts/context_precision_sys.jinja2: -------------------------------------------------------------------------------- 1 | Given the following question and context, verify if the information in the given context is useful in answering the question. 2 | Respond with either Yes or No, followed by reasoning. 3 | {% if use_few_shot %} 4 | 5 | --EXAMPLES-- 6 | Example 1 7 | Question: What is the capital of France? 8 | Context: Paris is the largest city and the capital of France. It has many historical monuments. 9 | Response: Yes 10 | Reasoning: The context states that Paris is the capital of France. 11 | 12 | Example 2: 13 | Question: What is the capital of France? 14 | Context: Lyon is a major city in France. It is known for its culinary arts. 15 | Response: No 16 | Reasoning: The context does not mention any city that is the capital of France. 17 | {% endif %} 18 | Now evaluate the following: 19 | -------------------------------------------------------------------------------- /continuous_eval/metrics/custom/custom_metric_sys.jinja2: -------------------------------------------------------------------------------- 1 | You are an expert evaluator system for a somewhat intelligent system. 2 | You need to evaluate the following criteria: 3 | {{ criteria }} 4 | 5 | -- GUIDELINES -- 6 | When evaluating the answer, strictly adhere to the following guidelines: 7 | {{ rubric }} 8 | -- END OF GUIDELINES -- 9 | 10 | -- RESPONSE FORMAT -- 11 | Output the your evaluation in JSON format following the following schema: 12 | { 13 | {% for key, value in response_format.items() -%} 14 | "{{ key }}": ({{ value.type }}) {{ value.description }}, 15 | {% endfor -%} 16 | } 17 | -- END OF RESPONSE FORMAT -- 18 | {% if examples %} 19 | -- EXAMPLES -- 20 | {%- for example in examples %} 21 | INPUT: 22 | {%- for key, value in example.input.items() %} 23 | {{ key }}: `{{ value }}` 24 | {%- endfor %} 25 | 26 | EVALUATION: 27 | {{ example.output | tojson }} 28 | -- 29 | {%- endfor %} END OF EXAMPLES -- 30 | {% endif %} 31 | -------------------------------------------------------------------------------- /docs/src/content/docs/metrics/Generation/LLM-Based/faithfulness.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: LLM-based Faithfulness 3 | --- 4 | 5 | ### Definition 6 | 7 | **LLM-based Faithfulness** measures how grounded is the generated answer on the retrieved contexts. 8 | 9 | ### Example Usage 10 | 11 | Required data items: `answer`, and `retrieved_context` 12 | 13 | ```python 14 | from continuous_eval.metrics.generation.text import Faithfulness 15 | 16 | datum = { 17 | "question": "Who wrote 'Romeo and Juliet'?", 18 | "retrieved_context": ["William Shakespeare is the author of 'Romeo and Juliet'."], 19 | "answer": "Shakespeare wrote 'Romeo and Juliet'", 20 | "ground_truth_answers": "Shakespeare", 21 | } 22 | metric = Faithfulness() 23 | print(metric(**datum)) 24 | ``` 25 | 26 | ### Sample Output 27 | 28 | ```python 29 | { 30 | "faithfulness": 1.0, 31 | "reasoning": "The statement directly reflects the context.", 32 | } 33 | ``` 34 | -------------------------------------------------------------------------------- /.github/workflows/codeflash-optimize.yaml: -------------------------------------------------------------------------------- 1 | name: Codeflash 2 | 3 | on: 4 | pull_request: 5 | workflow_dispatch: 6 | 7 | jobs: 8 | optimize: 9 | name: Optimize new code in this PR 10 | if: ${{ github.actor != 'codeflash-ai[bot]' }} 11 | runs-on: ubuntu-latest 12 | env: 13 | CODEFLASH_API_KEY: ${{ secrets.CODEFLASH_API_KEY }} 14 | CODEFLASH_PR_NUMBER: ${{ github.event.number }} 15 | steps: 16 | - uses: actions/checkout@v4 17 | with: 18 | fetch-depth: 0 19 | - name: Set up Python 20 | uses: actions/setup-python@v5 21 | with: 22 | python-version: '3.12' 23 | - name: Install Project Dependencies 24 | run: | 25 | python -m pip install --upgrade pip 26 | pip install poetry 27 | poetry install --all-extras 28 | - name: Run Codeflash to optimize code 29 | id: optimize_code 30 | run: | 31 | poetry env use python 32 | poetry run codeflash 33 | -------------------------------------------------------------------------------- /docs/src/content/docs/metrics/Code/Deterministic/code_string_match.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Code String Match 3 | sidebar: 4 | order: 1 5 | --- 6 | 7 | ### Definitions 8 | 9 | **Code String Match** measures how close the generated code string is to the ground truth code string. 10 | 11 | It outputs both the binary exact match score and the fuzzy match score in the range of (0.0 - 1.0). 12 | 13 |
14 | 15 | ### Example Usage 16 | 17 | Required data items: `answer`, `ground_truth_answers` 18 | 19 | ```python 20 | from continuous_eval.metrics.code.python import CodeStringMatch 21 | 22 | datum = { 23 | "answer": "def function(x, y):\n return x + y", 24 | "ground_truth_answers": [ 25 | "def foo(x, y):\n return x * y", 26 | "def foo(x, y):\n return x + y", 27 | ], 28 | } 29 | 30 | metric = CodeStringMatch() 31 | print(metric(**datum)) 32 | ``` 33 | 34 | ### Example Output 35 | 36 | ```JSON 37 | { 38 | "Exact_Match_Score": 0, 39 | "Fuzzy_Match_Score": 0.89 40 | } 41 | ``` 42 | -------------------------------------------------------------------------------- /docs/src/content/docs/getting-started/Introduction.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Introduction 3 | description: Overview 4 | sidebar: 5 | badge: 6 | text: new 7 | variant: tip 8 | --- 9 | 10 | ## What is continuous-eval? 11 | 12 | `continuous-eval` is an open-source package created for granular and holistic evaluation of GenAI application pipelines. 13 | 14 | 15 | 16 | ## How is continuous-eval different? 17 | 18 | - **Modularized Evaluation**: Measure each module in the pipeline with tailored metrics. 19 | 20 | - **Comprehensive Metric Library**: Covers Retrieval-Augmented Generation (RAG), Code Generation, Agent Tool Use, Classification and a variety of other LLM use cases. Mix and match Deterministic, Semantic and LLM-based metrics. 21 | 22 | ## Resources 23 | 24 | - **Relari Blog:** Useful articles on how to evaluate LLM applications [link](https://www.relari.ai/blog) 25 | - **Discord:** Join our community of LLM developers [Discord](https://discord.gg/GJnM8SRsHr) 26 | - **Reach out to founders:** [Email](mailto:founders@relari.ai) or [Schedule a chat](https://cal.com/pasquale/continuous-eval) 27 | -------------------------------------------------------------------------------- /docs/src/components/ThemeSelect.astro: -------------------------------------------------------------------------------- 1 | --- 2 | import type { Props } from '@astrojs/starlight/props'; 3 | import Default from '@astrojs/starlight/components/ThemeSelect.astro'; 4 | import Select from '@astrojs/starlight/components/Select.astro'; 5 | import { defaultVersion, versions } from '../content/config.ts'; 6 | 7 | const url = new URL(Astro.url); 8 | const currentVersion = /^\/v\d\.\d+/.test(url.pathname) 9 | ? url.pathname.split('/')[1] 10 | : defaultVersion; 11 | console.log(currentVersion); 12 | const options = versions.map(([version, label]) => ({ 13 | label, 14 | selected: currentVersion === version, 15 | value: `/${version}`, 16 | })); 17 | --- 18 | 19 | 28 | 29 |