├── data ├── Readme.md ├── SuperBEIR │ └── SuperBEIR-categories-with-rationales.json ├── WikiQuestions-2.0.json ├── WikiQuestions-builder.py ├── WikiQuestions.json ├── duplicate-answer.py ├── wiki-abstract-titles.json └── wiki-answerable-questions.json ├── docs ├── 1_run_test.md ├── 2_result_visualization.md └── task-zoo.md ├── notebooks ├── OPRO-Compiled-JSON-Mode.ipynb └── StructuredRAG_Experimental_Visualization.ipynb ├── poetry.lock ├── pyproject.toml ├── readme.md ├── related-works.md ├── setup.py ├── structured_rag ├── mock_gfl │ ├── __init__.py │ ├── dspy_program.py │ ├── dspy_signatures.py │ ├── fstring_program.py │ ├── fstring_prompts.py │ └── modal_vllm_outlines │ │ ├── download_llama.py │ │ ├── modal_web_server.py │ │ ├── quick_setup_test.py │ │ ├── readme.md │ │ ├── setup.sh │ │ └── vllm_outlines_setup.py ├── models.py ├── readme.md └── run_test │ ├── __init__.py │ ├── readme.md │ ├── result_visualization │ ├── aggregate_result_jsons.py │ ├── boxplot_success_rates_per_model.png │ ├── boxplot_success_rates_per_task.png │ ├── compute_averages.py │ ├── dspy_error_analysis.py │ ├── new_aggregate_result_jsons.py │ ├── success_rate_heatmap.png │ ├── success_rates.png │ ├── success_rates_per_test.png │ ├── visualize.py │ └── visualize_single_result.py │ ├── results │ ├── 10-6-24 │ │ ├── AssessAnswerability-gpt-4o-2024-08-06-dspy.json │ │ ├── AssessAnswerability-gpt-4o-2024-08-06-dspy_NO_OPRO_JSON.json │ │ ├── AssessAnswerability-gpt-4o-2024-08-06-dspy_WITH_OPRO_JSON.json │ │ ├── AssessAnswerability-gpt-4o-2024-08-06-fstring.json │ │ ├── AssessAnswerability-gpt-4o-2024-08-06-fstring_with_structured_outputs.json │ │ └── AssessAnswerability-gpt-4o-2024-08-06-fstring_without_structured_outputs.json │ ├── 9-26-24 │ │ ├── AssessAnswerability-llama3.2:3b-instruct-fp16-dspy.json │ │ └── AssessAnswerability-llama3.2:3b-instruct-fp16-fstring.json │ ├── Gemini-1.5-Pro-9-11-24 │ │ ├── AssessAnswerability-gemini-1.5-pro-dspy.json │ │ ├── AssessAnswerability-gemini-1.5-pro-fstring.json │ │ ├── GenerateAnswer-gemini-1.5-pro-dspy.json │ │ ├── GenerateAnswer-gemini-1.5-pro-fstring.json │ │ ├── GenerateAnswerWithConfidence-gemini-1.5-pro-dspy.json │ │ ├── GenerateAnswerWithConfidence-gemini-1.5-pro-fstring.json │ │ ├── GenerateAnswersWithConfidence-gemini-1.5-pro-dspy.json │ │ ├── GenerateAnswersWithConfidence-gemini-1.5-pro-fstring.json │ │ ├── ParaphraseQuestions-gemini-1.5-pro-dspy.json │ │ ├── ParaphraseQuestions-gemini-1.5-pro-fstring.json │ │ ├── RAGAS-gemini-1.5-pro-dspy.json │ │ ├── RAGAS-gemini-1.5-pro-fstring.json │ │ ├── RateContext-gemini-1.5-pro-dspy.json │ │ └── RateContext-gemini-1.5-pro-fstring.json │ ├── batch-9-13-24 │ │ ├── AssessAnswerability-BATCH-llama3-8b-instruct-Modal.json │ │ ├── GenerateAnswer-BATCH-llama3-8b-instruct-Modal.json │ │ ├── GenerateAnswerWithConfidence-BATCH-llama3-8b-instruct-Modal.json │ │ ├── GenerateAnswersWithConfidence-BATCH-llama3-8b-instruct-Modal.json │ │ ├── ParaphraseQuestions-BATCH-llama3-8b-instruct-Modal.json │ │ ├── RAGAS-BATCH-llama3-8b-instruct-Modal.json │ │ └── RateContext-BATCH-llama3-8b-instruct-Modal.json │ ├── experimental-results-8-26-24 │ │ ├── aggregated_results-8-26-24.json │ │ ├── model_comparison.png │ │ ├── model_comparison_results-trial-1.png │ │ ├── model_comparison_results-trial-2.png │ │ ├── trial-1 │ │ │ ├── AssessAnswerability-gemini-1.5-pro.json │ │ │ ├── AssessAnswerability-llama3:instruct.json │ │ │ ├── GenerateAnswer-gemini-1.5-pro.json │ │ │ ├── GenerateAnswer-llama3:instruct.json │ │ │ ├── GenerateAnswerWithConfidence-gemini-1.5-pro.json │ │ │ ├── GenerateAnswerWithConfidence-llama3:instruct.json │ │ │ ├── GenerateAnswersWithConfidence-gemini-1.5-pro.json │ │ │ ├── GenerateAnswersWithConfidence-llama3:instruct.json │ │ │ ├── ParaphraseQuestions-gemini-1.5-pro.json │ │ │ ├── ParaphraseQuestions-llama3:instruct.json │ │ │ ├── RateContext-gemini-1.5-pro.json │ │ │ └── RateContext-llama3:instruct.json │ │ └── trial-2 │ │ │ ├── AssessAnswerability-gemini-1.5-pro.json │ │ │ ├── AssessAnswerability-llama3:instruct.json │ │ │ ├── GenerateAnswer-gemini-1.5-pro.json │ │ │ ├── GenerateAnswer-llama3:instruct.json │ │ │ ├── GenerateAnswerWithConfidence-gemini-1.5-pro.json │ │ │ ├── GenerateAnswerWithConfidence-llama3:instruct.json │ │ │ ├── GenerateAnswersWithConfidence-gemini-1.5-pro.json │ │ │ ├── GenerateAnswersWithConfidence-llama3:instruct.json │ │ │ ├── ParaphraseQuestions-gemini-1.5-pro.json │ │ │ ├── ParaphraseQuestions-llama3:instruct.json │ │ │ ├── RateContext-gemini-1.5-pro.json │ │ │ └── RateContext-llama3:instruct.json │ ├── experimental-results-9-11-24 │ │ ├── AssessAnswerability-claude-3-5-sonnet-20240620-dspy.json │ │ ├── AssessAnswerability-claude-3-5-sonnet-20240620-fstring.json │ │ ├── AssessAnswerability-gemini-1.5-pro-dspy.json │ │ ├── AssessAnswerability-gemini-1.5-pro-fstring.json │ │ ├── AssessAnswerability-gpt-4o-dspy.json │ │ ├── AssessAnswerability-gpt-4o-fstring.json │ │ ├── AssessAnswerability-llama3:instruct-dspy.json │ │ ├── AssessAnswerability-llama3:instruct-fstring.json │ │ ├── GenerateAnswer-claude-3-5-sonnet-20240620-dspy.json │ │ ├── GenerateAnswer-claude-3-5-sonnet-20240620-fstring.json │ │ ├── GenerateAnswer-gemini-1.5-pro-dspy.json │ │ ├── GenerateAnswer-gemini-1.5-pro-fstring.json │ │ ├── GenerateAnswer-gpt-4o-dspy.json │ │ ├── GenerateAnswer-gpt-4o-fstring.json │ │ ├── GenerateAnswer-llama3:instruct-dspy.json │ │ ├── GenerateAnswer-llama3:instruct-fstring.json │ │ ├── GenerateAnswerWithConfidence-claude-3-5-sonnet-20240620-dspy.json │ │ ├── GenerateAnswerWithConfidence-claude-3-5-sonnet-20240620-fstring.json │ │ ├── GenerateAnswerWithConfidence-gemini-1.5-pro-dspy.json │ │ ├── GenerateAnswerWithConfidence-gemini-1.5-pro-fstring.json │ │ ├── GenerateAnswerWithConfidence-gpt-4o-dspy.json │ │ ├── GenerateAnswerWithConfidence-gpt-4o-fstring.json │ │ ├── GenerateAnswerWithConfidence-llama3:instruct-dspy.json │ │ ├── GenerateAnswerWithConfidence-llama3:instruct-fstring.json │ │ ├── GenerateAnswersWithConfidence-claude-3-5-sonnet-20240620-dspy.json │ │ ├── GenerateAnswersWithConfidence-claude-3-5-sonnet-20240620-fstring.json │ │ ├── GenerateAnswersWithConfidence-gemini-1.5-pro-dspy.json │ │ ├── GenerateAnswersWithConfidence-gemini-1.5-pro-fstring.json │ │ ├── GenerateAnswersWithConfidence-gpt-4o-dspy.json │ │ ├── GenerateAnswersWithConfidence-gpt-4o-fstring.json │ │ ├── GenerateAnswersWithConfidence-llama3:instruct-dspy.json │ │ ├── GenerateAnswersWithConfidence-llama3:instruct-fstring.json │ │ ├── ParaphraseQuestions-claude-3-5-sonnet-20240620-dspy.json │ │ ├── ParaphraseQuestions-claude-3-5-sonnet-20240620-fstring.json │ │ ├── ParaphraseQuestions-gemini-1.5-pro-dspy.json │ │ ├── ParaphraseQuestions-gemini-1.5-pro-fstring.json │ │ ├── ParaphraseQuestions-gpt-4o-dspy.json │ │ ├── ParaphraseQuestions-gpt-4o-fstring.json │ │ ├── ParaphraseQuestions-llama3:instruct-dspy.json │ │ ├── ParaphraseQuestions-llama3:instruct-fstring.json │ │ ├── RAGAS-claude-3-5-sonnet-20240620-dspy.json │ │ ├── RAGAS-claude-3-5-sonnet-20240620-fstring.json │ │ ├── RAGAS-gemini-1.5-pro-dspy.json │ │ ├── RAGAS-gemini-1.5-pro-fstring.json │ │ ├── RAGAS-gpt-4o-dspy.json │ │ ├── RAGAS-gpt-4o-fstring.json │ │ ├── RAGAS-llama3:instruct-dspy.json │ │ ├── RAGAS-llama3:instruct-fstring.json │ │ ├── RateContext-claude-3-5-sonnet-20240620-dspy.json │ │ ├── RateContext-claude-3-5-sonnet-20240620-fstring.json │ │ ├── RateContext-gemini-1.5-pro-dspy.json │ │ ├── RateContext-gemini-1.5-pro-fstring.json │ │ ├── RateContext-gpt-4o-dspy.json │ │ ├── RateContext-gpt-4o-fstring.json │ │ ├── RateContext-llama3:instruct-dspy.json │ │ └── RateContext-llama3:instruct-fstring.json │ └── results │ │ ├── AssessAnswerability-gpt-4o-dspy_NO_OPRO_JSON-2024-11-29.json │ │ ├── AssessAnswerability-gpt-4o-dspy_WITH_OPRO_JSON-2024-11-29.json │ │ ├── AssessAnswerability-gpt-4o-fstring_with_structured_outputs-2024-11-29.json │ │ └── AssessAnswerability-gpt-4o-fstring_without_structured_outputs-2024-11-29.json │ ├── run_scripts │ ├── experiment-log.md │ ├── results │ │ ├── 9-25-24 │ │ │ ├── AssessAnswerability-llama3:instruct-dspy.json │ │ │ └── AssessAnswerability-llama3:instruct-fstring.json │ │ ├── AssessAnswerability-BATCH-llama3-8b-instruct-Modal.json │ │ ├── ClassifyDocumentWithRationale-Modal-vLLM.json │ │ └── GenerateAnswer-BATCH-llama3.2-1b-instruct-Modal.json │ ├── run_batch_test.py │ └── run_test.py │ └── utils_and_metrics │ ├── __pycache__ │ ├── helpers.cpython-310.pyc │ └── metrics.cpython-310.pyc │ ├── count-tokens.py │ ├── helpers.py │ └── metrics.py └── test-cost.md /data/Readme.md: -------------------------------------------------------------------------------- 1 | # Datasets used in StructuredRAG Benchmarking 2 | 3 | StructuredRAG currently only uses the `WikiQuestions` dataset. 4 | 5 | We have also made the dataset available on HuggingFace [here](https://huggingface.co/datasets/weaviate/WikiQuestions)! 6 | -------------------------------------------------------------------------------- /data/WikiQuestions-builder.py: -------------------------------------------------------------------------------- 1 | import dspy 2 | import json 3 | 4 | api_key = "sk-foobar" 5 | 6 | claude = dspy.Claude(model="claude-3-5-sonnet-20240620", api_key=api_key) 7 | dspy.settings.configure(lm=claude) 8 | 9 | from pydantic import BaseModel, validator 10 | 11 | class Answer(BaseModel): 12 | answer: str 13 | 14 | @validator("answer") 15 | def validate_answer(cls, v): 16 | if v is None or v == "": 17 | raise ValueError("Answer cannot be empty") 18 | if v.strip().lower().startswith("answer:") or v.strip().lower().startswith("context"): 19 | raise ValueError("Answer should not start with 'Answer:' or 'Context'") 20 | return v 21 | 22 | class GenerateAnswer(dspy.Signature): 23 | """Assess the context and answer the question.""" 24 | 25 | context: str = dspy.InputField(description="The context to use for answering the question.") 26 | question: str = dspy.InputField(description="The question to answer.") 27 | answer: Answer = dspy.OutputField(description="The answer to the question. ONLY OUTPUT THE ANSWER AND NOTHING ELSE!!") 28 | 29 | generate_answer = dspy.TypedPredictor(GenerateAnswer) 30 | 31 | #rag(context="foo", question="bar").answer 32 | 33 | with open("./WikiQuestions.json", 'r') as json_file: 34 | data = json.load(json_file) 35 | 36 | print(data[0]) 37 | 38 | # Rename the "answer" column to "llama_3_1_8b_instruct_answer" 39 | for item in data: 40 | item["llama_3_1_8b_instruct_answer"] = item.pop("answer") 41 | 42 | print(data[0]) 43 | 44 | for item in data: 45 | context = item["context"] 46 | question = item["question"] 47 | answerable = item["answerable"] 48 | claude_sonnet_answer_obj = generate_answer(context=context, question=question).answer 49 | claude_sonnet_answer = claude_sonnet_answer_obj.answer 50 | print(f"\033[94m{question}\n\033[0m") 51 | print(f"\033[93m{answerable}\n\033[0m") 52 | print(f"\033[92m{claude_sonnet_answer}\n\033[0m") 53 | item["claude_sonnet_answer"] = claude_sonnet_answer 54 | 55 | with open("./WikiQuestions-2.0.json", 'w') as json_file: 56 | json.dump(data, json_file, indent=4) 57 | -------------------------------------------------------------------------------- /data/duplicate-answer.py: -------------------------------------------------------------------------------- 1 | # Temporary Solution for RAGASmetrics test 2 | import json 3 | 4 | with open("./WikiQuestions-2.0.json", 'r') as json_file: 5 | data = json.load(json_file) 6 | 7 | for item in data: 8 | item["answer"] = item["llama_3_1_8b_instruct_answer"] 9 | 10 | with open("./WikiQuestions-2.1.json", 'w') as json_file: 11 | json.dump(data, json_file, indent=4) 12 | -------------------------------------------------------------------------------- /docs/1_run_test.md: -------------------------------------------------------------------------------- 1 | # Run StructuredRAG Test 2 | 3 | To run the tests, create the python environment using `poetry install`. 4 | 5 | You can then run the script using `poetry run python tests/run_test.py`. 6 | 7 | `run_test.py` accepts the following command-line arguments: 8 | 9 | - `--model_name`: The name of the model to use. 10 | - `--model_provider`: The provider of the model. 11 | - `--api_key`: The API key for the model provider (not needed for Ollama). 12 | - `--test`: The type of test to run. 13 | 14 | StructuredRAG currently supports the following tests: 15 | 16 | - `GenerateAnswer` (string) 17 | - `RateContext` (integer) 18 | - `AssessAnswerability` (boolean) 19 | - `ParaphraseQuestions` (list of strings) 20 | - `GenerateAnswerWithConfidence` (AnswerWithConfidence) 21 | - `GenerateAnswersWithConfidence` (list of AnswerWithConfidence) 22 | 23 | You can also run all tests by using the `--all` flag. -------------------------------------------------------------------------------- /docs/2_result_visualization.md: -------------------------------------------------------------------------------- 1 | # Result Visualization 2 | 3 | To visualize the results of your experiments, follow these steps: 4 | 5 | 1. Aggregate results from each trial into a single file: 6 | ``` 7 | python tests/aggregate_result_jsons.py experimental-results 8 | ``` 9 | 10 | 2. This script will generate several outputs: 11 | - A summary of the experiment results printed to the console 12 | - Bar charts comparing model performance: 13 | - One chart for each trial 14 | - One chart showing the average across all trials 15 | - An aggregated JSON file containing all results 16 | 17 | 3. The bar charts will be saved as PNG files: 18 | - `model_comparison.png` for the average across all trials 19 | - `model_comparison_trial-X.png` for each individual trial 20 | 21 | 4. The aggregated results will be saved as `aggregated_results.json` in the `experimental-results` directory. 22 | 23 | 5. The bar charts provide a visual comparison of different models and providers across various test types. They show: 24 | - Performance for each test type 25 | - Comparison between DSPy and f-string implementations 26 | - Results for different models and providers 27 | 28 | 6. You can use these visualizations to quickly identify: 29 | - Which models perform best for each test type 30 | - How DSPy compares to f-string implementations 31 | - Any significant differences between trials 32 | 33 | Remember to run this script after completing your experiments to get a comprehensive view of your results. -------------------------------------------------------------------------------- /docs/task-zoo.md: -------------------------------------------------------------------------------- 1 | # StructuredRAG Task Zoo 2 | 3 | ### GenerateAnswer 4 | 5 | ```python 6 | class GenerateAnswer(BaseModel): 7 | answer: str 8 | ``` 9 | 10 | ### ParaphraseQuestions 11 | 12 | ```python 13 | class ParaphraseQuestions(BaseModel): 14 | questions: list[str] 15 | ``` 16 | 17 | ### RateContext 18 | 19 | ```python 20 | class RateContext(BaseModel): 21 | context_score: int 22 | ``` 23 | 24 | ### RAGAS 25 | 26 | ```python 27 | class RAGASmetrics(BaseModel): 28 | faithfulness_score: float 29 | answer_relevance_score: float 30 | context_relevance_score: float 31 | ``` 32 | 33 | ### AnswerWithConfidence 34 | 35 | ```python 36 | class AnswerWithConfidence(BaseModel): 37 | answer: str 38 | confidence: float 39 | ``` 40 | 41 | ### AnswersWithConfidences 42 | 43 | ```python 44 | class AnswersWithConfidences(BaseModel): 45 | answers_with_confidences: list[AnswerWithConfidence] 46 | ``` 47 | 48 | ### ResponseOrToolCall 49 | 50 | ```python 51 | class ToolArgument(BaseModel): 52 | argument_name: str 53 | argument_value: str 54 | 55 | class ToolCall(BaseModel): 56 | function_name: str 57 | arguments: list[ToolArgument] 58 | 59 | class ResponseOrToolCalls(BaseModel): 60 | reflection_about_tool_use: str = Field( 61 | default=None, 62 | description="A rationale regarding whether the tool calls are needed to answer the question." 63 | ) 64 | use_tools: bool = Field() 65 | response: str = Field( 66 | default=None, 67 | description="A direct response from the LLM without calling any tools." 68 | ) 69 | tool_calls: List[ToolCall] = Field( 70 | default=None, 71 | description="A list of tool calls requested by the LLM." 72 | ) 73 | ``` 74 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "structured-rag" 3 | version = "0.1.0" 4 | description = "" 5 | authors = ["Your Name "] 6 | readme = "README.md" 7 | 8 | [tool.poetry.dependencies] 9 | python = "^3.10" 10 | dspy_ai = "^2.4.13" 11 | ollama = "^0.2.1" 12 | matplotlib = "^3.9.1" 13 | 14 | [build-system] 15 | requires = ["poetry-core"] 16 | build-backend = "poetry.core.masonry.api" -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # StructuredRAG: JSON Response Formatting with Large Language Models 2 | 3 | ## News 📯 4 | 5 | Weaviate Podcast #119 with Will Kurt and Cameron Pfiffer from dottxt.ai is live on [YouTube](https://www.youtube.com/watch?v=3PdEYG6OusA) and [Spotify](https://spotifycreators-web.app.link/e/b8MEmkkbrSb) 6 | 7 | Our research paper is live on [ArXiv](https://arxiv.org/abs/2408.11061)! 8 | 9 | Weaviate Podcast #108 with Zhi Rui Tam, lead author of "Let Me Speak Freely? A Study on the Impact of Format Restrictions on Performance of Large Language Models", is live on [YouTube](https://www.youtube.com/watch?v=UsVIX9NJ_a4) and [Spotify](https://spotifyanchor-web.app.link/e/KkmrH99LkOb)! 10 | 11 | Large Language Models have become extremely powerful at Zero-Shot Instruction Following. This benchmarker aims to target how well LLMs can follow the instruction of formatting its output in a particular JSON template. It is extremely important that these outputs follow these instructions for building reliably LLM systems such as metadata extraction, reasoning, report generation, agents, and more! 12 | 13 | This benchmarker firstly compares `f-String` prompting with the `Follow the Format (FF)` method used in DSPy. 14 | 15 | This benchmarker secondly compares `Gemini` with `Llama3 (Ollama)`. 16 | 17 | The benchmarker explores different RAG inspired tasks with structured outputs as follows: 18 | 19 | | Output Type | Task | Example | 20 | |-----------------------------|-----------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------| 21 | | `string` | GenerateAnswer | {"answer": "The National Gallery of Art, Washington D.C., and the Pinacoteca di Brera, Milan, Italy."} | 22 | | `integer` | RateContext | {"context_score": 5} | 23 | | `boolean` | AssessAnswerability | {"answerable_question": True} | 24 | | `List[string]` | ParaphraseQuestions | {"paraphrased_questions": ["Where can some of Vincenzo Civerchio’s works be found?", "Where are some pieces by Vincenzo Civerchio displayed?", "Where can I find some of Vincenzo Civerchio’s art?"]} | 25 | | `RAGASmetrics` | RAGAS | {"faithfulness_score": 2.5, "answer_relevance_score": 1.0, "context_relevance_score": 3.5} 26 | | `AnswerWithConfidence` | GenerateAnswerWithConfidence| {"answer": "The National Gallery of Art, Washington D.C., and the Pinacoteca di Brera, Milan, Italy.", "confidence": 5} | 27 | | `List[AnswerWithConfidence]` | GenerateAnswersWithConfidence| [{"answer": "National Gallery of Art, Washington D.C.", "confidence": 5}, {"answer": "Pinacoteca di Brera, Milan, Italy", "confidence": 4}] | 28 | 29 | ## Additional Models 30 | 31 | ```python 32 | class RAGASmetrics(BaseModel): 33 | faithfulness_score: float 34 | answer_relevance_score: float 35 | context_relevance_score: float 36 | 37 | class AnswerWithConfidence(BaseModel): 38 | answer: str 39 | confidence: float 40 | ``` 41 | 42 | The WikiQuestions dataset can also be found on [HuggingFace Datasets](https://huggingface.co/datasets/weaviate/Wiki-Answerable-Questions)! 43 | 44 | ![Experimental Results](./structured_rag/run_test/result_visualization/success_rates_per_test.png) 45 | 46 | ## Citation 47 | Please consider citing our paper if you find this work useful: 48 | 49 | ```bibtex 50 | @misc{shorten2024, 51 | title={StructuredRAG: JSON Response Formatting with Large Language Models}, 52 | author={Connor Shorten and Charles Pierse and Thomas Benjamin Smith and Erika Cardenas and Akanksha Sharma and John Trengrove and Bob van Luijt}, 53 | year={2024}, 54 | eprint={2408.11061}, 55 | archivePrefix={arXiv}, 56 | primaryClass={cs.CL}, 57 | url={https://arxiv.org/abs/2408.11061}, 58 | } 59 | ``` 60 | -------------------------------------------------------------------------------- /related-works.md: -------------------------------------------------------------------------------- 1 | # Related Works 2 | This file contains links and thoughts on related works measuring the impact of JSON mode on LLM output quality. 3 | 4 | Please open an [issue](https://github.com/weaviate/structured-rag/issues/new) if we have missed an important paper, and we will look into it! 5 | 6 | ## Benchmarking Structured Output Generation Methods 7 | 1. Let Me Speak Freely? A Study on the Impact of Format Restrictions on Performance of Large Language Models, Tam et al. 2024. [Arxiv Link](https://arxiv.org/pdf/2408.02442) 8 | 9 | Weaviate Podcast interview with Zhi Rui Tam! [YouTube Link](https://www.youtube.com/watch?v=UsVIX9NJ_a4) [Spotify Link](https://spotifyanchor-web.app.link/e/KkmrH99LkOb) 10 | 11 | 2. Say What You Mean: A Response to 'Let Me Speak Freely'. 2024. dottxtai. [Blog Post Link](https://blog.dottxt.co/say-what-you-mean.html) 12 | 3. Structured outputs can hurt the performance of LLMs. 2024. Dylan Castillo. [Blog Post Link](https://dylancastillo.co/posts/say-what-you-mean-sometimes.html). 13 | 14 | Note #3 is a response to #2, which is a response to #1 🤠 15 | 16 | 4. Does Prompt Formatting Have Any Impact on LLM Performance? Jia He, Mukund Rungta, David Koleczek, Arshdeep Sekhon, Franklin X. Wang, Sadid Hasan. 2024. [Arxiv Link](https://arxiv.org/pdf/2411.10541) 17 | 5. Instruction-Following Evaluation for Large Language Models. Jeffrey Zhou, Tianjin Lu, Swaroop Mishra, Siddhartha Brahma, Sujoy Basu, Yi Luan, Denny Zhou, Le Hou. 2023. [Arxiv Link](https://arxiv.org/abs/2311.07911) 18 | 6. InfoBench: Evaluating Instruction Following Ability in Large Language Models. Yiwei Qin, Kaiqiang Song, Yebowen Hu, Wenlin Yao, Sangwoo Cho, Xiaoyang Wang, Xuansheng Wu, Fei Liu, Pengfei Liu, Dong Yu. [Arxiv Link](https://arxiv.org/pdf/2401.03601) 19 | 20 | ## Structured Generation 21 | 1. Outlines 22 | 2. XGrammar. [Blog Post](https://blog.mlc.ai/2024/11/22/achieving-efficient-flexible-portable-structured-generation-with-xgrammar) 23 | 24 | ## Motivating Applications of Structured Outputs 25 | Most papers in this area focus on their role in Function Calling, with an emerging emphasis on their use in Chain-of-Thought generation. 26 | 1. Chain of Thought Empowers Transformers to Solve Inherently Serial Problems. Zhiyuan Li, Hong Liu, Denny Zhou, Tengyu Ma. 2024. [Arxiv Link](https://arxiv.org/pdf/2402.12875) 27 | 2. Reasoning with Inference Time Compute by Sean Welleck. Language Technologies Institute at Carnegie Mellon (LTI at CMU). [YouTube Link](https://www.youtube.com/watch?v=lGr-O2rK7WQ) 28 | 29 | ## Tasks related to operating or constructing RAG Systems 30 | 1. Baleen: Robust Multi-Hop Reasoning at Scale via Condensed Retrieval. Omar Khattab, Christopher Potts, and Matei Zaharia. 2022. [Arxiv Link](https://arxiv.org/pdf/2101.00436) 31 | 2. RAGAS: Automated Evaluation of Retrieval Augmented Generation. Shahul Es, Jithin James, Espinosa-Anke, Steven Schockaert. 2023. [Arxiv Link](https://arxiv.org/abs/2309.15217) 32 | 3. Introducing Contextual Retrieval. Anthropic, 2024. [Blog Post Link](https://www.anthropic.com/news/contextual-retrieval) 33 | 34 | ## Tool Use and Structured Outputs 35 | 1. CATP-LLM: Empowering Large Language Models for Cost-Aware Tool Planning. Duo Wu, Jinghe Wang, Yuan Meng, Yanning Zhang, Le Sun, Zhi Wang. 2024. [Arxiv Link](https://arxiv.org/pdf/2411.16313) 36 | 37 | ## Adversarial Attacks and Structured Outputs 38 | 1. Universal and Context-Independent Triggers for Precise Control of LLM Outputs. Jiashuo Liang, Guancheng Li, Yang Yu. 2024. [Arxiv Link](https://arxiv.org/abs/2411.14738). 39 | 40 | # Deep Dive Reviews 41 | 42 | ### 1. Let Me Speak Freely? A Study on the Impact of Format Restrictions on Performance of Large Language Models, Tam et al. 2024. [Arxiv Link](https://arxiv.org/pdf/2408.02442) 43 | 44 | Tests 3 methods for achieving structured outputs: (1) Constrained Decoding (JSON-mode), (2) Format-Restricting Instructions (FRI), and (3) NL-to-Format (interestingly they able using more powerful models for the format part). Tested across 3 reasoning tasks, (1) GSM8K, (2) Last Letter Concatenation, (3) Shuffled Objects, and 4 classification tasks, (1) DDXPlus (49 class medical diagnosis), (2) MultiFin (5 classes for financial paragraphs), (3) Sports Understanding (binary plausibility), and (4) NI - Task 280. Tests `gpt-3.5-turbo-0125`, `claude-3-haiku-20240307`, `gemini-1.5-flash`, `LLaMA-3-8B-Instruct`, and `Gemma-2-9B-Instruct`. 45 | 46 | Findings Summarized at a High-Level: 47 | - Significant decline in LLMs' reasoning abilities under format restrictions. 48 | - Stricter format constraints generally lead to greater performance degradation in reasoning tasks. 49 | 50 | Interesting nuggets: 51 | - Looser format restrictions improve performance on reasoning tasks, whereas JSON mode performs better on classification tasks. 52 | - Parsing errors can be mitigated through corrective prompting (NL-to-format). 53 | - JSON-mode performed significantly worse than FRI on the Last Letter Task because 100% of GPT 3.5 Turbo JSON-mode response placed the "answer" key before the "reason" key -- interesting nugget for Chain-of-Thought prompting with respect to output key ordering. 54 | - YAML results in fewer tokens used versus JSON / XML. 55 | 56 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name="structured-rag", 5 | version="0.1", 6 | packages=find_packages(where="src"), 7 | package_dir={"": "src"}, 8 | ) -------------------------------------------------------------------------------- /structured_rag/mock_gfl/__init__.py: -------------------------------------------------------------------------------- 1 | from .dspy_program import * 2 | from .dspy_signatures import * 3 | from .fstring_program import * 4 | from .fstring_prompts import * -------------------------------------------------------------------------------- /structured_rag/mock_gfl/dspy_program.py: -------------------------------------------------------------------------------- 1 | import dspy 2 | from typing import Optional, Any, Dict 3 | from structured_rag.mock_gfl.dspy_signatures import GenerateResponse, OPRO_JSON 4 | from pydantic import BaseModel 5 | 6 | class dspy_Program(dspy.Module): 7 | def __init__(self, 8 | test_params: Dict[str, str], 9 | model_name: str, model_provider: str, api_key: Optional[str] = None, 10 | use_OPRO_JSON: bool = False) -> None: 11 | super().__init__() 12 | self.test_params = test_params 13 | self.model_name = model_name 14 | self.model_provider = model_provider 15 | self.use_OPRO_JSON = use_OPRO_JSON 16 | self.configure_llm(api_key) 17 | # ToDo, Interface `TypedPredictor` here 18 | if self.use_OPRO_JSON: 19 | self.generate_response = dspy.Predict(OPRO_JSON) 20 | else: 21 | self.generate_response = dspy.ChainOfThought(GenerateResponse) 22 | 23 | def configure_llm(self, api_key: Optional[str] = None): 24 | if self.model_provider == "ollama": 25 | llm = dspy.OllamaLocal(model=self.model_name, max_tokens=4000, timeout_s=480) 26 | elif self.model_provider == "google": 27 | llm = dspy.Google(model=self.model_name, api_key=api_key) 28 | elif self.model_provider == "openai": 29 | import openai 30 | 31 | openai.api_key = api_key 32 | llm = dspy.OpenAI(model=self.model_name) 33 | elif self.model_provider == "anthropic": 34 | import anthropic 35 | llm = dspy.Claude(model=self.model_name, api_key=api_key) 36 | # ToDo, add Cohere 37 | else: 38 | raise ValueError(f"Unsupported model provider: {self.model_provider}") 39 | 40 | print("Running LLM connection test (say hello)...") 41 | print(llm("say hello")) 42 | dspy.settings.configure(lm=llm) 43 | 44 | # Note, this needs to be cleaned up with the abstraction around DSPy / LLM APIs 45 | def forward(self, output_model: Optional[BaseModel], test: str, question: str, context: Optional[str] = "", answer: Optional[str] = "") -> Any: 46 | references = {"context": context, "question": question, "answer": answer} 47 | references = "".join(f"{k}: {v}" for k, v in references.items()) 48 | response = self.generate_response( 49 | task_instructions=self.test_params['task_instructions'], 50 | response_format=self.test_params['response_format'], 51 | references=references 52 | ).response 53 | 54 | return response -------------------------------------------------------------------------------- /structured_rag/mock_gfl/dspy_signatures.py: -------------------------------------------------------------------------------- 1 | import dspy 2 | from typing import List 3 | 4 | class GenerateResponse(dspy.Signature): 5 | """Follow the task_instructions (Input Field) and generate the response (Output Field) according to the output format given by response_format (Input Field). You will be given references from (Task-Specific Input Field).""" 6 | 7 | task_instructions = dspy.InputField(desc="(Input Field)") 8 | response_format = dspy.InputField(desc="(Input Field)") 9 | references = dspy.InputField(desc="Task-Specific Input Field") 10 | response = dspy.OutputField(desc="(Output Field)") 11 | 12 | # ToDo, OPRO_JSON is derived from a compiled version of GenerateResponse 13 | # -- would load the optimized program from disk in `dspy_program.py` 14 | 15 | class OPRO_JSON(dspy.Signature): 16 | """Carefully interpret the task_instructions provided in the Input Field, synthesizing the necessary information from the Task-Specific Input Field to construct a response. Your response should be formatted exclusively in JSON and must conform precisely to the structure dictated by the response_format Input Field. Ensure that your JSON-formatted response is devoid of extraneous characters or elements, such as markdown code block ticks (```), and includes only the keys specified by the response_format. Your attention to detail in following these instructions is paramount for the accuracy and relevance of your output.""" 17 | 18 | task_instructions = dspy.InputField(desc="(Input Field)") 19 | response_format = dspy.InputField(desc="(Input Field)") 20 | references = dspy.InputField(desc="Task-Specific Input Field") 21 | response = dspy.OutputField(desc="(Output Field)") -------------------------------------------------------------------------------- /structured_rag/mock_gfl/fstring_program.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Dict 2 | import ollama 3 | import google.generativeai as genai 4 | import openai 5 | from structured_rag.mock_gfl.fstring_prompts import get_prompt 6 | from pydantic import BaseModel 7 | import json 8 | 9 | class fstring_Program(): 10 | def __init__(self, 11 | test_params: Dict[str, str], structured_outputs: bool, 12 | model_name: str, model_provider: str, api_key: Optional[str]) -> None: 13 | self.test_params = test_params 14 | self.model_name = model_name 15 | self.model_provider = model_provider 16 | self.structured_outputs = structured_outputs 17 | if self.model_provider == "google": 18 | genai.configure(api_key=api_key) 19 | self.model = genai.GenerativeModel(self.model_name) 20 | elif self.model_provider == "openai": 21 | import openai 22 | self.model = openai.OpenAI(api_key=api_key) 23 | elif self.model_provider == "anthropic": 24 | import anthropic 25 | self.model = anthropic.Anthropic(api_key=api_key) 26 | print("Running LLM connection test (say hello)...") 27 | print(self.test_connection()) 28 | 29 | def test_connection(self) -> str: 30 | # For now this tests without structured outputs, could be an idea to add this 31 | connection_prompt = "say hello" 32 | print(f"Saying hello to {self.model_provider}'s {self.model_name}...\n") 33 | if self.model_provider == "google": 34 | # how to add a BaseModel to this? 35 | response = self.model.generate_content(connection_prompt) 36 | return response.text 37 | elif self.model_provider == "ollama": 38 | response = ollama.chat(model=self.model_name, messages=[{"role": "user", "content": connection_prompt}]) 39 | return response['message']['content'] 40 | elif self.model_provider == "openai": 41 | response = self.model.chat.completions.create( 42 | model=self.model_name, 43 | messages=[ 44 | {"role": "system", "content": "You are a helpful assistant."}, 45 | {"role": "user", "content": connection_prompt} 46 | ] 47 | ) 48 | return response.choices[0].message.content 49 | elif self.model_provider == "anthropic": 50 | response = self.model.messages.create( 51 | model=self.model_name, 52 | max_tokens=2048, 53 | messages=[ 54 | {"role": "user", "content": connection_prompt} 55 | ] 56 | ) 57 | return response.content[0].text 58 | 59 | def forward(self, output_model: Optional[BaseModel], test: str, 60 | context: str = "", question: str = "", answer: str = "", 61 | ) -> str: 62 | references: Dict[str, str] = {} 63 | if test != "ParaphraseQuestions": 64 | references = {"context": context, "question": question} 65 | elif test == "RAGAS": 66 | references = {"context": context, "question": question, "answer": answer} 67 | else: 68 | references = {"question": question} 69 | 70 | prompt = get_prompt(test, references, self.test_params) 71 | 72 | if self.model_provider == "ollama": 73 | # ToDo, add structured outputs to Ollama 74 | response = ollama.chat(model=self.model_name, messages=[{"role": "user", "content": prompt}]) 75 | return response['message']['content'] 76 | elif self.model_provider == "google": 77 | if self.structured_outputs: 78 | response = self.model.generate_content( 79 | prompt, 80 | generation_config=genai.GenerationConfig( 81 | response_mime_type="application/json", response_schema=output_model 82 | ), 83 | ) 84 | else: 85 | response = self.model.generate_content(prompt) 86 | return response.text 87 | elif self.model_provider == "openai": 88 | if self.structured_outputs: 89 | # Super likely this is moved out of the `.beta` prefix eventually 90 | # Note, this currently suppored with: 91 | # -- `gpt-4o-mini-2024-07-18` 92 | # -- `gpt-4o-2024-08-06` 93 | response = self.model.beta.chat.completions.parse( 94 | model=self.model_name, 95 | messages=[ 96 | {"role": "system", "content": "You are a helpful assistant. Follow the response format instructions."}, 97 | {"role": "user", "content": prompt} 98 | ], 99 | response_format=output_model 100 | ) 101 | parsed_response = response.choices[0].message.parsed 102 | # Convert the parsed response to JSON for the parsing later on 103 | json_response = json.dumps({key: value for key, value in parsed_response.__dict__.items()}) 104 | print(f"\n JSON RESPONSE: \n {json_response}\n") 105 | return json_response 106 | else: 107 | response = self.model.chat.completions.create( 108 | model=self.model_name, 109 | messages=[ 110 | {"role": "system", "content": "You are a helpful assistant."}, 111 | {"role": "user", "content": prompt} 112 | ] 113 | ) 114 | return response.choices[0].message.content 115 | elif self.model_provider == "anthropic": 116 | response = self.model.messages.create( 117 | model=self.model_name, 118 | max_tokens=2048, 119 | messages=[ 120 | {"role": "user", "content": prompt} 121 | ] 122 | ) 123 | return response.content[0].text -------------------------------------------------------------------------------- /structured_rag/mock_gfl/fstring_prompts.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | def get_prompt(test: str, references: Dict[str, str], test_params: Dict[str, str]) -> str: 4 | references_str = ' | '.join(f"{k}: {v}" for k, v in references.items()) 5 | 6 | return f"""Instructions: {test_params['task_instructions']} 7 | References: {references_str} 8 | Output the result as a JSON string with the following format: {test_params['response_format']} 9 | IMPORTANT!! Do not start the JSON with ```json or end it with ```.""" -------------------------------------------------------------------------------- /structured_rag/mock_gfl/modal_vllm_outlines/download_llama.py: -------------------------------------------------------------------------------- 1 | import modal 2 | 3 | HF_TOKEN = "YOUR_HF_TOKEN" # Replace this with your HuggingFace Token 4 | MODELS_DIR = "/llamas" 5 | 6 | # Model IDs 7 | Llama_3_1_8B_Instruct_MODEL_ID = "meta-llama/Meta-Llama-3.1-8B-Instruct" 8 | Llama_3_2_1B_Instruct_MODEL_ID = "meta-llama/Llama-3.2-1B-Instruct" 9 | Llama_3_2_3B_Instruct_MODEL_ID = "meta-llama/Llama-3.2-3B-Instruct" 10 | 11 | # Model Revisions 12 | Llama_3_1_8B_Instruct_MODEL_REVISION = "8c22764a7e3675c50d4c7c9a4edb474456022b16" # pin model revisions to prevent unexpected changes! 13 | Llama_3_2_1B_Instruct_MODEL_REVISION = "e9f8effbab1cbdc515c11ee6e098e3d5a9f51e14" 14 | Llama_3_2_3B_Instruct_MODEL_REVISION = "392a143b624368100f77a3eafaa4a2468ba50a72" 15 | 16 | volume = modal.Volume.from_name("llamas", create_if_missing=True) 17 | 18 | image = ( 19 | modal.Image.debian_slim(python_version="3.10") 20 | .pip_install( 21 | [ 22 | "huggingface_hub", # download models from the Hugging Face Hub 23 | "hf-transfer", # download models faster with Rust 24 | ] 25 | ) 26 | .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) 27 | ) 28 | 29 | MINUTES = 60 30 | HOURS = 60 * MINUTES 31 | 32 | 33 | app = modal.App(image=image, secrets=[modal.Secret.from_name("huggingface-secret")]) 34 | 35 | 36 | @app.function(volumes={MODELS_DIR: volume}, timeout=4 * HOURS) 37 | def download_model(model_name, model_revision, force_download=False): 38 | 39 | from huggingface_hub import snapshot_download 40 | 41 | volume.reload() 42 | 43 | snapshot_download( 44 | Llama_3_1_8B_Instruct_MODEL_ID, 45 | local_dir=MODELS_DIR, 46 | ignore_patterns=[ 47 | "*.pt", 48 | "*.bin", 49 | "*.pth", 50 | "original/*", 51 | ], # Ensure safetensors 52 | revision=Llama_3_1_8B_Instruct_MODEL_REVISION, 53 | token=HF_TOKEN, 54 | ) 55 | 56 | volume.commit() 57 | 58 | 59 | @app.local_entrypoint() 60 | def main( 61 | model_name: str = Llama_3_1_8B_Instruct_MODEL_ID, 62 | model_revision: str = Llama_3_1_8B_Instruct_MODEL_REVISION, 63 | force_download: bool = False, 64 | ): 65 | download_model.remote(model_name, model_revision, force_download) 66 | 67 | 68 | def extract_assistant_response(output_text): 69 | """Model-specific code to extract model responses. 70 | 71 | See this doc for LLaMA 3: https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3/. 72 | """ 73 | # Split the output text by the assistant header token 74 | parts = output_text.split("<|start_header_id|>assistant<|end_header_id|>") 75 | 76 | if len(parts) > 1: 77 | # Join the parts after the first occurrence of the assistant header token 78 | response = parts[1].split("<|eot_id|>")[0].strip() 79 | 80 | # Remove any remaining special tokens and whitespace 81 | response = response.replace("<|eot_id|>", "").strip() 82 | 83 | return response 84 | else: 85 | return output_text 86 | -------------------------------------------------------------------------------- /structured_rag/mock_gfl/modal_vllm_outlines/modal_web_server.py: -------------------------------------------------------------------------------- 1 | import modal 2 | import modal.gpu 3 | from fastapi import Depends, HTTPException, status 4 | from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer 5 | 6 | from vllm_outlines_setup import Model, app 7 | 8 | MINUTE = 60 9 | 10 | web_image = modal.Image.debian_slim(python_version="3.10") 11 | 12 | auth_scheme = HTTPBearer() 13 | 14 | 15 | @app.function( 16 | image=web_image, 17 | # secrets=[modal.Secret.from_name("my-inference-secret")], 18 | container_idle_timeout=MINUTE 19 | * 20, # keeps web container alive for 20 minutes (the max) 20 | ) 21 | @modal.web_endpoint(method="POST") 22 | def generate_web( 23 | data: dict, token: HTTPAuthorizationCredentials = Depends(auth_scheme) 24 | ): 25 | import os 26 | if data["with_outlines"] == True: 27 | return Model.generate_with_outlines.remote(data["prompts"], data["output_model"], settings=None) 28 | else: 29 | return Model.generate.remote_gen(data["prompts"], settings=None) -------------------------------------------------------------------------------- /structured_rag/mock_gfl/modal_vllm_outlines/quick_setup_test.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import json 3 | import requests 4 | import time 5 | import argparse 6 | from pydantic import BaseModel 7 | 8 | class Answer(BaseModel): 9 | answer: str 10 | confidence_rating: float 11 | 12 | url = "YOUR_MODAL_URL" 13 | 14 | headers = { 15 | "Content-Type": "application/json", 16 | "Authorization": "Bearer YOUR_MODAL_API_KEY", # replace with your Modal API Key 17 | } 18 | 19 | prompts = [ 20 | "What is the capital of France?", 21 | "What is the capital of Germany?", 22 | "What is the capital of Italy?", 23 | "What is the capital of Spain?", 24 | "What is the capital of Portugal?", 25 | "What is the capital of the United Kingdom?", 26 | "What is the capital of Ireland?", 27 | "What is the capital of Sweden?", 28 | "What is the capital of Norway?", 29 | "What is the capital of Finland?", 30 | "What is the capital of Denmark?", 31 | "What is the capital of Poland?", 32 | "What is the capital of Austria?", 33 | "What is the capital of Switzerland?", 34 | "What is the capital of Greece?", 35 | "What is the capital of Turkey?", 36 | "What is the capital of Russia?", 37 | "What is the capital of Ukraine?", 38 | "What is the capital of Romania?", 39 | "What is the capital of Bulgaria?" 40 | ] 41 | 42 | prompt_preface = """<|begin_of_text|> 43 | <|start_header_id|>system<|end_header_id|> 44 | 45 | Cutting Knowledge Date: December 2023 46 | Today Date: 23 Jul 2024 47 | 48 | You are a helpful assistant<|eot_id|> 49 | <|start_header_id|>user<|end_header_id|> 50 | """ 51 | 52 | prompt_ending = """<|eot_id|> 53 | <|start_header_id|>assistant<|end_header_id|>""" 54 | 55 | # Loops through `prompts` and prefaces each with the prompt_preface 56 | prefaced_prompts = [prompt_preface + prompt for prompt in prompts] 57 | # Loops through `prefaced_prompts` and appends the prompt_ending to each 58 | prefaced_prompts_with_ending = [prompt + prompt_ending for prompt in prefaced_prompts] 59 | 60 | ''' 61 | prompts_with_ids = [] 62 | 63 | for idx, prompt in enumerate(prefaced_prompts_with_ending): 64 | prompts_with_ids.append(PromptWithID(prompt=prompt, id=idx)) 65 | ''' 66 | 67 | def run_test(with_outlines): 68 | payload = { 69 | "prompts": prefaced_prompts_with_ending, 70 | "with_outlines": with_outlines, 71 | } 72 | if with_outlines: 73 | payload["output_model"] = Answer.schema() 74 | 75 | start_time = time.time() 76 | response = requests.post(url, headers=headers, json=payload) 77 | 78 | 79 | end_time = time.time() 80 | 81 | if response.status_code == 200: 82 | response_list = ast.literal_eval(response.text) 83 | print(f"\nResults {'with' if with_outlines else 'without'} Outlines:") 84 | results_dict = {int(result["id"]): result["answer"] for result in response_list} 85 | sorted_results = dict(sorted(results_dict.items())) 86 | 87 | for id, answer in sorted_results.items(): 88 | print(f"Prompt {id + 1}: {prompts[id]}") 89 | print("=" * 50) 90 | print(f"ID: {id+1}") 91 | print(f"Answer: {answer}") 92 | print("=" * 50) 93 | 94 | total_time = end_time - start_time 95 | num_tasks = len(response_list) 96 | print(f"Number of answers: {num_tasks}") 97 | print(f"Total time taken: {total_time:.2f} seconds") 98 | print(f"Average time per task: {total_time / num_tasks:.2f} seconds") 99 | else: 100 | print(f"Error: {response.status_code}") 101 | print(response.text) 102 | 103 | if __name__ == "__main__": 104 | parser = argparse.ArgumentParser(description="Run test with or without Outlines") 105 | parser.add_argument("--with-outlines", action="store_true", help="Run test with Outlines") 106 | args = parser.parse_args() 107 | 108 | run_test(with_outlines=args.with_outlines) 109 | 110 | # To run this script: 111 | # Without Outlines: python query_test.py 112 | # With Outlines: python query_test.py --with-outlines 113 | -------------------------------------------------------------------------------- /structured_rag/mock_gfl/modal_vllm_outlines/readme.md: -------------------------------------------------------------------------------- 1 | # Run these commands: 2 | 3 | ```bash 4 | modal run download_llama.py 5 | modal deploy modal_web_server.py 6 | ``` 7 | 8 | ```bash 9 | python3 quick_setup_test.py 10 | ``` 11 | -------------------------------------------------------------------------------- /structured_rag/mock_gfl/modal_vllm_outlines/setup.sh: -------------------------------------------------------------------------------- 1 | modal run download_llama.py 2 | modal deploy modal_web_server.py 3 | -------------------------------------------------------------------------------- /structured_rag/mock_gfl/modal_vllm_outlines/vllm_outlines_setup.py: -------------------------------------------------------------------------------- 1 | import modal 2 | 3 | vllm_image = modal.Image.debian_slim(python_version="3.10").pip_install( 4 | "vllm==0.5.3post1", "outlines==0.0.46" 5 | ) 6 | 7 | MODELS_DIR = "/llamas" 8 | # Model IDs 9 | Llama_3_1_8B_Instruct_MODEL_ID = "meta-llama/Meta-Llama-3.1-8B-Instruct" 10 | Llama_3_2_1B_Instruct_MODEL_ID = "meta-llama/Llama-3.2-1B-Instruct" 11 | Llama_3_2_3B_Instruct_MODEL_ID = "meta-llama/Llama-3.2-3B-Instruct" 12 | 13 | # Model Revisions 14 | Llama_3_1_8B_Instruct_MODEL_REVISION = "8c22764a7e3675c50d4c7c9a4edb474456022b16" # pin model revisions to prevent unexpected changes! 15 | Llama_3_2_1B_Instruct_MODEL_REVISION = "e9f8effbab1cbdc515c11ee6e098e3d5a9f51e14" 16 | Llama_3_2_3B_Instruct_MODEL_REVISION = "392a143b624368100f77a3eafaa4a2468ba50a72" 17 | 18 | try: 19 | volume = modal.Volume.lookup("llamas", create_if_missing=False) 20 | except modal.exception.NotFoundError: 21 | raise Exception("Download models first with modal run download_llama.py") 22 | 23 | # Test this 24 | N_GPUS=1 25 | GPU_CONFIG = modal.gpu.A100(count=N_GPUS) 26 | MINUTES = 60 27 | DTYPE = "float16" 28 | MAX_INPUT_LEN = 2048 29 | MAX_OUTPUT_LEN = 512 30 | 31 | app = modal.App("example-vllm-outlines", image=vllm_image) 32 | 33 | from pydantic import BaseModel 34 | class Answer(BaseModel): 35 | answer: str 36 | confidence_rating: float 37 | 38 | @app.cls( 39 | gpu=GPU_CONFIG, container_idle_timeout=1 * MINUTES, volumes={MODELS_DIR: volume} 40 | ) 41 | class Model: 42 | @modal.enter() 43 | def load(self): 44 | """Loads the VLLM engine and configures our tokenizer.""" 45 | 46 | from vllm import EngineArgs, LLMEngine, SamplingParams 47 | from outlines.integrations.vllm import JSONLogitsProcessor 48 | import vllm 49 | 50 | volume.reload() 51 | 52 | engine_args = EngineArgs( 53 | model=MODELS_DIR, 54 | tensor_parallel_size=N_GPUS, 55 | gpu_memory_utilization=0.9, 56 | max_model_len=8096, 57 | enforce_eager=False, 58 | dtype=DTYPE, 59 | ) 60 | 61 | self.engine = LLMEngine.from_engine_args(engine_args) 62 | 63 | @modal.method(is_generator=True) 64 | def generate(self, prompts: list[str], settings=None): 65 | """Generate responses to a batch of prompts, optionally with custom inference settings.""" 66 | from vllm import SamplingParams 67 | 68 | request_id = 0 69 | 70 | # Add all prompts to the engine 71 | for prompt in prompts: 72 | sampling_params = SamplingParams( 73 | max_tokens=MAX_OUTPUT_LEN, 74 | temperature=0 75 | ) 76 | self.engine.add_request(str(request_id), prompt, sampling_params) 77 | request_id += 1 78 | 79 | # Process requests and yield results 80 | while self.engine.has_unfinished_requests(): 81 | request_outputs = self.engine.step() 82 | for request_output in request_outputs: 83 | if request_output.finished: 84 | yield request_output.outputs[0].text 85 | 86 | # TODO: Add the generator back in 87 | @modal.method() 88 | def generate_with_outlines(self, prompts: list[str], output_model: BaseModel, settings=None): 89 | """Generate responses to a batch of prompts using Outlines structured outputs according to the provided Pydantic model.""" 90 | 91 | from vllm import SamplingParams 92 | from outlines.integrations.vllm import JSONLogitsProcessor 93 | 94 | request_id = 0 95 | results = [] 96 | 97 | logits_processor = JSONLogitsProcessor(schema=output_model, llm=self.engine) 98 | 99 | # Add all prompts to the engine 100 | for prompt in prompts: 101 | sampling_params = SamplingParams( 102 | max_tokens=MAX_OUTPUT_LEN, 103 | temperature=0, 104 | logits_processors=[logits_processor] 105 | ) 106 | self.engine.add_request(str(request_id), prompt, sampling_params) 107 | request_id += 1 108 | 109 | # Process requests and collect results 110 | while self.engine.has_unfinished_requests(): 111 | request_outputs = self.engine.step() 112 | for request_output in request_outputs: 113 | if request_output.finished: 114 | # fix this, `answer` is a terribly confusing key -- `response` is better 115 | results.append({ 116 | "id": request_output.request_id, 117 | "answer": request_output.outputs[0].text 118 | }) 119 | 120 | return results -------------------------------------------------------------------------------- /structured_rag/models.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel, create_model 2 | from enum import Enum 3 | from typing import Type, List 4 | 5 | class PromptWithResponse(BaseModel): 6 | prompt: str 7 | response: str 8 | 9 | class PromptingMethod(str, Enum): 10 | dspy = "dspy" 11 | fstring = "fstring" 12 | 13 | # ToDo rename to `JSON_success_rate` 14 | class Experiment(BaseModel): 15 | test_name: str 16 | model_name: str 17 | prompting_method: PromptingMethod 18 | num_successes: int 19 | total_task_performance: int 20 | num_attempts: int 21 | success_rate: float 22 | average_task_performance: float 23 | total_time: int 24 | all_responses: list[PromptWithResponse] 25 | failed_responses: list[PromptWithResponse] 26 | 27 | class Config: 28 | protected_namespaces = () 29 | 30 | class SingleTestResult(BaseModel): 31 | prompt_with_response: PromptWithResponse 32 | is_valid: bool 33 | task_metric: int 34 | 35 | class GenerateAnswer(BaseModel): 36 | answer: str 37 | 38 | class RateContext(BaseModel): 39 | context_score: int 40 | 41 | class AssessAnswerability(BaseModel): 42 | answerable_question: bool 43 | 44 | class ParaphraseQuestions(BaseModel): 45 | paraphrased_questions: List[str] 46 | 47 | class RAGAS(BaseModel): 48 | faithfulness_score: float 49 | answer_relevance_score: float 50 | context_relevance_score: float 51 | 52 | class GenerateAnswerWithConfidence(BaseModel): 53 | answer: str 54 | confidence: int 55 | 56 | class GenerateAnswersWithConfidence(BaseModel): 57 | answers: List[GenerateAnswerWithConfidence] 58 | 59 | class ClassifyDocument(BaseModel): 60 | category: Enum 61 | 62 | class Config: 63 | arbitrary_types_allowed = True 64 | 65 | class ClassifyDocumentWithRationale(BaseModel): 66 | rationale: str 67 | category: Enum 68 | 69 | class Config: 70 | arbitrary_types_allowed = True 71 | 72 | def create_enum(enum_name: str, enum_values: List[str]) -> Type[Enum]: 73 | """Dynamically create an Enum class with given values.""" 74 | return Enum(enum_name, {value: value for value in enum_values}) 75 | 76 | def _ClassifyDocument(categories: List[str]) -> Type[BaseModel]: 77 | # Dynamically create the Enum for categories 78 | CategoriesEnum = create_enum("CategoriesEnum", categories) 79 | 80 | # Dynamically create the Pydantic model with the Enum field 81 | ClassifyDocument = create_model( 82 | 'ClassifyDocument', 83 | category=(CategoriesEnum, ...) 84 | ) 85 | 86 | return ClassifyDocument 87 | 88 | def _ClassifyDocumentWithRationale(categories: List[str]) -> Type[BaseModel]: 89 | # Dynamically create the Enum for categories 90 | CategoriesEnum = create_enum("CategoriesEnum", categories) 91 | 92 | # Dynamically create the Pydantic model with the Enum field and rationale 93 | ClassifyDocumentWithRationale = create_model( 94 | 'ClassifyDocumentWithRationale', 95 | rationale=(str, ...), 96 | category=(CategoriesEnum, ...) 97 | ) 98 | 99 | return ClassifyDocumentWithRationale 100 | 101 | 102 | 103 | # ToDo, get `test_params` from here instead of hardcoded in `run_test.py` 104 | test_params = { 105 | "GenerateAnswer": { 106 | "task_instructions": "Assess the context and answer the question. If the context does not contain sufficient information to answer the question, respond with \"NOT ENOUGH CONTEXT\".", 107 | "response_format": '{"answer": "string"}' 108 | }, 109 | "RateContext": { 110 | "task_instructions": "Assess how well the context helps answer the question.", 111 | "response_format": '{"context_score": "int (0-5)"}' 112 | }, 113 | "AssessAnswerability": { 114 | "task_instructions": "Determine if the question is answerable based on the context.", 115 | "response_format": '{"answerable_question": "bool"}' 116 | }, 117 | "ParaphraseQuestions": { 118 | "task_instructions": "Generate 3 paraphrased versions of the given question.", 119 | "response_format": '{"paraphrased_questions": ["string", "string", "string"]}' 120 | }, 121 | "RAGAS": { 122 | "task_instructions": "Assess the faithfulness, answer relevance, and context relevance given a question, context, and answer.", 123 | "response_format": '{"faithfulness_score": "float (0-5)", "answer_relevance_score": "float (0-5)", "context_relevance_score": "float (0-5)"}' 124 | }, 125 | "GenerateAnswerWithConfidence": { 126 | "task_instructions": "Generate an answer with a confidence score.", 127 | "response_format": '{"Answer": "string", "Confidence": "int (0-5)"}' 128 | }, 129 | "GenerateAnswersWithConfidence": { 130 | "task_instructions": "Generate multiple answers with confidence scores.", 131 | "response_format": '[{"Answer": "string", "Confidence": "int (0-5)"}, ...]' 132 | }, 133 | "ClassifyDocument": { 134 | "task_instructions": "Classify the document into one of the provided classes.", 135 | "response_format": '{"classification": "Enum"}' 136 | }, 137 | "ClassifyDocumentWithRationale": { 138 | "task_instructions": "Classify the document into one of the provided classes and provide a rationale explaining why the document belongs in this class.", 139 | "response_format": '{"rationale": "string", "classification": "Enum"}' 140 | } 141 | } 142 | 143 | test_to_output_model = { 144 | "GenerateAnswer": GenerateAnswer, 145 | "RateContext": RateContext, 146 | "AssessAnswerability": AssessAnswerability, 147 | "ParaphraseQuestions": ParaphraseQuestions, 148 | "RAGAS": RAGAS, 149 | "GenerateAnswerWithConfidence": GenerateAnswerWithConfidence, 150 | "GenerateAnswersWithConfidence": GenerateAnswersWithConfidence, 151 | "ClassifyDocument": ClassifyDocument, 152 | "ClassifyDocumentWithRationale": ClassifyDocumentWithRationale 153 | } -------------------------------------------------------------------------------- /structured_rag/readme.md: -------------------------------------------------------------------------------- 1 | # StructuredRAG 2 | 3 | ## `mock_gfl` 4 | Contains the mock GFL system for StructuredRAG testing 5 | 6 | ## `run_test` 7 | Contains the code to run tests and visualize results 8 | 9 | ## `models.py` 10 | Contains the models used for StructuredRAG tests and result serialization 11 | -------------------------------------------------------------------------------- /structured_rag/run_test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weaviate/structured-rag/0c5174d0819db853adeeca59840a4eef957f5ac2/structured_rag/run_test/__init__.py -------------------------------------------------------------------------------- /structured_rag/run_test/readme.md: -------------------------------------------------------------------------------- 1 | # StructuredRAG Tests 2 | 3 | StructuredRAG contains 6 tests for JSON Structured Output testing with LLMs. 4 | 5 | - `GenerateAnswer` -> str 6 | - `RateContext` -> int 7 | - `AssessAnswerability` -> bool 8 | - `ParaphraseQuestions` -> List[str] 9 | - `AnswerWithConfidence` -> AnswerWithConfidence 10 | - `AnswersWithConfidences` -> List[AnswerWithConfidence] 11 | 12 | ```python 13 | from pydantic import BaseModel 14 | 15 | class AnswerWithConfidence(BaseModel): 16 | answer: str 17 | confidence: float 18 | ``` -------------------------------------------------------------------------------- /structured_rag/run_test/result_visualization/aggregate_result_jsons.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from typing import Dict, List 4 | from collections import defaultdict 5 | import matplotlib.pyplot as plt 6 | import matplotlib.colors as mcolors 7 | import numpy as np 8 | import argparse 9 | 10 | def read_json_files(base_dir: str) -> List[Dict]: 11 | results = [] 12 | for trial_dir in os.listdir(base_dir): 13 | if trial_dir.startswith("trial-"): 14 | trial_path = os.path.join(base_dir, trial_dir) 15 | for filename in os.listdir(trial_path): 16 | if filename.endswith(".json"): 17 | with open(os.path.join(trial_path, filename), "r") as f: 18 | data = json.load(f) 19 | data['file_path'] = os.path.join(trial_path, filename) 20 | results.append(data) 21 | return results 22 | 23 | def aggregate_results(results: List[Dict]) -> Dict: 24 | summary = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: { 25 | "dspy_total": 0, 26 | "fstring_total": 0, 27 | "total_questions": 0, 28 | "runs": 0 29 | })))) 30 | 31 | for result in results: 32 | test_type = result["test_type"] 33 | model_name = result["model_name"] 34 | model_provider = result["model_provider"] 35 | trial = os.path.basename(os.path.dirname(result["file_path"])) 36 | 37 | summary[test_type][model_name][model_provider][trial]["dspy_total"] += result["dspy_score"] 38 | summary[test_type][model_name][model_provider][trial]["fstring_total"] += result["fstring_score"] 39 | 40 | # Handle both old and new JSON formats 41 | if "total_questions" in result: 42 | total_questions = result["total_questions"] 43 | else: 44 | total_questions = max(result.get("dspy_total_attempts", 0), result.get("fstring_total_attempts", 0)) 45 | 46 | summary[test_type][model_name][model_provider][trial]["total_questions"] += total_questions 47 | summary[test_type][model_name][model_provider][trial]["runs"] += 1 48 | 49 | for test_type, models in summary.items(): 50 | for model_name, providers in models.items(): 51 | for provider, trials in providers.items(): 52 | for trial, data in trials.items(): 53 | data["dspy_average"] = data["dspy_total"] / data["runs"] if data["runs"] > 0 else 0 54 | data["fstring_average"] = data["fstring_total"] / data["runs"] if data["runs"] > 0 else 0 55 | data["average_questions"] = data["total_questions"] / data["runs"] if data["runs"] > 0 else 0 56 | 57 | return dict(summary) 58 | 59 | def print_summary(summary: Dict): 60 | print("Experiment Results Summary:") 61 | print("===========================") 62 | for test_type, models in summary.items(): 63 | print(f"\nTest: {test_type}") 64 | for model_name, providers in models.items(): 65 | for provider, trials in providers.items(): 66 | print(f"\nModel: {model_name} (Provider: {provider})") 67 | for trial, data in trials.items(): 68 | print(f" Trial: {trial}") 69 | print(f" Number of runs: {data['runs']}") 70 | print(f" Average questions per run: {data['average_questions']:.2f}") 71 | if data['average_questions'] > 0: 72 | print(f" DSPy average score: {data['dspy_average']:.2f} ({data['dspy_average']/data['average_questions']:.2%})") 73 | print(f" f-string average score: {data['fstring_average']:.2f} ({data['fstring_average']/data['average_questions']:.2%})") 74 | else: 75 | print(f" DSPy average score: {data['dspy_average']:.2f} (N/A)") 76 | print(f" f-string average score: {data['fstring_average']:.2f} (N/A)") 77 | 78 | def create_bar_chart(summary: Dict, trial: str = None): 79 | tests = list(summary.keys()) 80 | models = list(set(model for test in summary.values() for model in test.keys())) 81 | 82 | fig, ax = plt.subplots(figsize=(20, 10)) # Increased figure size for better readability 83 | 84 | num_models = len(models) 85 | group_width = 0.8 86 | bar_width = group_width / (2 * num_models) # We have 2 bars per model (DSPy/FF and f-string) 87 | 88 | # Define the color scheme 89 | color_scheme = { 90 | 'gemini-1.5-pro f-String': 'forestgreen', 91 | 'gemini-1.5-pro FF': 'dodgerblue', 92 | 'llama3:instruct f-String': 'red', 93 | 'llama3:instruct FF': 'darkorange' 94 | } 95 | 96 | for i, test in enumerate(tests): 97 | for j, model in enumerate(models): 98 | if model in summary[test]: 99 | providers = summary[test][model] 100 | if trial: 101 | provider_data = next((provider[trial] for provider in providers.values() if trial in provider), None) 102 | if provider_data and provider_data["average_questions"] > 0: 103 | dspy_avg = provider_data["dspy_average"] / provider_data["average_questions"] 104 | fstring_avg = provider_data["fstring_average"] / provider_data["average_questions"] 105 | else: 106 | dspy_avg = fstring_avg = 0 107 | else: 108 | # For overall average, we need to aggregate across all trials 109 | all_trials_data = [t for provider in providers.values() for t in provider.values()] 110 | if all_trials_data: 111 | total_questions = sum(t["average_questions"] for t in all_trials_data) 112 | if total_questions > 0: 113 | dspy_avg = sum(t["dspy_average"] for t in all_trials_data) / total_questions 114 | fstring_avg = sum(t["fstring_average"] for t in all_trials_data) / total_questions 115 | else: 116 | dspy_avg = fstring_avg = 0 117 | else: 118 | dspy_avg = fstring_avg = 0 119 | 120 | # Calculate positions for the bars 121 | base_position = i + (j - num_models/2 + 0.5) * group_width / num_models 122 | dspy_position = base_position - bar_width/2 123 | fstring_position = base_position + bar_width/2 124 | 125 | # Plot the bars with specified coloring 126 | ax.bar(dspy_position, dspy_avg, bar_width, color=color_scheme[f'{model} FF'], alpha=0.8) 127 | ax.bar(fstring_position, fstring_avg, bar_width, color=color_scheme[f'{model} f-String'], alpha=0.8) 128 | 129 | ax.set_ylabel('Average Score (as percentage)', fontsize=16) # Increased font size 130 | title = 'Model Performance Comparison by Test Type' 131 | if trial: 132 | title += f' - {trial}' 133 | else: 134 | title += ' - Average Across All Trials' 135 | ax.set_title(title, fontsize=18) # Increased font size 136 | 137 | ax.set_xticks(range(len(tests))) 138 | ax.set_xticklabels(tests, rotation=45, ha='right', fontsize=12) # Increased font size 139 | 140 | # Adjust y-axis to start from 0 and end at 1 (100%) 141 | ax.set_ylim(0, 1) 142 | ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: '{:.0%}'.format(y))) 143 | ax.tick_params(axis='y', labelsize=14) # Increased y-axis tick label font size 144 | 145 | # Add gridlines for better readability 146 | ax.grid(True, axis='y', linestyle='--', alpha=0.7) 147 | 148 | # Create custom legend 149 | legend_elements = [plt.Rectangle((0,0),1,1, facecolor=color, edgecolor='none', alpha=0.8) 150 | for color in color_scheme.values()] 151 | legend_labels = list(color_scheme.keys()) 152 | 153 | ax.legend(legend_elements, legend_labels, bbox_to_anchor=(1.05, 1), loc='upper left', 154 | borderaxespad=0., fontsize=14) # Increased legend font size 155 | 156 | plt.tight_layout() 157 | filename = 'model_comparison.png' if not trial else f'model_comparison_{trial}.png' 158 | plt.savefig(filename, bbox_inches='tight', dpi=300) # Increased DPI for better quality 159 | plt.close() 160 | 161 | print(f"Bar chart saved as '{filename}'") 162 | 163 | def main(): 164 | parser = argparse.ArgumentParser(description="Aggregate JSON results and create visualizations.") 165 | parser.add_argument("results_dir", help="Directory containing the experimental results") 166 | args = parser.parse_args() 167 | 168 | results = read_json_files(args.results_dir) 169 | summary = aggregate_results(results) 170 | 171 | print_summary(summary) 172 | 173 | # Create a plot for each trial 174 | trials = set() 175 | for test in summary.values(): 176 | for model in test.values(): 177 | for provider in model.values(): 178 | trials.update(provider.keys()) 179 | 180 | for trial in sorted(trials): 181 | create_bar_chart(summary, trial) 182 | 183 | # Create a plot for the average across all trials 184 | create_bar_chart(summary) 185 | 186 | # Save aggregated results 187 | with open(f"./{args.results_dir}/aggregated_results.json", "w") as f: 188 | json.dump(summary, f, indent=2) 189 | 190 | print("\nAggregated results saved to aggregated_results.json") 191 | 192 | 193 | if __name__ == "__main__": 194 | main() -------------------------------------------------------------------------------- /structured_rag/run_test/result_visualization/boxplot_success_rates_per_model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weaviate/structured-rag/0c5174d0819db853adeeca59840a4eef957f5ac2/structured_rag/run_test/result_visualization/boxplot_success_rates_per_model.png -------------------------------------------------------------------------------- /structured_rag/run_test/result_visualization/boxplot_success_rates_per_task.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weaviate/structured-rag/0c5174d0819db853adeeca59840a4eef957f5ac2/structured_rag/run_test/result_visualization/boxplot_success_rates_per_task.png -------------------------------------------------------------------------------- /structured_rag/run_test/result_visualization/compute_averages.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import pandas as pd 4 | from models import PromptWithResponse, PromptingMethod, Experiment 5 | 6 | def load_experiments(directory: str) -> pd.DataFrame: 7 | experiments = [] 8 | for filename in os.listdir(directory): 9 | if filename.endswith(".json"): 10 | with open(os.path.join(directory, filename), 'r') as f: 11 | data = json.load(f) 12 | experiment = Experiment(**data) 13 | experiments.append({ 14 | 'test_name': experiment.test_name, 15 | 'model_name': experiment.model_name, 16 | 'prompting_method': experiment.prompting_method, 17 | 'num_successes': experiment.num_successes, 18 | 'num_attempts': experiment.num_attempts, 19 | 'success_rate': experiment.success_rate, 20 | 'total_time': experiment.total_time, 21 | 'avg_response_time': experiment.total_time / experiment.num_attempts 22 | }) 23 | return pd.DataFrame(experiments) 24 | 25 | def calculate_avg_accuracy_per_prompting_method(experiments: pd.DataFrame) -> pd.DataFrame: 26 | return experiments.groupby(['prompting_method'])['success_rate'].mean().reset_index() 27 | 28 | def calculate_avg_accuracy_per_model(experiments: pd.DataFrame) -> pd.DataFrame: 29 | return experiments.groupby(['model_name'])['success_rate'].mean().reset_index() 30 | 31 | def calculate_avg_accuracy_per_test(experiments: pd.DataFrame) -> pd.DataFrame: 32 | return experiments.groupby(['test_name'])['success_rate'].mean().reset_index() 33 | 34 | def calculate_avg_accuracy_per_prompting_method_per_model(experiments: pd.DataFrame) -> pd.DataFrame: 35 | return experiments.groupby(['model_name', 'prompting_method'])['success_rate'].mean().reset_index() 36 | 37 | def calculate_avg_accuracy_per_prompting_method_per_test(experiments: pd.DataFrame) -> pd.DataFrame: 38 | return experiments.groupby(['test_name', 'prompting_method'])['success_rate'].mean().reset_index() 39 | 40 | def calculate_avg_accuracy_per_model_per_test(experiments: pd.DataFrame) -> pd.DataFrame: 41 | return experiments.groupby(['model_name', 'test_name'])['success_rate'].mean().reset_index() 42 | 43 | def calculate_overall_average(experiments: pd.DataFrame) -> pd.DataFrame: 44 | return experiments['success_rate'].mean() 45 | 46 | def calculate_avg_response_time_per_model(experiments: pd.DataFrame) -> pd.DataFrame: 47 | return experiments.groupby(['model_name'])['avg_response_time'].mean().reset_index() 48 | 49 | def list_all_results(experiments: pd.DataFrame) -> pd.DataFrame: 50 | for index, row in experiments.iterrows(): 51 | print(row["test_name"], row["model_name"], row["prompting_method"]) 52 | print(f"\033[92mSuccess rate: {row['success_rate']}\033[0m") 53 | 54 | if __name__ == "__main__": 55 | experiments = load_experiments("experimental-results-9-11-24") 56 | print("\033[92m\nAverage accuracy per prompting method:\n\033[0m") 57 | print(calculate_avg_accuracy_per_prompting_method(experiments)) 58 | 59 | print("\033[92m\nAverage accuracy per model:\n\033[0m") 60 | print(calculate_avg_accuracy_per_model(experiments)) 61 | 62 | print("\033[92m\nAverage accuracy per test:\n\033[0m") 63 | print(calculate_avg_accuracy_per_test(experiments)) 64 | 65 | print("\033[92m\nAverage accuracy per prompting method per model:\n\033[0m") 66 | print(calculate_avg_accuracy_per_prompting_method_per_model(experiments)) 67 | 68 | print("\033[92m\nAverage accuracy per prompting method per test:\n\033[0m") 69 | print(calculate_avg_accuracy_per_prompting_method_per_test(experiments)) 70 | 71 | print("\033[92m\nAverage accuracy per model per test:\n\033[0m") 72 | print(calculate_avg_accuracy_per_model_per_test(experiments)) 73 | 74 | print("\033[92m\nOverall average accuracy:\n\033[0m") 75 | print(calculate_overall_average(experiments)) 76 | 77 | print("\033[92m\nAverage response time per model:\n\033[0m") 78 | print(calculate_avg_response_time_per_model(experiments)) 79 | 80 | print("\033[92m\nList all results:\n\033[0m") 81 | list_all_results(experiments) 82 | -------------------------------------------------------------------------------- /structured_rag/run_test/result_visualization/dspy_error_analysis.py: -------------------------------------------------------------------------------- 1 | import dspy 2 | 3 | from structured_rag.run_test.utils_and_metrics.helpers import load_experiments 4 | from structured_rag.run_test.utils_and_metrics.helpers import Colors 5 | 6 | import openai 7 | openai.api_key = "sk-foobar" 8 | 9 | gpt4 = dspy.OpenAI(model="gpt-4o-mini") 10 | 11 | dspy.settings.configure(lm=gpt4) 12 | 13 | class ErrorAnalyzer(dspy.Signature): 14 | """An AI system has been tasked with following a particular response format in its output. 15 | Please review the example and output why the response failed to follow the response format.""" 16 | 17 | system_output = dspy.InputField(description="The output from the AI system.") 18 | why_it_failed = dspy.OutputField(description="Why the AI system failed to follow the response format.") 19 | 20 | class SummarizeErrors(dspy.Signature): 21 | """An AI System has been tasked with following a particular response format in its output. 22 | Another AI system has reviewed the AI system's output and provided an error analysis for each error. 23 | Please summarize the provided list of error analyses into a single error analysis.""" 24 | 25 | error_analyses = dspy.InputField(description="The list of errors.") 26 | error_analysis_report = dspy.OutputField(description="The summary of the errors.") 27 | 28 | error_analyzer = dspy.Predict(ErrorAnalyzer) 29 | 30 | # loop through failed_responses 31 | 32 | # ToDo reorganize results to move gemini results 33 | experiments = load_experiments("../results/Gemini-1.5-Pro-9-11-24") 34 | 35 | print(experiments.info()) 36 | 37 | # Need to add the task_instruction and response_format to these Results in order to parse it here 38 | 39 | failed_responses_per_experiments = experiments["failed_responses"].tolist() 40 | test_names = experiments["test_name"].tolist() 41 | 42 | for idx, failed_responses in enumerate(failed_responses_per_experiments): 43 | print(f"{Colors.GREEN}Analyzing Failures for Experiment: {test_names[idx]}\n{Colors.ENDC}") 44 | error_analyses = [] 45 | for idx, failed_response in enumerate(failed_responses): 46 | print(f"{Colors.BOLD}Analyzing Failure {idx}: {failed_response.response}\n{Colors.ENDC}") 47 | error_analysis = error_analyzer(system_output=failed_response.response).why_it_failed 48 | error_analyses.append(error_analysis) 49 | print(f"{Colors.GREEN}Error analysis: {error_analysis}\n{Colors.ENDC}") 50 | 51 | error_analyses = "\n".join([f"[{i+1}] {item}" for i, item in enumerate(error_analyses)]) 52 | summary = SummarizeErrors.predict(error_analyses=error_analyses).error_analysis_report 53 | 54 | print(f"{Colors.BOLD}Summary of Errors:{Colors.ENDC}\n{summary}") 55 | -------------------------------------------------------------------------------- /structured_rag/run_test/result_visualization/new_aggregate_result_jsons.py: -------------------------------------------------------------------------------- 1 | # This code loops from a folder `save-dir` (Cli argument) and aggregates the results from each trial into a single json file. 2 | 3 | import json 4 | import os 5 | import argparse 6 | 7 | def aggregate_results(save_dir): 8 | # Loops through all the files in the `save_dir` directory and aggregates the results into a single JSON file. 9 | results = [] 10 | for filename in os.listdir(save_dir): 11 | if filename.endswith(".json"): 12 | with open(os.path.join(save_dir, filename), "r") as f: 13 | data = json.load(f) 14 | results.append(data) 15 | print(data) 16 | 17 | if __name__ == "__main__": 18 | parser = argparse.ArgumentParser(description="Aggregate results from a save directory") 19 | parser.add_argument("--save-dir", type=str, required=True, help="Directory to save the results") 20 | args = parser.parse_args() 21 | aggregate_results(args.save_dir) -------------------------------------------------------------------------------- /structured_rag/run_test/result_visualization/success_rate_heatmap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weaviate/structured-rag/0c5174d0819db853adeeca59840a4eef957f5ac2/structured_rag/run_test/result_visualization/success_rate_heatmap.png -------------------------------------------------------------------------------- /structured_rag/run_test/result_visualization/success_rates.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weaviate/structured-rag/0c5174d0819db853adeeca59840a4eef957f5ac2/structured_rag/run_test/result_visualization/success_rates.png -------------------------------------------------------------------------------- /structured_rag/run_test/result_visualization/success_rates_per_test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weaviate/structured-rag/0c5174d0819db853adeeca59840a4eef957f5ac2/structured_rag/run_test/result_visualization/success_rates_per_test.png -------------------------------------------------------------------------------- /structured_rag/run_test/result_visualization/visualize.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import pandas as pd 4 | import matplotlib.pyplot as plt 5 | import matplotlib.colors as mcolors 6 | import seaborn as sns 7 | from pydantic import BaseModel 8 | from enum import Enum 9 | from typing import List 10 | 11 | from structured_rag.models import PromptWithResponse, PromptingMethod, Experiment 12 | from structured_rag.run_test.utils_and_metrics.helpers import load_experiments 13 | 14 | def barplot_success_rates(df: pd.DataFrame): 15 | """ 16 | This function plots the success rates of the models, averaged over the two prompting methods. 17 | """ 18 | # compute the average success rate over the two prompting methods 19 | df_avg = df.groupby(['model_name', 'prompting_method'])['success_rate'].mean().reset_index() 20 | plt.figure(figsize=(12, 6)) 21 | sns.barplot(x='model_name', y='success_rate', data=df_avg) 22 | plt.title('Success Rates by Model and Prompting Method') 23 | plt.xlabel('Model Name') 24 | plt.ylabel('Success Rate') 25 | plt.savefig('success_rates.png') 26 | plt.close() 27 | 28 | def barplot_success_rates_per_test(df: pd.DataFrame): 29 | """ 30 | This function plots the success rates of the models for each test. 31 | """ 32 | df_avg = df.groupby(['model_name', 'test_name'])['success_rate'].mean().reset_index() 33 | plt.figure(figsize=(15, 8)) 34 | sns.barplot(x='test_name', y='success_rate', hue='model_name', data=df_avg) 35 | plt.title('Success Rates by Model and Test') 36 | plt.xlabel('Test Name') 37 | plt.ylabel('Success Rate') 38 | plt.xticks(rotation=45, ha='right') 39 | plt.legend(title='Model Name', bbox_to_anchor=(1.05, 1), loc='upper left') 40 | plt.tight_layout() 41 | plt.savefig('success_rates_per_test.png') 42 | plt.close() 43 | 44 | def plot_success_rate_heatmap(df: pd.DataFrame, models: List[str]): 45 | # Filter the dataframe for the specified models 46 | df_filtered = df[df['model_name'].isin(models)] 47 | 48 | # Create a new column combining model_name and prompting_method 49 | df_filtered['model_method'] = df_filtered['model_name'].apply(lambda x: 'claude-3-5-sonnet' if x == 'claude-3-5-sonnet-20240620' else x) + '_' + df_filtered['prompting_method'] 50 | 51 | # Pivot the table 52 | pivot_df = df_filtered.pivot_table(values='success_rate', index='test_name', columns='model_method', aggfunc='mean') 53 | 54 | # Reorder columns to group by model 55 | column_order = [f"{model if model != 'claude-3-5-sonnet-20240620' else 'claude-3-5-sonnet'}_{method}" for model in models for method in ['dspy', 'fstring']] 56 | pivot_df = pivot_df[column_order] 57 | 58 | plt.figure(figsize=(12, 10)) 59 | # Create a custom colormap from red (0%) to green (100%) 60 | cmap = mcolors.LinearSegmentedColormap.from_list("", ["red", "yellow", "green"]) 61 | sns.heatmap(pivot_df, annot=True, cmap=cmap, fmt='.2f', vmin=0, vmax=1) 62 | plt.title('Success Rate Heatmap') 63 | plt.xlabel('Model and Prompting Method') 64 | plt.ylabel('Test Name') 65 | plt.xticks(rotation=45, ha='right') 66 | 67 | # Rename 'claude-3-5-sonnet-20240620' to 'claude-3-5-sonnet' in x-axis labels 68 | x_labels = [label.get_text().replace('claude-3-5-sonnet-20240620', 'claude-3-5-sonnet') for label in plt.gca().get_xticklabels()] 69 | # replace 'dspy' with 'FF' 70 | x_labels = [label.get_text().replace('dspy', 'FF') for label in plt.gca().get_xticklabels()] 71 | plt.gca().set_xticklabels(x_labels) 72 | 73 | plt.tight_layout() 74 | plt.savefig('success_rate_heatmap.png') 75 | plt.close() 76 | 77 | def boxplot_success_rate_per_task(df: pd.DataFrame): 78 | """ 79 | This function plots the success rates for each test, averaged across all models. 80 | """ 81 | plt.figure(figsize=(12, 10)) 82 | 83 | # Create the box plot 84 | sns.boxplot(x='success_rate', y='test_name', data=df, orient='h', color='lightblue', width=0.5) 85 | 86 | # Customize the plot 87 | plt.title('Success Rates by Test', fontsize=16) 88 | plt.xlabel('Success Rate', fontsize=12) 89 | plt.ylabel('Test Name', fontsize=12) 90 | plt.xlim(0, 1) # Set x-axis limits from 0 to 1 for success rate 91 | plt.grid(axis='x', linestyle='--', alpha=0.7) 92 | 93 | # Improve readability 94 | plt.tight_layout() 95 | plt.savefig('boxplot_success_rates_per_task.png') 96 | plt.close() 97 | 98 | def boxplot_success_rate_per_model(df: pd.DataFrame): 99 | """ 100 | This function plots the success rates for each model, averaged across all tests. 101 | """ 102 | plt.figure(figsize=(12, 10)) 103 | 104 | # Create the box plot 105 | sns.boxplot(x='model_name', y='success_rate', data=df, color='lightblue', width=0.5) 106 | 107 | # Customize the plot 108 | plt.title('Success Rates by Model', fontsize=16) 109 | plt.xlabel('Model Name', fontsize=12) 110 | plt.ylabel('Success Rate', fontsize=12) 111 | plt.ylim(0, 1) # Set y-axis limits from 0 to 1 for success rate 112 | plt.grid(axis='y', linestyle='--', alpha=0.7) 113 | 114 | # Rotate x-axis labels for better readability 115 | plt.xticks(rotation=45, ha='right') 116 | 117 | # Improve readability 118 | plt.tight_layout() 119 | plt.savefig('boxplot_success_rates_per_model.png') 120 | plt.close() 121 | 122 | def visualize_experiments(df: pd.DataFrame): 123 | # Set the style for all plots 124 | plt.style.use('ggplot') 125 | 126 | barplot_success_rates(df) 127 | barplot_success_rates_per_test(df) 128 | plot_success_rate_heatmap(df, models=["claude-3-5-sonnet-20240620", "llama3:instruct"]) 129 | boxplot_success_rate_per_task(df) 130 | boxplot_success_rate_per_model(df) 131 | 132 | if __name__ == "__main__": 133 | # Load experiments from the 'experiments' directory 134 | df = load_experiments('experimental-results-9-11-24') 135 | 136 | # Generate visualizations 137 | visualize_experiments(df) 138 | 139 | print("Visualizations have been generated and saved as PNG files.") -------------------------------------------------------------------------------- /structured_rag/run_test/result_visualization/visualize_single_result.py: -------------------------------------------------------------------------------- 1 | # This code reads one result json file and visualizes it. 2 | 3 | import json 4 | import os 5 | import argparse 6 | import matplotlib.pyplot as plt 7 | 8 | from models import Experiment 9 | 10 | def visualize_single_result(result_file): 11 | with open(result_file, "r") as f: 12 | data = json.load(f) 13 | experiment = Experiment(**data) 14 | print(experiment) 15 | return experiment 16 | 17 | def pretty_print_experiment(experiment): 18 | print(f"Test: {experiment.test_name}") 19 | print(f"Model: {experiment.model_name}") 20 | print(f"Prompting Method: {experiment.prompting_method}") 21 | print(f"Number of Successes: {experiment.num_successes}") 22 | print(f"Number of Attempts: {experiment.num_attempts}") 23 | print(f"Success Rate: {experiment.num_successes/experiment.num_attempts:.2%}") 24 | print(f"Total Time: {experiment.total_time}") 25 | 26 | # The data should be in the format of a list of dictionaries, where each dictionary represents a single experiment. 27 | 28 | if __name__ == "__main__": 29 | parser = argparse.ArgumentParser(description="Visualize a single result JSON file") 30 | parser.add_argument("--result-file", type=str, required=True, help="Path to the result JSON file") 31 | args = parser.parse_args() 32 | visualize_single_result(args.result_file) -------------------------------------------------------------------------------- /structured_rag/run_test/results/batch-9-13-24/AssessAnswerability-BATCH-llama3-8b-instruct-Modal.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_name": "AssessAnswerability", 3 | "model_name": "llama3-8b-instruct-Modal", 4 | "prompting_method": "fstring", 5 | "num_successes": 112, 6 | "num_attempts": 112, 7 | "success_rate": 1.0, 8 | "total_time": 3, 9 | "all_responses": [ 10 | { 11 | "prompt": "placeholder", 12 | "response": "{\"answerable_question\": true}" 13 | }, 14 | { 15 | "prompt": "placeholder", 16 | "response": "{\"answerable_question\": false}" 17 | }, 18 | { 19 | "prompt": "placeholder", 20 | "response": "{\"answerable_question\": true}" 21 | }, 22 | { 23 | "prompt": "placeholder", 24 | "response": "{\"answerable_question\": true}" 25 | }, 26 | { 27 | "prompt": "placeholder", 28 | "response": "{\"answerable_question\": true}" 29 | }, 30 | { 31 | "prompt": "placeholder", 32 | "response": "{\"answerable_question\": false}" 33 | }, 34 | { 35 | "prompt": "placeholder", 36 | "response": "{\"answerable_question\": true}" 37 | }, 38 | { 39 | "prompt": "placeholder", 40 | "response": "{\"answerable_question\": false}" 41 | }, 42 | { 43 | "prompt": "placeholder", 44 | "response": "{\"answerable_question\": true}" 45 | }, 46 | { 47 | "prompt": "placeholder", 48 | "response": "{\"answerable_question\": false}" 49 | }, 50 | { 51 | "prompt": "placeholder", 52 | "response": "{\"answerable_question\": true}" 53 | }, 54 | { 55 | "prompt": "placeholder", 56 | "response": "{\"answerable_question\": false}" 57 | }, 58 | { 59 | "prompt": "placeholder", 60 | "response": "{\"answerable_question\": true}" 61 | }, 62 | { 63 | "prompt": "placeholder", 64 | "response": "{\"answerable_question\": false}" 65 | }, 66 | { 67 | "prompt": "placeholder", 68 | "response": "{\"answerable_question\": true}" 69 | }, 70 | { 71 | "prompt": "placeholder", 72 | "response": "{\"answerable_question\": true}" 73 | }, 74 | { 75 | "prompt": "placeholder", 76 | "response": "{\"answerable_question\": true}" 77 | }, 78 | { 79 | "prompt": "placeholder", 80 | "response": "{\"answerable_question\": false}" 81 | }, 82 | { 83 | "prompt": "placeholder", 84 | "response": "{\"answerable_question\": true}" 85 | }, 86 | { 87 | "prompt": "placeholder", 88 | "response": "{\"answerable_question\": true}" 89 | }, 90 | { 91 | "prompt": "placeholder", 92 | "response": "{\"answerable_question\": true}" 93 | }, 94 | { 95 | "prompt": "placeholder", 96 | "response": "{\"answerable_question\": false}" 97 | }, 98 | { 99 | "prompt": "placeholder", 100 | "response": "{\"answerable_question\": true}" 101 | }, 102 | { 103 | "prompt": "placeholder", 104 | "response": "{\"answerable_question\": true}" 105 | }, 106 | { 107 | "prompt": "placeholder", 108 | "response": "{\"answerable_question\": true}" 109 | }, 110 | { 111 | "prompt": "placeholder", 112 | "response": "{\"answerable_question\": false}" 113 | }, 114 | { 115 | "prompt": "placeholder", 116 | "response": "{\"answerable_question\": true}" 117 | }, 118 | { 119 | "prompt": "placeholder", 120 | "response": "{\"answerable_question\": true}" 121 | }, 122 | { 123 | "prompt": "placeholder", 124 | "response": "{\"answerable_question\": true}" 125 | }, 126 | { 127 | "prompt": "placeholder", 128 | "response": "{\"answerable_question\": false}" 129 | }, 130 | { 131 | "prompt": "placeholder", 132 | "response": "{\"answerable_question\": true}" 133 | }, 134 | { 135 | "prompt": "placeholder", 136 | "response": "{\"answerable_question\": true}" 137 | }, 138 | { 139 | "prompt": "placeholder", 140 | "response": "{\"answerable_question\": true}" 141 | }, 142 | { 143 | "prompt": "placeholder", 144 | "response": "{\"answerable_question\": false}" 145 | }, 146 | { 147 | "prompt": "placeholder", 148 | "response": "{\"answerable_question\": true}" 149 | }, 150 | { 151 | "prompt": "placeholder", 152 | "response": "{\"answerable_question\": false}" 153 | }, 154 | { 155 | "prompt": "placeholder", 156 | "response": "{\"answerable_question\": true}" 157 | }, 158 | { 159 | "prompt": "placeholder", 160 | "response": "{\"answerable_question\": false}" 161 | }, 162 | { 163 | "prompt": "placeholder", 164 | "response": "{\"answerable_question\": true}" 165 | }, 166 | { 167 | "prompt": "placeholder", 168 | "response": "{\"answerable_question\": true}" 169 | }, 170 | { 171 | "prompt": "placeholder", 172 | "response": "{\"answerable_question\": true}" 173 | }, 174 | { 175 | "prompt": "placeholder", 176 | "response": "{\"answerable_question\": false}" 177 | }, 178 | { 179 | "prompt": "placeholder", 180 | "response": "{\"answerable_question\": true}" 181 | }, 182 | { 183 | "prompt": "placeholder", 184 | "response": "{\"answerable_question\": false}" 185 | }, 186 | { 187 | "prompt": "placeholder", 188 | "response": "{\"answerable_question\": true}" 189 | }, 190 | { 191 | "prompt": "placeholder", 192 | "response": "{\"answerable_question\": false}" 193 | }, 194 | { 195 | "prompt": "placeholder", 196 | "response": "{\"answerable_question\": true}" 197 | }, 198 | { 199 | "prompt": "placeholder", 200 | "response": "{\"answerable_question\": false}" 201 | }, 202 | { 203 | "prompt": "placeholder", 204 | "response": "{\"answerable_question\": true}" 205 | }, 206 | { 207 | "prompt": "placeholder", 208 | "response": "{\"answerable_question\": true}" 209 | }, 210 | { 211 | "prompt": "placeholder", 212 | "response": "{\"answerable_question\": true}" 213 | }, 214 | { 215 | "prompt": "placeholder", 216 | "response": "{\"answerable_question\": true}" 217 | }, 218 | { 219 | "prompt": "placeholder", 220 | "response": "{\"answerable_question\": true}" 221 | }, 222 | { 223 | "prompt": "placeholder", 224 | "response": "{\"answerable_question\": true}" 225 | }, 226 | { 227 | "prompt": "placeholder", 228 | "response": "{\"answerable_question\": true}" 229 | }, 230 | { 231 | "prompt": "placeholder", 232 | "response": "{\"answerable_question\": true}" 233 | }, 234 | { 235 | "prompt": "placeholder", 236 | "response": "{\"answerable_question\": true}" 237 | }, 238 | { 239 | "prompt": "placeholder", 240 | "response": "{\"answerable_question\": false}" 241 | }, 242 | { 243 | "prompt": "placeholder", 244 | "response": "{\"answerable_question\": true}" 245 | }, 246 | { 247 | "prompt": "placeholder", 248 | "response": "{\"answerable_question\": true}" 249 | }, 250 | { 251 | "prompt": "placeholder", 252 | "response": "{\"answerable_question\": true}" 253 | }, 254 | { 255 | "prompt": "placeholder", 256 | "response": "{\"answerable_question\": true}" 257 | }, 258 | { 259 | "prompt": "placeholder", 260 | "response": "{\"answerable_question\": true}" 261 | }, 262 | { 263 | "prompt": "placeholder", 264 | "response": "{\"answerable_question\": true}" 265 | }, 266 | { 267 | "prompt": "placeholder", 268 | "response": "{\"answerable_question\": true}" 269 | }, 270 | { 271 | "prompt": "placeholder", 272 | "response": "{\"answerable_question\": true}" 273 | }, 274 | { 275 | "prompt": "placeholder", 276 | "response": "{\"answerable_question\": true}" 277 | }, 278 | { 279 | "prompt": "placeholder", 280 | "response": "{\"answerable_question\": false}" 281 | }, 282 | { 283 | "prompt": "placeholder", 284 | "response": "{\"answerable_question\": true}" 285 | }, 286 | { 287 | "prompt": "placeholder", 288 | "response": "{\"answerable_question\": true}" 289 | }, 290 | { 291 | "prompt": "placeholder", 292 | "response": "{\"answerable_question\": true}" 293 | }, 294 | { 295 | "prompt": "placeholder", 296 | "response": "{\"answerable_question\": true}" 297 | }, 298 | { 299 | "prompt": "placeholder", 300 | "response": "{\"answerable_question\": true}" 301 | }, 302 | { 303 | "prompt": "placeholder", 304 | "response": "{\"answerable_question\": true}" 305 | }, 306 | { 307 | "prompt": "placeholder", 308 | "response": "{\"answerable_question\": true}" 309 | }, 310 | { 311 | "prompt": "placeholder", 312 | "response": "{\"answerable_question\": true}" 313 | }, 314 | { 315 | "prompt": "placeholder", 316 | "response": "{\"answerable_question\": true}" 317 | }, 318 | { 319 | "prompt": "placeholder", 320 | "response": "{\"answerable_question\": true}" 321 | }, 322 | { 323 | "prompt": "placeholder", 324 | "response": "{\"answerable_question\": true}" 325 | }, 326 | { 327 | "prompt": "placeholder", 328 | "response": "{\"answerable_question\": false}" 329 | }, 330 | { 331 | "prompt": "placeholder", 332 | "response": "{\"answerable_question\": true}" 333 | }, 334 | { 335 | "prompt": "placeholder", 336 | "response": "{\"answerable_question\": true}" 337 | }, 338 | { 339 | "prompt": "placeholder", 340 | "response": "{\"answerable_question\": true}" 341 | }, 342 | { 343 | "prompt": "placeholder", 344 | "response": "{\"answerable_question\": false}" 345 | }, 346 | { 347 | "prompt": "placeholder", 348 | "response": "{\"answerable_question\": true}" 349 | }, 350 | { 351 | "prompt": "placeholder", 352 | "response": "{\"answerable_question\": false}" 353 | }, 354 | { 355 | "prompt": "placeholder", 356 | "response": "{\"answerable_question\": true}" 357 | }, 358 | { 359 | "prompt": "placeholder", 360 | "response": "{\"answerable_question\": true}" 361 | }, 362 | { 363 | "prompt": "placeholder", 364 | "response": "{\"answerable_question\": true}" 365 | }, 366 | { 367 | "prompt": "placeholder", 368 | "response": "{\"answerable_question\": true}" 369 | }, 370 | { 371 | "prompt": "placeholder", 372 | "response": "{\"answerable_question\": true}" 373 | }, 374 | { 375 | "prompt": "placeholder", 376 | "response": "{\"answerable_question\": true}" 377 | }, 378 | { 379 | "prompt": "placeholder", 380 | "response": "{\"answerable_question\": true}" 381 | }, 382 | { 383 | "prompt": "placeholder", 384 | "response": "{\"answerable_question\": true}" 385 | }, 386 | { 387 | "prompt": "placeholder", 388 | "response": "{\"answerable_question\": true}" 389 | }, 390 | { 391 | "prompt": "placeholder", 392 | "response": "{\"answerable_question\": true}" 393 | }, 394 | { 395 | "prompt": "placeholder", 396 | "response": "{\"answerable_question\": true}" 397 | }, 398 | { 399 | "prompt": "placeholder", 400 | "response": "{\"answerable_question\": true}" 401 | }, 402 | { 403 | "prompt": "placeholder", 404 | "response": "{\"answerable_question\": true}" 405 | }, 406 | { 407 | "prompt": "placeholder", 408 | "response": "{\"answerable_question\": false}" 409 | }, 410 | { 411 | "prompt": "placeholder", 412 | "response": "{\"answerable_question\": true}" 413 | }, 414 | { 415 | "prompt": "placeholder", 416 | "response": "{\"answerable_question\": true}" 417 | }, 418 | { 419 | "prompt": "placeholder", 420 | "response": "{\"answerable_question\": true}" 421 | }, 422 | { 423 | "prompt": "placeholder", 424 | "response": "{\"answerable_question\": true}" 425 | }, 426 | { 427 | "prompt": "placeholder", 428 | "response": "{\"answerable_question\": true}" 429 | }, 430 | { 431 | "prompt": "placeholder", 432 | "response": "{\"answerable_question\": false}" 433 | }, 434 | { 435 | "prompt": "placeholder", 436 | "response": "{\"answerable_question\": true}" 437 | }, 438 | { 439 | "prompt": "placeholder", 440 | "response": "{\"answerable_question\": true}" 441 | }, 442 | { 443 | "prompt": "placeholder", 444 | "response": "{\"answerable_question\": true}" 445 | }, 446 | { 447 | "prompt": "placeholder", 448 | "response": "{\"answerable_question\": false}" 449 | }, 450 | { 451 | "prompt": "placeholder", 452 | "response": "{\"answerable_question\": true}" 453 | }, 454 | { 455 | "prompt": "placeholder", 456 | "response": "{\"answerable_question\": false}" 457 | } 458 | ], 459 | "failed_responses": [] 460 | } -------------------------------------------------------------------------------- /structured_rag/run_test/results/batch-9-13-24/GenerateAnswer-BATCH-llama3-8b-instruct-Modal.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_name": "GenerateAnswer", 3 | "model_name": "llama3-8b-instruct-Modal", 4 | "prompting_method": "fstring", 5 | "num_successes": 112, 6 | "num_attempts": 112, 7 | "success_rate": 1.0, 8 | "total_time": 71, 9 | "all_responses": [ 10 | { 11 | "prompt": "placeholder", 12 | "response": "{\"answer\": \"To conduct both long-term studies of the ionosphere from space and in-situ measurements of ion concentrations and temperatures.\"}" 13 | }, 14 | { 15 | "prompt": "placeholder", 16 | "response": "{\"answer\": \"Unfortunately, the information provided does not allow us to determine the exact year Explorer 20 was launched.\"}" 17 | }, 18 | { 19 | "prompt": "placeholder", 20 | "response": "{\"answer\": \"a protein in humans\"}" 21 | }, 22 | { 23 | "prompt": "placeholder", 24 | "response": "{\"answer\": \"The specific functions of the MAP4K3 protein include regulating the activity of various enzymes involved in cell signaling pathways, particularly those related to mitogen-activated protein kinases (MAPKs).\" }" 25 | }, 26 | { 27 | "prompt": "placeholder", 28 | "response": "{\"answer\": \"1972\"}" 29 | }, 30 | { 31 | "prompt": "placeholder", 32 | "response": "{\"answer\": \"Unfortunately, I couldn't find any specific information on the height of the Heggholmen Lighthouse. If you have more context or a reliable source, please provide it!\"}" 33 | }, 34 | { 35 | "prompt": "placeholder", 36 | "response": "{\"answer\": \"June 13, 2011\"}" 37 | }, 38 | { 39 | "prompt": "placeholder", 40 | "response": "{\"answer\": \"Unfortunately, there is no available information on the specific cases that Antonio Nachura presided over during his time as Associate Justice.\"}" 41 | }, 42 | { 43 | "prompt": "placeholder", 44 | "response": "{\"answer\": \"forward\"}" 45 | }, 46 | { 47 | "prompt": "placeholder", 48 | "response": "{\"answer\": \"Unfortunately, the provided context does not specify which teams Roman Gergel has played for. To provide an accurate answer, additional information would be needed.\"}" 49 | }, 50 | { 51 | "prompt": "placeholder", 52 | "response": "{\"answer\": \"Lord Hanuman, Lord Rama, Goddess Sita, Lord Lakshmana, and Lord Shiva\"}" 53 | }, 54 | { 55 | "prompt": "placeholder", 56 | "response": "{\"answer\": \"The answer is not explicitly stated in the given context.\"}" 57 | }, 58 | { 59 | "prompt": "placeholder", 60 | "response": "{\"answer\": \"The National Gallery of Art, Washington D.C., and the Pinacoteca di Brera, Milan, Italy.\"}" 61 | }, 62 | { 63 | "prompt": "placeholder", 64 | "response": "{\"answer\": \"Unfortunately, without further information or a specific mention of his most famous work, it is difficult to pinpoint Vincenzo Civerchio's most famous painting.\"}" 65 | }, 66 | { 67 | "prompt": "placeholder", 68 | "response": "{\"answer\": \"Australia\"}" 69 | }, 70 | { 71 | "prompt": "placeholder", 72 | "response": "{\"answer\": \"around 150-200 species\"}" 73 | }, 74 | { 75 | "prompt": "placeholder", 76 | "response": "{\"answer\": \"65 km/h (40 mph)\" }" 77 | }, 78 | { 79 | "prompt": "placeholder", 80 | "response": "{\"answer\": \"NOT ENOUGH CONTEXT\"}" 81 | }, 82 | { 83 | "prompt": "placeholder", 84 | "response": "{\"answer\": \"The southern half of South America\"}" 85 | }, 86 | { 87 | "prompt": "placeholder", 88 | "response": "{\"answer\": \"a tall upright marigold plant from the genus Tagetes, with small flowers\"}" 89 | }, 90 | { 91 | "prompt": "placeholder", 92 | "response": "{\"answer\": \"Rugby Union and Rugby League\"}" 93 | }, 94 | { 95 | "prompt": "placeholder", 96 | "response": "{\"answer\": \"We cannot determine the exact number of international matches George Spencer played based on the provided information.\"}" 97 | }, 98 | { 99 | "prompt": "placeholder", 100 | "response": "{\"answer\": \"Kuopio\"}" 101 | }, 102 | { 103 | "prompt": "placeholder", 104 | "response": "{\"answer\": \"Mikko Kuparinen, not Kuivonen\"}" 105 | }, 106 | { 107 | "prompt": "placeholder", 108 | "response": "{\"answer\": \"Minister of Finance\"}" 109 | }, 110 | { 111 | "prompt": "placeholder", 112 | "response": "{\"answer\": \"The text does not mention any specific policies implemented by Fons van der Stee as Minister of Finance.\"}" 113 | }, 114 | { 115 | "prompt": "placeholder", 116 | "response": "{\"answer\": \"Arctiinae\"}" 117 | }, 118 | { 119 | "prompt": "placeholder", 120 | "response": "{\"answer\": \"There are approximately 20-30 species in the Areva genus.\"}" 121 | }, 122 | { 123 | "prompt": "placeholder", 124 | "response": "{\"answer\": \"the 2004 Summer Olympics\"}" 125 | }, 126 | { 127 | "prompt": "placeholder", 128 | "response": "{\"answer\": \"Unfortunately, the provided context does not explicitly state Amanda Doman's position in softball. However, it does mention that she won a silver medal at the 2004 Summer Olympics, which suggests that she was part of the Australian softball team. Without further information, we cannot determine her specific position in softball.\"}" 129 | }, 130 | { 131 | "prompt": "placeholder", 132 | "response": "{\"answer\": \"melodramatic family dramas\"}" 133 | }, 134 | { 135 | "prompt": "placeholder", 136 | "response": "{\"answer\": \"4\"}" 137 | }, 138 | { 139 | "prompt": "placeholder", 140 | "response": "{\"answer\": \"Eocene epoch\"}" 141 | }, 142 | { 143 | "prompt": "placeholder", 144 | "response": "{\"answer\": \"Unfortunately, it is not possible to determine the average size of Erismatopterus levatus based on the available contexts.\"}" 145 | }, 146 | { 147 | "prompt": "placeholder", 148 | "response": "{\"answer\": \"John Henderson founded Ocean Kinetics in 1992.\"}" 149 | }, 150 | { 151 | "prompt": "placeholder", 152 | "response": "{\"answer\": \"The answer is unknown, as the provided contexts do not contain the necessary information.\"}" 153 | }, 154 | { 155 | "prompt": "placeholder", 156 | "response": "{\"answer\": \"Leslie Phillips\"}" 157 | }, 158 | { 159 | "prompt": "placeholder", 160 | "response": "{\"answer\": \"The exact box office performance of Doctor in Clover is unknown, but it likely performed reasonably well given its popularity as part of the Doctor series and the fame of Kiki Dee.\"}" 161 | }, 162 | { 163 | "prompt": "placeholder", 164 | "response": "{\"answer\": \"Julia Levy-Boeken is an actress.\"}" 165 | }, 166 | { 167 | "prompt": "placeholder", 168 | "response": "{\"answer\": \"At least one\"}" 169 | }, 170 | { 171 | "prompt": "placeholder", 172 | "response": "{\"answer\": \"May 16, 1864\"}" 173 | }, 174 | { 175 | "prompt": "placeholder", 176 | "response": "{\"answer\": \"Unfortunately, I couldn't find any specific information on the number of casualties in the Battle of Mansura. If you have more context or a reliable source, I'd be happy to help you estimate the answer!\"}" 177 | }, 178 | { 179 | "prompt": "placeholder", 180 | "response": "{\"answer\": \"Giuseppe Patania\"}" 181 | }, 182 | { 183 | "prompt": "placeholder", 184 | "response": "{\"answer\": \"Unfortunately, there is no specific information available that answers this question.\"}" 185 | }, 186 | { 187 | "prompt": "placeholder", 188 | "response": "{\"answer\": \"IRNSS-1G\"}" 189 | }, 190 | { 191 | "prompt": "placeholder", 192 | "response": "{\"answer\": \"Unfortunately, the provided contexts do not mention the expected lifespan of the NVS-01 satellite. It only provides information about its purpose and position within the NavIC constellation. To answer this question, we would need additional context or specific data from ISRO or other reliable sources.\"}" 193 | }, 194 | { 195 | "prompt": "placeholder", 196 | "response": "{\"answer\": \"The Meridian Mets played in the Mississippi State League (1921) and the Cotton States League (1922\u20131923; 1925\u20131929).\"}" 197 | }, 198 | { 199 | "prompt": "placeholder", 200 | "response": "{\"answer\": \"Unfortunately, I couldn't find any specific information on the most famous player to play for the Meridian Mets. However, it's likely that one or more of the players who went on to have successful careers in Major League Baseball played for the team at some point.\"}" 201 | }, 202 | { 203 | "prompt": "placeholder", 204 | "response": "{\"answer\": \"The National Wrestling Alliance (NWA)\"}" 205 | }, 206 | { 207 | "prompt": "placeholder", 208 | "response": "{\"answer\": \"NOT ENOUGH CONTEXT\"}" 209 | }, 210 | { 211 | "prompt": "placeholder", 212 | "response": "{\"answer\": \"HC Slovan Bratislava\"}" 213 | }, 214 | { 215 | "prompt": "placeholder", 216 | "response": "{\"answer\": \"14\"}" 217 | }, 218 | { 219 | "prompt": "placeholder", 220 | "response": "{\"answer\": \"Maui and Hawaii\"}" 221 | }, 222 | { 223 | "prompt": "placeholder", 224 | "response": "{\"answer\": \"several months to a year or more\"}" 225 | }, 226 | { 227 | "prompt": "placeholder", 228 | "response": "{\"answer\": \"Albert Tullgren\"}" 229 | }, 230 | { 231 | "prompt": "placeholder", 232 | "response": "{\"answer\": \"14\"}" 233 | }, 234 | { 235 | "prompt": "placeholder", 236 | "response": "{\"answer\": \"Natural Gas\"}" 237 | }, 238 | { 239 | "prompt": "placeholder", 240 | "response": "{\"answer\": \"Unfortunately, I couldn't find the exact number of albums George Olliver released as a solo artist.\"}" 241 | }, 242 | { 243 | "prompt": "placeholder", 244 | "response": "{\"answer\": \"Adenylyl cyclase 10\"}" 245 | }, 246 | { 247 | "prompt": "placeholder", 248 | "response": "{\"answer\": \"The specific function of the ADCY10 enzyme is to catalyze the conversion of ATP into cyclic AMP (cAMP), which is an important second messenger molecule involved in various cellular signaling pathways.\"}" 249 | }, 250 | { 251 | "prompt": "placeholder", 252 | "response": "{\"answer\": \"Les Ferdinand\"}" 253 | }, 254 | { 255 | "prompt": "placeholder", 256 | "response": "{\"answer\": \"Besiktas 1, Opponent 0\"}" 257 | }, 258 | { 259 | "prompt": "placeholder", 260 | "response": "{\"answer\": \"Lamont Dozier\"}" 261 | }, 262 | { 263 | "prompt": "placeholder", 264 | "response": "{\"answer\": \"It seems that Lamont Dozier was inspired to write \" }" 265 | }, 266 | { 267 | "prompt": "placeholder", 268 | "response": "{\"answer\": \"Steve Grimmett was the longest-running member of Grim Reaper.\"}" 269 | }, 270 | { 271 | "prompt": "placeholder", 272 | "response": "{\"answer\": \"Rock You to Hell\"}" 273 | }, 274 | { 275 | "prompt": "placeholder", 276 | "response": "{\"answer\": \"The Baltimore Orioles\"}" 277 | }, 278 | { 279 | "prompt": "placeholder", 280 | "response": "{\"answer\": \"Unfortunately, we cannot determine the exact number of years Rick Adair played in the minor leagues based on the provided context.\"}" 281 | }, 282 | { 283 | "prompt": "placeholder", 284 | "response": "{\"answer\": \"He was the Spokesperson (scientific head) of the ATLAS Collaboration.\"}" 285 | }, 286 | { 287 | "prompt": "placeholder", 288 | "response": "{\"answer\": \"Karl Jakobs' specific research contributions in particle physics likely include his work as the Spokesperson of the ATLAS Collaboration, where he oversaw the analysis of data collected by the Large Hadron Collider. His expertise may have been focused on Higgs boson research, dark matter detection, or advancements in collider technology.\"}" 289 | }, 290 | { 291 | "prompt": "placeholder", 292 | "response": "{\"answer\": \"The 2009 Rexall Edmonton Indy was held on July 26, 2009 at the Rexall Speedway in Edmonton, Alberta, Canada.\"}" 293 | }, 294 | { 295 | "prompt": "placeholder", 296 | "response": "{\"answer\": \"Dario Franchitti\"}" 297 | }, 298 | { 299 | "prompt": "placeholder", 300 | "response": "{\"answer\": \"Starbase General Manager\"}" 301 | }, 302 | { 303 | "prompt": "placeholder", 304 | "response": "{\"answer\": \"Developing and implementing commercial crew vehicles, such as SpaceX's Dragon spacecraft, and managing the integration of these vehicles into NASA's overall human spaceflight program.\"}" 305 | }, 306 | { 307 | "prompt": "placeholder", 308 | "response": "{\"answer\": \"AIB1, SRC-3, TRAM-1\"}" 309 | }, 310 | { 311 | "prompt": "placeholder", 312 | "response": "{\"answer\": \"The specific function of the NCOA3 protein in human cells is to act as a coactivator for various transcription factors, enhancing their activity and regulating gene expression.\"}" 313 | }, 314 | { 315 | "prompt": "placeholder", 316 | "response": "{\"answer\": \"76,517\"}" 317 | }, 318 | { 319 | "prompt": "placeholder", 320 | "response": "{\"answer\": \"The Gereja Kayu Church, built in 1640 in Jakarta.\"}" 321 | }, 322 | { 323 | "prompt": "placeholder", 324 | "response": "{\"answer\": \"1996\"}" 325 | }, 326 | { 327 | "prompt": "placeholder", 328 | "response": "{\"answer\": \"Not available with the given context\"}" 329 | }, 330 | { 331 | "prompt": "placeholder", 332 | "response": "{\"answer\": \"Justin Hayward and John Lodge\"}" 333 | }, 334 | { 335 | "prompt": "placeholder", 336 | "response": "{\"answer\": \"The lyrics of 'Gemini Dream' were likely inspired by the songwriters' personal experiences, emotions, and observations, as well as their creative vision for the song.\"}" 337 | }, 338 | { 339 | "prompt": "placeholder", 340 | "response": "{\"answer\": \"two\"}" 341 | }, 342 | { 343 | "prompt": "placeholder", 344 | "response": "{\"answer\": \"Unfortunately, I couldn't find any specific information on the legislation sponsored by E. S. Johnny Walker during his time in Congress. It's possible that this information is not publicly available or has been lost over time.\"}" 345 | }, 346 | { 347 | "prompt": "placeholder", 348 | "response": "{\"answer\": \"January 15, 2013\"}" 349 | }, 350 | { 351 | "prompt": "placeholder", 352 | "response": "{\"answer\": \"Unfortunately, I couldn't find any information about the current number of active contributors for Wikivoyage.\"}" 353 | }, 354 | { 355 | "prompt": "placeholder", 356 | "response": "{\"answer\": \"Dawson Walker became manager of the Scotland national football team in 1958 due to the injury of the official manager, Matt Busby, in the Munich air disaster.\"}" 357 | }, 358 | { 359 | "prompt": "placeholder", 360 | "response": "{\"answer\": \"4-1\"}" 361 | }, 362 | { 363 | "prompt": "placeholder", 364 | "response": "{\"answer\": \"The San Diego Padres\"}" 365 | }, 366 | { 367 | "prompt": "placeholder", 368 | "response": "{\"answer\": \"Unknown\"}" 369 | }, 370 | { 371 | "prompt": "placeholder", 372 | "response": "{\"answer\": \"Yasmine Bleeth and Richard Grieco were the stars of the 1996 movie Heaven or Vegas.\"}" 373 | }, 374 | { 375 | "prompt": "placeholder", 376 | "response": "{\"answer\": \"Unfortunately, I couldn't find any reliable sources that provide the exact box office performance of Heaven or Vegas. The movie received mixed reviews from critics and audiences, but it seems to be a relatively obscure film with limited information available about its commercial success.\"}" 377 | }, 378 | { 379 | "prompt": "placeholder", 380 | "response": "{\"answer\": \"approximately three hundred golf courses\"}" 381 | }, 382 | { 383 | "prompt": "placeholder", 384 | "response": "{\"answer\": \"Northwood Club in Dallas, Texas\"}" 385 | }, 386 | { 387 | "prompt": "placeholder", 388 | "response": "{\"answer\": \"The Mandalay Bay Events Center on the Las Vegas Strip in Nevada.\"}" 389 | }, 390 | { 391 | "prompt": "placeholder", 392 | "response": "{\"answer\": \"Randy Couture won the main event of UFC 58: USA vs. Canada\"}" 393 | }, 394 | { 395 | "prompt": "placeholder", 396 | "response": "{\"answer\": \"Timothy Zahn\"}" 397 | }, 398 | { 399 | "prompt": "placeholder", 400 | "response": "{\"answer\": \"The plot of Survivor's Quest follows Mara Jade, a former assassin and wife of Luke Skywalker, as she navigates her new role as a Jedi Master while dealing with the aftermath of the Yuuzhan Vong War.\"}" 401 | }, 402 | { 403 | "prompt": "placeholder", 404 | "response": "{\"answer\": \"William Elford Leach\"}" 405 | }, 406 | { 407 | "prompt": "placeholder", 408 | "response": "{\"answer\": \"Unknown\"}" 409 | }, 410 | { 411 | "prompt": "placeholder", 412 | "response": "{\"answer\": \"The carnelian cowrie.\"}" 413 | }, 414 | { 415 | "prompt": "placeholder", 416 | "response": "{\"answer\": \"The average size of a Lyncina carneola shell is approximately 2-3 centimeters (0.8-1.2 inches) in length.\"}" 417 | }, 418 | { 419 | "prompt": "placeholder", 420 | "response": "{\"answer\": \"Rod Laver Arena in Melbourne, Australia\"}" 421 | }, 422 | { 423 | "prompt": "placeholder", 424 | "response": "{\"answer\": \"Grant Hackett\"}" 425 | }, 426 | { 427 | "prompt": "placeholder", 428 | "response": "{\"answer\": \"54 years\"}" 429 | }, 430 | { 431 | "prompt": "placeholder", 432 | "response": "{\"answer\": \"Unfortunately, there is not enough information provided to determine Frederick E. Goodrich's most notable article during his career at The Boston Post.\"}" 433 | }, 434 | { 435 | "prompt": "placeholder", 436 | "response": "{\"answer\": \"Eight\"}" 437 | }, 438 | { 439 | "prompt": "placeholder", 440 | "response": "{\"answer\": \"The evolutionary history of the Wunderlichioideae subfamily is thought to involve an origin in South America, with subsequent dispersal events to other regions. The distinctive morphological features and genetic characteristics of its members suggest a complex evolutionary history involving significant adaptations and diversification events.\"}" 441 | }, 442 | { 443 | "prompt": "placeholder", 444 | "response": "{\"answer\": \"a miniature gold scimitar on a chain around their necks\"}" 445 | }, 446 | { 447 | "prompt": "placeholder", 448 | "response": "{\"answer\": \"Unable to determine the number of active members due to lack of relevant data.\"}" 449 | }, 450 | { 451 | "prompt": "placeholder", 452 | "response": "{\"answer\": \"Kemar Donaldson\"}" 453 | }, 454 | { 455 | "prompt": "placeholder", 456 | "response": "{\"answer\": \"Unfortunately, I couldn't find the exact number of albums Kranium has released, as this information is not readily available.\"}" 457 | } 458 | ], 459 | "failed_responses": [] 460 | } -------------------------------------------------------------------------------- /structured_rag/run_test/results/batch-9-13-24/RAGAS-BATCH-llama3-8b-instruct-Modal.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_name": "RAGAS", 3 | "model_name": "llama3-8b-instruct-Modal", 4 | "prompting_method": "fstring", 5 | "num_successes": 108, 6 | "num_attempts": 112, 7 | "success_rate": 0.9642857142857143, 8 | "total_time": 7, 9 | "all_responses": [ 10 | { 11 | "prompt": "placeholder", 12 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 13 | }, 14 | { 15 | "prompt": "placeholder", 16 | "response": "{\"faithfulness_score\": 3.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 17 | }, 18 | { 19 | "prompt": "placeholder", 20 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 21 | }, 22 | { 23 | "prompt": "placeholder", 24 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 25 | }, 26 | { 27 | "prompt": "placeholder", 28 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 29 | }, 30 | { 31 | "prompt": "placeholder", 32 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 33 | }, 34 | { 35 | "prompt": "placeholder", 36 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 37 | }, 38 | { 39 | "prompt": "placeholder", 40 | "response": "{\"faithfulness_score\": 0.0, \"answer_relevance_score\": 0.0, \"context_relevance_score\": 5.0}" 41 | }, 42 | { 43 | "prompt": "placeholder", 44 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 45 | }, 46 | { 47 | "prompt": "placeholder", 48 | "response": "{\"faithfulness_score\": 1.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 49 | }, 50 | { 51 | "prompt": "placeholder", 52 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 53 | }, 54 | { 55 | "prompt": "placeholder", 56 | "response": "{\"faithfulness_score\": 0.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 57 | }, 58 | { 59 | "prompt": "placeholder", 60 | "response": "{\"faithfulness_score\": 5, \"answer_relevance_score\": 5, \"context_relevance_score\": 5}" 61 | }, 62 | { 63 | "prompt": "placeholder", 64 | "response": "{\"faithfulness_score\": 4.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 65 | }, 66 | { 67 | "prompt": "placeholder", 68 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 69 | }, 70 | { 71 | "prompt": "placeholder", 72 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 73 | }, 74 | { 75 | "prompt": "placeholder", 76 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 77 | }, 78 | { 79 | "prompt": "placeholder", 80 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 4.0, \"context_relevance_score\": 5.0}" 81 | }, 82 | { 83 | "prompt": "placeholder", 84 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 85 | }, 86 | { 87 | "prompt": "placeholder", 88 | "response": "{\"faithfulness_score\": 3.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 89 | }, 90 | { 91 | "prompt": "placeholder", 92 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 93 | }, 94 | { 95 | "prompt": "placeholder", 96 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 97 | }, 98 | { 99 | "prompt": "placeholder", 100 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 101 | }, 102 | { 103 | "prompt": "placeholder", 104 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 105 | }, 106 | { 107 | "prompt": "placeholder", 108 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 109 | }, 110 | { 111 | "prompt": "placeholder", 112 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 113 | }, 114 | { 115 | "prompt": "placeholder", 116 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 117 | }, 118 | { 119 | "prompt": "placeholder", 120 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 121 | }, 122 | { 123 | "prompt": "placeholder", 124 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 125 | }, 126 | { 127 | "prompt": "placeholder", 128 | "response": "{\"faithfulness_score\": 4.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 129 | }, 130 | { 131 | "prompt": "placeholder", 132 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 133 | }, 134 | { 135 | "prompt": "placeholder", 136 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 137 | }, 138 | { 139 | "prompt": "placeholder", 140 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 141 | }, 142 | { 143 | "prompt": "placeholder", 144 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 4.0, \"context_relevance_score\": 5.0}" 145 | }, 146 | { 147 | "prompt": "placeholder", 148 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 149 | }, 150 | { 151 | "prompt": "placeholder", 152 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 153 | }, 154 | { 155 | "prompt": "placeholder", 156 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 157 | }, 158 | { 159 | "prompt": "placeholder", 160 | "response": "{\"faithfulness_score\": 3.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 161 | }, 162 | { 163 | "prompt": "placeholder", 164 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 165 | }, 166 | { 167 | "prompt": "placeholder", 168 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 169 | }, 170 | { 171 | "prompt": "placeholder", 172 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 173 | }, 174 | { 175 | "prompt": "placeholder", 176 | "response": "{\"faithfulness_score\": 3.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 177 | }, 178 | { 179 | "prompt": "placeholder", 180 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 181 | }, 182 | { 183 | "prompt": "placeholder", 184 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 2.0, \"context_relevance_score\": 5.0}" 185 | }, 186 | { 187 | "prompt": "placeholder", 188 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 189 | }, 190 | { 191 | "prompt": "placeholder", 192 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 2.0, \"context_relevance_score\": 5.0}" 193 | }, 194 | { 195 | "prompt": "placeholder", 196 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 197 | }, 198 | { 199 | "prompt": "placeholder", 200 | "response": "{\"faithfulness_score\": 3.0, \"answer_relevance_score\": 2.0, \"context_relevance_score\": 5.0}" 201 | }, 202 | { 203 | "prompt": "placeholder", 204 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 205 | }, 206 | { 207 | "prompt": "placeholder", 208 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 209 | }, 210 | { 211 | "prompt": "placeholder", 212 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 213 | }, 214 | { 215 | "prompt": "placeholder", 216 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 217 | }, 218 | { 219 | "prompt": "placeholder", 220 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 221 | }, 222 | { 223 | "prompt": "placeholder", 224 | "response": "{\"faithfulness_score\": 4.0, \"answer_relevance_score\": 1.0, \"context_relevance_score\": 5.0}" 225 | }, 226 | { 227 | "prompt": "placeholder", 228 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 229 | }, 230 | { 231 | "prompt": "placeholder", 232 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 233 | }, 234 | { 235 | "prompt": "placeholder", 236 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 237 | }, 238 | { 239 | "prompt": "placeholder", 240 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 4.0, \"context_relevance_score\": 5.0}" 241 | }, 242 | { 243 | "prompt": "placeholder", 244 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 245 | }, 246 | { 247 | "prompt": "placeholder", 248 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 249 | }, 250 | { 251 | "prompt": "placeholder", 252 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 253 | }, 254 | { 255 | "prompt": "placeholder", 256 | "response": "{\"faithfulness_score\": 3.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 257 | }, 258 | { 259 | "prompt": "placeholder", 260 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 261 | }, 262 | { 263 | "prompt": "placeholder", 264 | "response": "{\"faithfulness_score\": 4.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 265 | }, 266 | { 267 | "prompt": "placeholder", 268 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 269 | }, 270 | { 271 | "prompt": "placeholder", 272 | "response": "{\"faithfulness_score\": 4.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 273 | }, 274 | { 275 | "prompt": "placeholder", 276 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 277 | }, 278 | { 279 | "prompt": "placeholder", 280 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 281 | }, 282 | { 283 | "prompt": "placeholder", 284 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 285 | }, 286 | { 287 | "prompt": "placeholder", 288 | "response": "{\"faithfulness_score\": 4.5, \"answer_relevance_score\": 5, \"context_relevance_score\": 5}" 289 | }, 290 | { 291 | "prompt": "placeholder", 292 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 293 | }, 294 | { 295 | "prompt": "placeholder", 296 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 297 | }, 298 | { 299 | "prompt": "placeholder", 300 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 301 | }, 302 | { 303 | "prompt": "placeholder", 304 | "response": "{\"faithfulness_score\": 4.8, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 305 | }, 306 | { 307 | "prompt": "placeholder", 308 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 309 | }, 310 | { 311 | "prompt": "placeholder", 312 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 313 | }, 314 | { 315 | "prompt": "placeholder", 316 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 317 | }, 318 | { 319 | "prompt": "placeholder", 320 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 321 | }, 322 | { 323 | "prompt": "placeholder", 324 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 325 | }, 326 | { 327 | "prompt": "placeholder", 328 | "response": "{\"faithfulness_score\": 0.0, \"answer_relevance_score\": 0.0, \"context_relevance_score\": 1.0}" 329 | }, 330 | { 331 | "prompt": "placeholder", 332 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 333 | }, 334 | { 335 | "prompt": "placeholder", 336 | "response": "{\"faithfulness_score\": 2.5, \"answer_relevance_score\": 5, \"context_relevance_score\": 5}" 337 | }, 338 | { 339 | "prompt": "placeholder", 340 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 341 | }, 342 | { 343 | "prompt": "placeholder", 344 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 345 | }, 346 | { 347 | "prompt": "placeholder", 348 | "response": "{\"faithfulness_score\": 5, \"answer_relevance_score\": 5, \"context_relevance_score\": 5}" 349 | }, 350 | { 351 | "prompt": "placeholder", 352 | "response": "{\"faithfulness_score\": 4.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 353 | }, 354 | { 355 | "prompt": "placeholder", 356 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 357 | }, 358 | { 359 | "prompt": "placeholder", 360 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 361 | }, 362 | { 363 | "prompt": "placeholder", 364 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 365 | }, 366 | { 367 | "prompt": "placeholder", 368 | "response": "{\"faithfulness_score\": 0.0, \"answer_relevance_score\": 0.0, \"context_relevance_score\": 5.0}" 369 | }, 370 | { 371 | "prompt": "placeholder", 372 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 0.0}" 373 | }, 374 | { 375 | "prompt": "placeholder", 376 | "response": "{\"faithfulness_score\": 4.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 377 | }, 378 | { 379 | "prompt": "placeholder", 380 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 381 | }, 382 | { 383 | "prompt": "placeholder", 384 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 385 | }, 386 | { 387 | "prompt": "placeholder", 388 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 389 | }, 390 | { 391 | "prompt": "placeholder", 392 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 393 | }, 394 | { 395 | "prompt": "placeholder", 396 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 397 | }, 398 | { 399 | "prompt": "placeholder", 400 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 401 | }, 402 | { 403 | "prompt": "placeholder", 404 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 405 | }, 406 | { 407 | "prompt": "placeholder", 408 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 409 | }, 410 | { 411 | "prompt": "placeholder", 412 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 413 | }, 414 | { 415 | "prompt": "placeholder", 416 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 417 | }, 418 | { 419 | "prompt": "placeholder", 420 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 421 | }, 422 | { 423 | "prompt": "placeholder", 424 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 425 | }, 426 | { 427 | "prompt": "placeholder", 428 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 429 | }, 430 | { 431 | "prompt": "placeholder", 432 | "response": "{\"faithfulness_score\": 1.0, \"answer_relevance_score\": 4.0, \"context_relevance_score\": 5.0}" 433 | }, 434 | { 435 | "prompt": "placeholder", 436 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 437 | }, 438 | { 439 | "prompt": "placeholder", 440 | "response": "{\"faithfulness_score\": 4.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 441 | }, 442 | { 443 | "prompt": "placeholder", 444 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 445 | }, 446 | { 447 | "prompt": "placeholder", 448 | "response": "{\"faithfulness_score\": 4.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 449 | }, 450 | { 451 | "prompt": "placeholder", 452 | "response": "{\"faithfulness_score\": 5.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 453 | }, 454 | { 455 | "prompt": "placeholder", 456 | "response": "{\"faithfulness_score\": 4.0, \"answer_relevance_score\": 5.0, \"context_relevance_score\": 5.0}" 457 | } 458 | ], 459 | "failed_responses": [ 460 | { 461 | "prompt": "placeholder", 462 | "response": "{\"faithfulness_score\": 5, \"answer_relevance_score\": 5, \"context_relevance_score\": 5}" 463 | }, 464 | { 465 | "prompt": "placeholder", 466 | "response": "{\"faithfulness_score\": 4.5, \"answer_relevance_score\": 5, \"context_relevance_score\": 5}" 467 | }, 468 | { 469 | "prompt": "placeholder", 470 | "response": "{\"faithfulness_score\": 2.5, \"answer_relevance_score\": 5, \"context_relevance_score\": 5}" 471 | }, 472 | { 473 | "prompt": "placeholder", 474 | "response": "{\"faithfulness_score\": 5, \"answer_relevance_score\": 5, \"context_relevance_score\": 5}" 475 | } 476 | ] 477 | } -------------------------------------------------------------------------------- /structured_rag/run_test/results/batch-9-13-24/RateContext-BATCH-llama3-8b-instruct-Modal.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_name": "RateContext", 3 | "model_name": "llama3-8b-instruct-Modal", 4 | "prompting_method": "fstring", 5 | "num_successes": 112, 6 | "num_attempts": 112, 7 | "success_rate": 1.0, 8 | "total_time": 4, 9 | "all_responses": [ 10 | { 11 | "prompt": "placeholder", 12 | "response": "{\"context_score\": 5}" 13 | }, 14 | { 15 | "prompt": "placeholder", 16 | "response": "{\"context_score\": 3}" 17 | }, 18 | { 19 | "prompt": "placeholder", 20 | "response": "{\"context_score\": 5}" 21 | }, 22 | { 23 | "prompt": "placeholder", 24 | "response": "{\"context_score\": 5}" 25 | }, 26 | { 27 | "prompt": "placeholder", 28 | "response": "{\"context_score\": 5}" 29 | }, 30 | { 31 | "prompt": "placeholder", 32 | "response": "{\"context_score\": 2}" 33 | }, 34 | { 35 | "prompt": "placeholder", 36 | "response": "{\"context_score\": 5}" 37 | }, 38 | { 39 | "prompt": "placeholder", 40 | "response": "{\"context_score\": 2}" 41 | }, 42 | { 43 | "prompt": "placeholder", 44 | "response": "{\"context_score\": 5}" 45 | }, 46 | { 47 | "prompt": "placeholder", 48 | "response": "{\"context_score\": 0}" 49 | }, 50 | { 51 | "prompt": "placeholder", 52 | "response": "{\"context_score\": 5}" 53 | }, 54 | { 55 | "prompt": "placeholder", 56 | "response": "{\"context_score\": 0}" 57 | }, 58 | { 59 | "prompt": "placeholder", 60 | "response": "{\"context_score\": 5}" 61 | }, 62 | { 63 | "prompt": "placeholder", 64 | "response": "{\"context_score\": 2}" 65 | }, 66 | { 67 | "prompt": "placeholder", 68 | "response": "{\"context_score\": 5}" 69 | }, 70 | { 71 | "prompt": "placeholder", 72 | "response": "{\"context_score\": 5}" 73 | }, 74 | { 75 | "prompt": "placeholder", 76 | "response": "{\"context_score\": 5}" 77 | }, 78 | { 79 | "prompt": "placeholder", 80 | "response": "{\"context_score\": 0}" 81 | }, 82 | { 83 | "prompt": "placeholder", 84 | "response": "{\"context_score\": 5}" 85 | }, 86 | { 87 | "prompt": "placeholder", 88 | "response": "{\"context_score\": 1}" 89 | }, 90 | { 91 | "prompt": "placeholder", 92 | "response": "{\"context_score\": 5}" 93 | }, 94 | { 95 | "prompt": "placeholder", 96 | "response": "{\"context_score\": 0}" 97 | }, 98 | { 99 | "prompt": "placeholder", 100 | "response": "{\"context_score\": 5}" 101 | }, 102 | { 103 | "prompt": "placeholder", 104 | "response": "{\"context_score\": 4}" 105 | }, 106 | { 107 | "prompt": "placeholder", 108 | "response": "{\"context_score\": 5}" 109 | }, 110 | { 111 | "prompt": "placeholder", 112 | "response": "{\"context_score\": 2}" 113 | }, 114 | { 115 | "prompt": "placeholder", 116 | "response": "{\"context_score\": 5}" 117 | }, 118 | { 119 | "prompt": "placeholder", 120 | "response": "{\"context_score\": 5}" 121 | }, 122 | { 123 | "prompt": "placeholder", 124 | "response": "{\"context_score\": 5}" 125 | }, 126 | { 127 | "prompt": "placeholder", 128 | "response": "{\"context_score\": 2}" 129 | }, 130 | { 131 | "prompt": "placeholder", 132 | "response": "{\"context_score\": 5}" 133 | }, 134 | { 135 | "prompt": "placeholder", 136 | "response": "{\"context_score\": 5}" 137 | }, 138 | { 139 | "prompt": "placeholder", 140 | "response": "{\"context_score\": 5}" 141 | }, 142 | { 143 | "prompt": "placeholder", 144 | "response": "{\"context_score\": 2}" 145 | }, 146 | { 147 | "prompt": "placeholder", 148 | "response": "{\"context_score\": 5}" 149 | }, 150 | { 151 | "prompt": "placeholder", 152 | "response": "{\"context_score\": 2}" 153 | }, 154 | { 155 | "prompt": "placeholder", 156 | "response": "{\"context_score\": 5}" 157 | }, 158 | { 159 | "prompt": "placeholder", 160 | "response": "{\"context_score\": 4}" 161 | }, 162 | { 163 | "prompt": "placeholder", 164 | "response": "{\"context_score\": 5}" 165 | }, 166 | { 167 | "prompt": "placeholder", 168 | "response": "{\"context_score\": 5}" 169 | }, 170 | { 171 | "prompt": "placeholder", 172 | "response": "{\"context_score\": 5}" 173 | }, 174 | { 175 | "prompt": "placeholder", 176 | "response": "{\"context_score\": 2}" 177 | }, 178 | { 179 | "prompt": "placeholder", 180 | "response": "{\"context_score\": 5}" 181 | }, 182 | { 183 | "prompt": "placeholder", 184 | "response": "{\"context_score\": 1}" 185 | }, 186 | { 187 | "prompt": "placeholder", 188 | "response": "{\"context_score\": 5}" 189 | }, 190 | { 191 | "prompt": "placeholder", 192 | "response": "{\"context_score\": 2}" 193 | }, 194 | { 195 | "prompt": "placeholder", 196 | "response": "{\"context_score\": 5}" 197 | }, 198 | { 199 | "prompt": "placeholder", 200 | "response": "{\"context_score\": 2}" 201 | }, 202 | { 203 | "prompt": "placeholder", 204 | "response": "{\"context_score\": 5}" 205 | }, 206 | { 207 | "prompt": "placeholder", 208 | "response": "{\"context_score\": 5}" 209 | }, 210 | { 211 | "prompt": "placeholder", 212 | "response": "{\"context_score\": 5}" 213 | }, 214 | { 215 | "prompt": "placeholder", 216 | "response": "{\"context_score\": 5}" 217 | }, 218 | { 219 | "prompt": "placeholder", 220 | "response": "{\"context_score\": 5}" 221 | }, 222 | { 223 | "prompt": "placeholder", 224 | "response": "{\"context_score\": 2}" 225 | }, 226 | { 227 | "prompt": "placeholder", 228 | "response": "{\"context_score\": 5}" 229 | }, 230 | { 231 | "prompt": "placeholder", 232 | "response": "{\"context_score\": 5}" 233 | }, 234 | { 235 | "prompt": "placeholder", 236 | "response": "{\"context_score\": 5}" 237 | }, 238 | { 239 | "prompt": "placeholder", 240 | "response": "{\"context_score\": 2}" 241 | }, 242 | { 243 | "prompt": "placeholder", 244 | "response": "{\"context_score\": 5}" 245 | }, 246 | { 247 | "prompt": "placeholder", 248 | "response": "{\"context_score\": 5}" 249 | }, 250 | { 251 | "prompt": "placeholder", 252 | "response": "{\"context_score\": 5}" 253 | }, 254 | { 255 | "prompt": "placeholder", 256 | "response": "{\"context_score\": 1}" 257 | }, 258 | { 259 | "prompt": "placeholder", 260 | "response": "{\"context_score\": 5}" 261 | }, 262 | { 263 | "prompt": "placeholder", 264 | "response": "{\"context_score\": 4}" 265 | }, 266 | { 267 | "prompt": "placeholder", 268 | "response": "{\"context_score\": 5}" 269 | }, 270 | { 271 | "prompt": "placeholder", 272 | "response": "{\"context_score\": 5}" 273 | }, 274 | { 275 | "prompt": "placeholder", 276 | "response": "{\"context_score\": 5}" 277 | }, 278 | { 279 | "prompt": "placeholder", 280 | "response": "{\"context_score\": 0}" 281 | }, 282 | { 283 | "prompt": "placeholder", 284 | "response": "{\"context_score\": 5}" 285 | }, 286 | { 287 | "prompt": "placeholder", 288 | "response": "{\"context_score\": 5}" 289 | }, 290 | { 291 | "prompt": "placeholder", 292 | "response": "{\"context_score\": 5}" 293 | }, 294 | { 295 | "prompt": "placeholder", 296 | "response": "{\"context_score\": 5}" 297 | }, 298 | { 299 | "prompt": "placeholder", 300 | "response": "{\"context_score\": 5}" 301 | }, 302 | { 303 | "prompt": "placeholder", 304 | "response": "{\"context_score\": 5}" 305 | }, 306 | { 307 | "prompt": "placeholder", 308 | "response": "{\"context_score\": 5}" 309 | }, 310 | { 311 | "prompt": "placeholder", 312 | "response": "{\"context_score\": 5}" 313 | }, 314 | { 315 | "prompt": "placeholder", 316 | "response": "{\"context_score\": 5}" 317 | }, 318 | { 319 | "prompt": "placeholder", 320 | "response": "{\"context_score\": 5}" 321 | }, 322 | { 323 | "prompt": "placeholder", 324 | "response": "{\"context_score\": 5}" 325 | }, 326 | { 327 | "prompt": "placeholder", 328 | "response": "{\"context_score\": 1}" 329 | }, 330 | { 331 | "prompt": "placeholder", 332 | "response": "{\"context_score\": 5}" 333 | }, 334 | { 335 | "prompt": "placeholder", 336 | "response": "{\"context_score\": 3}" 337 | }, 338 | { 339 | "prompt": "placeholder", 340 | "response": "{\"context_score\": 5}" 341 | }, 342 | { 343 | "prompt": "placeholder", 344 | "response": "{\"context_score\": 2}" 345 | }, 346 | { 347 | "prompt": "placeholder", 348 | "response": "{\"context_score\": 5}" 349 | }, 350 | { 351 | "prompt": "placeholder", 352 | "response": "{\"context_score\": 0}" 353 | }, 354 | { 355 | "prompt": "placeholder", 356 | "response": "{\"context_score\": 4}" 357 | }, 358 | { 359 | "prompt": "placeholder", 360 | "response": "{\"context_score\": 3}" 361 | }, 362 | { 363 | "prompt": "placeholder", 364 | "response": "{\"context_score\": 5}" 365 | }, 366 | { 367 | "prompt": "placeholder", 368 | "response": "{\"context_score\": 0}" 369 | }, 370 | { 371 | "prompt": "placeholder", 372 | "response": "{\"context_score\": 5}" 373 | }, 374 | { 375 | "prompt": "placeholder", 376 | "response": "{\"context_score\": 4}" 377 | }, 378 | { 379 | "prompt": "placeholder", 380 | "response": "{\"context_score\": 5}" 381 | }, 382 | { 383 | "prompt": "placeholder", 384 | "response": "{\"context_score\": 4}" 385 | }, 386 | { 387 | "prompt": "placeholder", 388 | "response": "{\"context_score\": 5}" 389 | }, 390 | { 391 | "prompt": "placeholder", 392 | "response": "{\"context_score\": 5}" 393 | }, 394 | { 395 | "prompt": "placeholder", 396 | "response": "{\"context_score\": 5}" 397 | }, 398 | { 399 | "prompt": "placeholder", 400 | "response": "{\"context_score\": 5}" 401 | }, 402 | { 403 | "prompt": "placeholder", 404 | "response": "{\"context_score\": 5}" 405 | }, 406 | { 407 | "prompt": "placeholder", 408 | "response": "{\"context_score\": 1}" 409 | }, 410 | { 411 | "prompt": "placeholder", 412 | "response": "{\"context_score\": 5}" 413 | }, 414 | { 415 | "prompt": "placeholder", 416 | "response": "{\"context_score\": 5}" 417 | }, 418 | { 419 | "prompt": "placeholder", 420 | "response": "{\"context_score\": 5}" 421 | }, 422 | { 423 | "prompt": "placeholder", 424 | "response": "{\"context_score\": 5}" 425 | }, 426 | { 427 | "prompt": "placeholder", 428 | "response": "{\"context_score\": 5}" 429 | }, 430 | { 431 | "prompt": "placeholder", 432 | "response": "{\"context_score\": 2}" 433 | }, 434 | { 435 | "prompt": "placeholder", 436 | "response": "{\"context_score\": 5}" 437 | }, 438 | { 439 | "prompt": "placeholder", 440 | "response": "{\"context_score\": 5}" 441 | }, 442 | { 443 | "prompt": "placeholder", 444 | "response": "{\"context_score\": 5}" 445 | }, 446 | { 447 | "prompt": "placeholder", 448 | "response": "{\"context_score\": 2}" 449 | }, 450 | { 451 | "prompt": "placeholder", 452 | "response": "{\"context_score\": 5}" 453 | }, 454 | { 455 | "prompt": "placeholder", 456 | "response": "{\"context_score\": 3}" 457 | } 458 | ], 459 | "failed_responses": [] 460 | } -------------------------------------------------------------------------------- /structured_rag/run_test/results/experimental-results-8-26-24/aggregated_results-8-26-24.json: -------------------------------------------------------------------------------- 1 | { 2 | "GenerateAnswer": { 3 | "llama3:instruct": { 4 | "ollama": { 5 | "results-trial-1": { 6 | "dspy_total": 96, 7 | "fstring_total": 110, 8 | "total_questions": 112, 9 | "runs": 1, 10 | "dspy_average": 96.0, 11 | "fstring_average": 110.0, 12 | "average_questions": 112.0 13 | }, 14 | "results-trial-2": { 15 | "dspy_total": 96, 16 | "fstring_total": 109, 17 | "total_questions": 112, 18 | "runs": 1, 19 | "dspy_average": 96.0, 20 | "fstring_average": 109.0, 21 | "average_questions": 112.0 22 | } 23 | } 24 | }, 25 | "gemini-1.5-pro": { 26 | "google": { 27 | "results-trial-1": { 28 | "dspy_total": 112, 29 | "fstring_total": 112, 30 | "total_questions": 112, 31 | "runs": 1, 32 | "dspy_average": 112.0, 33 | "fstring_average": 112.0, 34 | "average_questions": 112.0 35 | }, 36 | "results-trial-2": { 37 | "dspy_total": 112, 38 | "fstring_total": 112, 39 | "total_questions": 112, 40 | "runs": 1, 41 | "dspy_average": 112.0, 42 | "fstring_average": 112.0, 43 | "average_questions": 112.0 44 | } 45 | } 46 | } 47 | }, 48 | "AssessAnswerability": { 49 | "llama3:instruct": { 50 | "ollama": { 51 | "results-trial-1": { 52 | "dspy_total": 107, 53 | "fstring_total": 97, 54 | "total_questions": 112, 55 | "runs": 1, 56 | "dspy_average": 107.0, 57 | "fstring_average": 97.0, 58 | "average_questions": 112.0 59 | }, 60 | "results-trial-2": { 61 | "dspy_total": 107, 62 | "fstring_total": 103, 63 | "total_questions": 112, 64 | "runs": 1, 65 | "dspy_average": 107.0, 66 | "fstring_average": 103.0, 67 | "average_questions": 112.0 68 | } 69 | } 70 | }, 71 | "gemini-1.5-pro": { 72 | "google": { 73 | "results-trial-1": { 74 | "dspy_total": 101, 75 | "fstring_total": 101, 76 | "total_questions": 112, 77 | "runs": 1, 78 | "dspy_average": 101.0, 79 | "fstring_average": 101.0, 80 | "average_questions": 112.0 81 | }, 82 | "results-trial-2": { 83 | "dspy_total": 105, 84 | "fstring_total": 103, 85 | "total_questions": 112, 86 | "runs": 1, 87 | "dspy_average": 105.0, 88 | "fstring_average": 103.0, 89 | "average_questions": 112.0 90 | } 91 | } 92 | } 93 | }, 94 | "RateContext": { 95 | "gemini-1.5-pro": { 96 | "google": { 97 | "results-trial-1": { 98 | "dspy_total": 102, 99 | "fstring_total": 107, 100 | "total_questions": 112, 101 | "runs": 1, 102 | "dspy_average": 102.0, 103 | "fstring_average": 107.0, 104 | "average_questions": 112.0 105 | }, 106 | "results-trial-2": { 107 | "dspy_total": 100, 108 | "fstring_total": 109, 109 | "total_questions": 112, 110 | "runs": 1, 111 | "dspy_average": 100.0, 112 | "fstring_average": 109.0, 113 | "average_questions": 112.0 114 | } 115 | } 116 | }, 117 | "llama3:instruct": { 118 | "ollama": { 119 | "results-trial-1": { 120 | "dspy_total": 100, 121 | "fstring_total": 89, 122 | "total_questions": 112, 123 | "runs": 1, 124 | "dspy_average": 100.0, 125 | "fstring_average": 89.0, 126 | "average_questions": 112.0 127 | }, 128 | "results-trial-2": { 129 | "dspy_total": 100, 130 | "fstring_total": 96, 131 | "total_questions": 112, 132 | "runs": 1, 133 | "dspy_average": 100.0, 134 | "fstring_average": 96.0, 135 | "average_questions": 112.0 136 | } 137 | } 138 | } 139 | }, 140 | "ParaphraseQuestions": { 141 | "gemini-1.5-pro": { 142 | "google": { 143 | "results-trial-1": { 144 | "dspy_total": 70, 145 | "fstring_total": 112, 146 | "total_questions": 112, 147 | "runs": 1, 148 | "dspy_average": 70.0, 149 | "fstring_average": 112.0, 150 | "average_questions": 112.0 151 | }, 152 | "results-trial-2": { 153 | "dspy_total": 82, 154 | "fstring_total": 112, 155 | "total_questions": 112, 156 | "runs": 1, 157 | "dspy_average": 82.0, 158 | "fstring_average": 112.0, 159 | "average_questions": 112.0 160 | } 161 | } 162 | }, 163 | "llama3:instruct": { 164 | "ollama": { 165 | "results-trial-1": { 166 | "dspy_total": 112, 167 | "fstring_total": 6, 168 | "total_questions": 112, 169 | "runs": 1, 170 | "dspy_average": 112.0, 171 | "fstring_average": 6.0, 172 | "average_questions": 112.0 173 | }, 174 | "results-trial-2": { 175 | "dspy_total": 112, 176 | "fstring_total": 10, 177 | "total_questions": 112, 178 | "runs": 1, 179 | "dspy_average": 112.0, 180 | "fstring_average": 10.0, 181 | "average_questions": 112.0 182 | } 183 | } 184 | } 185 | }, 186 | "GenerateAnswersWithConfidence": { 187 | "llama3:instruct": { 188 | "ollama": { 189 | "results-trial-1": { 190 | "dspy_total": 28, 191 | "fstring_total": 104, 192 | "total_questions": 112, 193 | "runs": 1, 194 | "dspy_average": 28.0, 195 | "fstring_average": 104.0, 196 | "average_questions": 112.0 197 | }, 198 | "results-trial-2": { 199 | "dspy_total": 28, 200 | "fstring_total": 104, 201 | "total_questions": 112, 202 | "runs": 1, 203 | "dspy_average": 28.0, 204 | "fstring_average": 104.0, 205 | "average_questions": 112.0 206 | } 207 | } 208 | }, 209 | "gemini-1.5-pro": { 210 | "google": { 211 | "results-trial-1": { 212 | "dspy_total": 107, 213 | "fstring_total": 110, 214 | "total_questions": 112, 215 | "runs": 1, 216 | "dspy_average": 107.0, 217 | "fstring_average": 110.0, 218 | "average_questions": 112.0 219 | }, 220 | "results-trial-2": { 221 | "dspy_total": 102, 222 | "fstring_total": 110, 223 | "total_questions": 112, 224 | "runs": 1, 225 | "dspy_average": 102.0, 226 | "fstring_average": 110.0, 227 | "average_questions": 112.0 228 | } 229 | } 230 | } 231 | }, 232 | "GenerateAnswerWithConfidence": { 233 | "llama3:instruct": { 234 | "ollama": { 235 | "results-trial-1": { 236 | "dspy_total": 91, 237 | "fstring_total": 108, 238 | "total_questions": 112, 239 | "runs": 1, 240 | "dspy_average": 91.0, 241 | "fstring_average": 108.0, 242 | "average_questions": 112.0 243 | }, 244 | "results-trial-2": { 245 | "dspy_total": 91, 246 | "fstring_total": 111, 247 | "total_questions": 112, 248 | "runs": 1, 249 | "dspy_average": 91.0, 250 | "fstring_average": 111.0, 251 | "average_questions": 112.0 252 | } 253 | } 254 | }, 255 | "gemini-1.5-pro": { 256 | "google": { 257 | "results-trial-1": { 258 | "dspy_total": 77, 259 | "fstring_total": 87, 260 | "total_questions": 112, 261 | "runs": 1, 262 | "dspy_average": 77.0, 263 | "fstring_average": 87.0, 264 | "average_questions": 112.0 265 | }, 266 | "results-trial-2": { 267 | "dspy_total": 75, 268 | "fstring_total": 85, 269 | "total_questions": 112, 270 | "runs": 1, 271 | "dspy_average": 75.0, 272 | "fstring_average": 85.0, 273 | "average_questions": 112.0 274 | } 275 | } 276 | } 277 | } 278 | } -------------------------------------------------------------------------------- /structured_rag/run_test/results/experimental-results-8-26-24/model_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weaviate/structured-rag/0c5174d0819db853adeeca59840a4eef957f5ac2/structured_rag/run_test/results/experimental-results-8-26-24/model_comparison.png -------------------------------------------------------------------------------- /structured_rag/run_test/results/experimental-results-8-26-24/model_comparison_results-trial-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weaviate/structured-rag/0c5174d0819db853adeeca59840a4eef957f5ac2/structured_rag/run_test/results/experimental-results-8-26-24/model_comparison_results-trial-1.png -------------------------------------------------------------------------------- /structured_rag/run_test/results/experimental-results-8-26-24/model_comparison_results-trial-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weaviate/structured-rag/0c5174d0819db853adeeca59840a4eef957f5ac2/structured_rag/run_test/results/experimental-results-8-26-24/model_comparison_results-trial-2.png -------------------------------------------------------------------------------- /structured_rag/run_test/results/experimental-results-8-26-24/trial-1/AssessAnswerability-gemini-1.5-pro.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_type": "AssessAnswerability", 3 | "model_name": "gemini-1.5-pro", 4 | "model_provider": "google", 5 | "dspy_score": 101, 6 | "fstring_score": 101, 7 | "total_questions": 112 8 | } -------------------------------------------------------------------------------- /structured_rag/run_test/results/experimental-results-8-26-24/trial-1/AssessAnswerability-llama3:instruct.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_type": "AssessAnswerability", 3 | "model_name": "llama3:instruct", 4 | "model_provider": "ollama", 5 | "dspy_score": 107, 6 | "fstring_score": 97, 7 | "total_questions": 112 8 | } -------------------------------------------------------------------------------- /structured_rag/run_test/results/experimental-results-8-26-24/trial-1/GenerateAnswer-gemini-1.5-pro.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_type": "GenerateAnswer", 3 | "model_name": "gemini-1.5-pro", 4 | "model_provider": "google", 5 | "dspy_score": 112, 6 | "fstring_score": 112, 7 | "total_questions": 112 8 | } -------------------------------------------------------------------------------- /structured_rag/run_test/results/experimental-results-8-26-24/trial-1/GenerateAnswer-llama3:instruct.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_type": "GenerateAnswer", 3 | "model_name": "llama3:instruct", 4 | "model_provider": "ollama", 5 | "dspy_score": 96, 6 | "fstring_score": 110, 7 | "total_questions": 112 8 | } -------------------------------------------------------------------------------- /structured_rag/run_test/results/experimental-results-8-26-24/trial-1/GenerateAnswerWithConfidence-gemini-1.5-pro.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_type": "GenerateAnswerWithConfidence", 3 | "model_name": "gemini-1.5-pro", 4 | "model_provider": "google", 5 | "dspy_score": 77, 6 | "fstring_score": 87, 7 | "dspy_total_attempts": 112, 8 | "fstring_total_attempts": 112 9 | } -------------------------------------------------------------------------------- /structured_rag/run_test/results/experimental-results-8-26-24/trial-1/GenerateAnswerWithConfidence-llama3:instruct.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_type": "GenerateAnswerWithConfidence", 3 | "model_name": "llama3:instruct", 4 | "model_provider": "ollama", 5 | "dspy_score": 91, 6 | "fstring_score": 108, 7 | "total_questions": 112 8 | } -------------------------------------------------------------------------------- /structured_rag/run_test/results/experimental-results-8-26-24/trial-1/GenerateAnswersWithConfidence-gemini-1.5-pro.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_type": "GenerateAnswersWithConfidence", 3 | "model_name": "gemini-1.5-pro", 4 | "model_provider": "google", 5 | "dspy_score": 107, 6 | "fstring_score": 110, 7 | "dspy_total_attempts": 112, 8 | "fstring_total_attempts": 112 9 | } -------------------------------------------------------------------------------- /structured_rag/run_test/results/experimental-results-8-26-24/trial-1/GenerateAnswersWithConfidence-llama3:instruct.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_type": "GenerateAnswersWithConfidence", 3 | "model_name": "llama3:instruct", 4 | "model_provider": "ollama", 5 | "dspy_score": 28, 6 | "fstring_score": 104, 7 | "total_questions": 112 8 | } -------------------------------------------------------------------------------- /structured_rag/run_test/results/experimental-results-8-26-24/trial-1/ParaphraseQuestions-gemini-1.5-pro.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_type": "ParaphraseQuestions", 3 | "model_name": "gemini-1.5-pro", 4 | "model_provider": "google", 5 | "dspy_score": 70, 6 | "fstring_score": 112, 7 | "dspy_total_attempts": 112, 8 | "fstring_total_attempts": 112 9 | } -------------------------------------------------------------------------------- /structured_rag/run_test/results/experimental-results-8-26-24/trial-1/ParaphraseQuestions-llama3:instruct.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_type": "ParaphraseQuestions", 3 | "model_name": "llama3:instruct", 4 | "model_provider": "ollama", 5 | "dspy_score": 112, 6 | "fstring_score": 6, 7 | "total_questions": 112 8 | } -------------------------------------------------------------------------------- /structured_rag/run_test/results/experimental-results-8-26-24/trial-1/RateContext-gemini-1.5-pro.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_type": "RateContext", 3 | "model_name": "gemini-1.5-pro", 4 | "model_provider": "google", 5 | "dspy_score": 102, 6 | "fstring_score": 107, 7 | "dspy_total_attempts": 112, 8 | "fstring_total_attempts": 110 9 | } -------------------------------------------------------------------------------- /structured_rag/run_test/results/experimental-results-8-26-24/trial-1/RateContext-llama3:instruct.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_type": "RateContext", 3 | "model_name": "llama3:instruct", 4 | "model_provider": "ollama", 5 | "dspy_score": 100, 6 | "fstring_score": 89, 7 | "total_questions": 112 8 | } -------------------------------------------------------------------------------- /structured_rag/run_test/results/experimental-results-8-26-24/trial-2/AssessAnswerability-gemini-1.5-pro.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_type": "AssessAnswerability", 3 | "model_name": "gemini-1.5-pro", 4 | "model_provider": "google", 5 | "dspy_score": 105, 6 | "fstring_score": 103, 7 | "dspy_total_attempts": 112, 8 | "fstring_total_attempts": 112 9 | } -------------------------------------------------------------------------------- /structured_rag/run_test/results/experimental-results-8-26-24/trial-2/AssessAnswerability-llama3:instruct.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_type": "AssessAnswerability", 3 | "model_name": "llama3:instruct", 4 | "model_provider": "ollama", 5 | "dspy_score": 107, 6 | "fstring_score": 103, 7 | "dspy_total_attempts": 112, 8 | "fstring_total_attempts": 112 9 | } -------------------------------------------------------------------------------- /structured_rag/run_test/results/experimental-results-8-26-24/trial-2/GenerateAnswer-gemini-1.5-pro.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_type": "GenerateAnswer", 3 | "model_name": "gemini-1.5-pro", 4 | "model_provider": "google", 5 | "dspy_score": 112, 6 | "fstring_score": 112, 7 | "dspy_total_attempts": 112, 8 | "fstring_total_attempts": 112 9 | } -------------------------------------------------------------------------------- /structured_rag/run_test/results/experimental-results-8-26-24/trial-2/GenerateAnswer-llama3:instruct.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_type": "GenerateAnswer", 3 | "model_name": "llama3:instruct", 4 | "model_provider": "ollama", 5 | "dspy_score": 96, 6 | "fstring_score": 109, 7 | "dspy_total_attempts": 112, 8 | "fstring_total_attempts": 112 9 | } -------------------------------------------------------------------------------- /structured_rag/run_test/results/experimental-results-8-26-24/trial-2/GenerateAnswerWithConfidence-gemini-1.5-pro.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_type": "GenerateAnswerWithConfidence", 3 | "model_name": "gemini-1.5-pro", 4 | "model_provider": "google", 5 | "dspy_score": 75, 6 | "fstring_score": 85, 7 | "dspy_total_attempts": 112, 8 | "fstring_total_attempts": 112 9 | } -------------------------------------------------------------------------------- /structured_rag/run_test/results/experimental-results-8-26-24/trial-2/GenerateAnswerWithConfidence-llama3:instruct.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_type": "GenerateAnswerWithConfidence", 3 | "model_name": "llama3:instruct", 4 | "model_provider": "ollama", 5 | "dspy_score": 91, 6 | "fstring_score": 111, 7 | "dspy_total_attempts": 112, 8 | "fstring_total_attempts": 112 9 | } -------------------------------------------------------------------------------- /structured_rag/run_test/results/experimental-results-8-26-24/trial-2/GenerateAnswersWithConfidence-gemini-1.5-pro.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_type": "GenerateAnswersWithConfidence", 3 | "model_name": "gemini-1.5-pro", 4 | "model_provider": "google", 5 | "dspy_score": 102, 6 | "fstring_score": 110, 7 | "dspy_total_attempts": 112, 8 | "fstring_total_attempts": 112 9 | } -------------------------------------------------------------------------------- /structured_rag/run_test/results/experimental-results-8-26-24/trial-2/GenerateAnswersWithConfidence-llama3:instruct.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_type": "GenerateAnswersWithConfidence", 3 | "model_name": "llama3:instruct", 4 | "model_provider": "ollama", 5 | "dspy_score": 28, 6 | "fstring_score": 104, 7 | "dspy_total_attempts": 112, 8 | "fstring_total_attempts": 112 9 | } -------------------------------------------------------------------------------- /structured_rag/run_test/results/experimental-results-8-26-24/trial-2/ParaphraseQuestions-gemini-1.5-pro.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_type": "ParaphraseQuestions", 3 | "model_name": "gemini-1.5-pro", 4 | "model_provider": "google", 5 | "dspy_score": 82, 6 | "fstring_score": 112, 7 | "dspy_total_attempts": 112, 8 | "fstring_total_attempts": 112 9 | } -------------------------------------------------------------------------------- /structured_rag/run_test/results/experimental-results-8-26-24/trial-2/ParaphraseQuestions-llama3:instruct.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_type": "ParaphraseQuestions", 3 | "model_name": "llama3:instruct", 4 | "model_provider": "ollama", 5 | "dspy_score": 112, 6 | "fstring_score": 10, 7 | "dspy_total_attempts": 112, 8 | "fstring_total_attempts": 112 9 | } -------------------------------------------------------------------------------- /structured_rag/run_test/results/experimental-results-8-26-24/trial-2/RateContext-gemini-1.5-pro.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_type": "RateContext", 3 | "model_name": "gemini-1.5-pro", 4 | "model_provider": "google", 5 | "dspy_score": 100, 6 | "fstring_score": 109, 7 | "dspy_total_attempts": 112, 8 | "fstring_total_attempts": 110 9 | } -------------------------------------------------------------------------------- /structured_rag/run_test/results/experimental-results-8-26-24/trial-2/RateContext-llama3:instruct.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_type": "RateContext", 3 | "model_name": "llama3:instruct", 4 | "model_provider": "ollama", 5 | "dspy_score": 100, 6 | "fstring_score": 96, 7 | "dspy_total_attempts": 112, 8 | "fstring_total_attempts": 112 9 | } -------------------------------------------------------------------------------- /structured_rag/run_test/run_scripts/experiment-log.md: -------------------------------------------------------------------------------- 1 | # Experiment Log 2 | 3 | | Model Name | Success Rate | Task | Prompting Method | Date Tested | 4 | |------------|-------------|------|-----------------|-------------| -------------------------------------------------------------------------------- /structured_rag/run_test/run_scripts/results/AssessAnswerability-BATCH-llama3-8b-instruct-Modal.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_name": "AssessAnswerability", 3 | "model_name": "llama3-8b-instruct-Modal", 4 | "prompting_method": "fstring", 5 | "num_successes": 112, 6 | "total_task_performance": 83, 7 | "num_attempts": 112, 8 | "success_rate": 1.0, 9 | "average_task_performance": 0.7410714285714286, 10 | "total_time": 3, 11 | "all_responses": [ 12 | { 13 | "prompt": "placeholder", 14 | "response": "{\"answerable_question\": true}" 15 | }, 16 | { 17 | "prompt": "placeholder", 18 | "response": "{\"answerable_question\": false}" 19 | }, 20 | { 21 | "prompt": "placeholder", 22 | "response": "{\"answerable_question\": true}" 23 | }, 24 | { 25 | "prompt": "placeholder", 26 | "response": "{\"answerable_question\": true}" 27 | }, 28 | { 29 | "prompt": "placeholder", 30 | "response": "{\"answerable_question\": true}" 31 | }, 32 | { 33 | "prompt": "placeholder", 34 | "response": "{\"answerable_question\": false}" 35 | }, 36 | { 37 | "prompt": "placeholder", 38 | "response": "{\"answerable_question\": true}" 39 | }, 40 | { 41 | "prompt": "placeholder", 42 | "response": "{\"answerable_question\": false}" 43 | }, 44 | { 45 | "prompt": "placeholder", 46 | "response": "{\"answerable_question\": true}" 47 | }, 48 | { 49 | "prompt": "placeholder", 50 | "response": "{\"answerable_question\": false}" 51 | }, 52 | { 53 | "prompt": "placeholder", 54 | "response": "{\"answerable_question\": true}" 55 | }, 56 | { 57 | "prompt": "placeholder", 58 | "response": "{\"answerable_question\": false}" 59 | }, 60 | { 61 | "prompt": "placeholder", 62 | "response": "{\"answerable_question\": true}" 63 | }, 64 | { 65 | "prompt": "placeholder", 66 | "response": "{\"answerable_question\": false}" 67 | }, 68 | { 69 | "prompt": "placeholder", 70 | "response": "{\"answerable_question\": true}" 71 | }, 72 | { 73 | "prompt": "placeholder", 74 | "response": "{\"answerable_question\": true}" 75 | }, 76 | { 77 | "prompt": "placeholder", 78 | "response": "{\"answerable_question\": true}" 79 | }, 80 | { 81 | "prompt": "placeholder", 82 | "response": "{\"answerable_question\": false}" 83 | }, 84 | { 85 | "prompt": "placeholder", 86 | "response": "{\"answerable_question\": true}" 87 | }, 88 | { 89 | "prompt": "placeholder", 90 | "response": "{\"answerable_question\": true}" 91 | }, 92 | { 93 | "prompt": "placeholder", 94 | "response": "{\"answerable_question\": true}" 95 | }, 96 | { 97 | "prompt": "placeholder", 98 | "response": "{\"answerable_question\": false}" 99 | }, 100 | { 101 | "prompt": "placeholder", 102 | "response": "{\"answerable_question\": true}" 103 | }, 104 | { 105 | "prompt": "placeholder", 106 | "response": "{\"answerable_question\": true}" 107 | }, 108 | { 109 | "prompt": "placeholder", 110 | "response": "{\"answerable_question\": true}" 111 | }, 112 | { 113 | "prompt": "placeholder", 114 | "response": "{\"answerable_question\": false}" 115 | }, 116 | { 117 | "prompt": "placeholder", 118 | "response": "{\"answerable_question\": true}" 119 | }, 120 | { 121 | "prompt": "placeholder", 122 | "response": "{\"answerable_question\": true}" 123 | }, 124 | { 125 | "prompt": "placeholder", 126 | "response": "{\"answerable_question\": true}" 127 | }, 128 | { 129 | "prompt": "placeholder", 130 | "response": "{\"answerable_question\": false}" 131 | }, 132 | { 133 | "prompt": "placeholder", 134 | "response": "{\"answerable_question\": true}" 135 | }, 136 | { 137 | "prompt": "placeholder", 138 | "response": "{\"answerable_question\": true}" 139 | }, 140 | { 141 | "prompt": "placeholder", 142 | "response": "{\"answerable_question\": true}" 143 | }, 144 | { 145 | "prompt": "placeholder", 146 | "response": "{\"answerable_question\": false}" 147 | }, 148 | { 149 | "prompt": "placeholder", 150 | "response": "{\"answerable_question\": true}" 151 | }, 152 | { 153 | "prompt": "placeholder", 154 | "response": "{\"answerable_question\": false}" 155 | }, 156 | { 157 | "prompt": "placeholder", 158 | "response": "{\"answerable_question\": true}" 159 | }, 160 | { 161 | "prompt": "placeholder", 162 | "response": "{\"answerable_question\": true}" 163 | }, 164 | { 165 | "prompt": "placeholder", 166 | "response": "{\"answerable_question\": true}" 167 | }, 168 | { 169 | "prompt": "placeholder", 170 | "response": "{\"answerable_question\": true}" 171 | }, 172 | { 173 | "prompt": "placeholder", 174 | "response": "{\"answerable_question\": true}" 175 | }, 176 | { 177 | "prompt": "placeholder", 178 | "response": "{\"answerable_question\": false}" 179 | }, 180 | { 181 | "prompt": "placeholder", 182 | "response": "{\"answerable_question\": true}" 183 | }, 184 | { 185 | "prompt": "placeholder", 186 | "response": "{\"answerable_question\": false}" 187 | }, 188 | { 189 | "prompt": "placeholder", 190 | "response": "{\"answerable_question\": true}" 191 | }, 192 | { 193 | "prompt": "placeholder", 194 | "response": "{\"answerable_question\": false}" 195 | }, 196 | { 197 | "prompt": "placeholder", 198 | "response": "{\"answerable_question\": true}" 199 | }, 200 | { 201 | "prompt": "placeholder", 202 | "response": "{\"answerable_question\": false}" 203 | }, 204 | { 205 | "prompt": "placeholder", 206 | "response": "{\"answerable_question\": true}" 207 | }, 208 | { 209 | "prompt": "placeholder", 210 | "response": "{\"answerable_question\": true}" 211 | }, 212 | { 213 | "prompt": "placeholder", 214 | "response": "{\"answerable_question\": true}" 215 | }, 216 | { 217 | "prompt": "placeholder", 218 | "response": "{\"answerable_question\": true}" 219 | }, 220 | { 221 | "prompt": "placeholder", 222 | "response": "{\"answerable_question\": true}" 223 | }, 224 | { 225 | "prompt": "placeholder", 226 | "response": "{\"answerable_question\": true}" 227 | }, 228 | { 229 | "prompt": "placeholder", 230 | "response": "{\"answerable_question\": true}" 231 | }, 232 | { 233 | "prompt": "placeholder", 234 | "response": "{\"answerable_question\": true}" 235 | }, 236 | { 237 | "prompt": "placeholder", 238 | "response": "{\"answerable_question\": true}" 239 | }, 240 | { 241 | "prompt": "placeholder", 242 | "response": "{\"answerable_question\": false}" 243 | }, 244 | { 245 | "prompt": "placeholder", 246 | "response": "{\"answerable_question\": true}" 247 | }, 248 | { 249 | "prompt": "placeholder", 250 | "response": "{\"answerable_question\": true}" 251 | }, 252 | { 253 | "prompt": "placeholder", 254 | "response": "{\"answerable_question\": true}" 255 | }, 256 | { 257 | "prompt": "placeholder", 258 | "response": "{\"answerable_question\": true}" 259 | }, 260 | { 261 | "prompt": "placeholder", 262 | "response": "{\"answerable_question\": true}" 263 | }, 264 | { 265 | "prompt": "placeholder", 266 | "response": "{\"answerable_question\": true}" 267 | }, 268 | { 269 | "prompt": "placeholder", 270 | "response": "{\"answerable_question\": true}" 271 | }, 272 | { 273 | "prompt": "placeholder", 274 | "response": "{\"answerable_question\": true}" 275 | }, 276 | { 277 | "prompt": "placeholder", 278 | "response": "{\"answerable_question\": true}" 279 | }, 280 | { 281 | "prompt": "placeholder", 282 | "response": "{\"answerable_question\": false}" 283 | }, 284 | { 285 | "prompt": "placeholder", 286 | "response": "{\"answerable_question\": true}" 287 | }, 288 | { 289 | "prompt": "placeholder", 290 | "response": "{\"answerable_question\": true}" 291 | }, 292 | { 293 | "prompt": "placeholder", 294 | "response": "{\"answerable_question\": true}" 295 | }, 296 | { 297 | "prompt": "placeholder", 298 | "response": "{\"answerable_question\": true}" 299 | }, 300 | { 301 | "prompt": "placeholder", 302 | "response": "{\"answerable_question\": true}" 303 | }, 304 | { 305 | "prompt": "placeholder", 306 | "response": "{\"answerable_question\": true}" 307 | }, 308 | { 309 | "prompt": "placeholder", 310 | "response": "{\"answerable_question\": true}" 311 | }, 312 | { 313 | "prompt": "placeholder", 314 | "response": "{\"answerable_question\": true}" 315 | }, 316 | { 317 | "prompt": "placeholder", 318 | "response": "{\"answerable_question\": true}" 319 | }, 320 | { 321 | "prompt": "placeholder", 322 | "response": "{\"answerable_question\": true}" 323 | }, 324 | { 325 | "prompt": "placeholder", 326 | "response": "{\"answerable_question\": true}" 327 | }, 328 | { 329 | "prompt": "placeholder", 330 | "response": "{\"answerable_question\": false}" 331 | }, 332 | { 333 | "prompt": "placeholder", 334 | "response": "{\"answerable_question\": true}" 335 | }, 336 | { 337 | "prompt": "placeholder", 338 | "response": "{\"answerable_question\": true}" 339 | }, 340 | { 341 | "prompt": "placeholder", 342 | "response": "{\"answerable_question\": true}" 343 | }, 344 | { 345 | "prompt": "placeholder", 346 | "response": "{\"answerable_question\": false}" 347 | }, 348 | { 349 | "prompt": "placeholder", 350 | "response": "{\"answerable_question\": true}" 351 | }, 352 | { 353 | "prompt": "placeholder", 354 | "response": "{\"answerable_question\": false}" 355 | }, 356 | { 357 | "prompt": "placeholder", 358 | "response": "{\"answerable_question\": true}" 359 | }, 360 | { 361 | "prompt": "placeholder", 362 | "response": "{\"answerable_question\": true}" 363 | }, 364 | { 365 | "prompt": "placeholder", 366 | "response": "{\"answerable_question\": true}" 367 | }, 368 | { 369 | "prompt": "placeholder", 370 | "response": "{\"answerable_question\": false}" 371 | }, 372 | { 373 | "prompt": "placeholder", 374 | "response": "{\"answerable_question\": true}" 375 | }, 376 | { 377 | "prompt": "placeholder", 378 | "response": "{\"answerable_question\": false}" 379 | }, 380 | { 381 | "prompt": "placeholder", 382 | "response": "{\"answerable_question\": true}" 383 | }, 384 | { 385 | "prompt": "placeholder", 386 | "response": "{\"answerable_question\": true}" 387 | }, 388 | { 389 | "prompt": "placeholder", 390 | "response": "{\"answerable_question\": true}" 391 | }, 392 | { 393 | "prompt": "placeholder", 394 | "response": "{\"answerable_question\": true}" 395 | }, 396 | { 397 | "prompt": "placeholder", 398 | "response": "{\"answerable_question\": true}" 399 | }, 400 | { 401 | "prompt": "placeholder", 402 | "response": "{\"answerable_question\": true}" 403 | }, 404 | { 405 | "prompt": "placeholder", 406 | "response": "{\"answerable_question\": true}" 407 | }, 408 | { 409 | "prompt": "placeholder", 410 | "response": "{\"answerable_question\": false}" 411 | }, 412 | { 413 | "prompt": "placeholder", 414 | "response": "{\"answerable_question\": true}" 415 | }, 416 | { 417 | "prompt": "placeholder", 418 | "response": "{\"answerable_question\": true}" 419 | }, 420 | { 421 | "prompt": "placeholder", 422 | "response": "{\"answerable_question\": true}" 423 | }, 424 | { 425 | "prompt": "placeholder", 426 | "response": "{\"answerable_question\": true}" 427 | }, 428 | { 429 | "prompt": "placeholder", 430 | "response": "{\"answerable_question\": true}" 431 | }, 432 | { 433 | "prompt": "placeholder", 434 | "response": "{\"answerable_question\": false}" 435 | }, 436 | { 437 | "prompt": "placeholder", 438 | "response": "{\"answerable_question\": true}" 439 | }, 440 | { 441 | "prompt": "placeholder", 442 | "response": "{\"answerable_question\": true}" 443 | }, 444 | { 445 | "prompt": "placeholder", 446 | "response": "{\"answerable_question\": true}" 447 | }, 448 | { 449 | "prompt": "placeholder", 450 | "response": "{\"answerable_question\": false}" 451 | }, 452 | { 453 | "prompt": "placeholder", 454 | "response": "{\"answerable_question\": true}" 455 | }, 456 | { 457 | "prompt": "placeholder", 458 | "response": "{\"answerable_question\": false}" 459 | } 460 | ], 461 | "failed_responses": [] 462 | } -------------------------------------------------------------------------------- /structured_rag/run_test/run_scripts/results/GenerateAnswer-BATCH-llama3.2-1b-instruct-Modal.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_name": "GenerateAnswer", 3 | "model_name": "llama3.2-3B-Instruct-Modal", 4 | "prompting_method": "fstring", 5 | "num_successes": 112, 6 | "total_task_performance": 547, 7 | "num_attempts": 112, 8 | "success_rate": 1.0, 9 | "average_task_performance": 4.883928571428571, 10 | "total_time": 4, 11 | "all_responses": [ 12 | { 13 | "prompt": "placeholder", 14 | "response": "{\"answer\": \"To conduct both long-term studies of the ionosphere from space and in-situ measurements of ion concentrations and temperatures.\"}" 15 | }, 16 | { 17 | "prompt": "placeholder", 18 | "response": "{\"answer\": \"Unfortunately, the information provided does not allow us to determine the exact year Explorer 20 was launched.\"}" 19 | }, 20 | { 21 | "prompt": "placeholder", 22 | "response": "{\"answer\": \"a protein\"}" 23 | }, 24 | { 25 | "prompt": "placeholder", 26 | "response": "{\"answer\": \"The specific functions of the MAP4K3 protein include regulating the activity of various enzymes involved in cell signaling pathways, particularly those related to mitogen-activated protein kinases (MAPKs).\"}" 27 | }, 28 | { 29 | "prompt": "placeholder", 30 | "response": "{\"answer\": \"1972\"}" 31 | }, 32 | { 33 | "prompt": "placeholder", 34 | "response": "{\"answer\": \"Unfortunately, I couldn't find any specific information on the height of the Heggholmen Lighthouse. If you have more context or a reliable source, please provide it!\"}" 35 | }, 36 | { 37 | "prompt": "placeholder", 38 | "response": "{\"answer\": \"June 13, 2011\"}" 39 | }, 40 | { 41 | "prompt": "placeholder", 42 | "response": "{\"answer\": \"Unfortunately, there is no available information on the specific cases that Antonio Nachura presided over during his time as Associate Justice.\"}" 43 | }, 44 | { 45 | "prompt": "placeholder", 46 | "response": "{\"answer\": \"forward\"}" 47 | }, 48 | { 49 | "prompt": "placeholder", 50 | "response": "{\"answer\": \"Unfortunately, the provided context does not specify which teams Roman Gergel has played for. To provide an accurate answer, additional information would be needed.\"}" 51 | }, 52 | { 53 | "prompt": "placeholder", 54 | "response": "{\"answer\": \"Lord Hanuman, Lord Rama, Goddess Sita, Lord Lakshmana, and Lord Shiva\"}" 55 | }, 56 | { 57 | "prompt": "placeholder", 58 | "response": "{\"answer\": \"The answer is not explicitly stated in the given context.\"}" 59 | }, 60 | { 61 | "prompt": "placeholder", 62 | "response": "{\"answer\": \"The National Gallery of Art, Washington D.C., and the Pinacoteca di Brera, Milan, Italy.\"}" 63 | }, 64 | { 65 | "prompt": "placeholder", 66 | "response": "{\"answer\": \"Unfortunately, without further information or a specific mention of his most famous work, it is difficult to pinpoint Vincenzo Civerchio's most famous painting.\"}" 67 | }, 68 | { 69 | "prompt": "placeholder", 70 | "response": "{\"answer\": \"Australia\"}" 71 | }, 72 | { 73 | "prompt": "placeholder", 74 | "response": "{\"answer\": \"The Kanieae tribe includes around 150-200 species.\"}" 75 | }, 76 | { 77 | "prompt": "placeholder", 78 | "response": "{\"answer\": \"65 km/h (40 mph)\"}" 79 | }, 80 | { 81 | "prompt": "placeholder", 82 | "response": "{\"answer\": \"NOT ENOUGH CONTEXT\"}" 83 | }, 84 | { 85 | "prompt": "placeholder", 86 | "response": "{\"answer\": \"The southern half of South America.\"}" 87 | }, 88 | { 89 | "prompt": "placeholder", 90 | "response": "{\"answer\": \"Tagetes minuta is a tall upright marigold plant from the genus Tagetes, with small flowers, native to the southern half of South America.\"}" 91 | }, 92 | { 93 | "prompt": "placeholder", 94 | "response": "{\"answer\": \"Rugby Union and Rugby League\"}" 95 | }, 96 | { 97 | "prompt": "placeholder", 98 | "response": "{\"answer\": \"We cannot determine the exact number of international matches George Spencer played based on the provided information.\"}" 99 | }, 100 | { 101 | "prompt": "placeholder", 102 | "response": "{\"answer\": \"Kuopio\"}" 103 | }, 104 | { 105 | "prompt": "placeholder", 106 | "response": "{\"answer\": \"Mikko Kuivonen\"}" 107 | }, 108 | { 109 | "prompt": "placeholder", 110 | "response": "{\"answer\": \"Minister of Finance\"}" 111 | }, 112 | { 113 | "prompt": "placeholder", 114 | "response": "{\"answer\": \"None\"}" 115 | }, 116 | { 117 | "prompt": "placeholder", 118 | "response": "{\"answer\": \"Arctiinae\"}" 119 | }, 120 | { 121 | "prompt": "placeholder", 122 | "response": "{\"answer\": \"There are approximately 20-30 species in the Areva genus.\"}" 123 | }, 124 | { 125 | "prompt": "placeholder", 126 | "response": "{\"answer\": \"The Olympic Games where Amanda Doman won a silver medal were the 2004 Summer Olympics.\"}" 127 | }, 128 | { 129 | "prompt": "placeholder", 130 | "response": "{\"answer\": \"Unfortunately, the provided context does not explicitly state Amanda Doman's position in softball. However, it does mention that she won a silver medal at the 2004 Summer Olympics, which suggests that she was part of the Australian softball team. Without further information, we cannot determine her specific position in softball.\"}" 131 | }, 132 | { 133 | "prompt": "placeholder", 134 | "response": "{\"answer\": \"Melodramatic family dramas.\"}" 135 | }, 136 | { 137 | "prompt": "placeholder", 138 | "response": "{\"answer\": \"4\"}" 139 | }, 140 | { 141 | "prompt": "placeholder", 142 | "response": "{\"answer\": \"Eocene epoch\"}" 143 | }, 144 | { 145 | "prompt": "placeholder", 146 | "response": "{\"answer\": \"Unfortunately, it is not possible to determine the average size of Erismatopterus levatus based on the available contexts.\"}" 147 | }, 148 | { 149 | "prompt": "placeholder", 150 | "response": "{\"answer\": \"John Henderson founded Ocean Kinetics in 1992.\"}" 151 | }, 152 | { 153 | "prompt": "placeholder", 154 | "response": "{\"answer\": \"The answer is unknown, as the provided contexts do not contain the necessary information.\"}" 155 | }, 156 | { 157 | "prompt": "placeholder", 158 | "response": "{\"answer\": \"Leslie Phillips\"}" 159 | }, 160 | { 161 | "prompt": "placeholder", 162 | "response": "{\"answer\": \"The exact box office performance of Doctor in Clover is unknown, but it likely performed reasonably well given its popularity as part of the Doctor series and the fame of Kiki Dee.\"}" 163 | }, 164 | { 165 | "prompt": "placeholder", 166 | "response": "{\"answer\": \"Julia Levy-Boeken is an actress.\"}" 167 | }, 168 | { 169 | "prompt": "placeholder", 170 | "response": "{\"answer\": \"At least one\"}" 171 | }, 172 | { 173 | "prompt": "placeholder", 174 | "response": "{\"answer\": \"May 16, 1864\"}" 175 | }, 176 | { 177 | "prompt": "placeholder", 178 | "response": "{\"answer\": \"Unfortunately, I couldn't find any specific information on the number of casualties in the Battle of Mansura. If you have more context or a reliable source, I'd be happy to help you estimate the answer!\"}" 179 | }, 180 | { 181 | "prompt": "placeholder", 182 | "response": "{\"answer\": \"Giuseppe Patania\"}" 183 | }, 184 | { 185 | "prompt": "placeholder", 186 | "response": "{\"answer\": \"Unfortunately, there is no specific information available that answers this question.\"}" 187 | }, 188 | { 189 | "prompt": "placeholder", 190 | "response": "{\"answer\": \"IRNSS-1G\"}" 191 | }, 192 | { 193 | "prompt": "placeholder", 194 | "response": "{\"answer\": \"Unfortunately, the provided contexts do not mention the expected lifespan of the NVS-01 satellite. It only provides information about its purpose and position within the NavIC constellation. To answer this question, we would need additional context or specific data from ISRO or other reliable sources.\"}" 195 | }, 196 | { 197 | "prompt": "placeholder", 198 | "response": "{\"answer\": \"The Meridian Mets played in the Mississippi State League (1921) and the Cotton States League (1922\u20131923; 1925\u20131929).\"}" 199 | }, 200 | { 201 | "prompt": "placeholder", 202 | "response": "{\"answer\": \"Unfortunately, I couldn't find any specific information on the most famous player to play for the Meridian Mets. However, it's likely that one or more of the players who went on to have successful careers in Major League Baseball played for the team at some point.\"}" 203 | }, 204 | { 205 | "prompt": "placeholder", 206 | "response": "{\"answer\": \"The National Wrestling Alliance (NWA)\"}" 207 | }, 208 | { 209 | "prompt": "placeholder", 210 | "response": "{\"answer\": \"NOT ENOUGH CONTEXT\"}" 211 | }, 212 | { 213 | "prompt": "placeholder", 214 | "response": "{\"answer\": \"HC Slovan Bratislava\"}" 215 | }, 216 | { 217 | "prompt": "placeholder", 218 | "response": "{\"answer\": \"NOT ENOUGH CONTEXT\"}" 219 | }, 220 | { 221 | "prompt": "placeholder", 222 | "response": "{\"answer\": \"Maui and Hawaii\"}" 223 | }, 224 | { 225 | "prompt": "placeholder", 226 | "response": "{\"answer\": \"The lifespan of a Thyrocopa alterna moth is likely several months to a year or more.\"}" 227 | }, 228 | { 229 | "prompt": "placeholder", 230 | "response": "{\"answer\": \"Albert Tullgren\"}" 231 | }, 232 | { 233 | "prompt": "placeholder", 234 | "response": "{\"answer\": \"14 species\"}" 235 | }, 236 | { 237 | "prompt": "placeholder", 238 | "response": "{\"answer\": \"Natural Gas\"}" 239 | }, 240 | { 241 | "prompt": "placeholder", 242 | "response": "{\"answer\": \"I couldn't find the exact number of albums George Olliver released as a solo artist.\"}" 243 | }, 244 | { 245 | "prompt": "placeholder", 246 | "response": "{\"answer\": \"Adenylyl cyclase 10\"}" 247 | }, 248 | { 249 | "prompt": "placeholder", 250 | "response": "{\"answer\": \"The specific function of the ADCY10 enzyme is to catalyze the conversion of ATP into cyclic AMP (cAMP), which is an important second messenger molecule involved in various cellular signaling pathways.\"}" 251 | }, 252 | { 253 | "prompt": "placeholder", 254 | "response": "{\"answer\": \"Les Ferdinand\"}" 255 | }, 256 | { 257 | "prompt": "placeholder", 258 | "response": "{\"answer\": \"Besiktas 1, Opponent 0\"}" 259 | }, 260 | { 261 | "prompt": "placeholder", 262 | "response": "{\"answer\": \"Lamont Dozier\"}" 263 | }, 264 | { 265 | "prompt": "placeholder", 266 | "response": "{\"answer\": \"It seems that Lamont Dozier was inspired to write 'Invisible' as part of his efforts to create a successful single for Alison Moyet's debut album Alf, which was released in November 1984.\"}" 267 | }, 268 | { 269 | "prompt": "placeholder", 270 | "response": "{\"answer\": \"Steve Grimmett was the longest-running member of Grim Reaper.\"}" 271 | }, 272 | { 273 | "prompt": "placeholder", 274 | "response": "{\"answer\": \"Rock You to Hell\"}" 275 | }, 276 | { 277 | "prompt": "placeholder", 278 | "response": "{\"answer\": \"The Baltimore Orioles\"}" 279 | }, 280 | { 281 | "prompt": "placeholder", 282 | "response": "{\"answer\": \"Unfortunately, we cannot determine the exact number of years Rick Adair played in the minor leagues based on the provided context.\"}" 283 | }, 284 | { 285 | "prompt": "placeholder", 286 | "response": "{\"answer\": \"He was the Spokesperson (scientific head) of the ATLAS Collaboration\"}" 287 | }, 288 | { 289 | "prompt": "placeholder", 290 | "response": "{\"answer\": \"His work as the Spokesperson of the ATLAS Collaboration, where he oversaw the analysis of data collected by the Large Hadron Collider, and his likely focus on Higgs boson research, dark matter detection, or advancements in collider technology.\"}" 291 | }, 292 | { 293 | "prompt": "placeholder", 294 | "response": "{\"answer\": \"The 2009 Rexall Edmonton Indy was held on July 26, 2009 at the Rexall Speedway in Edmonton, Alberta, Canada.\"}" 295 | }, 296 | { 297 | "prompt": "placeholder", 298 | "response": "{\"answer\": \"Dario Franchitti\"}" 299 | }, 300 | { 301 | "prompt": "placeholder", 302 | "response": "{\"answer\": \"Starbase General Manager\"}" 303 | }, 304 | { 305 | "prompt": "placeholder", 306 | "response": "{\"answer\": \"Developing and implementing commercial crew vehicles, such as SpaceX's Dragon spacecraft, and managing the integration of these vehicles into NASA's overall human spaceflight program.\"}" 307 | }, 308 | { 309 | "prompt": "placeholder", 310 | "response": "{\"answer\": \"AIB1, SRC-3, TRAM-1\"}" 311 | }, 312 | { 313 | "prompt": "placeholder", 314 | "response": "{\"answer\": \"acts as a coactivator for various transcription factors, enhancing their activity and regulating gene expression\"}" 315 | }, 316 | { 317 | "prompt": "placeholder", 318 | "response": "{\"answer\": \"76,517\"}" 319 | }, 320 | { 321 | "prompt": "placeholder", 322 | "response": "{\"answer\": \"The Gereja Kayu Church, built in 1640 in Jakarta.\"}" 323 | }, 324 | { 325 | "prompt": "placeholder", 326 | "response": "{\"answer\": \"The St. Michael's Indian Residential School in Duck Lake closed in 1996.\"}" 327 | }, 328 | { 329 | "prompt": "placeholder", 330 | "response": "{\"answer\": \"Not available with the given context\"}" 331 | }, 332 | { 333 | "prompt": "placeholder", 334 | "response": "{\"answer\": \"Justin Hayward and John Lodge\"}" 335 | }, 336 | { 337 | "prompt": "placeholder", 338 | "response": "{\"answer\": \"The lyrics of 'Gemini Dream' were likely inspired by the songwriters' personal experiences, emotions, and observations, as well as their creative vision for the song.\"}" 339 | }, 340 | { 341 | "prompt": "placeholder", 342 | "response": "{\"answer\": \"Two\"}" 343 | }, 344 | { 345 | "prompt": "placeholder", 346 | "response": "{\"answer\": \"Unfortunately, I couldn't find any specific information on the legislation sponsored by E. S. Johnny Walker during his time in Congress. It's possible that this information is not publicly available or has been lost over time.\"}" 347 | }, 348 | { 349 | "prompt": "placeholder", 350 | "response": "{\"answer\": \"January 15, 2013\"}" 351 | }, 352 | { 353 | "prompt": "placeholder", 354 | "response": "{\"answer\": \"Unfortunately, I couldn't find any information about the current number of active contributors for Wikivoyage.\"}" 355 | }, 356 | { 357 | "prompt": "placeholder", 358 | "response": "{\"answer\": \"Dawson Walker became manager of the Scotland national football team in 1958 due to the injury of the official manager, Matt Busby, in the Munich air disaster.\"}" 359 | }, 360 | { 361 | "prompt": "placeholder", 362 | "response": "{\"answer\": \"Scotland had a win-loss record of 4-1 under Dawson Walker's management.\"}" 363 | }, 364 | { 365 | "prompt": "placeholder", 366 | "response": "{\"answer\": \"The San Diego Padres\"}" 367 | }, 368 | { 369 | "prompt": "placeholder", 370 | "response": "{\"answer\": \"Unknown\"}" 371 | }, 372 | { 373 | "prompt": "placeholder", 374 | "response": "{\"answer\": \"Yasmine Bleeth and Richard Grieco were the stars of the 1996 movie Heaven or Vegas.\"}" 375 | }, 376 | { 377 | "prompt": "placeholder", 378 | "response": "{\"answer\": \"Unfortunately, I couldn't find any reliable sources that provide the exact box office performance of Heaven or Vegas. The movie received mixed reviews from critics and audiences, but it seems to be a relatively obscure film with limited information available about its commercial success.\"}" 379 | }, 380 | { 381 | "prompt": "placeholder", 382 | "response": "{\"answer\": \"approximately three hundred\"}" 383 | }, 384 | { 385 | "prompt": "placeholder", 386 | "response": "{\"answer\": \"The Northwood Club in Dallas, Texas\"}" 387 | }, 388 | { 389 | "prompt": "placeholder", 390 | "response": "{\"answer\": \"The Mandalay Bay Events Center on the Las Vegas Strip in Nevada\"}" 391 | }, 392 | { 393 | "prompt": "placeholder", 394 | "response": "{\"answer\": \"Randy Couture won the main event of UFC 58: USA vs. Canada\"}" 395 | }, 396 | { 397 | "prompt": "placeholder", 398 | "response": "{\"answer\": \"Timothy Zahn\"}" 399 | }, 400 | { 401 | "prompt": "placeholder", 402 | "response": "{\"answer\": \"The plot of Survivor's Quest follows Mara Jade, a former assassin and wife of Luke Skywalker, as she navigates her new role as a Jedi Master while dealing with the aftermath of the Yuuzhan Vong War.\"}" 403 | }, 404 | { 405 | "prompt": "placeholder", 406 | "response": "{\"answer\": \"William Elford Leach\"}" 407 | }, 408 | { 409 | "prompt": "placeholder", 410 | "response": "{\"answer\": \"Unknown\"}" 411 | }, 412 | { 413 | "prompt": "placeholder", 414 | "response": "{\"answer\": \"The carnelian cowrie\"}" 415 | }, 416 | { 417 | "prompt": "placeholder", 418 | "response": "{\"answer\": \"The average size of a Lyncina carneola shell is approximately 2-3 centimeters (0.8-1.2 inches) in length.\"}" 419 | }, 420 | { 421 | "prompt": "placeholder", 422 | "response": "{\"answer\": \"Rod Laver Arena in Melbourne, Australia.\"}" 423 | }, 424 | { 425 | "prompt": "placeholder", 426 | "response": "{\"answer\": \"Grant Hackett\"}" 427 | }, 428 | { 429 | "prompt": "placeholder", 430 | "response": "{\"answer\": \"54 years\"}" 431 | }, 432 | { 433 | "prompt": "placeholder", 434 | "response": "{\"answer\": \"Unfortunately, there is not enough information provided to determine Frederick E. Goodrich's most notable article during his career at The Boston Post.\"}" 435 | }, 436 | { 437 | "prompt": "placeholder", 438 | "response": "{\"answer\": \"eight\"}" 439 | }, 440 | { 441 | "prompt": "placeholder", 442 | "response": "{\"answer\": \"The evolutionary history of the Wunderlichioideae subfamily is thought to involve an origin in South America, with subsequent dispersal events to other regions. The distinctive morphological features and genetic characteristics of its members suggest a complex evolutionary history involving significant adaptations and diversification events.\"}" 443 | }, 444 | { 445 | "prompt": "placeholder", 446 | "response": "{\"answer\": \"A miniature gold scimitar on a chain around their necks.\"}" 447 | }, 448 | { 449 | "prompt": "placeholder", 450 | "response": "{\"answer\": \"Unable to determine the number of active members due to lack of relevant data.\"}" 451 | }, 452 | { 453 | "prompt": "placeholder", 454 | "response": "{\"answer\": \"Kemar Donaldson\"}" 455 | }, 456 | { 457 | "prompt": "placeholder", 458 | "response": "{\"answer\": \"Unfortunately, I couldn't find the exact number of albums Kranium has released, as this information is not readily available.\"}" 459 | } 460 | ], 461 | "failed_responses": [] 462 | } -------------------------------------------------------------------------------- /structured_rag/run_test/run_scripts/run_batch_test.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import json 3 | import os 4 | import requests 5 | import time 6 | from pydantic import BaseModel 7 | 8 | from structured_rag.run_test.utils_and_metrics.helpers import Colors, load_json_from_file 9 | from structured_rag.run_test.utils_and_metrics.metrics import is_valid_json_output, assess_answerability_metric, classification_metric 10 | from structured_rag.run_test.utils_and_metrics.metrics import GenerateAnswerTaskMetric 11 | 12 | from typing import List 13 | from pydantic import BaseModel 14 | 15 | from structured_rag.mock_gfl.fstring_prompts import get_prompt 16 | from structured_rag.models import GenerateAnswer, RateContext, AssessAnswerability, ParaphraseQuestions, RAGAS, GenerateAnswerWithConfidence, GenerateAnswersWithConfidence, ClassifyDocument 17 | from structured_rag.models import test_params 18 | from structured_rag.models import create_enum, _ClassifyDocument, _ClassifyDocumentWithRationale 19 | 20 | from structured_rag.models import Experiment, PromptWithResponse, PromptingMethod 21 | 22 | # Configuration variables 23 | url = "YOUR_MODAL_URL" 24 | openai_api_key = "sk-foobar" 25 | test_type = "AssessAnswerability" 26 | save_dir = "results" 27 | dataset_filepath = "SuperBEIR" 28 | 29 | headers = { 30 | "Content-Type": "application/json", 31 | "Authorization": "Bearer YOUR_MODAL_API_KEY", # replace with your Modal API Key 32 | } 33 | 34 | def prepare_prompts_for_llama3(prompts: List[str]) -> List[str]: 35 | prompt_preface = """<|begin_of_text|> 36 | <|start_header_id|>system<|end_header_id|> 37 | 38 | Cutting Knowledge Date: December 2023 39 | Today Date: 23 Jul 2024 40 | 41 | You are a helpful assistant<|eot_id|> 42 | <|start_header_id|>user<|end_header_id|> 43 | """ 44 | 45 | prompt_ending = """<|eot_id|> 46 | <|start_header_id|>assistant<|end_header_id|>""" 47 | 48 | # Preface each prompt and append the ending 49 | return [prompt_preface + prompt + prompt_ending for prompt in prompts] 50 | 51 | # currently doing nearly everything in this single function 52 | def run_batch_test(dataset_filepath, test_type, save_dir, with_outlines): 53 | # fix this with a CLI argument `dataset` 54 | # Leaving the hardcoded filepath 55 | if dataset_filepath == "../../../data/WikiQuestions.json": 56 | dataset = load_json_from_file(dataset_filepath) 57 | else: 58 | #dataset = load_superbeir() 59 | dataset = load_json_from_file("../../../data/SuperBEIR/SuperBEIR-small-balanced.json")[:340] 60 | 61 | # Load SuperBEIR categories and their descriptions 62 | with open('../../../data/SuperBEIR/SuperBEIR-categories-with-rationales.json', 'r') as file: 63 | data = json.load(file) 64 | 65 | # Create a list of dictionaries with category name and description 66 | categories = [{category: info['category_description']} for category, info in data.items()] 67 | 68 | formatted_categories = "" 69 | for category_dict in categories: 70 | for category_name, category_description in category_dict.items(): 71 | formatted_categories += f"{category_name}: {category_description}\n" 72 | 73 | # Remove the trailing newline 74 | formatted_categories = formatted_categories.rstrip() 75 | categories = list(data.keys()) 76 | 77 | 78 | # ToD, update to ablate `with_outlines` 79 | payload = { 80 | "with_outlines": True 81 | } 82 | 83 | # Get Pydantic Model to send to vLLM / Outlines 84 | if with_outlines: 85 | if test_type == "GenerateAnswer": 86 | payload["output_model"] = GenerateAnswer.schema() 87 | generate_answer_task_metric = GenerateAnswerTaskMetric(api_key=openai_api_key) 88 | elif test_type == "RateContext": 89 | payload["output_model"] = RateContext.schema() 90 | elif test_type == "AssessAnswerability": 91 | payload["output_model"] = AssessAnswerability.schema() 92 | elif test_type == "ParaphraseQuestions": 93 | payload["output_model"] = ParaphraseQuestions.schema() 94 | elif test_type == "RAGAS": 95 | payload["output_model"] = RAGAS.schema() 96 | elif test_type == "GenerateAnswerWithConfidence": 97 | payload["output_model"] = GenerateAnswerWithConfidence.schema() 98 | elif test_type == "GenerateAnswersWithConfidence": 99 | payload["output_model"] = GenerateAnswersWithConfidence.schema() 100 | elif test_type == "ClassifyDocument": 101 | ClassifyDocumentModel = _ClassifyDocument(categories) 102 | payload["output_model"] = ClassifyDocumentModel.schema() 103 | elif test_type == "ClassifyDocumentWithRationale": 104 | ClassifyDocumentWithRationale = _ClassifyDocumentWithRationale(categories) 105 | payload["output_model"] = ClassifyDocumentWithRationale.schema() 106 | 107 | # ToDo, ablate interfacing the response_format instructions with structured decoding? 108 | 109 | prompts = [] 110 | for item in dataset: 111 | # ToDo, fix this 112 | if test_type == "ClassifyDocument" or test_type == "ClassifyDocumentWithRationale": 113 | references = {"document": item["document"], 114 | "label": item["label"], 115 | "classes_with_descriptions": formatted_categories} 116 | else: 117 | references = {"context": item["context"], 118 | "question": item["question"], 119 | "answer": item["answer"]} 120 | formatted_prompt = get_prompt(test_type, references, test_params[test_type]) 121 | prompts.append(formatted_prompt) 122 | 123 | prompts_for_llama3 = prepare_prompts_for_llama3(prompts) 124 | 125 | payload["prompts"] = prompts_for_llama3 126 | 127 | start_time = time.time() 128 | # Run all inferences 129 | response = requests.post(url, headers=headers, json=payload, timeout=3000) # Increased timeout to 5 minutes 130 | total_time = time.time() - start_time 131 | print(f"Total time taken: {total_time} seconds") 132 | print(f"Average time per task: {(total_time) / len(prompts):.2f} seconds") 133 | 134 | # check the `int` valued total_time, I don't think that's right 135 | batch_experiment = Experiment( 136 | test_name=test_type, 137 | model_name="llama3.2-3B-Instruct-Modal", 138 | prompting_method=PromptingMethod.fstring, 139 | num_successes=0, 140 | total_task_performance=0, 141 | num_attempts=0, 142 | success_rate=0, 143 | average_task_performance=0, 144 | total_time=int(total_time), 145 | all_responses=[], 146 | failed_responses=[] 147 | ) 148 | 149 | if response.status_code == 200: 150 | response_list = ast.literal_eval(response.text) 151 | results_dict = {int(result["id"]): result["answer"] for result in response_list} 152 | sorted_results = dict(sorted(results_dict.items())) 153 | for id, output in sorted_results.items(): 154 | if is_valid_json_output(output, test_type): 155 | print(f"{Colors.GREEN}Valid output:\n{output}{Colors.ENDC}") 156 | batch_experiment.num_successes += 1 157 | if test_type == "AssessAnswerability": 158 | assess_answerability_response = json.loads(output)["answerable_question"] 159 | print(f"{Colors.BOLD}Assess Answerability Response: {assess_answerability_response}{Colors.ENDC}") 160 | task_metric = assess_answerability_metric(assess_answerability_response, dataset[id]["answerable"]) 161 | print(f"{Colors.BOLD}Task Metric: {task_metric}{Colors.ENDC}") 162 | batch_experiment.total_task_performance += task_metric 163 | if test_type == "GenerateAnswer": 164 | answer_response = json.loads(output)["answer"] 165 | print(f"{Colors.BOLD}Answer Response: {answer_response}{Colors.ENDC}") 166 | print(f"{Colors.RED}Ground Truth: {dataset[id]['answer']}{Colors.ENDC}") 167 | task_metric, rationale = generate_answer_task_metric.assess_answer_metric(context=dataset[id]["context"], 168 | question=dataset[id]["question"], 169 | system_answer=answer_response, 170 | ground_truth=dataset[id]["answer"]) 171 | print(f"{Colors.BOLD}Task Metric: {task_metric}{Colors.ENDC}\n") 172 | print(f"{Colors.CYAN}Rationale: {rationale}{Colors.ENDC}") 173 | batch_experiment.total_task_performance += task_metric 174 | if test_type == "ClassifyDocument": 175 | classification_response = json.loads(output)["category"] # extend to return classification and rationale 176 | print(f"{Colors.BOLD}Classification Response: {classification_response}{Colors.ENDC}") 177 | ground_truth = dataset[id]["label"] 178 | print(f"{Colors.CYAN}Ground Truth: {ground_truth}{Colors.ENDC}") 179 | task_metric = classification_metric(classification_response, ground_truth) 180 | print(f"{Colors.BOLD}Task Metric: {task_metric}{Colors.ENDC}") 181 | batch_experiment.total_task_performance += task_metric 182 | if test_type == "ClassifyDocumentWithRationale": 183 | # ToDo, extend to do something with the rationale as well 184 | classification_response = json.loads(output)["category"] # extend to return classification and rationale 185 | print(f"{Colors.BOLD}Classification Response: {classification_response}{Colors.ENDC}") 186 | ground_truth = dataset[id]["label"] 187 | print(f"{Colors.CYAN}Ground Truth: {ground_truth}{Colors.ENDC}") 188 | task_metric = classification_metric(classification_response, ground_truth) 189 | print(f"{Colors.BOLD}Task Metric: {task_metric}{Colors.ENDC}") 190 | batch_experiment.total_task_performance += task_metric 191 | else: 192 | print(f"{Colors.RED}Invalid output:\n{output}{Colors.ENDC}") 193 | batch_experiment.failed_responses.append(PromptWithResponse( 194 | prompt="placeholder", 195 | response=output 196 | )) 197 | batch_experiment.num_attempts += 1 198 | batch_experiment.all_responses.append(PromptWithResponse( 199 | prompt="placeholder", 200 | response=output 201 | )) 202 | 203 | batch_experiment.success_rate = batch_experiment.num_successes / batch_experiment.num_attempts 204 | batch_experiment.average_task_performance = batch_experiment.total_task_performance / batch_experiment.num_attempts 205 | print(f"{Colors.GREEN}JSON Success rate: {batch_experiment.success_rate:.2f}{Colors.ENDC}") 206 | print(f"{Colors.GREEN}Average task performance: {batch_experiment.average_task_performance:.2f}{Colors.ENDC}") 207 | print(f"{Colors.GREEN}Time to run experiment: {total_time} seconds{Colors.ENDC}") 208 | 209 | # serialize experiment to JSON 210 | os.makedirs(save_dir, exist_ok=True) 211 | 212 | # Fix this save path 213 | batch_result_file = os.path.join(save_dir, f"{test_type}-Modal-vLLM.json") 214 | 215 | with open(batch_result_file, "w") as f: 216 | json.dump(batch_experiment.dict(), f, indent=2) 217 | 218 | print(f"\nResults saved in {batch_result_file}.") 219 | 220 | else: 221 | print(f"Error: {response.status_code}") 222 | print(response.text) 223 | 224 | if __name__ == "__main__": 225 | run_batch_test(dataset_filepath, test_type, save_dir, with_outlines=True) -------------------------------------------------------------------------------- /structured_rag/run_test/run_scripts/run_test.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import datetime 4 | import time 5 | 6 | from typing import Optional 7 | from pydantic import BaseModel 8 | 9 | from structured_rag.mock_gfl.dspy_program import dspy_Program 10 | from structured_rag.mock_gfl.fstring_program import fstring_Program 11 | 12 | from structured_rag.run_test.utils_and_metrics.helpers import Colors, load_json_from_file 13 | from structured_rag.run_test.utils_and_metrics.metrics import is_valid_json_output, assess_answerability_metric 14 | 15 | from structured_rag.models import Experiment, PromptWithResponse, PromptingMethod, SingleTestResult 16 | from structured_rag.models import test_params, test_to_output_model 17 | 18 | # Configuration variables 19 | MODEL_NAME = "gpt-4o" 20 | MODEL_PROVIDER = "openai" # one of: "ollama", "google", "openai", "anthropic" 21 | API_KEY = "" 22 | TEST_TYPE = "AssessAnswerability" # one of: "GenerateAnswer", "RateContext", "AssessAnswerability", "ParaphraseQuestions", "RAGAS", "RateMultipleAspects", "GenerateAnswerWithConfidence", "GenerateAnswersWithConfidence" 23 | SAVE_DIR = "results" 24 | 25 | def run_single_test(output_model: Optional[BaseModel], 26 | program, test_type, title, context, question, answer, task_specific_ground_truth) -> SingleTestResult: 27 | try: 28 | if test_type == "ParaphraseQuestions": 29 | output = program.forward(output_model, test_type, question=question) 30 | elif test_type == "RAGAS": 31 | output = program.forward(output_model, test_type, context, question, answer) 32 | else: 33 | output = program.forward(output_model, test_type, context, question) 34 | 35 | print(f"{Colors.CYAN}{program.__class__.__name__} Output: {output}{Colors.ENDC}\n") 36 | 37 | task_metric = 0 38 | 39 | parsed_output, is_valid = is_valid_json_output(output, test_type) 40 | 41 | if is_valid: 42 | print(f"{Colors.GREEN}Valid output for {test_type}{Colors.ENDC}") 43 | is_valid = True 44 | if test_type == "AssessAnswerability": 45 | answerable_question_response = parsed_output # not necessary, but lazy 46 | # print(f"{Colors.BOLD}Assess Answerability Response: {answerable_question_response}{Colors.ENDC}") 47 | # print(f"{Colors.CYAN}Ground truth answerability: {task_specific_ground_truth}{Colors.ENDC}\n") 48 | # print(f"Predicted type {type(answerable_question_response)}\n") 49 | # print(f"Ground truth type {type(task_specific_ground_truth)}\n") 50 | task_metric = assess_answerability_metric(answerable_question_response, task_specific_ground_truth) 51 | print(f"{Colors.BOLD}Task Metric: {task_metric}{Colors.ENDC}") 52 | else: 53 | print(f"{Colors.RED}Invalid output for {test_type}{Colors.ENDC}") 54 | 55 | return SingleTestResult(prompt_with_response=PromptWithResponse(prompt=f"Title: {title}\nContext: {context}\nQuestion: {question}", response=output), is_valid=is_valid, task_metric=task_metric) 56 | 57 | except Exception as e: 58 | print(f"{Colors.YELLOW}Error occurred: {str(e)}{Colors.ENDC}") 59 | print(f"{Colors.RED}Skipping this test due to error.{Colors.ENDC}") 60 | return SingleTestResult(prompt_with_response=PromptWithResponse(prompt=f"Title: {title}\nContext: {context}\nQuestion: {question}", response="Error"), is_valid=False, task_metric=0) 61 | 62 | def run_test(): 63 | filename = "../../../data/WikiQuestions.json" 64 | json_data = load_json_from_file(filename) 65 | 66 | print(f"{Colors.BOLD}Number of samples in the dataset: {len(json_data)}{Colors.ENDC}") 67 | 68 | if TEST_TYPE not in test_params: 69 | raise ValueError(f"Unsupported test: {TEST_TYPE}") 70 | 71 | test_to_run = test_params[TEST_TYPE] 72 | output_model = test_to_output_model[TEST_TYPE] 73 | 74 | # Define program configurations 75 | program_configs = [ 76 | # DSPy Programs 77 | { 78 | 'name': 'dspy_NO_OPRO_JSON', 79 | 'type': 'dspy', 80 | 'params': { 81 | 'use_OPRO_JSON': False, 82 | 'test_params': test_to_run, 83 | 'model_name': MODEL_NAME, 84 | 'model_provider': MODEL_PROVIDER, 85 | 'api_key': API_KEY 86 | } 87 | }, 88 | { 89 | 'name': 'dspy_WITH_OPRO_JSON', 90 | 'type': 'dspy', 91 | 'params': { 92 | 'use_OPRO_JSON': True, 93 | 'test_params': test_to_run, 94 | 'model_name': MODEL_NAME, 95 | 'model_provider': MODEL_PROVIDER, 96 | 'api_key': API_KEY 97 | } 98 | }, 99 | # f-string Programs 100 | { 101 | 'name': 'fstring_without_structured_outputs', 102 | 'type': 'fstring', 103 | 'params': { 104 | 'structured_outputs': False, 105 | 'test_params': test_to_run, 106 | 'model_name': MODEL_NAME, 107 | 'model_provider': MODEL_PROVIDER, 108 | 'api_key': API_KEY 109 | } 110 | }, 111 | { 112 | 'name': 'fstring_with_structured_outputs', 113 | 'type': 'fstring', 114 | 'params': { 115 | 'structured_outputs': True, 116 | 'test_params': test_to_run, 117 | 'model_name': MODEL_NAME, 118 | 'model_provider': MODEL_PROVIDER, 119 | 'api_key': API_KEY 120 | } 121 | } 122 | ] 123 | 124 | total_inference_count = 0 # Total inferences across all programs 125 | 126 | # For each program configuration 127 | for program_config in program_configs: 128 | print(f"\n{Colors.BOLD}Running tests for program: {program_config['name']}{Colors.ENDC}") 129 | # Initialize the program 130 | if program_config['type'] == 'dspy': 131 | program = dspy_Program(**program_config['params']) 132 | prompting_method = PromptingMethod.dspy 133 | elif program_config['type'] == 'fstring': 134 | program = fstring_Program(**program_config['params']) 135 | prompting_method = PromptingMethod.fstring 136 | else: 137 | raise ValueError(f"Unknown program type: {program_config['type']}") 138 | 139 | # Initialize the Experiment 140 | experiment = Experiment( 141 | test_name=TEST_TYPE, 142 | model_name=MODEL_NAME, 143 | prompting_method=prompting_method, 144 | num_successes=0, 145 | total_task_performance=0, 146 | num_attempts=0, 147 | success_rate=0, 148 | average_task_performance=0, 149 | total_time=0, 150 | all_responses=[], 151 | failed_responses=[] 152 | ) 153 | 154 | total_start_time = time.time() 155 | inference_count = 0 # Inferences for this program 156 | 157 | # Loop over dataset entries 158 | for entry in json_data: 159 | title = entry.get('title', '') 160 | context = entry.get('context', '') 161 | question = entry.get('question', '') 162 | answer = entry.get('answer', '') 163 | answerable = entry.get('answerable', '') 164 | 165 | print(f"{Colors.UNDERLINE}Title: {title}{Colors.ENDC}") 166 | print(f"{Colors.UNDERLINE}Question: {question}{Colors.ENDC}\n") 167 | 168 | single_test_result = run_single_test( 169 | output_model=output_model, 170 | program=program, 171 | test_type=TEST_TYPE, 172 | title=title, 173 | context=context, 174 | question=question, 175 | answer=answer, 176 | task_specific_ground_truth=answerable 177 | ) 178 | inference_count += 1 179 | 180 | # Record the result 181 | if single_test_result: 182 | experiment.all_responses.append(single_test_result.prompt_with_response) 183 | experiment.num_attempts += 1 184 | if single_test_result.is_valid: 185 | experiment.num_successes += 1 186 | experiment.total_task_performance += single_test_result.task_metric 187 | else: 188 | experiment.failed_responses.append(single_test_result.prompt_with_response) 189 | 190 | print(f"\n{Colors.BOLD}==============={Colors.ENDC}\n") 191 | 192 | total_time = time.time() - total_start_time 193 | experiment.total_time = int(total_time) 194 | 195 | # Calculate success rate and average task performance 196 | if experiment.num_attempts > 0: 197 | experiment.success_rate = experiment.num_successes / experiment.num_attempts 198 | experiment.average_task_performance = experiment.total_task_performance / experiment.num_attempts 199 | else: 200 | experiment.success_rate = 0 201 | experiment.average_task_performance = 0 202 | 203 | # Print final scores 204 | print(f"{Colors.HEADER}Final Scores for {program_config['name']}:{Colors.ENDC}") 205 | print(f"{Colors.BOLD}JSON Success Rate: {Colors.GREEN}{experiment.num_successes}/{experiment.num_attempts} ({experiment.success_rate:.2%}){Colors.ENDC}") 206 | print(f"{Colors.BOLD}Average Task Performance: {Colors.GREEN}{experiment.average_task_performance:.2f}{Colors.ENDC}") 207 | 208 | # Save results to JSON file 209 | os.makedirs("../results/" + SAVE_DIR, exist_ok=True) 210 | current_date = datetime.datetime.now().strftime("%Y-%m-%d") 211 | result_file = os.path.join("../results/", f"{TEST_TYPE}-{MODEL_NAME}-{program_config['name']}-{current_date}.json") 212 | 213 | with open(result_file, "w") as f: 214 | json.dump(experiment.dict(), f, indent=2) 215 | 216 | print(f"\nResults saved in {result_file}") 217 | 218 | # Append results to experiment log 219 | with open("experiment-log.md", "a") as f: 220 | f.write(f"| {MODEL_NAME} | {experiment.success_rate:.2%} | {TEST_TYPE} | {program_config['type']} | {current_date} |\n") 221 | 222 | total_inference_count += inference_count 223 | 224 | # Print total number of inferences run 225 | print(f"{Colors.BOLD}Total number of inferences run: {total_inference_count}{Colors.ENDC}") 226 | 227 | if __name__ == "__main__": 228 | run_test() -------------------------------------------------------------------------------- /structured_rag/run_test/utils_and_metrics/__pycache__/helpers.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weaviate/structured-rag/0c5174d0819db853adeeca59840a4eef957f5ac2/structured_rag/run_test/utils_and_metrics/__pycache__/helpers.cpython-310.pyc -------------------------------------------------------------------------------- /structured_rag/run_test/utils_and_metrics/__pycache__/metrics.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weaviate/structured-rag/0c5174d0819db853adeeca59840a4eef957f5ac2/structured_rag/run_test/utils_and_metrics/__pycache__/metrics.cpython-310.pyc -------------------------------------------------------------------------------- /structured_rag/run_test/utils_and_metrics/count-tokens.py: -------------------------------------------------------------------------------- 1 | # Count input and output tokens per task from the `all_responses` key 2 | import json 3 | import tiktoken 4 | 5 | # Initialize tokenizer 6 | encoding = tiktoken.encoding_for_model("gpt-4") 7 | 8 | # Read results file 9 | with open("../results/results/AssessAnswerability-gpt-4o-fstring_with_structured_outputs-2024-11-29.json", "r") as f: 10 | results = json.load(f) 11 | 12 | total_input_tokens = 0 13 | total_output_tokens = 0 14 | 15 | # Process each response 16 | num_responses = len(results["all_responses"]) 17 | for response in results["all_responses"]: 18 | # Count input tokens from prompt 19 | input_tokens = len(encoding.encode(response["prompt"])) 20 | total_input_tokens += input_tokens 21 | 22 | # Count output tokens from response 23 | output_tokens = len(encoding.encode(response["response"])) 24 | total_output_tokens += output_tokens 25 | 26 | # Calculate averages 27 | avg_input_tokens = total_input_tokens / num_responses 28 | avg_output_tokens = total_output_tokens / num_responses 29 | 30 | print(f"Total input tokens: {total_input_tokens}") 31 | print(f"Total output tokens: {total_output_tokens}") 32 | print(f"Average input tokens per response: {avg_input_tokens:.1f}") 33 | print(f"Average output tokens per response: {avg_output_tokens:.1f}") 34 | -------------------------------------------------------------------------------- /structured_rag/run_test/utils_and_metrics/helpers.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import datetime 4 | 5 | import pandas as pd 6 | 7 | from structured_rag.models import Experiment 8 | 9 | class Colors: 10 | HEADER = '\033[95m' 11 | BLUE = '\033[94m' 12 | CYAN = '\033[96m' 13 | GREEN = '\033[92m' 14 | YELLOW = '\033[93m' 15 | RED = '\033[91m' 16 | ENDC = '\033[0m' 17 | BOLD = '\033[1m' 18 | UNDERLINE = '\033[4m' 19 | 20 | def load_json_from_file(filename): 21 | try: 22 | with open(filename, 'r') as json_file: 23 | data = json.load(json_file) 24 | return data 25 | except FileNotFoundError: 26 | print(f"{Colors.RED}Error: File '{filename}' not found.{Colors.ENDC}") 27 | return None 28 | except json.JSONDecodeError: 29 | print(f"{Colors.RED}Error: Invalid JSON format in '{filename}'.{Colors.ENDC}") 30 | return None 31 | 32 | def load_experiments(directory: str) -> pd.DataFrame: 33 | experiments = [] 34 | for filename in os.listdir(directory): 35 | if filename.endswith(".json"): 36 | with open(os.path.join(directory, filename), 'r') as f: 37 | data = json.load(f) 38 | experiment = Experiment(**data) 39 | experiments.append({ 40 | 'test_name': experiment.test_name, 41 | 'model_name': experiment.model_name, 42 | 'prompting_method': experiment.prompting_method, 43 | 'num_successes': experiment.num_successes, 44 | 'num_attempts': experiment.num_attempts, 45 | 'success_rate': experiment.success_rate, 46 | 'total_time': experiment.total_time, 47 | 'avg_response_time': experiment.total_time / experiment.num_attempts, 48 | 'failed_responses': experiment.failed_responses 49 | }) 50 | return pd.DataFrame(experiments) 51 | 52 | def count_objects_in_json_file(filename): 53 | """Loads JSON data from a file and returns the number of objects in the list.""" 54 | with open(filename, "r") as f: 55 | data = json.load(f) 56 | 57 | if isinstance(data, list): # Check if data is a list of objects 58 | return len(data) 59 | else: 60 | raise ValueError("The JSON file does not contain a list of objects.") -------------------------------------------------------------------------------- /structured_rag/run_test/utils_and_metrics/metrics.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import Any, Tuple 3 | 4 | # This needs a refactor to validate based on the models 5 | 6 | def _validate_int_score(score, min_val=0, max_val=5): 7 | if isinstance(score, str): 8 | try: 9 | score = int(score) 10 | except ValueError: 11 | return None, False 12 | return score, isinstance(score, int) and min_val <= score <= max_val 13 | 14 | def _validate_float_score(score, min_val=0, max_val=5): 15 | if isinstance(score, str): 16 | try: 17 | score = float(score) 18 | except ValueError: 19 | return None, False 20 | return score, isinstance(score, float) and min_val <= score <= max_val 21 | 22 | def _validate_boolean(value): 23 | if isinstance(value, bool): 24 | return value, True 25 | if isinstance(value, str): 26 | lower_value = value.lower() 27 | if lower_value in ['true', 'false']: 28 | return lower_value == 'true', True 29 | return None, False 30 | 31 | def is_valid_json_output(output: Any, test_type: str) -> Tuple[Any, bool]: 32 | try: 33 | parsed = json.loads(output) 34 | if test_type == "GenerateAnswer": 35 | answer = parsed.get("answer") 36 | return parsed, isinstance(answer, str) 37 | elif test_type == "RateContext": 38 | score, is_valid = _validate_int_score(parsed.get("context_score")) 39 | return parsed if is_valid else None, is_valid 40 | elif test_type == "AssessAnswerability": 41 | answerable, is_valid = _validate_boolean(parsed.get("answerable_question")) 42 | return answerable if is_valid else None, is_valid 43 | elif test_type == "ParaphraseQuestions": 44 | questions = parsed.get("paraphrased_questions") 45 | is_valid = isinstance(questions, list) and all(isinstance(q, str) for q in questions) 46 | return parsed if is_valid else None, is_valid 47 | elif test_type == "RAGAS": 48 | scores = ["faithfulness_score", "answer_relevance_score", "context_relevance_score"] 49 | is_valid = all(_validate_float_score(parsed.get(score))[1] for score in scores) 50 | return parsed if is_valid else None, is_valid 51 | elif test_type == "GenerateAnswerWithConfidence": 52 | answer_valid = isinstance(parsed.get("Answer"), str) 53 | confidence, confidence_valid = _validate_int_score(parsed.get("Confidence")) 54 | return parsed if answer_valid and confidence_valid else None, answer_valid and confidence_valid 55 | elif test_type == "GenerateAnswersWithConfidence": 56 | answers = parsed 57 | is_valid = isinstance(answers, list) and all( 58 | isinstance(a.get("Answer"), str) and _validate_int_score(a.get("Confidence"))[1] 59 | for a in answers 60 | ) 61 | return parsed if is_valid else None, is_valid 62 | elif test_type == "ClassifyDocument": 63 | is_valid = isinstance(parsed.get('category'), str) 64 | return parsed if is_valid else None, is_valid 65 | elif test_type == "ClassifyDocumentWithRationale": 66 | is_valid = isinstance(parsed.get('category'), str) and isinstance(parsed.get('rationale'), str) 67 | return parsed if is_valid else None, is_valid 68 | else: 69 | return None, False 70 | except json.JSONDecodeError: 71 | return None, False 72 | 73 | # Although assess_answerability_metric and classification_metric currently do the same thing, 74 | # ==> we want to extend classification_metric in the future to put probabilties on more than one class. 75 | # ==> and thus we will extend this later on as described. 76 | 77 | def assess_answerability_metric(answer: bool, ground_truth: bool) -> int: 78 | if answer == ground_truth: 79 | return 1 80 | else: 81 | return 0 82 | 83 | def classification_metric(predicted_class: str, ground_truth: str) -> int: 84 | if predicted_class == ground_truth: 85 | return 1 86 | else: 87 | return 0 88 | 89 | import dspy 90 | 91 | class AssessAnswerAlignment(dspy.Signature): 92 | """Assess the alignment between the system answer and the ground truth answer on a scale of 0 to 5.""" 93 | 94 | context: str = dspy.InputField(description="The context to use for answering the question.") 95 | question: str = dspy.InputField(description="The question to answer.") 96 | system_answer: str = dspy.InputField(description="The answer generated by the system.") 97 | ground_truth: str = dspy.InputField(description="The ground truth answer.") 98 | score_rationale: str = dspy.OutputField(description="The rationale for the alignment score. Please make it very clear why you chose this particular score and not the others.") 99 | alignment_score: int = dspy.OutputField(description="The alignment score on an integer scale of 0 to 5 between the system answer and the ground truth answer. 0 meaning the system answer is not aligned with the ground truth answer, 5 meaning the system answer is fully aligned with the ground truth answer.") 100 | 101 | assess_answer_alignment = dspy.TypedPredictor(AssessAnswerAlignment) 102 | 103 | class GenerateAnswerTaskMetric: 104 | def __init__(self, api_key: str): 105 | self.gpt4 = dspy.OpenAI(model="gpt-4o", api_key=api_key) 106 | self.assess_answer_alignment = assess_answer_alignment 107 | 108 | def assess_answer_metric(self, context: str, question: str, system_answer: str, ground_truth: str) -> Tuple[int, str]: 109 | with dspy.context(lm=self.gpt4): 110 | metric_output = self.assess_answer_alignment(context=context, question=question, system_answer=system_answer, ground_truth=ground_truth) 111 | return metric_output.alignment_score, metric_output.score_rationale -------------------------------------------------------------------------------- /test-cost.md: -------------------------------------------------------------------------------- 1 | # Cost for running StructuredRAG tests 2 | 3 | StructuredRAG contains 112 inputs that slightly vary with the output format per task: 4 | 5 | | Task | Input Tokens | Output Tokens | 6 | |------|--------------|---------------| 7 | | `AssessAnswerability` | 14,170 | 784 | 8 | 9 | # Test Costs 10 | | Task | Model | Input Cost | Output Cost | Total Cost | 11 | |------|--------|------------|-------------|------------| 12 | | AssessAnswerability | gpt-4o | $0.35 | $0.08 | $0.43 | 13 | 14 | ## Model costs 15 | 16 | ### Per 1M Tokens 17 | 18 | | Model | Input Cost (per 1M tokens) | Output Cost (per 1M tokens) | 19 | |-------|---------------------------|----------------------------| 20 | | gpt-4o | $2.50 | $10.00 | 21 | 22 | ### Per 1K Tokens 23 | 24 | | Model | Input Cost (per 1K tokens) | Output Cost (per 1K tokens) | 25 | |-------|---------------------------|----------------------------| 26 | | gpt-4o | $0.0025 | $0.01 | 27 | 28 | --------------------------------------------------------------------------------