├── .gitignore ├── CODE_OF_CONDUCT.md ├── LICENSE ├── README.md ├── SECURITY.md ├── SUPPORT.md ├── aml-tutorial ├── ReadMe.md ├── components │ ├── jsonl_guidance_component.yaml │ ├── jsonl_score_multiplechoice_component.yaml │ └── src │ │ ├── jsonl_guidance_aoai.py │ │ └── jsonl_score_multiplechoice.py ├── create_dataset.py ├── environments │ └── promptbase-basic-env.yaml ├── guidance_programs │ └── zero_shot.py ├── other_configs_example.json ├── requirements.txt └── run_experiment.py ├── azureml ├── ReadMe.md ├── components │ ├── jsonl_embeddings_aoai_component.yaml │ ├── jsonl_filter_correct_multiplechoice_component.yaml │ ├── jsonl_guidance_component.yaml │ ├── jsonl_guidance_phi2_component.yaml │ ├── jsonl_key_filter_component.yaml │ ├── jsonl_key_rename_component.yaml │ ├── jsonl_knn_cosine_similarity_component.yaml │ ├── jsonl_mmlu_fetch_component.yaml │ ├── jsonl_random_examples_component.yaml │ ├── jsonl_schema_checker_component.yaml │ ├── jsonl_score_biosbias_json_component.yaml │ ├── jsonl_score_multiplechoice_component.yaml │ ├── jsonl_to_json_component.yaml │ ├── src │ │ ├── jsonl_embeddings_aoai.py │ │ ├── jsonl_filter_correct_multiplechoice.py │ │ ├── jsonl_guidance_aoai.py │ │ ├── jsonl_guidance_phi2.py │ │ ├── jsonl_key_filter.py │ │ ├── jsonl_key_rename.py │ │ ├── jsonl_knn_cosine_similarity.py │ │ ├── jsonl_mmlu_fetch.py │ │ ├── jsonl_random_examples.py │ │ ├── jsonl_schema_check.py │ │ ├── jsonl_score_biosbias_json.py │ │ ├── jsonl_score_multiplechoice.py │ │ └── jsonl_to_json.py │ └── uri_folder_to_file_component.yaml ├── environments │ ├── phi2transformer-env.yaml │ └── promptbase-env.yaml ├── json_schemas │ ├── multichoice_schema.json │ └── multiplechoice_cot_schema.json ├── pipelines │ ├── azureml_pipelines.py │ ├── azureml_utils.py │ ├── configs.py │ ├── configs │ │ ├── aml_config_template.yaml │ │ ├── aoai_config_template.yaml │ │ ├── biosbias_json_config.yaml │ │ ├── biosbias_json_phi2_config.yaml │ │ ├── fewshot_knn_config.yaml │ │ ├── fewshot_random_config.yaml │ │ ├── knn_fewshot_cot_config.yaml │ │ ├── knn_fewshot_cot_ensemble_config.yaml │ │ ├── random_fewshot_cot_config.yaml │ │ ├── zeroshot_config.yaml │ │ └── zeroshot_cot_config.yaml │ ├── constants.py │ ├── logging_utils.py │ ├── submit_mmlu_fewshot_knn_cot.py │ ├── submit_mmlu_fewshot_random_cot.py │ ├── submit_mmlu_knn_fewshot.py │ ├── submit_mmlu_random_fewshot.py │ ├── submit_mmlu_zeroshot.py │ ├── submit_mmlu_zeroshot_cot.py │ ├── submit_simple_biosbias_json.py │ └── submit_simple_biosbias_json_phi2.py └── requirements.txt ├── guidance_programs ├── fewshot.py ├── fewshot_as_conversation.py ├── fewshot_cot_as_conversation.py ├── fewshot_cot_as_conversation_ensemble.py ├── simple_biosbias_json.py ├── simple_biosbias_json_completion.py ├── simple_biosbias_json_completion_v2.py ├── zero_or_few_shot.py ├── zero_or_few_shot_alpha.py ├── zero_or_few_shot_expert.py ├── zero_or_few_shot_fortran.py └── zero_shot_cot.py ├── images ├── medprompt_radar.png ├── medprompt_sa_graphic.png └── mmlu_accuracy_ablation.png └── src ├── promptbase ├── __init__.py ├── __main__.py ├── bigbench │ ├── __init__.py │ ├── bigbench.py │ ├── bigbench_answer.py │ ├── bigbench_cot.py │ ├── bigbench_score.py │ └── consts.py ├── datasets │ └── put_datasets_here.txt ├── drop │ ├── __init__.py │ └── drop.py ├── format │ ├── format_hellaswag.py │ └── format_mmlu.py ├── generations │ └── README.md ├── gsm8k │ ├── __init__.py │ └── gsm8k.py ├── humaneval │ ├── __init__.py │ └── humaneval.py ├── math │ ├── __init__.py │ └── math.py ├── mmlu │ ├── MMLU.py │ ├── __init__.py │ ├── analyze.py │ ├── embed_problems.py │ ├── eval.py │ ├── experiment.py │ ├── generate.py │ ├── mmlu_paths.py │ ├── print_results.py │ ├── problem_utils.py │ ├── prompt_templates.py │ ├── test.py │ ├── tune_parameter │ │ ├── analyze.py │ │ └── summarize.py │ └── utils.py └── utils │ ├── __init__.py │ └── helpers.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | **/datasets/* 2 | **/log.md 3 | **/warnings.log 4 | *.pyc 5 | *.json 6 | *.json.gz 7 | env/ 8 | src/*.egg-info/* 9 | src/promptbase/generations/* 10 | *.log 11 | *.jsonl 12 | src/promptbase/datasets/BigBench/** 13 | 14 | 15 | notebooks/* 16 | 17 | # Don't include my actual configs 18 | azureml/pipelines/configs/aml_config.yaml 19 | azureml/pipelines/configs/aoai_config.yaml 20 | azureml/pipelines/configs/aoai_embedding_config.yaml 21 | 22 | # Don't include Hydra output directory 23 | azureml/**/outputs/* 24 | 25 | # DO include our schema jsons 26 | !azureml/json_schemas/*.json 27 | 28 | # Do include the example other_config 29 | !aml-tutorial/other_configs_example.json -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd). 40 | 41 | 42 | -------------------------------------------------------------------------------- /SUPPORT.md: -------------------------------------------------------------------------------- 1 | # TODO: The maintainer of this repo has not yet edited this file 2 | 3 | **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project? 4 | 5 | - **No CSS support:** Fill out this template with information about how to file issues and get help. 6 | - **Yes CSS support:** Fill out an intake form at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). CSS will work with/help you to determine next steps. 7 | - **Not sure?** Fill out an intake as though the answer were "Yes". CSS will help you decide. 8 | 9 | *Then remove this first heading from this SUPPORT.MD file before publishing your repo.* 10 | 11 | # Support 12 | 13 | ## How to file issues and get help 14 | 15 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 16 | issues before filing new issues to avoid duplicates. For new issues, file your bug or 17 | feature request as a new Issue. 18 | 19 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 20 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER 21 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**. 22 | 23 | ## Microsoft Support Policy 24 | 25 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above. 26 | -------------------------------------------------------------------------------- /aml-tutorial/components/jsonl_guidance_component.yaml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json 2 | 3 | name: jsonl_guidance_aoai 4 | version: 0.0.1pre1 5 | display_name: JSONL Guidance Azure Open AI 6 | type: command 7 | description: Runs a supplied Guidance program on every line of a JSONL file 8 | is_deterministic: false 9 | 10 | inputs: 11 | guidance_program: 12 | type: uri_file 13 | optional: false 14 | description: Python file containing the guidance program 15 | guidance_workers: 16 | type: integer 17 | optional: false 18 | default: 4 19 | description: Number of workers to use 20 | max_errors: 21 | type: integer 22 | optional: false 23 | default: 5 24 | description: Maximum number of failed lines to tolerate 25 | input_dataset: 26 | type: uri_file 27 | optional: false 28 | description: Dataset containing JSONL input 29 | azure_openai_endpoint: 30 | type: string 31 | optional: false 32 | description: The AzureAI OpenaAI endpoitn to call 33 | azure_openai_deployment: 34 | type: string 35 | optional: false 36 | description: The name of the deployment from the portal 37 | azure_openai_model: 38 | type: string 39 | optional: false 40 | default: gpt-3.5-turbo 41 | description: The OpenAI model behind the endpoint 42 | azure_openai_api_version: 43 | type: string 44 | optional: false 45 | description: The API version in use 46 | 47 | outputs: 48 | output_dataset: 49 | type: uri_file 50 | description: JSONL file 51 | error_dataset: 52 | type: uri_file 53 | description: JSONL file containing failed lines 54 | 55 | code: ./src/ 56 | 57 | command: >- 58 | python ./jsonl_guidance_aoai.py 59 | --guidance_program ${{ inputs.guidance_program }} 60 | --guidance_workers ${{ inputs.guidance_workers }} 61 | --max_errors ${{ inputs.max_errors }} 62 | --input_dataset ${{ inputs.input_dataset }} 63 | --azure_openai_endpoint ${{ inputs.azure_openai_endpoint }} 64 | --azure_openai_deployment ${{ inputs.azure_openai_deployment }} 65 | --azure_openai_model ${{ inputs.azure_openai_model }} 66 | --azure_openai_api_version ${{ inputs.azure_openai_api_version }} 67 | --output_dataset ${{ outputs.output_dataset }} 68 | --error_dataset ${{ outputs.error_dataset }} 69 | 70 | environment: 71 | # Will be updated when component uploads 72 | image: azureml:promptbase_basic@latest -------------------------------------------------------------------------------- /aml-tutorial/components/jsonl_score_multiplechoice_component.yaml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json 2 | 3 | name: jsonl_score_multiplechoice 4 | version: 0.0.1pre1 5 | display_name: JSONL Multiple Choice Scorer 6 | type: command 7 | description: | 8 | Takes a JSONL file of multiple choice questions and correct answers and responses 9 | from a model, and produces the overall score. 10 | Results are stored in JSON 11 | is_deterministic: true 12 | 13 | inputs: 14 | input_dataset: 15 | type: uri_file 16 | optional: false 17 | description: Dataset containing JSONL input 18 | correct_key: 19 | type: string 20 | optional: false 21 | description: Which key contains the correct answer 22 | response_key: 23 | type: string 24 | optional: false 25 | description: Which key contains the answer produced by the model 26 | 27 | outputs: 28 | output_dataset: 29 | type: uri_file 30 | description: JSON file containing score summary 31 | 32 | 33 | code: ./src/ 34 | 35 | command: >- 36 | python ./jsonl_score_multiplechoice.py 37 | --input_dataset ${{ inputs.input_dataset }} 38 | --output_dataset ${{ outputs.output_dataset }} 39 | --correct_key ${{ inputs.correct_key }} 40 | --response_key ${{ inputs.response_key }} 41 | 42 | environment: 43 | # Will be updated when component uploads 44 | image: azureml:promptbase_basic@latest -------------------------------------------------------------------------------- /aml-tutorial/components/src/jsonl_guidance_aoai.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import importlib.util 3 | import json 4 | import pathlib 5 | 6 | from typing import Any, Callable, Dict 7 | 8 | from azure.identity import DefaultAzureCredential, get_bearer_token_provider 9 | 10 | import guidance 11 | 12 | from aether_utils.jsonl_utils_multiprocessing import line_map_mp, ItemMapper 13 | from aether_utils.logging_utils import get_standard_logger_for_file 14 | 15 | 16 | _logger = get_standard_logger_for_file(__file__) 17 | 18 | USER_MODULE = "user_module" 19 | GUIDANCE_FUNCTION = "guidance_generation" 20 | 21 | 22 | def parse_args(): 23 | parser = argparse.ArgumentParser(add_help=True) 24 | 25 | # Information about the datasets 26 | datasets_group = parser.add_argument_group("Datasets") 27 | datasets_group.add_argument("--input_dataset", type=pathlib.Path, required=True) 28 | datasets_group.add_argument("--output_dataset", type=pathlib.Path, required=True) 29 | datasets_group.add_argument("--error_dataset", type=pathlib.Path, required=True) 30 | 31 | # Information about the guidance program 32 | parser.add_argument("--guidance_program", type=pathlib.Path, required=True) 33 | parser.add_argument("--guidance_workers", type=int, required=True) 34 | parser.add_argument("--max_errors", type=int, required=True) 35 | 36 | # Information about the model 37 | model_group = parser.add_argument_group("Model Endpoint") 38 | model_group.add_argument("--azure_openai_endpoint", type=str, required=True) 39 | model_group.add_argument("--azure_openai_deployment", type=str, required=True) 40 | model_group.add_argument("--azure_openai_model", type=str, required=True) 41 | model_group.add_argument("--azure_openai_api_version", type=str, required=True) 42 | 43 | args = parser.parse_args() 44 | return args 45 | 46 | 47 | def get_guidance_function( 48 | program_path: pathlib.Path, 49 | ) -> Callable[[Dict[str, Any]], Dict[str, Any]]: 50 | _logger.info(f"Importing guidance file: {program_path}") 51 | spec = importlib.util.spec_from_file_location(USER_MODULE, program_path) 52 | module_definition = importlib.util.module_from_spec(spec) 53 | spec.loader.exec_module(module_definition) 54 | 55 | guidance_func = getattr(module_definition, GUIDANCE_FUNCTION) 56 | _logger.info("Guidance program imported") 57 | 58 | return guidance_func 59 | 60 | 61 | class GuidanceAzureML(ItemMapper): 62 | def __init__( 63 | self, 64 | *, 65 | program_path: pathlib.Path, 66 | endpoint: str, 67 | deployment: str, 68 | model: str, 69 | api_version: str, 70 | ): 71 | super().__init__() 72 | self._program_path = program_path 73 | self._endpoint = endpoint 74 | self._deployment = deployment 75 | self._model = model 76 | self._api_version = api_version 77 | 78 | def start_up(self, worker_id: int) -> None: 79 | _logger.info(f"Starting up {worker_id}") 80 | self._guidance_function = get_guidance_function(self._program_path) 81 | self._azure_credential = DefaultAzureCredential() 82 | _logger.info(f"Start up complete {worker_id}") 83 | 84 | def _get_model(self) -> guidance.models.Model: 85 | token_provider = get_bearer_token_provider( 86 | self._azure_credential, "https://cognitiveservices.azure.com/.default" 87 | ) 88 | assert token_provider is not None 89 | _logger.info(f"Got token_provider") 90 | 91 | azureai_model = guidance.models.AzureOpenAI( 92 | model=self._model, 93 | azure_endpoint=self._endpoint, 94 | azure_deployment=self._deployment, 95 | version=self._api_version, 96 | azure_ad_token_provider=token_provider, 97 | ) 98 | _logger.info(f"Created AzureOpenAI model") 99 | 100 | return azureai_model 101 | 102 | def map(self, item: dict[str, any]) -> dict[str, any] | None: 103 | _logger.info(f"map: {item}") 104 | language_model = self._get_model() 105 | result = self._guidance_function(language_model, item) 106 | _logger.debug(f"Checking keys") 107 | for k in result.keys(): 108 | assert k not in item, f"Duplicate key: {k}" 109 | 110 | _logger.info(f"Updating item") 111 | item.update(**result) 112 | 113 | return item 114 | 115 | 116 | def main(): 117 | args = parse_args() 118 | 119 | # Bind arguments to the processor function 120 | processor = GuidanceAzureML( 121 | program_path=args.guidance_program, 122 | endpoint=args.azure_openai_endpoint, 123 | deployment=args.azure_openai_deployment, 124 | model=args.azure_openai_model, 125 | api_version=args.azure_openai_api_version, 126 | ) 127 | 128 | # Run the processing 129 | line_map_mp( 130 | mapper=processor, 131 | source_file=args.input_dataset, 132 | dest_file=args.output_dataset, 133 | source_encoding="utf-8-sig", 134 | dest_encoding="utf-8-sig", 135 | error_file=args.error_dataset, 136 | error_encoding="utf-8-sig", 137 | n_worker_tasks=args.guidance_workers, 138 | max_errors=args.max_errors, 139 | ) 140 | 141 | _logger.info("Complete") 142 | 143 | 144 | if __name__ == "__main__": 145 | main() 146 | -------------------------------------------------------------------------------- /aml-tutorial/components/src/jsonl_score_multiplechoice.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import functools 3 | import json 4 | import pathlib 5 | 6 | from typing import Any 7 | 8 | import mlflow 9 | import sklearn.metrics as skm 10 | 11 | from aether_utils.jsonl_utils import line_reduce 12 | from aether_utils.logging_utils import get_standard_logger_for_file 13 | 14 | _logger = get_standard_logger_for_file(__file__) 15 | 16 | 17 | class Scorer: 18 | def __init__(self, correct_key: str, response_key: str): 19 | self.y_true = [] 20 | self.y_pred = [] 21 | self.correct_key = correct_key 22 | self.response_key = response_key 23 | 24 | def __call__(self, line: dict[str, Any]): 25 | correct_answer = line[self.correct_key] 26 | response_answer = line[self.response_key] 27 | self.y_true.append(correct_answer) 28 | self.y_pred.append(response_answer) 29 | 30 | def generate_summary(self) -> dict[str, Any]: 31 | result = dict() 32 | result["count"] = len(self.y_true) 33 | result["accuracy"] = skm.accuracy_score(self.y_true, self.y_pred) 34 | result["n_correct"] = skm.accuracy_score( 35 | self.y_true, self.y_pred, normalize=False 36 | ) 37 | return result 38 | 39 | 40 | def parse_args(): 41 | parser = argparse.ArgumentParser(add_help=True) 42 | 43 | # Information about the ports 44 | ports_group = parser.add_argument_group("Ports") 45 | ports_group.add_argument("--input_dataset", type=pathlib.Path, required=True) 46 | ports_group.add_argument("--output_dataset", type=pathlib.Path, required=True) 47 | 48 | # Information about the keys 49 | keys_group = parser.add_argument_group("Keys") 50 | keys_group.add_argument("--correct_key", type=str, required=True) 51 | keys_group.add_argument("--response_key", type=str, required=True) 52 | 53 | args = parser.parse_args() 54 | 55 | return args 56 | 57 | 58 | def main(): 59 | args = parse_args() 60 | 61 | scorer = Scorer(correct_key=args.correct_key, response_key=args.response_key) 62 | line_reduce( 63 | reducer=scorer, 64 | source_file=args.input_dataset, 65 | source_encoding="utf-8-sig", 66 | ) 67 | summary = scorer.generate_summary() 68 | 69 | _logger.info("Logging with mlflow") 70 | mlflow.log_metrics(summary) 71 | _logger.info("Writing output file") 72 | 73 | with open(args.output_dataset, encoding="utf-8-sig", mode="w") as jf: 74 | json.dump(summary, jf, indent=4) 75 | 76 | 77 | if __name__ == "__main__": 78 | main() 79 | -------------------------------------------------------------------------------- /aml-tutorial/create_dataset.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import pathlib 3 | import tempfile 4 | import time 5 | 6 | from typing import Any 7 | 8 | import datasets 9 | 10 | from azure.ai.ml import MLClient 11 | from azure.ai.ml.constants import AssetTypes 12 | from azure.ai.ml.entities import Data 13 | 14 | from azure.identity import DefaultAzureCredential 15 | 16 | from aether_utils.jsonl_file_utils import save_jsonl 17 | from aether_utils.logging_utils import get_standard_logger_for_file 18 | 19 | _logger = get_standard_logger_for_file(__file__) 20 | 21 | MMLU_DATASETS = [ 22 | "abstract_algebra", 23 | "anatomy", 24 | "astronomy", 25 | "business_ethics", 26 | "clinical_knowledge", 27 | "college_biology", 28 | "college_chemistry", 29 | "college_computer_science", 30 | "college_mathematics", 31 | "college_medicine", 32 | "college_physics", 33 | "computer_security", 34 | "conceptual_physics", 35 | "econometrics", 36 | "electrical_engineering", 37 | "elementary_mathematics", 38 | "formal_logic", 39 | "global_facts", 40 | "high_school_biology", 41 | "high_school_chemistry", 42 | "high_school_computer_science", 43 | "high_school_european_history", 44 | "high_school_geography", 45 | "high_school_government_and_politics", 46 | "high_school_macroeconomics", 47 | "high_school_mathematics", 48 | "high_school_microeconomics", 49 | "high_school_physics", 50 | "high_school_psychology", 51 | "high_school_statistics", 52 | "high_school_us_history", 53 | "high_school_world_history", 54 | "human_aging", 55 | "human_sexuality", 56 | "international_law", 57 | "jurisprudence", 58 | "logical_fallacies", 59 | "machine_learning", 60 | "management", 61 | "marketing", 62 | "medical_genetics", 63 | "miscellaneous", 64 | "moral_disputes", 65 | "moral_scenarios", 66 | "nutrition", 67 | "philosophy", 68 | "prehistory", 69 | "professional_accounting", 70 | "professional_law", 71 | "professional_medicine", 72 | "professional_psychology", 73 | "public_relations", 74 | "security_studies", 75 | "sociology", 76 | "us_foreign_policy", 77 | "virology", 78 | "world_religions", 79 | ] 80 | 81 | SPLITS = ["test", "validation", "dev"] 82 | 83 | 84 | def parse_args(): 85 | parser = argparse.ArgumentParser(add_help=True) 86 | 87 | mmlu_group = parser.add_argument_group( 88 | "MMLU Information", description="Options pertaining to the data" 89 | ) 90 | mmlu_group.add_argument( 91 | "--mmlu_dataset", 92 | type=str, 93 | choices=MMLU_DATASETS, 94 | required=True, 95 | help="The name of the desired MMLU dataset", 96 | ) 97 | mmlu_group.add_argument( 98 | "--split", 99 | type=str, 100 | choices=SPLITS, 101 | default="validation", 102 | help="Which of the splits to use", 103 | ) 104 | 105 | aml_group = parser.add_argument_group( 106 | "AzureML Information", description="Options pertaining to AzureML" 107 | ) 108 | aml_group.add_argument( 109 | "--workspace_config", 110 | type=pathlib.Path, 111 | default=pathlib.Path("./config.json"), 112 | help="Path to config.json downloaded from AzureML workspace", 113 | ) 114 | 115 | args = parser.parse_args() 116 | return args 117 | 118 | 119 | def process_data_split(data, subject: str) -> list[dict[str, Any]]: 120 | all_questions = [] 121 | for line in data: 122 | nxt = dict( 123 | dataset="mmlu", 124 | subject=subject, 125 | question=line["question"], 126 | choices=line["choices"], 127 | correct_answer=line["answer"], 128 | ) 129 | all_questions.append(nxt) 130 | 131 | return all_questions 132 | 133 | 134 | def main(): 135 | args = parse_args() 136 | assert args.workspace_config.exists(), f"Could not find {args.workspace_config}" 137 | 138 | _logger.info("Creating AzureML client") 139 | credential = DefaultAzureCredential(exclude_shared_token_cache_credential=True) 140 | ml_client = MLClient.from_config(credential, path=args.workspace_config) 141 | 142 | _logger.info(f"Fetching {args.mmlu_dataset}") 143 | hf_data = datasets.load_dataset("tasksource/mmlu", args.mmlu_dataset) 144 | 145 | _logger.info(f"Reformatting data") 146 | all_questions = process_data_split(hf_data[args.split], args.mmlu_dataset) 147 | 148 | with tempfile.TemporaryDirectory() as temp_dir: 149 | out_dir = pathlib.Path(temp_dir) 150 | 151 | dataset_name = f"mmlu_{args.mmlu_dataset}_{args.split}" 152 | 153 | out_file = out_dir / f"{dataset_name}.jsonl" 154 | save_jsonl(out_file, data=all_questions, destination_encoding="utf-8-sig") 155 | 156 | aml_data = Data( 157 | name=dataset_name, 158 | version=str(int(time.time())), 159 | description="Sample multiple choice dataset", 160 | path=out_file, 161 | type=AssetTypes.URI_FILE, 162 | ) 163 | returned_data = ml_client.data.create_or_update(aml_data) 164 | _logger.info( 165 | f"Created dataset {returned_data.name} at version {returned_data.version}" 166 | ) 167 | 168 | _logger.info("Complete") 169 | 170 | 171 | if __name__ == "__main__": 172 | main() 173 | -------------------------------------------------------------------------------- /aml-tutorial/environments/promptbase-basic-env.yaml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/environment.schema.json 2 | 3 | name: promptbase_basic 4 | description: | 5 | A simple environment for promptbase 6 | 7 | image: mcr.microsoft.com/azureml/inference-base-2004 8 | conda_file: 9 | channels: 10 | - defaults 11 | dependencies: 12 | - python=3.12 13 | - pip 14 | - pip: 15 | - aether-utils==0.0.1.dev1 16 | - azure-identity 17 | - azure-keyvault-secrets 18 | - azureml-mlflow 19 | - guidance>0.1.5 20 | - jsonschema 21 | - mlflow 22 | - numpy 23 | - openai>=1 24 | - scikit-learn -------------------------------------------------------------------------------- /aml-tutorial/guidance_programs/zero_shot.py: -------------------------------------------------------------------------------- 1 | # This is a very naive guidance program for doing zero shot multiple choice questions 2 | # It is not what generated the reported results 3 | 4 | import logging 5 | import sys 6 | 7 | from typing import Any, Dict 8 | 9 | import guidance 10 | from guidance import gen, select, system, user, assistant 11 | 12 | 13 | _logger = logging.getLogger(__file__) 14 | _logger.setLevel(logging.INFO) 15 | _logger.addHandler(logging.StreamHandler(stream=sys.stdout)) 16 | 17 | 18 | @guidance 19 | def zero_shot_multiple_choice( 20 | lm: guidance.models.Model, 21 | question: str, 22 | choices: list[str], 23 | ): 24 | # Some general instruction to the model 25 | with system(): 26 | lm += """You are a student taking a multiple choice test. 27 | You will be shown a question, followed by numbered multiple choice answers. 28 | Respond with the number corresponding to the best answer. 29 | """ 30 | 31 | with user(): 32 | lm += question + "\n" 33 | for i, choice in enumerate(choices): 34 | lm += f"{i} : {choice}\n" 35 | lm += "Correct Answer: " 36 | 37 | with assistant(): 38 | lm += select([str(i) for i in range(len(choices))], name="string_choice") 39 | 40 | return lm 41 | 42 | 43 | def guidance_generation( 44 | lm: guidance.models.Model, 45 | input: Dict[str, Any], 46 | ) -> Dict[str, Any]: 47 | _logger.info("Starting guidance_generation") 48 | result = lm + zero_shot_multiple_choice( 49 | question=input["question"], choices=input["choices"] 50 | ) 51 | 52 | _logger.info(f"Result: {result}") 53 | 54 | result = dict(zero_shot_choice=int(result["string_choice"])) 55 | return result 56 | -------------------------------------------------------------------------------- /aml-tutorial/other_configs_example.json: -------------------------------------------------------------------------------- 1 | { 2 | "aoai_endpoint": "https://SOMETHING.openai.azure.com/", 3 | "aoai_deployment": "A_DEPLOYMENT_NAME", 4 | "aoai_model": "gpt-4-32k", 5 | "aoai_api_version": "2024-02-01", 6 | "aoai_compute": "cluster_with_endpoint_permission", 7 | "general_compute": "any_other_cluster" 8 | } -------------------------------------------------------------------------------- /aml-tutorial/requirements.txt: -------------------------------------------------------------------------------- 1 | aether-utils==0.0.1.dev1 2 | azure-ai-ml 3 | datasets -------------------------------------------------------------------------------- /azureml/ReadMe.md: -------------------------------------------------------------------------------- 1 | # AzureML Pipelines 2 | 3 | This directory contains [AzureML pipelines](https://learn.microsoft.com/en-us/azure/machine-learning/concept-ml-pipelines?view=azureml-api-2) to run various datasets through a given Azure AI endpoints, and assess the results. 4 | The LLM prompting is done using the [`guidance` package](https://github.com/guidance-ai/guidance). 5 | It is provided as an 'extra' and was not used to generated the reported results. 6 | 7 | ## Contents 8 | 9 | - `components` 10 | This directory contains the Python [components](https://learn.microsoft.com/en-us/azure/machine-learning/concept-component?view=azureml-api-2) which are used in the AzureML pipelines 11 | - `environments` 12 | This directory contains the definition of the [AzureML environment](https://learn.microsoft.com/en-us/azure/machine-learning/concept-environments?view=azureml-api-2) shared by the various components 13 | - `pipelines` 14 | This directory contains the code required to submit the pipelines 15 | - `requirements.txt` 16 | A standard `pip` file which will install the necessary packages for the pipeline submission to work 17 | 18 | Furthermore, the actual `guidance` programs are in the top level `guidance_programs` directory in this repository. 19 | 20 | ## Preparing to submit a pipeline 21 | 22 | In order to submit a pipeline, you will need to give various pieces of information to the submission script (e.g. the AzureML workspace information). 23 | Look in the `pipelines/configs` directory, and you will see a number of `*_template.yaml` files. 24 | You will need to make copies without the '_template' suffix, and fill out the contents. 25 | For exmaple, the `aml_config_template.yaml` needs to be copied to `aml_config.yaml` (in the same directory) and filled out with appropriate information. 26 | 27 | ## Submitting a pipeline 28 | 29 | The pipeline submission scripts all have names prefixed with `submit_`. 30 | To run one: 31 | ```bash 32 | python ./submit_mmlu_zeroshot.py -cn zeroshot_config 33 | ``` 34 | where `zeroshot_config` means the `zeroshot_config.yaml` file in the `configs` directory. -------------------------------------------------------------------------------- /azureml/components/jsonl_embeddings_aoai_component.yaml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json 2 | 3 | name: jsonl_embeddings_aoai 4 | version: 0.0.1pre1 5 | display_name: JSONL Embeddings Azure OpenAI 6 | type: command 7 | description: Get the AOAI embeddings for a given key in a JSONL file 8 | is_deterministic: false 9 | 10 | inputs: 11 | workers: 12 | type: integer 13 | optional: false 14 | default: 4 15 | description: Number of workers to use 16 | max_errors: 17 | type: integer 18 | optional: false 19 | default: 5 20 | description: Maximum number of failed lines to tolerate 21 | input_dataset: 22 | type: uri_file 23 | optional: false 24 | description: Dataset containing JSONL input 25 | input_encoding: 26 | type: string 27 | optional: false 28 | default: utf-8-sig 29 | description: Encoding format of the input dataset 30 | azure_openai_endpoint: 31 | type: string 32 | optional: false 33 | description: The AzureAI OpenaAI endpoitn to call 34 | source_key: 35 | type: string 36 | optional: false 37 | description: Generate embeddings for this key 38 | destination_key: 39 | type: string 40 | optional: false 41 | description: Store embeddings in this key 42 | 43 | output_encoding: 44 | type: string 45 | optional: false 46 | default: utf-8-sig 47 | description: Encoding format of the output dataset 48 | error_encoding: 49 | type: string 50 | optional: false 51 | default: utf-8-sig 52 | description: Encoding format of the error dataset 53 | 54 | outputs: 55 | output_dataset: 56 | type: uri_file 57 | description: JSONL file 58 | error_dataset: 59 | type: uri_file 60 | description: JSONL file containing failed lines 61 | 62 | code: ./src/ 63 | 64 | command: >- 65 | python ./jsonl_embeddings_aoai.py 66 | --workers ${{ inputs.workers }} 67 | --max_errors ${{ inputs.max_errors }} 68 | --input_dataset ${{ inputs.input_dataset }} 69 | --input_encoding ${{ inputs.input_encoding }} 70 | --azure_openai_endpoint ${{ inputs.azure_openai_endpoint }} 71 | --output_dataset ${{ outputs.output_dataset }} 72 | --output_encoding ${{ inputs.output_encoding }} 73 | --error_dataset ${{ outputs.error_dataset }} 74 | --error_encoding ${{ inputs.error_encoding }} 75 | --source_key ${{ inputs.source_key }} 76 | --destination_key ${{ inputs.destination_key }} 77 | 78 | environment: 79 | # Will be updated when component uploads 80 | image: azureml:guidance_aml_env@latest -------------------------------------------------------------------------------- /azureml/components/jsonl_filter_correct_multiplechoice_component.yaml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json 2 | 3 | name: jsonl_filter_correct_multiplechoice 4 | version: 0.0.1pre1 5 | display_name: JSONL Multiple Choice Filter Correct 6 | type: command 7 | description: | 8 | Takes a JSONL file of multiple choice questions and correct answers and responses 9 | from a model, and filter out incorrect responses 10 | is_deterministic: true 11 | 12 | inputs: 13 | input_dataset: 14 | type: uri_file 15 | optional: false 16 | description: Dataset containing JSONL input 17 | input_encoding: 18 | type: string 19 | optional: false 20 | default: utf-8-sig 21 | description: Encoding format of the input dataset 22 | correct_key: 23 | type: string 24 | optional: false 25 | description: Which key contains the correct answer 26 | response_key: 27 | type: string 28 | optional: false 29 | description: Which key contains the answer produced by the model 30 | output_encoding: 31 | type: string 32 | optional: false 33 | default: utf-8-sig 34 | description: Encoding format of the output dataset 35 | 36 | outputs: 37 | output_dataset: 38 | type: uri_file 39 | description: JSON file containing score summary 40 | 41 | 42 | code: ./src/ 43 | 44 | command: >- 45 | python ./jsonl_filter_correct_multiplechoice.py 46 | --input_dataset ${{ inputs.input_dataset }} 47 | --input_encoding ${{ inputs.input_encoding }} 48 | --output_dataset ${{ outputs.output_dataset }} 49 | --output_encoding ${{ inputs.output_encoding }} 50 | --correct_key ${{ inputs.correct_key }} 51 | --response_key ${{ inputs.response_key }} 52 | 53 | environment: 54 | # Will be updated when component uploads 55 | image: azureml:promptbase_aml@latest -------------------------------------------------------------------------------- /azureml/components/jsonl_guidance_component.yaml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json 2 | 3 | name: jsonl_guidance_aoai 4 | version: 0.0.1pre1 5 | display_name: JSONL Guidance Azure Open AI 6 | type: command 7 | description: Runs a supplied Guidance program on every line of a JSONL file 8 | is_deterministic: false 9 | 10 | inputs: 11 | guidance_program: 12 | type: uri_file 13 | optional: false 14 | description: Python file containing the guidance program 15 | guidance_workers: 16 | type: integer 17 | optional: false 18 | default: 4 19 | description: Number of workers to use 20 | max_errors: 21 | type: integer 22 | optional: false 23 | default: 5 24 | description: Maximum number of failed lines to tolerate 25 | input_dataset: 26 | type: uri_file 27 | optional: false 28 | description: Dataset containing JSONL input 29 | input_encoding: 30 | type: string 31 | optional: false 32 | default: utf-8-sig 33 | description: Encoding format of the input dataset 34 | common_dataset: 35 | type: uri_file 36 | optional: true 37 | description: Dataset containing data to be shared with all rows in input 38 | common_encoding: 39 | type: string 40 | optional: true 41 | default: utf-8-sig 42 | description: Encoding format of the common dataset 43 | azure_openai_endpoint: 44 | type: string 45 | optional: false 46 | description: The AzureAI OpenaAI endpoitn to call 47 | azure_openai_deployed_model: 48 | type: string 49 | optional: false 50 | default: gpt-3.5-turbo 51 | description: The OpenAI model behind the endpoint 52 | output_encoding: 53 | type: string 54 | optional: false 55 | default: utf-8-sig 56 | description: Encoding format of the output dataset 57 | error_encoding: 58 | type: string 59 | optional: false 60 | default: utf-8-sig 61 | description: Encoding format of the error dataset 62 | 63 | outputs: 64 | output_dataset: 65 | type: uri_file 66 | description: JSONL file 67 | error_dataset: 68 | type: uri_file 69 | description: JSONL file containing failed lines 70 | 71 | code: ./src/ 72 | 73 | command: >- 74 | python ./jsonl_guidance_aoai.py 75 | --guidance_program ${{ inputs.guidance_program }} 76 | --guidance_workers ${{ inputs.guidance_workers }} 77 | --max_errors ${{ inputs.max_errors }} 78 | --input_dataset ${{ inputs.input_dataset }} 79 | --input_encoding ${{ inputs.input_encoding }} 80 | $[[--common_dataset ${{ inputs.common_dataset }} ]] 81 | $[[--common_encoding ${{ inputs.common_encoding }} ]] 82 | --azure_openai_endpoint ${{ inputs.azure_openai_endpoint }} 83 | --azure_openai_deployed_model ${{ inputs.azure_openai_deployed_model }} 84 | --output_dataset ${{ outputs.output_dataset }} 85 | --output_encoding ${{ inputs.output_encoding }} 86 | --error_dataset ${{ outputs.error_dataset }} 87 | --error_encoding ${{ inputs.error_encoding }} 88 | 89 | environment: 90 | # Will be updated when component uploads 91 | image: azureml:guidance_aml_env@latest -------------------------------------------------------------------------------- /azureml/components/jsonl_guidance_phi2_component.yaml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json 2 | 3 | name: jsonl_guidance_phi2 4 | version: 0.0.1pre1 5 | display_name: JSONL Guidance Phi2 6 | type: command 7 | description: Runs a supplied Guidance program on every line of a JSONL file via Phi2 8 | is_deterministic: false 9 | 10 | inputs: 11 | guidance_program: 12 | type: uri_file 13 | optional: false 14 | description: Python file containing the guidance program 15 | input_dataset: 16 | type: uri_file 17 | optional: false 18 | description: Dataset containing JSONL input 19 | input_encoding: 20 | type: string 21 | optional: false 22 | default: utf-8-sig 23 | description: Encoding format of the input dataset 24 | common_dataset: 25 | type: uri_file 26 | optional: true 27 | description: Dataset containing data to be shared with all rows in input 28 | common_encoding: 29 | type: string 30 | optional: true 31 | default: utf-8-sig 32 | description: Encoding format of the common dataset 33 | output_encoding: 34 | type: string 35 | optional: false 36 | default: utf-8-sig 37 | description: Encoding format of the output dataset 38 | error_encoding: 39 | type: string 40 | optional: false 41 | default: utf-8-sig 42 | description: Encoding format of the error dataset 43 | 44 | outputs: 45 | output_dataset: 46 | type: uri_file 47 | description: JSONL file 48 | error_dataset: 49 | type: uri_file 50 | description: JSONL file containing failed lines 51 | 52 | code: ./src/ 53 | 54 | command: >- 55 | python ./jsonl_guidance_phi2.py 56 | --guidance_program ${{ inputs.guidance_program }} 57 | --input_dataset ${{ inputs.input_dataset }} 58 | --input_encoding ${{ inputs.input_encoding }} 59 | $[[--common_dataset ${{ inputs.common_dataset }} ]] 60 | $[[--common_encoding ${{ inputs.common_encoding }} ]] 61 | --output_dataset ${{ outputs.output_dataset }} 62 | --output_encoding ${{ inputs.output_encoding }} 63 | --error_dataset ${{ outputs.error_dataset }} 64 | --error_encoding ${{ inputs.error_encoding }} 65 | 66 | environment: 67 | # Will be updated when component uploads 68 | image: azureml:guidance_phi2_env@latest -------------------------------------------------------------------------------- /azureml/components/jsonl_key_filter_component.yaml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json 2 | 3 | name: jsonl_key_filter 4 | display_name: 'JSONL Key Filter' 5 | type: command 6 | description: | 7 | Filters keys in JSONL file. Either keeps keys from a specified list, or 8 | drops keys from a specified list 9 | is_deterministic: true 10 | 11 | inputs: 12 | input_dataset: 13 | type: uri_file 14 | optional: false 15 | description: Dataset containing JSONL input 16 | input_encoding: 17 | type: string 18 | optional: false 19 | default: utf-8-sig 20 | description: Encoding format of the input dataset 21 | keep_keys: 22 | type: string 23 | optional: true 24 | description: Stringified JSON list of keys to keep. Mutually exclusive with drop_keys 25 | drop_keys: 26 | type: string 27 | optional: true 28 | description: Stringified JSON list of keys to drop. Mutually exclusive with keep_keys 29 | output_encoding: 30 | type: string 31 | optional: false 32 | default: utf-8-sig 33 | description: Encoding format of the output dataset 34 | 35 | outputs: 36 | output_dataset: 37 | type: uri_file 38 | description: Dataset containing JSONL filtered keys 39 | 40 | code: ./src 41 | 42 | command: >- 43 | python ./jsonl_key_filter.py 44 | --input_dataset ${{ inputs.input_dataset }} 45 | --input_encoding ${{ inputs.input_encoding }} 46 | $[[--keep_keys '${{ inputs.keep_keys }}']] 47 | $[[--drop_keys '${{ inputs.drop_keys }}']] 48 | --output_dataset ${{ outputs.output_dataset }} 49 | --output_encoding ${{ inputs.output_encoding }} 50 | 51 | environment: 52 | # Will be updated when component uploads 53 | image: azureml:promptbase_aml@latest -------------------------------------------------------------------------------- /azureml/components/jsonl_key_rename_component.yaml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json 2 | 3 | name: jsonl_key_rename 4 | display_name: 'JSONL Key Rename' 5 | type: command 6 | description: | 7 | Renames keys in JSONL file. 8 | For example, if the `rename_keys` parameter is set to: 9 | ```json 10 | { "a": "a_new" } 11 | ``` 12 | then the file: 13 | ``` 14 | { "a": 1, "b": 2 } 15 | { "a": 2, "b": 3 } 16 | ``` 17 | will become: 18 | ``` 19 | { "a_new": 1, "b": 2 } 20 | { "a_new": 2, "b": 3 } 21 | ``` 22 | is_deterministic: true 23 | 24 | inputs: 25 | input_dataset: 26 | type: uri_file 27 | optional: false 28 | description: Dataset containing JSONL input 29 | input_encoding: 30 | type: string 31 | optional: false 32 | default: utf-8-sig 33 | description: Encoding format of the input dataset 34 | rename_keys: 35 | type: string 36 | optional: false 37 | description: Stringified JSON dictionary of keys to rename 38 | output_encoding: 39 | type: string 40 | optional: false 41 | default: utf-8-sig 42 | description: Encoding format of the output dataset 43 | 44 | outputs: 45 | output_dataset: 46 | type: uri_file 47 | description: Dataset containing JSONL with renamed keys 48 | 49 | code: ./src 50 | 51 | command: >- 52 | python jsonl_key_rename.py 53 | --input_dataset ${{ inputs.input_dataset }} 54 | --input_encoding ${{ inputs.input_encoding }} 55 | --rename_keys '${{ inputs.rename_keys }}' 56 | --output_dataset ${{ outputs.output_dataset }} 57 | --output_encoding ${{ inputs.output_encoding }} 58 | 59 | environment: 60 | # Will be updated when component uploads 61 | image: azureml:promptbase_aml@latest -------------------------------------------------------------------------------- /azureml/components/jsonl_knn_cosine_similarity_component.yaml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json 2 | 3 | name: jsonl_knn_cosine_similarity 4 | version: 0.0.1pre1 5 | display_name: JSONL k-Nearest Neighbours Cosine Similarity 6 | type: command 7 | description: | 8 | Takes two JSONL files, 'input' and 'examples'. 9 | Given a key containing a vector in each file, for each line in the input: 10 | 11 | 1. Compute the cosine similarity to every line in the examples 12 | 2. Store the examples with the k largest values in the designated output key 13 | is_deterministic: true 14 | 15 | inputs: 16 | input_dataset: 17 | type: uri_file 18 | optional: false 19 | description: Dataset containing JSONL input 20 | input_encoding: 21 | type: string 22 | optional: false 23 | default: utf-8-sig 24 | description: Encoding format of the input dataset 25 | example_dataset: 26 | type: uri_file 27 | optional: false 28 | description: Dataset containing JSONL example data 29 | example_encoding: 30 | type: string 31 | optional: false 32 | default: utf-8-sig 33 | description: Encoding format of the example dataset 34 | output_encoding: 35 | type: string 36 | optional: false 37 | default: utf-8-sig 38 | description: Encoding format of the output dataset 39 | input_vector_key: 40 | type: string 41 | optional: false 42 | description: Key in the input dataset which contains the vector 43 | example_vector_key: 44 | type: string 45 | optional: false 46 | description: Key in the example dataset which contains the vector 47 | output_key: 48 | type: string 49 | optional: false 50 | description: Key in which to store the list of k-nearest neighbours 51 | k_nearest: 52 | type: integer 53 | optional: false 54 | description: How many neighbours to select 55 | 56 | 57 | outputs: 58 | output_dataset: 59 | type: uri_file 60 | description: JSONL file containing inputs with k-nearest neighbours appended 61 | 62 | 63 | code: ./src/ 64 | 65 | command: >- 66 | python ./jsonl_knn_cosine_similarity.py 67 | --input_dataset ${{ inputs.input_dataset }} 68 | --input_encoding ${{ inputs.input_encoding }} 69 | --example_dataset ${{ inputs.example_dataset }} 70 | --example_encoding ${{ inputs.example_encoding }} 71 | --output_dataset ${{ outputs.output_dataset }} 72 | --output_encoding ${{ inputs.output_encoding }} 73 | --input_vector_key ${{ inputs.input_vector_key }} 74 | --example_vector_key ${{ inputs.example_vector_key }} 75 | --output_key ${{ inputs.output_key }} 76 | --k_nearest ${{ inputs.k_nearest }} 77 | 78 | 79 | environment: 80 | # Will be updated when component uploads 81 | image: azureml:promptbase_aml@latest 82 | -------------------------------------------------------------------------------- /azureml/components/jsonl_mmlu_fetch_component.yaml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json 2 | 3 | name: jsonl_mmlu_fetch 4 | version: 0.0.1pre1 5 | display_name: JSONL MMLU Fetcher 6 | type: command 7 | description: Fetches a given MMLU dataset and exports to JSONL 8 | is_deterministic: true 9 | 10 | inputs: 11 | mmlu_dataset: 12 | type: string 13 | optional: false 14 | enum: 15 | - anatomy 16 | - astronomy 17 | - clinical_knowledge 18 | - college_biology 19 | - college_medicine 20 | - medical_genetics 21 | - professional_medicine 22 | output_encoding: 23 | type: string 24 | optional: false 25 | default: utf-8-sig 26 | description: Encoding format of the output datasets 27 | 28 | outputs: 29 | output_dataset: 30 | type: uri_folder 31 | description: | 32 | Folder which will contain 'test.jsonl', 'valdation.jsonl' and 'dev.jsonl' 33 | 34 | code: ./src/ 35 | 36 | command: >- 37 | python ./jsonl_mmlu_fetch.py 38 | --mmlu_dataset ${{ inputs.mmlu_dataset }} 39 | --output_encoding ${{ inputs.output_encoding }} 40 | --output_dataset ${{ outputs.output_dataset }} 41 | 42 | environment: 43 | # Will be updated when component uploads 44 | image: azureml:promptbase_aml@latest -------------------------------------------------------------------------------- /azureml/components/jsonl_random_examples_component.yaml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json 2 | 3 | name: jsonl_random_examples 4 | version: 0.0.1pre1 5 | display_name: JSONL Random Examples 6 | type: command 7 | description: | 8 | Takes two JSONL files, 'input' and 'examples'. 9 | For each line in the input, selects a random number of examples 10 | to include in the output key 11 | is_deterministic: true 12 | 13 | inputs: 14 | input_dataset: 15 | type: uri_file 16 | optional: false 17 | description: Dataset containing JSONL input 18 | input_encoding: 19 | type: string 20 | optional: false 21 | default: utf-8-sig 22 | description: Encoding format of the input dataset 23 | example_dataset: 24 | type: uri_file 25 | optional: false 26 | description: Dataset containing JSONL example data 27 | example_encoding: 28 | type: string 29 | optional: false 30 | default: utf-8-sig 31 | description: Encoding format of the example dataset 32 | output_encoding: 33 | type: string 34 | optional: false 35 | default: utf-8-sig 36 | description: Encoding format of the output dataset 37 | output_key: 38 | type: string 39 | optional: false 40 | description: Key in which to store the list of examples 41 | num_examples: 42 | type: integer 43 | optional: false 44 | description: How many examples to select 45 | random_seed: 46 | type: integer 47 | optional: false 48 | description: Seed for selecting random numbers 49 | 50 | 51 | outputs: 52 | output_dataset: 53 | type: uri_file 54 | description: JSONL file containing inputs with examples appended 55 | 56 | 57 | code: ./src/ 58 | 59 | command: >- 60 | python ./jsonl_random_examples.py 61 | --input_dataset ${{ inputs.input_dataset }} 62 | --input_encoding ${{ inputs.input_encoding }} 63 | --example_dataset ${{ inputs.example_dataset }} 64 | --example_encoding ${{ inputs.example_encoding }} 65 | --output_dataset ${{ outputs.output_dataset }} 66 | --output_encoding ${{ inputs.output_encoding }} 67 | --output_key ${{ inputs.output_key }} 68 | --num_examples ${{ inputs.num_examples }} 69 | --random_seed ${{ inputs.random_seed }} 70 | 71 | 72 | environment: 73 | # Will be updated when component uploads 74 | image: azureml:promptbase_aml@latest 75 | -------------------------------------------------------------------------------- /azureml/components/jsonl_schema_checker_component.yaml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json 2 | 3 | name: jsonl_schema_check 4 | display_name: 'JSONL Schema Check' 5 | type: command 6 | description: | 7 | Checks lines in a JSONL against a schema, removing those which do not match 8 | is_deterministic: true 9 | 10 | inputs: 11 | input_dataset: 12 | type: uri_file 13 | optional: false 14 | description: Dataset containing JSONL input 15 | input_encoding: 16 | type: string 17 | optional: false 18 | default: utf-8-sig 19 | description: Encoding format of the input dataset 20 | schema_dataset: 21 | type: uri_file 22 | optional: false 23 | description: Dataset containing a JSON schema file 24 | schema_encoding: 25 | type: string 26 | optional: false 27 | default: utf-8-sig 28 | description: Encoding format of the schema dataset 29 | forbidden_keys: 30 | type: string 31 | optional: false 32 | default: "[]" 33 | description: Stringified JSON list of keys which must not appear in the input 34 | max_errors: 35 | type: integer 36 | optional: false 37 | default: 10 38 | description: Maximum number of schema errors to tolerate 39 | output_encoding: 40 | type: string 41 | optional: false 42 | default: utf-8-sig 43 | description: Encoding format of the output dataset 44 | error_encoding: 45 | type: string 46 | optional: false 47 | default: utf-8-sig 48 | description: Encoding format of the error dataset 49 | 50 | 51 | outputs: 52 | output_dataset: 53 | type: uri_file 54 | description: Dataset containing JSONL filtered keys 55 | error_dataset: 56 | type: uri_file 57 | description: JSONL file containing failed lines 58 | 59 | code: ./src 60 | 61 | command: >- 62 | python ./jsonl_schema_check.py 63 | --input_dataset ${{ inputs.input_dataset }} 64 | --input_encoding ${{ inputs.input_encoding }} 65 | --schema_dataset ${{ inputs.schema_dataset }} 66 | --schema_encoding ${{ inputs.schema_encoding }} 67 | --forbidden_keys '${{ inputs.forbidden_keys }}' 68 | --output_dataset ${{ outputs.output_dataset }} 69 | --output_encoding ${{ inputs.output_encoding }} 70 | --error_dataset ${{ outputs.error_dataset }} 71 | --error_encoding ${{ inputs.error_encoding }} 72 | --max_errors ${{ inputs.max_errors }} 73 | 74 | environment: 75 | # Will be updated when component uploads 76 | image: azureml:promptbase_aml@latest -------------------------------------------------------------------------------- /azureml/components/jsonl_score_biosbias_json_component.yaml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json 2 | 3 | name: jsonl_score_biosbias_json 4 | version: 0.0.1pre1 5 | display_name: JSONL Score BIOSBIAS JSON Component 6 | type: command 7 | description: | 8 | Takes a JSONL file of results from running the JSON extraction 9 | test on BIOSBIAS, and computes correct answers. 10 | Has a lot of hard coded knowledge 11 | is_deterministic: true 12 | 13 | inputs: 14 | input_dataset: 15 | type: uri_file 16 | optional: false 17 | description: Dataset containing JSONL input 18 | input_encoding: 19 | type: string 20 | optional: false 21 | default: utf-8-sig 22 | description: Encoding format of the input dataset 23 | response_key: 24 | type: string 25 | optional: false 26 | description: Which key contains the answer produced by the model 27 | 28 | 29 | code: ./src/ 30 | 31 | command: >- 32 | python ./jsonl_score_biosbias_json.py 33 | --input_dataset ${{ inputs.input_dataset }} 34 | --input_encoding ${{ inputs.input_encoding }} 35 | --response_key ${{ inputs.response_key }} 36 | 37 | environment: 38 | # Will be updated when component uploads 39 | image: azureml:promptbase_aml@latest -------------------------------------------------------------------------------- /azureml/components/jsonl_score_multiplechoice_component.yaml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json 2 | 3 | name: jsonl_score_multiplechoice 4 | version: 0.0.1pre1 5 | display_name: JSONL Multiple Choice Scorer 6 | type: command 7 | description: | 8 | Takes a JSONL file of multiple choice questions and correct answers and responses 9 | from a model, and produces the overall score. 10 | Results are stored in JSON 11 | is_deterministic: true 12 | 13 | inputs: 14 | input_dataset: 15 | type: uri_file 16 | optional: false 17 | description: Dataset containing JSONL input 18 | input_encoding: 19 | type: string 20 | optional: false 21 | default: utf-8-sig 22 | description: Encoding format of the input dataset 23 | correct_key: 24 | type: string 25 | optional: false 26 | description: Which key contains the correct answer 27 | response_key: 28 | type: string 29 | optional: false 30 | description: Which key contains the answer produced by the model 31 | output_encoding: 32 | type: string 33 | optional: false 34 | default: utf-8-sig 35 | description: Encoding format of the output dataset 36 | 37 | outputs: 38 | output_dataset: 39 | type: uri_file 40 | description: JSON file containing score summary 41 | 42 | 43 | code: ./src/ 44 | 45 | command: >- 46 | python ./jsonl_score_multiplechoice.py 47 | --input_dataset ${{ inputs.input_dataset }} 48 | --input_encoding ${{ inputs.input_encoding }} 49 | --output_dataset ${{ outputs.output_dataset }} 50 | --output_encoding ${{ inputs.output_encoding }} 51 | --correct_key ${{ inputs.correct_key }} 52 | --response_key ${{ inputs.response_key }} 53 | 54 | environment: 55 | # Will be updated when component uploads 56 | image: azureml:promptbase_aml@latest -------------------------------------------------------------------------------- /azureml/components/jsonl_to_json_component.yaml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json 2 | type: command 3 | 4 | name: jsonl_to_json 5 | display_name: 'JSONL to JSON' 6 | description: Convert a JSONL file to JSON 7 | is_deterministic: true 8 | 9 | inputs: 10 | input_dataset: 11 | type: uri_folder 12 | optional: false 13 | description: | 14 | The source JSONL file 15 | input_encoding: 16 | type: string 17 | optional: false 18 | default: utf-8-sig 19 | description: Encoding format of the input dataset 20 | output_encoding: 21 | type: string 22 | optional: false 23 | default: utf-8-sig 24 | description: Encoding format of the output dataset 25 | 26 | outputs: 27 | output_dataset: 28 | type: uri_file 29 | description: The converted JSON file 30 | 31 | 32 | code: ./src/ 33 | 34 | command: >- 35 | python ./jsonl_to_json.py 36 | --input_dataset ${{ inputs.input_dataset }} 37 | --input_encoding ${{ inputs.input_encoding }} 38 | --output_dataset ${{ outputs.output_dataset }} 39 | --output_encoding ${{ inputs.output_encoding }} 40 | 41 | environment: 42 | # Will be updated when component uploads 43 | image: azureml:guidance_aml_env@latest -------------------------------------------------------------------------------- /azureml/components/src/jsonl_embeddings_aoai.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import pathlib 3 | 4 | from urllib.parse import urlparse, parse_qs 5 | 6 | 7 | from azure.identity import DefaultAzureCredential, get_bearer_token_provider 8 | 9 | from openai import AzureOpenAI 10 | 11 | from aether_utils.jsonl_utils_multiprocessing import line_map_mp, ItemMapper 12 | from aether_utils.logging_utils import get_standard_logger_for_file 13 | 14 | 15 | _logger = get_standard_logger_for_file(__file__) 16 | 17 | 18 | def parse_args(): 19 | parser = argparse.ArgumentParser(add_help=True) 20 | 21 | # Information about the datasets 22 | datasets_group = parser.add_argument_group("Datasets") 23 | datasets_group.add_argument("--input_dataset", type=pathlib.Path, required=True) 24 | datasets_group.add_argument("--input_encoding", type=str, required=True) 25 | datasets_group.add_argument("--output_dataset", type=pathlib.Path, required=True) 26 | datasets_group.add_argument("--output_encoding", type=str, required=True) 27 | datasets_group.add_argument("--error_dataset", type=pathlib.Path, required=True) 28 | datasets_group.add_argument("--error_encoding", type=str, required=True) 29 | 30 | # Processing configuration 31 | processing_group = parser.add_argument_group("Processing configuration") 32 | processing_group.add_argument("--workers", type=int, required=True) 33 | processing_group.add_argument("--max_errors", type=int, required=True) 34 | 35 | # Information about the embeddings mode 36 | model_group = parser.add_argument_group("Model Endpoint") 37 | model_group.add_argument("--azure_openai_endpoint", type=str, required=True) 38 | 39 | # Information about the keys 40 | keys_group = parser.add_argument_group("JSON Keys") 41 | keys_group.add_argument("--source_key", type=str, required=True) 42 | keys_group.add_argument("--destination_key", type=str, required=True) 43 | 44 | args = parser.parse_args() 45 | return args 46 | 47 | 48 | class AOAIEmbedder(ItemMapper): 49 | def __init__(self, endpoint: str, src_key: str, dst_key: str): 50 | super().__init__() 51 | self._endpoint = endpoint 52 | self._src_key = src_key 53 | self._dst_key = dst_key 54 | 55 | def start_up(self, worker_id: int) -> None: 56 | _logger.info(f"Starting up {worker_id}") 57 | self._azure_credential = DefaultAzureCredential() 58 | 59 | def _get_aoai_client(self) -> AzureOpenAI: 60 | token_provider = get_bearer_token_provider( 61 | self._azure_credential, "https://cognitiveservices.azure.com/.default" 62 | ) 63 | assert token_provider is not None 64 | 65 | # Pending a fix going into the released version of guidance, 66 | # we can only work with chat models 67 | parsed_url = urlparse(self._endpoint) 68 | parsed_query = parse_qs(parsed_url.query) 69 | 70 | client = AzureOpenAI( 71 | azure_endpoint=self._endpoint, 72 | azure_ad_token_provider=token_provider, 73 | api_version=parsed_query["api-version"], 74 | ) 75 | return client 76 | 77 | def map(self, item: dict[str, any]) -> dict[str, any] | None: 78 | _logger.debug(f"map: {item}") 79 | 80 | client = self._get_aoai_client() 81 | 82 | parsed_url = urlparse(self._endpoint) 83 | deployment_name = parsed_url.path.split("/")[3] 84 | _logger.debug(f"Got Deployment: {deployment_name}") 85 | 86 | embeddings = ( 87 | client.embeddings.create(input=[item[self._src_key]], model=deployment_name) 88 | .data[0] 89 | .embedding 90 | ) 91 | 92 | _logger.debug(f"Updating item") 93 | item[self._dst_key] = embeddings 94 | 95 | return item 96 | 97 | 98 | def main(): 99 | args = parse_args() 100 | 101 | # Bind arguments to the processor function 102 | processor = AOAIEmbedder( 103 | src_key=args.source_key, 104 | dst_key=args.destination_key, 105 | endpoint=args.azure_openai_endpoint, 106 | ) 107 | 108 | # Run the processing 109 | line_map_mp( 110 | mapper=processor, 111 | source_file=args.input_dataset, 112 | dest_file=args.output_dataset, 113 | source_encoding=args.input_encoding, 114 | dest_encoding=args.output_encoding, 115 | error_file=args.error_dataset, 116 | error_encoding=args.error_encoding, 117 | n_worker_tasks=args.workers, 118 | max_errors=args.max_errors, 119 | ) 120 | 121 | _logger.info("Complete") 122 | 123 | 124 | if __name__ == "__main__": 125 | main() 126 | -------------------------------------------------------------------------------- /azureml/components/src/jsonl_filter_correct_multiplechoice.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import functools 3 | import pathlib 4 | 5 | from aether_utils.jsonl_utils import line_map 6 | from aether_utils.logging_utils import get_standard_logger_for_file 7 | 8 | _logger = get_standard_logger_for_file(__file__) 9 | 10 | 11 | def parse_args(): 12 | parser = argparse.ArgumentParser(add_help=True) 13 | 14 | # Information about the ports 15 | ports_group = parser.add_argument_group("Ports") 16 | ports_group.add_argument("--input_dataset", type=pathlib.Path, required=True) 17 | ports_group.add_argument("--input_encoding", type=str, required=True) 18 | ports_group.add_argument("--output_dataset", type=pathlib.Path, required=True) 19 | ports_group.add_argument("--output_encoding", type=str, required=True) 20 | 21 | # Information about the keys 22 | keys_group = parser.add_argument_group("Keys") 23 | keys_group.add_argument("--correct_key", type=str, required=True) 24 | keys_group.add_argument("--response_key", type=str, required=True) 25 | 26 | args = parser.parse_args() 27 | 28 | return args 29 | 30 | 31 | def process_item( 32 | item: dict[str, any], *, correct_key: str, response_key: str 33 | ) -> dict[str, any]: 34 | result = None 35 | if item[correct_key] == item[response_key]: 36 | result = item 37 | return result 38 | 39 | 40 | def main(): 41 | args = parse_args() 42 | 43 | processor = functools.partial( 44 | process_item, correct_key=args.correct_key, response_key=args.response_key 45 | ) 46 | 47 | s, f = line_map( 48 | map_func=processor, 49 | source_file=args.input_dataset, 50 | dest_file=args.output_dataset, 51 | source_encoding=args.input_encoding, 52 | dest_encoding=args.output_encoding, 53 | ) 54 | _logger.info(f"Complete with {s} successes and {f} failures") 55 | 56 | 57 | if __name__ == "__main__": 58 | main() 59 | -------------------------------------------------------------------------------- /azureml/components/src/jsonl_guidance_phi2.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import importlib.util 3 | import json 4 | import pathlib 5 | 6 | from typing import Any, Callable, Dict 7 | 8 | import guidance 9 | 10 | import torch 11 | from transformers import AutoModelForCausalLM, AutoTokenizer 12 | 13 | from aether_utils.jsonl_utils import line_map 14 | from aether_utils.logging_utils import get_standard_logger_for_file 15 | 16 | 17 | _logger = get_standard_logger_for_file(__file__) 18 | 19 | USER_MODULE = "user_module" 20 | GUIDANCE_FUNCTION = "guidance_generation" 21 | 22 | 23 | def parse_args(): 24 | parser = argparse.ArgumentParser(add_help=True) 25 | 26 | # Information about the datasets 27 | datasets_group = parser.add_argument_group("Datasets") 28 | datasets_group.add_argument("--input_dataset", type=pathlib.Path, required=True) 29 | datasets_group.add_argument("--input_encoding", type=str, required=True) 30 | datasets_group.add_argument("--output_dataset", type=pathlib.Path, required=True) 31 | datasets_group.add_argument("--output_encoding", type=str, required=True) 32 | datasets_group.add_argument("--error_dataset", type=pathlib.Path, required=True) 33 | datasets_group.add_argument("--error_encoding", type=str, required=True) 34 | datasets_group.add_argument( 35 | "--common_dataset", type=pathlib.Path, required=False, default=None 36 | ) 37 | datasets_group.add_argument("--common_encoding", type=str, required=False) 38 | 39 | # Information about the guidance program 40 | parser.add_argument("--guidance_program", type=pathlib.Path, required=True) 41 | 42 | args = parser.parse_args() 43 | return args 44 | 45 | 46 | class Phi2Processor: 47 | def __init__( 48 | self, 49 | program_path, 50 | model: guidance.models.Model, 51 | common_data: dict[str, any] | None, 52 | ): 53 | self._program_path = program_path 54 | self._model = model 55 | self._guidance_function = self._get_guidance_function() 56 | self._common_data = common_data 57 | 58 | def __call__(self, item: Dict[str, Any]) -> dict[str, any]: 59 | _logger.debug(f"__call__: {item}") 60 | result = self._guidance_function(self._model, item, common=self._common_data) 61 | _logger.debug(f"Checking keys") 62 | for k in result.keys(): 63 | assert k not in item, f"Duplicate key: {k}" 64 | 65 | _logger.debug(f"Updating item") 66 | item.update(**result) 67 | 68 | return item 69 | 70 | def _get_guidance_function( 71 | self, 72 | ) -> Callable[[Dict[str, Any]], Dict[str, Any]]: 73 | _logger.debug("Importing guidance file") 74 | spec = importlib.util.spec_from_file_location(USER_MODULE, self._program_path) 75 | module_definition = importlib.util.module_from_spec(spec) 76 | spec.loader.exec_module(module_definition) 77 | 78 | guidance_func = getattr(module_definition, GUIDANCE_FUNCTION) 79 | 80 | return guidance_func 81 | 82 | 83 | def main(): 84 | args = parse_args() 85 | 86 | # Load the common data (if required) 87 | common_data = None 88 | if args.common_dataset is not None: 89 | _logger.info("Loading common dataset") 90 | with open(args.common_dataset, "r", encoding=args.common_encoding) as jf: 91 | common_data = json.load(jf) 92 | else: 93 | _logger.info("No common dataset present") 94 | 95 | torch.set_default_device("cuda") 96 | guidance_model = guidance.models.Transformers( 97 | "microsoft/phi-2", 98 | device_map="cuda:0", 99 | echo=False, 100 | trust_remote_code=True, 101 | ) 102 | _logger.info(f"guidance_model.device: {guidance_model.engine.device}") 103 | 104 | processor = Phi2Processor( 105 | program_path=args.guidance_program, 106 | model=guidance_model, 107 | common_data=common_data, 108 | ) 109 | 110 | s, f = line_map( 111 | map_func=processor, 112 | source_file=args.input_dataset, 113 | dest_file=args.output_dataset, 114 | source_encoding=args.input_encoding, 115 | dest_encoding=args.output_encoding, 116 | ) 117 | 118 | _logger.info(f"Complete with {s} successes and {f} failures") 119 | 120 | 121 | if __name__ == "__main__": 122 | main() 123 | -------------------------------------------------------------------------------- /azureml/components/src/jsonl_key_filter.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import functools 3 | import pathlib 4 | 5 | from typing import Any, Dict, List 6 | 7 | from aether_utils.argparse_utils import json_loads_fixer 8 | from aether_utils.jsonl_utils import line_map 9 | from aether_utils.logging_utils import get_standard_logger_for_file 10 | 11 | _logger = get_standard_logger_for_file(__file__) 12 | 13 | 14 | def parse_args(): 15 | parser = argparse.ArgumentParser(add_help=True) 16 | 17 | # Information about the datasets 18 | datasets_group = parser.add_argument_group("Datasets") 19 | datasets_group.add_argument("--input_dataset", type=pathlib.Path, required=True) 20 | datasets_group.add_argument("--input_encoding", type=str, required=True) 21 | datasets_group.add_argument("--output_dataset", type=pathlib.Path, required=True) 22 | datasets_group.add_argument("--output_encoding", type=str, required=True) 23 | 24 | # Filtering config 25 | filtering_group = parser.add_mutually_exclusive_group(required=True) 26 | filtering_group.add_argument( 27 | "--keep_keys", 28 | type=json_loads_fixer, 29 | default=[], 30 | help="JSON list of keys to keep", 31 | ) 32 | filtering_group.add_argument( 33 | "--drop_keys", 34 | type=json_loads_fixer, 35 | default=[], 36 | help="JSON list of keys to drop", 37 | ) 38 | 39 | args = parser.parse_args() 40 | return args 41 | 42 | 43 | def process_item( 44 | item: Dict[str, Any], *, keep: List[str], drop: List[str] 45 | ) -> Dict[str, Any]: 46 | result = dict() 47 | 48 | if len(keep) > 0: 49 | _logger.info("Processing keeps") 50 | for k in keep: 51 | result[k] = item[k] 52 | elif len(drop) > 0: 53 | _logger.info("Processing drops") 54 | for k, v in item.items(): 55 | assert k in item, f"Key {k} not in original!" 56 | if k not in drop: 57 | result[k] = v 58 | else: 59 | raise ValueError("Shouldn't get here") 60 | 61 | return result 62 | 63 | 64 | def main(): 65 | args = parse_args() 66 | 67 | # Exclusivity taken care of by add_mutually_exclusive_group 68 | assert ( 69 | len(args.keep_keys) > 0 or len(args.drop_keys) > 0 70 | ), "Must either keep or drop something!" 71 | 72 | processor = functools.partial( 73 | process_item, keep=args.keep_keys, drop=args.drop_keys 74 | ) 75 | 76 | line_map( 77 | map_func=processor, 78 | source_file=args.input_dataset, 79 | dest_file=args.output_dataset, 80 | source_encoding=args.input_encoding, 81 | dest_encoding=args.output_encoding, 82 | ) 83 | 84 | 85 | if __name__ == "__main__": 86 | main() 87 | -------------------------------------------------------------------------------- /azureml/components/src/jsonl_key_rename.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import functools 3 | import pathlib 4 | 5 | from typing import Any, Dict, List 6 | 7 | from aether_utils.argparse_utils import json_loads_fixer 8 | from aether_utils.jsonl_utils import line_map 9 | from aether_utils.logging_utils import get_standard_logger_for_file 10 | 11 | 12 | _logger = get_standard_logger_for_file(__file__) 13 | 14 | 15 | def parse_args(): 16 | parser = argparse.ArgumentParser(add_help=True) 17 | 18 | # Information about the datasets 19 | datasets_group = parser.add_argument_group("Datasets") 20 | datasets_group.add_argument("--input_dataset", type=pathlib.Path, required=True) 21 | datasets_group.add_argument("--input_encoding", type=str, required=True) 22 | datasets_group.add_argument("--output_dataset", type=pathlib.Path, required=True) 23 | datasets_group.add_argument("--output_encoding", type=str, required=True) 24 | 25 | # Renaming config 26 | parser.add_argument("--rename_keys", type=json_loads_fixer, required=True) 27 | 28 | args = parser.parse_args() 29 | return args 30 | 31 | 32 | def process_item(item: Dict[str, Any], *, rename: Dict[str, str]) -> Dict[str, Any]: 33 | result = dict() 34 | 35 | _logger.info("Processing renames") 36 | for k in item: 37 | if k in rename: 38 | result[rename[k]] = item[k] 39 | else: 40 | result[k] = item[k] 41 | return result 42 | 43 | 44 | def main(): 45 | args = parse_args() 46 | 47 | assert len(args.rename_keys) > 0, "Must rename at least one key!" 48 | 49 | processor = functools.partial(process_item, rename=args.rename_keys) 50 | line_map( 51 | map_func=processor, 52 | source_file=args.input_dataset, 53 | dest_file=args.output_dataset, 54 | source_encoding=args.input_encoding, 55 | dest_encoding=args.output_encoding, 56 | ) 57 | 58 | 59 | if __name__ == "__main__": 60 | main() 61 | -------------------------------------------------------------------------------- /azureml/components/src/jsonl_knn_cosine_similarity.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import functools 3 | import pathlib 4 | 5 | import numpy as np 6 | 7 | 8 | from aether_utils.jsonl_file_utils import load_jsonl 9 | from aether_utils.jsonl_utils import line_map 10 | from aether_utils.logging_utils import get_standard_logger_for_file 11 | 12 | 13 | _logger = get_standard_logger_for_file(__file__) 14 | 15 | 16 | def parse_args(): 17 | parser = argparse.ArgumentParser(add_help=True) 18 | 19 | # Information about the datasets 20 | datasets_group = parser.add_argument_group("Datasets") 21 | datasets_group.add_argument("--input_dataset", type=pathlib.Path, required=True) 22 | datasets_group.add_argument("--input_encoding", type=str, required=True) 23 | datasets_group.add_argument("--output_dataset", type=pathlib.Path, required=True) 24 | datasets_group.add_argument("--output_encoding", type=str, required=True) 25 | datasets_group.add_argument("--example_dataset", type=pathlib.Path, required=True) 26 | datasets_group.add_argument("--example_encoding", type=str, required=True) 27 | 28 | # Information about keys 29 | key_group = parser.add_argument_group("Keys") 30 | key_group.add_argument("--input_vector_key", type=str, required=True) 31 | key_group.add_argument("--example_vector_key", type=str, required=True) 32 | key_group.add_argument("--output_key", type=str, required=True) 33 | 34 | # Information about the algorithm 35 | algo_group = parser.add_argument_group("Algorithm") 36 | algo_group.add_argument("--k_nearest", type=int, required=True) 37 | 38 | args = parser.parse_args() 39 | return args 40 | 41 | 42 | def compute_knn( 43 | item: dict[str, any], 44 | *, 45 | examples: list[dict[str, any]], 46 | example_embedding_matrix: np.ndarray, 47 | input_vector_key: str, 48 | output_key: str, 49 | k_nearest: int, 50 | ) -> dict[str, any]: 51 | _logger.debug(f"process_item: {item}") 52 | 53 | item_embedding = np.asarray(item[input_vector_key]) 54 | _logger.debug(f"Item embedding {item_embedding.dtype} {item_embedding.shape}") 55 | 56 | similarities = np.matmul(example_embedding_matrix, item_embedding) 57 | # np.argsort is ascending, so we need to reverse 58 | sorted_indices = list(reversed(np.argsort(similarities).tolist())) 59 | top_k_indices = sorted_indices[0:k_nearest] 60 | _logger.debug(f"k nearest: {top_k_indices}") 61 | k_examples = [] 62 | for k in top_k_indices: 63 | k_examples.append(examples[k]) 64 | item[output_key] = k_examples 65 | del item[input_vector_key] 66 | 67 | return item 68 | 69 | 70 | def normalised_vector(input: list[float]) -> np.ndarray: 71 | result = np.asarray(input) 72 | result = result / np.linalg.norm(result) 73 | 74 | return result 75 | 76 | 77 | def main(): 78 | args = parse_args() 79 | 80 | example_data = load_jsonl(args.example_dataset, args.example_encoding) 81 | example_embedding_matrix = np.stack( 82 | [normalised_vector(e[args.example_vector_key]) for e in example_data], axis=0 83 | ) 84 | _logger.info( 85 | f"Embedding Matrix: {example_embedding_matrix.dtype} {example_embedding_matrix.shape}" 86 | ) 87 | 88 | # Remove the vectors 89 | for e in example_data: 90 | del e[args.example_vector_key] 91 | 92 | # Construct the mapping function 93 | processor = functools.partial( 94 | compute_knn, 95 | examples=example_data, 96 | example_embedding_matrix=example_embedding_matrix, 97 | input_vector_key=args.input_vector_key, 98 | output_key=args.output_key, 99 | k_nearest=args.k_nearest, 100 | ) 101 | 102 | s, f = line_map( 103 | map_func=processor, 104 | source_file=args.input_dataset, 105 | source_encoding=args.input_encoding, 106 | dest_file=args.output_dataset, 107 | dest_encoding=args.output_encoding, 108 | ) 109 | 110 | _logger.info(f"Complete with {s} successes and {f} failures") 111 | 112 | 113 | if __name__ == "__main__": 114 | main() 115 | -------------------------------------------------------------------------------- /azureml/components/src/jsonl_mmlu_fetch.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import pathlib 3 | 4 | from typing import Any 5 | 6 | import datasets 7 | 8 | from aether_utils.jsonl_file_utils import JSONLWriter 9 | from aether_utils.logging_utils import get_standard_logger_for_file 10 | 11 | _logger = get_standard_logger_for_file(__file__) 12 | 13 | MMLU_DATASETS = [ 14 | "abstract_algebra", 15 | "anatomy", 16 | "astronomy", 17 | "business_ethics", 18 | "clinical_knowledge", 19 | "college_biology", 20 | "college_chemistry", 21 | "college_computer_science", 22 | "college_mathematics", 23 | "college_medicine", 24 | "college_physics", 25 | "computer_security", 26 | "conceptual_physics", 27 | "econometrics", 28 | "electrical_engineering", 29 | "elementary_mathematics", 30 | "formal_logic", 31 | "global_facts", 32 | "high_school_biology", 33 | "high_school_chemistry", 34 | "high_school_computer_science", 35 | "high_school_european_history", 36 | "high_school_geography", 37 | "high_school_government_and_politics", 38 | "high_school_macroeconomics", 39 | "high_school_mathematics", 40 | "high_school_microeconomics", 41 | "high_school_physics", 42 | "high_school_psychology", 43 | "high_school_statistics", 44 | "high_school_us_history", 45 | "high_school_world_history", 46 | "human_aging", 47 | "human_sexuality", 48 | "international_law", 49 | "jurisprudence", 50 | "logical_fallacies", 51 | "machine_learning", 52 | "management", 53 | "marketing", 54 | "medical_genetics", 55 | "miscellaneous", 56 | "moral_disputes", 57 | "moral_scenarios", 58 | "nutrition", 59 | "philosophy", 60 | "prehistory", 61 | "professional_accounting", 62 | "professional_law", 63 | "professional_medicine", 64 | "professional_psychology", 65 | "public_relations", 66 | "security_studies", 67 | "sociology", 68 | "us_foreign_policy", 69 | "virology", 70 | "world_religions", 71 | ] 72 | 73 | DATASET_OPTIONS = [*MMLU_DATASETS, "all_medicine_datasets", "all_mmlu_datasets"] 74 | 75 | SPLITS = ["test", "validation", "dev"] 76 | 77 | 78 | def parse_args(): 79 | parser = argparse.ArgumentParser(add_help=True) 80 | 81 | # Information about the ports 82 | ports_group = parser.add_argument_group("Ports") 83 | ports_group.add_argument("--output_dataset", type=pathlib.Path, required=True) 84 | ports_group.add_argument("--output_encoding", type=str, required=True) 85 | 86 | parser.add_argument( 87 | "--mmlu_dataset", type=str, choices=DATASET_OPTIONS, required=True 88 | ) 89 | 90 | args = parser.parse_args() 91 | return args 92 | 93 | 94 | def process_data_split(data, subject: str) -> list[dict[str, Any]]: 95 | all_questions = [] 96 | for line in data: 97 | nxt = dict( 98 | dataset="mmlu", 99 | subject=subject, 100 | question=line["question"], 101 | choices=line["choices"], 102 | correct_answer=line["answer"], 103 | ) 104 | all_questions.append(nxt) 105 | 106 | return all_questions 107 | 108 | 109 | def main(): 110 | args = parse_args() 111 | _logger.info(f"Fetching {args.mmlu_dataset}") 112 | 113 | if args.mmlu_dataset == "all_medicine_datasets": 114 | target_datasets = [ 115 | "anatomy", 116 | "clinical_knowledge", 117 | "college_biology", 118 | "college_medicine", 119 | "medical_genetics", 120 | "professional_medicine", 121 | ] 122 | elif args.mmlu_dataset == "all_mmlu_datasets": 123 | target_datasets = MMLU_DATASETS 124 | else: 125 | target_datasets = [args.mmlu_dataset] 126 | 127 | jsonl_writers: dict[str, JSONLWriter] = dict() 128 | for split in SPLITS: 129 | nxt_writer = JSONLWriter( 130 | args.output_dataset / f"{split}.jsonl", args.output_encoding 131 | ) 132 | nxt_writer.__enter__() 133 | jsonl_writers[split] = nxt_writer 134 | 135 | for nxt_ds in target_datasets: 136 | _logger.info(f"Processing dataset {nxt_ds}") 137 | # Note that tasksource skips the huge 'train' file 138 | hf_data = datasets.load_dataset("tasksource/mmlu", nxt_ds) 139 | 140 | for split in SPLITS: 141 | _logger.info(f"Extracting split {split}") 142 | extracted_data = process_data_split(hf_data[split], subject=nxt_ds) 143 | _logger.info(f"Saving split {split}") 144 | for line in extracted_data: 145 | jsonl_writers[split].write_line(line) 146 | 147 | _logger.info("Closing JSONL files") 148 | for v in jsonl_writers.values(): 149 | v.__exit__() 150 | 151 | _logger.info("Complete") 152 | 153 | 154 | if __name__ == "__main__": 155 | main() 156 | -------------------------------------------------------------------------------- /azureml/components/src/jsonl_random_examples.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import functools 3 | import pathlib 4 | import random 5 | 6 | 7 | from aether_utils.jsonl_file_utils import load_jsonl 8 | from aether_utils.jsonl_utils import line_map 9 | from aether_utils.logging_utils import get_standard_logger_for_file 10 | 11 | 12 | _logger = get_standard_logger_for_file(__file__) 13 | 14 | 15 | def parse_args(): 16 | parser = argparse.ArgumentParser(add_help=True) 17 | 18 | # Information about the datasets 19 | datasets_group = parser.add_argument_group("Datasets") 20 | datasets_group.add_argument("--input_dataset", type=pathlib.Path, required=True) 21 | datasets_group.add_argument("--input_encoding", type=str, required=True) 22 | datasets_group.add_argument("--output_dataset", type=pathlib.Path, required=True) 23 | datasets_group.add_argument("--output_encoding", type=str, required=True) 24 | datasets_group.add_argument("--example_dataset", type=pathlib.Path, required=True) 25 | datasets_group.add_argument("--example_encoding", type=str, required=True) 26 | 27 | # Information about keys 28 | key_group = parser.add_argument_group("Keys") 29 | key_group.add_argument("--output_key", type=str, required=True) 30 | 31 | # Information about the algorithm 32 | algo_group = parser.add_argument_group("Algorithm") 33 | algo_group.add_argument("--num_examples", type=int, required=True) 34 | algo_group.add_argument("--random_seed", type=int, required=True) 35 | 36 | args = parser.parse_args() 37 | return args 38 | 39 | 40 | def select_examples( 41 | item: dict[str, any], 42 | *, 43 | examples: list[dict[str, any]], 44 | num_examples: int, 45 | output_key: str, 46 | ) -> dict[str, any]: 47 | # Note that random.samples() is _without_ replacement 48 | selected_examples = random.sample(examples, num_examples) 49 | item[output_key] = selected_examples 50 | return item 51 | 52 | 53 | def main(): 54 | args = parse_args() 55 | 56 | example_data = load_jsonl(args.example_dataset, args.example_encoding) 57 | _logger.info("Loaded example file") 58 | random.seed(args.random_seed) 59 | 60 | # Construct the mapping function 61 | processor = functools.partial( 62 | select_examples, 63 | examples=example_data, 64 | output_key=args.output_key, 65 | num_examples=args.num_examples, 66 | ) 67 | 68 | s, f = line_map( 69 | map_func=processor, 70 | source_file=args.input_dataset, 71 | source_encoding=args.input_encoding, 72 | dest_file=args.output_dataset, 73 | dest_encoding=args.output_encoding, 74 | ) 75 | 76 | _logger.info(f"Complete with {s} successes and {f} failures") 77 | 78 | 79 | if __name__ == "__main__": 80 | main() 81 | -------------------------------------------------------------------------------- /azureml/components/src/jsonl_schema_check.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import functools 3 | import json 4 | import pathlib 5 | 6 | from typing import Any, Dict, List 7 | 8 | from jsonschema.protocols import Validator 9 | from jsonschema.validators import Draft202012Validator 10 | 11 | from aether_utils.argparse_utils import json_loads_fixer 12 | from aether_utils.jsonl_utils import line_map 13 | from aether_utils.logging_utils import get_standard_logger_for_file 14 | 15 | _logger = get_standard_logger_for_file(__file__) 16 | 17 | 18 | def parse_args(): 19 | parser = argparse.ArgumentParser(add_help=True) 20 | 21 | # Information about the datasets 22 | datasets_group = parser.add_argument_group("Datasets") 23 | datasets_group.add_argument("--input_dataset", type=pathlib.Path, required=True) 24 | datasets_group.add_argument("--input_encoding", type=str, required=True) 25 | datasets_group.add_argument("--schema_dataset", type=pathlib.Path, required=True) 26 | datasets_group.add_argument("--schema_encoding", type=str, required=True) 27 | datasets_group.add_argument("--output_dataset", type=pathlib.Path, required=True) 28 | datasets_group.add_argument("--output_encoding", type=str, required=True) 29 | datasets_group.add_argument("--error_dataset", type=pathlib.Path, required=True) 30 | datasets_group.add_argument("--error_encoding", type=str, required=True) 31 | 32 | # Forbidden keys 33 | parser.add_argument("--forbidden_keys", type=json_loads_fixer, required=True) 34 | 35 | # Maximum error count 36 | parser.add_argument("--max_errors", type=int, required=True) 37 | 38 | args = parser.parse_args() 39 | return args 40 | 41 | 42 | def process_item( 43 | item: Dict[str, Any], *, json_validator: Validator, forbidden_keys=list[str] 44 | ) -> Dict[str, Any]: 45 | for k in forbidden_keys: 46 | assert k not in item, f"Key {k} not allowed" 47 | 48 | json_validator.validate(item) 49 | 50 | return item 51 | 52 | 53 | def main(): 54 | args = parse_args() 55 | 56 | # Load in the JSON schema 57 | with open(args.schema_dataset, "r", encoding=args.schema_encoding) as sf: 58 | json_schema = json.load(sf) 59 | 60 | # Check the schema 61 | Draft202012Validator.check_schema(json_schema) 62 | 63 | # Create the validator object 64 | validator = Draft202012Validator(schema=json_schema) 65 | 66 | processor = functools.partial( 67 | process_item, json_validator=validator, forbidden_keys=args.forbidden_keys 68 | ) 69 | 70 | line_map( 71 | map_func=processor, 72 | source_file=args.input_dataset, 73 | dest_file=args.output_dataset, 74 | source_encoding=args.input_encoding, 75 | dest_encoding=args.output_encoding, 76 | error_file=args.error_dataset, 77 | error_encoding=args.error_encoding, 78 | max_errors=args.max_errors, 79 | ) 80 | _logger.info("Complete") 81 | 82 | 83 | if __name__ == "__main__": 84 | main() 85 | -------------------------------------------------------------------------------- /azureml/components/src/jsonl_score_biosbias_json.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import functools 3 | import json 4 | import pathlib 5 | 6 | from typing import Any 7 | 8 | import mlflow 9 | import sklearn.metrics as skm 10 | 11 | from aether_utils.jsonl_utils import line_reduce 12 | from aether_utils.logging_utils import get_standard_logger_for_file 13 | 14 | _logger = get_standard_logger_for_file(__file__) 15 | 16 | 17 | class Scorer: 18 | def __init__(self, response_key: str): 19 | self.total_count = 0 20 | self.good_json_count = 0 21 | self.json_keys_count = 0 22 | self.correct_name_count = 0 23 | self.correct_occupation_count = 0 24 | self.response_key = response_key 25 | 26 | def __call__(self, line: dict[str, Any]): 27 | self.total_count += 1 28 | response_answer = line[self.response_key] 29 | try: 30 | decoded_response = json.loads(response_answer) 31 | self.good_json_count += 1 32 | 33 | EXPECTED_KEYS = ["name", "occupation"] 34 | 35 | if all([k in decoded_response.keys() for k in EXPECTED_KEYS]): 36 | self.json_keys_count += 1 37 | 38 | if self.fuzzy_string_match( 39 | generated=decoded_response["name"], target=line["entity"] 40 | ): 41 | self.correct_name_count += 1 42 | if self.fuzzy_string_match( 43 | generated=decoded_response["occupation"], target=line["target_mediated"] 44 | ): 45 | self.correct_occupation_count += 1 46 | except: 47 | pass 48 | 49 | def fuzzy_string_match(self, *, target: str, generated: str) -> bool: 50 | # I believe that this is the ultimate comparison done by: 51 | # https://github.com/QingruZhang/PASTA/blob/b28e6307896df9f91c282ecf0201fa7bebdad0d6/evaluation/evaluator.py#L233 52 | return target.lower() in generated.lower() 53 | 54 | def generate_summary(self) -> dict[str, Any]: 55 | result = dict() 56 | result["metrics"] = dict() 57 | 58 | result["metrics"]["total"] = self.total_count 59 | result["metrics"]["good_json"] = self.good_json_count 60 | result["metrics"]["json_keys"] = self.json_keys_count 61 | result["metrics"]["correct_name"] = self.correct_name_count 62 | result["metrics"]["correct_occupation"] = self.correct_occupation_count 63 | return result 64 | 65 | 66 | def parse_args(): 67 | parser = argparse.ArgumentParser(add_help=True) 68 | 69 | # Information about the ports 70 | ports_group = parser.add_argument_group("Ports") 71 | ports_group.add_argument("--input_dataset", type=pathlib.Path, required=True) 72 | ports_group.add_argument("--input_encoding", type=str, required=True) 73 | 74 | # Information about the keys 75 | keys_group = parser.add_argument_group("Keys") 76 | keys_group.add_argument("--response_key", type=str, required=True) 77 | 78 | args = parser.parse_args() 79 | 80 | return args 81 | 82 | 83 | def main(): 84 | args = parse_args() 85 | 86 | scorer = Scorer(response_key=args.response_key) 87 | line_reduce( 88 | reducer=scorer, 89 | source_file=args.input_dataset, 90 | source_encoding=args.input_encoding, 91 | ) 92 | summary = scorer.generate_summary() 93 | 94 | _logger.info("Logging with mlflow") 95 | mlflow.log_metrics(summary["metrics"]) 96 | 97 | 98 | if __name__ == "__main__": 99 | main() 100 | -------------------------------------------------------------------------------- /azureml/components/src/jsonl_score_multiplechoice.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import functools 3 | import json 4 | import pathlib 5 | 6 | from typing import Any 7 | 8 | import fairlearn.metrics as flm 9 | import mlflow 10 | import sklearn.metrics as skm 11 | 12 | from aether_utils.jsonl_utils import line_reduce 13 | from aether_utils.logging_utils import get_standard_logger_for_file 14 | 15 | _logger = get_standard_logger_for_file(__file__) 16 | 17 | 18 | class Scorer: 19 | def __init__(self, correct_key: str, response_key: str): 20 | self.y_true = [] 21 | self.y_pred = [] 22 | self.dataset = [] 23 | self.subject = [] 24 | self.correct_key = correct_key 25 | self.response_key = response_key 26 | 27 | def __call__(self, line: dict[str, Any]): 28 | correct_answer = line[self.correct_key] 29 | response_answer = line[self.response_key] 30 | self.y_true.append(correct_answer) 31 | self.y_pred.append(response_answer) 32 | if "dataset" in line: 33 | self.dataset.append(line["dataset"]) 34 | else: 35 | self.dataset.append("No dataset") 36 | if "subject" in line: 37 | self.subject.append(line["subject"]) 38 | else: 39 | self.subject.append("No subject") 40 | 41 | def generate_summary(self) -> dict[str, Any]: 42 | metrics = { 43 | "count": flm.count, 44 | "accuracy": skm.accuracy_score, 45 | "n_correct": functools.partial(skm.accuracy_score, normalize=False), 46 | } 47 | 48 | mf = flm.MetricFrame( 49 | metrics=metrics, 50 | y_true=self.y_true, 51 | y_pred=self.y_pred, 52 | sensitive_features=dict(dataset=self.dataset, subject=self.subject), 53 | ) 54 | 55 | result = dict() 56 | result["metrics"] = mf 57 | result["figures"] = dict() 58 | cm_display = skm.ConfusionMatrixDisplay.from_predictions( 59 | self.y_true, self.y_pred 60 | ) 61 | result["figures"]["confusion_matrix"] = cm_display.figure_ 62 | return result 63 | 64 | 65 | def parse_args(): 66 | parser = argparse.ArgumentParser(add_help=True) 67 | 68 | # Information about the ports 69 | ports_group = parser.add_argument_group("Ports") 70 | ports_group.add_argument("--input_dataset", type=pathlib.Path, required=True) 71 | ports_group.add_argument("--input_encoding", type=str, required=True) 72 | ports_group.add_argument("--output_dataset", type=pathlib.Path, required=True) 73 | ports_group.add_argument("--output_encoding", type=str, required=True) 74 | 75 | # Information about the keys 76 | keys_group = parser.add_argument_group("Keys") 77 | keys_group.add_argument("--correct_key", type=str, required=True) 78 | keys_group.add_argument("--response_key", type=str, required=True) 79 | 80 | args = parser.parse_args() 81 | 82 | return args 83 | 84 | 85 | def main(): 86 | args = parse_args() 87 | 88 | scorer = Scorer(correct_key=args.correct_key, response_key=args.response_key) 89 | line_reduce( 90 | reducer=scorer, 91 | source_file=args.input_dataset, 92 | source_encoding=args.input_encoding, 93 | ) 94 | summary = scorer.generate_summary() 95 | 96 | _logger.info("Logging with mlflow") 97 | mlflow.log_metrics(summary["metrics"].overall.to_dict()) 98 | for k, v in summary["figures"].items(): 99 | mlflow.log_figure(v, f"{k}.png") 100 | 101 | _logger.info("Writing output file") 102 | 103 | by_group_dict = dict() 104 | # Due to how MetricFrame does its indexing, we have to unpack the 105 | # key into another level of nesting 106 | for k, v in summary["metrics"].by_group.to_dict(orient="index").items(): 107 | if k[0] not in by_group_dict: 108 | by_group_dict[k[0]] = dict() 109 | by_group_dict[k[0]][k[1]] = v 110 | 111 | output_dict = dict( 112 | overall=summary["metrics"].overall.to_dict(), 113 | details=by_group_dict, 114 | ) 115 | print(f"output_dict:\n {json.dumps(output_dict,indent=4)}") 116 | with open(args.output_dataset, encoding=args.output_encoding, mode="w") as jf: 117 | json.dump(output_dict, jf, indent=4) 118 | 119 | 120 | if __name__ == "__main__": 121 | main() 122 | -------------------------------------------------------------------------------- /azureml/components/src/jsonl_to_json.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import pathlib 4 | 5 | 6 | from aether_utils.jsonl_utils import line_reduce 7 | from aether_utils.logging_utils import get_standard_logger_for_file 8 | 9 | _logger = get_standard_logger_for_file(__file__) 10 | 11 | 12 | class ContentAccumulator: 13 | def __init__(self): 14 | self.contents = [] 15 | 16 | def __call__(self, line: dict[str, any]): 17 | self.contents.append(line) 18 | 19 | 20 | def parse_args(): 21 | parser = argparse.ArgumentParser(add_help=True) 22 | 23 | # Information about the ports 24 | ports_group = parser.add_argument_group("Ports") 25 | ports_group.add_argument("--input_dataset", type=pathlib.Path, required=True) 26 | ports_group.add_argument("--input_encoding", type=str, required=True) 27 | ports_group.add_argument("--output_dataset", type=pathlib.Path, required=True) 28 | ports_group.add_argument("--output_encoding", type=str, required=True) 29 | 30 | args = parser.parse_args() 31 | 32 | return args 33 | 34 | 35 | def main(): 36 | args = parse_args() 37 | 38 | _logger.info("Starting accumulation") 39 | acc = ContentAccumulator() 40 | line_reduce( 41 | reducer=acc, 42 | source_file=args.input_dataset, 43 | source_encoding=args.input_encoding, 44 | ) 45 | _logger.info("All lines accumulated") 46 | 47 | with open(args.output_dataset, "w", encoding=args.output_encoding) as jf: 48 | json.dump(acc.contents, jf, indent=4) 49 | 50 | 51 | if __name__ == "__main__": 52 | main() 53 | -------------------------------------------------------------------------------- /azureml/components/uri_folder_to_file_component.yaml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json 2 | type: command 3 | 4 | name: uri_folder_to_file 5 | display_name: 'uri_folder to uri_file' 6 | description: Extract single uri_file from uri_folder 7 | is_deterministic: true 8 | 9 | inputs: 10 | input_dataset: 11 | type: uri_folder 12 | optional: false 13 | description: | 14 | A folder dataset containing the desired file 15 | 16 | filename_pattern: 17 | type: string 18 | optional: false 19 | description: Pattern to select the required file 20 | 21 | outputs: 22 | output_dataset: 23 | type: uri_file 24 | description: The matched file 25 | 26 | command: >- 27 | cp ${{ inputs.input_dataset }}/${{ inputs.filename_pattern }} ${{ outputs.output_dataset }} 28 | 29 | 30 | environment: 31 | image: mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:20231011.v1 -------------------------------------------------------------------------------- /azureml/environments/phi2transformer-env.yaml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/environment.schema.json 2 | 3 | name: phi2_transformer 4 | description: | 5 | A simple environment running Phi2 from Hugging Face 6 | 7 | image: mcr.microsoft.com/azureml/minimal-ubuntu22.04-py39-cuda11.8-gpu-inference:20240122.v1 8 | conda_file: 9 | channels: 10 | - defaults 11 | dependencies: 12 | - python=3.11 13 | - pip 14 | - pip: 15 | # Note that we have to force torch to install from this index 16 | # in order to match the CUDA driver... 17 | - --index-url https://download.pytorch.org/whl/cu118 18 | - torch 19 | # ... so we have to add PyPI back in as an alternative index 20 | - --extra-index-url https://pypi.org/simple 21 | - accelerate 22 | - aether-utils==0.0.1.dev1 23 | - guidance>=0.1.13 24 | - transformers -------------------------------------------------------------------------------- /azureml/environments/promptbase-env.yaml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/environment.schema.json 2 | 3 | name: promptbase_aml 4 | description: | 5 | A simple environment for promptbase 6 | 7 | image: mcr.microsoft.com/azureml/inference-base-2004 8 | conda_file: 9 | channels: 10 | - defaults 11 | dependencies: 12 | - python=3.11 13 | - pip 14 | - pip: 15 | - aether-utils==0.0.1.dev1 16 | - azure-identity 17 | - azure-keyvault-secrets 18 | - azureml-mlflow 19 | - fairlearn 20 | - datasets 21 | - guidance>=0.1.13 22 | - jsonschema 23 | - mlflow 24 | - numpy 25 | - openai>=1 26 | - scikit-learn -------------------------------------------------------------------------------- /azureml/json_schemas/multichoice_schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://json-schema.org/draft/2020-12/schema", 3 | "$id": "promptbase.multiplechoice", 4 | "title": "Multiple Choice Question", 5 | "description": "A sample multiple choice question", 6 | "type": "object", 7 | "properties": { 8 | "question": { 9 | "description": "The question being asked", 10 | "type": "string" 11 | }, 12 | "choices": { 13 | "description": "A list of possible answers to the question", 14 | "type": "array", 15 | "items": { 16 | "type": "string" 17 | }, 18 | "minItems": 2, 19 | "uniqueItems": true 20 | }, 21 | "correct_answer": { 22 | "description": "The index of the correct answer within the 'choices' array", 23 | "type": "integer", 24 | "minimum": 0 25 | } 26 | }, 27 | "required": [ 28 | "question", 29 | "choices", 30 | "correct_answer" 31 | ] 32 | } -------------------------------------------------------------------------------- /azureml/json_schemas/multiplechoice_cot_schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://json-schema.org/draft/2020-12/schema", 3 | "$id": "promptbase.multiplechoice_cot", 4 | "title": "Multiple Choice Question with Chain-of-Thought", 5 | "description": "A sample multiple choice question with a chain of thought", 6 | "type": "object", 7 | "properties": { 8 | "question": { 9 | "description": "The question being asked", 10 | "type": "string" 11 | }, 12 | "choices": { 13 | "description": "A list of possible answers to the question", 14 | "type": "array", 15 | "items": { 16 | "type": "string" 17 | }, 18 | "minItems": 2, 19 | "uniqueItems": true 20 | }, 21 | "chain_of_thought": { 22 | "description": "A chain of thought leading to the correct answer", 23 | "type": "string" 24 | }, 25 | "correct_answer": { 26 | "description": "The index of the correct answer within the 'choices' array", 27 | "type": "integer", 28 | "minimum": 0 29 | } 30 | }, 31 | "required": [ 32 | "question", 33 | "choices", 34 | "chain_of_thought", 35 | "correct_answer" 36 | ] 37 | } -------------------------------------------------------------------------------- /azureml/pipelines/azureml_utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pathlib 3 | 4 | 5 | from azure.ai.ml import load_component, MLClient, load_environment 6 | from azure.ai.ml.entities import Component, Environment 7 | 8 | from constants import COMPONENTS_DIR, ENVIRONMENT_FILE, PHI2_ENVIRONMENT_FILE 9 | 10 | 11 | _logger = logging.getLogger(__file__) 12 | _logger.setLevel(logging.INFO) 13 | 14 | # This dictionary lists the attributes to be added to ComponentCollector 15 | ALL_COMPONENTS = dict( 16 | jsonl_embeddings="jsonl_embeddings_aoai_component.yaml", 17 | jsonl_filter_correct_multiplechoice="jsonl_filter_correct_multiplechoice_component.yaml", 18 | jsonl_guidance="jsonl_guidance_component.yaml", 19 | jsonl_key_filter="jsonl_key_filter_component.yaml", 20 | jsonl_key_rename="jsonl_key_rename_component.yaml", 21 | jsonl_knn_cosine_similarity="jsonl_knn_cosine_similarity_component.yaml", 22 | jsonl_mmlu_fetch="jsonl_mmlu_fetch_component.yaml", 23 | jsonl_random_examples="jsonl_random_examples_component.yaml", 24 | jsonl_schema_checker="jsonl_schema_checker_component.yaml", 25 | jsonl_score_biosbias_json="jsonl_score_biosbias_json_component.yaml", 26 | jsonl_score_multiplechoice="jsonl_score_multiplechoice_component.yaml", 27 | jsonl_to_json="jsonl_to_json_component.yaml", 28 | uri_folder_to_file="uri_folder_to_file_component.yaml", 29 | ) 30 | 31 | 32 | def create_component_from_yaml( 33 | ml_client: MLClient, 34 | yaml_path: pathlib.Path, 35 | version_string: str, 36 | environment: Environment = None, 37 | ) -> Component: 38 | _logger.info(f"Loading {yaml_path}") 39 | loaded_yaml = load_component(source=yaml_path) 40 | _logger.info("Changing version") 41 | loaded_yaml.version = version_string 42 | _logger.info("Changing environment") 43 | loaded_yaml.environment = environment 44 | _logger.info("Creating component") 45 | my_comp = ml_client.components.create_or_update(loaded_yaml) 46 | _logger.info(f"Component {my_comp.name}:{my_comp.version} created") 47 | return my_comp 48 | 49 | 50 | def create_environment_from_yaml( 51 | ml_client: MLClient, yaml_path: pathlib.Path, version_string: str 52 | ) -> Environment: 53 | _logger.info(f"Loading {yaml_path}") 54 | loaded_yaml = load_environment(source=yaml_path) 55 | _logger.info("Changing version") 56 | loaded_yaml.version = version_string 57 | _logger.info("Creating Environment") 58 | my_env = ml_client.environments.create_or_update(loaded_yaml) 59 | _logger.info(f"Environment {my_env.name}:{my_env.version} created") 60 | return my_env 61 | 62 | 63 | class ComponentCollector: 64 | def __init__( 65 | self, 66 | ml_client: MLClient, 67 | component_base_dir: pathlib.Path, 68 | version_string: str, 69 | ): 70 | self._client = ml_client 71 | self._base_dir = component_base_dir 72 | self._version_string = version_string 73 | 74 | def prepare(self): 75 | _logger.info(f"Creating environment") 76 | component_environment = create_environment_from_yaml( 77 | self._client, ENVIRONMENT_FILE, self._version_string 78 | ) 79 | for attr_name, component_string in ALL_COMPONENTS.items(): 80 | assert not hasattr(self, attr_name) 81 | _logger.info(f"Creating {component_string} from YAML") 82 | component = create_component_from_yaml( 83 | self._client, 84 | self._base_dir / component_string, 85 | environment=component_environment, 86 | version_string=self._version_string, 87 | ) 88 | _logger.info(f"Adding attribute {attr_name}") 89 | setattr(self, attr_name, component) 90 | 91 | # Quickly put in the Phi2 environment 92 | _logger.info("Working on Phi2 component") 93 | phi2_environment = create_environment_from_yaml( 94 | self._client, PHI2_ENVIRONMENT_FILE, self._version_string 95 | ) 96 | self.jsonl_guidance_phi2 = create_component_from_yaml( 97 | self._client, 98 | self._base_dir / "jsonl_guidance_phi2_component.yaml", 99 | environment=phi2_environment, 100 | version_string=self._version_string, 101 | ) 102 | 103 | _logger.info("Added all components") 104 | 105 | 106 | def get_component_collector( 107 | ml_client: MLClient, version_string: str 108 | ) -> ComponentCollector: 109 | components = ComponentCollector(ml_client, COMPONENTS_DIR, version_string) 110 | components.prepare() 111 | 112 | return components 113 | -------------------------------------------------------------------------------- /azureml/pipelines/configs.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Union 2 | 3 | from dataclasses import dataclass, field 4 | 5 | 6 | @dataclass 7 | class AMLConfig: 8 | workspace_name: str = str() 9 | resource_group: str = str() 10 | subscription_id: str = str() 11 | 12 | 13 | @dataclass 14 | class PipelineConfig: 15 | base_experiment_name: str = str() 16 | tags: Dict[str, str] = field(default_factory=dict) 17 | default_compute_target: str = str() 18 | 19 | 20 | @dataclass 21 | class AOAIConfig: 22 | endpoint: str = str() 23 | model: str = str() 24 | compute_target: str = str() 25 | max_errors: int = int() 26 | workers: int = int() 27 | 28 | 29 | @dataclass 30 | class Phi2Config: 31 | compute_target: str = str() 32 | 33 | 34 | @dataclass 35 | class ZeroShotRunConfig: 36 | pipeline: PipelineConfig = field(default_factory=PipelineConfig) 37 | mmlu_dataset: str = str() 38 | mmlu_split: str = str() 39 | guidance_programs: list[str] = field(default_factory=list) 40 | aoai_config: AOAIConfig = field(default_factory=AOAIConfig) 41 | 42 | 43 | @dataclass 44 | class FewShotConfig: 45 | pipeline: PipelineConfig = field(default_factory=PipelineConfig) 46 | mmlu_dataset: str = str() 47 | mmlu_split: str = str() 48 | fewshot_split: str = str() 49 | guidance_program: str = str() 50 | guidance_workers: int = 4 51 | max_errors: int = 5 52 | aoai_config: AOAIConfig = field(default_factory=AOAIConfig) 53 | 54 | 55 | @dataclass 56 | class KNNConfig: 57 | k_nearest: int = int() 58 | 59 | 60 | @dataclass 61 | class KNNFewshotConfig: 62 | pipeline: PipelineConfig = field(default_factory=PipelineConfig) 63 | mmlu_dataset: str = str() 64 | test_split: str = str() 65 | example_split: str = str() 66 | guidance_programs: list[str] = field(default_factory=list) 67 | answer_key: str = str() 68 | knn_config: KNNConfig = field(default_factory=KNNConfig) 69 | aoai_config: AOAIConfig = field(default_factory=AOAIConfig) 70 | aoai_embedding_config: AOAIConfig = field(default_factory=AOAIConfig) 71 | 72 | 73 | @dataclass 74 | class RandomExamplesConfig: 75 | num_examples: int = int() 76 | random_seed: int = int() 77 | 78 | 79 | @dataclass 80 | class RandomFewshotPipelineConfig: 81 | pipeline: PipelineConfig = field(default_factory=PipelineConfig) 82 | mmlu_dataset: str = str() 83 | test_split: str = str() 84 | example_split: str = str() 85 | guidance_programs: list[str] = field(default_factory=list) 86 | answer_key: str = str() 87 | random_examples: RandomExamplesConfig = field(default_factory=RandomExamplesConfig) 88 | aoai_config: AOAIConfig = field(default_factory=AOAIConfig) 89 | 90 | 91 | @dataclass 92 | class RandomFewshotCoTPipelineConfig: 93 | pipeline: PipelineConfig = field(default_factory=PipelineConfig) 94 | mmlu_dataset: str = str() 95 | test_split: str = str() 96 | example_split: str = str() 97 | zeroshot_cot_guidance_program: str = str() 98 | fewshot_cot_guidance_program: str = str() 99 | random_example_config: RandomExamplesConfig = field( 100 | default_factory=RandomExamplesConfig 101 | ) 102 | aoai_config: AOAIConfig = field(default_factory=AOAIConfig) 103 | 104 | 105 | @dataclass 106 | class KNNFewshotCoTPipelineConfig: 107 | pipeline: PipelineConfig = field(default_factory=PipelineConfig) 108 | mmlu_dataset: str = str() 109 | test_split: str = str() 110 | example_split: str = str() 111 | zeroshot_cot_guidance_program: str = str() 112 | fewshot_cot_guidance_program: str = str() 113 | knn_config: KNNConfig = field(default_factory=KNNConfig) 114 | aoai_config: AOAIConfig = field(default_factory=AOAIConfig) 115 | aoai_embedding_config: AOAIConfig = field(default_factory=AOAIConfig) 116 | 117 | 118 | @dataclass 119 | class BiosBiasJSONPipelineConfig: 120 | pipeline: PipelineConfig = field(default_factory=PipelineConfig) 121 | biosbias_dataset: str = str() 122 | json_guidance_program: str = str() 123 | aoai_config: AOAIConfig = field(default_factory=AOAIConfig) 124 | 125 | 126 | @dataclass 127 | class Phi2BiosBiasJSONPipelineConfig: 128 | pipeline: PipelineConfig = field(default_factory=PipelineConfig) 129 | biosbias_dataset: str = str() 130 | json_guidance_programs: list[str] = field(default_factory=list) 131 | phi2_config: Phi2Config = field(default_factory=Phi2Config) 132 | -------------------------------------------------------------------------------- /azureml/pipelines/configs/aml_config_template.yaml: -------------------------------------------------------------------------------- 1 | azureml_config: 2 | workspace_name: 3 | resource_group: 4 | subscription_id: -------------------------------------------------------------------------------- /azureml/pipelines/configs/aoai_config_template.yaml: -------------------------------------------------------------------------------- 1 | default_aoai_config: 2 | endpoint: 3 | model: 4 | compute_target: 5 | max_errors: 10 6 | workers: 10 7 | 8 | # If being used as an embedding config, then the endpoint will look like: 9 | # https://YOUR_RESOURCE_NAME.openai.azure.com/openai/deployments/YOUR_DEPLOYMENT_NAME/embeddings?api-version=2023-05-15 -------------------------------------------------------------------------------- /azureml/pipelines/configs/biosbias_json_config.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _self_ 3 | - aml_config 4 | - aoai_config 5 | 6 | zeroshot_config: 7 | pipeline: 8 | base_experiment_name: biosbias_json 9 | tags: 10 | default_compute_target: isolatedcompute 11 | biosbias_dataset: biosbias_small:1 12 | json_guidance_program: simple_biosbias_json.py 13 | aoai_config: ${ default_aoai_config } 14 | -------------------------------------------------------------------------------- /azureml/pipelines/configs/biosbias_json_phi2_config.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _self_ 3 | - aml_config 4 | 5 | zeroshot_config: 6 | pipeline: 7 | base_experiment_name: biosbias_json_phi2 8 | tags: 9 | default_compute_target: isolatedcompute 10 | biosbias_dataset: biosbias_small:1 11 | json_guidance_programs: 12 | - simple_biosbias_json_completion.py 13 | - simple_biosbias_json_completion_v2.py 14 | phi2_config: 15 | compute_target: gput4 16 | -------------------------------------------------------------------------------- /azureml/pipelines/configs/fewshot_knn_config.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _self_ 3 | - aml_config 4 | - aoai_config 5 | - aoai_embedding_config 6 | 7 | knn_fewshot_config: 8 | pipeline: 9 | base_experiment_name: fewshot_knn 10 | tags: 11 | default_compute_target: isolatedcompute 12 | mmlu_dataset: all_mmlu_datasets 13 | test_split: test 14 | example_split: validation 15 | guidance_programs: 16 | - fewshot.py 17 | - fewshot_as_conversation.py 18 | knn_config: 19 | k_nearest: 5 20 | answer_key: fewshot_answer 21 | aoai_config: ${ default_aoai_config } 22 | aoai_embedding_config: ${ default_aoai_embedding_config } -------------------------------------------------------------------------------- /azureml/pipelines/configs/fewshot_random_config.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _self_ 3 | - aml_config 4 | - aoai_config 5 | 6 | random_fewshot_config: 7 | pipeline: 8 | base_experiment_name: fewshot_random 9 | tags: 10 | default_compute_target: isolatedcompute 11 | mmlu_dataset: all_mmlu_datasets 12 | test_split: test 13 | example_split: validation 14 | guidance_programs: 15 | - fewshot.py 16 | - fewshot_as_conversation.py 17 | random_examples: 18 | num_examples: 5 19 | random_seed: 1234987 20 | answer_key: fewshot_answer 21 | aoai_config: ${ default_aoai_config } -------------------------------------------------------------------------------- /azureml/pipelines/configs/knn_fewshot_cot_config.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _self_ 3 | - aml_config 4 | - aoai_config 5 | - aoai_embedding_config 6 | 7 | knn_fewshot_cot_config: 8 | pipeline: 9 | base_experiment_name: fewshot_knn_cot 10 | tags: 11 | default_compute_target: isolatedcompute 12 | mmlu_dataset: all_mmlu_datasets 13 | test_split: test 14 | example_split: validation 15 | zeroshot_cot_guidance_program: zero_shot_cot.py 16 | fewshot_cot_guidance_program: fewshot_cot_as_conversation.py 17 | knn_config: 18 | k_nearest: 5 19 | aoai_config: ${ default_aoai_config } 20 | aoai_embedding_config: ${ default_aoai_embedding_config } -------------------------------------------------------------------------------- /azureml/pipelines/configs/knn_fewshot_cot_ensemble_config.yaml: -------------------------------------------------------------------------------- 1 | # This is also for the submit_mmlu_fewshot_knn_cot.py script 2 | 3 | defaults: 4 | - _self_ 5 | - aml_config 6 | - aoai_config 7 | - aoai_embedding_config 8 | 9 | knn_fewshot_cot_config: 10 | pipeline: 11 | base_experiment_name: fewshot_knn_cot_ensemble 12 | tags: 13 | default_compute_target: isolatedcompute 14 | mmlu_dataset: all_mmlu_datasets 15 | test_split: test 16 | example_split: validation 17 | zeroshot_cot_guidance_program: zero_shot_cot.py 18 | fewshot_cot_guidance_program: fewshot_cot_as_conversation_ensemble.py 19 | knn_config: 20 | k_nearest: 5 21 | aoai_config: ${ default_aoai_config } 22 | aoai_embedding_config: ${ default_aoai_embedding_config } -------------------------------------------------------------------------------- /azureml/pipelines/configs/random_fewshot_cot_config.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _self_ 3 | - aml_config 4 | - aoai_config 5 | 6 | random_fewshot_cot_config: 7 | pipeline: 8 | base_experiment_name: fewshot_random_cot 9 | tags: 10 | default_compute_target: isolatedcompute 11 | mmlu_dataset: all_mmlu_datasets 12 | test_split: test 13 | example_split: validation 14 | zeroshot_cot_guidance_program: zero_shot_cot.py 15 | fewshot_cot_guidance_program: fewshot_cot_as_conversation.py 16 | random_example_config: 17 | num_examples: 5 18 | random_seed: 1234987 19 | aoai_config: ${ default_aoai_config } -------------------------------------------------------------------------------- /azureml/pipelines/configs/zeroshot_config.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _self_ 3 | - aml_config 4 | - aoai_config 5 | 6 | zeroshot_config: 7 | pipeline: 8 | base_experiment_name: zeroshot 9 | tags: 10 | default_compute_target: isolatedcompute 11 | mmlu_dataset: all_mmlu_datasets 12 | mmlu_split: test 13 | guidance_programs: 14 | - zero_or_few_shot.py 15 | - zero_or_few_shot_fortran.py 16 | - zero_or_few_shot_alpha.py 17 | - zero_or_few_shot_expert.py 18 | aoai_config: ${ default_aoai_config } 19 | -------------------------------------------------------------------------------- /azureml/pipelines/configs/zeroshot_cot_config.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _self_ 3 | - aml_config 4 | - aoai_config 5 | 6 | zeroshot_config: 7 | pipeline: 8 | base_experiment_name: zeroshot_cot 9 | tags: 10 | default_compute_target: isolatedcompute 11 | mmlu_dataset: all_mmlu_datasets 12 | mmlu_split: test 13 | guidance_programs: 14 | - zero_shot_cot.py 15 | aoai_config: ${ default_aoai_config } 16 | -------------------------------------------------------------------------------- /azureml/pipelines/constants.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | COMPONENTS_DIR = (Path(__file__).parent.parent / "components").absolute() 4 | 5 | ENVIRONMENTS_DIR = (Path(__file__).parent.parent / "environments").absolute() 6 | 7 | GUIDANCE_PROGRAMS_DIR = ( 8 | Path(__file__).parent.parent.parent / "guidance_programs" 9 | ).absolute() 10 | 11 | 12 | SCHEMA_DIR = (Path(__file__).parent.parent / "json_schemas").absolute() 13 | 14 | ENVIRONMENT_FILE = ENVIRONMENTS_DIR / "promptbase-env.yaml" 15 | 16 | PHI2_ENVIRONMENT_FILE = ENVIRONMENTS_DIR / "phi2transformer-env.yaml" 17 | 18 | assert COMPONENTS_DIR.exists(), f"Did not find {COMPONENTS_DIR}" 19 | assert ENVIRONMENT_FILE.exists(), f"Did not find {ENVIRONMENT_FILE}" 20 | assert GUIDANCE_PROGRAMS_DIR.exists(), f"Did not find {GUIDANCE_PROGRAMS_DIR}" 21 | assert SCHEMA_DIR.exists(), f"Did not find {SCHEMA_DIR}" 22 | -------------------------------------------------------------------------------- /azureml/pipelines/logging_utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pathlib 3 | 4 | 5 | def get_standard_logger_for_file(file_path: str) -> logging.Logger: 6 | _logger = logging.getLogger(pathlib.Path(file_path).name) 7 | _logger.setLevel(logging.INFO) 8 | return _logger 9 | -------------------------------------------------------------------------------- /azureml/pipelines/submit_mmlu_knn_fewshot.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | from dataclasses import dataclass 4 | 5 | import hydra 6 | from hydra.core.config_store import ConfigStore 7 | 8 | import omegaconf 9 | 10 | from azure.identity import DefaultAzureCredential 11 | 12 | from azure.ai.ml import dsl, MLClient, Input 13 | from azure.ai.ml.entities import Pipeline 14 | 15 | from azureml_pipelines import create_knn_fewshot_pipeline 16 | from azureml_utils import get_component_collector 17 | from configs import AMLConfig, KNNFewshotConfig, AOAIConfig 18 | from constants import GUIDANCE_PROGRAMS_DIR 19 | from logging_utils import get_standard_logger_for_file 20 | 21 | _logger = get_standard_logger_for_file(__file__) 22 | 23 | 24 | @dataclass 25 | class PipelineConfig: 26 | knn_fewshot_config: KNNFewshotConfig = omegaconf.MISSING 27 | azureml_config: AMLConfig = omegaconf.MISSING 28 | aoai_config: AOAIConfig = omegaconf.MISSING 29 | aoai_embedding_config: AOAIConfig = omegaconf.MISSING 30 | 31 | 32 | cs = ConfigStore.instance() 33 | cs.store(name="config", node=PipelineConfig) 34 | 35 | 36 | def create_knn_fewshot_pipeline_mmlu( 37 | ml_client: MLClient, run_config: KNNFewshotConfig, version_string: str 38 | ): 39 | components = get_component_collector(ml_client, version_string) 40 | 41 | guidance_inputs = dict() 42 | for prog_filename in run_config.guidance_programs: 43 | k = prog_filename[0:-3] 44 | v = Input( 45 | type="uri_file", 46 | path=GUIDANCE_PROGRAMS_DIR / prog_filename, 47 | model="download", 48 | ) 49 | guidance_inputs[k] = v 50 | _logger.info(f"Found {len(guidance_inputs)} guidance programs") 51 | 52 | @dsl.pipeline() 53 | def basic_pipeline() -> Pipeline: 54 | mmlu_fetch_job = components.jsonl_mmlu_fetch( 55 | mmlu_dataset=run_config.mmlu_dataset 56 | ) 57 | mmlu_fetch_job.name = f"fetch_mmlu_{run_config.mmlu_dataset}" 58 | 59 | split_outputs = dict() 60 | for k, v in dict( 61 | input=run_config.test_split, example=run_config.example_split 62 | ).items(): 63 | get_split_job = components.uri_folder_to_file( 64 | input_dataset=mmlu_fetch_job.outputs.output_dataset, 65 | filename_pattern=f"{v}.jsonl", 66 | ) 67 | get_split_job.name = f"extract_split_{k}" 68 | split_outputs[k] = get_split_job.outputs.output_dataset 69 | 70 | for progname, prog_input in guidance_inputs.items(): 71 | answer_ds = create_knn_fewshot_pipeline( 72 | components=components, 73 | embedding_config=run_config.aoai_embedding_config, 74 | inference_config=run_config.aoai_config, 75 | input_dataset=split_outputs["input"], 76 | example_dataset=split_outputs["example"], 77 | guidance_program=prog_input, 78 | num_examples=run_config.knn_config.k_nearest, 79 | output_key=run_config.answer_key, 80 | ) 81 | 82 | score_job = components.jsonl_score_multiplechoice( 83 | input_dataset=answer_ds, 84 | correct_key="correct_answer", # Set when MMLU fetching 85 | response_key=run_config.answer_key, 86 | ) 87 | score_job.name = f"score_fewshot_{progname}" 88 | 89 | pipeline = basic_pipeline() 90 | pipeline.experiment_name = ( 91 | f"{run_config.pipeline.base_experiment_name}_{run_config.mmlu_dataset}" 92 | ) 93 | pipeline.display_name = None 94 | pipeline.compute = run_config.pipeline.default_compute_target 95 | if run_config.pipeline.tags: 96 | pipeline.tags.update(run_config.tags) 97 | _logger.info("Pipeline created") 98 | 99 | return pipeline 100 | 101 | 102 | @hydra.main(config_path="configs", version_base="1.1") 103 | def main(config: PipelineConfig): 104 | version_string = str(int(time.time())) 105 | _logger.info(f"AzureML object version for this run: {version_string}") 106 | 107 | _logger.info(f"Azure Subscription: {config.azureml_config.subscription_id}") 108 | _logger.info(f"Resource Group: {config.azureml_config.resource_group}") 109 | _logger.info(f"Workspace : {config.azureml_config.workspace_name}") 110 | 111 | credential = DefaultAzureCredential(exclude_shared_token_cache_credential=True) 112 | 113 | ws_client = MLClient( 114 | credential=credential, 115 | subscription_id=config.azureml_config.subscription_id, 116 | resource_group_name=config.azureml_config.resource_group, 117 | workspace_name=config.azureml_config.workspace_name, 118 | logging_enable=False, 119 | ) 120 | 121 | pipeline = create_knn_fewshot_pipeline_mmlu( 122 | ws_client, config.knn_fewshot_config, version_string 123 | ) 124 | _logger.info("Submitting pipeline") 125 | submitted_job = ws_client.jobs.create_or_update(pipeline) 126 | _logger.info(f"Submitted: {submitted_job.name}") 127 | 128 | 129 | if __name__ == "__main__": 130 | main() 131 | -------------------------------------------------------------------------------- /azureml/pipelines/submit_mmlu_random_fewshot.py: -------------------------------------------------------------------------------- 1 | # Submit a run using: 2 | # python .\submit_mmlu_random_fewshot.py -cn fewshot_random_config 3 | 4 | import time 5 | 6 | from dataclasses import dataclass 7 | 8 | import hydra 9 | from hydra.core.config_store import ConfigStore 10 | 11 | import omegaconf 12 | 13 | from azure.identity import DefaultAzureCredential 14 | from azure.ai.ml import MLClient 15 | 16 | from azure.ai.ml import dsl, Input, MLClient 17 | from azure.ai.ml.entities import Pipeline 18 | 19 | from azureml_pipelines import create_random_fewshot_pipeline 20 | from azureml_utils import get_component_collector 21 | from configs import AMLConfig, RandomFewshotPipelineConfig 22 | from constants import GUIDANCE_PROGRAMS_DIR 23 | from logging_utils import get_standard_logger_for_file 24 | 25 | _logger = get_standard_logger_for_file(__file__) 26 | 27 | 28 | @dataclass 29 | class PipelineConfig: 30 | random_fewshot_config: RandomFewshotPipelineConfig = omegaconf.MISSING 31 | azureml_config: AMLConfig = omegaconf.MISSING 32 | 33 | 34 | cs = ConfigStore.instance() 35 | cs.store(name="config", node=PipelineConfig) 36 | 37 | 38 | def create_fewshot_pipeline( 39 | ml_client: MLClient, run_config: RandomFewshotPipelineConfig, version_string: str 40 | ): 41 | components = get_component_collector(ml_client, version_string) 42 | 43 | guidance_inputs = dict() 44 | for prog_filename in run_config.guidance_programs: 45 | k = prog_filename[0:-3] 46 | v = Input( 47 | type="uri_file", 48 | path=GUIDANCE_PROGRAMS_DIR / prog_filename, 49 | model="download", 50 | ) 51 | guidance_inputs[k] = v 52 | _logger.info(f"Found {len(guidance_inputs)} guidance programs") 53 | 54 | @dsl.pipeline() 55 | def basic_pipeline() -> Pipeline: 56 | mmlu_fetch_job = components.jsonl_mmlu_fetch( 57 | mmlu_dataset=run_config.mmlu_dataset 58 | ) 59 | mmlu_fetch_job.name = f"fetch_mmlu_{run_config.mmlu_dataset}" 60 | 61 | split_outputs = dict() 62 | for k, v in dict( 63 | input=run_config.test_split, example=run_config.example_split 64 | ).items(): 65 | get_split_job = components.uri_folder_to_file( 66 | input_dataset=mmlu_fetch_job.outputs.output_dataset, 67 | filename_pattern=f"{v}.jsonl", 68 | ) 69 | get_split_job.name = f"extract_split_{k}" 70 | split_outputs[k] = get_split_job.outputs.output_dataset 71 | 72 | for progname, prog_input in guidance_inputs.items(): 73 | answer_ds = create_random_fewshot_pipeline( 74 | components=components, 75 | inference_config=run_config.aoai_config, 76 | input_dataset=split_outputs["input"], 77 | example_dataset=split_outputs["example"], 78 | guidance_program=prog_input, 79 | random_examples=run_config.random_examples, 80 | output_key=run_config.answer_key, 81 | ) 82 | 83 | score_job = components.jsonl_score_multiplechoice( 84 | input_dataset=answer_ds, 85 | correct_key="correct_answer", # Set when MMLU fetching 86 | response_key=run_config.answer_key, 87 | ) 88 | score_job.name = f"score_fewshot_{progname}" 89 | 90 | pipeline = basic_pipeline() 91 | pipeline.experiment_name = ( 92 | f"{run_config.pipeline.base_experiment_name}_{run_config.mmlu_dataset}" 93 | ) 94 | pipeline.display_name = None 95 | pipeline.compute = run_config.pipeline.default_compute_target 96 | if run_config.pipeline.tags: 97 | pipeline.tags.update(run_config.tags) 98 | _logger.info("Pipeline created") 99 | 100 | return pipeline 101 | 102 | 103 | @hydra.main(config_path="configs", version_base="1.1") 104 | def main(config: PipelineConfig): 105 | version_string = str(int(time.time())) 106 | _logger.info(f"AzureML object version for this run: {version_string}") 107 | 108 | _logger.info(f"Azure Subscription: {config.azureml_config.subscription_id}") 109 | _logger.info(f"Resource Group: {config.azureml_config.resource_group}") 110 | _logger.info(f"Workspace : {config.azureml_config.workspace_name}") 111 | 112 | credential = DefaultAzureCredential(exclude_shared_token_cache_credential=True) 113 | 114 | ws_client = MLClient( 115 | credential=credential, 116 | subscription_id=config.azureml_config.subscription_id, 117 | resource_group_name=config.azureml_config.resource_group, 118 | workspace_name=config.azureml_config.workspace_name, 119 | logging_enable=False, 120 | ) 121 | 122 | pipeline = create_fewshot_pipeline( 123 | ws_client, config.random_fewshot_config, version_string 124 | ) 125 | _logger.info("Submitting pipeline") 126 | submitted_job = ws_client.jobs.create_or_update(pipeline) 127 | _logger.info(f"Submitted: {submitted_job.name}") 128 | 129 | 130 | if __name__ == "__main__": 131 | main() 132 | -------------------------------------------------------------------------------- /azureml/pipelines/submit_mmlu_zeroshot.py: -------------------------------------------------------------------------------- 1 | # Submit a run using: 2 | # python .\submit_mmlu_zeroshot.py -cn zeroshot_config 3 | 4 | import time 5 | 6 | from dataclasses import dataclass 7 | 8 | import hydra 9 | from hydra.core.config_store import ConfigStore 10 | 11 | import omegaconf 12 | 13 | from azure.identity import DefaultAzureCredential 14 | from azure.ai.ml import MLClient 15 | 16 | from azure.ai.ml import dsl, Input, MLClient 17 | from azure.ai.ml.entities import Pipeline 18 | 19 | from azureml_pipelines import create_zeroshot_pipeline 20 | from azureml_utils import get_component_collector 21 | from configs import AMLConfig, ZeroShotRunConfig 22 | from constants import GUIDANCE_PROGRAMS_DIR 23 | from logging_utils import get_standard_logger_for_file 24 | 25 | _logger = get_standard_logger_for_file(__file__) 26 | 27 | 28 | @dataclass 29 | class PipelineConfig: 30 | zeroshot_config: ZeroShotRunConfig = omegaconf.MISSING 31 | azureml_config: AMLConfig = omegaconf.MISSING 32 | 33 | 34 | cs = ConfigStore.instance() 35 | cs.store(name="config", node=PipelineConfig) 36 | 37 | 38 | def create_mmlu_zeroshot_pipeline( 39 | ml_client: MLClient, run_config: ZeroShotRunConfig, version_string: str 40 | ): 41 | components = get_component_collector(ml_client, version_string) 42 | 43 | guidance_inputs = dict() 44 | for prog_filename in run_config.guidance_programs: 45 | k = prog_filename[0:-3] 46 | v = Input( 47 | type="uri_file", 48 | path=GUIDANCE_PROGRAMS_DIR / prog_filename, 49 | model="download", 50 | ) 51 | guidance_inputs[k] = v 52 | _logger.info(f"Found {len(guidance_inputs)} guidance programs") 53 | 54 | answer_key = "zeroshot_answer" 55 | 56 | @dsl.pipeline() 57 | def basic_pipeline() -> Pipeline: 58 | mmlu_fetch_job = components.jsonl_mmlu_fetch( 59 | mmlu_dataset=run_config.mmlu_dataset 60 | ) 61 | mmlu_fetch_job.name = f"fetch_mmlu_{run_config.mmlu_dataset}" 62 | 63 | get_split_job = components.uri_folder_to_file( 64 | input_dataset=mmlu_fetch_job.outputs.output_dataset, 65 | filename_pattern=f"{run_config.mmlu_split}.jsonl", 66 | ) 67 | get_split_job.name = f"extract_split_{run_config.mmlu_split}" 68 | 69 | for progname, prog_input in guidance_inputs.items(): 70 | answer_ds = create_zeroshot_pipeline( 71 | pipeline_name=f"{progname}_zeroshot", 72 | pipeline_display_name=f"Zero Shot {progname}", 73 | components=components, 74 | inference_config=run_config.aoai_config, 75 | input_dataset=get_split_job.outputs.output_dataset, 76 | guidance_program=prog_input, 77 | output_key=answer_key, 78 | ) 79 | 80 | score_job = components.jsonl_score_multiplechoice( 81 | input_dataset=answer_ds, 82 | correct_key="correct_answer", # Set when MMLU fetching 83 | response_key=answer_key, 84 | ) 85 | score_job.name = f"zeroshot_score_{progname}" 86 | 87 | pipeline = basic_pipeline() 88 | pipeline.experiment_name = ( 89 | f"{run_config.pipeline.base_experiment_name}_{run_config.mmlu_dataset}" 90 | ) 91 | pipeline.display_name = None 92 | pipeline.compute = run_config.pipeline.default_compute_target 93 | if run_config.pipeline.tags: 94 | pipeline.tags.update(run_config.tags) 95 | _logger.info("Pipeline created") 96 | 97 | return pipeline 98 | 99 | 100 | @hydra.main(config_path="configs", version_base="1.1") 101 | def main(config: PipelineConfig): 102 | version_string = str(int(time.time())) 103 | _logger.info(f"AzureML object version for this run: {version_string}") 104 | 105 | _logger.info(f"Azure Subscription: {config.azureml_config.subscription_id}") 106 | _logger.info(f"Resource Group: {config.azureml_config.resource_group}") 107 | _logger.info(f"Workspace : {config.azureml_config.workspace_name}") 108 | 109 | credential = DefaultAzureCredential(exclude_shared_token_cache_credential=True) 110 | 111 | ws_client = MLClient( 112 | credential=credential, 113 | subscription_id=config.azureml_config.subscription_id, 114 | resource_group_name=config.azureml_config.resource_group, 115 | workspace_name=config.azureml_config.workspace_name, 116 | logging_enable=False, 117 | ) 118 | 119 | pipeline = create_mmlu_zeroshot_pipeline( 120 | ws_client, config.zeroshot_config, version_string 121 | ) 122 | _logger.info("Submitting pipeline") 123 | submitted_job = ws_client.jobs.create_or_update(pipeline) 124 | _logger.info(f"Submitted: {submitted_job.name}") 125 | 126 | 127 | if __name__ == "__main__": 128 | main() 129 | -------------------------------------------------------------------------------- /azureml/pipelines/submit_mmlu_zeroshot_cot.py: -------------------------------------------------------------------------------- 1 | # Submit a run using: 2 | # python .\submit_mmlu_zeroshot_cot.py -cn zeroshot_cot_config 3 | 4 | import time 5 | 6 | from dataclasses import dataclass 7 | 8 | import hydra 9 | from hydra.core.config_store import ConfigStore 10 | 11 | import omegaconf 12 | 13 | from azure.identity import DefaultAzureCredential 14 | from azure.ai.ml import MLClient 15 | 16 | from azure.ai.ml import dsl, Input, MLClient 17 | from azure.ai.ml.entities import Pipeline 18 | 19 | from azureml_pipelines import create_zeroshot_cot_pipeline 20 | from azureml_utils import get_component_collector 21 | from configs import AMLConfig, ZeroShotRunConfig 22 | from constants import GUIDANCE_PROGRAMS_DIR 23 | from logging_utils import get_standard_logger_for_file 24 | 25 | _logger = get_standard_logger_for_file(__file__) 26 | 27 | 28 | @dataclass 29 | class PipelineConfig: 30 | zeroshot_config: ZeroShotRunConfig = omegaconf.MISSING 31 | azureml_config: AMLConfig = omegaconf.MISSING 32 | 33 | 34 | cs = ConfigStore.instance() 35 | cs.store(name="config", node=PipelineConfig) 36 | 37 | 38 | def create_mmlu_zeroshot_cot_pipeline( 39 | ml_client: MLClient, run_config: ZeroShotRunConfig, version_string: str 40 | ): 41 | components = get_component_collector(ml_client, version_string) 42 | 43 | guidance_inputs = dict() 44 | for prog_filename in run_config.guidance_programs: 45 | k = prog_filename[0:-3] 46 | v = Input( 47 | type="uri_file", 48 | path=GUIDANCE_PROGRAMS_DIR / prog_filename, 49 | model="download", 50 | ) 51 | guidance_inputs[k] = v 52 | _logger.info(f"Found {len(guidance_inputs)} guidance programs") 53 | 54 | answer_key = "zeroshot_cot_answer" 55 | cot_key = "zeroshot_chain_of_thought" 56 | 57 | @dsl.pipeline() 58 | def basic_pipeline() -> Pipeline: 59 | mmlu_fetch_job = components.jsonl_mmlu_fetch( 60 | mmlu_dataset=run_config.mmlu_dataset 61 | ) 62 | mmlu_fetch_job.name = f"fetch_mmlu_{run_config.mmlu_dataset}" 63 | 64 | get_split_job = components.uri_folder_to_file( 65 | input_dataset=mmlu_fetch_job.outputs.output_dataset, 66 | filename_pattern=f"{run_config.mmlu_split}.jsonl", 67 | ) 68 | get_split_job.name = f"extract_split_{run_config.mmlu_split}" 69 | 70 | for progname, prog_input in guidance_inputs.items(): 71 | answer_ds = create_zeroshot_cot_pipeline( 72 | pipeline_name=f"{progname}_zeroshot_cot", 73 | pipeline_display_name=f"Zero Shot CoT {progname}", 74 | components=components, 75 | inference_config=run_config.aoai_config, 76 | input_dataset=get_split_job.outputs.output_dataset, 77 | guidance_program=prog_input, 78 | output_key=answer_key, 79 | cot_key=cot_key, 80 | ) 81 | 82 | score_job = components.jsonl_score_multiplechoice( 83 | input_dataset=answer_ds, 84 | correct_key="correct_answer", # Set when MMLU fetching 85 | response_key=answer_key, 86 | ) 87 | score_job.name = f"zeroshot_cot_score_{progname}" 88 | 89 | pipeline = basic_pipeline() 90 | pipeline.experiment_name = ( 91 | f"{run_config.pipeline.base_experiment_name}_{run_config.mmlu_dataset}" 92 | ) 93 | pipeline.display_name = None 94 | pipeline.compute = run_config.pipeline.default_compute_target 95 | if run_config.pipeline.tags: 96 | pipeline.tags.update(run_config.tags) 97 | _logger.info("Pipeline created") 98 | 99 | return pipeline 100 | 101 | 102 | @hydra.main(config_path="configs", version_base="1.1") 103 | def main(config: PipelineConfig): 104 | version_string = str(int(time.time())) 105 | _logger.info(f"AzureML object version for this run: {version_string}") 106 | 107 | _logger.info(f"Azure Subscription: {config.azureml_config.subscription_id}") 108 | _logger.info(f"Resource Group: {config.azureml_config.resource_group}") 109 | _logger.info(f"Workspace : {config.azureml_config.workspace_name}") 110 | 111 | credential = DefaultAzureCredential(exclude_shared_token_cache_credential=True) 112 | 113 | ws_client = MLClient( 114 | credential=credential, 115 | subscription_id=config.azureml_config.subscription_id, 116 | resource_group_name=config.azureml_config.resource_group, 117 | workspace_name=config.azureml_config.workspace_name, 118 | logging_enable=False, 119 | ) 120 | 121 | pipeline = create_mmlu_zeroshot_cot_pipeline( 122 | ws_client, config.zeroshot_config, version_string 123 | ) 124 | _logger.info("Submitting pipeline") 125 | submitted_job = ws_client.jobs.create_or_update(pipeline) 126 | _logger.info(f"Submitted: {submitted_job.name}") 127 | 128 | 129 | if __name__ == "__main__": 130 | main() 131 | -------------------------------------------------------------------------------- /azureml/pipelines/submit_simple_biosbias_json.py: -------------------------------------------------------------------------------- 1 | # Submit a run using: 2 | # python .\submit_mmlu_zeroshot.py -cn zeroshot_config 3 | 4 | import time 5 | 6 | from dataclasses import dataclass 7 | 8 | import hydra 9 | from hydra.core.config_store import ConfigStore 10 | 11 | import omegaconf 12 | 13 | from azure.identity import DefaultAzureCredential 14 | from azure.ai.ml import MLClient 15 | 16 | from azure.ai.ml import dsl, Input, MLClient 17 | from azure.ai.ml.entities import Pipeline 18 | 19 | from azureml_pipelines import create_zeroshot_pipeline 20 | from azureml_utils import get_component_collector 21 | from configs import AMLConfig, BiosBiasJSONPipelineConfig 22 | from constants import GUIDANCE_PROGRAMS_DIR 23 | from logging_utils import get_standard_logger_for_file 24 | 25 | _logger = get_standard_logger_for_file(__file__) 26 | 27 | 28 | @dataclass 29 | class PipelineConfig: 30 | zeroshot_config: BiosBiasJSONPipelineConfig = omegaconf.MISSING 31 | azureml_config: AMLConfig = omegaconf.MISSING 32 | 33 | 34 | cs = ConfigStore.instance() 35 | cs.store(name="config", node=PipelineConfig) 36 | 37 | 38 | def create_biosbias_simple_json_pipeline( 39 | ml_client: MLClient, run_config: BiosBiasJSONPipelineConfig, version_string: str 40 | ): 41 | components = get_component_collector(ml_client, version_string) 42 | 43 | guidance_input = Input( 44 | type="uri_file", 45 | path=GUIDANCE_PROGRAMS_DIR / run_config.json_guidance_program, 46 | model="download", 47 | ) 48 | 49 | ds_parts = run_config.biosbias_dataset.split(":") 50 | bios_ds = ml_client.data.get(ds_parts[0], version=ds_parts[1]) 51 | 52 | inference_config = run_config.aoai_config 53 | 54 | @dsl.pipeline() 55 | def basic_pipeline() -> Pipeline: 56 | guidance_job = components.jsonl_guidance( 57 | guidance_program=guidance_input, 58 | guidance_workers=inference_config.workers, 59 | max_errors=inference_config.max_errors, 60 | input_dataset=bios_ds, 61 | azure_openai_endpoint=inference_config.endpoint, 62 | azure_openai_deployed_model=inference_config.model, 63 | ) 64 | guidance_job.name = f"guidance_simple" 65 | guidance_job.compute = inference_config.compute_target 66 | 67 | score_job = components.jsonl_score_biosbias_json( 68 | input_dataset=guidance_job.outputs.output_dataset, 69 | response_key="model_answer", 70 | ) 71 | score_job.name = f"score_biosbias_json" 72 | 73 | pipeline = basic_pipeline() 74 | pipeline.experiment_name = ( 75 | f"{run_config.pipeline.base_experiment_name}_{ds_parts[0]}_{ds_parts[1]}" 76 | ) 77 | pipeline.display_name = None 78 | pipeline.compute = run_config.pipeline.default_compute_target 79 | if run_config.pipeline.tags: 80 | pipeline.tags.update(run_config.tags) 81 | _logger.info("Pipeline created") 82 | 83 | return pipeline 84 | 85 | 86 | @hydra.main(config_path="configs", version_base="1.1") 87 | def main(config: PipelineConfig): 88 | version_string = str(int(time.time())) 89 | _logger.info(f"AzureML object version for this run: {version_string}") 90 | 91 | _logger.info(f"Azure Subscription: {config.azureml_config.subscription_id}") 92 | _logger.info(f"Resource Group: {config.azureml_config.resource_group}") 93 | _logger.info(f"Workspace : {config.azureml_config.workspace_name}") 94 | 95 | credential = DefaultAzureCredential(exclude_shared_token_cache_credential=True) 96 | 97 | ws_client = MLClient( 98 | credential=credential, 99 | subscription_id=config.azureml_config.subscription_id, 100 | resource_group_name=config.azureml_config.resource_group, 101 | workspace_name=config.azureml_config.workspace_name, 102 | logging_enable=False, 103 | ) 104 | 105 | pipeline = create_biosbias_simple_json_pipeline( 106 | ws_client, config.zeroshot_config, version_string 107 | ) 108 | _logger.info("Submitting pipeline") 109 | submitted_job = ws_client.jobs.create_or_update(pipeline) 110 | _logger.info(f"Submitted: {submitted_job.name}") 111 | 112 | 113 | if __name__ == "__main__": 114 | main() 115 | -------------------------------------------------------------------------------- /azureml/pipelines/submit_simple_biosbias_json_phi2.py: -------------------------------------------------------------------------------- 1 | # Submit a run using: 2 | # python .\submit_simple_biosbias_json_phi2.py -cn biosbias_json_phi2_config.yaml 3 | 4 | import time 5 | 6 | from dataclasses import dataclass 7 | 8 | import hydra 9 | from hydra.core.config_store import ConfigStore 10 | 11 | import omegaconf 12 | 13 | from azure.identity import DefaultAzureCredential 14 | from azure.ai.ml import MLClient 15 | 16 | from azure.ai.ml import dsl, Input, MLClient 17 | from azure.ai.ml.entities import Pipeline 18 | 19 | from azureml_utils import get_component_collector 20 | from configs import AMLConfig, Phi2BiosBiasJSONPipelineConfig 21 | from constants import GUIDANCE_PROGRAMS_DIR 22 | from logging_utils import get_standard_logger_for_file 23 | 24 | _logger = get_standard_logger_for_file(__file__) 25 | 26 | 27 | @dataclass 28 | class PipelineConfig: 29 | zeroshot_config: Phi2BiosBiasJSONPipelineConfig = omegaconf.MISSING 30 | azureml_config: AMLConfig = omegaconf.MISSING 31 | 32 | 33 | cs = ConfigStore.instance() 34 | cs.store(name="config", node=PipelineConfig) 35 | 36 | 37 | def create_biosbias_simple_json_pipeline( 38 | ml_client: MLClient, run_config: Phi2BiosBiasJSONPipelineConfig, version_string: str 39 | ): 40 | components = get_component_collector(ml_client, version_string) 41 | 42 | guidance_inputs = dict() 43 | for prog_filename in run_config.json_guidance_programs: 44 | k = prog_filename[0:-3] 45 | v = Input( 46 | type="uri_file", 47 | path=GUIDANCE_PROGRAMS_DIR / prog_filename, 48 | model="download", 49 | ) 50 | guidance_inputs[k] = v 51 | _logger.info(f"Found {len(guidance_inputs)} guidance programs") 52 | 53 | ds_parts = run_config.biosbias_dataset.split(":") 54 | bios_ds = ml_client.data.get(ds_parts[0], version=ds_parts[1]) 55 | 56 | @dsl.pipeline() 57 | def basic_pipeline() -> Pipeline: 58 | for progname, prog_input in guidance_inputs.items(): 59 | guidance_job = components.jsonl_guidance_phi2( 60 | guidance_program=prog_input, 61 | input_dataset=bios_ds, 62 | ) 63 | guidance_job.compute = run_config.phi2_config.compute_target 64 | guidance_job.name = f"guidance_simple_{progname}" 65 | 66 | score_job = components.jsonl_score_biosbias_json( 67 | input_dataset=guidance_job.outputs.output_dataset, 68 | response_key="model_answer", 69 | ) 70 | score_job.name = f"score_biosbias_json_{progname}" 71 | 72 | pipeline = basic_pipeline() 73 | pipeline.experiment_name = ( 74 | f"{run_config.pipeline.base_experiment_name}_{ds_parts[0]}_{ds_parts[1]}" 75 | ) 76 | pipeline.display_name = None 77 | pipeline.compute = run_config.pipeline.default_compute_target 78 | if run_config.pipeline.tags: 79 | pipeline.tags.update(run_config.tags) 80 | _logger.info("Pipeline created") 81 | 82 | return pipeline 83 | 84 | 85 | @hydra.main(config_path="configs", version_base="1.1") 86 | def main(config: PipelineConfig): 87 | version_string = str(int(time.time())) 88 | _logger.info(f"AzureML object version for this run: {version_string}") 89 | 90 | _logger.info(f"Azure Subscription: {config.azureml_config.subscription_id}") 91 | _logger.info(f"Resource Group: {config.azureml_config.resource_group}") 92 | _logger.info(f"Workspace : {config.azureml_config.workspace_name}") 93 | 94 | credential = DefaultAzureCredential(exclude_shared_token_cache_credential=True) 95 | 96 | ws_client = MLClient( 97 | credential=credential, 98 | subscription_id=config.azureml_config.subscription_id, 99 | resource_group_name=config.azureml_config.resource_group, 100 | workspace_name=config.azureml_config.workspace_name, 101 | logging_enable=False, 102 | ) 103 | 104 | pipeline = create_biosbias_simple_json_pipeline( 105 | ws_client, config.zeroshot_config, version_string 106 | ) 107 | _logger.info("Submitting pipeline") 108 | submitted_job = ws_client.jobs.create_or_update(pipeline) 109 | _logger.info(f"Submitted: {submitted_job.name}") 110 | 111 | 112 | if __name__ == "__main__": 113 | main() 114 | -------------------------------------------------------------------------------- /azureml/requirements.txt: -------------------------------------------------------------------------------- 1 | azure-ai-ml 2 | hydra-core -------------------------------------------------------------------------------- /guidance_programs/fewshot.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | import textwrap 4 | 5 | from typing import Any, Dict 6 | 7 | import guidance 8 | from guidance import gen, select, system, user, assistant 9 | 10 | 11 | _logger = logging.getLogger(__file__) 12 | _logger.setLevel(logging.INFO) 13 | _logger.addHandler(logging.StreamHandler(stream=sys.stdout)) 14 | 15 | 16 | @guidance 17 | def few_shot_multiple_choice( 18 | lm: guidance.models.Chat, 19 | question: str, 20 | choices: list[str], 21 | fewshot_examples: list[dict[str, any]], 22 | ): 23 | # Some general instruction to the model 24 | with system(): 25 | lm += textwrap.dedent( 26 | """You are a student taking a multiple choice test. 27 | You will be shown a question, followed by numbered multiple choice answers. 28 | Response with the number corresponding to the best answer. 29 | """ 30 | ) 31 | 32 | _logger.debug("Adding few shot examples") 33 | lm += "\nHere are some examples to help you:\n\n" 34 | for i, example in enumerate(fewshot_examples): 35 | lm += f"Example {i}\n" 36 | lm += example["question"] + "\n" 37 | for j, choice in enumerate(example["choices"]): 38 | lm += f"{j} : {choice}\n" 39 | lm += f"Correct Answer: {example['correct_answer']}\n\n" 40 | 41 | lm += "The question you need to answer will be shown next.\n\n" 42 | 43 | with user(): 44 | lm += question + "\n" 45 | for i, choice in enumerate(choices): 46 | lm += f"{i} : {choice}\n" 47 | lm += "Correct Answer: " 48 | 49 | with assistant(): 50 | lm += select([str(i) for i in range(len(choices))], name="string_choice") 51 | 52 | return lm 53 | 54 | 55 | def guidance_generation( 56 | lm: guidance.models.Chat, 57 | input: Dict[str, Any], 58 | common: list[dict[str, Any]] | None = None, 59 | ) -> Dict[str, Any]: 60 | _logger.debug("Starting guidance_generation") 61 | assert common is None, "Unexpected common data" 62 | result = lm + few_shot_multiple_choice( 63 | question=input["question"], 64 | choices=input["choices"], 65 | fewshot_examples=input["fewshot_examples"], 66 | ) 67 | 68 | _logger.debug(f"Result: {result}") 69 | 70 | result = dict(fewshot_choice=int(result["string_choice"])) 71 | return result 72 | -------------------------------------------------------------------------------- /guidance_programs/fewshot_as_conversation.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | import textwrap 4 | 5 | from typing import Any, Dict 6 | 7 | import guidance 8 | from guidance import gen, select, system, user, assistant 9 | 10 | 11 | _logger = logging.getLogger(__file__) 12 | _logger.setLevel(logging.INFO) 13 | _logger.addHandler(logging.StreamHandler(stream=sys.stdout)) 14 | 15 | 16 | @guidance 17 | def few_shot_multiple_choice( 18 | lm: guidance.models.Chat, 19 | question: str, 20 | choices: list[str], 21 | fewshot_examples: list[dict[str, any]], 22 | ): 23 | # Some general instruction to the model 24 | with system(): 25 | lm += textwrap.dedent( 26 | """You are a student taking a multiple choice test. 27 | You will be shown a question, followed by numbered multiple choice answers. 28 | Response with the number corresponding to the best answer. 29 | """ 30 | ) 31 | 32 | for example in fewshot_examples: 33 | with user(): 34 | lm += example["question"] + "\n" 35 | for i, choice in enumerate(example["choices"]): 36 | lm += f"{i} : {choice}\n" 37 | lm += f"Correct Answer: " 38 | 39 | with assistant(): 40 | lm += str(example["correct_answer"]) 41 | 42 | with user(): 43 | lm += question + "\n" 44 | for i, choice in enumerate(choices): 45 | lm += f"{i} : {choice}\n" 46 | lm += "Correct Answer: " 47 | 48 | with assistant(): 49 | lm += select([str(i) for i in range(len(choices))], name="string_choice") 50 | 51 | return lm 52 | 53 | 54 | def guidance_generation( 55 | lm: guidance.models.Chat, 56 | input: Dict[str, Any], 57 | common: list[dict[str, Any]] | None = None, 58 | ) -> Dict[str, Any]: 59 | _logger.debug("Starting guidance_generation") 60 | assert common is None, "Unexpected common data" 61 | result = lm + few_shot_multiple_choice( 62 | question=input["question"], 63 | choices=input["choices"], 64 | fewshot_examples=input["fewshot_examples"], 65 | ) 66 | 67 | _logger.debug(f"Result: {result}") 68 | 69 | result = dict(fewshot_choice=int(result["string_choice"])) 70 | return result 71 | -------------------------------------------------------------------------------- /guidance_programs/fewshot_cot_as_conversation.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | import textwrap 4 | 5 | from typing import Any, Dict 6 | 7 | import guidance 8 | from guidance import gen, select, system, user, assistant 9 | 10 | 11 | _logger = logging.getLogger(__file__) 12 | _logger.setLevel(logging.INFO) 13 | _logger.addHandler(logging.StreamHandler(stream=sys.stdout)) 14 | 15 | 16 | ANSWER_KEY = "string_choice" 17 | COT_KEY = "explanation" 18 | 19 | 20 | @guidance 21 | def few_shot_cot_multiple_choice( 22 | lm: guidance.models.Chat, 23 | question: str, 24 | choices: list[str], 25 | fewshot_examples: list[dict[str, any]], 26 | ): 27 | # Some general instruction to the model 28 | with system(): 29 | lm += textwrap.dedent( 30 | """Answer the following multiple choice **Question**. 31 | First, think step by step and write an **Explanation** for reasoning through the question. 32 | Then, when prompted by the user for a **Final Answer**, analyze your explanation and write just the number of the correct answer. 33 | Do not say the final answer until the user asks for it.""" 34 | ) 35 | 36 | for example in fewshot_examples: 37 | with user(): 38 | lm += "**Question**\n" 39 | lm += example["question"] + "\n" 40 | for i, choice in enumerate(example["choices"]): 41 | lm += f"{i} : {choice}\n" 42 | lm += "**Explanation**" 43 | 44 | with assistant(): 45 | lm += example["chain_of_thought"] 46 | 47 | with user(): 48 | lm += f"**Final Answer**" 49 | 50 | with assistant(): 51 | lm += str(example["correct_answer"]) 52 | 53 | with user(): 54 | lm += question + "\n" 55 | for i, choice in enumerate(choices): 56 | lm += f"{i} : {choice}\n" 57 | lm += "**Explanation**" 58 | 59 | with assistant(): 60 | lm += gen(name=COT_KEY) 61 | 62 | with user(): 63 | lm += f"**Final Answer**" 64 | 65 | with assistant(): 66 | lm += select([str(i) for i in range(len(choices))], name=ANSWER_KEY) 67 | 68 | return lm 69 | 70 | 71 | def guidance_generation( 72 | lm: guidance.models.Chat, 73 | input: Dict[str, Any], 74 | common: list[dict[str, Any]] | None = None, 75 | ) -> Dict[str, Any]: 76 | _logger.debug("Starting guidance_generation") 77 | assert common is None, "Unexpected common data" 78 | result = lm + few_shot_cot_multiple_choice( 79 | question=input["question"], 80 | choices=input["choices"], 81 | fewshot_examples=input["fewshot_examples"], 82 | ) 83 | 84 | _logger.debug(f"Result: {result}") 85 | 86 | result = dict(fewshot_choice=int(result[ANSWER_KEY]), fewshot_cot=result[COT_KEY]) 87 | return result 88 | -------------------------------------------------------------------------------- /guidance_programs/fewshot_cot_as_conversation_ensemble.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | import textwrap 4 | 5 | from typing import Any, Dict, Iterator, TypeVar 6 | 7 | import guidance 8 | from guidance import gen, select, system, user, assistant 9 | 10 | 11 | _logger = logging.getLogger(__file__) 12 | _logger.setLevel(logging.INFO) 13 | _logger.addHandler(logging.StreamHandler(stream=sys.stdout)) 14 | 15 | 16 | ANSWER_KEY = "string_choice" 17 | COT_KEY = "explanation" 18 | 19 | 20 | def validate_and_sort_swaps(swaps: list[int], line_len: int) -> list[int]: 21 | swap_set = set(swaps) 22 | assert len(swap_set) == len(swaps), f"Swaps not unique: {swaps}" 23 | for s in swaps: 24 | assert s - 1 not in swap_set, f"Swaps too close: {s} {swaps}" 25 | assert s + 1 not in swap_set, f"Swaps too close: {s} {swaps}" 26 | assert s >= 0, f"Negative swap: {s}" 27 | assert s < (line_len - 1), f"Swap too large: {s}" 28 | return list(sorted(swaps)) 29 | 30 | 31 | T = TypeVar("T") 32 | 33 | 34 | def apply_swaps(line: list[T], swaps: list[int]) -> list[T]: 35 | sorted_swaps = validate_and_sort_swaps(swaps, len(line)) 36 | 37 | i_swap = 0 38 | result = [] 39 | for i in range(len(line)): 40 | if i_swap < len(sorted_swaps) and i == sorted_swaps[i_swap]: 41 | result.append(line[sorted_swaps[i_swap] + 1]) 42 | elif i_swap < len(sorted_swaps) and i == sorted_swaps[i_swap] + 1: 43 | result.append(line[sorted_swaps[i_swap]]) 44 | i_swap += 1 45 | else: 46 | result.append(line[i]) 47 | return result 48 | 49 | 50 | def plain_hunt_generator(starting_line: list[T]) -> Iterator[T]: 51 | first_element = starting_line[0] 52 | swaps_A = list(range(0, len(starting_line) - (len(starting_line) % 2), 2)) 53 | swaps_B = list(range(1, len(starting_line) - 1, 2)) 54 | all_swaps = [swaps_A, swaps_B] 55 | current = [x for x in starting_line] 56 | line_count = 0 57 | yield current 58 | while True: 59 | current = apply_swaps(current, all_swaps[line_count % len(all_swaps)]) 60 | yield current 61 | line_count += 1 62 | if current[0] == first_element: 63 | break 64 | 65 | 66 | NUM_PERMUTATIONS = 5 67 | 68 | 69 | @guidance 70 | def few_shot_cot_multiple_choice( 71 | lm: guidance.models.Chat, 72 | question: str, 73 | choices: list[str], 74 | fewshot_examples: list[dict[str, any]], 75 | permutation: list[int], 76 | ): 77 | # Some general instruction to the model 78 | with system(): 79 | lm += textwrap.dedent( 80 | """Answer the following multiple choice **Question**. 81 | First, think step by step and write an **Explanation** for reasoning through the question. 82 | Then, when prompted by the user for a **Final Answer**, analyze your explanation and write just the number of the correct answer. 83 | Do not say the final answer until the user asks for it.""" 84 | ) 85 | 86 | for example in fewshot_examples: 87 | with user(): 88 | lm += "**Question**\n" 89 | lm += example["question"] + "\n" 90 | for i, choice in enumerate(example["choices"]): 91 | lm += f"{i} : {choice}\n" 92 | lm += "**Explanation**" 93 | 94 | with assistant(): 95 | lm += example["chain_of_thought"] 96 | 97 | with user(): 98 | lm += f"**Final Answer**" 99 | 100 | with assistant(): 101 | lm += str(example["correct_answer"]) 102 | 103 | with user(): 104 | lm += question + "\n" 105 | for i in range(len(choices)): 106 | lm += f"{i}: {choices[permutation[i]]}\n" 107 | lm += "**Explanation**" 108 | 109 | with assistant(): 110 | lm += gen(name=COT_KEY) 111 | 112 | with user(): 113 | lm += f"**Final Answer**" 114 | 115 | with assistant(): 116 | lm += select([str(i) for i in range(len(choices))], name=ANSWER_KEY) 117 | 118 | return lm 119 | 120 | 121 | def guidance_generation( 122 | lm: guidance.models.Chat, 123 | input: Dict[str, Any], 124 | common: list[dict[str, Any]] | None = None, 125 | ) -> Dict[str, Any]: 126 | _logger.debug("Starting guidance_generation") 127 | assert common is None, "Unexpected common data" 128 | 129 | num_choices = len(input["choices"]) 130 | 131 | votes = [0 for _ in range(num_choices)] 132 | cots = [] 133 | generator = plain_hunt_generator(list(range(num_choices))) 134 | for i in range(NUM_PERMUTATIONS): 135 | current_permutation = next(generator) 136 | result = lm + few_shot_cot_multiple_choice( 137 | question=input["question"], 138 | choices=input["choices"], 139 | fewshot_examples=input["fewshot_examples"], 140 | permutation=current_permutation, 141 | ) 142 | _logger.debug(f"Result: {result}") 143 | cots.append(result[COT_KEY]) 144 | selected = int(result[ANSWER_KEY]) 145 | actual = current_permutation[selected] 146 | votes[actual] += 1 147 | 148 | _logger.debug(f"Votes: {votes}") 149 | # Check the votes 150 | max_idx = -1 151 | curr_max = 0 152 | for i in range(len(votes)): 153 | if votes[i] > curr_max: 154 | curr_max = votes[i] 155 | max_idx = i 156 | 157 | final_result = dict(fewshot_choice=max_idx, fewshot_cot=cots) 158 | _logger.debug(f"final_result: {final_result}") 159 | return final_result 160 | -------------------------------------------------------------------------------- /guidance_programs/simple_biosbias_json.py: -------------------------------------------------------------------------------- 1 | # This is a very naive guidance program for working on the "produce JSON" task 2 | # described by PASTA for the BIASBIOS dataset 3 | 4 | import logging 5 | import json 6 | import sys 7 | 8 | from textwrap import dedent 9 | from typing import Any, Dict 10 | 11 | import guidance 12 | from guidance import gen, select, system, user, assistant 13 | 14 | 15 | _logger = logging.getLogger(__file__) 16 | _logger.setLevel(logging.INFO) 17 | _logger.addHandler(logging.StreamHandler(stream=sys.stdout)) 18 | 19 | 20 | NAME_KEY = "given_name" 21 | OCCUPATION_KEY = "occupation" 22 | 23 | 24 | @guidance 25 | def zeroshot_biosbias_json(lm: guidance.models.Chat, short_biography: str): 26 | # Some general instruction to the model 27 | with system(): 28 | lm += dedent( 29 | """You will be shown a short biography of a person by the user. Answer their questions""" 30 | ) 31 | 32 | with user(): 33 | lm += short_biography 34 | 35 | with assistant(): 36 | lm += "OK" 37 | 38 | with user(): 39 | lm += f"What is the given name of the person? Only reply with their name and nothing else." 40 | 41 | with assistant(): 42 | lm += gen(name=NAME_KEY) 43 | 44 | with user(): 45 | lm += dedent( 46 | """Simply state the occupation of the person in lower case. 47 | For example, if a person were an orthodontist, you should state that they are a dentist. 48 | If the person were a freighter pilot, you should state that they are a pilot. 49 | Only reply with their occupation and nothing else.""" 50 | ) 51 | 52 | with assistant(): 53 | lm += gen(name=OCCUPATION_KEY) 54 | 55 | return lm 56 | 57 | 58 | def guidance_generation( 59 | lm: guidance.models.Chat, input: Dict[str, Any], common: Any = None 60 | ) -> Dict[str, Any]: 61 | _logger.debug("Starting guidance_generation") 62 | if common is not None: 63 | _logger.warn("Got unexpected 'common' argument") 64 | result = lm + zeroshot_biosbias_json(short_biography=input["context"]) 65 | 66 | result = dict(name=result[NAME_KEY], occupation=result[OCCUPATION_KEY]) 67 | return dict(model_answer=json.dumps(result)) 68 | -------------------------------------------------------------------------------- /guidance_programs/simple_biosbias_json_completion.py: -------------------------------------------------------------------------------- 1 | # This is a very naive guidance program for working on the "produce JSON" task 2 | # described by PASTA for the BIASBIOS dataset 3 | # This version is for a completion model 4 | 5 | import logging 6 | import json 7 | import sys 8 | 9 | from textwrap import dedent 10 | from typing import Any, Dict 11 | 12 | import guidance 13 | from guidance import gen 14 | 15 | 16 | _logger = logging.getLogger(__file__) 17 | _logger.setLevel(logging.INFO) 18 | _logger.addHandler(logging.StreamHandler(stream=sys.stdout)) 19 | 20 | 21 | @guidance 22 | def zeroshot_biosbias_json(lm: guidance.models.Model, short_biography: str): 23 | lm += dedent( 24 | f"""Instruct: You will be shown a short biography of a person. Extract their name and occupation, and return 25 | a JSON object containing these two keys. 26 | 27 | Output: {short_biography} 28 | """ 29 | ) 30 | lm += gen(name="model_answer") 31 | 32 | return lm 33 | 34 | 35 | def guidance_generation( 36 | lm: guidance.models.Chat, input: Dict[str, Any], common: Any = None 37 | ) -> Dict[str, Any]: 38 | _logger.debug("Starting guidance_generation") 39 | if common is not None: 40 | _logger.warn("Got unexpected 'common' argument") 41 | result = lm + zeroshot_biosbias_json(short_biography=input["context"]) 42 | 43 | result = dict(model_answer=result["model_answer"]) 44 | return result 45 | -------------------------------------------------------------------------------- /guidance_programs/simple_biosbias_json_completion_v2.py: -------------------------------------------------------------------------------- 1 | # This is a very naive guidance program for working on the "produce JSON" task 2 | # described by PASTA for the BIASBIOS dataset 3 | # This version is for a completion model 4 | 5 | import logging 6 | import json 7 | import sys 8 | 9 | from textwrap import dedent 10 | from typing import Any, Dict 11 | 12 | import guidance 13 | from guidance import gen 14 | 15 | 16 | _logger = logging.getLogger(__file__) 17 | _logger.setLevel(logging.INFO) 18 | _logger.addHandler(logging.StreamHandler(stream=sys.stdout)) 19 | 20 | 21 | @guidance 22 | def zeroshot_biosbias_json(lm: guidance.models.Model, short_biography: str): 23 | lm += dedent( 24 | f"""Answer the occupation of {short_biography} and generate the answer as json format. 25 | Here is an example: {{"name": , "occupation": ,}}. 26 | Now generate the answer: 27 | """ 28 | ) 29 | _logger.info(f"lm: {lm}") 30 | lm += gen(name="model_answer") 31 | 32 | return lm 33 | 34 | 35 | def guidance_generation( 36 | lm: guidance.models.Chat, input: Dict[str, Any], common: Any = None 37 | ) -> Dict[str, Any]: 38 | _logger.debug("Starting guidance_generation") 39 | if common is not None: 40 | _logger.warn("Got unexpected 'common' argument") 41 | result = lm + zeroshot_biosbias_json(short_biography=input["context"]) 42 | 43 | result = dict(model_answer=result["model_answer"]) 44 | return result 45 | -------------------------------------------------------------------------------- /guidance_programs/zero_or_few_shot.py: -------------------------------------------------------------------------------- 1 | # This is a very naive guidance program for doing zero shot multiple choice questions 2 | # It is not what generated the reported results 3 | 4 | import logging 5 | import sys 6 | 7 | from typing import Any, Dict 8 | 9 | import guidance 10 | from guidance import gen, select, system, user, assistant 11 | 12 | 13 | _logger = logging.getLogger(__file__) 14 | _logger.setLevel(logging.INFO) 15 | _logger.addHandler(logging.StreamHandler(stream=sys.stdout)) 16 | 17 | 18 | @guidance 19 | def zero_shot_multiple_choice( 20 | lm: guidance.models.Chat, 21 | question: str, 22 | choices: list[str], 23 | common: list[dict[str, Any]] | None, 24 | ): 25 | # Some general instruction to the model 26 | with system(): 27 | lm += """You are a student taking a multiple choice test. 28 | You will be shown a question, followed by numbered multiple choice answers. 29 | Response with the number corresponding to the best answer. 30 | """ 31 | 32 | if common: 33 | _logger.debug("Adding few shot examples") 34 | lm += "\nHere are some examples to help you:\n\n" 35 | for i, example in enumerate(common): 36 | lm += f"Example {i}\n" 37 | lm += example["question"] + "\n" 38 | for j, choice in enumerate(example["choices"]): 39 | lm += f"{j} : {choice}\n" 40 | lm += f"Correct Answer: {example['correct_answer']}\n\n" 41 | 42 | lm += "The question you need to answer will be shown next.\n\n" 43 | 44 | with user(): 45 | lm += question + "\n" 46 | for i, choice in enumerate(choices): 47 | lm += f"{i} : {choice}\n" 48 | lm += "Correct Answer: " 49 | 50 | with assistant(): 51 | lm += select([str(i) for i in range(len(choices))], name="string_choice") 52 | 53 | return lm 54 | 55 | 56 | def guidance_generation( 57 | lm: guidance.models.Chat, 58 | input: Dict[str, Any], 59 | common: list[dict[str, Any]] | None = None, 60 | ) -> Dict[str, Any]: 61 | _logger.debug("Starting guidance_generation") 62 | result = lm + zero_shot_multiple_choice( 63 | question=input["question"], choices=input["choices"], common=common 64 | ) 65 | 66 | _logger.debug(f"Result: {result}") 67 | 68 | result = dict(zero_or_few_shot_choice=int(result["string_choice"])) 69 | return result 70 | -------------------------------------------------------------------------------- /guidance_programs/zero_or_few_shot_alpha.py: -------------------------------------------------------------------------------- 1 | # This is a very naive guidance program for doing zero shot multiple choice questions 2 | # It is not what generated the reported results 3 | 4 | import logging 5 | import sys 6 | 7 | from typing import Any, Dict 8 | 9 | import guidance 10 | from guidance import select, system, user, assistant 11 | 12 | 13 | _logger = logging.getLogger(__file__) 14 | _logger.setLevel(logging.INFO) 15 | _logger.addHandler(logging.StreamHandler(stream=sys.stdout)) 16 | 17 | ASCII_OFFSET = ord("a") 18 | 19 | 20 | @guidance 21 | def zero_shot_multiple_choice( 22 | lm: guidance.models.Chat, 23 | question: str, 24 | choices: list[str], 25 | common: list[dict[str, Any]] | None, 26 | ): 27 | # Some general instruction to the model 28 | with system(): 29 | lm += """You are a student taking a multiple choice test. 30 | You will be shown a question, followed by numbered multiple choice answers. 31 | Response with the number corresponding to the best answer. 32 | """ 33 | 34 | if common: 35 | _logger.debug("Adding few shot examples") 36 | lm += "\nHere are some examples to help you:\n\n" 37 | for i, example in enumerate(common): 38 | lm += f"Example {i}\n" 39 | lm += example["question"] + "\n" 40 | for j, choice in enumerate(example["choices"]): 41 | lm += f"{chr(j+ASCII_OFFSET)} : {choice}\n" 42 | lm += ( 43 | f"Correct Answer: {chr(example['correct_answer']+ASCII_OFFSET)}\n\n" 44 | ) 45 | 46 | lm += "The question you need to answer will be shown next.\n\n" 47 | 48 | with user(): 49 | lm += question + "\n" 50 | for i, choice in enumerate(choices): 51 | lm += f"{chr(i+ASCII_OFFSET)} : {choice}\n" 52 | lm += "Correct Answer: " 53 | 54 | with assistant(): 55 | lm += select( 56 | [chr(i + ASCII_OFFSET) for i in range(len(choices))], name="string_choice" 57 | ) 58 | 59 | return lm 60 | 61 | 62 | def guidance_generation( 63 | lm: guidance.models.Chat, 64 | input: Dict[str, Any], 65 | common: list[dict[str, Any]] | None = None, 66 | ) -> Dict[str, Any]: 67 | _logger.debug("Starting guidance_generation") 68 | result = lm + zero_shot_multiple_choice( 69 | question=input["question"], choices=input["choices"], common=common 70 | ) 71 | 72 | _logger.debug(f"Result: {result}") 73 | int_result = ord(result["string_choice"]) - ASCII_OFFSET 74 | 75 | result = dict(zero_or_few_shot_choice=int_result) 76 | return result 77 | -------------------------------------------------------------------------------- /guidance_programs/zero_or_few_shot_expert.py: -------------------------------------------------------------------------------- 1 | # This is a very naive guidance program for doing zero shot multiple choice questions 2 | # It is not what generated the reported results 3 | 4 | import logging 5 | import sys 6 | 7 | from typing import Any, Dict 8 | 9 | import guidance 10 | from guidance import gen, select, system, user, assistant 11 | 12 | 13 | _logger = logging.getLogger(__file__) 14 | _logger.setLevel(logging.INFO) 15 | _logger.addHandler(logging.StreamHandler(stream=sys.stdout)) 16 | 17 | 18 | @guidance 19 | def zero_shot_multiple_choice( 20 | lm: guidance.models.Chat, 21 | question: str, 22 | choices: list[str], 23 | common: list[dict[str, Any]] | None, 24 | ): 25 | # Some general instruction to the model 26 | with system(): 27 | lm += """You are an expert validating a multiple choice test. 28 | You will be shown a question, followed by numbered multiple choice answers. 29 | Use your vast expertise to respond with the number corresponding to the best answer. 30 | """ 31 | 32 | if common: 33 | _logger.debug("Adding few shot examples") 34 | lm += "\nHere are some examples to help you:\n\n" 35 | for i, example in enumerate(common): 36 | lm += f"Example {i}\n" 37 | lm += example["question"] + "\n" 38 | for j, choice in enumerate(example["choices"]): 39 | lm += f"{j} : {choice}\n" 40 | lm += f"Correct Answer: {example['correct_answer']}\n\n" 41 | 42 | lm += "The question you need to answer will be shown next.\n\n" 43 | 44 | with user(): 45 | lm += question + "\n" 46 | for i, choice in enumerate(choices): 47 | lm += f"{i} : {choice}\n" 48 | lm += "Correct Answer: " 49 | 50 | with assistant(): 51 | lm += select([str(i) for i in range(len(choices))], name="string_choice") 52 | 53 | return lm 54 | 55 | 56 | def guidance_generation( 57 | lm: guidance.models.Chat, 58 | input: Dict[str, Any], 59 | common: list[dict[str, Any]] | None = None, 60 | ) -> Dict[str, Any]: 61 | _logger.debug("Starting guidance_generation") 62 | result = lm + zero_shot_multiple_choice( 63 | question=input["question"], choices=input["choices"], common=common 64 | ) 65 | 66 | _logger.debug(f"Result: {result}") 67 | 68 | result = dict(zero_or_few_shot_choice=int(result["string_choice"])) 69 | return result 70 | -------------------------------------------------------------------------------- /guidance_programs/zero_or_few_shot_fortran.py: -------------------------------------------------------------------------------- 1 | # This is a very naive guidance program for doing zero shot multiple choice questions 2 | # It is not what generated the reported results 3 | 4 | import logging 5 | import sys 6 | 7 | from typing import Any, Dict 8 | 9 | import guidance 10 | from guidance import select, system, user, assistant 11 | 12 | 13 | _logger = logging.getLogger(__file__) 14 | _logger.setLevel(logging.INFO) 15 | _logger.addHandler(logging.StreamHandler(stream=sys.stdout)) 16 | 17 | 18 | @guidance 19 | def zero_shot_multiple_choice( 20 | lm: guidance.models.Chat, 21 | question: str, 22 | choices: list[str], 23 | common: list[dict[str, Any]] | None, 24 | ): 25 | # Some general instruction to the model 26 | with system(): 27 | lm += """You are a student taking a multiple choice test. 28 | You will be shown a question, followed by numbered multiple choice answers. 29 | Response with the number corresponding to the best answer. 30 | """ 31 | 32 | if common: 33 | _logger.debug("Adding few shot examples") 34 | lm += "\nHere are some examples to help you:\n\n" 35 | for i, example in enumerate(common): 36 | lm += f"Example {i}\n" 37 | lm += example["question"] + "\n" 38 | for j, choice in enumerate(example["choices"]): 39 | lm += f"{j+1} : {choice}\n" 40 | lm += f"Correct Answer: {example['correct_answer']+1}\n\n" 41 | 42 | lm += "The question you need to answer will be shown next.\n\n" 43 | 44 | with user(): 45 | lm += question + "\n" 46 | for i, choice in enumerate(choices): 47 | lm += f"{i+1} : {choice}\n" 48 | lm += "Correct Answer: " 49 | 50 | with assistant(): 51 | lm += select([str(i + 1) for i in range(len(choices))], name="string_choice") 52 | 53 | return lm 54 | 55 | 56 | def guidance_generation( 57 | lm: guidance.models.Chat, 58 | input: Dict[str, Any], 59 | common: list[dict[str, Any]] | None = None, 60 | ) -> Dict[str, Any]: 61 | _logger.debug("Starting guidance_generation") 62 | result = lm + zero_shot_multiple_choice( 63 | question=input["question"], choices=input["choices"], common=common 64 | ) 65 | 66 | _logger.debug(f"Result: {result}") 67 | int_result = int(result["string_choice"]) 68 | 69 | result = dict(zero_or_few_shot_choice=int_result - 1) 70 | return result 71 | -------------------------------------------------------------------------------- /guidance_programs/zero_shot_cot.py: -------------------------------------------------------------------------------- 1 | # This is a very naive guidance program for doing zero shot multiple choice questions 2 | # with chain-of-thought prompting 3 | # It is not what generated the reported results 4 | 5 | import logging 6 | import sys 7 | 8 | from textwrap import dedent 9 | from typing import Any, Dict 10 | 11 | import guidance 12 | from guidance import gen, select, system, user, assistant 13 | 14 | 15 | _logger = logging.getLogger(__file__) 16 | _logger.setLevel(logging.INFO) 17 | _logger.addHandler(logging.StreamHandler(stream=sys.stdout)) 18 | 19 | 20 | ANSWER_KEY = "string_choice" 21 | COT_KEY = "explanation" 22 | 23 | 24 | @guidance 25 | def zero_shot_cot_multiple_choice( 26 | lm: guidance.models.Chat, question: str, choices: list[str] 27 | ): 28 | # Some general instruction to the model 29 | with system(): 30 | lm += dedent( 31 | """Answer the following multiple choice **Question**. 32 | First, think step by step and write an **Explanation** for reasoning through the question. 33 | Then, when prompted by the user for a **Final Answer**, analyze your explanation and write just the number of the correct answer. 34 | Do not say the final answer until the user asks for it.""" 35 | ) 36 | 37 | with user(): 38 | lm += "**Question**\n" 39 | lm += question + "\n" 40 | for i, choice in enumerate(choices): 41 | lm += f"{i} : {choice}" + "\n" 42 | lm += "**Explanation**" 43 | 44 | with assistant(): 45 | lm += gen(name=COT_KEY) 46 | 47 | response_choices = [str(i) for i in range(len(choices))] 48 | with user(): 49 | lm += f"**Final Answer**" 50 | 51 | with assistant(): 52 | lm += select(response_choices, name=ANSWER_KEY) 53 | 54 | return lm 55 | 56 | 57 | def guidance_generation( 58 | lm: guidance.models.Chat, input: Dict[str, Any], common: Any = None 59 | ) -> Dict[str, Any]: 60 | _logger.debug("Starting guidance_generation") 61 | if common is not None: 62 | _logger.warn("Got unexpected 'common' argument") 63 | result = lm + zero_shot_cot_multiple_choice( 64 | question=input["question"], choices=input["choices"] 65 | ) 66 | 67 | result = dict( 68 | zeroshot_cot_choice=int(result[ANSWER_KEY]), zeroshot_cot=result[COT_KEY] 69 | ) 70 | return result 71 | -------------------------------------------------------------------------------- /images/medprompt_radar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/promptbase/bf5d0dcc7f92650e50f351bf3878efbeb6dae385/images/medprompt_radar.png -------------------------------------------------------------------------------- /images/medprompt_sa_graphic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/promptbase/bf5d0dcc7f92650e50f351bf3878efbeb6dae385/images/medprompt_sa_graphic.png -------------------------------------------------------------------------------- /images/mmlu_accuracy_ablation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/promptbase/bf5d0dcc7f92650e50f351bf3878efbeb6dae385/images/mmlu_accuracy_ablation.png -------------------------------------------------------------------------------- /src/promptbase/__init__.py: -------------------------------------------------------------------------------- 1 | from . import utils, gsm8k 2 | -------------------------------------------------------------------------------- /src/promptbase/__main__.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | 4 | from promptbase.gsm8k import gsm8k 5 | from promptbase.humaneval import humaneval 6 | from promptbase.math import math 7 | from promptbase.drop import drop 8 | from promptbase.bigbench import bigbench 9 | from promptbase.bigbench.consts import BIGBENCH_SUBJECTS 10 | 11 | import promptbase.mmlu as mmlu 12 | 13 | logging.basicConfig(level=logging.INFO) 14 | 15 | VALID_DATASETS = ["gsm8k", "humaneval", "math", "drop", "bigbench", "mmlu"] 16 | 17 | 18 | def parse_arguments(): 19 | p = argparse.ArgumentParser() 20 | p.add_argument( 21 | "dataset", type=str, choices=VALID_DATASETS, help="Name of dataset to test" 22 | ) 23 | p.add_argument("--subject", type=str, help="Specify the subject for the dataset") 24 | p.add_argument( 25 | "--mode", 26 | type=str, 27 | default="chat", 28 | choices=["chat", "completion"], 29 | help="Prompting mode for the model (chat or completion)", 30 | ) 31 | p.add_argument( 32 | "--list_subjects", 33 | action="store_true", 34 | help="Lists the subjects available for the dataset", 35 | ) 36 | p.add_argument( 37 | "--overwrite", 38 | action="store_true", 39 | help="Overwrites the results of a previous run", 40 | ) 41 | return p.parse_args() 42 | 43 | 44 | def main(): 45 | args = parse_arguments() 46 | 47 | if args.list_subjects: 48 | if args.dataset == "bigbench": 49 | print(BIGBENCH_SUBJECTS) 50 | elif args.dataset == "mmlu": 51 | pass 52 | else: 53 | print(f"Dataset {args.dataset} does not have subjects") 54 | return 55 | 56 | mode = args.mode 57 | 58 | if args.dataset == "gsm8k": 59 | gsm8k.generate() 60 | gsm8k.evaluate() 61 | elif args.dataset == "humaneval": 62 | humaneval.generate() 63 | humaneval.evaluate() 64 | elif args.dataset == "math": 65 | math.generate() 66 | math.evaluate() 67 | elif args.dataset == "drop": 68 | drop.generate() 69 | drop.evaluate() 70 | elif args.dataset == "bigbench": 71 | subject = args.subject if args.subject else "all" 72 | overwrite = args.overwrite 73 | bigbench.generate(subject, overwrite, mode) 74 | bigbench.evaluate(mode) 75 | elif args.dataset == "mmlu": 76 | # Note that to run the MMLU tests, you will need to download the 77 | # data, and then use the 'format_mmlu.py' script 78 | mmlu.generate(args.subject) 79 | mmlu.evaluate_all(args.subject) 80 | else: 81 | raise ValueError(f"Bad dataset: {args.dataset}") 82 | 83 | 84 | if __name__ == "__main__": 85 | main() 86 | -------------------------------------------------------------------------------- /src/promptbase/bigbench/__init__.py: -------------------------------------------------------------------------------- 1 | from .bigbench import generate, evaluate 2 | from .consts import * -------------------------------------------------------------------------------- /src/promptbase/bigbench/bigbench.py: -------------------------------------------------------------------------------- 1 | from .bigbench_cot import process_cot 2 | from .bigbench_score import score 3 | from .bigbench_answer import process_answers 4 | from promptbase.bigbench.consts import BIGBENCH_SUBJECTS 5 | 6 | def generate(subject: str, overwrite: bool, mode="chat"): 7 | if subject != "all" and subject not in BIGBENCH_SUBJECTS: 8 | print(f"Invalid subject: {subject}") 9 | return 10 | print(f"Running BigBench generation for subject {subject}") 11 | process_cot(subject, overwrite, mode) 12 | process_answers(subject, overwrite, mode) 13 | 14 | def evaluate(mode="chat"): 15 | score(mode) -------------------------------------------------------------------------------- /src/promptbase/bigbench/bigbench_score.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import json 3 | import os 4 | import pathlib 5 | 6 | from promptbase.utils.helpers import get_datasets_path, get_generations_path, get_standard_logger_for_file 7 | 8 | _logger = get_standard_logger_for_file(__file__) 9 | 10 | def score(api_type="chat"): 11 | ground_truth_dir = get_datasets_path() / "BigBench" / "bbh" 12 | if not ground_truth_dir.exists(): 13 | _logger.error(f"Ground truth directory {ground_truth_dir} does not exist") 14 | return 15 | answer_dir = get_generations_path() / "bigbench" / "answers" / api_type 16 | 17 | score_dict = {} 18 | 19 | # loop through json files in ground truth path 20 | for gt_filename in os.listdir(ground_truth_dir): 21 | if not gt_filename.endswith(".json"): 22 | _logger.warn("Skipping non-json file: " + gt_filename) 23 | continue 24 | _logger.info("Processing file: " + gt_filename) 25 | fname_base = gt_filename.split(".")[0] 26 | answer_path = answer_dir / f"{fname_base}_{api_type}_answers.json" 27 | if not os.path.exists(answer_path): 28 | _logger.warn("Answer file does not exist: %s", answer_path) 29 | continue 30 | with open(ground_truth_dir / gt_filename) as f: 31 | ground_truth_data = json.load(f) 32 | with open(answer_path) as f: 33 | answer_data = json.load(f) 34 | 35 | _logger.info("Number of ground truth examples: %s", str(len(ground_truth_data["examples"]))) 36 | _logger.info("Number of answer examples: %s", str(len(answer_data))) 37 | if len(ground_truth_data["examples"]) != len(answer_data): 38 | _logger.warn("Number of examples does not match for file: %s", gt_filename) 39 | continue 40 | 41 | correct_count = 0 42 | total_count = len(ground_truth_data["examples"]) 43 | 44 | for i, gt in enumerate(ground_truth_data["examples"]): 45 | if gt["target"] == answer_data[i]["completion"]: 46 | correct_count += 1 47 | 48 | score_dict[fname_base] = { 49 | "correct": correct_count, 50 | "total": total_count, 51 | "score": correct_count / total_count, 52 | } 53 | 54 | total_correct = 0 55 | total_overall = 0 56 | for k, v in score_dict.items(): 57 | total_correct += v["correct"] 58 | total_overall += v["total"] 59 | 60 | score_dict["overall"] = { 61 | "correct": total_correct, 62 | "total": total_overall, 63 | "score": total_correct / total_overall, 64 | } 65 | 66 | print("Final scores:", score_dict) 67 | 68 | # save as json file 69 | timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") 70 | score_dir = get_generations_path() / "bigbench" / "scores" 71 | score_dir.mkdir(parents=True, exist_ok=True) 72 | with open(score_dir / f"bigbench_scores_{api_type}_{timestamp}.json", "w") as f: 73 | json.dump(score_dict, f) 74 | -------------------------------------------------------------------------------- /src/promptbase/bigbench/consts.py: -------------------------------------------------------------------------------- 1 | BIGBENCH_SUBJECTS = [ 2 | "boolean_expressions", 3 | "causal_judgement", 4 | "date_understanding", 5 | "disambiguation_qa", 6 | "dyck_languages", 7 | "formal_fallacies", 8 | "geometric_shapes", 9 | "hyperbaton", 10 | "logical_deduction_five_objects", 11 | "logical_deduction_seven_objects", 12 | "logical_deduction_three_objects", 13 | "movie_recommendation", 14 | "multistep_arithmetic_two", 15 | "navigate", 16 | "object_counting", 17 | "penguins_in_a_table", 18 | "reasoning_about_colored_objects", 19 | "ruin_names", 20 | "salient_translation_error_detection", 21 | "snarks", 22 | "sports_understanding", 23 | "temporal_sequences", 24 | "tracking_shuffled_objects_five_objects", 25 | "tracking_shuffled_objects_seven_objects", 26 | "tracking_shuffled_objects_three_objects", 27 | "web_of_lies", 28 | "word_sorting", 29 | ] 30 | -------------------------------------------------------------------------------- /src/promptbase/datasets/put_datasets_here.txt: -------------------------------------------------------------------------------- 1 | Datasets will be loaded from this folder. Put your datasets here as instructed in the readme. -------------------------------------------------------------------------------- /src/promptbase/drop/__init__.py: -------------------------------------------------------------------------------- 1 | from .drop import generate, evaluate 2 | -------------------------------------------------------------------------------- /src/promptbase/format/format_hellaswag.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import csv 4 | import uuid 5 | 6 | 7 | train_path = "../datasets/hellaswag_train.jsonl" 8 | test_path = "../datasets/hellaswag_test.jsonl" 9 | val_path = "../datasets/hellaswag_val.jsonl" 10 | 11 | 12 | def process_jsonl_file(file_path, split_name): 13 | questions = [] 14 | with open(file_path, "r", encoding="utf-8") as file: 15 | lines = file.readlines() 16 | for i, json_line in enumerate(lines): 17 | question_data = json.loads(json_line) 18 | answer_choices = { 19 | chr(65 + i): answer for i, answer in enumerate(question_data["endings"]) 20 | } 21 | 22 | question_dict = { 23 | "question_number": f"{question_data['ind']}", 24 | "question": question_data["ctx"], 25 | "correct_answer": chr(65 + question_data["label"]), 26 | "has_media": False, # Assuming no media in MMLU dataset 27 | "dataset": "hellaswag", 28 | "id": f"{uuid.uuid4()}", 29 | "split": split_name, 30 | "extra": question_data[ 31 | "activity_label" 32 | ], # Any extra information, if needed 33 | "answer_choices": answer_choices, 34 | } 35 | questions.append(question_dict) 36 | return questions 37 | 38 | 39 | train_questions = process_jsonl_file(train_path, "train") 40 | # test_questions = process_jsonl_file(test_path, "test") 41 | val_questions = process_jsonl_file(val_path, "val") 42 | 43 | print("Train questions: ", len(train_questions)) 44 | # print("Test questions: ", len(test_questions)) 45 | print("Val questions: ", len(val_questions)) 46 | 47 | # all_questions = train_questions + test_questions + val_questions 48 | all_questions = train_questions + val_questions 49 | 50 | with open("hellaswag.json", "w", encoding="utf-8") as json_file: 51 | json.dump(all_questions, json_file, ensure_ascii=False, indent=4) 52 | -------------------------------------------------------------------------------- /src/promptbase/format/format_mmlu.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import csv 3 | import json 4 | import pathlib 5 | import uuid 6 | 7 | 8 | ALL_QUESTIONS = "all_questions.json" 9 | ALL_FILENAME_FORMAT = "mmlu_all_{0}.json" 10 | 11 | 12 | def parse_arguments(): 13 | parser = argparse.ArgumentParser() 14 | 15 | parser.add_argument("--mmlu_csv_dir", type=pathlib.Path, required=True) 16 | parser.add_argument("--output_path", type=pathlib.Path, required=True) 17 | 18 | args = parser.parse_args() 19 | 20 | return args 21 | 22 | 23 | # Function to process a single CSV file and return a list of question dictionaries 24 | def process_csv_file(file_path: pathlib.Path, split_name: str): 25 | questions = [] 26 | with open(file_path, "r", encoding="utf-8") as file: 27 | csv_reader = csv.reader(file) 28 | for i, row in enumerate(csv_reader): 29 | question_text, *answers, correct_answer = row 30 | answer_choices = {chr(65 + i): answer for i, answer in enumerate(answers)} 31 | test_name = file_path.stem 32 | 33 | question_dict = { 34 | "question_number": f"{test_name}_{i}", 35 | "question": question_text, 36 | "correct_answer": correct_answer, 37 | "has_media": False, # Assuming no media in MMLU dataset 38 | "dataset": "MMLU", 39 | "id": f"{uuid.uuid4()}", 40 | "split": split_name, 41 | "extra": test_name, # Any extra information, if needed 42 | "answer_choices": answer_choices, 43 | } 44 | questions.append(question_dict) 45 | return questions 46 | 47 | 48 | def main(mmlu_csv_dir: pathlib.Path, output_path: pathlib.Path): 49 | assert mmlu_csv_dir.is_dir() 50 | assert output_path.is_dir() 51 | all_questions = [] 52 | 53 | splits = dict( 54 | train=mmlu_csv_dir / "auxiliary_train", 55 | dev=mmlu_csv_dir / "dev", 56 | test=mmlu_csv_dir / "test", 57 | val=mmlu_csv_dir / "val", 58 | ) 59 | all_questions_split = dict(train=[], dev=[], test=[], val=[]) 60 | 61 | for split_name, split_path in splits.items(): 62 | for csv_file in split_path.iterdir(): 63 | questions = process_csv_file(csv_file, split_name) 64 | print(json.dumps(questions[3], indent=4, ensure_ascii=False)) 65 | file_path = output_path / f"mmlu_{csv_file.stem}.json" 66 | print(f"Writing {file_path}") 67 | with open( 68 | file_path, 69 | "w", 70 | encoding="utf-8", 71 | ) as json_file: 72 | json.dump(questions, json_file, ensure_ascii=False, indent=4) 73 | all_questions.extend(questions) 74 | all_questions_split[split_name].extend(questions) 75 | 76 | print("Writing all questions") 77 | with open(output_path / ALL_QUESTIONS, "w", encoding="utf-8") as json_file: 78 | json.dump(all_questions, json_file, ensure_ascii=False, indent=4) 79 | 80 | print("Writing all question splits") 81 | for split_name, split_questions in all_questions_split.items(): 82 | file_path = output_path / ALL_FILENAME_FORMAT.format(split_name) 83 | print(f"Writing out all questions for split {split_name} to {file_path}") 84 | with open(file_path, "w", encoding="utf-8") as json_file: 85 | json.dump(split_questions, json_file, ensure_ascii=False, indent=4) 86 | 87 | 88 | if __name__ == "__main__": 89 | args = parse_arguments() 90 | main(args.mmlu_csv_dir, args.output_path) 91 | -------------------------------------------------------------------------------- /src/promptbase/generations/README.md: -------------------------------------------------------------------------------- 1 | # About 2 | 3 | This directory is used to store any generated output from language models. For example, intermediate results from chain-of-thought prompting could be stored here. -------------------------------------------------------------------------------- /src/promptbase/gsm8k/__init__.py: -------------------------------------------------------------------------------- 1 | from .gsm8k import generate, evaluate 2 | -------------------------------------------------------------------------------- /src/promptbase/gsm8k/gsm8k.py: -------------------------------------------------------------------------------- 1 | # generate.py 2 | import json 3 | import pathlib 4 | 5 | from promptbase.utils.helpers import text_completion, run_batch_jobs 6 | from datasets import load_dataset 7 | 8 | 9 | my_path = pathlib.Path(__file__).parent.resolve() 10 | 11 | 12 | def extract_substrings(text): 13 | parts = text.split(r"\boxed") 14 | matches = [] 15 | 16 | for part in parts[1:]: # Skip the first part as it does not start with \boxed 17 | if part.startswith("{"): 18 | brace_level = 0 19 | for i, char in enumerate(part): 20 | if char == "{": 21 | brace_level += 1 22 | elif char == "}": 23 | brace_level -= 1 24 | if brace_level == 0: 25 | matches.append( 26 | part[1:i] 27 | ) # Extract the content inside the braces 28 | break 29 | 30 | if len(matches) == 0: 31 | return None 32 | 33 | return matches[0] 34 | 35 | 36 | def solve(task): 37 | idx, prompt = task 38 | 39 | for retry in range(5): 40 | response = text_completion( 41 | prompt=prompt, 42 | max_tokens=1200 + retry * 500, 43 | log_file="gsm8k.log", 44 | max_trial=5, 45 | temperature=retry * 0.5, 46 | model="gpt-4-1106-preview", 47 | ) 48 | 49 | if not response["success"]: 50 | answer = None 51 | text = None 52 | else: 53 | text = response["text"] 54 | answer = extract_substrings(text) 55 | 56 | if answer: 57 | break 58 | 59 | if answer: 60 | with open(my_path.parent / "generations" / "gsm8k.jsonl", "a") as f: 61 | f.write(json.dumps({"idx": idx, "answer": answer, "proof": text}) + "\n") 62 | 63 | 64 | def generate(): 65 | ds = load_dataset("gsm8k", "main")["test"] 66 | tasks = [] 67 | for idx, row in enumerate(ds): 68 | prompt = ( 69 | row["question"] 70 | + "\nPlease end your solution with Answer: $\\boxed{number}$ where number is the numerical answer without unit.\nSolution:" 71 | ) 72 | tasks.append((idx, prompt)) 73 | run_batch_jobs(solve, tasks, max_thread=20) 74 | 75 | 76 | def evaluate(): 77 | rows = [] 78 | ds = load_dataset("gsm8k", "main")["test"] 79 | with open(my_path.parent / "generations" / "gsm8k.jsonl", "r") as f: 80 | for line in f: 81 | row = json.loads(line) 82 | row["answer"] = extract_substrings(row["proof"]) 83 | rows.append(row) 84 | 85 | def check_answer(official, student): 86 | return abs(official - student) < (abs(official) + 1e-6) * 1e-6 87 | 88 | n_correct = 0 89 | for i, row in enumerate(rows): 90 | idx = row["idx"] 91 | gpt_answer = None 92 | official_answer = None 93 | official_answer = ds[idx]["answer"].split("####")[1].replace(",", "") 94 | 95 | try: 96 | gpt_answer = ( 97 | row["answer"].replace(",", "").split("\n## ")[0].replace("\%", "") 98 | ) 99 | 100 | if gpt_answer == official_answer: 101 | n_correct += 1 102 | continue 103 | 104 | official_float = float(official_answer) 105 | gpt_float = float(gpt_answer) 106 | n_correct += check_answer(official_float, gpt_float) 107 | continue 108 | except: 109 | with open("parse.txt", "a") as f: 110 | f.write("=" * 80 + "\n") 111 | f.write(f"idx:{idx}\n") 112 | f.write("official_answer:" + str(official_answer) + "\n") 113 | f.write("gpt_answer:" + str(gpt_answer) + "\n") 114 | f.write("-" * 40 + "\n") 115 | f.write(ds[idx]["answer"] + "\n") 116 | f.write("-" * 40 + "\n") 117 | f.write(row["proof"] + "\n") 118 | 119 | print( 120 | "n_correct:", 121 | n_correct, 122 | "n_total:", 123 | len(rows), 124 | "accuracy:", 125 | n_correct / len(rows), 126 | ) 127 | -------------------------------------------------------------------------------- /src/promptbase/humaneval/__init__.py: -------------------------------------------------------------------------------- 1 | from .humaneval import generate, evaluate 2 | -------------------------------------------------------------------------------- /src/promptbase/humaneval/humaneval.py: -------------------------------------------------------------------------------- 1 | # Generate 2 | import hashlib 3 | import json 4 | import math 5 | import re 6 | import traceback 7 | from promptbase import utils 8 | from datasets import load_dataset 9 | from collections import Counter 10 | 11 | _logger = utils.helpers.get_standard_logger_for_file(__file__) 12 | 13 | prompts = [] 14 | chat_mode = False 15 | ds = None 16 | 17 | 18 | def fetch_data(): 19 | _logger.info("Starting fetch_data") 20 | global prompts 21 | global ds 22 | # data_file = utils.fetch_dataset_blob("humaneval") 23 | ds = load_dataset("openai_humaneval") # Dataset.from_file(data_file 24 | _logger.info("Dataset downloaded; starting processing of test split") 25 | for row in ds["test"]: 26 | if chat_mode: 27 | prompt = ( 28 | row["prompt"] 29 | + "\n\nPlease complete the function above together with the function header." 30 | ) 31 | else: 32 | prompt = ( 33 | "## Here is the official solution of one python exercise via only one function:\n" 34 | + row["prompt"] 35 | ) # 118 36 | # prompt = f"## Solution of the coding exercise `{row['entry_point']}`:\n" + row["prompt"] 37 | # prompt = f"## Official solution of the coding exercise `{row['entry_point']}`:\n" + row["prompt"] 38 | prompts.append(prompt) 39 | _logger.info("Completed fetch_data") 40 | 41 | 42 | def extract_substrings(text): 43 | return re.findall(r"```(.*?)```", text, re.DOTALL) 44 | 45 | 46 | def solve(idx): 47 | global prompts 48 | _logger.info(f"Starting solve for index {idx}") 49 | 50 | for retry in range(5): 51 | response = utils.helpers.text_completion( 52 | prompt=prompts[idx], 53 | max_tokens=600, 54 | log_file="human_eval.log", 55 | max_trial=5, 56 | temperature=retry * 0.05, 57 | model="gpt-4-1106-preview", 58 | stop=["##"], 59 | ) 60 | 61 | if not response["success"]: 62 | code = None 63 | else: 64 | if chat_mode: 65 | text = response["text"] 66 | substrings = extract_substrings(text) 67 | substrings = [s for s in substrings if "def " in s] 68 | code = max(substrings, key=len, default="") if substrings else None 69 | else: 70 | code = prompts[idx] + response["text"] 71 | 72 | if code: 73 | break 74 | 75 | if code: 76 | with open("gpt4.jsonl", "a") as f: 77 | f.write(json.dumps({"idx": idx, "code": code}) + "\n") 78 | 79 | 80 | def generate(): 81 | fetch_data() 82 | _logger.info("Running bach jobs") 83 | utils.helpers.run_batch_jobs(solve, range(len(prompts)), max_thread=20) 84 | 85 | 86 | def evaluate(): 87 | _logger.info("Starting evaluate") 88 | # open gpt4.jsonl 89 | rows = [] 90 | with open("gpt4.jsonl") as f: 91 | for line in f: 92 | rows.append(json.loads(line)) 93 | 94 | env = { 95 | "hashlib": hashlib, 96 | "re": re, 97 | "Counter": Counter, 98 | "factorial": math.factorial, 99 | } 100 | n_success = 0 101 | for row in rows: 102 | code = row["code"] 103 | if code.startswith("python"): 104 | code = code[6:] 105 | code = ( 106 | code.split("# Test")[0] 107 | .split("# test")[0] 108 | .split("\nprint")[0] 109 | .split("\nassert")[0] 110 | .split("# END")[0] 111 | .split("<|ipynb_marker|>")[0] 112 | .split("\n# Check your answer")[0] 113 | ) 114 | code += ( 115 | "\n" 116 | + ds["test"][row["idx"]]["test"] 117 | + "\ncheck(" 118 | + ds["test"][row["idx"]]["entry_point"] 119 | + ")" 120 | ) 121 | 122 | try: 123 | exec(code, env, env) 124 | n_success += 1 125 | except Exception as e: 126 | err = traceback.format_exc() 127 | if "AssertionError" not in err: 128 | print(traceback.format_exc()) 129 | print(code) 130 | print("=" * 100) 131 | n_success += 0 132 | 133 | _logger.info(f"Number of successes: {n_success}") 134 | _logger.info(f"Number of rows: {len(rows)}") 135 | _logger.info(f"Success rate: {n_success / len(rows)}") 136 | -------------------------------------------------------------------------------- /src/promptbase/math/__init__.py: -------------------------------------------------------------------------------- 1 | from .math import generate, evaluate 2 | -------------------------------------------------------------------------------- /src/promptbase/mmlu/__init__.py: -------------------------------------------------------------------------------- 1 | # from .problem_utils import * 2 | 3 | from .generate import generate 4 | from .eval import evaluate_all 5 | -------------------------------------------------------------------------------- /src/promptbase/mmlu/analyze.py: -------------------------------------------------------------------------------- 1 | from .problem_utils import * 2 | 3 | test_problem = "MMLU_test_chemistry" 4 | 5 | subjects = ( 6 | """Astronomy 7 | College Biology 8 | College Chemistry 9 | College Mathematics 10 | College Medicine 11 | College Physics 12 | Conceptual Physics 13 | Econometrics 14 | Electrical Engineering 15 | Elementary Mathematics 16 | High School Biology 17 | High School Chemistry 18 | High School Macroeconomics 19 | High School Mathematics 20 | High School Microeconomics 21 | High School Physics 22 | High School Statistics 23 | Machine Learning 24 | Professional Accounting 25 | Professional Medicine""".replace( 26 | " ", "_" 27 | ) 28 | .lower() 29 | .split("\n") 30 | ) 31 | 32 | 33 | # Load problems 34 | cot_rows_list = [ 35 | load_problems(f"expt/{test_problem}/cot_knn/result"), 36 | load_problems(f"expt/{test_problem}/cot_via_knn/result"), 37 | ] 38 | 39 | 40 | def merge_ds(dataset_list): 41 | cot_rows = {} 42 | for rows_set in dataset_list: 43 | for row in rows_set: 44 | if row["question_number"] not in cot_rows: 45 | cot_rows[row["question_number"]] = copy.copy(row) 46 | cot_rows[row["question_number"]]["expt"] = {} 47 | if "expt" in row and row["expt"]: 48 | for key in row["expt"]: 49 | cot_rows[row["question_number"]]["expt"][key] = row["expt"][key] 50 | return list(cot_rows.values()) 51 | 52 | 53 | cot_rows = merge_ds(cot_rows_list) 54 | logprobs_rows = load_problems(f"expt/{test_problem}/logprobs5/result") 55 | 56 | if cot_rows: 57 | print("Number of COT:", len(cot_rows[42]["expt"].keys())) 58 | if logprobs_rows: 59 | print("Number of logprobs:", len(logprobs_rows[42]["expt"].keys())) 60 | 61 | # Merge datasets 62 | rows = {} 63 | for row in cot_rows: 64 | key = row["question_number"] 65 | if key not in rows: 66 | rows[key] = {} 67 | rows[key]["question"] = row["question"] 68 | rows[key]["subject"] = row["extra"].replace("_test", "").replace("_dev", "") 69 | rows[key]["answer"] = row["correct_answer"] 70 | expts = row["expt"] 71 | rows[key]["cot"] = [ 72 | expts[expt]["answer"] 73 | for expt in expts 74 | if expts[expt].get("answer", None) is not None 75 | ] 76 | 77 | for row in logprobs_rows: 78 | key = row["question_number"] 79 | if key not in rows: 80 | rows[key] = {} 81 | rows[key]["question"] = row["question"] 82 | rows[key]["subject"] = row["extra"].replace("_test", "").replace("_dev", "") 83 | rows[key]["answer"] = row["correct_answer"] 84 | expts = row["expt"] 85 | rows[key]["logprobs"] = [ 86 | expts[expt]["scores"] 87 | for expt in expts 88 | if expts[expt].get("scores", None) is not None 89 | ] 90 | 91 | rows = list(rows.values()) 92 | 93 | n_correct = 0 94 | for row in rows: 95 | if "cot" in row: 96 | x = Counter(row["cot"]) 97 | for k in x: 98 | x[k] /= len(row["cot"]) 99 | else: 100 | x = {} 101 | 102 | if "logprobs" in row: 103 | for e in row["logprobs"]: 104 | for k in e: 105 | if k not in x: 106 | x[k] = 0 107 | if row["subject"] in subjects: 108 | x[k] += 0.5 * e[k] / len(row["logprobs"]) 109 | else: 110 | x[k] += 2.0 * e[k] / len(row["logprobs"]) 111 | 112 | if x: 113 | selected_answer = max(x, key=x.get) 114 | if row["answer"] == selected_answer: 115 | n_correct += 1 116 | else: 117 | n_correct += 1 / 4 118 | 119 | print("Number of questions:", len(rows)) 120 | print("Number of correct answers:", n_correct) 121 | print("Accuracy:", n_correct / len(rows)) 122 | -------------------------------------------------------------------------------- /src/promptbase/mmlu/embed_problems.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | from tqdm import tqdm 3 | from .eval import * 4 | from .utils import * 5 | 6 | 7 | def embed_file(file_name): 8 | ds = load_json_file(file_name) 9 | questions = [row["question"] for row in ds] 10 | embeddings = embed_batch(questions) 11 | for row, embedding in tqdm(zip(ds, embeddings)): 12 | row["embedding"] = embedding 13 | 14 | with gzip.open(file_name + ".gz", "wt") as f: 15 | json.dump(ds, f) 16 | 17 | 18 | if __name__ == "__main__": 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument("question_file", help="The JSON file containing user answers") 21 | args = parser.parse_args() 22 | 23 | embed_file(args.question_file) 24 | -------------------------------------------------------------------------------- /src/promptbase/mmlu/eval.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import json 3 | import pathlib 4 | 5 | import sklearn.metrics as skm 6 | 7 | from .mmlu_paths import mmlu_data_dir, mmlu_generations_dir 8 | 9 | API_DATA_KEYS = ["api_calls", "tokens_used_prompt", "tokens_used_completion"] 10 | 11 | 12 | def load_json_file(file_path): 13 | if type(file_path) is str: 14 | file_path = pathlib.Path(file_path) 15 | 16 | gz_path = file_path.with_suffix(file_path.suffix + ".gz") 17 | print(f"Looking for: {gz_path}") 18 | if gz_path.exists(): 19 | print("Found zip file") 20 | with gzip.open(gz_path, "rt") as f: 21 | return json.load(f) 22 | else: 23 | print("Found regular file") 24 | with open(file_path, "r", encoding="utf-8") as f: 25 | return json.load(f) 26 | 27 | 28 | def eval_answers(all_questions) -> dict[str, any]: 29 | y_true = [] 30 | y_pred = [] 31 | answer_counts = [] 32 | skipped = 0 33 | for item in all_questions: 34 | answer_voting = dict() 35 | for response in item["expt"].values(): 36 | if response["answer"] in answer_voting: 37 | answer_voting[response["answer"]] += 1 38 | else: 39 | answer_voting[response["answer"]] = 1 40 | best_answer = "" 41 | best_count = 0 42 | for k, v in answer_voting.items(): 43 | if v > best_count: 44 | best_answer = k 45 | if not best_answer: 46 | skipped += 1 47 | continue 48 | y_true.append(item["correct_answer"]) 49 | answer_counts.append(len(answer_voting)) 50 | y_pred.append(best_answer) 51 | 52 | result = dict() 53 | result["count"] = len(y_true) 54 | result["accuracy"] = skm.accuracy_score(y_true, y_pred) 55 | result["skipped"] = skipped 56 | result["mean_different_answers"] = sum(answer_counts) / len(answer_counts) 57 | 58 | return result 59 | 60 | 61 | def evaluate_all(dataset_name: str): 62 | dev_problem = f"mmlu_{dataset_name}_val" 63 | test_problem = f"mmlu_{dataset_name}_test" 64 | 65 | print(f"Starting evaluation of {dataset_name}") 66 | 67 | variants = { 68 | "cot": dev_problem, 69 | "cot_knn": test_problem, 70 | "cot_via_knn": test_problem, 71 | } 72 | 73 | for k, v in variants.items(): 74 | print(f"Evaluating {v}") 75 | # Note that output we have in the directory appears to be a gzip 76 | all_generated_data = load_json_file( 77 | mmlu_generations_dir / "expt" / v / k / "result.json" 78 | ) 79 | stats = eval_answers(all_generated_data) 80 | print(f"{json.dumps(stats, indent=4)}") 81 | print("Evaluations complete") 82 | -------------------------------------------------------------------------------- /src/promptbase/mmlu/generate.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pathlib 3 | 4 | from . import MMLU 5 | from .embed_problems import embed_file 6 | from .mmlu_paths import mmlu_data_dir, mmlu_generations_dir 7 | 8 | model_name = "gpt-4-1106-preview" 9 | 10 | 11 | def generate(dataset_name: str): 12 | dev_problem = f"mmlu_{dataset_name}_val" 13 | test_problem = f"mmlu_{dataset_name}_test" 14 | 15 | if not os.path.exists(str(mmlu_data_dir / dev_problem) + ".json.gz"): 16 | embed_file(str(mmlu_data_dir / dev_problem) + ".json") 17 | 18 | if not os.path.exists(str(mmlu_data_dir / test_problem) + ".json.gz"): 19 | embed_file(str(mmlu_data_dir / test_problem) + ".json") 20 | 21 | MMLU.generate_solutions_without_rank( 22 | dev_problem, run_name=f"{dev_problem}/cot", model=model_name 23 | ) 24 | MMLU.run_cot_without_rank( 25 | test_problem, 26 | run_name=f"{test_problem}/cot_knn", 27 | examples=str( 28 | mmlu_generations_dir / f"expt" / f"{dev_problem}" / "cot" / "result" 29 | ), 30 | mode="knn", 31 | num_examples=5, 32 | num_repeat=5, 33 | max_thread=50, 34 | model=model_name, 35 | ) 36 | MMLU.run_cot_without_rank( 37 | test_problem, 38 | run_name=f"{test_problem}/cot_via_knn", 39 | examples=str( 40 | mmlu_generations_dir / f"expt" / f"{test_problem}" / "cot_knn" / "result" 41 | ), 42 | mode="knn", 43 | num_examples=5, 44 | num_repeat=15, 45 | max_thread=50, 46 | model=model_name, 47 | ) 48 | if False: 49 | # Logprobs not currently available in OpenAI API 50 | MMLU.run_logprobs( 51 | test_problem, 52 | run_name=f"{test_problem}/logprobs5", 53 | num_examples=5, 54 | num_repeat=10, 55 | max_thread=50, 56 | model=model_name, 57 | ) 58 | -------------------------------------------------------------------------------- /src/promptbase/mmlu/mmlu_paths.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | 3 | _my_path = pathlib.Path(__file__).parent.resolve() 4 | 5 | mmlu_data_dir = _my_path.parent / "datasets" / "mmlu" 6 | 7 | mmlu_generations_dir = _my_path.parent / "generations" 8 | -------------------------------------------------------------------------------- /src/promptbase/mmlu/print_results.py: -------------------------------------------------------------------------------- 1 | from .problem_utils import * 2 | import gzip 3 | 4 | 5 | def load_problems(file_name): 6 | with gzip.open(file_name + ".json.gz", "rt") as f: 7 | problems = json.loads(f.read()) 8 | return problems 9 | 10 | 11 | # Load problems from the file 12 | problems = load_problems(f"expt/final/MMLU_medical_genetics/logits0/result") 13 | 14 | # Compute statistics on the loaded problems 15 | summary = compute_statistics(problems) 16 | print(summary) 17 | -------------------------------------------------------------------------------- /src/promptbase/mmlu/test.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | import os 3 | 4 | from . import MMLU 5 | from .embed_problems import * 6 | from .problem_utils import * 7 | 8 | dev_name = "MMLU_dev" 9 | test_name = "MMLU_test" 10 | dev_name = "MMLU_chemistry" 11 | test_name = "MMLU_chemistry" 12 | 13 | # embed questions 14 | if not os.path.exists(problem_files[dev_name] + ".json.gz"): 15 | embed_file(problem_files[dev_name] + ".json") 16 | 17 | if not os.path.exists(problem_files[test_name] + ".json.gz"): 18 | embed_file(problem_files[test_name] + ".json") 19 | 20 | # generate cot solutions on dev set 21 | if not os.path.exists(f"mmlu/expt/{dev_name}/cot/result.json.gz"): 22 | MMLU.run_cot(dev_name, example_selector="random", max_thread=50) 23 | 24 | # generate cot solutions on test set via dev set 25 | if not os.path.exists(f"mmlu/expt/{test_name}/cot_merged.json.gz"): 26 | 27 | def generate_test_cot_initial(index): 28 | MMLU.run_cot( 29 | test_name, 30 | run_name=f"{test_name}/cot_{index}", 31 | examples=f"expt/{dev_name}/cot/result", 32 | num_repeat=1, 33 | max_thread=30, 34 | num_examples=5, 35 | example_selector="knn", 36 | model="gpt-4-1106-preview", 37 | ) 38 | return "Done!" 39 | 40 | with multiprocessing.Pool(processes=5) as pool: 41 | results = pool.map(generate_test_cot_initial, range(5)) 42 | 43 | cot_rows1 = load_problems(f"expt/{test_name}/cot_0/result") 44 | cot_rows2 = load_problems(f"expt/{test_name}/cot_1/result") 45 | cot_rows3 = load_problems(f"expt/{test_name}/cot_2/result") 46 | cot_rows4 = load_problems(f"expt/{test_name}/cot_3/result") 47 | cot_rows5 = load_problems(f"expt/{test_name}/cot_4/result") 48 | 49 | def merge_ds(dataset_list): 50 | cot_rows = {} 51 | for rows_set in dataset_list: 52 | for row in rows_set: 53 | if row["question_number"] not in cot_rows: 54 | cot_rows[row["question_number"]] = copy.copy(row) 55 | cot_rows[row["question_number"]]["expt"] = {} 56 | for key in row["expt"]: 57 | cot_rows[row["question_number"]]["expt"][key] = row["expt"][key] 58 | return list(cot_rows.values()) 59 | 60 | cot_rows = merge_ds([cot_rows1, cot_rows2, cot_rows3, cot_rows4, cot_rows5]) 61 | save_problems(f"expt/{test_name}/cot_merged", cot_rows) 62 | 63 | 64 | # solutions on test set 65 | 66 | 67 | ## generate cot solutions on test set via test set 68 | def generate_test_cot(index): 69 | MMLU.run_cot_without_rank( 70 | test_name, 71 | run_name=f"{test_name}/cot_via_test_{index}_v8", 72 | examples=f"mmlu/expt/{test_name}/cot_merged", 73 | num_repeat=1, 74 | max_thread=30, 75 | num_examples=5, 76 | mode="knn", 77 | model="gpt-4-1106-preview", 78 | ) 79 | return "Done!" 80 | 81 | 82 | with multiprocessing.Pool(processes=15) as pool: 83 | results = pool.map(generate_test_cot, range(5)) 84 | -------------------------------------------------------------------------------- /src/promptbase/mmlu/tune_parameter/analyze.py: -------------------------------------------------------------------------------- 1 | import json, random, copy 2 | import numpy as np 3 | from tqdm import tqdm 4 | from collections import Counter 5 | 6 | with open("summary.json") as f: 7 | data = json.load(f) 8 | 9 | 10 | def calculate_result(rows): 11 | best_weight = 0 12 | best_acc = 0 13 | for weight in np.arange(0, 2, 0.01): 14 | n_correct = 0 15 | n_cnt = 0 16 | for row in rows: 17 | x = copy.deepcopy(row["cot"]) 18 | for k in row["logprob"]: 19 | x[k] = x.get(k, 0) + weight * row["logprob"][k] 20 | 21 | selected_answer = max(x, key=x.get) 22 | n_cnt += 1 23 | if row["answer"] == selected_answer: 24 | n_correct += 1 25 | acc = n_correct / len(rows) 26 | if acc > best_acc: 27 | best_acc = acc 28 | best_weight = weight 29 | return best_acc, best_weight 30 | 31 | 32 | # 89.93 33 | subject_weight = 0.5 34 | non_subject_weight = 1.2 35 | subject_list = [] 36 | total_correct = 0 37 | total_count = 0 38 | if 1: 39 | for subject in data: 40 | print(subject) 41 | rows = data[subject] 42 | 43 | # use best threshold to process each row 44 | for i, row in tqdm(enumerate(rows)): 45 | rows_i = [item for index, item in enumerate(rows) if index != i] 46 | acc, weight = calculate_result(rows_i) 47 | x = row["cot"] 48 | 49 | for k in row["logprob"]: 50 | x[k] = x.get(k, 0) + weight * row["logprob"][k] 51 | selected_answer = max(x, key=x.get) 52 | total_count += 1 53 | if row["answer"] == selected_answer: 54 | total_correct += 1 55 | 56 | if 0: 57 | for subject in tqdm(data): 58 | rows = data[subject] 59 | subject_acc = calculate_result(rows, subject_weight) 60 | non_subject_acc = calculate_result(rows, non_subject_weight) 61 | if subject_acc > non_subject_acc: 62 | weight = subject_weight 63 | else: 64 | weight = non_subject_weight 65 | 66 | # use best threshold to process each row 67 | for i, row in enumerate(rows): 68 | x = row["cot"] 69 | for k in row["logprob"]: 70 | x[k] = x.get(k, 0) + weight * row["logprob"][k] 71 | selected_answer = max(x, key=x.get) 72 | total_count += 1 73 | if row["answer"] == selected_answer: 74 | total_correct += 1 75 | 76 | print(f"total_correct: {total_correct}") 77 | print(f"total_count: {total_count}") 78 | print(f"accuracy: {total_correct / total_count}") 79 | # save best_thresholds to best_thresholds.json 80 | with open("best_thresholds.json", "w") as f: 81 | json.dump(subject_list, f, indent=4) 82 | -------------------------------------------------------------------------------- /src/promptbase/mmlu/tune_parameter/summarize.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | import sys 4 | from .problem_utils import * 5 | 6 | cot_rows1 = load_problems("mmlu/expt/final/MMLU_test/cot_without_rank_knn_5_v0/result") 7 | cot_rows2 = load_problems( 8 | "mmlu/expt/final/MMLU_test/cot_without_rank_knn_5_gpt-4-1106-preview/result" 9 | ) 10 | cot_rows3 = load_problems( 11 | "mmlu/expt/final/MMLU_test/cot_without_rank_knn_5_gpt-4-1106-preview/result" 12 | ) 13 | logprobs_rows1 = load_problems("mmlu/expt/final/MMLU_test/logprobs5_MMLU_dev/result") 14 | logprobs_rows2 = load_problems("mmlu/expt/final/MMLU_test/logprobs5_MMLU_test/result") 15 | 16 | import copy 17 | 18 | 19 | def merge_ds(dataset_list): 20 | cot_rows = {} 21 | for rows_set in dataset_list: 22 | for row in rows_set: 23 | if row["question_number"] not in cot_rows: 24 | cot_rows[row["question_number"]] = copy.deepcopy(row) 25 | else: 26 | cot_rows[row["question_number"]]["expt"].update(row["expt"]) 27 | return list(cot_rows.values()) 28 | 29 | 30 | cot_rows = merge_ds([cot_rows1, cot_rows2, cot_rows3]) 31 | logprobs_rows = merge_ds([logprobs_rows1, logprobs_rows2]) 32 | 33 | rows = {} 34 | for row in cot_rows: 35 | key = row["question_number"] 36 | if key not in rows: 37 | rows[key] = {} 38 | rows[key]["question"] = row["question"] 39 | rows[key]["subject"] = row["extra"].replace("_test", "") 40 | rows[key]["answer"] = row["correct_answer"] 41 | expts = row["expt"] 42 | rows[key]["cot"] = [ 43 | expts[expt]["answer"] 44 | for expt in expts 45 | if expts[expt].get("answer", None) is not None 46 | ] 47 | 48 | for row in logprobs_rows: 49 | key = row["question_number"] 50 | if key not in rows: 51 | rows[key] = {} 52 | rows[key]["question"] = row["question"] 53 | rows[key]["answer"] = row["correct_answer"] 54 | expts = row["expt"] 55 | rows[key]["logprobs"] = [ 56 | expts[expt]["scores"] 57 | for expt in expts 58 | if expts[expt].get("scores", None) is not None 59 | ] 60 | 61 | rows = list(rows.values()) 62 | 63 | data = {} 64 | for row in rows: 65 | if row["subject"] not in data: 66 | data[row["subject"]] = [] 67 | 68 | scores_logprob = {} 69 | for e in row["logprobs"]: 70 | for k in e: 71 | scores_logprob[k] = scores_logprob.get(k, 0) + e[k] / len(row["logprobs"]) 72 | 73 | scores_cot = Counter(row["cot"]) 74 | for k in scores_cot: 75 | scores_cot[k] /= len(row["cot"]) 76 | 77 | data[row["subject"]].append( 78 | {"logprob": scores_logprob, "cot": scores_cot, "answer": row["answer"]} 79 | ) 80 | 81 | # save data to summary.json 82 | with open("summary.json", "w") as f: 83 | json.dump(data, f, indent=4) 84 | -------------------------------------------------------------------------------- /src/promptbase/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from . import helpers 2 | -------------------------------------------------------------------------------- /src/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name="promptbase", 5 | version="0.1.0", 6 | author="Microsoft", 7 | description="Advanced prompting for advanced intelligence", 8 | # url="https://github.com/repo", # Replace with the URL of your project 9 | packages=find_packages(), 10 | install_requires=[ 11 | "datasets", 12 | "tqdm", 13 | "openai", 14 | "python-liquid", 15 | "GitPython", 16 | "torch", 17 | "scikit-learn", 18 | ], 19 | python_requires=">=3.9", # Specify the minimum Python version required 20 | ) 21 | --------------------------------------------------------------------------------