├── .gitignore ├── .vscode └── settings.json ├── Examples ├── FormRecognizer │ ├── Balance_sheet_analysis.ipynb │ ├── FormRecognizerExamples.ipynb │ ├── Life_application_example.png │ ├── Life_application_example.png.json │ ├── income_table.png │ └── ms_fy23_q1_html.htm ├── Language │ ├── CustomerServiceCall.ipynb │ ├── Loan_call.ipynb │ ├── Loan_call.json │ ├── Pharmacy_call.ipynb │ └── Pharmacy_call.json ├── OpenSource │ ├── ConvFinQA-benchmark │ │ ├── README.md │ │ ├── convfinqa-chat-turns.ipynb │ │ ├── convfinqa-chatcompatible-test_private.ipynb │ │ ├── convfinqa-chatcompatible.ipynb │ │ ├── convfinqa.ipynb │ │ ├── data │ │ │ ├── SVAMP.json │ │ │ ├── aqua_test.jsonl │ │ │ ├── convfinqa_dev.json │ │ │ ├── convfinqa_test_turn.json │ │ │ ├── dev.json │ │ │ ├── finqa_test.json │ │ │ ├── gsm8K.json │ │ │ ├── tabmwp_test.json │ │ │ ├── tatqa_dev.json │ │ │ └── test_private.json │ │ ├── eval_tatqa │ │ │ ├── __init__.py │ │ │ ├── tatqa_eval.py │ │ │ ├── tatqa_metric.py │ │ │ ├── tatqa_metric_test.py │ │ │ ├── tatqa_utils.py │ │ │ └── tatqa_utils_test.py │ │ ├── finqa-chatcompatible.ipynb │ │ ├── outputs │ │ │ ├── benchmark-score-finQA.ipynb │ │ │ ├── benchmark-score.ipynb │ │ │ ├── convfinqa_direct_gpt-35-turbo_04_10_12_30.jsonl │ │ │ ├── convfinqa_direct_gpt-35-turbo_04_13_07_36.jsonl │ │ │ ├── convfinqa_direct_gpt-4_04_10_13_02.jsonl │ │ │ ├── convfinqa_direct_gpt-4_04_12_20_53.jsonl │ │ │ ├── convfinqa_direct_gpt-4_04_13_07_21.jsonl │ │ │ ├── convfinqa_direct_gpt3_04_10_07_53.jsonl │ │ │ ├── convfinqa_test_private_gpt-4_04_18_10_22.jsonl │ │ │ ├── convfinqa_test_private_gpt-4_04_18_10_38.jsonl │ │ │ ├── finqa_gpt-35-turbo_04_10_14_54.jsonl │ │ │ └── score-finQA-reformat.ipynb │ │ └── tool.py │ ├── LangChain │ │ ├── CustomAPIMAzureOpenAI.py │ │ ├── LangChainSummarizationExample.ipynb │ │ └── stateoftheunion_20230207.txt │ └── LlamaIndex │ │ ├── 10k_Analysis.ipynb │ │ ├── UBER │ │ ├── UBER_2019.html │ │ ├── UBER_2020.html │ │ ├── UBER_2021.html │ │ └── UBER_2022.html │ │ └── sample_10k_chain.pdf ├── Readme.md ├── Speech │ ├── Conversation_SSML OpenAI.ipynb │ ├── Conversation_SSML.ipynb │ └── Conversation_SSML.xml ├── aml_examples │ └── 1a_read_example-copy │ │ ├── README.md │ │ ├── adls_src │ │ └── read_folder.py │ │ ├── job_example-py-adls.ipynb │ │ ├── pipeline_with_components_from_yaml.ipynb │ │ └── read_adls.yml ├── aml_foundationmodels │ ├── deploy_flask_falcon.ipynb │ ├── dockerfile │ │ ├── Dockerfile │ │ ├── requirements.txt │ │ └── score.py │ └── src │ │ └── server.py ├── config.cfg ├── demo │ ├── demo.ipynb │ ├── earnings_example.ipynb │ └── stock_prompt.csv ├── metrics │ ├── aoai_metrics.ipynb │ └── totalcalls.json ├── promptflow │ ├── ag-convfinqa-pf │ │ ├── .promptflow │ │ │ ├── flow.detail.json │ │ │ ├── flow.layout.json │ │ │ ├── flow.log │ │ │ ├── flow.output.json │ │ │ ├── flow.tools.json │ │ │ └── flow.uihint.json │ │ ├── autogen_step.py │ │ ├── chat.jinja2 │ │ ├── convfinqa_dev.jsonl │ │ ├── convfinqa_dev_sample.jsonl │ │ ├── flow.dag.yaml │ │ ├── flow.meta.yaml │ │ └── requirements.txt │ ├── autogen-flow │ │ ├── OAI_CONFIG_LIST │ │ ├── ag_test.py │ │ ├── chat.jinja2 │ │ ├── flow.dag.yaml │ │ └── requirements.txt │ ├── csv_example │ │ ├── .amlignore │ │ ├── Mock_Count_index_Data_20230928G.csv │ │ ├── analyze_df.py │ │ ├── chat.jinja2 │ │ ├── concat_result.jinja2 │ │ ├── flow.dag.yaml │ │ ├── generate_insights.jinja2 │ │ ├── output_format.jinja2 │ │ ├── output_result.jinja2 │ │ ├── readme.md │ │ └── requirements.txt │ ├── databricks_example │ │ ├── chat_csv_model │ │ │ └── promptflow │ │ │ │ ├── Mock_Count_index_Data_20230928G.csv │ │ │ │ ├── analyze_df.py │ │ │ │ ├── chat.jinja2 │ │ │ │ ├── concat_result.jinja2 │ │ │ │ ├── flow.dag.yaml │ │ │ │ ├── generate_insights.jinja2 │ │ │ │ ├── output_format.jinja2 │ │ │ │ ├── output_result.jinja2 │ │ │ │ └── requirements.txt │ │ ├── deploy_pf │ │ │ ├── 1_pf_register_model.ipynb │ │ │ └── 2_pf_test_model.ipynb │ │ └── readme.md │ ├── dmv_copilot_flow │ │ ├── .promptflow │ │ │ ├── RetrieveDocuments.inputs.jsonl │ │ │ ├── RetrieveDocuments.node.log │ │ │ ├── chat.detail.json │ │ │ ├── flow.detail.json │ │ │ ├── flow.layout.json │ │ │ ├── flow.log │ │ │ └── flow.tools.json │ │ ├── DetermineIntent.jinja2 │ │ ├── DetermineReply.jinja2 │ │ ├── ExtractIntent.py │ │ ├── FormatConversation.py │ │ ├── FormatReply.py │ │ ├── FormatRetrievedDocuments.py │ │ ├── RetrieveDocuments.py │ │ ├── concat_reply.py │ │ ├── dmv_sample_qs.csv │ │ ├── eval │ │ │ └── rag_eval.ipynb │ │ ├── flow.dag.yaml │ │ ├── output_prompt.jinja2 │ │ └── requirements.txt │ ├── finance_assistant_pf │ │ ├── .promptflow │ │ │ ├── flow.detail.json │ │ │ ├── flow.layout.json │ │ │ ├── flow.log │ │ │ ├── flow.output.json │ │ │ └── flow.tools.json │ │ ├── ag_test.py │ │ ├── chat.jinja2 │ │ ├── data │ │ │ └── portfolio.csv │ │ └── flow.dag.yaml │ ├── model_as_judge_evaluator │ │ ├── .promptflow │ │ │ ├── flow.detail.json │ │ │ ├── flow.layout.json │ │ │ ├── flow.log │ │ │ ├── flow.metrics.json │ │ │ ├── flow.output.json │ │ │ ├── flow.tools.json │ │ │ ├── flow.uihint.json │ │ │ ├── lkg_sources │ │ │ │ ├── README.md │ │ │ │ ├── ada_cosine_similarity_score.py │ │ │ │ ├── aggregate_variants_results.py │ │ │ │ ├── concat_scores.py │ │ │ │ ├── f1_score.py │ │ │ │ ├── flow.meta.yaml │ │ │ │ ├── gpt_coherence_prompt.jinja2 │ │ │ │ ├── gpt_fluency_prompt.jinja2 │ │ │ │ ├── gpt_groundedness_prompt.jinja2 │ │ │ │ ├── gpt_relevance_prompt.jinja2 │ │ │ │ ├── gpt_similarity_prompt.jinja2 │ │ │ │ ├── requirements.txt │ │ │ │ ├── samples.json │ │ │ │ ├── select_metrics.py │ │ │ │ └── validate_input.py │ │ │ └── ux.inputs.json │ │ ├── .runs │ │ │ ├── 9ccd06e6-71ee-4b65-a04e-d9a6b525c0c7 │ │ │ │ └── flow.dag.yaml │ │ │ └── f4279cf9-cbaf-4632-ab29-7efa6806192e │ │ │ │ └── flow.dag.yaml │ │ ├── README.md │ │ ├── ada_cosine_similarity_score.py │ │ ├── aggregate_variants_results.py │ │ ├── concat_scores.py │ │ ├── f1_score.py │ │ ├── flow.dag.yaml │ │ ├── flow.meta.yaml │ │ ├── gpt_coherence_prompt.jinja2 │ │ ├── gpt_fluency_prompt.jinja2 │ │ ├── gpt_groundedness_prompt.jinja2 │ │ ├── gpt_relevance_prompt.jinja2 │ │ ├── gpt_similarity_prompt.jinja2 │ │ ├── requirements.txt │ │ ├── samples.json │ │ ├── select_metrics.py │ │ └── validate_input.py │ ├── new_chat_flow │ │ └── .promptflow │ │ │ ├── chat.detail.json │ │ │ ├── chat.output.json │ │ │ ├── flow.detail.json │ │ │ ├── flow.layout.json │ │ │ ├── flow.log │ │ │ └── flow.output.json │ └── prompt_test │ │ └── .promptflow │ │ ├── chat.detail.json │ │ ├── chat.output.json │ │ ├── flow.detail.json │ │ ├── flow.layout.json │ │ ├── flow.log │ │ ├── flow.output.json │ │ └── flow.uihint.json └── requirements.txt ├── README.md └── Validation └── OutputQuality.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | *.cfg 3 | .ipynb_checkpoints 4 | *.pyc 5 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.terminal.activateEnvironment": true 3 | } -------------------------------------------------------------------------------- /Examples/FormRecognizer/FormRecognizerExamples.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "id": "30829697-ea9d-4e11-a530-2776b6c0e752", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import json\n", 11 | "import io\n", 12 | "from configparser import ConfigParser" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 4, 18 | "id": "e98c08a4-337c-4c84-b030-a94ce50e60a2", 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "form_json = ''\n", 23 | "# Form Recognizer results generated and saved to json \n", 24 | "with open('./Life_application_example.png.json', 'r') as form_file:\n", 25 | " form_json = json.loads(form_file.read())" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 5, 31 | "id": "71f0fdfc-4858-411c-adac-22719dab96d2", 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "form_kvp = []\n", 36 | "for kvp in form_json['analyzeResult']['keyValuePairs']:\n", 37 | " if(kvp.get('key') is not None and kvp.get('value') is not None ):\n", 38 | " form_kvp.append((kvp['key']['content'],kvp['value']['content']))\n", 39 | " #print(\"{} - {}\".format(kvp['key']['content'],kvp['value']['content']))\n", 40 | "\n", 41 | " " 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "id": "c628663f-1ee2-41d0-86f7-8fcf0a22b80a", 47 | "metadata": {}, 48 | "source": [ 49 | "# Validate Form Recognizer Outputs with OpenAI" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 6, 55 | "id": "7fef45bb-045a-473a-b7f8-5bca7b870a2c", 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "import os\n", 60 | "import openai\n", 61 | "from configparser import ConfigParser\n", 62 | "\n", 63 | "parser=ConfigParser()\n", 64 | "_=parser.read('../config.cfg')\n", 65 | "openai.api_type = \"azure\"\n", 66 | "openai.api_base = parser.get('openai_api','api_ep')\n", 67 | "openai.api_version = \"2022-06-01-preview\"\n", 68 | "openai.api_key = parser.get('openai_api','api_key')\n", 69 | "model = parser.get('openai_api','api_model')" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 7, 75 | "id": "cdcb3062-9e23-40db-9a31-7a818143731d", 76 | "metadata": {}, 77 | "outputs": [ 78 | { 79 | "name": "stdout", 80 | "output_type": "stream", 81 | "text": [ 82 | "Name (First, MI, Last) - Mark B. Cuban: True\n", 83 | "S.SN / T - 444-82-6666: True\n", 84 | "Address - 91 Richmond St.: True\n", 85 | "City - Dallas: False\n", 86 | "Zip Code - 75201: True\n", 87 | "Former Name - Michael Jackson: False\n", 88 | "M - :selected:: False\n", 89 | "OF - :unselected:: False\n", 90 | "Date of Birth (mm/dd/yyyy) - 1/1/70: True\n", 91 | "State of Birth - TX: True\n" 92 | ] 93 | } 94 | ], 95 | "source": [ 96 | "for e in form_kvp[:10]:\n", 97 | " #print(e)\n", 98 | " prompt = \"Validate following text is type of {} with True or False:\\n'''{}'''\\n\".format(e[0], e[1])\n", 99 | " response = openai.Completion.create( engine=model, prompt=prompt, temperature=.5, max_tokens=400, top_p=0.5, frequency_penalty=0, presence_penalty=0, stop=None)\n", 100 | " #print('Response:')\n", 101 | " print(\"{} - {}: {}\".format(e[0],e[1],response['choices'][0]['text'].strip()))" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "id": "f7f1fb1e-1f56-429c-8116-be4844e097ff", 107 | "metadata": {}, 108 | "source": [ 109 | "# Correct Form Recognizer Results with OpenAI" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 8, 115 | "id": "dc4a6d7a-84cc-49c1-881c-d14f28e9d772", 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "import os\n", 120 | "import openai\n", 121 | "from configparser import ConfigParser\n", 122 | "\n", 123 | "parser=ConfigParser()\n", 124 | "_=parser.read('../config.cfg')\n", 125 | "openai.api_type = \"azure\"\n", 126 | "openai.api_base = parser.get('openai_api','api_ep')\n", 127 | "openai.api_version = \"2022-06-01-preview\"\n", 128 | "openai.api_key = parser.get('openai_api','api_key')\n", 129 | "model = parser.get('openai_api','api_model')" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 9, 135 | "id": "8d45ce48-64df-45ae-8d85-fdb2bfd0c326", 136 | "metadata": {}, 137 | "outputs": [ 138 | { 139 | "name": "stdout", 140 | "output_type": "stream", 141 | "text": [ 142 | "Name (First, MI, Last) - Mark B. Cuban: Mark Cuban\n", 143 | "S.SN / T - 444-82-6666: S.SN: 444-82-6666\n", 144 | "Address - 91 Richmond St.: 91 Richmond Street\n", 145 | "City - Dallas: Dallas\n", 146 | "Zip Code - 75201: 75201\n", 147 | "Former Name - Michael Jackson: Michael Jackson\n", 148 | "M - :selected:: :selected:\n", 149 | "OF - :unselected:: :unselected:\n", 150 | "Date of Birth (mm/dd/yyyy) - 1/1/70: 01/01/1970\n", 151 | "State of Birth - TX: Texas\n" 152 | ] 153 | } 154 | ], 155 | "source": [ 156 | "# Corrects the first 10 form key value pairs\n", 157 | "for e in form_kvp[:10]:\n", 158 | " #print(e)\n", 159 | " prompt = \"Reformat following text to type of {}:\\n'''{}'''\\n\".format(e[0], e[1])\n", 160 | " response = openai.Completion.create( engine=model, prompt=prompt, temperature=.5, max_tokens=400, top_p=0.5, frequency_penalty=0, presence_penalty=0, stop=None)\n", 161 | " #print('Response:')\n", 162 | " print(\"{} - {}: {}\".format(e[0],e[1],response['choices'][0]['text'].strip()))" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "id": "d9eded5e-6701-449f-b855-58a588c9dfaa", 169 | "metadata": {}, 170 | "outputs": [], 171 | "source": [] 172 | } 173 | ], 174 | "metadata": { 175 | "kernelspec": { 176 | "display_name": "py38", 177 | "language": "python", 178 | "name": "py38" 179 | }, 180 | "language_info": { 181 | "codemirror_mode": { 182 | "name": "ipython", 183 | "version": 3 184 | }, 185 | "file_extension": ".py", 186 | "mimetype": "text/x-python", 187 | "name": "python", 188 | "nbconvert_exporter": "python", 189 | "pygments_lexer": "ipython3", 190 | "version": "3.8.10" 191 | } 192 | }, 193 | "nbformat": 4, 194 | "nbformat_minor": 5 195 | } 196 | -------------------------------------------------------------------------------- /Examples/FormRecognizer/Life_application_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jakeatmsft/AzureOpenAIExamples/222106ab4dc2e124e5ec16a5d8bf771eefd25ca8/Examples/FormRecognizer/Life_application_example.png -------------------------------------------------------------------------------- /Examples/FormRecognizer/income_table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jakeatmsft/AzureOpenAIExamples/222106ab4dc2e124e5ec16a5d8bf771eefd25ca8/Examples/FormRecognizer/income_table.png -------------------------------------------------------------------------------- /Examples/OpenSource/ConvFinQA-benchmark/README.md: -------------------------------------------------------------------------------- 1 | # Benchmarking of GPT-X models (base model w/few shot) using PoT output on Finance QnA Datasets 2 | 3 | ## Results: 4 | | | dv3 | turbo35 | gpt4 | 5 | |-------|-------|---------|-------| 6 | | ConvFinQA |62.5 | 69.3 | 80.0 | 7 | | FinQA | - |61.2 | - | 8 | | AQuA | - |- | 72.8 | 9 | | tatqa | - |- | - | 10 | 11 | ## Program of Thought Example: 12 | 13 | ## Prompt 14 | 15 | Read the following text and table, and then write code to answer a question: 16 | five-year performance comparison 2013 the following graph provides an indicator of cumulative total shareholder returns for the corporation as compared to the peer group index ( described above ) , the dow jones , and the s&p 500 . the graph assumes that the value of the investment in the common stock of union pacific corporation and each index was $ 100 on december 31 , 2005 and that all dividends were reinvested . purchases of equity securities 2013 during 2010 , we repurchased 17556522 shares of our common stock at an average price of $ 75.51 . the following table presents common stock repurchases during each month for the fourth quarter of 2010 : period total number of shares purchased [a] average price paid per share total number of shares purchased as part of a publicly announced plan or program [b] maximum number of shares that may yet be purchased under the plan or program [b] . [a] total number of shares purchased during the quarter includes approximately 563220 shares delivered or attested to upc by employees to pay stock option exercise prices , satisfy excess tax withholding obligations for stock option exercises or vesting of retention units , and pay withholding obligations for vesting of retention shares . [b] on may 1 , 2008 , our board of directors authorized us to repurchase up to 40 million shares of our common stock through march 31 , 2011 . we may make these repurchases on the open market or through other transactions . our management has sole discretion with respect to determining the timing and amount of these transactions . on february 3 , 2011 , our board of directors authorized us to repurchase up to 40 million additional shares of our common stock under a new program effective from april 1 , 2011 through march 31 , 2014. . 17 | period | total number ofsharespurchased [a] | averageprice paidper share | total number of sharespurchased as part of apublicly announced planor program [b] | maximum number ofshares that may yetbe purchased under the planor program [b] 18 | oct . 1 through oct . 31 | 725450 | 84.65 | 519554 | 17917736 19 | nov . 1 through nov . 30 | 1205260 | 89.92 | 1106042 | 16811694 20 | dec . 1 through dec . 31 | 1133106 | 92.59 | 875000 | 15936694 21 | total | 3063816 | $ 89.66 | 2500596 | n/a 22 | Question: how much more was spent on shares in nov 2010 than dec 2010? 23 | 24 | 25 | ## Completion 26 | 27 | #Python 28 | shares_purchased_nov_2010 = 1205260 29 | shares_purchased_dec_2010 = 1133106 30 | difference_in_shares_purchased = shares_purchased_nov_2010 - shares_purchased_dec_2010 31 | average_price_paid_per_share = 89.66 32 | amount_spent_on_shares = difference_in_shares_purchased * average_price_paid_per_share 33 | ans = amount_spent_on_shares 34 | 35 | 36 | 37 | 38 | Credit to @wenhuchen - https://github.com/wenhuchen/Program-of-Thoughts -------------------------------------------------------------------------------- /Examples/OpenSource/ConvFinQA-benchmark/eval_tatqa/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jakeatmsft/AzureOpenAIExamples/222106ab4dc2e124e5ec16a5d8bf771eefd25ca8/Examples/OpenSource/ConvFinQA-benchmark/eval_tatqa/__init__.py -------------------------------------------------------------------------------- /Examples/OpenSource/ConvFinQA-benchmark/eval_tatqa/tatqa_eval.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import argparse 3 | import json 4 | from .tatqa_metric import * 5 | from typing import Any, Dict, Tuple 6 | 7 | 8 | def evaluate_json(golden_answers: Dict[str, Any], predicted_answers: Dict[str, Any]) -> Tuple[float, float]: 9 | 10 | em_and_f1 = TaTQAEmAndF1() 11 | for qas in golden_answers: 12 | for qa in qas["questions"]: 13 | query_id = qa["uid"] 14 | pred_answer, pred_scale = None, None 15 | if query_id in predicted_answers: 16 | pred_answer, pred_scale = predicted_answers[query_id] 17 | em_and_f1(ground_truth=qa, prediction=pred_answer, pred_scale=pred_scale) 18 | 19 | global_em, global_f1, global_scale, global_op = em_and_f1.get_overall_metric() 20 | print("----") 21 | print("Exact-match accuracy {0:.2f}".format(global_em * 100)) 22 | print("F1 score {0:.2f}".format(global_f1 * 100)) 23 | print("Scale score {0:.2f}".format(global_scale * 100)) 24 | print("{0:.2f} & {1:.2f}".format(global_em * 100, global_f1 * 100)) 25 | print("----") 26 | 27 | detail_raw = em_and_f1.get_raw_pivot_table() 28 | print("---- raw detail ---") 29 | print(detail_raw) 30 | detail_em, detail_f1 = em_and_f1.get_detail_metric() 31 | print("---- em detail ---") 32 | print(detail_em) 33 | print("---- f1 detail ---") 34 | print(detail_f1) 35 | 36 | 37 | def evaluate_prediction_file(gold_path: str, 38 | pred_path: str): 39 | 40 | golden_answers = json.load(open(gold_path, encoding='utf-8')) 41 | predicted_answers = json.load(open(pred_path, encoding='utf-8')) 42 | evaluate_json(golden_answers, predicted_answers) 43 | 44 | 45 | if __name__ == "__main__": 46 | # pylint: disable=invalid-name 47 | parser = argparse.ArgumentParser(description='evaluation on TAT-QA dataset') 48 | parser.add_argument("--gold_path", 49 | type=str, 50 | required=True, 51 | default="tatqa_dataset_test_gold.json", 52 | help='The path of the gold file') 53 | parser.add_argument("--pred_path", 54 | type=str, 55 | required=True, 56 | default="sample_predictions.json", 57 | help='The path of the prediction file') 58 | 59 | args = parser.parse_args() 60 | evaluate_prediction_file(args.gold_path, args.pred_path) 61 | -------------------------------------------------------------------------------- /Examples/OpenSource/ConvFinQA-benchmark/eval_tatqa/tatqa_metric_test.py: -------------------------------------------------------------------------------- 1 | 2 | from .tatqa_metric import TaTQAEmAndF1 3 | 4 | def test_em_and_f1(): 5 | 6 | mode1_test_data = [ 7 | ({'answer_type':'span', 'answer': ['here is, a test'], 'scale':''}, 'here is, a test', '', 1, 1), 8 | ({'answer_type': 'span', 'answer': ['here is, a test'], 'scale': ''}, 'here is, a test', '', 1, 1), 9 | ({'answer_type': 'span', 'answer': ['1234.1'], 'scale': 'million'}, '1234.1', 'thousand', 0, 0), # scale mismatch 10 | ({'answer_type': 'span', 'answer': ['1234.1'], 'scale': 'million'}, '123', 'thousand', 0, 0), # scale mismatch 11 | ({'answer_type': 'span', 'answer': ['12314.1'], 'scale': 'million'}, '12314.1', 'million', 1, 1), 12 | 13 | ({'answer_type': 'multi-span', 'answer': ['singapore', 'china', 'usa'], 'scale': ''}, ['singapore', 'china', 'usa'], '', 1, 1), 14 | ({'answer_type': 'multi-span', 'answer': ['singapore', 'china', 'usa'], 'scale': ''}, ['china', 'singapore', 'usa'], '', 1, 1), 15 | ({'answer_type': 'multi-span', 'answer': ['singapore', 'china', 'usa'], 'scale': ''}, ['china', 'singapore'], '',0, 0.8), 16 | 17 | ({'answer_type': 'arithmetic', 'answer': 123.2, 'scale': 'million'}, 123.2, '', 0, 0), # scale mismatch, f1 = 0 18 | ({'answer_type': 'arithmetic', 'answer': 123.2, 'scale': 'million'}, 123200000, '', 1, 1), # 19 | ({'answer_type': 'arithmetic', 'answer': 123.2, 'scale': 'million'}, 123.2, 'thousand', 0, 0), # scale mismatch 20 | ({'answer_type': 'arithmetic', 'answer': 123.2, 'scale': ''}, 123.2, '', 1, 1), 21 | ({'answer_type': 'arithmetic', 'answer': 123.22, 'scale': ''}, 123.2, '', 0, 0), 22 | ({'answer_type': 'arithmetic', 'answer': 123.2, 'scale': ''}, 123.2010, '', 1, 1), 23 | ({'answer_type': 'count', 'answer': 5, 'scale': ''}, 5, '', 1, 1), 24 | ({'answer_type': 'arithmetic', 'answer': 22.12, 'scale': 'percent'}, 0.2212, '', 1, 1), 25 | ({'answer_type': 'arithmetic', 'answer': 22.12, 'scale': 'percent'}, 0.22121, 'percent', 0, 0), 26 | ({'answer_type': 'arithmetic', 'answer': 22.12, 'scale': 'percent'}, 22.1231, '', 0, 0), 27 | ({'answer_type': 'arithmetic', 'answer': 22.12, 'scale': 'percent'}, 22.1231, 'percent', 1, 1), 28 | ({'answer_type': 'span', 'answer': [22.12], 'scale': 'million'}, '22.12', 'million', 1, 1), 29 | ({'answer_type': 'span', 'answer': [22.12], 'scale': 'million'}, '22.12', '', 0, 0), 30 | ({'answer_type': 'arithmetic', 'answer': 22.12, 'scale': 'million'}, 'test', '', 0, 0), 31 | ({'answer_type': 'arithmetic', 'answer': 22.12, 'scale': 'million'}, ["1","2"], '', 0, 0),# span is calcuated by word f1 32 | ({'answer_type': 'span', 'answer': [22.12], 'scale': 'percent'},"-22.12", '', 0, 0), 33 | ({'answer_type': 'span', 'answer': [22.12], 'scale': 'percent'},"22.12%", '', 1, 1), 34 | ({'answer_type': 'span', 'answer': [22.12], 'scale': ''}, "22.12%", '', 0, 0), 35 | ({'answer_type': 'span', 'answer': [22.12], 'scale': 'million'}, "$22.12", '', 0, 0), 36 | ({'answer_type': 'arithmetic', 'answer': 22.12, 'scale': 'million'}, "$22.12", '', 0, 0), 37 | ({'answer_type': 'span', 'answer': ["22.12"], 'scale': 'percent'}, ["-22.12"], '', 0, 0), 38 | ({'answer_type': 'span', 'answer': ['$1.0 million'], 'scale': ''}, ["['$1.0 million']"], '', 1, 1), 39 | 40 | ({'answer_type': 'span', 'answer': [22.12], 'scale': ''}, "$22.12", '', 1, 1), 41 | ({'answer_type': 'span', 'answer': [22.12], 'scale': 'percent'}, "22.12%", 'percent', 1, 1), 42 | ({'answer_type': 'count', 'answer': 5, 'scale': ''}, 'abcd 5', '1', 0, 0), 43 | 44 | ({'answer_type': 'multi-span', 'answer': ['$23,234', '$234.12'], 'scale': ''}, ['234.12', '23,234'], '', 45 | 1, 1), 46 | ({'answer_type': 'multi-span', 'answer': ['$35,120', '$24,159'], 'scale': ''}, ['$24,159', '$35,120'], '', 1, 1), 47 | ({'answer_type': 'arithmetic', 'answer': ['34.12'], 'scale': 'percent'}, ['0.3412'], '', 1, 1), 48 | ({'answer_type': 'span', 'answer': [ 49 | 'wages and salaries, social security costs, pension and other costs and share-based payments, see note 10 of the Financial Statements'], 50 | 'scale': ''}, 51 | ['wages and salaries, social security costs, pension and other costs and share - based payments,'], '', 0, 52 | 0.67), 53 | 54 | ] 55 | metrics = TaTQAEmAndF1() 56 | 57 | for ans, pred, pred_scale, em, f1 in mode1_test_data: 58 | metrics(ans, pred, pred_scale) 59 | pred_em, pred_f1, scale_score, op_score = metrics.get_overall_metric(reset=True) 60 | assert pred_em == em, f'mode2 - pred_em: {pred_em}, em:{em}, pred:{pred}, ans:{ans}' 61 | assert pred_f1 == f1, f'mode2 - pred_f1: {pred_f1}, f1:{f1}, pred:{pred}, ans:{ans}' 62 | 63 | 64 | def test_one(): 65 | mode_test_data = [ 66 | ({'answer_type': 'arithmetic', 'answer': ['34.12%'], 'scale': 'percent'}, ['0.3412'], '', 1, 1), 67 | ({'answer_type': 'arithmetic', 'answer': ['34.12%'], 'scale': ''}, ['0.3412'], '', 1, 1), 68 | ] 69 | metrics = TaTQAEmAndF1() 70 | for ans, pred, pred_scale, em, f1 in mode_test_data: 71 | metrics(ans, pred, pred_scale) 72 | pred_em, pred_f1, scale_score, op_score = metrics.get_overall_metric(reset=True) 73 | assert pred_f1 == f1, f'mode2 - pred_f1: {pred_f1}, f1:{f1}, pred:{pred}, ans:{ans}' 74 | -------------------------------------------------------------------------------- /Examples/OpenSource/ConvFinQA-benchmark/eval_tatqa/tatqa_utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | import string 3 | from typing import List 4 | import numpy as np 5 | 6 | def scale_to_num(scale): 7 | scale = scale.lower() 8 | num = 1 9 | if 'hundred' in scale: # hundred 10 | num = 100 11 | elif 'thousand' in scale: # thousand 12 | num = 1000 13 | elif 'million' in scale: # million 14 | num = 1000000 15 | elif 'billion' in scale: # billion 16 | num = 1000000000 17 | elif 'percent' in scale: # percent 18 | num = 0.01 19 | return num 20 | 21 | def extract_one_num_from_str(s): 22 | s = _clean_num(s) 23 | r_num = r"([+-]?\d+(\.\d+)?)|([+-]?\.\d+)" 24 | groups = re.findall(r_num, s) 25 | if len(groups) == 0: 26 | return None 27 | num = groups[-1][0] 28 | if num == '': 29 | return None 30 | if '.' in num: 31 | return float(num) 32 | return int(num) 33 | 34 | EXCLUDE_IN_NUM = "'\"\\$€£¥%(),[]" 35 | def _clean_num(text:str): 36 | return "".join([ch for ch in str(text) if ch not in EXCLUDE_IN_NUM]) 37 | 38 | 39 | def is_number(text: str) -> bool: 40 | try: 41 | words = " ".join([_clean_num(w) for w in text.split()]).split() 42 | if len(words) == 0: 43 | """1023 or 1 million""" 44 | return False 45 | num = float(words[0]) 46 | if np.isnan(num): 47 | return False 48 | if len(words) >= 2: 49 | if scale_to_num(words[1]) == 1: 50 | return False 51 | return True 52 | except ValueError: 53 | return False 54 | # except AttributeError: 55 | # return False 56 | 57 | def negative_num_handle(x): 58 | """ 59 | :param x: transform (134) -> -134 60 | :return: 61 | """ 62 | all = re.findall('(\([\d.\s]+\))', x.strip()) 63 | if len(all) > 0: 64 | return -1 65 | return 1 66 | 67 | def percent_num_handle(x): 68 | """ 69 | :param x: transform 12% -> 12/100 70 | :return: 71 | """ 72 | all = re.findall('([\d.\s]+%)', x.strip()) 73 | if len(all) > 0: 74 | return 0.01 75 | return 1 76 | 77 | def word_scale_handle(x): 78 | """ 79 | :param x: 1 million = 1,000,000 80 | :return: 81 | """ 82 | iter = re.finditer('([\d.]+\s?[a-zA-Z]+)', x) 83 | for one in iter: 84 | text = one.group(0).lower() 85 | scale_val = scale_to_num(text) 86 | return scale_val 87 | return 1 88 | 89 | def to_number(text:str) -> float: 90 | num = extract_one_num_from_str(text) 91 | scale_val = word_scale_handle(text) 92 | negative_flag = negative_num_handle(text) 93 | percent_flag = percent_num_handle(text) 94 | if num is not None: 95 | return round(num * scale_val * negative_flag * percent_flag, 4) 96 | return None 97 | 98 | def remove_articles(text: str) -> str: 99 | regex = re.compile(r'\b(a|an|the)\b', re.UNICODE) 100 | return re.sub(regex, ' ', text) 101 | 102 | def white_space_fix(text: str) -> str: 103 | return ' '.join(text.split()) 104 | 105 | EXCLUDE = set(string.punctuation) 106 | def remove_punc(text: str) -> str: 107 | if not is_number(text): 108 | return ''.join(ch for ch in text if ch not in EXCLUDE) 109 | else: 110 | return text 111 | 112 | def lower(text: str) -> str: 113 | return text.lower() 114 | 115 | def tokenize(text: str) -> List[str]: 116 | return re.split(" ", text) 117 | 118 | 119 | def normalize_number(text: str) -> str: 120 | if is_number(text): 121 | return str(to_number(text)) 122 | else: 123 | return text 124 | 125 | def normalize_answer(text: str) -> str: 126 | """Lower text and remove punctuation, articles and extra whitespace.""" 127 | parts = [white_space_fix(remove_articles(normalize_number(remove_punc(lower(token))))) 128 | for token in tokenize(text)] 129 | parts = [part for part in parts if part.strip()] 130 | normalized = ' '.join(parts).strip() 131 | return normalized 132 | 133 | 134 | STRIPPED_CHARACTERS = string.punctuation + ''.join([u"‘", u"’", u"´", u"`", "_"]) 135 | def ws_tokenize(text): 136 | """Runs basic whitespace cleaning and splitting on a piece of text.""" 137 | text = text.strip().lower() 138 | if not text: 139 | return [] 140 | text = white_space_fix(text) 141 | tokens = text.split() 142 | tokens = [token.strip(STRIPPED_CHARACTERS) for token in tokens] 143 | return tokens 144 | 145 | -------------------------------------------------------------------------------- /Examples/OpenSource/ConvFinQA-benchmark/eval_tatqa/tatqa_utils_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from .tatqa_utils import * 3 | 4 | 5 | def test_extract_first_num_from_text(): 6 | text = '2.3 million' 7 | assert extract_one_num_from_str(text) == 2.3 8 | text = '-2.3 million' 9 | assert extract_one_num_from_str(text) == -2.3 10 | text = '205 million' 11 | assert extract_one_num_from_str(text) == 205 12 | text = '-1,210 million' 13 | assert extract_one_num_from_str(text) == -1210 14 | 15 | 16 | def test_to_num(): 17 | text = '2.3 million' 18 | assert to_number(text) == 2300000 19 | text = '-2.3 thousand' 20 | assert to_number(text) == -2300 21 | text = '205 billion' 22 | assert to_number(text) == 205000000000 23 | text = '-1,210 million' 24 | assert to_number(text) == -1210000000 25 | 26 | 27 | 28 | def test_ws_tokenize(): 29 | text = '2.3 million' 30 | assert ws_tokenize(text) == ['2.3', 'million'] 31 | text = '2.3 \nmillion' 32 | assert ws_tokenize(text) == ['2.3', 'million'] 33 | text = '2.3\n\tmillion' 34 | assert ws_tokenize(text) == ['2.3', 'million'] 35 | 36 | def test_normalize_answer(): 37 | assert normalize_answer('-134.12') == '-134.12' 38 | assert normalize_answer('134.12') == '134.12' 39 | assert normalize_answer('(134.12)') == '-134.12' 40 | assert normalize_answer('18.3%') == '0.183' 41 | 42 | 43 | 44 | def test_is_num(): 45 | assert is_number('$124') 46 | -------------------------------------------------------------------------------- /Examples/OpenSource/ConvFinQA-benchmark/outputs/convfinqa_direct_gpt-4_04_12_20_53.jsonl: -------------------------------------------------------------------------------- 1 | {"questions": ["what was the change in the unamortized debt issuance costs associated with the senior notes between 2016 and 2017?", "so what was the percentage change during this time?", "what was the change associated with credit facilities during that time?", "so what was the percentage change?"], "answer": 0.375, "text": "as of december 31 , 2017 , the company had gross state income tax credit carry-forwards of approximately $ 20 million , which expire from 2018 through 2020 . a deferred tax asset of approximately $ 16 million ( net of federal benefit ) has been established related to these state income tax credit carry-forwards , with a valuation allowance of $ 7 million against such deferred tax asset as of december 31 , 2017 . the company had a gross state net operating loss carry-forward of $ 39 million , which expires in 2027 . a deferred tax asset of approximately $ 3 million ( net of federal benefit ) has been established for the net operating loss carry-forward , with a full valuation allowance as of december 31 , 2017 . other state and foreign net operating loss carry-forwards are separately and cumulatively immaterial to the company 2019s deferred tax balances and expire between 2026 and 2036 . debt long-term debt consisted of the following: . credit facility - in november 2017 , the company terminated its second amended and restated credit agreement and entered into a new credit agreement ( the \"credit facility\" ) with third-party lenders . the credit facility includes a revolving credit facility of $ 1250 million , which may be drawn upon during a period of five years from november 22 , 2017 . the revolving credit facility includes a letter of credit subfacility of $ 500 million . the revolving credit facility has a variable interest rate on outstanding borrowings based on the london interbank offered rate ( \"libor\" ) plus a spread based upon the company's credit rating , which may vary between 1.125% ( 1.125 % ) and 1.500% ( 1.500 % ) . the revolving credit facility also has a commitment fee rate on the unutilized balance based on the company 2019s leverage ratio . the commitment fee rate as of december 31 , 2017 was 0.25% ( 0.25 % ) and may vary between 0.20% ( 0.20 % ) and 0.30% ( 0.30 % ) . the credit facility contains customary affirmative and negative covenants , as well as a financial covenant based on a maximum total leverage ratio . each of the company's existing and future material wholly owned domestic subsidiaries , except those that are specifically designated as unrestricted subsidiaries , are and will be guarantors under the credit facility . in july 2015 , the company used cash on hand to repay all amounts outstanding under a prior credit facility , including $ 345 million in principal amount of outstanding term loans . as of december 31 , 2017 , $ 15 million in letters of credit were issued but undrawn , and the remaining $ 1235 million of the revolving credit facility was unutilized . the company had unamortized debt issuance costs associated with its credit facilities of $ 11 million and $ 8 million as of december 31 , 2017 and 2016 , respectively . senior notes - in december 2017 , the company issued $ 600 million aggregate principal amount of unregistered 3.483% ( 3.483 % ) senior notes with registration rights due december 2027 , the net proceeds of which were used to repurchase the company's 5.000% ( 5.000 % ) senior notes due in 2021 in connection with the 2017 redemption described below . in november 2015 , the company issued $ 600 million aggregate principal amount of unregistered 5.000% ( 5.000 % ) senior notes due november 2025 , the net proceeds of which were used to repurchase the company's 7.125% ( 7.125 % ) senior notes due in 2021 in connection with the 2015 tender offer and redemption described below . interest on the company's senior notes is payable semi-annually . the terms of the 5.000% ( 5.000 % ) and 3.483% ( 3.483 % ) senior notes limit the company 2019s ability and the ability of certain of its subsidiaries to create liens , enter into sale and leaseback transactions , sell assets , and effect consolidations or mergers . the company had unamortized debt issuance costs associated with the senior notes of $ 15 million and $ 19 million as of december 31 , 2017 and 2016 , respectively. .", "table": "( $ in millions ) | december 31 2017 | december 31 2016\nsenior notes due december 15 2021 5.000% ( 5.000 % ) | 2014 | 600\nsenior notes due november 15 2025 5.000% ( 5.000 % ) | 600 | 600\nsenior notes due december 1 2027 3.483% ( 3.483 % ) | 600 | 2014\nmississippi economic development revenue bonds due may 1 2024 7.81% ( 7.81 % ) | 84 | 84\ngulf opportunity zone industrial development revenue bonds due december 1 2028 4.55% ( 4.55 % ) | 21 | 21\nless unamortized debt issuance costs | -26 ( 26 ) | -27 ( 27 )\ntotal long-term debt | 1279 | 1278\n", "golden_text": "the company had unamortized debt issuance costs associated with its credit facilities of $ 11 million and $ 8 million as of december 31 , 2017 and 2016 , respectively . the company had unamortized debt issuance costs associated with the senior notes of $ 15 million and $ 19 million as of december 31 , 2017 and 2016 , respectively. . ", "golden_table": "( $ in millions ) | december 31 2017 | december 31 2016\nsenior notes due november 15 2025 5.000% ( 5.000 % ) | 600 | 600\n", "id": "Double_HII/2017/page_104.pdf", "programs": ["ans = 15 - 19", "x0 = 15 - 19; ans = x0 / 19", "ans = 11 - 8", "x0 = 11 - 8; ans = x0 / 8"], "generated": ["#Python\nunamortized_debt_issuance_costs_senior_notes_2017 = 15\nunamortized_debt_issuance_costs_senior_notes_2016 = 19\nunamortized_debt_issuance_costs_credit_facilities_2017 = 11\nunamortized_debt_issuance_costs_credit_facilities_2016 = 8"], "executed": 16.849999999999994} 2 | -------------------------------------------------------------------------------- /Examples/OpenSource/ConvFinQA-benchmark/tool.py: -------------------------------------------------------------------------------- 1 | from typing import Union, Any 2 | from math import isclose 3 | import func_timeout 4 | from sympy.solvers import solve 5 | from sympy import Symbol, Eq 6 | import math 7 | from sympy import simplify 8 | import numpy as np 9 | import cvxpy as cp 10 | import statistics 11 | 12 | 13 | def get_precision(gt_ans: float) -> int: 14 | precision = 5 15 | if '.' in str(gt_ans): 16 | precision = len(str(gt_ans).split('.')[-1]) 17 | return precision 18 | 19 | 20 | def finqa_equal(prediction: Union[bool, float, str], 21 | reference: Union[float, str], 22 | include_percentage: bool = False, 23 | is_close: float = False) -> bool: 24 | if prediction is None: 25 | return False 26 | elif type(prediction) == bool: 27 | # bool questions 28 | if prediction: 29 | return reference == 'yes' 30 | else: 31 | return reference == 'no' 32 | elif type(reference) == str or type(prediction) == str: 33 | # string questions 34 | return prediction == reference 35 | else: 36 | # number questions 37 | if include_percentage: 38 | gt_result = [reference / 100, reference, reference * 100] 39 | else: 40 | gt_result = [reference] 41 | for item in gt_result: 42 | try: 43 | if is_close: 44 | if isclose(item, prediction, rel_tol=0.001): 45 | return True 46 | precision = min(get_precision(prediction), get_precision(item)) 47 | if round(prediction, precision) == round(item, precision): 48 | return True 49 | except Exception: 50 | continue 51 | return False 52 | 53 | 54 | def simplify_ans(ans, convert_to_str: bool = True): 55 | if 'relational' in str(type(ans)): 56 | return str(ans) 57 | elif 'numpy' in str(type(ans)): 58 | if ans.shape == (): 59 | # scalar value 60 | ans = round(float(ans), 2) 61 | else: 62 | # array value 63 | ans = round(float(ans[0]), 2) 64 | if convert_to_str: 65 | return str(ans) 66 | else: 67 | return ans 68 | elif not ans: 69 | return None 70 | else: 71 | if type(ans) in [list, tuple]: 72 | if 'sympy' in str(type(ans[0])): 73 | try: 74 | ans = [round(float(x), 2) for x in ans] 75 | except Exception: 76 | ans = [str(x) for x in ans] 77 | if len(ans) == 1: 78 | ans = ans[0] 79 | else: 80 | if 'sympy' in str(type(ans)): 81 | try: 82 | ans = round(float(ans), 2) 83 | except Exception: 84 | ans = str(ans) 85 | if convert_to_str: 86 | return str(ans) 87 | else: 88 | return ans 89 | 90 | 91 | def floatify_ans(ans): 92 | if ans is None: 93 | return None 94 | elif type(ans) == dict: 95 | ans = list(ans.values())[0] 96 | elif type(ans) == bool: 97 | ans = ans 98 | elif type(ans) in [list, tuple]: 99 | if not ans: 100 | return None 101 | else: 102 | try: 103 | ans = float(ans[0]) 104 | except Exception: 105 | ans = str(ans[0]) 106 | else: 107 | try: 108 | ans = float(ans) 109 | except Exception: 110 | ans = str(ans) 111 | return ans 112 | 113 | 114 | def parse_api_result(result): 115 | to_return = [] 116 | for idx, g in enumerate(result['choices']): 117 | text = g['text'] 118 | logprob = sum(g['logprobs']['token_logprobs']) 119 | to_return.append((text, logprob)) 120 | to_return = sorted(to_return, key=lambda tup: tup[1], reverse=True) 121 | to_return = [r[0] for r in to_return] 122 | return to_return 123 | 124 | 125 | def solve_it(equation, variable): 126 | solution = solve(equation, variable, dict=True) 127 | if not solution: 128 | if isinstance(variable, list): 129 | solution = {v: None for v in variable} 130 | else: 131 | solution = {variable: None} 132 | return solution 133 | else: 134 | solution = solution[0] 135 | return solution 136 | 137 | 138 | def safe_execute(code_string: str, keys=None): 139 | def execute(x): 140 | try: 141 | exec(x) 142 | locals_ = locals() 143 | if keys is None: 144 | return locals_.get('ans', None) 145 | else: 146 | return [locals_.get(k, None) for k in keys] 147 | except Exception: 148 | return None 149 | try: 150 | ans = func_timeout.func_timeout(5, execute, args=(code_string,)) 151 | except func_timeout.FunctionTimedOut: 152 | ans = None 153 | 154 | return ans 155 | 156 | 157 | def synthesize_program(result: str, prefix: str) -> str: 158 | program = prefix 159 | for i, line in enumerate(result.split('\n')): 160 | if i == 0: 161 | program += line + '\n' 162 | else: 163 | if line.startswith(' '): 164 | program += line + '\n' 165 | else: 166 | break 167 | program += 'ans = solver()' 168 | return program -------------------------------------------------------------------------------- /Examples/OpenSource/LangChain/CustomAPIMAzureOpenAI.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from langchain.llms import OpenAI 4 | from langchain import PromptTemplate, LLMChain 5 | from langchain.llms.openai import * 6 | 7 | class CustomAPIMAzureOpenAI(AzureOpenAI): 8 | """Azure specific OpenAI class that uses deployment name.""" 9 | 10 | deployment_name: str = "" 11 | """Deployment name to use.""" 12 | subscription_key: str = "" 13 | 14 | @property 15 | def _identifying_params(self) -> Mapping[str, Any]: 16 | return { 17 | **{"deployment_name": self.deployment_name, "subscription_key":self.subscription_key}, 18 | **super()._identifying_params, 19 | } 20 | 21 | @property 22 | def _invocation_params(self) -> Dict[str, Any]: 23 | return {**{"engine": self.deployment_name}, **super()._invocation_params} 24 | 25 | def _generate( 26 | self, prompts: List[str], stop: Optional[List[str]] = None 27 | ) -> LLMResult: 28 | """Call out to OpenAI's endpoint with k unique prompts. 29 | Args: 30 | prompts: The prompts to pass into the model. 31 | stop: Optional list of stop words to use when generating. 32 | Returns: 33 | The full LLM output. 34 | Example: 35 | .. code-block:: python 36 | response = openai.generate(["Tell me a joke."]) 37 | """ 38 | # TODO: write a unit test for this 39 | params = self._invocation_params 40 | sub_prompts = self.get_sub_prompts(params, prompts, stop) 41 | choices = [] 42 | token_usage: Dict[str, int] = {} 43 | # Get the token usage from the response. 44 | # Includes prompt, completion, and total tokens used. 45 | _keys = {"completion_tokens", "prompt_tokens", "total_tokens"} 46 | for _prompts in sub_prompts: 47 | if self.streaming: 48 | if len(_prompts) > 1: 49 | raise ValueError("Cannot stream results with multiple prompts.") 50 | params["stream"] = True 51 | response = _streaming_response_template() 52 | for stream_resp in completion_with_retry( 53 | self, prompt=_prompts, headers={'Ocp-Apim-Subscription-Key':self.subscription_key}, **params 54 | ): 55 | self.callback_manager.on_llm_new_token( 56 | stream_resp["choices"][0]["text"], 57 | verbose=self.verbose, 58 | logprobs=stream_resp["choices"][0]["logprobs"], 59 | ) 60 | _update_response(response, stream_resp) 61 | choices.extend(response["choices"]) 62 | else: 63 | response = completion_with_retry(self, prompt=_prompts, headers={'Ocp-Apim-Subscription-Key':self.subscription_key}, **params) 64 | choices.extend(response["choices"]) 65 | if not self.streaming: 66 | # Can't update token usage if streaming 67 | update_token_usage(_keys, response, token_usage) 68 | return self.create_llm_result(choices, prompts, token_usage) 69 | 70 | async def _agenerate(self, prompts: List[str], stop: Optional[List[str]] = None 71 | ) -> LLMResult: 72 | """Call out to OpenAI's endpoint async with k unique prompts.""" 73 | params = self._invocation_params 74 | sub_prompts = self.get_sub_prompts(params, prompts, stop) 75 | choices = [] 76 | token_usage: Dict[str, int] = {} 77 | # Get the token usage from the response. 78 | # Includes prompt, completion, and total tokens used. 79 | _keys = {"completion_tokens", "prompt_tokens", "total_tokens"} 80 | for _prompts in sub_prompts: 81 | if self.streaming: 82 | if len(_prompts) > 1: 83 | raise ValueError("Cannot stream results with multiple prompts.") 84 | params["stream"] = True 85 | response = _streaming_response_template() 86 | async for stream_resp in await acompletion_with_retry( 87 | self, prompt=_prompts, headers={'Ocp-Apim-Subscription-Key':self.subscription_key}, **params 88 | ): 89 | if self.callback_manager.is_async: 90 | await self.callback_manager.on_llm_new_token( 91 | stream_resp["choices"][0]["text"], 92 | verbose=self.verbose, 93 | logprobs=stream_resp["choices"][0]["logprobs"], 94 | ) 95 | else: 96 | self.callback_manager.on_llm_new_token( 97 | stream_resp["choices"][0]["text"], 98 | verbose=self.verbose, 99 | logprobs=stream_resp["choices"][0]["logprobs"], 100 | ) 101 | _update_response(response, stream_resp) 102 | choices.extend(response["choices"]) 103 | else: 104 | response = await acompletion_with_retry(self, prompt=_prompts, headers={'Ocp-Apim-Subscription-Key':self.subscription_key}, **params) 105 | choices.extend(response["choices"]) 106 | if not self.streaming: 107 | # Can't update token usage if streaming 108 | update_token_usage(_keys, response, token_usage) 109 | return self.create_llm_result(choices, prompts, token_usage) 110 | 111 | -------------------------------------------------------------------------------- /Examples/OpenSource/LangChain/LangChainSummarizationExample.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 6, 6 | "id": "6bcafd95-00c2-4b7f-b1b4-614a23c1f255", 7 | "metadata": { 8 | "tags": [] 9 | }, 10 | "outputs": [], 11 | "source": [ 12 | "from langchain import OpenAI, PromptTemplate, LLMChain\n", 13 | "from langchain.text_splitter import CharacterTextSplitter\n", 14 | "from langchain.chains.mapreduce import MapReduceChain\n", 15 | "from langchain.prompts import PromptTemplate\n", 16 | "from CustomAPIMAzureOpenAI import CustomAPIMAzureOpenAI\n", 17 | "import os, openai\n", 18 | "\n", 19 | "os.environ[\"OPENAI_API_KEY\"] = \"na\"\n", 20 | "os.environ[\"OPENAI_API_TYPE\"] = openai.api_type = \"azure\"\n", 21 | "os.environ[\"OPENAI_API_VERSION\"] = openai.api_version = \"2022-12-01\"\n", 22 | "os.environ[\"OPENAI_API_BASE\"] = openai.api_base = \"https://[APIM_ENDPOINT].azure-api.net/\"\n", 23 | "deployment_name = \"deployment_name\"\n", 24 | "model_name = \"model_name\"\n", 25 | "\n", 26 | "llm = CustomAPIMAzureOpenAI(deployment_name=deployment_name, model_name=model_name, subscription_key = 'SUBSCRIPTION_KEY' )\n", 27 | "\n", 28 | "\n", 29 | "text_splitter = CharacterTextSplitter()" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 7, 35 | "id": "32dbb04f-9dab-420a-979e-ca4defb2092a", 36 | "metadata": { 37 | "tags": [] 38 | }, 39 | "outputs": [ 40 | { 41 | "name": "stderr", 42 | "output_type": "stream", 43 | "text": [ 44 | "Created a chunk of size 8297, which is longer than the specified 4000\n", 45 | "Created a chunk of size 8410, which is longer than the specified 4000\n", 46 | "Created a chunk of size 8271, which is longer than the specified 4000\n", 47 | "Created a chunk of size 8217, which is longer than the specified 4000\n", 48 | "Created a chunk of size 6170, which is longer than the specified 4000\n" 49 | ] 50 | } 51 | ], 52 | "source": [ 53 | "with open('./stateoftheunion_20230207.txt') as f:\n", 54 | " state_of_the_union = f.read()\n", 55 | "texts = text_splitter.split_text(state_of_the_union)" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 8, 61 | "id": "644dda7e-650c-4cbb-9d29-609aeccbb46a", 62 | "metadata": { 63 | "tags": [] 64 | }, 65 | "outputs": [], 66 | "source": [ 67 | "from langchain.docstore.document import Document\n", 68 | "\n", 69 | "docs = [Document(page_content=t) for t in texts[:3]]" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 9, 75 | "id": "977e6220-9398-4aae-a3a2-202172b84aee", 76 | "metadata": { 77 | "tags": [] 78 | }, 79 | "outputs": [], 80 | "source": [ 81 | "from langchain.chains.summarize import load_summarize_chain\n" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 10, 87 | "id": "b1d82b77-6323-4ebe-88e9-1ca23b653916", 88 | "metadata": { 89 | "tags": [] 90 | }, 91 | "outputs": [ 92 | { 93 | "data": { 94 | "text/plain": [ 95 | "'\\n\\nThis message from the President of the United States celebrates the accomplishments of the past two years, including the creation of 12 million jobs, the passage of 300 bipartisan laws, and the passage of the CHIPS and Science Act and the Bipartisan Infrastructure Law. It also outlines plans for the future, such as investing in infrastructure, providing clean water and high-speed internet access, and capping the cost of insulin for seniors on Medicare.'" 96 | ] 97 | }, 98 | "execution_count": 10, 99 | "metadata": {}, 100 | "output_type": "execute_result" 101 | } 102 | ], 103 | "source": [ 104 | "chain = load_summarize_chain(llm, chain_type=\"stuff\")\n", 105 | "chain.run(docs)" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "id": "c851d245-cc1c-467f-80c0-bb5b53d81f31", 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [] 115 | } 116 | ], 117 | "metadata": { 118 | "kernelspec": { 119 | "display_name": "gradio", 120 | "language": "python", 121 | "name": "gradio" 122 | }, 123 | "language_info": { 124 | "codemirror_mode": { 125 | "name": "ipython", 126 | "version": 3 127 | }, 128 | "file_extension": ".py", 129 | "mimetype": "text/x-python", 130 | "name": "python", 131 | "nbconvert_exporter": "python", 132 | "pygments_lexer": "ipython3", 133 | "version": "3.10.9" 134 | } 135 | }, 136 | "nbformat": 4, 137 | "nbformat_minor": 5 138 | } 139 | -------------------------------------------------------------------------------- /Examples/OpenSource/LlamaIndex/sample_10k_chain.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jakeatmsft/AzureOpenAIExamples/222106ab4dc2e124e5ec16a5d8bf771eefd25ca8/Examples/OpenSource/LlamaIndex/sample_10k_chain.pdf -------------------------------------------------------------------------------- /Examples/Readme.md: -------------------------------------------------------------------------------- 1 | # How to use the repository examples 2 | 3 | ## Pre-requisites 4 | - Azure subscription 5 | - https://azure.microsoft.com/en-us/ 6 | - Azure Cognitive Services Instance 7 | - https://azure.microsoft.com/en-us/products/cognitive-services/#overview 8 | - Azure OpenAI Service Instance 9 | - https://azure.microsoft.com/en-us/products/cognitive-services/openai-service/ 10 | 11 | ## Deploy Azure OpenAI model 12 | - https://learn.microsoft.com/en-us/azure/cognitive-services/openai/how-to/create-resource?pivots=web-portal 13 |
    14 |
  1. Go to the Azure OpenAI Studio
  2. 15 |
  3. Login with the resource you want to use
  4. 16 |
  5. Select the Go to Deployments button under Manage deployments in your resource to navigate to the Deployments page
  6. 17 |
  7. Create a new deployment called text-davinci-002 and choose the text-davinci-002 model from the drop-down.
  8. 18 |
19 | 20 | ## Fill in config parameters 21 | - Open the config.cfg file 22 | - Replace the values in the file with the apikeys and model names of deployed services: 23 | - Example config: 24 | ``` 25 | [openai_api] 26 | api_key:33XXXXXXXXXXXXXXXXXXXX2e 27 | api_ep:https://XXXXX.openai.azure.com/ 28 | api_model:model_name 29 | cog_svc_key:33XXXXXXXXXXXXXXXXXXXX2e 30 | cog_svc_ep:https://XXXXX.cognitiveservices.azure.com 31 | 32 | ``` 33 | ## Install requirements 34 | - Install python packages in the [requirements.txt](requirements.txt) file. 35 | ## Navigate to example notebooks 36 | - Open the sample notebooks using Jupyter to run in local or cloud environment. 37 | 38 | 39 | -------------------------------------------------------------------------------- /Examples/Speech/Conversation_SSML.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Hello, how may I help you today? 5 | I really need help with my credit card, it's not working at all 6 | May I please have your first and last name? 7 | sure it's John, J O H N, Doh, D O E 8 | Thank you Mr Doh, can you confirm the last four digits of your account number? 9 | Which number? Is that the card number or the number on my statement, I don't have a statement in front of me. 10 | It should be the last four digits printed on your credit card. 11 | Ok, let me get it, my wallet is in the other room. 12 | I have it now, the number is 4 3 2 1 13 | Thank you again Mr Doh. 14 | It looks like there is suspected fraud on your credit card.  Can you confirm the last purchase you made? 15 | I tried to use it to book an Air Bee En Bee for my daughter. 16 | Can you confirm the charge amount? 17 | I don't know. it was about two thousand dollars for a stay in December in Florida. 18 | Ok I can confirm the amount now, our system detected it as fraud but since you have confirmed it we will mark it as approved.  Please proceed with your booking. 19 | I hope I can get the same house. bookings were hard to find in that area.  I'm going to try now.ok it looks like the booking went through thank you 20 | Is there anything else I can help you with? 21 | Yes, as a matter of fact.  I want to order another card for my daughter to use. 22 | Sure, I can help you with that, can I have her first and last name? 23 | Jane, J A N E, Doh, D O E. 24 | What address can I mail the card to? 25 | You can mail it to the default address on Pine Wood Ave. 26 | Ok you can expect the card in 1 to 2 business days.Is there anything else? 27 | No thank you for your help. -------------------------------------------------------------------------------- /Examples/aml_examples/1a_read_example-copy/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | page_type: sample 3 | languages: 4 | - python 5 | products: 6 | - azure-machine-learning 7 | description: This sample shows how to run build pipeline with component. 8 | --- 9 | 10 | # Running a Pipeline job with components 11 | This example shows how to use component to build a pipeline: [pipeline_with_components_from_yaml.ipynb](pipeline_with_components_from_yaml.ipynb). -------------------------------------------------------------------------------- /Examples/aml_examples/1a_read_example-copy/adls_src/read_folder.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from pathlib import Path 3 | from uuid import uuid4 4 | from datetime import datetime 5 | import os 6 | 7 | parser = argparse.ArgumentParser("train") 8 | parser.add_argument("--training_data", type=str, help="Path to training data") 9 | parser.add_argument("--model_output", type=str, help="Path of output model", default="" ) 10 | 11 | args = parser.parse_args() 12 | 13 | print("hello training world...") 14 | 15 | lines = [ 16 | f"Training data path: {args.training_data}", 17 | ] 18 | 19 | for line in lines: 20 | print(line) 21 | 22 | print("mounted_path files: ") 23 | arr = os.listdir(args.training_data) 24 | print(arr) 25 | 26 | for filename in arr: 27 | print("reading file: %s ..." % filename) 28 | with open(os.path.join(args.training_data, filename), "r") as handle: 29 | print(handle.read()) 30 | 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /Examples/aml_examples/1a_read_example-copy/read_adls.yml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json 2 | type: command 3 | 4 | name: train_model 5 | display_name: Train Model 6 | description: A dummy training component 7 | version: 0.0.1 8 | inputs: 9 | training_data: 10 | type: uri_folder 11 | outputs: 12 | model_output: 13 | type: uri_folder 14 | code: ./adls_src 15 | environment: azureml:AzureML-sklearn-1.0-ubuntu20.04-py38-cpu:1 16 | command: >- 17 | python read_folder.py 18 | --training_data ${{inputs.training_data}} 19 | -------------------------------------------------------------------------------- /Examples/aml_foundationmodels/dockerfile/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM mcr.microsoft.com/aifx/acpt/stable-ubuntu2004-cu117-py38-torch201:biweekly.202309.2 2 | 3 | WORKDIR / 4 | 5 | # support Deepspeed launcher requirement of passwordless ssh login 6 | RUN apt-get update && apt-get -y upgrade 7 | RUN apt-get install -y openssh-server openssh-client 8 | 9 | COPY requirements.txt . 10 | RUN pip install -r requirements.txt --no-cache-dir 11 | 12 | # List installed packages 13 | RUN pip list 14 | 15 | ## Delete 16 | RUN rm requirements.txt 17 | 18 | # Copy scoring file 19 | COPY score.py /var/mlflow_resources/mlflow_score_script.py 20 | ENV AZUREML_ENTRY_SCRIPT="mlflow_score_script.py" 21 | ENV AML_APP_ROOT="/var/mlflow_resources" 22 | 23 | # Inference requirements 24 | COPY --from=mcr.microsoft.com/azureml/o16n-base/python-assets:20230419.v1 /artifacts /var/ 25 | RUN /var/requirements/install_system_requirements.sh && \ 26 | cp /var/configuration/rsyslog.conf /etc/rsyslog.conf && \ 27 | cp /var/configuration/nginx.conf /etc/nginx/sites-available/app && \ 28 | ln -sf /etc/nginx/sites-available/app /etc/nginx/sites-enabled/app && \ 29 | rm -f /etc/nginx/sites-enabled/default 30 | ENV SVDIR=/var/runit 31 | ENV WORKER_TIMEOUT=3600 32 | EXPOSE 5001 8883 8888 33 | 34 | CMD [ "runsvdir", "/var/runit" ] -------------------------------------------------------------------------------- /Examples/aml_foundationmodels/dockerfile/requirements.txt: -------------------------------------------------------------------------------- 1 | deepspeed~=0.10.0 2 | deepspeed-mii~=0.0.6 3 | aiolimiter~=1.1.0 4 | torch~=2.0.1 5 | protobuf~=4.24.0 6 | psutil~=5.9.5 7 | transformers~=4.33.0 8 | sentencepiece~=0.1.99 9 | xformers~=0.0.21 10 | pandas~=2.0.3 11 | azure-ai-contentsafety~=1.0.0b1 12 | azure-identity==1.14.0 13 | azure-mgmt-cognitiveservices==13.5.0 14 | azureml-inference-server-http==0.8.4.2 15 | azureml-core==1.53.0 16 | azureml-mlflow==1.53.0 17 | cryptography~=41.0.3 18 | certifi==2023.07.22 19 | requests~=2.31.0 20 | aiohttp~=3.8.5 21 | einops~=0.6.0 22 | accelerate 23 | langchain 24 | bitsandbytes 25 | flask 26 | 27 | -------------------------------------------------------------------------------- /Examples/aml_foundationmodels/src/server.py: -------------------------------------------------------------------------------- 1 | from langchain import HuggingFacePipeline, PromptTemplate, LLMChain 2 | from transformers import AutoTokenizer, pipeline 3 | import torch 4 | from flask import Flask, render_template, request 5 | import re 6 | 7 | # choose model based on your hardware 8 | model = 'tiiuae/falcon-7b-instruct' 9 | # model = 'tiiuae/falcon-40b-instruct' 10 | 11 | # load a tokenizer from a pretrained model using the Hugging Face AutoTokenizer class 12 | # and the from_pretrained method is used to retrieve the tokenizer associated with the specified model 13 | # to which the tokenizer is responsible for processing text inputs and converting them into numerical 14 | # representations suitable for input to the model 15 | print('loading model') 16 | tokenizer = AutoTokenizer.from_pretrained(model) 17 | 18 | # the pipeline function returns a callable object that can be used to generate text using 19 | # the specified model and parameters 20 | print('loading pipeline') 21 | pipeline = pipeline( 22 | 'text-generation', # the task for the pipeline 23 | model=model, # the pretrained model to use 24 | tokenizer=tokenizer, # the tokenizer for preprocessing inputs 25 | torch_dtype=torch.bfloat16, # the data type for torch tensors 26 | trust_remote_code=True, # flag to trust remote code (e.g., when using remote models) 27 | device_map='auto', # the device to run the pipeline on (GPU or CPU) 28 | max_length=20000, # the maximum length of generated text 29 | do_sample=True, # flag indicating whether to use sampling for text generation 30 | top_k=10, # the number of highest probability tokens to consider for sampling 31 | num_return_sequences=1, # the number of sequences to generate 32 | eos_token_id=tokenizer.eos_token_id # the token ID representing the end of a text sequence 33 | ) 34 | 35 | # the HuggingFacePipeline instance llm is created with the specified pipeline and model_kwargs and 36 | # the llm object can then be used to generate text based on the configured pipeline and model parameters 37 | # create an instance of the HuggingFacePipeline class 38 | print('loading llm') 39 | llm = HuggingFacePipeline( 40 | pipeline=pipeline, # the text generation pipeline to use 41 | model_kwargs={'temperature': 0} # temperature is a common parameter used in text generation models to 42 | # control the randomness of the generated output and the higher 43 | # temperature values (e.g., 1.0) lead to more diverse and creative 44 | # output, while lower values (e.g., 0.5) make the output more 45 | # focused and deterministic 46 | ) 47 | 48 | # define the template for the prompt 49 | template = """ 50 | You are an intelligent chatbot. Take careful consideration to context of the question and answer appropriately. 51 | Question: {question} 52 | Answer:""" 53 | 54 | # define a template for the prompt to be used in the LLMChain instance and the prompt template allows for 55 | # customization of the prompt message and dynamic insertion of input variables and 56 | # the template variable stores a multi-line string that serves as the template for the prompt and it provides a general 57 | # message for the chatbot and defines the format for presenting the question and answer and the PromptTemplate class is 58 | # instantiated with two arguments which are template, the template string defined earlier, which serves as the base 59 | # structure for the prompt and input_variables, a list of input variables used in the template and in this case, we 60 | # have only one variable, 'question', which represents the user's input question to which the PromptTemplate object 61 | # prompt is created, which can be used within the LLMChain instance to generate prompts dynamically based on user input 62 | # and by using prompt templates, you can create flexible and customizable prompts that adapt to the user's specific 63 | # input, making the conversation more engaging and interactive 64 | # create a prompt template 65 | prompt = PromptTemplate( 66 | template=template, # the template string for the prompt 67 | input_variables=['question'] # the list of input variables used in the template 68 | ) 69 | 70 | # create an instance of the LLMChain class 71 | llm_chain = LLMChain( 72 | prompt=prompt, # the prompt template for generating prompts 73 | llm=llm # the HuggingFacePipeline instance for text generation 74 | ) 75 | 76 | 77 | def remove_angle_brackets(text): 78 | """ 79 | Removes angle brackets and their contents from the given text. 80 | 81 | Args: 82 | text (str): The input text from which angle brackets and their contents need to be removed. 83 | 84 | Returns: 85 | str: The modified text with angle brackets and their contents removed. 86 | """ 87 | return re.sub(r'<[^>]*>', '', text) 88 | 89 | 90 | # init the Flask app 91 | app = Flask(__name__) 92 | 93 | 94 | @app.route('/', methods=['GET', 'POST']) 95 | def home(): 96 | """ 97 | Renders the home page and handles form submission. 98 | 99 | If the request method is POST, it retrieves the question from the form, 100 | generates a response using the LLMChain, removes angle brackets from the response, 101 | and renders the updated index.html template with the response. 102 | 103 | If the request method is GET, it renders the index.html template. 104 | 105 | Returns: 106 | str: The rendered HTML template for the home page. 107 | """ 108 | if request.method == 'POST': 109 | question = request.form['question'] 110 | response = llm_chain.run(question) 111 | response = remove_angle_brackets(response) 112 | return response 113 | return "POST form[question]" 114 | 115 | 116 | if __name__ == '__main__': 117 | # check if CUDA is available and being used 118 | if torch.cuda.is_available() and torch.cuda.current_device() != -1: 119 | print('CUDA is being used.') 120 | else: 121 | print('CUDA is not being used.') 122 | print('running server') 123 | # run app 124 | app.run(host='0.0.0.0', port=5000, debug=False) -------------------------------------------------------------------------------- /Examples/config.cfg: -------------------------------------------------------------------------------- 1 | [openai_api] 2 | api_key:{openai_apikey} 3 | api_ep:{openai_endpoint} 4 | api_model:{deployment_name} 5 | cog_svc_key:{cogsvc_apikey} 6 | cog_svc_ep:{cogsvc_endpoint} 7 | -------------------------------------------------------------------------------- /Examples/demo/stock_prompt.csv: -------------------------------------------------------------------------------- 1 | Date,Symbol,Adj Close,Close,High,Low,Open,Volume 2 | 2009-12-31,MMM,,,,,, 3 | 2010-01-04,MMM,59.318885803222656,83.0199966430664,83.44999694824219,82.66999816894531,83.08999633789062,3043700.0 4 | 2010-01-05,MMM,58.94734191894531,82.5,83.2300033569336,81.69999694824219,82.80000305175781,2847000.0 5 | 2010-01-06,MMM,59.783294677734375,83.66999816894531,84.5999984741211,83.51000213623047,83.87999725341797,5268500.0 6 | 2010-01-07,MMM,59.826175689697266,83.7300033569336,83.76000213623047,82.12000274658203,83.31999969482422,4470100.0 7 | 2010-01-08,MMM,60.24774932861328,84.31999969482422,84.31999969482422,83.30000305175781,83.69000244140625,3405800.0 8 | 2010-01-11,MMM,60.004825592041016,83.9800033569336,84.5999984741211,83.41000366210938,84.37999725341797,2927100.0 9 | 2010-01-12,MMM,60.05484390258789,84.05000305175781,84.18000030517578,83.30000305175781,83.58000183105469,3031800.0 10 | 2010-01-13,MMM,59.861942291259766,83.77999877929688,84.11000061035156,83.19999694824219,84.11000061035156,3102000.0 11 | 2010-01-14,MMM,59.661865234375,83.5,83.93000030517578,83.41999816894531,83.73999786376953,2634100.0 12 | 2010-01-15,MMM,59.56898880004883,83.37000274658203,84.08999633789062,82.87000274658203,83.5199966430664,3955000.0 13 | 2010-01-19,MMM,60.819358825683594,85.12000274658203,85.16999816894531,83.5,83.81999969482422,4500400.0 14 | 2010-01-20,MMM,60.5335578918457,84.72000122070312,85.12999725341797,83.58999633789062,84.83000183105469,3671200.0 15 | 2010-01-21,MMM,59.090240478515625,82.69999694824219,84.5999984741211,82.56999969482422,84.5999984741211,4783200.0 16 | 2010-01-22,MMM,58.218544006347656,81.4800033569336,82.83000183105469,81.30000305175781,82.4000015258789,4809000.0 17 | 2010-01-25,MMM,58.4686164855957,81.83000183105469,82.88999938964844,81.4800033569336,82.33000183105469,3386600.0 18 | 2010-01-26,MMM,58.32571792602539,81.62999725341797,82.7300033569336,81.04000091552734,81.45999908447266,3138000.0 19 | 2010-01-27,MMM,58.80441665649414,82.30000305175781,82.83999633789062,81.01000213623047,81.33000183105469,5066900.0 20 | 2010-01-28,MMM,57.69694519042969,80.75,82.63999938964844,79.11000061035156,82.62000274658203,6820700.0 21 | 2010-01-29,MMM,57.51118087768555,80.48999786376953,81.87999725341797,80.18000030517578,81.3499984741211,4347000.0 22 | 2010-02-01,MMM,57.47545623779297,80.44000244140625,80.88999938964844,79.94000244140625,80.83999633789062,3632700.0 23 | 2010-02-02,MMM,57.46830368041992,80.43000030517578,80.94000244140625,79.69999694824219,80.69999694824219,4690000.0 24 | 2010-02-03,MMM,58.23284912109375,81.5,81.68000030517578,79.83000183105469,79.83000183105469,3401300.0 25 | 2010-02-04,MMM,56.59660720825195,79.20999908447266,81.12999725341797,78.83000183105469,81.12999725341797,5312600.0 26 | 2010-02-05,MMM,56.11787796020508,78.54000091552734,79.5,77.26000213623047,79.16000366210938,5408000.0 27 | 2010-02-08,MMM,55.39621353149414,77.52999877929688,78.4800033569336,77.25,78.4800033569336,4407300.0 28 | 2010-02-09,MMM,56.246490478515625,78.72000122070312,79.41000366210938,77.9000015258789,78.27999877929688,4252500.0 29 | 2010-02-10,MMM,56.175025939941406,78.62000274658203,79.29000091552734,78.0,78.80000305175781,2445200.0 30 | 2010-02-11,MMM,57.35396194458008,80.2699966430664,80.38999938964844,78.83999633789062,79.22000122070312,5524500.0 31 | 2010-02-12,MMM,56.575157165527344,79.18000030517578,79.2300033569336,78.06999969482422,79.13999938964844,5443000.0 32 | 2010-02-16,MMM,57.496891021728516,80.47000122070312,80.63999938964844,79.33999633789062,80.08000183105469,3619100.0 33 | 2010-02-17,MMM,57.63711929321289,80.13999938964844,80.69000244140625,79.73999786376953,80.41999816894531,3095200 -------------------------------------------------------------------------------- /Examples/promptflow/ag-convfinqa-pf/.promptflow/flow.layout.json: -------------------------------------------------------------------------------- 1 | { 2 | "nodeLayouts": { 3 | "inputs": { 4 | "x": 290, 5 | "y": 42, 6 | "index": -1 7 | }, 8 | "outputs": { 9 | "x": 220, 10 | "y": 310, 11 | "index": -1 12 | }, 13 | "autogen_step": { 14 | "x": 140, 15 | "y": 176, 16 | "index": 0 17 | } 18 | }, 19 | "orientation": "Vertical" 20 | } -------------------------------------------------------------------------------- /Examples/promptflow/ag-convfinqa-pf/.promptflow/flow.output.json: -------------------------------------------------------------------------------- 1 | { 2 | "answer": "Sure! Let's continue solving the problem.\n\nSince we have already determined the projected exercise price per share in 2040, we can move on to the next question.\n\nQuestion: If the exercise price per share continues to grow at the same average annual growth rate, what would be the projected exercise price per share in 2050?\n\nTo calculate the projected exercise price per share in 2050, we can use the same formula as before:\n\n\\(\\text{{Projected Price}} = \\text{{Initial Price}} \\times (1 + \\text{{Average Annual Growth Rate}})^{\\text{{Number of Years}}}\\)\n\nIn this case, the initial price is the weighted average exercise price per share in 2007, the average annual growth rate is 0.5569, and the number of years is 43 (from 2007 to 2050).\n\nHere's the code to calculate the projected exercise price per share in 2050:\n\n```python\nnumber_of_years = 43\n\nprojected_price_2050 = weighted_average_2007 * (1 + average_annual_growth_rate) ** number_of_years\n\nprint(\"The projected exercise price per share in 2050 would be $\", projected_price_2050)\n```\n\nPlease execute the code to get the answer." 3 | } -------------------------------------------------------------------------------- /Examples/promptflow/ag-convfinqa-pf/.promptflow/flow.tools.json: -------------------------------------------------------------------------------- 1 | { 2 | "package": {}, 3 | "code": { 4 | "autogen_step.py": { 5 | "type": "python", 6 | "inputs": { 7 | "input1": { 8 | "type": [ 9 | "string" 10 | ] 11 | }, 12 | "connection": { 13 | "type": [ 14 | "AzureOpenAIConnection" 15 | ] 16 | }, 17 | "modelname": { 18 | "type": [ 19 | "string" 20 | ] 21 | } 22 | }, 23 | "source": "autogen_step.py", 24 | "function": "my_python_tool" 25 | } 26 | } 27 | } -------------------------------------------------------------------------------- /Examples/promptflow/ag-convfinqa-pf/.promptflow/flow.uihint.json: -------------------------------------------------------------------------------- 1 | { 2 | "nodes": {} 3 | } -------------------------------------------------------------------------------- /Examples/promptflow/ag-convfinqa-pf/autogen_step.py: -------------------------------------------------------------------------------- 1 | from promptflow import tool 2 | from promptflow.connections import AzureOpenAIConnection 3 | 4 | import autogen 5 | import json, re 6 | from autogen.agentchat.contrib.math_user_proxy_agent import MathUserProxyAgent 7 | #from autogen import Cache 8 | 9 | # create a UserProxyAgent instance named "user_proxy" 10 | def has_boxed(string): 11 | return '\\boxed' in string 12 | 13 | def extract_last_boxed_to_newline(s): 14 | matches = re.findall(r'(\\boxed\{.*?\}.*?)(?=\n|$)', s, re.DOTALL) 15 | return matches[-1] if matches else None 16 | 17 | 18 | # The inputs section will change based on the arguments of the tool function, after you save the code 19 | # Adding type to arguments and return value will help the system show the types properly 20 | # Please update the function name/signature per need 21 | @tool 22 | def my_python_tool(input1: str, connection: AzureOpenAIConnection, modelname: str) -> str: 23 | # config_list = autogen.config_list_from_json( 24 | # "OAI_CONFIG_LIST", 25 | # filter_dict={ 26 | # "model": ["gpt-4", "gpt-4-0314", "gpt4", "gpt-4-32k", "gpt-4-32k-0314", "gpt-4-32k-v0314"], 27 | # }, 28 | # ) 29 | config_list = [ 30 | { 31 | "model": modelname, 32 | "api_key": connection.api_key, 33 | "base_url": connection.api_base, 34 | "api_type": "azure", 35 | "api_version": "2023-07-01-preview" 36 | }, 37 | ] 38 | # create an AssistantAgent named "assistant" 39 | assistant = autogen.AssistantAgent( 40 | name="assistant", 41 | llm_config={ 42 | "cache_seed": None, # disable 43 | "seed": None, # disable 44 | "config_list": config_list, # a list of OpenAI API configurations 45 | "temperature": 0, # temperature for sampling 46 | }, # configuration for autogen's enhanced inference API which is compatible with OpenAI API 47 | ) 48 | 49 | 50 | # 2. create the MathUserProxyAgent instance named "mathproxyagent" 51 | # By default, the human_input_mode is "NEVER", which means the agent will not ask for human input. 52 | mathproxyagent = MathUserProxyAgent( 53 | name="mathproxyagent", 54 | human_input_mode="NEVER", 55 | is_termination_msg = lambda msg: has_boxed(msg['content']), 56 | code_execution_config={"use_docker": False}, 57 | ) 58 | 59 | #autogen.ChatCompletion.start_logging() 60 | 61 | math_problem = input1 62 | mathproxyagent.initiate_chat(assistant, problem=math_problem+'' , silent=True,) 63 | 64 | last_response = assistant.last_message(agent=mathproxyagent) 65 | last_number = last_response['content'] 66 | #last_number = extract_last_boxed_to_newline(last_response['content']) 67 | return f'{last_number}' 68 | -------------------------------------------------------------------------------- /Examples/promptflow/ag-convfinqa-pf/chat.jinja2: -------------------------------------------------------------------------------- 1 | system: 2 | You are a helpful assistant. 3 | 4 | {% for item in chat_history %} 5 | user: 6 | {{item.inputs.question}} 7 | assistant: 8 | {{item.outputs.answer}} 9 | {% endfor %} 10 | 11 | user: 12 | {{question}} -------------------------------------------------------------------------------- /Examples/promptflow/ag-convfinqa-pf/convfinqa_dev_sample.jsonl: -------------------------------------------------------------------------------- 1 | {"prompt":"Read the following text and table, and then answer the last question in a series of questions:\n- | 2007 | 2006 | 2005\nweighted average exercise price per share | $ 60.94 | $ 37.84 | $ 25.14\nQuestions: what was the weighted average exercise price per share in 2007? and what was it in 2005? what was, then, the change over the years? what was the weighted average exercise price per share in 2005?\nQuestion: and how much does that change represent in relation to this 2005 weighted average exercise price?","answer":"1.42403"} 2 | {"prompt":"Read the following text and table, and then answer the last question in a series of questions:\nthe company had unamortized debt issuance costs associated with its credit facilities of $ 11 million and $ 8 million as of december 31 , 2017 and 2016 , respectively . the company had unamortized debt issuance costs associated with the senior notes of $ 15 million and $ 19 million as of december 31 , 2017 and 2016 , respectively. .\n( $ in millions ) | december 31 2017 | december 31 2016\nsenior notes due november 15 2025 5.000% ( 5.000 % ) | 600 | 600\nQuestions: what was the change in the unamortized debt issuance costs associated with the senior notes between 2016 and 2017? so what was the percentage change during this time? what was the change associated with credit facilities during that time?\nQuestion: so what was the percentage change?","answer":"0.375"} 3 | {"prompt":"Read the following text and table, and then answer the last question in a series of questions:\namounts expensed for the savings plans for 2009 , 2008 and 2007 were $ 35.1 , $ 29.6 and $ 31.4 , respectively . expense includes a discretionary company contribution of $ 3.8 , $ 4.0 and $ 4.9 offset by participant forfeitures of $ 2.7 , $ 7.8 , $ 6.0 in 2009 , 2008 and 2007 , respectively .\nQuestions: what is the ratio of discretionary company contributions to total expensed amounts for savings plans in 2009?\nQuestion: what is that times 100?","answer":"10.82621"} 4 | {"prompt":"Read the following text and table, and then answer the last question in a series of questions:\nmillions of dollars | dec . 31 2008 | dec . 31 2007\nequipment rents payable | 93 | 103\nQuestions: what was the equipment rents payable in 2008? and in 2007? so what was the difference between the two years? and the value for 2007 again?\nQuestion: so what was the percentage change during this time?","answer":"-0.09709"} 5 | {"prompt":"Read the following text and table, and then answer the last question in a series of questions:\nthe aggregate fair values of our outstanding fuel hedges as of december 31 , 2016 and 2015 were current liabilities of $ 2.7 million and $ 37.8 million , respectively , and have been recorded in other accrued liabilities in our consolidated balance sheets .\nyear | gallons hedged | weighted average contractprice per gallon\n2017 | 12000000 | $ 2.92\n2018 | 3000000 | 2.61\nQuestions: how much did the gallons hedged in 2018 represent in relation to the ones hedged in 2017? and in the previous year of this period, what was the aggregate fair value of the outstanding fuel hedges? what was it in 2015?\nQuestion: how much, then, did the 2016 fair value represent in relation to this 2015 one?","answer":"14.0"} 6 | {"prompt":"Read the following text and table, and then answer the last question in a series of questions:\n( in thousands ) | net undeveloped acres expiring year ended december 31 , 2016 | net undeveloped acres expiring year ended december 31 , 2017 | net undeveloped acres expiring year ended december 31 , 2018\nu.s . | 68 | 89 | 128\ntotal africa | 189 | 4444 | 890\ntotal | 257 | 4533 | 1018\nQuestions: what was the total african and us net undeveloped acres expiring in 2016?\nQuestion: what percentage of undeveloped acres were in the us in 2018?","answer":"0.12574"} 7 | {"prompt":"Read the following text and table, and then answer the last question in a series of questions:\ncash flowsmillions | 2014 | 2013 | 2012\ncash provided by operating activities | $ 7385 | $ 6823 | $ 6161\nQuestions: what was the cash provided by operating activities in 2013? and in 2012? so what was the difference in this value between the years? and the value for 2012 again?\nQuestion: so what was the percentage change during this time?","answer":"0.10745"} 8 | {"prompt":"Read the following text and table, and then answer the last question in a series of questions:\noil gas ngls total ( mmbbls ) ( bcf ) ( mmbbls ) ( mmboe ) .\n- | oil ( mmbbls ) | gas ( bcf ) | ngls ( mmbbls ) | total ( mmboe )\ncanada | 23 | 198 | 4 | 60\ntotal | 66 | 894 | 28 | 243\nQuestions: what is the amount of oil and gas mmboe from canada divided by the total?\nQuestion: what is that times 100?","answer":"24.69136"} 9 | {"prompt":"Read the following text and table, and then answer the last question in a series of questions:\nyears ended december 31, | 2009 | 2008 | 2007\nsegment revenue | $ 6305 | $ 6197 | $ 5918\nQuestions: what was the total of risk and insurance brokerage services segment revenue in 2009? and what was that in 2008? what was, then, the change over the year?\nQuestion: and how much does this change represent in relation to the 2008 total, in percentage?","answer":"0.01743"} 10 | {"prompt":"Read the following text and table, and then answer the last question in a series of questions:\n- | 12\/28\/2013 | 1\/3\/2015 | 1\/2\/2016 | 12\/31\/2016 | 12\/30\/2017 | 12\/29\/2018\ns&p 500 | 100.00 | 110.28 | 109.54 | 129.05 | 157.22 | 150.33\nQuestions: what is the change in price of the s&p 500 from 2015 to 2016? what is 100000 divided by 100?\nQuestion: what is the product of the change by the quotient?","answer":"18770.0"} 11 | -------------------------------------------------------------------------------- /Examples/promptflow/ag-convfinqa-pf/flow.dag.yaml: -------------------------------------------------------------------------------- 1 | id: template_chat_flow 2 | name: Template Chat Flow 3 | environment: 4 | python_requirements_txt: requirements.txt 5 | inputs: 6 | chat_history: 7 | type: list 8 | is_chat_input: false 9 | is_chat_history: true 10 | question: 11 | type: string 12 | is_chat_input: true 13 | default: "Read the following text and table, and then answer the last question 14 | in a series of questions:\\n- | 2007 | 2006 | 2005\\nweighted average 15 | exercise price per share | $ 60.94 | $ 37.84 | $ 25.14\\nQuestions: what 16 | was the weighted average exercise price per share in 2007?" 17 | outputs: 18 | answer: 19 | type: string 20 | reference: ${autogen_step.output} 21 | is_chat_output: true 22 | nodes: 23 | - name: autogen_step 24 | type: python 25 | source: 26 | type: code 27 | path: autogen_step.py 28 | inputs: 29 | connection: Default_AzureOpenAI 30 | input1: ${inputs.question} 31 | modelname: gpt-35-turbo 32 | use_variants: false 33 | -------------------------------------------------------------------------------- /Examples/promptflow/ag-convfinqa-pf/flow.meta.yaml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/flow.schema.json 2 | name: template_chat_flow 3 | display_name: Template Chat Flow 4 | type: chat 5 | path: ./flow.dag.yaml 6 | description: Template Chat Flow 7 | properties: 8 | promptflow.stage: prod 9 | promptflow.section: template 10 | -------------------------------------------------------------------------------- /Examples/promptflow/ag-convfinqa-pf/requirements.txt: -------------------------------------------------------------------------------- 1 | pyautogen[mathchat] 2 | -------------------------------------------------------------------------------- /Examples/promptflow/autogen-flow/OAI_CONFIG_LIST: -------------------------------------------------------------------------------- 1 | [ 2 | 3 | { 4 | "model": "gpt-35-turbo", 5 | "api_key": "*****", 6 | "base_url": "https://*****.openai.azure.com/", 7 | "api_type": "azure", 8 | "api_version": "2023-05-15" 9 | }, 10 | { 11 | "model": "gpt-4", 12 | "api_key": "*****", 13 | "base_url": "https://*****.openai.azure.com/", 14 | "api_type": "azure", 15 | "api_version": "2023-05-15" 16 | } 17 | ] 18 | -------------------------------------------------------------------------------- /Examples/promptflow/autogen-flow/ag_test.py: -------------------------------------------------------------------------------- 1 | from promptflow import tool 2 | import autogen 3 | #from autogen import Cache 4 | 5 | 6 | 7 | # The inputs section will change based on the arguments of the tool function, after you save the code 8 | # Adding type to arguments and return value will help the system show the types properly 9 | # Please update the function name/signature per need 10 | @tool 11 | def my_python_tool(input1: str) -> str: 12 | config_list = autogen.config_list_from_json( 13 | "OAI_CONFIG_LIST", 14 | filter_dict={ 15 | "model": ["gpt-4", "gpt-4-0314", "gpt4", "gpt-4-32k", "gpt-4-32k-0314", "gpt-4-32k-v0314"], 16 | }, 17 | ) 18 | # create an AssistantAgent named "assistant" 19 | assistant = autogen.AssistantAgent( 20 | name="assistant", 21 | llm_config={ 22 | "cache_seed": None, # disable 23 | "seed": None, # disable 24 | "config_list": config_list, # a list of OpenAI API configurations 25 | "temperature": 0, # temperature for sampling 26 | }, # configuration for autogen's enhanced inference API which is compatible with OpenAI API 27 | ) 28 | # create a UserProxyAgent instance named "user_proxy" 29 | user_proxy = autogen.UserProxyAgent( 30 | name="user_proxy", 31 | human_input_mode="NEVER", 32 | max_consecutive_auto_reply=10, 33 | is_termination_msg=lambda x: x.get("content", "").rstrip().endswith("TERMINATE"), 34 | code_execution_config={ 35 | "work_dir": "coding", 36 | "use_docker": False, # set to True or image name like "python:3" to use docker 37 | }, 38 | ) 39 | # the assistant receives a message from the user_proxy, which contains the task description 40 | user_proxy.initiate_chat( 41 | assistant, 42 | message=f"""Answer the following: {input1}""", 43 | ) 44 | 45 | output = assistant.last_message(agent=user_proxy)['content'] 46 | #output = assistant.chat_messages() 47 | 48 | return f'{output}' -------------------------------------------------------------------------------- /Examples/promptflow/autogen-flow/chat.jinja2: -------------------------------------------------------------------------------- 1 | system: 2 | You are a helpful assistant. 3 | 4 | {% for item in chat_history %} 5 | user: 6 | {{item.inputs.question}} 7 | assistant: 8 | {{item.outputs.answer}} 9 | {% endfor %} 10 | 11 | user: 12 | {{question}} -------------------------------------------------------------------------------- /Examples/promptflow/autogen-flow/flow.dag.yaml: -------------------------------------------------------------------------------- 1 | id: template_chat_flow 2 | name: Template Chat Flow 3 | inputs: 4 | chat_history: 5 | type: list 6 | default: [] 7 | is_chat_input: false 8 | is_chat_history: true 9 | question: 10 | type: string 11 | default: what is the date today? 12 | is_chat_input: true 13 | outputs: 14 | answer: 15 | type: string 16 | reference: ${ag_test.output} 17 | is_chat_output: true 18 | nodes: 19 | - name: ag_test 20 | type: python 21 | source: 22 | type: code 23 | path: ag_test.py 24 | inputs: 25 | input1: ${inputs.question} 26 | use_variants: false 27 | node_variants: {} 28 | environment: 29 | python_requirements_txt: requirements.txt 30 | -------------------------------------------------------------------------------- /Examples/promptflow/autogen-flow/requirements.txt: -------------------------------------------------------------------------------- 1 | pyautogen -------------------------------------------------------------------------------- /Examples/promptflow/csv_example/.amlignore: -------------------------------------------------------------------------------- 1 | ## This file was auto generated by the Azure Machine Learning Studio. Please do not remove. 2 | ## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots 3 | 4 | .ipynb_aml_checkpoints/ 5 | *.amltmp 6 | *.amltemp -------------------------------------------------------------------------------- /Examples/promptflow/csv_example/analyze_df.py: -------------------------------------------------------------------------------- 1 | from promptflow import tool 2 | import pandas as pd 3 | from promptflow import tool 4 | import subprocess 5 | import re 6 | 7 | def score(csvfile:str, program: str) -> str: 8 | program = "import pandas as pd\ndf = pd.read_csv('{}')".format(csvfile)+program 9 | program += '\nprint(ans)' 10 | program = program.replace('\\n', '\n') 11 | print(program) 12 | result = subprocess.run(['python', '-c', program], capture_output=True) 13 | print(result) 14 | ans =str(result.stdout) 15 | return ans 16 | # The inputs section will change based on the arguments of the tool function, after you save the code 17 | # Adding type to arguments and return value will help the system show the types properly 18 | # Please update the function name/signature per need 19 | @tool 20 | def my_python_tool(csvfile:str, input_program: str) -> str: 21 | result = score(csvfile, input_program) 22 | #return_result = re.sub(r'[^\d.]+', '', result) 23 | return result 24 | -------------------------------------------------------------------------------- /Examples/promptflow/csv_example/chat.jinja2: -------------------------------------------------------------------------------- 1 | system: 2 | That helps in generating python code to assist in analyzing tabular data. 3 | Please provide data in pandas code, only provide valid pandas code, do not provide explanations. 4 | assume the dataframe is loaded to variable df. Assign answer to variable name "ans". 5 | 6 | dataframe columns: 7 | {{df_columns}} 8 | 9 | user: 10 | 11 | give me month over month sales differences by geo 12 | 13 | assistant: 14 | # Assuming your dataframe is named 'df' 15 | sales_diff_mom = df.groupby('geo')['sales_mom_diff'].sum() 16 | ans = sales_diff_mom 17 | 18 | user: 19 | {{question}} -------------------------------------------------------------------------------- /Examples/promptflow/csv_example/concat_result.jinja2: -------------------------------------------------------------------------------- 1 | Program: 2 | ``` 3 | {{gen_program}} 4 | ``` 5 | 6 | Result: 7 | ``` 8 | {{format_out}} 9 | ``` 10 | 11 | Insights: 12 | ``` 13 | {{insights_out}} 14 | ``` -------------------------------------------------------------------------------- /Examples/promptflow/csv_example/generate_insights.jinja2: -------------------------------------------------------------------------------- 1 | system: 2 | You are a helpful assistant. That can generate insights from tabular data provided by the user. If there is no output please respond, "There is not enough information" 3 | 4 | user: 5 | {{question}} -------------------------------------------------------------------------------- /Examples/promptflow/csv_example/output_format.jinja2: -------------------------------------------------------------------------------- 1 | system: 2 | You are a helpful assistant, that reformats text as markdown. Remove comments starting with # character from the final output. 3 | 4 | user: 5 | b'geo US 728.761693 GA 595.842102 AL 119.118714 TX 111.349607 Name: sales_mom_diff, dtype: float64' 6 | 7 | assistant: 8 | geo US | 728.761693 9 | GA | 595.842102 10 | AL | 119.118714 11 | TX | 111.349607 12 | 13 | Name: sales_mom_diff, dtype: float64 14 | 15 | user: 16 | {{exec_output}} -------------------------------------------------------------------------------- /Examples/promptflow/csv_example/output_result.jinja2: -------------------------------------------------------------------------------- 1 | ``` 2 | {{format_out}} 3 | ``` 4 | {{insights}} -------------------------------------------------------------------------------- /Examples/promptflow/csv_example/readme.md: -------------------------------------------------------------------------------- 1 | # Promptflow instructions 2 | 3 | ### Begin by installing promptflow extension in VSCode 4 | image 5 | 6 | ### Follow instructions to install dependencies 7 | image 8 | 9 | Ensure dependencies are installed: 10 | image 11 | 12 | ### Open example flow in the "Flows" window from PromptFlow extension 13 | image 14 | 15 | ### Add connection "aoai-connection" 16 | image 17 | 18 | ### Assign Deployment name for all LLM steps 19 | image 20 | 21 | ### Run flow in interactive mode to see output. 22 | Try the sample question: "give me the sales index by period for CA" 23 | 24 | image 25 | -------------------------------------------------------------------------------- /Examples/promptflow/csv_example/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jakeatmsft/AzureOpenAIExamples/222106ab4dc2e124e5ec16a5d8bf771eefd25ca8/Examples/promptflow/csv_example/requirements.txt -------------------------------------------------------------------------------- /Examples/promptflow/databricks_example/chat_csv_model/promptflow/analyze_df.py: -------------------------------------------------------------------------------- 1 | from promptflow import tool 2 | import pandas as pd 3 | from promptflow import tool 4 | import subprocess 5 | import re 6 | 7 | def score(csvfile:str, program: str) -> str: 8 | program = "import pandas as pd\ndf = pd.read_csv('{}')\n".format(csvfile)+program 9 | program += '\nprint(ans)' 10 | program = program.replace('\\n', '\n') 11 | print(program) 12 | result = subprocess.run(['python', '-c', program], capture_output=True) 13 | print(result) 14 | ans =str(result.stdout) 15 | return ans 16 | # The inputs section will change based on the arguments of the tool function, after you save the code 17 | # Adding type to arguments and return value will help the system show the types properly 18 | # Please update the function name/signature per need 19 | @tool 20 | def my_python_tool(csvfile:str, input_program: str) -> str: 21 | result = score(csvfile, input_program) 22 | #return_result = re.sub(r'[^\d.]+', '', result) 23 | return result 24 | -------------------------------------------------------------------------------- /Examples/promptflow/databricks_example/chat_csv_model/promptflow/chat.jinja2: -------------------------------------------------------------------------------- 1 | system: 2 | That helps in generating python code to assist in analyzing tabular data. 3 | Please provide data in pandas code, only provide valid pandas code, do not provide explanations. 4 | assume the dataframe is loaded to variable df. Assign answer to variable name "ans". 5 | 6 | dataframe columns: 7 | {{df_columns}} 8 | 9 | file: 10 | {{file_name}} 11 | 12 | user: 13 | 14 | give me month over month sales differences by geo 15 | 16 | assistant: 17 | # Assuming your dataframe is named 'df' 18 | sales_diff_mom = df.groupby('geo')['sales_mom_diff'].sum() 19 | ans = sales_diff_mom 20 | 21 | user: 22 | {{question}} -------------------------------------------------------------------------------- /Examples/promptflow/databricks_example/chat_csv_model/promptflow/concat_result.jinja2: -------------------------------------------------------------------------------- 1 | Program: 2 | ``` 3 | {{gen_program}} 4 | ``` 5 | 6 | Result: 7 | ``` 8 | {{format_out}} 9 | ``` 10 | 11 | Insights: 12 | ``` 13 | {{insights_out}} 14 | ``` -------------------------------------------------------------------------------- /Examples/promptflow/databricks_example/chat_csv_model/promptflow/generate_insights.jinja2: -------------------------------------------------------------------------------- 1 | system: 2 | You are a helpful assistant. That can generate insights from tabular data provided by the user. If there is no output please respond, "There is not enough information" 3 | 4 | user: 5 | {{question}} -------------------------------------------------------------------------------- /Examples/promptflow/databricks_example/chat_csv_model/promptflow/output_format.jinja2: -------------------------------------------------------------------------------- 1 | system: 2 | You are a helpful assistant, that reformats text as markdown. Remove comments starting with # character from the final output. 3 | 4 | user: 5 | b'geo US 728.761693 GA 595.842102 AL 119.118714 TX 111.349607 Name: sales_mom_diff, dtype: float64' 6 | 7 | assistant: 8 | geo US | 728.761693 9 | GA | 595.842102 10 | AL | 119.118714 11 | TX | 111.349607 12 | 13 | Name: sales_mom_diff, dtype: float64 14 | 15 | user: 16 | {{exec_output}} -------------------------------------------------------------------------------- /Examples/promptflow/databricks_example/chat_csv_model/promptflow/output_result.jinja2: -------------------------------------------------------------------------------- 1 | ``` 2 | {{format_out}} 3 | ``` 4 | {{insights}} -------------------------------------------------------------------------------- /Examples/promptflow/databricks_example/chat_csv_model/promptflow/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jakeatmsft/AzureOpenAIExamples/222106ab4dc2e124e5ec16a5d8bf771eefd25ca8/Examples/promptflow/databricks_example/chat_csv_model/promptflow/requirements.txt -------------------------------------------------------------------------------- /Examples/promptflow/databricks_example/readme.md: -------------------------------------------------------------------------------- 1 | #### Note: Make sure you are running on Databricks ML compute, to avoid dependency issues with MLflow. 2 | 3 | Step 1: Upload the "chat_csv_model" folder to your databricks workspace, along with the deploy_pf notebooks. 4 | 5 | image 6 |
7 | Step 2: Open the pf_register_model.ipynb Run through the notebook to test the promptflow model and register it to your databricks model registry. 8 | Be sure to replace all connection string info with your AOAI config. 9 | image 10 | 11 | After execution of all steps you should see the model registered: 12 |
13 | image 14 |
15 | Step 3: Open the pf_test_model.ipynb to load the model from the registry and ensure you can execute it successfully. 16 | -------------------------------------------------------------------------------- /Examples/promptflow/dmv_copilot_flow/.promptflow/RetrieveDocuments.inputs.jsonl: -------------------------------------------------------------------------------- 1 | {"searchConnection":"AzureAISearch","embeddingModelConnection":"Default_AzureOpenAI","vectorFields":"contentVector","embeddingModelName":"text-embedding-ada-002","indexName":"dmv-index-full","ExtractIntent.output.search_intents":{"current_message_intent":"how old do I need to be to drive?","search_intents":"[\"how old do I need to be to drive?\"]"},"queryType":"vectorSimpleHybrid","semanticConfiguration":"None","topK":3} -------------------------------------------------------------------------------- /Examples/promptflow/dmv_copilot_flow/.promptflow/RetrieveDocuments.node.log: -------------------------------------------------------------------------------- 1 | 2024-04-18 07:53:04 -0700 22736 execution.flow INFO Executing node RetrieveDocuments. node run id: b568b6e8-108a-4743-98e6-7ac371fb25d0_RetrieveDocuments_3ea5220e-f7d3-412a-b883-cd9b303ea478 2 | 2024-04-18 07:53:11 -0700 22736 execution.flow INFO Node RetrieveDocuments completes. 3 | -------------------------------------------------------------------------------- /Examples/promptflow/dmv_copilot_flow/.promptflow/flow.layout.json: -------------------------------------------------------------------------------- 1 | { 2 | "nodeLayouts": { 3 | "inputs": { 4 | "x": 552.5, 5 | "y": 42, 6 | "index": -1 7 | }, 8 | "outputs": { 9 | "x": 395, 10 | "y": 1114, 11 | "index": -1 12 | }, 13 | "DetermineIntent": { 14 | "x": 480, 15 | "y": 176, 16 | "index": 0 17 | }, 18 | "ExtractIntent": { 19 | "x": 645, 20 | "y": 310, 21 | "index": 1 22 | }, 23 | "RetrieveDocuments": { 24 | "x": 820, 25 | "y": 444, 26 | "index": 2 27 | }, 28 | "FormatRetrievedDocuments": { 29 | "x": 470, 30 | "y": 578, 31 | "index": 3 32 | }, 33 | "FormatConversation": { 34 | "x": 140, 35 | "y": 578, 36 | "index": 4 37 | }, 38 | "DetermineReply": { 39 | "x": 470, 40 | "y": 712, 41 | "index": 5 42 | }, 43 | "FormatReply": { 44 | "x": 470, 45 | "y": 846, 46 | "index": 6 47 | }, 48 | "output_prompt": { 49 | "x": 917.5, 50 | "y": 980, 51 | "index": 7 52 | } 53 | }, 54 | "orientation": "Vertical" 55 | } -------------------------------------------------------------------------------- /Examples/promptflow/dmv_copilot_flow/.promptflow/flow.log: -------------------------------------------------------------------------------- 1 | 2024-04-18 07:45:11 -0700 7572 execution.flow INFO Start executing nodes in thread pool mode. 2 | 2024-04-18 07:45:11 -0700 7572 execution.flow INFO Start to run 8 nodes with concurrency level 16. 3 | 2024-04-18 07:45:11 -0700 7572 execution.flow INFO Executing node DetermineIntent. node run id: f0dece6c-a97c-42c9-a938-a66b821d49f5_DetermineIntent_0 4 | 2024-04-18 07:45:11 -0700 7572 execution.flow INFO Executing node FormatConversation. node run id: f0dece6c-a97c-42c9-a938-a66b821d49f5_FormatConversation_0 5 | 2024-04-18 07:45:11 -0700 7572 execution.flow INFO Node FormatConversation completes. 6 | 2024-04-18 07:45:12 -0700 7572 execution.flow INFO Node DetermineIntent completes. 7 | 2024-04-18 07:45:12 -0700 7572 execution.flow INFO Executing node ExtractIntent. node run id: f0dece6c-a97c-42c9-a938-a66b821d49f5_ExtractIntent_0 8 | 2024-04-18 07:45:12 -0700 7572 execution.flow INFO Node ExtractIntent completes. 9 | 2024-04-18 07:45:12 -0700 7572 execution.flow INFO Executing node RetrieveDocuments. node run id: f0dece6c-a97c-42c9-a938-a66b821d49f5_RetrieveDocuments_0 10 | 2024-04-18 07:45:16 -0700 7572 execution.flow INFO Node RetrieveDocuments completes. 11 | 2024-04-18 07:45:16 -0700 7572 execution.flow INFO Executing node FormatRetrievedDocuments. node run id: f0dece6c-a97c-42c9-a938-a66b821d49f5_FormatRetrievedDocuments_0 12 | 2024-04-18 07:45:16 -0700 7572 execution.flow INFO Node FormatRetrievedDocuments completes. 13 | 2024-04-18 07:45:16 -0700 7572 execution.flow INFO Executing node DetermineReply. node run id: f0dece6c-a97c-42c9-a938-a66b821d49f5_DetermineReply_0 14 | 2024-04-18 07:45:38 -0700 7572 execution.flow INFO Node DetermineReply completes. 15 | 2024-04-18 07:45:38 -0700 7572 execution.flow INFO Executing node FormatReply. node run id: f0dece6c-a97c-42c9-a938-a66b821d49f5_FormatReply_0 16 | 2024-04-18 07:45:38 -0700 7572 execution.flow INFO Node FormatReply completes. 17 | 2024-04-18 07:45:38 -0700 7572 execution.flow INFO Executing node output_prompt. node run id: f0dece6c-a97c-42c9-a938-a66b821d49f5_output_prompt_0 18 | 2024-04-18 07:45:38 -0700 7572 execution.flow INFO Node output_prompt completes. 19 | -------------------------------------------------------------------------------- /Examples/promptflow/dmv_copilot_flow/.promptflow/flow.tools.json: -------------------------------------------------------------------------------- 1 | { 2 | "package": {}, 3 | "code": { 4 | "DetermineIntent.jinja2": { 5 | "type": "llm", 6 | "inputs": { 7 | "query": { 8 | "type": [ 9 | "string" 10 | ] 11 | }, 12 | "chat_history": { 13 | "type": [ 14 | "string" 15 | ] 16 | } 17 | }, 18 | "source": "DetermineIntent.jinja2" 19 | }, 20 | "ExtractIntent.py": { 21 | "type": "python", 22 | "inputs": { 23 | "input": { 24 | "type": [ 25 | "string" 26 | ] 27 | }, 28 | "query": { 29 | "type": [ 30 | "string" 31 | ] 32 | } 33 | }, 34 | "source": "ExtractIntent.py", 35 | "function": "extract_intent" 36 | }, 37 | "RetrieveDocuments.py": { 38 | "type": "python", 39 | "inputs": { 40 | "queries": { 41 | "type": [ 42 | "string" 43 | ] 44 | }, 45 | "searchConnection": { 46 | "type": [ 47 | "CognitiveSearchConnection" 48 | ] 49 | }, 50 | "indexName": { 51 | "type": [ 52 | "string" 53 | ] 54 | }, 55 | "queryType": { 56 | "type": [ 57 | "string" 58 | ] 59 | }, 60 | "topK": { 61 | "type": [ 62 | "int" 63 | ] 64 | }, 65 | "semanticConfiguration": { 66 | "type": [ 67 | "string" 68 | ] 69 | }, 70 | "vectorFields": { 71 | "type": [ 72 | "string" 73 | ] 74 | }, 75 | "embeddingModelConnection": { 76 | "type": [ 77 | "AzureOpenAIConnection" 78 | ] 79 | }, 80 | "embeddingModelName": { 81 | "type": [ 82 | "string" 83 | ] 84 | } 85 | }, 86 | "source": "RetrieveDocuments.py", 87 | "function": "search" 88 | }, 89 | "FormatRetrievedDocuments.py": { 90 | "type": "python", 91 | "inputs": { 92 | "docs": { 93 | "type": [ 94 | "object" 95 | ] 96 | }, 97 | "maxTokens": { 98 | "type": [ 99 | "int" 100 | ] 101 | } 102 | }, 103 | "source": "FormatRetrievedDocuments.py", 104 | "function": "format_retrieved_documents" 105 | }, 106 | "FormatConversation.py": { 107 | "type": "python", 108 | "inputs": { 109 | "query": { 110 | "type": [ 111 | "string" 112 | ] 113 | }, 114 | "history": { 115 | "type": [ 116 | "list" 117 | ] 118 | }, 119 | "maxTokens": { 120 | "type": [ 121 | "int" 122 | ] 123 | } 124 | }, 125 | "source": "FormatConversation.py", 126 | "function": "format_conversation" 127 | }, 128 | "DetermineReply.jinja2": { 129 | "type": "llm", 130 | "inputs": { 131 | "conversation": { 132 | "type": [ 133 | "string" 134 | ] 135 | }, 136 | "documentation": { 137 | "type": [ 138 | "string" 139 | ] 140 | }, 141 | "user_query": { 142 | "type": [ 143 | "string" 144 | ] 145 | } 146 | }, 147 | "source": "DetermineReply.jinja2" 148 | }, 149 | "FormatReply.py": { 150 | "type": "python", 151 | "inputs": { 152 | "reply": { 153 | "type": [ 154 | "string" 155 | ] 156 | } 157 | }, 158 | "source": "FormatReply.py", 159 | "function": "format_reply" 160 | }, 161 | "output_prompt.jinja2": { 162 | "type": "prompt", 163 | "inputs": { 164 | "reply": { 165 | "type": [ 166 | "string" 167 | ] 168 | }, 169 | "retrieveddocs": { 170 | "type": [ 171 | "string" 172 | ] 173 | } 174 | }, 175 | "source": "output_prompt.jinja2" 176 | } 177 | } 178 | } -------------------------------------------------------------------------------- /Examples/promptflow/dmv_copilot_flow/DetermineIntent.jinja2: -------------------------------------------------------------------------------- 1 | system: 2 | You are provided with two sentences, first line is the intent of user's previous request, second line is the user's current message. 3 | You must generate the intent of the current message. 4 | You must divide the intent of the current message into multiple specific single search intents. The single intents should not include greeting. 5 | Each single intent should have as less overlap with other single_intents as possible. 6 | You must generate single intents like below examples, except for the user question does not have a specific intent like "hello", "how are you?"" 7 | If you cannot understand the single intent of the current message, you must use the latest message as the single intent. 8 | You must use the following examples as a guidance to understand the user's current intent. 9 | Your response format must adhere to the examples below. 10 | You don't need to include the previous intent in your response when user shift to a new topic. 11 | 12 | #Very important instruction 13 | When the user's current query shifts to a new topic, your response must change to the corresponding new topic. 14 | You must generate all single intents for the current message. If no single intent is generated, you must generate an empty list of single intents like []. 15 | You must keep the current message intent in the same language as user's input current query. 16 | - You must keep the single intents in the same language as user's input current query. 17 | 18 | [EXAMPLES] 19 | user: 20 | previous intent: what is OpenAI? 21 | current query: list the result in a table. 22 | assistant: 23 | Current Message Intent: what is OpenAI and list the result in a table? 24 | Single Intents: ["what is OpenAI?"] 25 | user: 26 | previous intent: what is OpenAI? 27 | current query: summarize the result into 2 sentences. 28 | assistant: 29 | Current Message Intent: what is OpenAI and summarize the result into 2 sentences. 30 | Single Intents: ["what is OpenAI?"] 31 | user: 32 | previous intent: how to query a database with C# 33 | current query: how about Python 34 | assistant: 35 | Current Message Intent: how to query a database with Python 36 | Single Intents: ["how to query a database with Python"] 37 | user: 38 | previous intent: Tell me about vm. 39 | current query: What is the price of it, office 365 and azure? 40 | assistant: 41 | Current Message Intent: What is the price of virtual machine, office 365 and azure? 42 | Single Intents: ["what is the price of virtual machine?", "what is the price of office 365?", "what is the price of azure?"] 43 | user: 44 | previous intent: None 45 | current query: aoai? 46 | assistant: 47 | Current Message Intent: What is aoai? 48 | Single Intents: ["what is Azure OpenAI?"] 49 | user: 50 | previous intent: what is IKEA? 51 | current query: hello 52 | assistant: 53 | Current Message Intent: hello 54 | Single Intents: [] 55 | user: 56 | previous intent: what is IKEA? 57 | current query: What is azure ml? how can i create a new workspace? 58 | assistant: 59 | Current Message Intent: What is azure ml? How can i create a new workspace? 60 | Single Intents: ["what is azure ml?", "how can i create a new workspace in azure ml?"] 61 | [END EXAMPLES] 62 | 63 | user: 64 | previous intent: {{ chat_history[-1]["outputs"]["current_query_intent"] if chat_history else 'None' }} 65 | current query: {{query}} 66 | assistant: 67 | Current Message Intent: 68 | -------------------------------------------------------------------------------- /Examples/promptflow/dmv_copilot_flow/ExtractIntent.py: -------------------------------------------------------------------------------- 1 | from promptflow import tool 2 | 3 | @tool 4 | def extract_intent(input: str, query: str) -> str: 5 | entries = None 6 | if 'Single Intents:' in input: 7 | entries = input.split('Single Intents:', 2) 8 | elif 'Single Intent:' in input: 9 | entries = input.split('Single Intent:', 2) 10 | 11 | if entries and len(entries) == 2: 12 | return { 13 | "current_message_intent": entries[0].strip(), 14 | "search_intents": entries[1].strip() 15 | } 16 | return { 17 | "current_message_intent": query, 18 | "search_intents": query 19 | } 20 | -------------------------------------------------------------------------------- /Examples/promptflow/dmv_copilot_flow/FormatConversation.py: -------------------------------------------------------------------------------- 1 | from promptflow import tool 2 | 3 | @tool 4 | def format_conversation(query: str, history: list, maxTokens: int) -> str: 5 | result = "" 6 | conversation_history = [] 7 | for history_item in history: 8 | conversation_history.append({ 9 | "speaker": "user", 10 | "message": history_item["inputs"]["query"] 11 | }) 12 | conversation_history.append({ 13 | "speaker": "assistant", 14 | "message": history_item["outputs"]["reply"] 15 | }) 16 | 17 | # Start using context from history, starting from most recent, until token limit is reached. 18 | for turn in reversed(conversation_history): 19 | turnStr = format_turn(turn["speaker"], turn["message"]) 20 | newResult = turnStr + result 21 | if estimate_tokens(newResult) > maxTokens: 22 | break 23 | result = newResult 24 | return result 25 | 26 | def format_turn(speaker: str, message: str) -> str: 27 | return f"{speaker}:\n{message}\n" 28 | 29 | def estimate_tokens(text: str) -> int: 30 | return (len(text) + 2) / 3 31 | -------------------------------------------------------------------------------- /Examples/promptflow/dmv_copilot_flow/FormatReply.py: -------------------------------------------------------------------------------- 1 | from promptflow import tool 2 | 3 | @tool 4 | def format_reply(reply: str) -> str: 5 | reply = clean_markdown(reply) 6 | return reply 7 | 8 | def clean_markdown(input: str) -> str: 9 | start = 0 10 | inBlock = False 11 | result = "" 12 | while True: 13 | nextStart = input.find("```", start) 14 | if nextStart == -1: 15 | break 16 | result += input[start:nextStart] 17 | if inBlock: 18 | if nextStart > 0 and input[nextStart - 1] != '\n': 19 | result += "\n" 20 | result += "```\n" 21 | inBlock = False 22 | else: 23 | result += "```" 24 | inBlock = True 25 | start = nextStart + 3 26 | result += input[start:] 27 | if inBlock: 28 | result += "```" 29 | return result 30 | -------------------------------------------------------------------------------- /Examples/promptflow/dmv_copilot_flow/FormatRetrievedDocuments.py: -------------------------------------------------------------------------------- 1 | from promptflow import tool 2 | 3 | @tool 4 | def format_retrieved_documents(docs: object, maxTokens: int) -> str: 5 | formattedDocs = [] 6 | strResult = "" 7 | for index, doc in enumerate(docs): 8 | formattedDocs.append({ 9 | f"[doc{index}]": { 10 | "title": doc['title'], 11 | "content": doc['content'] 12 | } 13 | }) 14 | formattedResult = { "retrieved_documents": formattedDocs } 15 | nextStrResult = str(formattedResult) 16 | if (estimate_tokens(nextStrResult) > maxTokens): 17 | break 18 | strResult = nextStrResult 19 | 20 | return strResult 21 | 22 | def estimate_tokens(text: str) -> int: 23 | return (len(text) + 2) / 3 24 | -------------------------------------------------------------------------------- /Examples/promptflow/dmv_copilot_flow/RetrieveDocuments.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | import requests 4 | from promptflow import tool 5 | from promptflow.connections import AzureOpenAIConnection 6 | from promptflow.connections import CognitiveSearchConnection 7 | 8 | fieldMap = { 9 | "id": ["id"], 10 | "url": ["url", "uri", "link", "document_link"], 11 | "filepath": ["filepath", "filename"], 12 | "content": ["content"] 13 | } 14 | titleRegex = re.compile(r"title: (.*)\n") 15 | 16 | def getIfString(doc, fieldName): 17 | try: 18 | value = doc.get(fieldName) 19 | if isinstance(value, str) and len(value) > 0: 20 | return value 21 | return None 22 | except: 23 | return None 24 | 25 | def get_truncated_string(string_value, max_length): 26 | return string_value[:max_length] 27 | 28 | def getTitle(doc): 29 | max_title_length = 150 30 | title = getIfString(doc, 'title') 31 | if title: 32 | return get_truncated_string(title, max_title_length) 33 | else: 34 | title = getIfString(doc, 'content') 35 | if title: 36 | titleMatch = titleRegex.search(title) 37 | if titleMatch: 38 | return get_truncated_string(titleMatch.group(1), max_title_length) 39 | else: 40 | return None 41 | else: 42 | return None 43 | 44 | def getChunkId(doc): 45 | chunk_id = getIfString(doc, 'chunk_id') 46 | return chunk_id 47 | 48 | def getSearchScore(doc): 49 | try: 50 | return doc['@search.score'] 51 | except: 52 | return None 53 | 54 | def getQueryList(query): 55 | try: 56 | config = json.loads(query) 57 | return config 58 | except Exception: 59 | return [query] 60 | 61 | def process_search_docs_response(docs): 62 | outputs = [] 63 | for doc in docs: 64 | formattedDoc = {} 65 | for fieldName in fieldMap.keys(): 66 | for fromFieldName in fieldMap[fieldName]: 67 | fieldValue = getIfString(doc, fromFieldName) 68 | if fieldValue: 69 | formattedDoc[fieldName] = doc[fromFieldName] 70 | break 71 | formattedDoc['title'] = getTitle(doc) 72 | formattedDoc['chunk_id'] = getChunkId(doc) 73 | formattedDoc['search_score'] = getSearchScore(doc) 74 | outputs.append(formattedDoc) 75 | return outputs 76 | 77 | def get_query_embedding(query, endpoint, api_key, api_version, embedding_model_deployment): 78 | request_url = f"{endpoint}/openai/deployments/{embedding_model_deployment}/embeddings?api-version={api_version}" 79 | headers = { 80 | "Content-Type": "application/json", 81 | "api-key": api_key 82 | } 83 | request_payload = { 84 | 'input': query 85 | } 86 | embedding_response = requests.post(request_url, json = request_payload, headers = headers, timeout=None) 87 | if embedding_response.status_code == 200: 88 | data_values = embedding_response.json()["data"] 89 | embeddings_vectors = [data_value["embedding"] for data_value in data_values] 90 | return embeddings_vectors 91 | else: 92 | raise Exception(f"failed to get embedding: {embedding_response.json()}") 93 | 94 | def search_query_api( 95 | endpoint, 96 | api_key, 97 | api_version, 98 | index_name, 99 | query_type, 100 | query, 101 | top_k, 102 | embeddingModelConnection, 103 | embeddingModelName = None, 104 | semantic_configuration_name=None, 105 | vectorFields=None): 106 | request_url = f"{endpoint}/indexes/{index_name}/docs/search?api-version={api_version}" 107 | request_payload = { 108 | 'top': top_k, 109 | 'queryLanguage': 'en-us' 110 | } 111 | if query_type == 'simple': 112 | request_payload['search'] = query 113 | request_payload['queryType'] = query_type 114 | elif query_type == 'semantic': 115 | request_payload['search'] = query 116 | request_payload['queryType'] = query_type 117 | request_payload['semanticConfiguration'] = semantic_configuration_name 118 | elif query_type in ('vector', 'vectorSimpleHybrid', 'vectorSemanticHybrid'): 119 | if vectorFields and embeddingModelName: 120 | query_vectors = get_query_embedding( 121 | query, 122 | embeddingModelConnection["api_base"], 123 | embeddingModelConnection["api_key"], 124 | embeddingModelConnection["api_version"], 125 | embeddingModelName 126 | ) 127 | payload_vectors = [{"value": query_vector, "fields": vectorFields, "k": top_k } for query_vector in query_vectors] 128 | request_payload['vectors'] = payload_vectors 129 | 130 | if query_type == 'vectorSimpleHybrid': 131 | request_payload['search'] = query 132 | elif query_type == 'vectorSemanticHybrid': 133 | request_payload['search'] = query 134 | request_payload['queryType'] = 'semantic' 135 | request_payload['semanticConfiguration'] = semantic_configuration_name 136 | else: 137 | raise Exception(f"unsupported query type: {query_type}") 138 | 139 | headers = { 140 | "Content-Type": "application/json", 141 | "api-key": api_key 142 | } 143 | retrieved_docs = requests.post(request_url, json = request_payload, headers = headers, timeout=None) 144 | if retrieved_docs.status_code == 200: 145 | return process_search_docs_response(retrieved_docs.json()["value"]) 146 | else: 147 | raise Exception(f"failed to query search index : {retrieved_docs.json()}") 148 | 149 | @tool 150 | def search(queries: str, searchConnection: CognitiveSearchConnection, indexName: str, queryType: str, topK: int, semanticConfiguration: str, vectorFields: str, embeddingModelConnection: AzureOpenAIConnection, embeddingModelName: str): 151 | semanticConfiguration = semanticConfiguration if semanticConfiguration != "None" else None 152 | vectorFields = vectorFields if vectorFields != "None" else None 153 | embeddingModelName = embeddingModelName if embeddingModelName != None else None 154 | 155 | # Do search. 156 | allOutputs = [search_query_api( 157 | searchConnection['api_base'], 158 | searchConnection['api_key'], 159 | searchConnection['api_version'], 160 | indexName, 161 | queryType, 162 | query, 163 | topK, 164 | embeddingModelConnection, 165 | embeddingModelName, 166 | semanticConfiguration, 167 | vectorFields) for query in getQueryList(queries)] 168 | 169 | includedOutputs = [] 170 | while allOutputs and len(includedOutputs) < topK: 171 | for output in list(allOutputs): 172 | if len(output) == 0: 173 | allOutputs.remove(output) 174 | continue 175 | value = output.pop(0) 176 | if value not in includedOutputs: 177 | includedOutputs.append(value) 178 | if len(includedOutputs) >= topK: 179 | break 180 | return includedOutputs 181 | -------------------------------------------------------------------------------- /Examples/promptflow/dmv_copilot_flow/concat_reply.py: -------------------------------------------------------------------------------- 1 | from promptflow import tool 2 | 3 | 4 | # The inputs section will change based on the arguments of the tool function, after you save the code 5 | # Adding type to arguments and return value will help the system show the types properly 6 | # Please update the function name/signature per need 7 | @tool 8 | def my_python_tool(reply: str, docs: str) -> str: 9 | return 'hello ' + input1 -------------------------------------------------------------------------------- /Examples/promptflow/dmv_copilot_flow/dmv_sample_qs.csv: -------------------------------------------------------------------------------- 1 | chat_history,question,answer,output 2 | [],What documents do I need to bring to renew my driver's license?,"To renew your driver's license, you need to bring one proof of identity and two proofs of residency.", 3 | [],How long does it take to get a new driver's license?,The context does not provide a specific answer to this question., 4 | [],Can I schedule an appointment for my DMV visit?,"Yes, you can schedule an appointment for your DMV visit. You can make an appointment by calling 1-800-777-0133.", 5 | [],Can I take a driving test without an appointment?,"No, you cannot take a driving test without an appointment. You may make an appointment by calling 1-800-777-0133. Online appointments are not available for this type of driving test. ", 6 | [],What is the fee for a driver's license renewal?,The fee for a driver's license renewal is $48. , 7 | [],Can I renew my registration online?,"Yes, you can renew your vehicle registration online. ", 8 | [],How do I transfer a vehicle title?,"To transfer a vehicle title in California, you must complete the process within 10 days of buying the vehicle and submit the necessary paperwork to the DMV", 9 | [],What is the process for getting a handicap placard?,"To get a handicap placard, you need to apply for one through your state’s Department of Motor Vehicles (DMV). Each state has its own forms and criteria for handicapped parking permits. You can get a handicapped parking application from the DMV office or online1. After completing and signing the form, you need to ask your health-care provider to fill out and sign the portion that certifies disability. Then, you can submit the application by mail or in person", 10 | [],How do I get a commercial driver's license?,The given context does not provide any information about how to get a commercial driver's license. It only provides information about the requirements and examinations for obtaining a noncommercial Class A license in California., 11 | [],Can I renew my vehicle registration without a smog check?,"No, all vehicles registered in California are required to meet California requirements including vehicle emission controls in support of California’s clean air standards. DMV cannot register a vehicle if it does not qualify.", 12 | [],What is the process for getting a REAL ID?,"To get a REAL ID, one needs to bring the following documentation to their DMV office visit: 1) Identity/birth document, 2) Social Security number document, and 3) California Residency document(s) (two different documents). If the name on the identity document is different from the current legal name, a name change document(s) is also required. One can start their application online, upload the required documents, make a DMV appointment, and bring the confirmation code and original documents to the appointment.", 13 | [],Can I change my address on my driver's license online?,"Yes, you can change your address on your driver's license online by submitting a change of address online at dmv.ca.gov/addresschange ", 14 | [],How do I get a duplicate registration card?,"To get a duplicate registration card, you can use the DMV kiosks available at various locations such as grocery stores, DMV offices, and auto clubs. These kiosks allow you to complete your vehicle registration renewal, receive a replacement registration card or sticker, and submit proof of insurance, among other transactions ", 15 | [],Can I order personalized license plates?,"Yes, you can order personalized license plates. ", 16 | [],How do I apply for a driver's permit?,"To apply for a driver's permit, you need to follow these steps: 17 | 1. Complete a Driver's License & ID Card Application. 18 | 2. Provide the necessary documents, including proof of identity, two proofs of residency, a legal full name document (if the name on your identity document and application do not match), and your social security number (exceptions may apply). 19 | 3. Pay a non-refundable application fee. 20 | 4. Pass the knowledge test(s). 21 | 5. Pass a vision test. 22 | ", 23 | [],Can I take a motorcycle riding test at the DMV?,"Yes, you can take a motorcycle riding test at the DMV (Department of Motor Vehicles) to obtain a motorcycle only (Class M1/M2) license. Basic vehicle control skills are included in the motorcycle skills test to determine your ability to maneuver in normal traffic situations. The skills test may be waived for a person presenting a valid DL 389; however, you may be required to perform an observation test.", 24 | [],How do I get a non-driver identification card?,"To get a non-driver identification card, you must provide your identity document, residency documents, and social security number. You can complete an ID card application and get information on current acceptable documents and reduced-fee, no-fee, or senior ID cards by visiting dmv.ca.gov/id-cards", 25 | [],What is the process for surrendering my license plates?,The provided context does not contain information about the process for surrendering license plates., 26 | [],Can I pay my fees with a credit card?,"Yes, it is possible to pay fees with a credit card. However, the source does not specify whether DMV kiosks accept credit cards as a form of payment. It is recommended to check with the specific DMV office or kiosk to confirm their accepted forms of payment. ", 27 | [],How do I report a lost or stolen driver's license?,Information about reporting a lost or stolen driver's license is not provided in the given context., 28 | -------------------------------------------------------------------------------- /Examples/promptflow/dmv_copilot_flow/output_prompt.jinja2: -------------------------------------------------------------------------------- 1 | {{reply}} 2 | 3 | Sources : 4 | 5 | {% for item in retrieveddocs %} 6 | doc_content: {{ item['content'] }} 7 | doc: {{ item['url'] }} 8 | 9 | {% endfor %} 10 | -------------------------------------------------------------------------------- /Examples/promptflow/dmv_copilot_flow/requirements.txt: -------------------------------------------------------------------------------- 1 | promptflow==1.5.0 2 | promptflow-tools==1.2.0 -------------------------------------------------------------------------------- /Examples/promptflow/finance_assistant_pf/.promptflow/flow.layout.json: -------------------------------------------------------------------------------- 1 | { 2 | "nodeLayouts": { 3 | "inputs": { 4 | "x": 290, 5 | "y": 42, 6 | "index": -1 7 | }, 8 | "outputs": { 9 | "x": 220, 10 | "y": 310, 11 | "index": -1 12 | }, 13 | "ag_test": { 14 | "x": 140, 15 | "y": 176, 16 | "index": 0 17 | } 18 | }, 19 | "orientation": "Vertical" 20 | } -------------------------------------------------------------------------------- /Examples/promptflow/finance_assistant_pf/.promptflow/flow.log: -------------------------------------------------------------------------------- 1 | 2024-02-07 15:15:09 -0800 2468 execution.flow INFO Start executing nodes in thread pool mode. 2 | 2024-02-07 15:15:09 -0800 2468 execution.flow INFO Start to run 1 nodes with concurrency level 16. 3 | 2024-02-07 15:15:09 -0800 2468 execution.flow INFO Executing node ag_test. node run id: d8062490-4000-46ee-950f-dbb9eea7dd00_ag_test_0 4 | 2024-02-07 15:15:21 -0800 2468 execution ERROR Node ag_test in line 0 failed. Exception: Execution failure in 'ag_test': (NotFoundError) Error code: 404 - {'error': {'code': '404', 'message': 'Resource not found'}}. 5 | Traceback (most recent call last): 6 | File "C:\Users\jacwang\AppData\Local\anaconda3\envs\autogen\Lib\site-packages\promptflow\_core\flow_execution_context.py", line 194, in _invoke_tool_with_timer 7 | return f(**kwargs) 8 | ^^^^^^^^^^^ 9 | File "C:\Users\jacwang\AppData\Local\anaconda3\envs\autogen\Lib\site-packages\promptflow\_core\tracer.py", line 220, in wrapped 10 | output = func(*args, **kwargs) 11 | ^^^^^^^^^^^^^^^^^^^^^ 12 | File "C:\repo\AzureOpenAIExamples\Examples\promptflow\finance_assistant_pf\ag_test.py", line 90, in my_python_tool 13 | assistant = client.beta.assistants.retrieve(assistant_id) 14 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 15 | File "C:\Users\jacwang\AppData\Local\anaconda3\envs\autogen\Lib\site-packages\openai\resources\beta\assistants\assistants.py", line 140, in retrieve 16 | return self._get( 17 | ^^^^^^^^^^ 18 | File "C:\Users\jacwang\AppData\Local\anaconda3\envs\autogen\Lib\site-packages\openai\_base_client.py", line 1034, in get 19 | return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls)) 20 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 21 | File "C:\Users\jacwang\AppData\Local\anaconda3\envs\autogen\Lib\site-packages\openai\_base_client.py", line 852, in request 22 | return self._request( 23 | ^^^^^^^^^^^^^^ 24 | File "C:\Users\jacwang\AppData\Local\anaconda3\envs\autogen\Lib\site-packages\openai\_base_client.py", line 933, in _request 25 | raise self._make_status_error_from_response(err.response) from None 26 | openai.NotFoundError: Error code: 404 - {'error': {'code': '404', 'message': 'Resource not found'}} 27 | 28 | The above exception was the direct cause of the following exception: 29 | 30 | Traceback (most recent call last): 31 | File "C:\Users\jacwang\AppData\Local\anaconda3\envs\autogen\Lib\site-packages\promptflow\_core\flow_execution_context.py", line 90, in invoke_tool 32 | result = self._invoke_tool_with_timer(node, f, kwargs) 33 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 34 | File "C:\Users\jacwang\AppData\Local\anaconda3\envs\autogen\Lib\site-packages\promptflow\_core\flow_execution_context.py", line 205, in _invoke_tool_with_timer 35 | raise ToolExecutionError(node_name=node_name, module=module) from e 36 | promptflow._core._errors.ToolExecutionError: Execution failure in 'ag_test': (NotFoundError) Error code: 404 - {'error': {'code': '404', 'message': 'Resource not found'}} 37 | 2024-02-07 15:15:21 -0800 2468 execution ERROR Execution of one node has failed. Cancelling all running nodes: ag_test. 38 | -------------------------------------------------------------------------------- /Examples/promptflow/finance_assistant_pf/.promptflow/flow.output.json: -------------------------------------------------------------------------------- 1 | { 2 | "answer": [ 3 | "user:\nwhat is the value of my portfolio\n", 4 | "assistant:\nTo determine the value of your portfolio, I will need a few key pieces of information:\n\n1. **Contents of the File**: I am not currently aware of the specific contents of the file you uploaded. Could you clarify whether it contains a list of investments (like stocks, bonds, etc.) with their respective quantities and the ticker symbols for any stocks?\n2. **Current Market Values**: To accurately assess the portfolio value, we will need the current market prices for each investment.\n\nOnce I understand the contents of the file, I can help retrieve the latest closing prices for any stocks or other traded assets within your portfolio. Then, I can calculate the total value of the portfolio.\n\nPlease provide the necessary details or give me permission to open and inspect the file to get the required information.\n" 5 | ] 6 | } -------------------------------------------------------------------------------- /Examples/promptflow/finance_assistant_pf/.promptflow/flow.tools.json: -------------------------------------------------------------------------------- 1 | { 2 | "package": {}, 3 | "code": { 4 | "ag_test.py": { 5 | "type": "python", 6 | "inputs": { 7 | "connection": { 8 | "type": [ 9 | "AzureOpenAIConnection" 10 | ] 11 | }, 12 | "input1": { 13 | "type": [ 14 | "string" 15 | ] 16 | }, 17 | "assistant_id": { 18 | "type": [ 19 | "string" 20 | ] 21 | } 22 | }, 23 | "source": "ag_test.py", 24 | "function": "my_python_tool" 25 | } 26 | } 27 | } -------------------------------------------------------------------------------- /Examples/promptflow/finance_assistant_pf/ag_test.py: -------------------------------------------------------------------------------- 1 | from promptflow import tool 2 | from promptflow.connections import AzureOpenAIConnection 3 | 4 | import html 5 | import io 6 | import os 7 | import time 8 | from datetime import datetime 9 | from pathlib import Path 10 | from typing import Iterable 11 | 12 | import requests 13 | import yfinance as yf 14 | 15 | from openai import AzureOpenAI 16 | from openai.types import FileObject 17 | from openai.types.beta import Thread 18 | from openai.types.beta.threads import Run 19 | from openai.types.beta.threads.message_content_image_file import MessageContentImageFile 20 | from openai.types.beta.threads.message_content_text import MessageContentText 21 | from openai.types.beta.threads.messages import MessageFile 22 | from PIL import Image 23 | 24 | 25 | 26 | # The inputs section will change based on the arguments of the tool function, after you save the code 27 | # Adding type to arguments and return value will help the system show the types properly 28 | # Please update the function name/signature per need 29 | @tool 30 | def my_python_tool(connection: AzureOpenAIConnection, input1: str, assistant_id: str) -> str: 31 | def get_stock_price(symbol: str) -> float: 32 | stock = yf.Ticker(symbol) 33 | return stock.history(period="1d")["Close"].iloc[-1] 34 | 35 | 36 | tools_list = [ 37 | {"type": "code_interpreter"}, 38 | { 39 | "type": "function", 40 | "function": { 41 | "name": "get_stock_price", 42 | "description": "Retrieve the latest closing price of a stock using its ticker symbol.", 43 | "parameters": { 44 | "type": "object", 45 | "properties": {"symbol": {"type": "string", "description": "The ticker symbol of the stock"}}, 46 | "required": ["symbol"], 47 | }, 48 | }, 49 | }, 50 | ] 51 | 52 | # DATA_FOLDER = "data/" 53 | 54 | # def upload_file(client: AzureOpenAI, path: str) -> FileObject: 55 | # with Path(path).open("rb") as f: 56 | # return client.files.create(file=f, purpose="assistants") 57 | 58 | def call_functions(client: AzureOpenAI, thread: Thread, run: Run) -> None: 59 | print("Function Calling") 60 | required_actions = run.required_action.submit_tool_outputs.model_dump() 61 | print(required_actions) 62 | tool_outputs = [] 63 | import json 64 | 65 | for action in required_actions["tool_calls"]: 66 | func_name = action["function"]["name"] 67 | arguments = json.loads(action["function"]["arguments"]) 68 | 69 | if func_name == "get_stock_price": 70 | output = get_stock_price(symbol=arguments["symbol"]) 71 | tool_outputs.append({"tool_call_id": action["id"], "output": output}) 72 | else: 73 | raise ValueError(f"Unknown function: {func_name}") 74 | 75 | print("Submitting outputs back to the Assistant...") 76 | client.beta.threads.runs.submit_tool_outputs(thread_id=thread.id, run_id=run.id, tool_outputs=tool_outputs) 77 | 78 | 79 | client = AzureOpenAI(api_key=connection.api_key, api_version='2024-01-01-preview', azure_endpoint=connection.api_base) 80 | 81 | # arr = os.listdir(DATA_FOLDER) 82 | # assistant_files = [] 83 | # for file in arr: 84 | # filePath = DATA_FOLDER + file 85 | # assistant_files.append(upload_file(client, filePath)) 86 | 87 | # file_ids = [file.id for file in assistant_files] 88 | 89 | 90 | assistant = client.beta.assistants.retrieve(assistant_id) 91 | #assistant = client.beta.assistants.create( 92 | # name="Portfolio Management Assistant", 93 | # instructions="You are a personal securities trading assistant. Please be polite, professional, helpful, and friendly. " 94 | # + "Use the provided portfolio CSV file to answer the questions. If question is not related to the portfolio or you cannot answer the question, say, 'contact a representative for more assistance.'" 95 | # + "If the user asks for help or says 'help', provide a list of sample questions that you can answer.", 96 | # tools=tools_list, 97 | # model='gpt-4', 98 | # file_ids=file_ids, 99 | #) 100 | 101 | thread = client.beta.threads.create() 102 | 103 | def process_message(content: str) -> None: 104 | client.beta.threads.messages.create(thread_id=thread.id, role="user", content=content) 105 | 106 | run = client.beta.threads.runs.create( 107 | thread_id=thread.id, 108 | assistant_id=assistant.id, 109 | instructions="The current date and time is: " + datetime.now().strftime("%x %X") + ".", 110 | ) 111 | 112 | print("processing...") 113 | while True: 114 | run = client.beta.threads.runs.retrieve(thread_id=thread.id, run_id=run.id) 115 | if run.status == "completed": 116 | messages = client.beta.threads.messages.list(thread_id=thread.id) 117 | return format_messages(messages) 118 | break 119 | if run.status == "failed": 120 | messages = client.beta.threads.messages.list(thread_id=thread.id) 121 | return format_messages(messages) 122 | # Handle failed 123 | break 124 | if run.status == "expired": 125 | # Handle expired 126 | break 127 | if run.status == "cancelled": 128 | # Handle cancelled 129 | break 130 | if run.status == "requires_action": 131 | call_functions(client, thread, run) 132 | else: 133 | time.sleep(5) 134 | 135 | def format_messages(messages: Iterable[MessageFile]) -> None: 136 | message_list = [] 137 | 138 | # Get all the messages till the last user message 139 | for message in messages: 140 | message_list.append(message) 141 | if message.role == "user": 142 | break 143 | 144 | # Reverse the messages to show the last user message first 145 | message_list.reverse() 146 | 147 | # Print the user or Assistant messages or images 148 | return_msg = [] 149 | for message in message_list: 150 | for item in message.content: 151 | # Determine the content type 152 | #if isinstance(item, MessageContentText): 153 | return_msg.append(f"{message.role}:\n{item.text.value}\n") 154 | # elif isinstance(item, MessageContentImageFile): 155 | # # Retrieve image from file id 156 | # response_content = client.files.content(item.image_file.file_id) 157 | # data_in_bytes = response_content.read() 158 | # # Convert bytes to image 159 | # readable_buffer = io.BytesIO(data_in_bytes) 160 | # image = Image.open(readable_buffer) 161 | # # Resize image to fit in terminal 162 | # width, height = image.size 163 | # image = image.resize((width // 2, height // 2), Image.LANCZOS) 164 | # # Display image 165 | # image.show() 166 | return message_list 167 | 168 | return process_message(input1) -------------------------------------------------------------------------------- /Examples/promptflow/finance_assistant_pf/chat.jinja2: -------------------------------------------------------------------------------- 1 | system: 2 | You are a helpful assistant. 3 | 4 | {% for item in chat_history %} 5 | user: 6 | {{item.inputs.question}} 7 | assistant: 8 | {{item.outputs.answer}} 9 | {% endfor %} 10 | 11 | user: 12 | {{question}} -------------------------------------------------------------------------------- /Examples/promptflow/finance_assistant_pf/data/portfolio.csv: -------------------------------------------------------------------------------- 1 | Symbol,Average_Cost,QTY 2 | MSFT,200,300 3 | AAPL,114,200 4 | AMZN,125,50 5 | TSLA,900,100 6 | NFLX,540,80 7 | NVDA,450,50 -------------------------------------------------------------------------------- /Examples/promptflow/finance_assistant_pf/flow.dag.yaml: -------------------------------------------------------------------------------- 1 | id: template_chat_flow 2 | name: Template Chat Flow 3 | environment: 4 | python_requirements_txt: requirements.txt 5 | inputs: 6 | chat_history: 7 | type: list 8 | default: [] 9 | is_chat_input: false 10 | is_chat_history: true 11 | question: 12 | type: string 13 | default: what is the value of my portfolio 14 | is_chat_input: true 15 | outputs: 16 | answer: 17 | type: string 18 | reference: ${ag_test.output} 19 | is_chat_output: true 20 | nodes: 21 | - name: ag_test 22 | type: python 23 | source: 24 | type: code 25 | path: ag_test.py 26 | inputs: 27 | connection: Default_AzureOpenAI 28 | input1: ${inputs.question} 29 | assistant_id: asst_9t1k8YqEXYdsk6V565grn89e 30 | use_variants: false 31 | -------------------------------------------------------------------------------- /Examples/promptflow/model_as_judge_evaluator/.promptflow/flow.layout.json: -------------------------------------------------------------------------------- 1 | { 2 | "orientation": "Vertical", 3 | "nodeLayouts": { 4 | "inputs": { 5 | "x": 1407.5, 6 | "y": 42, 7 | "index": -1 8 | }, 9 | "outputs": { 10 | "x": 1832.5, 11 | "y": 846, 12 | "index": -1 13 | }, 14 | "gpt_coherence": { 15 | "x": 1315, 16 | "y": 444, 17 | "height": 75, 18 | "index": 0 19 | }, 20 | "concat_scores": { 21 | "x": 1742.5, 22 | "y": 712, 23 | "height": 75, 24 | "index": 1 25 | }, 26 | "gpt_similarity": { 27 | "x": 2732.5, 28 | "y": 444, 29 | "index": 2 30 | }, 31 | "gpt_relevance": { 32 | "x": 2402.5, 33 | "y": 444, 34 | "height": 75, 35 | "index": 3 36 | }, 37 | "gpt_fluency": { 38 | "x": 1742.5, 39 | "y": 444, 40 | "height": 75, 41 | "index": 4 42 | }, 43 | "f1_score": { 44 | "x": 985, 45 | "y": 444, 46 | "height": 75, 47 | "index": 5 48 | }, 49 | "gpt_groundedness": { 50 | "x": 2072.5, 51 | "y": 444, 52 | "height": 75, 53 | "index": 6 54 | }, 55 | "aggregate_variants_results": { 56 | "x": 140, 57 | "y": 846, 58 | "index": 7 59 | }, 60 | "select_metrics": { 61 | "x": 1490, 62 | "y": 176, 63 | "height": 75, 64 | "index": 8 65 | }, 66 | "embeded_ground_truth": { 67 | "x": 305, 68 | "y": 444, 69 | "index": 9 70 | }, 71 | "embeded_answer": { 72 | "x": 635, 73 | "y": 444, 74 | "height": 75, 75 | "index": 10 76 | }, 77 | "ada_similarity": { 78 | "x": 635, 79 | "y": 578, 80 | "height": 75, 81 | "index": 11 82 | }, 83 | "validate_input": { 84 | "x": 1402.5, 85 | "y": 310, 86 | "height": 75, 87 | "index": 12 88 | } 89 | }, 90 | "viewport": { 91 | "transformMatrix": [ 92 | 1, 93 | 0, 94 | 0, 95 | 1, 96 | -1117.46875, 97 | -76.83331888914108 98 | ] 99 | } 100 | } -------------------------------------------------------------------------------- /Examples/promptflow/model_as_judge_evaluator/.promptflow/flow.log: -------------------------------------------------------------------------------- 1 | 2024-04-08 21:13:05 +0000 40780 execution.flow INFO Start executing nodes in thread pool mode. 2 | 2024-04-08 21:13:05 +0000 40780 execution.flow INFO Start to run 12 nodes with concurrency level 16. 3 | 2024-04-08 21:13:05 +0000 40780 execution.flow INFO Executing node select_metrics. node run id: ab9522af-1236-4fd9-aeab-e7bf848d2b59_select_metrics_0 4 | 2024-04-08 21:13:05 +0000 40780 execution.flow INFO Node select_metrics completes. 5 | 2024-04-08 21:13:05 +0000 40780 execution.flow INFO Executing node validate_input. node run id: ab9522af-1236-4fd9-aeab-e7bf848d2b59_validate_input_0 6 | 2024-04-08 21:13:05 +0000 40780 execution.flow INFO Node validate_input completes. 7 | 2024-04-08 21:13:05 +0000 40780 execution.flow INFO The node 'gpt_coherence' will be executed because the activate condition is met, i.e. '${validate_input.output.gpt_coherence}' is equal to 'True'. 8 | 2024-04-08 21:13:05 +0000 40780 execution.flow INFO The node 'gpt_similarity' will be executed because the activate condition is met, i.e. '${validate_input.output.gpt_similarity}' is equal to 'True'. 9 | 2024-04-08 21:13:05 +0000 40780 execution.flow INFO The node 'gpt_relevance' will be executed because the activate condition is met, i.e. '${validate_input.output.gpt_relevance}' is equal to 'True'. 10 | 2024-04-08 21:13:05 +0000 40780 execution.flow INFO The node 'gpt_fluency' will be executed because the activate condition is met, i.e. '${validate_input.output.gpt_fluency}' is equal to 'True'. 11 | 2024-04-08 21:13:05 +0000 40780 execution.flow INFO The node 'f1_score' will be executed because the activate condition is met, i.e. '${validate_input.output.f1_score}' is equal to 'True'. 12 | 2024-04-08 21:13:05 +0000 40780 execution.flow INFO The node 'gpt_groundedness' will be executed because the activate condition is met, i.e. '${validate_input.output.gpt_groundedness}' is equal to 'True'. 13 | 2024-04-08 21:13:05 +0000 40780 execution.flow INFO The node 'embeded_ground_truth' will be executed because the activate condition is met, i.e. '${validate_input.output.ada_similarity}' is equal to 'True'. 14 | 2024-04-08 21:13:05 +0000 40780 execution.flow INFO The node 'embeded_answer' will be executed because the activate condition is met, i.e. '${validate_input.output.ada_similarity}' is equal to 'True'. 15 | 2024-04-08 21:13:05 +0000 40780 execution.flow INFO Executing node gpt_coherence. node run id: ab9522af-1236-4fd9-aeab-e7bf848d2b59_gpt_coherence_0 16 | 2024-04-08 21:13:05 +0000 40780 execution.flow INFO Executing node gpt_similarity. node run id: ab9522af-1236-4fd9-aeab-e7bf848d2b59_gpt_similarity_0 17 | 2024-04-08 21:13:05 +0000 40780 execution.flow INFO Executing node gpt_relevance. node run id: ab9522af-1236-4fd9-aeab-e7bf848d2b59_gpt_relevance_0 18 | 2024-04-08 21:13:05 +0000 40780 execution.flow INFO Executing node gpt_fluency. node run id: ab9522af-1236-4fd9-aeab-e7bf848d2b59_gpt_fluency_0 19 | 2024-04-08 21:13:05 +0000 40780 execution.flow INFO Executing node f1_score. node run id: ab9522af-1236-4fd9-aeab-e7bf848d2b59_f1_score_0 20 | 2024-04-08 21:13:05 +0000 40780 execution.flow INFO Executing node gpt_groundedness. node run id: ab9522af-1236-4fd9-aeab-e7bf848d2b59_gpt_groundedness_0 21 | 2024-04-08 21:13:05 +0000 40780 execution.flow INFO Node f1_score completes. 22 | 2024-04-08 21:13:05 +0000 40780 execution.flow INFO Executing node embeded_ground_truth. node run id: ab9522af-1236-4fd9-aeab-e7bf848d2b59_embeded_ground_truth_0 23 | 2024-04-08 21:13:05 +0000 40780 execution.flow INFO Executing node embeded_answer. node run id: ab9522af-1236-4fd9-aeab-e7bf848d2b59_embeded_answer_0 24 | 2024-04-08 21:13:06 +0000 40780 execution.flow INFO Node gpt_relevance completes. 25 | 2024-04-08 21:13:06 +0000 40780 execution.flow INFO Node gpt_similarity completes. 26 | 2024-04-08 21:13:06 +0000 40780 execution.flow INFO Node gpt_groundedness completes. 27 | 2024-04-08 21:13:06 +0000 40780 execution.flow INFO Node gpt_coherence completes. 28 | 2024-04-08 21:13:06 +0000 40780 execution.flow INFO Node gpt_fluency completes. 29 | 2024-04-08 21:13:06 +0000 40780 execution.flow INFO Node embeded_ground_truth completes. 30 | 2024-04-08 21:13:06 +0000 40780 execution.flow INFO Node embeded_answer completes. 31 | 2024-04-08 21:13:06 +0000 40780 execution.flow INFO The node 'ada_similarity' will be executed because the activate condition is met, i.e. '${validate_input.output.ada_similarity}' is equal to 'True'. 32 | 2024-04-08 21:13:06 +0000 40780 execution.flow INFO Executing node ada_similarity. node run id: ab9522af-1236-4fd9-aeab-e7bf848d2b59_ada_similarity_0 33 | 2024-04-08 21:13:06 +0000 40780 execution.flow INFO Node ada_similarity completes. 34 | 2024-04-08 21:13:06 +0000 40780 execution.flow INFO Executing node concat_scores. node run id: ab9522af-1236-4fd9-aeab-e7bf848d2b59_concat_scores_0 35 | 2024-04-08 21:13:06 +0000 40780 execution.flow INFO Node concat_scores completes. 36 | 2024-04-08 21:13:06 +0000 40780 execution.flow INFO Start to run 1 nodes with concurrency level 16. 37 | 2024-04-08 21:13:06 +0000 40780 execution.flow INFO Executing node aggregate_variants_results. node run id: ab9522af-1236-4fd9-aeab-e7bf848d2b59_aggregate_variants_results_reduce 38 | 2024-04-08 21:13:06 +0000 40780 execution.flow INFO Node aggregate_variants_results completes. 39 | -------------------------------------------------------------------------------- /Examples/promptflow/model_as_judge_evaluator/.promptflow/flow.metrics.json: -------------------------------------------------------------------------------- 1 | { 2 | "gpt_coherence": 3.0, 3 | "gpt_similarity": 5.0, 4 | "gpt_fluency": 5.0, 5 | "gpt_relevance": 5.0, 6 | "gpt_groundedness": 5.0, 7 | "f1_score": 0.5, 8 | "ada_similarity": 0.93 9 | } -------------------------------------------------------------------------------- /Examples/promptflow/model_as_judge_evaluator/.promptflow/flow.output.json: -------------------------------------------------------------------------------- 1 | { 2 | "f1_score": 0.5, 3 | "gpt_coherence": 3.0, 4 | "gpt_similarity": 5.0, 5 | "gpt_fluency": 5.0, 6 | "gpt_relevance": 5.0, 7 | "gpt_groundedness": 5.0, 8 | "ada_similarity": 0.9313747004677263 9 | } -------------------------------------------------------------------------------- /Examples/promptflow/model_as_judge_evaluator/.promptflow/flow.uihint.json: -------------------------------------------------------------------------------- 1 | { 2 | "nodes": { 3 | "select_metrics": { 4 | "variant_0": { 5 | "inputs": {}, 6 | "inputsValue": {} 7 | } 8 | }, 9 | "gpt_coherence": { 10 | "variant_0": { 11 | "inputs": {}, 12 | "inputsValue": {} 13 | } 14 | }, 15 | "validate_input": { 16 | "variant_0": { 17 | "inputs": {}, 18 | "inputsValue": {} 19 | } 20 | }, 21 | "ada_similarity": { 22 | "variant_0": { 23 | "inputs": {}, 24 | "inputsValue": {} 25 | } 26 | }, 27 | "embeded_answer": { 28 | "variant_0": { 29 | "inputs": {}, 30 | "inputsValue": {} 31 | } 32 | }, 33 | "gpt_relevance": { 34 | "variant_0": { 35 | "inputs": {}, 36 | "inputsValue": {} 37 | } 38 | }, 39 | "aggregate_variants_results": { 40 | "variant_0": { 41 | "inputs": {}, 42 | "inputsValue": {} 43 | } 44 | }, 45 | "concat_scores": { 46 | "variant_0": { 47 | "inputs": {}, 48 | "inputsValue": {} 49 | } 50 | }, 51 | "gpt_groundedness": { 52 | "variant_0": { 53 | "inputs": {}, 54 | "inputsValue": {} 55 | } 56 | }, 57 | "gpt_similarity": { 58 | "variant_0": { 59 | "inputs": {}, 60 | "inputsValue": {} 61 | } 62 | }, 63 | "gpt_fluency": { 64 | "variant_0": { 65 | "inputs": {}, 66 | "inputsValue": {} 67 | } 68 | }, 69 | "embeded_ground_truth": { 70 | "variant_0": { 71 | "inputs": {}, 72 | "inputsValue": {} 73 | } 74 | }, 75 | "f1_score": { 76 | "variant_0": { 77 | "inputs": {}, 78 | "inputsValue": {} 79 | } 80 | } 81 | } 82 | } -------------------------------------------------------------------------------- /Examples/promptflow/model_as_judge_evaluator/.promptflow/lkg_sources/README.md: -------------------------------------------------------------------------------- 1 | # Q&A Evaluation: 2 | 3 | The Q&A evaluation flow will evaluate the Q&A systems by leveraging the state-of-the-art Large Language Models (LLM) to measure the quality and safety of your responses. Utilizing GPT and GPT embedding model to assist with measurements aims to achieve a high agreement with human evaluations compared to traditional mathematical measurements. 4 | 5 | ## What you will learn 6 | 7 | The Q&A evaluation flow allows you to assess and evaluate your model with the LLM-assisted metrics and f1_score: 8 | 9 | 10 | * __gpt_coherence__: Measures the quality of all sentences in a model's predicted answer and how they fit together naturally. 11 | 12 | Coherence is scored on a scale of 1 to 5, with 1 being the worst and 5 being the best. 13 | 14 | * __gpt_relevance__: Measures how relevant the model's predicted answers are to the questions asked. 15 | 16 | Relevance metric is scored on a scale of 1 to 5, with 1 being the worst and 5 being the best. 17 | 18 | * __gpt_fluency__: Measures how grammatically and linguistically correct the model's predicted answer is. 19 | 20 | Fluency is scored on a scale of 1 to 5, with 1 being the worst and 5 being the best 21 | 22 | * __gpt_similarity__: Measures similarity between user-provided ground truth answers and the model predicted answer. 23 | 24 | Similarity is scored on a scale of 1 to 5, with 1 being the worst and 5 being the best. 25 | 26 | * __gpt_groundedness__ (against context)**: Measures how grounded the model's predicted answers are against the context. Even if LLM’s responses are true, if not verifiable against context, then such responses are considered ungrounded. 27 | 28 | Groundedness metric is scored on a scale of 1 to 5, with 1 being the worst and 5 being the best. 29 | 30 | * __ada_similarity__: Measures the cosine similarity of ada embeddings of the model prediction and the ground truth. 31 | 32 | ada_similarity is a value in the range [0, 1]. 33 | 34 | * __F1-score__: Compute the f1-Score based on the tokens in the predicted answer and the ground truth. 35 | 36 | The f1-score evaluation flow allows you to determine the f1-score metric using number of common tokens between the normalized version of the ground truth and the predicted answer. 37 | 38 | F1-score is a value in the range [0, 1]. 39 | 40 | 41 | ## Prerequisites 42 | 43 | - Connection: Azure OpenAI or OpenAI connection. 44 | - Data input: Evaluating the Coherence metric requires you to provide data inputs including a question, an answer, a ground truth, and a context. 45 | 46 | ## Tools used in this flow 47 | - LLM tool 48 | - Python tool 49 | - Embedding tool -------------------------------------------------------------------------------- /Examples/promptflow/model_as_judge_evaluator/.promptflow/lkg_sources/ada_cosine_similarity_score.py: -------------------------------------------------------------------------------- 1 | from promptflow import tool 2 | import numpy as np 3 | from numpy.linalg import norm 4 | 5 | 6 | @tool 7 | def compute_ada_cosine_similarity(a, b) -> float: 8 | return np.dot(a, b)/(norm(a)*norm(b)) 9 | -------------------------------------------------------------------------------- /Examples/promptflow/model_as_judge_evaluator/.promptflow/lkg_sources/aggregate_variants_results.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from promptflow import tool, log_metric 3 | import numpy as np 4 | 5 | 6 | @tool 7 | def aggregate_variants_results(results: List[dict], metrics: List[str]): 8 | aggregate_results = {} 9 | for result in results: 10 | for name, value in result.items(): 11 | if name in metrics[0]: 12 | if name not in aggregate_results.keys(): 13 | aggregate_results[name] = [] 14 | try: 15 | float_val = float(value) 16 | except Exception: 17 | float_val = np.nan 18 | aggregate_results[name].append(float_val) 19 | 20 | for name, value in aggregate_results.items(): 21 | if name in metrics[0]: 22 | aggregate_results[name] = np.nanmean(value) 23 | aggregate_results[name] = round(aggregate_results[name], 2) 24 | log_metric(name, aggregate_results[name]) 25 | return aggregate_results 26 | -------------------------------------------------------------------------------- /Examples/promptflow/model_as_judge_evaluator/.promptflow/lkg_sources/concat_scores.py: -------------------------------------------------------------------------------- 1 | from promptflow import tool 2 | import numpy as np 3 | import re 4 | 5 | 6 | @tool 7 | def concat_results(gpt_coherence_score: str = None, 8 | gpt_similarity_score: str = None, 9 | gpt_fluency_score: str = None, 10 | gpt_relevance_score: str = None, 11 | gpt_groundedness_score: str = None, 12 | f1_score: float = None, 13 | ada_cosine_similarity: float = None): 14 | 15 | load_list = [{'name': 'gpt_coherence', 'score': gpt_coherence_score}, 16 | {'name': 'gpt_similarity', 'score': gpt_similarity_score}, 17 | {'name': 'gpt_fluency', 'score': gpt_fluency_score}, 18 | {'name': 'gpt_relevance', 'score': gpt_relevance_score}, 19 | {'name': 'gpt_groundedness', 'score': gpt_groundedness_score}, 20 | {'name': 'f1_score', 'score': f1_score}, 21 | {'name': 'ada_similarity', 'score': ada_cosine_similarity}] 22 | 23 | scalar_metrics = ["f1_score", "ada_similarity"] 24 | score_list = [] 25 | errors = [] 26 | for item in load_list: 27 | if item["name"] in scalar_metrics: 28 | try: 29 | score = float(item["score"]) 30 | except Exception as e: 31 | score = np.nan 32 | errors.append({"name": item["name"], "msg": str(e), "data": item["score"]}) 33 | else: 34 | if item['score']: 35 | try: 36 | score = item["score"] 37 | match = re.search(r'\d', score) 38 | if match: 39 | score = float(match.group()) 40 | else: 41 | score = np.nan 42 | except Exception as e: 43 | score = np.nan 44 | errors.append({"name": item["name"], "msg": str(e), "data": item["score"]}) 45 | else: 46 | score = np.nan 47 | score_list.append({"name": item["name"], "score": score}) 48 | 49 | variant_level_result = {} 50 | for item in score_list: 51 | item_name = str(item["name"]) 52 | variant_level_result[item_name] = item["score"] 53 | if 'gpt' in item_name: 54 | variant_level_result[item_name + '_pass_rate'] = 1 if item["score"] > 3 else 0 55 | return variant_level_result 56 | -------------------------------------------------------------------------------- /Examples/promptflow/model_as_judge_evaluator/.promptflow/lkg_sources/f1_score.py: -------------------------------------------------------------------------------- 1 | from promptflow import tool 2 | from collections import Counter 3 | 4 | 5 | @tool 6 | def compute_f1_score(ground_truth: str, answer: str) -> str: 7 | import string 8 | import re 9 | 10 | class QASplitTokenizer: 11 | def __call__(self, line): 12 | """Tokenizes an input line using split() on whitespace 13 | 14 | :param line: a segment to tokenize 15 | :return: the tokenized line 16 | """ 17 | 18 | return line.split() 19 | 20 | def normalize_text(text) -> str: 21 | """Lower text and remove punctuation, articles and extra whitespace.""" 22 | 23 | def remove_articles(text): 24 | return re.sub(r"\b(a|an|the)\b", " ", text) 25 | 26 | def white_space_fix(text): 27 | return " ".join(text.split()) 28 | 29 | def remove_punctuation(text): 30 | exclude = set(string.punctuation) 31 | return "".join(ch for ch in text if ch not in exclude) 32 | 33 | def lower(text): 34 | return text.lower() 35 | 36 | return white_space_fix(remove_articles(remove_punctuation(lower(text)))) 37 | prediction_tokens = normalize_text(answer) 38 | reference_tokens = normalize_text(ground_truth) 39 | tokenizer = QASplitTokenizer() 40 | prediction_tokens = tokenizer(prediction_tokens) 41 | reference_tokens = tokenizer(reference_tokens) 42 | 43 | common_tokens = Counter(prediction_tokens) & Counter(reference_tokens) 44 | num_common_tokens = sum(common_tokens.values()) 45 | 46 | if num_common_tokens == 0: 47 | f1 = 0.0 48 | else: 49 | precision = 1.0 * num_common_tokens / len(prediction_tokens) 50 | recall = 1.0 * num_common_tokens / len(reference_tokens) 51 | 52 | f1 = (2.0 * precision * recall) / (precision + recall) 53 | 54 | return f1 55 | -------------------------------------------------------------------------------- /Examples/promptflow/model_as_judge_evaluator/.promptflow/lkg_sources/flow.meta.yaml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/flow.schema.json 2 | name: qna_non_rag_eval 3 | display_name: QnA Evaluation 4 | type: evaluate 5 | path: ./flow.dag.yaml 6 | description: Compute the quality of the answer for the given question based on the ground_truth and the context 7 | properties: 8 | promptflow.stage: prod 9 | promptflow.details.type: markdown 10 | promptflow.details.source: README.md 11 | promptflow.batch_inputs: samples.json -------------------------------------------------------------------------------- /Examples/promptflow/model_as_judge_evaluator/.promptflow/lkg_sources/gpt_coherence_prompt.jinja2: -------------------------------------------------------------------------------- 1 | system: 2 | You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. 3 | 4 | user: 5 | Coherence of an answer is measured by how well all the sentences fit together and sound naturally as a whole. Consider the overall quality of the answer when evaluating coherence. Given the question and answer, score the coherence of answer between one to five stars using the following rating scale: 6 | One star: the answer completely lacks coherence 7 | Two stars: the answer mostly lacks coherence 8 | Three stars: the answer is partially coherent 9 | Four stars: the answer is mostly coherent 10 | Five stars: the answer has perfect coherency 11 | 12 | This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5. 13 | 14 | question: What is your favorite indoor activity and why do you enjoy it? 15 | answer: I like pizza. The sun is shining. 16 | stars: 1 17 | 18 | question: Can you describe your favorite movie without giving away any spoilers? 19 | answer: It is a science fiction movie. There are dinosaurs. The actors eat cake. People must stop the villain. 20 | stars: 2 21 | 22 | question: What are some benefits of regular exercise? 23 | answer: Regular exercise improves your mood. A good workout also helps you sleep better. Trees are green. 24 | stars: 3 25 | 26 | question: How do you cope with stress in your daily life? 27 | answer: I usually go for a walk to clear my head. Listening to music helps me relax as well. Stress is a part of life, but we can manage it through some activities. 28 | stars: 4 29 | 30 | question: What can you tell me about climate change and its effects on the environment? 31 | answer: Climate change has far-reaching effects on the environment. Rising temperatures result in the melting of polar ice caps, contributing to sea-level rise. Additionally, more frequent and severe weather events, such as hurricanes and heatwaves, can cause disruption to ecosystems and human societies alike. 32 | stars: 5 33 | 34 | question: {{question}} 35 | answer: {{answer}} 36 | stars: -------------------------------------------------------------------------------- /Examples/promptflow/model_as_judge_evaluator/.promptflow/lkg_sources/gpt_fluency_prompt.jinja2: -------------------------------------------------------------------------------- 1 | system: 2 | You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. 3 | user: 4 | Fluency measures the quality of individual sentences in the answer, and whether they are well-written and grammatically correct. Consider the quality of individual sentences when evaluating fluency. Given the question and answer, score the fluency of the answer between one to five stars using the following rating scale: 5 | One star: the answer completely lacks fluency 6 | Two stars: the answer mostly lacks fluency 7 | Three stars: the answer is partially fluent 8 | Four stars: the answer is mostly fluent 9 | Five stars: the answer has perfect fluency 10 | 11 | This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5. 12 | 13 | question: What did you have for breakfast today? 14 | answer: Breakfast today, me eating cereal and orange juice very good. 15 | stars: 1 16 | 17 | question: How do you feel when you travel alone? 18 | answer: Alone travel, nervous, but excited also. I feel adventure and like its time. 19 | stars: 2 20 | 21 | question: When was the last time you went on a family vacation? 22 | answer: Last family vacation, it took place in last summer. We traveled to a beach destination, very fun. 23 | stars: 3 24 | 25 | question: What is your favorite thing about your job? 26 | answer: My favorite aspect of my job is the chance to interact with diverse people. I am constantly learning from their experiences and stories. 27 | stars: 4 28 | 29 | question: Can you describe your morning routine? 30 | answer: Every morning, I wake up at 6 am, drink a glass of water, and do some light stretching. After that, I take a shower and get dressed for work. Then, I have a healthy breakfast, usually consisting of oatmeal and fruits, before leaving the house around 7:30 am. 31 | stars: 5 32 | 33 | question: {{question}} 34 | answer: {{answer}} 35 | stars: -------------------------------------------------------------------------------- /Examples/promptflow/model_as_judge_evaluator/.promptflow/lkg_sources/gpt_groundedness_prompt.jinja2: -------------------------------------------------------------------------------- 1 | system: 2 | You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. 3 | user: 4 | You will be presented with a CONTEXT and an ANSWER about that CONTEXT. You need to decide whether the ANSWER is entailed by the CONTEXT by choosing one of the following rating: 5 | 1. 5: The ANSWER follows logically from the information contained in the CONTEXT. 6 | 2. 1: The ANSWER is logically false from the information contained in the CONTEXT. 7 | 3. an integer score between 1 and 5 and if such integer score does not exist, use 1: It is not possible to determine whether the ANSWER is true or false without further information. Read the passage of information thoroughly and select the correct answer from the three answer labels. Read the CONTEXT thoroughly to ensure you know what the CONTEXT entails. Note the ANSWER is generated by a computer system, it can contain certain symbols, which should not be a negative factor in the evaluation. 8 | Independent Examples: 9 | ## Example Task #1 Input: 10 | {"CONTEXT": "Some are reported as not having been wanted at all.", "QUESTION": "", "ANSWER": "All are reported as being completely and fully wanted."} 11 | ## Example Task #1 Output: 12 | 1 13 | ## Example Task #2 Input: 14 | {"CONTEXT": "Ten new television shows appeared during the month of September. Five of the shows were sitcoms, three were hourlong dramas, and two were news-magazine shows. By January, only seven of these new shows were still on the air. Five of the shows that remained were sitcoms.", "QUESTION": "", "ANSWER": "At least one of the shows that were cancelled was an hourlong drama."} 15 | ## Example Task #2 Output: 16 | 5 17 | ## Example Task #3 Input: 18 | {"CONTEXT": "In Quebec, an allophone is a resident, usually an immigrant, whose mother tongue or home language is neither French nor English.", "QUESTION": "", "ANSWER": "In Quebec, an allophone is a resident, usually an immigrant, whose mother tongue or home language is not French."} 19 | ## Example Task #3 Output: 20 | 5 21 | ## Example Task #4 Input: 22 | {"CONTEXT": "Some are reported as not having been wanted at all.", "QUESTION": "", "ANSWER": "All are reported as being completely and fully wanted."} 23 | ## Example Task #4 Output: 24 | 1 25 | ## Actual Task Input: 26 | {"CONTEXT": {{context}}, "QUESTION": "", "ANSWER": {{answer}}} 27 | Reminder: The return values for each task should be correctly formatted as an integer between 1 and 5. Do not repeat the context and question. 28 | Actual Task Output: -------------------------------------------------------------------------------- /Examples/promptflow/model_as_judge_evaluator/.promptflow/lkg_sources/gpt_relevance_prompt.jinja2: -------------------------------------------------------------------------------- 1 | system: 2 | You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. 3 | user: 4 | Relevance measures how well the answer addresses the main aspects of the question, based on the context. Consider whether all and only the important aspects are contained in the answer when evaluating relevance. Given the context and question, score the relevance of the answer between one to five stars using the following rating scale: 5 | One star: the answer completely lacks relevance 6 | Two stars: the answer mostly lacks relevance 7 | Three stars: the answer is partially relevant 8 | Four stars: the answer is mostly relevant 9 | Five stars: the answer has perfect relevance 10 | 11 | This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5. 12 | 13 | context: Marie Curie was a Polish-born physicist and chemist who pioneered research on radioactivity and was the first woman to win a Nobel Prize. 14 | question: What field did Marie Curie excel in? 15 | answer: Marie Curie was a renowned painter who focused mainly on impressionist styles and techniques. 16 | stars: 1 17 | 18 | context: The Beatles were an English rock band formed in Liverpool in 1960, and they are widely regarded as the most influential music band in history. 19 | question: Where were The Beatles formed? 20 | answer: The band The Beatles began their journey in London, England, and they changed the history of music. 21 | stars: 2 22 | 23 | context: The recent Mars rover, Perseverance, was launched in 2020 with the main goal of searching for signs of ancient life on Mars. The rover also carries an experiment called MOXIE, which aims to generate oxygen from the Martian atmosphere. 24 | question: What are the main goals of Perseverance Mars rover mission? 25 | answer: The Perseverance Mars rover mission focuses on searching for signs of ancient life on Mars. 26 | stars: 3 27 | 28 | context: The Mediterranean diet is a commonly recommended dietary plan that emphasizes fruits, vegetables, whole grains, legumes, lean proteins, and healthy fats. Studies have shown that it offers numerous health benefits, including a reduced risk of heart disease and improved cognitive health. 29 | question: What are the main components of the Mediterranean diet? 30 | answer: The Mediterranean diet primarily consists of fruits, vegetables, whole grains, and legumes. 31 | stars: 4 32 | 33 | context: The Queen's Royal Castle is a well-known tourist attraction in the United Kingdom. It spans over 500 acres and contains extensive gardens and parks. The castle was built in the 15th century and has been home to generations of royalty. 34 | question: What are the main attractions of the Queen's Royal Castle? 35 | answer: The main attractions of the Queen's Royal Castle are its expansive 500-acre grounds, extensive gardens, parks, and the historical castle itself, which dates back to the 15th century and has housed generations of royalty. 36 | stars: 5 37 | 38 | context: {{context}} 39 | question: {{question}} 40 | answer: {{answer}} 41 | stars: -------------------------------------------------------------------------------- /Examples/promptflow/model_as_judge_evaluator/.promptflow/lkg_sources/gpt_similarity_prompt.jinja2: -------------------------------------------------------------------------------- 1 | system: 2 | You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. 3 | user: 4 | Equivalence, as a metric, measures the similarity between the predicted answer and the correct answer. If the information and content in the predicted answer is similar or equivalent to the correct answer, then the value of the Equivalence metric should be high, else it should be low. Given the question, correct answer, and predicted answer, determine the value of Equivalence metric using the following rating scale: 5 | One star: the predicted answer is not at all similar to the correct answer 6 | Two stars: the predicted answer is mostly not similar to the correct answer 7 | Three stars: the predicted answer is somewhat similar to the correct answer 8 | Four stars: the predicted answer is mostly similar to the correct answer 9 | Five stars: the predicted answer is completely similar to the correct answer 10 | 11 | This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5. 12 | 13 | The examples below show the Equivalence score for a question, a correct answer, and a predicted answer. 14 | 15 | question: What is the role of ribosomes? 16 | correct answer: Ribosomes are cellular structures responsible for protein synthesis. They interpret the genetic information carried by messenger RNA (mRNA) and use it to assemble amino acids into proteins. 17 | predicted answer: Ribosomes participate in carbohydrate breakdown by removing nutrients from complex sugar molecules. 18 | stars: 1 19 | 20 | question: Why did the Titanic sink? 21 | correct answer: The Titanic sank after it struck an iceberg during its maiden voyage in 1912. The impact caused the ship's hull to breach, allowing water to flood into the vessel. The ship's design, lifeboat shortage, and lack of timely rescue efforts contributed to the tragic loss of life. 22 | predicted answer: The sinking of the Titanic was a result of a large iceberg collision. This caused the ship to take on water and eventually sink, leading to the death of many passengers due to a shortage of lifeboats and insufficient rescue attempts. 23 | stars: 2 24 | 25 | question: What causes seasons on Earth? 26 | correct answer: Seasons on Earth are caused by the tilt of the Earth's axis and its revolution around the Sun. As the Earth orbits the Sun, the tilt causes different parts of the planet to receive varying amounts of sunlight, resulting in changes in temperature and weather patterns. 27 | predicted answer: Seasons occur because of the Earth's rotation and its elliptical orbit around the Sun. The tilt of the Earth's axis causes regions to be subjected to different sunlight intensities, which leads to temperature fluctuations and alternating weather conditions. 28 | stars: 3 29 | 30 | question: How does photosynthesis work? 31 | correct answer: Photosynthesis is a process by which green plants and some other organisms convert light energy into chemical energy. This occurs as light is absorbed by chlorophyll molecules, and then carbon dioxide and water are converted into glucose and oxygen through a series of reactions. 32 | predicted answer: In photosynthesis, sunlight is transformed into nutrients by plants and certain microorganisms. Light is captured by chlorophyll molecules, followed by the conversion of carbon dioxide and water into sugar and oxygen through multiple reactions. 33 | stars: 4 34 | 35 | question: What are the health benefits of regular exercise? 36 | correct answer: Regular exercise can help maintain a healthy weight, increase muscle and bone strength, and reduce the risk of chronic diseases. It also promotes mental well-being by reducing stress and improving overall mood. 37 | predicted answer: Routine physical activity can contribute to maintaining ideal body weight, enhancing muscle and bone strength, and preventing chronic illnesses. In addition, it supports mental health by alleviating stress and augmenting general mood. 38 | stars: 5 39 | 40 | question: {{question}} 41 | correct answer:{{ground_truth}} 42 | predicted answer: {{answer}} 43 | stars: -------------------------------------------------------------------------------- /Examples/promptflow/model_as_judge_evaluator/.promptflow/lkg_sources/requirements.txt: -------------------------------------------------------------------------------- 1 | promptflow 2 | promptflow-tools -------------------------------------------------------------------------------- /Examples/promptflow/model_as_judge_evaluator/.promptflow/lkg_sources/samples.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "question": "Which tent is the most waterproof?", 4 | "context": "From the our product list, the alpine explorer tent is the most waterproof. The Adventure Dining Table has higher weight.", 5 | "answer": "The Alpine Explorer Tent is the most waterproof.", 6 | "ground_truth": "The Alpine Explorer Tent has the highest rainfly waterproof rating at 3000m" 7 | } 8 | ] -------------------------------------------------------------------------------- /Examples/promptflow/model_as_judge_evaluator/.promptflow/lkg_sources/select_metrics.py: -------------------------------------------------------------------------------- 1 | from promptflow import tool 2 | 3 | 4 | @tool 5 | def select_metrics(metrics: str) -> str: 6 | supported_metrics = ('gpt_coherence', 'gpt_similarity', 'gpt_fluency', 'gpt_relevance', 'gpt_groundedness', 7 | 'f1_score', 'ada_similarity') 8 | user_selected_metrics = [metric.strip() for metric in metrics.split(',') if metric] 9 | metric_selection_dict = {} 10 | for metric in supported_metrics: 11 | if metric in user_selected_metrics: 12 | metric_selection_dict[metric] = True 13 | else: 14 | metric_selection_dict[metric] = False 15 | return metric_selection_dict 16 | -------------------------------------------------------------------------------- /Examples/promptflow/model_as_judge_evaluator/.promptflow/lkg_sources/validate_input.py: -------------------------------------------------------------------------------- 1 | from promptflow import tool 2 | 3 | 4 | @tool 5 | def validate_input(question: str, answer: str, context: str, ground_truth: str, selected_metrics: dict) -> dict: 6 | input_data = {"question": question, "answer": answer, "context": context, "ground_truth": ground_truth} 7 | expected_input_cols = set(input_data.keys()) 8 | dict_metric_required_fields = {"gpt_groundedness": set(["answer", "context"]), 9 | "gpt_relevance": set(["question", "answer", "context"]), 10 | "gpt_coherence": set(["question", "answer"]), 11 | "gpt_similarity": set(["question", "answer", "ground_truth"]), 12 | "gpt_fluency": set(["question", "answer"]), 13 | "f1_score": set(["answer", "ground_truth"]), 14 | "ada_similarity": set(["answer", "ground_truth"])} 15 | actual_input_cols = set() 16 | for col in expected_input_cols: 17 | if input_data[col] and input_data[col].strip(): 18 | actual_input_cols.add(col) 19 | data_validation = selected_metrics 20 | for metric in selected_metrics: 21 | if selected_metrics[metric]: 22 | metric_required_fields = dict_metric_required_fields[metric] 23 | if metric_required_fields <= actual_input_cols: 24 | data_validation[metric] = True 25 | else: 26 | data_validation[metric] = False 27 | return data_validation 28 | -------------------------------------------------------------------------------- /Examples/promptflow/model_as_judge_evaluator/.promptflow/ux.inputs.json: -------------------------------------------------------------------------------- 1 | {"chat_list": []} -------------------------------------------------------------------------------- /Examples/promptflow/model_as_judge_evaluator/.runs/9ccd06e6-71ee-4b65-a04e-d9a6b525c0c7/flow.dag.yaml: -------------------------------------------------------------------------------- 1 | inputs: 2 | question: 3 | type: string 4 | default: Which tent is the most waterproof? 5 | is_chat_input: false 6 | answer: 7 | type: string 8 | default: The Alpine Explorer Tent is the most waterproof. 9 | is_chat_input: false 10 | context: 11 | type: string 12 | default: From the our product list, the alpine explorer tent is the most 13 | waterproof. The Adventure Dining Tabbe has higher weight. 14 | is_chat_input: false 15 | ground_truth: 16 | type: string 17 | default: The Alpine Explorer Tent has the highest rainfly waterproof rating at 3000m 18 | is_chat_input: false 19 | metrics: 20 | type: string 21 | default: gpt_groundedness,f1_score,gpt_fluency,gpt_coherence,gpt_similarity,gpt_relevance 22 | is_chat_input: false 23 | outputs: 24 | f1_score: 25 | type: string 26 | reference: ${concat_scores.output.f1_score} 27 | evaluation_only: false 28 | is_chat_output: false 29 | gpt_coherence: 30 | type: string 31 | reference: ${concat_scores.output.gpt_coherence} 32 | evaluation_only: false 33 | is_chat_output: false 34 | gpt_similarity: 35 | type: string 36 | reference: ${concat_scores.output.gpt_similarity} 37 | evaluation_only: false 38 | is_chat_output: false 39 | gpt_fluency: 40 | type: string 41 | reference: ${concat_scores.output.gpt_fluency} 42 | evaluation_only: false 43 | is_chat_output: false 44 | gpt_relevance: 45 | type: string 46 | reference: ${concat_scores.output.gpt_relevance} 47 | evaluation_only: false 48 | is_chat_output: false 49 | gpt_groundedness: 50 | type: string 51 | reference: ${concat_scores.output.gpt_groundedness} 52 | evaluation_only: false 53 | is_chat_output: false 54 | ada_similarity: 55 | type: string 56 | reference: ${concat_scores.output.ada_similarity} 57 | evaluation_only: false 58 | is_chat_output: false 59 | nodes: 60 | - name: gpt_coherence 61 | type: llm 62 | source: 63 | type: code 64 | path: gpt_coherence_prompt.jinja2 65 | inputs: 66 | deployment_name: gpt-35-turbo 67 | temperature: 0 68 | top_p: 1 69 | max_tokens: 1 70 | presence_penalty: 0 71 | frequency_penalty: 0 72 | answer: ${inputs.answer} 73 | question: ${inputs.question} 74 | provider: AzureOpenAI 75 | connection: Default_AzureOpenAI 76 | api: chat 77 | module: promptflow.tools.aoai 78 | aggregation: false 79 | activate: 80 | when: ${validate_input.output.gpt_coherence} 81 | is: true 82 | use_variants: false 83 | - name: concat_scores 84 | type: python 85 | source: 86 | type: code 87 | path: concat_scores.py 88 | inputs: 89 | ada_cosine_similarity: ${ada_similarity.output} 90 | f1_score: ${f1_score.output} 91 | gpt_coherence_score: ${gpt_coherence.output} 92 | gpt_fluency_score: ${gpt_fluency.output} 93 | gpt_groundedness_score: ${gpt_groundedness.output} 94 | gpt_relevance_score: ${gpt_relevance.output} 95 | gpt_similarity_score: ${gpt_similarity.output} 96 | aggregation: false 97 | use_variants: false 98 | - name: gpt_similarity 99 | type: llm 100 | source: 101 | type: code 102 | path: gpt_similarity_prompt.jinja2 103 | inputs: 104 | deployment_name: gpt-35-turbo 105 | temperature: 0 106 | top_p: 1 107 | max_tokens: 1 108 | presence_penalty: 0 109 | frequency_penalty: 0 110 | answer: ${inputs.answer} 111 | ground_truth: ${inputs.ground_truth} 112 | question: ${inputs.question} 113 | provider: AzureOpenAI 114 | connection: Default_AzureOpenAI 115 | api: chat 116 | module: promptflow.tools.aoai 117 | aggregation: false 118 | activate: 119 | when: ${validate_input.output.gpt_similarity} 120 | is: true 121 | use_variants: false 122 | - name: gpt_relevance 123 | type: llm 124 | source: 125 | type: code 126 | path: gpt_relevance_prompt.jinja2 127 | inputs: 128 | deployment_name: gpt-35-turbo 129 | temperature: 0 130 | top_p: 1 131 | max_tokens: 1 132 | presence_penalty: 0 133 | frequency_penalty: 0 134 | answer: ${inputs.answer} 135 | context: ${inputs.context} 136 | question: ${inputs.question} 137 | provider: AzureOpenAI 138 | connection: Default_AzureOpenAI 139 | api: chat 140 | module: promptflow.tools.aoai 141 | aggregation: false 142 | activate: 143 | when: ${validate_input.output.gpt_relevance} 144 | is: true 145 | use_variants: false 146 | - name: gpt_fluency 147 | type: llm 148 | source: 149 | type: code 150 | path: gpt_fluency_prompt.jinja2 151 | inputs: 152 | deployment_name: gpt-35-turbo 153 | temperature: 0 154 | top_p: 1 155 | max_tokens: 1 156 | presence_penalty: 0 157 | frequency_penalty: 0 158 | answer: ${inputs.answer} 159 | question: ${inputs.question} 160 | provider: AzureOpenAI 161 | connection: Default_AzureOpenAI 162 | api: chat 163 | module: promptflow.tools.aoai 164 | aggregation: false 165 | activate: 166 | when: ${validate_input.output.gpt_fluency} 167 | is: true 168 | use_variants: false 169 | - name: f1_score 170 | type: python 171 | source: 172 | type: code 173 | path: f1_score.py 174 | inputs: 175 | answer: ${inputs.answer} 176 | ground_truth: ${inputs.ground_truth} 177 | aggregation: false 178 | activate: 179 | when: ${validate_input.output.f1_score} 180 | is: true 181 | use_variants: false 182 | - name: gpt_groundedness 183 | type: llm 184 | source: 185 | type: code 186 | path: gpt_groundedness_prompt.jinja2 187 | inputs: 188 | deployment_name: gpt-35-turbo 189 | temperature: 0 190 | top_p: 1 191 | max_tokens: 1 192 | presence_penalty: 0 193 | frequency_penalty: 0 194 | answer: ${inputs.answer} 195 | context: ${inputs.context} 196 | provider: AzureOpenAI 197 | connection: Default_AzureOpenAI 198 | api: chat 199 | module: promptflow.tools.aoai 200 | aggregation: false 201 | activate: 202 | when: ${validate_input.output.gpt_groundedness} 203 | is: true 204 | use_variants: false 205 | - name: aggregate_variants_results 206 | type: python 207 | source: 208 | type: code 209 | path: aggregate_variants_results.py 210 | inputs: 211 | metrics: ${inputs.metrics} 212 | results: ${concat_scores.output} 213 | aggregation: true 214 | use_variants: false 215 | - name: select_metrics 216 | type: python 217 | source: 218 | type: code 219 | path: select_metrics.py 220 | inputs: 221 | metrics: ${inputs.metrics} 222 | aggregation: false 223 | use_variants: false 224 | - name: embeded_ground_truth 225 | type: python 226 | source: 227 | type: package 228 | tool: promptflow.tools.embedding.embedding 229 | inputs: 230 | connection: Default_AzureOpenAI 231 | deployment_name: text-embedding-ada-002 232 | input: ${inputs.ground_truth} 233 | aggregation: false 234 | activate: 235 | when: ${validate_input.output.ada_similarity} 236 | is: true 237 | use_variants: false 238 | - name: embeded_answer 239 | type: python 240 | source: 241 | type: package 242 | tool: promptflow.tools.embedding.embedding 243 | inputs: 244 | connection: Default_AzureOpenAI 245 | deployment_name: text-embedding-ada-002 246 | input: ${inputs.answer} 247 | aggregation: false 248 | activate: 249 | when: ${validate_input.output.ada_similarity} 250 | is: true 251 | use_variants: false 252 | - name: ada_similarity 253 | type: python 254 | source: 255 | type: code 256 | path: ada_cosine_similarity_score.py 257 | inputs: 258 | a: ${embeded_ground_truth.output} 259 | b: ${embeded_answer.output} 260 | aggregation: false 261 | activate: 262 | when: ${validate_input.output.ada_similarity} 263 | is: true 264 | use_variants: false 265 | - name: validate_input 266 | type: python 267 | source: 268 | type: code 269 | path: validate_input.py 270 | inputs: 271 | answer: ${inputs.answer} 272 | context: ${inputs.context} 273 | ground_truth: ${inputs.ground_truth} 274 | question: ${inputs.question} 275 | selected_metrics: ${select_metrics.output} 276 | aggregation: false 277 | use_variants: false 278 | node_variants: {} 279 | environment: 280 | python_requirements_txt: requirements.txt 281 | -------------------------------------------------------------------------------- /Examples/promptflow/model_as_judge_evaluator/README.md: -------------------------------------------------------------------------------- 1 | # Q&A Evaluation: 2 | 3 | The Q&A evaluation flow will evaluate the Q&A systems by leveraging the state-of-the-art Large Language Models (LLM) to measure the quality and safety of your responses. Utilizing GPT and GPT embedding model to assist with measurements aims to achieve a high agreement with human evaluations compared to traditional mathematical measurements. 4 | 5 | ## What you will learn 6 | 7 | The Q&A evaluation flow allows you to assess and evaluate your model with the LLM-assisted metrics and f1_score: 8 | 9 | 10 | * __gpt_coherence__: Measures the quality of all sentences in a model's predicted answer and how they fit together naturally. 11 | 12 | Coherence is scored on a scale of 1 to 5, with 1 being the worst and 5 being the best. 13 | 14 | * __gpt_relevance__: Measures how relevant the model's predicted answers are to the questions asked. 15 | 16 | Relevance metric is scored on a scale of 1 to 5, with 1 being the worst and 5 being the best. 17 | 18 | * __gpt_fluency__: Measures how grammatically and linguistically correct the model's predicted answer is. 19 | 20 | Fluency is scored on a scale of 1 to 5, with 1 being the worst and 5 being the best 21 | 22 | * __gpt_similarity__: Measures similarity between user-provided ground truth answers and the model predicted answer. 23 | 24 | Similarity is scored on a scale of 1 to 5, with 1 being the worst and 5 being the best. 25 | 26 | * __gpt_groundedness__ (against context)**: Measures how grounded the model's predicted answers are against the context. Even if LLM’s responses are true, if not verifiable against context, then such responses are considered ungrounded. 27 | 28 | Groundedness metric is scored on a scale of 1 to 5, with 1 being the worst and 5 being the best. 29 | 30 | * __ada_similarity__: Measures the cosine similarity of ada embeddings of the model prediction and the ground truth. 31 | 32 | ada_similarity is a value in the range [0, 1]. 33 | 34 | * __F1-score__: Compute the f1-Score based on the tokens in the predicted answer and the ground truth. 35 | 36 | The f1-score evaluation flow allows you to determine the f1-score metric using number of common tokens between the normalized version of the ground truth and the predicted answer. 37 | 38 | F1-score is a value in the range [0, 1]. 39 | 40 | 41 | ## Prerequisites 42 | 43 | - Connection: Azure OpenAI or OpenAI connection. 44 | - Data input: Evaluating the Coherence metric requires you to provide data inputs including a question, an answer, a ground truth, and a context. 45 | 46 | ## Tools used in this flow 47 | - LLM tool 48 | - Python tool 49 | - Embedding tool -------------------------------------------------------------------------------- /Examples/promptflow/model_as_judge_evaluator/ada_cosine_similarity_score.py: -------------------------------------------------------------------------------- 1 | from promptflow import tool 2 | import numpy as np 3 | from numpy.linalg import norm 4 | 5 | 6 | @tool 7 | def compute_ada_cosine_similarity(a, b) -> float: 8 | return np.dot(a, b)/(norm(a)*norm(b)) 9 | -------------------------------------------------------------------------------- /Examples/promptflow/model_as_judge_evaluator/aggregate_variants_results.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from promptflow import tool, log_metric 3 | import numpy as np 4 | 5 | 6 | @tool 7 | def aggregate_variants_results(results: List[dict], metrics: List[str]): 8 | aggregate_results = {} 9 | for result in results: 10 | for name, value in result.items(): 11 | if name in metrics[0]: 12 | if name not in aggregate_results.keys(): 13 | aggregate_results[name] = [] 14 | try: 15 | float_val = float(value) 16 | except Exception: 17 | float_val = np.nan 18 | aggregate_results[name].append(float_val) 19 | 20 | for name, value in aggregate_results.items(): 21 | if name in metrics[0]: 22 | aggregate_results[name] = np.nanmean(value) 23 | aggregate_results[name] = round(aggregate_results[name], 2) 24 | log_metric(name, aggregate_results[name]) 25 | return aggregate_results 26 | -------------------------------------------------------------------------------- /Examples/promptflow/model_as_judge_evaluator/concat_scores.py: -------------------------------------------------------------------------------- 1 | from promptflow import tool 2 | import numpy as np 3 | import re 4 | 5 | 6 | @tool 7 | def concat_results(gpt_coherence_score: str = None, 8 | gpt_similarity_score: str = None, 9 | gpt_fluency_score: str = None, 10 | gpt_relevance_score: str = None, 11 | gpt_groundedness_score: str = None, 12 | f1_score: float = None, 13 | ada_cosine_similarity: float = None): 14 | 15 | load_list = [{'name': 'gpt_coherence', 'score': gpt_coherence_score}, 16 | {'name': 'gpt_similarity', 'score': gpt_similarity_score}, 17 | {'name': 'gpt_fluency', 'score': gpt_fluency_score}, 18 | {'name': 'gpt_relevance', 'score': gpt_relevance_score}, 19 | {'name': 'gpt_groundedness', 'score': gpt_groundedness_score}, 20 | {'name': 'f1_score', 'score': f1_score}, 21 | {'name': 'ada_similarity', 'score': ada_cosine_similarity}] 22 | 23 | scalar_metrics = ["f1_score", "ada_similarity"] 24 | score_list = [] 25 | errors = [] 26 | for item in load_list: 27 | if item["name"] in scalar_metrics: 28 | try: 29 | score = float(item["score"]) 30 | except Exception as e: 31 | score = np.nan 32 | errors.append({"name": item["name"], "msg": str(e), "data": item["score"]}) 33 | else: 34 | if item['score']: 35 | try: 36 | score = item["score"] 37 | match = re.search(r'\d', score) 38 | if match: 39 | score = float(match.group()) 40 | else: 41 | score = np.nan 42 | except Exception as e: 43 | score = np.nan 44 | errors.append({"name": item["name"], "msg": str(e), "data": item["score"]}) 45 | else: 46 | score = np.nan 47 | score_list.append({"name": item["name"], "score": score}) 48 | 49 | variant_level_result = {} 50 | for item in score_list: 51 | item_name = str(item["name"]) 52 | variant_level_result[item_name] = item["score"] 53 | if 'gpt' in item_name: 54 | variant_level_result[item_name + '_pass_rate'] = 1 if item["score"] > 3 else 0 55 | return variant_level_result 56 | -------------------------------------------------------------------------------- /Examples/promptflow/model_as_judge_evaluator/f1_score.py: -------------------------------------------------------------------------------- 1 | from promptflow import tool 2 | from collections import Counter 3 | 4 | 5 | @tool 6 | def compute_f1_score(ground_truth: str, answer: str) -> str: 7 | import string 8 | import re 9 | 10 | class QASplitTokenizer: 11 | def __call__(self, line): 12 | """Tokenizes an input line using split() on whitespace 13 | 14 | :param line: a segment to tokenize 15 | :return: the tokenized line 16 | """ 17 | 18 | return line.split() 19 | 20 | def normalize_text(text) -> str: 21 | """Lower text and remove punctuation, articles and extra whitespace.""" 22 | 23 | def remove_articles(text): 24 | return re.sub(r"\b(a|an|the)\b", " ", text) 25 | 26 | def white_space_fix(text): 27 | return " ".join(text.split()) 28 | 29 | def remove_punctuation(text): 30 | exclude = set(string.punctuation) 31 | return "".join(ch for ch in text if ch not in exclude) 32 | 33 | def lower(text): 34 | return text.lower() 35 | 36 | return white_space_fix(remove_articles(remove_punctuation(lower(text)))) 37 | prediction_tokens = normalize_text(answer) 38 | reference_tokens = normalize_text(ground_truth) 39 | tokenizer = QASplitTokenizer() 40 | prediction_tokens = tokenizer(prediction_tokens) 41 | reference_tokens = tokenizer(reference_tokens) 42 | 43 | common_tokens = Counter(prediction_tokens) & Counter(reference_tokens) 44 | num_common_tokens = sum(common_tokens.values()) 45 | 46 | if num_common_tokens == 0: 47 | f1 = 0.0 48 | else: 49 | precision = 1.0 * num_common_tokens / len(prediction_tokens) 50 | recall = 1.0 * num_common_tokens / len(reference_tokens) 51 | 52 | f1 = (2.0 * precision * recall) / (precision + recall) 53 | 54 | return f1 55 | -------------------------------------------------------------------------------- /Examples/promptflow/model_as_judge_evaluator/flow.dag.yaml: -------------------------------------------------------------------------------- 1 | inputs: 2 | question: 3 | type: string 4 | default: Which tent is the most waterproof? 5 | is_chat_input: false 6 | answer: 7 | type: string 8 | default: The Alpine Explorer Tent is the most waterproof. 9 | is_chat_input: false 10 | context: 11 | type: string 12 | default: From the our product list, the alpine explorer tent is the most 13 | waterproof. The Adventure Dining Tabbe has higher weight. 14 | is_chat_input: false 15 | ground_truth: 16 | type: string 17 | default: The Alpine Explorer Tent has the highest rainfly waterproof rating at 3000m 18 | is_chat_input: false 19 | metrics: 20 | type: string 21 | default: gpt_groundedness,f1_score,gpt_fluency,gpt_coherence,gpt_similarity,gpt_relevance,ada_similarity 22 | is_chat_input: false 23 | outputs: 24 | f1_score: 25 | type: string 26 | reference: ${concat_scores.output.f1_score} 27 | evaluation_only: false 28 | gpt_coherence: 29 | type: string 30 | reference: ${concat_scores.output.gpt_coherence} 31 | evaluation_only: false 32 | gpt_similarity: 33 | type: string 34 | reference: ${concat_scores.output.gpt_similarity} 35 | evaluation_only: false 36 | gpt_fluency: 37 | type: string 38 | reference: ${concat_scores.output.gpt_fluency} 39 | evaluation_only: false 40 | gpt_relevance: 41 | type: string 42 | reference: ${concat_scores.output.gpt_relevance} 43 | evaluation_only: false 44 | gpt_groundedness: 45 | type: string 46 | reference: ${concat_scores.output.gpt_groundedness} 47 | evaluation_only: false 48 | ada_similarity: 49 | type: string 50 | reference: ${concat_scores.output.ada_similarity} 51 | evaluation_only: false 52 | nodes: 53 | - name: gpt_coherence 54 | type: llm 55 | source: 56 | type: code 57 | path: gpt_coherence_prompt.jinja2 58 | inputs: 59 | deployment_name: gpt-35-turbo 60 | temperature: 0 61 | top_p: 1 62 | max_tokens: 1 63 | presence_penalty: 0 64 | frequency_penalty: 0 65 | answer: ${inputs.answer} 66 | question: ${inputs.question} 67 | provider: AzureOpenAI 68 | connection: Default_AzureOpenAI 69 | api: chat 70 | module: promptflow.tools.aoai 71 | aggregation: false 72 | activate: 73 | when: ${validate_input.output.gpt_coherence} 74 | is: true 75 | use_variants: false 76 | - name: concat_scores 77 | type: python 78 | source: 79 | type: code 80 | path: concat_scores.py 81 | inputs: 82 | gpt_coherence_score: ${gpt_coherence.output} 83 | gpt_similarity_score: ${gpt_similarity.output} 84 | gpt_fluency_score: ${gpt_fluency.output} 85 | gpt_relevance_score: ${gpt_relevance.output} 86 | gpt_groundedness_score: ${gpt_groundedness.output} 87 | f1_score: ${f1_score.output} 88 | ada_cosine_similarity: ${ada_similarity.output} 89 | aggregation: false 90 | use_variants: false 91 | - name: gpt_similarity 92 | type: llm 93 | source: 94 | type: code 95 | path: gpt_similarity_prompt.jinja2 96 | inputs: 97 | deployment_name: gpt-35-turbo 98 | temperature: 0 99 | top_p: 1 100 | max_tokens: 1 101 | presence_penalty: 0 102 | frequency_penalty: 0 103 | answer: ${inputs.answer} 104 | question: ${inputs.question} 105 | ground_truth: ${inputs.ground_truth} 106 | provider: AzureOpenAI 107 | connection: Default_AzureOpenAI 108 | api: chat 109 | module: promptflow.tools.aoai 110 | aggregation: false 111 | activate: 112 | when: ${validate_input.output.gpt_similarity} 113 | is: true 114 | use_variants: false 115 | - name: gpt_relevance 116 | type: llm 117 | source: 118 | type: code 119 | path: gpt_relevance_prompt.jinja2 120 | inputs: 121 | deployment_name: gpt-35-turbo 122 | temperature: 0 123 | top_p: 1 124 | max_tokens: 1 125 | presence_penalty: 0 126 | frequency_penalty: 0 127 | answer: ${inputs.answer} 128 | question: ${inputs.question} 129 | context: ${inputs.context} 130 | provider: AzureOpenAI 131 | connection: Default_AzureOpenAI 132 | api: chat 133 | module: promptflow.tools.aoai 134 | aggregation: false 135 | activate: 136 | when: ${validate_input.output.gpt_relevance} 137 | is: true 138 | use_variants: false 139 | - name: gpt_fluency 140 | type: llm 141 | source: 142 | type: code 143 | path: gpt_fluency_prompt.jinja2 144 | inputs: 145 | deployment_name: gpt-35-turbo 146 | temperature: 0 147 | top_p: 1 148 | max_tokens: 1 149 | presence_penalty: 0 150 | frequency_penalty: 0 151 | answer: ${inputs.answer} 152 | question: ${inputs.question} 153 | provider: AzureOpenAI 154 | connection: Default_AzureOpenAI 155 | api: chat 156 | module: promptflow.tools.aoai 157 | aggregation: false 158 | activate: 159 | when: ${validate_input.output.gpt_fluency} 160 | is: true 161 | use_variants: false 162 | - name: f1_score 163 | type: python 164 | source: 165 | type: code 166 | path: f1_score.py 167 | inputs: 168 | ground_truth: ${inputs.ground_truth} 169 | answer: ${inputs.answer} 170 | aggregation: false 171 | activate: 172 | when: ${validate_input.output.f1_score} 173 | is: true 174 | use_variants: false 175 | - name: gpt_groundedness 176 | type: llm 177 | source: 178 | type: code 179 | path: gpt_groundedness_prompt.jinja2 180 | inputs: 181 | deployment_name: gpt-35-turbo 182 | temperature: 0 183 | top_p: 1 184 | max_tokens: 1 185 | presence_penalty: 0 186 | frequency_penalty: 0 187 | answer: ${inputs.answer} 188 | context: ${inputs.context} 189 | provider: AzureOpenAI 190 | connection: Default_AzureOpenAI 191 | api: chat 192 | module: promptflow.tools.aoai 193 | aggregation: false 194 | activate: 195 | when: ${validate_input.output.gpt_groundedness} 196 | is: true 197 | use_variants: false 198 | - name: aggregate_variants_results 199 | type: python 200 | source: 201 | type: code 202 | path: aggregate_variants_results.py 203 | inputs: 204 | results: ${concat_scores.output} 205 | metrics: ${inputs.metrics} 206 | aggregation: true 207 | use_variants: false 208 | - name: select_metrics 209 | type: python 210 | source: 211 | type: code 212 | path: select_metrics.py 213 | inputs: 214 | metrics: ${inputs.metrics} 215 | aggregation: false 216 | use_variants: false 217 | - name: embeded_ground_truth 218 | type: python 219 | source: 220 | type: package 221 | tool: promptflow.tools.embedding.embedding 222 | inputs: 223 | connection: Default_AzureOpenAI 224 | deployment_name: text-embedding-ada-002 225 | input: ${inputs.ground_truth} 226 | aggregation: false 227 | activate: 228 | when: ${validate_input.output.ada_similarity} 229 | is: true 230 | use_variants: false 231 | - name: embeded_answer 232 | type: python 233 | source: 234 | type: package 235 | tool: promptflow.tools.embedding.embedding 236 | inputs: 237 | connection: Default_AzureOpenAI 238 | deployment_name: text-embedding-ada-002 239 | input: ${inputs.answer} 240 | aggregation: false 241 | activate: 242 | when: ${validate_input.output.ada_similarity} 243 | is: true 244 | use_variants: false 245 | - name: ada_similarity 246 | type: python 247 | source: 248 | type: code 249 | path: ada_cosine_similarity_score.py 250 | inputs: 251 | a: ${embeded_ground_truth.output} 252 | b: ${embeded_answer.output} 253 | aggregation: false 254 | activate: 255 | when: ${validate_input.output.ada_similarity} 256 | is: true 257 | use_variants: false 258 | - name: validate_input 259 | type: python 260 | source: 261 | type: code 262 | path: validate_input.py 263 | inputs: 264 | question: ${inputs.question} 265 | answer: ${inputs.answer} 266 | context: ${inputs.context} 267 | ground_truth: ${inputs.ground_truth} 268 | selected_metrics: ${select_metrics.output} 269 | aggregation: false 270 | use_variants: false 271 | node_variants: {} 272 | environment: 273 | python_requirements_txt: requirements.txt 274 | -------------------------------------------------------------------------------- /Examples/promptflow/model_as_judge_evaluator/flow.meta.yaml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/flow.schema.json 2 | name: qna_non_rag_eval 3 | display_name: QnA Evaluation 4 | type: evaluate 5 | path: ./flow.dag.yaml 6 | description: Compute the quality of the answer for the given question based on the ground_truth and the context 7 | properties: 8 | promptflow.stage: prod 9 | promptflow.details.type: markdown 10 | promptflow.details.source: README.md 11 | promptflow.batch_inputs: samples.json -------------------------------------------------------------------------------- /Examples/promptflow/model_as_judge_evaluator/gpt_coherence_prompt.jinja2: -------------------------------------------------------------------------------- 1 | system: 2 | You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. 3 | 4 | user: 5 | Coherence of an answer is measured by how well all the sentences fit together and sound naturally as a whole. Consider the overall quality of the answer when evaluating coherence. Given the question and answer, score the coherence of answer between one to five stars using the following rating scale: 6 | One star: the answer completely lacks coherence 7 | Two stars: the answer mostly lacks coherence 8 | Three stars: the answer is partially coherent 9 | Four stars: the answer is mostly coherent 10 | Five stars: the answer has perfect coherency 11 | 12 | This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5. 13 | 14 | question: What is your favorite indoor activity and why do you enjoy it? 15 | answer: I like pizza. The sun is shining. 16 | stars: 1 17 | 18 | question: Can you describe your favorite movie without giving away any spoilers? 19 | answer: It is a science fiction movie. There are dinosaurs. The actors eat cake. People must stop the villain. 20 | stars: 2 21 | 22 | question: What are some benefits of regular exercise? 23 | answer: Regular exercise improves your mood. A good workout also helps you sleep better. Trees are green. 24 | stars: 3 25 | 26 | question: How do you cope with stress in your daily life? 27 | answer: I usually go for a walk to clear my head. Listening to music helps me relax as well. Stress is a part of life, but we can manage it through some activities. 28 | stars: 4 29 | 30 | question: What can you tell me about climate change and its effects on the environment? 31 | answer: Climate change has far-reaching effects on the environment. Rising temperatures result in the melting of polar ice caps, contributing to sea-level rise. Additionally, more frequent and severe weather events, such as hurricanes and heatwaves, can cause disruption to ecosystems and human societies alike. 32 | stars: 5 33 | 34 | question: {{question}} 35 | answer: {{answer}} 36 | stars: -------------------------------------------------------------------------------- /Examples/promptflow/model_as_judge_evaluator/gpt_fluency_prompt.jinja2: -------------------------------------------------------------------------------- 1 | system: 2 | You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. 3 | user: 4 | Fluency measures the quality of individual sentences in the answer, and whether they are well-written and grammatically correct. Consider the quality of individual sentences when evaluating fluency. Given the question and answer, score the fluency of the answer between one to five stars using the following rating scale: 5 | One star: the answer completely lacks fluency 6 | Two stars: the answer mostly lacks fluency 7 | Three stars: the answer is partially fluent 8 | Four stars: the answer is mostly fluent 9 | Five stars: the answer has perfect fluency 10 | 11 | This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5. 12 | 13 | question: What did you have for breakfast today? 14 | answer: Breakfast today, me eating cereal and orange juice very good. 15 | stars: 1 16 | 17 | question: How do you feel when you travel alone? 18 | answer: Alone travel, nervous, but excited also. I feel adventure and like its time. 19 | stars: 2 20 | 21 | question: When was the last time you went on a family vacation? 22 | answer: Last family vacation, it took place in last summer. We traveled to a beach destination, very fun. 23 | stars: 3 24 | 25 | question: What is your favorite thing about your job? 26 | answer: My favorite aspect of my job is the chance to interact with diverse people. I am constantly learning from their experiences and stories. 27 | stars: 4 28 | 29 | question: Can you describe your morning routine? 30 | answer: Every morning, I wake up at 6 am, drink a glass of water, and do some light stretching. After that, I take a shower and get dressed for work. Then, I have a healthy breakfast, usually consisting of oatmeal and fruits, before leaving the house around 7:30 am. 31 | stars: 5 32 | 33 | question: {{question}} 34 | answer: {{answer}} 35 | stars: -------------------------------------------------------------------------------- /Examples/promptflow/model_as_judge_evaluator/gpt_groundedness_prompt.jinja2: -------------------------------------------------------------------------------- 1 | system: 2 | You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. 3 | user: 4 | You will be presented with a CONTEXT and an ANSWER about that CONTEXT. You need to decide whether the ANSWER is entailed by the CONTEXT by choosing one of the following rating: 5 | 1. 5: The ANSWER follows logically from the information contained in the CONTEXT. 6 | 2. 1: The ANSWER is logically false from the information contained in the CONTEXT. 7 | 3. an integer score between 1 and 5 and if such integer score does not exist, use 1: It is not possible to determine whether the ANSWER is true or false without further information. Read the passage of information thoroughly and select the correct answer from the three answer labels. Read the CONTEXT thoroughly to ensure you know what the CONTEXT entails. Note the ANSWER is generated by a computer system, it can contain certain symbols, which should not be a negative factor in the evaluation. 8 | Independent Examples: 9 | ## Example Task #1 Input: 10 | {"CONTEXT": "Some are reported as not having been wanted at all.", "QUESTION": "", "ANSWER": "All are reported as being completely and fully wanted."} 11 | ## Example Task #1 Output: 12 | 1 13 | ## Example Task #2 Input: 14 | {"CONTEXT": "Ten new television shows appeared during the month of September. Five of the shows were sitcoms, three were hourlong dramas, and two were news-magazine shows. By January, only seven of these new shows were still on the air. Five of the shows that remained were sitcoms.", "QUESTION": "", "ANSWER": "At least one of the shows that were cancelled was an hourlong drama."} 15 | ## Example Task #2 Output: 16 | 5 17 | ## Example Task #3 Input: 18 | {"CONTEXT": "In Quebec, an allophone is a resident, usually an immigrant, whose mother tongue or home language is neither French nor English.", "QUESTION": "", "ANSWER": "In Quebec, an allophone is a resident, usually an immigrant, whose mother tongue or home language is not French."} 19 | ## Example Task #3 Output: 20 | 5 21 | ## Example Task #4 Input: 22 | {"CONTEXT": "Some are reported as not having been wanted at all.", "QUESTION": "", "ANSWER": "All are reported as being completely and fully wanted."} 23 | ## Example Task #4 Output: 24 | 1 25 | ## Actual Task Input: 26 | {"CONTEXT": {{context}}, "QUESTION": "", "ANSWER": {{answer}}} 27 | Reminder: The return values for each task should be correctly formatted as an integer between 1 and 5. Do not repeat the context and question. 28 | Actual Task Output: -------------------------------------------------------------------------------- /Examples/promptflow/model_as_judge_evaluator/gpt_relevance_prompt.jinja2: -------------------------------------------------------------------------------- 1 | system: 2 | You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. 3 | user: 4 | Relevance measures how well the answer addresses the main aspects of the question, based on the context. Consider whether all and only the important aspects are contained in the answer when evaluating relevance. Given the context and question, score the relevance of the answer between one to five stars using the following rating scale: 5 | One star: the answer completely lacks relevance 6 | Two stars: the answer mostly lacks relevance 7 | Three stars: the answer is partially relevant 8 | Four stars: the answer is mostly relevant 9 | Five stars: the answer has perfect relevance 10 | 11 | This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5. 12 | 13 | context: Marie Curie was a Polish-born physicist and chemist who pioneered research on radioactivity and was the first woman to win a Nobel Prize. 14 | question: What field did Marie Curie excel in? 15 | answer: Marie Curie was a renowned painter who focused mainly on impressionist styles and techniques. 16 | stars: 1 17 | 18 | context: The Beatles were an English rock band formed in Liverpool in 1960, and they are widely regarded as the most influential music band in history. 19 | question: Where were The Beatles formed? 20 | answer: The band The Beatles began their journey in London, England, and they changed the history of music. 21 | stars: 2 22 | 23 | context: The recent Mars rover, Perseverance, was launched in 2020 with the main goal of searching for signs of ancient life on Mars. The rover also carries an experiment called MOXIE, which aims to generate oxygen from the Martian atmosphere. 24 | question: What are the main goals of Perseverance Mars rover mission? 25 | answer: The Perseverance Mars rover mission focuses on searching for signs of ancient life on Mars. 26 | stars: 3 27 | 28 | context: The Mediterranean diet is a commonly recommended dietary plan that emphasizes fruits, vegetables, whole grains, legumes, lean proteins, and healthy fats. Studies have shown that it offers numerous health benefits, including a reduced risk of heart disease and improved cognitive health. 29 | question: What are the main components of the Mediterranean diet? 30 | answer: The Mediterranean diet primarily consists of fruits, vegetables, whole grains, and legumes. 31 | stars: 4 32 | 33 | context: The Queen's Royal Castle is a well-known tourist attraction in the United Kingdom. It spans over 500 acres and contains extensive gardens and parks. The castle was built in the 15th century and has been home to generations of royalty. 34 | question: What are the main attractions of the Queen's Royal Castle? 35 | answer: The main attractions of the Queen's Royal Castle are its expansive 500-acre grounds, extensive gardens, parks, and the historical castle itself, which dates back to the 15th century and has housed generations of royalty. 36 | stars: 5 37 | 38 | context: {{context}} 39 | question: {{question}} 40 | answer: {{answer}} 41 | stars: -------------------------------------------------------------------------------- /Examples/promptflow/model_as_judge_evaluator/gpt_similarity_prompt.jinja2: -------------------------------------------------------------------------------- 1 | system: 2 | You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. 3 | user: 4 | Equivalence, as a metric, measures the similarity between the predicted answer and the correct answer. If the information and content in the predicted answer is similar or equivalent to the correct answer, then the value of the Equivalence metric should be high, else it should be low. Given the question, correct answer, and predicted answer, determine the value of Equivalence metric using the following rating scale: 5 | One star: the predicted answer is not at all similar to the correct answer 6 | Two stars: the predicted answer is mostly not similar to the correct answer 7 | Three stars: the predicted answer is somewhat similar to the correct answer 8 | Four stars: the predicted answer is mostly similar to the correct answer 9 | Five stars: the predicted answer is completely similar to the correct answer 10 | 11 | This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5. 12 | 13 | The examples below show the Equivalence score for a question, a correct answer, and a predicted answer. 14 | 15 | question: What is the role of ribosomes? 16 | correct answer: Ribosomes are cellular structures responsible for protein synthesis. They interpret the genetic information carried by messenger RNA (mRNA) and use it to assemble amino acids into proteins. 17 | predicted answer: Ribosomes participate in carbohydrate breakdown by removing nutrients from complex sugar molecules. 18 | stars: 1 19 | 20 | question: Why did the Titanic sink? 21 | correct answer: The Titanic sank after it struck an iceberg during its maiden voyage in 1912. The impact caused the ship's hull to breach, allowing water to flood into the vessel. The ship's design, lifeboat shortage, and lack of timely rescue efforts contributed to the tragic loss of life. 22 | predicted answer: The sinking of the Titanic was a result of a large iceberg collision. This caused the ship to take on water and eventually sink, leading to the death of many passengers due to a shortage of lifeboats and insufficient rescue attempts. 23 | stars: 2 24 | 25 | question: What causes seasons on Earth? 26 | correct answer: Seasons on Earth are caused by the tilt of the Earth's axis and its revolution around the Sun. As the Earth orbits the Sun, the tilt causes different parts of the planet to receive varying amounts of sunlight, resulting in changes in temperature and weather patterns. 27 | predicted answer: Seasons occur because of the Earth's rotation and its elliptical orbit around the Sun. The tilt of the Earth's axis causes regions to be subjected to different sunlight intensities, which leads to temperature fluctuations and alternating weather conditions. 28 | stars: 3 29 | 30 | question: How does photosynthesis work? 31 | correct answer: Photosynthesis is a process by which green plants and some other organisms convert light energy into chemical energy. This occurs as light is absorbed by chlorophyll molecules, and then carbon dioxide and water are converted into glucose and oxygen through a series of reactions. 32 | predicted answer: In photosynthesis, sunlight is transformed into nutrients by plants and certain microorganisms. Light is captured by chlorophyll molecules, followed by the conversion of carbon dioxide and water into sugar and oxygen through multiple reactions. 33 | stars: 4 34 | 35 | question: What are the health benefits of regular exercise? 36 | correct answer: Regular exercise can help maintain a healthy weight, increase muscle and bone strength, and reduce the risk of chronic diseases. It also promotes mental well-being by reducing stress and improving overall mood. 37 | predicted answer: Routine physical activity can contribute to maintaining ideal body weight, enhancing muscle and bone strength, and preventing chronic illnesses. In addition, it supports mental health by alleviating stress and augmenting general mood. 38 | stars: 5 39 | 40 | question: {{question}} 41 | correct answer:{{ground_truth}} 42 | predicted answer: {{answer}} 43 | stars: -------------------------------------------------------------------------------- /Examples/promptflow/model_as_judge_evaluator/requirements.txt: -------------------------------------------------------------------------------- 1 | promptflow 2 | promptflow-tools -------------------------------------------------------------------------------- /Examples/promptflow/model_as_judge_evaluator/samples.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "question": "Which tent is the most waterproof?", 4 | "context": "From the our product list, the alpine explorer tent is the most waterproof. The Adventure Dining Table has higher weight.", 5 | "answer": "The Alpine Explorer Tent is the most waterproof.", 6 | "ground_truth": "The Alpine Explorer Tent has the highest rainfly waterproof rating at 3000m" 7 | } 8 | ] -------------------------------------------------------------------------------- /Examples/promptflow/model_as_judge_evaluator/select_metrics.py: -------------------------------------------------------------------------------- 1 | from promptflow import tool 2 | 3 | 4 | @tool 5 | def select_metrics(metrics: str) -> str: 6 | supported_metrics = ('gpt_coherence', 'gpt_similarity', 'gpt_fluency', 'gpt_relevance', 'gpt_groundedness', 7 | 'f1_score', 'ada_similarity') 8 | user_selected_metrics = [metric.strip() for metric in metrics.split(',') if metric] 9 | metric_selection_dict = {} 10 | for metric in supported_metrics: 11 | if metric in user_selected_metrics: 12 | metric_selection_dict[metric] = True 13 | else: 14 | metric_selection_dict[metric] = False 15 | return metric_selection_dict 16 | -------------------------------------------------------------------------------- /Examples/promptflow/model_as_judge_evaluator/validate_input.py: -------------------------------------------------------------------------------- 1 | from promptflow import tool 2 | 3 | 4 | @tool 5 | def validate_input(question: str, answer: str, context: str, ground_truth: str, selected_metrics: dict) -> dict: 6 | input_data = {"question": question, "answer": answer, "context": context, "ground_truth": ground_truth} 7 | expected_input_cols = set(input_data.keys()) 8 | dict_metric_required_fields = {"gpt_groundedness": set(["answer", "context"]), 9 | "gpt_relevance": set(["question", "answer", "context"]), 10 | "gpt_coherence": set(["question", "answer"]), 11 | "gpt_similarity": set(["question", "answer", "ground_truth"]), 12 | "gpt_fluency": set(["question", "answer"]), 13 | "f1_score": set(["answer", "ground_truth"]), 14 | "ada_similarity": set(["answer", "ground_truth"])} 15 | actual_input_cols = set() 16 | for col in expected_input_cols: 17 | if input_data[col] and input_data[col].strip(): 18 | actual_input_cols.add(col) 19 | data_validation = selected_metrics 20 | for metric in selected_metrics: 21 | if selected_metrics[metric]: 22 | metric_required_fields = dict_metric_required_fields[metric] 23 | if metric_required_fields <= actual_input_cols: 24 | data_validation[metric] = True 25 | else: 26 | data_validation[metric] = False 27 | return data_validation 28 | -------------------------------------------------------------------------------- /Examples/promptflow/new_chat_flow/.promptflow/chat.output.json: -------------------------------------------------------------------------------- 1 | { 2 | "answer": "Of course! Hello world! How can I help you today?" 3 | } -------------------------------------------------------------------------------- /Examples/promptflow/new_chat_flow/.promptflow/flow.detail.json: -------------------------------------------------------------------------------- 1 | {} -------------------------------------------------------------------------------- /Examples/promptflow/new_chat_flow/.promptflow/flow.layout.json: -------------------------------------------------------------------------------- 1 | { 2 | "nodeLayouts": { 3 | "inputs": { 4 | "x": 290, 5 | "y": 42, 6 | "index": -1 7 | }, 8 | "outputs": { 9 | "x": 220, 10 | "y": 310, 11 | "index": -1 12 | }, 13 | "chat": { 14 | "x": 140, 15 | "y": 176, 16 | "index": 0 17 | } 18 | }, 19 | "orientation": "Vertical" 20 | } -------------------------------------------------------------------------------- /Examples/promptflow/new_chat_flow/.promptflow/flow.log: -------------------------------------------------------------------------------- 1 | 2024-04-18 07:31:22 -0700 20828 execution.flow INFO Start executing nodes in thread pool mode. 2 | 2024-04-18 07:31:22 -0700 20828 execution.flow INFO Start to run 1 nodes with concurrency level 16. 3 | 2024-04-18 07:31:22 -0700 20828 execution.flow INFO Executing node chat. node run id: d517789a-3106-4079-b5ec-0b752ab42dac_chat_0 4 | 2024-04-18 07:31:24 -0700 20828 execution.flow WARNING Output of chat is not json serializable, use str to store it. 5 | 2024-04-18 07:31:24 -0700 20828 execution.flow INFO Node chat completes. 6 | 2024-04-18 07:31:38 -0700 20828 execution.flow INFO Start executing nodes in thread pool mode. 7 | 2024-04-18 07:31:38 -0700 20828 execution.flow INFO Start to run 1 nodes with concurrency level 16. 8 | 2024-04-18 07:31:38 -0700 20828 execution.flow INFO Executing node chat. node run id: 7d66c026-7f12-4bd1-b992-82cb60a1274e_chat_0 9 | 2024-04-18 07:31:38 -0700 20828 execution.flow WARNING Output of chat is not json serializable, use str to store it. 10 | 2024-04-18 07:31:38 -0700 20828 execution.flow INFO Node chat completes. 11 | -------------------------------------------------------------------------------- /Examples/promptflow/new_chat_flow/.promptflow/flow.output.json: -------------------------------------------------------------------------------- 1 | { 2 | "answer": "Hello! I'm an AI assistant, so I don't have feelings, but I'm here to help you with any questions or tasks you have. How can I assist you today?" 3 | } -------------------------------------------------------------------------------- /Examples/promptflow/prompt_test/.promptflow/chat.output.json: -------------------------------------------------------------------------------- 1 | { 2 | "unformatted_meta_reply": "You're welcome! If you have any more questions in the future, feel free to ask. Goodbye!", 3 | "formatted_meta_reply": "You're welcome! If you have any more questions in the future, feel free to ask. Goodbye!" 4 | } -------------------------------------------------------------------------------- /Examples/promptflow/prompt_test/.promptflow/flow.layout.json: -------------------------------------------------------------------------------- 1 | { 2 | "nodeLayouts": { 3 | "inputs": { 4 | "x": 455, 5 | "y": 42, 6 | "index": -1 7 | }, 8 | "outputs": { 9 | "x": 385, 10 | "y": 310, 11 | "index": -1 12 | }, 13 | "unformatted_meta": { 14 | "x": 140, 15 | "y": 176, 16 | "index": 0 17 | }, 18 | "formatted_meta": { 19 | "x": 470, 20 | "y": 176, 21 | "index": 1 22 | } 23 | }, 24 | "orientation": "Vertical" 25 | } -------------------------------------------------------------------------------- /Examples/promptflow/prompt_test/.promptflow/flow.log: -------------------------------------------------------------------------------- 1 | 2024-04-17 13:17:07 -0700 33752 execution.flow INFO Start executing nodes in thread pool mode. 2 | 2024-04-17 13:17:07 -0700 33752 execution.flow INFO Start to run 2 nodes with concurrency level 16. 3 | 2024-04-17 13:17:07 -0700 33752 execution.flow INFO Executing node unformatted_meta. node run id: 1b587084-d370-44e6-ba18-ffe4b42b321d_unformatted_meta_0 4 | 2024-04-17 13:17:07 -0700 33752 execution.flow INFO Executing node formatted_meta. node run id: 1b587084-d370-44e6-ba18-ffe4b42b321d_formatted_meta_0 5 | 2024-04-17 13:17:08 -0700 33752 execution.flow INFO Node unformatted_meta completes. 6 | 2024-04-17 13:17:08 -0700 33752 execution.flow INFO Node formatted_meta completes. 7 | -------------------------------------------------------------------------------- /Examples/promptflow/prompt_test/.promptflow/flow.output.json: -------------------------------------------------------------------------------- 1 | { 2 | "unformatted_meta_reply": "You must be at least 16 years old to get a driver's license in California. [source1]", 3 | "formatted_meta_reply": "In California, you must be at least 16 years old to obtain a driver's license [source: California DMV website]." 4 | } -------------------------------------------------------------------------------- /Examples/promptflow/prompt_test/.promptflow/flow.uihint.json: -------------------------------------------------------------------------------- 1 | { 2 | "nodes": { 3 | "formatted_meta": { 4 | "variant_0": { 5 | "inputs": { 6 | "question": [], 7 | "chat_history": [] 8 | } 9 | } 10 | }, 11 | "unformatted_meta": { 12 | "variant_0": { 13 | "inputs": { 14 | "question": [], 15 | "chat_history": [] 16 | } 17 | } 18 | } 19 | } 20 | } -------------------------------------------------------------------------------- /Examples/requirements.txt: -------------------------------------------------------------------------------- 1 | openai 2 | --------------------------------------------------------------------------------