├── .gitignore
├── .vscode
└── settings.json
├── Examples
├── FormRecognizer
│ ├── Balance_sheet_analysis.ipynb
│ ├── FormRecognizerExamples.ipynb
│ ├── Life_application_example.png
│ ├── Life_application_example.png.json
│ ├── income_table.png
│ └── ms_fy23_q1_html.htm
├── Language
│ ├── CustomerServiceCall.ipynb
│ ├── Loan_call.ipynb
│ ├── Loan_call.json
│ ├── Pharmacy_call.ipynb
│ └── Pharmacy_call.json
├── OpenSource
│ ├── ConvFinQA-benchmark
│ │ ├── README.md
│ │ ├── convfinqa-chat-turns.ipynb
│ │ ├── convfinqa-chatcompatible-test_private.ipynb
│ │ ├── convfinqa-chatcompatible.ipynb
│ │ ├── convfinqa.ipynb
│ │ ├── data
│ │ │ ├── SVAMP.json
│ │ │ ├── aqua_test.jsonl
│ │ │ ├── convfinqa_dev.json
│ │ │ ├── convfinqa_test_turn.json
│ │ │ ├── dev.json
│ │ │ ├── finqa_test.json
│ │ │ ├── gsm8K.json
│ │ │ ├── tabmwp_test.json
│ │ │ ├── tatqa_dev.json
│ │ │ └── test_private.json
│ │ ├── eval_tatqa
│ │ │ ├── __init__.py
│ │ │ ├── tatqa_eval.py
│ │ │ ├── tatqa_metric.py
│ │ │ ├── tatqa_metric_test.py
│ │ │ ├── tatqa_utils.py
│ │ │ └── tatqa_utils_test.py
│ │ ├── finqa-chatcompatible.ipynb
│ │ ├── outputs
│ │ │ ├── benchmark-score-finQA.ipynb
│ │ │ ├── benchmark-score.ipynb
│ │ │ ├── convfinqa_direct_gpt-35-turbo_04_10_12_30.jsonl
│ │ │ ├── convfinqa_direct_gpt-35-turbo_04_13_07_36.jsonl
│ │ │ ├── convfinqa_direct_gpt-4_04_10_13_02.jsonl
│ │ │ ├── convfinqa_direct_gpt-4_04_12_20_53.jsonl
│ │ │ ├── convfinqa_direct_gpt-4_04_13_07_21.jsonl
│ │ │ ├── convfinqa_direct_gpt3_04_10_07_53.jsonl
│ │ │ ├── convfinqa_test_private_gpt-4_04_18_10_22.jsonl
│ │ │ ├── convfinqa_test_private_gpt-4_04_18_10_38.jsonl
│ │ │ ├── finqa_gpt-35-turbo_04_10_14_54.jsonl
│ │ │ └── score-finQA-reformat.ipynb
│ │ └── tool.py
│ ├── LangChain
│ │ ├── CustomAPIMAzureOpenAI.py
│ │ ├── LangChainSummarizationExample.ipynb
│ │ └── stateoftheunion_20230207.txt
│ └── LlamaIndex
│ │ ├── 10k_Analysis.ipynb
│ │ ├── UBER
│ │ ├── UBER_2019.html
│ │ ├── UBER_2020.html
│ │ ├── UBER_2021.html
│ │ └── UBER_2022.html
│ │ └── sample_10k_chain.pdf
├── Readme.md
├── Speech
│ ├── Conversation_SSML OpenAI.ipynb
│ ├── Conversation_SSML.ipynb
│ └── Conversation_SSML.xml
├── aml_examples
│ └── 1a_read_example-copy
│ │ ├── README.md
│ │ ├── adls_src
│ │ └── read_folder.py
│ │ ├── job_example-py-adls.ipynb
│ │ ├── pipeline_with_components_from_yaml.ipynb
│ │ └── read_adls.yml
├── aml_foundationmodels
│ ├── deploy_flask_falcon.ipynb
│ ├── dockerfile
│ │ ├── Dockerfile
│ │ ├── requirements.txt
│ │ └── score.py
│ └── src
│ │ └── server.py
├── config.cfg
├── demo
│ ├── demo.ipynb
│ ├── earnings_example.ipynb
│ └── stock_prompt.csv
├── metrics
│ ├── aoai_metrics.ipynb
│ └── totalcalls.json
├── promptflow
│ ├── ag-convfinqa-pf
│ │ ├── .promptflow
│ │ │ ├── flow.detail.json
│ │ │ ├── flow.layout.json
│ │ │ ├── flow.log
│ │ │ ├── flow.output.json
│ │ │ ├── flow.tools.json
│ │ │ └── flow.uihint.json
│ │ ├── autogen_step.py
│ │ ├── chat.jinja2
│ │ ├── convfinqa_dev.jsonl
│ │ ├── convfinqa_dev_sample.jsonl
│ │ ├── flow.dag.yaml
│ │ ├── flow.meta.yaml
│ │ └── requirements.txt
│ ├── autogen-flow
│ │ ├── OAI_CONFIG_LIST
│ │ ├── ag_test.py
│ │ ├── chat.jinja2
│ │ ├── flow.dag.yaml
│ │ └── requirements.txt
│ ├── csv_example
│ │ ├── .amlignore
│ │ ├── Mock_Count_index_Data_20230928G.csv
│ │ ├── analyze_df.py
│ │ ├── chat.jinja2
│ │ ├── concat_result.jinja2
│ │ ├── flow.dag.yaml
│ │ ├── generate_insights.jinja2
│ │ ├── output_format.jinja2
│ │ ├── output_result.jinja2
│ │ ├── readme.md
│ │ └── requirements.txt
│ ├── databricks_example
│ │ ├── chat_csv_model
│ │ │ └── promptflow
│ │ │ │ ├── Mock_Count_index_Data_20230928G.csv
│ │ │ │ ├── analyze_df.py
│ │ │ │ ├── chat.jinja2
│ │ │ │ ├── concat_result.jinja2
│ │ │ │ ├── flow.dag.yaml
│ │ │ │ ├── generate_insights.jinja2
│ │ │ │ ├── output_format.jinja2
│ │ │ │ ├── output_result.jinja2
│ │ │ │ └── requirements.txt
│ │ ├── deploy_pf
│ │ │ ├── 1_pf_register_model.ipynb
│ │ │ └── 2_pf_test_model.ipynb
│ │ └── readme.md
│ ├── dmv_copilot_flow
│ │ ├── .promptflow
│ │ │ ├── RetrieveDocuments.inputs.jsonl
│ │ │ ├── RetrieveDocuments.node.log
│ │ │ ├── chat.detail.json
│ │ │ ├── flow.detail.json
│ │ │ ├── flow.layout.json
│ │ │ ├── flow.log
│ │ │ └── flow.tools.json
│ │ ├── DetermineIntent.jinja2
│ │ ├── DetermineReply.jinja2
│ │ ├── ExtractIntent.py
│ │ ├── FormatConversation.py
│ │ ├── FormatReply.py
│ │ ├── FormatRetrievedDocuments.py
│ │ ├── RetrieveDocuments.py
│ │ ├── concat_reply.py
│ │ ├── dmv_sample_qs.csv
│ │ ├── eval
│ │ │ └── rag_eval.ipynb
│ │ ├── flow.dag.yaml
│ │ ├── output_prompt.jinja2
│ │ └── requirements.txt
│ ├── finance_assistant_pf
│ │ ├── .promptflow
│ │ │ ├── flow.detail.json
│ │ │ ├── flow.layout.json
│ │ │ ├── flow.log
│ │ │ ├── flow.output.json
│ │ │ └── flow.tools.json
│ │ ├── ag_test.py
│ │ ├── chat.jinja2
│ │ ├── data
│ │ │ └── portfolio.csv
│ │ └── flow.dag.yaml
│ ├── model_as_judge_evaluator
│ │ ├── .promptflow
│ │ │ ├── flow.detail.json
│ │ │ ├── flow.layout.json
│ │ │ ├── flow.log
│ │ │ ├── flow.metrics.json
│ │ │ ├── flow.output.json
│ │ │ ├── flow.tools.json
│ │ │ ├── flow.uihint.json
│ │ │ ├── lkg_sources
│ │ │ │ ├── README.md
│ │ │ │ ├── ada_cosine_similarity_score.py
│ │ │ │ ├── aggregate_variants_results.py
│ │ │ │ ├── concat_scores.py
│ │ │ │ ├── f1_score.py
│ │ │ │ ├── flow.meta.yaml
│ │ │ │ ├── gpt_coherence_prompt.jinja2
│ │ │ │ ├── gpt_fluency_prompt.jinja2
│ │ │ │ ├── gpt_groundedness_prompt.jinja2
│ │ │ │ ├── gpt_relevance_prompt.jinja2
│ │ │ │ ├── gpt_similarity_prompt.jinja2
│ │ │ │ ├── requirements.txt
│ │ │ │ ├── samples.json
│ │ │ │ ├── select_metrics.py
│ │ │ │ └── validate_input.py
│ │ │ └── ux.inputs.json
│ │ ├── .runs
│ │ │ ├── 9ccd06e6-71ee-4b65-a04e-d9a6b525c0c7
│ │ │ │ └── flow.dag.yaml
│ │ │ └── f4279cf9-cbaf-4632-ab29-7efa6806192e
│ │ │ │ └── flow.dag.yaml
│ │ ├── README.md
│ │ ├── ada_cosine_similarity_score.py
│ │ ├── aggregate_variants_results.py
│ │ ├── concat_scores.py
│ │ ├── f1_score.py
│ │ ├── flow.dag.yaml
│ │ ├── flow.meta.yaml
│ │ ├── gpt_coherence_prompt.jinja2
│ │ ├── gpt_fluency_prompt.jinja2
│ │ ├── gpt_groundedness_prompt.jinja2
│ │ ├── gpt_relevance_prompt.jinja2
│ │ ├── gpt_similarity_prompt.jinja2
│ │ ├── requirements.txt
│ │ ├── samples.json
│ │ ├── select_metrics.py
│ │ └── validate_input.py
│ ├── new_chat_flow
│ │ └── .promptflow
│ │ │ ├── chat.detail.json
│ │ │ ├── chat.output.json
│ │ │ ├── flow.detail.json
│ │ │ ├── flow.layout.json
│ │ │ ├── flow.log
│ │ │ └── flow.output.json
│ └── prompt_test
│ │ └── .promptflow
│ │ ├── chat.detail.json
│ │ ├── chat.output.json
│ │ ├── flow.detail.json
│ │ ├── flow.layout.json
│ │ ├── flow.log
│ │ ├── flow.output.json
│ │ └── flow.uihint.json
└── requirements.txt
├── README.md
└── Validation
└── OutputQuality.ipynb
/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | *.cfg
3 | .ipynb_checkpoints
4 | *.pyc
5 |
--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 | "python.terminal.activateEnvironment": true
3 | }
--------------------------------------------------------------------------------
/Examples/FormRecognizer/FormRecognizerExamples.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 3,
6 | "id": "30829697-ea9d-4e11-a530-2776b6c0e752",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "import json\n",
11 | "import io\n",
12 | "from configparser import ConfigParser"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": 4,
18 | "id": "e98c08a4-337c-4c84-b030-a94ce50e60a2",
19 | "metadata": {},
20 | "outputs": [],
21 | "source": [
22 | "form_json = ''\n",
23 | "# Form Recognizer results generated and saved to json \n",
24 | "with open('./Life_application_example.png.json', 'r') as form_file:\n",
25 | " form_json = json.loads(form_file.read())"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 5,
31 | "id": "71f0fdfc-4858-411c-adac-22719dab96d2",
32 | "metadata": {},
33 | "outputs": [],
34 | "source": [
35 | "form_kvp = []\n",
36 | "for kvp in form_json['analyzeResult']['keyValuePairs']:\n",
37 | " if(kvp.get('key') is not None and kvp.get('value') is not None ):\n",
38 | " form_kvp.append((kvp['key']['content'],kvp['value']['content']))\n",
39 | " #print(\"{} - {}\".format(kvp['key']['content'],kvp['value']['content']))\n",
40 | "\n",
41 | " "
42 | ]
43 | },
44 | {
45 | "cell_type": "markdown",
46 | "id": "c628663f-1ee2-41d0-86f7-8fcf0a22b80a",
47 | "metadata": {},
48 | "source": [
49 | "# Validate Form Recognizer Outputs with OpenAI"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": 6,
55 | "id": "7fef45bb-045a-473a-b7f8-5bca7b870a2c",
56 | "metadata": {},
57 | "outputs": [],
58 | "source": [
59 | "import os\n",
60 | "import openai\n",
61 | "from configparser import ConfigParser\n",
62 | "\n",
63 | "parser=ConfigParser()\n",
64 | "_=parser.read('../config.cfg')\n",
65 | "openai.api_type = \"azure\"\n",
66 | "openai.api_base = parser.get('openai_api','api_ep')\n",
67 | "openai.api_version = \"2022-06-01-preview\"\n",
68 | "openai.api_key = parser.get('openai_api','api_key')\n",
69 | "model = parser.get('openai_api','api_model')"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": 7,
75 | "id": "cdcb3062-9e23-40db-9a31-7a818143731d",
76 | "metadata": {},
77 | "outputs": [
78 | {
79 | "name": "stdout",
80 | "output_type": "stream",
81 | "text": [
82 | "Name (First, MI, Last) - Mark B. Cuban: True\n",
83 | "S.SN / T - 444-82-6666: True\n",
84 | "Address - 91 Richmond St.: True\n",
85 | "City - Dallas: False\n",
86 | "Zip Code - 75201: True\n",
87 | "Former Name - Michael Jackson: False\n",
88 | "M - :selected:: False\n",
89 | "OF - :unselected:: False\n",
90 | "Date of Birth (mm/dd/yyyy) - 1/1/70: True\n",
91 | "State of Birth - TX: True\n"
92 | ]
93 | }
94 | ],
95 | "source": [
96 | "for e in form_kvp[:10]:\n",
97 | " #print(e)\n",
98 | " prompt = \"Validate following text is type of {} with True or False:\\n'''{}'''\\n\".format(e[0], e[1])\n",
99 | " response = openai.Completion.create( engine=model, prompt=prompt, temperature=.5, max_tokens=400, top_p=0.5, frequency_penalty=0, presence_penalty=0, stop=None)\n",
100 | " #print('Response:')\n",
101 | " print(\"{} - {}: {}\".format(e[0],e[1],response['choices'][0]['text'].strip()))"
102 | ]
103 | },
104 | {
105 | "cell_type": "markdown",
106 | "id": "f7f1fb1e-1f56-429c-8116-be4844e097ff",
107 | "metadata": {},
108 | "source": [
109 | "# Correct Form Recognizer Results with OpenAI"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": 8,
115 | "id": "dc4a6d7a-84cc-49c1-881c-d14f28e9d772",
116 | "metadata": {},
117 | "outputs": [],
118 | "source": [
119 | "import os\n",
120 | "import openai\n",
121 | "from configparser import ConfigParser\n",
122 | "\n",
123 | "parser=ConfigParser()\n",
124 | "_=parser.read('../config.cfg')\n",
125 | "openai.api_type = \"azure\"\n",
126 | "openai.api_base = parser.get('openai_api','api_ep')\n",
127 | "openai.api_version = \"2022-06-01-preview\"\n",
128 | "openai.api_key = parser.get('openai_api','api_key')\n",
129 | "model = parser.get('openai_api','api_model')"
130 | ]
131 | },
132 | {
133 | "cell_type": "code",
134 | "execution_count": 9,
135 | "id": "8d45ce48-64df-45ae-8d85-fdb2bfd0c326",
136 | "metadata": {},
137 | "outputs": [
138 | {
139 | "name": "stdout",
140 | "output_type": "stream",
141 | "text": [
142 | "Name (First, MI, Last) - Mark B. Cuban: Mark Cuban\n",
143 | "S.SN / T - 444-82-6666: S.SN: 444-82-6666\n",
144 | "Address - 91 Richmond St.: 91 Richmond Street\n",
145 | "City - Dallas: Dallas\n",
146 | "Zip Code - 75201: 75201\n",
147 | "Former Name - Michael Jackson: Michael Jackson\n",
148 | "M - :selected:: :selected:\n",
149 | "OF - :unselected:: :unselected:\n",
150 | "Date of Birth (mm/dd/yyyy) - 1/1/70: 01/01/1970\n",
151 | "State of Birth - TX: Texas\n"
152 | ]
153 | }
154 | ],
155 | "source": [
156 | "# Corrects the first 10 form key value pairs\n",
157 | "for e in form_kvp[:10]:\n",
158 | " #print(e)\n",
159 | " prompt = \"Reformat following text to type of {}:\\n'''{}'''\\n\".format(e[0], e[1])\n",
160 | " response = openai.Completion.create( engine=model, prompt=prompt, temperature=.5, max_tokens=400, top_p=0.5, frequency_penalty=0, presence_penalty=0, stop=None)\n",
161 | " #print('Response:')\n",
162 | " print(\"{} - {}: {}\".format(e[0],e[1],response['choices'][0]['text'].strip()))"
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": null,
168 | "id": "d9eded5e-6701-449f-b855-58a588c9dfaa",
169 | "metadata": {},
170 | "outputs": [],
171 | "source": []
172 | }
173 | ],
174 | "metadata": {
175 | "kernelspec": {
176 | "display_name": "py38",
177 | "language": "python",
178 | "name": "py38"
179 | },
180 | "language_info": {
181 | "codemirror_mode": {
182 | "name": "ipython",
183 | "version": 3
184 | },
185 | "file_extension": ".py",
186 | "mimetype": "text/x-python",
187 | "name": "python",
188 | "nbconvert_exporter": "python",
189 | "pygments_lexer": "ipython3",
190 | "version": "3.8.10"
191 | }
192 | },
193 | "nbformat": 4,
194 | "nbformat_minor": 5
195 | }
196 |
--------------------------------------------------------------------------------
/Examples/FormRecognizer/Life_application_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jakeatmsft/AzureOpenAIExamples/222106ab4dc2e124e5ec16a5d8bf771eefd25ca8/Examples/FormRecognizer/Life_application_example.png
--------------------------------------------------------------------------------
/Examples/FormRecognizer/income_table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jakeatmsft/AzureOpenAIExamples/222106ab4dc2e124e5ec16a5d8bf771eefd25ca8/Examples/FormRecognizer/income_table.png
--------------------------------------------------------------------------------
/Examples/OpenSource/ConvFinQA-benchmark/README.md:
--------------------------------------------------------------------------------
1 | # Benchmarking of GPT-X models (base model w/few shot) using PoT output on Finance QnA Datasets
2 |
3 | ## Results:
4 | | | dv3 | turbo35 | gpt4 |
5 | |-------|-------|---------|-------|
6 | | ConvFinQA |62.5 | 69.3 | 80.0 |
7 | | FinQA | - |61.2 | - |
8 | | AQuA | - |- | 72.8 |
9 | | tatqa | - |- | - |
10 |
11 | ## Program of Thought Example:
12 |
13 | ## Prompt
14 |
15 | Read the following text and table, and then write code to answer a question:
16 | five-year performance comparison 2013 the following graph provides an indicator of cumulative total shareholder returns for the corporation as compared to the peer group index ( described above ) , the dow jones , and the s&p 500 . the graph assumes that the value of the investment in the common stock of union pacific corporation and each index was $ 100 on december 31 , 2005 and that all dividends were reinvested . purchases of equity securities 2013 during 2010 , we repurchased 17556522 shares of our common stock at an average price of $ 75.51 . the following table presents common stock repurchases during each month for the fourth quarter of 2010 : period total number of shares purchased [a] average price paid per share total number of shares purchased as part of a publicly announced plan or program [b] maximum number of shares that may yet be purchased under the plan or program [b] . [a] total number of shares purchased during the quarter includes approximately 563220 shares delivered or attested to upc by employees to pay stock option exercise prices , satisfy excess tax withholding obligations for stock option exercises or vesting of retention units , and pay withholding obligations for vesting of retention shares . [b] on may 1 , 2008 , our board of directors authorized us to repurchase up to 40 million shares of our common stock through march 31 , 2011 . we may make these repurchases on the open market or through other transactions . our management has sole discretion with respect to determining the timing and amount of these transactions . on february 3 , 2011 , our board of directors authorized us to repurchase up to 40 million additional shares of our common stock under a new program effective from april 1 , 2011 through march 31 , 2014. .
17 | period | total number ofsharespurchased [a] | averageprice paidper share | total number of sharespurchased as part of apublicly announced planor program [b] | maximum number ofshares that may yetbe purchased under the planor program [b]
18 | oct . 1 through oct . 31 | 725450 | 84.65 | 519554 | 17917736
19 | nov . 1 through nov . 30 | 1205260 | 89.92 | 1106042 | 16811694
20 | dec . 1 through dec . 31 | 1133106 | 92.59 | 875000 | 15936694
21 | total | 3063816 | $ 89.66 | 2500596 | n/a
22 | Question: how much more was spent on shares in nov 2010 than dec 2010?
23 |
24 |
25 | ## Completion
26 |
27 | #Python
28 | shares_purchased_nov_2010 = 1205260
29 | shares_purchased_dec_2010 = 1133106
30 | difference_in_shares_purchased = shares_purchased_nov_2010 - shares_purchased_dec_2010
31 | average_price_paid_per_share = 89.66
32 | amount_spent_on_shares = difference_in_shares_purchased * average_price_paid_per_share
33 | ans = amount_spent_on_shares
34 |
35 |
36 |
37 |
38 | Credit to @wenhuchen - https://github.com/wenhuchen/Program-of-Thoughts
--------------------------------------------------------------------------------
/Examples/OpenSource/ConvFinQA-benchmark/eval_tatqa/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jakeatmsft/AzureOpenAIExamples/222106ab4dc2e124e5ec16a5d8bf771eefd25ca8/Examples/OpenSource/ConvFinQA-benchmark/eval_tatqa/__init__.py
--------------------------------------------------------------------------------
/Examples/OpenSource/ConvFinQA-benchmark/eval_tatqa/tatqa_eval.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | import argparse
3 | import json
4 | from .tatqa_metric import *
5 | from typing import Any, Dict, Tuple
6 |
7 |
8 | def evaluate_json(golden_answers: Dict[str, Any], predicted_answers: Dict[str, Any]) -> Tuple[float, float]:
9 |
10 | em_and_f1 = TaTQAEmAndF1()
11 | for qas in golden_answers:
12 | for qa in qas["questions"]:
13 | query_id = qa["uid"]
14 | pred_answer, pred_scale = None, None
15 | if query_id in predicted_answers:
16 | pred_answer, pred_scale = predicted_answers[query_id]
17 | em_and_f1(ground_truth=qa, prediction=pred_answer, pred_scale=pred_scale)
18 |
19 | global_em, global_f1, global_scale, global_op = em_and_f1.get_overall_metric()
20 | print("----")
21 | print("Exact-match accuracy {0:.2f}".format(global_em * 100))
22 | print("F1 score {0:.2f}".format(global_f1 * 100))
23 | print("Scale score {0:.2f}".format(global_scale * 100))
24 | print("{0:.2f} & {1:.2f}".format(global_em * 100, global_f1 * 100))
25 | print("----")
26 |
27 | detail_raw = em_and_f1.get_raw_pivot_table()
28 | print("---- raw detail ---")
29 | print(detail_raw)
30 | detail_em, detail_f1 = em_and_f1.get_detail_metric()
31 | print("---- em detail ---")
32 | print(detail_em)
33 | print("---- f1 detail ---")
34 | print(detail_f1)
35 |
36 |
37 | def evaluate_prediction_file(gold_path: str,
38 | pred_path: str):
39 |
40 | golden_answers = json.load(open(gold_path, encoding='utf-8'))
41 | predicted_answers = json.load(open(pred_path, encoding='utf-8'))
42 | evaluate_json(golden_answers, predicted_answers)
43 |
44 |
45 | if __name__ == "__main__":
46 | # pylint: disable=invalid-name
47 | parser = argparse.ArgumentParser(description='evaluation on TAT-QA dataset')
48 | parser.add_argument("--gold_path",
49 | type=str,
50 | required=True,
51 | default="tatqa_dataset_test_gold.json",
52 | help='The path of the gold file')
53 | parser.add_argument("--pred_path",
54 | type=str,
55 | required=True,
56 | default="sample_predictions.json",
57 | help='The path of the prediction file')
58 |
59 | args = parser.parse_args()
60 | evaluate_prediction_file(args.gold_path, args.pred_path)
61 |
--------------------------------------------------------------------------------
/Examples/OpenSource/ConvFinQA-benchmark/eval_tatqa/tatqa_metric_test.py:
--------------------------------------------------------------------------------
1 |
2 | from .tatqa_metric import TaTQAEmAndF1
3 |
4 | def test_em_and_f1():
5 |
6 | mode1_test_data = [
7 | ({'answer_type':'span', 'answer': ['here is, a test'], 'scale':''}, 'here is, a test', '', 1, 1),
8 | ({'answer_type': 'span', 'answer': ['here is, a test'], 'scale': ''}, 'here is, a test', '', 1, 1),
9 | ({'answer_type': 'span', 'answer': ['1234.1'], 'scale': 'million'}, '1234.1', 'thousand', 0, 0), # scale mismatch
10 | ({'answer_type': 'span', 'answer': ['1234.1'], 'scale': 'million'}, '123', 'thousand', 0, 0), # scale mismatch
11 | ({'answer_type': 'span', 'answer': ['12314.1'], 'scale': 'million'}, '12314.1', 'million', 1, 1),
12 |
13 | ({'answer_type': 'multi-span', 'answer': ['singapore', 'china', 'usa'], 'scale': ''}, ['singapore', 'china', 'usa'], '', 1, 1),
14 | ({'answer_type': 'multi-span', 'answer': ['singapore', 'china', 'usa'], 'scale': ''}, ['china', 'singapore', 'usa'], '', 1, 1),
15 | ({'answer_type': 'multi-span', 'answer': ['singapore', 'china', 'usa'], 'scale': ''}, ['china', 'singapore'], '',0, 0.8),
16 |
17 | ({'answer_type': 'arithmetic', 'answer': 123.2, 'scale': 'million'}, 123.2, '', 0, 0), # scale mismatch, f1 = 0
18 | ({'answer_type': 'arithmetic', 'answer': 123.2, 'scale': 'million'}, 123200000, '', 1, 1), #
19 | ({'answer_type': 'arithmetic', 'answer': 123.2, 'scale': 'million'}, 123.2, 'thousand', 0, 0), # scale mismatch
20 | ({'answer_type': 'arithmetic', 'answer': 123.2, 'scale': ''}, 123.2, '', 1, 1),
21 | ({'answer_type': 'arithmetic', 'answer': 123.22, 'scale': ''}, 123.2, '', 0, 0),
22 | ({'answer_type': 'arithmetic', 'answer': 123.2, 'scale': ''}, 123.2010, '', 1, 1),
23 | ({'answer_type': 'count', 'answer': 5, 'scale': ''}, 5, '', 1, 1),
24 | ({'answer_type': 'arithmetic', 'answer': 22.12, 'scale': 'percent'}, 0.2212, '', 1, 1),
25 | ({'answer_type': 'arithmetic', 'answer': 22.12, 'scale': 'percent'}, 0.22121, 'percent', 0, 0),
26 | ({'answer_type': 'arithmetic', 'answer': 22.12, 'scale': 'percent'}, 22.1231, '', 0, 0),
27 | ({'answer_type': 'arithmetic', 'answer': 22.12, 'scale': 'percent'}, 22.1231, 'percent', 1, 1),
28 | ({'answer_type': 'span', 'answer': [22.12], 'scale': 'million'}, '22.12', 'million', 1, 1),
29 | ({'answer_type': 'span', 'answer': [22.12], 'scale': 'million'}, '22.12', '', 0, 0),
30 | ({'answer_type': 'arithmetic', 'answer': 22.12, 'scale': 'million'}, 'test', '', 0, 0),
31 | ({'answer_type': 'arithmetic', 'answer': 22.12, 'scale': 'million'}, ["1","2"], '', 0, 0),# span is calcuated by word f1
32 | ({'answer_type': 'span', 'answer': [22.12], 'scale': 'percent'},"-22.12", '', 0, 0),
33 | ({'answer_type': 'span', 'answer': [22.12], 'scale': 'percent'},"22.12%", '', 1, 1),
34 | ({'answer_type': 'span', 'answer': [22.12], 'scale': ''}, "22.12%", '', 0, 0),
35 | ({'answer_type': 'span', 'answer': [22.12], 'scale': 'million'}, "$22.12", '', 0, 0),
36 | ({'answer_type': 'arithmetic', 'answer': 22.12, 'scale': 'million'}, "$22.12", '', 0, 0),
37 | ({'answer_type': 'span', 'answer': ["22.12"], 'scale': 'percent'}, ["-22.12"], '', 0, 0),
38 | ({'answer_type': 'span', 'answer': ['$1.0 million'], 'scale': ''}, ["['$1.0 million']"], '', 1, 1),
39 |
40 | ({'answer_type': 'span', 'answer': [22.12], 'scale': ''}, "$22.12", '', 1, 1),
41 | ({'answer_type': 'span', 'answer': [22.12], 'scale': 'percent'}, "22.12%", 'percent', 1, 1),
42 | ({'answer_type': 'count', 'answer': 5, 'scale': ''}, 'abcd 5', '1', 0, 0),
43 |
44 | ({'answer_type': 'multi-span', 'answer': ['$23,234', '$234.12'], 'scale': ''}, ['234.12', '23,234'], '',
45 | 1, 1),
46 | ({'answer_type': 'multi-span', 'answer': ['$35,120', '$24,159'], 'scale': ''}, ['$24,159', '$35,120'], '', 1, 1),
47 | ({'answer_type': 'arithmetic', 'answer': ['34.12'], 'scale': 'percent'}, ['0.3412'], '', 1, 1),
48 | ({'answer_type': 'span', 'answer': [
49 | 'wages and salaries, social security costs, pension and other costs and share-based payments, see note 10 of the Financial Statements'],
50 | 'scale': ''},
51 | ['wages and salaries, social security costs, pension and other costs and share - based payments,'], '', 0,
52 | 0.67),
53 |
54 | ]
55 | metrics = TaTQAEmAndF1()
56 |
57 | for ans, pred, pred_scale, em, f1 in mode1_test_data:
58 | metrics(ans, pred, pred_scale)
59 | pred_em, pred_f1, scale_score, op_score = metrics.get_overall_metric(reset=True)
60 | assert pred_em == em, f'mode2 - pred_em: {pred_em}, em:{em}, pred:{pred}, ans:{ans}'
61 | assert pred_f1 == f1, f'mode2 - pred_f1: {pred_f1}, f1:{f1}, pred:{pred}, ans:{ans}'
62 |
63 |
64 | def test_one():
65 | mode_test_data = [
66 | ({'answer_type': 'arithmetic', 'answer': ['34.12%'], 'scale': 'percent'}, ['0.3412'], '', 1, 1),
67 | ({'answer_type': 'arithmetic', 'answer': ['34.12%'], 'scale': ''}, ['0.3412'], '', 1, 1),
68 | ]
69 | metrics = TaTQAEmAndF1()
70 | for ans, pred, pred_scale, em, f1 in mode_test_data:
71 | metrics(ans, pred, pred_scale)
72 | pred_em, pred_f1, scale_score, op_score = metrics.get_overall_metric(reset=True)
73 | assert pred_f1 == f1, f'mode2 - pred_f1: {pred_f1}, f1:{f1}, pred:{pred}, ans:{ans}'
74 |
--------------------------------------------------------------------------------
/Examples/OpenSource/ConvFinQA-benchmark/eval_tatqa/tatqa_utils.py:
--------------------------------------------------------------------------------
1 | import re
2 | import string
3 | from typing import List
4 | import numpy as np
5 |
6 | def scale_to_num(scale):
7 | scale = scale.lower()
8 | num = 1
9 | if 'hundred' in scale: # hundred
10 | num = 100
11 | elif 'thousand' in scale: # thousand
12 | num = 1000
13 | elif 'million' in scale: # million
14 | num = 1000000
15 | elif 'billion' in scale: # billion
16 | num = 1000000000
17 | elif 'percent' in scale: # percent
18 | num = 0.01
19 | return num
20 |
21 | def extract_one_num_from_str(s):
22 | s = _clean_num(s)
23 | r_num = r"([+-]?\d+(\.\d+)?)|([+-]?\.\d+)"
24 | groups = re.findall(r_num, s)
25 | if len(groups) == 0:
26 | return None
27 | num = groups[-1][0]
28 | if num == '':
29 | return None
30 | if '.' in num:
31 | return float(num)
32 | return int(num)
33 |
34 | EXCLUDE_IN_NUM = "'\"\\$€£¥%(),[]"
35 | def _clean_num(text:str):
36 | return "".join([ch for ch in str(text) if ch not in EXCLUDE_IN_NUM])
37 |
38 |
39 | def is_number(text: str) -> bool:
40 | try:
41 | words = " ".join([_clean_num(w) for w in text.split()]).split()
42 | if len(words) == 0:
43 | """1023 or 1 million"""
44 | return False
45 | num = float(words[0])
46 | if np.isnan(num):
47 | return False
48 | if len(words) >= 2:
49 | if scale_to_num(words[1]) == 1:
50 | return False
51 | return True
52 | except ValueError:
53 | return False
54 | # except AttributeError:
55 | # return False
56 |
57 | def negative_num_handle(x):
58 | """
59 | :param x: transform (134) -> -134
60 | :return:
61 | """
62 | all = re.findall('(\([\d.\s]+\))', x.strip())
63 | if len(all) > 0:
64 | return -1
65 | return 1
66 |
67 | def percent_num_handle(x):
68 | """
69 | :param x: transform 12% -> 12/100
70 | :return:
71 | """
72 | all = re.findall('([\d.\s]+%)', x.strip())
73 | if len(all) > 0:
74 | return 0.01
75 | return 1
76 |
77 | def word_scale_handle(x):
78 | """
79 | :param x: 1 million = 1,000,000
80 | :return:
81 | """
82 | iter = re.finditer('([\d.]+\s?[a-zA-Z]+)', x)
83 | for one in iter:
84 | text = one.group(0).lower()
85 | scale_val = scale_to_num(text)
86 | return scale_val
87 | return 1
88 |
89 | def to_number(text:str) -> float:
90 | num = extract_one_num_from_str(text)
91 | scale_val = word_scale_handle(text)
92 | negative_flag = negative_num_handle(text)
93 | percent_flag = percent_num_handle(text)
94 | if num is not None:
95 | return round(num * scale_val * negative_flag * percent_flag, 4)
96 | return None
97 |
98 | def remove_articles(text: str) -> str:
99 | regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
100 | return re.sub(regex, ' ', text)
101 |
102 | def white_space_fix(text: str) -> str:
103 | return ' '.join(text.split())
104 |
105 | EXCLUDE = set(string.punctuation)
106 | def remove_punc(text: str) -> str:
107 | if not is_number(text):
108 | return ''.join(ch for ch in text if ch not in EXCLUDE)
109 | else:
110 | return text
111 |
112 | def lower(text: str) -> str:
113 | return text.lower()
114 |
115 | def tokenize(text: str) -> List[str]:
116 | return re.split(" ", text)
117 |
118 |
119 | def normalize_number(text: str) -> str:
120 | if is_number(text):
121 | return str(to_number(text))
122 | else:
123 | return text
124 |
125 | def normalize_answer(text: str) -> str:
126 | """Lower text and remove punctuation, articles and extra whitespace."""
127 | parts = [white_space_fix(remove_articles(normalize_number(remove_punc(lower(token)))))
128 | for token in tokenize(text)]
129 | parts = [part for part in parts if part.strip()]
130 | normalized = ' '.join(parts).strip()
131 | return normalized
132 |
133 |
134 | STRIPPED_CHARACTERS = string.punctuation + ''.join([u"‘", u"’", u"´", u"`", "_"])
135 | def ws_tokenize(text):
136 | """Runs basic whitespace cleaning and splitting on a piece of text."""
137 | text = text.strip().lower()
138 | if not text:
139 | return []
140 | text = white_space_fix(text)
141 | tokens = text.split()
142 | tokens = [token.strip(STRIPPED_CHARACTERS) for token in tokens]
143 | return tokens
144 |
145 |
--------------------------------------------------------------------------------
/Examples/OpenSource/ConvFinQA-benchmark/eval_tatqa/tatqa_utils_test.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from .tatqa_utils import *
3 |
4 |
5 | def test_extract_first_num_from_text():
6 | text = '2.3 million'
7 | assert extract_one_num_from_str(text) == 2.3
8 | text = '-2.3 million'
9 | assert extract_one_num_from_str(text) == -2.3
10 | text = '205 million'
11 | assert extract_one_num_from_str(text) == 205
12 | text = '-1,210 million'
13 | assert extract_one_num_from_str(text) == -1210
14 |
15 |
16 | def test_to_num():
17 | text = '2.3 million'
18 | assert to_number(text) == 2300000
19 | text = '-2.3 thousand'
20 | assert to_number(text) == -2300
21 | text = '205 billion'
22 | assert to_number(text) == 205000000000
23 | text = '-1,210 million'
24 | assert to_number(text) == -1210000000
25 |
26 |
27 |
28 | def test_ws_tokenize():
29 | text = '2.3 million'
30 | assert ws_tokenize(text) == ['2.3', 'million']
31 | text = '2.3 \nmillion'
32 | assert ws_tokenize(text) == ['2.3', 'million']
33 | text = '2.3\n\tmillion'
34 | assert ws_tokenize(text) == ['2.3', 'million']
35 |
36 | def test_normalize_answer():
37 | assert normalize_answer('-134.12') == '-134.12'
38 | assert normalize_answer('134.12') == '134.12'
39 | assert normalize_answer('(134.12)') == '-134.12'
40 | assert normalize_answer('18.3%') == '0.183'
41 |
42 |
43 |
44 | def test_is_num():
45 | assert is_number('$124')
46 |
--------------------------------------------------------------------------------
/Examples/OpenSource/ConvFinQA-benchmark/outputs/convfinqa_direct_gpt-4_04_12_20_53.jsonl:
--------------------------------------------------------------------------------
1 | {"questions": ["what was the change in the unamortized debt issuance costs associated with the senior notes between 2016 and 2017?", "so what was the percentage change during this time?", "what was the change associated with credit facilities during that time?", "so what was the percentage change?"], "answer": 0.375, "text": "as of december 31 , 2017 , the company had gross state income tax credit carry-forwards of approximately $ 20 million , which expire from 2018 through 2020 . a deferred tax asset of approximately $ 16 million ( net of federal benefit ) has been established related to these state income tax credit carry-forwards , with a valuation allowance of $ 7 million against such deferred tax asset as of december 31 , 2017 . the company had a gross state net operating loss carry-forward of $ 39 million , which expires in 2027 . a deferred tax asset of approximately $ 3 million ( net of federal benefit ) has been established for the net operating loss carry-forward , with a full valuation allowance as of december 31 , 2017 . other state and foreign net operating loss carry-forwards are separately and cumulatively immaterial to the company 2019s deferred tax balances and expire between 2026 and 2036 . debt long-term debt consisted of the following: . credit facility - in november 2017 , the company terminated its second amended and restated credit agreement and entered into a new credit agreement ( the \"credit facility\" ) with third-party lenders . the credit facility includes a revolving credit facility of $ 1250 million , which may be drawn upon during a period of five years from november 22 , 2017 . the revolving credit facility includes a letter of credit subfacility of $ 500 million . the revolving credit facility has a variable interest rate on outstanding borrowings based on the london interbank offered rate ( \"libor\" ) plus a spread based upon the company's credit rating , which may vary between 1.125% ( 1.125 % ) and 1.500% ( 1.500 % ) . the revolving credit facility also has a commitment fee rate on the unutilized balance based on the company 2019s leverage ratio . the commitment fee rate as of december 31 , 2017 was 0.25% ( 0.25 % ) and may vary between 0.20% ( 0.20 % ) and 0.30% ( 0.30 % ) . the credit facility contains customary affirmative and negative covenants , as well as a financial covenant based on a maximum total leverage ratio . each of the company's existing and future material wholly owned domestic subsidiaries , except those that are specifically designated as unrestricted subsidiaries , are and will be guarantors under the credit facility . in july 2015 , the company used cash on hand to repay all amounts outstanding under a prior credit facility , including $ 345 million in principal amount of outstanding term loans . as of december 31 , 2017 , $ 15 million in letters of credit were issued but undrawn , and the remaining $ 1235 million of the revolving credit facility was unutilized . the company had unamortized debt issuance costs associated with its credit facilities of $ 11 million and $ 8 million as of december 31 , 2017 and 2016 , respectively . senior notes - in december 2017 , the company issued $ 600 million aggregate principal amount of unregistered 3.483% ( 3.483 % ) senior notes with registration rights due december 2027 , the net proceeds of which were used to repurchase the company's 5.000% ( 5.000 % ) senior notes due in 2021 in connection with the 2017 redemption described below . in november 2015 , the company issued $ 600 million aggregate principal amount of unregistered 5.000% ( 5.000 % ) senior notes due november 2025 , the net proceeds of which were used to repurchase the company's 7.125% ( 7.125 % ) senior notes due in 2021 in connection with the 2015 tender offer and redemption described below . interest on the company's senior notes is payable semi-annually . the terms of the 5.000% ( 5.000 % ) and 3.483% ( 3.483 % ) senior notes limit the company 2019s ability and the ability of certain of its subsidiaries to create liens , enter into sale and leaseback transactions , sell assets , and effect consolidations or mergers . the company had unamortized debt issuance costs associated with the senior notes of $ 15 million and $ 19 million as of december 31 , 2017 and 2016 , respectively. .", "table": "( $ in millions ) | december 31 2017 | december 31 2016\nsenior notes due december 15 2021 5.000% ( 5.000 % ) | 2014 | 600\nsenior notes due november 15 2025 5.000% ( 5.000 % ) | 600 | 600\nsenior notes due december 1 2027 3.483% ( 3.483 % ) | 600 | 2014\nmississippi economic development revenue bonds due may 1 2024 7.81% ( 7.81 % ) | 84 | 84\ngulf opportunity zone industrial development revenue bonds due december 1 2028 4.55% ( 4.55 % ) | 21 | 21\nless unamortized debt issuance costs | -26 ( 26 ) | -27 ( 27 )\ntotal long-term debt | 1279 | 1278\n", "golden_text": "the company had unamortized debt issuance costs associated with its credit facilities of $ 11 million and $ 8 million as of december 31 , 2017 and 2016 , respectively . the company had unamortized debt issuance costs associated with the senior notes of $ 15 million and $ 19 million as of december 31 , 2017 and 2016 , respectively. . ", "golden_table": "( $ in millions ) | december 31 2017 | december 31 2016\nsenior notes due november 15 2025 5.000% ( 5.000 % ) | 600 | 600\n", "id": "Double_HII/2017/page_104.pdf", "programs": ["ans = 15 - 19", "x0 = 15 - 19; ans = x0 / 19", "ans = 11 - 8", "x0 = 11 - 8; ans = x0 / 8"], "generated": ["#Python\nunamortized_debt_issuance_costs_senior_notes_2017 = 15\nunamortized_debt_issuance_costs_senior_notes_2016 = 19\nunamortized_debt_issuance_costs_credit_facilities_2017 = 11\nunamortized_debt_issuance_costs_credit_facilities_2016 = 8"], "executed": 16.849999999999994}
2 |
--------------------------------------------------------------------------------
/Examples/OpenSource/ConvFinQA-benchmark/tool.py:
--------------------------------------------------------------------------------
1 | from typing import Union, Any
2 | from math import isclose
3 | import func_timeout
4 | from sympy.solvers import solve
5 | from sympy import Symbol, Eq
6 | import math
7 | from sympy import simplify
8 | import numpy as np
9 | import cvxpy as cp
10 | import statistics
11 |
12 |
13 | def get_precision(gt_ans: float) -> int:
14 | precision = 5
15 | if '.' in str(gt_ans):
16 | precision = len(str(gt_ans).split('.')[-1])
17 | return precision
18 |
19 |
20 | def finqa_equal(prediction: Union[bool, float, str],
21 | reference: Union[float, str],
22 | include_percentage: bool = False,
23 | is_close: float = False) -> bool:
24 | if prediction is None:
25 | return False
26 | elif type(prediction) == bool:
27 | # bool questions
28 | if prediction:
29 | return reference == 'yes'
30 | else:
31 | return reference == 'no'
32 | elif type(reference) == str or type(prediction) == str:
33 | # string questions
34 | return prediction == reference
35 | else:
36 | # number questions
37 | if include_percentage:
38 | gt_result = [reference / 100, reference, reference * 100]
39 | else:
40 | gt_result = [reference]
41 | for item in gt_result:
42 | try:
43 | if is_close:
44 | if isclose(item, prediction, rel_tol=0.001):
45 | return True
46 | precision = min(get_precision(prediction), get_precision(item))
47 | if round(prediction, precision) == round(item, precision):
48 | return True
49 | except Exception:
50 | continue
51 | return False
52 |
53 |
54 | def simplify_ans(ans, convert_to_str: bool = True):
55 | if 'relational' in str(type(ans)):
56 | return str(ans)
57 | elif 'numpy' in str(type(ans)):
58 | if ans.shape == ():
59 | # scalar value
60 | ans = round(float(ans), 2)
61 | else:
62 | # array value
63 | ans = round(float(ans[0]), 2)
64 | if convert_to_str:
65 | return str(ans)
66 | else:
67 | return ans
68 | elif not ans:
69 | return None
70 | else:
71 | if type(ans) in [list, tuple]:
72 | if 'sympy' in str(type(ans[0])):
73 | try:
74 | ans = [round(float(x), 2) for x in ans]
75 | except Exception:
76 | ans = [str(x) for x in ans]
77 | if len(ans) == 1:
78 | ans = ans[0]
79 | else:
80 | if 'sympy' in str(type(ans)):
81 | try:
82 | ans = round(float(ans), 2)
83 | except Exception:
84 | ans = str(ans)
85 | if convert_to_str:
86 | return str(ans)
87 | else:
88 | return ans
89 |
90 |
91 | def floatify_ans(ans):
92 | if ans is None:
93 | return None
94 | elif type(ans) == dict:
95 | ans = list(ans.values())[0]
96 | elif type(ans) == bool:
97 | ans = ans
98 | elif type(ans) in [list, tuple]:
99 | if not ans:
100 | return None
101 | else:
102 | try:
103 | ans = float(ans[0])
104 | except Exception:
105 | ans = str(ans[0])
106 | else:
107 | try:
108 | ans = float(ans)
109 | except Exception:
110 | ans = str(ans)
111 | return ans
112 |
113 |
114 | def parse_api_result(result):
115 | to_return = []
116 | for idx, g in enumerate(result['choices']):
117 | text = g['text']
118 | logprob = sum(g['logprobs']['token_logprobs'])
119 | to_return.append((text, logprob))
120 | to_return = sorted(to_return, key=lambda tup: tup[1], reverse=True)
121 | to_return = [r[0] for r in to_return]
122 | return to_return
123 |
124 |
125 | def solve_it(equation, variable):
126 | solution = solve(equation, variable, dict=True)
127 | if not solution:
128 | if isinstance(variable, list):
129 | solution = {v: None for v in variable}
130 | else:
131 | solution = {variable: None}
132 | return solution
133 | else:
134 | solution = solution[0]
135 | return solution
136 |
137 |
138 | def safe_execute(code_string: str, keys=None):
139 | def execute(x):
140 | try:
141 | exec(x)
142 | locals_ = locals()
143 | if keys is None:
144 | return locals_.get('ans', None)
145 | else:
146 | return [locals_.get(k, None) for k in keys]
147 | except Exception:
148 | return None
149 | try:
150 | ans = func_timeout.func_timeout(5, execute, args=(code_string,))
151 | except func_timeout.FunctionTimedOut:
152 | ans = None
153 |
154 | return ans
155 |
156 |
157 | def synthesize_program(result: str, prefix: str) -> str:
158 | program = prefix
159 | for i, line in enumerate(result.split('\n')):
160 | if i == 0:
161 | program += line + '\n'
162 | else:
163 | if line.startswith(' '):
164 | program += line + '\n'
165 | else:
166 | break
167 | program += 'ans = solver()'
168 | return program
--------------------------------------------------------------------------------
/Examples/OpenSource/LangChain/CustomAPIMAzureOpenAI.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from langchain.llms import OpenAI
4 | from langchain import PromptTemplate, LLMChain
5 | from langchain.llms.openai import *
6 |
7 | class CustomAPIMAzureOpenAI(AzureOpenAI):
8 | """Azure specific OpenAI class that uses deployment name."""
9 |
10 | deployment_name: str = ""
11 | """Deployment name to use."""
12 | subscription_key: str = ""
13 |
14 | @property
15 | def _identifying_params(self) -> Mapping[str, Any]:
16 | return {
17 | **{"deployment_name": self.deployment_name, "subscription_key":self.subscription_key},
18 | **super()._identifying_params,
19 | }
20 |
21 | @property
22 | def _invocation_params(self) -> Dict[str, Any]:
23 | return {**{"engine": self.deployment_name}, **super()._invocation_params}
24 |
25 | def _generate(
26 | self, prompts: List[str], stop: Optional[List[str]] = None
27 | ) -> LLMResult:
28 | """Call out to OpenAI's endpoint with k unique prompts.
29 | Args:
30 | prompts: The prompts to pass into the model.
31 | stop: Optional list of stop words to use when generating.
32 | Returns:
33 | The full LLM output.
34 | Example:
35 | .. code-block:: python
36 | response = openai.generate(["Tell me a joke."])
37 | """
38 | # TODO: write a unit test for this
39 | params = self._invocation_params
40 | sub_prompts = self.get_sub_prompts(params, prompts, stop)
41 | choices = []
42 | token_usage: Dict[str, int] = {}
43 | # Get the token usage from the response.
44 | # Includes prompt, completion, and total tokens used.
45 | _keys = {"completion_tokens", "prompt_tokens", "total_tokens"}
46 | for _prompts in sub_prompts:
47 | if self.streaming:
48 | if len(_prompts) > 1:
49 | raise ValueError("Cannot stream results with multiple prompts.")
50 | params["stream"] = True
51 | response = _streaming_response_template()
52 | for stream_resp in completion_with_retry(
53 | self, prompt=_prompts, headers={'Ocp-Apim-Subscription-Key':self.subscription_key}, **params
54 | ):
55 | self.callback_manager.on_llm_new_token(
56 | stream_resp["choices"][0]["text"],
57 | verbose=self.verbose,
58 | logprobs=stream_resp["choices"][0]["logprobs"],
59 | )
60 | _update_response(response, stream_resp)
61 | choices.extend(response["choices"])
62 | else:
63 | response = completion_with_retry(self, prompt=_prompts, headers={'Ocp-Apim-Subscription-Key':self.subscription_key}, **params)
64 | choices.extend(response["choices"])
65 | if not self.streaming:
66 | # Can't update token usage if streaming
67 | update_token_usage(_keys, response, token_usage)
68 | return self.create_llm_result(choices, prompts, token_usage)
69 |
70 | async def _agenerate(self, prompts: List[str], stop: Optional[List[str]] = None
71 | ) -> LLMResult:
72 | """Call out to OpenAI's endpoint async with k unique prompts."""
73 | params = self._invocation_params
74 | sub_prompts = self.get_sub_prompts(params, prompts, stop)
75 | choices = []
76 | token_usage: Dict[str, int] = {}
77 | # Get the token usage from the response.
78 | # Includes prompt, completion, and total tokens used.
79 | _keys = {"completion_tokens", "prompt_tokens", "total_tokens"}
80 | for _prompts in sub_prompts:
81 | if self.streaming:
82 | if len(_prompts) > 1:
83 | raise ValueError("Cannot stream results with multiple prompts.")
84 | params["stream"] = True
85 | response = _streaming_response_template()
86 | async for stream_resp in await acompletion_with_retry(
87 | self, prompt=_prompts, headers={'Ocp-Apim-Subscription-Key':self.subscription_key}, **params
88 | ):
89 | if self.callback_manager.is_async:
90 | await self.callback_manager.on_llm_new_token(
91 | stream_resp["choices"][0]["text"],
92 | verbose=self.verbose,
93 | logprobs=stream_resp["choices"][0]["logprobs"],
94 | )
95 | else:
96 | self.callback_manager.on_llm_new_token(
97 | stream_resp["choices"][0]["text"],
98 | verbose=self.verbose,
99 | logprobs=stream_resp["choices"][0]["logprobs"],
100 | )
101 | _update_response(response, stream_resp)
102 | choices.extend(response["choices"])
103 | else:
104 | response = await acompletion_with_retry(self, prompt=_prompts, headers={'Ocp-Apim-Subscription-Key':self.subscription_key}, **params)
105 | choices.extend(response["choices"])
106 | if not self.streaming:
107 | # Can't update token usage if streaming
108 | update_token_usage(_keys, response, token_usage)
109 | return self.create_llm_result(choices, prompts, token_usage)
110 |
111 |
--------------------------------------------------------------------------------
/Examples/OpenSource/LangChain/LangChainSummarizationExample.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 6,
6 | "id": "6bcafd95-00c2-4b7f-b1b4-614a23c1f255",
7 | "metadata": {
8 | "tags": []
9 | },
10 | "outputs": [],
11 | "source": [
12 | "from langchain import OpenAI, PromptTemplate, LLMChain\n",
13 | "from langchain.text_splitter import CharacterTextSplitter\n",
14 | "from langchain.chains.mapreduce import MapReduceChain\n",
15 | "from langchain.prompts import PromptTemplate\n",
16 | "from CustomAPIMAzureOpenAI import CustomAPIMAzureOpenAI\n",
17 | "import os, openai\n",
18 | "\n",
19 | "os.environ[\"OPENAI_API_KEY\"] = \"na\"\n",
20 | "os.environ[\"OPENAI_API_TYPE\"] = openai.api_type = \"azure\"\n",
21 | "os.environ[\"OPENAI_API_VERSION\"] = openai.api_version = \"2022-12-01\"\n",
22 | "os.environ[\"OPENAI_API_BASE\"] = openai.api_base = \"https://[APIM_ENDPOINT].azure-api.net/\"\n",
23 | "deployment_name = \"deployment_name\"\n",
24 | "model_name = \"model_name\"\n",
25 | "\n",
26 | "llm = CustomAPIMAzureOpenAI(deployment_name=deployment_name, model_name=model_name, subscription_key = 'SUBSCRIPTION_KEY' )\n",
27 | "\n",
28 | "\n",
29 | "text_splitter = CharacterTextSplitter()"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": 7,
35 | "id": "32dbb04f-9dab-420a-979e-ca4defb2092a",
36 | "metadata": {
37 | "tags": []
38 | },
39 | "outputs": [
40 | {
41 | "name": "stderr",
42 | "output_type": "stream",
43 | "text": [
44 | "Created a chunk of size 8297, which is longer than the specified 4000\n",
45 | "Created a chunk of size 8410, which is longer than the specified 4000\n",
46 | "Created a chunk of size 8271, which is longer than the specified 4000\n",
47 | "Created a chunk of size 8217, which is longer than the specified 4000\n",
48 | "Created a chunk of size 6170, which is longer than the specified 4000\n"
49 | ]
50 | }
51 | ],
52 | "source": [
53 | "with open('./stateoftheunion_20230207.txt') as f:\n",
54 | " state_of_the_union = f.read()\n",
55 | "texts = text_splitter.split_text(state_of_the_union)"
56 | ]
57 | },
58 | {
59 | "cell_type": "code",
60 | "execution_count": 8,
61 | "id": "644dda7e-650c-4cbb-9d29-609aeccbb46a",
62 | "metadata": {
63 | "tags": []
64 | },
65 | "outputs": [],
66 | "source": [
67 | "from langchain.docstore.document import Document\n",
68 | "\n",
69 | "docs = [Document(page_content=t) for t in texts[:3]]"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": 9,
75 | "id": "977e6220-9398-4aae-a3a2-202172b84aee",
76 | "metadata": {
77 | "tags": []
78 | },
79 | "outputs": [],
80 | "source": [
81 | "from langchain.chains.summarize import load_summarize_chain\n"
82 | ]
83 | },
84 | {
85 | "cell_type": "code",
86 | "execution_count": 10,
87 | "id": "b1d82b77-6323-4ebe-88e9-1ca23b653916",
88 | "metadata": {
89 | "tags": []
90 | },
91 | "outputs": [
92 | {
93 | "data": {
94 | "text/plain": [
95 | "'\\n\\nThis message from the President of the United States celebrates the accomplishments of the past two years, including the creation of 12 million jobs, the passage of 300 bipartisan laws, and the passage of the CHIPS and Science Act and the Bipartisan Infrastructure Law. It also outlines plans for the future, such as investing in infrastructure, providing clean water and high-speed internet access, and capping the cost of insulin for seniors on Medicare.'"
96 | ]
97 | },
98 | "execution_count": 10,
99 | "metadata": {},
100 | "output_type": "execute_result"
101 | }
102 | ],
103 | "source": [
104 | "chain = load_summarize_chain(llm, chain_type=\"stuff\")\n",
105 | "chain.run(docs)"
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": null,
111 | "id": "c851d245-cc1c-467f-80c0-bb5b53d81f31",
112 | "metadata": {},
113 | "outputs": [],
114 | "source": []
115 | }
116 | ],
117 | "metadata": {
118 | "kernelspec": {
119 | "display_name": "gradio",
120 | "language": "python",
121 | "name": "gradio"
122 | },
123 | "language_info": {
124 | "codemirror_mode": {
125 | "name": "ipython",
126 | "version": 3
127 | },
128 | "file_extension": ".py",
129 | "mimetype": "text/x-python",
130 | "name": "python",
131 | "nbconvert_exporter": "python",
132 | "pygments_lexer": "ipython3",
133 | "version": "3.10.9"
134 | }
135 | },
136 | "nbformat": 4,
137 | "nbformat_minor": 5
138 | }
139 |
--------------------------------------------------------------------------------
/Examples/OpenSource/LlamaIndex/sample_10k_chain.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jakeatmsft/AzureOpenAIExamples/222106ab4dc2e124e5ec16a5d8bf771eefd25ca8/Examples/OpenSource/LlamaIndex/sample_10k_chain.pdf
--------------------------------------------------------------------------------
/Examples/Readme.md:
--------------------------------------------------------------------------------
1 | # How to use the repository examples
2 |
3 | ## Pre-requisites
4 | - Azure subscription
5 | - https://azure.microsoft.com/en-us/
6 | - Azure Cognitive Services Instance
7 | - https://azure.microsoft.com/en-us/products/cognitive-services/#overview
8 | - Azure OpenAI Service Instance
9 | - https://azure.microsoft.com/en-us/products/cognitive-services/openai-service/
10 |
11 | ## Deploy Azure OpenAI model
12 | - https://learn.microsoft.com/en-us/azure/cognitive-services/openai/how-to/create-resource?pivots=web-portal
13 |
14 | - Go to the Azure OpenAI Studio
15 | - Login with the resource you want to use
16 | - Select the Go to Deployments button under Manage deployments in your resource to navigate to the Deployments page
17 | - Create a new deployment called text-davinci-002 and choose the text-davinci-002 model from the drop-down.
18 |
19 |
20 | ## Fill in config parameters
21 | - Open the config.cfg file
22 | - Replace the values in the file with the apikeys and model names of deployed services:
23 | - Example config:
24 | ```
25 | [openai_api]
26 | api_key:33XXXXXXXXXXXXXXXXXXXX2e
27 | api_ep:https://XXXXX.openai.azure.com/
28 | api_model:model_name
29 | cog_svc_key:33XXXXXXXXXXXXXXXXXXXX2e
30 | cog_svc_ep:https://XXXXX.cognitiveservices.azure.com
31 |
32 | ```
33 | ## Install requirements
34 | - Install python packages in the [requirements.txt](requirements.txt) file.
35 | ## Navigate to example notebooks
36 | - Open the sample notebooks using Jupyter to run in local or cloud environment.
37 |
38 |
39 |
--------------------------------------------------------------------------------
/Examples/Speech/Conversation_SSML.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Hello, how may I help you today?
5 | I really need help with my credit card, it's not working at all
6 | May I please have your first and last name?
7 | sure it's John, J O H N, Doh, D O E
8 | Thank you Mr Doh, can you confirm the last four digits of your account number?
9 | Which number? Is that the card number or the number on my statement, I don't have a statement in front of me.
10 | It should be the last four digits printed on your credit card.
11 | Ok, let me get it, my wallet is in the other room.
12 | I have it now, the number is 4 3 2 1
13 | Thank you again Mr Doh.
14 | It looks like there is suspected fraud on your credit card. Can you confirm the last purchase you made?
15 | I tried to use it to book an Air Bee En Bee for my daughter.
16 | Can you confirm the charge amount?
17 | I don't know. it was about two thousand dollars for a stay in December in Florida.
18 | Ok I can confirm the amount now, our system detected it as fraud but since you have confirmed it we will mark it as approved. Please proceed with your booking.
19 | I hope I can get the same house. bookings were hard to find in that area. I'm going to try now.ok it looks like the booking went through thank you
20 | Is there anything else I can help you with?
21 | Yes, as a matter of fact. I want to order another card for my daughter to use.
22 | Sure, I can help you with that, can I have her first and last name?
23 | Jane, J A N E, Doh, D O E.
24 | What address can I mail the card to?
25 | You can mail it to the default address on Pine Wood Ave.
26 | Ok you can expect the card in 1 to 2 business days.Is there anything else?
27 | No thank you for your help.
--------------------------------------------------------------------------------
/Examples/aml_examples/1a_read_example-copy/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | page_type: sample
3 | languages:
4 | - python
5 | products:
6 | - azure-machine-learning
7 | description: This sample shows how to run build pipeline with component.
8 | ---
9 |
10 | # Running a Pipeline job with components
11 | This example shows how to use component to build a pipeline: [pipeline_with_components_from_yaml.ipynb](pipeline_with_components_from_yaml.ipynb).
--------------------------------------------------------------------------------
/Examples/aml_examples/1a_read_example-copy/adls_src/read_folder.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from pathlib import Path
3 | from uuid import uuid4
4 | from datetime import datetime
5 | import os
6 |
7 | parser = argparse.ArgumentParser("train")
8 | parser.add_argument("--training_data", type=str, help="Path to training data")
9 | parser.add_argument("--model_output", type=str, help="Path of output model", default="" )
10 |
11 | args = parser.parse_args()
12 |
13 | print("hello training world...")
14 |
15 | lines = [
16 | f"Training data path: {args.training_data}",
17 | ]
18 |
19 | for line in lines:
20 | print(line)
21 |
22 | print("mounted_path files: ")
23 | arr = os.listdir(args.training_data)
24 | print(arr)
25 |
26 | for filename in arr:
27 | print("reading file: %s ..." % filename)
28 | with open(os.path.join(args.training_data, filename), "r") as handle:
29 | print(handle.read())
30 |
31 |
32 |
33 |
34 |
--------------------------------------------------------------------------------
/Examples/aml_examples/1a_read_example-copy/read_adls.yml:
--------------------------------------------------------------------------------
1 | $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
2 | type: command
3 |
4 | name: train_model
5 | display_name: Train Model
6 | description: A dummy training component
7 | version: 0.0.1
8 | inputs:
9 | training_data:
10 | type: uri_folder
11 | outputs:
12 | model_output:
13 | type: uri_folder
14 | code: ./adls_src
15 | environment: azureml:AzureML-sklearn-1.0-ubuntu20.04-py38-cpu:1
16 | command: >-
17 | python read_folder.py
18 | --training_data ${{inputs.training_data}}
19 |
--------------------------------------------------------------------------------
/Examples/aml_foundationmodels/dockerfile/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM mcr.microsoft.com/aifx/acpt/stable-ubuntu2004-cu117-py38-torch201:biweekly.202309.2
2 |
3 | WORKDIR /
4 |
5 | # support Deepspeed launcher requirement of passwordless ssh login
6 | RUN apt-get update && apt-get -y upgrade
7 | RUN apt-get install -y openssh-server openssh-client
8 |
9 | COPY requirements.txt .
10 | RUN pip install -r requirements.txt --no-cache-dir
11 |
12 | # List installed packages
13 | RUN pip list
14 |
15 | ## Delete
16 | RUN rm requirements.txt
17 |
18 | # Copy scoring file
19 | COPY score.py /var/mlflow_resources/mlflow_score_script.py
20 | ENV AZUREML_ENTRY_SCRIPT="mlflow_score_script.py"
21 | ENV AML_APP_ROOT="/var/mlflow_resources"
22 |
23 | # Inference requirements
24 | COPY --from=mcr.microsoft.com/azureml/o16n-base/python-assets:20230419.v1 /artifacts /var/
25 | RUN /var/requirements/install_system_requirements.sh && \
26 | cp /var/configuration/rsyslog.conf /etc/rsyslog.conf && \
27 | cp /var/configuration/nginx.conf /etc/nginx/sites-available/app && \
28 | ln -sf /etc/nginx/sites-available/app /etc/nginx/sites-enabled/app && \
29 | rm -f /etc/nginx/sites-enabled/default
30 | ENV SVDIR=/var/runit
31 | ENV WORKER_TIMEOUT=3600
32 | EXPOSE 5001 8883 8888
33 |
34 | CMD [ "runsvdir", "/var/runit" ]
--------------------------------------------------------------------------------
/Examples/aml_foundationmodels/dockerfile/requirements.txt:
--------------------------------------------------------------------------------
1 | deepspeed~=0.10.0
2 | deepspeed-mii~=0.0.6
3 | aiolimiter~=1.1.0
4 | torch~=2.0.1
5 | protobuf~=4.24.0
6 | psutil~=5.9.5
7 | transformers~=4.33.0
8 | sentencepiece~=0.1.99
9 | xformers~=0.0.21
10 | pandas~=2.0.3
11 | azure-ai-contentsafety~=1.0.0b1
12 | azure-identity==1.14.0
13 | azure-mgmt-cognitiveservices==13.5.0
14 | azureml-inference-server-http==0.8.4.2
15 | azureml-core==1.53.0
16 | azureml-mlflow==1.53.0
17 | cryptography~=41.0.3
18 | certifi==2023.07.22
19 | requests~=2.31.0
20 | aiohttp~=3.8.5
21 | einops~=0.6.0
22 | accelerate
23 | langchain
24 | bitsandbytes
25 | flask
26 |
27 |
--------------------------------------------------------------------------------
/Examples/aml_foundationmodels/src/server.py:
--------------------------------------------------------------------------------
1 | from langchain import HuggingFacePipeline, PromptTemplate, LLMChain
2 | from transformers import AutoTokenizer, pipeline
3 | import torch
4 | from flask import Flask, render_template, request
5 | import re
6 |
7 | # choose model based on your hardware
8 | model = 'tiiuae/falcon-7b-instruct'
9 | # model = 'tiiuae/falcon-40b-instruct'
10 |
11 | # load a tokenizer from a pretrained model using the Hugging Face AutoTokenizer class
12 | # and the from_pretrained method is used to retrieve the tokenizer associated with the specified model
13 | # to which the tokenizer is responsible for processing text inputs and converting them into numerical
14 | # representations suitable for input to the model
15 | print('loading model')
16 | tokenizer = AutoTokenizer.from_pretrained(model)
17 |
18 | # the pipeline function returns a callable object that can be used to generate text using
19 | # the specified model and parameters
20 | print('loading pipeline')
21 | pipeline = pipeline(
22 | 'text-generation', # the task for the pipeline
23 | model=model, # the pretrained model to use
24 | tokenizer=tokenizer, # the tokenizer for preprocessing inputs
25 | torch_dtype=torch.bfloat16, # the data type for torch tensors
26 | trust_remote_code=True, # flag to trust remote code (e.g., when using remote models)
27 | device_map='auto', # the device to run the pipeline on (GPU or CPU)
28 | max_length=20000, # the maximum length of generated text
29 | do_sample=True, # flag indicating whether to use sampling for text generation
30 | top_k=10, # the number of highest probability tokens to consider for sampling
31 | num_return_sequences=1, # the number of sequences to generate
32 | eos_token_id=tokenizer.eos_token_id # the token ID representing the end of a text sequence
33 | )
34 |
35 | # the HuggingFacePipeline instance llm is created with the specified pipeline and model_kwargs and
36 | # the llm object can then be used to generate text based on the configured pipeline and model parameters
37 | # create an instance of the HuggingFacePipeline class
38 | print('loading llm')
39 | llm = HuggingFacePipeline(
40 | pipeline=pipeline, # the text generation pipeline to use
41 | model_kwargs={'temperature': 0} # temperature is a common parameter used in text generation models to
42 | # control the randomness of the generated output and the higher
43 | # temperature values (e.g., 1.0) lead to more diverse and creative
44 | # output, while lower values (e.g., 0.5) make the output more
45 | # focused and deterministic
46 | )
47 |
48 | # define the template for the prompt
49 | template = """
50 | You are an intelligent chatbot. Take careful consideration to context of the question and answer appropriately.
51 | Question: {question}
52 | Answer:"""
53 |
54 | # define a template for the prompt to be used in the LLMChain instance and the prompt template allows for
55 | # customization of the prompt message and dynamic insertion of input variables and
56 | # the template variable stores a multi-line string that serves as the template for the prompt and it provides a general
57 | # message for the chatbot and defines the format for presenting the question and answer and the PromptTemplate class is
58 | # instantiated with two arguments which are template, the template string defined earlier, which serves as the base
59 | # structure for the prompt and input_variables, a list of input variables used in the template and in this case, we
60 | # have only one variable, 'question', which represents the user's input question to which the PromptTemplate object
61 | # prompt is created, which can be used within the LLMChain instance to generate prompts dynamically based on user input
62 | # and by using prompt templates, you can create flexible and customizable prompts that adapt to the user's specific
63 | # input, making the conversation more engaging and interactive
64 | # create a prompt template
65 | prompt = PromptTemplate(
66 | template=template, # the template string for the prompt
67 | input_variables=['question'] # the list of input variables used in the template
68 | )
69 |
70 | # create an instance of the LLMChain class
71 | llm_chain = LLMChain(
72 | prompt=prompt, # the prompt template for generating prompts
73 | llm=llm # the HuggingFacePipeline instance for text generation
74 | )
75 |
76 |
77 | def remove_angle_brackets(text):
78 | """
79 | Removes angle brackets and their contents from the given text.
80 |
81 | Args:
82 | text (str): The input text from which angle brackets and their contents need to be removed.
83 |
84 | Returns:
85 | str: The modified text with angle brackets and their contents removed.
86 | """
87 | return re.sub(r'<[^>]*>', '', text)
88 |
89 |
90 | # init the Flask app
91 | app = Flask(__name__)
92 |
93 |
94 | @app.route('/', methods=['GET', 'POST'])
95 | def home():
96 | """
97 | Renders the home page and handles form submission.
98 |
99 | If the request method is POST, it retrieves the question from the form,
100 | generates a response using the LLMChain, removes angle brackets from the response,
101 | and renders the updated index.html template with the response.
102 |
103 | If the request method is GET, it renders the index.html template.
104 |
105 | Returns:
106 | str: The rendered HTML template for the home page.
107 | """
108 | if request.method == 'POST':
109 | question = request.form['question']
110 | response = llm_chain.run(question)
111 | response = remove_angle_brackets(response)
112 | return response
113 | return "POST form[question]"
114 |
115 |
116 | if __name__ == '__main__':
117 | # check if CUDA is available and being used
118 | if torch.cuda.is_available() and torch.cuda.current_device() != -1:
119 | print('CUDA is being used.')
120 | else:
121 | print('CUDA is not being used.')
122 | print('running server')
123 | # run app
124 | app.run(host='0.0.0.0', port=5000, debug=False)
--------------------------------------------------------------------------------
/Examples/config.cfg:
--------------------------------------------------------------------------------
1 | [openai_api]
2 | api_key:{openai_apikey}
3 | api_ep:{openai_endpoint}
4 | api_model:{deployment_name}
5 | cog_svc_key:{cogsvc_apikey}
6 | cog_svc_ep:{cogsvc_endpoint}
7 |
--------------------------------------------------------------------------------
/Examples/demo/stock_prompt.csv:
--------------------------------------------------------------------------------
1 | Date,Symbol,Adj Close,Close,High,Low,Open,Volume
2 | 2009-12-31,MMM,,,,,,
3 | 2010-01-04,MMM,59.318885803222656,83.0199966430664,83.44999694824219,82.66999816894531,83.08999633789062,3043700.0
4 | 2010-01-05,MMM,58.94734191894531,82.5,83.2300033569336,81.69999694824219,82.80000305175781,2847000.0
5 | 2010-01-06,MMM,59.783294677734375,83.66999816894531,84.5999984741211,83.51000213623047,83.87999725341797,5268500.0
6 | 2010-01-07,MMM,59.826175689697266,83.7300033569336,83.76000213623047,82.12000274658203,83.31999969482422,4470100.0
7 | 2010-01-08,MMM,60.24774932861328,84.31999969482422,84.31999969482422,83.30000305175781,83.69000244140625,3405800.0
8 | 2010-01-11,MMM,60.004825592041016,83.9800033569336,84.5999984741211,83.41000366210938,84.37999725341797,2927100.0
9 | 2010-01-12,MMM,60.05484390258789,84.05000305175781,84.18000030517578,83.30000305175781,83.58000183105469,3031800.0
10 | 2010-01-13,MMM,59.861942291259766,83.77999877929688,84.11000061035156,83.19999694824219,84.11000061035156,3102000.0
11 | 2010-01-14,MMM,59.661865234375,83.5,83.93000030517578,83.41999816894531,83.73999786376953,2634100.0
12 | 2010-01-15,MMM,59.56898880004883,83.37000274658203,84.08999633789062,82.87000274658203,83.5199966430664,3955000.0
13 | 2010-01-19,MMM,60.819358825683594,85.12000274658203,85.16999816894531,83.5,83.81999969482422,4500400.0
14 | 2010-01-20,MMM,60.5335578918457,84.72000122070312,85.12999725341797,83.58999633789062,84.83000183105469,3671200.0
15 | 2010-01-21,MMM,59.090240478515625,82.69999694824219,84.5999984741211,82.56999969482422,84.5999984741211,4783200.0
16 | 2010-01-22,MMM,58.218544006347656,81.4800033569336,82.83000183105469,81.30000305175781,82.4000015258789,4809000.0
17 | 2010-01-25,MMM,58.4686164855957,81.83000183105469,82.88999938964844,81.4800033569336,82.33000183105469,3386600.0
18 | 2010-01-26,MMM,58.32571792602539,81.62999725341797,82.7300033569336,81.04000091552734,81.45999908447266,3138000.0
19 | 2010-01-27,MMM,58.80441665649414,82.30000305175781,82.83999633789062,81.01000213623047,81.33000183105469,5066900.0
20 | 2010-01-28,MMM,57.69694519042969,80.75,82.63999938964844,79.11000061035156,82.62000274658203,6820700.0
21 | 2010-01-29,MMM,57.51118087768555,80.48999786376953,81.87999725341797,80.18000030517578,81.3499984741211,4347000.0
22 | 2010-02-01,MMM,57.47545623779297,80.44000244140625,80.88999938964844,79.94000244140625,80.83999633789062,3632700.0
23 | 2010-02-02,MMM,57.46830368041992,80.43000030517578,80.94000244140625,79.69999694824219,80.69999694824219,4690000.0
24 | 2010-02-03,MMM,58.23284912109375,81.5,81.68000030517578,79.83000183105469,79.83000183105469,3401300.0
25 | 2010-02-04,MMM,56.59660720825195,79.20999908447266,81.12999725341797,78.83000183105469,81.12999725341797,5312600.0
26 | 2010-02-05,MMM,56.11787796020508,78.54000091552734,79.5,77.26000213623047,79.16000366210938,5408000.0
27 | 2010-02-08,MMM,55.39621353149414,77.52999877929688,78.4800033569336,77.25,78.4800033569336,4407300.0
28 | 2010-02-09,MMM,56.246490478515625,78.72000122070312,79.41000366210938,77.9000015258789,78.27999877929688,4252500.0
29 | 2010-02-10,MMM,56.175025939941406,78.62000274658203,79.29000091552734,78.0,78.80000305175781,2445200.0
30 | 2010-02-11,MMM,57.35396194458008,80.2699966430664,80.38999938964844,78.83999633789062,79.22000122070312,5524500.0
31 | 2010-02-12,MMM,56.575157165527344,79.18000030517578,79.2300033569336,78.06999969482422,79.13999938964844,5443000.0
32 | 2010-02-16,MMM,57.496891021728516,80.47000122070312,80.63999938964844,79.33999633789062,80.08000183105469,3619100.0
33 | 2010-02-17,MMM,57.63711929321289,80.13999938964844,80.69000244140625,79.73999786376953,80.41999816894531,3095200
--------------------------------------------------------------------------------
/Examples/promptflow/ag-convfinqa-pf/.promptflow/flow.layout.json:
--------------------------------------------------------------------------------
1 | {
2 | "nodeLayouts": {
3 | "inputs": {
4 | "x": 290,
5 | "y": 42,
6 | "index": -1
7 | },
8 | "outputs": {
9 | "x": 220,
10 | "y": 310,
11 | "index": -1
12 | },
13 | "autogen_step": {
14 | "x": 140,
15 | "y": 176,
16 | "index": 0
17 | }
18 | },
19 | "orientation": "Vertical"
20 | }
--------------------------------------------------------------------------------
/Examples/promptflow/ag-convfinqa-pf/.promptflow/flow.output.json:
--------------------------------------------------------------------------------
1 | {
2 | "answer": "Sure! Let's continue solving the problem.\n\nSince we have already determined the projected exercise price per share in 2040, we can move on to the next question.\n\nQuestion: If the exercise price per share continues to grow at the same average annual growth rate, what would be the projected exercise price per share in 2050?\n\nTo calculate the projected exercise price per share in 2050, we can use the same formula as before:\n\n\\(\\text{{Projected Price}} = \\text{{Initial Price}} \\times (1 + \\text{{Average Annual Growth Rate}})^{\\text{{Number of Years}}}\\)\n\nIn this case, the initial price is the weighted average exercise price per share in 2007, the average annual growth rate is 0.5569, and the number of years is 43 (from 2007 to 2050).\n\nHere's the code to calculate the projected exercise price per share in 2050:\n\n```python\nnumber_of_years = 43\n\nprojected_price_2050 = weighted_average_2007 * (1 + average_annual_growth_rate) ** number_of_years\n\nprint(\"The projected exercise price per share in 2050 would be $\", projected_price_2050)\n```\n\nPlease execute the code to get the answer."
3 | }
--------------------------------------------------------------------------------
/Examples/promptflow/ag-convfinqa-pf/.promptflow/flow.tools.json:
--------------------------------------------------------------------------------
1 | {
2 | "package": {},
3 | "code": {
4 | "autogen_step.py": {
5 | "type": "python",
6 | "inputs": {
7 | "input1": {
8 | "type": [
9 | "string"
10 | ]
11 | },
12 | "connection": {
13 | "type": [
14 | "AzureOpenAIConnection"
15 | ]
16 | },
17 | "modelname": {
18 | "type": [
19 | "string"
20 | ]
21 | }
22 | },
23 | "source": "autogen_step.py",
24 | "function": "my_python_tool"
25 | }
26 | }
27 | }
--------------------------------------------------------------------------------
/Examples/promptflow/ag-convfinqa-pf/.promptflow/flow.uihint.json:
--------------------------------------------------------------------------------
1 | {
2 | "nodes": {}
3 | }
--------------------------------------------------------------------------------
/Examples/promptflow/ag-convfinqa-pf/autogen_step.py:
--------------------------------------------------------------------------------
1 | from promptflow import tool
2 | from promptflow.connections import AzureOpenAIConnection
3 |
4 | import autogen
5 | import json, re
6 | from autogen.agentchat.contrib.math_user_proxy_agent import MathUserProxyAgent
7 | #from autogen import Cache
8 |
9 | # create a UserProxyAgent instance named "user_proxy"
10 | def has_boxed(string):
11 | return '\\boxed' in string
12 |
13 | def extract_last_boxed_to_newline(s):
14 | matches = re.findall(r'(\\boxed\{.*?\}.*?)(?=\n|$)', s, re.DOTALL)
15 | return matches[-1] if matches else None
16 |
17 |
18 | # The inputs section will change based on the arguments of the tool function, after you save the code
19 | # Adding type to arguments and return value will help the system show the types properly
20 | # Please update the function name/signature per need
21 | @tool
22 | def my_python_tool(input1: str, connection: AzureOpenAIConnection, modelname: str) -> str:
23 | # config_list = autogen.config_list_from_json(
24 | # "OAI_CONFIG_LIST",
25 | # filter_dict={
26 | # "model": ["gpt-4", "gpt-4-0314", "gpt4", "gpt-4-32k", "gpt-4-32k-0314", "gpt-4-32k-v0314"],
27 | # },
28 | # )
29 | config_list = [
30 | {
31 | "model": modelname,
32 | "api_key": connection.api_key,
33 | "base_url": connection.api_base,
34 | "api_type": "azure",
35 | "api_version": "2023-07-01-preview"
36 | },
37 | ]
38 | # create an AssistantAgent named "assistant"
39 | assistant = autogen.AssistantAgent(
40 | name="assistant",
41 | llm_config={
42 | "cache_seed": None, # disable
43 | "seed": None, # disable
44 | "config_list": config_list, # a list of OpenAI API configurations
45 | "temperature": 0, # temperature for sampling
46 | }, # configuration for autogen's enhanced inference API which is compatible with OpenAI API
47 | )
48 |
49 |
50 | # 2. create the MathUserProxyAgent instance named "mathproxyagent"
51 | # By default, the human_input_mode is "NEVER", which means the agent will not ask for human input.
52 | mathproxyagent = MathUserProxyAgent(
53 | name="mathproxyagent",
54 | human_input_mode="NEVER",
55 | is_termination_msg = lambda msg: has_boxed(msg['content']),
56 | code_execution_config={"use_docker": False},
57 | )
58 |
59 | #autogen.ChatCompletion.start_logging()
60 |
61 | math_problem = input1
62 | mathproxyagent.initiate_chat(assistant, problem=math_problem+'' , silent=True,)
63 |
64 | last_response = assistant.last_message(agent=mathproxyagent)
65 | last_number = last_response['content']
66 | #last_number = extract_last_boxed_to_newline(last_response['content'])
67 | return f'{last_number}'
68 |
--------------------------------------------------------------------------------
/Examples/promptflow/ag-convfinqa-pf/chat.jinja2:
--------------------------------------------------------------------------------
1 | system:
2 | You are a helpful assistant.
3 |
4 | {% for item in chat_history %}
5 | user:
6 | {{item.inputs.question}}
7 | assistant:
8 | {{item.outputs.answer}}
9 | {% endfor %}
10 |
11 | user:
12 | {{question}}
--------------------------------------------------------------------------------
/Examples/promptflow/ag-convfinqa-pf/convfinqa_dev_sample.jsonl:
--------------------------------------------------------------------------------
1 | {"prompt":"Read the following text and table, and then answer the last question in a series of questions:\n- | 2007 | 2006 | 2005\nweighted average exercise price per share | $ 60.94 | $ 37.84 | $ 25.14\nQuestions: what was the weighted average exercise price per share in 2007? and what was it in 2005? what was, then, the change over the years? what was the weighted average exercise price per share in 2005?\nQuestion: and how much does that change represent in relation to this 2005 weighted average exercise price?","answer":"1.42403"}
2 | {"prompt":"Read the following text and table, and then answer the last question in a series of questions:\nthe company had unamortized debt issuance costs associated with its credit facilities of $ 11 million and $ 8 million as of december 31 , 2017 and 2016 , respectively . the company had unamortized debt issuance costs associated with the senior notes of $ 15 million and $ 19 million as of december 31 , 2017 and 2016 , respectively. .\n( $ in millions ) | december 31 2017 | december 31 2016\nsenior notes due november 15 2025 5.000% ( 5.000 % ) | 600 | 600\nQuestions: what was the change in the unamortized debt issuance costs associated with the senior notes between 2016 and 2017? so what was the percentage change during this time? what was the change associated with credit facilities during that time?\nQuestion: so what was the percentage change?","answer":"0.375"}
3 | {"prompt":"Read the following text and table, and then answer the last question in a series of questions:\namounts expensed for the savings plans for 2009 , 2008 and 2007 were $ 35.1 , $ 29.6 and $ 31.4 , respectively . expense includes a discretionary company contribution of $ 3.8 , $ 4.0 and $ 4.9 offset by participant forfeitures of $ 2.7 , $ 7.8 , $ 6.0 in 2009 , 2008 and 2007 , respectively .\nQuestions: what is the ratio of discretionary company contributions to total expensed amounts for savings plans in 2009?\nQuestion: what is that times 100?","answer":"10.82621"}
4 | {"prompt":"Read the following text and table, and then answer the last question in a series of questions:\nmillions of dollars | dec . 31 2008 | dec . 31 2007\nequipment rents payable | 93 | 103\nQuestions: what was the equipment rents payable in 2008? and in 2007? so what was the difference between the two years? and the value for 2007 again?\nQuestion: so what was the percentage change during this time?","answer":"-0.09709"}
5 | {"prompt":"Read the following text and table, and then answer the last question in a series of questions:\nthe aggregate fair values of our outstanding fuel hedges as of december 31 , 2016 and 2015 were current liabilities of $ 2.7 million and $ 37.8 million , respectively , and have been recorded in other accrued liabilities in our consolidated balance sheets .\nyear | gallons hedged | weighted average contractprice per gallon\n2017 | 12000000 | $ 2.92\n2018 | 3000000 | 2.61\nQuestions: how much did the gallons hedged in 2018 represent in relation to the ones hedged in 2017? and in the previous year of this period, what was the aggregate fair value of the outstanding fuel hedges? what was it in 2015?\nQuestion: how much, then, did the 2016 fair value represent in relation to this 2015 one?","answer":"14.0"}
6 | {"prompt":"Read the following text and table, and then answer the last question in a series of questions:\n( in thousands ) | net undeveloped acres expiring year ended december 31 , 2016 | net undeveloped acres expiring year ended december 31 , 2017 | net undeveloped acres expiring year ended december 31 , 2018\nu.s . | 68 | 89 | 128\ntotal africa | 189 | 4444 | 890\ntotal | 257 | 4533 | 1018\nQuestions: what was the total african and us net undeveloped acres expiring in 2016?\nQuestion: what percentage of undeveloped acres were in the us in 2018?","answer":"0.12574"}
7 | {"prompt":"Read the following text and table, and then answer the last question in a series of questions:\ncash flowsmillions | 2014 | 2013 | 2012\ncash provided by operating activities | $ 7385 | $ 6823 | $ 6161\nQuestions: what was the cash provided by operating activities in 2013? and in 2012? so what was the difference in this value between the years? and the value for 2012 again?\nQuestion: so what was the percentage change during this time?","answer":"0.10745"}
8 | {"prompt":"Read the following text and table, and then answer the last question in a series of questions:\noil gas ngls total ( mmbbls ) ( bcf ) ( mmbbls ) ( mmboe ) .\n- | oil ( mmbbls ) | gas ( bcf ) | ngls ( mmbbls ) | total ( mmboe )\ncanada | 23 | 198 | 4 | 60\ntotal | 66 | 894 | 28 | 243\nQuestions: what is the amount of oil and gas mmboe from canada divided by the total?\nQuestion: what is that times 100?","answer":"24.69136"}
9 | {"prompt":"Read the following text and table, and then answer the last question in a series of questions:\nyears ended december 31, | 2009 | 2008 | 2007\nsegment revenue | $ 6305 | $ 6197 | $ 5918\nQuestions: what was the total of risk and insurance brokerage services segment revenue in 2009? and what was that in 2008? what was, then, the change over the year?\nQuestion: and how much does this change represent in relation to the 2008 total, in percentage?","answer":"0.01743"}
10 | {"prompt":"Read the following text and table, and then answer the last question in a series of questions:\n- | 12\/28\/2013 | 1\/3\/2015 | 1\/2\/2016 | 12\/31\/2016 | 12\/30\/2017 | 12\/29\/2018\ns&p 500 | 100.00 | 110.28 | 109.54 | 129.05 | 157.22 | 150.33\nQuestions: what is the change in price of the s&p 500 from 2015 to 2016? what is 100000 divided by 100?\nQuestion: what is the product of the change by the quotient?","answer":"18770.0"}
11 |
--------------------------------------------------------------------------------
/Examples/promptflow/ag-convfinqa-pf/flow.dag.yaml:
--------------------------------------------------------------------------------
1 | id: template_chat_flow
2 | name: Template Chat Flow
3 | environment:
4 | python_requirements_txt: requirements.txt
5 | inputs:
6 | chat_history:
7 | type: list
8 | is_chat_input: false
9 | is_chat_history: true
10 | question:
11 | type: string
12 | is_chat_input: true
13 | default: "Read the following text and table, and then answer the last question
14 | in a series of questions:\\n- | 2007 | 2006 | 2005\\nweighted average
15 | exercise price per share | $ 60.94 | $ 37.84 | $ 25.14\\nQuestions: what
16 | was the weighted average exercise price per share in 2007?"
17 | outputs:
18 | answer:
19 | type: string
20 | reference: ${autogen_step.output}
21 | is_chat_output: true
22 | nodes:
23 | - name: autogen_step
24 | type: python
25 | source:
26 | type: code
27 | path: autogen_step.py
28 | inputs:
29 | connection: Default_AzureOpenAI
30 | input1: ${inputs.question}
31 | modelname: gpt-35-turbo
32 | use_variants: false
33 |
--------------------------------------------------------------------------------
/Examples/promptflow/ag-convfinqa-pf/flow.meta.yaml:
--------------------------------------------------------------------------------
1 | $schema: https://azuremlschemas.azureedge.net/latest/flow.schema.json
2 | name: template_chat_flow
3 | display_name: Template Chat Flow
4 | type: chat
5 | path: ./flow.dag.yaml
6 | description: Template Chat Flow
7 | properties:
8 | promptflow.stage: prod
9 | promptflow.section: template
10 |
--------------------------------------------------------------------------------
/Examples/promptflow/ag-convfinqa-pf/requirements.txt:
--------------------------------------------------------------------------------
1 | pyautogen[mathchat]
2 |
--------------------------------------------------------------------------------
/Examples/promptflow/autogen-flow/OAI_CONFIG_LIST:
--------------------------------------------------------------------------------
1 | [
2 |
3 | {
4 | "model": "gpt-35-turbo",
5 | "api_key": "*****",
6 | "base_url": "https://*****.openai.azure.com/",
7 | "api_type": "azure",
8 | "api_version": "2023-05-15"
9 | },
10 | {
11 | "model": "gpt-4",
12 | "api_key": "*****",
13 | "base_url": "https://*****.openai.azure.com/",
14 | "api_type": "azure",
15 | "api_version": "2023-05-15"
16 | }
17 | ]
18 |
--------------------------------------------------------------------------------
/Examples/promptflow/autogen-flow/ag_test.py:
--------------------------------------------------------------------------------
1 | from promptflow import tool
2 | import autogen
3 | #from autogen import Cache
4 |
5 |
6 |
7 | # The inputs section will change based on the arguments of the tool function, after you save the code
8 | # Adding type to arguments and return value will help the system show the types properly
9 | # Please update the function name/signature per need
10 | @tool
11 | def my_python_tool(input1: str) -> str:
12 | config_list = autogen.config_list_from_json(
13 | "OAI_CONFIG_LIST",
14 | filter_dict={
15 | "model": ["gpt-4", "gpt-4-0314", "gpt4", "gpt-4-32k", "gpt-4-32k-0314", "gpt-4-32k-v0314"],
16 | },
17 | )
18 | # create an AssistantAgent named "assistant"
19 | assistant = autogen.AssistantAgent(
20 | name="assistant",
21 | llm_config={
22 | "cache_seed": None, # disable
23 | "seed": None, # disable
24 | "config_list": config_list, # a list of OpenAI API configurations
25 | "temperature": 0, # temperature for sampling
26 | }, # configuration for autogen's enhanced inference API which is compatible with OpenAI API
27 | )
28 | # create a UserProxyAgent instance named "user_proxy"
29 | user_proxy = autogen.UserProxyAgent(
30 | name="user_proxy",
31 | human_input_mode="NEVER",
32 | max_consecutive_auto_reply=10,
33 | is_termination_msg=lambda x: x.get("content", "").rstrip().endswith("TERMINATE"),
34 | code_execution_config={
35 | "work_dir": "coding",
36 | "use_docker": False, # set to True or image name like "python:3" to use docker
37 | },
38 | )
39 | # the assistant receives a message from the user_proxy, which contains the task description
40 | user_proxy.initiate_chat(
41 | assistant,
42 | message=f"""Answer the following: {input1}""",
43 | )
44 |
45 | output = assistant.last_message(agent=user_proxy)['content']
46 | #output = assistant.chat_messages()
47 |
48 | return f'{output}'
--------------------------------------------------------------------------------
/Examples/promptflow/autogen-flow/chat.jinja2:
--------------------------------------------------------------------------------
1 | system:
2 | You are a helpful assistant.
3 |
4 | {% for item in chat_history %}
5 | user:
6 | {{item.inputs.question}}
7 | assistant:
8 | {{item.outputs.answer}}
9 | {% endfor %}
10 |
11 | user:
12 | {{question}}
--------------------------------------------------------------------------------
/Examples/promptflow/autogen-flow/flow.dag.yaml:
--------------------------------------------------------------------------------
1 | id: template_chat_flow
2 | name: Template Chat Flow
3 | inputs:
4 | chat_history:
5 | type: list
6 | default: []
7 | is_chat_input: false
8 | is_chat_history: true
9 | question:
10 | type: string
11 | default: what is the date today?
12 | is_chat_input: true
13 | outputs:
14 | answer:
15 | type: string
16 | reference: ${ag_test.output}
17 | is_chat_output: true
18 | nodes:
19 | - name: ag_test
20 | type: python
21 | source:
22 | type: code
23 | path: ag_test.py
24 | inputs:
25 | input1: ${inputs.question}
26 | use_variants: false
27 | node_variants: {}
28 | environment:
29 | python_requirements_txt: requirements.txt
30 |
--------------------------------------------------------------------------------
/Examples/promptflow/autogen-flow/requirements.txt:
--------------------------------------------------------------------------------
1 | pyautogen
--------------------------------------------------------------------------------
/Examples/promptflow/csv_example/.amlignore:
--------------------------------------------------------------------------------
1 | ## This file was auto generated by the Azure Machine Learning Studio. Please do not remove.
2 | ## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots
3 |
4 | .ipynb_aml_checkpoints/
5 | *.amltmp
6 | *.amltemp
--------------------------------------------------------------------------------
/Examples/promptflow/csv_example/analyze_df.py:
--------------------------------------------------------------------------------
1 | from promptflow import tool
2 | import pandas as pd
3 | from promptflow import tool
4 | import subprocess
5 | import re
6 |
7 | def score(csvfile:str, program: str) -> str:
8 | program = "import pandas as pd\ndf = pd.read_csv('{}')".format(csvfile)+program
9 | program += '\nprint(ans)'
10 | program = program.replace('\\n', '\n')
11 | print(program)
12 | result = subprocess.run(['python', '-c', program], capture_output=True)
13 | print(result)
14 | ans =str(result.stdout)
15 | return ans
16 | # The inputs section will change based on the arguments of the tool function, after you save the code
17 | # Adding type to arguments and return value will help the system show the types properly
18 | # Please update the function name/signature per need
19 | @tool
20 | def my_python_tool(csvfile:str, input_program: str) -> str:
21 | result = score(csvfile, input_program)
22 | #return_result = re.sub(r'[^\d.]+', '', result)
23 | return result
24 |
--------------------------------------------------------------------------------
/Examples/promptflow/csv_example/chat.jinja2:
--------------------------------------------------------------------------------
1 | system:
2 | That helps in generating python code to assist in analyzing tabular data.
3 | Please provide data in pandas code, only provide valid pandas code, do not provide explanations.
4 | assume the dataframe is loaded to variable df. Assign answer to variable name "ans".
5 |
6 | dataframe columns:
7 | {{df_columns}}
8 |
9 | user:
10 |
11 | give me month over month sales differences by geo
12 |
13 | assistant:
14 | # Assuming your dataframe is named 'df'
15 | sales_diff_mom = df.groupby('geo')['sales_mom_diff'].sum()
16 | ans = sales_diff_mom
17 |
18 | user:
19 | {{question}}
--------------------------------------------------------------------------------
/Examples/promptflow/csv_example/concat_result.jinja2:
--------------------------------------------------------------------------------
1 | Program:
2 | ```
3 | {{gen_program}}
4 | ```
5 |
6 | Result:
7 | ```
8 | {{format_out}}
9 | ```
10 |
11 | Insights:
12 | ```
13 | {{insights_out}}
14 | ```
--------------------------------------------------------------------------------
/Examples/promptflow/csv_example/generate_insights.jinja2:
--------------------------------------------------------------------------------
1 | system:
2 | You are a helpful assistant. That can generate insights from tabular data provided by the user. If there is no output please respond, "There is not enough information"
3 |
4 | user:
5 | {{question}}
--------------------------------------------------------------------------------
/Examples/promptflow/csv_example/output_format.jinja2:
--------------------------------------------------------------------------------
1 | system:
2 | You are a helpful assistant, that reformats text as markdown. Remove comments starting with # character from the final output.
3 |
4 | user:
5 | b'geo US 728.761693 GA 595.842102 AL 119.118714 TX 111.349607 Name: sales_mom_diff, dtype: float64'
6 |
7 | assistant:
8 | geo US | 728.761693
9 | GA | 595.842102
10 | AL | 119.118714
11 | TX | 111.349607
12 |
13 | Name: sales_mom_diff, dtype: float64
14 |
15 | user:
16 | {{exec_output}}
--------------------------------------------------------------------------------
/Examples/promptflow/csv_example/output_result.jinja2:
--------------------------------------------------------------------------------
1 | ```
2 | {{format_out}}
3 | ```
4 | {{insights}}
--------------------------------------------------------------------------------
/Examples/promptflow/csv_example/readme.md:
--------------------------------------------------------------------------------
1 | # Promptflow instructions
2 |
3 | ### Begin by installing promptflow extension in VSCode
4 |
5 |
6 | ### Follow instructions to install dependencies
7 |
8 |
9 | Ensure dependencies are installed:
10 |
11 |
12 | ### Open example flow in the "Flows" window from PromptFlow extension
13 |
14 |
15 | ### Add connection "aoai-connection"
16 |
17 |
18 | ### Assign Deployment name for all LLM steps
19 |
20 |
21 | ### Run flow in interactive mode to see output.
22 | Try the sample question: "give me the sales index by period for CA"
23 |
24 |
25 |
--------------------------------------------------------------------------------
/Examples/promptflow/csv_example/requirements.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jakeatmsft/AzureOpenAIExamples/222106ab4dc2e124e5ec16a5d8bf771eefd25ca8/Examples/promptflow/csv_example/requirements.txt
--------------------------------------------------------------------------------
/Examples/promptflow/databricks_example/chat_csv_model/promptflow/analyze_df.py:
--------------------------------------------------------------------------------
1 | from promptflow import tool
2 | import pandas as pd
3 | from promptflow import tool
4 | import subprocess
5 | import re
6 |
7 | def score(csvfile:str, program: str) -> str:
8 | program = "import pandas as pd\ndf = pd.read_csv('{}')\n".format(csvfile)+program
9 | program += '\nprint(ans)'
10 | program = program.replace('\\n', '\n')
11 | print(program)
12 | result = subprocess.run(['python', '-c', program], capture_output=True)
13 | print(result)
14 | ans =str(result.stdout)
15 | return ans
16 | # The inputs section will change based on the arguments of the tool function, after you save the code
17 | # Adding type to arguments and return value will help the system show the types properly
18 | # Please update the function name/signature per need
19 | @tool
20 | def my_python_tool(csvfile:str, input_program: str) -> str:
21 | result = score(csvfile, input_program)
22 | #return_result = re.sub(r'[^\d.]+', '', result)
23 | return result
24 |
--------------------------------------------------------------------------------
/Examples/promptflow/databricks_example/chat_csv_model/promptflow/chat.jinja2:
--------------------------------------------------------------------------------
1 | system:
2 | That helps in generating python code to assist in analyzing tabular data.
3 | Please provide data in pandas code, only provide valid pandas code, do not provide explanations.
4 | assume the dataframe is loaded to variable df. Assign answer to variable name "ans".
5 |
6 | dataframe columns:
7 | {{df_columns}}
8 |
9 | file:
10 | {{file_name}}
11 |
12 | user:
13 |
14 | give me month over month sales differences by geo
15 |
16 | assistant:
17 | # Assuming your dataframe is named 'df'
18 | sales_diff_mom = df.groupby('geo')['sales_mom_diff'].sum()
19 | ans = sales_diff_mom
20 |
21 | user:
22 | {{question}}
--------------------------------------------------------------------------------
/Examples/promptflow/databricks_example/chat_csv_model/promptflow/concat_result.jinja2:
--------------------------------------------------------------------------------
1 | Program:
2 | ```
3 | {{gen_program}}
4 | ```
5 |
6 | Result:
7 | ```
8 | {{format_out}}
9 | ```
10 |
11 | Insights:
12 | ```
13 | {{insights_out}}
14 | ```
--------------------------------------------------------------------------------
/Examples/promptflow/databricks_example/chat_csv_model/promptflow/generate_insights.jinja2:
--------------------------------------------------------------------------------
1 | system:
2 | You are a helpful assistant. That can generate insights from tabular data provided by the user. If there is no output please respond, "There is not enough information"
3 |
4 | user:
5 | {{question}}
--------------------------------------------------------------------------------
/Examples/promptflow/databricks_example/chat_csv_model/promptflow/output_format.jinja2:
--------------------------------------------------------------------------------
1 | system:
2 | You are a helpful assistant, that reformats text as markdown. Remove comments starting with # character from the final output.
3 |
4 | user:
5 | b'geo US 728.761693 GA 595.842102 AL 119.118714 TX 111.349607 Name: sales_mom_diff, dtype: float64'
6 |
7 | assistant:
8 | geo US | 728.761693
9 | GA | 595.842102
10 | AL | 119.118714
11 | TX | 111.349607
12 |
13 | Name: sales_mom_diff, dtype: float64
14 |
15 | user:
16 | {{exec_output}}
--------------------------------------------------------------------------------
/Examples/promptflow/databricks_example/chat_csv_model/promptflow/output_result.jinja2:
--------------------------------------------------------------------------------
1 | ```
2 | {{format_out}}
3 | ```
4 | {{insights}}
--------------------------------------------------------------------------------
/Examples/promptflow/databricks_example/chat_csv_model/promptflow/requirements.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jakeatmsft/AzureOpenAIExamples/222106ab4dc2e124e5ec16a5d8bf771eefd25ca8/Examples/promptflow/databricks_example/chat_csv_model/promptflow/requirements.txt
--------------------------------------------------------------------------------
/Examples/promptflow/databricks_example/readme.md:
--------------------------------------------------------------------------------
1 | #### Note: Make sure you are running on Databricks ML compute, to avoid dependency issues with MLflow.
2 |
3 | Step 1: Upload the "chat_csv_model" folder to your databricks workspace, along with the deploy_pf notebooks.
4 |
5 |
6 |
7 | Step 2: Open the pf_register_model.ipynb Run through the notebook to test the promptflow model and register it to your databricks model registry.
8 | Be sure to replace all connection string info with your AOAI config.
9 |
10 |
11 | After execution of all steps you should see the model registered:
12 |
13 |
14 |
15 | Step 3: Open the pf_test_model.ipynb to load the model from the registry and ensure you can execute it successfully.
16 |
--------------------------------------------------------------------------------
/Examples/promptflow/dmv_copilot_flow/.promptflow/RetrieveDocuments.inputs.jsonl:
--------------------------------------------------------------------------------
1 | {"searchConnection":"AzureAISearch","embeddingModelConnection":"Default_AzureOpenAI","vectorFields":"contentVector","embeddingModelName":"text-embedding-ada-002","indexName":"dmv-index-full","ExtractIntent.output.search_intents":{"current_message_intent":"how old do I need to be to drive?","search_intents":"[\"how old do I need to be to drive?\"]"},"queryType":"vectorSimpleHybrid","semanticConfiguration":"None","topK":3}
--------------------------------------------------------------------------------
/Examples/promptflow/dmv_copilot_flow/.promptflow/RetrieveDocuments.node.log:
--------------------------------------------------------------------------------
1 | 2024-04-18 07:53:04 -0700 22736 execution.flow INFO Executing node RetrieveDocuments. node run id: b568b6e8-108a-4743-98e6-7ac371fb25d0_RetrieveDocuments_3ea5220e-f7d3-412a-b883-cd9b303ea478
2 | 2024-04-18 07:53:11 -0700 22736 execution.flow INFO Node RetrieveDocuments completes.
3 |
--------------------------------------------------------------------------------
/Examples/promptflow/dmv_copilot_flow/.promptflow/flow.layout.json:
--------------------------------------------------------------------------------
1 | {
2 | "nodeLayouts": {
3 | "inputs": {
4 | "x": 552.5,
5 | "y": 42,
6 | "index": -1
7 | },
8 | "outputs": {
9 | "x": 395,
10 | "y": 1114,
11 | "index": -1
12 | },
13 | "DetermineIntent": {
14 | "x": 480,
15 | "y": 176,
16 | "index": 0
17 | },
18 | "ExtractIntent": {
19 | "x": 645,
20 | "y": 310,
21 | "index": 1
22 | },
23 | "RetrieveDocuments": {
24 | "x": 820,
25 | "y": 444,
26 | "index": 2
27 | },
28 | "FormatRetrievedDocuments": {
29 | "x": 470,
30 | "y": 578,
31 | "index": 3
32 | },
33 | "FormatConversation": {
34 | "x": 140,
35 | "y": 578,
36 | "index": 4
37 | },
38 | "DetermineReply": {
39 | "x": 470,
40 | "y": 712,
41 | "index": 5
42 | },
43 | "FormatReply": {
44 | "x": 470,
45 | "y": 846,
46 | "index": 6
47 | },
48 | "output_prompt": {
49 | "x": 917.5,
50 | "y": 980,
51 | "index": 7
52 | }
53 | },
54 | "orientation": "Vertical"
55 | }
--------------------------------------------------------------------------------
/Examples/promptflow/dmv_copilot_flow/.promptflow/flow.log:
--------------------------------------------------------------------------------
1 | 2024-04-18 07:45:11 -0700 7572 execution.flow INFO Start executing nodes in thread pool mode.
2 | 2024-04-18 07:45:11 -0700 7572 execution.flow INFO Start to run 8 nodes with concurrency level 16.
3 | 2024-04-18 07:45:11 -0700 7572 execution.flow INFO Executing node DetermineIntent. node run id: f0dece6c-a97c-42c9-a938-a66b821d49f5_DetermineIntent_0
4 | 2024-04-18 07:45:11 -0700 7572 execution.flow INFO Executing node FormatConversation. node run id: f0dece6c-a97c-42c9-a938-a66b821d49f5_FormatConversation_0
5 | 2024-04-18 07:45:11 -0700 7572 execution.flow INFO Node FormatConversation completes.
6 | 2024-04-18 07:45:12 -0700 7572 execution.flow INFO Node DetermineIntent completes.
7 | 2024-04-18 07:45:12 -0700 7572 execution.flow INFO Executing node ExtractIntent. node run id: f0dece6c-a97c-42c9-a938-a66b821d49f5_ExtractIntent_0
8 | 2024-04-18 07:45:12 -0700 7572 execution.flow INFO Node ExtractIntent completes.
9 | 2024-04-18 07:45:12 -0700 7572 execution.flow INFO Executing node RetrieveDocuments. node run id: f0dece6c-a97c-42c9-a938-a66b821d49f5_RetrieveDocuments_0
10 | 2024-04-18 07:45:16 -0700 7572 execution.flow INFO Node RetrieveDocuments completes.
11 | 2024-04-18 07:45:16 -0700 7572 execution.flow INFO Executing node FormatRetrievedDocuments. node run id: f0dece6c-a97c-42c9-a938-a66b821d49f5_FormatRetrievedDocuments_0
12 | 2024-04-18 07:45:16 -0700 7572 execution.flow INFO Node FormatRetrievedDocuments completes.
13 | 2024-04-18 07:45:16 -0700 7572 execution.flow INFO Executing node DetermineReply. node run id: f0dece6c-a97c-42c9-a938-a66b821d49f5_DetermineReply_0
14 | 2024-04-18 07:45:38 -0700 7572 execution.flow INFO Node DetermineReply completes.
15 | 2024-04-18 07:45:38 -0700 7572 execution.flow INFO Executing node FormatReply. node run id: f0dece6c-a97c-42c9-a938-a66b821d49f5_FormatReply_0
16 | 2024-04-18 07:45:38 -0700 7572 execution.flow INFO Node FormatReply completes.
17 | 2024-04-18 07:45:38 -0700 7572 execution.flow INFO Executing node output_prompt. node run id: f0dece6c-a97c-42c9-a938-a66b821d49f5_output_prompt_0
18 | 2024-04-18 07:45:38 -0700 7572 execution.flow INFO Node output_prompt completes.
19 |
--------------------------------------------------------------------------------
/Examples/promptflow/dmv_copilot_flow/.promptflow/flow.tools.json:
--------------------------------------------------------------------------------
1 | {
2 | "package": {},
3 | "code": {
4 | "DetermineIntent.jinja2": {
5 | "type": "llm",
6 | "inputs": {
7 | "query": {
8 | "type": [
9 | "string"
10 | ]
11 | },
12 | "chat_history": {
13 | "type": [
14 | "string"
15 | ]
16 | }
17 | },
18 | "source": "DetermineIntent.jinja2"
19 | },
20 | "ExtractIntent.py": {
21 | "type": "python",
22 | "inputs": {
23 | "input": {
24 | "type": [
25 | "string"
26 | ]
27 | },
28 | "query": {
29 | "type": [
30 | "string"
31 | ]
32 | }
33 | },
34 | "source": "ExtractIntent.py",
35 | "function": "extract_intent"
36 | },
37 | "RetrieveDocuments.py": {
38 | "type": "python",
39 | "inputs": {
40 | "queries": {
41 | "type": [
42 | "string"
43 | ]
44 | },
45 | "searchConnection": {
46 | "type": [
47 | "CognitiveSearchConnection"
48 | ]
49 | },
50 | "indexName": {
51 | "type": [
52 | "string"
53 | ]
54 | },
55 | "queryType": {
56 | "type": [
57 | "string"
58 | ]
59 | },
60 | "topK": {
61 | "type": [
62 | "int"
63 | ]
64 | },
65 | "semanticConfiguration": {
66 | "type": [
67 | "string"
68 | ]
69 | },
70 | "vectorFields": {
71 | "type": [
72 | "string"
73 | ]
74 | },
75 | "embeddingModelConnection": {
76 | "type": [
77 | "AzureOpenAIConnection"
78 | ]
79 | },
80 | "embeddingModelName": {
81 | "type": [
82 | "string"
83 | ]
84 | }
85 | },
86 | "source": "RetrieveDocuments.py",
87 | "function": "search"
88 | },
89 | "FormatRetrievedDocuments.py": {
90 | "type": "python",
91 | "inputs": {
92 | "docs": {
93 | "type": [
94 | "object"
95 | ]
96 | },
97 | "maxTokens": {
98 | "type": [
99 | "int"
100 | ]
101 | }
102 | },
103 | "source": "FormatRetrievedDocuments.py",
104 | "function": "format_retrieved_documents"
105 | },
106 | "FormatConversation.py": {
107 | "type": "python",
108 | "inputs": {
109 | "query": {
110 | "type": [
111 | "string"
112 | ]
113 | },
114 | "history": {
115 | "type": [
116 | "list"
117 | ]
118 | },
119 | "maxTokens": {
120 | "type": [
121 | "int"
122 | ]
123 | }
124 | },
125 | "source": "FormatConversation.py",
126 | "function": "format_conversation"
127 | },
128 | "DetermineReply.jinja2": {
129 | "type": "llm",
130 | "inputs": {
131 | "conversation": {
132 | "type": [
133 | "string"
134 | ]
135 | },
136 | "documentation": {
137 | "type": [
138 | "string"
139 | ]
140 | },
141 | "user_query": {
142 | "type": [
143 | "string"
144 | ]
145 | }
146 | },
147 | "source": "DetermineReply.jinja2"
148 | },
149 | "FormatReply.py": {
150 | "type": "python",
151 | "inputs": {
152 | "reply": {
153 | "type": [
154 | "string"
155 | ]
156 | }
157 | },
158 | "source": "FormatReply.py",
159 | "function": "format_reply"
160 | },
161 | "output_prompt.jinja2": {
162 | "type": "prompt",
163 | "inputs": {
164 | "reply": {
165 | "type": [
166 | "string"
167 | ]
168 | },
169 | "retrieveddocs": {
170 | "type": [
171 | "string"
172 | ]
173 | }
174 | },
175 | "source": "output_prompt.jinja2"
176 | }
177 | }
178 | }
--------------------------------------------------------------------------------
/Examples/promptflow/dmv_copilot_flow/DetermineIntent.jinja2:
--------------------------------------------------------------------------------
1 | system:
2 | You are provided with two sentences, first line is the intent of user's previous request, second line is the user's current message.
3 | You must generate the intent of the current message.
4 | You must divide the intent of the current message into multiple specific single search intents. The single intents should not include greeting.
5 | Each single intent should have as less overlap with other single_intents as possible.
6 | You must generate single intents like below examples, except for the user question does not have a specific intent like "hello", "how are you?""
7 | If you cannot understand the single intent of the current message, you must use the latest message as the single intent.
8 | You must use the following examples as a guidance to understand the user's current intent.
9 | Your response format must adhere to the examples below.
10 | You don't need to include the previous intent in your response when user shift to a new topic.
11 |
12 | #Very important instruction
13 | When the user's current query shifts to a new topic, your response must change to the corresponding new topic.
14 | You must generate all single intents for the current message. If no single intent is generated, you must generate an empty list of single intents like [].
15 | You must keep the current message intent in the same language as user's input current query.
16 | - You must keep the single intents in the same language as user's input current query.
17 |
18 | [EXAMPLES]
19 | user:
20 | previous intent: what is OpenAI?
21 | current query: list the result in a table.
22 | assistant:
23 | Current Message Intent: what is OpenAI and list the result in a table?
24 | Single Intents: ["what is OpenAI?"]
25 | user:
26 | previous intent: what is OpenAI?
27 | current query: summarize the result into 2 sentences.
28 | assistant:
29 | Current Message Intent: what is OpenAI and summarize the result into 2 sentences.
30 | Single Intents: ["what is OpenAI?"]
31 | user:
32 | previous intent: how to query a database with C#
33 | current query: how about Python
34 | assistant:
35 | Current Message Intent: how to query a database with Python
36 | Single Intents: ["how to query a database with Python"]
37 | user:
38 | previous intent: Tell me about vm.
39 | current query: What is the price of it, office 365 and azure?
40 | assistant:
41 | Current Message Intent: What is the price of virtual machine, office 365 and azure?
42 | Single Intents: ["what is the price of virtual machine?", "what is the price of office 365?", "what is the price of azure?"]
43 | user:
44 | previous intent: None
45 | current query: aoai?
46 | assistant:
47 | Current Message Intent: What is aoai?
48 | Single Intents: ["what is Azure OpenAI?"]
49 | user:
50 | previous intent: what is IKEA?
51 | current query: hello
52 | assistant:
53 | Current Message Intent: hello
54 | Single Intents: []
55 | user:
56 | previous intent: what is IKEA?
57 | current query: What is azure ml? how can i create a new workspace?
58 | assistant:
59 | Current Message Intent: What is azure ml? How can i create a new workspace?
60 | Single Intents: ["what is azure ml?", "how can i create a new workspace in azure ml?"]
61 | [END EXAMPLES]
62 |
63 | user:
64 | previous intent: {{ chat_history[-1]["outputs"]["current_query_intent"] if chat_history else 'None' }}
65 | current query: {{query}}
66 | assistant:
67 | Current Message Intent:
68 |
--------------------------------------------------------------------------------
/Examples/promptflow/dmv_copilot_flow/ExtractIntent.py:
--------------------------------------------------------------------------------
1 | from promptflow import tool
2 |
3 | @tool
4 | def extract_intent(input: str, query: str) -> str:
5 | entries = None
6 | if 'Single Intents:' in input:
7 | entries = input.split('Single Intents:', 2)
8 | elif 'Single Intent:' in input:
9 | entries = input.split('Single Intent:', 2)
10 |
11 | if entries and len(entries) == 2:
12 | return {
13 | "current_message_intent": entries[0].strip(),
14 | "search_intents": entries[1].strip()
15 | }
16 | return {
17 | "current_message_intent": query,
18 | "search_intents": query
19 | }
20 |
--------------------------------------------------------------------------------
/Examples/promptflow/dmv_copilot_flow/FormatConversation.py:
--------------------------------------------------------------------------------
1 | from promptflow import tool
2 |
3 | @tool
4 | def format_conversation(query: str, history: list, maxTokens: int) -> str:
5 | result = ""
6 | conversation_history = []
7 | for history_item in history:
8 | conversation_history.append({
9 | "speaker": "user",
10 | "message": history_item["inputs"]["query"]
11 | })
12 | conversation_history.append({
13 | "speaker": "assistant",
14 | "message": history_item["outputs"]["reply"]
15 | })
16 |
17 | # Start using context from history, starting from most recent, until token limit is reached.
18 | for turn in reversed(conversation_history):
19 | turnStr = format_turn(turn["speaker"], turn["message"])
20 | newResult = turnStr + result
21 | if estimate_tokens(newResult) > maxTokens:
22 | break
23 | result = newResult
24 | return result
25 |
26 | def format_turn(speaker: str, message: str) -> str:
27 | return f"{speaker}:\n{message}\n"
28 |
29 | def estimate_tokens(text: str) -> int:
30 | return (len(text) + 2) / 3
31 |
--------------------------------------------------------------------------------
/Examples/promptflow/dmv_copilot_flow/FormatReply.py:
--------------------------------------------------------------------------------
1 | from promptflow import tool
2 |
3 | @tool
4 | def format_reply(reply: str) -> str:
5 | reply = clean_markdown(reply)
6 | return reply
7 |
8 | def clean_markdown(input: str) -> str:
9 | start = 0
10 | inBlock = False
11 | result = ""
12 | while True:
13 | nextStart = input.find("```", start)
14 | if nextStart == -1:
15 | break
16 | result += input[start:nextStart]
17 | if inBlock:
18 | if nextStart > 0 and input[nextStart - 1] != '\n':
19 | result += "\n"
20 | result += "```\n"
21 | inBlock = False
22 | else:
23 | result += "```"
24 | inBlock = True
25 | start = nextStart + 3
26 | result += input[start:]
27 | if inBlock:
28 | result += "```"
29 | return result
30 |
--------------------------------------------------------------------------------
/Examples/promptflow/dmv_copilot_flow/FormatRetrievedDocuments.py:
--------------------------------------------------------------------------------
1 | from promptflow import tool
2 |
3 | @tool
4 | def format_retrieved_documents(docs: object, maxTokens: int) -> str:
5 | formattedDocs = []
6 | strResult = ""
7 | for index, doc in enumerate(docs):
8 | formattedDocs.append({
9 | f"[doc{index}]": {
10 | "title": doc['title'],
11 | "content": doc['content']
12 | }
13 | })
14 | formattedResult = { "retrieved_documents": formattedDocs }
15 | nextStrResult = str(formattedResult)
16 | if (estimate_tokens(nextStrResult) > maxTokens):
17 | break
18 | strResult = nextStrResult
19 |
20 | return strResult
21 |
22 | def estimate_tokens(text: str) -> int:
23 | return (len(text) + 2) / 3
24 |
--------------------------------------------------------------------------------
/Examples/promptflow/dmv_copilot_flow/RetrieveDocuments.py:
--------------------------------------------------------------------------------
1 | import json
2 | import re
3 | import requests
4 | from promptflow import tool
5 | from promptflow.connections import AzureOpenAIConnection
6 | from promptflow.connections import CognitiveSearchConnection
7 |
8 | fieldMap = {
9 | "id": ["id"],
10 | "url": ["url", "uri", "link", "document_link"],
11 | "filepath": ["filepath", "filename"],
12 | "content": ["content"]
13 | }
14 | titleRegex = re.compile(r"title: (.*)\n")
15 |
16 | def getIfString(doc, fieldName):
17 | try:
18 | value = doc.get(fieldName)
19 | if isinstance(value, str) and len(value) > 0:
20 | return value
21 | return None
22 | except:
23 | return None
24 |
25 | def get_truncated_string(string_value, max_length):
26 | return string_value[:max_length]
27 |
28 | def getTitle(doc):
29 | max_title_length = 150
30 | title = getIfString(doc, 'title')
31 | if title:
32 | return get_truncated_string(title, max_title_length)
33 | else:
34 | title = getIfString(doc, 'content')
35 | if title:
36 | titleMatch = titleRegex.search(title)
37 | if titleMatch:
38 | return get_truncated_string(titleMatch.group(1), max_title_length)
39 | else:
40 | return None
41 | else:
42 | return None
43 |
44 | def getChunkId(doc):
45 | chunk_id = getIfString(doc, 'chunk_id')
46 | return chunk_id
47 |
48 | def getSearchScore(doc):
49 | try:
50 | return doc['@search.score']
51 | except:
52 | return None
53 |
54 | def getQueryList(query):
55 | try:
56 | config = json.loads(query)
57 | return config
58 | except Exception:
59 | return [query]
60 |
61 | def process_search_docs_response(docs):
62 | outputs = []
63 | for doc in docs:
64 | formattedDoc = {}
65 | for fieldName in fieldMap.keys():
66 | for fromFieldName in fieldMap[fieldName]:
67 | fieldValue = getIfString(doc, fromFieldName)
68 | if fieldValue:
69 | formattedDoc[fieldName] = doc[fromFieldName]
70 | break
71 | formattedDoc['title'] = getTitle(doc)
72 | formattedDoc['chunk_id'] = getChunkId(doc)
73 | formattedDoc['search_score'] = getSearchScore(doc)
74 | outputs.append(formattedDoc)
75 | return outputs
76 |
77 | def get_query_embedding(query, endpoint, api_key, api_version, embedding_model_deployment):
78 | request_url = f"{endpoint}/openai/deployments/{embedding_model_deployment}/embeddings?api-version={api_version}"
79 | headers = {
80 | "Content-Type": "application/json",
81 | "api-key": api_key
82 | }
83 | request_payload = {
84 | 'input': query
85 | }
86 | embedding_response = requests.post(request_url, json = request_payload, headers = headers, timeout=None)
87 | if embedding_response.status_code == 200:
88 | data_values = embedding_response.json()["data"]
89 | embeddings_vectors = [data_value["embedding"] for data_value in data_values]
90 | return embeddings_vectors
91 | else:
92 | raise Exception(f"failed to get embedding: {embedding_response.json()}")
93 |
94 | def search_query_api(
95 | endpoint,
96 | api_key,
97 | api_version,
98 | index_name,
99 | query_type,
100 | query,
101 | top_k,
102 | embeddingModelConnection,
103 | embeddingModelName = None,
104 | semantic_configuration_name=None,
105 | vectorFields=None):
106 | request_url = f"{endpoint}/indexes/{index_name}/docs/search?api-version={api_version}"
107 | request_payload = {
108 | 'top': top_k,
109 | 'queryLanguage': 'en-us'
110 | }
111 | if query_type == 'simple':
112 | request_payload['search'] = query
113 | request_payload['queryType'] = query_type
114 | elif query_type == 'semantic':
115 | request_payload['search'] = query
116 | request_payload['queryType'] = query_type
117 | request_payload['semanticConfiguration'] = semantic_configuration_name
118 | elif query_type in ('vector', 'vectorSimpleHybrid', 'vectorSemanticHybrid'):
119 | if vectorFields and embeddingModelName:
120 | query_vectors = get_query_embedding(
121 | query,
122 | embeddingModelConnection["api_base"],
123 | embeddingModelConnection["api_key"],
124 | embeddingModelConnection["api_version"],
125 | embeddingModelName
126 | )
127 | payload_vectors = [{"value": query_vector, "fields": vectorFields, "k": top_k } for query_vector in query_vectors]
128 | request_payload['vectors'] = payload_vectors
129 |
130 | if query_type == 'vectorSimpleHybrid':
131 | request_payload['search'] = query
132 | elif query_type == 'vectorSemanticHybrid':
133 | request_payload['search'] = query
134 | request_payload['queryType'] = 'semantic'
135 | request_payload['semanticConfiguration'] = semantic_configuration_name
136 | else:
137 | raise Exception(f"unsupported query type: {query_type}")
138 |
139 | headers = {
140 | "Content-Type": "application/json",
141 | "api-key": api_key
142 | }
143 | retrieved_docs = requests.post(request_url, json = request_payload, headers = headers, timeout=None)
144 | if retrieved_docs.status_code == 200:
145 | return process_search_docs_response(retrieved_docs.json()["value"])
146 | else:
147 | raise Exception(f"failed to query search index : {retrieved_docs.json()}")
148 |
149 | @tool
150 | def search(queries: str, searchConnection: CognitiveSearchConnection, indexName: str, queryType: str, topK: int, semanticConfiguration: str, vectorFields: str, embeddingModelConnection: AzureOpenAIConnection, embeddingModelName: str):
151 | semanticConfiguration = semanticConfiguration if semanticConfiguration != "None" else None
152 | vectorFields = vectorFields if vectorFields != "None" else None
153 | embeddingModelName = embeddingModelName if embeddingModelName != None else None
154 |
155 | # Do search.
156 | allOutputs = [search_query_api(
157 | searchConnection['api_base'],
158 | searchConnection['api_key'],
159 | searchConnection['api_version'],
160 | indexName,
161 | queryType,
162 | query,
163 | topK,
164 | embeddingModelConnection,
165 | embeddingModelName,
166 | semanticConfiguration,
167 | vectorFields) for query in getQueryList(queries)]
168 |
169 | includedOutputs = []
170 | while allOutputs and len(includedOutputs) < topK:
171 | for output in list(allOutputs):
172 | if len(output) == 0:
173 | allOutputs.remove(output)
174 | continue
175 | value = output.pop(0)
176 | if value not in includedOutputs:
177 | includedOutputs.append(value)
178 | if len(includedOutputs) >= topK:
179 | break
180 | return includedOutputs
181 |
--------------------------------------------------------------------------------
/Examples/promptflow/dmv_copilot_flow/concat_reply.py:
--------------------------------------------------------------------------------
1 | from promptflow import tool
2 |
3 |
4 | # The inputs section will change based on the arguments of the tool function, after you save the code
5 | # Adding type to arguments and return value will help the system show the types properly
6 | # Please update the function name/signature per need
7 | @tool
8 | def my_python_tool(reply: str, docs: str) -> str:
9 | return 'hello ' + input1
--------------------------------------------------------------------------------
/Examples/promptflow/dmv_copilot_flow/dmv_sample_qs.csv:
--------------------------------------------------------------------------------
1 | chat_history,question,answer,output
2 | [],What documents do I need to bring to renew my driver's license?,"To renew your driver's license, you need to bring one proof of identity and two proofs of residency.",
3 | [],How long does it take to get a new driver's license?,The context does not provide a specific answer to this question.,
4 | [],Can I schedule an appointment for my DMV visit?,"Yes, you can schedule an appointment for your DMV visit. You can make an appointment by calling 1-800-777-0133.",
5 | [],Can I take a driving test without an appointment?,"No, you cannot take a driving test without an appointment. You may make an appointment by calling 1-800-777-0133. Online appointments are not available for this type of driving test. ",
6 | [],What is the fee for a driver's license renewal?,The fee for a driver's license renewal is $48. ,
7 | [],Can I renew my registration online?,"Yes, you can renew your vehicle registration online. ",
8 | [],How do I transfer a vehicle title?,"To transfer a vehicle title in California, you must complete the process within 10 days of buying the vehicle and submit the necessary paperwork to the DMV",
9 | [],What is the process for getting a handicap placard?,"To get a handicap placard, you need to apply for one through your state’s Department of Motor Vehicles (DMV). Each state has its own forms and criteria for handicapped parking permits. You can get a handicapped parking application from the DMV office or online1. After completing and signing the form, you need to ask your health-care provider to fill out and sign the portion that certifies disability. Then, you can submit the application by mail or in person",
10 | [],How do I get a commercial driver's license?,The given context does not provide any information about how to get a commercial driver's license. It only provides information about the requirements and examinations for obtaining a noncommercial Class A license in California.,
11 | [],Can I renew my vehicle registration without a smog check?,"No, all vehicles registered in California are required to meet California requirements including vehicle emission controls in support of California’s clean air standards. DMV cannot register a vehicle if it does not qualify.",
12 | [],What is the process for getting a REAL ID?,"To get a REAL ID, one needs to bring the following documentation to their DMV office visit: 1) Identity/birth document, 2) Social Security number document, and 3) California Residency document(s) (two different documents). If the name on the identity document is different from the current legal name, a name change document(s) is also required. One can start their application online, upload the required documents, make a DMV appointment, and bring the confirmation code and original documents to the appointment.",
13 | [],Can I change my address on my driver's license online?,"Yes, you can change your address on your driver's license online by submitting a change of address online at dmv.ca.gov/addresschange ",
14 | [],How do I get a duplicate registration card?,"To get a duplicate registration card, you can use the DMV kiosks available at various locations such as grocery stores, DMV offices, and auto clubs. These kiosks allow you to complete your vehicle registration renewal, receive a replacement registration card or sticker, and submit proof of insurance, among other transactions ",
15 | [],Can I order personalized license plates?,"Yes, you can order personalized license plates. ",
16 | [],How do I apply for a driver's permit?,"To apply for a driver's permit, you need to follow these steps:
17 | 1. Complete a Driver's License & ID Card Application.
18 | 2. Provide the necessary documents, including proof of identity, two proofs of residency, a legal full name document (if the name on your identity document and application do not match), and your social security number (exceptions may apply).
19 | 3. Pay a non-refundable application fee.
20 | 4. Pass the knowledge test(s).
21 | 5. Pass a vision test.
22 | ",
23 | [],Can I take a motorcycle riding test at the DMV?,"Yes, you can take a motorcycle riding test at the DMV (Department of Motor Vehicles) to obtain a motorcycle only (Class M1/M2) license. Basic vehicle control skills are included in the motorcycle skills test to determine your ability to maneuver in normal traffic situations. The skills test may be waived for a person presenting a valid DL 389; however, you may be required to perform an observation test.",
24 | [],How do I get a non-driver identification card?,"To get a non-driver identification card, you must provide your identity document, residency documents, and social security number. You can complete an ID card application and get information on current acceptable documents and reduced-fee, no-fee, or senior ID cards by visiting dmv.ca.gov/id-cards",
25 | [],What is the process for surrendering my license plates?,The provided context does not contain information about the process for surrendering license plates.,
26 | [],Can I pay my fees with a credit card?,"Yes, it is possible to pay fees with a credit card. However, the source does not specify whether DMV kiosks accept credit cards as a form of payment. It is recommended to check with the specific DMV office or kiosk to confirm their accepted forms of payment. ",
27 | [],How do I report a lost or stolen driver's license?,Information about reporting a lost or stolen driver's license is not provided in the given context.,
28 |
--------------------------------------------------------------------------------
/Examples/promptflow/dmv_copilot_flow/output_prompt.jinja2:
--------------------------------------------------------------------------------
1 | {{reply}}
2 |
3 | Sources :
4 |
5 | {% for item in retrieveddocs %}
6 | doc_content: {{ item['content'] }}
7 | doc: {{ item['url'] }}
8 |
9 | {% endfor %}
10 |
--------------------------------------------------------------------------------
/Examples/promptflow/dmv_copilot_flow/requirements.txt:
--------------------------------------------------------------------------------
1 | promptflow==1.5.0
2 | promptflow-tools==1.2.0
--------------------------------------------------------------------------------
/Examples/promptflow/finance_assistant_pf/.promptflow/flow.layout.json:
--------------------------------------------------------------------------------
1 | {
2 | "nodeLayouts": {
3 | "inputs": {
4 | "x": 290,
5 | "y": 42,
6 | "index": -1
7 | },
8 | "outputs": {
9 | "x": 220,
10 | "y": 310,
11 | "index": -1
12 | },
13 | "ag_test": {
14 | "x": 140,
15 | "y": 176,
16 | "index": 0
17 | }
18 | },
19 | "orientation": "Vertical"
20 | }
--------------------------------------------------------------------------------
/Examples/promptflow/finance_assistant_pf/.promptflow/flow.log:
--------------------------------------------------------------------------------
1 | 2024-02-07 15:15:09 -0800 2468 execution.flow INFO Start executing nodes in thread pool mode.
2 | 2024-02-07 15:15:09 -0800 2468 execution.flow INFO Start to run 1 nodes with concurrency level 16.
3 | 2024-02-07 15:15:09 -0800 2468 execution.flow INFO Executing node ag_test. node run id: d8062490-4000-46ee-950f-dbb9eea7dd00_ag_test_0
4 | 2024-02-07 15:15:21 -0800 2468 execution ERROR Node ag_test in line 0 failed. Exception: Execution failure in 'ag_test': (NotFoundError) Error code: 404 - {'error': {'code': '404', 'message': 'Resource not found'}}.
5 | Traceback (most recent call last):
6 | File "C:\Users\jacwang\AppData\Local\anaconda3\envs\autogen\Lib\site-packages\promptflow\_core\flow_execution_context.py", line 194, in _invoke_tool_with_timer
7 | return f(**kwargs)
8 | ^^^^^^^^^^^
9 | File "C:\Users\jacwang\AppData\Local\anaconda3\envs\autogen\Lib\site-packages\promptflow\_core\tracer.py", line 220, in wrapped
10 | output = func(*args, **kwargs)
11 | ^^^^^^^^^^^^^^^^^^^^^
12 | File "C:\repo\AzureOpenAIExamples\Examples\promptflow\finance_assistant_pf\ag_test.py", line 90, in my_python_tool
13 | assistant = client.beta.assistants.retrieve(assistant_id)
14 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
15 | File "C:\Users\jacwang\AppData\Local\anaconda3\envs\autogen\Lib\site-packages\openai\resources\beta\assistants\assistants.py", line 140, in retrieve
16 | return self._get(
17 | ^^^^^^^^^^
18 | File "C:\Users\jacwang\AppData\Local\anaconda3\envs\autogen\Lib\site-packages\openai\_base_client.py", line 1034, in get
19 | return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))
20 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
21 | File "C:\Users\jacwang\AppData\Local\anaconda3\envs\autogen\Lib\site-packages\openai\_base_client.py", line 852, in request
22 | return self._request(
23 | ^^^^^^^^^^^^^^
24 | File "C:\Users\jacwang\AppData\Local\anaconda3\envs\autogen\Lib\site-packages\openai\_base_client.py", line 933, in _request
25 | raise self._make_status_error_from_response(err.response) from None
26 | openai.NotFoundError: Error code: 404 - {'error': {'code': '404', 'message': 'Resource not found'}}
27 |
28 | The above exception was the direct cause of the following exception:
29 |
30 | Traceback (most recent call last):
31 | File "C:\Users\jacwang\AppData\Local\anaconda3\envs\autogen\Lib\site-packages\promptflow\_core\flow_execution_context.py", line 90, in invoke_tool
32 | result = self._invoke_tool_with_timer(node, f, kwargs)
33 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
34 | File "C:\Users\jacwang\AppData\Local\anaconda3\envs\autogen\Lib\site-packages\promptflow\_core\flow_execution_context.py", line 205, in _invoke_tool_with_timer
35 | raise ToolExecutionError(node_name=node_name, module=module) from e
36 | promptflow._core._errors.ToolExecutionError: Execution failure in 'ag_test': (NotFoundError) Error code: 404 - {'error': {'code': '404', 'message': 'Resource not found'}}
37 | 2024-02-07 15:15:21 -0800 2468 execution ERROR Execution of one node has failed. Cancelling all running nodes: ag_test.
38 |
--------------------------------------------------------------------------------
/Examples/promptflow/finance_assistant_pf/.promptflow/flow.output.json:
--------------------------------------------------------------------------------
1 | {
2 | "answer": [
3 | "user:\nwhat is the value of my portfolio\n",
4 | "assistant:\nTo determine the value of your portfolio, I will need a few key pieces of information:\n\n1. **Contents of the File**: I am not currently aware of the specific contents of the file you uploaded. Could you clarify whether it contains a list of investments (like stocks, bonds, etc.) with their respective quantities and the ticker symbols for any stocks?\n2. **Current Market Values**: To accurately assess the portfolio value, we will need the current market prices for each investment.\n\nOnce I understand the contents of the file, I can help retrieve the latest closing prices for any stocks or other traded assets within your portfolio. Then, I can calculate the total value of the portfolio.\n\nPlease provide the necessary details or give me permission to open and inspect the file to get the required information.\n"
5 | ]
6 | }
--------------------------------------------------------------------------------
/Examples/promptflow/finance_assistant_pf/.promptflow/flow.tools.json:
--------------------------------------------------------------------------------
1 | {
2 | "package": {},
3 | "code": {
4 | "ag_test.py": {
5 | "type": "python",
6 | "inputs": {
7 | "connection": {
8 | "type": [
9 | "AzureOpenAIConnection"
10 | ]
11 | },
12 | "input1": {
13 | "type": [
14 | "string"
15 | ]
16 | },
17 | "assistant_id": {
18 | "type": [
19 | "string"
20 | ]
21 | }
22 | },
23 | "source": "ag_test.py",
24 | "function": "my_python_tool"
25 | }
26 | }
27 | }
--------------------------------------------------------------------------------
/Examples/promptflow/finance_assistant_pf/ag_test.py:
--------------------------------------------------------------------------------
1 | from promptflow import tool
2 | from promptflow.connections import AzureOpenAIConnection
3 |
4 | import html
5 | import io
6 | import os
7 | import time
8 | from datetime import datetime
9 | from pathlib import Path
10 | from typing import Iterable
11 |
12 | import requests
13 | import yfinance as yf
14 |
15 | from openai import AzureOpenAI
16 | from openai.types import FileObject
17 | from openai.types.beta import Thread
18 | from openai.types.beta.threads import Run
19 | from openai.types.beta.threads.message_content_image_file import MessageContentImageFile
20 | from openai.types.beta.threads.message_content_text import MessageContentText
21 | from openai.types.beta.threads.messages import MessageFile
22 | from PIL import Image
23 |
24 |
25 |
26 | # The inputs section will change based on the arguments of the tool function, after you save the code
27 | # Adding type to arguments and return value will help the system show the types properly
28 | # Please update the function name/signature per need
29 | @tool
30 | def my_python_tool(connection: AzureOpenAIConnection, input1: str, assistant_id: str) -> str:
31 | def get_stock_price(symbol: str) -> float:
32 | stock = yf.Ticker(symbol)
33 | return stock.history(period="1d")["Close"].iloc[-1]
34 |
35 |
36 | tools_list = [
37 | {"type": "code_interpreter"},
38 | {
39 | "type": "function",
40 | "function": {
41 | "name": "get_stock_price",
42 | "description": "Retrieve the latest closing price of a stock using its ticker symbol.",
43 | "parameters": {
44 | "type": "object",
45 | "properties": {"symbol": {"type": "string", "description": "The ticker symbol of the stock"}},
46 | "required": ["symbol"],
47 | },
48 | },
49 | },
50 | ]
51 |
52 | # DATA_FOLDER = "data/"
53 |
54 | # def upload_file(client: AzureOpenAI, path: str) -> FileObject:
55 | # with Path(path).open("rb") as f:
56 | # return client.files.create(file=f, purpose="assistants")
57 |
58 | def call_functions(client: AzureOpenAI, thread: Thread, run: Run) -> None:
59 | print("Function Calling")
60 | required_actions = run.required_action.submit_tool_outputs.model_dump()
61 | print(required_actions)
62 | tool_outputs = []
63 | import json
64 |
65 | for action in required_actions["tool_calls"]:
66 | func_name = action["function"]["name"]
67 | arguments = json.loads(action["function"]["arguments"])
68 |
69 | if func_name == "get_stock_price":
70 | output = get_stock_price(symbol=arguments["symbol"])
71 | tool_outputs.append({"tool_call_id": action["id"], "output": output})
72 | else:
73 | raise ValueError(f"Unknown function: {func_name}")
74 |
75 | print("Submitting outputs back to the Assistant...")
76 | client.beta.threads.runs.submit_tool_outputs(thread_id=thread.id, run_id=run.id, tool_outputs=tool_outputs)
77 |
78 |
79 | client = AzureOpenAI(api_key=connection.api_key, api_version='2024-01-01-preview', azure_endpoint=connection.api_base)
80 |
81 | # arr = os.listdir(DATA_FOLDER)
82 | # assistant_files = []
83 | # for file in arr:
84 | # filePath = DATA_FOLDER + file
85 | # assistant_files.append(upload_file(client, filePath))
86 |
87 | # file_ids = [file.id for file in assistant_files]
88 |
89 |
90 | assistant = client.beta.assistants.retrieve(assistant_id)
91 | #assistant = client.beta.assistants.create(
92 | # name="Portfolio Management Assistant",
93 | # instructions="You are a personal securities trading assistant. Please be polite, professional, helpful, and friendly. "
94 | # + "Use the provided portfolio CSV file to answer the questions. If question is not related to the portfolio or you cannot answer the question, say, 'contact a representative for more assistance.'"
95 | # + "If the user asks for help or says 'help', provide a list of sample questions that you can answer.",
96 | # tools=tools_list,
97 | # model='gpt-4',
98 | # file_ids=file_ids,
99 | #)
100 |
101 | thread = client.beta.threads.create()
102 |
103 | def process_message(content: str) -> None:
104 | client.beta.threads.messages.create(thread_id=thread.id, role="user", content=content)
105 |
106 | run = client.beta.threads.runs.create(
107 | thread_id=thread.id,
108 | assistant_id=assistant.id,
109 | instructions="The current date and time is: " + datetime.now().strftime("%x %X") + ".",
110 | )
111 |
112 | print("processing...")
113 | while True:
114 | run = client.beta.threads.runs.retrieve(thread_id=thread.id, run_id=run.id)
115 | if run.status == "completed":
116 | messages = client.beta.threads.messages.list(thread_id=thread.id)
117 | return format_messages(messages)
118 | break
119 | if run.status == "failed":
120 | messages = client.beta.threads.messages.list(thread_id=thread.id)
121 | return format_messages(messages)
122 | # Handle failed
123 | break
124 | if run.status == "expired":
125 | # Handle expired
126 | break
127 | if run.status == "cancelled":
128 | # Handle cancelled
129 | break
130 | if run.status == "requires_action":
131 | call_functions(client, thread, run)
132 | else:
133 | time.sleep(5)
134 |
135 | def format_messages(messages: Iterable[MessageFile]) -> None:
136 | message_list = []
137 |
138 | # Get all the messages till the last user message
139 | for message in messages:
140 | message_list.append(message)
141 | if message.role == "user":
142 | break
143 |
144 | # Reverse the messages to show the last user message first
145 | message_list.reverse()
146 |
147 | # Print the user or Assistant messages or images
148 | return_msg = []
149 | for message in message_list:
150 | for item in message.content:
151 | # Determine the content type
152 | #if isinstance(item, MessageContentText):
153 | return_msg.append(f"{message.role}:\n{item.text.value}\n")
154 | # elif isinstance(item, MessageContentImageFile):
155 | # # Retrieve image from file id
156 | # response_content = client.files.content(item.image_file.file_id)
157 | # data_in_bytes = response_content.read()
158 | # # Convert bytes to image
159 | # readable_buffer = io.BytesIO(data_in_bytes)
160 | # image = Image.open(readable_buffer)
161 | # # Resize image to fit in terminal
162 | # width, height = image.size
163 | # image = image.resize((width // 2, height // 2), Image.LANCZOS)
164 | # # Display image
165 | # image.show()
166 | return message_list
167 |
168 | return process_message(input1)
--------------------------------------------------------------------------------
/Examples/promptflow/finance_assistant_pf/chat.jinja2:
--------------------------------------------------------------------------------
1 | system:
2 | You are a helpful assistant.
3 |
4 | {% for item in chat_history %}
5 | user:
6 | {{item.inputs.question}}
7 | assistant:
8 | {{item.outputs.answer}}
9 | {% endfor %}
10 |
11 | user:
12 | {{question}}
--------------------------------------------------------------------------------
/Examples/promptflow/finance_assistant_pf/data/portfolio.csv:
--------------------------------------------------------------------------------
1 | Symbol,Average_Cost,QTY
2 | MSFT,200,300
3 | AAPL,114,200
4 | AMZN,125,50
5 | TSLA,900,100
6 | NFLX,540,80
7 | NVDA,450,50
--------------------------------------------------------------------------------
/Examples/promptflow/finance_assistant_pf/flow.dag.yaml:
--------------------------------------------------------------------------------
1 | id: template_chat_flow
2 | name: Template Chat Flow
3 | environment:
4 | python_requirements_txt: requirements.txt
5 | inputs:
6 | chat_history:
7 | type: list
8 | default: []
9 | is_chat_input: false
10 | is_chat_history: true
11 | question:
12 | type: string
13 | default: what is the value of my portfolio
14 | is_chat_input: true
15 | outputs:
16 | answer:
17 | type: string
18 | reference: ${ag_test.output}
19 | is_chat_output: true
20 | nodes:
21 | - name: ag_test
22 | type: python
23 | source:
24 | type: code
25 | path: ag_test.py
26 | inputs:
27 | connection: Default_AzureOpenAI
28 | input1: ${inputs.question}
29 | assistant_id: asst_9t1k8YqEXYdsk6V565grn89e
30 | use_variants: false
31 |
--------------------------------------------------------------------------------
/Examples/promptflow/model_as_judge_evaluator/.promptflow/flow.layout.json:
--------------------------------------------------------------------------------
1 | {
2 | "orientation": "Vertical",
3 | "nodeLayouts": {
4 | "inputs": {
5 | "x": 1407.5,
6 | "y": 42,
7 | "index": -1
8 | },
9 | "outputs": {
10 | "x": 1832.5,
11 | "y": 846,
12 | "index": -1
13 | },
14 | "gpt_coherence": {
15 | "x": 1315,
16 | "y": 444,
17 | "height": 75,
18 | "index": 0
19 | },
20 | "concat_scores": {
21 | "x": 1742.5,
22 | "y": 712,
23 | "height": 75,
24 | "index": 1
25 | },
26 | "gpt_similarity": {
27 | "x": 2732.5,
28 | "y": 444,
29 | "index": 2
30 | },
31 | "gpt_relevance": {
32 | "x": 2402.5,
33 | "y": 444,
34 | "height": 75,
35 | "index": 3
36 | },
37 | "gpt_fluency": {
38 | "x": 1742.5,
39 | "y": 444,
40 | "height": 75,
41 | "index": 4
42 | },
43 | "f1_score": {
44 | "x": 985,
45 | "y": 444,
46 | "height": 75,
47 | "index": 5
48 | },
49 | "gpt_groundedness": {
50 | "x": 2072.5,
51 | "y": 444,
52 | "height": 75,
53 | "index": 6
54 | },
55 | "aggregate_variants_results": {
56 | "x": 140,
57 | "y": 846,
58 | "index": 7
59 | },
60 | "select_metrics": {
61 | "x": 1490,
62 | "y": 176,
63 | "height": 75,
64 | "index": 8
65 | },
66 | "embeded_ground_truth": {
67 | "x": 305,
68 | "y": 444,
69 | "index": 9
70 | },
71 | "embeded_answer": {
72 | "x": 635,
73 | "y": 444,
74 | "height": 75,
75 | "index": 10
76 | },
77 | "ada_similarity": {
78 | "x": 635,
79 | "y": 578,
80 | "height": 75,
81 | "index": 11
82 | },
83 | "validate_input": {
84 | "x": 1402.5,
85 | "y": 310,
86 | "height": 75,
87 | "index": 12
88 | }
89 | },
90 | "viewport": {
91 | "transformMatrix": [
92 | 1,
93 | 0,
94 | 0,
95 | 1,
96 | -1117.46875,
97 | -76.83331888914108
98 | ]
99 | }
100 | }
--------------------------------------------------------------------------------
/Examples/promptflow/model_as_judge_evaluator/.promptflow/flow.log:
--------------------------------------------------------------------------------
1 | 2024-04-08 21:13:05 +0000 40780 execution.flow INFO Start executing nodes in thread pool mode.
2 | 2024-04-08 21:13:05 +0000 40780 execution.flow INFO Start to run 12 nodes with concurrency level 16.
3 | 2024-04-08 21:13:05 +0000 40780 execution.flow INFO Executing node select_metrics. node run id: ab9522af-1236-4fd9-aeab-e7bf848d2b59_select_metrics_0
4 | 2024-04-08 21:13:05 +0000 40780 execution.flow INFO Node select_metrics completes.
5 | 2024-04-08 21:13:05 +0000 40780 execution.flow INFO Executing node validate_input. node run id: ab9522af-1236-4fd9-aeab-e7bf848d2b59_validate_input_0
6 | 2024-04-08 21:13:05 +0000 40780 execution.flow INFO Node validate_input completes.
7 | 2024-04-08 21:13:05 +0000 40780 execution.flow INFO The node 'gpt_coherence' will be executed because the activate condition is met, i.e. '${validate_input.output.gpt_coherence}' is equal to 'True'.
8 | 2024-04-08 21:13:05 +0000 40780 execution.flow INFO The node 'gpt_similarity' will be executed because the activate condition is met, i.e. '${validate_input.output.gpt_similarity}' is equal to 'True'.
9 | 2024-04-08 21:13:05 +0000 40780 execution.flow INFO The node 'gpt_relevance' will be executed because the activate condition is met, i.e. '${validate_input.output.gpt_relevance}' is equal to 'True'.
10 | 2024-04-08 21:13:05 +0000 40780 execution.flow INFO The node 'gpt_fluency' will be executed because the activate condition is met, i.e. '${validate_input.output.gpt_fluency}' is equal to 'True'.
11 | 2024-04-08 21:13:05 +0000 40780 execution.flow INFO The node 'f1_score' will be executed because the activate condition is met, i.e. '${validate_input.output.f1_score}' is equal to 'True'.
12 | 2024-04-08 21:13:05 +0000 40780 execution.flow INFO The node 'gpt_groundedness' will be executed because the activate condition is met, i.e. '${validate_input.output.gpt_groundedness}' is equal to 'True'.
13 | 2024-04-08 21:13:05 +0000 40780 execution.flow INFO The node 'embeded_ground_truth' will be executed because the activate condition is met, i.e. '${validate_input.output.ada_similarity}' is equal to 'True'.
14 | 2024-04-08 21:13:05 +0000 40780 execution.flow INFO The node 'embeded_answer' will be executed because the activate condition is met, i.e. '${validate_input.output.ada_similarity}' is equal to 'True'.
15 | 2024-04-08 21:13:05 +0000 40780 execution.flow INFO Executing node gpt_coherence. node run id: ab9522af-1236-4fd9-aeab-e7bf848d2b59_gpt_coherence_0
16 | 2024-04-08 21:13:05 +0000 40780 execution.flow INFO Executing node gpt_similarity. node run id: ab9522af-1236-4fd9-aeab-e7bf848d2b59_gpt_similarity_0
17 | 2024-04-08 21:13:05 +0000 40780 execution.flow INFO Executing node gpt_relevance. node run id: ab9522af-1236-4fd9-aeab-e7bf848d2b59_gpt_relevance_0
18 | 2024-04-08 21:13:05 +0000 40780 execution.flow INFO Executing node gpt_fluency. node run id: ab9522af-1236-4fd9-aeab-e7bf848d2b59_gpt_fluency_0
19 | 2024-04-08 21:13:05 +0000 40780 execution.flow INFO Executing node f1_score. node run id: ab9522af-1236-4fd9-aeab-e7bf848d2b59_f1_score_0
20 | 2024-04-08 21:13:05 +0000 40780 execution.flow INFO Executing node gpt_groundedness. node run id: ab9522af-1236-4fd9-aeab-e7bf848d2b59_gpt_groundedness_0
21 | 2024-04-08 21:13:05 +0000 40780 execution.flow INFO Node f1_score completes.
22 | 2024-04-08 21:13:05 +0000 40780 execution.flow INFO Executing node embeded_ground_truth. node run id: ab9522af-1236-4fd9-aeab-e7bf848d2b59_embeded_ground_truth_0
23 | 2024-04-08 21:13:05 +0000 40780 execution.flow INFO Executing node embeded_answer. node run id: ab9522af-1236-4fd9-aeab-e7bf848d2b59_embeded_answer_0
24 | 2024-04-08 21:13:06 +0000 40780 execution.flow INFO Node gpt_relevance completes.
25 | 2024-04-08 21:13:06 +0000 40780 execution.flow INFO Node gpt_similarity completes.
26 | 2024-04-08 21:13:06 +0000 40780 execution.flow INFO Node gpt_groundedness completes.
27 | 2024-04-08 21:13:06 +0000 40780 execution.flow INFO Node gpt_coherence completes.
28 | 2024-04-08 21:13:06 +0000 40780 execution.flow INFO Node gpt_fluency completes.
29 | 2024-04-08 21:13:06 +0000 40780 execution.flow INFO Node embeded_ground_truth completes.
30 | 2024-04-08 21:13:06 +0000 40780 execution.flow INFO Node embeded_answer completes.
31 | 2024-04-08 21:13:06 +0000 40780 execution.flow INFO The node 'ada_similarity' will be executed because the activate condition is met, i.e. '${validate_input.output.ada_similarity}' is equal to 'True'.
32 | 2024-04-08 21:13:06 +0000 40780 execution.flow INFO Executing node ada_similarity. node run id: ab9522af-1236-4fd9-aeab-e7bf848d2b59_ada_similarity_0
33 | 2024-04-08 21:13:06 +0000 40780 execution.flow INFO Node ada_similarity completes.
34 | 2024-04-08 21:13:06 +0000 40780 execution.flow INFO Executing node concat_scores. node run id: ab9522af-1236-4fd9-aeab-e7bf848d2b59_concat_scores_0
35 | 2024-04-08 21:13:06 +0000 40780 execution.flow INFO Node concat_scores completes.
36 | 2024-04-08 21:13:06 +0000 40780 execution.flow INFO Start to run 1 nodes with concurrency level 16.
37 | 2024-04-08 21:13:06 +0000 40780 execution.flow INFO Executing node aggregate_variants_results. node run id: ab9522af-1236-4fd9-aeab-e7bf848d2b59_aggregate_variants_results_reduce
38 | 2024-04-08 21:13:06 +0000 40780 execution.flow INFO Node aggregate_variants_results completes.
39 |
--------------------------------------------------------------------------------
/Examples/promptflow/model_as_judge_evaluator/.promptflow/flow.metrics.json:
--------------------------------------------------------------------------------
1 | {
2 | "gpt_coherence": 3.0,
3 | "gpt_similarity": 5.0,
4 | "gpt_fluency": 5.0,
5 | "gpt_relevance": 5.0,
6 | "gpt_groundedness": 5.0,
7 | "f1_score": 0.5,
8 | "ada_similarity": 0.93
9 | }
--------------------------------------------------------------------------------
/Examples/promptflow/model_as_judge_evaluator/.promptflow/flow.output.json:
--------------------------------------------------------------------------------
1 | {
2 | "f1_score": 0.5,
3 | "gpt_coherence": 3.0,
4 | "gpt_similarity": 5.0,
5 | "gpt_fluency": 5.0,
6 | "gpt_relevance": 5.0,
7 | "gpt_groundedness": 5.0,
8 | "ada_similarity": 0.9313747004677263
9 | }
--------------------------------------------------------------------------------
/Examples/promptflow/model_as_judge_evaluator/.promptflow/flow.uihint.json:
--------------------------------------------------------------------------------
1 | {
2 | "nodes": {
3 | "select_metrics": {
4 | "variant_0": {
5 | "inputs": {},
6 | "inputsValue": {}
7 | }
8 | },
9 | "gpt_coherence": {
10 | "variant_0": {
11 | "inputs": {},
12 | "inputsValue": {}
13 | }
14 | },
15 | "validate_input": {
16 | "variant_0": {
17 | "inputs": {},
18 | "inputsValue": {}
19 | }
20 | },
21 | "ada_similarity": {
22 | "variant_0": {
23 | "inputs": {},
24 | "inputsValue": {}
25 | }
26 | },
27 | "embeded_answer": {
28 | "variant_0": {
29 | "inputs": {},
30 | "inputsValue": {}
31 | }
32 | },
33 | "gpt_relevance": {
34 | "variant_0": {
35 | "inputs": {},
36 | "inputsValue": {}
37 | }
38 | },
39 | "aggregate_variants_results": {
40 | "variant_0": {
41 | "inputs": {},
42 | "inputsValue": {}
43 | }
44 | },
45 | "concat_scores": {
46 | "variant_0": {
47 | "inputs": {},
48 | "inputsValue": {}
49 | }
50 | },
51 | "gpt_groundedness": {
52 | "variant_0": {
53 | "inputs": {},
54 | "inputsValue": {}
55 | }
56 | },
57 | "gpt_similarity": {
58 | "variant_0": {
59 | "inputs": {},
60 | "inputsValue": {}
61 | }
62 | },
63 | "gpt_fluency": {
64 | "variant_0": {
65 | "inputs": {},
66 | "inputsValue": {}
67 | }
68 | },
69 | "embeded_ground_truth": {
70 | "variant_0": {
71 | "inputs": {},
72 | "inputsValue": {}
73 | }
74 | },
75 | "f1_score": {
76 | "variant_0": {
77 | "inputs": {},
78 | "inputsValue": {}
79 | }
80 | }
81 | }
82 | }
--------------------------------------------------------------------------------
/Examples/promptflow/model_as_judge_evaluator/.promptflow/lkg_sources/README.md:
--------------------------------------------------------------------------------
1 | # Q&A Evaluation:
2 |
3 | The Q&A evaluation flow will evaluate the Q&A systems by leveraging the state-of-the-art Large Language Models (LLM) to measure the quality and safety of your responses. Utilizing GPT and GPT embedding model to assist with measurements aims to achieve a high agreement with human evaluations compared to traditional mathematical measurements.
4 |
5 | ## What you will learn
6 |
7 | The Q&A evaluation flow allows you to assess and evaluate your model with the LLM-assisted metrics and f1_score:
8 |
9 |
10 | * __gpt_coherence__: Measures the quality of all sentences in a model's predicted answer and how they fit together naturally.
11 |
12 | Coherence is scored on a scale of 1 to 5, with 1 being the worst and 5 being the best.
13 |
14 | * __gpt_relevance__: Measures how relevant the model's predicted answers are to the questions asked.
15 |
16 | Relevance metric is scored on a scale of 1 to 5, with 1 being the worst and 5 being the best.
17 |
18 | * __gpt_fluency__: Measures how grammatically and linguistically correct the model's predicted answer is.
19 |
20 | Fluency is scored on a scale of 1 to 5, with 1 being the worst and 5 being the best
21 |
22 | * __gpt_similarity__: Measures similarity between user-provided ground truth answers and the model predicted answer.
23 |
24 | Similarity is scored on a scale of 1 to 5, with 1 being the worst and 5 being the best.
25 |
26 | * __gpt_groundedness__ (against context)**: Measures how grounded the model's predicted answers are against the context. Even if LLM’s responses are true, if not verifiable against context, then such responses are considered ungrounded.
27 |
28 | Groundedness metric is scored on a scale of 1 to 5, with 1 being the worst and 5 being the best.
29 |
30 | * __ada_similarity__: Measures the cosine similarity of ada embeddings of the model prediction and the ground truth.
31 |
32 | ada_similarity is a value in the range [0, 1].
33 |
34 | * __F1-score__: Compute the f1-Score based on the tokens in the predicted answer and the ground truth.
35 |
36 | The f1-score evaluation flow allows you to determine the f1-score metric using number of common tokens between the normalized version of the ground truth and the predicted answer.
37 |
38 | F1-score is a value in the range [0, 1].
39 |
40 |
41 | ## Prerequisites
42 |
43 | - Connection: Azure OpenAI or OpenAI connection.
44 | - Data input: Evaluating the Coherence metric requires you to provide data inputs including a question, an answer, a ground truth, and a context.
45 |
46 | ## Tools used in this flow
47 | - LLM tool
48 | - Python tool
49 | - Embedding tool
--------------------------------------------------------------------------------
/Examples/promptflow/model_as_judge_evaluator/.promptflow/lkg_sources/ada_cosine_similarity_score.py:
--------------------------------------------------------------------------------
1 | from promptflow import tool
2 | import numpy as np
3 | from numpy.linalg import norm
4 |
5 |
6 | @tool
7 | def compute_ada_cosine_similarity(a, b) -> float:
8 | return np.dot(a, b)/(norm(a)*norm(b))
9 |
--------------------------------------------------------------------------------
/Examples/promptflow/model_as_judge_evaluator/.promptflow/lkg_sources/aggregate_variants_results.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 | from promptflow import tool, log_metric
3 | import numpy as np
4 |
5 |
6 | @tool
7 | def aggregate_variants_results(results: List[dict], metrics: List[str]):
8 | aggregate_results = {}
9 | for result in results:
10 | for name, value in result.items():
11 | if name in metrics[0]:
12 | if name not in aggregate_results.keys():
13 | aggregate_results[name] = []
14 | try:
15 | float_val = float(value)
16 | except Exception:
17 | float_val = np.nan
18 | aggregate_results[name].append(float_val)
19 |
20 | for name, value in aggregate_results.items():
21 | if name in metrics[0]:
22 | aggregate_results[name] = np.nanmean(value)
23 | aggregate_results[name] = round(aggregate_results[name], 2)
24 | log_metric(name, aggregate_results[name])
25 | return aggregate_results
26 |
--------------------------------------------------------------------------------
/Examples/promptflow/model_as_judge_evaluator/.promptflow/lkg_sources/concat_scores.py:
--------------------------------------------------------------------------------
1 | from promptflow import tool
2 | import numpy as np
3 | import re
4 |
5 |
6 | @tool
7 | def concat_results(gpt_coherence_score: str = None,
8 | gpt_similarity_score: str = None,
9 | gpt_fluency_score: str = None,
10 | gpt_relevance_score: str = None,
11 | gpt_groundedness_score: str = None,
12 | f1_score: float = None,
13 | ada_cosine_similarity: float = None):
14 |
15 | load_list = [{'name': 'gpt_coherence', 'score': gpt_coherence_score},
16 | {'name': 'gpt_similarity', 'score': gpt_similarity_score},
17 | {'name': 'gpt_fluency', 'score': gpt_fluency_score},
18 | {'name': 'gpt_relevance', 'score': gpt_relevance_score},
19 | {'name': 'gpt_groundedness', 'score': gpt_groundedness_score},
20 | {'name': 'f1_score', 'score': f1_score},
21 | {'name': 'ada_similarity', 'score': ada_cosine_similarity}]
22 |
23 | scalar_metrics = ["f1_score", "ada_similarity"]
24 | score_list = []
25 | errors = []
26 | for item in load_list:
27 | if item["name"] in scalar_metrics:
28 | try:
29 | score = float(item["score"])
30 | except Exception as e:
31 | score = np.nan
32 | errors.append({"name": item["name"], "msg": str(e), "data": item["score"]})
33 | else:
34 | if item['score']:
35 | try:
36 | score = item["score"]
37 | match = re.search(r'\d', score)
38 | if match:
39 | score = float(match.group())
40 | else:
41 | score = np.nan
42 | except Exception as e:
43 | score = np.nan
44 | errors.append({"name": item["name"], "msg": str(e), "data": item["score"]})
45 | else:
46 | score = np.nan
47 | score_list.append({"name": item["name"], "score": score})
48 |
49 | variant_level_result = {}
50 | for item in score_list:
51 | item_name = str(item["name"])
52 | variant_level_result[item_name] = item["score"]
53 | if 'gpt' in item_name:
54 | variant_level_result[item_name + '_pass_rate'] = 1 if item["score"] > 3 else 0
55 | return variant_level_result
56 |
--------------------------------------------------------------------------------
/Examples/promptflow/model_as_judge_evaluator/.promptflow/lkg_sources/f1_score.py:
--------------------------------------------------------------------------------
1 | from promptflow import tool
2 | from collections import Counter
3 |
4 |
5 | @tool
6 | def compute_f1_score(ground_truth: str, answer: str) -> str:
7 | import string
8 | import re
9 |
10 | class QASplitTokenizer:
11 | def __call__(self, line):
12 | """Tokenizes an input line using split() on whitespace
13 |
14 | :param line: a segment to tokenize
15 | :return: the tokenized line
16 | """
17 |
18 | return line.split()
19 |
20 | def normalize_text(text) -> str:
21 | """Lower text and remove punctuation, articles and extra whitespace."""
22 |
23 | def remove_articles(text):
24 | return re.sub(r"\b(a|an|the)\b", " ", text)
25 |
26 | def white_space_fix(text):
27 | return " ".join(text.split())
28 |
29 | def remove_punctuation(text):
30 | exclude = set(string.punctuation)
31 | return "".join(ch for ch in text if ch not in exclude)
32 |
33 | def lower(text):
34 | return text.lower()
35 |
36 | return white_space_fix(remove_articles(remove_punctuation(lower(text))))
37 | prediction_tokens = normalize_text(answer)
38 | reference_tokens = normalize_text(ground_truth)
39 | tokenizer = QASplitTokenizer()
40 | prediction_tokens = tokenizer(prediction_tokens)
41 | reference_tokens = tokenizer(reference_tokens)
42 |
43 | common_tokens = Counter(prediction_tokens) & Counter(reference_tokens)
44 | num_common_tokens = sum(common_tokens.values())
45 |
46 | if num_common_tokens == 0:
47 | f1 = 0.0
48 | else:
49 | precision = 1.0 * num_common_tokens / len(prediction_tokens)
50 | recall = 1.0 * num_common_tokens / len(reference_tokens)
51 |
52 | f1 = (2.0 * precision * recall) / (precision + recall)
53 |
54 | return f1
55 |
--------------------------------------------------------------------------------
/Examples/promptflow/model_as_judge_evaluator/.promptflow/lkg_sources/flow.meta.yaml:
--------------------------------------------------------------------------------
1 | $schema: https://azuremlschemas.azureedge.net/latest/flow.schema.json
2 | name: qna_non_rag_eval
3 | display_name: QnA Evaluation
4 | type: evaluate
5 | path: ./flow.dag.yaml
6 | description: Compute the quality of the answer for the given question based on the ground_truth and the context
7 | properties:
8 | promptflow.stage: prod
9 | promptflow.details.type: markdown
10 | promptflow.details.source: README.md
11 | promptflow.batch_inputs: samples.json
--------------------------------------------------------------------------------
/Examples/promptflow/model_as_judge_evaluator/.promptflow/lkg_sources/gpt_coherence_prompt.jinja2:
--------------------------------------------------------------------------------
1 | system:
2 | You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric.
3 |
4 | user:
5 | Coherence of an answer is measured by how well all the sentences fit together and sound naturally as a whole. Consider the overall quality of the answer when evaluating coherence. Given the question and answer, score the coherence of answer between one to five stars using the following rating scale:
6 | One star: the answer completely lacks coherence
7 | Two stars: the answer mostly lacks coherence
8 | Three stars: the answer is partially coherent
9 | Four stars: the answer is mostly coherent
10 | Five stars: the answer has perfect coherency
11 |
12 | This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5.
13 |
14 | question: What is your favorite indoor activity and why do you enjoy it?
15 | answer: I like pizza. The sun is shining.
16 | stars: 1
17 |
18 | question: Can you describe your favorite movie without giving away any spoilers?
19 | answer: It is a science fiction movie. There are dinosaurs. The actors eat cake. People must stop the villain.
20 | stars: 2
21 |
22 | question: What are some benefits of regular exercise?
23 | answer: Regular exercise improves your mood. A good workout also helps you sleep better. Trees are green.
24 | stars: 3
25 |
26 | question: How do you cope with stress in your daily life?
27 | answer: I usually go for a walk to clear my head. Listening to music helps me relax as well. Stress is a part of life, but we can manage it through some activities.
28 | stars: 4
29 |
30 | question: What can you tell me about climate change and its effects on the environment?
31 | answer: Climate change has far-reaching effects on the environment. Rising temperatures result in the melting of polar ice caps, contributing to sea-level rise. Additionally, more frequent and severe weather events, such as hurricanes and heatwaves, can cause disruption to ecosystems and human societies alike.
32 | stars: 5
33 |
34 | question: {{question}}
35 | answer: {{answer}}
36 | stars:
--------------------------------------------------------------------------------
/Examples/promptflow/model_as_judge_evaluator/.promptflow/lkg_sources/gpt_fluency_prompt.jinja2:
--------------------------------------------------------------------------------
1 | system:
2 | You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric.
3 | user:
4 | Fluency measures the quality of individual sentences in the answer, and whether they are well-written and grammatically correct. Consider the quality of individual sentences when evaluating fluency. Given the question and answer, score the fluency of the answer between one to five stars using the following rating scale:
5 | One star: the answer completely lacks fluency
6 | Two stars: the answer mostly lacks fluency
7 | Three stars: the answer is partially fluent
8 | Four stars: the answer is mostly fluent
9 | Five stars: the answer has perfect fluency
10 |
11 | This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5.
12 |
13 | question: What did you have for breakfast today?
14 | answer: Breakfast today, me eating cereal and orange juice very good.
15 | stars: 1
16 |
17 | question: How do you feel when you travel alone?
18 | answer: Alone travel, nervous, but excited also. I feel adventure and like its time.
19 | stars: 2
20 |
21 | question: When was the last time you went on a family vacation?
22 | answer: Last family vacation, it took place in last summer. We traveled to a beach destination, very fun.
23 | stars: 3
24 |
25 | question: What is your favorite thing about your job?
26 | answer: My favorite aspect of my job is the chance to interact with diverse people. I am constantly learning from their experiences and stories.
27 | stars: 4
28 |
29 | question: Can you describe your morning routine?
30 | answer: Every morning, I wake up at 6 am, drink a glass of water, and do some light stretching. After that, I take a shower and get dressed for work. Then, I have a healthy breakfast, usually consisting of oatmeal and fruits, before leaving the house around 7:30 am.
31 | stars: 5
32 |
33 | question: {{question}}
34 | answer: {{answer}}
35 | stars:
--------------------------------------------------------------------------------
/Examples/promptflow/model_as_judge_evaluator/.promptflow/lkg_sources/gpt_groundedness_prompt.jinja2:
--------------------------------------------------------------------------------
1 | system:
2 | You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric.
3 | user:
4 | You will be presented with a CONTEXT and an ANSWER about that CONTEXT. You need to decide whether the ANSWER is entailed by the CONTEXT by choosing one of the following rating:
5 | 1. 5: The ANSWER follows logically from the information contained in the CONTEXT.
6 | 2. 1: The ANSWER is logically false from the information contained in the CONTEXT.
7 | 3. an integer score between 1 and 5 and if such integer score does not exist, use 1: It is not possible to determine whether the ANSWER is true or false without further information. Read the passage of information thoroughly and select the correct answer from the three answer labels. Read the CONTEXT thoroughly to ensure you know what the CONTEXT entails. Note the ANSWER is generated by a computer system, it can contain certain symbols, which should not be a negative factor in the evaluation.
8 | Independent Examples:
9 | ## Example Task #1 Input:
10 | {"CONTEXT": "Some are reported as not having been wanted at all.", "QUESTION": "", "ANSWER": "All are reported as being completely and fully wanted."}
11 | ## Example Task #1 Output:
12 | 1
13 | ## Example Task #2 Input:
14 | {"CONTEXT": "Ten new television shows appeared during the month of September. Five of the shows were sitcoms, three were hourlong dramas, and two were news-magazine shows. By January, only seven of these new shows were still on the air. Five of the shows that remained were sitcoms.", "QUESTION": "", "ANSWER": "At least one of the shows that were cancelled was an hourlong drama."}
15 | ## Example Task #2 Output:
16 | 5
17 | ## Example Task #3 Input:
18 | {"CONTEXT": "In Quebec, an allophone is a resident, usually an immigrant, whose mother tongue or home language is neither French nor English.", "QUESTION": "", "ANSWER": "In Quebec, an allophone is a resident, usually an immigrant, whose mother tongue or home language is not French."}
19 | ## Example Task #3 Output:
20 | 5
21 | ## Example Task #4 Input:
22 | {"CONTEXT": "Some are reported as not having been wanted at all.", "QUESTION": "", "ANSWER": "All are reported as being completely and fully wanted."}
23 | ## Example Task #4 Output:
24 | 1
25 | ## Actual Task Input:
26 | {"CONTEXT": {{context}}, "QUESTION": "", "ANSWER": {{answer}}}
27 | Reminder: The return values for each task should be correctly formatted as an integer between 1 and 5. Do not repeat the context and question.
28 | Actual Task Output:
--------------------------------------------------------------------------------
/Examples/promptflow/model_as_judge_evaluator/.promptflow/lkg_sources/gpt_relevance_prompt.jinja2:
--------------------------------------------------------------------------------
1 | system:
2 | You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric.
3 | user:
4 | Relevance measures how well the answer addresses the main aspects of the question, based on the context. Consider whether all and only the important aspects are contained in the answer when evaluating relevance. Given the context and question, score the relevance of the answer between one to five stars using the following rating scale:
5 | One star: the answer completely lacks relevance
6 | Two stars: the answer mostly lacks relevance
7 | Three stars: the answer is partially relevant
8 | Four stars: the answer is mostly relevant
9 | Five stars: the answer has perfect relevance
10 |
11 | This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5.
12 |
13 | context: Marie Curie was a Polish-born physicist and chemist who pioneered research on radioactivity and was the first woman to win a Nobel Prize.
14 | question: What field did Marie Curie excel in?
15 | answer: Marie Curie was a renowned painter who focused mainly on impressionist styles and techniques.
16 | stars: 1
17 |
18 | context: The Beatles were an English rock band formed in Liverpool in 1960, and they are widely regarded as the most influential music band in history.
19 | question: Where were The Beatles formed?
20 | answer: The band The Beatles began their journey in London, England, and they changed the history of music.
21 | stars: 2
22 |
23 | context: The recent Mars rover, Perseverance, was launched in 2020 with the main goal of searching for signs of ancient life on Mars. The rover also carries an experiment called MOXIE, which aims to generate oxygen from the Martian atmosphere.
24 | question: What are the main goals of Perseverance Mars rover mission?
25 | answer: The Perseverance Mars rover mission focuses on searching for signs of ancient life on Mars.
26 | stars: 3
27 |
28 | context: The Mediterranean diet is a commonly recommended dietary plan that emphasizes fruits, vegetables, whole grains, legumes, lean proteins, and healthy fats. Studies have shown that it offers numerous health benefits, including a reduced risk of heart disease and improved cognitive health.
29 | question: What are the main components of the Mediterranean diet?
30 | answer: The Mediterranean diet primarily consists of fruits, vegetables, whole grains, and legumes.
31 | stars: 4
32 |
33 | context: The Queen's Royal Castle is a well-known tourist attraction in the United Kingdom. It spans over 500 acres and contains extensive gardens and parks. The castle was built in the 15th century and has been home to generations of royalty.
34 | question: What are the main attractions of the Queen's Royal Castle?
35 | answer: The main attractions of the Queen's Royal Castle are its expansive 500-acre grounds, extensive gardens, parks, and the historical castle itself, which dates back to the 15th century and has housed generations of royalty.
36 | stars: 5
37 |
38 | context: {{context}}
39 | question: {{question}}
40 | answer: {{answer}}
41 | stars:
--------------------------------------------------------------------------------
/Examples/promptflow/model_as_judge_evaluator/.promptflow/lkg_sources/gpt_similarity_prompt.jinja2:
--------------------------------------------------------------------------------
1 | system:
2 | You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric.
3 | user:
4 | Equivalence, as a metric, measures the similarity between the predicted answer and the correct answer. If the information and content in the predicted answer is similar or equivalent to the correct answer, then the value of the Equivalence metric should be high, else it should be low. Given the question, correct answer, and predicted answer, determine the value of Equivalence metric using the following rating scale:
5 | One star: the predicted answer is not at all similar to the correct answer
6 | Two stars: the predicted answer is mostly not similar to the correct answer
7 | Three stars: the predicted answer is somewhat similar to the correct answer
8 | Four stars: the predicted answer is mostly similar to the correct answer
9 | Five stars: the predicted answer is completely similar to the correct answer
10 |
11 | This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5.
12 |
13 | The examples below show the Equivalence score for a question, a correct answer, and a predicted answer.
14 |
15 | question: What is the role of ribosomes?
16 | correct answer: Ribosomes are cellular structures responsible for protein synthesis. They interpret the genetic information carried by messenger RNA (mRNA) and use it to assemble amino acids into proteins.
17 | predicted answer: Ribosomes participate in carbohydrate breakdown by removing nutrients from complex sugar molecules.
18 | stars: 1
19 |
20 | question: Why did the Titanic sink?
21 | correct answer: The Titanic sank after it struck an iceberg during its maiden voyage in 1912. The impact caused the ship's hull to breach, allowing water to flood into the vessel. The ship's design, lifeboat shortage, and lack of timely rescue efforts contributed to the tragic loss of life.
22 | predicted answer: The sinking of the Titanic was a result of a large iceberg collision. This caused the ship to take on water and eventually sink, leading to the death of many passengers due to a shortage of lifeboats and insufficient rescue attempts.
23 | stars: 2
24 |
25 | question: What causes seasons on Earth?
26 | correct answer: Seasons on Earth are caused by the tilt of the Earth's axis and its revolution around the Sun. As the Earth orbits the Sun, the tilt causes different parts of the planet to receive varying amounts of sunlight, resulting in changes in temperature and weather patterns.
27 | predicted answer: Seasons occur because of the Earth's rotation and its elliptical orbit around the Sun. The tilt of the Earth's axis causes regions to be subjected to different sunlight intensities, which leads to temperature fluctuations and alternating weather conditions.
28 | stars: 3
29 |
30 | question: How does photosynthesis work?
31 | correct answer: Photosynthesis is a process by which green plants and some other organisms convert light energy into chemical energy. This occurs as light is absorbed by chlorophyll molecules, and then carbon dioxide and water are converted into glucose and oxygen through a series of reactions.
32 | predicted answer: In photosynthesis, sunlight is transformed into nutrients by plants and certain microorganisms. Light is captured by chlorophyll molecules, followed by the conversion of carbon dioxide and water into sugar and oxygen through multiple reactions.
33 | stars: 4
34 |
35 | question: What are the health benefits of regular exercise?
36 | correct answer: Regular exercise can help maintain a healthy weight, increase muscle and bone strength, and reduce the risk of chronic diseases. It also promotes mental well-being by reducing stress and improving overall mood.
37 | predicted answer: Routine physical activity can contribute to maintaining ideal body weight, enhancing muscle and bone strength, and preventing chronic illnesses. In addition, it supports mental health by alleviating stress and augmenting general mood.
38 | stars: 5
39 |
40 | question: {{question}}
41 | correct answer:{{ground_truth}}
42 | predicted answer: {{answer}}
43 | stars:
--------------------------------------------------------------------------------
/Examples/promptflow/model_as_judge_evaluator/.promptflow/lkg_sources/requirements.txt:
--------------------------------------------------------------------------------
1 | promptflow
2 | promptflow-tools
--------------------------------------------------------------------------------
/Examples/promptflow/model_as_judge_evaluator/.promptflow/lkg_sources/samples.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "question": "Which tent is the most waterproof?",
4 | "context": "From the our product list, the alpine explorer tent is the most waterproof. The Adventure Dining Table has higher weight.",
5 | "answer": "The Alpine Explorer Tent is the most waterproof.",
6 | "ground_truth": "The Alpine Explorer Tent has the highest rainfly waterproof rating at 3000m"
7 | }
8 | ]
--------------------------------------------------------------------------------
/Examples/promptflow/model_as_judge_evaluator/.promptflow/lkg_sources/select_metrics.py:
--------------------------------------------------------------------------------
1 | from promptflow import tool
2 |
3 |
4 | @tool
5 | def select_metrics(metrics: str) -> str:
6 | supported_metrics = ('gpt_coherence', 'gpt_similarity', 'gpt_fluency', 'gpt_relevance', 'gpt_groundedness',
7 | 'f1_score', 'ada_similarity')
8 | user_selected_metrics = [metric.strip() for metric in metrics.split(',') if metric]
9 | metric_selection_dict = {}
10 | for metric in supported_metrics:
11 | if metric in user_selected_metrics:
12 | metric_selection_dict[metric] = True
13 | else:
14 | metric_selection_dict[metric] = False
15 | return metric_selection_dict
16 |
--------------------------------------------------------------------------------
/Examples/promptflow/model_as_judge_evaluator/.promptflow/lkg_sources/validate_input.py:
--------------------------------------------------------------------------------
1 | from promptflow import tool
2 |
3 |
4 | @tool
5 | def validate_input(question: str, answer: str, context: str, ground_truth: str, selected_metrics: dict) -> dict:
6 | input_data = {"question": question, "answer": answer, "context": context, "ground_truth": ground_truth}
7 | expected_input_cols = set(input_data.keys())
8 | dict_metric_required_fields = {"gpt_groundedness": set(["answer", "context"]),
9 | "gpt_relevance": set(["question", "answer", "context"]),
10 | "gpt_coherence": set(["question", "answer"]),
11 | "gpt_similarity": set(["question", "answer", "ground_truth"]),
12 | "gpt_fluency": set(["question", "answer"]),
13 | "f1_score": set(["answer", "ground_truth"]),
14 | "ada_similarity": set(["answer", "ground_truth"])}
15 | actual_input_cols = set()
16 | for col in expected_input_cols:
17 | if input_data[col] and input_data[col].strip():
18 | actual_input_cols.add(col)
19 | data_validation = selected_metrics
20 | for metric in selected_metrics:
21 | if selected_metrics[metric]:
22 | metric_required_fields = dict_metric_required_fields[metric]
23 | if metric_required_fields <= actual_input_cols:
24 | data_validation[metric] = True
25 | else:
26 | data_validation[metric] = False
27 | return data_validation
28 |
--------------------------------------------------------------------------------
/Examples/promptflow/model_as_judge_evaluator/.promptflow/ux.inputs.json:
--------------------------------------------------------------------------------
1 | {"chat_list": []}
--------------------------------------------------------------------------------
/Examples/promptflow/model_as_judge_evaluator/.runs/9ccd06e6-71ee-4b65-a04e-d9a6b525c0c7/flow.dag.yaml:
--------------------------------------------------------------------------------
1 | inputs:
2 | question:
3 | type: string
4 | default: Which tent is the most waterproof?
5 | is_chat_input: false
6 | answer:
7 | type: string
8 | default: The Alpine Explorer Tent is the most waterproof.
9 | is_chat_input: false
10 | context:
11 | type: string
12 | default: From the our product list, the alpine explorer tent is the most
13 | waterproof. The Adventure Dining Tabbe has higher weight.
14 | is_chat_input: false
15 | ground_truth:
16 | type: string
17 | default: The Alpine Explorer Tent has the highest rainfly waterproof rating at 3000m
18 | is_chat_input: false
19 | metrics:
20 | type: string
21 | default: gpt_groundedness,f1_score,gpt_fluency,gpt_coherence,gpt_similarity,gpt_relevance
22 | is_chat_input: false
23 | outputs:
24 | f1_score:
25 | type: string
26 | reference: ${concat_scores.output.f1_score}
27 | evaluation_only: false
28 | is_chat_output: false
29 | gpt_coherence:
30 | type: string
31 | reference: ${concat_scores.output.gpt_coherence}
32 | evaluation_only: false
33 | is_chat_output: false
34 | gpt_similarity:
35 | type: string
36 | reference: ${concat_scores.output.gpt_similarity}
37 | evaluation_only: false
38 | is_chat_output: false
39 | gpt_fluency:
40 | type: string
41 | reference: ${concat_scores.output.gpt_fluency}
42 | evaluation_only: false
43 | is_chat_output: false
44 | gpt_relevance:
45 | type: string
46 | reference: ${concat_scores.output.gpt_relevance}
47 | evaluation_only: false
48 | is_chat_output: false
49 | gpt_groundedness:
50 | type: string
51 | reference: ${concat_scores.output.gpt_groundedness}
52 | evaluation_only: false
53 | is_chat_output: false
54 | ada_similarity:
55 | type: string
56 | reference: ${concat_scores.output.ada_similarity}
57 | evaluation_only: false
58 | is_chat_output: false
59 | nodes:
60 | - name: gpt_coherence
61 | type: llm
62 | source:
63 | type: code
64 | path: gpt_coherence_prompt.jinja2
65 | inputs:
66 | deployment_name: gpt-35-turbo
67 | temperature: 0
68 | top_p: 1
69 | max_tokens: 1
70 | presence_penalty: 0
71 | frequency_penalty: 0
72 | answer: ${inputs.answer}
73 | question: ${inputs.question}
74 | provider: AzureOpenAI
75 | connection: Default_AzureOpenAI
76 | api: chat
77 | module: promptflow.tools.aoai
78 | aggregation: false
79 | activate:
80 | when: ${validate_input.output.gpt_coherence}
81 | is: true
82 | use_variants: false
83 | - name: concat_scores
84 | type: python
85 | source:
86 | type: code
87 | path: concat_scores.py
88 | inputs:
89 | ada_cosine_similarity: ${ada_similarity.output}
90 | f1_score: ${f1_score.output}
91 | gpt_coherence_score: ${gpt_coherence.output}
92 | gpt_fluency_score: ${gpt_fluency.output}
93 | gpt_groundedness_score: ${gpt_groundedness.output}
94 | gpt_relevance_score: ${gpt_relevance.output}
95 | gpt_similarity_score: ${gpt_similarity.output}
96 | aggregation: false
97 | use_variants: false
98 | - name: gpt_similarity
99 | type: llm
100 | source:
101 | type: code
102 | path: gpt_similarity_prompt.jinja2
103 | inputs:
104 | deployment_name: gpt-35-turbo
105 | temperature: 0
106 | top_p: 1
107 | max_tokens: 1
108 | presence_penalty: 0
109 | frequency_penalty: 0
110 | answer: ${inputs.answer}
111 | ground_truth: ${inputs.ground_truth}
112 | question: ${inputs.question}
113 | provider: AzureOpenAI
114 | connection: Default_AzureOpenAI
115 | api: chat
116 | module: promptflow.tools.aoai
117 | aggregation: false
118 | activate:
119 | when: ${validate_input.output.gpt_similarity}
120 | is: true
121 | use_variants: false
122 | - name: gpt_relevance
123 | type: llm
124 | source:
125 | type: code
126 | path: gpt_relevance_prompt.jinja2
127 | inputs:
128 | deployment_name: gpt-35-turbo
129 | temperature: 0
130 | top_p: 1
131 | max_tokens: 1
132 | presence_penalty: 0
133 | frequency_penalty: 0
134 | answer: ${inputs.answer}
135 | context: ${inputs.context}
136 | question: ${inputs.question}
137 | provider: AzureOpenAI
138 | connection: Default_AzureOpenAI
139 | api: chat
140 | module: promptflow.tools.aoai
141 | aggregation: false
142 | activate:
143 | when: ${validate_input.output.gpt_relevance}
144 | is: true
145 | use_variants: false
146 | - name: gpt_fluency
147 | type: llm
148 | source:
149 | type: code
150 | path: gpt_fluency_prompt.jinja2
151 | inputs:
152 | deployment_name: gpt-35-turbo
153 | temperature: 0
154 | top_p: 1
155 | max_tokens: 1
156 | presence_penalty: 0
157 | frequency_penalty: 0
158 | answer: ${inputs.answer}
159 | question: ${inputs.question}
160 | provider: AzureOpenAI
161 | connection: Default_AzureOpenAI
162 | api: chat
163 | module: promptflow.tools.aoai
164 | aggregation: false
165 | activate:
166 | when: ${validate_input.output.gpt_fluency}
167 | is: true
168 | use_variants: false
169 | - name: f1_score
170 | type: python
171 | source:
172 | type: code
173 | path: f1_score.py
174 | inputs:
175 | answer: ${inputs.answer}
176 | ground_truth: ${inputs.ground_truth}
177 | aggregation: false
178 | activate:
179 | when: ${validate_input.output.f1_score}
180 | is: true
181 | use_variants: false
182 | - name: gpt_groundedness
183 | type: llm
184 | source:
185 | type: code
186 | path: gpt_groundedness_prompt.jinja2
187 | inputs:
188 | deployment_name: gpt-35-turbo
189 | temperature: 0
190 | top_p: 1
191 | max_tokens: 1
192 | presence_penalty: 0
193 | frequency_penalty: 0
194 | answer: ${inputs.answer}
195 | context: ${inputs.context}
196 | provider: AzureOpenAI
197 | connection: Default_AzureOpenAI
198 | api: chat
199 | module: promptflow.tools.aoai
200 | aggregation: false
201 | activate:
202 | when: ${validate_input.output.gpt_groundedness}
203 | is: true
204 | use_variants: false
205 | - name: aggregate_variants_results
206 | type: python
207 | source:
208 | type: code
209 | path: aggregate_variants_results.py
210 | inputs:
211 | metrics: ${inputs.metrics}
212 | results: ${concat_scores.output}
213 | aggregation: true
214 | use_variants: false
215 | - name: select_metrics
216 | type: python
217 | source:
218 | type: code
219 | path: select_metrics.py
220 | inputs:
221 | metrics: ${inputs.metrics}
222 | aggregation: false
223 | use_variants: false
224 | - name: embeded_ground_truth
225 | type: python
226 | source:
227 | type: package
228 | tool: promptflow.tools.embedding.embedding
229 | inputs:
230 | connection: Default_AzureOpenAI
231 | deployment_name: text-embedding-ada-002
232 | input: ${inputs.ground_truth}
233 | aggregation: false
234 | activate:
235 | when: ${validate_input.output.ada_similarity}
236 | is: true
237 | use_variants: false
238 | - name: embeded_answer
239 | type: python
240 | source:
241 | type: package
242 | tool: promptflow.tools.embedding.embedding
243 | inputs:
244 | connection: Default_AzureOpenAI
245 | deployment_name: text-embedding-ada-002
246 | input: ${inputs.answer}
247 | aggregation: false
248 | activate:
249 | when: ${validate_input.output.ada_similarity}
250 | is: true
251 | use_variants: false
252 | - name: ada_similarity
253 | type: python
254 | source:
255 | type: code
256 | path: ada_cosine_similarity_score.py
257 | inputs:
258 | a: ${embeded_ground_truth.output}
259 | b: ${embeded_answer.output}
260 | aggregation: false
261 | activate:
262 | when: ${validate_input.output.ada_similarity}
263 | is: true
264 | use_variants: false
265 | - name: validate_input
266 | type: python
267 | source:
268 | type: code
269 | path: validate_input.py
270 | inputs:
271 | answer: ${inputs.answer}
272 | context: ${inputs.context}
273 | ground_truth: ${inputs.ground_truth}
274 | question: ${inputs.question}
275 | selected_metrics: ${select_metrics.output}
276 | aggregation: false
277 | use_variants: false
278 | node_variants: {}
279 | environment:
280 | python_requirements_txt: requirements.txt
281 |
--------------------------------------------------------------------------------
/Examples/promptflow/model_as_judge_evaluator/README.md:
--------------------------------------------------------------------------------
1 | # Q&A Evaluation:
2 |
3 | The Q&A evaluation flow will evaluate the Q&A systems by leveraging the state-of-the-art Large Language Models (LLM) to measure the quality and safety of your responses. Utilizing GPT and GPT embedding model to assist with measurements aims to achieve a high agreement with human evaluations compared to traditional mathematical measurements.
4 |
5 | ## What you will learn
6 |
7 | The Q&A evaluation flow allows you to assess and evaluate your model with the LLM-assisted metrics and f1_score:
8 |
9 |
10 | * __gpt_coherence__: Measures the quality of all sentences in a model's predicted answer and how they fit together naturally.
11 |
12 | Coherence is scored on a scale of 1 to 5, with 1 being the worst and 5 being the best.
13 |
14 | * __gpt_relevance__: Measures how relevant the model's predicted answers are to the questions asked.
15 |
16 | Relevance metric is scored on a scale of 1 to 5, with 1 being the worst and 5 being the best.
17 |
18 | * __gpt_fluency__: Measures how grammatically and linguistically correct the model's predicted answer is.
19 |
20 | Fluency is scored on a scale of 1 to 5, with 1 being the worst and 5 being the best
21 |
22 | * __gpt_similarity__: Measures similarity between user-provided ground truth answers and the model predicted answer.
23 |
24 | Similarity is scored on a scale of 1 to 5, with 1 being the worst and 5 being the best.
25 |
26 | * __gpt_groundedness__ (against context)**: Measures how grounded the model's predicted answers are against the context. Even if LLM’s responses are true, if not verifiable against context, then such responses are considered ungrounded.
27 |
28 | Groundedness metric is scored on a scale of 1 to 5, with 1 being the worst and 5 being the best.
29 |
30 | * __ada_similarity__: Measures the cosine similarity of ada embeddings of the model prediction and the ground truth.
31 |
32 | ada_similarity is a value in the range [0, 1].
33 |
34 | * __F1-score__: Compute the f1-Score based on the tokens in the predicted answer and the ground truth.
35 |
36 | The f1-score evaluation flow allows you to determine the f1-score metric using number of common tokens between the normalized version of the ground truth and the predicted answer.
37 |
38 | F1-score is a value in the range [0, 1].
39 |
40 |
41 | ## Prerequisites
42 |
43 | - Connection: Azure OpenAI or OpenAI connection.
44 | - Data input: Evaluating the Coherence metric requires you to provide data inputs including a question, an answer, a ground truth, and a context.
45 |
46 | ## Tools used in this flow
47 | - LLM tool
48 | - Python tool
49 | - Embedding tool
--------------------------------------------------------------------------------
/Examples/promptflow/model_as_judge_evaluator/ada_cosine_similarity_score.py:
--------------------------------------------------------------------------------
1 | from promptflow import tool
2 | import numpy as np
3 | from numpy.linalg import norm
4 |
5 |
6 | @tool
7 | def compute_ada_cosine_similarity(a, b) -> float:
8 | return np.dot(a, b)/(norm(a)*norm(b))
9 |
--------------------------------------------------------------------------------
/Examples/promptflow/model_as_judge_evaluator/aggregate_variants_results.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 | from promptflow import tool, log_metric
3 | import numpy as np
4 |
5 |
6 | @tool
7 | def aggregate_variants_results(results: List[dict], metrics: List[str]):
8 | aggregate_results = {}
9 | for result in results:
10 | for name, value in result.items():
11 | if name in metrics[0]:
12 | if name not in aggregate_results.keys():
13 | aggregate_results[name] = []
14 | try:
15 | float_val = float(value)
16 | except Exception:
17 | float_val = np.nan
18 | aggregate_results[name].append(float_val)
19 |
20 | for name, value in aggregate_results.items():
21 | if name in metrics[0]:
22 | aggregate_results[name] = np.nanmean(value)
23 | aggregate_results[name] = round(aggregate_results[name], 2)
24 | log_metric(name, aggregate_results[name])
25 | return aggregate_results
26 |
--------------------------------------------------------------------------------
/Examples/promptflow/model_as_judge_evaluator/concat_scores.py:
--------------------------------------------------------------------------------
1 | from promptflow import tool
2 | import numpy as np
3 | import re
4 |
5 |
6 | @tool
7 | def concat_results(gpt_coherence_score: str = None,
8 | gpt_similarity_score: str = None,
9 | gpt_fluency_score: str = None,
10 | gpt_relevance_score: str = None,
11 | gpt_groundedness_score: str = None,
12 | f1_score: float = None,
13 | ada_cosine_similarity: float = None):
14 |
15 | load_list = [{'name': 'gpt_coherence', 'score': gpt_coherence_score},
16 | {'name': 'gpt_similarity', 'score': gpt_similarity_score},
17 | {'name': 'gpt_fluency', 'score': gpt_fluency_score},
18 | {'name': 'gpt_relevance', 'score': gpt_relevance_score},
19 | {'name': 'gpt_groundedness', 'score': gpt_groundedness_score},
20 | {'name': 'f1_score', 'score': f1_score},
21 | {'name': 'ada_similarity', 'score': ada_cosine_similarity}]
22 |
23 | scalar_metrics = ["f1_score", "ada_similarity"]
24 | score_list = []
25 | errors = []
26 | for item in load_list:
27 | if item["name"] in scalar_metrics:
28 | try:
29 | score = float(item["score"])
30 | except Exception as e:
31 | score = np.nan
32 | errors.append({"name": item["name"], "msg": str(e), "data": item["score"]})
33 | else:
34 | if item['score']:
35 | try:
36 | score = item["score"]
37 | match = re.search(r'\d', score)
38 | if match:
39 | score = float(match.group())
40 | else:
41 | score = np.nan
42 | except Exception as e:
43 | score = np.nan
44 | errors.append({"name": item["name"], "msg": str(e), "data": item["score"]})
45 | else:
46 | score = np.nan
47 | score_list.append({"name": item["name"], "score": score})
48 |
49 | variant_level_result = {}
50 | for item in score_list:
51 | item_name = str(item["name"])
52 | variant_level_result[item_name] = item["score"]
53 | if 'gpt' in item_name:
54 | variant_level_result[item_name + '_pass_rate'] = 1 if item["score"] > 3 else 0
55 | return variant_level_result
56 |
--------------------------------------------------------------------------------
/Examples/promptflow/model_as_judge_evaluator/f1_score.py:
--------------------------------------------------------------------------------
1 | from promptflow import tool
2 | from collections import Counter
3 |
4 |
5 | @tool
6 | def compute_f1_score(ground_truth: str, answer: str) -> str:
7 | import string
8 | import re
9 |
10 | class QASplitTokenizer:
11 | def __call__(self, line):
12 | """Tokenizes an input line using split() on whitespace
13 |
14 | :param line: a segment to tokenize
15 | :return: the tokenized line
16 | """
17 |
18 | return line.split()
19 |
20 | def normalize_text(text) -> str:
21 | """Lower text and remove punctuation, articles and extra whitespace."""
22 |
23 | def remove_articles(text):
24 | return re.sub(r"\b(a|an|the)\b", " ", text)
25 |
26 | def white_space_fix(text):
27 | return " ".join(text.split())
28 |
29 | def remove_punctuation(text):
30 | exclude = set(string.punctuation)
31 | return "".join(ch for ch in text if ch not in exclude)
32 |
33 | def lower(text):
34 | return text.lower()
35 |
36 | return white_space_fix(remove_articles(remove_punctuation(lower(text))))
37 | prediction_tokens = normalize_text(answer)
38 | reference_tokens = normalize_text(ground_truth)
39 | tokenizer = QASplitTokenizer()
40 | prediction_tokens = tokenizer(prediction_tokens)
41 | reference_tokens = tokenizer(reference_tokens)
42 |
43 | common_tokens = Counter(prediction_tokens) & Counter(reference_tokens)
44 | num_common_tokens = sum(common_tokens.values())
45 |
46 | if num_common_tokens == 0:
47 | f1 = 0.0
48 | else:
49 | precision = 1.0 * num_common_tokens / len(prediction_tokens)
50 | recall = 1.0 * num_common_tokens / len(reference_tokens)
51 |
52 | f1 = (2.0 * precision * recall) / (precision + recall)
53 |
54 | return f1
55 |
--------------------------------------------------------------------------------
/Examples/promptflow/model_as_judge_evaluator/flow.dag.yaml:
--------------------------------------------------------------------------------
1 | inputs:
2 | question:
3 | type: string
4 | default: Which tent is the most waterproof?
5 | is_chat_input: false
6 | answer:
7 | type: string
8 | default: The Alpine Explorer Tent is the most waterproof.
9 | is_chat_input: false
10 | context:
11 | type: string
12 | default: From the our product list, the alpine explorer tent is the most
13 | waterproof. The Adventure Dining Tabbe has higher weight.
14 | is_chat_input: false
15 | ground_truth:
16 | type: string
17 | default: The Alpine Explorer Tent has the highest rainfly waterproof rating at 3000m
18 | is_chat_input: false
19 | metrics:
20 | type: string
21 | default: gpt_groundedness,f1_score,gpt_fluency,gpt_coherence,gpt_similarity,gpt_relevance,ada_similarity
22 | is_chat_input: false
23 | outputs:
24 | f1_score:
25 | type: string
26 | reference: ${concat_scores.output.f1_score}
27 | evaluation_only: false
28 | gpt_coherence:
29 | type: string
30 | reference: ${concat_scores.output.gpt_coherence}
31 | evaluation_only: false
32 | gpt_similarity:
33 | type: string
34 | reference: ${concat_scores.output.gpt_similarity}
35 | evaluation_only: false
36 | gpt_fluency:
37 | type: string
38 | reference: ${concat_scores.output.gpt_fluency}
39 | evaluation_only: false
40 | gpt_relevance:
41 | type: string
42 | reference: ${concat_scores.output.gpt_relevance}
43 | evaluation_only: false
44 | gpt_groundedness:
45 | type: string
46 | reference: ${concat_scores.output.gpt_groundedness}
47 | evaluation_only: false
48 | ada_similarity:
49 | type: string
50 | reference: ${concat_scores.output.ada_similarity}
51 | evaluation_only: false
52 | nodes:
53 | - name: gpt_coherence
54 | type: llm
55 | source:
56 | type: code
57 | path: gpt_coherence_prompt.jinja2
58 | inputs:
59 | deployment_name: gpt-35-turbo
60 | temperature: 0
61 | top_p: 1
62 | max_tokens: 1
63 | presence_penalty: 0
64 | frequency_penalty: 0
65 | answer: ${inputs.answer}
66 | question: ${inputs.question}
67 | provider: AzureOpenAI
68 | connection: Default_AzureOpenAI
69 | api: chat
70 | module: promptflow.tools.aoai
71 | aggregation: false
72 | activate:
73 | when: ${validate_input.output.gpt_coherence}
74 | is: true
75 | use_variants: false
76 | - name: concat_scores
77 | type: python
78 | source:
79 | type: code
80 | path: concat_scores.py
81 | inputs:
82 | gpt_coherence_score: ${gpt_coherence.output}
83 | gpt_similarity_score: ${gpt_similarity.output}
84 | gpt_fluency_score: ${gpt_fluency.output}
85 | gpt_relevance_score: ${gpt_relevance.output}
86 | gpt_groundedness_score: ${gpt_groundedness.output}
87 | f1_score: ${f1_score.output}
88 | ada_cosine_similarity: ${ada_similarity.output}
89 | aggregation: false
90 | use_variants: false
91 | - name: gpt_similarity
92 | type: llm
93 | source:
94 | type: code
95 | path: gpt_similarity_prompt.jinja2
96 | inputs:
97 | deployment_name: gpt-35-turbo
98 | temperature: 0
99 | top_p: 1
100 | max_tokens: 1
101 | presence_penalty: 0
102 | frequency_penalty: 0
103 | answer: ${inputs.answer}
104 | question: ${inputs.question}
105 | ground_truth: ${inputs.ground_truth}
106 | provider: AzureOpenAI
107 | connection: Default_AzureOpenAI
108 | api: chat
109 | module: promptflow.tools.aoai
110 | aggregation: false
111 | activate:
112 | when: ${validate_input.output.gpt_similarity}
113 | is: true
114 | use_variants: false
115 | - name: gpt_relevance
116 | type: llm
117 | source:
118 | type: code
119 | path: gpt_relevance_prompt.jinja2
120 | inputs:
121 | deployment_name: gpt-35-turbo
122 | temperature: 0
123 | top_p: 1
124 | max_tokens: 1
125 | presence_penalty: 0
126 | frequency_penalty: 0
127 | answer: ${inputs.answer}
128 | question: ${inputs.question}
129 | context: ${inputs.context}
130 | provider: AzureOpenAI
131 | connection: Default_AzureOpenAI
132 | api: chat
133 | module: promptflow.tools.aoai
134 | aggregation: false
135 | activate:
136 | when: ${validate_input.output.gpt_relevance}
137 | is: true
138 | use_variants: false
139 | - name: gpt_fluency
140 | type: llm
141 | source:
142 | type: code
143 | path: gpt_fluency_prompt.jinja2
144 | inputs:
145 | deployment_name: gpt-35-turbo
146 | temperature: 0
147 | top_p: 1
148 | max_tokens: 1
149 | presence_penalty: 0
150 | frequency_penalty: 0
151 | answer: ${inputs.answer}
152 | question: ${inputs.question}
153 | provider: AzureOpenAI
154 | connection: Default_AzureOpenAI
155 | api: chat
156 | module: promptflow.tools.aoai
157 | aggregation: false
158 | activate:
159 | when: ${validate_input.output.gpt_fluency}
160 | is: true
161 | use_variants: false
162 | - name: f1_score
163 | type: python
164 | source:
165 | type: code
166 | path: f1_score.py
167 | inputs:
168 | ground_truth: ${inputs.ground_truth}
169 | answer: ${inputs.answer}
170 | aggregation: false
171 | activate:
172 | when: ${validate_input.output.f1_score}
173 | is: true
174 | use_variants: false
175 | - name: gpt_groundedness
176 | type: llm
177 | source:
178 | type: code
179 | path: gpt_groundedness_prompt.jinja2
180 | inputs:
181 | deployment_name: gpt-35-turbo
182 | temperature: 0
183 | top_p: 1
184 | max_tokens: 1
185 | presence_penalty: 0
186 | frequency_penalty: 0
187 | answer: ${inputs.answer}
188 | context: ${inputs.context}
189 | provider: AzureOpenAI
190 | connection: Default_AzureOpenAI
191 | api: chat
192 | module: promptflow.tools.aoai
193 | aggregation: false
194 | activate:
195 | when: ${validate_input.output.gpt_groundedness}
196 | is: true
197 | use_variants: false
198 | - name: aggregate_variants_results
199 | type: python
200 | source:
201 | type: code
202 | path: aggregate_variants_results.py
203 | inputs:
204 | results: ${concat_scores.output}
205 | metrics: ${inputs.metrics}
206 | aggregation: true
207 | use_variants: false
208 | - name: select_metrics
209 | type: python
210 | source:
211 | type: code
212 | path: select_metrics.py
213 | inputs:
214 | metrics: ${inputs.metrics}
215 | aggregation: false
216 | use_variants: false
217 | - name: embeded_ground_truth
218 | type: python
219 | source:
220 | type: package
221 | tool: promptflow.tools.embedding.embedding
222 | inputs:
223 | connection: Default_AzureOpenAI
224 | deployment_name: text-embedding-ada-002
225 | input: ${inputs.ground_truth}
226 | aggregation: false
227 | activate:
228 | when: ${validate_input.output.ada_similarity}
229 | is: true
230 | use_variants: false
231 | - name: embeded_answer
232 | type: python
233 | source:
234 | type: package
235 | tool: promptflow.tools.embedding.embedding
236 | inputs:
237 | connection: Default_AzureOpenAI
238 | deployment_name: text-embedding-ada-002
239 | input: ${inputs.answer}
240 | aggregation: false
241 | activate:
242 | when: ${validate_input.output.ada_similarity}
243 | is: true
244 | use_variants: false
245 | - name: ada_similarity
246 | type: python
247 | source:
248 | type: code
249 | path: ada_cosine_similarity_score.py
250 | inputs:
251 | a: ${embeded_ground_truth.output}
252 | b: ${embeded_answer.output}
253 | aggregation: false
254 | activate:
255 | when: ${validate_input.output.ada_similarity}
256 | is: true
257 | use_variants: false
258 | - name: validate_input
259 | type: python
260 | source:
261 | type: code
262 | path: validate_input.py
263 | inputs:
264 | question: ${inputs.question}
265 | answer: ${inputs.answer}
266 | context: ${inputs.context}
267 | ground_truth: ${inputs.ground_truth}
268 | selected_metrics: ${select_metrics.output}
269 | aggregation: false
270 | use_variants: false
271 | node_variants: {}
272 | environment:
273 | python_requirements_txt: requirements.txt
274 |
--------------------------------------------------------------------------------
/Examples/promptflow/model_as_judge_evaluator/flow.meta.yaml:
--------------------------------------------------------------------------------
1 | $schema: https://azuremlschemas.azureedge.net/latest/flow.schema.json
2 | name: qna_non_rag_eval
3 | display_name: QnA Evaluation
4 | type: evaluate
5 | path: ./flow.dag.yaml
6 | description: Compute the quality of the answer for the given question based on the ground_truth and the context
7 | properties:
8 | promptflow.stage: prod
9 | promptflow.details.type: markdown
10 | promptflow.details.source: README.md
11 | promptflow.batch_inputs: samples.json
--------------------------------------------------------------------------------
/Examples/promptflow/model_as_judge_evaluator/gpt_coherence_prompt.jinja2:
--------------------------------------------------------------------------------
1 | system:
2 | You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric.
3 |
4 | user:
5 | Coherence of an answer is measured by how well all the sentences fit together and sound naturally as a whole. Consider the overall quality of the answer when evaluating coherence. Given the question and answer, score the coherence of answer between one to five stars using the following rating scale:
6 | One star: the answer completely lacks coherence
7 | Two stars: the answer mostly lacks coherence
8 | Three stars: the answer is partially coherent
9 | Four stars: the answer is mostly coherent
10 | Five stars: the answer has perfect coherency
11 |
12 | This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5.
13 |
14 | question: What is your favorite indoor activity and why do you enjoy it?
15 | answer: I like pizza. The sun is shining.
16 | stars: 1
17 |
18 | question: Can you describe your favorite movie without giving away any spoilers?
19 | answer: It is a science fiction movie. There are dinosaurs. The actors eat cake. People must stop the villain.
20 | stars: 2
21 |
22 | question: What are some benefits of regular exercise?
23 | answer: Regular exercise improves your mood. A good workout also helps you sleep better. Trees are green.
24 | stars: 3
25 |
26 | question: How do you cope with stress in your daily life?
27 | answer: I usually go for a walk to clear my head. Listening to music helps me relax as well. Stress is a part of life, but we can manage it through some activities.
28 | stars: 4
29 |
30 | question: What can you tell me about climate change and its effects on the environment?
31 | answer: Climate change has far-reaching effects on the environment. Rising temperatures result in the melting of polar ice caps, contributing to sea-level rise. Additionally, more frequent and severe weather events, such as hurricanes and heatwaves, can cause disruption to ecosystems and human societies alike.
32 | stars: 5
33 |
34 | question: {{question}}
35 | answer: {{answer}}
36 | stars:
--------------------------------------------------------------------------------
/Examples/promptflow/model_as_judge_evaluator/gpt_fluency_prompt.jinja2:
--------------------------------------------------------------------------------
1 | system:
2 | You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric.
3 | user:
4 | Fluency measures the quality of individual sentences in the answer, and whether they are well-written and grammatically correct. Consider the quality of individual sentences when evaluating fluency. Given the question and answer, score the fluency of the answer between one to five stars using the following rating scale:
5 | One star: the answer completely lacks fluency
6 | Two stars: the answer mostly lacks fluency
7 | Three stars: the answer is partially fluent
8 | Four stars: the answer is mostly fluent
9 | Five stars: the answer has perfect fluency
10 |
11 | This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5.
12 |
13 | question: What did you have for breakfast today?
14 | answer: Breakfast today, me eating cereal and orange juice very good.
15 | stars: 1
16 |
17 | question: How do you feel when you travel alone?
18 | answer: Alone travel, nervous, but excited also. I feel adventure and like its time.
19 | stars: 2
20 |
21 | question: When was the last time you went on a family vacation?
22 | answer: Last family vacation, it took place in last summer. We traveled to a beach destination, very fun.
23 | stars: 3
24 |
25 | question: What is your favorite thing about your job?
26 | answer: My favorite aspect of my job is the chance to interact with diverse people. I am constantly learning from their experiences and stories.
27 | stars: 4
28 |
29 | question: Can you describe your morning routine?
30 | answer: Every morning, I wake up at 6 am, drink a glass of water, and do some light stretching. After that, I take a shower and get dressed for work. Then, I have a healthy breakfast, usually consisting of oatmeal and fruits, before leaving the house around 7:30 am.
31 | stars: 5
32 |
33 | question: {{question}}
34 | answer: {{answer}}
35 | stars:
--------------------------------------------------------------------------------
/Examples/promptflow/model_as_judge_evaluator/gpt_groundedness_prompt.jinja2:
--------------------------------------------------------------------------------
1 | system:
2 | You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric.
3 | user:
4 | You will be presented with a CONTEXT and an ANSWER about that CONTEXT. You need to decide whether the ANSWER is entailed by the CONTEXT by choosing one of the following rating:
5 | 1. 5: The ANSWER follows logically from the information contained in the CONTEXT.
6 | 2. 1: The ANSWER is logically false from the information contained in the CONTEXT.
7 | 3. an integer score between 1 and 5 and if such integer score does not exist, use 1: It is not possible to determine whether the ANSWER is true or false without further information. Read the passage of information thoroughly and select the correct answer from the three answer labels. Read the CONTEXT thoroughly to ensure you know what the CONTEXT entails. Note the ANSWER is generated by a computer system, it can contain certain symbols, which should not be a negative factor in the evaluation.
8 | Independent Examples:
9 | ## Example Task #1 Input:
10 | {"CONTEXT": "Some are reported as not having been wanted at all.", "QUESTION": "", "ANSWER": "All are reported as being completely and fully wanted."}
11 | ## Example Task #1 Output:
12 | 1
13 | ## Example Task #2 Input:
14 | {"CONTEXT": "Ten new television shows appeared during the month of September. Five of the shows were sitcoms, three were hourlong dramas, and two were news-magazine shows. By January, only seven of these new shows were still on the air. Five of the shows that remained were sitcoms.", "QUESTION": "", "ANSWER": "At least one of the shows that were cancelled was an hourlong drama."}
15 | ## Example Task #2 Output:
16 | 5
17 | ## Example Task #3 Input:
18 | {"CONTEXT": "In Quebec, an allophone is a resident, usually an immigrant, whose mother tongue or home language is neither French nor English.", "QUESTION": "", "ANSWER": "In Quebec, an allophone is a resident, usually an immigrant, whose mother tongue or home language is not French."}
19 | ## Example Task #3 Output:
20 | 5
21 | ## Example Task #4 Input:
22 | {"CONTEXT": "Some are reported as not having been wanted at all.", "QUESTION": "", "ANSWER": "All are reported as being completely and fully wanted."}
23 | ## Example Task #4 Output:
24 | 1
25 | ## Actual Task Input:
26 | {"CONTEXT": {{context}}, "QUESTION": "", "ANSWER": {{answer}}}
27 | Reminder: The return values for each task should be correctly formatted as an integer between 1 and 5. Do not repeat the context and question.
28 | Actual Task Output:
--------------------------------------------------------------------------------
/Examples/promptflow/model_as_judge_evaluator/gpt_relevance_prompt.jinja2:
--------------------------------------------------------------------------------
1 | system:
2 | You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric.
3 | user:
4 | Relevance measures how well the answer addresses the main aspects of the question, based on the context. Consider whether all and only the important aspects are contained in the answer when evaluating relevance. Given the context and question, score the relevance of the answer between one to five stars using the following rating scale:
5 | One star: the answer completely lacks relevance
6 | Two stars: the answer mostly lacks relevance
7 | Three stars: the answer is partially relevant
8 | Four stars: the answer is mostly relevant
9 | Five stars: the answer has perfect relevance
10 |
11 | This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5.
12 |
13 | context: Marie Curie was a Polish-born physicist and chemist who pioneered research on radioactivity and was the first woman to win a Nobel Prize.
14 | question: What field did Marie Curie excel in?
15 | answer: Marie Curie was a renowned painter who focused mainly on impressionist styles and techniques.
16 | stars: 1
17 |
18 | context: The Beatles were an English rock band formed in Liverpool in 1960, and they are widely regarded as the most influential music band in history.
19 | question: Where were The Beatles formed?
20 | answer: The band The Beatles began their journey in London, England, and they changed the history of music.
21 | stars: 2
22 |
23 | context: The recent Mars rover, Perseverance, was launched in 2020 with the main goal of searching for signs of ancient life on Mars. The rover also carries an experiment called MOXIE, which aims to generate oxygen from the Martian atmosphere.
24 | question: What are the main goals of Perseverance Mars rover mission?
25 | answer: The Perseverance Mars rover mission focuses on searching for signs of ancient life on Mars.
26 | stars: 3
27 |
28 | context: The Mediterranean diet is a commonly recommended dietary plan that emphasizes fruits, vegetables, whole grains, legumes, lean proteins, and healthy fats. Studies have shown that it offers numerous health benefits, including a reduced risk of heart disease and improved cognitive health.
29 | question: What are the main components of the Mediterranean diet?
30 | answer: The Mediterranean diet primarily consists of fruits, vegetables, whole grains, and legumes.
31 | stars: 4
32 |
33 | context: The Queen's Royal Castle is a well-known tourist attraction in the United Kingdom. It spans over 500 acres and contains extensive gardens and parks. The castle was built in the 15th century and has been home to generations of royalty.
34 | question: What are the main attractions of the Queen's Royal Castle?
35 | answer: The main attractions of the Queen's Royal Castle are its expansive 500-acre grounds, extensive gardens, parks, and the historical castle itself, which dates back to the 15th century and has housed generations of royalty.
36 | stars: 5
37 |
38 | context: {{context}}
39 | question: {{question}}
40 | answer: {{answer}}
41 | stars:
--------------------------------------------------------------------------------
/Examples/promptflow/model_as_judge_evaluator/gpt_similarity_prompt.jinja2:
--------------------------------------------------------------------------------
1 | system:
2 | You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric.
3 | user:
4 | Equivalence, as a metric, measures the similarity between the predicted answer and the correct answer. If the information and content in the predicted answer is similar or equivalent to the correct answer, then the value of the Equivalence metric should be high, else it should be low. Given the question, correct answer, and predicted answer, determine the value of Equivalence metric using the following rating scale:
5 | One star: the predicted answer is not at all similar to the correct answer
6 | Two stars: the predicted answer is mostly not similar to the correct answer
7 | Three stars: the predicted answer is somewhat similar to the correct answer
8 | Four stars: the predicted answer is mostly similar to the correct answer
9 | Five stars: the predicted answer is completely similar to the correct answer
10 |
11 | This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5.
12 |
13 | The examples below show the Equivalence score for a question, a correct answer, and a predicted answer.
14 |
15 | question: What is the role of ribosomes?
16 | correct answer: Ribosomes are cellular structures responsible for protein synthesis. They interpret the genetic information carried by messenger RNA (mRNA) and use it to assemble amino acids into proteins.
17 | predicted answer: Ribosomes participate in carbohydrate breakdown by removing nutrients from complex sugar molecules.
18 | stars: 1
19 |
20 | question: Why did the Titanic sink?
21 | correct answer: The Titanic sank after it struck an iceberg during its maiden voyage in 1912. The impact caused the ship's hull to breach, allowing water to flood into the vessel. The ship's design, lifeboat shortage, and lack of timely rescue efforts contributed to the tragic loss of life.
22 | predicted answer: The sinking of the Titanic was a result of a large iceberg collision. This caused the ship to take on water and eventually sink, leading to the death of many passengers due to a shortage of lifeboats and insufficient rescue attempts.
23 | stars: 2
24 |
25 | question: What causes seasons on Earth?
26 | correct answer: Seasons on Earth are caused by the tilt of the Earth's axis and its revolution around the Sun. As the Earth orbits the Sun, the tilt causes different parts of the planet to receive varying amounts of sunlight, resulting in changes in temperature and weather patterns.
27 | predicted answer: Seasons occur because of the Earth's rotation and its elliptical orbit around the Sun. The tilt of the Earth's axis causes regions to be subjected to different sunlight intensities, which leads to temperature fluctuations and alternating weather conditions.
28 | stars: 3
29 |
30 | question: How does photosynthesis work?
31 | correct answer: Photosynthesis is a process by which green plants and some other organisms convert light energy into chemical energy. This occurs as light is absorbed by chlorophyll molecules, and then carbon dioxide and water are converted into glucose and oxygen through a series of reactions.
32 | predicted answer: In photosynthesis, sunlight is transformed into nutrients by plants and certain microorganisms. Light is captured by chlorophyll molecules, followed by the conversion of carbon dioxide and water into sugar and oxygen through multiple reactions.
33 | stars: 4
34 |
35 | question: What are the health benefits of regular exercise?
36 | correct answer: Regular exercise can help maintain a healthy weight, increase muscle and bone strength, and reduce the risk of chronic diseases. It also promotes mental well-being by reducing stress and improving overall mood.
37 | predicted answer: Routine physical activity can contribute to maintaining ideal body weight, enhancing muscle and bone strength, and preventing chronic illnesses. In addition, it supports mental health by alleviating stress and augmenting general mood.
38 | stars: 5
39 |
40 | question: {{question}}
41 | correct answer:{{ground_truth}}
42 | predicted answer: {{answer}}
43 | stars:
--------------------------------------------------------------------------------
/Examples/promptflow/model_as_judge_evaluator/requirements.txt:
--------------------------------------------------------------------------------
1 | promptflow
2 | promptflow-tools
--------------------------------------------------------------------------------
/Examples/promptflow/model_as_judge_evaluator/samples.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "question": "Which tent is the most waterproof?",
4 | "context": "From the our product list, the alpine explorer tent is the most waterproof. The Adventure Dining Table has higher weight.",
5 | "answer": "The Alpine Explorer Tent is the most waterproof.",
6 | "ground_truth": "The Alpine Explorer Tent has the highest rainfly waterproof rating at 3000m"
7 | }
8 | ]
--------------------------------------------------------------------------------
/Examples/promptflow/model_as_judge_evaluator/select_metrics.py:
--------------------------------------------------------------------------------
1 | from promptflow import tool
2 |
3 |
4 | @tool
5 | def select_metrics(metrics: str) -> str:
6 | supported_metrics = ('gpt_coherence', 'gpt_similarity', 'gpt_fluency', 'gpt_relevance', 'gpt_groundedness',
7 | 'f1_score', 'ada_similarity')
8 | user_selected_metrics = [metric.strip() for metric in metrics.split(',') if metric]
9 | metric_selection_dict = {}
10 | for metric in supported_metrics:
11 | if metric in user_selected_metrics:
12 | metric_selection_dict[metric] = True
13 | else:
14 | metric_selection_dict[metric] = False
15 | return metric_selection_dict
16 |
--------------------------------------------------------------------------------
/Examples/promptflow/model_as_judge_evaluator/validate_input.py:
--------------------------------------------------------------------------------
1 | from promptflow import tool
2 |
3 |
4 | @tool
5 | def validate_input(question: str, answer: str, context: str, ground_truth: str, selected_metrics: dict) -> dict:
6 | input_data = {"question": question, "answer": answer, "context": context, "ground_truth": ground_truth}
7 | expected_input_cols = set(input_data.keys())
8 | dict_metric_required_fields = {"gpt_groundedness": set(["answer", "context"]),
9 | "gpt_relevance": set(["question", "answer", "context"]),
10 | "gpt_coherence": set(["question", "answer"]),
11 | "gpt_similarity": set(["question", "answer", "ground_truth"]),
12 | "gpt_fluency": set(["question", "answer"]),
13 | "f1_score": set(["answer", "ground_truth"]),
14 | "ada_similarity": set(["answer", "ground_truth"])}
15 | actual_input_cols = set()
16 | for col in expected_input_cols:
17 | if input_data[col] and input_data[col].strip():
18 | actual_input_cols.add(col)
19 | data_validation = selected_metrics
20 | for metric in selected_metrics:
21 | if selected_metrics[metric]:
22 | metric_required_fields = dict_metric_required_fields[metric]
23 | if metric_required_fields <= actual_input_cols:
24 | data_validation[metric] = True
25 | else:
26 | data_validation[metric] = False
27 | return data_validation
28 |
--------------------------------------------------------------------------------
/Examples/promptflow/new_chat_flow/.promptflow/chat.output.json:
--------------------------------------------------------------------------------
1 | {
2 | "answer": "Of course! Hello world! How can I help you today?"
3 | }
--------------------------------------------------------------------------------
/Examples/promptflow/new_chat_flow/.promptflow/flow.detail.json:
--------------------------------------------------------------------------------
1 | {}
--------------------------------------------------------------------------------
/Examples/promptflow/new_chat_flow/.promptflow/flow.layout.json:
--------------------------------------------------------------------------------
1 | {
2 | "nodeLayouts": {
3 | "inputs": {
4 | "x": 290,
5 | "y": 42,
6 | "index": -1
7 | },
8 | "outputs": {
9 | "x": 220,
10 | "y": 310,
11 | "index": -1
12 | },
13 | "chat": {
14 | "x": 140,
15 | "y": 176,
16 | "index": 0
17 | }
18 | },
19 | "orientation": "Vertical"
20 | }
--------------------------------------------------------------------------------
/Examples/promptflow/new_chat_flow/.promptflow/flow.log:
--------------------------------------------------------------------------------
1 | 2024-04-18 07:31:22 -0700 20828 execution.flow INFO Start executing nodes in thread pool mode.
2 | 2024-04-18 07:31:22 -0700 20828 execution.flow INFO Start to run 1 nodes with concurrency level 16.
3 | 2024-04-18 07:31:22 -0700 20828 execution.flow INFO Executing node chat. node run id: d517789a-3106-4079-b5ec-0b752ab42dac_chat_0
4 | 2024-04-18 07:31:24 -0700 20828 execution.flow WARNING Output of chat is not json serializable, use str to store it.
5 | 2024-04-18 07:31:24 -0700 20828 execution.flow INFO Node chat completes.
6 | 2024-04-18 07:31:38 -0700 20828 execution.flow INFO Start executing nodes in thread pool mode.
7 | 2024-04-18 07:31:38 -0700 20828 execution.flow INFO Start to run 1 nodes with concurrency level 16.
8 | 2024-04-18 07:31:38 -0700 20828 execution.flow INFO Executing node chat. node run id: 7d66c026-7f12-4bd1-b992-82cb60a1274e_chat_0
9 | 2024-04-18 07:31:38 -0700 20828 execution.flow WARNING Output of chat is not json serializable, use str to store it.
10 | 2024-04-18 07:31:38 -0700 20828 execution.flow INFO Node chat completes.
11 |
--------------------------------------------------------------------------------
/Examples/promptflow/new_chat_flow/.promptflow/flow.output.json:
--------------------------------------------------------------------------------
1 | {
2 | "answer": "Hello! I'm an AI assistant, so I don't have feelings, but I'm here to help you with any questions or tasks you have. How can I assist you today?"
3 | }
--------------------------------------------------------------------------------
/Examples/promptflow/prompt_test/.promptflow/chat.output.json:
--------------------------------------------------------------------------------
1 | {
2 | "unformatted_meta_reply": "You're welcome! If you have any more questions in the future, feel free to ask. Goodbye!",
3 | "formatted_meta_reply": "You're welcome! If you have any more questions in the future, feel free to ask. Goodbye!"
4 | }
--------------------------------------------------------------------------------
/Examples/promptflow/prompt_test/.promptflow/flow.layout.json:
--------------------------------------------------------------------------------
1 | {
2 | "nodeLayouts": {
3 | "inputs": {
4 | "x": 455,
5 | "y": 42,
6 | "index": -1
7 | },
8 | "outputs": {
9 | "x": 385,
10 | "y": 310,
11 | "index": -1
12 | },
13 | "unformatted_meta": {
14 | "x": 140,
15 | "y": 176,
16 | "index": 0
17 | },
18 | "formatted_meta": {
19 | "x": 470,
20 | "y": 176,
21 | "index": 1
22 | }
23 | },
24 | "orientation": "Vertical"
25 | }
--------------------------------------------------------------------------------
/Examples/promptflow/prompt_test/.promptflow/flow.log:
--------------------------------------------------------------------------------
1 | 2024-04-17 13:17:07 -0700 33752 execution.flow INFO Start executing nodes in thread pool mode.
2 | 2024-04-17 13:17:07 -0700 33752 execution.flow INFO Start to run 2 nodes with concurrency level 16.
3 | 2024-04-17 13:17:07 -0700 33752 execution.flow INFO Executing node unformatted_meta. node run id: 1b587084-d370-44e6-ba18-ffe4b42b321d_unformatted_meta_0
4 | 2024-04-17 13:17:07 -0700 33752 execution.flow INFO Executing node formatted_meta. node run id: 1b587084-d370-44e6-ba18-ffe4b42b321d_formatted_meta_0
5 | 2024-04-17 13:17:08 -0700 33752 execution.flow INFO Node unformatted_meta completes.
6 | 2024-04-17 13:17:08 -0700 33752 execution.flow INFO Node formatted_meta completes.
7 |
--------------------------------------------------------------------------------
/Examples/promptflow/prompt_test/.promptflow/flow.output.json:
--------------------------------------------------------------------------------
1 | {
2 | "unformatted_meta_reply": "You must be at least 16 years old to get a driver's license in California. [source1]",
3 | "formatted_meta_reply": "In California, you must be at least 16 years old to obtain a driver's license [source: California DMV website]."
4 | }
--------------------------------------------------------------------------------
/Examples/promptflow/prompt_test/.promptflow/flow.uihint.json:
--------------------------------------------------------------------------------
1 | {
2 | "nodes": {
3 | "formatted_meta": {
4 | "variant_0": {
5 | "inputs": {
6 | "question": [],
7 | "chat_history": []
8 | }
9 | }
10 | },
11 | "unformatted_meta": {
12 | "variant_0": {
13 | "inputs": {
14 | "question": [],
15 | "chat_history": []
16 | }
17 | }
18 | }
19 | }
20 | }
--------------------------------------------------------------------------------
/Examples/requirements.txt:
--------------------------------------------------------------------------------
1 | openai
2 |
--------------------------------------------------------------------------------