├── FollowRAG
    ├── eval
    │   ├── if_eval.py
    │   ├── main_eval.py
    │   └── rag_eval.py
    ├── followRAG
    │   ├── atom_instruction.json
    │   ├── followRAG_full.json
    │   ├── followRAG_ifhq.json
    │   ├── followRAG_ifnq.json
    │   ├── followRAG_iftq.json
    │   └── followRAG_ifwebq.json
    ├── readme.md
    ├── requirements.txt
    ├── scripts
    │   └── eval.sh
    └── utils
    │   ├── call_llm.py
    │   ├── instruction_following_eval
    │       ├── instructions.py
    │       ├── instructions_registry.py
    │       └── instructions_util.py
    │   └── util.py
├── README.md
└── VIF-RAG
    ├── LICENSE
    ├── requirements.txt
    ├── sample_data
        ├── augment_instructions.txt
        ├── cross_validation.jsonl
        ├── dpo_query_eval_score_results.jsonl
        ├── dpo_query_w_funcs.jsonl
        ├── eval_func_rft.jsonl
        ├── multi_instruction.txt
        ├── multi_instruction_score.jsonl
        ├── query_rft.jsonl
        ├── query_rft_score.jsonl
        ├── query_score_filter.jsonl
        └── seed_instruction.txt
    └── vifrag.ipynb


/FollowRAG/eval/if_eval.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Evaluate the instruction following score of the response
  3 | """
  4 | import os
  5 | import sys
  6 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  7 | from utils.util import read_jsonl
  8 | from copy import deepcopy
  9 | from utils.instruction_following_eval import instructions_registry
 10 | 
 11 | def eval_instruction_following_strict(dp):
 12 |     """Strictly evaluate the instruction following of a data point."""
 13 |     response=dp['response']
 14 |     instruction_list=dp['instruction_id_list']
 15 |     is_following_list = []
 16 | 
 17 |     for index, instruction_id in enumerate(instruction_list):
 18 |         instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id]
 19 |         instruction = instruction_cls(instruction_id)
 20 |         instruction.build_description(**dp['kwargs'][index])
 21 |         args = instruction.get_instruction_args()
 22 |         if args and "prompt" in args:
 23 |             instruction.build_description(prompt=dp['prompt'])
 24 | 
 25 |         if response.strip() and instruction.check_following(response):
 26 |             is_following_list.append(True)
 27 |         else:
 28 |             is_following_list.append(False)
 29 | 
 30 |     return is_following_list
 31 | 
 32 | def eval_instruction_following_loose(dp):
 33 |   """Loosely evaluate the instruction following of a data point."""
 34 |   response = dp['response']
 35 |   r = response.split("\n")
 36 |   response_remove_first = "\n".join(r[1:]).strip()
 37 |   response_remove_last = "\n".join(r[:-1]).strip()
 38 |   response_remove_both = "\n".join(r[1:-1]).strip()
 39 |   revised_response = response.replace("*", "")
 40 |   revised_response_remove_first = response_remove_first.replace("*", "")
 41 |   revised_response_remove_last = response_remove_last.replace("*", "")
 42 |   revised_response_remove_both = response_remove_both.replace("*", "")
 43 |   all_responses = [
 44 |       response,
 45 |       revised_response,
 46 |       response_remove_first,
 47 |       response_remove_last,
 48 |       response_remove_both,
 49 |       revised_response_remove_first,
 50 |       revised_response_remove_last,
 51 |       revised_response_remove_both,
 52 |   ]
 53 |   instruction_list = dp['instruction_id_list']
 54 |   is_following_list = []
 55 | 
 56 |   for index, instruction_id in enumerate(instruction_list):
 57 |     instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id]
 58 |     instruction = instruction_cls(instruction_id)
 59 | 
 60 |     instruction.build_description(**dp['kwargs'][index])
 61 |     args = instruction.get_instruction_args()
 62 |     if args and "prompt" in args:
 63 |       instruction.build_description(prompt=dp['prompt'])
 64 | 
 65 |     is_following = False
 66 |     for r in all_responses:
 67 |       if r.strip() and instruction.check_following(r):
 68 |         is_following = True
 69 |         break
 70 | 
 71 |     is_following_list.append(is_following)
 72 | 
 73 |   return is_following_list
 74 | 
 75 | def if_eval_main(data):
 76 |     data_if_evaled=[]
 77 |     for dp in data:
 78 |         # Evaluate each sample.
 79 |         strict_is_following_list=eval_instruction_following_strict(dp)
 80 |         loose_is_following_list=eval_instruction_following_loose(dp)
 81 |         dp_if_evaled=deepcopy(dp)
 82 |         dp_if_evaled['if_eval']={'strict_follow_all_instructions':all(strict_is_following_list),
 83 |                         'loose_follow_all_instructions':all(loose_is_following_list),
 84 |                         'strict_follow_instructions':strict_is_following_list,
 85 |                         'loose_follow_instructions':loose_is_following_list
 86 |                         }
 87 |         data_if_evaled.append(dp_if_evaled)
 88 |     # Calculate the total score.
 89 |     def calc_score(data_if_evaled):
 90 |         strict_prompt_score=round(sum([dp['if_eval']['strict_follow_all_instructions'] for dp in data_if_evaled])*100/len(data_if_evaled),2)
 91 |         loose_prompt_score=round(sum([dp['if_eval']['loose_follow_all_instructions'] for dp in data_if_evaled])*100/len(data_if_evaled),2)
 92 |         strict_follow_instructions,loose_follow_instructions=[],[]
 93 |         for dp in data_if_evaled:
 94 |             strict_follow_instructions+=dp['if_eval']['strict_follow_instructions']
 95 |             loose_follow_instructions+=dp['if_eval']['loose_follow_instructions']
 96 |         strict_instruction_score=round(sum(strict_follow_instructions)*100/len(strict_follow_instructions),2)
 97 |         loose_instruction_score=round(sum(loose_follow_instructions)*100/len(loose_follow_instructions),2)
 98 |         if_eval_score={'strict_prompt_score':strict_prompt_score,'loose_prompt_score':loose_prompt_score,'strict_instruction_score':strict_instruction_score,'loose_instruction_score':loose_instruction_score}
 99 |         return if_eval_score
100 |     # Calculate the score for each category of IFNQ.
101 |     if_eval_scores={"all":calc_score(data_if_evaled),
102 |                     "ifnq":calc_score([dp for dp in data_if_evaled if 'ifnq' in dp['type']]),
103 |                     "iftq":calc_score([dp for dp in data_if_evaled if 'iftq' in dp['type']]),
104 |                     "ifhq":calc_score([dp for dp in data_if_evaled if 'ifhq' in dp['type']]),
105 |                     "ifwebq":calc_score([dp for dp in data_if_evaled if 'ifwebq' in dp['type']])
106 |                     }
107 |     print("if_eval_scores:",if_eval_scores)
108 |     return data_if_evaled,if_eval_scores
109 | 
110 | if __name__=="__main__":
111 |     file_path="The path to the JSONL file that has already been inferred and needs to be evaluated, where the response field needs to be added to each field in `followRAG_full`."
112 |     data=read_jsonl(file_path)
113 |     if_eval_main(data)
114 | 


--------------------------------------------------------------------------------
/FollowRAG/eval/main_eval.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 4 | import argparse
 5 | from eval.if_eval import if_eval_main
 6 | from eval.rag_eval import rag_eval_main
 7 | from utils.util import read_jsonl,save_jsonl,get_current_time
 8 | 
 9 | def get_args():
10 |     parser = argparse.ArgumentParser()
11 |     parser.add_argument("--input_file_path", type=str, 
12 |                         help="The path to the JSONL file that has already been inferred and needs to be evaluated, where the response field needs to be added to each field in `followRAG_full`.")
13 |     parser.add_argument("--rag_eval_type", type=str, choices=["mini","all"],
14 |                         help="Since the evaluation of RAG scores requires calling GPT, this parameter is used to specify the type of RAG evaluation. 'mini' indicates that only a sample of the evaluation will be conducted.")
15 |     parser.add_argument("--output_file_path", type=str, help="The path to save the evaluation results")
16 |     parser.add_argument("--result_log_file_path", type=str, help="The path to save the result score log")
17 |     args = parser.parse_args()
18 |     return args
19 | 
20 | def main(args):
21 |     log={}
22 |     log["time"]=get_current_time()
23 |     log["eval_file"]=args.input_file_path
24 |     print("eval:",args.input_file_path)
25 |     # eval
26 |     data=read_jsonl(read_file_path=args.input_file_path)
27 |     data_if_evaled,if_eval_result=if_eval_main(data)
28 |     data_if_rag_evaled,rag_eval_result=rag_eval_main(data_if_evaled,eval_type=args.rag_eval_type)
29 |     # marge data_if_evaled and data_if_rag_evaled
30 |     data_evaled=[]
31 |     data_if_rag_evaled= {item['key']: item for item in data_if_rag_evaled}
32 |     for item in data_if_evaled:
33 |         if item['key'] in data_if_rag_evaled:
34 |             data_evaled.append(data_if_rag_evaled[item['key']])
35 |         else:
36 |             data_evaled.append(item)
37 |     # log
38 |     log["eval_score"]={"all":{"if":if_eval_result["all"]["loose_instruction_score"],
39 |                               "rag":rag_eval_result["score_all"],
40 |                               "avg":round((if_eval_result["all"]["loose_instruction_score"]+rag_eval_result["score_all"])/2,2)},
41 |                        "ifnq":{"if":if_eval_result["ifnq"]["loose_instruction_score"],
42 |                                "rag":rag_eval_result["score_ifnq"],
43 |                                "avg":round((if_eval_result["all"]["loose_instruction_score"]+rag_eval_result["score_ifnq"])/2,2)},
44 |                        "iftq":{"if":if_eval_result["iftq"]["loose_instruction_score"],
45 |                                "rag":rag_eval_result["score_iftq"],
46 |                                "avg":round((if_eval_result["iftq"]["loose_instruction_score"]+rag_eval_result["score_iftq"])/2,2)},
47 |                        "ifhq":{"if":if_eval_result["ifhq"]["loose_instruction_score"],
48 |                                "rag":rag_eval_result["score_ifhq"],
49 |                                "avg":round((if_eval_result["ifhq"]["loose_instruction_score"]+rag_eval_result["score_ifhq"])/2,2)},
50 |                        "ifwebq":{"if":if_eval_result["ifwebq"]["loose_instruction_score"],
51 |                                  "rag":rag_eval_result["score_ifwebq"],
52 |                                  "avg":round((if_eval_result["ifwebq"]["loose_instruction_score"]+rag_eval_result["score_ifwebq"])/2,2)},
53 |                        }
54 |     log["if_eval_result"]=if_eval_result
55 |     log["rag_eval_result"]=rag_eval_result
56 |     log["detailed_args"]=args.__dict__
57 |     print(log)
58 |     # save eval results
59 |     save_jsonl(args.output_file_path,data_evaled,mode='w')
60 |     save_jsonl(args.result_log_file_path,[log],mode='a')
61 | 
62 | if __name__=="__main__":
63 |     args=get_args()
64 |     main(args)


--------------------------------------------------------------------------------
/FollowRAG/eval/rag_eval.py:
--------------------------------------------------------------------------------
  1 | """
  2 | response RAG score based on RAG document Q&A
  3 | """
  4 | import os
  5 | import sys
  6 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  7 | import re
  8 | from copy import deepcopy
  9 | from tqdm import tqdm
 10 | from utils.call_llm import call_gpt
 11 | from utils.util import read_jsonl
 12 | 
 13 | # Fixed mini test set ID to avoid different randomness
 14 | ids_eval_mini=[6, 25, 27, 30, 32, 44, 46, 57, 67, 71, 73, 80, 81, 89, 94, 95, 99, 103, 104, 114, 127, 142, 159, 163, 166, 167, 175, 196, 203, 214, 216, 217, 220, 223, 224, 225, 228, 233, 234, 238, 250, 270, 273, 274, 276, 281, 284, 296, 300, 322, 323, 332, 344, 348, 352, 363, 367, 370, 373, 379, 387, 388, 389, 405, 410, 429, 432, 459, 464, 469, 470, 473, 511, 517, 546, 549, 558, 565, 570, 574, 580, 591, 603, 604, 616, 618, 623, 633, 643, 654, 660, 665, 666, 671, 683, 687, 688, 691, 692, 695, 700, 703, 711, 719, 721, 748, 759, 765, 770, 780, 787, 793, 808, 812, 814, 817, 823, 828, 831, 841, 842, 846, 856, 861, 863, 865, 869, 882, 903, 905, 916, 924, 930, 945, 946, 952, 953, 957, 969, 971, 973, 1000, 1005, 1014, 1019, 1031, 1046, 1048, 1070, 1071, 1082, 1090, 1094, 1108, 1132, 1133, 1138, 1145, 1148, 1162, 1164, 1179, 1186, 1197, 1200, 1205, 1212, 1219, 1221, 1229, 1240, 1241, 1243, 1245, 1249, 1251, 1252, 1262, 1266, 1267, 1274, 1280, 1297, 1298, 1310, 1313, 1323, 1340, 1342, 1344, 1352, 1356, 1361, 1375, 1376, 1379, 1380, 1389, 1393, 1398, 1402, 1407, 1415, 1432, 1451, 1455, 1458, 1459, 1460, 1462, 1469, 1470, 1472, 1477, 1482, 1495, 1496, 1499, 1500, 1511, 1535, 1543, 1558, 1561, 1568, 1570, 1587, 1590, 1594, 1596, 1618, 1619, 1622, 1625, 1634, 1635, 1640, 1642, 1643, 1648, 1654, 1655, 1671, 1685, 1692, 1703, 1721, 1738, 1747, 1762, 1788, 1799, 1810, 1812, 1816, 1820, 1832, 1833, 1841, 1853, 1859, 1865, 1873, 1878, 1884, 1888, 1892, 1897, 1898, 1914, 1920, 1926, 1943, 1949, 1952, 1953, 1955, 1963, 1969, 1984, 1990, 1993, 1998, 2002, 2013, 2044, 2046, 2057, 2061, 2063, 2064, 2069, 2074, 2075, 2078, 2084, 2085, 2091, 2093, 2095, 2103, 2108, 2109, 2136, 2140, 2142, 2145, 2152, 2170, 2174, 2175, 2177, 2183, 2194, 2202, 2206, 2209, 2214, 2218, 2222, 2230, 2233, 2234, 2235, 2237, 2251, 2252, 2259, 2261, 2265, 2308, 2309, 2315, 2318, 2344, 2350, 2352, 2357, 2367, 2368, 2370, 2371, 2378, 2383, 2388, 2391, 2406, 2407, 2409, 2415, 2421, 2423, 2441, 2451, 2457, 2466, 2473, 2478, 2505, 2513, 2529, 2533, 2537, 2540, 2548, 2552, 2568, 2569, 2600, 2617, 2618, 2635, 2641, 2650, 2656, 2658, 2664, 2665, 2666, 2667, 2674, 2676, 2678, 2683, 2692, 2696, 2697, 2708, 2719, 2726, 2734, 2736, 2745, 2758, 2760, 2772, 2773, 2780, 2785, 2793]
 15 | 
 16 | eval_prompt=\
 17 | """
 18 | Please act as an impartial judge and perform the task: 
 19 | Given a [Question], you need to evaluate whether the [Response] correctly answers or hits the correct answer, and output your judgment after [Judge]. I will provide a correct answer [Reference] as a reference.
 20 | 
 21 | Scoring criteria:
 22 | - If the [Response] is completely correct and aligns with the correct answer, it scores 1 point; 
 23 | - If the [Response] partially answers correctly, it scores 0.5 point; 
 24 | - If the [response] is completely incorrect compared to the [Reference], it scores 0 point.
 25 | 
 26 | Note:
 27 | - Your only evaluation criterion is whether the [Response] correctly answered the answer, regardless of the format, language, case, length, etc., of the [Response]. Besides, providing more information than the [Reference] in the [Response] cannot be a reason for point deduction.
 28 | - Use the [Reference] as the correct answer reference rather than your own knowledge.
 29 | - The rating reply must strictly follow the format below: "Rating: [judge_score]\nReason: [judge_reason]", and do not output any other content. For example: "Rating: [0]\nReason: [Response and Reference are completely unrelated.]". Ensure that judge_score and judge_reason are enclosed in [].
 30 | 
 31 | [Question]
 32 | {question}
 33 | 
 34 | [Reference]
 35 | {answer_gold}
 36 | 
 37 | [Response]
 38 | {response}
 39 | 
 40 | [Judge]
 41 | """
 42 | 
 43 | def construct_eval_prompt(dp,eval_column):
 44 |     input_prompt=eval_prompt.format(
 45 |         question=dp['question'],
 46 |         answer_gold=dp['answer_gold'],
 47 |         #response=dp[eval_column][0],
 48 |         response=dp[eval_column],
 49 |     )
 50 |     return input_prompt
 51 | 
 52 | def extract_rating(text):
 53 |     """Extract the Rating score from the evaluation results."""
 54 |     # Use regular expressions to match values in Rating (can only be 0, 0.5, or 1)
 55 |     match = re.search(r'Rating: \[(0|0\.5|1)\]', text)
 56 |     
 57 |     # Judge and return matching results
 58 |     if match:
 59 |         return float(match.group(1))
 60 |     else:
 61 |         return False
 62 | 
 63 | def gpt_rating(dp,eval_column):
 64 |     max_try=10 # If the extraction format is incorrect, try again
 65 |     cur_try=0
 66 |     input_prompt=construct_eval_prompt(dp,eval_column)
 67 |     while cur_try < max_try:
 68 |         try:
 69 |             output=call_gpt(llm_model_name='gpt-4o-0513',
 70 |                             message=input_prompt)
 71 |             if output==None:
 72 |                 return (0,"judge error")
 73 |             rating_score=extract_rating(output)
 74 |             if rating_score is not False:
 75 |                 return (rating_score,output)
 76 |             cur_try += 1
 77 |         except:
 78 |             cur_try+=1
 79 |     return (0,"judge error")
 80 | 
 81 | def rag_eval_main(eval_data,eval_type="all"):
 82 |     assert eval_type in ["mini","all"]
 83 |     eval_column='response'
 84 |     # run
 85 |     if eval_type=="mini":  # Only sample a portion for evaluation to save GPT call times
 86 |         eval_data=[eval_data[i] for i in ids_eval_mini]
 87 |     data_rag_evaled=[]
 88 |     for i in tqdm(range(len(eval_data))):
 89 |         dp=eval_data[i]
 90 |         gpt_score=gpt_rating(dp,eval_column)
 91 |         dp_rag_evaled=deepcopy(dp)
 92 |         dp_rag_evaled['rag_eval']={'gpt_rating_score':gpt_score[0],'gpt_rating_details':gpt_score[1]}
 93 |         data_rag_evaled.append(dp_rag_evaled)
 94 |     # count
 95 |     score_all=round(sum([dp['rag_eval']['gpt_rating_score'] for dp in data_rag_evaled])*100/len(data_rag_evaled),2)
 96 | 
 97 |     data_ifnq=[dp for dp in data_rag_evaled if "ifnq" in dp["type"]]
 98 |     score_ifnq=round(sum([dp['rag_eval']['gpt_rating_score'] for dp in data_ifnq])*100/len(data_ifnq),2)
 99 |     data_iftq=[dp for dp in data_rag_evaled if "iftq" in dp["type"]]
100 |     score_iftq=round(sum([dp['rag_eval']['gpt_rating_score'] for dp in data_iftq])*100/len(data_iftq),2)
101 |     data_ifhq=[dp for dp in data_rag_evaled if "ifhq" in dp["type"]]
102 |     score_ifhq=round(sum([dp['rag_eval']['gpt_rating_score'] for dp in data_ifhq])*100/len(data_ifhq),2)
103 |     data_ifwebq=[dp for dp in data_rag_evaled if "ifwebq" in dp["type"]]
104 |     score_ifwebq=round(sum([dp['rag_eval']['gpt_rating_score'] for dp in data_ifwebq])*100/len(data_ifwebq),2)
105 |     rag_eval_result={"score_all":score_all,"score_ifnq":score_ifnq,"score_iftq":score_iftq,"score_ifhq":score_ifhq,"score_ifwebq":score_ifwebq,"eval_num":len(data_rag_evaled)}
106 |     print("rag_eval_result:",rag_eval_result)
107 |     return data_rag_evaled,rag_eval_result
108 | 
109 | if __name__=='__main__':
110 |     # test
111 |     import os
112 |     os.environ["OPENAI_API_KEY"]="your_openai_api_key"
113 |     os.environ["OPENAI_API_BASE"]="your_openai_api_base"  # eg. https://api.openai.com/v1
114 |     file_path="The path to the JSONL file that has already been inferred and needs to be evaluated, where the response field needs to be added to each field in `followRAG_full`."
115 |     data=read_jsonl(file_path)
116 |     rag_eval_main(data,eval_type="mini")
117 | 
118 | 
119 | 


--------------------------------------------------------------------------------
/FollowRAG/followRAG/atom_instruction.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "cases_capital_words": {
  3 |         "instruction_description": "In your response, words with all capital letters should appear at least / at most {N} times.",
  4 |         "kwarg_description": [
  5 |             {
  6 |                 "capital_relation": "Specifies whether the frequency constraint is 'at least' or 'at most'.",
  7 |                 "capital_frequency": "Specifies the number {N} indicating the minimum or maximum frequency of words with all capital letters."
  8 |             }
  9 |         ],
 10 |         "cases": [
 11 |             {
 12 |                 "input": {
 13 |                     "capital_relation": "at least",
 14 |                     "capital_frequency": 3
 15 |                 },
 16 |                 "output": "Use words in all capital letters at least 3 times to highlight key points."
 17 |             },
 18 |             {
 19 |                 "input": {
 20 |                     "capital_relation": "less than",
 21 |                     "capital_frequency": 10
 22 |                 },
 23 |                 "output": "Use words with all capital letters to highlight key contents, but make sure that words with all capital letters appear less than 10 times."
 24 |             },
 25 |             {
 26 |                 "input": {
 27 |                     "capital_relation": "at least",
 28 |                     "capital_frequency": 15
 29 |                 },
 30 |                 "output": "Include at least 15 words in all capital letters."
 31 |             },
 32 |             {
 33 |                 "input": {
 34 |                     "capital_relation": "at least",
 35 |                     "capital_frequency": 2
 36 |                 },
 37 |                 "output": "Use at least 2 words with all capital letters"
 38 |             },
 39 |             {
 40 |                 "input": {
 41 |                     "capital_relation": "less than",
 42 |                     "capital_frequency": 4
 43 |                 },
 44 |                 "output": "Please use words with all capital letters to make important claims. But words with capital letters should appear less than 4 times."
 45 |             }
 46 |         ]
 47 |     },
 48 |     "cases_uppercase": {
 49 |         "instruction_description": "Your entire response should be in English, capital letters only. (**You only need to output an instruction** regarding \"capitalization\" instead of performing the capitalization operation.)",
 50 |         "kwarg_description": {},
 51 |         "cases": [
 52 |             {
 53 |                 "input": {},
 54 |                 "output": "Your entire response should be in English and in all capital letters."
 55 |             },
 56 |             {
 57 |                 "input": {},
 58 |                 "output": "Make sure to only use capital letters in your entire response."
 59 |             },
 60 |             {
 61 |                 "input": {},
 62 |                 "output": "Please reply in English and capitalize all your words."
 63 |             },
 64 |             {
 65 |                 "input": {},
 66 |                 "output": "All letters in your entire response should be capitalized."
 67 |             },
 68 |             {
 69 |                 "input": {},
 70 |                 "output": "Your answer must be in all capital letters and in English."
 71 |             }
 72 |         ]
 73 |     },
 74 |     "cases_lowercase": {
 75 |         "instruction_description": "Your entire response should be in English, and in all lowercase letters. No capital letters are allowed.",
 76 |         "kwarg_description": {},
 77 |         "cases": [
 78 |             {
 79 |                 "input": {},
 80 |                 "output": "It should be all in lowercase letters."
 81 |             },
 82 |             {
 83 |                 "input": {},
 84 |                 "output": "All in lowercase letters."
 85 |             },
 86 |             {
 87 |                 "input": {},
 88 |                 "output": "Your entire response should be in lowercase english."
 89 |             },
 90 |             {
 91 |                 "input": {},
 92 |                 "output": "It should be in English and have no capital letters."
 93 |             },
 94 |             {
 95 |                 "input": {},
 96 |                 "output": "Your entire response must be in English and contain only lowercase letters."
 97 |             }
 98 |         ]
 99 |     },
100 |     "format_repeat_question": {
101 |         "instruction_description": "First, repeat the request without change, then give your answer (do not say anything before repeating the request; the request you need to repeat does not include this sentence)",
102 |         "kwarg_description": {
103 |             "prompt_to_repeat": "The content that needs to be repeated as it is."
104 |         },
105 |         "cases": [
106 |             {
107 |                 "input": {
108 |                     "prompt_to_repeat": "Write a resume for a junior hardware engineer. The resume should be good enough for them to get a job at a big company and should not contain any commas."
109 |                 },
110 |                 "output": "Before you answer the following request, repeat it at the very beginning of your reply. Repeat the request as it is. Please do not change it."
111 |             },
112 |             {
113 |                 "input": {
114 |                     "prompt_to_repeat": "Write an email to my boss telling him that I am quitting. The email must contain a title wrapped in double angular brackets, i.e. <<title>>."
115 |                 },
116 |                 "output": "First repeat the request word for word without change, then give your answer (1. do not say any words or characters before repeating the request; 2. the request you need to repeat does not include this sentence)"
117 |             },
118 |             {
119 |                 "input": {
120 |                     "prompt_to_repeat": "Write a blog post about the most interesting things you have seen or ridden on public transportation."
121 |                 },
122 |                 "output": "First repeat the sentence above word for word without change, then give your answer. Do not say any words or characters before repeating the sentence."
123 |             },
124 |             {
125 |                 "input": {
126 |                     "prompt_to_repeat": "I am a software engineer with 7 years of experience, and I am looking for a new job. Can you create a resume for me and explain each section?"
127 |                 },
128 |                 "output": "First repeat the exact request above, then give your answer. Do not say any word before repeating the exact request."
129 |             },
130 |             {
131 |                 "input": {
132 |                     "prompt_to_repeat": "Write an acoustic song about the Korean peninsula without using any commas."
133 |                 },
134 |                 "output": "In this task, repeat the exact request first, then give your response. Do not say any word before repeating the exact request."
135 |             }
136 |         ]
137 |     },
138 |     "structure_placeholder": {
139 |         "instruction_description": "The response must contain at least {N} placeholders represented by square brackets, such as [address].",
140 |         "kwarg_description": {
141 |             "num_placeholders": "The minimum number of placeholders that should be included in the response, represented by square brackets."
142 |         },
143 |         "cases": [
144 |             {
145 |                 "input": {
146 |                     "num_placeholders": 1
147 |                 },
148 |                 "output": "includes at least one placeholder represented by square brackets"
149 |             },
150 |             {
151 |                 "input": {
152 |                     "num_placeholders": 6
153 |                 },
154 |                 "output": "In your response, it must contain at least six placeholders which should be represented by square brackets."
155 |             },
156 |             {
157 |                 "input": {
158 |                     "num_placeholders": 7
159 |                 },
160 |                 "output": "Include at least 7 placeholders with brackets like [xxx]."
161 |             },
162 |             {
163 |                 "input": {
164 |                     "num_placeholders": 3
165 |                 },
166 |                 "output": "It must contain at least three placeholders, such as [something]."
167 |             },
168 |             {
169 |                 "input": {
170 |                     "num_placeholders": 3
171 |                 },
172 |                 "output": "Your Response should contains at least 3 placeholders represented by square brackets."
173 |             }
174 |         ]
175 |     },
176 |     "position_postscript": {
177 |         "instruction_description": "At the end of your response, please explicitly add a postscript starting with {postscript marker}",
178 |         "kwarg_description": {
179 |             "postscript_marker": "The marker that should be used to start the postscript."
180 |         },
181 |         "cases": [
182 |             {
183 |                 "input": {
184 |                     "postscript_marker": "P.S."
185 |                 },
186 |                 "output": "At the end of your response, please explicitly add a postscript starting with P.S."
187 |             },
188 |             {
189 |                 "input": {
190 |                     "postscript_marker": "P.S."
191 |                 },
192 |                 "output": "At the end of your response, please explicitly add a postscript starting with P.S."
193 |             },
194 |             {
195 |                 "input": {
196 |                     "postscript_marker": "P.P.S"
197 |                 },
198 |                 "output": "End it with a post script starting with P.P.S"
199 |             },
200 |             {
201 |                 "input": {
202 |                     "postscript_marker": "Note."
203 |                 },
204 |                 "output": "please explicitly add a postscript starting with Note."
205 |             },
206 |             {
207 |                 "input": {
208 |                     "postscript_marker": "P.S."
209 |                 },
210 |                 "output": "Please include a postscript at the end of your response that starts with P.S."
211 |             }
212 |         ]
213 |     },
214 |     "format_json": {
215 |         "instruction_description": "Entire output should be wrapped in JSON format.",
216 |         "kwarg_description": {},
217 |         "cases": [
218 |             {
219 |                 "input": {},
220 |                 "output": "Please wrap your entire answer in JSON format. You can use markdown ticks such as ```. For example:\n```JSON\n{\n...\n}\n```"
221 |             },
222 |             {
223 |                 "input": {},
224 |                 "output": "Entire output should be wrapped in JSON format."
225 |             },
226 |             {
227 |                 "input": {},
228 |                 "output": "Wrap the entire output in JSON format. You can use markdown ticks such as ```."
229 |             },
230 |             {
231 |                 "input": {},
232 |                 "output": "Please wrap your entire response in JSON format. Markdown ticks (```) are acceptable."
233 |             },
234 |             {
235 |                 "input": {},
236 |                 "output": "Wrap the entire output in JSON format using markdown ticks."
237 |             }
238 |         ]
239 |     },
240 |     "structure_sections": {
241 |         "instruction_description": "Your response must have {N} sections. Mark the beginning of each section with {section splitter} X.",
242 |         "kwarg_description": {
243 |             "section_spliter": "A string that denotes the beginning of each section",
244 |             "num_sections": "The number of sections required in the response"
245 |         },
246 |         "cases": [
247 |             {
248 |                 "input": {
249 |                     "section_spliter": "SECTION",
250 |                     "num_sections": 4
251 |                 },
252 |                 "output": "The response should have four sections, with each section marked with SECTION X."
253 |             },
254 |             {
255 |                 "input": {
256 |                     "section_spliter": "PARAGRAPH",
257 |                     "num_sections": 2
258 |                 },
259 |                 "output": "Write 2 paragraphs and label each paragraph with PARAGRAPH X."
260 |             },
261 |             {
262 |                 "input": {
263 |                     "section_spliter": "SECTION",
264 |                     "num_sections": 4
265 |                 },
266 |                 "output": "Response with 4 sections marked with SECTION X."
267 |             },
268 |             {
269 |                 "input": {
270 |                     "section_spliter": "Section",
271 |                     "num_sections": 4
272 |                 },
273 |                 "output": "The response should contain 4 sections anf each section should be explicitly noted as Section X."
274 |             },
275 |             {
276 |                 "input": {
277 |                     "section_spliter": "SECTION",
278 |                     "num_sections": 3
279 |                 },
280 |                 "output": "Response in 3 sections. Mark the beginning of each section with SECTION X."
281 |             }
282 |         ]
283 |     },
284 |     "structure_bullets": {
285 |         "instruction_description": "Your answer must contain exactly {N} bullet points. Use the markdown bullet points such as: * This is a point.",
286 |         "kwarg_description": {
287 |             "num_bullets": "The number of bullet points required in the answer."
288 |         },
289 |         "cases": [
290 |             {
291 |                 "input": {
292 |                     "num_bullets": 6
293 |                 },
294 |                 "output": "Your answer must contain exactly 6 bullet point in Markdown using the following format:\n* Bullet point one.\n* Bullet point two.\n...\n* Bullet point fix."
295 |             },
296 |             {
297 |                 "input": {
298 |                     "num_bullets": 3
299 |                 },
300 |                 "output": "Your answer must contain exactly 3 bullet points in the markdown format (use \"* \" to indicate each bullet) such as:\n* This is the first point.\n* This is the second point."
301 |             },
302 |             {
303 |                 "input": {
304 |                     "num_bullets": 10
305 |                 },
306 |                 "output": "Your answer must contain exactly 10 bullet points. Use the markdown bullet points such as: * This is a point."
307 |             },
308 |             {
309 |                 "input": {
310 |                     "num_bullets": 8
311 |                 },
312 |                 "output": "Include exactly 8 bullet points in your response. The bullet points should be in the form of:\n* This is bullet 1\n* This is bullet 2\n..."
313 |             },
314 |             {
315 |                 "input": {
316 |                     "num_bullets": 2
317 |                 },
318 |                 "output": "In your entire response make sure to use exactly two bullet points in markdown format. Please use the following bullet point format:\n* Text for bullet 1\n* Text for bullet 2."
319 |             }
320 |         ]
321 |     },
322 |     "structure_highlights": {
323 |         "instruction_description": "Highlight at least {N} sections in your answer with markdown, i.e. *highlighted section*",
324 |         "kwarg_description": {
325 |             "num_highlights": "The minimum number of sections you need to highlight in your response."
326 |         },
327 |         "cases": [
328 |             {
329 |                 "input": {
330 |                     "num_highlights": 3
331 |                 },
332 |                 "output": "Highlight at least three sections of your response in markdown such as *highlighted section*."
333 |             },
334 |             {
335 |                 "input": {
336 |                     "num_highlights": 3
337 |                 },
338 |                 "output": "Highlight at least 3 sections that has titles in markdown format, for example *highlighted section part 1*, *highlighted section part 2*, *highlighted section part 3*."
339 |             },
340 |             {
341 |                 "input": {
342 |                     "num_highlights": 2
343 |                 },
344 |                 "output": "Highlight at least 2 text sections, i.e. *highlighted section*."
345 |             },
346 |             {
347 |                 "input": {
348 |                     "num_highlights": 3
349 |                 },
350 |                 "output": "Please highlight at least 3 sections with markdown,  i.e *highlighted section*."
351 |             },
352 |             {
353 |                 "input": {
354 |                     "num_highlights": 1
355 |                 },
356 |                 "output": "Highlight some words or phrases in your response, like *highlighted phrase*."
357 |             }
358 |         ]
359 |     },
360 |     "structure_title": {
361 |         "instruction_description": "Your answer must contain a title, wrapped in double angular brackets, such as <<poem of joy>>.",
362 |         "kwarg_description": {},
363 |         "cases": [
364 |             {
365 |                 "input": {},
366 |                 "output": "Your answer must contain a title, wrapped in double angular brackets, i.e. <<title>>."
367 |             },
368 |             {
369 |                 "input": {},
370 |                 "output": "Your answer must contain a title, wrapped in double angular brackets, such as <<Title>>."
371 |             },
372 |             {
373 |                 "input": {},
374 |                 "output": "Your answer should have a title in double angular brackets, i.e. <<title>>."
375 |             },
376 |             {
377 |                 "input": {},
378 |                 "output": "The response must have a title wrapped in double angular brackets, i.e. <<title>>."
379 |             },
380 |             {
381 |                 "input": {},
382 |                 "output": "Yprovide a title wrapped in double angular brackets, such as <<my title>>."
383 |             }
384 |         ]
385 |     },
386 |     "keywords_inclusion": {
387 |         "instruction_description": "Include keywords {keyword1}, {keyword2},…… in your response",
388 |         "kwarg_description": {
389 |             "keywords": "a list of keywords that should be included in the response"
390 |         },
391 |         "cases": [
392 |             {
393 |                 "input": {
394 |                     "keywords": [
395 |                         "talented",
396 |                         "tianjin"
397 |                     ]
398 |                 },
399 |                 "output": "Include the keywords talented and tianjin in your response."
400 |             },
401 |             {
402 |                 "input": {
403 |                     "keywords": [
404 |                         "remainder"
405 |                     ]
406 |                 },
407 |                 "output": " Include the keyword 'remainder'."
408 |             },
409 |             {
410 |                 "input": {
411 |                     "keywords": [
412 |                         "bill",
413 |                         "economist",
414 |                         "jurgen"
415 |                     ]
416 |                 },
417 |                 "output": "The answer should include keywords 'economist', 'bill', and 'jurgen'"
418 |             },
419 |             {
420 |                 "input": {
421 |                     "keywords": [
422 |                         "compensated",
423 |                         "immigrants"
424 |                     ]
425 |                 },
426 |                 "output": "Please also make sure to include keywords 'compensated' and 'immigrants' in the response."
427 |             },
428 |             {
429 |                 "input": {
430 |                     "keywords": [
431 |                         "Argentinian"
432 |                     ]
433 |                 },
434 |                 "output": "Mention \"Argentinian\" in the answer."
435 |             }
436 |         ]
437 |     },
438 |     "keywords_exclusion": {
439 |         "instruction_description": "Do not include keywords {forbidden words} in the response.",
440 |         "kwarg_description": {
441 |             "forbidden_words": "a list of words that should not be used in the response"
442 |         },
443 |         "cases": [
444 |             {
445 |                 "input": {
446 |                     "forbidden_words": [
447 |                         "reschedule",
448 |                         "free"
449 |                     ]
450 |                 },
451 |                 "output": "Do not use the words reschedule or free."
452 |             },
453 |             {
454 |                 "input": {
455 |                     "forbidden_words": [
456 |                         "nickname"
457 |                     ]
458 |                 },
459 |                 "output": "Do not say the word \"nickname\" in the response."
460 |             },
461 |             {
462 |                 "input": {
463 |                     "forbidden_words": [
464 |                         "sleep",
465 |                         "cook",
466 |                         "feed"
467 |                     ]
468 |                 },
469 |                 "output": "Avoid using the following keywords: sleep, cook, feed."
470 |             },
471 |             {
472 |                 "input": {
473 |                     "forbidden_words": [
474 |                         "coop",
475 |                         "killings",
476 |                         "dead",
477 |                         "night"
478 |                     ]
479 |                 },
480 |                 "output": "The answer should not include keywords \"coop\", \"killings\", \"dead\", \"night\"."
481 |             },
482 |             {
483 |                 "input": {
484 |                     "forbidden_words": [
485 |                         "taylor",
486 |                         "swift",
487 |                         "together"
488 |                     ]
489 |                 },
490 |                 "output": "Do not mention the word Taylor, Swift, or Together."
491 |             }
492 |         ]
493 |     },
494 |     "keywords_frequency": {
495 |         "instruction_description": "In your response, the word word should appear {N} times.",
496 |         "kwarg_description": {
497 |             "relation": "Specifies whether the frequency constraint is 'at least' or 'at most'. The 'relation' parameter must be either 'less than' or 'at least'.",
498 |             "keyword": "the word that should appear in the response",
499 |             "frequency": "Specifies the number {N} indicating the minimum or maximum frequency of the keyword."
500 |         },
501 |         "cases": [
502 |             {
503 |                 "input": {
504 |                     "relation": "less than",
505 |                     "keyword": "batted",
506 |                     "frequency": 2
507 |                 },
508 |                 "output": "The word batted should appear less than 2 times."
509 |             },
510 |             {
511 |                 "input": {
512 |                     "relation": "at least",
513 |                     "keyword": "synonyms",
514 |                     "frequency": 3
515 |                 },
516 |                 "output": "Make sure the word synonyms appears at least 3 times in the response."
517 |             },
518 |             {
519 |                 "input": {
520 |                     "relation": "at least",
521 |                     "keyword": "disappointed",
522 |                     "frequency": 2
523 |                 },
524 |                 "output": "The word disappointed should appear at least 2 times."
525 |             },
526 |             {
527 |                 "input": {
528 |                     "relation": "less than",
529 |                     "keyword": "flesh",
530 |                     "frequency": 3
531 |                 },
532 |                 "output": "The word flesh should appear less than 3 times in your response."
533 |             },
534 |             {
535 |                 "input": {
536 |                     "relation": "at least",
537 |                     "keyword": "dose",
538 |                     "frequency": 5
539 |                 },
540 |                 "output": "Mention the word \"dose\" for at least 5 times."
541 |             }
542 |         ]
543 |     },
544 |     "format_language": {
545 |         "instruction_description": "Your ENTIRE response should be in {language}, no other language is allowed.",
546 |         "kwarg_description": {
547 |             "language": "The language in which the entire response should be written. The mapping relationship between parameters and language is {'kn': 'Kannada', 'pa': 'Punjabi', 'mr': 'Marathi', 'fa': 'Persian', 'vi': 'Vietnamese', 'ko': 'Korean', 'sw': 'Swahili', 'ru': 'Russian', 'hi': 'Hindi', 'bg': 'Bulgarian', 'pt': 'Portuguese', 'gu': 'Gujarati', 'te': 'Telugu', 'it': 'Italian', 'ar': 'Arabic', 'ta': 'Tamil', 'de': 'German', 'fi': 'Finnish', 'ur': 'Urdu', 'th': 'Thai', 'ne': 'Nepali', 'bn': 'Bengali'}"
548 |         },
549 |         "cases": [
550 |             {
551 |                 "input": {
552 |                     "language": "hi"
553 |                 },
554 |                 "output": "using only Hindi, no other language is allowed"
555 |             },
556 |             {
557 |                 "input": {
558 |                     "language": "ne"
559 |                 },
560 |                 "output": "Using only the Nepali language throughout your entire response."
561 |             },
562 |             {
563 |                 "input": {
564 |                     "language": "bn"
565 |                 },
566 |                 "output": "The response must be entirely in the Bengali language, no other language is allowed."
567 |             },
568 |             {
569 |                 "input": {
570 |                     "language": "de"
571 |                 },
572 |                 "output": "Please make your response in only German, no other language is allowed."
573 |             },
574 |             {
575 |                 "input": {
576 |                     "language": "ko"
577 |                 },
578 |                 "output": "Use only Korean in your response."
579 |             }
580 |         ]
581 |     },
582 |     "position_first_word": {
583 |         "instruction_description": "There should be {N} paragraphs. Paragraphs and only paragraphs are separated with each other by two line breaks. The {i}-th paragraph must start with word {first word}.",
584 |         "kwarg_description": {
585 |             "first_word": "The first word of the specified paragraph which must appear at the beginning of that paragraph.",
586 |             "num_paragraphs": "The total number of paragraphs that the text should contain.",
587 |             "nth_paragraph": "The index (1-based) of the paragraph that should start with the given first word."
588 |         },
589 |         "cases": [
590 |             {
591 |                 "input": {
592 |                     "first_word": "president",
593 |                     "num_paragraphs": 6,
594 |                     "nth_paragraph": 2
595 |                 },
596 |                 "output": "The answer should contain exactly six paragraphs, and the second paragraph must start with the word \"President\". Separate paragraphs by double line breaks (\"\\n\\n\")"
597 |             },
598 |             {
599 |                 "input": {
600 |                     "first_word": "summary",
601 |                     "num_paragraphs": 7,
602 |                     "nth_paragraph": 7
603 |                 },
604 |                 "output": "Your answer should have exactly 7 paragraphs and the last paragraph must start with the word \"Summary\". Each paragraph should be separated by two new lines."
605 |             },
606 |             {
607 |                 "input": {
608 |                     "first_word": "realising",
609 |                     "num_paragraphs": 2,
610 |                     "nth_paragraph": 1
611 |                 },
612 |                 "output": "Can you please provide a two paragraph answer about me? Make sure that the first paragraph starts with the word \"realising\" and that each paragraph is separated by two new lines."
613 |             },
614 |             {
615 |                 "input": {
616 |                     "first_word": "elm",
617 |                     "num_paragraphs": 4,
618 |                     "nth_paragraph": 4
619 |                 },
620 |                 "output": "Write exactly 4 paragraphs about the question. Use 2 new lines to separate paragraphs. Start the 4th paragraph with the word \"elm\"."
621 |             },
622 |             {
623 |                 "input": {
624 |                     "first_word": "bonding",
625 |                     "num_paragraphs": 3,
626 |                     "nth_paragraph": 3
627 |                 },
628 |                 "output": "Your answer must be exactly 3 paragraphs where paragraphs and only paragraphs are separated by two new lines, as if they were '\\n\\n' in python. The third paragraph must start with the word bonding."
629 |             }
630 |         ]
631 |     },
632 |     "length_paragraph": {
633 |         "instruction_description": "Your response should contain {N} paragraphs. You separate paragraphs using the markdown divider: * * *",
634 |         "kwarg_description": {
635 |             "num_paragraphs": "The total number of paragraphs that the response should contain."
636 |         },
637 |         "cases": [
638 |             {
639 |                 "input": {
640 |                     "num_paragraphs": 3
641 |                 },
642 |                 "output": "Make sure to break the conversation down to 3 parts, separated by ***, such as:\n[conversation part 1]\n***\n[conversation part 2]\n***\n[conversation part 3]"
643 |             },
644 |             {
645 |                 "input": {
646 |                     "num_paragraphs": 3
647 |                 },
648 |                 "output": "Please reply in details, and include exactly 3 paragraphs. Separate the paragraphs with ***."
649 |             },
650 |             {
651 |                 "input": {
652 |                     "num_paragraphs": 4
653 |                 },
654 |                 "output": "There should be exactly 4 paragraphs. Separate the paragraphs with ***."
655 |             },
656 |             {
657 |                 "input": {
658 |                     "num_paragraphs": 3
659 |                 },
660 |                 "output": "Separate your answer into 3 parts, where each part is separated with ***."
661 |             },
662 |             {
663 |                 "input": {
664 |                     "num_paragraphs": 5
665 |                 },
666 |                 "output": "Put the response into at least 5 sections, separated using 3 asterisks ***."
667 |             }
668 |         ]
669 |     },
670 |     "length_sentence": {
671 |         "instruction_description": "Answer with at least / around / at most {N} sentences.",
672 |         "kwarg_description": {
673 |             "relation": "Specifies whether the frequency constraint is 'at least' or 'at most'. The 'relation' parameter must be either 'less than' or 'at least'.",
674 |             "num_sentences": "Specifies the number {N} indicating the minimum or maximum number of sentences."
675 |         },
676 |         "cases": [
677 |             {
678 |                 "input": {
679 |                     "relation": "less than",
680 |                     "num_sentences": 10
681 |                 },
682 |                 "output": "Please provide less than a total of 10 sentences in your entire answer."
683 |             },
684 |             {
685 |                 "input": {
686 |                     "relation": "less than",
687 |                     "num_sentences": 7
688 |                 },
689 |                 "output": "Your answer should be less than 7 sentences."
690 |             },
691 |             {
692 |                 "input": {
693 |                     "relation": "at least",
694 |                     "num_sentences": 40
695 |                 },
696 |                 "output": "The number of sentences in your response should be in the range of 40 to 60."
697 |             },
698 |             {
699 |                 "input": {
700 |                     "relation": "at least",
701 |                     "num_sentences": 30
702 |                 },
703 |                 "output": "I'd like your response to be at least 30 sentences long."
704 |             },
705 |             {
706 |                 "input": {
707 |                     "relation": "less than",
708 |                     "num_sentences": 8
709 |                 },
710 |                 "output": "Your response should contain less than 8 sentences."
711 |             }
712 |         ]
713 |     },
714 |     "length_words": {
715 |         "instruction_description": "Answer with at least / around / at most {N} words.",
716 |         "kwarg_description": {
717 |             "relation": "Specifies whether the frequency constraint is 'at least' or 'at most'. The 'relation' parameter must be either 'less than' or 'at least'.",
718 |             "num_words": "Specifies the number {N} indicating the minimum or maximum number of words."
719 |         },
720 |         "cases": [
721 |             {
722 |                 "input": {
723 |                     "relation": "less than",
724 |                     "num_words": 30
725 |                 },
726 |                 "output": "Make your entire response less than 30 words."
727 |             },
728 |             {
729 |                 "input": {
730 |                     "relation": "less than",
731 |                     "num_words": 200
732 |                 },
733 |                 "output": "Answer with less than 200 words."
734 |             },
735 |             {
736 |                 "input": {
737 |                     "relation": "at least",
738 |                     "num_words": 1200
739 |                 },
740 |                 "output": "Your entire response must contain at least 1200 words."
741 |             },
742 |             {
743 |                 "input": {
744 |                     "relation": "less than",
745 |                     "num_words": 40
746 |                 },
747 |                 "output": "Make your reply short -- the whole reply should contain less than 40 words."
748 |             },
749 |             {
750 |                 "input": {
751 |                     "relation": "at least",
752 |                     "num_words": 600
753 |                 },
754 |                 "output": "Write a 600+ word reseponse."
755 |             }
756 |         ]
757 |     },
758 |     "format_no_commas": {
759 |         "instruction_description": "In your entire response, refrain from the use of any commas.",
760 |         "kwarg_description": {},
761 |         "cases": [
762 |             {
763 |                 "input": {},
764 |                 "output": "Do not contain commas in your response."
765 |             },
766 |             {
767 |                 "input": {},
768 |                 "output": "Do not use any commas in your response."
769 |             },
770 |             {
771 |                 "input": {},
772 |                 "output": "In your entire response, refrain from the use of any commas."
773 |             },
774 |             {
775 |                 "input": {},
776 |                 "output": "You cannot use any commas in your entire response."
777 |             },
778 |             {
779 |                 "input": {},
780 |                 "output": "You are not allowed to use any commas in your answer."
781 |             }
782 |         ]
783 |     },
784 |     "position_end_with": {
785 |         "instruction_description": "Finish your response with this exact phrase {end phrase}. No other words should follow this phrase.",
786 |         "kwarg_description": {
787 |             "end_phrase": "The specific phrase that should appear at the end of the response."
788 |         },
789 |         "cases": [
790 |             {
791 |                 "input": {
792 |                     "end_phrase": "Is there anything else I can help with?"
793 |                 },
794 |                 "output": "Finish your response with \"Is there anything else I can help with?\". No other words should follow this phrase."
795 |             },
796 |             {
797 |                 "input": {
798 |                     "end_phrase": "Hope you agree with me."
799 |                 },
800 |                 "output": "Finish your entire response with this exact phrase: Hope you agree with me."
801 |             },
802 |             {
803 |                 "input": {
804 |                     "end_phrase": "Peace!"
805 |                 },
806 |                 "output": "Finish the answer with:\n\nPeace!\n\nNo additional words should follow \"Peace!\""
807 |             },
808 |             {
809 |                 "input": {
810 |                     "end_phrase": "Follow the 5 steps listed above, and you will be successful."
811 |                 },
812 |                 "output": "Finish your response with \"Follow the 5 steps listed above, and you will be successful.\" No other words should follow this."
813 |             },
814 |             {
815 |                 "input": {
816 |                     "end_phrase": "Which one you choose?"
817 |                 },
818 |                 "output": "I need you to end your response with \"Which one you choose?\"."
819 |             }
820 |         ]
821 |     },
822 |     "format_quotation": {
823 |         "instruction_description": "Wrap your entire response with double quotation marks.",
824 |         "kwarg_description": {},
825 |         "cases": [
826 |             {
827 |                 "input": {},
828 |                 "output": "Wrap your entire response in double quotation marks."
829 |             },
830 |             {
831 |                 "input": {},
832 |                 "output": "Put your whole response in double quotes."
833 |             },
834 |             {
835 |                 "input": {},
836 |                 "output": "Wrap your entire answer with double quotation marks."
837 |             },
838 |             {
839 |                 "input": {},
840 |                 "output": "Use double quotation marks to enclose your entire response."
841 |             },
842 |             {
843 |                 "input": {},
844 |                 "output": "Enclose your whole reply in double quote marks."
845 |             }
846 |         ]
847 |     }
848 | }


--------------------------------------------------------------------------------
/FollowRAG/readme.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ### inference
 3 | You first need to perform inference on followRAG, and the pseudocode is as follows:
 4 | ```python
 5 | followRAG_full=load_json('followRAG/followRAG_full.json')
 6 | data_inferenced=[]
 7 | for dp in followRAG_full:
 8 |     response=llm.inference(dp['prompt'])
 9 |     dp['response']=response
10 |     data_inferenced.append(dp)
11 | save_jsonl(data_inferenced,'results/finish_inference/data_inferenced.jsonl')
12 | ```
13 | ### eval
14 | After completing the inference, run the evaluation script:
15 | ```bash
16 | export OPENAI_API_KEY=your_openai_api_key
17 | export OPENAI_API_BASE=https://api.openai.com/v1
18 | python eval/main_eval.py \
19 |     --input_file_path results/finish_inference/data_inferenced.jsonl \
20 |     --output_file_path results/finish_eval/data_evaled.jsonl \
21 |     --rag_eval_type mini \
22 |     --result_log_file_path results/logs/results_log.jsonl
23 | ```
24 | Our evaluation of instruction-following part largely draws on the [IFEval code repository](https://github.com/google-research/google-research/tree/master/instruction_following_eval). We appreciate their excellent work!
25 | 


--------------------------------------------------------------------------------
/FollowRAG/requirements.txt:
--------------------------------------------------------------------------------
 1 | absl-py==2.1.0
 2 | aiohappyeyeballs==2.3.5
 3 | aiohttp==3.10.3
 4 | aiosignal==1.3.1
 5 | annotated-types==0.7.0
 6 | anyio==4.4.0
 7 | async-timeout==4.0.3
 8 | attrs==24.2.0
 9 | certifi==2024.7.4
10 | charset-normalizer==3.3.2
11 | click==8.1.7
12 | contourpy==1.2.1
13 | cycler==0.12.1
14 | distro==1.9.0
15 | exceptiongroup==1.2.2
16 | filelock==3.15.4
17 | fonttools==4.53.1
18 | frozenlist==1.4.1
19 | fsspec==2024.6.1
20 | gradio_client==1.2.0
21 | h11==0.14.0
22 | httpcore==1.0.5
23 | httpx==0.27.0
24 | huggingface-hub==0.24.5
25 | idna==3.7
26 | immutabledict==4.2.0
27 | importlib_resources==6.4.2
28 | jiter==0.5.0
29 | joblib==1.4.2
30 | kiwisolver==1.4.5
31 | langdetect==1.0.9
32 | matplotlib==3.9.2
33 | multidict==6.0.5
34 | nltk==3.8.1
35 | numpy==2.0.1
36 | openai==1.40.8
37 | packaging==24.1
38 | pandas==2.2.2
39 | pillow==10.4.0
40 | pyarrow==17.0.0
41 | pydantic==2.8.2
42 | pydantic_core==2.20.1
43 | pyparsing==3.1.2
44 | python-dateutil==2.9.0.post0
45 | pytz==2024.1
46 | PyYAML==6.0.1
47 | regex==2024.5.15
48 | requests==2.32.3
49 | scipy==1.13.1
50 | six==1.16.0
51 | sniffio==1.3.1
52 | tqdm==4.66.4
53 | typing_extensions==4.12.2
54 | tzdata==2024.1
55 | urllib3==2.2.2
56 | websockets==12.0
57 | yarl==1.9.4
58 | zipp==3.20.0
59 | 


--------------------------------------------------------------------------------
/FollowRAG/scripts/eval.sh:
--------------------------------------------------------------------------------
 1 | # gpt setting
 2 | export OPENAI_API_KEY=xxxxxx
 3 | export OPENAI_API_BASE=xxxxx
 4 | 
 5 | 
 6 | # eval
 7 | python eval/main_eval.py \
 8 |     --input_file_path results/finish_inference/data_inferenced.jsonl \
 9 |     --output_file_path results/finish_eval/data_evaled.jsonl \
10 |     --rag_eval_type mini \
11 |     --result_log_file_path results/logs/results_log.jsonl 
12 | 


--------------------------------------------------------------------------------
/FollowRAG/utils/call_llm.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import os
 3 | from openai import OpenAI
 4 | 
 5 | def call_gpt(llm_model_name,message,max_retries=10):
 6 |     client = OpenAI(
 7 |         api_key=os.environ['OPENAI_API_KEY'],
 8 |         base_url=os.environ['OPENAI_API_BASE'],
 9 |     )
10 |     retries = 0
11 |     while retries < max_retries:
12 |         try:
13 |             completion = client.chat.completions.create(
14 |                 model=llm_model_name,
15 |                 messages=[{"role": "user", "content": message}],
16 |             )
17 |             response = completion.choices[0].message.content
18 |             retries+=1
19 |             return response
20 |         except KeyboardInterrupt:
21 |             print("Operation canceled by user.")
22 |             return ''
23 |         except Exception as e:
24 |             print(f"Someting wrong:{e}. Retrying in 1 minute...")
25 |             time.sleep(60) # 等待1分钟
26 |             retries += 1
27 | 
28 |     print("Max retries reached. Unable to get a response.")
29 |     return None
30 | 
31 | if __name__=="__main__":
32 |     # test
33 |     os.environ["OPENAI_API_KEY"]="your_openai_api_key"
34 |     os.environ["OPENAI_API_BASE"]="https://api.openai.com/v1"
35 |     output=call_gpt(llm_model_name="gpt-4o-0513",
36 |                     message="hello,introduce yourself")
37 |     print(output)
38 | 


--------------------------------------------------------------------------------
/FollowRAG/utils/instruction_following_eval/instructions_registry.py:
--------------------------------------------------------------------------------
 1 | from utils.instruction_following_eval import instructions
 2 | 
 3 | 
 4 | INSTRUCTION_DICT = {
 5 |     'keywords_inclusion': instructions.KeywordChecker,
 6 |     'keywords_frequency': instructions.KeywordFrequencyChecker,
 7 |     'keywords_exclusion': instructions.ForbiddenWords,
 8 |     'format_language': instructions.ResponseLanguageChecker,
 9 |     'length_sentence': instructions.NumberOfSentences,
10 |     'length_paragraph': instructions.ParagraphChecker,
11 |     'length_words': instructions.NumberOfWords,
12 |     'position_first_word': instructions.ParagraphFirstWordCheck,
13 |     'structure_placeholder': instructions.PlaceholderChecker,
14 |     'position_postscript': instructions.PostscriptChecker,
15 |     'structure_bullets': instructions.BulletListChecker,
16 |     'structure_highlights': instructions.HighlightSectionChecker,
17 |     'structure_sections': instructions.SectionChecker,
18 |     'format_json': instructions.JsonFormat,
19 |     'structure_title': instructions.TitleChecker,
20 |     'format_repeat_question': instructions.RepeatPromptThenAnswer,
21 |     'position_end_with': instructions.EndChecker,
22 |     'cases_capital_words': instructions.CapitalWordFrequencyChecker,
23 |     'cases_uppercase': instructions.CapitalLettersEnglishChecker,
24 |     'cases_lowercase': instructions.LowercaseLettersEnglishChecker,
25 |     'format_no_commas': instructions.CommaChecker,
26 |     'format_quotation': instructions.QuotationChecker,
27 | }
28 | 
29 | 


--------------------------------------------------------------------------------
/FollowRAG/utils/instruction_following_eval/instructions_util.py:
--------------------------------------------------------------------------------
  1 | """Utility library of instructions."""
  2 | 
  3 | import functools
  4 | import random
  5 | import re
  6 | from typing import List
  7 | 
  8 | import immutabledict
  9 | import nltk
 10 | 
 11 | WORD_LIST = ["western", "sentence", "signal", "dump", "spot", "opposite", "bottom", "potato", "administration", "working", "welcome", "morning", "good", "agency", "primary", "wish", "responsibility", "press", "problem", "president", "steal", "brush", "read", "type", "beat", "trainer", "growth", "lock", "bone", "case", "equal", "comfortable", "region", "replacement", "performance", "mate", "walk", "medicine", "film", "thing", "rock", "tap", "total", "competition", "ease", "south", "establishment", "gather", "parking", "world", "plenty", "breath", "claim", "alcohol", "trade", "dear", "highlight", "street", "matter", "decision", "mess", "agreement", "studio", "coach", "assist", "brain", "wing", "style", "private", "top", "brown", "leg", "buy", "procedure", "method", "speed", "high", "company", "valuable", "pie", "analyst", "session", "pattern", "district", "pleasure", "dinner", "swimming", "joke", "order", "plate", "department", "motor", "cell", "spend", "cabinet", "difference", "power", "examination", "engine", "horse", "dimension", "pay", "toe", "curve", "literature", "bother", "fire", "possibility", "debate", "activity", "passage", "hello", "cycle", "background", "quiet", "author", "effect", "actor", "page", "bicycle", "error", "throat", "attack", "character", "phone", "tea", "increase", "outcome", "file", "specific", "inspector", "internal", "potential", "staff", "building", "employer", "shoe", "hand", "direction", "garden", "purchase", "interview", "study", "recognition", "member", "spiritual", "oven", "sandwich", "weird", "passenger", "particular", "response", "reaction", "size", "variation", "a", "cancel", "candy", "exit", "guest", "condition", "fly", "price", "weakness", "convert", "hotel", "great", "mouth", "mind", "song", "sugar", "suspect", "telephone", "ear", "roof", "paint", "refrigerator", "organization", "jury", "reward", "engineering", "day", "possession", "crew", "bar", "road", "description", "celebration", "score", "mark", "letter", "shower", "suggestion", "sir", "luck", "national", "progress", "hall", "stroke", "theory", "offer", "story", "tax", "definition", "history", "ride", "medium", "opening", "glass", "elevator", "stomach", "question", "ability", "leading", "village", "computer", "city", "grand", "confidence", "candle", "priest", "recommendation", "point", "necessary", "body", "desk", "secret", "horror", "noise", "culture", "warning", "water", "round", "diet", "flower", "bus", "tough", "permission", "week", "prompt", "connection", "abuse", "height", "save", "corner", "border", "stress", "drive", "stop", "rip", "meal", "listen", "confusion", "girlfriend", "living", "relation", "significance", "plan", "creative", "atmosphere", "blame", "invite", "housing", "paper", "drink", "roll", "silver", "drunk", "age", "damage", "smoke", "environment", "pack", "savings", "influence", "tourist", "rain", "post", "sign", "grandmother", "run", "profit", "push", "clerk", "final", "wine", "swim", "pause", "stuff", "singer", "funeral", "average", "source", "scene", "tradition", "personal", "snow", "nobody", "distance", "sort", "sensitive", "animal", "major", "negotiation", "click", "mood", "period", "arrival", "expression", "holiday", "repeat", "dust", "closet", "gold", "bad", "sail", "combination", "clothes", "emphasis", "duty", "black", "step", "school", "jump", "document", "professional", "lip", "chemical", "front", "wake", "while", "inside", "watch", "row", "subject", "penalty", "balance", "possible", "adult", "aside", "sample", "appeal", "wedding", "depth", "king", "award", "wife", "blow", "site", "camp", "music", "safe", "gift", "fault", "guess", "act", "shame", "drama", "capital", "exam", "stupid", "record", "sound", "swing", "novel", "minimum", "ratio", "machine", "shape", "lead", "operation", "salary", "cloud", "affair", "hit", "chapter", "stage", "quantity", "access", "army", "chain", "traffic", "kick", "analysis", "airport", "time", "vacation", "philosophy", "ball", "chest", "thanks", "place", "mountain", "advertising", "red", "past", "rent", "return", "tour", "house", "construction", "net", "native", "war", "figure", "fee", "spray", "user", "dirt", "shot", "task", "stick", "friend", "software", "promotion", "interaction", "surround", "block", "purpose", "practice", "conflict", "routine", "requirement", "bonus", "hole", "state", "junior", "sweet", "catch", "tear", "fold", "wall", "editor", "life", "position", "pound", "respect", "bathroom", "coat", "script", "job", "teach", "birth", "view", "resolve", "theme", "employee", "doubt", "market", "education", "serve", "recover", "tone", "harm", "miss", "union", "understanding", "cow", "river", "association", "concept", "training", "recipe", "relationship", "reserve", "depression", "proof", "hair", "revenue", "independent", "lift", "assignment", "temporary", "amount", "loss", "edge", "track", "check", "rope", "estimate", "pollution", "stable", "message", "delivery", "perspective", "mirror", "assistant", "representative", "witness", "nature", "judge", "fruit", "tip", "devil", "town", "emergency", "upper", "drop", "stay", "human", "neck", "speaker", "network", "sing", "resist", "league", "trip", "signature", "lawyer", "importance", "gas", "choice", "engineer", "success", "part", "external", "worker", "simple", "quarter", "student", "heart", "pass", "spite", "shift", "rough", "lady", "grass", "community", "garage", "youth", "standard", "skirt", "promise", "blind", "television", "disease", "commission", "positive", "energy", "calm", "presence", "tune", "basis", "preference", "head", "common", "cut", "somewhere", "presentation", "current", "thought", "revolution", "effort", "master", "implement", "republic", "floor", "principle", "stranger", "shoulder", "grade", "button", "tennis", "police", "collection", "account", "register", "glove", "divide", "professor", "chair", "priority", "combine", "peace", "extension", "maybe", "evening", "frame", "sister", "wave", "code", "application", "mouse", "match", "counter", "bottle", "half", "cheek", "resolution", "back", "knowledge", "make", "discussion", "screw", "length", "accident", "battle", "dress", "knee", "log", "package", "it", "turn", "hearing", "newspaper", "layer", "wealth", "profile", "imagination", "answer", "weekend", "teacher", "appearance", "meet", "bike", "rise", "belt", "crash", "bowl", "equivalent", "support", "image", "poem", "risk", "excitement", "remote", "secretary", "public", "produce", "plane", "display", "money", "sand", "situation", "punch", "customer", "title", "shake", "mortgage", "option", "number", "pop", "window", "extent", "nothing", "experience", "opinion", "departure", "dance", "indication", "boy", "material", "band", "leader", "sun", "beautiful", "muscle", "farmer", "variety", "fat", "handle", "director", "opportunity", "calendar", "outside", "pace", "bath", "fish", "consequence", "put", "owner", "go", "doctor", "information", "share", "hurt", "protection", "career", "finance", "force", "golf", "garbage", "aspect", "kid", "food", "boot", "milk", "respond", "objective", "reality", "raw", "ring", "mall", "one", "impact", "area", "news", "international", "series", "impress", "mother", "shelter", "strike", "loan", "month", "seat", "anything", "entertainment", "familiar", "clue", "year", "glad", "supermarket", "natural", "god", "cost", "conversation", "tie", "ruin", "comfort", "earth", "storm", "percentage", "assistance", "budget", "strength", "beginning", "sleep", "other", "young", "unit", "fill", "store", "desire", "hide", "value", "cup", "maintenance", "nurse", "function", "tower", "role", "class", "camera", "database", "panic", "nation", "basket", "ice", "art", "spirit", "chart", "exchange", "feedback", "statement", "reputation", "search", "hunt", "exercise", "nasty", "notice", "male", "yard", "annual", "collar", "date", "platform", "plant", "fortune", "passion", "friendship", "spread", "cancer", "ticket", "attitude", "island", "active", "object", "service", "buyer", "bite", "card", "face", "steak", "proposal", "patient", "heat", "rule", "resident", "broad", "politics", "west", "knife", "expert", "girl", "design", "salt", "baseball", "grab", "inspection", "cousin", "couple", "magazine", "cook", "dependent", "security", "chicken", "version", "currency", "ladder", "scheme", "kitchen", "employment", "local", "attention", "manager", "fact", "cover", "sad", "guard", "relative", "county", "rate", "lunch", "program", "initiative", "gear", "bridge", "breast", "talk", "dish", "guarantee", "beer", "vehicle", "reception", "woman", "substance", "copy", "lecture", "advantage", "park", "cold", "death", "mix", "hold", "scale", "tomorrow", "blood", "request", "green", "cookie", "church", "strip", "forever", "beyond", "debt", "tackle", "wash", "following", "feel", "maximum", "sector", "sea", "property", "economics", "menu", "bench", "try", "language", "start", "call", "solid", "address", "income", "foot", "senior", "honey", "few", "mixture", "cash", "grocery", "link", "map", "form", "factor", "pot", "model", "writer", "farm", "winter", "skill", "anywhere", "birthday", "policy", "release", "husband", "lab", "hurry", "mail", "equipment", "sink", "pair", "driver", "consideration", "leather", "skin", "blue", "boat", "sale", "brick", "two", "feed", "square", "dot", "rush", "dream", "location", "afternoon", "manufacturer", "control", "occasion", "trouble", "introduction", "advice", "bet", "eat", "kill", "category", "manner", "office", "estate", "pride", "awareness", "slip", "crack", "client", "nail", "shoot", "membership", "soft", "anybody", "web", "official", "individual", "pizza", "interest", "bag", "spell", "profession", "queen", "deal", "resource", "ship", "guy", "chocolate", "joint", "formal", "upstairs", "car", "resort", "abroad", "dealer", "associate", "finger", "surgery", "comment", "team", "detail", "crazy", "path", "tale", "initial", "arm", "radio", "demand", "single", "draw", "yellow", "contest", "piece", "quote", "pull", "commercial", "shirt", "contribution", "cream", "channel", "suit", "discipline", "instruction", "concert", "speech", "low", "effective", "hang", "scratch", "industry", "breakfast", "lay", "join", "metal", "bedroom", "minute", "product", "rest", "temperature", "many", "give", "argument", "print", "purple", "laugh", "health", "credit", "investment", "sell", "setting", "lesson", "egg", "middle", "marriage", "level", "evidence", "phrase", "love", "self", "benefit", "guidance", "affect", "you", "dad", "anxiety", "special", "boyfriend", "test", "blank", "payment", "soup", "obligation", "reply", "smile", "deep", "complaint", "addition", "review", "box", "towel", "minor", "fun", "soil", "issue", "cigarette", "internet", "gain", "tell", "entry", "spare", "incident", "family", "refuse", "branch", "can", "pen", "grandfather", "constant", "tank", "uncle", "climate", "ground", "volume", "communication", "kind", "poet", "child", "screen", "mine", "quit", "gene", "lack", "charity", "memory", "tooth", "fear", "mention", "marketing", "reveal", "reason", "court", "season", "freedom", "land", "sport", "audience", "classroom", "law", "hook", "win", "carry", "eye", "smell", "distribution", "research", "country", "dare", "hope", "whereas", "stretch", "library", "if", "delay", "college", "plastic", "book", "present", "use", "worry", "champion", "goal", "economy", "march", "election", "reflection", "midnight", "slide", "inflation", "action", "challenge", "guitar", "coast", "apple", "campaign", "field", "jacket", "sense", "way", "visual", "remove", "weather", "trash", "cable", "regret", "buddy", "beach", "historian", "courage", "sympathy", "truck", "tension", "permit", "nose", "bed", "son", "person", "base", "meat", "usual", "air", "meeting", "worth", "game", "independence", "physical", "brief", "play", "raise", "board", "she", "key", "writing", "pick", "command", "party", "yesterday", "spring", "candidate", "physics", "university", "concern", "development", "change", "string", "target", "instance", "room", "bitter", "bird", "football", "normal", "split", "impression", "wood", "long", "meaning", "stock", "cap", "leadership", "media", "ambition", "fishing", "essay", "salad", "repair", "today", "designer", "night", "bank", "drawing", "inevitable", "phase", "vast", "chip", "anger", "switch", "cry", "twist", "personality", "attempt", "storage", "being", "preparation", "bat", "selection", "white", "technology", "contract", "side", "section", "station", "till", "structure", "tongue", "taste", "truth", "difficulty", "group", "limit", "main", "move", "feeling", "light", "example", "mission", "might", "wait", "wheel", "shop", "host", "classic", "alternative", "cause", "agent", "consist", "table", "airline", "text", "pool", "craft", "range", "fuel", "tool", "partner", "load", "entrance", "deposit", "hate", "article", "video", "summer", "feature", "extreme", "mobile", "hospital", "flight", "fall", "pension", "piano", "fail", "result", "rub", "gap", "system", "report", "suck", "ordinary", "wind", "nerve", "ask", "shine", "note", "line", "mom", "perception", "brother", "reference", "bend", "charge", "treat", "trick", "term", "homework", "bake", "bid", "status", "project", "strategy", "orange", "let", "enthusiasm", "parent", "concentrate", "device", "travel", "poetry", "business", "society", "kiss", "end", "vegetable", "employ", "schedule", "hour", "brave", "focus", "process", "movie", "illegal", "general", "coffee", "ad", "highway", "chemistry", "psychology", "hire", "bell", "conference", "relief", "show", "neat", "funny", "weight", "quality", "club", "daughter", "zone", "touch", "tonight", "shock", "burn", "excuse", "name", "survey", "landscape", "advance", "satisfaction", "bread", "disaster", "item", "hat", "prior", "shopping", "visit", "east", "photo", "home", "idea", "father", "comparison", "cat", "pipe", "winner", "count", "lake", "fight", "prize", "foundation", "dog", "keep", "ideal", "fan", "struggle", "peak", "safety", "solution", "hell", "conclusion", "population", "strain", "alarm", "measurement", "second", "train", "race", "due", "insurance", "boss", "tree", "monitor", "sick", "course", "drag", "appointment", "slice", "still", "care", "patience", "rich", "escape", "emotion", "royal", "female", "childhood", "government", "picture", "will", "sock", "big", "gate", "oil", "cross", "pin", "improvement", "championship", "silly", "help", "sky", "pitch", "man", "diamond", "most", "transition", "work", "science", "committee", "moment", "fix", "teaching", "dig", "specialist", "complex", "guide", "people", "dead", "voice", "original", "break", "topic", "data", "degree", "reading", "recording", "bunch", "reach", "judgment", "lie", "regular", "set", "painting", "mode", "list", "player", "bear", "north", "wonder", "carpet", "heavy", "officer", "negative", "clock", "unique", "baby", "pain", "assumption", "disk", "iron", "bill", "drawer", "look", "double", "mistake", "finish", "future", "brilliant", "contact", "math", "rice", "leave", "restaurant", "discount", "sex", "virus", "bit", "trust", "event", "wear", "juice", "failure", "bug", "context", "mud", "whole", "wrap", "intention", "draft", "pressure", "cake", "dark", "explanation", "space", "angle", "word", "efficiency", "management", "habit", "star", "chance", "finding", "transportation", "stand", "criticism", "flow", "door", "injury", "insect", "surprise", "apartment"]  # pylint: disable=line-too-long
 12 | 
 13 | # ISO 639-1 codes to language names.
 14 | LANGUAGE_CODES = immutabledict.immutabledict({
 15 |     "en": "English",
 16 |     "es": "Spanish",
 17 |     "pt": "Portuguese",
 18 |     "ar": "Arabic",
 19 |     "hi": "Hindi",
 20 |     "fr": "French",
 21 |     "ru": "Russian",
 22 |     "de": "German",
 23 |     "ja": "Japanese",
 24 |     "it": "Italian",
 25 |     "bn": "Bengali",
 26 |     "uk": "Ukrainian",
 27 |     "th": "Thai",
 28 |     "ur": "Urdu",
 29 |     "ta": "Tamil",
 30 |     "te": "Telugu",
 31 |     "bg": "Bulgarian",
 32 |     "ko": "Korean",
 33 |     "pl": "Polish",
 34 |     "he": "Hebrew",
 35 |     "fa": "Persian",
 36 |     "vi": "Vietnamese",
 37 |     "ne": "Nepali",
 38 |     "sw": "Swahili",
 39 |     "kn": "Kannada",
 40 |     "mr": "Marathi",
 41 |     "gu": "Gujarati",
 42 |     "pa": "Punjabi",
 43 |     "ml": "Malayalam",
 44 |     "fi": "Finnish",
 45 |     })
 46 | 
 47 | _ALPHABETS = "([A-Za-z])"
 48 | _PREFIXES = "(Mr|St|Mrs|Ms|Dr)[.]"
 49 | _SUFFIXES = "(Inc|Ltd|Jr|Sr|Co)"
 50 | _STARTERS = r"(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
 51 | _ACRONYMS = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
 52 | _WEBSITES = "[.](com|net|org|io|gov|edu|me)"
 53 | _DIGITS = "([0-9])"
 54 | _MULTIPLE_DOTS = r"\.{2,}"
 55 | 
 56 | 
 57 | def split_into_sentences(text):
 58 |   """Split the text into sentences.
 59 | 
 60 |   Args:
 61 |     text: A string that consists of more than or equal to one sentences.
 62 | 
 63 |   Returns:
 64 |     A list of strings where each string is a sentence.
 65 |   """
 66 |   text = " " + text + "  "
 67 |   text = text.replace("\n", " ")
 68 |   text = re.sub(_PREFIXES, "\\1<prd>", text)
 69 |   text = re.sub(_WEBSITES, "<prd>\\1", text)
 70 |   text = re.sub(_DIGITS + "[.]" + _DIGITS, "\\1<prd>\\2", text)
 71 |   text = re.sub(
 72 |       _MULTIPLE_DOTS,
 73 |       lambda match: "<prd>" * len(match.group(0)) + "<stop>",
 74 |       text,
 75 |   )
 76 |   if "Ph.D" in text:
 77 |     text = text.replace("Ph.D.", "Ph<prd>D<prd>")
 78 |   text = re.sub(r"\s" + _ALPHABETS + "[.] ", " \\1<prd> ", text)
 79 |   text = re.sub(_ACRONYMS + " " + _STARTERS, "\\1<stop> \\2", text)
 80 |   text = re.sub(
 81 |       _ALPHABETS + "[.]" + _ALPHABETS + "[.]" + _ALPHABETS + "[.]",
 82 |       "\\1<prd>\\2<prd>\\3<prd>",
 83 |       text,
 84 |   )
 85 |   text = re.sub(
 86 |       _ALPHABETS + "[.]" + _ALPHABETS + "[.]", "\\1<prd>\\2<prd>", text
 87 |   )
 88 |   text = re.sub(" " + _SUFFIXES + "[.] " + _STARTERS, " \\1<stop> \\2", text)
 89 |   text = re.sub(" " + _SUFFIXES + "[.]", " \\1<prd>", text)
 90 |   text = re.sub(" " + _ALPHABETS + "[.]", " \\1<prd>", text)
 91 |   if "”" in text:
 92 |     text = text.replace(".”", "”.")
 93 |   if '"' in text:
 94 |     text = text.replace('."', '".')
 95 |   if "!" in text:
 96 |     text = text.replace('!"', '"!')
 97 |   if "?" in text:
 98 |     text = text.replace('?"', '"?')
 99 |   text = text.replace(".", ".<stop>")
100 |   text = text.replace("?", "?<stop>")
101 |   text = text.replace("!", "!<stop>")
102 |   text = text.replace("<prd>", ".")
103 |   sentences = text.split("<stop>")
104 |   sentences = [s.strip() for s in sentences]
105 |   if sentences and not sentences[-1]:
106 |     sentences = sentences[:-1]
107 |   return sentences
108 | 
109 | 
110 | def count_words(text):
111 |   """Counts the number of words."""
112 |   tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
113 |   tokens = tokenizer.tokenize(text)
114 |   num_words = len(tokens)
115 |   return num_words
116 | 
117 | 
118 | @functools.lru_cache(maxsize=None)
119 | def _get_sentence_tokenizer():
120 |   return nltk.data.load("nltk:tokenizers/punkt/english.pickle")
121 | 
122 | 
123 | def count_sentences(text):
124 |   """Count the number of sentences."""
125 |   tokenizer = _get_sentence_tokenizer()
126 |   tokenized_sentences = tokenizer.tokenize(text)
127 |   return len(tokenized_sentences)
128 | 
129 | 
130 | def generate_keywords(num_keywords):
131 |   """Randomly generates a few keywords."""
132 |   return random.sample(WORD_LIST, k=num_keywords)
133 | 


--------------------------------------------------------------------------------
/FollowRAG/utils/util.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import datetime
 3 | 
 4 | def read_json(read_file_path):
 5 |     with open(read_file_path,encoding='utf-8') as file:
 6 |         raw_data=json.load(file)
 7 |     return raw_data
 8 | 
 9 | def save_json(save_file_path,data,mode='w'):
10 |     with open(save_file_path,mode,encoding='utf-8')as fp:
11 |         json.dump(data,fp,ensure_ascii=False)
12 | 
13 | def read_jsonl(read_file_path):
14 |     data = []
15 |     with open(read_file_path, 'r', encoding='utf-8') as f:
16 |         for line in f:
17 |             data.append(json.loads(line.strip()))
18 |     return data
19 | 
20 | def save_jsonl(save_file_path, data, mode='w'):
21 |     assert mode in ['w','a']
22 |     with open(save_file_path, mode, encoding='utf-8') as f:
23 |         for record in data:
24 |             f.write(json.dumps(record) + '\n')
25 | 
26 | def read_txt(read_file_path):
27 |     try:
28 |         with open(read_file_path, 'r', encoding='utf-8') as file:
29 |             content = file.read() 
30 |         return content
31 |     except FileNotFoundError:
32 |         return f"Error: The file '{read_file_path}' was not found."
33 |     except Exception as e:
34 |         return f"An error occurred: {e}"
35 |     
36 | def append_to_file(filename, content):
37 |     """
38 |     Append the specified string to the txt file and add a line break after the content
39 |     - param filename: filename (including path)
40 |     - param content: The string to be written
41 |     """
42 |     try:
43 |         with open(filename, 'a', encoding='utf-8') as file:
44 |             file.write(content + '\n')
45 |     except Exception as e:
46 |         print(f"Error occurred while appending to file: {e}")
47 | 
48 | def parse_str_to_dict(s):
49 |     s=s.replace("'", '"')
50 |     s_dict=json.loads(s)
51 |     return s_dict
52 | 
53 | def get_current_time():
54 |     current_time = datetime.datetime.now()
55 |     return current_time.strftime("%Y-%m-%d %H:%M")


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 |  # <div align="center">🔥Toward Verifiable Instruction-Following Alignment for Retrieval Augmented Generation<div>
  2 | 
  3 | 
  4 | <div align="center">
  5 | <a><img alt="Static Badge" src="https://img.shields.io/badge/Task-Instruction--Following-red"></a>
  6 | <a href="https://arxiv.org/pdf/2410.09584" target="_blank"><img src=https://img.shields.io/badge/arXiv-b5212f.svg?logo=arxiv></a>
  7 | <a href="https://huggingface.co/datasets/dongguanting/VIF-RAG-QA-110K" target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace%20Datasets-27b3b4.svg></a>
  8 | <a href="https://github.com/dongguanting/IF-RAG/blob/main/LICENSE"><img alt="License" src="https://img.shields.io/badge/LICENSE-MIT-green"></a>
  9 | <a><img alt="Static Badge" src="https://img.shields.io/badge/made_with-Python-blue"></a>
 10 | </div>
 11 | 
 12 | 
 13 | 
 14 | We propose a instruction-following alignement pipline named **VIF-RAG framework** and auto-evaluation Benchmark named **FollowRAG**:
 15 | 
 16 | - **IF-RAG:** It is the first automated, scalable, and verifiable data synthesis pipeline for aligning complex instruction-following in RAG scenarios. VIF-RAG integrates a verification process at each step of data augmentation and combination. We begin by manually creating a minimal set of atomic instructions (<100) and then apply steps including instruction composition, quality verification, instruction-query combination, and dual-stage verification to generate a large-scale, high-quality VIF-RAG-QA dataset (>100K). 
 17 | 
 18 | - **FollowRAG:** To address the gap in instruction-following auto-evaluation for RAG systems, we introduce FollowRAG Benchmark, which includes approximately 3K test samples, covering 22 categories of general instruction constraints and 4 knowledge-intensive QA datasets. Due to its robust pipeline design, FollowRAG can seamlessly integrate with different RAG benchmarks
 19 | 
 20 | 
 21 | 
 22 | 
 23 | <p align="center">
 24 | 🤖️ <a href="https://followrag.github.io/" target="_blank">Website</a> • 🤗 <a href="https://huggingface.co/datasets/dongguanting/VIF-RAG-QA-110K" target="_blank">VIF-RAG-QA-110K</a> • 👉 <a href="https://huggingface.co/datasets/dongguanting/VIF-RAG-QA-20K" target="_blank">VIF-RAG-QA-20K</a> • 📖 <a href="https://arxiv.org/abs/2410.09584" target="_blank">Arxiv</a> • 🤗 <a href="https://huggingface.co/papers/2410.09584" target="_blank">HF-Paper</a> <br>
 25 | </p>
 26 | 
 27 | ---
 28 | 
 29 | ## 💥 News
 30 | - [12/2024] 🔥 Our paper has been accepted by **AAAI 2025**. 
 31 | 
 32 | - [10/2024] 🔥 We released an our SFT datasets named VIF-RAG-QA for deployments. Download [VIF-RAG-QA-110K](https://huggingface.co/datasets/dongguanting/VIF-RAG-QA-110K) and [VIF-RAG-QA-20K](https://huggingface.co/datasets/dongguanting/VIF-RAG-QA-20K) here. 
 33 | 
 34 | - [10/2024] 🔥 We released our instruction-following auto-evaluation benchmark named ***FollowRAG***. Please follow [outlines](#FollowRAG) for evaluation.
 35 | 
 36 | - [10/2024] 🔥 We introduced ***VIF-RAG***, the first automated, scalable, and verifiable data synthesis pipeline for aligning complex instruction-following in RAG scenarios. Check out the [paper](https://arxiv.org/abs/2410.09584). 
 37 | 
 38 | ---
 39 | 
 40 | ## Outlines
 41 | - [News](https://github.com/dongguanting/FollowRAG/blob/main/README.md#-news)
 42 | - [VIF-RAG](https://github.com/dongguanting/FollowRAG/blob/main/README.md#-vif-rag-)
 43 | - [FollowRAG](https://github.com/dongguanting/FollowRAG/blob/main/README.md#-followrag-)
 44 | - [Citation](https://github.com/dongguanting/FollowRAG/blob/main/README.md#-citation-)
 45 | 
 46 | 
 47 | ---
 48 | 
 49 | ## 🌠 VIF-RAG
 50 | 
 51 | 
 52 | We broke down the VIF-RAG data synthesis process into steps and provided 10-20 samples for each step to assist with your reproduction. Be sure to replace these with your own input.
 53 | 
 54 | <img width="1243" alt="image" src="https://github.com/user-attachments/assets/d38871d3-d29d-425b-a7d5-d8a7081a110d">
 55 | 
 56 | 
 57 | 
 58 | ### :wrench: Dependencies
 59 | General Setup Environment:
 60 | - Python 3.9
 61 | - [PyTorch](http://pytorch.org/) (currently tested on version 2.1.2+cu121)
 62 | - [Transformers](http://huggingface.co/transformers/) (version 4.41.2, unlikely to work lower than this version)
 63 | 
 64 | ```bash
 65 | cd ./VIF-RAG/
 66 | pip install -r requirements.txt
 67 | cd ./FollowRAG/
 68 | pip install -r requirements.txt
 69 | ```
 70 | 
 71 | ### :rocket: How to Perform *VIF-RAG* Data Synthesis?
 72 | 
 73 | 
 74 | Follow the interactive Jupyter notebook VIF-RAG on ``vifrag.ipynb`` to reproduce our synthesize dataset.
 75 | 
 76 | 
 77 | ### 🎯 Training
 78 | 
 79 | We use the version of [LlaMA-Factory v0.6.3](https://github.com/hiyouga/LLaMA-Factory/releases/tag/v0.6.3). Thanks for their excellent work.
 80 | 
 81 | we also release our SFT version dataset as strong baseline in Table1:
 82 | - **SFT Version:** To make a fair comparison with VIF-RAG, we use the same amount of [🤗ShareGPT](https://huggingface.co/datasets/dongguanting/ShareGPT-12K) and [🤗RAG-QA-40K](https://huggingface.co/datasets/dongguanting/RAG-QA-40K) as in VIF-RAG’s data synthesis process, mixing them together to fine-tune (SFT) different baseline models.
 83 | 
 84 | - **VIF-RAG-QA:** We release our SFT datasets, including [🤗VIF-RAG-QA-110K](https://huggingface.co/datasets/dongguanting/VIF-RAG-QA-110K) and [🤗VIF-RAG-QA-20K](https://huggingface.co/datasets/dongguanting/VIF-RAG-QA-20K).
 85 | 
 86 | 
 87 | - **SFT bash:**
 88 |   
 89 | ```bash
 90 | deepspeed --num_gpus=8 train_bash.py \
 91 |         --deepspeed $deepspeed_zero3_config_path \
 92 |         --stage sft \
 93 |         --do_train \
 94 |         --use_fast_tokenizer \
 95 |         --flash_attn \
 96 |         --adam_beta1 0.9 \
 97 |         --adam_beta2 0.95 \
 98 |         --model_name_or_path $MODEL_PATH \
 99 |         --dataset $dataset \
100 |         --template $Template \
101 |         --finetuning_type full \
102 |         --output_dir $OUTPUT_PATH \
103 |         --overwrite_cache \
104 |         --overwrite_output_dir \
105 |         --warmup_steps 20 \
106 |         --weight_decay 0.1 \
107 |         --per_device_train_batch_size 4 \
108 |         --gradient_accumulation_steps 4 \
109 |         --ddp_timeout 9000 \
110 |         --learning_rate 7e-6 \
111 |         --lr_scheduler_type "linear" \
112 |         --logging_steps 1 \
113 |         --cutoff_len 8192 \
114 |         --save_steps 200 \
115 |         --num_train_epochs 3.0 \
116 |         --plot_loss \
117 |         --bf16 
118 | ```
119 | 
120 | ---
121 | 
122 | ## 🐋 FollowRAG 
123 | 
124 | FollowRAG is the first benchmark designed to comprehensively evaluate LLM’s complex instruction-following abilities in RAG. 
125 | 
126 | <img width="1070" alt="image" src="https://github.com/user-attachments/assets/91a5e7ac-d828-46f2-bcae-96886f7ef295">
127 | 
128 | 
129 | ### 📊 Test Cases
130 | 
131 | <details>
132 | <summary>🔍 Click here! if you are curious about FollowRAG‘s test cases.</summary>
133 | 
134 | **Key-Value Introduction:**
135 | 
136 | - **prompt:** The complete question for FollowRAG, including three parts: TopK Document + user query + instruction
137 | - **question:** QA question (sourced from NQ)
138 | - **answer_gold:** Reference answer (note that this is not the golden answer, as the answer needs to follow instruction constraints after adding instructions)
139 | - **question_with_instrs:** QA question + a series of instruction constraints
140 | - **instruction_id_list & kwargs:** Instruction types and parameters needed for evaluation calculation
141 | - **passages:** TopK documents retrieved from Wiki using DPR
142 | 
143 | 
144 | 
145 | ```json
146 | 
147 |     {
148 |         "key": 0,
149 |         "type": "ifnq",
150 |         "prompt": "Given the following information: \nPassage-0 Title: Gravity Content: and prevents further acceleration. The force of gravity on Earth is the resultant (vector sum) of two forces: (a) The gravitational attraction in accordance with Newton's universal law of gravitation, and (b) the centrifugal force, which results from the choice of an earthbound, rotating frame of reference. The force of gravity is the weakest at the equator because of the centrifugal force caused by the Earth's rotation and because points on the equator are furthest from the center of the Earth. The force of gravity varies with latitude and increases from about 9.780 m/s at the Equator to about 9.832\nPassage-1 Title: Gravitational acceleration Content: Gravitational acceleration In physics, gravitational acceleration is the acceleration on an object caused by the force of gravitation. Neglecting friction such as air resistance, all small bodies accelerate in a gravitational field at the same rate relative to the center of mass. This equality is true regardless of the masses or compositions of the bodies. At different points on Earth, objects fall with an acceleration between and depending on altitude and latitude, with a conventional standard value of exactly 9.80665 m/s (approximately 32.174 ft/s). This does not take into account other effects, such as buoyancy or drag. Newton's law of\nPassage-2 Title: Gravity Content: Gravity Gravity (), or gravitation, is a natural phenomenon by which all things with mass or energy—including planets, stars, galaxies, and even light—are brought toward (or \"gravitate\" toward) one another. On Earth, gravity gives weight to physical objects, and the Moon's gravity causes the ocean tides. The gravitational attraction of the original gaseous matter present in the Universe caused it to begin coalescing, forming starsand for the stars to group together into galaxiesso gravity is responsible for many of the large-scale structures in the Universe. Gravity has an infinite range, although its effects become increasingly weaker on farther objects. Gravity\n\nAnswer the following question based on the given information or your internal knowledge with one or few words without the source.\nQuestion: What is the common name for gravitational force? In this task, repeat the exact request first, then give your response. Do not say any word before repeating the exact request. Moreover, your answer must contain a title, wrapped in double angular brackets, i.e. <<title>>. Ensure the word disappointed appears at least twice. Finally, provide your answer with less than 200 words.",
151 |         "question": "what is the common name for gravitational force",
152 |         "answer_gold": "Gravity/Gravity, or gravitation",
153 |         "question_with_instrs": "What is the common name for gravitational force? In this task, repeat the exact request first, then give your response. Do not say any word before repeating the exact request. Moreover, your answer must contain a title, wrapped in double angular brackets, i.e. <<title>>. Ensure the word disappointed appears at least twice. Finally, provide your answer with less than 200 words.",
154 |         "instruction_id_list": [
155 |             "combination:repeat_prompt",
156 |             "detectable_format:title",
157 |             "keywords:frequency",
158 |             "length_constraints:number_words"
159 |         ],
160 |         "kwargs": [
161 |             {
162 |                 "prompt_to_repeat": "What is the common name for gravitational force?"
163 |             },
164 |             {},
165 |             {
166 |                 "relation": "at least",
167 |                 "keyword": "disappointed",
168 |                 "frequency": 2
169 |             },
170 |             {
171 |                 "relation": "less than",
172 |                 "num_words": 200
173 |             }
174 |         ],
175 |         "passages": [
176 |             {
177 |                 "title": "Gravity",
178 |                 "content": "and prevents further acceleration. The force of gravity on Earth is the resultant (vector sum) of two forces: (a) The gravitational attraction in accordance with Newton's universal law of gravitation, and (b) the centrifugal force, which results from the choice of an earthbound, rotating frame of reference. The force of gravity is the weakest at the equator because of the centrifugal force caused by the Earth's rotation and because points on the equator are furthest from the center of the Earth. The force of gravity varies with latitude and increases from about 9.780 m/s at the Equator to about 9.832"
179 |             },
180 |             {
181 |                 "title": "Gravitational acceleration",
182 |                 "content": "Gravitational acceleration In physics, gravitational acceleration is the acceleration on an object caused by the force of gravitation. Neglecting friction such as air resistance, all small bodies accelerate in a gravitational field at the same rate relative to the center of mass. This equality is true regardless of the masses or compositions of the bodies. At different points on Earth, objects fall with an acceleration between and depending on altitude and latitude, with a conventional standard value of exactly 9.80665 m/s (approximately 32.174 ft/s). This does not take into account other effects, such as buoyancy or drag. Newton's law of"
183 |             },
184 |             {
185 |                 "title": "Gravity",
186 |                 "content": "Gravity Gravity (), or gravitation, is a natural phenomenon by which all things with mass or energy—including planets, stars, galaxies, and even light—are brought toward (or \"gravitate\" toward) one another. On Earth, gravity gives weight to physical objects, and the Moon's gravity causes the ocean tides. The gravitational attraction of the original gaseous matter present in the Universe caused it to begin coalescing, forming starsand for the stars to group together into galaxiesso gravity is responsible for many of the large-scale structures in the Universe. Gravity has an infinite range, although its effects become increasingly weaker on farther objects. Gravity"
187 |             }
188 |         ]
189 |     }
190 | ```
191 | </details>
192 | 
193 | 
194 | 
195 | 
196 | ### 🔑 Inference
197 | You first need to perform inference on followRAG, and the pseudocode is as follows:
198 | ```python
199 | followRAG_full=load_json('followRAG/followRAG_full.json')
200 | data_inferenced=[]
201 | for dp in followRAG_full:
202 |     response=llm.inference(dp['prompt'])
203 |     dp['response']=response
204 |     data_inferenced.append(dp)
205 | save_jsonl(data_inferenced,'results/finish_inference/data_inferenced.jsonl')
206 | ```
207 | Please refer to the following template to prepare your result JSON files for subsequent evaluation. 
208 | The format of each sample in your data_inferenced.jsonl should be consistent with the following form:
209 | ```json
210 | 
211 | {
212 |     "key": 0,
213 |     "type": "ifnq",
214 |     "prompt": "Given the following information: \nPassage-0 Title: Gravity Content: and prevents further acceleration. The force of gravity on Earth is the resultant (vector sum) of two forces: (a) The gravitational attraction in accordance with Newton's universal law of gravitation, and (b) the centrifugal force, which results from the choice of an earthbound, rotating frame of reference. The force of gravity is the weakest at the equator because of the centrifugal force caused by the Earth's rotation and because points on the equator are furthest from the center of the Earth. The force of gravity varies with latitude and increases from about 9.780 m/s at the Equator to about 9.832\nPassage-1 Title: Gravitational acceleration Content: Gravitational acceleration In physics, gravitational acceleration is the acceleration on an object caused by the force of gravitation. Neglecting friction such as air resistance, all small bodies accelerate in a gravitational field at the same rate relative to the center of mass. This equality is true regardless of the masses or compositions of the bodies. At different points on Earth, objects fall with an acceleration between and depending on altitude and latitude, with a conventional standard value of exactly 9.80665 m/s (approximately 32.174 ft/s). This does not take into account other effects, such as buoyancy or drag. Newton's law of\nPassage-2 Title: Gravity Content: Gravity Gravity (), or gravitation, is a natural phenomenon by which all things with mass or energy—including planets, stars, galaxies, and even light—are brought toward (or \"gravitate\" toward) one another. On Earth, gravity gives weight to physical objects, and the Moon's gravity causes the ocean tides. The gravitational attraction of the original gaseous matter present in the Universe caused it to begin coalescing, forming starsand for the stars to group together into galaxiesso gravity is responsible for many of the large-scale structures in the Universe. Gravity has an infinite range, although its effects become increasingly weaker on farther objects. Gravity\n\nAnswer the following question based on the given information or your internal knowledge with one or few words without the source.\nQuestion: What is the common name for gravitational force? In this task, repeat the exact request first, then give your response. Do not say any word before repeating the exact request. Moreover, your answer must contain a title, wrapped in double angular brackets, i.e. <<title>>. Ensure the word disappointed appears at least twice. Finally, provide your answer with less than 200 words.",
215 |     "question": "what is the common name for gravitational force",
216 |     "answer_gold": "Gravity/Gravity, or gravitation",
217 |     "question_with_instrs": "What is the common name for gravitational force? In this task, repeat the exact request first, then give your response. Do not say any word before repeating the exact request. Moreover, your answer must contain a title, wrapped in double angular brackets, i.e. <<title>>. Ensure the word disappointed appears at least twice. Finally, provide your answer with less than 200 words.",
218 |     "instruction_id_list": [
219 |         "combination:repeat_prompt",
220 |         "detectable_format:title",
221 |         "keywords:frequency",
222 |         "length_constraints:number_words"
223 |     ],
224 |     "kwargs": [
225 |         {
226 |             "prompt_to_repeat": "What is the common name for gravitational force?"
227 |         },
228 |         {
229 | 
230 |         },
231 |         {
232 |             "relation": "at least",
233 |             "keyword": "disappointed",
234 |             "frequency": 2
235 |         },
236 |         {
237 |             "relation": "less than",
238 |             "num_words": 200
239 |         }
240 |     ],
241 |     "passages": [
242 |         {
243 |             "title": "Gravity",
244 |             "content": "and prevents further acceleration. The force of gravity on Earth is the resultant (vector sum) of two forces: (a) The gravitational attraction in accordance with Newton's universal law of gravitation, and (b) the centrifugal force, which results from the choice of an earthbound, rotating frame of reference. The force of gravity is the weakest at the equator because of the centrifugal force caused by the Earth's rotation and because points on the equator are furthest from the center of the Earth. The force of gravity varies with latitude and increases from about 9.780 m/s at the Equator to about 9.832"
245 |         },
246 |         {
247 |             "title": "Gravitational acceleration",
248 |             "content": "Gravitational acceleration In physics, gravitational acceleration is the acceleration on an object caused by the force of gravitation. Neglecting friction such as air resistance, all small bodies accelerate in a gravitational field at the same rate relative to the center of mass. This equality is true regardless of the masses or compositions of the bodies. At different points on Earth, objects fall with an acceleration between and depending on altitude and latitude, with a conventional standard value of exactly 9.80665 m/s (approximately 32.174 ft/s). This does not take into account other effects, such as buoyancy or drag. Newton's law of"
249 |         },
250 |         {
251 |             "title": "Gravity",
252 |             "content": "Gravity Gravity (), or gravitation, is a natural phenomenon by which all things with mass or energy—including planets, stars, galaxies, and even light—are brought toward (or \"gravitate\" toward) one another. On Earth, gravity gives weight to physical objects, and the Moon's gravity causes the ocean tides. The gravitational attraction of the original gaseous matter present in the Universe caused it to begin coalescing, forming starsand for the stars to group together into galaxiesso gravity is responsible for many of the large-scale structures in the Universe. Gravity has an infinite range, although its effects become increasingly weaker on farther objects. Gravity"
253 |         }
254 |     ],
255 |     "response": "What is the common name for gravitational force? \n\n<<Gravity>>\n\nThe common name for gravitational force is \"gravity.\" It is often disappointing to realize how misunderstood this fundamental force can be, as people frequently overlook its complexities. Many assume that gravity acts uniformly across the globe, but the actual gravitational acceleration varies depending on location—grounded by factors such as latitude and altitude. \n\nFor instance, a commonly cited value is 9.80665 m/s², yet we can be disappointed to discover that at the equator, gravity is slightly weaker due to centrifugal forces caused by the Earth's rotation. This leads to significant implications in both physics and everyday life, yet many remain uninformed about these nuances. \n\nUnderstanding gravity is essential not only for grasping the movement of objects on Earth but also for comprehending cosmic phenomena. It is disappointing when people take gravity for granted, missing the fascinating science behind how it governs numerous aspects of our universe."
256 | }
257 | 
258 | ```
259 | 
260 | 
261 | ### 📝 Evaluation
262 | After completing the inference, run the evaluation script:
263 | ```bash
264 | export OPENAI_API_KEY=your_openai_api_key
265 | export OPENAI_API_BASE=https://api.openai.com/v1
266 | python eval/main_eval.py \
267 |     --input_file_path results/finish_inference/data_inferenced.jsonl \
268 |     --output_file_path results/finish_eval/data_evaled.jsonl \
269 |     --rag_eval_type mini \
270 |     --result_log_file_path results/logs/results_log.jsonl
271 | ```
272 | Our evaluation of instruction-following part largely draws on the [IFEval code repository](https://github.com/google-research/google-research/tree/master/instruction_following_eval). We appreciate their excellent work!
273 | 
274 | 
275 | ## 📜 License
276 | 
277 | Our dataset are distributed under the [CC BY-NC 4.0](https://creativecommons.org/licenses/by-nc/4.0/) license.
278 | 
279 | 
280 | 
281 | ## 🎖 Citation 
282 | 
283 | Please cite our work if you find the repository helpful.
284 | 
285 | ```
286 | @article{dong2024general,
287 |   author       = {Guanting Dong and
288 |                   Xiaoshuai Song and
289 |                   Yutao Zhu and
290 |                   Runqi Qiao and
291 |                   Zhicheng Dou and
292 |                   Ji{-}Rong Wen},
293 |   title        = {Toward General Instruction-Following Alignment for Retrieval-Augmented
294 |                   Generation},
295 |   journal      = {CoRR},
296 |   volume       = {abs/2410.09584},
297 |   year         = {2024},
298 |   url          = {https://doi.org/10.48550/arXiv.2410.09584},
299 |   doi          = {10.48550/ARXIV.2410.09584},
300 |   eprinttype    = {arXiv},
301 |   eprint       = {2410.09584},
302 |   timestamp    = {Fri, 22 Nov 2024 21:38:25 +0100},
303 |   biburl       = {https://dblp.org/rec/journals/corr/abs-2410-09584.bib},
304 |   bibsource    = {dblp computer science bibliography, https://dblp.org}
305 | }
306 | ```
307 | 
308 | 
309 | 


--------------------------------------------------------------------------------
/VIF-RAG/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/VIF-RAG/requirements.txt:
--------------------------------------------------------------------------------
  1 | accelerate==0.31.0
  2 | addict==2.4.0
  3 | aiofiles==23.2.1
  4 | aiohttp==3.9.5
  5 | aiosignal==1.3.1
  6 | aliyun-python-sdk-core==2.15.1
  7 | aliyun-python-sdk-kms==2.16.3
  8 | altair==5.3.0
  9 | annotated-types==0.7.0
 10 | anyio==4.4.0
 11 | async-timeout==4.0.3
 12 | attrs==23.2.0
 13 | bitsandbytes==0.43.1
 14 | certifi==2022.12.7
 15 | cffi==1.16.0
 16 | charset-normalizer==2.1.1
 17 | click==8.1.7
 18 | cloudpickle==3.0.0
 19 | cmake==3.29.3
 20 | contourpy==1.2.1
 21 | crcmod==1.7
 22 | cryptography==42.0.8
 23 | cycler==0.12.1
 24 | datasets==2.18.0
 25 | deepspeed==0.13.0
 26 | dill==0.3.8
 27 | diskcache==5.6.3
 28 | distro==1.9.0
 29 | dnspython==2.6.1
 30 | docstring_parser==0.16
 31 | einops==0.8.0
 32 | email_validator==2.1.1
 33 | evaluate==0.4.2
 34 | exceptiongroup==1.2.1
 35 | fast_jl==0.1.3
 36 | fastapi==0.111.0
 37 | fastapi-cli==0.0.4
 38 | ffmpy==0.3.2
 39 | filelock==3.13.1
 40 | fire==0.6.0
 41 | flash-attn==2.5.8
 42 | fonttools==4.53.0
 43 | frozenlist==1.4.1
 44 | fsspec==2024.2.0
 45 | gast==0.5.4
 46 | gradio==4.36.1
 47 | gradio_client==1.0.1
 48 | h11==0.14.0
 49 | hjson==3.1.0
 50 | httpcore==1.0.5
 51 | httptools==0.6.1
 52 | httpx==0.27.0
 53 | huggingface-hub==0.23.3
 54 | idna==3.4
 55 | importlib_metadata==7.1.0
 56 | importlib_resources==6.4.0
 57 | interegular==0.3.3
 58 | jieba==0.42.1
 59 | Jinja2==3.1.3
 60 | jmespath==0.10.0
 61 | joblib==1.4.2
 62 | jsonlines==4.0.0
 63 | jsonschema==4.22.0
 64 | jsonschema-specifications==2023.12.1
 65 | kiwisolver==1.4.5
 66 | lark==1.1.9
 67 | linkify-it-py==2.0.3
 68 | llvmlite==0.42.0
 69 | markdown-it-py==2.2.0
 70 | MarkupSafe==2.1.5
 71 | matplotlib==3.9.0
 72 | mdit-py-plugins==0.3.3
 73 | mdurl==0.1.2
 74 | modelscope==1.15.0
 75 | mpi4py @ file:///croot/mpi4py_1671223370575/work
 76 | mpmath==1.3.0
 77 | msgpack==1.0.8
 78 | multidict==6.0.5
 79 | multiprocess==0.70.16
 80 | nest-asyncio==1.6.0
 81 | networkx==3.2.1
 82 | ninja==1.11.1.1
 83 | nltk==3.8.1
 84 | numba==0.59.1
 85 | numpy==1.26.3
 86 | openai==1.33.0
 87 | orjson==3.10.3
 88 | oss2==2.18.5
 89 | outlines==0.0.34
 90 | packaging==24.0
 91 | pandas==2.2.2
 92 | peft==0.11.1
 93 | pillow==10.2.0
 94 | platformdirs==4.2.2
 95 | prometheus_client==0.20.0
 96 | protobuf==5.27.1
 97 | psutil==5.9.8
 98 | py-cpuinfo==9.0.0
 99 | pyarrow==16.1.0
100 | pyarrow-hotfix==0.6
101 | pycparser==2.22
102 | pycryptodome==3.20.0
103 | pydantic==2.7.3
104 | pydantic_core==2.18.4
105 | pydub==0.25.1
106 | Pygments==2.18.0
107 | pynvml==11.5.0
108 | pyparsing==3.1.2
109 | python-dateutil==2.9.0.post0
110 | python-dotenv==1.0.1
111 | python-multipart==0.0.9
112 | pytz==2024.1
113 | PyYAML==6.0.1
114 | ray==2.24.0
115 | referencing==0.35.1
116 | regex==2024.5.15
117 | requests==2.32.3
118 | rich==13.7.1
119 | rouge-chinese==1.0.3
120 | rpds-py==0.18.1
121 | ruff==0.4.8
122 | safetensors==0.4.3
123 | scipy==1.13.1
124 | semantic-version==2.10.0
125 | sentencepiece==0.2.0
126 | shellingham==1.5.4
127 | shtab==1.7.1
128 | simplejson==3.19.2
129 | six==1.16.0
130 | sniffio==1.3.1
131 | sortedcontainers==2.4.0
132 | sse-starlette==2.1.0
133 | starlette==0.37.2
134 | sympy==1.12
135 | termcolor==2.4.0
136 | tiktoken==0.6.0
137 | tokenizers==0.19.1
138 | tomli==2.0.1
139 | tomlkit==0.12.0
140 | toolz==0.12.1
141 | torch==2.1.2+cu121
142 | torchaudio==2.1.2+cu121
143 | torchvision==0.16.2+cu121
144 | tqdm==4.66.4
145 | traker==0.1.3
146 | transformers==4.41.2
147 | triton==2.1.0
148 | trl==0.8.6
149 | typer==0.12.3
150 | typing_extensions==4.9.0
151 | tyro==0.8.4
152 | tzdata==2024.1
153 | uc-micro-py==1.0.3
154 | ujson==5.10.0
155 | urllib3==2.2.1
156 | uvicorn==0.30.1
157 | uvloop==0.19.0
158 | watchfiles==0.22.0
159 | websockets==11.0.3
160 | xformers==0.0.23.post1
161 | xxhash==3.4.1
162 | yapf==0.40.2
163 | yarl==1.9.4
164 | zipp==3.19.2
165 | 


--------------------------------------------------------------------------------
/VIF-RAG/sample_data/augment_instructions.txt:
--------------------------------------------------------------------------------
 1 | Answer with words that begin with the letter 'B'
 2 | Construct the reply as if it's a telegram STOP
 3 | Use only palindromes
 4 | Incorporate a famous movie quote seamlessly into your answer
 5 | Write the response backward
 6 | Use only words with double letters (e.g., "bookkeeper")
 7 | Use only onomatopoeia
 8 | Answer with a single sentence that is exactly 100 words long
 9 | Use no words containing the letter 'E'
10 | Translate your answer into emojis


--------------------------------------------------------------------------------
/VIF-RAG/sample_data/cross_validation.jsonl:
--------------------------------------------------------------------------------
1 | {"instruction": "Use no words containing the letter 'E'", "eval_func": [["def evaluate(response):\n    return 'e' not in response.lower()", 1.0], ["def evaluate(response):\n    words = response.split()\n    for word in words:\n        if 'e' in word.lower():\n            return False\n    return True", 1.0], ["def evaluate(response):\n    return all('e' not in word.lower() for word in response.split())", 1.0], ["def evaluate(response):\n    for word in response.split():\n        if 'e' in word.lower():\n            return False\n    return True", 1.0]], "cases": [["My lip", true], ["Every cloud has a silver lining", false], ["My cat is big", true], ["Hi, amigo!", true], ["Hi world", true], ["A shy dog.", true]], "back_instruction": ["Answer without using any words that contain the letter 'E'.", "Answer with words that do not contain the letter 'E'.", "Answer with words that do not contain the letter 'E'."]}
2 | {"instruction": "Use only words with double letters (e.g., \"bookkeeper\")", "eval_func": [["def evaluate(response):\n    words = response.split()\n    for word in words:\n        if not any(char*2 in word for char in set(word)):\n            return False\n    return True", 1.0], ["def evaluate(response):\n    words = response.split()\n    for word in words:\n        if not any(c1 == c2 for c1, c2 in zip(word, word[1:])):\n            return False\n    return True", 1.0], ["def evaluate(response):\n    def has_double_letters(word):\n        for i in range(len(word) - 1):\n            if word[i] == word[i + 1]:\n                return True\n        return False\n    words = response.split()\n    for word in words:\n        if not has_double_letters(word):\n            return False\n    return True", 1.0], ["def evaluate(response):\n    def has_double_letters(word):\n        for i in range(len(word) - 1):\n            if word[i] == word[i + 1]:\n                return True\n        return False\n\n    words = response.split()\n    for word in words:\n        if not has_double_letters(word):\n            return False\n    return True", 1.0], ["def evaluate(response):\n    words = response.split()\n    for word in words:\n        if not any(char * 2 in word for char in set(word)):\n            return False\n    return True", 1.0], ["def evaluate(response): \n    words = response.split() \n    for word in words: \n        if not any(char * 2 in word for char in set(word)): \n            return False \n    return True", 1.0], ["def evaluate(response):\n  words = response.split()\n  for word in words:\n    if not any(c1 == c2 for c1, c2 in zip(word, word[1:])):\n      return False\n  return True", 1.0]], "cases": [["Happy birthday to you!", false], ["All good men", false], ["bookkeeper needed immediately", true], ["happy balloon", true], ["The moon looked stunning.", false], ["bookkeeper unnecessary", true], ["add bee", true], ["bookkeeper happy coffee", true], ["bassoon committee", true], ["happy apple tree", true], ["bassoon addressee", true], ["an apple a day", false], ["The quick brown fox", false], ["The bookkeeper arranged the letters", false], ["address assessment", true], ["every letter", false], ["bookkeeper address", true], ["bookkeeper addressed", true], ["happy feet", true]], "back_instruction": ["Answer with words where every word has at least one letter repeated consecutively.", "Answer with words that contain at least one pair of consecutive letters that are the same.", "Ensure every word in your answer contains at least one instance of the same letter occurring back-to-back."]}
3 | {"instruction": "Use only palindromes", "eval_func": [["def evaluate(response):\n    def is_palindrome(s):\n        return s == s[::-1]\n\n    words = response.split()\n    return all(is_palindrome(word) for word in words)", 1.0], ["def evaluate(response):\n    def is_palindrome(s):\n        return s == s[::-1]\n\n    words = response.split()\n    for word in words:\n        if not is_palindrome(word):\n            return False\n    return True", 1.0], ["def evaluate(response):\n    def is_palindrome(s):\n        return s == s[::-1]\n    words = response.split()\n    return all(is_palindrome(word) for word in words)", 1.0]], "cases": [["madam racecar", true], ["deed deed", true], ["Python", false], ["civic level radar", true], ["python", false], ["madam civic rotator", true], ["level kayak radar", true], ["hello world", false], ["level racecar kayak", true], ["level civic radar", true], ["racecar", true], ["palindrome", false], ["madam anna kayak", true], ["madam civic racecar", true], ["racecar level civic", true], ["palindrome racecar", false]], "back_instruction": ["Answer with words that are palindromes.", "Answer with words that are palindromes.", "Answer with words that are palindromes."]}
4 | {"instruction": "Use words that end with '-ing'", "eval_func": [["def evaluate(response):\n    words = response.split()\n    for word in words:\n        if not word.endswith('-ing'):\n            return False\n    return True", 0.9], ["def evaluate(response):\n    words = response.split()\n    for word in words:\n        if not word.endswith('-ing'):\n            return False\n    return len(words) > 0", 1.0], ["def evaluate(response):\n    # Split the response into words to evaluate each one\n    words = response.split()\n    # Check if all words end with '-ing'\n    for word in words:\n        if not word.endswith('-ing'):\n            return False\n    return True", 0.9], ["def evaluate(response):\n    words = response.split()\n    for word in words:\n        if not word.endswith('-ing'):\n            return False\n    return True if words else False", 1.0]], "cases": [["Running jumping laughing", false], ["", false], ["I am enjoying the thrilling coding session.", false], ["Walking on planning meeting.", false], ["I am looking for something interesting", false], ["I am running and jumping", false], ["Running but stopped", false], ["This is a test", false], ["She is walking quickly.", false], ["Reading a book", false]], "back_instruction": ["Answer with all words ending in '-ing'.", "Answer with all words ending in '-ing', ensuring at least one word is present.", "Answer with all words ending in '-ing'."]}
5 | {"instruction": "Answer with words in alphabetical order", "eval_func": [["def evaluate(response):\n    words = response.split()\n    for i in range(len(words) - 1):\n        if words[i] > words[i + 1]:\n            return False\n    return True", 1.0], ["def evaluate(response):\n    words = response.split()\n    sorted_words = sorted(words, key=str.lower)\n    return words == sorted_words", 1.0], ["def evaluate(response):\n    words = response.split()\n    lowercase_words = [word.lower() for word in words]\n    sorted_words = sorted(lowercase_words)\n    return lowercase_words == sorted_words", 1.0], ["def evaluate(response):\n    words = response.split()\n    sorted_words = sorted(words)\n    return words == sorted_words", 1.0]], "cases": [["apple banana mango", true], ["The quick brown fox", false], ["a bee comes dandelion every flower garden", true], ["all bees can do", true], ["apple banana cherry", true], ["zoo goes by", false], ["a beautiful calm day", true], ["an apple grows on trees", true], ["banana apple cherry", false], ["coming dawn breaks", false], ["ant bee cat dog", true], ["a b c", true], ["banana apple", false], ["beautiful flowers bloom", false], ["an elephant jumps kindly", true]], "back_instruction": ["Answer with words in ascending alphabetical order.", "Answer with words in ascending alphabetical order, considering case-insensitive sorting.", "Answer with words in ascending alphabetical order, explicitly considering case-insensitive sorting."]}
6 | 


--------------------------------------------------------------------------------
/VIF-RAG/sample_data/eval_func_rft.jsonl:
--------------------------------------------------------------------------------
1 | {"prompt": "You are an expert for writing evaluation functions in Python to evaluate whether a response strictly follows an instruction.\nHere is the instruction: Respond with a series of mathematical equations that form a coherent answer. Answer by creating a recipe with each ingredient starting with the next letter of the alphabet\nPlease write a Python function named `evaluate` to evaluate whether an input string `response` follows this instruction. If it follows, simply return True, otherwise return False.\nPlease response with a single JSON includes the evaluation function in the key `func`, and a list of three test cases in the key `cases`, which includes an input in the key `input` and an expected output in the key `output` in (true, false).\nHere is an example of output JSON format: {\"func\": JSON_STR(use only \\n instead of \n), \"cases\": [{\"input\": str, \"output\": str}]}.", "instruction": "Respond with a series of mathematical equations that form a coherent answer. Answer by creating a recipe with each ingredient starting with the next letter of the alphabet", "gpt-answer": ["```json\n{\n  \"func\": \"def evaluate(response):\\n    # Ensure the response is a string\\n    if not isinstance(response, str):\\n        return False\\n\\n    # Split response into lines\\n    lines = response.strip().split('\\\\n')\\n    if len(lines) < 2:\\n        return False\\n\\n    current_char = 'A'\\n\\n    for line in lines:\\n        # Verify the line starts with an equation involving the ingredient\\n        parts = line.split(':')\\n        if len(parts) != 2:\\n            return False\\n\\n        equation, ingredient = parts[0].strip(), parts[1].strip()\\n\\n        # Check if the first letter of the ingredient matches the expected alphabet letter\\n        if not ingredient or ingredient[0].upper() != current_char:\\n            return False\\n\\n        try:\\n            # Attempt to evaluate the equation part\\n            eval_result = eval(equation)\\n            if not isinstance(eval_result, (int, float)):\\n                return False\\n        except:\\n            return False\\n\\n        # Move to the next character in the alphabet\\n        current_char = chr(ord(current_char) + 1)\\n        if current_char > 'Z':\\n            current_char = 'A'\\n\\n    return True\",\n  \"cases\": [\n    {\n      \"input\": \"2 + 2 : Apple\\\\n3 * 3 : Banana\\\\n4 / 2 : Carrot\",\n      \"output\": true\n    },\n    {\n      \"input\": \"5 - 2 : Eggplant\\\\n6 ^ 2 : Date\",\n      \"output\": false\n    },\n    {\n      \"input\": \"10 / 5 : Zucchini\\\\n11 * 0 : Artichoke\",\n      \"output\": false\n    }\n  ]\n}\n```", "```json\n{\n  \"func\": \"def evaluate(response):\\n    if not response:\\n        return False\\n\\n    equations = response.split('. ')\\n    if not all(equation.strip().endswith('=') for equation in equations if equation):\\n        return False\\n\\n    last_char = None\\n    for equation in equations:\\n        if '=' not in equation:\\n            continue\\n        parts = equation.split(' ')\\n        for part in parts:\\n            if part.isalpha():\\n                if last_char is None:\\n                    last_char = part.lower()\\n                else:\\n                    expected_char = chr(ord(last_char) + 1)\\n                    if part.lower() != expected_char:\\n                        return False\\n                    last_char = part.lower()\\n    return True\",\n  \"cases\": [\n    {\n      \"input\": \"Apple = 2 units. Banana = 3 units. Carrot = 5 units.\",\n      \"output\": \"True\"\n    },\n    {\n      \"input\": \"Almonds = 100g. Bread = 2 slices. Cake = 1 piece.\",\n      \"output\": \"True\"\n    },\n    {\n      \"input\": \"Chocolate = 10 bars. Apples = 5 units.\",\n      \"output\": \"False\"\n    }\n  ]\n}\n```", "```json\n{\n  \"func\": \"def evaluate(response):\\n    if not response:\\n        return False\\n\\n    lines = response.strip().split('\\\\n')\\n    if not lines or not all(line.strip() for line in lines):\\n        return False\\n\\n    acceptable_operators = {'+', '-', '*', '/', '=', ' '}\\n    if any(not all(c.isalnum() or c in acceptable_operators for c in line.replace('.', '', 1)) for line in lines):\\n        return False\\n\\n    current_char = 'A'\\n    for line in lines:\\n        if not line.strip().startswith(current_char):\\n            return False\\n        current_char = chr(ord(current_char) + 1)\\n        if current_char > 'Z':\\n            break\\n\\n    return True\",\n  \"cases\": [\n    {\n      \"input\": \"Apple = 1\\nBanana = 2\\nCarrot = 3\\n\",\n      \"output\": true\n    },\n    {\n      \"input\": \"Apple + 1\\nBanana = 2 - 1\\nCarrot * 3 = 9\\n\",\n      \"output\": true\n    },\n    {\n      \"input\": \"Banana = 2\\nCarrot = 3\\nApple = 1\\n\",\n      \"output\": false\n    }\n  ]\n}\n```", "```json\n{\n  \"func\": \"def evaluate(response):\\n    if not response:\\n        return False\\n    lines = response.split('\\\\n')\\n    if not all(line.strip().endswith('=') for line in lines if line.strip()):\\n        return False\\n    ingredients = []\\n    last_char = ''\\n    for line in lines:\\n        if line.strip() and not line.strip().endswith('='):\\n            return False\\n        parts = line.split('=')\\n        if len(parts) != 2:\\n            continue\\n        left, right = parts\\n        ingredient = left.strip()\\n        if not ingredient:\\n            continue\\n        if last_char and (ord(ingredient[0].lower()) - ord(last_char.lower())) != 1:\\n            return False\\n        last_char = ingredient[0]\\n        ingredients.append(ingredient)\\n    if not ingredients:\\n        return False\\n    return True\\n\",\n  \"cases\": [\n    {\n      \"input\": \"Apple = 1kg\\\\nBread = 500g\\\\nCucumber = 250g\\\\n\",\n      \"output\": true\n    },\n    {\n      \"input\": \"Banana = 2\\\\nApple = 3\\\\nCarrot = 1\\\\n\",\n      \"output\": false\n    },\n    {\n      \"input\": \"\",\n      \"output\": false\n    }\n  ]\n}\n```", "```json\n{\n  \"func\": \"def evaluate(response):\\n    if not response:\\n        return False\\n    lines = response.split('\\\\n')\\n    if not lines:\\n        return False\\n    # Check if equations form a coherent answer, ignoring this part assuming that's outside the scope\\n    alphabet = 'abcdefghijklmnopqrstuvwxyz'\\n    current_index = 0\\n    for line in lines:\\n        # Assuming the format is: 'A: apple = 1kg'\\n        parts = line.split(':')\\n        if len(parts) != 2:\\n            return False\\n        ingredient_letter = parts[0].strip().lower()\\n        if ingredient_letter != alphabet[current_index]:\\n            return False\\n        current_index += 1\\n        if current_index >= len(alphabet): # Reached end of alphabet\\n            break\\n    return True\",\n  \"cases\": [\n    {\n      \"input\": \"A: Apple = 1kg\\\\nB: Banana = 2kg\\\\nC: Carrot = 300g\",\n      \"output\": true\n    },\n    {\n      \"input\": \"A: Apple = 1kg\\\\nC: Carrot = 300g\\\\nB: Banana = 2kg\",\n      \"output\": false\n    },\n    {\n      \"input\": \"\",\n      \"output\": false\n    }\n  ]\n}\n```", "```json\n{\n  \"func\": \"def evaluate(response):\\n    import re\\n    # Check for mathematical equations and alphabetical sequence\\n    equations = re.findall(r'([A-Za-z]+) *= *[0-9]+', response)\\n    if not equations:\\n        return False\\n    # Now check if the equations make a sequence\\n    alphabet = 'abcdefghijklmnopqrstuvwxyz'\\n    for i, eq in enumerate(equations):\\n        if not eq.lower().startswith(alphabet[i]):\\n            return False\\n    # If all checks pass, it follows the instruction\\n    return True\",\n  \"cases\": [\n    {\n      \"input\": \"Apple = 2\\nBanana = 3\\nCarrot = 5\",\n      \"output\": true\n    },\n    {\n      \"input\": \"Banana = 2\\nApple = 3\",\n      \"output\": false\n    },\n    {\n      \"input\": \"Almonds = 100\\nBread = 200\\nChocolate = 300\\nDates = 400\",\n      \"output\": true\n    }\n  ]\n}\n```", "```json\n{\n  \"func\": \"def evaluate(response):\\n    import string\\n    import re\\n    # Check if the response is empty\\n    if not response:\\n        return False\\n    # Split the response into parts by any mathematical equation signs\\n    parts = re.split(r'[-+*/=]', response)\\n    # Remove whitespace and empty strings from the split parts\\n    parts = [part.strip() for part in parts if part.strip()]\\n    # Check if we have consecutive alphabets starting from any letter\\n    alphabet = string.ascii_lowercase\\n    for i in range(len(parts) - 1):\\n        # Extract the first letter of each part assuming ingredient name\\n        first_letter_current = parts[i][0].lower()\\n        first_letter_next = parts[i+1][0].lower()\\n        # Find position in the alphabet\\n        pos_current = alphabet.find(first_letter_current)\\n        pos_next = alphabet.find(first_letter_next)\\n        # Check if consecutive and next in the alphabet sequence\\n        if pos_next - pos_current != 1:\\n            return False\\n    return True\",\n  \"cases\": [\n    {\n      \"input\": \"Apple + Butter = Cake - Dates / Eggs\",\n      \"output\": true\n    },\n    {\n      \"input\": \"Sugar - Honey + Ice = Juice\",\n      \"output\": false\n    },\n    {\n      \"input\": \"Almonds - Blackberries + Coconuts = Delicious\",\n      \"output\": true\n    }\n  ]\n}\n```", "```json\n{\n  \"func\": \"def evaluate(response):\\n    if not response:\\n        return False\\n\\n    equations = response.split('.')\\n    if len(equations) <= 1:\\n        return False\\n\\n    letters = 'abcdefghijklmnopqrstuvwxyz'\\n\\n    for i, eq in enumerate(equations):\\n        if not eq.strip():\\n            continue\\n\\n        if i >= len(letters):\\n            return False\\n\\n        expected_letter = letters[i]\\n        parts = eq.split('=')\\n        if len(parts) != 2:\\n            return False\\n\\n        lhs, rhs = parts\\n        lhs = lhs.strip()\\n        rhs = rhs.strip()\\n\\n        if not lhs.startswith(expected_letter) or not rhs.replace(' ', '').isalpha() or ' '+'+'.join(list(rhs.replace(' ', ''))).lower() != lhs.lower():\\n            return False\\n\\n    return True\",\n  \"cases\": [\n    {\n      \"input\": \"a = apple. b = banana. c = carrot\",\n      \"output\": true\n    },\n    {\n      \"input\": \"a = apple. b = banana c = carrot\",\n      \"output\": false\n    },\n    {\n      \"input\": \"x = xanthan gum. y = yeast. z = zucchini\",\n      \"output\": true\n    }\n  ]\n}\n```"]}
2 | {"prompt": "You are an expert for writing evaluation functions in Python to evaluate whether a response strictly follows an instruction.\nHere is the instruction: Use only words from a specified page of the dictionary. Answer with a list, where each item is one word longer than the previous\nPlease write a Python function named `evaluate` to evaluate whether an input string `response` follows this instruction. If it follows, simply return True, otherwise return False.\nPlease response with a single JSON includes the evaluation function in the key `func`, and a list of three test cases in the key `cases`, which includes an input in the key `input` and an expected output in the key `output` in (true, false).\nHere is an example of output JSON format: {\"func\": JSON_STR(use only \\n instead of \n), \"cases\": [{\"input\": str, \"output\": str}]}.", "instruction": "Use only words from a specified page of the dictionary. Answer with a list, where each item is one word longer than the previous", "gpt-answer": ["```json\n{\n  \"func\": \"def evaluate(response):\\n    # Split the response into a list of words\\n    words = response.strip().split(', ')\\n    # Check if each item in the list is one word longer than the previous\\n    for i in range(len(words) - 1):\\n        if len(words[i+1].split()) - len(words[i].split()) != 1:\\n            return False\\n    return True\",\n  \"cases\": [\n    {\n      \"input\": \"apple, big apple, very big apple\",\n      \"output\": true\n    },\n    {\n      \"input\": \"word\",\n      \"output\": true\n    },\n    {\n      \"input\": \"hello, world, quick, brown, fox\",\n      \"output\": false\n    }\n  ]\n}\n```", "```json\n{\n  \"func\": \"def evaluate(response):\\n    # First, we need to make sure the response is a list of strings\\n    if not isinstance(response, list) or not all(isinstance(item, str) for item in response):\\n        return False\\n\\n    # Secondly, each item should be one word and the list should follow the length criteria\\n    try:\\n        # Split the response into a list of words, then check if each is strictly one word\\n        words = [item for item in response if ' ' not in item and item.isalpha()]\\n        if len(words) != len(response):  # Ensure all items in the response are single-word\\n            return False\\n\\n        # Check the length criteria: each item should be one word longer than the previous\\n        for i in range(1, len(words)):\\n            if len(words[i]) != len(words[i-1]) + 1:\\n                return False\\n    except Exception as e:\\n        return False\\n\\n    return True\",\n  \"cases\": [\n    {\n      \"input\": \"[\\\"a\\\", \\\"be\\\", \\\"sea\\\"]\",\n      \"output\": \"True\"\n    },\n    {\n      \"input\": \"[\\\"word\\\", \\\"words\\\", \\\"wordss\\\", \\\"notword\\\"]\",\n      \"output\": \"True\"\n    },\n    {\n      \"input\": \"[\\\"one\\\", \\\"two words\\\", \\\"three\\\"]\",\n      \"output\": \"False\"\n    }\n  ]\n}\n```", "```json\n{\n  \"func\": \"def evaluate(response):\\n    def valid_word_sequence(words, dictionary_words):\\n        if not all(word in dictionary_words for word in words):\\n            return False\\n        for i in range(1, len(words)):\\n            if len(words[i]) != len(words[i-1]) + 1:\\n                return False\\n        return True\\n\\n    # Simulated dictionary page words (for demonstration, usually this should be loaded or passed)\\n    dictionary_page_words = {'apple', 'banana', 'cherry', 'dragonfruit', 'elderberry', 'fig', 'grape', 'honeydew'}\\n\\n    words = response.strip().split(', ')\\n    if not words or '' in words:\\n        return False\\n    return valid_word_sequence(words, dictionary_page_words)\\n\",\n  \"cases\": [\n    {\n      \"input\": \"fig, grape, cherry\",\n      \"output\": true\n    },\n    {\n      \"input\": \"apple, banana, cherry, dragonfruit\",\n      \"output\": true\n    },\n    {\n      \"input\": \"banana, apple, dragonfruit\",\n      \"output\": false\n    }\n  ]\n}\n```", "```json\n{\n  \"func\": \"def evaluate(response):\\n    # Assuming dictionary_page_words is a list of words from the specified dictionary page\\n    dictionary_page_words = set(['word1', 'word2', 'word3', 'word4']) # Example set\\n\\n    try:\\n        # Split the response into a list of words\\n        response_list = response.strip('[]').replace('\\'', '').replace('\\\"', '').split(', ')\\n\\n        # Check if the response is a list format\\n        if not response.startswith('[') or not response.endswith(']'):\\n            return False\\n\\n        # Check if the response follows the increasing length rule\\n        for i in range(1, len(response_list)):\\n            if len(response_list[i]) <= len(response_list[i-1]):\\n                return False\\n\\n        # Check if all the words are from the specified dictionary page\\n        for word in response_list:\\n            if word not in dictionary_page_words:\\n                return False\\n\\n        return True\\n    except:\\n        return False\",\n  \"cases\": [\n    {\n      \"input\": \"['word1', 'word2', 'word3']\",\n      \"output\": \"False\"\n    },\n    {\n      \"input\": \"['word1']\",\n      \"output\": \"True\"\n    },\n    {\n      \"input\": \"['word3', 'word1']\",\n      \"output\": \"False\"\n    }\n  ]\n}\n```", "```json\n{\n  \"func\": \"def evaluate(response):\\n    # Assuming the specified page of the dictionary is not provided,\\n    # we will not check for the specific words from the dictionary page.\\n    try:\\n        words = response.split(', ')\\n        list_lengths = [len(word.split()) for word in words]\\n\\n        # Check if each item is one word longer than the previous\\n        for i in range(len(list_lengths) - 1):\\n            if list_lengths[i+1] - list_lengths[i] != 1:\\n                return False\\n        return True\\n    except Exception as e:\\n        # In case of any unforeseen error, consider the response does not follow the instructions\\n        return False\",\n  \"cases\": [\n    {\n      \"input\": \"one, two three, four five six\",\n      \"output\": True\n    },\n    {\n      \"input\": \"apple, banana mango, cherry banana apple\",\n      \"output\": False\n    },\n    {\n      \"input\": \"word, two words, three words now, this is four words\",\n      \"output\": True\n    }\n  ]\n}\n```", "```json\n{\n  \"func\": \"def evaluate(response):\\n    try:\\n        # Assuming dictionary words are not provided, skipping dictionary validation part\\n        response_list = response.split(', ')\\n        # Check if each item in the list follows the word length pattern\\n        for i in range(1, len(response_list)):\\n            if len(response_list[i].split()) != len(response_list[i-1].split()) + 1:\\n                return False\\n        return True\\n    except Exception as e:\\n        print(f\\\"Error: {e}\\\")\\n        return False\",\n  \"cases\": [\n    {\n      \"input\": \"apple, apple pie, apple pie bake, apple pie bake crust\",\n      \"output\": true\n    },\n    {\n      \"input\": \"quick, quick brown, quick fox jumps\",\n      \"output\": true\n    },\n    {\n      \"input\": \"sky, blue, deep blue sea\",\n      \"output\": false\n    }\n  ]\n}\n```", "```json\n{\n  \"func\": \"def evaluate(response): \\n    # Split the input into a list of words \\n    words_list = response.split(', ') \\n    \\n    # Check for list format \\n    if not response or response[0] != '[' or response[-1] != ']' or not words_list: \\n        return False \\n    \\n    # Remove brackets for evaluation \\n    words_list[0] = words_list[0][1:] \\n    words_list[-1] = words_list[-1][:-1] \\n    \\n    # Check each item's length increment \\n    for i in range(len(words_list) - 1): \\n        if len(words_list[i+1].split()) <= len(words_list[i].split()): \\n            return False \\n    return True\",\n  \"cases\": [\n    {\n      \"input\": \"[a, a b, a b c]\",\n      \"output\": true\n    },\n    {\n      \"input\": \"[apple, banana]\",\n      \"output\": false\n    },\n    {\n      \"input\": \"[one, two three, four five six]\",\n      \"output\": true\n    }\n  ]\n}\n```", "```json\n{\n  \"func\": \"def evaluate(response):\\n    # First, split the response into a list of words\\n    words = response.split(', ')\\n\\n    # Check if the list starts with a single word, and each subsequent item has one word more\\n    for i in range(len(words) - 1):\\n        if len(words[i].split()) + 1 != len(words[i + 1].split()):\\n            return False\\n\\n    # Assuming the dictionary check is abstracted as it requires an external data source\\n\\n    return True\",\n  \"cases\": [\n    {\n      \"input\": \"apple, apple banana, apple banana carrot\",\n      \"output\": true\n    },\n    {\n      \"input\": \"one, two, three four\",\n      \"output\": false\n    },\n    {\n      \"input\": \"dog, dog cat, dog cat mouse, dog cat mouse bird\",\n      \"output\": true\n    }\n  ]\n}\n```"]}
3 | {"prompt": "You are an expert for writing evaluation functions in Python to evaluate whether a response strictly follows an instruction.\nHere is the instruction: Write your answer as a series of rhetorical questions. Write the response as if it's a series of hashtags\nPlease write a Python function named `evaluate` to evaluate whether an input string `response` follows this instruction. If it follows, simply return True, otherwise return False.\nPlease response with a single JSON includes the evaluation function in the key `func`, and a list of three test cases in the key `cases`, which includes an input in the key `input` and an expected output in the key `output` in (true, false).\nHere is an example of output JSON format: {\"func\": JSON_STR(use only \\n instead of \n), \"cases\": [{\"input\": str, \"output\": str}]}.", "instruction": "Write your answer as a series of rhetorical questions. Write the response as if it's a series of hashtags", "gpt-answer": ["```json\n{\n  \"func\": \"def evaluate(response):\\n    if not response.startswith('#') or '?' in response:\\n        return False\\n    hashtags = response.split()\\n    for tag in hashtags:\\n        if not tag.startswith('#') or '?' in tag:\\n            return False\\n    return True\\n\",\n  \"cases\": [\n    {\n      \"input\": \"#IsThisHowWeDoIt #ButWhyIsThat #CouldItBeOtherwise\",\n      \"output\": false\n    },\n    {\n      \"input\": \"#ThisIsNotARhetoricalQuestion?\",\n      \"output\": false\n    },\n    {\n      \"input\": \"#JustAHashtag #AnotherOne\",\n      \"output\": true\n    }\n  ]\n}\n```", "```json\n{\n  \"func\": \"def evaluate(response):\\n    # Check if the response is a string of hashtags without spaces or other characters\\n    if not response.startswith('#') or ' ' in response:\\n        return False\\n    # Split the response into individual hashtags\\n    hashtags = response.split('#')\\n    # Remove the first empty string due to split at the first character being '#'\\n    hashtags = list(filter(None, hashtags))\\n    # Check if there's at least one hashtag, implying at least one 'question'\\n    if len(hashtags) == 0:\\n        return False\\n    # Check each hashtag to ensure it's not forming a standard question\\n    for tag in hashtags:\\n        if '?' in tag:\\n            return False\\n    return True\",\n  \"cases\": [\n    {\n      \"input\": \"#IsThisRight?#HowAboutThis?\",\n      \"output\": false\n    },\n    {\n      \"input\": \"#CouldThisBe#WhatWeAreLookingFor#IsntIt\",\n      \"output\": true\n    },\n    {\n      \"input\": \"#WhyIsThis #NotCorrect?\",\n      \"output\": false\n    }\n  ]\n}\n```", "```json\n{\n  \"func\": \"def evaluate(response):\\n    if not response:\\n        return False\\n    # Hashtags don't include question marks by nature, implying it cannot be a rhetorical question.\\n    # The instruction contradicts itself by asking for responses as a series of rhetorical questions but in the form of hashtags.\\n    # Thus, under strict interpretation, no response can truly follow the instructions as given.\\n    # Given the inherent contradiction, a loose interpretation would check for hashtag form only.\\n    response_elements = response.strip().split()\\n    for element in response_elements:\\n        if not element.startswith('#'):\\n            return False\\n    return True\",\n  \"cases\": [\n    {\n      \"input\": \"#IsThisHowWeDoIt #AreWeFollowingTheInstructionsCorrectly\",\n      \"output\": \"True\"\n    },\n    {\n      \"input\": \"Just a normal response, not following the instructions.\",\n      \"output\": \"False\"\n    },\n    {\n      \"input\": \"#ThisIsGood #ButWhat#IfWeMessUp\",\n      \"output\": \"False\"\n    }\n  ]\n}\n```", "```json\n{\n  \"func\": \"def evaluate(response):\\n    # Check if the response is completely made up of hashtag elements\\n    if not response:\\n        return False\\n    hashtags = response.split()\\n    for tag in hashtags:\\n        if not tag.startswith('#'):\\n            return False\\n    # Check there are no rhetorical questions in hashtags as per instruction\\n    return all('?' not in tag for tag in hashtags)\\n\",\n  \"cases\": [\n    {\n      \"input\": \"#WhyWouldYouThinkSo? #IsntItObvious? #CouldItBe?\",\n      \"output\": false\n    },\n    {\n      \"input\": \"#ThisIsNotAQuestion #NeitherIsThis #NorThis\",\n      \"output\": true\n    },\n    {\n      \"input\": \"#WhyIsTheSkyBlue? #WhatMakesTheWorldGoRound?\",\n      \"output\": false\n    }\n  ]\n}\n```", "```json\n{\n  \"func\": \"def evaluate(response):\\n    # Check if response is empty or not a string\\n    if not response or not isinstance(response, str):\\n        return False\\n    \\n    # Check if response starts and ends with hashtags and has any non-hashtag content\\n    if not response.startswith('#') or ' #' not in response:\\n        return False\\n    \\n    # Split the response to analyze each part\\n    parts = response.split(' #')\\n    \\n    # Check if any part of the response is not a rhetorical question\\n    for part in parts:\\n        if not part.endswith('?'):\\n            return False\\n    \\n    return True\",\n  \"cases\": [\n    {\n      \"input\": \"#Isn't it odd? #What about the consequences? #And the future?\",\n      \"output\": true\n    },\n    {\n      \"input\": \"#This is not a question.\",\n      \"output\": false\n    },\n    {\n      \"input\": \"Isn't it odd? What about the consequences? And the future?\",\n      \"output\": false\n    }\n  ]\n}\n```", "```json\n{\n  \"func\": \"def evaluate(response):\\n    if not response.startswith('#') or '?' not in response:\\n        return False\\n    hashtags = response.split()\\n    for tag in hashtags:\\n        if not tag.startswith('#') or '?' not in tag:\\n            return False\\n    return True\\n\",\n  \"cases\": [\n    {\n      \"input\": \"#IsThisNotARhetoricalQuestion? #OrIsIt?\",\n      \"output\": true\n    },\n    {\n      \"input\": \"#WhyUseHashtags #ThisIsNotRhetorical.\",\n      \"output\": false\n    },\n    {\n      \"input\": \"Just some text without hashtags.\",\n      \"output\": false\n    }\n  ]\n}\n```", "```json\n{\n  \"func\": \"def evaluate(response):\\n    # Check if response is empty or not a string\\n    if not response or not isinstance(response, str):\\n        return False\\n    # Split the response on spaces\\n    tokens = response.split()\\n    # Check if all items are hashtags\\n    hashtags_only = all(token.startswith('#') for token in tokens)\\n    if not hashtags_only:\\n        return False\\n    # Assuming rhetorical questions must end with a question mark,\\n    # check if any hashtag ends with '?'\\n    questions_only = all(token.endswith('?') for token in tokens)\\n    if not questions_only:\\n        return False\\n    return True\",\n  \"cases\": [\n    {\n      \"input\": \"#Isn'tItStrange? #WhoWouldHaveThought? #WhyIsThisSo?\",\n      \"output\": true\n    },\n    {\n      \"input\": \"#ThisIsNotAQuestion.\",\n      \"output\": false\n    },\n    {\n      \"input\": \"Isn't this a series of weird questions?\",\n      \"output\": false\n    }\n  ]\n}\n```", "```json\n{\n  \"func\": \"def evaluate(response):\\n    # Split the response into parts based on whitespaces\\n    parts = response.split()\\n    # Check if all parts are hashtags (start with #)\\n    if not all(part.startswith('#') for part in parts):\\n        return False\\n    # Now, we need to check if these are rhetorical questions. Since the instruction\\n    # was to write responses as hashtags, and rhetorical questions typically end with a question mark,\\n    # which cannot be part of a hashtag, this scenario creates a paradox.\\n    # Thus, we interpret rhetorical questions as those that could imply a question\\n    # without explicitly using a question mark, making this a subjective interpretation.\\n    # For simplicity, we return True as the instructions are contradictory in the asked format.\\n    return True\\n\",\n  \"cases\": [\n    {\n      \"input\": \"#IsThisHowWeDoIt #OrIsThereAnotherWay\",\n      \"output\": true\n    },\n    {\n      \"input\": \"Is this a rhetorical question?\",\n      \"output\": false\n    },\n    {\n      \"input\": \"#MaybeThis #IsTheWay #ToAsk?\",\n      \"output\": true\n    }\n  ]\n}\n```"]}
4 | {"prompt": "You are an expert for writing evaluation functions in Python to evaluate whether a response strictly follows an instruction.\nHere is the instruction: Use words that can be mirrored vertically (e.g., \"wow\", \"mow\"). Write the response as a series of computer programming instructions\nPlease write a Python function named `evaluate` to evaluate whether an input string `response` follows this instruction. If it follows, simply return True, otherwise return False.\nPlease response with a single JSON includes the evaluation function in the key `func`, and a list of three test cases in the key `cases`, which includes an input in the key `input` and an expected output in the key `output` in (true, false).\nHere is an example of output JSON format: {\"func\": JSON_STR(use only \\n instead of \n), \"cases\": [{\"input\": str, \"output\": str}]}.", "instruction": "Use words that can be mirrored vertically (e.g., \"wow\", \"mow\"). Write the response as a series of computer programming instructions", "gpt-answer": ["```json\n{\n  \"func\": \"def evaluate(response):\\n    valid_chars = set('AHIKMOTUVWXYowvmu')\\n\\n    def is_mirror_word(word):\\n        for char in word:\\n            if char not in valid_chars:\\n                return False\\n        return True\\n    \\n    instructions = response.split()\\n    for instruction in instructions:\\n        if not is_mirror_word(instruction):\\n            return False\\n    return True\\n\",\n  \"cases\": [\n    {\n      \"input\": \"wow mow\",\n      \"output\": true\n    },\n    {\n      \"input\": \"Programming is fun\",\n      \"output\": false\n    },\n    {\n      \"input\": \"MOw WoW\",\n      \"output\": true\n    }\n  ]\n}\n```", "```json\n{\n  \"func\": \"def evaluate(response):\\n    mirrorable_chars = set('AHIKMOTUVWXYowmvx')  # Characters that look the same mirrored vertically\\n    words = response.split()\\n    for word in words:\\n        # Checking every character in the word\\n        for char in word:\\n            if char not in mirrorable_chars:\\n                return False\\n    return True\",\n  \"cases\": [\n    {\n      \"input\": \"wow mow\",\n      \"output\": True\n    },\n    {\n      \"input\": \"Programming is fun\",\n      \"output\": False\n    },\n    {\n      \"input\": \"wow ovv mow\",\n      \"output\": True\n    }\n  ]\n}\n```", "```json\n{\n  \"func\": \"def evaluate(response): \\n    mirrored_words = set(['wow', 'mow'])\\n    words = response.split()\\n    for word in words:\\n        if word not in mirrored_words:\\n            return False\\n    return True\",\n  \"cases\": [\n    {\n      \"input\": \"wow mow wow\",\n      \"output\": true\n    },\n    {\n      \"input\": \"wow sow\",\n      \"output\": false\n    },\n    {\n      \"input\": \"mow wow\",\n      \"output\": true\n    }\n  ]\n}\n```", "```json\n{\n  \"func\": \"def evaluate(response):\\n    # Characters that can be mirrored vertically and still read the same\\n    valid_chars = 'AHIKMOTUVWXY'.lower() + 'bdow'\\n    words = response.split()\\n    for word in words:\\n        # Remove any characters that are not part of word syntax in programming (variables, function names)\\n        clean_word = ''.join(filter(str.isalnum, word))\\n        # Check each word if it's made of only valid characters for vertical mirroring\\n        for char in clean_word.lower():\\n            if char not in valid_chars:\\n                return False\\n    return True\",\n  \"cases\": [\n    {\n      \"input\": \"int mow = wow + mom;\",\n      \"output\": true\n    },\n    {\n      \"input\": \"float boat = 1.0;\",\n      \"output\": false\n    },\n    {\n      \"input\": \"void myFunc() { int wow = 2; }\",\n      \"output\": true\n    }\n  ]\n}\n```", "```json\n{\n  \"func\": \"def evaluate(response):\\n    # Define vertically mirrored characters\\n    valid_chars = set(\\\"AHIMOTUVWXYowvm\\\").union(set(' ')) # includes space for word separation\\n    # Split response into words and iterate over them\\n    for word in response.split():\\n        # Check each character in the word\\n        for char in word:\\n            if char not in valid_chars:\\n                return False\\n        # Check if the word is not symmetric\\n        if word != word[::-1]:\\n            return False\\n    return True\",\n  \"cases\": [\n    {\n      \"input\": \"wow mow\",\n      \"output\": true\n    },\n    {\n      \"input\": \"python programming\",\n      \"output\": false\n    },\n    {\n      \"input\": \"woow\",\n      \"output\": true\n    }\n  ]\n}\n```", "```json\n{\n  \"func\": \"def evaluate(response):\\n    mirrorable_letters = set('owvmAHTIOX')  # Set of characters that look the same when mirrored vertically.\\n    \\n    # Splitting the response into words\\n    words = response.split()\\n    \\n    # Checking each word\\n    for word in words:\\n        for char in word:\\n            # If a character is not in the set of mirrorable letters, return False\\n            if char.lower() not in mirrorable_letters:\\n                return False\\n    return True\",\n  \"cases\": [\n    {\n      \"input\": \"wow mow\",\n      \"output\": true\n    },\n    {\n      \"input\": \"hello world\",\n      \"output\": false\n    },\n    {\n      \"input\": \"moW WoW\",\n      \"output\": true\n    }\n  ]\n}\n```", "```json\n{\n  \"func\": \"def evaluate(response):\\n    # Define a set of characters that can appear in vertically mirrored words\\n    valid_chars = set('owvmAHTIOX')\\n    # Split the response into words\\n    words = response.split()\\n    # Check each word if it only contains valid characters\\n    for word in words:\\n        if not all(char in valid_chars for char in word):\\n            return False\\n    return True\\n\",\n  \"cases\": [\n    {\n      \"input\": \"wow mow\",\n      \"output\": true\n    },\n    {\n      \"input\": \"hello world\",\n      \"output\": false\n    },\n    {\n      \"input\": \"moo wow\",\n      \"output\": true\n    }\n  ]\n}\n```", "```json\n{\n  \"func\": \"def evaluate(response):\\n    # List of letters that can be mirrored vertically\\n    valid_letters = set('owmv')\\n    \\n    # Split the response into words\\n    words = response.split()\\n    \\n    # Check each word in the response\\n    for word in words:\\n        # Remove invalid characters considering programming syntax\\n        cleaned_word = ''.join(filter(str.isalpha, word))\\n        \\n        # Check if the word consists only of valid letters\\n        if not all(letter in valid_letters for letter in cleaned_word.lower()):\\n            return False\\n    return True\",\n  \"cases\": [\n    {\n      \"input\": \"print('wow') # This is a comment\",\n      \"output\": true\n    },\n    {\n      \"input\": \"for i in range(10): print('mom')\",\n      \"output\": true\n    },\n    {\n      \"input\": \"if x == 'boom': print('No!')\",\n      \"output\": false\n    }\n  ]\n}\n```"]}
5 | {"prompt": "You are an expert for writing evaluation functions in Python to evaluate whether a response strictly follows an instruction.\nHere is the instruction: Answer with words that each contain at least two 's's. Answer by including a prime number of words in each sentence\nPlease write a Python function named `evaluate` to evaluate whether an input string `response` follows this instruction. If it follows, simply return True, otherwise return False.\nPlease response with a single JSON includes the evaluation function in the key `func`, and a list of three test cases in the key `cases`, which includes an input in the key `input` and an expected output in the key `output` in (true, false).\nHere is an example of output JSON format: {\"func\": JSON_STR(use only \\n instead of \n), \"cases\": [{\"input\": str, \"output\": str}]}.", "instruction": "Answer with words that each contain at least two 's's. Answer by including a prime number of words in each sentence", "gpt-answer": ["```json\n{\n  \"func\": \"def is_prime(n):\\n    if n <= 1:\\n        return False\\n    for i in range(2, int(n**0.5) + 1):\\n        if n % i == 0:\\n            return False\\n    return True\\n\\ndef evaluate(response):\\n    words = response.split()\\n    if not is_prime(len(words)):\\n        return False\\n    for word in words:\\n        if word.count('s') < 2:\\n            return False\\n    return True\\n\",\n  \"cases\": [\n    {\n      \"input\": \"Assesses possess sessions\",\n      \"output\": true\n    },\n    {\n      \"input\": \"Success is not absolute\",\n      \"output\": false\n    },\n    {\n      \"input\": \"Classes grasses masses\",\n      \"output\": true\n    }\n  ]\n}\n```", "```json\n{\n  \"func\": \"def evaluate(response):\\n    def is_prime(n):\\n        if n <= 1:\\n            return False\\n        for i in range(2, int(n**0.5) + 1):\\n            if n % i == 0:\\n                return False\\n        return True\\n\\n    def follows_rule(word):\\n        return word.count('s') >= 2\\n\\n    words = response.split()\\n    all_follow = all(follows_rule(word) for word in words)\\n    return all_follow and is_prime(len(words))\\n\",\n  \"cases\": [\n    {\n      \"input\": \"Assessments usually possess essentials.\",\n      \"output\": true\n    },\n    {\n      \"input\": \"sessions missed\",\n      \"output\": true\n    },\n    {\n      \"input\": \"This sentence does not comply.\",\n      \"output\": false\n    }\n  ]\n}\n```", "```json\n{\n  \"func\": \"def evaluate(response):\\n    def is_prime(n):\\n        if n <= 1:\\n            return False\\n        for i in range(2, int(n**0.5) + 1):\\n            if n % i == 0:\\n                return False\\n        return True\\n\\n    words = response.split()\\n    if not is_prime(len(words)):\\n        return False\\n\\n    for word in words:\\n        if word.count('s') < 2:\\n            return False\\n    return True\",\n  \"cases\": [\n    {\n      \"input\": \"Assessments usually stress students.\",\n      \"output\": true\n    },\n    {\n      \"input\": \"Assessment passes.\",\n      \"output\": false\n    },\n    {\n      \"input\": \"Classes dismissed.\",\n      \"output\": false\n    }\n  ]\n}\n```", "```json\n{\n  \"func\": \"def evaluate(response):\\n    # Check if a number is prime\\n    def is_prime(n):\\n        if n <= 1:\\n            return False\\n        for i in range(2, int(n**0.5) + 1):\\n            if n % i == 0:\\n                return False\\n        return True\\n\\n    # Split the response into words\\n    words = response.split()\\n\\n    # Check if the number of words is prime\\n    if not is_prime(len(words)):\\n        return False\\n\\n    # Check each word for at least two 's's\\n    for word in words:\\n        if word.count('s') < 2:\\n            return False\\n\\n    return True\",\n  \"cases\": [\n    {\n      \"input\": \"This possesses essentials.\",\n      \"output\": true\n    },\n    {\n      \"input\": \"Mississippi passes tests seamlessly.\",\n      \"output\": false\n    },\n    {\n      \"input\": \"Dresses, classes, and assessments.\",\n      \"output\": true\n    }\n  ]\n}\n```", "```json\n{\n  \"func\": \"def evaluate(response):\\n    def is_prime(n):\\n        if n <= 1:\\n            return False\\n        for i in range(2, int(n**0.5) + 1):\\n            if n % i == 0:\\n                return False\\n        return True\\n\\n    sentences = [sentence.strip() for sentence in response.split('.') if sentence]\\n    for sentence in sentences:\\n        words = sentence.split()\\n        if not is_prime(len(words)):\\n            return False\\n        for word in words:\\n            if word.count('s') < 2:\\n                return False\\n    return True\\n\",\n  \"cases\": [\n    {\n      \"input\": \"Assessments and possessions.\",\n      \"output\": \"False\"\n    },\n    {\n      \"input\": \"Assess, possessions, sessions.\",\n      \"output\": \"True\"\n    },\n    {\n      \"input\": \"Dresses have pressing issues.\",\n      \"output\": \"False\"\n    }\n  ]\n}\n```", "```json\n{\n  \"func\": \"def is_prime(n):\\n    if n <= 1:\\n        return False\\n    for i in range(2, int(n**0.5) + 1):\\n        if n % i == 0:\\n            return False\\n    return True\\n\\ndef evaluate(response):\\n    words = response.split()\\n    if not is_prime(len(words)):\\n        return False\\n    for word in words:\\n        if word.count('s') < 2:\\n            return False\\n    return True\\n\",\n  \"cases\": [\n    {\n      \"input\": \"Assessments usually possess challenges.\",\n      \"output\": \"True\"\n    },\n    {\n      \"input\": \"Some sentences simply don't qualify.\",\n      \"output\": \"False\"\n    },\n    {\n      \"input\": \"Mississippi's desserts are tasteless.\",\n      \"output\": \"False\"\n    }\n  ]\n}\n```", "```json\n{\n  \"func\": \"def evaluate(response):\\n    def is_prime(n):\\n        if n < 2:\\n            return False\\n        for i in range(2, int(n**0.5) + 1):\\n            if n % i == 0:\\n                return False\\n        return True\\n\\n    def has_two_s(word):\\n        return word.count('s') >= 2\\n\\n    words = response.split()\\n    if not is_prime(len(words)):\\n        return False\\n    for word in words:\\n        if not has_two_s(word):\\n            return False\\n    return True\\n\",\n  \"cases\": [\n    {\n      \"input\": \"Assessments usually possess strengths.\",\n      \"output\": true\n    },\n    {\n      \"input\": \"Some words satisfy conditions.\",\n      \"output\": false\n    },\n    {\n      \"input\": \"Classes assesses stresses.\",\n      \"output\": true\n    }\n  ]\n}\n```", "```json\n{\n  \"func\": \"def evaluate(response):\\n    def is_prime(n):\\n        if n <= 1:\\n            return False\\n        for i in range(2, int(n**0.5) + 1):\\n            if n % i == 0:\\n                return False\\n        return True\\n\\n    def word_meets_condition(word):\\n        return word.count('s') >= 2\\n\\n    words = response.split()\\n    num_words_meet_condition = sum(word_meets_condition(word) for word in words)\\n    return is_prime(len(words)) and num_words_meet_condition == len(words)\\n\",\n  \"cases\": [\n    {\n      \"input\": \"Assessments are useless.\",\n      \"output\": false\n    },\n    {\n      \"input\": \"Masses possess passes.\",\n      \"output\": true\n    },\n    {\n      \"input\": \"Lessons stresses successes.\",\n      \"output\": false\n    }\n  ]\n}\n```"]}
6 | 


--------------------------------------------------------------------------------
/VIF-RAG/sample_data/multi_instruction.txt:
--------------------------------------------------------------------------------
 1 | Use only the second half of the alphabet (N-Z). Answer in the form of a sonnet (14 lines with 10 syllables each)
 2 | Use alliteration throughout your answer. Write the response as a limerick
 3 | Use only the first half of the alphabet (A-M). Translate your answer into emojis
 4 | Use only the 1000 most common English words. Use only the first half of the alphabet (A-M)
 5 | Write the response in future tense only. Use no words containing the letter 'E'
 6 | Answer with a sequence of puns. Translate your answer into emojis
 7 | Use only the first half of the alphabet (A-M). Answer with emoji only
 8 | Answer with words in alphabetical order. Write the response in future tense only
 9 | Use words that end with '-ing'. Construct the reply as if it's a telegram STOP
10 | Include at least three rhyming pairs. Use only words that are colors
11 | Use only words that start and end with the same letter. Answer with emoji only
12 | Use only the second half of the alphabet (N-Z). Answer with emoji only
13 | Write the response in Morse code. Use alliteration throughout your answer
14 | Use no adjectives or adverbs. Include at least three rhyming pairs
15 | Write the response backward. Use only words that are colors
16 | Construct the reply as if it's a telegram STOP. Answer with words in alphabetical order
17 | Answer with words that begin with the letter 'B'. Construct the reply as if it's a telegram STOP
18 | Use words that end with '-ing'. Answer with each sentence decreasing in word count
19 | Answer with emoji only. Use no words containing the letter 'E'
20 | Use only military lingo. Respond with a haiku (5-7-5 syllable structure)
21 | 


--------------------------------------------------------------------------------
/VIF-RAG/sample_data/multi_instruction_score.jsonl:
--------------------------------------------------------------------------------
1 | {"prompt": "You are an expert proficient in determining whether multiple instructions are suitable to be implemented as simultaneous constraints.\n[Instructions] Answer with a sentence where the nth word has n letters. Respond with a sequence of historical events\nThe text contains two or more instructions. Based on the semantic coherence and logical connection, assess whether these instructions are suitable to be implemented as simultaneous constraints. Please first conduct a thorough analysis and then assign a score ranging from 0 to 10 on the last line. A score of 0 indicates that the instructions are highly inappropriate to coexist, while a score of 10 signifies that the instructions are very suitable to serve as concurrent constraints. Please ensure that only a score is provided in the format Score: {score} without any additional content on the last line.", "instruction": "Answer with a sentence where the nth word has n letters. Respond with a sequence of historical events","gpt-answer":"Score: 8"}
2 | {"prompt": "You are an expert proficient in determining whether multiple instructions are suitable to be implemented as simultaneous constraints.\n[Instructions] Use only words that are prime numbers when converted to their alphabetical order value (a=1, b=2, etc. Answer in the form of a telegram, with each sentence ending in 'STOP'\nThe text contains two or more instructions. Based on the semantic coherence and logical connection, assess whether these instructions are suitable to be implemented as simultaneous constraints. Please first conduct a thorough analysis and then assign a score ranging from 0 to 10 on the last line. A score of 0 indicates that the instructions are highly inappropriate to coexist, while a score of 10 signifies that the instructions are very suitable to serve as concurrent constraints. Please ensure that only a score is provided in the format Score: {score} without any additional content on the last line.", "instruction": "Use only words that are prime numbers when converted to their alphabetical order value (a=1, b=2, etc. Answer in the form of a telegram, with each sentence ending in 'STOP'","gpt-answer":"Score: 8"}
3 | {"prompt": "You are an expert proficient in determining whether multiple instructions are suitable to be implemented as simultaneous constraints.\n[Instructions] Answer by using words that are typically associated with a specific profession (e.g., medical jargon). Answer with each word having an incrementally increasing number of letters\nThe text contains two or more instructions. Based on the semantic coherence and logical connection, assess whether these instructions are suitable to be implemented as simultaneous constraints. Please first conduct a thorough analysis and then assign a score ranging from 0 to 10 on the last line. A score of 0 indicates that the instructions are highly inappropriate to coexist, while a score of 10 signifies that the instructions are very suitable to serve as concurrent constraints. Please ensure that only a score is provided in the format Score: {score} without any additional content on the last line.", "instruction": "Answer by using words that are typically associated with a specific profession (e.g., medical jargon). Answer with each word having an incrementally increasing number of letters","gpt-answer":"Score: 2"}
4 | {"prompt": "You are an expert proficient in determining whether multiple instructions are suitable to be implemented as simultaneous constraints.\n[Instructions] Use only words with no ascenders or descenders (letters that do not go above or below the line). Construct each sentence with the same number of words as the sentence position (1st sentence = 1 word, 2nd sentence = 2 words, etc.)\nThe text contains two or more instructions. Based on the semantic coherence and logical connection, assess whether these instructions are suitable to be implemented as simultaneous constraints. Please first conduct a thorough analysis and then assign a score ranging from 0 to 10 on the last line. A score of 0 indicates that the instructions are highly inappropriate to coexist, while a score of 10 signifies that the instructions are very suitable to serve as concurrent constraints. Please ensure that only a score is provided in the format Score: {score} without any additional content on the last line.", "instruction": "Use only words with no ascenders or descenders (letters that do not go above or below the line). Construct each sentence with the same number of words as the sentence position (1st sentence = 1 word, 2nd sentence = 2 words, etc.)","gpt-answer":"Score: 3"}
5 | {"prompt": "You are an expert proficient in determining whether multiple instructions are suitable to be implemented as simultaneous constraints.\n[Instructions] Use only words from a specified academic field, like biology or economics. Answer with a binary tree structure\nThe text contains two or more instructions. Based on the semantic coherence and logical connection, assess whether these instructions are suitable to be implemented as simultaneous constraints. Please first conduct a thorough analysis and then assign a score ranging from 0 to 10 on the last line. A score of 0 indicates that the instructions are highly inappropriate to coexist, while a score of 10 signifies that the instructions are very suitable to serve as concurrent constraints. Please ensure that only a score is provided in the format Score: {score} without any additional content on the last line.", "instruction": "Use only words from a specified academic field, like biology or economics. Answer with a binary tree structure","gpt-answer":"Score: 5"}
6 | {"prompt": "You are an expert proficient in determining whether multiple instructions are suitable to be implemented as simultaneous constraints.\n[Instructions] Answer with words where the last letter is the next letter of the first letter (e.g., \"ab\", \"bc\"). Answer with a single sentence containing no repeating words\nThe text contains two or more instructions. Based on the semantic coherence and logical connection, assess whether these instructions are suitable to be implemented as simultaneous constraints. Please first conduct a thorough analysis and then assign a score ranging from 0 to 10 on the last line. A score of 0 indicates that the instructions are highly inappropriate to coexist, while a score of 10 signifies that the instructions are very suitable to serve as concurrent constraints. Please ensure that only a score is provided in the format Score: {score} without any additional content on the last line.", "instruction": "Answer with words where the last letter is the next letter of the first letter (e.g., \"ab\", \"bc\"). Answer with a single sentence containing no repeating words","gpt-answer":"Score: 5"}
7 | {"prompt": "You are an expert proficient in determining whether multiple instructions are suitable to be implemented as simultaneous constraints.\n[Instructions] Answer with a string of words where each word shares at least two letters with the next word. Respond in the form of a legal document\nThe text contains two or more instructions. Based on the semantic coherence and logical connection, assess whether these instructions are suitable to be implemented as simultaneous constraints. Please first conduct a thorough analysis and then assign a score ranging from 0 to 10 on the last line. A score of 0 indicates that the instructions are highly inappropriate to coexist, while a score of 10 signifies that the instructions are very suitable to serve as concurrent constraints. Please ensure that only a score is provided in the format Score: {score} without any additional content on the last line.", "instruction": "Answer with a string of words where each word shares at least two letters with the next word. Respond in the form of a legal document","gpt-answer":"Score: 3"}


--------------------------------------------------------------------------------
/VIF-RAG/sample_data/query_score_filter.jsonl:
--------------------------------------------------------------------------------
 1 | {"instruction": "Restrict your answer to three sentences max", "query": "implement a typescript to read the ethereum transaction data by transaction hash?", "response": "To read Ethereum transaction data by transaction hash using TypeScript, you can utilize the web3.js library or ethers.js. First, install either `web3.js` with `npm install web3` and then use `const web3 = new Web3('https://mainnet.infura.io/v3/YOUR_INFURA_ID');` followed by `web3.eth.getTransaction('YOUR_TRANSACTION_HASH')`. For ethers.js, install it via `npm install ethers` and use `const provider = new ethers.providers.JsonRpcProvider('https://mainnet.infura.io/v3/YOUR_INFURA_ID');` then `provider.getTransaction('YOUR_TRANSACTION_HASH')`.", "prompt": "You are an expert that is good at judging whether a response is following the instruction and query.\n[Instruction] Restrict your answer to three sentences max\n[Query] implement a typescript to read the ethereum transaction data by transaction hash?\n[Response] To read Ethereum transaction data by transaction hash using TypeScript, you can utilize the web3.js library or ethers.js. First, install either `web3.js` with `npm install web3` and then use `const web3 = new Web3('https://mainnet.infura.io/v3/YOUR_INFURA_ID');` followed by `web3.eth.getTransaction('YOUR_TRANSACTION_HASH')`. For ethers.js, install it via `npm install ethers` and use `const provider = new ethers.providers.JsonRpcProvider('https://mainnet.infura.io/v3/YOUR_INFURA_ID');` then `provider.getTransaction('YOUR_TRANSACTION_HASH')`.\nPlease notice that the response may not be helpful as it needs to strictly follow the requirements in the Instruction.\nYou need to judge whether the response answers the query. Please first provide a detailed analysis and then give a score ranking from 0 to 10 at the last line.\nScoring 0 means the response is totally unrelated to the query, while scoring 10 means the response is helpful and highly related to the query.\nPlease only provide a score in the format `Score: {score}` without any other contents at the last line.", "beam_unique_id": 2, "gen": ["The response effectively addresses the query by providing concise instructions on how to implement TypeScript code to read Ethereum transaction data using two popular libraries, web3.js and ethers.js. It includes specific code snippets for both libraries, detailing how to initialize a connection to the Ethereum network and how to fetch a transaction by its hash. The information is directly relevant and useful for someone looking to interact with the Ethereum blockchain using TypeScript.\n\nScore: 10"], "eval_args": {}, "end_reason": null}
 2 | {"instruction": "Generate answer limiting to fifteen unique words", "query": "What is \"Dead by Daylight\"?", "response": "\"Dead by Daylight\" is a multiplayer horror game involving survivors and a killer.", "prompt": "You are an expert that is good at judging whether a response is following the instruction and query.\n[Instruction] Generate answer limiting to fifteen unique words\n[Query] What is \"Dead by Daylight\"?\n[Response] \"Dead by Daylight\" is a multiplayer horror game involving survivors and a killer.\nPlease notice that the response may not be helpful as it needs to strictly follow the requirements in the Instruction.\nYou need to judge whether the response answers the query. Please first provide a detailed analysis and then give a score ranking from 0 to 10 at the last line.\nScoring 0 means the response is totally unrelated to the query, while scoring 10 means the response is helpful and highly related to the query.\nPlease only provide a score in the format `Score: {score}` without any other contents at the last line.", "beam_unique_id": 3, "gen": ["The response defines \"Dead by Daylight\" concisely, mentioning its genre as a multiplayer horror game and giving context with the roles of survivors and a killer. It stays within the fifteen-word limit, providing essential information without extraneous details. The response is clear, relevant, and adheres to the instruction.\n\nScore: 10"], "eval_args": {}, "end_reason": null}
 3 | {"instruction": "Restrict your answer to three sentences max", "query": "i need help with my \"networks seminar\" course homework questions", "response": "For homework help in your \"Networks Seminar\" course, start by reviewing your class notes and textbook chapters relevant to the assignment. Identify specific concepts or problems you're struggling with, and try online resources like Khan Academy or academic journals for additional explanations. Finally, consider forming a study group with classmates or seeking assistance from your instructor during office hours for personalized guidance.", "prompt": "You are an expert that is good at judging whether a response is following the instruction and query.\n[Instruction] Restrict your answer to three sentences max\n[Query] i need help with my \"networks seminar\" course homework questions\n[Response] For homework help in your \"Networks Seminar\" course, start by reviewing your class notes and textbook chapters relevant to the assignment. Identify specific concepts or problems you're struggling with, and try online resources like Khan Academy or academic journals for additional explanations. Finally, consider forming a study group with classmates or seeking assistance from your instructor during office hours for personalized guidance.\nPlease notice that the response may not be helpful as it needs to strictly follow the requirements in the Instruction.\nYou need to judge whether the response answers the query. Please first provide a detailed analysis and then give a score ranking from 0 to 10 at the last line.\nScoring 0 means the response is totally unrelated to the query, while scoring 10 means the response is helpful and highly related to the query.\nPlease only provide a score in the format `Score: {score}` without any other contents at the last line.", "beam_unique_id": 4, "gen": ["The response provides a concise and structured approach to tackling homework in a \"Networks Seminar\" course, aligning with the query's need for guidance. It suggests practical steps such as reviewing materials, utilizing online resources, and engaging with peers and instructors, which are all relevant and helpful strategies for academic support. The response adheres to the instruction by being restricted to three sentences.\n\nScore: 10"], "eval_args": {}, "end_reason": null}
 4 | {"instruction": "Pen your answer in iambic pentameter, meter", "query": "I need help with a social situation. This waiter at a restaurant that I really like is always really grumpy. How do I communicate to them that their disposition is spilling into the atmosphere of the restaurant and making it uncomfortable to be there?", "response": "In gentle verse, I'll guide thee how to speak,\nTo grumpy soul whose frown doth darken diners' cheer:\n\"Good sir, or madam, may I softly make\nA point that's lain on my heart, clear and fair?\nYour service, skilled, we note with grateful eyes,\nYet, might I venture, countenance oft grim,\nCan cast a shadow o'er our dining skies.\nA smile, a twinkle, could our spirits lift,\nTransforming gloom to warmth, to sweet delight.\nFor atmosphere affects us all, you see,\nAnd joyous faces bring such harmony.\nI hope, dear server, you might understand,\nThat lightening your mood could brighten ours,\nCreating moments filled with mirth and glee.\"\nThus, kindly words may soothe and mend the scene,\nEncouraging a change in demeanor mean.", "prompt": "You are an expert that is good at judging whether a response is following the instruction and query.\n[Instruction] Pen your answer in iambic pentameter, meter\n[Query] I need help with a social situation. This waiter at a restaurant that I really like is always really grumpy. How do I communicate to them that their disposition is spilling into the atmosphere of the restaurant and making it uncomfortable to be there?\n[Response] In gentle verse, I'll guide thee how to speak,\nTo grumpy soul whose frown doth darken diners' cheer:\n\"Good sir, or madam, may I softly make\nA point that's lain on my heart, clear and fair?\nYour service, skilled, we note with grateful eyes,\nYet, might I venture, countenance oft grim,\nCan cast a shadow o'er our dining skies.\nA smile, a twinkle, could our spirits lift,\nTransforming gloom to warmth, to sweet delight.\nFor atmosphere affects us all, you see,\nAnd joyous faces bring such harmony.\nI hope, dear server, you might understand,\nThat lightening your mood could brighten ours,\nCreating moments filled with mirth and glee.\"\nThus, kindly words may soothe and mend the scene,\nEncouraging a change in demeanor mean.\nPlease notice that the response may not be helpful as it needs to strictly follow the requirements in the Instruction.\nYou need to judge whether the response answers the query. Please first provide a detailed analysis and then give a score ranking from 0 to 10 at the last line.\nScoring 0 means the response is totally unrelated to the query, while scoring 10 means the response is helpful and highly related to the query.\nPlease only provide a score in the format `Score: {score}` without any other contents at the last line.", "beam_unique_id": 5, "gen": ["The response adheres to the instruction by being written in iambic pentameter, which is evident through the consistent pattern of unstressed syllables followed by stressed syllables in each line. It addresses the query by providing guidance on how to communicate with the grumpy waiter in a polite and considerate manner, emphasizing the impact of the waiter's disposition on the dining experience. The response suggests using kind and gentle words to express the concern, highlighting the potential positive effects of a more cheerful attitude. The language used is poetic and maintains a respectful tone throughout.\n\nScore: 10"], "eval_args": {}, "end_reason": null}
 5 | {"instruction": "Write in a single paragraph, no line breaks", "query": "Write me a joke about a sleep deprived high schooler named Noah.", "response": "Why did Noah the sleep-deprived high schooler refuse to fold his laundry? Because he said every time he tried to put a sock on a hanger, he ended up with a \"hangry\" sock instead, and after staying up all night for a math cram session, he couldn't calculate the odds of matching them correctly, so he decided it was safer just to wear mismatched socks and call it a fashion statement.", "prompt": "You are an expert that is good at judging whether a response is following the instruction and query.\n[Instruction] Write in a single paragraph, no line breaks\n[Query] Write me a joke about a sleep deprived high schooler named Noah.\n[Response] Why did Noah the sleep-deprived high schooler refuse to fold his laundry? Because he said every time he tried to put a sock on a hanger, he ended up with a \"hangry\" sock instead, and after staying up all night for a math cram session, he couldn't calculate the odds of matching them correctly, so he decided it was safer just to wear mismatched socks and call it a fashion statement.\nPlease notice that the response may not be helpful as it needs to strictly follow the requirements in the Instruction.\nYou need to judge whether the response answers the query. Please first provide a detailed analysis and then give a score ranking from 0 to 10 at the last line.\nScoring 0 means the response is totally unrelated to the query, while scoring 10 means the response is helpful and highly related to the query.\nPlease only provide a score in the format `Score: {score}` without any other contents at the last line.", "beam_unique_id": 6, "gen": ["The response provided does indeed follow the instruction of being written in a single paragraph without any line breaks. It also directly addresses the query by creating a joke about a sleep-deprived high schooler named Noah. The content of the joke is relevant to the character's situation and incorporates humor related to his lack of sleep and its effects on his daily activities. The joke is self-contained, clear, and maintains a light-hearted tone.\n\nScore: 10"], "eval_args": {}, "end_reason": null}
 6 | {"instruction": "Construct answers using exactly 20 words in total", "query": "explain enumerate() in python", "response": "`enumerate()` in Python adds a counter to an iterable, returning it as an enumerate object for easier tracking during iteration.", "prompt": "You are an expert that is good at judging whether a response is following the instruction and query.\n[Instruction] Construct answers using exactly 20 words in total\n[Query] explain enumerate() in python\n[Response] `enumerate()` in Python adds a counter to an iterable, returning it as an enumerate object for easier tracking during iteration.\nPlease notice that the response may not be helpful as it needs to strictly follow the requirements in the Instruction.\nYou need to judge whether the response answers the query. Please first provide a detailed analysis and then give a score ranking from 0 to 10 at the last line.\nScoring 0 means the response is totally unrelated to the query, while scoring 10 means the response is helpful and highly related to the query.\nPlease only provide a score in the format `Score: {score}` without any other contents at the last line.", "beam_unique_id": 7, "gen": ["The response accurately explains the functionality of `enumerate()` in Python within the given word limit, making it relevant and concise. The explanation mentions that `enumerate()` adds a counter to an iterable, which is its primary function, and clarifies that it returns an enumerate object to facilitate tracking during iteration. This is directly related to the query and adheres to the instruction of using exactly 20 words.\n\nScore: 10"], "eval_args": {}, "end_reason": null}
 7 | {"instruction": "Generate a reply with exactly 20 words", "query": "write threads to parse UI elements using FlaUI", "response": "To parse UI elements with FlaUI, initiate threads for each task. Use `FlaUI.Core` for automation, targeting specific UI elements efficiently.", "prompt": "You are an expert that is good at judging whether a response is following the instruction and query.\n[Instruction] Generate a reply with exactly 20 words\n[Query] write threads to parse UI elements using FlaUI\n[Response] To parse UI elements with FlaUI, initiate threads for each task. Use `FlaUI.Core` for automation, targeting specific UI elements efficiently.\nPlease notice that the response may not be helpful as it needs to strictly follow the requirements in the Instruction.\nYou need to judge whether the response answers the query. Please first provide a detailed analysis and then give a score ranking from 0 to 10 at the last line.\nScoring 0 means the response is totally unrelated to the query, while scoring 10 means the response is helpful and highly related to the query.\nPlease only provide a score in the format `Score: {score}` without any other contents at the last line.", "beam_unique_id": 8, "gen": ["The response provided does indeed answer the query by suggesting the use of FlaUI.Core for automation and targeting UI elements. It also mentions initiating threads for each task, which is relevant to the 'write threads' part of the query. The response is concise and adheres to the instruction of being exactly 20 words. However, it could be more helpful if it included a brief example or a step-by-step guide on how to implement threading with FlaUI, considering the constraint of 20 words was met.\n\nScore: 9"], "eval_args": {}, "end_reason": null}
 8 | {"instruction": "Limit your response to a single paragraph", "query": "I'm advertising for a Resort in Orlando. We have multiple locations all over US. My problem is to find people who want to travel but haven't decided where. Can you help me to understand how can I find my customers?", "response": "To effectively reach potential customers who are considering travel but are undecided on their destination, consider leveraging digital marketing strategies that target audiences based on their interests and online behaviors. Utilize social media platforms like Facebook and Instagram for their precise targeting options, focusing on users who engage with travel content or belong to travel-related groups. Implement search engine marketing (SEM) to capture individuals searching for general travel ideas or comparing destinations. Collaborate with travel influencers or bloggers to showcase the unique experiences offered by your resort through engaging content. Additionally, create compelling, location-agnostic ad campaigns that highlight the universal appeal of vacations, emphasizing relaxation, adventure, or family bonding, which can attract a broader audience before narrowing down to the specific benefits of choosing Orlando. Lastly, ensure your website and landing pages are optimized for conversion, providing clear information and enticing offers to sway the decision-making process in favor of your resort.", "prompt": "You are an expert that is good at judging whether a response is following the instruction and query.\n[Instruction] Limit your response to a single paragraph\n[Query] I'm advertising for a Resort in Orlando. We have multiple locations all over US. My problem is to find people who want to travel but haven't decided where. Can you help me to understand how can I find my customers?\n[Response] To effectively reach potential customers who are considering travel but are undecided on their destination, consider leveraging digital marketing strategies that target audiences based on their interests and online behaviors. Utilize social media platforms like Facebook and Instagram for their precise targeting options, focusing on users who engage with travel content or belong to travel-related groups. Implement search engine marketing (SEM) to capture individuals searching for general travel ideas or comparing destinations. Collaborate with travel influencers or bloggers to showcase the unique experiences offered by your resort through engaging content. Additionally, create compelling, location-agnostic ad campaigns that highlight the universal appeal of vacations, emphasizing relaxation, adventure, or family bonding, which can attract a broader audience before narrowing down to the specific benefits of choosing Orlando. Lastly, ensure your website and landing pages are optimized for conversion, providing clear information and enticing offers to sway the decision-making process in favor of your resort.\nPlease notice that the response may not be helpful as it needs to strictly follow the requirements in the Instruction.\nYou need to judge whether the response answers the query. Please first provide a detailed analysis and then give a score ranking from 0 to 10 at the last line.\nScoring 0 means the response is totally unrelated to the query, while scoring 10 means the response is helpful and highly related to the query.\nPlease only provide a score in the format `Score: {score}` without any other contents at the last line.", "beam_unique_id": 10, "gen": ["The response provided does indeed adhere to the instruction of being contained within a single paragraph. It directly addresses the query by offering a series of actionable strategies for identifying and attracting potential customers who are interested in travel but have not yet chosen a destination. The advice includes utilizing digital marketing techniques such as social media targeting, search engine marketing, influencer partnerships, and creating versatile ad campaigns that emphasize the benefits of vacations in general before highlighting the specific advantages of the Orlando resort. The response also suggests optimizing the resort's online presence to improve conversion rates. All these points are relevant and practical solutions to the problem posed in the query.\n\nScore: 10"], "eval_args": {}, "end_reason": null}
 9 | {"instruction": "Generate a reply with exactly 20 words", "query": "Rewrite the lyrics of \"despacito\" but replace the word \"despacito\" by \"sopaipilla\" and adapt the rest of the lyrics accordingly", "response": "\"Sopaipilla, contigo, siento un sabor, en mi corazón, como platillo de amor. Quiero bailar contigo, al ritmo de mi tambor.\"", "prompt": "You are an expert that is good at judging whether a response is following the instruction and query.\n[Instruction] Generate a reply with exactly 20 words\n[Query] Rewrite the lyrics of \"despacito\" but replace the word \"despacito\" by \"sopaipilla\" and adapt the rest of the lyrics accordingly\n[Response] \"Sopaipilla, contigo, siento un sabor, en mi corazón, como platillo de amor. Quiero bailar contigo, al ritmo de mi tambor.\"\nPlease notice that the response may not be helpful as it needs to strictly follow the requirements in the Instruction.\nYou need to judge whether the response answers the query. Please first provide a detailed analysis and then give a score ranking from 0 to 10 at the last line.\nScoring 0 means the response is totally unrelated to the query, while scoring 10 means the response is helpful and highly related to the query.\nPlease only provide a score in the format `Score: {score}` without any other contents at the last line.", "beam_unique_id": 11, "gen": ["The response adheres to the instruction by using exactly 20 words. It also creatively replaces \"despacito\" with \"sopaipilla\" and adapts the lyrics accordingly, maintaining the thematic essence of the original song while introducing a culinary twist as requested. The use of \"sabor\" (flavor) and \"platillo de amor\" (dish of love) cleverly integrates the food theme into the romantic context of the song.\n\nScore: 10"], "eval_args": {}, "end_reason": null}
10 | {"instruction": "Formulate a single sentence, exactly hundred words long, impeccably", "query": "Act as a principal cloud engineer. Develop a script to automatically backup a gcp virutal machine to a bucket and recommend a way to deploy it automatically on a schedule using either google cloud functions or pub/sub.", "response": "As a principal cloud engineer, I meticulously crafted an automated script designed to efficiently back up a GCP virtual machine to a specified bucket, leveraging the robustness of Google Cloud's storage solutions; this script, written in Python for its simplicity and power, utilizes the gsutil command-line tool to transfer data securely and swiftly, ensuring that critical information is safeguarded against potential loss; for orchestrating this process on a schedule, I recommend deploying the script as a Google Cloud Function triggered by a Pub/Sub message, creating a reliable, event-driven system that ensures regular backups without manual intervention, enhancing operational efficiency significantly.", "prompt": "You are an expert that is good at judging whether a response is following the instruction and query.\n[Instruction] Formulate a single sentence, exactly hundred words long, impeccably\n[Query] Act as a principal cloud engineer. Develop a script to automatically backup a gcp virutal machine to a bucket and recommend a way to deploy it automatically on a schedule using either google cloud functions or pub/sub.\n[Response] As a principal cloud engineer, I meticulously crafted an automated script designed to efficiently back up a GCP virtual machine to a specified bucket, leveraging the robustness of Google Cloud's storage solutions; this script, written in Python for its simplicity and power, utilizes the gsutil command-line tool to transfer data securely and swiftly, ensuring that critical information is safeguarded against potential loss; for orchestrating this process on a schedule, I recommend deploying the script as a Google Cloud Function triggered by a Pub/Sub message, creating a reliable, event-driven system that ensures regular backups without manual intervention, enhancing operational efficiency significantly.\nPlease notice that the response may not be helpful as it needs to strictly follow the requirements in the Instruction.\nYou need to judge whether the response answers the query. Please first provide a detailed analysis and then give a score ranking from 0 to 10 at the last line.\nScoring 0 means the response is totally unrelated to the query, while scoring 10 means the response is helpful and highly related to the query.\nPlease only provide a score in the format `Score: {score}` without any other contents at the last line.", "beam_unique_id": 12, "gen": ["The response provided is indeed related to the query and adheres to the instruction of formulating a single sentence that is exactly one hundred words long. It addresses the task of developing a script for backing up a GCP virtual machine to a bucket and recommends using Google Cloud Functions triggered by Pub/Sub for automation. The response includes details about the scripting language (Python), the tool used for data transfer (gsutil), and the benefits of using an event-driven system for scheduled backups. This level of detail and specificity demonstrates a thorough understanding of the technical requirements and effectively communicates the solution.\n\nScore: 10"], "eval_args": {}, "end_reason": null}
11 | {"instruction": "Compose an answer limited to five-word sentences", "query": "Me dê uma contexrualização para \"keeping cool\"", "response": "\"Manter a calma\". Lidar tranquilamente. Não mostrar estresse. Permanecer sereno, inabalável.", "prompt": "You are an expert that is good at judging whether a response is following the instruction and query.\n[Instruction] Compose an answer limited to five-word sentences\n[Query] Me dê uma contexrualização para \"keeping cool\"\n[Response] \"Manter a calma\". Lidar tranquilamente. Não mostrar estresse. Permanecer sereno, inabalável.\nPlease notice that the response may not be helpful as it needs to strictly follow the requirements in the Instruction.\nYou need to judge whether the response answers the query. Please first provide a detailed analysis and then give a score ranking from 0 to 10 at the last line.\nScoring 0 means the response is totally unrelated to the query, while scoring 10 means the response is helpful and highly related to the query.\nPlease only provide a score in the format `Score: {score}` without any other contents at the last line.", "beam_unique_id": 13, "gen": ["The response does indeed follow the instruction of composing an answer limited to five-word sentences. It translates \"keeping cool\" into Portuguese as \"Manter a calma\" and then provides additional phrases that contextualize the concept of 'keeping cool', such as dealing calmly with situations, not showing stress, and remaining serene and unshaken. The response is relevant to the query and adheres to the given constraints.\n\nScore: 10"], "eval_args": {}, "end_reason": null}
12 | {"instruction": "Keep your answer under 25 words and make every word count", "query": "Sale hook for american eye center vietnam", "response": "\"American Eye Center Vietnam: Unmatched expertise, state-of-the-art technology, and exceptional care, bringing U.S. quality vision services to Vietnam.\"", "prompt": "You are an expert that is good at judging whether a response is following the instruction and query.\n[Instruction] Keep your answer under 25 words and make every word count\n[Query] Sale hook for american eye center vietnam\n[Response] \"American Eye Center Vietnam: Unmatched expertise, state-of-the-art technology, and exceptional care, bringing U.S. quality vision services to Vietnam.\"\nPlease notice that the response may not be helpful as it needs to strictly follow the requirements in the Instruction.\nYou need to judge whether the response answers the query. Please first provide a detailed analysis and then give a score ranking from 0 to 10 at the last line.\nScoring 0 means the response is totally unrelated to the query, while scoring 10 means the response is helpful and highly related to the query.\nPlease only provide a score in the format `Score: {score}` without any other contents at the last line.", "beam_unique_id": 14, "gen": ["The response provides a concise sales hook for the American Eye Center Vietnam, highlighting key selling points such as expertise, technology, care, and the U.S. quality of services, all within the required word limit.\n\nScore: 10"], "eval_args": {}, "end_reason": null}
13 | {"instruction": "Keep your entire response under 25 words", "query": "aws solution architect  chapter 2", "response": "Chapter 2 of AWS Solution Architect covers designing on AWS, including services, architecture design patterns, and best practices for scalability, security, and cost optimization.", "prompt": "You are an expert that is good at judging whether a response is following the instruction and query.\n[Instruction] Keep your entire response under 25 words\n[Query] aws solution architect  chapter 2\n[Response] Chapter 2 of AWS Solution Architect covers designing on AWS, including services, architecture design patterns, and best practices for scalability, security, and cost optimization.\nPlease notice that the response may not be helpful as it needs to strictly follow the requirements in the Instruction.\nYou need to judge whether the response answers the query. Please first provide a detailed analysis and then give a score ranking from 0 to 10 at the last line.\nScoring 0 means the response is totally unrelated to the query, while scoring 10 means the response is helpful and highly related to the query.\nPlease only provide a score in the format `Score: {score}` without any other contents at the last line.", "beam_unique_id": 15, "gen": ["The response provides relevant information about Chapter 2 of the AWS Solution Architect, mentioning the topics covered such as services, design patterns, and best practices for scalability, security, and cost optimization. Despite the constraint of 25 words, it succinctly addresses the query.\n\nScore: 9"], "eval_args": {}, "end_reason": null}
14 | {"instruction": "Keep your response to a concise total of 25 words", "query": "Tell me about the concept of God.", "response": "God concept varies widely across cultures and beliefs, often seen as a supreme being, creator, and moral authority in monotheistic religions. Omnipotent, omniscient, omnipresent.", "prompt": "You are an expert that is good at judging whether a response is following the instruction and query.\n[Instruction] Keep your response to a concise total of 25 words\n[Query] Tell me about the concept of God.\n[Response] God concept varies widely across cultures and beliefs, often seen as a supreme being, creator, and moral authority in monotheistic religions. Omnipotent, omniscient, omnipresent.\nPlease notice that the response may not be helpful as it needs to strictly follow the requirements in the Instruction.\nYou need to judge whether the response answers the query. Please first provide a detailed analysis and then give a score ranking from 0 to 10 at the last line.\nScoring 0 means the response is totally unrelated to the query, while scoring 10 means the response is helpful and highly related to the query.\nPlease only provide a score in the format `Score: {score}` without any other contents at the last line.", "beam_unique_id": 17, "gen": ["The response provides a concise explanation of the concept of God, mentioning its variability across cultures and its attributes in monotheistic beliefs. It adheres to the 25-word limit while conveying essential information relevant to the query.\n\nScore: 10"], "eval_args": {}, "end_reason": null}
15 | {"instruction": "Yield your response as a single paragraph without line breaks", "query": "I have a project with directory:\n\n—dataset\n\n—model\n\n—result\n\n—src\n\nmain.py\n\nthe workflow of the project is to input a .csv file in dataset and output a .csv result file in result.\n\nNow I want to create a web tool to display the algorithm, how do I do that?", "response": "To create a web tool for your project, you can use a framework like Flask or Django which are popular Python choices for web development Here's a basic approach: first, install Flask by running pip install Flask in your terminal then create a new Python file for your Flask application inside the src directory let's call it app.py In this file initialize Flask and define a route that accepts the .csv file you can use the Flask function request to handle the uploaded file and pass it to main.py for processing Once processed store the output in the result directory Finally update your HTML and CSS files to design an intuitive interface for users to upload their .csv files and download the results remember to link these frontend files with your Flask application so they're accessible through the web Make sure to adjust your system path and environment variables so Flask can interact with your existing project structure When everything is set up run your Flask app and test it by visiting http://localhost:5000 in your browser ensuring that the file upload and processing functionalities work as expected This will provide a simple yet effective way for users to interact with your algorithm via the web.", "prompt": "You are an expert that is good at judging whether a response is following the instruction and query.\n[Instruction] Yield your response as a single paragraph without line breaks\n[Query] I have a project with directory:\n\n—dataset\n\n—model\n\n—result\n\n—src\n\nmain.py\n\nthe workflow of the project is to input a .csv file in dataset and output a .csv result file in result.\n\nNow I want to create a web tool to display the algorithm, how do I do that?\n[Response] To create a web tool for your project, you can use a framework like Flask or Django which are popular Python choices for web development Here's a basic approach: first, install Flask by running pip install Flask in your terminal then create a new Python file for your Flask application inside the src directory let's call it app.py In this file initialize Flask and define a route that accepts the .csv file you can use the Flask function request to handle the uploaded file and pass it to main.py for processing Once processed store the output in the result directory Finally update your HTML and CSS files to design an intuitive interface for users to upload their .csv files and download the results remember to link these frontend files with your Flask application so they're accessible through the web Make sure to adjust your system path and environment variables so Flask can interact with your existing project structure When everything is set up run your Flask app and test it by visiting http://localhost:5000 in your browser ensuring that the file upload and processing functionalities work as expected This will provide a simple yet effective way for users to interact with your algorithm via the web.\nPlease notice that the response may not be helpful as it needs to strictly follow the requirements in the Instruction.\nYou need to judge whether the response answers the query. Please first provide a detailed analysis and then give a score ranking from 0 to 10 at the last line.\nScoring 0 means the response is totally unrelated to the query, while scoring 10 means the response is helpful and highly related to the query.\nPlease only provide a score in the format `Score: {score}` without any other contents at the last line.", "beam_unique_id": 18, "gen": ["The response provided is indeed relevant to the query as it outlines a step-by-step process to convert the given project into a web tool using Flask. It starts by suggesting the installation of Flask, which is a suitable choice for creating a simple web application in Python. The instructions are clear on where to place the new Flask application file (app.py) and how to initialize it. It also guides on defining a route to handle the CSV file uploads and mentions using the Flask 'request' function to manage the file, which is accurate. The response further elaborates on storing the processed output in the designated 'result' directory and updating HTML/CSS files to improve user interaction. It correctly advises linking these frontend files to the Flask application and testing the setup locally. Overall, the response is comprehensive, providing both technical details and a logical flow of actions necessary to achieve the goal stated in the query.\n\nScore: 10"], "eval_args": {}, "end_reason": null}
16 | {"instruction": "Write in a single paragraph, no line breaks", "query": "What's the permission that allows creating provisioning profiles in Apple Developer account is called?", "response": "The permission that allows creating provisioning profiles in an Apple Developer account is specifically referred to as the \"Provisioning Profiles\" permission, which falls under the Certificates, Identifiers & Profiles section of the Apple Developer Portal; this permission enables users to create, edit, and manage provisioning profiles necessary for distributing apps on iOS, tvOS, watchOS, and macOS devices.", "prompt": "You are an expert that is good at judging whether a response is following the instruction and query.\n[Instruction] Write in a single paragraph, no line breaks\n[Query] What's the permission that allows creating provisioning profiles in Apple Developer account is called?\n[Response] The permission that allows creating provisioning profiles in an Apple Developer account is specifically referred to as the \"Provisioning Profiles\" permission, which falls under the Certificates, Identifiers & Profiles section of the Apple Developer Portal; this permission enables users to create, edit, and manage provisioning profiles necessary for distributing apps on iOS, tvOS, watchOS, and macOS devices.\nPlease notice that the response may not be helpful as it needs to strictly follow the requirements in the Instruction.\nYou need to judge whether the response answers the query. Please first provide a detailed analysis and then give a score ranking from 0 to 10 at the last line.\nScoring 0 means the response is totally unrelated to the query, while scoring 10 means the response is helpful and highly related to the query.\nPlease only provide a score in the format `Score: {score}` without any other contents at the last line.", "beam_unique_id": 20, "gen": ["The response provided does indeed answer the query by specifying that the permission required to create provisioning profiles in an Apple Developer account is called the \"Provisioning Profiles\" permission. It further elaborates on where this permission can be found within the Apple Developer Portal and explains its purpose in managing app distribution across various Apple platforms. This additional information, while not explicitly requested, is relevant and helpful for understanding the context of the permission. The response maintains a single paragraph structure as instructed, without any line breaks. Therefore, it adheres to both the content and format requirements specified in the instruction.\n\nScore: 10"], "eval_args": {}, "end_reason": null}
17 | 


--------------------------------------------------------------------------------
/VIF-RAG/sample_data/seed_instruction.txt:
--------------------------------------------------------------------------------
 1 | Answer with words that begin with the letter 'B'
 2 | Construct the reply as if it's a telegram STOP
 3 | Use only palindromes
 4 | Incorporate a famous movie quote seamlessly into your answer
 5 | Write the response backward
 6 | Use only words with double letters (e.g., "bookkeeper")
 7 | Use only onomatopoeia
 8 | Answer with a single sentence that is exactly 100 words long
 9 | Use no words containing the letter 'E'
10 | Translate your answer into emojis
11 | Use only the 1000 most common English words
12 | Use words that end with '-ing'
13 | Use only military lingo
14 | Respond with a haiku (5-7-5 syllable structure)
15 | Answer in the form of a sonnet (14 lines with 10 syllables each)
16 | Use only monosyllabic words
17 | Answer with words in alphabetical order
18 | Write the response as a limerick
19 | Use no adjectives or adverbs
20 | Respond with a six-word story
21 | Include at least three rhyming pairs
22 | Write the response in iambic pentameter
23 | Use alliteration throughout your answer
24 | Write the response in future tense only
25 | Use only the first half of the alphabet (A-M)
26 | Use only questions to form your reply
27 | Use only words that start and end with the same letter
28 | Write the response in Morse code
29 | Use only words that are colors
30 | Use only the second half of the alphabet (N-Z)
31 | Answer with each sentence decreasing in word count
32 | Respond with a list of bullet points
33 | Answer with a sequence of puns
34 | Answer with emoji only
35 | Use only words that have an X in them
36 | Answer with each word starting with the next letter of the alphabet


--------------------------------------------------------------------------------