├── requirements.txt ├── instruction_following_eval ├── requirements.txt ├── data │ ├── input_data2.jsonl │ ├── temp.jsonl │ └── debug.jsonl ├── run.sh ├── README.md ├── instructions_util_test.py ├── instructions_registry.py ├── evaluation_main.py ├── instructions_util.py ├── instructions_test.py └── instructions.py ├── .gitignore ├── examples ├── eval_desta2.py └── eval_llama3_8B_instruct.py ├── README.md └── script └── llm_evaluation.py /requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py 2 | langdetect 3 | nltk 4 | immutabledict 5 | openai 6 | transformers -------------------------------------------------------------------------------- /instruction_following_eval/requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py 2 | langdetect 3 | nltk 4 | immutabledict 5 | -------------------------------------------------------------------------------- /instruction_following_eval/data/input_data2.jsonl: -------------------------------------------------------------------------------- 1 | {"key": 1000, "prompt": "Write a 300+ word summary of the wikipedia page \"https://en.wikipedia.org/wiki/Raymond_III,_Count_of_Tripoli\". Do not use any commas and highlight at least 3 sections that has titles in markdown format, for example *highlighted section part 1*, *highlighted section part 2*, *highlighted section part 3*.", "instruction_id_list": ["punctuation:no_comma", "detectable_format:number_highlighted_sections", "length_constraints:number_words"], "kwargs": [{}, {"num_highlights": 3}, {"relation": "at least", "num_words": 300}]} -------------------------------------------------------------------------------- /instruction_following_eval/run.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The Google Research Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | #!/bin/bash 16 | 17 | python3 -m instruction_following_eval.evaluation_main \ 18 | --input_data=./instruction_following_eval/data/input_data.jsonl \ 19 | --input_response_data=./instruction_following_eval/data/input_response_data_gpt4_20231107_145030.jsonl \ 20 | --output_dir=./instruction_following_eval/data/ 21 | 22 | exit 0 -------------------------------------------------------------------------------- /instruction_following_eval/data/temp.jsonl: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | {"key": 3, "prompt": "Make sure to wrap your entire response with double quotes.", "instruction_id_list": ["startend:quotation"], "kwargs": [{}]} 6 | 7 | 8 | 9 | {"key": 1216, "prompt": "Make sure to wrap your entire response with double quotes", "instruction_id_list": ["startend:quotation"], "kwargs": [{}]} 10 | {"key": 1220, "prompt": "End your entire response with the exact phrase \"Is there anything else I can help with?\"", "instruction_id_list": ["startend:end_checker"], "kwargs": [{"end_phrase": "Is there anything else I can help with?"}]} 11 | {"key": 1221, "prompt": "Start your answer with \"Answer:\"", "instruction_id_list": ["combination:repeat_prompt"], "kwargs": [{"prompt_to_repeat": "Answer:"}]} 12 | {"key": 122, "prompt": "Wrap your entire response with double quotation marks.", "instruction_id_list": ["startend:quotation"], "kwargs": [{}]} 13 | {"key": 1148, "prompt": "Wrap the entire output in JSON format.", "instruction_id_list": ["detectable_format:json_format"], "kwargs": [{}]} 14 | {"key": 13, "prompt": "Please wrap your entire answer in JSON format. You can use markdown ticks such as ```. For example:\n```JSON\n{\n...\n}\n```", "instruction_id_list": ["detectable_format:json_format"], "kwargs": [{}]} 15 | {"key": 14, "prompt": "Please wrap your answer in JSON format with the key: \"answer\"", "instruction_id_list": ["detectable_format:json_format", "keywords:existence"], "kwargs": [{}, {"keywords": ["answer"]}]} 16 | 17 | -------------------------------------------------------------------------------- /instruction_following_eval/README.md: -------------------------------------------------------------------------------- 1 | # IFEval: Instruction Following Eval 2 | 3 | This is not an officially supported Google product. 4 | 5 | This repository contains source code and data for 6 | [Instruction Following Evaluation for Large Language Models](arxiv.org/abs/2311.07911) 7 | 8 | ## Dependencies 9 | 10 | Please make sure that all required python packages are installed via: 11 | 12 | ``` 13 | pip3 install -r requirements.txt 14 | ``` 15 | 16 | ## How to run 17 | 18 | You need to create a jsonl file with two entries: prompt and response. 19 | Then, call `evaluation_main` from the parent folder of 20 | instruction_following_eval. For example: 21 | 22 | ```bash 23 | # Content of `--input_response_data` should be like: 24 | # {"prompt": "Write a 300+ word summary ...", "response": "PUT YOUR MODEL RESPONSE HERE"} 25 | # {"prompt": "I am planning a trip to ...", "response": "PUT YOUR MODEL RESPONSE HERE"} 26 | # ... 27 | python3 -m instruction_following_eval.evaluation_main \ 28 | --input_data=./instruction_following_eval/data/input_data.jsonl \ 29 | --input_response_data=./instruction_following_eval/data/input_response_data_gpt4_20231107_145030.jsonl \ 30 | --output_dir=./instruction_following_eval/data/ 31 | ``` 32 | 33 | ## Reference 34 | 35 | If you use our work, please consider citing our preprint: 36 | 37 | ``` 38 | @article{zhou2023instruction, 39 | title={Instruction-Following Evaluation for Large Language Models}, 40 | author={Zhou, Jeffrey and Lu, Tianjian and Mishra, Swaroop and Brahma, Siddhartha and Basu, Sujoy and Luan, Yi and Zhou, Denny and Hou, Le}, 41 | journal={arXiv preprint arXiv:2311.07911}, 42 | year={2023} 43 | } 44 | ``` -------------------------------------------------------------------------------- /instruction_following_eval/data/debug.jsonl: -------------------------------------------------------------------------------- 1 | {"key": 1000, "prompt": "Write a 300+ word summary of the wikipedia page \"https://en.wikipedia.org/wiki/Raymond_III,_Count_of_Tripoli\". Do not use any commas and highlight at least 3 sections that has titles in markdown format, for example *highlighted section part 1*, *highlighted section part 2*, *highlighted section part 3*.", "response": "Raymond III was the Count of Tripoli from 1152 to 1187 and Prince of Galilee and Tiberias in the Kingdom of Jerusalem. He was born in 1140 to Raymond II of Tripoli and Hodierna of Jerusalem. His mother was the daughter of Baldwin II of Jerusalem. \n\n*Early Life and Succession*\n\nRaymond III was only a child when his father was murdered. His mother Hodierna was regent until Raymond came of age. In 1155 Raymond married Eschiva the daughter of Walter I of Beirut. They had three children: Raymond IV Bertrand and a daughter who married Guy of Lusignan. \n\n*Reign*\n\nRaymond III's reign was marked by a series of conflicts and alliances with the Muslim world. He was involved in the defense of the Kingdom of Jerusalem against Nur ad-Din and later Saladin. He was also a key figure in the internal politics of the kingdom. He was a supporter of the queen mother Amalric of Jerusalem and opposed the succession of Guy of Lusignan. \n\n*Later Life and Death*\n\nIn 1187 Raymond III was part of the disastrous Battle of Hattin where the Christian forces were defeated by Saladin. He was one of the few to escape the battlefield but died later that year. His son Raymond IV succeeded him as Count of Tripoli. \n\nRaymond III's life and reign were marked by the complex politics and warfare of the Crusader states. He was a key figure in the defense of the Kingdom of Jerusalem and his death marked a significant loss for the Christian forces in the region. His legacy is a testament to the turbulent times in which he lived and the challenges faced by the Crusader states in their relations with the Muslim world."} 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | # Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyderworkspace 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # pytype static type analyzer 132 | .pytype/ 133 | 134 | # Cython debug symbols 135 | cython_debug/ -------------------------------------------------------------------------------- /examples/eval_desta2.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import json 4 | import torch 5 | from transformers import AutoModel 6 | from pathlib import Path 7 | import logging 8 | from tqdm import tqdm 9 | 10 | def arg_parser(): 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument("--model_id", type=str, default="DeSTA-ntu/DeSTA2-8B-beta") 13 | 14 | parser.add_argument("--data_dir", type=str, default="./data") 15 | parser.add_argument("--output_dir", type=str, default="./outputs") 16 | 17 | return parser.parse_args() 18 | 19 | def get_response(args, data, model, tokenizer=None): 20 | instruction = data["instruction"] 21 | 22 | # TODO: Replace with actual model inference logic 23 | messages = [ 24 | {"role": "system", "content": "Follow the given instructions."}, 25 | {"role": "audio", "content": args.data_dir + "/audios/" + data["audio_filepath"]}, 26 | {"role": "user", "content": f"{instruction}"}, 27 | ] 28 | 29 | generated_ids = model.chat(messages, max_new_tokens=2048, do_sample=False, temperature=1, top_p=1.0) 30 | 31 | response = model.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] 32 | 33 | return response, messages 34 | 35 | def main(args): 36 | data_dir = Path(args.data_dir) 37 | output_dir = Path(args.output_dir) / args.model_id.replace("/", "--") 38 | 39 | if not os.path.exists(output_dir): 40 | os.makedirs(output_dir, exist_ok=True) 41 | 42 | 43 | manifest_paths = [ 44 | Path(data_dir) / "eval_data/closed_ended_questions.jsonl", 45 | Path(data_dir) / "eval_data/creative_writing.jsonl", 46 | Path(data_dir) / "eval_data/chain-of-thought.jsonl", 47 | Path(data_dir) / "eval_data/closed_ended_questions-woprompt.jsonl", 48 | ] 49 | 50 | # Load model 51 | model = AutoModel.from_pretrained(args.model_id, trust_remote_code=True, cache_dir=os.getenv("HF_HOME"), token=os.getenv("HF_TOKEN")) 52 | model.to("cuda") 53 | 54 | for manifest_path in manifest_paths: 55 | output_file = output_dir / manifest_path.name 56 | 57 | # logging to a file path that is the same as the manifest file 58 | logging.basicConfig(filename=output_dir / f"{manifest_path.stem}.log", level=logging.INFO) 59 | 60 | logging.info(f"Processing {manifest_path}") 61 | logging.info(f"Output file: {output_file}") 62 | 63 | with manifest_path.open("r") as fin, output_file.open("w") as fout: 64 | datas = [json.loads(line) for line in fin.readlines()] 65 | 66 | 67 | for data in tqdm(datas): 68 | 69 | response, messages = get_response(args, data, model) 70 | 71 | data["messages"] = messages 72 | data["response"] = response # Add response to data 73 | 74 | fout.write(json.dumps(data) + "\n") 75 | logging.info(json.dumps(data)) 76 | 77 | if __name__ == "__main__": 78 | args = arg_parser() 79 | main(args) -------------------------------------------------------------------------------- /examples/eval_llama3_8B_instruct.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import json 4 | import torch 5 | from transformers import AutoTokenizer, AutoModelForCausalLM 6 | from pathlib import Path 7 | import logging 8 | from tqdm import tqdm 9 | 10 | def arg_parser(): 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument("--model_id", type=str, default="meta-llama/Meta-Llama-3-8B-Instruct") 13 | 14 | parser.add_argument("--data_dir", type=str, default="./data") 15 | parser.add_argument("--output_dir", type=str, default="./outputs") 16 | 17 | return parser.parse_args() 18 | 19 | def main(args): 20 | data_dir = Path(args.data_dir) 21 | output_dir = Path(args.output_dir) / args.model_id.replace("/", "--") 22 | (output_dir / "logs").mkdir(parents=True, exist_ok=True) 23 | 24 | 25 | manifest_paths = [ 26 | Path(data_dir) / "eval_data/closed_ended_questions.jsonl", 27 | Path(data_dir) / "eval_data/creative_writing.jsonl", 28 | Path(data_dir) / "eval_data/chain-of-thought.jsonl", 29 | Path(data_dir) / "eval_data/closed_ended_questions-woprompt.jsonl", 30 | ] 31 | 32 | # Load model 33 | tokenizer = AutoTokenizer.from_pretrained(args.model_id) 34 | tokenizer.padding_side = "left" 35 | tokenizer.pad_token_id = tokenizer.eos_token_id 36 | model = AutoModelForCausalLM.from_pretrained( 37 | args.model_id, 38 | torch_dtype=torch.bfloat16, 39 | device_map="auto", 40 | cache_dir=os.getenv("HF_HOME"), 41 | token=os.getenv("HF_TOKEN"), 42 | ) 43 | 44 | for manifest_path in manifest_paths: 45 | output_file = output_dir / manifest_path.name 46 | 47 | # logging to a file path that is the same as the manifest file 48 | logging.basicConfig(filename=output_dir / f"{manifest_path.stem}.log", level=logging.INFO) 49 | 50 | logging.info(f"Processing {manifest_path}") 51 | logging.info(f"Output file: {output_file}") 52 | 53 | with manifest_path.open("r") as fin, output_file.open("w") as fout: 54 | datas = [json.loads(line) for line in fin.readlines()] 55 | 56 | 57 | for data in tqdm(datas): 58 | instruction = data["instruction"] 59 | textual_audio = data["textual_audio"] 60 | 61 | # TODO: Replace with actual model inference logic 62 | content = f"""Speech Input: {textual_audio}\n\n{instruction}""" 63 | messages = [ 64 | {"role": "system", "content": "Follow the given instructions."}, 65 | {"role": "user", "content": content}, 66 | ] 67 | input_ids = tokenizer.apply_chat_template( 68 | messages, 69 | add_generation_prompt=True, 70 | return_tensors="pt", 71 | ).to(model.device) 72 | 73 | terminators = [ 74 | tokenizer.eos_token_id, 75 | tokenizer.convert_tokens_to_ids("<|eot_id|>") 76 | ] 77 | 78 | outputs = model.generate( 79 | input_ids, 80 | max_new_tokens=2048, 81 | eos_token_id=terminators, 82 | pad_token_id=tokenizer.eos_token_id, 83 | do_sample=False, 84 | temperature=1.0, 85 | top_p=1.0, 86 | ) 87 | 88 | response = tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True) 89 | data["messages"] = messages 90 | data["response"] = response # Add response to data 91 | 92 | fout.write(json.dumps(data) + "\n") 93 | logging.info(json.dumps(data)) 94 | 95 | break 96 | 97 | if __name__ == "__main__": 98 | args = arg_parser() 99 | main(args) -------------------------------------------------------------------------------- /instruction_following_eval/instructions_util_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2024 The Google Research Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Test for utility library of instructions.""" 17 | 18 | from absl.testing import absltest 19 | from absl.testing import parameterized 20 | from instruction_following_eval import instructions_util 21 | 22 | 23 | class InstructionsUtilTest(parameterized.TestCase): 24 | 25 | TEST_WORD_COUNT_CASE_1 = ("word1, word2, word3, word4.", 4) 26 | 27 | TEST_WORD_COUNT_CASE_2 = ( 28 | """ 29 | Bard can you tell me which is the best optimization method for the 30 | transition from an hydro-thermal system to an hydro-renewables system""", 31 | 24) 32 | 33 | TEST_WORD_COUNT_CASE_3 = ( 34 | """ 35 | Hyphenated-word has two word counts. 36 | """, 6) 37 | 38 | def test_word_count(self): 39 | """Tests word counter.""" 40 | with self.subTest(f"{self.TEST_WORD_COUNT_CASE_1[0]}"): 41 | text, expected_num_words = self.TEST_WORD_COUNT_CASE_1 42 | actual_num_words = instructions_util.count_words(text) 43 | self.assertEqual(expected_num_words, actual_num_words) 44 | 45 | with self.subTest(f"{self.TEST_WORD_COUNT_CASE_2[0]}"): 46 | text, expected_num_words = self.TEST_WORD_COUNT_CASE_2 47 | actual_num_words = instructions_util.count_words(text) 48 | self.assertEqual(expected_num_words, actual_num_words) 49 | 50 | with self.subTest(f"{self.TEST_WORD_COUNT_CASE_3[0]}"): 51 | text, expected_num_words = self.TEST_WORD_COUNT_CASE_3 52 | actual_num_words = instructions_util.count_words(text) 53 | self.assertEqual(expected_num_words, actual_num_words) 54 | 55 | @parameterized.named_parameters( 56 | [ 57 | { # pylint: disable=g-complex-comprehension 58 | "testcase_name": ( 59 | f"_response={response}_num_sentences={num_sentences}" 60 | ), 61 | "response": response, 62 | "num_sentences": num_sentences, 63 | } 64 | for response, num_sentences in [ 65 | ("xx,x. xx,x! xx/x. x{x}x? x.", 5), 66 | ("xx,x! xxxx. x(x)x?", 3), 67 | ("xxxx. xx,x! xx|x. x&x x?", 4), 68 | ("xx-x]xx,x! x{x}xx,x.", 2), 69 | ] 70 | ] 71 | ) 72 | def test_count_sentences(self, response, num_sentences): 73 | """Tests sentence counter.""" 74 | actual_num_sentences = instructions_util.count_sentences(response) 75 | self.assertEqual(num_sentences, actual_num_sentences) 76 | 77 | TEST_SENTENCE_SPLIT_1 = """ 78 | Google is a technology company. It was founded in 1998 by Larry Page 79 | and Sergey Brin. Google's mission is to organize the world's information 80 | and make it universally accessible and useful. 81 | """ 82 | 83 | TEST_SENTENCE_SPLIT_2 = """ 84 | The U.S.A has many Ph.D. students. They will often haven a .com website 85 | sharing the research that they have done. 86 | """ 87 | 88 | EXPECTED_SENTENCE_SPLIT_1 = [ 89 | "Google is a technology company.", 90 | "It was founded in 1998 by Larry Page and Sergey Brin.", 91 | ( 92 | "Google's mission is to organize the world's information and make it" 93 | " universally accessible and useful." 94 | ), 95 | ] 96 | 97 | EXPECTED_SENTENCE_SPLIT_2 = [ 98 | "The U.S.A has many Ph.D. students.", 99 | ( 100 | "They will often haven a .com website sharing the research that they" 101 | " have done." 102 | ), 103 | ] 104 | 105 | def test_sentence_splitter(self): 106 | """Tests sentence splitter.""" 107 | sentence_split_1 = instructions_util.split_into_sentences( 108 | self.TEST_SENTENCE_SPLIT_1 109 | ) 110 | sentence_split_2 = instructions_util.split_into_sentences( 111 | self.TEST_SENTENCE_SPLIT_2 112 | ) 113 | 114 | self.assertEqual(self.EXPECTED_SENTENCE_SPLIT_1, sentence_split_1) 115 | self.assertEqual(self.EXPECTED_SENTENCE_SPLIT_2, sentence_split_2) 116 | 117 | def test_generate_keywords(self): 118 | """Tests generate keywords.""" 119 | self.assertLen(instructions_util.generate_keywords(10), 10) 120 | 121 | 122 | if __name__ == "__main__": 123 | absltest.main() 124 | -------------------------------------------------------------------------------- /instruction_following_eval/instructions_registry.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2024 The Google Research Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Registry of all instructions.""" 17 | from instruction_following_eval import instructions 18 | 19 | _KEYWORD = "keywords:" 20 | 21 | _LANGUAGE = "language:" 22 | 23 | _LENGTH = "length_constraints:" 24 | 25 | _CONTENT = "detectable_content:" 26 | 27 | _FORMAT = "detectable_format:" 28 | 29 | _MULTITURN = "multi-turn:" 30 | 31 | _COMBINATION = "combination:" 32 | 33 | _STARTEND = "startend:" 34 | 35 | _CHANGE_CASES = "change_case:" 36 | 37 | _PUNCTUATION = "punctuation:" 38 | 39 | INSTRUCTION_DICT = { 40 | _KEYWORD + "existence": instructions.KeywordChecker, 41 | _KEYWORD + "frequency": instructions.KeywordFrequencyChecker, 42 | # TODO(jeffreyzhou): make a proper set of sentences to choose from 43 | # _KEYWORD + "key_sentences": instructions.KeySentenceChecker, 44 | _KEYWORD + "forbidden_words": instructions.ForbiddenWords, 45 | _KEYWORD + "letter_frequency": instructions.LetterFrequencyChecker, 46 | _LANGUAGE + "response_language": instructions.ResponseLanguageChecker, 47 | _LENGTH + "number_sentences": instructions.NumberOfSentences, 48 | _LENGTH + "number_paragraphs": instructions.ParagraphChecker, 49 | _LENGTH + "number_words": instructions.NumberOfWords, 50 | _LENGTH + "nth_paragraph_first_word": instructions.ParagraphFirstWordCheck, 51 | _CONTENT + "number_placeholders": instructions.PlaceholderChecker, 52 | _CONTENT + "postscript": instructions.PostscriptChecker, 53 | _FORMAT + "number_bullet_lists": instructions.BulletListChecker, 54 | # TODO(jeffreyzhou): Pre-create paragraph or use prompt to replace 55 | # _CONTENT + "rephrase_paragraph": instructions.RephraseParagraph, 56 | _FORMAT + "constrained_response": instructions.ConstrainedResponseChecker, 57 | _FORMAT + "number_highlighted_sections": ( 58 | instructions.HighlightSectionChecker), 59 | _FORMAT + "multiple_sections": instructions.SectionChecker, 60 | # TODO(tianjianlu): Re-enable rephrasing with preprocessing the message. 61 | # _FORMAT + "rephrase": instructions.RephraseChecker, 62 | _FORMAT + "json_format": instructions.JsonFormat, 63 | _FORMAT + "title": instructions.TitleChecker, 64 | # TODO(tianjianlu): Re-enable with specific prompts. 65 | # _MULTITURN + "constrained_start": instructions.ConstrainedStartChecker, 66 | _COMBINATION + "two_responses": instructions.TwoResponsesChecker, 67 | _COMBINATION + "repeat_prompt": instructions.RepeatPromptThenAnswer, 68 | _STARTEND + "end_checker": instructions.EndChecker, 69 | _CHANGE_CASES 70 | + "capital_word_frequency": instructions.CapitalWordFrequencyChecker, 71 | _CHANGE_CASES 72 | + "english_capital": instructions.CapitalLettersEnglishChecker, 73 | _CHANGE_CASES 74 | + "english_lowercase": instructions.LowercaseLettersEnglishChecker, 75 | _PUNCTUATION + "no_comma": instructions.CommaChecker, 76 | _STARTEND + "quotation": instructions.QuotationChecker, 77 | } 78 | 79 | INSTRUCTION_CONFLICTS = { 80 | _KEYWORD + "existence": {_KEYWORD + "existence"}, 81 | _KEYWORD + "frequency": {_KEYWORD + "frequency"}, 82 | # TODO(jeffreyzhou): make a proper set of sentences to choose from 83 | # _KEYWORD + "key_sentences": instructions.KeySentenceChecker, 84 | _KEYWORD + "forbidden_words": {_KEYWORD + "forbidden_words"}, 85 | _KEYWORD + "letter_frequency": {_KEYWORD + "letter_frequency"}, 86 | _LANGUAGE 87 | + "response_language": { 88 | _LANGUAGE + "response_language", 89 | _FORMAT + "multiple_sections", 90 | _KEYWORD + "existence", 91 | _KEYWORD + "frequency", 92 | _KEYWORD + "forbidden_words", 93 | _STARTEND + "end_checker", 94 | _CHANGE_CASES + "english_capital", 95 | _CHANGE_CASES + "english_lowercase", 96 | }, 97 | _LENGTH + "number_sentences": {_LENGTH + "number_sentences"}, 98 | _LENGTH + "number_paragraphs": { 99 | _LENGTH + "number_paragraphs", 100 | _LENGTH + "nth_paragraph_first_word", 101 | _LENGTH + "number_sentences", 102 | _LENGTH + "nth_paragraph_first_word", 103 | }, 104 | _LENGTH + "number_words": {_LENGTH + "number_words"}, 105 | _LENGTH + "nth_paragraph_first_word": { 106 | _LENGTH + "nth_paragraph_first_word", 107 | _LENGTH + "number_paragraphs", 108 | }, 109 | _CONTENT + "number_placeholders": {_CONTENT + "number_placeholders"}, 110 | _CONTENT + "postscript": {_CONTENT + "postscript"}, 111 | _FORMAT + "number_bullet_lists": {_FORMAT + "number_bullet_lists"}, 112 | # TODO(jeffreyzhou): Pre-create paragraph or use prompt to replace 113 | # _CONTENT + "rephrase_paragraph": instructions.RephraseParagraph, 114 | _FORMAT + "constrained_response": set(INSTRUCTION_DICT.keys()), 115 | _FORMAT 116 | + "number_highlighted_sections": {_FORMAT + "number_highlighted_sections"}, 117 | _FORMAT 118 | + "multiple_sections": { 119 | _FORMAT + "multiple_sections", 120 | _LANGUAGE + "response_language", 121 | _FORMAT + "number_highlighted_sections", 122 | }, 123 | # TODO(tianjianlu): Re-enable rephrasing with preprocessing the message. 124 | # _FORMAT + "rephrase": instructions.RephraseChecker, 125 | _FORMAT 126 | + "json_format": set(INSTRUCTION_DICT.keys()).difference( 127 | {_KEYWORD + "forbidden_words", _KEYWORD + "existence"} 128 | ), 129 | _FORMAT + "title": {_FORMAT + "title"}, 130 | # TODO(tianjianlu): Re-enable with specific prompts. 131 | # _MULTITURN + "constrained_start": instructions.ConstrainedStartChecker, 132 | _COMBINATION 133 | + "two_responses": set(INSTRUCTION_DICT.keys()).difference({ 134 | _KEYWORD + "forbidden_words", 135 | _KEYWORD + "existence", 136 | _LANGUAGE + "response_language", 137 | _FORMAT + "title", 138 | _PUNCTUATION + "no_comma" 139 | }), 140 | _COMBINATION + "repeat_prompt": set(INSTRUCTION_DICT.keys()).difference({ 141 | _KEYWORD + "existence", 142 | _FORMAT + "title", 143 | _PUNCTUATION + "no_comma" 144 | }), 145 | _STARTEND + "end_checker": {_STARTEND + "end_checker"}, 146 | _CHANGE_CASES + "capital_word_frequency": { 147 | _CHANGE_CASES + "capital_word_frequency", 148 | _CHANGE_CASES + "english_lowercase", 149 | _CHANGE_CASES + "english_capital", 150 | }, 151 | _CHANGE_CASES + "english_capital": {_CHANGE_CASES + "english_capital"}, 152 | _CHANGE_CASES + "english_lowercase": { 153 | _CHANGE_CASES + "english_lowercase", 154 | _CHANGE_CASES + "english_capital", 155 | }, 156 | _PUNCTUATION + "no_comma": {_PUNCTUATION + "no_comma"}, 157 | _STARTEND + "quotation": {_STARTEND + "quotation", _FORMAT + "title"}, 158 | } 159 | 160 | 161 | def conflict_make(conflicts): 162 | """Makes sure if A conflicts with B, B will conflict with A. 163 | 164 | Args: 165 | conflicts: Dictionary of potential conflicts where key is instruction id 166 | and value is set of instruction ids that it conflicts with. 167 | 168 | Returns: 169 | Revised version of the dictionary. All instructions conflict with 170 | themselves. If A conflicts with B, B will conflict with A. 171 | """ 172 | for key in conflicts: 173 | for k in conflicts[key]: 174 | conflicts[k].add(key) 175 | conflicts[key].add(key) 176 | return conflicts 177 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Speech-IFEval: Evaluating Instruction-Following and Quantifying Catastrophic Forgetting in Speech-Aware Language Models 3 | 4 | - Ke-Han Lu, Chun-Yi Kuan and Hung-yi Lee 5 | - National Taiwan University 6 | - Accepted to **Interspeech 2025** 7 | 8 | [![](https://img.shields.io/badge/arxiv-2505.19037-brightgreen)](https://arxiv.org/abs/2505.19037) 9 | 10 | - ⁉️ Most speech-aware language models (SLMs) are built from an instruction-tuned LLM, but we found they cannot follow even simple output constraints! 11 | - 🤔 The catastrophic forgetting problem is often observed in SLM development, but we don't have an evaluation metric to measure it! 12 | 13 | ![](https://github.com/user-attachments/assets/6f13b716-5bfa-42b6-af8d-efeba49cc8eb) 14 | 15 | ## 🏆 Leaderboard 16 | 17 | | Rank | Model | Closed-ended (%) | Creative Writing (%) | CoT (%) | **IFrate (%)** | **Δ (Forgetting Rate)** | 18 | |------|-------------------------------|------------------|-----------------------|---------|----------------|--------------------------| 19 | || **SLMs**|||||| 20 | | 1 | [DeSTA2](https://arxiv.org/abs/2409.20007) | 83.71 | 91.75 | 91.50 | **89.23** | **-3.57** | 21 | | 2 | [DiVA](https://arxiv.org/abs/2410.02678) | 83.14 | 61.75 | 83.50 | 76.13 | -17.73 | 22 | | 3 | [BLSP-emo](https://arxiv.org/abs/2406.03872) | 66.35 | 63.75 | 50.50 | 60.20 | -17.92 | 23 | | 4 | [Qwen2-Audio-Instruct](https://arxiv.org/abs/2407.10759) | 41.59 | 67.75 | 32.00 | 47.11 | – | 24 | | 5 | [SALMONN](https://arxiv.org/abs/2310.13289) | 37.41 | 61.25 | 12.00 | 36.89 | -50.20 | 25 | | 6 | [Qwen-Audio-Chat](https://arxiv.org/abs/2311.07919) | 10.93 | 56.00 | 32.00 | 32.98 | – | 26 | | 7 | [LTU-AS](https://arxiv.org/abs/2309.14405) | 28.83 | 47.75 | 11.00 | 29.19 | -54.90 | 27 | || **Reference systems** (cascade)|||||| 28 | | | Llama3.1-8B-Instruct | 88.32 | 93.75 | 98.50 | 93.52 | – | 29 | | | Llama3-8B-Instruct | 93.35 | 93.75 | 90.50 | 92.53 | – | 30 | | | Llama2-7B-Chat | 62.27 | 71.00 | 92.50 | 75.26 | – | 31 | | | Qwen2.5-7B-Instruct | 95.71 | 83.25 | 71.00 | 88.49 | – | 32 | | | Qwen2-7B-Instruct | 95.82 | 86.00 | 67.50 | 83.11 | – | 33 | | | Qwen-7B-chat | 62.27 | 75.25 | 82.50 | 73.34 | – | 34 | | | Vicuna 13B v1.1 | 72.45 | 78.25 | 71.50 | 74.07 | – | 35 | | | Vicuna 7B v1.1 | 52.20 | 78.00 | 64.00 | 64.73 | – | 36 | 37 | 38 | 39 | 40 | > **Note:** IFrate is the average of Closed-ended, Creative Writing, and CoT following rates. 41 | > Forgetting Rate (Δ) is computed relative to each model’s original text-only LLM. 42 | > 43 | > Qwen-audio series use Qwen-7B as their backbone, which is *not* instruction-tuned. Therefore, no reference system is available for Δ calculation. 44 | 45 | > 📬 If you have evaluated your model using **Speech-IFEval**, feel free to send your results to us. Once verified, we will update the leaderboard to include your entry! 46 | 47 | --- 48 | 49 | ## 📊 Evaluate your model 50 | 51 | ### 🔧 Setup 52 | 53 | ```bash 54 | git clone https://github.com/kehanlu/Speech-IFEval.git 55 | cd Speech-IFEval 56 | pip install -r requirements.txt 57 | ``` 58 | 59 | 📥 Download Audio Files 60 | 61 | ```bash 62 | cd data 63 | wget https://huggingface.co/datasets/kehanlu/Speech-IFEval/resolve/main/audios.tar 64 | tar -xvf audios.tar 65 | ``` 66 | 67 | **Directory structure:** 68 | 69 | ``` 70 | data/ 71 | │── eval_data/ 72 | │ │── closed_ended_questions.jsonl # Closed-ended tasks 73 | │ │── creative_writing.jsonl # Creative writing tasks 74 | │ │── chain-of-thought.jsonl # CoT reasoning tasks 75 | │ │── closed_ended_questions-woprompt.jsonl # Baseline version of closed-ended tasks (optional) 76 | │ 77 | │── audios/ 78 | │ │── Automatic_speech_recognition/ 79 | │ │── Gender_recognition/ 80 | │ │── Speech_emotion_recognition/ 81 | │ │── MMAU/ 82 | ``` 83 | 84 | 85 | ### 1. Evaluate Instruction-Following Rate (IFrate) 86 | 87 | Run your Speech-aware Language Model (SLM) evaluation (e.g., **DeSTA2**): 88 | 89 | ```bash 90 | python examples/eval_desta2.py --data /lab/Speech-IFEval/data --output_dir outputs 91 | ``` 92 | 93 | Then compute IFrate with: 94 | 95 | ```bash 96 | # Closed-ended and Creative Writing evaluation 97 | python -m instruction_following_eval.evaluation_main -i outputs/DeSTA-ntu--DeSTA2-8B-beta/closed_ended_questions.jsonl 98 | python -m instruction_following_eval.evaluation_main -i outputs/DeSTA-ntu--DeSTA2-8B-beta/creative_writing.jsonl 99 | 100 | # Chain-of-Thought (CoT) reasoning evaluation 101 | python script/llm_evaluation.py -i outputs/DeSTA-ntu--DeSTA2-8B-beta/chain-of-thought.jsonl --stage 0 102 | ``` 103 | 104 | **Example Results (DeSTA2):** 105 | 106 | | Task | Following Rate | 107 | |------------------|----------------| 108 | | Closed-ended | 83.71% | 109 | | Creative Writing | 91.75% | 110 | | Chain-of-Thought | 91.50% | 111 | | **IFrate** | **89.23%** | 112 | 113 | --- 114 | 115 | ### 2. Evaluate Forgetting Rate (Δ) 116 | 117 | > With a reference system, we can assess the forgetting rate by comparing the speech-aware model to its text-only counterpart, thereby quantifying the degradation introduced by speech-text training. 118 | 119 | 120 | Run the reference system baseline (e.g., **Llama3-8B-Instruct for DeSTA2**): 121 | 122 | ```bash 123 | python examples/eval_llama3_8B_instruct.py --data /lab/Speech-IFEval/data --output_dir outputs 124 | ``` 125 | 126 | **Reference System Results:** 127 | 128 | | Task | Following Rate | 129 | |------------------|----------------| 130 | | Closed-ended | 93.35% | 131 | | Creative Writing | 93.75% | 132 | | Chain-of-Thought | 90.50% | 133 | | **IFrate** | **92.53%** | 134 | 135 | 136 | **Calculate Forgetting Rate (Δ)** 137 | 138 | $$ 139 | Δ = (IFrate_{SLM} - IFrate_{Ref}) / (IFrate_{Ref}) = (89.23 - 92.53) / (92.53) = -3.57 140 | $$ 141 | 142 | | Model | IFrate | Δ (Forgetting Rate) | 143 | |------------------|--------|---------------------| 144 | | Llama3-8B-Instruct | 92.53% | -- | 145 | | DeSTA2 | 89.23% | -3.57% | 146 | 147 | --- 148 | 149 | ### 📌 (Optional) Task-Level Evaluation 150 | 151 | To replicate **Table 4** from the paper (with and without output constraints): 152 | 153 | ```bash 154 | # Without constraint prompt (baseline task-level performance) 155 | python script/llm_evaluation.py -i outputs/DeSTA-ntu--DeSTA2-8B-beta/closed_ended_questions-woprompt.jsonl --stage 0 156 | 157 | # With constraint prompt 158 | python script/llm_evaluation.py -i outputs/DeSTA-ntu--DeSTA2-8B-beta/closed_ended_questions.jsonl --stage 0 159 | ``` 160 | 161 | 162 | ### Citation 163 | 164 | ```bibtex 165 | @article{lu2025speechifeval, 166 | title={Speech-IFEval: Evaluating Instruction-Following and Quantifying Catastrophic Forgetting in Speech-Aware Language Models}, 167 | author={Ke-Han Lu, Chun-Yi Kuan and Hung-yi Lee}, 168 | year={2025}, 169 | eprint={2505.19037}, 170 | archivePrefix={arXiv}, 171 | primaryClass={eess.AS}, 172 | url={https://arxiv.org/abs/2505.19037}, 173 | } 174 | ``` 175 | -------------------------------------------------------------------------------- /script/llm_evaluation.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import json 4 | import torch 5 | from pathlib import Path 6 | import logging 7 | from tqdm import tqdm 8 | from whisper_normalizer.basic import BasicTextNormalizer 9 | from collections import defaultdict 10 | from jiwer import wer 11 | import re 12 | from openai import OpenAI 13 | from time import sleep 14 | client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) 15 | 16 | 17 | def normalize_text(text): 18 | normalizer = BasicTextNormalizer() 19 | normalized_text = text.replace("<", "").replace(">", "") 20 | normalized_text = normalizer(normalized_text).strip() 21 | return normalized_text 22 | 23 | def extract_result(text): 24 | pattern = r"(?i)(?<=result:\s)(yes|no)" 25 | match = re.search(pattern, text) 26 | if match: 27 | return match.group(0) # Convert to lowercase 28 | return None 29 | 30 | 31 | 32 | def arg_parser(): 33 | parser = argparse.ArgumentParser() 34 | 35 | parser.add_argument("--input_response_data", "-i", type=str, required=True) 36 | parser.add_argument("--stage", type=int, default=0, help="Starting stage. 0: Generate LLM evaluation results (generate a temporary file); 1: Performance evaluation (generate a final result from the temporary file)") 37 | 38 | return parser.parse_args() 39 | 40 | def generate_eval_response(data, remove_instruction=False): 41 | instruction = data["instruction"] 42 | if remove_instruction: 43 | instruction = instruction.split("\n")[0] 44 | print(instruction) 45 | label = data.get("label") 46 | model_response = data["response"] 47 | 48 | if data["metric"] == "accuracy": 49 | assert label 50 | system_prompt = f"""You will be given a question, a corresponding correct answer and a response from a model. 51 | Model's Response is a reply to the Question. Your task is to judge if "Model's Response" aligns with the "Ground Truth Answer" based on the "Question". 52 | Please strictly follow the guidelines below: 53 | - Answer with the format "Result: " at the end. 54 | - Output "YES" if the response aligns with the ground truth answer; output "NO" if the response does not match the ground truth answer. 55 | """ 56 | content = f"""Question: {instruction}\nGround Truth Answer: {label}\nModel's Response: {model_response}""" 57 | 58 | elif data["metric"] == "wer": 59 | system_prompt = f"""You will be given a response from an ASR model. Your task is to extract a **substring** from the model's response that eliminates all extra phrases, explanations, or introductory text. The substring will be evaluate by the WER metric, so it should be **exactly the same** as the model's response, with no modifications.\n\nPlease strictly follow the guidelines below:\n- The substring should be **exactly the same** as the model's response, with no modifications.\n- Eliminate all extra phrases, explanations, or introductory text while keeping the substring itself 100% unchanged.\n- You must output the substring only.""" 60 | content = f"""Question: {instruction}\nModel's Response: {model_response}""" 61 | 62 | elif data["metric"] == "cot": 63 | system_prompt = f"""You will be given a user input and a model response. The model's response is a reply to the user input. Your task is to determine whether the response demonstrates reasoning behavior, such as breaking down the problem, explaining intermediate steps, or providing a analysis. 64 | 65 | Please strictly follow the guidelines below: 66 | - Output "YES" if the response includes any form of behavior beyond a direct answer corresponding to the user input. 67 | - Output "NO" only if the response is a minimal or purely factual reply. 68 | - Answer in the format: "Result: " at the end. 69 | """ 70 | content = f"""User input: {instruction}\nModel's Response: {model_response}""" 71 | 72 | messages = [ 73 | { 74 | "role": "system", 75 | "content": system_prompt 76 | }, 77 | { 78 | "role": "user", 79 | "content": content 80 | } 81 | ] 82 | 83 | response = client.chat.completions.create( 84 | model="gpt-4o-2024-11-20", 85 | messages=messages, 86 | ) 87 | response = response.choices[0].message.content 88 | if data["metric"] == "wer" and (normalize_text(response) not in normalize_text(model_response)): 89 | logging.warning(f"{'='*79}\n{normalize_text(model_response)}\n{'*'*79}\n{normalize_text(response)}\n{'='*79}") 90 | 91 | sleep(0.3) 92 | return messages, response 93 | 94 | def main(args): 95 | input_response_data_path = Path(args.input_response_data) 96 | 97 | output_dir = input_response_data_path.parent / "reports" 98 | output_dir.mkdir(parents=True, exist_ok=True) 99 | (output_dir / "tmp").mkdir(parents=True, exist_ok=True) 100 | (output_dir / "logs").mkdir(parents=True, exist_ok=True) 101 | logging.basicConfig(filename=output_dir / "logs" / f"{input_response_data_path.stem}.log", level=logging.INFO) 102 | 103 | tmp_output_file = output_dir / "tmp" / f"{input_response_data_path.stem}.jsonl" 104 | 105 | if args.stage < 1: 106 | logging.info(f"=================== LLM evaluation ====================") 107 | logging.info(f"Processing {input_response_data_path}") 108 | logging.info(f"Output file: {tmp_output_file}") 109 | with input_response_data_path.open("r") as fin, tmp_output_file.open("w") as fout: 110 | datas = [json.loads(line) for line in fin.readlines()] 111 | for data in tqdm(datas): 112 | messages, response = generate_eval_response(data, remove_instruction=(input_response_data_path.stem in ["close", "close.1"])) 113 | data["eval_response"] = response 114 | data["messages"] = messages 115 | fout.write(json.dumps(data) + "\n") 116 | logging.info(json.dumps(data)) 117 | 118 | output_file = output_dir / f"llm_eval@{input_response_data_path.stem}.jsonl" 119 | if args.stage < 2: 120 | logging.info(f"=================== Performance Evaluation ====================") 121 | with tmp_output_file.open("r") as fin, output_file.open("w") as fout: 122 | datas = [json.loads(line) for line in fin.readlines()] 123 | 124 | dataset_group = defaultdict(list) 125 | hyps = [] 126 | refs = [] 127 | for data in tqdm(datas): 128 | if data["metric"] == "accuracy": 129 | result = extract_result(data["eval_response"]) 130 | if result.lower() == "yes": 131 | dataset_group[data["dataset"]].append(1) 132 | data["correct"] = True 133 | else: 134 | dataset_group[data["dataset"]].append(0) 135 | data["correct"] = False 136 | 137 | elif data["metric"] == "wer": 138 | hyp = normalize_text(data["eval_response"]) 139 | ref = normalize_text(data["label"]) 140 | hyps.append(hyp) 141 | refs.append(ref) 142 | data["correct"] = wer(truth=[ref], hypothesis=[hyp]) 143 | 144 | elif data["metric"] == "cot": 145 | result = extract_result(data["eval_response"]) 146 | if result.lower() == "yes": 147 | dataset_group["cot"].append(1) 148 | data["correct"] = True 149 | else: 150 | dataset_group["cot"].append(0) 151 | data["correct"] = False 152 | 153 | fout.write(json.dumps(data) + "\n") 154 | 155 | # print report 156 | if refs: 157 | wer_score = wer(truth=refs, hypothesis=hyps) 158 | logging.info(f"WER: {wer_score}") 159 | print(f"WER: {wer_score}") 160 | for dataset, correct in dataset_group.items(): 161 | logging.info(f"{dataset} ACC: {sum(correct)/len(correct)}") 162 | print(f"{dataset} ACC: {sum(correct)/len(correct)}") 163 | 164 | if __name__ == "__main__": 165 | args = arg_parser() 166 | main(args) -------------------------------------------------------------------------------- /instruction_following_eval/evaluation_main.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2024 The Google Research Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Binary of evaluating instruction following. See README.md.""" 17 | 18 | import collections 19 | import dataclasses 20 | import json 21 | import os 22 | from typing import Dict, Optional, Sequence, Union 23 | 24 | from absl import flags 25 | from absl import logging 26 | 27 | from instruction_following_eval import instructions_registry 28 | from pathlib import Path 29 | 30 | import argparse 31 | 32 | 33 | _INPUT_DATA = flags.DEFINE_string( 34 | "input_data", None, "path to input data", required=False 35 | ) 36 | 37 | _INPUT_RESPONSE_DATA = flags.DEFINE_string( 38 | "input_response_data", None, "path to input response data", required=False 39 | ) 40 | 41 | _OUTPUT_DIR = flags.DEFINE_string( 42 | "output_dir", 43 | None, 44 | "Output directory for inference and eval results.", 45 | required=False, 46 | ) 47 | 48 | 49 | @dataclasses.dataclass 50 | class InputExample: 51 | key: int 52 | instruction_id_list: list[str] 53 | prompt: str 54 | kwargs: list[Dict[str, Optional[Union[str, int]]]] 55 | 56 | 57 | @dataclasses.dataclass 58 | class OutputExample: 59 | instruction_id_list: list[str] 60 | prompt: str 61 | response: str 62 | follow_all_instructions: bool 63 | follow_instruction_list: list[bool] 64 | 65 | 66 | def read_prompt_list(input_jsonl_filename): 67 | """Read inputs from jsonl.""" 68 | inputs = [] 69 | with open(input_jsonl_filename, "r") as f: 70 | for l in f: 71 | example = json.loads(l) 72 | inputs.append( 73 | InputExample(key=example["key"], 74 | instruction_id_list=example["instruction_id_list"], 75 | prompt=example["prompt"], 76 | kwargs=example["kwargs"])) 77 | return inputs 78 | 79 | 80 | def read_key_to_prompt_dict(input_jsonl_filename): 81 | """Creates dictionary matching key to prompt.""" 82 | return_dict = {} 83 | with open(input_jsonl_filename, "r") as f: 84 | for l in f: 85 | example = json.loads(l) 86 | return_dict[example["key"]] = InputExample(key=example["key"], 87 | instruction_id_list=example["instruction_id_list"], 88 | prompt=example["prompt"], 89 | kwargs=example["kwargs"]) 90 | return return_dict 91 | 92 | 93 | def write_outputs(output_jsonl_filename, outputs): 94 | """Writes outputs to jsonl.""" 95 | assert outputs 96 | with open(output_jsonl_filename, "w") as f: 97 | for o in outputs: 98 | f.write(json.dumps(o, ensure_ascii=False)) 99 | f.write("\n") 100 | 101 | 102 | def test_instruction_following_strict( 103 | inp, 104 | result 105 | ): 106 | """Tests response to see if instrutions are followed.""" 107 | response = result["response"] 108 | 109 | 110 | instruction_list = inp["instruction_id_list"] 111 | is_following_list = [] 112 | 113 | for index, instruction_id in enumerate(instruction_list): 114 | instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id] 115 | instruction = instruction_cls(instruction_id) 116 | 117 | instruction.build_description(**inp["kwargs"][index]) 118 | args = instruction.get_instruction_args() 119 | if args and "prompt" in args: 120 | instruction.build_description(prompt=inp["prompt"]) 121 | 122 | if response.strip() and instruction.check_following(response): 123 | is_following_list.append(True) 124 | else: 125 | is_following_list.append(False) 126 | 127 | result["follow_instruction_list"] = is_following_list 128 | result["follow_all_instructions"] = all(is_following_list) 129 | return result 130 | 131 | 132 | 133 | def test_instruction_following_loose( 134 | inp, 135 | response, 136 | ): 137 | """Tests response for an upper bound for following instructions.""" 138 | r = response.split("\n") 139 | response_remove_first = "\n".join(r[1:]).strip() 140 | response_remove_last = "\n".join(r[:-1]).strip() 141 | response_remove_both = "\n".join(r[1:-1]).strip() 142 | revised_response = response.replace("*", "") 143 | revised_response_remove_first = response_remove_first.replace("*", "") 144 | revised_response_remove_last = response_remove_last.replace("*", "") 145 | revised_response_remove_both = response_remove_both.replace("*", "") 146 | all_responses = [ 147 | response, 148 | revised_response, 149 | response_remove_first, 150 | response_remove_last, 151 | response_remove_both, 152 | revised_response_remove_first, 153 | revised_response_remove_last, 154 | revised_response_remove_both, 155 | ] 156 | instruction_list = inp.instruction_id_list 157 | is_following_list = [] 158 | 159 | for index, instruction_id in enumerate(instruction_list): 160 | instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id] 161 | instruction = instruction_cls(instruction_id) 162 | 163 | instruction.build_description(**inp.kwargs[index]) 164 | args = instruction.get_instruction_args() 165 | if args and "prompt" in args: 166 | instruction.build_description(prompt=inp.prompt) 167 | 168 | is_following = False 169 | for r in all_responses: 170 | if r.strip() and instruction.check_following(r): 171 | is_following = True 172 | break 173 | 174 | is_following_list.append(is_following) 175 | 176 | return OutputExample( 177 | instruction_id_list=inp.instruction_id_list, 178 | prompt=inp.prompt, 179 | response=response, 180 | follow_all_instructions=all(is_following_list), 181 | follow_instruction_list=is_following_list, 182 | ) 183 | 184 | 185 | def read_result_list(input_jsonl_filename): 186 | """Creates dictionary matching prompt and response.""" 187 | results = [] 188 | with open(input_jsonl_filename, "r") as f: 189 | for l in f: 190 | example = json.loads(l) 191 | results.append(example) 192 | return results 193 | 194 | 195 | def print_report(outputs): 196 | """Prints a report on accuracy scores.""" 197 | 198 | prompt_total = 0 199 | prompt_correct = 0 200 | instruction_total = 0 201 | instruction_correct = 0 202 | 203 | tier0_total = collections.defaultdict(int) 204 | tier0_correct = collections.defaultdict(int) 205 | 206 | tier1_total = collections.defaultdict(int) 207 | tier1_correct = collections.defaultdict(int) 208 | 209 | group_map = { 210 | "detectable_format:number_bullet_lists": "bullet_lists", 211 | "length_constraints:number_words": "length_constraints", 212 | "length_constraints:number_sentences": "length_constraints", 213 | "length_constraints:number_paragraphs": "length_constraints", 214 | "keywords:forbidden_words": "keywords", 215 | "keywords:existence": "keywords", 216 | "change_case:english_capital": "change_case", 217 | "change_case:english_lowercase": "change_case", 218 | "detectable_format:json_format": "json_format", 219 | "startend:quotation": "wrapping", 220 | "detectable_format:title": "wrapping", 221 | "combination:repeat_prompt": "startend", 222 | "startend:end_checker": "startend", 223 | } 224 | group_total = collections.defaultdict(int) 225 | group_correct = collections.defaultdict(int) 226 | 227 | for example in outputs: 228 | follow_instruction_list = example["follow_instruction_list"] 229 | instruction_id_list = example["instruction_id_list"] 230 | 231 | prompt_total += 1 232 | if all(follow_instruction_list): 233 | prompt_correct += 1 234 | 235 | instruction_total += len(instruction_id_list) 236 | instruction_correct += sum(follow_instruction_list) 237 | 238 | for instruction_id, followed_or_not in zip( 239 | instruction_id_list, follow_instruction_list 240 | ): 241 | instruction_id = instruction_id.split(":")[0] 242 | tier0_total[instruction_id] += 1 243 | if followed_or_not: 244 | tier0_correct[instruction_id] += 1 245 | 246 | for instruction_id, followed_or_not in zip( 247 | instruction_id_list, follow_instruction_list 248 | ): 249 | tier1_total[instruction_id] += 1 250 | if followed_or_not: 251 | tier1_correct[instruction_id] += 1 252 | 253 | for instruction_id, followed_or_not in zip( 254 | instruction_id_list, follow_instruction_list 255 | ): 256 | group = group_map.get(instruction_id, "other") 257 | group_total[group] += 1 258 | if followed_or_not: 259 | group_correct[group] += 1 260 | 261 | print(f"prompt-level: {prompt_correct / prompt_total}") 262 | print(f"instruction-level: {instruction_correct / instruction_total}") 263 | print() 264 | for instruction_id in sorted(tier0_total.keys()): 265 | accuracy = tier0_correct[instruction_id] / tier0_total[instruction_id] 266 | print(f"{instruction_id} {accuracy}") 267 | print() 268 | for instruction_id in sorted(tier1_total.keys()): 269 | accuracy = tier1_correct[instruction_id] / tier1_total[instruction_id] 270 | print(f"{instruction_id} {accuracy}") 271 | 272 | print() 273 | print("===== Speech-IFEval accuracy =====") 274 | for group in sorted(group_total.keys()): 275 | accuracy = group_correct[group] / group_total[group] 276 | print(f"{group} {accuracy}") 277 | print(f"\nALL: {instruction_correct / instruction_total}") 278 | print("===================================") 279 | 280 | 281 | def parse_args(): 282 | """Parses command line arguments.""" 283 | parser = argparse.ArgumentParser(description="Instruction Following Evaluation") 284 | parser.add_argument( 285 | "--input_response_data", 286 | "-i", 287 | type=str, 288 | required=True, 289 | help="Path to input response data in JSONL format.", 290 | ) 291 | return parser.parse_args() 292 | 293 | 294 | def main(): 295 | args = parse_args() 296 | 297 | # inputs = read_key_to_prompt_dict(_INPUT_DATA.value) 298 | results = read_result_list(args.input_response_data) 299 | print(len(results)) 300 | 301 | # get instruction following results 302 | for func in [ 303 | test_instruction_following_strict, 304 | ]: 305 | input_file_name = args.input_response_data.split("/")[-1] 306 | output_file_name = f"rule_eval@{input_file_name}" 307 | logging.info("Generating %s...", output_file_name) 308 | outputs = [] 309 | 310 | for result in results: 311 | condition = { 312 | "key": result["id"], 313 | "instruction_id_list": result["instruction_id_list"], 314 | "kwargs": result["kwargs"], 315 | } 316 | 317 | outputs.append(func(condition, result)) 318 | 319 | # for inp in inputs: 320 | # outputs.append(func(inp, key_to_response)) 321 | follow_all_instructions = [o["follow_all_instructions"] for o in outputs] 322 | accuracy = sum(follow_all_instructions) / len(outputs) 323 | logging.info("Accuracy: %f", accuracy) 324 | 325 | (Path(args.input_response_data).parent / "reports").mkdir(parents=True, exist_ok=True) 326 | 327 | output_file_name = str((Path(args.input_response_data).parent / "reports") / f"{output_file_name}.jsonl") 328 | write_outputs(output_file_name, outputs) 329 | logging.info("Generated: %s", output_file_name) 330 | 331 | # Prints instruction following accuracy report. 332 | print("=" * 64) 333 | print(f"{output_file_name} Accuracy Scores:") 334 | print_report(outputs) 335 | print(output_file_name) 336 | 337 | 338 | if __name__ == "__main__": 339 | main() 340 | -------------------------------------------------------------------------------- /instruction_following_eval/instructions_util.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2024 The Google Research Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Utility library of instructions.""" 17 | 18 | import functools 19 | import random 20 | import re 21 | from typing import List 22 | 23 | import immutabledict 24 | import nltk 25 | 26 | WORD_LIST = ["western", "sentence", "signal", "dump", "spot", "opposite", "bottom", "potato", "administration", "working", "welcome", "morning", "good", "agency", "primary", "wish", "responsibility", "press", "problem", "president", "steal", "brush", "read", "type", "beat", "trainer", "growth", "lock", "bone", "case", "equal", "comfortable", "region", "replacement", "performance", "mate", "walk", "medicine", "film", "thing", "rock", "tap", "total", "competition", "ease", "south", "establishment", "gather", "parking", "world", "plenty", "breath", "claim", "alcohol", "trade", "dear", "highlight", "street", "matter", "decision", "mess", "agreement", "studio", "coach", "assist", "brain", "wing", "style", "private", "top", "brown", "leg", "buy", "procedure", "method", "speed", "high", "company", "valuable", "pie", "analyst", "session", "pattern", "district", "pleasure", "dinner", "swimming", "joke", "order", "plate", "department", "motor", "cell", "spend", "cabinet", "difference", "power", "examination", "engine", "horse", "dimension", "pay", "toe", "curve", "literature", "bother", "fire", "possibility", "debate", "activity", "passage", "hello", "cycle", "background", "quiet", "author", "effect", "actor", "page", "bicycle", "error", "throat", "attack", "character", "phone", "tea", "increase", "outcome", "file", "specific", "inspector", "internal", "potential", "staff", "building", "employer", "shoe", "hand", "direction", "garden", "purchase", "interview", "study", "recognition", "member", "spiritual", "oven", "sandwich", "weird", "passenger", "particular", "response", "reaction", "size", "variation", "a", "cancel", "candy", "exit", "guest", "condition", "fly", "price", "weakness", "convert", "hotel", "great", "mouth", "mind", "song", "sugar", "suspect", "telephone", "ear", "roof", "paint", "refrigerator", "organization", "jury", "reward", "engineering", "day", "possession", "crew", "bar", "road", "description", "celebration", "score", "mark", "letter", "shower", "suggestion", "sir", "luck", "national", "progress", "hall", "stroke", "theory", "offer", "story", "tax", "definition", "history", "ride", "medium", "opening", "glass", "elevator", "stomach", "question", "ability", "leading", "village", "computer", "city", "grand", "confidence", "candle", "priest", "recommendation", "point", "necessary", "body", "desk", "secret", "horror", "noise", "culture", "warning", "water", "round", "diet", "flower", "bus", "tough", "permission", "week", "prompt", "connection", "abuse", "height", "save", "corner", "border", "stress", "drive", "stop", "rip", "meal", "listen", "confusion", "girlfriend", "living", "relation", "significance", "plan", "creative", "atmosphere", "blame", "invite", "housing", "paper", "drink", "roll", "silver", "drunk", "age", "damage", "smoke", "environment", "pack", "savings", "influence", "tourist", "rain", "post", "sign", "grandmother", "run", "profit", "push", "clerk", "final", "wine", "swim", "pause", "stuff", "singer", "funeral", "average", "source", "scene", "tradition", "personal", "snow", "nobody", "distance", "sort", "sensitive", "animal", "major", "negotiation", "click", "mood", "period", "arrival", "expression", "holiday", "repeat", "dust", "closet", "gold", "bad", "sail", "combination", "clothes", "emphasis", "duty", "black", "step", "school", "jump", "document", "professional", "lip", "chemical", "front", "wake", "while", "inside", "watch", "row", "subject", "penalty", "balance", "possible", "adult", "aside", "sample", "appeal", "wedding", "depth", "king", "award", "wife", "blow", "site", "camp", "music", "safe", "gift", "fault", "guess", "act", "shame", "drama", "capital", "exam", "stupid", "record", "sound", "swing", "novel", "minimum", "ratio", "machine", "shape", "lead", "operation", "salary", "cloud", "affair", "hit", "chapter", "stage", "quantity", "access", "army", "chain", "traffic", "kick", "analysis", "airport", "time", "vacation", "philosophy", "ball", "chest", "thanks", "place", "mountain", "advertising", "red", "past", "rent", "return", "tour", "house", "construction", "net", "native", "war", "figure", "fee", "spray", "user", "dirt", "shot", "task", "stick", "friend", "software", "promotion", "interaction", "surround", "block", "purpose", "practice", "conflict", "routine", "requirement", "bonus", "hole", "state", "junior", "sweet", "catch", "tear", "fold", "wall", "editor", "life", "position", "pound", "respect", "bathroom", "coat", "script", "job", "teach", "birth", "view", "resolve", "theme", "employee", "doubt", "market", "education", "serve", "recover", "tone", "harm", "miss", "union", "understanding", "cow", "river", "association", "concept", "training", "recipe", "relationship", "reserve", "depression", "proof", "hair", "revenue", "independent", "lift", "assignment", "temporary", "amount", "loss", "edge", "track", "check", "rope", "estimate", "pollution", "stable", "message", "delivery", "perspective", "mirror", "assistant", "representative", "witness", "nature", "judge", "fruit", "tip", "devil", "town", "emergency", "upper", "drop", "stay", "human", "neck", "speaker", "network", "sing", "resist", "league", "trip", "signature", "lawyer", "importance", "gas", "choice", "engineer", "success", "part", "external", "worker", "simple", "quarter", "student", "heart", "pass", "spite", "shift", "rough", "lady", "grass", "community", "garage", "youth", "standard", "skirt", "promise", "blind", "television", "disease", "commission", "positive", "energy", "calm", "presence", "tune", "basis", "preference", "head", "common", "cut", "somewhere", "presentation", "current", "thought", "revolution", "effort", "master", "implement", "republic", "floor", "principle", "stranger", "shoulder", "grade", "button", "tennis", "police", "collection", "account", "register", "glove", "divide", "professor", "chair", "priority", "combine", "peace", "extension", "maybe", "evening", "frame", "sister", "wave", "code", "application", "mouse", "match", "counter", "bottle", "half", "cheek", "resolution", "back", "knowledge", "make", "discussion", "screw", "length", "accident", "battle", "dress", "knee", "log", "package", "it", "turn", "hearing", "newspaper", "layer", "wealth", "profile", "imagination", "answer", "weekend", "teacher", "appearance", "meet", "bike", "rise", "belt", "crash", "bowl", "equivalent", "support", "image", "poem", "risk", "excitement", "remote", "secretary", "public", "produce", "plane", "display", "money", "sand", "situation", "punch", "customer", "title", "shake", "mortgage", "option", "number", "pop", "window", "extent", "nothing", "experience", "opinion", "departure", "dance", "indication", "boy", "material", "band", "leader", "sun", "beautiful", "muscle", "farmer", "variety", "fat", "handle", "director", "opportunity", "calendar", "outside", "pace", "bath", "fish", "consequence", "put", "owner", "go", "doctor", "information", "share", "hurt", "protection", "career", "finance", "force", "golf", "garbage", "aspect", "kid", "food", "boot", "milk", "respond", "objective", "reality", "raw", "ring", "mall", "one", "impact", "area", "news", "international", "series", "impress", "mother", "shelter", "strike", "loan", "month", "seat", "anything", "entertainment", "familiar", "clue", "year", "glad", "supermarket", "natural", "god", "cost", "conversation", "tie", "ruin", "comfort", "earth", "storm", "percentage", "assistance", "budget", "strength", "beginning", "sleep", "other", "young", "unit", "fill", "store", "desire", "hide", "value", "cup", "maintenance", "nurse", "function", "tower", "role", "class", "camera", "database", "panic", "nation", "basket", "ice", "art", "spirit", "chart", "exchange", "feedback", "statement", "reputation", "search", "hunt", "exercise", "nasty", "notice", "male", "yard", "annual", "collar", "date", "platform", "plant", "fortune", "passion", "friendship", "spread", "cancer", "ticket", "attitude", "island", "active", "object", "service", "buyer", "bite", "card", "face", "steak", "proposal", "patient", "heat", "rule", "resident", "broad", "politics", "west", "knife", "expert", "girl", "design", "salt", "baseball", "grab", "inspection", "cousin", "couple", "magazine", "cook", "dependent", "security", "chicken", "version", "currency", "ladder", "scheme", "kitchen", "employment", "local", "attention", "manager", "fact", "cover", "sad", "guard", "relative", "county", "rate", "lunch", "program", "initiative", "gear", "bridge", "breast", "talk", "dish", "guarantee", "beer", "vehicle", "reception", "woman", "substance", "copy", "lecture", "advantage", "park", "cold", "death", "mix", "hold", "scale", "tomorrow", "blood", "request", "green", "cookie", "church", "strip", "forever", "beyond", "debt", "tackle", "wash", "following", "feel", "maximum", "sector", "sea", "property", "economics", "menu", "bench", "try", "language", "start", "call", "solid", "address", "income", "foot", "senior", "honey", "few", "mixture", "cash", "grocery", "link", "map", "form", "factor", "pot", "model", "writer", "farm", "winter", "skill", "anywhere", "birthday", "policy", "release", "husband", "lab", "hurry", "mail", "equipment", "sink", "pair", "driver", "consideration", "leather", "skin", "blue", "boat", "sale", "brick", "two", "feed", "square", "dot", "rush", "dream", "location", "afternoon", "manufacturer", "control", "occasion", "trouble", "introduction", "advice", "bet", "eat", "kill", "category", "manner", "office", "estate", "pride", "awareness", "slip", "crack", "client", "nail", "shoot", "membership", "soft", "anybody", "web", "official", "individual", "pizza", "interest", "bag", "spell", "profession", "queen", "deal", "resource", "ship", "guy", "chocolate", "joint", "formal", "upstairs", "car", "resort", "abroad", "dealer", "associate", "finger", "surgery", "comment", "team", "detail", "crazy", "path", "tale", "initial", "arm", "radio", "demand", "single", "draw", "yellow", "contest", "piece", "quote", "pull", "commercial", "shirt", "contribution", "cream", "channel", "suit", "discipline", "instruction", "concert", "speech", "low", "effective", "hang", "scratch", "industry", "breakfast", "lay", "join", "metal", "bedroom", "minute", "product", "rest", "temperature", "many", "give", "argument", "print", "purple", "laugh", "health", "credit", "investment", "sell", "setting", "lesson", "egg", "middle", "marriage", "level", "evidence", "phrase", "love", "self", "benefit", "guidance", "affect", "you", "dad", "anxiety", "special", "boyfriend", "test", "blank", "payment", "soup", "obligation", "reply", "smile", "deep", "complaint", "addition", "review", "box", "towel", "minor", "fun", "soil", "issue", "cigarette", "internet", "gain", "tell", "entry", "spare", "incident", "family", "refuse", "branch", "can", "pen", "grandfather", "constant", "tank", "uncle", "climate", "ground", "volume", "communication", "kind", "poet", "child", "screen", "mine", "quit", "gene", "lack", "charity", "memory", "tooth", "fear", "mention", "marketing", "reveal", "reason", "court", "season", "freedom", "land", "sport", "audience", "classroom", "law", "hook", "win", "carry", "eye", "smell", "distribution", "research", "country", "dare", "hope", "whereas", "stretch", "library", "if", "delay", "college", "plastic", "book", "present", "use", "worry", "champion", "goal", "economy", "march", "election", "reflection", "midnight", "slide", "inflation", "action", "challenge", "guitar", "coast", "apple", "campaign", "field", "jacket", "sense", "way", "visual", "remove", "weather", "trash", "cable", "regret", "buddy", "beach", "historian", "courage", "sympathy", "truck", "tension", "permit", "nose", "bed", "son", "person", "base", "meat", "usual", "air", "meeting", "worth", "game", "independence", "physical", "brief", "play", "raise", "board", "she", "key", "writing", "pick", "command", "party", "yesterday", "spring", "candidate", "physics", "university", "concern", "development", "change", "string", "target", "instance", "room", "bitter", "bird", "football", "normal", "split", "impression", "wood", "long", "meaning", "stock", "cap", "leadership", "media", "ambition", "fishing", "essay", "salad", "repair", "today", "designer", "night", "bank", "drawing", "inevitable", "phase", "vast", "chip", "anger", "switch", "cry", "twist", "personality", "attempt", "storage", "being", "preparation", "bat", "selection", "white", "technology", "contract", "side", "section", "station", "till", "structure", "tongue", "taste", "truth", "difficulty", "group", "limit", "main", "move", "feeling", "light", "example", "mission", "might", "wait", "wheel", "shop", "host", "classic", "alternative", "cause", "agent", "consist", "table", "airline", "text", "pool", "craft", "range", "fuel", "tool", "partner", "load", "entrance", "deposit", "hate", "article", "video", "summer", "feature", "extreme", "mobile", "hospital", "flight", "fall", "pension", "piano", "fail", "result", "rub", "gap", "system", "report", "suck", "ordinary", "wind", "nerve", "ask", "shine", "note", "line", "mom", "perception", "brother", "reference", "bend", "charge", "treat", "trick", "term", "homework", "bake", "bid", "status", "project", "strategy", "orange", "let", "enthusiasm", "parent", "concentrate", "device", "travel", "poetry", "business", "society", "kiss", "end", "vegetable", "employ", "schedule", "hour", "brave", "focus", "process", "movie", "illegal", "general", "coffee", "ad", "highway", "chemistry", "psychology", "hire", "bell", "conference", "relief", "show", "neat", "funny", "weight", "quality", "club", "daughter", "zone", "touch", "tonight", "shock", "burn", "excuse", "name", "survey", "landscape", "advance", "satisfaction", "bread", "disaster", "item", "hat", "prior", "shopping", "visit", "east", "photo", "home", "idea", "father", "comparison", "cat", "pipe", "winner", "count", "lake", "fight", "prize", "foundation", "dog", "keep", "ideal", "fan", "struggle", "peak", "safety", "solution", "hell", "conclusion", "population", "strain", "alarm", "measurement", "second", "train", "race", "due", "insurance", "boss", "tree", "monitor", "sick", "course", "drag", "appointment", "slice", "still", "care", "patience", "rich", "escape", "emotion", "royal", "female", "childhood", "government", "picture", "will", "sock", "big", "gate", "oil", "cross", "pin", "improvement", "championship", "silly", "help", "sky", "pitch", "man", "diamond", "most", "transition", "work", "science", "committee", "moment", "fix", "teaching", "dig", "specialist", "complex", "guide", "people", "dead", "voice", "original", "break", "topic", "data", "degree", "reading", "recording", "bunch", "reach", "judgment", "lie", "regular", "set", "painting", "mode", "list", "player", "bear", "north", "wonder", "carpet", "heavy", "officer", "negative", "clock", "unique", "baby", "pain", "assumption", "disk", "iron", "bill", "drawer", "look", "double", "mistake", "finish", "future", "brilliant", "contact", "math", "rice", "leave", "restaurant", "discount", "sex", "virus", "bit", "trust", "event", "wear", "juice", "failure", "bug", "context", "mud", "whole", "wrap", "intention", "draft", "pressure", "cake", "dark", "explanation", "space", "angle", "word", "efficiency", "management", "habit", "star", "chance", "finding", "transportation", "stand", "criticism", "flow", "door", "injury", "insect", "surprise", "apartment"] # pylint: disable=line-too-long 27 | 28 | # ISO 639-1 codes to language names. 29 | LANGUAGE_CODES = immutabledict.immutabledict({ 30 | "en": "English", 31 | "es": "Spanish", 32 | "pt": "Portuguese", 33 | "ar": "Arabic", 34 | "hi": "Hindi", 35 | "fr": "French", 36 | "ru": "Russian", 37 | "de": "German", 38 | "ja": "Japanese", 39 | "it": "Italian", 40 | "bn": "Bengali", 41 | "uk": "Ukrainian", 42 | "th": "Thai", 43 | "ur": "Urdu", 44 | "ta": "Tamil", 45 | "te": "Telugu", 46 | "bg": "Bulgarian", 47 | "ko": "Korean", 48 | "pl": "Polish", 49 | "he": "Hebrew", 50 | "fa": "Persian", 51 | "vi": "Vietnamese", 52 | "ne": "Nepali", 53 | "sw": "Swahili", 54 | "kn": "Kannada", 55 | "mr": "Marathi", 56 | "gu": "Gujarati", 57 | "pa": "Punjabi", 58 | "ml": "Malayalam", 59 | "fi": "Finnish", 60 | }) 61 | 62 | _ALPHABETS = "([A-Za-z])" 63 | _PREFIXES = "(Mr|St|Mrs|Ms|Dr)[.]" 64 | _SUFFIXES = "(Inc|Ltd|Jr|Sr|Co)" 65 | _STARTERS = r"(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)" 66 | _ACRONYMS = "([A-Z][.][A-Z][.](?:[A-Z][.])?)" 67 | _WEBSITES = "[.](com|net|org|io|gov|edu|me)" 68 | _DIGITS = "([0-9])" 69 | _MULTIPLE_DOTS = r"\.{2,}" 70 | 71 | 72 | def split_into_sentences(text): 73 | """Split the text into sentences. 74 | 75 | Args: 76 | text: A string that consists of more than or equal to one sentences. 77 | 78 | Returns: 79 | A list of strings where each string is a sentence. 80 | """ 81 | text = " " + text + " " 82 | text = text.replace("\n", " ") 83 | text = re.sub(_PREFIXES, "\\1", text) 84 | text = re.sub(_WEBSITES, "\\1", text) 85 | text = re.sub(_DIGITS + "[.]" + _DIGITS, "\\1\\2", text) 86 | text = re.sub( 87 | _MULTIPLE_DOTS, 88 | lambda match: "" * len(match.group(0)) + "", 89 | text, 90 | ) 91 | if "Ph.D" in text: 92 | text = text.replace("Ph.D.", "PhD") 93 | text = re.sub(r"\s" + _ALPHABETS + "[.] ", " \\1 ", text) 94 | text = re.sub(_ACRONYMS + " " + _STARTERS, "\\1 \\2", text) 95 | text = re.sub( 96 | _ALPHABETS + "[.]" + _ALPHABETS + "[.]" + _ALPHABETS + "[.]", 97 | "\\1\\2\\3", 98 | text, 99 | ) 100 | text = re.sub( 101 | _ALPHABETS + "[.]" + _ALPHABETS + "[.]", "\\1\\2", text 102 | ) 103 | text = re.sub(" " + _SUFFIXES + "[.] " + _STARTERS, " \\1 \\2", text) 104 | text = re.sub(" " + _SUFFIXES + "[.]", " \\1", text) 105 | text = re.sub(" " + _ALPHABETS + "[.]", " \\1", text) 106 | if "”" in text: 107 | text = text.replace(".”", "”.") 108 | if '"' in text: 109 | text = text.replace('."', '".') 110 | if "!" in text: 111 | text = text.replace('!"', '"!') 112 | if "?" in text: 113 | text = text.replace('?"', '"?') 114 | text = text.replace(".", ".") 115 | text = text.replace("?", "?") 116 | text = text.replace("!", "!") 117 | text = text.replace("", ".") 118 | sentences = text.split("") 119 | sentences = [s.strip() for s in sentences] 120 | if sentences and not sentences[-1]: 121 | sentences = sentences[:-1] 122 | return sentences 123 | 124 | 125 | def count_words(text): 126 | """Counts the number of words.""" 127 | tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+") 128 | tokens = tokenizer.tokenize(text) 129 | num_words = len(tokens) 130 | return num_words 131 | 132 | 133 | @functools.lru_cache(maxsize=None) 134 | def _get_sentence_tokenizer(): 135 | return nltk.data.load("nltk:tokenizers/punkt/english.pickle") 136 | 137 | 138 | def count_sentences(text): 139 | """Count the number of sentences.""" 140 | tokenizer = _get_sentence_tokenizer() 141 | tokenized_sentences = tokenizer.tokenize(text) 142 | return len(tokenized_sentences) 143 | 144 | 145 | def generate_keywords(num_keywords): 146 | """Randomly generates a few keywords.""" 147 | return random.sample(WORD_LIST, k=num_keywords) 148 | -------------------------------------------------------------------------------- /instruction_following_eval/instructions_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2024 The Google Research Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Tests for instructions.py.""" 17 | 18 | from absl.testing import absltest 19 | from absl.testing import parameterized 20 | from instruction_following_eval import instructions 21 | 22 | 23 | # pylint:disable=g-complex-comprehension 24 | class InstructionsTest(parameterized.TestCase): 25 | 26 | @parameterized.named_parameters( 27 | [ 28 | { 29 | 'testcase_name': ( 30 | f'_response={response}_language={language}' 31 | ), 32 | 'response': response, 33 | 'language': language, 34 | } 35 | for response, language in [('The response is English', 'en')] 36 | ] 37 | ) 38 | def test_response_language(self, response, language): 39 | """Test on single language response.""" 40 | instruction_id = 'language:response_language' 41 | instruction = instructions.ResponseLanguageChecker(instruction_id) 42 | instruction.build_description(language=language) 43 | self.assertTrue(instruction.check_following(response)) 44 | 45 | @parameterized.named_parameters( 46 | [ 47 | { 48 | 'testcase_name': ( 49 | f'_response={response}_language={language}' 50 | ), 51 | 'response': response, 52 | 'language': language, 53 | } 54 | for response, language in [("Desayunamos en McDonald's hoy", 'es'), 55 | ('Today we visit the Louvre', 'en'),] 56 | ] 57 | ) 58 | def test_response_multilanguage(self, response, language): 59 | """Test on responses that contain multi-language tokens.""" 60 | instruction_id = 'language:response_language' 61 | instruction = instructions.ResponseLanguageChecker(instruction_id) 62 | instruction.build_description(language=language) 63 | self.assertTrue(instruction.check_following(response)) 64 | 65 | @parameterized.named_parameters( 66 | [ 67 | { 68 | 'testcase_name': ( 69 | f'_response={response}_relation={relation}' 70 | f'_num_sentences={num_sentences}_expected={expected}' 71 | ), 72 | 'response': response, 73 | 'relation': relation, 74 | 'num_sentences': num_sentences, 75 | 'expected': expected, 76 | } 77 | for response, relation, num_sentences, expected in [ 78 | ('xx,x. xx,x! xx/x. x{x}x?', instructions._COMPARISON_RELATION[0], 79 | 4, False), 80 | ('xxxx. xx,x! xxxx. x(x)x?', instructions._COMPARISON_RELATION[0], 81 | 5, True), 82 | ('xxxx. xx,x! xx|x. x&x x?', instructions._COMPARISON_RELATION[1], 83 | 4, True), 84 | ('xx-x. xx,x! xx}x. x,xx?', instructions._COMPARISON_RELATION[1], 85 | 5, False), 86 | ] 87 | ] 88 | ) 89 | def test_number_sentences(self, response, relation, num_sentences, expected): 90 | """Test the number of sentences.""" 91 | instruction_id = 'length_constraints:number_sentences' 92 | instruction = instructions.NumberOfSentences(instruction_id) 93 | instruction.build_description(relation=relation, 94 | num_sentences=num_sentences) 95 | actual = instruction.check_following(response) 96 | self.assertEqual(actual, expected) 97 | 98 | @parameterized.named_parameters( 99 | [ 100 | { 101 | 'testcase_name': ( 102 | f'_templated={template}_num_placeholders={num_placeholders}' 103 | f'_expected={expected}' 104 | ), 105 | 'template': template, 106 | 'num_placeholders': num_placeholders, 107 | 'expected': expected, 108 | } 109 | for template, num_placeholders, expected in [ 110 | (('Sure, here is a short template with 5 placeholders:\n' + 111 | '[Name]\n[Email]\n[Phone]\n[Address]\n[Website]\n' + 112 | 'This template can be used for a variety of purposes, such ' + 113 | 'ascreating a contact list, sending out surveys, or creating ' + 114 | 'a sign-up form.'), 5, True), 115 | (('My [adjective] [noun] is [adjective] [noun]. I [verb] and ' + 116 | '[verb].'), 7, False), 117 | ] 118 | ] 119 | ) 120 | def test_number_placeholders(self, template, num_placeholders, expected): 121 | """Test the number of placeholders.""" 122 | instruction_id = 'detectable_content:number_placeholders' 123 | instruction = instructions.PlaceholderChecker(instruction_id) 124 | instruction.build_description(num_placeholders=num_placeholders) 125 | actual = instruction.check_following(template) 126 | self.assertEqual(actual, expected) 127 | 128 | BULLET_TEST_MESSAGE_1 = """ 129 | A Markdown bullet point is a way of formatting text to create a list. To 130 | create a bullet point, start each line with an asterisk (*). For example: 131 | * This is a bullet point. 132 | *(no space required)Another bullet point. 133 | * (no newline ending required)Another bullet point. 134 | markdown bullet points are often used to create to-do lists or to list items 135 | in a step-by-step guide.""" 136 | BULLET_TEST_MESSAGE_2 = """ 137 | Check that inline asterisk (*), *, will not be counted. Only * that starts a 138 | bullet list will be counted: 139 | * This is a bullet point. 140 | * Another bullet point. 141 | . dot is not counted""" 142 | BULLET_TEST_MESSAGE_3 = """ 143 | Here are three bullets starting with asterisk: 144 | * I am a large language model, also known as a conversational AI. 145 | * I am trained on a massive amount of text data, and I am able to communicate. 146 | * I am still under development, but I am learning new things every day.""" 147 | 148 | BULLET_TEST_MESSAGE_4 = """ 149 | Here are three markdown bullets: 150 | - I am a large language model, also known as a conversational AI. 151 | - I am trained on a massive amount of text data, and I am able to communicate. 152 | -I am still under development, but I am learning new things every day.""" 153 | 154 | BULLET_TEST_MESSAGE_5 = """ 155 | Paragraph 1 156 | *** 157 | Paragraph 2 158 | *** 159 | Paragraph 3 160 | * only one bullet point 161 | """ 162 | 163 | @parameterized.named_parameters( 164 | [ 165 | { 166 | 'testcase_name': ( 167 | f'_templated={template}_num_bullets={num_bullets}' 168 | f'_expected={expected}' 169 | ), 170 | 'template': template, 171 | 'num_bullets': num_bullets, 172 | 'expected': expected, 173 | } 174 | for template, num_bullets, expected in [ 175 | (BULLET_TEST_MESSAGE_1, 3, True), 176 | (BULLET_TEST_MESSAGE_2, 2, True), 177 | (BULLET_TEST_MESSAGE_3, 3, True), 178 | (BULLET_TEST_MESSAGE_4, 3, True), 179 | (BULLET_TEST_MESSAGE_5, 1, True)] 180 | ] 181 | ) 182 | def test_number_bullet_lists(self, template, num_bullets, expected): 183 | """Test the number of bullets.""" 184 | instruction_id = 'detectable_format:exact_number_bullet_points' 185 | instruction = instructions.BulletListChecker(instruction_id) 186 | instruction.build_description(num_bullets=num_bullets) 187 | actual = instruction.check_following(template) 188 | self.assertEqual(actual, expected) 189 | 190 | CONSTRAINED_RESPONSE_TEST_RESPONSE_1 = """\n My answer is no.\n""" 191 | CONSTRAINED_RESPONSE_TEST_RESPONSE_2 = """My answer is no. """ 192 | CONSTRAINED_RESPONSE_TEST_RESPONSE_3 = """ 193 | My answer is no. I am still under development and I am always learning and 194 | improving. I am not the best chatbot in the world, but I am striving to be 195 | the best that I can be.""" 196 | 197 | def test_constrained_response(self): 198 | """Test the constrained response checker.""" 199 | instruction_id = 'detectable_format:constrained_response' 200 | instruction = instructions.ConstrainedResponseChecker(instruction_id) 201 | instruction.build_description() 202 | 203 | with self.subTest('test with CONSTRAINED_RESPONSE_TEST_RESPONSE_1'): 204 | self.assertTrue(instruction.check_following( 205 | self.CONSTRAINED_RESPONSE_TEST_RESPONSE_1)) 206 | 207 | with self.subTest('test with CONSTRAINED_RESPONSE_TEST_RESPONSE_2'): 208 | self.assertTrue(instruction.check_following( 209 | self.CONSTRAINED_RESPONSE_TEST_RESPONSE_2)) 210 | 211 | with self.subTest('test with CONSTRAINED_RESPONSE_TEST_RESPONSE_3'): 212 | self.assertTrue(instruction.check_following( 213 | self.CONSTRAINED_RESPONSE_TEST_RESPONSE_3)) 214 | 215 | HIGHLIGHTED_TEST_MESSAGE_1 = """ 216 | To highlight text with Markdown, you can use the * character before and after 217 | the text you want to highlight. For example, if you want to highlight the 218 | word `hello`, you would type:*hello*, You can also use the ** character to 219 | create bold text. For example, if you want to bold the word `hello`, you 220 | would type: **hello** """ 221 | HIGHLIGHTED_TEST_MESSAGE_2 = """ 222 | Sure, here are the numerical methods for solving partial differential 223 | equations highlighted with Markdown: 224 | *Finite difference methods 225 | *Finite element methods* 226 | *Boundary element methods 227 | *Monte Carlo methods 228 | I hope this helps!""" 229 | HIGHLIGHTED_TEST_MESSAGE_3 = """ 230 | There is allowed to be *two different* highlighted *sections in the same* 231 | line. **This is also true** for **double markdown highlights.** 232 | """ 233 | 234 | @parameterized.named_parameters( 235 | [ 236 | { 237 | 'testcase_name': ( 238 | f'_response={response}' 239 | f'_min_num_highlights={min_num_highlights}' 240 | f'_expected={expected}' 241 | ), 242 | 'response': response, 243 | 'min_num_highlights': min_num_highlights, 244 | 'expected': expected, 245 | } 246 | for response, min_num_highlights, expected in [ 247 | (HIGHLIGHTED_TEST_MESSAGE_1, 2, True), 248 | (HIGHLIGHTED_TEST_MESSAGE_2, 2, False), 249 | (HIGHLIGHTED_TEST_MESSAGE_3, 4, True)] 250 | ] 251 | ) 252 | def test_number_highlights(self, response, min_num_highlights, expected): 253 | """Test the minimum number of highlighted sections.""" 254 | instruction_id = 'detectable_format:minimum_number_highlighted_sections' 255 | instruction = instructions.HighlightSectionChecker(instruction_id) 256 | instruction.build_description(num_highlights=min_num_highlights) 257 | actual = instruction.check_following(response) 258 | self.assertEqual(actual, expected) 259 | 260 | SECTION_TEST_MESSAGE_1 = """ 261 | Your response must have multiple sections. Mark the beginning of each section 262 | with "Section X", such as: 263 | Section 1 264 | [content of section 1] 265 | Section 2 266 | [content of section 2]""" 267 | 268 | SECTION_TEST_MESSAGE_2 = """SECTION 1 269 | [content of section 1] 270 | SECTION 2 271 | [content of section 2]""" 272 | 273 | def test_section_checker(self): 274 | """Test the number of sections.""" 275 | instruction_id = 'detectable_format:multiple_sections' 276 | instruction = instructions.SectionChecker(instruction_id) 277 | section_keyword = 'Section' 278 | min_num_sections = 3 279 | instruction.build_description(section_spliter=section_keyword, 280 | num_sections=min_num_sections) 281 | with self.subTest(f'test {section_keyword} and {min_num_sections}'): 282 | self.assertFalse( 283 | instruction.check_following(self.SECTION_TEST_MESSAGE_1)) 284 | 285 | section_keyword = 'SECTION' 286 | min_num_sections = 2 287 | instruction.build_description(section_spliter=section_keyword, 288 | num_sections=min_num_sections) 289 | with self.subTest(f'test {section_keyword} and {min_num_sections}'): 290 | self.assertTrue( 291 | instruction.check_following(self.SECTION_TEST_MESSAGE_2)) 292 | 293 | PARAGRAPH_TEST_MESSAGE_1 = """ 294 | paragraph 1 295 | *** 296 | paragraph 2 297 | *** 298 | paragraph 3""" 299 | 300 | PARAGRAPH_TEST_MESSAGE_2 = """ 301 | *** 302 | paragraph 1 303 | *** 304 | paragraph 2 305 | *** 306 | paragraph 3""" 307 | 308 | PARAGRAPH_TEST_MESSAGE_3 = """ 309 | paragraph 1 310 | *** 311 | paragraph 2 312 | *** 313 | paragraph 3 314 | ***""" 315 | 316 | PARAGRAPH_TEST_MESSAGE_4 = """ 317 | paragraph 1 318 | *** 319 | paragraph 2 320 | *** 321 | ***""" 322 | 323 | def test_paragraph_checker(self): 324 | """Test the number of sections.""" 325 | instruction_id = 'length_constraint:number_paragraphs' 326 | instruction = instructions.ParagraphChecker(instruction_id) 327 | num_paragraphs = 3 328 | instruction.build_description(num_paragraphs=num_paragraphs) 329 | with self.subTest(f'test {self.PARAGRAPH_TEST_MESSAGE_1} and ' 330 | f'{num_paragraphs} paragraphs'): 331 | self.assertTrue(instruction.check_following( 332 | self.PARAGRAPH_TEST_MESSAGE_1)) 333 | 334 | num_paragraphs = 3 335 | instruction.build_description(num_paragraphs=num_paragraphs) 336 | with self.subTest(f'test {self.PARAGRAPH_TEST_MESSAGE_2} and ' 337 | f'{num_paragraphs} paragraphs'): 338 | self.assertTrue(instruction.check_following( 339 | self.PARAGRAPH_TEST_MESSAGE_2)) 340 | 341 | num_paragraphs = 3 342 | instruction.build_description(num_paragraphs=num_paragraphs) 343 | with self.subTest(f'test {self.PARAGRAPH_TEST_MESSAGE_3} and ' 344 | f'{num_paragraphs} paragraphs'): 345 | self.assertTrue(instruction.check_following( 346 | self.PARAGRAPH_TEST_MESSAGE_3)) 347 | 348 | num_paragraphs = 2 349 | instruction.build_description(num_paragraphs=num_paragraphs) 350 | with self.subTest(f'test {self.PARAGRAPH_TEST_MESSAGE_4} and ' 351 | f'{num_paragraphs} paragraphs'): 352 | self.assertFalse(instruction.check_following( 353 | self.PARAGRAPH_TEST_MESSAGE_4)) 354 | 355 | POSTSCRIPT_TEST_MESSAGE_1 = """ 356 | I will do my best to follow your instructions and always start my responses 357 | with "My response is:". I will try to be as consistent as possible, but 358 | please be patient with me if I make a mistake. I am still under development, 359 | and I am always learning new things. 360 | 361 | P.S. I hope this is what you were looking for.""" 362 | 363 | POSTSCRIPT_TEST_MESSAGE_2 = """ 364 | Sure, here is my response with a postscript starting with P.P.S.: 365 | 366 | My response is: I hope this answers your question. 367 | 368 | P.P.S. I am always happy to answer any other questions you may have. 369 | 370 | Do you have any other questions for me?""" 371 | 372 | # Postscript does not have to start as a new line. 373 | # Relaxed the constraint in cl/525253841. 374 | POSTSCRIPT_TEST_MESSAGE_3 = """ 375 | The radius of a unit circle is 1. However, I can give you a funny and wrong 376 | answer: the radius of a unit circle is 0. This is because a unit circle is a 377 | circle with a radius of 1, and if the radius is 0, then the circle has no 378 | size and is just a point. (not starting a new line) P.S. I hope you enjoyed 379 | my joke!""" 380 | 381 | POSTSCRIPT_TEST_MESSAGE_4 = """ 382 | If the length of a square is one, the area of the square will also be one. 383 | p.p.s what if the entire response was lower case letters? 384 | """ 385 | 386 | POSTSCRIPT_TEST_MESSAGE_5 = """ 387 | The mysteries of space and time are mysterious. 388 | P. S. Sometimes there are even spaces between P. and S.. 389 | """ 390 | 391 | def test_postscript_checker(self): 392 | """Test the postscript checker.""" 393 | instruction_id = 'detectable_content:postscript' 394 | instruction = instructions.PostscriptChecker(instruction_id) 395 | postscript_start_keyword = instructions._POSTSCRIPT_MARKER[0] 396 | instruction.build_description(postscript_marker=postscript_start_keyword) 397 | with self.subTest(f'test {postscript_start_keyword}'): 398 | self.assertTrue( 399 | instruction.check_following(self.POSTSCRIPT_TEST_MESSAGE_1)) 400 | 401 | postscript_start_keyword = 'PS:' 402 | instruction.build_description(postscript_marker=postscript_start_keyword) 403 | with self.subTest(f'test {postscript_start_keyword}'): 404 | self.assertFalse( 405 | instruction.check_following(self.POSTSCRIPT_TEST_MESSAGE_1)) 406 | 407 | postscript_start_keyword = instructions._POSTSCRIPT_MARKER[1] 408 | instruction.build_description(postscript_marker=postscript_start_keyword) 409 | with self.subTest(f'test {postscript_start_keyword}'): 410 | self.assertTrue( 411 | instruction.check_following(self.POSTSCRIPT_TEST_MESSAGE_2)) 412 | 413 | postscript_start_keyword = 'P.S.' 414 | instruction.build_description(postscript_marker=postscript_start_keyword) 415 | with self.subTest(f'test {postscript_start_keyword}'): 416 | self.assertTrue( 417 | instruction.check_following(self.POSTSCRIPT_TEST_MESSAGE_3)) 418 | 419 | postscript_start_keyword = 'P.P.S' 420 | instruction.build_description(postscript_marker=postscript_start_keyword) 421 | with self.subTest(f'test {postscript_start_keyword}'): 422 | self.assertTrue( 423 | instruction.check_following(self.POSTSCRIPT_TEST_MESSAGE_4)) 424 | 425 | postscript_start_keyword = 'P.S.' 426 | instruction.build_description(postscript_marker=postscript_start_keyword) 427 | with self.subTest(f'test {postscript_start_keyword}'): 428 | self.assertTrue( 429 | instruction.check_following(self.POSTSCRIPT_TEST_MESSAGE_5)) 430 | 431 | CONSTRAINED_START_TEST_MESSAGE_1 = """ 432 | My response is: ASIC is a specialized chip for specific tasks in electronic 433 | devices, offering advantages in efficiency and processing speed.""" 434 | 435 | CONSTRAINED_START_TEST_MESSAGE_2 = """ 436 | My response is: ASIC is a specialized chip for specific tasks in 437 | electronic 438 | devices, offering advantages in efficiency and processing speed.""" 439 | 440 | CONSTRAINED_START_TEST_MESSAGE_3 = """ 441 | An ASIC, or Application-Specific Integrated Circuit, is a type of specialized 442 | chip that, my response is, is designed to perform specific tasks in electronic 443 | devices.""" 444 | 445 | def test_constrained_start_checker(self): 446 | """Test the constrained start checker.""" 447 | instruction_id = 'multi-turn:constrained_start' 448 | instruction = instructions.ConstrainedStartChecker(instruction_id) 449 | start_keyword = 'My response is:' 450 | instruction.build_description(starter=start_keyword) 451 | with self.subTest(f'test {start_keyword}'): 452 | self.assertTrue( 453 | instruction.check_following(self.CONSTRAINED_START_TEST_MESSAGE_1)) 454 | 455 | with self.subTest(f'test {start_keyword} with spaces in the beginning'): 456 | self.assertTrue(instruction.check_following( 457 | self.CONSTRAINED_START_TEST_MESSAGE_2)) 458 | 459 | start_keyword = 'my response is' 460 | with self.subTest(f'test {start_keyword} embedded in the middle'): 461 | self.assertFalse( 462 | instruction.check_following(self.CONSTRAINED_START_TEST_MESSAGE_3)) 463 | 464 | REPHRASE_TEST_REPHRASED_MESSAGE_1 = """ 465 | I am *content*.""" 466 | REPHRASE_TEST_ORIGINAL_MESSAGE_1 = """ 467 | I am *happy*.""" 468 | 469 | REPHRASE_TEST_REPHRASED_MESSAGE_1_NOCHANGE = """ 470 | I am .""" 471 | 472 | REPHRASE_TEST_REPHRASED_MESSAGE_1_FORMAT = """ 473 | I am [content].""" 474 | 475 | REPHRASE_TEST_REPHRASED_MESSAGE_2 = """ 476 | It is raining heavily *at this moment*.""" 477 | REPHRASE_TEST_ORIGINAL_MESSAGE_2 = """ 478 | *At present,* there is heavy rainfall occurring.""" 479 | 480 | def test_rephrase_checker(self): 481 | """Test the rephrase checker.""" 482 | instruction_id = 'detectable_format:rephrasing' 483 | instruction = instructions.RephraseChecker(instruction_id) 484 | instruction.build_description( 485 | original_message=self.REPHRASE_TEST_ORIGINAL_MESSAGE_1) 486 | with self.subTest(f'test {self.REPHRASE_TEST_REPHRASED_MESSAGE_1}'): 487 | self.assertTrue( 488 | instruction.check_following(self.REPHRASE_TEST_REPHRASED_MESSAGE_1)) 489 | 490 | instruction.build_description( 491 | original_message=self.REPHRASE_TEST_ORIGINAL_MESSAGE_1) 492 | with self.subTest( 493 | f'test {self.REPHRASE_TEST_REPHRASED_MESSAGE_1_NOCHANGE}'): 494 | with self.assertRaises(ValueError): 495 | instruction.check_following( 496 | self.REPHRASE_TEST_REPHRASED_MESSAGE_1_NOCHANGE) 497 | 498 | instruction.build_description( 499 | original_message=self.REPHRASE_TEST_ORIGINAL_MESSAGE_1) 500 | with self.subTest(f'test {self.REPHRASE_TEST_REPHRASED_MESSAGE_1_FORMAT}'): 501 | with self.assertRaises(ValueError): 502 | instruction.check_following( 503 | self.REPHRASE_TEST_REPHRASED_MESSAGE_1_FORMAT) 504 | 505 | instruction.build_description( 506 | original_message=self.REPHRASE_TEST_ORIGINAL_MESSAGE_2) 507 | with self.subTest(f'test {self.REPHRASE_TEST_REPHRASED_MESSAGE_2}'): 508 | self.assertFalse( 509 | instruction.check_following(self.REPHRASE_TEST_REPHRASED_MESSAGE_2)) 510 | 511 | TEST_INCLUDE_KEYWORD_MESSAGE_1 = """ 512 | Paris is a city of beauty and romance. The romantic river Seine winds its way 513 | through the city, past iconic landmarks like the Eiffel Tower and the Louvre 514 | Museum, where the Mona Lisa resides. Whether you're taking a boat cruise down 515 | the river or simply strolling along the banks, you're sure to be captivated 516 | by the city's charm.""" 517 | 518 | TEST_INCLUDE_KEYWORD_MESSAGE_2 = """ 519 | Paris is a city of beauty, romance, and history. It is home to some of the 520 | most iconic landmarks in the world, including the Eiffel Tower, the Louvre 521 | Museum, and the Notre Dame Cathedral. The city is also known for its romantic 522 | river cruises, its delicious food, and its stylish people. 523 | """ 524 | 525 | KEYWORDS = ('romantic', 'river', 'Mona Lisa') 526 | 527 | def test_keyword_checker(self): 528 | """Test the inclusion of keywords.""" 529 | instruction_id = 'keywords:include_keywords' 530 | instruction = instructions.KeywordChecker(instruction_id) 531 | 532 | instruction.build_description(keywords=self.KEYWORDS) 533 | with self.subTest(f'test {self.TEST_INCLUDE_KEYWORD_MESSAGE_1}'): 534 | self.assertTrue( 535 | instruction.check_following(self.TEST_INCLUDE_KEYWORD_MESSAGE_1)) 536 | 537 | instruction.build_description(keywords=self.KEYWORDS) 538 | with self.subTest(f'test {self.TEST_INCLUDE_KEYWORD_MESSAGE_2}'): 539 | self.assertFalse( 540 | instruction.check_following(self.TEST_INCLUDE_KEYWORD_MESSAGE_2)) 541 | 542 | TEST_KEYWORD_FREQUNECY_MESSAGE_1 = """ 543 | keyword, Keyword, KEYWORD 544 | """ 545 | TEST_KEYWORD_FREQUENCY_KEYWORD_1 = ' keyword ' 546 | 547 | TEST_KEYWORD_FREQUNECY_MESSAGE_2 = """ 548 | *keyword 549 | *Keyword 550 | *KEYWORD 551 | """ 552 | TEST_KEYWORD_FREQUENCY_KEYWORD_2 = 'KEYWORD' 553 | 554 | def test_keyword_frequency_checker(self): 555 | """Test the frequency of keywords.""" 556 | 557 | instruction_id = 'keywords:keyword_frequency' 558 | instruction = instructions.KeywordFrequencyChecker(instruction_id) 559 | 560 | frequency = 4 561 | instruction.build_description(keyword=self.TEST_KEYWORD_FREQUENCY_KEYWORD_1, 562 | frequency=frequency, 563 | relation=instructions._COMPARISON_RELATION[0]) 564 | with self.subTest( 565 | f'test {self.TEST_KEYWORD_FREQUENCY_KEYWORD_1} {frequency}'): 566 | self.assertTrue( 567 | instruction.check_following(self.TEST_KEYWORD_FREQUNECY_MESSAGE_1)) 568 | 569 | frequency = 3 570 | instruction.build_description(keyword=self.TEST_KEYWORD_FREQUENCY_KEYWORD_1, 571 | frequency=frequency, 572 | relation=instructions._COMPARISON_RELATION[1]) 573 | with self.subTest( 574 | f'test {self.TEST_KEYWORD_FREQUENCY_KEYWORD_1} {frequency}'): 575 | self.assertTrue( 576 | instruction.check_following(self.TEST_KEYWORD_FREQUNECY_MESSAGE_1)) 577 | 578 | frequency = 4 579 | instruction.build_description(keyword=self.TEST_KEYWORD_FREQUENCY_KEYWORD_2, 580 | frequency=frequency, 581 | relation=instructions._COMPARISON_RELATION[1]) 582 | with self.subTest( 583 | f'test {self.TEST_KEYWORD_FREQUENCY_KEYWORD_2} {frequency}'): 584 | self.assertFalse( 585 | instruction.check_following(self.TEST_KEYWORD_FREQUNECY_MESSAGE_2)) 586 | 587 | TEST_NUM_WORDS_MESSAGE_1 = """ 588 | d3sCRi7 lArge lAnguagE M0del w1tH 20 w0RdS.""" 589 | 590 | TEST_NUM_WORDS_MESSAGE_2 = """ 591 | L4RGE L4NGU4GE M0DEL: AI syst3m th4t und3rstands, g3n3r4tes, or tr4nsforms 592 | l4ngu4g3 b4s3d on pr3vious l3arning & d4t4.""" 593 | 594 | def test_num_words_checker(self): 595 | """Test the checker on the number of words.""" 596 | instruction_id = 'length_constraint:number_words' 597 | instruction = instructions.NumberOfWords(instruction_id) 598 | 599 | word_counts = 8 600 | instruction.build_description(num_words=word_counts, 601 | relation=instructions._COMPARISON_RELATION[0]) 602 | with self.subTest( 603 | f'test {self.TEST_NUM_WORDS_MESSAGE_1} {word_counts}'): 604 | self.assertTrue( 605 | instruction.check_following(self.TEST_NUM_WORDS_MESSAGE_1)) 606 | 607 | word_counts = 16 608 | instruction.build_description(num_words=word_counts, 609 | relation=instructions._COMPARISON_RELATION[0]) 610 | with self.subTest( 611 | f'test {self.TEST_NUM_WORDS_MESSAGE_2} less than {word_counts}'): 612 | self.assertFalse( 613 | instruction.check_following(self.TEST_NUM_WORDS_MESSAGE_2)) 614 | 615 | word_counts = 16 616 | instruction.build_description(num_words=word_counts, 617 | relation=instructions._COMPARISON_RELATION[1]) 618 | with self.subTest( 619 | f'test {self.TEST_NUM_WORDS_MESSAGE_2} at least {word_counts}'): 620 | self.assertTrue( 621 | instruction.check_following(self.TEST_NUM_WORDS_MESSAGE_2)) 622 | 623 | PARAGRAPH_FIRST_WORD_TEST_1 = """ 624 | paragraph 1 625 | 626 | I paragraph 2 627 | 628 | paragraph 3""" 629 | 630 | PARAGRAPH_FIRST_WORD_TEST_2 = """ 631 | paragraph 1 632 | 633 | I paragraph 2""" 634 | 635 | PARAGRAPH_FIRST_WORD_TEST_3 = """ 636 | paragraph 1 637 | 638 | fail paragraph 2 639 | 640 | paragraph 3""" 641 | 642 | PARAGRAPH_FIRST_WORD_TEST_4 = """ 643 | Wow this is a very long response. 644 | 645 | I can't believe there is more than three paragraphs. 646 | 647 | Really more than three? No way! 648 | 649 | I can't believe it but I guess I am living proof. 650 | 651 | Haha, you go that right.""" 652 | 653 | PARAGRAPH_FIRST_WORD_TEST_5 = """ 654 | Wow this is a very long response. 655 | 656 | I can't believe there is more than three paragraphs. 657 | 658 | "Really?! more than three? No way!" 659 | 660 | I can't believe it but I guess I am living proof. 661 | 662 | Haha, you go that right.""" 663 | 664 | PARAGRAPH_FIRST_WORD_TEST_6 = """ 665 | Wow this is a very long response. 666 | 667 | I can't believe there is more than three paragraphs. 668 | 669 | Rea!lly more than three? No way! 670 | 671 | I can't believe it but I guess I am living proof. 672 | 673 | Haha, you go that right.""" 674 | 675 | def test_paragraph_first_word(self): 676 | """Test number of paragraphs and first word of nth paragraph.""" 677 | instruction_id = 'length_constraints:nth_paragraph_first_word' 678 | instruction = instructions.ParagraphFirstWordCheck(instruction_id) 679 | tests = [ 680 | self.PARAGRAPH_FIRST_WORD_TEST_1, 681 | self.PARAGRAPH_FIRST_WORD_TEST_2, 682 | self.PARAGRAPH_FIRST_WORD_TEST_3, 683 | self.PARAGRAPH_FIRST_WORD_TEST_4, 684 | self.PARAGRAPH_FIRST_WORD_TEST_5, 685 | self.PARAGRAPH_FIRST_WORD_TEST_6, 686 | ] 687 | 688 | for test in tests: 689 | if (test == self.PARAGRAPH_FIRST_WORD_TEST_1 690 | or test == self.PARAGRAPH_FIRST_WORD_TEST_2 691 | or test == self.PARAGRAPH_FIRST_WORD_TEST_3): 692 | num_paragraphs = 3 693 | nth_paragraph = 2 694 | first_word = 'I' 695 | elif test == self.PARAGRAPH_FIRST_WORD_TEST_4: 696 | num_paragraphs = 5 697 | nth_paragraph = 5 698 | first_word = 'haha' 699 | else: 700 | num_paragraphs = 5 701 | nth_paragraph = 3 702 | first_word = 'Really' 703 | 704 | instruction.build_description( 705 | num_paragraphs=num_paragraphs, 706 | nth_paragraph=nth_paragraph, 707 | first_word=first_word, 708 | ) 709 | with self.subTest( 710 | f'test {test} \n. Test for ' 711 | f'{num_paragraphs} paragraphs and ' 712 | f'for paragraph {nth_paragraph} ' 713 | f'{first_word} is first word' 714 | ): 715 | if (test == self.PARAGRAPH_FIRST_WORD_TEST_1 716 | or test == self.PARAGRAPH_FIRST_WORD_TEST_4 717 | or test == self.PARAGRAPH_FIRST_WORD_TEST_5): 718 | self.assertTrue(instruction.check_following(test)) 719 | else: 720 | self.assertFalse(instruction.check_following(test)) 721 | 722 | TEST_KEY_SENTENCES_1 = """ 723 | Puppies are fun. They are playful, energetic, and always up for a good time. 724 | Puppies love to run, jump, and play fetch. They are also very good at 725 | cuddling and giving kisses. If you are looking for a fun and loving pet, 726 | a puppy is a great choice. 727 | """ 728 | 729 | TEST_KEY_SENTENCES_2 = """ 730 | I like to eat candy. When I'm feeling happy, sad, or even angry, candy 731 | always makes me feel better. I like to share candy with my friends and 732 | family. It's a great way to show them how much I care. 733 | """ 734 | 735 | TEST_KEY_SENTENCES_3 = """ 736 | I know that candy isn't the healthiest thing to eat, but I don't care. 737 | I love it too much. I'll just have to make sure to eat it in moderation. 738 | """ 739 | 740 | key_sentences = {'Puppies love to run, jump, and play fetch.', 741 | 'I like to eat candy.', 'Puppies are fun.'} 742 | 743 | def test_key_sentences(self): 744 | """Test the inclusion of key sentences.""" 745 | instruction_id = 'keywords:key_sentences' 746 | instruction = instructions.KeySentenceChecker(instruction_id) 747 | 748 | num_sentences = 2 749 | instruction.build_description( 750 | key_sentences=self.key_sentences, num_sentences=num_sentences) 751 | 752 | with self.subTest(f'test {self.TEST_KEY_SENTENCES_1}'): 753 | self.assertTrue(instruction.check_following(self.TEST_KEY_SENTENCES_1)) 754 | 755 | num_sentences = 1 756 | instruction.build_description( 757 | key_sentences=self.key_sentences, num_sentences=num_sentences) 758 | 759 | with self.subTest(f'test {self.TEST_KEY_SENTENCES_2}'): 760 | self.assertTrue(instruction.check_following(self.TEST_KEY_SENTENCES_2)) 761 | 762 | with self.subTest(f'test {self.TEST_KEY_SENTENCES_3}'): 763 | self.assertFalse(instruction.check_following(self.TEST_KEY_SENTENCES_3)) 764 | 765 | TEST_FORBIDDEN_WORDS_MESSAGE_1 = """ 766 | The Nazis came to power in 1933 through a combination of legal and illegal 767 | means. Hitler was appointed chancellor by President Paul von Hindenburg, and 768 | the Nazis quickly consolidated their power by passing a series of laws that 769 | restricted the rights of opposition parties and individuals. By 1934, Hitler 770 | had become dictator of Germany. 771 | """ 772 | 773 | TEST_FORBIDDEN_WORDS_MESSAGE_2 = """ 774 | Dinosaurs were a diverse group of reptiles that dominated the Earth for over 775 | 160 million years. They came in all shapes and sizes, from the tiny 776 | Compsognathus to the massive Argentinosaurus. Dinosaurs were the most 777 | successful land animals on Earth until they went extinct about 66 million 778 | years ago. The exact cause of their extinction is still unknown, but it 779 | is thought to have been a combination of factors, including an asteroid 780 | impact and climate change. 781 | """ 782 | 783 | TEST_FORBIDDEN_WORDS_MESSAGE_3 = """ 784 | GPT, or Generative Pre-trained Transformer, is a family of neural network 785 | models that uses the transformer architecture. GPT models are trained on a 786 | massive dataset of text and code, and can be used for a variety of tasks, 787 | including text generation, translation, and question answering. GPT models 788 | have been shown to be very effective at these tasks, and are being used by 789 | a variety of companies and organizations like Google. 790 | """ 791 | FORBIDDEN_WORDS_1 = ('HOUSE', 'POWER', 'BECOME') 792 | FORBIDDEN_WORDS_2 = ('GOOGLE', 'TEXT') 793 | FORBIDDEN_WORDS_3 = ('GENE', 'TRANSFORM') 794 | 795 | def test_forbidden_words(self): 796 | """Test the exclusion of key words.""" 797 | instruction_id = 'keywords:forbidden_words' 798 | instruction = instructions.ForbiddenWords(instruction_id) 799 | 800 | instruction.build_description(forbidden_words=self.FORBIDDEN_WORDS_1) 801 | with self.subTest(f'test {self.TEST_FORBIDDEN_WORDS_MESSAGE_1}\n ' + 802 | f'with forbidden words: {self.FORBIDDEN_WORDS_1}. '): 803 | self.assertFalse( 804 | instruction.check_following(self.TEST_FORBIDDEN_WORDS_MESSAGE_1)) 805 | 806 | with self.subTest(f'test {self.TEST_FORBIDDEN_WORDS_MESSAGE_2}\n ' + 807 | f'with forbidden words: {self.FORBIDDEN_WORDS_1}. '): 808 | self.assertTrue( 809 | instruction.check_following(self.TEST_FORBIDDEN_WORDS_MESSAGE_2)) 810 | 811 | with self.subTest(f'test {self.TEST_FORBIDDEN_WORDS_MESSAGE_3}\n ' + 812 | f'with forbidden words: {self.FORBIDDEN_WORDS_1}. '): 813 | self.assertTrue( 814 | instruction.check_following(self.TEST_FORBIDDEN_WORDS_MESSAGE_3)) 815 | 816 | instruction.build_description(forbidden_words=self.FORBIDDEN_WORDS_2) 817 | with self.subTest(f'test {self.TEST_FORBIDDEN_WORDS_MESSAGE_1}\n ' + 818 | f'with forbidden words: {self.FORBIDDEN_WORDS_2}. '): 819 | self.assertTrue( 820 | instruction.check_following(self.TEST_FORBIDDEN_WORDS_MESSAGE_1)) 821 | 822 | with self.subTest(f'test {self.TEST_FORBIDDEN_WORDS_MESSAGE_2}\n ' + 823 | f'with forbidden words: {self.FORBIDDEN_WORDS_2}. '): 824 | self.assertTrue( 825 | instruction.check_following(self.TEST_FORBIDDEN_WORDS_MESSAGE_2)) 826 | 827 | with self.subTest(f'test {self.TEST_FORBIDDEN_WORDS_MESSAGE_3}\n ' + 828 | f'with forbidden words: {self.FORBIDDEN_WORDS_2}. '): 829 | self.assertFalse( 830 | instruction.check_following(self.TEST_FORBIDDEN_WORDS_MESSAGE_3)) 831 | 832 | instruction.build_description(forbidden_words=self.FORBIDDEN_WORDS_3) 833 | with self.subTest(f'test {self.TEST_FORBIDDEN_WORDS_MESSAGE_3}\n ' + 834 | f'with forbidden words: {self.FORBIDDEN_WORDS_2}. '): 835 | self.assertTrue( 836 | instruction.check_following(self.TEST_FORBIDDEN_WORDS_MESSAGE_3)) 837 | 838 | TEST_ORIGINAL_PARAGRAPH_1 = """ 839 | The sun is shining brightly today, and the birds are singing in the trees. 840 | It's a beautiful day to be outside, so I decided to go for a walk. 841 | As I walked, I took in the fresh air and the warm sunshine. 842 | I felt happy and relaxed, and I was grateful for the beautiful day 843 | """ 844 | 845 | TEST_ORIGINAL_PARAGRAPH_2 = """ 846 | Google is a global technology company that specializes in Internet-related 847 | services and products. It is one of the most successful companies in the 848 | world, and its products are used by billions of people every day. Google's 849 | mission is to organize the world's information and make it universally 850 | accessible and useful. 851 | """ 852 | 853 | TEST_REPHRASED_PARAGRAPH_1 = """ 854 | On a beautiful day, I went for a walk. The sun shone and birds sang. 855 | I enjoyed the fresh air and warm sun. 856 | I felt happy and grateful for the lovely day. 857 | """ 858 | 859 | TEST_REPHRASED_PARAGRAPH_2 = """ 860 | The weather was lovely, so I went for a walk. I enjoyed the 861 | fresh air and warm sun. It was a beautiful day, and I felt happy and grateful. 862 | """ 863 | 864 | TEST_REPHRASED_PARAGRAPH_3 = """ 865 | Google is a technology company that provides Internet services. 866 | It aims to organize the world's information and make it universally 867 | accessible and useful. 868 | """ 869 | 870 | TEST_REPHRASED_PARAGRAPH_4 = """ 871 | I like candy. 872 | """ 873 | 874 | def test_rephrase_paragraph(self): 875 | """Test the rephrasing of paragraph.""" 876 | instruction_id = 'detectable_content:rephrase_paragraph' 877 | instruction = instructions.RephraseParagraph(instruction_id) 878 | low, high = 20, 30 879 | instruction.build_description( 880 | low=low, high=high, original_paragraph=self.TEST_ORIGINAL_PARAGRAPH_1) 881 | 882 | with self.subTest(f'test {self.TEST_ORIGINAL_PARAGRAPH_1} to ' + 883 | f'have between {low} and {high} same words.'): 884 | self.assertTrue( 885 | instruction.check_following(self.TEST_REPHRASED_PARAGRAPH_1)) 886 | 887 | low, high = 20, 25 888 | instruction.build_description( 889 | low=low, high=high, original_paragraph=self.TEST_ORIGINAL_PARAGRAPH_1) 890 | 891 | with self.subTest(f'test {self.TEST_ORIGINAL_PARAGRAPH_1} to ' + 892 | f'have between {low} and {high} same words.'): 893 | self.assertTrue( 894 | instruction.check_following(self.TEST_REPHRASED_PARAGRAPH_2)) 895 | 896 | low, high = 15, 20 897 | instruction.build_description( 898 | low=low, high=high, original_paragraph=self.TEST_ORIGINAL_PARAGRAPH_2) 899 | 900 | with self.subTest(f'test {self.TEST_ORIGINAL_PARAGRAPH_2} to ' + 901 | f'have between {low} and {high} same words.'): 902 | self.assertFalse( 903 | instruction.check_following(self.TEST_REPHRASED_PARAGRAPH_3)) 904 | 905 | low, high = 0, 5 906 | instruction.build_description( 907 | low=low, high=high, original_paragraph=self.TEST_ORIGINAL_PARAGRAPH_2) 908 | 909 | with self.subTest(f'test {self.TEST_ORIGINAL_PARAGRAPH_2} to ' + 910 | f'have between {low} and {high} same words.'): 911 | self.assertTrue( 912 | instruction.check_following(self.TEST_REPHRASED_PARAGRAPH_4)) 913 | 914 | low, high = 1, 5 915 | instruction.build_description( 916 | low=low, high=high, original_paragraph=self.TEST_ORIGINAL_PARAGRAPH_2) 917 | 918 | with self.subTest(f'test {self.TEST_ORIGINAL_PARAGRAPH_2} to ' + 919 | f'have between {low} and {high} same words.'): 920 | self.assertFalse( 921 | instruction.check_following(self.TEST_REPHRASED_PARAGRAPH_4)) 922 | 923 | TEST_TWO_RESPONSES_1 = """ 924 | This is response 1. 925 | ****** 926 | This is response 2. 927 | """ 928 | 929 | TEST_TWO_RESPONSES_2 = """ 930 | This is response 1. 931 | ****** 932 | This is response 1. 933 | """ 934 | 935 | TEST_TWO_RESPONSES_3 = """ 936 | This is response 1. 937 | ****** 938 | This is response 2. 939 | ****** 940 | This is response 3. 941 | """ 942 | 943 | TEST_TWO_RESPONSES_4 = """ 944 | ****** 945 | Response 1. 946 | ****** 947 | ****** 948 | Response 2. 949 | ****** 950 | """ 951 | 952 | TEST_TWO_RESPONSES_5 = """ 953 | ****** 954 | Response 1 955 | ****** 956 | Response 2 957 | ****** 958 | """ 959 | 960 | def test_two_responses(self): 961 | """Test that two responses are given.""" 962 | instruction_id = 'combination:two_responses' 963 | instruction = instructions.TwoResponsesChecker(instruction_id) 964 | instruction.build_description() 965 | 966 | with self.subTest(f'test {self.TEST_TWO_RESPONSES_1}'): 967 | self.assertTrue(instruction.check_following(self.TEST_TWO_RESPONSES_1)) 968 | 969 | with self.subTest(f'test {self.TEST_TWO_RESPONSES_2}'): 970 | self.assertFalse(instruction.check_following(self.TEST_TWO_RESPONSES_2)) 971 | 972 | with self.subTest(f'test {self.TEST_TWO_RESPONSES_3}'): 973 | self.assertFalse(instruction.check_following(self.TEST_TWO_RESPONSES_3)) 974 | 975 | with self.subTest(f'test {self.TEST_TWO_RESPONSES_4}'): 976 | self.assertFalse(instruction.check_following(self.TEST_TWO_RESPONSES_4)) 977 | 978 | with self.subTest(f'test {self.TEST_TWO_RESPONSES_5}'): 979 | self.assertTrue(instruction.check_following(self.TEST_TWO_RESPONSES_5)) 980 | 981 | PROMPT_TO_REPEAT = 'Write a CL description.' 982 | 983 | TEST_PROMPT_1 = """Write a CL description. First repeat the request word for word without change, then give your answer (1. do not say any words or characters before repeating the request; 2. the request you need to repeat does not include this sentence)""" 984 | 985 | TEST_PROMPT_ANSWER_1 = """Write a CL description. Hi, Le and TJ, please 986 | check this out. Thanks. 987 | """ 988 | TEST_PROMPT_ANSWER_2 = """Hi, Le and TJ. Write a CL description. Thanks. 989 | """ 990 | 991 | def test_prompt_repeat_answer(self): 992 | """Test that prompt is repeated then anwered.""" 993 | instruction_id = 'combination:repeat_prompt' 994 | instruction = instructions.RepeatPromptThenAnswer(instruction_id) 995 | 996 | instruction.build_description(prompt_to_repeat=self.PROMPT_TO_REPEAT) 997 | with self.subTest(f'test {self.TEST_PROMPT_ANSWER_1}' + 998 | f' with prompt: {self.TEST_PROMPT_1}'): 999 | self.assertTrue(instruction.check_following(self.TEST_PROMPT_ANSWER_1)) 1000 | 1001 | with self.subTest(f'test {self.TEST_PROMPT_ANSWER_2}' + 1002 | f' with prompt: {self.TEST_PROMPT_1}'): 1003 | self.assertFalse(instruction.check_following(self.TEST_PROMPT_ANSWER_2)) 1004 | 1005 | TEST_END_CHECKER_1 = """ 1006 | The answer is 7. Any more questions? 1007 | """ 1008 | 1009 | TEST_END_CHECKER_2 = """ 1010 | At the end of this prompt I am required to say that this is the end. 1011 | """ 1012 | 1013 | TEST_END_CHECKER_3 = """ 1014 | This will fail. Paris is cool. 1015 | """ 1016 | 1017 | END_PHRASE_1 = """ 1018 | Any more questions? 1019 | """ 1020 | 1021 | END_PHRASE_2 = """ 1022 | This is the end. 1023 | """ 1024 | 1025 | END_PHRASE_3 = """ 1026 | This will fail. 1027 | """ 1028 | 1029 | def test_end_checker(self): 1030 | """Check the end of the prompt.""" 1031 | instruction_id = 'startend:end_checker' 1032 | instruction = instructions.EndChecker(instruction_id) 1033 | instruction.build_description(end_phrase=self.END_PHRASE_1) 1034 | with self.subTest(f'test {self.TEST_END_CHECKER_1}'): 1035 | self.assertTrue(instruction.check_following(self.TEST_END_CHECKER_1)) 1036 | 1037 | instruction.build_description(end_phrase=self.END_PHRASE_2) 1038 | with self.subTest(f'test {self.TEST_END_CHECKER_2}'): 1039 | self.assertTrue(instruction.check_following(self.TEST_END_CHECKER_2)) 1040 | 1041 | instruction.build_description(end_phrase=self.END_PHRASE_3) 1042 | with self.subTest(f'test {self.TEST_END_CHECKER_3}'): 1043 | self.assertFalse(instruction.check_following(self.TEST_END_CHECKER_3)) 1044 | 1045 | TEST_TITLE_MESSAGE_1 = """ 1046 | <> 1047 | La la la. Happy song. 1048 | """ 1049 | 1050 | TEST_TITLE_MESSAGE_2 = """ 1051 | Is it fine for title to be at the end? 1052 | <> 1053 | """ 1054 | TEST_TITLE_MESSAGE_3 = """ 1055 | << >> 1056 | There is no title. 1057 | """ 1058 | 1059 | TEST_TITLE_MESSAGE_4 = """ 1060 | <> 1062 | """ 1063 | 1064 | def test_title_checker(self): 1065 | """Check the prompt for a title.""" 1066 | instruction_id = 'detectable_format:title' 1067 | instruction = instructions.TitleChecker(instruction_id) 1068 | instruction.build_description() 1069 | with self.subTest(f'test {self.TEST_TITLE_MESSAGE_1}'): 1070 | self.assertTrue(instruction.check_following(self.TEST_TITLE_MESSAGE_1)) 1071 | with self.subTest(f'test {self.TEST_TITLE_MESSAGE_2}'): 1072 | self.assertTrue(instruction.check_following(self.TEST_TITLE_MESSAGE_2)) 1073 | 1074 | with self.subTest(f'test {self.TEST_TITLE_MESSAGE_3}'): 1075 | self.assertFalse(instruction.check_following(self.TEST_TITLE_MESSAGE_3)) 1076 | with self.subTest(f'test {self.TEST_TITLE_MESSAGE_4}'): 1077 | self.assertFalse(instruction.check_following(self.TEST_TITLE_MESSAGE_4)) 1078 | 1079 | TEST_LETTER_FREQUENCY_MESSAGE_1 = """ 1080 | There is the T. Four T's. 1081 | """ 1082 | 1083 | TEST_LETTER_FREQUENCY_MESSAGE_2 = """ 1084 | asdfghjkl!!aA 1085 | """ 1086 | 1087 | TEST_LETTER_FREQUENCY_MESSAGE_3 = """ 1088 | The letter P appears 3 times in this message. 1089 | """ 1090 | 1091 | def test_letter_frequency_checker(self): 1092 | """Test the frequency of letters.""" 1093 | instruction_id = 'keywords:letter_frequency' 1094 | instruction = instructions.LetterFrequencyChecker(instruction_id) 1095 | 1096 | letter = 'T' 1097 | frequency = 4 1098 | instruction.build_description( 1099 | letter=letter, 1100 | let_frequency=frequency, 1101 | let_relation=instructions._COMPARISON_RELATION[1], 1102 | ) 1103 | with self.subTest(f'test {self.TEST_LETTER_FREQUENCY_MESSAGE_1}'): 1104 | self.assertTrue( 1105 | instruction.check_following(self.TEST_LETTER_FREQUENCY_MESSAGE_1) 1106 | ) 1107 | 1108 | letter = 'a' 1109 | frequency = 4 1110 | instruction.build_description( 1111 | letter=letter, 1112 | let_frequency=frequency, 1113 | let_relation=instructions._COMPARISON_RELATION[0], 1114 | ) 1115 | with self.subTest(f'test {self.TEST_LETTER_FREQUENCY_MESSAGE_2}'): 1116 | self.assertTrue( 1117 | instruction.check_following(self.TEST_LETTER_FREQUENCY_MESSAGE_2) 1118 | ) 1119 | 1120 | letter = 'p' 1121 | frequency = 4 1122 | instruction.build_description( 1123 | letter=letter, 1124 | let_frequency=frequency, 1125 | let_relation=instructions._COMPARISON_RELATION[1], 1126 | ) 1127 | with self.subTest(f'test {self.TEST_LETTER_FREQUENCY_MESSAGE_2}'): 1128 | self.assertFalse( 1129 | instruction.check_following(self.TEST_LETTER_FREQUENCY_MESSAGE_2) 1130 | ) 1131 | 1132 | TEST_ENGLISH_CAPITAL_1 = """ 1133 | THIS IS AN ENGLISH SENTENCE. EVERY LETTER IS CAPITALIZED!!! AMAZING. 1134 | """ 1135 | 1136 | TEST_ENGLISH_CAPITAL_2 = """ 1137 | Every Word Is Capitalized. 1138 | """ 1139 | 1140 | def test_english_capital_checker(self): 1141 | """Test that letters are all capitalized.""" 1142 | instruction_id = 'change_case:english_capital' 1143 | instruction = instructions.CapitalLettersEnglishChecker(instruction_id) 1144 | instruction.build_description() 1145 | with self.subTest(f'test {self.TEST_ENGLISH_CAPITAL_1}'): 1146 | self.assertTrue(instruction.check_following(self.TEST_ENGLISH_CAPITAL_1)) 1147 | 1148 | with self.subTest(f'test {self.TEST_ENGLISH_CAPITAL_2}'): 1149 | self.assertFalse(instruction.check_following(self.TEST_ENGLISH_CAPITAL_2)) 1150 | 1151 | TEST_ENGLISH_LOWERCASE_1 = """ 1152 | every letter is lowercase. 1153 | """ 1154 | 1155 | TEST_ENGLISH_LOWERCASE_2 = """ 1156 | Almost every letter is lowercase. 1157 | """ 1158 | 1159 | def test_english_lowercase_checker(self): 1160 | """Test that letters are all capitalized.""" 1161 | instruction_id = 'change_case:english_lowercase' 1162 | instruction = instructions.LowercaseLettersEnglishChecker(instruction_id) 1163 | instruction.build_description() 1164 | with self.subTest(f'test {self.TEST_ENGLISH_LOWERCASE_1}'): 1165 | self.assertTrue( 1166 | instruction.check_following(self.TEST_ENGLISH_LOWERCASE_1) 1167 | ) 1168 | 1169 | with self.subTest(f'test {self.TEST_ENGLISH_LOWERCASE_2}'): 1170 | self.assertFalse( 1171 | instruction.check_following(self.TEST_ENGLISH_LOWERCASE_2) 1172 | ) 1173 | 1174 | TEST_COMMA_MESSAGE_1 = """ 1175 | Every sentence is short. There is no need for a comma. 1176 | """ 1177 | 1178 | TEST_COMMA_MESSAGE_2 = """ 1179 | Since the start of time, people have always found a way to punctuate. 1180 | """ 1181 | 1182 | def test_comma(self): 1183 | instruction_id = 'punctuation:no_comma' 1184 | instruction = instructions.CommaChecker(instruction_id) 1185 | instruction.build_description() 1186 | with self.subTest(f'test {self.TEST_COMMA_MESSAGE_1}'): 1187 | self.assertTrue(instruction.check_following(self.TEST_COMMA_MESSAGE_1)) 1188 | with self.subTest(f'test {self.TEST_COMMA_MESSAGE_2}'): 1189 | self.assertFalse(instruction.check_following(self.TEST_COMMA_MESSAGE_2)) 1190 | 1191 | TEST_CAPITAL_WORD_FREQUENCY_MESSAGE_1 = """ 1192 | HERE there are THREE FUlly CAPITAL words. 1193 | """ 1194 | 1195 | TEST_CAPITAL_WORD_FREQUENCY_MESSAGE_2 = """ 1196 | THERE are Four FULLY CAPITAL WORDS. Many Others Are Only Partially So. 1197 | """ 1198 | 1199 | def test_capital_word_frequency(self): 1200 | instruction_id = 'change_case:capital_word_frequency' 1201 | instruction = instructions.CapitalWordFrequencyChecker(instruction_id) 1202 | 1203 | capital_frequency = 3 1204 | instruction.build_description( 1205 | capital_frequency=capital_frequency, 1206 | capital_relation=instructions._COMPARISON_RELATION[1], 1207 | ) 1208 | with self.subTest(f'test {self.TEST_CAPITAL_WORD_FREQUENCY_MESSAGE_1}'): 1209 | self.assertTrue( 1210 | instruction.check_following( 1211 | self.TEST_CAPITAL_WORD_FREQUENCY_MESSAGE_1 1212 | ) 1213 | ) 1214 | 1215 | capital_frequency = 5 1216 | instruction.build_description( 1217 | capital_frequency=capital_frequency, 1218 | capital_relation=instructions._COMPARISON_RELATION[0], 1219 | ) 1220 | with self.subTest(f'test {self.TEST_CAPITAL_WORD_FREQUENCY_MESSAGE_2}'): 1221 | self.assertTrue( 1222 | instruction.check_following( 1223 | self.TEST_CAPITAL_WORD_FREQUENCY_MESSAGE_2 1224 | ) 1225 | ) 1226 | 1227 | capital_frequency = 4 1228 | instruction.build_description( 1229 | capital_frequency=capital_frequency, 1230 | capital_relation=instructions._COMPARISON_RELATION[0], 1231 | ) 1232 | with self.subTest(f'test {self.TEST_CAPITAL_WORD_FREQUENCY_MESSAGE_2}'): 1233 | self.assertFalse( 1234 | instruction.check_following( 1235 | self.TEST_CAPITAL_WORD_FREQUENCY_MESSAGE_2 1236 | ) 1237 | ) 1238 | 1239 | TEST_QUOTATION_MESSAGE_1 = """ 1240 | "This entire message is wrapped in double quotation marks." 1241 | """ 1242 | 1243 | TEST_QUOTATION_MESSAGE_2 = """ 1244 | "This message is wrapped in double quotation marks." But not everything. 1245 | """ 1246 | 1247 | def test_quotation(self): 1248 | instruction_id = 'startend:quotation' 1249 | instruction = instructions.QuotationChecker(instruction_id) 1250 | instruction.build_description() 1251 | with self.subTest(f'test {self.TEST_QUOTATION_MESSAGE_1}'): 1252 | self.assertTrue( 1253 | instruction.check_following(self.TEST_QUOTATION_MESSAGE_1) 1254 | ) 1255 | with self.subTest(f'test {self.TEST_QUOTATION_MESSAGE_2}'): 1256 | self.assertFalse( 1257 | instruction.check_following(self.TEST_QUOTATION_MESSAGE_2) 1258 | ) 1259 | 1260 | INSTRUCTION_DICT = { 1261 | 'language:response_language': instructions.ResponseLanguageChecker, 1262 | 'length_constraints:number_sentences': instructions.NumberOfSentences, 1263 | 'length_constraints:number_paragraphs': instructions.ParagraphChecker, 1264 | 'length_constraints:number_words': instructions.NumberOfWords, 1265 | 'detectable_content:number_placeholders': instructions.PlaceholderChecker, 1266 | 'detectable_content:postscript': instructions.PostscriptChecker, 1267 | 'detectable_format:number_bullet_lists': instructions.BulletListChecker, 1268 | 'detectable_format:constrained_response': ( 1269 | instructions.ConstrainedResponseChecker), 1270 | 'detectable_format:number_highlighted_sections': ( 1271 | instructions.HighlightSectionChecker), 1272 | 'detectable_format:multiple_sections': instructions.SectionChecker, 1273 | 'detectable_format:json_format': instructions.JsonFormat, 1274 | } 1275 | 1276 | def test_get_instruction_args(self): 1277 | """Test getting instruction args.""" 1278 | for inst_id, inst_cls in self.INSTRUCTION_DICT.items(): 1279 | instruction = inst_cls(inst_id) 1280 | inst_description = instruction.build_description() 1281 | kwargs = instruction.get_instruction_args() 1282 | # The keyword args can be None. 1283 | if kwargs: 1284 | inst_description_closed_loop = instruction.build_description(**kwargs) 1285 | with self.subTest(f'test {inst_id}'): 1286 | self.assertEqual(inst_description, inst_description_closed_loop) 1287 | 1288 | 1289 | if __name__ == '__main__': 1290 | absltest.main() 1291 | -------------------------------------------------------------------------------- /instruction_following_eval/instructions.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2024 The Google Research Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Library of instructions.""" 17 | import collections 18 | import json 19 | import random 20 | import re 21 | import string 22 | from typing import Dict, Optional, Sequence, Union 23 | 24 | from absl import logging 25 | import langdetect 26 | 27 | from instruction_following_eval import instructions_util 28 | 29 | 30 | _InstructionArgsDtype = Optional[Dict[str, Union[int, str, Sequence[str]]]] 31 | 32 | _LANGUAGES = instructions_util.LANGUAGE_CODES 33 | 34 | # The relational operation for comparison. 35 | _COMPARISON_RELATION = ("less than", "at least") 36 | 37 | # The maximum number of sentences. 38 | _MAX_NUM_SENTENCES = 20 39 | 40 | # The number of placeholders. 41 | _NUM_PLACEHOLDERS = 4 42 | 43 | # The number of bullet lists. 44 | _NUM_BULLETS = 5 45 | 46 | # The options of constrained response. 47 | _CONSTRAINED_RESPONSE_OPTIONS = ( 48 | "My answer is yes.", "My answer is no.", "My answer is maybe.") 49 | 50 | # The options of starter keywords. 51 | _STARTER_OPTIONS = ("I would say", "My answer is", "I believe", 52 | "In my opinion", "I think", "I reckon", "I feel", 53 | "From my perspective", "As I see it", "According to me", 54 | "As far as I'm concerned", "To my understanding", 55 | "In my view", "My take on it is", "As per my perception") 56 | 57 | # The options of ending keywords. 58 | # TODO(jeffreyzhou) add more ending options 59 | _ENDING_OPTIONS = ("Any other questions?", 60 | "Is there anything else I can help with?") 61 | 62 | # The number of highlighted sections. 63 | _NUM_HIGHLIGHTED_SECTIONS = 4 64 | 65 | # The section spliter. 66 | _SECTION_SPLITER = ("Section", "SECTION") 67 | 68 | # The number of sections. 69 | _NUM_SECTIONS = 5 70 | 71 | # The number of paragraphs. 72 | _NUM_PARAGRAPHS = 5 73 | 74 | # The postscript marker. 75 | _POSTSCRIPT_MARKER = ("P.S.", "P.P.S") 76 | 77 | # The number of keywords. 78 | _NUM_KEYWORDS = 2 79 | 80 | # The occurrences of a single keyword. 81 | _KEYWORD_FREQUENCY = 3 82 | 83 | # The occurrences of a single letter. 84 | _LETTER_FREQUENCY = 10 85 | 86 | # The occurrences of words with all capital letters. 87 | _ALL_CAPITAL_WORD_FREQUENCY = 20 88 | 89 | # The number of words in the response. 90 | _NUM_WORDS_LOWER_LIMIT = 100 91 | _NUM_WORDS_UPPER_LIMIT = 500 92 | 93 | 94 | class Instruction: 95 | """An instruction template.""" 96 | 97 | def __init__(self, instruction_id): 98 | self.id = instruction_id 99 | 100 | def build_description(self, **kwargs): 101 | raise NotImplementedError("`build_description` not implemented.") 102 | 103 | def get_instruction_args(self): 104 | raise NotImplementedError("`get_instruction_args` not implemented.") 105 | 106 | def get_instruction_args_keys(self): 107 | raise NotImplementedError("`get_instruction_args_keys` not implemented.") 108 | 109 | def check_following(self, value): 110 | raise NotImplementedError("`check_following` not implemented.") 111 | 112 | 113 | class ResponseLanguageChecker(Instruction): 114 | """Check the language of the entire response.""" 115 | 116 | def build_description(self, *, language = None): 117 | """Build the instruction description. 118 | 119 | Args: 120 | language: A string representing the expected language of the response. The 121 | language has to comply to the 97 types defined in 122 | `langid.py` (https://pypi.org/project/langid/1.1.5/), which follows 123 | ISO 639-1 codes (https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes); 124 | for example, `en` for English, `zh` for Chinese, `fr` for French. 125 | 126 | Returns: 127 | A string representing the instruction description. 128 | """ 129 | self._language = language 130 | if self._language is None: 131 | self._language = random.choice(list(_LANGUAGES.keys())) 132 | # TODO(tianjianlu): opens the description generation to more choices. 133 | self._description_pattern = ( 134 | "Your ENTIRE response should be in {language} language, no other " + 135 | "language is allowed.") 136 | return self._description_pattern.format(language=_LANGUAGES[self._language]) 137 | 138 | def get_instruction_args(self): 139 | """Returns the keyward args of `build_description`.""" 140 | return {"language": self._language} 141 | 142 | def get_instruction_args_keys(self): 143 | """Returns the args keys of `build_description`.""" 144 | return ["language"] 145 | 146 | def check_following(self, value): 147 | """Check if the language of the entire response follows the instruction. 148 | 149 | Args: 150 | value: A string representing the response. 151 | 152 | Returns: 153 | True if the language of `value` follows instruction; otherwise False. 154 | """ 155 | assert isinstance(value, str) 156 | 157 | try: 158 | return langdetect.detect(value) == self._language 159 | except langdetect.LangDetectException as e: 160 | # Count as instruction is followed. 161 | logging.error( 162 | "Unable to detect language for text %s due to %s", value, e 163 | ) # refex: disable=pytotw.037 164 | return True 165 | 166 | 167 | class NumberOfSentences(Instruction): 168 | """Check the number of sentences.""" 169 | 170 | def build_description(self, *, num_sentences = None, 171 | relation = None): 172 | """Build the instruction description. 173 | 174 | Args: 175 | num_sentences: An integer specifying the number of sentences as a 176 | threshold. 177 | relation: A string in (`less than`, `at least`), defining the relational 178 | operator for comparison. 179 | Two relational comparisons are supported for now: 180 | if 'less than', the actual number of sentences < the threshold; 181 | if 'at least', the actual number of sentences >= the threshold. 182 | 183 | Returns: 184 | A string representing the instruction description. 185 | """ 186 | # The number of sentences as a threshold for comparison. 187 | self._num_sentences_threshold = num_sentences 188 | if (self._num_sentences_threshold is None or 189 | self._num_sentences_threshold < 0): 190 | self._num_sentences_threshold = random.randint(1, _MAX_NUM_SENTENCES) 191 | 192 | if relation is None: 193 | self._comparison_relation = random.choice(_COMPARISON_RELATION) 194 | elif relation not in _COMPARISON_RELATION: 195 | raise ValueError("The supported relation for comparison must be in " 196 | f"{_COMPARISON_RELATION}, but {relation} is given.") 197 | else: 198 | self._comparison_relation = relation 199 | 200 | self._description_pattern = ( 201 | "Your response should contain {relation} {num_sentences} sentences.") 202 | return self._description_pattern.format( 203 | relation=self._comparison_relation, 204 | num_sentences=self._num_sentences_threshold) 205 | 206 | def get_instruction_args(self): 207 | """Returns the keyward args of `build_description`.""" 208 | return {"num_sentences": self._num_sentences_threshold, 209 | "relation": self._comparison_relation} 210 | 211 | def get_instruction_args_keys(self): 212 | """Returns the args keys of `build_description`.""" 213 | return ["num_sentences", "relation"] 214 | 215 | def check_following(self, value): 216 | """Check if the number of sentences follows the instruction. 217 | 218 | Args: 219 | value: A string representing the response. 220 | 221 | Returns: 222 | True if the response follows the instruction. 223 | 224 | Raise: 225 | ValueError if the string in `instruction_args` is not in 226 | [`less_than`, `at_least`]. 227 | """ 228 | num_sentences = instructions_util.count_sentences(value) 229 | if self._comparison_relation == _COMPARISON_RELATION[0]: 230 | return num_sentences < self._num_sentences_threshold 231 | elif self._comparison_relation == _COMPARISON_RELATION[1]: 232 | return num_sentences >= self._num_sentences_threshold # pytype: disable=bad-return-type 233 | 234 | 235 | class PlaceholderChecker(Instruction): 236 | """Check the placeholders in template writing.""" 237 | 238 | def build_description(self, *, num_placeholders = None): 239 | """Build the instruction description. 240 | 241 | Args: 242 | num_placeholders: An integer denoting the minimum number of 243 | placeholders required in the response. 244 | 245 | Returns: 246 | A string representing the instruction description. 247 | """ 248 | self._num_placeholders = num_placeholders 249 | if self._num_placeholders is None or self._num_placeholders < 0: 250 | self._num_placeholders = random.randint(1, _NUM_PLACEHOLDERS) 251 | self._description_pattern = ( 252 | "The response must contain at least {num_placeholders} placeholders " + 253 | "represented by square brackets, such as [address].") 254 | return self._description_pattern.format( 255 | num_placeholders=self._num_placeholders) 256 | 257 | def get_instruction_args(self): 258 | """Returns the keyward args of `build_description`.""" 259 | return {"num_placeholders": self._num_placeholders} 260 | 261 | def get_instruction_args_keys(self): 262 | """Returns the args keys of `build_description`.""" 263 | return ["num_placeholders"] 264 | 265 | def check_following(self, value): 266 | """Check if the number of placeholders follows the instruction. 267 | 268 | Args: 269 | value: A string representing the response. 270 | 271 | Returns: 272 | True if the actual number of placeholders in the response is greater than 273 | or equal to `num_placeholders`; otherwise, False. 274 | """ 275 | placeholders = re.findall(r"\[.*?\]", value) 276 | num_placeholders = len(placeholders) 277 | return num_placeholders >= self._num_placeholders 278 | 279 | 280 | class BulletListChecker(Instruction): 281 | """Checks the bullet list in the prompt.""" 282 | 283 | def build_description(self, *, num_bullets = None): 284 | """Build the instruction description. 285 | 286 | Args: 287 | num_bullets: An integer specifying the exact number of bullet lists 288 | that is required to appear in the response. 289 | 290 | Returns: 291 | A string representing the instruction description. 292 | """ 293 | self._num_bullets = num_bullets 294 | if self._num_bullets is None or self._num_bullets < 0: 295 | self._num_bullets = random.randint(1, _NUM_BULLETS) 296 | self._description_pattern = ( 297 | "Your answer must contain exactly {num_bullets} bullet points. " + 298 | "Use the markdown bullet points such as:\n" + 299 | "* This is point 1. \n" + 300 | "* This is point 2") 301 | return self._description_pattern.format( 302 | num_bullets=self._num_bullets) 303 | 304 | def get_instruction_args(self): 305 | """Returns the keyward args of `build_description`.""" 306 | return {"num_bullets": self._num_bullets} 307 | 308 | def get_instruction_args_keys(self): 309 | """Returns the args keys of `build_description`.""" 310 | return ["num_bullets"] 311 | 312 | def check_following(self, value): 313 | r"""Check if the number of bullet lists meets the requirement. 314 | 315 | Args: 316 | value: A string representing the response. The response is expected to 317 | contain some bullet lists that start with `\*`. 318 | 319 | Returns: 320 | True if the actual number of bullet lists in the response meets the 321 | requirement. 322 | """ 323 | value = value.replace("\\n", "\n") 324 | bullet_lists = re.findall(r"^\s*\*[^\*].*$", value, flags=re.MULTILINE) 325 | bullet_lists_2 = re.findall(r"^\s*-.*$", value, flags=re.MULTILINE) 326 | bullet_lists_3 = re.findall(r"^\s*•.*$", value, flags=re.MULTILINE) 327 | bullet_lists_4 = re.findall(r"^\s*\d.*$", value, flags=re.MULTILINE) 328 | num_bullet_lists = len(bullet_lists) + len(bullet_lists_2) + len(bullet_lists_3) + len(bullet_lists_4) 329 | return num_bullet_lists == self._num_bullets 330 | 331 | 332 | class ConstrainedResponseChecker(Instruction): 333 | """Checks the constrained response.""" 334 | 335 | def build_description(self): 336 | """Build the instruction description.""" 337 | # A sequence of string(s) representing the options of the expected response. 338 | self._constrained_responses = _CONSTRAINED_RESPONSE_OPTIONS 339 | self._description_pattern = ( 340 | "Answer with one of the following options: {response_options}") 341 | return self._description_pattern.format( 342 | response_options=self._constrained_responses) 343 | 344 | def get_instruction_args(self): 345 | """Returns the keyward args of `build_description`.""" 346 | return None 347 | 348 | def get_instruction_args_keys(self): 349 | """Returns the args keys of `build_description`.""" 350 | return [] 351 | 352 | def check_following(self, value): 353 | """Checks if the response matches the constrained options. 354 | 355 | Args: 356 | value: A string representing the response. 357 | 358 | Returns: 359 | True if the actual response contains one of the options in the constrained 360 | responses; otherwise False. 361 | """ 362 | value = value.strip() 363 | for constrained_response in self._constrained_responses: 364 | if constrained_response in value: 365 | return True 366 | return False 367 | 368 | 369 | class ConstrainedStartChecker(Instruction): 370 | """Checks the response start.""" 371 | 372 | def build_description(self, *, starter = None): 373 | """Build the instruction description. 374 | 375 | Args: 376 | starter: A string representing the keyward that the response should start 377 | with. 378 | 379 | Returns: 380 | A string representing the instruction description. 381 | """ 382 | self._starter = starter.strip() if isinstance(starter, str) else starter 383 | if self._starter is None: 384 | self._starter = random.choice(_STARTER_OPTIONS) 385 | self._description_pattern = ( 386 | "During the conversation, when it is your turn, " + 387 | "please always start with {starter}") 388 | return self._description_pattern.format(starter=self._starter) 389 | 390 | def get_instruction_args(self): 391 | """Returns the keyward args of `build_description`.""" 392 | return {"starter": self._starter} 393 | 394 | def get_instruction_args_keys(self): 395 | """Returns the args keys of `build_description`.""" 396 | return ["starter"] 397 | 398 | def check_following(self, value): 399 | """Checks if the response starts with the constrained keyword or phrase. 400 | 401 | Args: 402 | value: A string representing the response. 403 | 404 | Returns: 405 | True if the response starts with the given phrase or keyword that is 406 | contained in `instruction_args`; otherwise, False. 407 | """ 408 | response_pattern = r"^\s*" + self._starter + r".*$" 409 | response_with_constrained_start = re.search(response_pattern, value, 410 | flags=re.MULTILINE) 411 | return True if response_with_constrained_start else False 412 | 413 | 414 | class HighlightSectionChecker(Instruction): 415 | """Checks the highlighted section.""" 416 | 417 | def build_description(self, *, num_highlights = None): 418 | """Build the instruction description. 419 | 420 | Args: 421 | num_highlights: An integer specifying the minimum number of highlighted 422 | sections. 423 | 424 | Returns: 425 | A string representing the instruction description. 426 | """ 427 | self._num_highlights = num_highlights 428 | if self._num_highlights is None or self._num_highlights < 0: 429 | self._num_highlights = random.randint(1, _NUM_HIGHLIGHTED_SECTIONS) 430 | 431 | self._description_pattern = ( 432 | "Highlight at least {num_highlights} sections in your answer with " + 433 | "markdown, i.e. *highlighted section*.") 434 | 435 | return self._description_pattern.format(num_highlights=self._num_highlights) 436 | 437 | def get_instruction_args(self): 438 | """Returns the keyward args of `build_description`.""" 439 | return {"num_highlights": self._num_highlights} 440 | 441 | def get_instruction_args_keys(self): 442 | """Returns the args keys of `build_description`.""" 443 | return ["num_highlights"] 444 | 445 | def check_following(self, value): 446 | """Checks if the number of highlighted sections meets the requirement. 447 | 448 | Args: 449 | value: a string repesenting the response. The response is expected to 450 | contain highlighted sections in the format of *highlighted*. 451 | 452 | Returns: 453 | True if the actual number of highlighted sections in the format of 454 | *highlighed sections* meets the minimum requirement; otherwise False. 455 | """ 456 | num_highlights = 0 457 | highlights = re.findall(r"\*[^\n\*]*\*", value) 458 | double_highlights = re.findall(r"\*\*[^\n\*]*\*\*", value) 459 | for highlight in highlights: 460 | if highlight.strip("*").strip(): 461 | num_highlights += 1 462 | for highlight in double_highlights: 463 | if highlight.removeprefix("**").removesuffix("**").strip(): 464 | num_highlights += 1 465 | 466 | return num_highlights >= self._num_highlights 467 | 468 | 469 | class SectionChecker(Instruction): 470 | """Checks the sections.""" 471 | 472 | def build_description(self, *, section_spliter = None, 473 | num_sections = None): 474 | """Build the instruction description. 475 | 476 | Args: 477 | section_spliter: A string represents the section spliter keyword that 478 | marks a new section, i.e., `Section` or `SECTION`. 479 | num_sections: An integer specifying the number of sections. 480 | 481 | Returns: 482 | A string representing the instruction description. 483 | """ 484 | self._section_spliter = section_spliter.strip() if isinstance( 485 | section_spliter, str) else section_spliter 486 | if self._section_spliter is None: 487 | self._section_spliter = random.choice(_SECTION_SPLITER) 488 | 489 | self._num_sections = num_sections 490 | if self._num_sections is None or self._num_sections < 0: 491 | self._num_sections = random.randint(1, _NUM_SECTIONS) 492 | 493 | self._description_pattern = ( 494 | "Your response must have {num_sections} sections. Mark the beginning " + 495 | "of each section with {section_spliter} X, such as:\n" + 496 | "{section_spliter} 1\n" + 497 | "[content of section 1]\n" + 498 | "{section_spliter} 2\n" + 499 | "[content of section 2]") 500 | 501 | return self._description_pattern.format( 502 | num_sections=self._num_sections, 503 | section_spliter=self._section_spliter) 504 | 505 | def get_instruction_args(self): 506 | """Returns the keyward args of `build_description`.""" 507 | return {"section_spliter": self._section_spliter, 508 | "num_sections": self._num_sections} 509 | 510 | def get_instruction_args_keys(self): 511 | """Returns the args keys of `build_description`.""" 512 | return ["section_spliter", "num_sections"] 513 | 514 | def check_following(self, value): 515 | """Checks the response contains multiple sections. 516 | 517 | Args: 518 | value: A string representing the response. The response is expected 519 | to contain multiple sections (number of sections is greater than 1). 520 | A new section starts with `Section 1`, where the number denotes the 521 | section index. 522 | 523 | Returns: 524 | True if the number of sections in the response is greater than or equal to 525 | the minimum number of sections; otherwise, False. 526 | """ 527 | section_splitter_patten = r"\s?" + self._section_spliter + r"\s?\d+\s?" 528 | sections = re.split(section_splitter_patten, value) 529 | num_sections = len(sections) - 1 530 | return num_sections >= self._num_sections 531 | 532 | 533 | class ParagraphChecker(Instruction): 534 | """Checks the paragraphs.""" 535 | 536 | def build_description(self, *, num_paragraphs = None): 537 | """Build the instruction description. 538 | 539 | Args: 540 | num_paragraphs: An integer specifying the number of paragraphs. 541 | 542 | Returns: 543 | A string representing the instruction description. 544 | """ 545 | self._num_paragraphs = num_paragraphs 546 | if self._num_paragraphs is None or self._num_paragraphs < 0: 547 | self._num_paragraphs = random.randint(1, _NUM_PARAGRAPHS) 548 | 549 | self._description_pattern = ( 550 | "There should be {num_paragraphs} paragraphs. " + 551 | "Paragraphs are separated with the markdown divider: ***") 552 | 553 | return self._description_pattern.format(num_paragraphs=self._num_paragraphs) 554 | 555 | def get_instruction_args(self): 556 | """Returns the keyward args of `build_description`.""" 557 | return {"num_paragraphs": self._num_paragraphs} 558 | 559 | def get_instruction_args_keys(self): 560 | """Returns the args keys of `build_description`.""" 561 | return ["num_paragraphs"] 562 | 563 | def check_following(self, value): 564 | """Checks the response contains required number of paragraphs. 565 | 566 | Args: 567 | value: A string representing the response. The response may contain 568 | paragraphs that are separated by the markdown divider: `***`. 569 | 570 | Returns: 571 | True if the actual number of paragraphs is the same as required; 572 | otherwise, False. 573 | """ 574 | # Split based on *** (with optional spaces) or two or more newlines 575 | paragraphs = re.split(r"\s*\*\*\*\s*|\n{1,}", value.strip()) 576 | 577 | num_paragraphs = len(paragraphs) 578 | 579 | for index, paragraph in enumerate(paragraphs): 580 | if not paragraph.strip(): 581 | # If empty, reduce count if it's at start or end, otherwise return False 582 | if index == 0 or index == len(paragraphs) - 1: 583 | num_paragraphs -= 1 584 | else: 585 | return False 586 | 587 | return num_paragraphs == self._num_paragraphs 588 | 589 | 590 | class PostscriptChecker(Instruction): 591 | """Checks the postscript.""" 592 | 593 | def build_description(self, *, postscript_marker = None 594 | ): 595 | """Build the instruction description. 596 | 597 | Args: 598 | postscript_marker: A string containing the keyword that marks the start 599 | of the postscript section. 600 | 601 | Returns: 602 | A string representing the instruction description. 603 | """ 604 | self._postscript_marker = postscript_marker.strip() if isinstance( 605 | postscript_marker, str) else postscript_marker 606 | if self._postscript_marker is None: 607 | self._postscript_marker = random.choice(_POSTSCRIPT_MARKER) 608 | 609 | self._description_pattern = ( 610 | "At the end of your response, please explicitly add a postscript " + 611 | "starting with {postscript}") 612 | 613 | return self._description_pattern.format(postscript=self._postscript_marker) 614 | 615 | def get_instruction_args(self): 616 | """Returns the keyward args of `build_description`.""" 617 | return {"postscript_marker": self._postscript_marker} 618 | 619 | def get_instruction_args_keys(self): 620 | """Returns the args keys of `build_description`.""" 621 | return ["postscript_marker"] 622 | 623 | def check_following(self, value): 624 | """Checks if the response follows the postscript format. 625 | 626 | Args: 627 | value: a string representing the response. The response is expected to 628 | contain a postscript section. 629 | 630 | Returns: 631 | True if the response contains a postscript section starting with 632 | the keyword containing in the `instruction_args`; otherwise False. 633 | """ 634 | value = value.lower() 635 | if self._postscript_marker == "P.P.S": 636 | postscript_pattern = r"\s*p\.\s?p\.\s?s.*$" 637 | elif self._postscript_marker == "P.S.": 638 | postscript_pattern = r"\s*p\.\s?s\..*$" 639 | else: 640 | postscript_pattern = r"\s*" + self._postscript_marker.lower() + r".*$" 641 | postscript = re.findall(postscript_pattern, value, flags=re.MULTILINE) 642 | return True if postscript else False 643 | 644 | 645 | class RephraseChecker(Instruction): 646 | """Checks the repharse.""" 647 | 648 | def build_description(self, *, original_message): 649 | """Build the instruction description. 650 | 651 | Args: 652 | original_message: A string representing the original message. The 653 | rephrased response should only change its words/sentences in between 654 | its two asterisks, for example, *change me*. Both original and rephrased 655 | messages should contain the changes in the form of *change me*. 656 | 657 | Returns: 658 | A string representing the instruction description. 659 | """ 660 | if not self.is_change(original_message): 661 | raise ValueError(f"Message {original_message} does not contain changes " 662 | "in the form of *change me*.") 663 | 664 | self._reference_without_change = original_message 665 | self._description = ("Rephrasing: Your rephrased response should only" + 666 | "change the words/sentences in between two asterisks" + 667 | "such as *change me*.") 668 | return self._description 669 | 670 | def get_instruction_args(self): 671 | """Returns the keyward args of `build_description`.""" 672 | return {"original_message": self._reference_without_change} 673 | 674 | def get_instruction_args_keys(self): 675 | """Returns the args keys of `build_description`.""" 676 | return ["original_message"] 677 | 678 | def check_following(self, value): 679 | r"""Checks if the rephrasing follows the instruction. 680 | 681 | Args: 682 | value: A string representing the response, which is expected to rephras 683 | the string of `instruction_args`. 684 | 685 | Returns: 686 | True if `value` and `instruction_args` only differ by the words/sentences 687 | in between two asterisks such as *change me*; otherwise, False. 688 | """ 689 | 690 | if not self.is_change(value): 691 | raise ValueError(f"value {value} does not contain " 692 | "changes in the form of *change me*.") 693 | 694 | response_without_changes = self.strip_changes(value) 695 | reference_without_changes = self.strip_changes( 696 | self._reference_without_change) 697 | 698 | return response_without_changes == reference_without_changes 699 | 700 | def is_change(self, response): 701 | """Check if there is change in the response in the form of *change me*.""" 702 | return re.search(r"\*.*\*", response) 703 | 704 | def strip_changes(self, response): 705 | """Strips off the changes.""" 706 | return re.sub(r"\*.*\*", "", response) 707 | 708 | 709 | class KeywordChecker(Instruction): 710 | """Check the exisitence of certain keywords.""" 711 | 712 | def build_description(self, *, keywords = None 713 | ): 714 | """Build the instruction description. 715 | 716 | Args: 717 | keywords: A sequence of strings representing the keywords that are 718 | expected in the response. 719 | 720 | Returns: 721 | A string representing the instruction description. 722 | """ 723 | 724 | if not keywords: 725 | self._keywords = instructions_util.generate_keywords( 726 | num_keywords=_NUM_KEYWORDS) 727 | else: 728 | self._keywords = keywords 729 | self._keywords = sorted(self._keywords) 730 | 731 | self._description_pattern = ("Include keywords {keywords} in the response.") 732 | 733 | return self._description_pattern.format(keywords=self._keywords) 734 | 735 | def get_instruction_args(self): 736 | """Returns the keyward args of `build_description`.""" 737 | return {"keywords": self._keywords} 738 | 739 | def get_instruction_args_keys(self): 740 | """Returns the args keys of `build_description`.""" 741 | return ["keywords"] 742 | 743 | def check_following(self, value): 744 | """Check if the response contain the expected keywords.""" 745 | for keyword in self._keywords: 746 | if not re.search(keyword, value, flags=re.IGNORECASE): 747 | return False 748 | return True 749 | 750 | 751 | class KeywordFrequencyChecker(Instruction): 752 | """Check the keyword frequency.""" 753 | 754 | def build_description(self, *, keyword = None, 755 | frequency = None, 756 | relation = None): 757 | """Build the instruction description. 758 | 759 | Args: 760 | keyword: A string representing a keyword that is expected in the response. 761 | frequency: An integer specifying the number of times `keyword` is expected 762 | to appear in the response. 763 | relation: A string in (`less than`, `at least`), defining the relational 764 | operator for comparison. 765 | Two relational comparisons are supported for now: 766 | if 'less than', the actual number of occurrences < frequency; 767 | if 'at least', the actual number of occurrences >= frequency. 768 | 769 | Returns: 770 | A string representing the instruction description. 771 | """ 772 | if not keyword: 773 | self._keyword = instructions_util.generate_keywords(num_keywords=1)[0] 774 | else: 775 | self._keyword = keyword.strip() 776 | 777 | self._frequency = frequency 778 | if self._frequency is None or self._frequency < 0: 779 | self._frequency = random.randint(1, _KEYWORD_FREQUENCY) 780 | 781 | if relation is None: 782 | self._comparison_relation = random.choice(_COMPARISON_RELATION) 783 | elif relation not in _COMPARISON_RELATION: 784 | raise ValueError("The supported relation for comparison must be in " 785 | f"{_COMPARISON_RELATION}, but {relation} is given.") 786 | else: 787 | self._comparison_relation = relation 788 | 789 | self._description_pattern = ( 790 | "In your response, the word {keyword} should appear {relation} " + 791 | "{frequency} times.") 792 | 793 | return self._description_pattern.format( 794 | keyword=self._keyword, 795 | relation=self._comparison_relation, 796 | frequency=self._frequency) 797 | 798 | def get_instruction_args(self): 799 | """Returns the keyward args of `build_description`.""" 800 | return {"keyword": self._keyword, 801 | "frequency": self._frequency, 802 | "relation": self._comparison_relation} 803 | 804 | def get_instruction_args_keys(self): 805 | """Returns the args keys of `build_description`.""" 806 | return ["keyword", "frequency", "relation"] 807 | 808 | def check_following(self, value): 809 | """Checks if the response contain the keyword with required frequency.""" 810 | actual_occurrences = len(re.findall( 811 | self._keyword, value, flags=re.IGNORECASE)) 812 | 813 | if self._comparison_relation == _COMPARISON_RELATION[0]: 814 | return actual_occurrences < self._frequency 815 | elif self._comparison_relation == _COMPARISON_RELATION[1]: 816 | return actual_occurrences >= self._frequency # pytype: disable=bad-return-type 817 | 818 | 819 | class NumberOfWords(Instruction): 820 | """Checks the number of words.""" 821 | 822 | def build_description(self, *, num_words = None, 823 | relation = None): 824 | """Build the instruction description. 825 | 826 | Args: 827 | num_words: An integer specifying the number of words contained in the 828 | response. 829 | relation: A string in (`less than`, `at least`), defining the relational 830 | operator for comparison. 831 | Two relational comparisons are supported for now: 832 | if 'less than', the actual number of words < num_words; 833 | if 'at least', the actual number of words >= num_words. 834 | 835 | Returns: 836 | A string representing the instruction description. 837 | """ 838 | 839 | self._num_words = num_words 840 | if self._num_words is None or self._num_words < 0: 841 | self._num_words = random.randint( 842 | _NUM_WORDS_LOWER_LIMIT, _NUM_WORDS_UPPER_LIMIT 843 | ) 844 | 845 | if relation is None: 846 | self._comparison_relation = random.choice(_COMPARISON_RELATION) 847 | elif relation not in _COMPARISON_RELATION: 848 | raise ValueError("The supported relation for comparison must be in " 849 | f"{_COMPARISON_RELATION}, but {relation} is given.") 850 | else: 851 | self._comparison_relation = relation 852 | 853 | self._description_pattern = ( 854 | "Answer with {relation} {num_words} words.") 855 | 856 | return self._description_pattern.format( 857 | relation=self._comparison_relation, 858 | num_words=self._num_words) 859 | 860 | def get_instruction_args(self): 861 | """Returns the keyward args of `build_description`.""" 862 | return {"num_words": self._num_words, 863 | "relation": self._comparison_relation} 864 | 865 | def get_instruction_args_keys(self): 866 | """Returns the args keys of `build_description`.""" 867 | return ["num_words", "relation"] 868 | 869 | def check_following(self, value): 870 | """Checks if the response contains the expected number of words.""" 871 | num_words = instructions_util.count_words(value) 872 | 873 | if self._comparison_relation == _COMPARISON_RELATION[0]: 874 | return num_words < self._num_words 875 | elif self._comparison_relation == _COMPARISON_RELATION[1]: 876 | return num_words >= self._num_words # pytype: disable=bad-return-type 877 | 878 | 879 | class JsonFormat(Instruction): 880 | """Check the Json format.""" 881 | 882 | def build_description(self): 883 | self._description_pattern = ( 884 | "Entire output should be wrapped in JSON format. You can use markdown" 885 | " ticks such as ```." 886 | ) 887 | return self._description_pattern 888 | 889 | def get_instruction_args(self): 890 | """Returns the keyward args of `build_description`.""" 891 | return None 892 | 893 | def get_instruction_args_keys(self): 894 | """Returns the args keys of `build_description`.""" 895 | return [] 896 | 897 | def check_following(self, value): 898 | value = ( 899 | value.strip() 900 | .removeprefix("```json") 901 | .removeprefix("```Json") 902 | .removeprefix("```JSON") 903 | .removeprefix("```") 904 | .removesuffix("```") 905 | .strip() 906 | ) 907 | try: 908 | json.loads(value) 909 | except ValueError as _: 910 | return False 911 | return True 912 | 913 | 914 | class ParagraphFirstWordCheck(Instruction): 915 | """Check the paragraph and the first word of the nth paragraph.""" 916 | 917 | def build_description(self, num_paragraphs = None, 918 | nth_paragraph = None, 919 | first_word = None): 920 | r"""Build the instruction description. 921 | 922 | Args: 923 | num_paragraphs: An integer indicating the number of paragraphs expected 924 | in the response. A paragraph is a subset of the string that is 925 | expected to be separated by '\n\n'. 926 | nth_paragraph: An integer indicating the paragraph number that we look at. 927 | Note that n starts from 1. 928 | first_word: A string that represent the first word of the bth paragraph. 929 | 930 | Returns: 931 | A string representing the instruction description. 932 | """ 933 | self._num_paragraphs = num_paragraphs 934 | if self._num_paragraphs is None or self._num_paragraphs < 0: 935 | self._num_paragraphs = random.randint(1, _NUM_PARAGRAPHS) 936 | 937 | self._nth_paragraph = nth_paragraph 938 | if ( 939 | self._nth_paragraph is None 940 | or self._nth_paragraph <= 0 941 | or self._nth_paragraph > self._num_paragraphs 942 | ): 943 | self._nth_paragraph = random.randint(1, self._num_paragraphs + 1) 944 | 945 | self._first_word = first_word 946 | if self._first_word is None: 947 | self._first_word = instructions_util.generate_keywords(num_keywords=1)[0] 948 | self._first_word = self._first_word.lower() 949 | 950 | self._description_pattern = ( 951 | "There should be {num_paragraphs} paragraphs. " + 952 | "Paragraphs and only paragraphs are separated with each other by two " + 953 | "new lines as if it was '\\n\\n' in python. " + 954 | "Paragraph {nth_paragraph} must start with word {first_word}.") 955 | 956 | return self._description_pattern.format( 957 | num_paragraphs=self._num_paragraphs, 958 | nth_paragraph=self._nth_paragraph, 959 | first_word=self._first_word) 960 | 961 | def get_instruction_args(self): 962 | """Returns the keyward args of `build_description`.""" 963 | return {"num_paragraphs": self._num_paragraphs, 964 | "nth_paragraph": self._nth_paragraph, 965 | "first_word": self._first_word} 966 | 967 | def get_instruction_args_keys(self): 968 | """Returns the args keys of `build_description`.""" 969 | return ["num_paragraphs", "nth_paragraph", "first_word"] 970 | 971 | def check_following(self, value): 972 | """Checks for required number of paragraphs and correct first word. 973 | 974 | Args: 975 | value: a string representing the response. The response may contain 976 | paragraphs that are separated by two new lines and the first word of 977 | the nth paragraph will have to match a specified word. 978 | 979 | Returns: 980 | True if the number of paragraphs is the same as required and the first 981 | word of the specified paragraph is the same as required. Otherwise, false. 982 | """ 983 | 984 | paragraphs = re.split(r"\n\n", value) 985 | num_paragraphs = len(paragraphs) 986 | 987 | for paragraph in paragraphs: 988 | if not paragraph.strip(): 989 | num_paragraphs -= 1 990 | 991 | # check that index doesn't go out of bounds 992 | if self._nth_paragraph <= num_paragraphs: 993 | paragraph = paragraphs[self._nth_paragraph - 1].strip() 994 | if not paragraph: 995 | return False 996 | else: 997 | return False 998 | 999 | first_word = "" 1000 | punctuation = {".", ",", "?", "!", "'", '"'} 1001 | 1002 | # get first word and remove punctuation 1003 | word = paragraph.split()[0].strip() 1004 | # TODO(jeffrey): make more complex? 1005 | word = word.lstrip("'") 1006 | word = word.lstrip('"') 1007 | 1008 | for letter in word: 1009 | if letter in punctuation: 1010 | break 1011 | first_word += letter.lower() 1012 | 1013 | return ( 1014 | num_paragraphs == self._num_paragraphs 1015 | and first_word == self._first_word 1016 | ) 1017 | 1018 | 1019 | # TODO(jeffrey) add relation - at least/at most? 1020 | class KeySentenceChecker(Instruction): 1021 | """Check the existence of certain key sentences.""" 1022 | 1023 | def build_description(self, key_sentences = None, 1024 | num_sentences = None): 1025 | """Build the instruction description. 1026 | 1027 | Args: 1028 | key_sentences: A sequences of strings representing the key sentences that 1029 | are expected in the response. 1030 | num_sentences: The number of key sentences that are expected to be seen in 1031 | the response. 1032 | 1033 | Returns: 1034 | A string representing the instruction description. 1035 | """ 1036 | 1037 | if not key_sentences: 1038 | # TODO(jeffrey) make a generate sentences function? wonderwords package 1039 | self._key_sentences = set(["For now, this is fine."]) 1040 | else: 1041 | self._key_sentences = key_sentences 1042 | 1043 | if not num_sentences: 1044 | self._num_sentences = random.randint(1, len(self._key_sentences)) 1045 | else: 1046 | self._num_sentences = num_sentences 1047 | 1048 | self._description_pattern = ( 1049 | "Include {num_sentences} of the following sentences {key_sentences}" 1050 | ) 1051 | 1052 | return self._description_pattern.format( 1053 | num_sentences=self._num_sentences, key_sentences=self._key_sentences 1054 | ) 1055 | 1056 | def get_instruction_args(self): 1057 | """Returns the keyward args of `build_description`.""" 1058 | return {"num_sentences": self._num_sentences, 1059 | "key_sentences": list(self._key_sentences)} 1060 | 1061 | def get_instruction_args_keys(self): 1062 | """Returns the args keys of `build_description`.""" 1063 | return ["num_sentences", "key_sentences"] 1064 | 1065 | def check_following(self, value): 1066 | """Checks if the response contains the expected key sentences.""" 1067 | count = 0 1068 | sentences = instructions_util.split_into_sentences(value) 1069 | for sentence in self._key_sentences: 1070 | if sentence in sentences: 1071 | count += 1 1072 | 1073 | return count == self._num_sentences 1074 | 1075 | 1076 | class ForbiddenWords(Instruction): 1077 | """Checks that specified words are not used in response.""" 1078 | 1079 | def build_description(self, forbidden_words = None 1080 | ): 1081 | """Build the instruction description. 1082 | 1083 | Args: 1084 | forbidden_words: A sequences of strings respresenting words that are not 1085 | allowed in the response. 1086 | 1087 | Returns: 1088 | A string representing the instruction description. 1089 | """ 1090 | 1091 | if not forbidden_words: 1092 | self._forbidden_words = instructions_util.generate_keywords( 1093 | num_keywords=_NUM_KEYWORDS) 1094 | else: 1095 | self._forbidden_words = list(set(forbidden_words)) 1096 | self._forbidden_words = sorted(self._forbidden_words) 1097 | self._description_pattern = ( 1098 | "Do not include keywords {forbidden_words} in the response." 1099 | ) 1100 | 1101 | return self._description_pattern.format( 1102 | forbidden_words=self._forbidden_words 1103 | ) 1104 | 1105 | def get_instruction_args(self): 1106 | """Returns the keyward args of `build_description`.""" 1107 | return {"forbidden_words": self._forbidden_words} 1108 | 1109 | def get_instruction_args_keys(self): 1110 | """Returns the args keys of `build_description`.""" 1111 | return ["forbidden_words"] 1112 | 1113 | def check_following(self, value): 1114 | """Check if the response does not contain the expected keywords.""" 1115 | for word in self._forbidden_words: 1116 | if re.search(r"\b" + word + r"\b", value, flags=re.IGNORECASE): 1117 | return False 1118 | return True 1119 | 1120 | 1121 | class RephraseParagraph(Instruction): 1122 | """Checks that the paragraph is rephrased.""" 1123 | 1124 | def build_description(self, *, original_paragraph, low, high 1125 | ): 1126 | """Builds the instruction description. 1127 | 1128 | Args: 1129 | original_paragraph: A string presenting the original paragraph. The 1130 | rephrases response should have betweeb low-high words in common. 1131 | low: An integer presenting the lower bound of similar words. 1132 | high: An integer representing the upper bound of similar words. 1133 | 1134 | Returns: 1135 | A string representing the instruction description. 1136 | """ 1137 | # TODO(jeffrey) make more encompassing 1138 | self._original_paragraph = original_paragraph 1139 | self._low = low 1140 | self._high = high 1141 | 1142 | self._description = ("Rephrase the following paragraph: " + 1143 | "{original_paragraph}\nYour response should have " + 1144 | "between {low} and {high} of the same words. " + 1145 | "Words are the same if and only if all of the " + 1146 | "letters, ignoring cases, are the same. For " + 1147 | "example, 'run' is the same as 'Run' but different " + 1148 | "to 'ran'.") 1149 | 1150 | return self._description.format(original_paragraph=original_paragraph, 1151 | low=self._low, high=self._high) 1152 | 1153 | def get_instruction_args(self): 1154 | """Returns the keyward args of `build_description`.""" 1155 | return {"original_paragraph": self._original_paragraph, 1156 | "low": self._low, 1157 | "high": self._high} 1158 | 1159 | def get_instruction_args_keys(self): 1160 | """Returns the args keys of `build_description`.""" 1161 | return ["original_paragraph", "low", "high"] 1162 | 1163 | def check_following(self, value): 1164 | val_words = re.findall(r"\w+", value.lower()) 1165 | original_words = re.findall(r"\w+", self._original_paragraph.lower()) 1166 | similar_words = 0 1167 | 1168 | dict_val = collections.Counter(val_words) 1169 | dict_original = collections.Counter(original_words) 1170 | 1171 | for word in dict_original: 1172 | similar_words += min(dict_original[word], dict_val[word]) 1173 | 1174 | return similar_words >= self._low and similar_words <= self._high 1175 | 1176 | 1177 | class TwoResponsesChecker(Instruction): 1178 | """Check that two responses were given.""" 1179 | 1180 | def build_description(self): 1181 | """Build the instruction description.""" 1182 | self._description_pattern = ( 1183 | "Give two different responses. Responses and only responses should" 1184 | " be separated by 6 asterisk symbols: ******." 1185 | ) 1186 | return self._description_pattern 1187 | 1188 | def get_instruction_args(self): 1189 | """Returns the keyward args of `build_description`.""" 1190 | return None 1191 | 1192 | def get_instruction_args_keys(self): 1193 | """Returns the args keys of `build_description`.""" 1194 | return [] 1195 | 1196 | def check_following(self, value): 1197 | """Checks if the response has two different answers. 1198 | 1199 | Args: 1200 | value: A string representing the response. 1201 | 1202 | Returns: 1203 | True if two responses are detected and false otherwise. 1204 | """ 1205 | valid_responses = list() 1206 | responses = value.split("******") 1207 | for index, response in enumerate(responses): 1208 | if not response.strip(): 1209 | if index != 0 and index != len(responses) - 1: 1210 | return False 1211 | else: 1212 | valid_responses.append(response) 1213 | return ( 1214 | len(valid_responses) == 2 1215 | and valid_responses[0].strip() != valid_responses[1].strip() 1216 | ) 1217 | 1218 | 1219 | class RepeatPromptThenAnswer(Instruction): 1220 | """Checks that Prompt is first repeated then answered.""" 1221 | 1222 | def build_description(self, *, prompt_to_repeat = None): 1223 | """Build the instruction description. 1224 | 1225 | Args: 1226 | prompt_to_repeat: The prompt that is meant to be repeated. 1227 | 1228 | Returns: 1229 | A string representing the instruction description. 1230 | """ 1231 | if not prompt_to_repeat: 1232 | raise ValueError("prompt_to_repeat must be set.") 1233 | else: 1234 | self._prompt_to_repeat = prompt_to_repeat 1235 | self._description_pattern = ( 1236 | "First repeat the request word for word without change," 1237 | " then give your answer (1. do not say any words or characters" 1238 | " before repeating the request; 2. the request you need to repeat" 1239 | " does not include this sentence)" 1240 | ) 1241 | return self._description_pattern 1242 | 1243 | def get_instruction_args(self): 1244 | return {"prompt_to_repeat": self._prompt_to_repeat} 1245 | 1246 | def get_instruction_args_keys(self): 1247 | """Returns the args keys of `build_description`.""" 1248 | return ["prompt_to_repeat"] 1249 | 1250 | def check_following(self, value): 1251 | if value.strip().lower().strip("\"\'").startswith(self._prompt_to_repeat.strip().lower()): 1252 | return True 1253 | return False 1254 | 1255 | 1256 | class EndChecker(Instruction): 1257 | """Checks that the prompt ends with a given phrase.""" 1258 | 1259 | def build_description(self, *, end_phrase = None): 1260 | """Build the instruction description. 1261 | 1262 | Args: 1263 | end_phrase: A string representing the phrase the response should end with. 1264 | 1265 | Returns: 1266 | A string representing the instruction description. 1267 | """ 1268 | self._end_phrase = ( 1269 | end_phrase.strip() if isinstance(end_phrase, str) else end_phrase 1270 | ) 1271 | if self._end_phrase is None: 1272 | self._end_phrase = random.choice(_ENDING_OPTIONS) 1273 | self._description_pattern = ( 1274 | "Finish your response with this exact phrase {ender}. " 1275 | "No other words should follow this phrase.") 1276 | return self._description_pattern.format(ender=self._end_phrase) 1277 | 1278 | def get_instruction_args(self): 1279 | return {"end_phrase": self._end_phrase} 1280 | 1281 | def get_instruction_args_keys(self): 1282 | """Returns the args keys of `build_description`.""" 1283 | return ["end_phrase"] 1284 | 1285 | def check_following(self, value): 1286 | """Checks if the response ends with the expected phrase.""" 1287 | value = value.strip().strip("\"").lower() 1288 | self._end_phrase = self._end_phrase.strip().lower() 1289 | return value.endswith(self._end_phrase) 1290 | 1291 | 1292 | class TitleChecker(Instruction): 1293 | """Checks the response for a title.""" 1294 | 1295 | def build_description(self): 1296 | """Build the instruction description.""" 1297 | self._description_pattern = ( 1298 | "Your answer must contain a title, wrapped in double angular brackets," 1299 | " such as <>." 1300 | ) 1301 | return self._description_pattern 1302 | 1303 | def get_instruction_args(self): 1304 | return None 1305 | 1306 | def get_instruction_args_keys(self): 1307 | """Returns the args keys of `build_description`.""" 1308 | return [] 1309 | 1310 | def check_following(self, value): 1311 | """Checks if the response contains a title.""" 1312 | pattern = r"<<[^\n]+>>" 1313 | re_pattern = re.compile(pattern) 1314 | titles = re.findall(re_pattern, value) 1315 | 1316 | for title in titles: 1317 | if title.lstrip("<").rstrip(">").strip(): 1318 | return True 1319 | return False 1320 | 1321 | 1322 | class LetterFrequencyChecker(Instruction): 1323 | """Checks letter frequency.""" 1324 | 1325 | def build_description(self, *, letter = None, 1326 | let_frequency = None, 1327 | let_relation = None): 1328 | """Build the instruction description. 1329 | 1330 | Args: 1331 | letter: A string representing a letter that is expected in the response. 1332 | let_frequency: An integer specifying the number of times `keyword` is 1333 | expected to appear in the response. 1334 | let_relation: A string in (`less than`, `at least`), defining the 1335 | relational operator for comparison. Two relational comparisons are 1336 | supported for now; if 'less than', the actual number of 1337 | occurrences < frequency; if 'at least', the actual number of 1338 | occurrences >= frequency. 1339 | 1340 | Returns: 1341 | A string representing the instruction description. 1342 | """ 1343 | if ( 1344 | not letter 1345 | or len(letter) > 1 1346 | or ord(letter.lower()) < 97 1347 | or ord(letter.lower()) > 122 1348 | ): 1349 | self._letter = random.choice(list(string.ascii_letters)) 1350 | else: 1351 | self._letter = letter.strip() 1352 | self._letter = self._letter.lower() 1353 | 1354 | self._frequency = let_frequency 1355 | if self._frequency is None or self._frequency < 0: 1356 | self._frequency = random.randint(1, _LETTER_FREQUENCY) 1357 | 1358 | if let_relation is None: 1359 | self._comparison_relation = random.choice(_COMPARISON_RELATION) 1360 | elif let_relation not in _COMPARISON_RELATION: 1361 | raise ValueError( 1362 | "The supported relation for comparison must be in " 1363 | f"{_COMPARISON_RELATION}, but {let_relation} is given." 1364 | ) 1365 | else: 1366 | self._comparison_relation = let_relation 1367 | 1368 | self._description_pattern = ( 1369 | "In your response, the letter {letter} should appear {let_relation}" 1370 | " {let_frequency} times." 1371 | ) 1372 | 1373 | return self._description_pattern.format( 1374 | letter=self._letter, 1375 | let_frequency=self._frequency, 1376 | let_relation=self._comparison_relation, 1377 | ) 1378 | 1379 | def get_instruction_args(self): 1380 | """Returns the keyword args of build description.""" 1381 | return {"letter": self._letter, 1382 | "let_frequency": self._frequency, 1383 | "let_relation": self._comparison_relation} 1384 | 1385 | def get_instruction_args_keys(self): 1386 | """Returns the args keys of `build_description`.""" 1387 | return ["letter", "let_frequency", "let_relation"] 1388 | 1389 | def check_following(self, value): 1390 | """Checks that the response contains the letter at the right frequency.""" 1391 | value = value.lower() 1392 | letters = collections.Counter(value) 1393 | 1394 | if self._comparison_relation == _COMPARISON_RELATION[0]: 1395 | return letters[self._letter] < self._frequency 1396 | else: 1397 | return letters[self._letter] >= self._frequency 1398 | 1399 | 1400 | class CapitalLettersEnglishChecker(Instruction): 1401 | """Checks that the response is in english and is in all capital letters.""" 1402 | 1403 | def build_description(self): 1404 | """Build the instruction description.""" 1405 | self._description_pattern = ( 1406 | "Your entire response should be in English, and in all capital letters." 1407 | ) 1408 | return self._description_pattern 1409 | 1410 | def get_instruction_args(self): 1411 | return None 1412 | 1413 | def get_instruction_args_keys(self): 1414 | """Returns the args keys of `build_description`.""" 1415 | return [] 1416 | 1417 | def check_following(self, value): 1418 | """Checks that the response is in English and in all capital letters.""" 1419 | assert isinstance(value, str) 1420 | 1421 | try: 1422 | return value.isupper()# and langdetect.detect(value) == "en" 1423 | except langdetect.LangDetectException as e: 1424 | # Count as instruction is followed. 1425 | logging.error( 1426 | "Unable to detect language for text %s due to %s", value, e 1427 | ) # refex: disable=pytotw.037 1428 | return True 1429 | 1430 | 1431 | class LowercaseLettersEnglishChecker(Instruction): 1432 | """Checks that the response is in english and is in all lowercase letters.""" 1433 | 1434 | def build_description(self): 1435 | """Build the instruction description.""" 1436 | self._description_pattern = ( 1437 | "Your entire response should be in English, and in all lowercase" 1438 | " letters. No capital letters are allowed." 1439 | ) 1440 | return self._description_pattern 1441 | 1442 | def get_instruction_args(self): 1443 | return None 1444 | 1445 | def get_instruction_args_keys(self): 1446 | """Returns the args keys of `build_description`.""" 1447 | return [] 1448 | 1449 | def check_following(self, value): 1450 | """Checks that the response is in English and in all lowercase letters.""" 1451 | assert isinstance(value, str) 1452 | 1453 | try: 1454 | return value.islower()# and langdetect.detect(value) == "en" 1455 | except langdetect.LangDetectException as e: 1456 | # Count as instruction is followed. 1457 | logging.error( 1458 | "Unable to detect language for text %s due to %s", value, e 1459 | ) # refex: disable=pytotw.037 1460 | return True 1461 | 1462 | 1463 | class CommaChecker(Instruction): 1464 | """Checks the response for no commas.""" 1465 | 1466 | def build_description(self): 1467 | """Build the instruction description.""" 1468 | self._description_pattern = ( 1469 | "In your entire response, refrain from the use of any commas." 1470 | ) 1471 | return self._description_pattern 1472 | 1473 | def get_instruction_args(self): 1474 | return None 1475 | 1476 | def get_instruction_args_keys(self): 1477 | """Returns the args keys of `build_description`.""" 1478 | return [] 1479 | 1480 | def check_following(self, value): 1481 | """Checks that the response does not contain commas.""" 1482 | return not re.search(r"\,", value) 1483 | 1484 | 1485 | class CapitalWordFrequencyChecker(Instruction): 1486 | """Checks frequency of words with all capital letters.""" 1487 | 1488 | def build_description( 1489 | self, 1490 | capital_frequency = None, 1491 | capital_relation = None, 1492 | ): 1493 | """Build the instruction description. 1494 | 1495 | Args: 1496 | capital_frequency: An integer that represents the number of words that 1497 | should be in all capital letters. 1498 | capital_relation: A string that is 'at least' or 'at most' that refers to 1499 | the frequency. 1500 | 1501 | Returns: 1502 | A string representing the instruction description. 1503 | """ 1504 | self._frequency = capital_frequency 1505 | if self._frequency is None: 1506 | self._frequency = random.randint(1, _ALL_CAPITAL_WORD_FREQUENCY) 1507 | 1508 | self._comparison_relation = capital_relation 1509 | if capital_relation is None: 1510 | self._comparison_relation = random.choice(_COMPARISON_RELATION) 1511 | elif capital_relation not in _COMPARISON_RELATION: 1512 | raise ValueError( 1513 | "The supported relation for comparison must be in " 1514 | f"{_COMPARISON_RELATION}, but {capital_relation} is given." 1515 | ) 1516 | 1517 | self._description_pattern = ( 1518 | "In your response, words with all capital letters should appear" 1519 | " {relation} {frequency} times." 1520 | ) 1521 | 1522 | return self._description_pattern.format( 1523 | frequency=self._frequency, relation=self._comparison_relation 1524 | ) 1525 | 1526 | def get_instruction_args(self): 1527 | """Returns the keyword args of build description.""" 1528 | return { 1529 | "capital_frequency": self._frequency, 1530 | "capital_relation": self._comparison_relation, 1531 | } 1532 | 1533 | def get_instruction_args_keys(self): 1534 | """Returns the args keys of `build_description`.""" 1535 | return ["capital_frequency", "capital_relation"] 1536 | 1537 | def check_following(self, value): 1538 | """Checks the frequency of words with all capital letters.""" 1539 | # Hyphenated words will count as one word 1540 | words = instructions_util.nltk.word_tokenize(value) 1541 | capital_words = [word for word in words if word.isupper()] 1542 | 1543 | capital_words = len(capital_words) 1544 | 1545 | if self._comparison_relation == _COMPARISON_RELATION[0]: 1546 | return capital_words < self._frequency 1547 | else: 1548 | return capital_words >= self._frequency 1549 | 1550 | 1551 | class QuotationChecker(Instruction): 1552 | """Checks response is wrapped with double quotation marks.""" 1553 | 1554 | def build_description(self): 1555 | """Build the instruction description.""" 1556 | self._description_pattern = ( 1557 | "Wrap your entire response with double quotation marks." 1558 | ) 1559 | return self._description_pattern 1560 | 1561 | def get_instruction_args(self): 1562 | """Returns the keyword args of build description.""" 1563 | return None 1564 | 1565 | def get_instruction_args_keys(self): 1566 | """Returns the args keys of `build_description`.""" 1567 | return [] 1568 | 1569 | def check_following(self, value): 1570 | """Checks if the response is wrapped with double quotation marks.""" 1571 | value = value.strip() 1572 | return len(value) > 1 and value[0] == '"' and value[-1] == '"' 1573 | --------------------------------------------------------------------------------