├── requirements.txt
├── instruction_following_eval
    ├── requirements.txt
    ├── data
    │   ├── input_data2.jsonl
    │   ├── temp.jsonl
    │   └── debug.jsonl
    ├── run.sh
    ├── README.md
    ├── instructions_util_test.py
    ├── instructions_registry.py
    ├── evaluation_main.py
    ├── instructions_util.py
    ├── instructions_test.py
    └── instructions.py
├── .gitignore
├── examples
    ├── eval_desta2.py
    └── eval_llama3_8B_instruct.py
├── README.md
└── script
    └── llm_evaluation.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | absl-py
2 | langdetect
3 | nltk
4 | immutabledict
5 | openai
6 | transformers


--------------------------------------------------------------------------------
/instruction_following_eval/requirements.txt:
--------------------------------------------------------------------------------
1 | absl-py
2 | langdetect
3 | nltk
4 | immutabledict
5 | 


--------------------------------------------------------------------------------
/instruction_following_eval/data/input_data2.jsonl:
--------------------------------------------------------------------------------
1 | {"key": 1000, "prompt": "Write a 300+ word summary of the wikipedia page \"https://en.wikipedia.org/wiki/Raymond_III,_Count_of_Tripoli\". Do not use any commas and highlight at least 3 sections that has titles in markdown format, for example *highlighted section part 1*, *highlighted section part 2*, *highlighted section part 3*.", "instruction_id_list": ["punctuation:no_comma", "detectable_format:number_highlighted_sections", "length_constraints:number_words"], "kwargs": [{}, {"num_highlights": 3}, {"relation": "at least", "num_words": 300}]}


--------------------------------------------------------------------------------
/instruction_following_eval/run.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The Google Research Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | #!/bin/bash
16 | 
17 | python3 -m instruction_following_eval.evaluation_main \
18 |   --input_data=./instruction_following_eval/data/input_data.jsonl \
19 |   --input_response_data=./instruction_following_eval/data/input_response_data_gpt4_20231107_145030.jsonl \
20 |   --output_dir=./instruction_following_eval/data/
21 | 
22 | exit 0


--------------------------------------------------------------------------------
/instruction_following_eval/data/temp.jsonl:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | 
 5 | {"key": 3, "prompt": "Make sure to wrap your entire response with double quotes.", "instruction_id_list": ["startend:quotation"], "kwargs": [{}]}
 6 | 
 7 | 
 8 | 
 9 | {"key": 1216, "prompt": "Make sure to wrap your entire response with double quotes", "instruction_id_list": ["startend:quotation"], "kwargs": [{}]}
10 | {"key": 1220, "prompt": "End your entire response with the exact phrase \"Is there anything else I can help with?\"", "instruction_id_list": ["startend:end_checker"], "kwargs": [{"end_phrase": "Is there anything else I can help with?"}]}
11 | {"key": 1221, "prompt": "Start your answer with \"Answer:\"", "instruction_id_list": ["combination:repeat_prompt"], "kwargs": [{"prompt_to_repeat": "Answer:"}]}
12 | {"key": 122, "prompt": "Wrap your entire response with double quotation marks.", "instruction_id_list": ["startend:quotation"], "kwargs": [{}]}
13 | {"key": 1148, "prompt": "Wrap the entire output in JSON format.", "instruction_id_list": ["detectable_format:json_format"], "kwargs": [{}]}
14 | {"key": 13, "prompt": "Please wrap your entire answer in JSON format. You can use markdown ticks such as ```. For example:\n```JSON\n{\n...\n}\n```", "instruction_id_list": ["detectable_format:json_format"], "kwargs": [{}]}
15 | {"key": 14, "prompt": "Please wrap your answer in JSON format with the key: \"answer\"", "instruction_id_list": ["detectable_format:json_format", "keywords:existence"], "kwargs": [{}, {"keywords": ["answer"]}]}
16 | 
17 | 


--------------------------------------------------------------------------------
/instruction_following_eval/README.md:
--------------------------------------------------------------------------------
 1 | # IFEval: Instruction Following Eval
 2 | 
 3 | This is not an officially supported Google product.
 4 | 
 5 | This repository contains source code and data for
 6 | [Instruction Following Evaluation for Large Language Models](arxiv.org/abs/2311.07911)
 7 | 
 8 | ## Dependencies
 9 | 
10 | Please make sure that all required python packages are installed via:
11 | 
12 | ```
13 | pip3 install -r requirements.txt
14 | ```
15 | 
16 | ## How to run
17 | 
18 | You need to create a jsonl file with two entries: prompt and response.
19 | Then, call `evaluation_main` from the parent folder of
20 | instruction_following_eval. For example:
21 | 
22 | ```bash
23 | # Content of `--input_response_data` should be like:
24 | # {"prompt": "Write a 300+ word summary ...", "response": "PUT YOUR MODEL RESPONSE HERE"}
25 | # {"prompt": "I am planning a trip to ...", "response": "PUT YOUR MODEL RESPONSE HERE"}
26 | # ...
27 | python3 -m instruction_following_eval.evaluation_main \
28 |   --input_data=./instruction_following_eval/data/input_data.jsonl \
29 |   --input_response_data=./instruction_following_eval/data/input_response_data_gpt4_20231107_145030.jsonl \
30 |   --output_dir=./instruction_following_eval/data/
31 | ```
32 | 
33 | ## Reference
34 | 
35 | If you use our work, please consider citing our preprint:
36 | 
37 | ```
38 | @article{zhou2023instruction,
39 |   title={Instruction-Following Evaluation for Large Language Models},
40 |   author={Zhou, Jeffrey and Lu, Tianjian and Mishra, Swaroop and Brahma, Siddhartha and Basu, Sujoy and Luan, Yi and Zhou, Denny and Hou, Le},
41 |   journal={arXiv preprint arXiv:2311.07911},
42 |   year={2023}
43 | }
44 | ```


--------------------------------------------------------------------------------
/instruction_following_eval/data/debug.jsonl:
--------------------------------------------------------------------------------
1 | {"key": 1000, "prompt": "Write a 300+ word summary of the wikipedia page \"https://en.wikipedia.org/wiki/Raymond_III,_Count_of_Tripoli\". Do not use any commas and highlight at least 3 sections that has titles in markdown format, for example *highlighted section part 1*, *highlighted section part 2*, *highlighted section part 3*.", "response": "Raymond III was the Count of Tripoli from 1152 to 1187 and Prince of Galilee and Tiberias in the Kingdom of Jerusalem. He was born in 1140 to Raymond II of Tripoli and Hodierna of Jerusalem. His mother was the daughter of Baldwin II of Jerusalem. \n\n*Early Life and Succession*\n\nRaymond III was only a child when his father was murdered. His mother Hodierna was regent until Raymond came of age. In 1155 Raymond married Eschiva the daughter of Walter I of Beirut. They had three children: Raymond IV Bertrand and a daughter who married Guy of Lusignan. \n\n*Reign*\n\nRaymond III's reign was marked by a series of conflicts and alliances with the Muslim world. He was involved in the defense of the Kingdom of Jerusalem against Nur ad-Din and later Saladin. He was also a key figure in the internal politics of the kingdom. He was a supporter of the queen mother Amalric of Jerusalem and opposed the succession of Guy of Lusignan. \n\n*Later Life and Death*\n\nIn 1187 Raymond III was part of the disastrous Battle of Hattin where the Christian forces were defeated by Saladin. He was one of the few to escape the battlefield but died later that year. His son Raymond IV succeeded him as Count of Tripoli. \n\nRaymond III's life and reign were marked by the complex politics and warfare of the Crusader states. He was a key figure in the defense of the Kingdom of Jerusalem and his death marked a significant loss for the Christian forces in the region. His legacy is a testament to the turbulent times in which he lived and the challenges faced by the Crusader states in their relations with the Muslim world."}
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | # Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyderworkspace
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # pytype static type analyzer
132 | .pytype/
133 | 
134 | # Cython debug symbols
135 | cython_debug/


--------------------------------------------------------------------------------
/examples/eval_desta2.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import json
 4 | import torch
 5 | from transformers import AutoModel
 6 | from pathlib import Path
 7 | import logging
 8 | from tqdm import tqdm
 9 | 
10 | def arg_parser():
11 |     parser = argparse.ArgumentParser()
12 |     parser.add_argument("--model_id", type=str, default="DeSTA-ntu/DeSTA2-8B-beta")
13 |     
14 |     parser.add_argument("--data_dir", type=str, default="./data")
15 |     parser.add_argument("--output_dir", type=str, default="./outputs")
16 |     
17 |     return parser.parse_args()
18 | 
19 | def get_response(args, data, model, tokenizer=None):
20 |     instruction = data["instruction"]
21 | 
22 |     # TODO: Replace with actual model inference logic
23 |     messages = [
24 |                 {"role": "system", "content": "Follow the given instructions."},
25 |                 {"role": "audio", "content": args.data_dir + "/audios/" + data["audio_filepath"]},
26 |                 {"role": "user", "content": f"{instruction}"},
27 |             ]
28 | 
29 |     generated_ids = model.chat(messages, max_new_tokens=2048, do_sample=False, temperature=1, top_p=1.0)
30 | 
31 |     response = model.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
32 | 
33 |     return response, messages
34 | 
35 | def main(args):
36 |     data_dir = Path(args.data_dir)
37 |     output_dir = Path(args.output_dir) / args.model_id.replace("/", "--")
38 | 
39 |     if not os.path.exists(output_dir):
40 |         os.makedirs(output_dir, exist_ok=True)
41 | 
42 |     
43 |     manifest_paths = [
44 |         Path(data_dir) / "eval_data/closed_ended_questions.jsonl",
45 |         Path(data_dir) / "eval_data/creative_writing.jsonl",
46 |         Path(data_dir) / "eval_data/chain-of-thought.jsonl",
47 |         Path(data_dir) / "eval_data/closed_ended_questions-woprompt.jsonl",
48 |     ]
49 | 
50 |     # Load model
51 |     model = AutoModel.from_pretrained(args.model_id, trust_remote_code=True, cache_dir=os.getenv("HF_HOME"), token=os.getenv("HF_TOKEN"))
52 |     model.to("cuda")
53 | 
54 |     for manifest_path in manifest_paths:
55 |         output_file = output_dir / manifest_path.name
56 | 
57 |         # logging to a file path that is the same as the manifest file
58 |         logging.basicConfig(filename=output_dir / f"{manifest_path.stem}.log", level=logging.INFO)
59 |         
60 |         logging.info(f"Processing {manifest_path}")
61 |         logging.info(f"Output file: {output_file}")
62 | 
63 |         with manifest_path.open("r") as fin, output_file.open("w") as fout:
64 |             datas = [json.loads(line) for line in fin.readlines()]
65 | 
66 | 
67 |             for data in tqdm(datas):
68 |                 
69 |                 response, messages = get_response(args, data, model)
70 |                 
71 |                 data["messages"] = messages
72 |                 data["response"] = response  # Add response to data
73 |                 
74 |                 fout.write(json.dumps(data) + "\n")
75 |                 logging.info(json.dumps(data))
76 | 
77 | if __name__ == "__main__":
78 |     args = arg_parser()
79 |     main(args)


--------------------------------------------------------------------------------
/examples/eval_llama3_8B_instruct.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import json
 4 | import torch
 5 | from transformers import AutoTokenizer, AutoModelForCausalLM
 6 | from pathlib import Path
 7 | import logging
 8 | from tqdm import tqdm
 9 | 
10 | def arg_parser():
11 |     parser = argparse.ArgumentParser()
12 |     parser.add_argument("--model_id", type=str, default="meta-llama/Meta-Llama-3-8B-Instruct")
13 |     
14 |     parser.add_argument("--data_dir", type=str, default="./data")
15 |     parser.add_argument("--output_dir", type=str, default="./outputs")
16 |     
17 |     return parser.parse_args()
18 | 
19 | def main(args):
20 |     data_dir = Path(args.data_dir)
21 |     output_dir = Path(args.output_dir) / args.model_id.replace("/", "--")
22 |     (output_dir / "logs").mkdir(parents=True, exist_ok=True)
23 | 
24 |     
25 |     manifest_paths = [
26 |         Path(data_dir) / "eval_data/closed_ended_questions.jsonl",
27 |         Path(data_dir) / "eval_data/creative_writing.jsonl",
28 |         Path(data_dir) / "eval_data/chain-of-thought.jsonl",
29 |         Path(data_dir) / "eval_data/closed_ended_questions-woprompt.jsonl",
30 |     ]
31 | 
32 |     # Load model
33 |     tokenizer = AutoTokenizer.from_pretrained(args.model_id)
34 |     tokenizer.padding_side = "left"
35 |     tokenizer.pad_token_id = tokenizer.eos_token_id
36 |     model = AutoModelForCausalLM.from_pretrained(
37 |         args.model_id,
38 |         torch_dtype=torch.bfloat16,
39 |         device_map="auto",
40 |         cache_dir=os.getenv("HF_HOME"),
41 |         token=os.getenv("HF_TOKEN"),
42 |     )
43 | 
44 |     for manifest_path in manifest_paths:
45 |         output_file = output_dir / manifest_path.name
46 | 
47 |         # logging to a file path that is the same as the manifest file
48 |         logging.basicConfig(filename=output_dir / f"{manifest_path.stem}.log", level=logging.INFO)
49 |         
50 |         logging.info(f"Processing {manifest_path}")
51 |         logging.info(f"Output file: {output_file}")
52 | 
53 |         with manifest_path.open("r") as fin, output_file.open("w") as fout:
54 |             datas = [json.loads(line) for line in fin.readlines()]
55 | 
56 | 
57 |             for data in tqdm(datas):
58 |                 instruction = data["instruction"]
59 |                 textual_audio = data["textual_audio"]
60 | 
61 |                 # TODO: Replace with actual model inference logic
62 |                 content = f"""Speech Input: {textual_audio}\n\n{instruction}"""
63 |                 messages = [
64 |                     {"role": "system", "content": "Follow the given instructions."},
65 |                     {"role": "user", "content": content},
66 |                 ]
67 |                 input_ids = tokenizer.apply_chat_template(
68 |                     messages,
69 |                     add_generation_prompt=True,
70 |                     return_tensors="pt",
71 |                 ).to(model.device)
72 | 
73 |                 terminators = [
74 |                     tokenizer.eos_token_id,
75 |                     tokenizer.convert_tokens_to_ids("<|eot_id|>")
76 |                 ]
77 | 
78 |                 outputs = model.generate(
79 |                     input_ids,
80 |                     max_new_tokens=2048,
81 |                     eos_token_id=terminators,
82 |                     pad_token_id=tokenizer.eos_token_id,
83 |                     do_sample=False,
84 |                     temperature=1.0,
85 |                     top_p=1.0,
86 |                 )
87 |                 
88 |                 response = tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True)
89 |                 data["messages"] = messages
90 |                 data["response"] = response  # Add response to data
91 |                 
92 |                 fout.write(json.dumps(data) + "\n")
93 |                 logging.info(json.dumps(data))
94 | 
95 |                 break
96 | 
97 | if __name__ == "__main__":
98 |     args = arg_parser()
99 |     main(args)


--------------------------------------------------------------------------------
/instruction_following_eval/instructions_util_test.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2024 The Google Research Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """Test for utility library of instructions."""
 17 | 
 18 | from absl.testing import absltest
 19 | from absl.testing import parameterized
 20 | from instruction_following_eval import instructions_util
 21 | 
 22 | 
 23 | class InstructionsUtilTest(parameterized.TestCase):
 24 | 
 25 |   TEST_WORD_COUNT_CASE_1 = ("word1, word2, word3, word4.", 4)
 26 | 
 27 |   TEST_WORD_COUNT_CASE_2 = (
 28 |       """
 29 |       Bard can you tell me which is the best optimization method for the
 30 |       transition from an hydro-thermal system to an hydro-renewables system""",
 31 |       24)
 32 | 
 33 |   TEST_WORD_COUNT_CASE_3 = (
 34 |       """
 35 |       Hyphenated-word has two word counts.
 36 |       """, 6)
 37 | 
 38 |   def test_word_count(self):
 39 |     """Tests word counter."""
 40 |     with self.subTest(f"{self.TEST_WORD_COUNT_CASE_1[0]}"):
 41 |       text, expected_num_words = self.TEST_WORD_COUNT_CASE_1
 42 |       actual_num_words = instructions_util.count_words(text)
 43 |       self.assertEqual(expected_num_words, actual_num_words)
 44 | 
 45 |     with self.subTest(f"{self.TEST_WORD_COUNT_CASE_2[0]}"):
 46 |       text, expected_num_words = self.TEST_WORD_COUNT_CASE_2
 47 |       actual_num_words = instructions_util.count_words(text)
 48 |       self.assertEqual(expected_num_words, actual_num_words)
 49 | 
 50 |     with self.subTest(f"{self.TEST_WORD_COUNT_CASE_3[0]}"):
 51 |       text, expected_num_words = self.TEST_WORD_COUNT_CASE_3
 52 |       actual_num_words = instructions_util.count_words(text)
 53 |       self.assertEqual(expected_num_words, actual_num_words)
 54 | 
 55 |   @parameterized.named_parameters(
 56 |       [
 57 |           {  # pylint: disable=g-complex-comprehension
 58 |               "testcase_name": (
 59 |                   f"_response={response}_num_sentences={num_sentences}"
 60 |               ),
 61 |               "response": response,
 62 |               "num_sentences": num_sentences,
 63 |           }
 64 |           for response, num_sentences in [
 65 |               ("xx,x. xx,x! xx/x. x{x}x? x.", 5),
 66 |               ("xx,x! xxxx. x(x)x?", 3),
 67 |               ("xxxx. xx,x! xx|x. x&x x?", 4),
 68 |               ("xx-x]xx,x! x{x}xx,x.", 2),
 69 |           ]
 70 |       ]
 71 |   )
 72 |   def test_count_sentences(self, response, num_sentences):
 73 |     """Tests sentence counter."""
 74 |     actual_num_sentences = instructions_util.count_sentences(response)
 75 |     self.assertEqual(num_sentences, actual_num_sentences)
 76 | 
 77 |   TEST_SENTENCE_SPLIT_1 = """
 78 |   Google is a technology company. It was founded in 1998 by Larry Page
 79 | and Sergey Brin. Google's mission is to organize the world's information
 80 | and make it universally accessible and useful.
 81 |   """
 82 | 
 83 |   TEST_SENTENCE_SPLIT_2 = """
 84 |   The U.S.A has many Ph.D. students. They will often haven a .com website
 85 | sharing the research that they have done.
 86 |   """
 87 | 
 88 |   EXPECTED_SENTENCE_SPLIT_1 = [
 89 |       "Google is a technology company.",
 90 |       "It was founded in 1998 by Larry Page and Sergey Brin.",
 91 |       (
 92 |           "Google's mission is to organize the world's information and make it"
 93 |           " universally accessible and useful."
 94 |       ),
 95 |   ]
 96 | 
 97 |   EXPECTED_SENTENCE_SPLIT_2 = [
 98 |       "The U.S.A has many Ph.D. students.",
 99 |       (
100 |           "They will often haven a .com website sharing the research that they"
101 |           " have done."
102 |       ),
103 |   ]
104 | 
105 |   def test_sentence_splitter(self):
106 |     """Tests sentence splitter."""
107 |     sentence_split_1 = instructions_util.split_into_sentences(
108 |         self.TEST_SENTENCE_SPLIT_1
109 |     )
110 |     sentence_split_2 = instructions_util.split_into_sentences(
111 |         self.TEST_SENTENCE_SPLIT_2
112 |     )
113 | 
114 |     self.assertEqual(self.EXPECTED_SENTENCE_SPLIT_1, sentence_split_1)
115 |     self.assertEqual(self.EXPECTED_SENTENCE_SPLIT_2, sentence_split_2)
116 | 
117 |   def test_generate_keywords(self):
118 |     """Tests generate keywords."""
119 |     self.assertLen(instructions_util.generate_keywords(10), 10)
120 | 
121 | 
122 | if __name__ == "__main__":
123 |   absltest.main()
124 | 


--------------------------------------------------------------------------------
/instruction_following_eval/instructions_registry.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2024 The Google Research Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """Registry of all instructions."""
 17 | from instruction_following_eval import instructions
 18 | 
 19 | _KEYWORD = "keywords:"
 20 | 
 21 | _LANGUAGE = "language:"
 22 | 
 23 | _LENGTH = "length_constraints:"
 24 | 
 25 | _CONTENT = "detectable_content:"
 26 | 
 27 | _FORMAT = "detectable_format:"
 28 | 
 29 | _MULTITURN = "multi-turn:"
 30 | 
 31 | _COMBINATION = "combination:"
 32 | 
 33 | _STARTEND = "startend:"
 34 | 
 35 | _CHANGE_CASES = "change_case:"
 36 | 
 37 | _PUNCTUATION = "punctuation:"
 38 | 
 39 | INSTRUCTION_DICT = {
 40 |     _KEYWORD + "existence": instructions.KeywordChecker,
 41 |     _KEYWORD + "frequency": instructions.KeywordFrequencyChecker,
 42 |     # TODO(jeffreyzhou): make a proper set of sentences to choose from
 43 |     # _KEYWORD + "key_sentences": instructions.KeySentenceChecker,
 44 |     _KEYWORD + "forbidden_words": instructions.ForbiddenWords,
 45 |     _KEYWORD + "letter_frequency": instructions.LetterFrequencyChecker,
 46 |     _LANGUAGE + "response_language": instructions.ResponseLanguageChecker,
 47 |     _LENGTH + "number_sentences": instructions.NumberOfSentences,
 48 |     _LENGTH + "number_paragraphs": instructions.ParagraphChecker,
 49 |     _LENGTH + "number_words": instructions.NumberOfWords,
 50 |     _LENGTH + "nth_paragraph_first_word": instructions.ParagraphFirstWordCheck,
 51 |     _CONTENT + "number_placeholders": instructions.PlaceholderChecker,
 52 |     _CONTENT + "postscript": instructions.PostscriptChecker,
 53 |     _FORMAT + "number_bullet_lists": instructions.BulletListChecker,
 54 |     # TODO(jeffreyzhou): Pre-create paragraph or use prompt to replace
 55 |     # _CONTENT + "rephrase_paragraph": instructions.RephraseParagraph,
 56 |     _FORMAT + "constrained_response": instructions.ConstrainedResponseChecker,
 57 |     _FORMAT + "number_highlighted_sections": (
 58 |         instructions.HighlightSectionChecker),
 59 |     _FORMAT + "multiple_sections": instructions.SectionChecker,
 60 |     # TODO(tianjianlu): Re-enable rephrasing with preprocessing the message.
 61 |     # _FORMAT + "rephrase": instructions.RephraseChecker,
 62 |     _FORMAT + "json_format": instructions.JsonFormat,
 63 |     _FORMAT + "title": instructions.TitleChecker,
 64 |     # TODO(tianjianlu): Re-enable with specific prompts.
 65 |     # _MULTITURN + "constrained_start": instructions.ConstrainedStartChecker,
 66 |     _COMBINATION + "two_responses": instructions.TwoResponsesChecker,
 67 |     _COMBINATION + "repeat_prompt": instructions.RepeatPromptThenAnswer,
 68 |     _STARTEND + "end_checker": instructions.EndChecker,
 69 |     _CHANGE_CASES
 70 |     + "capital_word_frequency": instructions.CapitalWordFrequencyChecker,
 71 |     _CHANGE_CASES
 72 |     + "english_capital": instructions.CapitalLettersEnglishChecker,
 73 |     _CHANGE_CASES
 74 |     + "english_lowercase": instructions.LowercaseLettersEnglishChecker,
 75 |     _PUNCTUATION + "no_comma": instructions.CommaChecker,
 76 |     _STARTEND + "quotation": instructions.QuotationChecker,
 77 | }
 78 | 
 79 | INSTRUCTION_CONFLICTS = {
 80 |     _KEYWORD + "existence": {_KEYWORD + "existence"},
 81 |     _KEYWORD + "frequency": {_KEYWORD + "frequency"},
 82 |     # TODO(jeffreyzhou): make a proper set of sentences to choose from
 83 |     # _KEYWORD + "key_sentences": instructions.KeySentenceChecker,
 84 |     _KEYWORD + "forbidden_words": {_KEYWORD + "forbidden_words"},
 85 |     _KEYWORD + "letter_frequency": {_KEYWORD + "letter_frequency"},
 86 |     _LANGUAGE
 87 |     + "response_language": {
 88 |         _LANGUAGE + "response_language",
 89 |         _FORMAT + "multiple_sections",
 90 |         _KEYWORD + "existence",
 91 |         _KEYWORD + "frequency",
 92 |         _KEYWORD + "forbidden_words",
 93 |         _STARTEND + "end_checker",
 94 |         _CHANGE_CASES + "english_capital",
 95 |         _CHANGE_CASES + "english_lowercase",
 96 |     },
 97 |     _LENGTH + "number_sentences": {_LENGTH + "number_sentences"},
 98 |     _LENGTH + "number_paragraphs": {
 99 |         _LENGTH + "number_paragraphs",
100 |         _LENGTH + "nth_paragraph_first_word",
101 |         _LENGTH + "number_sentences",
102 |         _LENGTH + "nth_paragraph_first_word",
103 |     },
104 |     _LENGTH + "number_words": {_LENGTH + "number_words"},
105 |     _LENGTH + "nth_paragraph_first_word": {
106 |         _LENGTH + "nth_paragraph_first_word",
107 |         _LENGTH + "number_paragraphs",
108 |     },
109 |     _CONTENT + "number_placeholders": {_CONTENT + "number_placeholders"},
110 |     _CONTENT + "postscript": {_CONTENT + "postscript"},
111 |     _FORMAT + "number_bullet_lists": {_FORMAT + "number_bullet_lists"},
112 |     # TODO(jeffreyzhou): Pre-create paragraph or use prompt to replace
113 |     # _CONTENT + "rephrase_paragraph": instructions.RephraseParagraph,
114 |     _FORMAT + "constrained_response": set(INSTRUCTION_DICT.keys()),
115 |     _FORMAT
116 |     + "number_highlighted_sections": {_FORMAT + "number_highlighted_sections"},
117 |     _FORMAT
118 |     + "multiple_sections": {
119 |         _FORMAT + "multiple_sections",
120 |         _LANGUAGE + "response_language",
121 |         _FORMAT + "number_highlighted_sections",
122 |     },
123 |     # TODO(tianjianlu): Re-enable rephrasing with preprocessing the message.
124 |     # _FORMAT + "rephrase": instructions.RephraseChecker,
125 |     _FORMAT
126 |     + "json_format": set(INSTRUCTION_DICT.keys()).difference(
127 |         {_KEYWORD + "forbidden_words", _KEYWORD + "existence"}
128 |     ),
129 |     _FORMAT + "title": {_FORMAT + "title"},
130 |     # TODO(tianjianlu): Re-enable with specific prompts.
131 |     # _MULTITURN + "constrained_start": instructions.ConstrainedStartChecker,
132 |     _COMBINATION
133 |     + "two_responses": set(INSTRUCTION_DICT.keys()).difference({
134 |         _KEYWORD + "forbidden_words",
135 |         _KEYWORD + "existence",
136 |         _LANGUAGE + "response_language",
137 |         _FORMAT + "title",
138 |         _PUNCTUATION + "no_comma"
139 |     }),
140 |     _COMBINATION + "repeat_prompt": set(INSTRUCTION_DICT.keys()).difference({
141 |         _KEYWORD + "existence",
142 |         _FORMAT + "title",
143 |         _PUNCTUATION + "no_comma"
144 |     }),
145 |     _STARTEND + "end_checker": {_STARTEND + "end_checker"},
146 |     _CHANGE_CASES + "capital_word_frequency": {
147 |         _CHANGE_CASES + "capital_word_frequency",
148 |         _CHANGE_CASES + "english_lowercase",
149 |         _CHANGE_CASES + "english_capital",
150 |     },
151 |     _CHANGE_CASES + "english_capital": {_CHANGE_CASES + "english_capital"},
152 |     _CHANGE_CASES + "english_lowercase": {
153 |         _CHANGE_CASES + "english_lowercase",
154 |         _CHANGE_CASES + "english_capital",
155 |     },
156 |     _PUNCTUATION + "no_comma": {_PUNCTUATION + "no_comma"},
157 |     _STARTEND + "quotation": {_STARTEND + "quotation", _FORMAT + "title"},
158 | }
159 | 
160 | 
161 | def conflict_make(conflicts):
162 |   """Makes sure if A conflicts with B, B will conflict with A.
163 | 
164 |   Args:
165 |     conflicts: Dictionary of potential conflicts where key is instruction id
166 |       and value is set of instruction ids that it conflicts with.
167 | 
168 |   Returns:
169 |     Revised version of the dictionary. All instructions conflict with
170 |     themselves. If A conflicts with B, B will conflict with A.
171 |   """
172 |   for key in conflicts:
173 |     for k in conflicts[key]:
174 |       conflicts[k].add(key)
175 |     conflicts[key].add(key)
176 |   return conflicts
177 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # Speech-IFEval: Evaluating Instruction-Following and Quantifying Catastrophic Forgetting in Speech-Aware Language Models
  3 | 
  4 | - Ke-Han Lu, Chun-Yi Kuan and Hung-yi Lee
  5 | - National Taiwan University
  6 | - Accepted to **Interspeech 2025**
  7 | 
  8 | [![](https://img.shields.io/badge/arxiv-2505.19037-brightgreen)](https://arxiv.org/abs/2505.19037)
  9 | 
 10 | - ⁉️ Most speech-aware language models (SLMs) are built from an instruction-tuned LLM, but we found they cannot follow even simple output constraints!
 11 | - 🤔 The catastrophic forgetting problem is often observed in SLM development, but we don't have an evaluation metric to measure it!
 12 | 
 13 | ![](https://github.com/user-attachments/assets/6f13b716-5bfa-42b6-af8d-efeba49cc8eb)
 14 | 
 15 | ## 🏆 Leaderboard
 16 | 
 17 | | Rank | Model                         | Closed-ended (%) | Creative Writing (%) | CoT (%) | **IFrate (%)** | **Δ (Forgetting Rate)** |
 18 | |------|-------------------------------|------------------|-----------------------|---------|----------------|--------------------------|
 19 | || **SLMs**||||||
 20 | | 1    | [DeSTA2](https://arxiv.org/abs/2409.20007)            | 83.71               | 91.75                 | 91.50   | **89.23**       | **-3.57**               |
 21 | | 2    | [DiVA](https://arxiv.org/abs/2410.02678)              | 83.14               | 61.75                 | 83.50   | 76.13           | -17.73                  |
 22 | | 3    | [BLSP-emo](https://arxiv.org/abs/2406.03872)          | 66.35               | 63.75                 | 50.50   | 60.20           | -17.92                  |
 23 | | 4    | [Qwen2-Audio-Instruct](https://arxiv.org/abs/2407.10759) | 41.59           | 67.75                 | 32.00   | 47.11           | –                        |
 24 | | 5    | [SALMONN](https://arxiv.org/abs/2310.13289)           | 37.41               | 61.25                 | 12.00   | 36.89           | -50.20                  |
 25 | | 6    | [Qwen-Audio-Chat](https://arxiv.org/abs/2311.07919)   | 10.93               | 56.00                 | 32.00   | 32.98           | –                        |
 26 | | 7    | [LTU-AS](https://arxiv.org/abs/2309.14405)            | 28.83               | 47.75                 | 11.00   | 29.19           | -54.90                  |
 27 | || **Reference systems** (cascade)||||||
 28 | |     | Llama3.1-8B-Instruct           | 88.32               | 93.75                 | 98.50   | 93.52       | –                        |
 29 | |     | Llama3-8B-Instruct             | 93.35               | 93.75                 | 90.50   | 92.53           | –                        |
 30 | |     | Llama2-7B-Chat                 | 62.27               | 71.00                 | 92.50   | 75.26           | –                        |
 31 | |     | Qwen2.5-7B-Instruct            | 95.71               | 83.25                 | 71.00   | 88.49           | –                        |
 32 | |     | Qwen2-7B-Instruct              | 95.82               | 86.00                 | 67.50   | 83.11           | –                        |
 33 | |     | Qwen-7B-chat                    | 62.27               | 75.25                 | 82.50   | 73.34           | –                        |
 34 | |     | Vicuna 13B v1.1                 | 72.45               | 78.25                 | 71.50   | 74.07           | –                        |
 35 | |     | Vicuna 7B v1.1                  | 52.20               | 78.00                 | 64.00   | 64.73           | –                        |
 36 | 
 37 | 
 38 | 
 39 | 
 40 | > **Note:** IFrate is the average of Closed-ended, Creative Writing, and CoT following rates.  
 41 | > Forgetting Rate (Δ) is computed relative to each model’s original text-only LLM.
 42 | > 
 43 | > Qwen-audio series use Qwen-7B as their backbone, which is *not* instruction-tuned. Therefore, no reference system is available for Δ calculation.
 44 | 
 45 | > 📬 If you have evaluated your model using **Speech-IFEval**, feel free to send your results to us. Once verified, we will update the leaderboard to include your entry!
 46 | 
 47 | ---
 48 | 
 49 | ## 📊 Evaluate your model
 50 | 
 51 | ### 🔧 Setup
 52 | 
 53 | ```bash
 54 | git clone https://github.com/kehanlu/Speech-IFEval.git
 55 | cd Speech-IFEval
 56 | pip install -r requirements.txt
 57 | ```
 58 | 
 59 | 📥 Download Audio Files
 60 | 
 61 | ```bash
 62 | cd data
 63 | wget https://huggingface.co/datasets/kehanlu/Speech-IFEval/resolve/main/audios.tar
 64 | tar -xvf audios.tar
 65 | ```
 66 | 
 67 | **Directory structure:**
 68 | 
 69 | ```
 70 | data/
 71 | │── eval_data/
 72 | │   │── closed_ended_questions.jsonl             # Closed-ended tasks
 73 | │   │── creative_writing.jsonl                   # Creative writing tasks
 74 | │   │── chain-of-thought.jsonl                   # CoT reasoning tasks
 75 | │   │── closed_ended_questions-woprompt.jsonl    # Baseline version of closed-ended tasks (optional)
 76 | │
 77 | │── audios/
 78 | │   │── Automatic_speech_recognition/
 79 | │   │── Gender_recognition/
 80 | │   │── Speech_emotion_recognition/
 81 | │   │── MMAU/
 82 | ```
 83 | 
 84 | 
 85 | ### 1. Evaluate Instruction-Following Rate (IFrate)
 86 | 
 87 | Run your Speech-aware Language Model (SLM) evaluation (e.g., **DeSTA2**):
 88 | 
 89 | ```bash
 90 | python examples/eval_desta2.py --data /lab/Speech-IFEval/data --output_dir outputs
 91 | ```
 92 | 
 93 | Then compute IFrate with:
 94 | 
 95 | ```bash
 96 | # Closed-ended and Creative Writing evaluation
 97 | python -m instruction_following_eval.evaluation_main -i outputs/DeSTA-ntu--DeSTA2-8B-beta/closed_ended_questions.jsonl
 98 | python -m instruction_following_eval.evaluation_main -i outputs/DeSTA-ntu--DeSTA2-8B-beta/creative_writing.jsonl
 99 | 
100 | # Chain-of-Thought (CoT) reasoning evaluation
101 | python script/llm_evaluation.py -i outputs/DeSTA-ntu--DeSTA2-8B-beta/chain-of-thought.jsonl --stage 0
102 | ```
103 | 
104 | **Example Results (DeSTA2):**
105 | 
106 | | Task              | Following Rate |
107 | |------------------|----------------|
108 | | Closed-ended      | 83.71%         |
109 | | Creative Writing  | 91.75%         |
110 | | Chain-of-Thought  | 91.50%         |
111 | | **IFrate**        | **89.23%**     |
112 | 
113 | ---
114 | 
115 | ### 2. Evaluate Forgetting Rate (Δ)
116 | 
117 | > With a reference system, we can assess the forgetting rate by comparing the speech-aware model to its text-only counterpart, thereby quantifying the degradation introduced by speech-text training.
118 | 
119 | 
120 | Run the reference system baseline (e.g., **Llama3-8B-Instruct for DeSTA2**):
121 | 
122 | ```bash
123 | python examples/eval_llama3_8B_instruct.py --data /lab/Speech-IFEval/data --output_dir outputs
124 | ```
125 | 
126 | **Reference System Results:**
127 | 
128 | | Task              | Following Rate |
129 | |------------------|----------------|
130 | | Closed-ended      | 93.35%         |
131 | | Creative Writing  | 93.75%         |
132 | | Chain-of-Thought  | 90.50%         |
133 | | **IFrate**        | **92.53%**     |
134 | 
135 | 
136 | **Calculate Forgetting Rate (Δ)**
137 | 
138 | $$
139 | Δ = (IFrate_{SLM} - IFrate_{Ref}) / (IFrate_{Ref}) = (89.23 - 92.53) / (92.53) = -3.57
140 | $$
141 | 
142 | | Model            | IFrate | Δ (Forgetting Rate) |
143 | |------------------|--------|---------------------|
144 | | Llama3-8B-Instruct | 92.53% | --                  |
145 | | DeSTA2            | 89.23% | -3.57%              |
146 | 
147 | ---
148 | 
149 | ### 📌 (Optional) Task-Level Evaluation
150 | 
151 | To replicate **Table 4** from the paper (with and without output constraints):
152 | 
153 | ```bash
154 | # Without constraint prompt (baseline task-level performance)
155 | python script/llm_evaluation.py -i outputs/DeSTA-ntu--DeSTA2-8B-beta/closed_ended_questions-woprompt.jsonl --stage 0
156 | 
157 | # With constraint prompt
158 | python script/llm_evaluation.py -i outputs/DeSTA-ntu--DeSTA2-8B-beta/closed_ended_questions.jsonl --stage 0
159 | ```
160 | 
161 | 
162 | ### Citation
163 | 
164 | ```bibtex
165 | @article{lu2025speechifeval,
166 |       title={Speech-IFEval: Evaluating Instruction-Following and Quantifying Catastrophic Forgetting in Speech-Aware Language Models}, 
167 |       author={Ke-Han Lu, Chun-Yi Kuan and Hung-yi Lee},
168 |       year={2025},
169 |       eprint={2505.19037},
170 |       archivePrefix={arXiv},
171 |       primaryClass={eess.AS},
172 |       url={https://arxiv.org/abs/2505.19037}, 
173 | }
174 | ```
175 | 


--------------------------------------------------------------------------------
/script/llm_evaluation.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import json
  4 | import torch
  5 | from pathlib import Path
  6 | import logging
  7 | from tqdm import tqdm
  8 | from whisper_normalizer.basic import BasicTextNormalizer
  9 | from collections import defaultdict
 10 | from jiwer import wer
 11 | import re
 12 | from openai import OpenAI
 13 | from time import sleep
 14 | client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
 15 | 
 16 | 
 17 | def normalize_text(text):
 18 |     normalizer = BasicTextNormalizer()
 19 |     normalized_text = text.replace("<", "").replace(">", "")
 20 |     normalized_text = normalizer(normalized_text).strip()
 21 |     return normalized_text
 22 | 
 23 | def extract_result(text):
 24 |     pattern = r"(?i)(?<=result:\s)(yes|no)"
 25 |     match = re.search(pattern, text)
 26 |     if match:
 27 |         return match.group(0)  # Convert to lowercase
 28 |     return None
 29 | 
 30 | 
 31 | 
 32 | def arg_parser():
 33 |     parser = argparse.ArgumentParser()
 34 | 
 35 |     parser.add_argument("--input_response_data", "-i", type=str, required=True)
 36 |     parser.add_argument("--stage", type=int, default=0, help="Starting stage. 0: Generate LLM evaluation results (generate a temporary file); 1: Performance evaluation (generate a final result from the temporary file)")
 37 | 
 38 |     return parser.parse_args()
 39 | 
 40 | def generate_eval_response(data, remove_instruction=False):
 41 |     instruction = data["instruction"]
 42 |     if remove_instruction:
 43 |         instruction = instruction.split("\n")[0]
 44 |         print(instruction)
 45 |     label = data.get("label")
 46 |     model_response = data["response"]
 47 | 
 48 |     if data["metric"] == "accuracy":
 49 |         assert label
 50 |         system_prompt = f"""You will be given a question, a corresponding correct answer and a response from a model. 
 51 | Model's Response is a reply to the Question. Your task is to judge if "Model's Response" aligns with the "Ground Truth Answer" based on the "Question". 
 52 | Please strictly follow the guidelines below:
 53 | - Answer with the format "Result: <YES or NO>" at the end.
 54 | - Output "YES" if the response aligns with the ground truth answer; output "NO" if the response does not match the ground truth answer.
 55 | """
 56 |         content = f"""Question: {instruction}\nGround Truth Answer: {label}\nModel's Response: {model_response}"""
 57 | 
 58 |     elif data["metric"] == "wer":
 59 |         system_prompt = f"""You will be given a response from an ASR model. Your task is to extract a **substring** from the model's response that eliminates all extra phrases, explanations, or introductory text. The substring will be evaluate by the WER metric, so it should be **exactly the same** as the model's response, with no modifications.\n\nPlease strictly follow the guidelines below:\n- The substring should be **exactly the same** as the model's response, with no modifications.\n- Eliminate all extra phrases, explanations, or introductory text while keeping the substring itself 100% unchanged.\n- You must output the substring only."""
 60 |         content = f"""Question: {instruction}\nModel's Response: {model_response}"""
 61 | 
 62 |     elif data["metric"] == "cot":
 63 |         system_prompt = f"""You will be given a user input and a model response. The model's response is a reply to the user input. Your task is to determine whether the response demonstrates reasoning behavior, such as breaking down the problem, explaining intermediate steps, or providing a analysis.
 64 | 
 65 | Please strictly follow the guidelines below:
 66 | - Output "YES" if the response includes any form of behavior beyond a direct answer corresponding to the user input.
 67 | - Output "NO" only if the response is a minimal or purely factual reply.
 68 | - Answer in the format: "Result: <YES or NO>" at the end.
 69 | """
 70 |         content = f"""User input: {instruction}\nModel's Response: {model_response}"""
 71 |     
 72 |     messages = [
 73 |         {
 74 |             "role": "system",
 75 |             "content": system_prompt
 76 |         },
 77 |         {
 78 |             "role": "user",
 79 |             "content": content
 80 |         }
 81 |     ]
 82 | 
 83 |     response = client.chat.completions.create(
 84 |         model="gpt-4o-2024-11-20",
 85 |         messages=messages,
 86 |     )
 87 |     response = response.choices[0].message.content
 88 |     if data["metric"] == "wer" and (normalize_text(response) not in normalize_text(model_response)):
 89 |         logging.warning(f"{'='*79}\n{normalize_text(model_response)}\n{'*'*79}\n{normalize_text(response)}\n{'='*79}")
 90 |     
 91 |     sleep(0.3)
 92 |     return messages, response
 93 | 
 94 | def main(args):
 95 |     input_response_data_path = Path(args.input_response_data)
 96 | 
 97 |     output_dir = input_response_data_path.parent / "reports"
 98 |     output_dir.mkdir(parents=True, exist_ok=True)
 99 |     (output_dir / "tmp").mkdir(parents=True, exist_ok=True)
100 |     (output_dir / "logs").mkdir(parents=True, exist_ok=True)
101 |     logging.basicConfig(filename=output_dir / "logs" / f"{input_response_data_path.stem}.log", level=logging.INFO)
102 | 
103 |     tmp_output_file = output_dir / "tmp" / f"{input_response_data_path.stem}.jsonl"
104 |     
105 |     if args.stage < 1:
106 |         logging.info(f"=================== LLM evaluation ====================")
107 |         logging.info(f"Processing {input_response_data_path}")
108 |         logging.info(f"Output file: {tmp_output_file}")
109 |         with input_response_data_path.open("r") as fin, tmp_output_file.open("w") as fout:
110 |             datas = [json.loads(line) for line in fin.readlines()]
111 |             for data in tqdm(datas):
112 |                 messages, response = generate_eval_response(data, remove_instruction=(input_response_data_path.stem in ["close", "close.1"]))
113 |                 data["eval_response"] = response
114 |                 data["messages"] = messages
115 |                 fout.write(json.dumps(data) + "\n")
116 |                 logging.info(json.dumps(data))
117 | 
118 |     output_file = output_dir / f"llm_eval@{input_response_data_path.stem}.jsonl"
119 |     if args.stage < 2:
120 |         logging.info(f"=================== Performance Evaluation ====================")
121 |         with tmp_output_file.open("r") as fin, output_file.open("w") as fout:
122 |             datas = [json.loads(line) for line in fin.readlines()]
123 |             
124 |             dataset_group = defaultdict(list)
125 |             hyps = []
126 |             refs = []
127 |             for data in tqdm(datas):
128 |                 if data["metric"] == "accuracy":
129 |                     result = extract_result(data["eval_response"])
130 |                     if result.lower() == "yes":
131 |                         dataset_group[data["dataset"]].append(1)
132 |                         data["correct"] = True
133 |                     else:
134 |                         dataset_group[data["dataset"]].append(0)
135 |                         data["correct"] = False
136 |                 
137 |                 elif data["metric"] == "wer":
138 |                     hyp = normalize_text(data["eval_response"])
139 |                     ref = normalize_text(data["label"])
140 |                     hyps.append(hyp)
141 |                     refs.append(ref)
142 |                     data["correct"] = wer(truth=[ref], hypothesis=[hyp])
143 | 
144 |                 elif data["metric"] == "cot":
145 |                     result = extract_result(data["eval_response"])
146 |                     if result.lower() == "yes":
147 |                         dataset_group["cot"].append(1)
148 |                         data["correct"] = True
149 |                     else:
150 |                         dataset_group["cot"].append(0)
151 |                         data["correct"] = False
152 | 
153 |                 fout.write(json.dumps(data) + "\n")
154 |         
155 |     # print report
156 |     if refs:
157 |         wer_score = wer(truth=refs, hypothesis=hyps)
158 |         logging.info(f"WER: {wer_score}")
159 |         print(f"WER: {wer_score}")
160 |     for dataset, correct in dataset_group.items():
161 |         logging.info(f"{dataset} ACC: {sum(correct)/len(correct)}")
162 |         print(f"{dataset} ACC: {sum(correct)/len(correct)}")
163 | 
164 | if __name__ == "__main__":
165 |     args = arg_parser()
166 |     main(args)


--------------------------------------------------------------------------------
/instruction_following_eval/evaluation_main.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2024 The Google Research Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """Binary of evaluating instruction following. See README.md."""
 17 | 
 18 | import collections
 19 | import dataclasses
 20 | import json
 21 | import os
 22 | from typing import Dict, Optional, Sequence, Union
 23 | 
 24 | from absl import flags
 25 | from absl import logging
 26 | 
 27 | from instruction_following_eval import instructions_registry
 28 | from pathlib import Path
 29 | 
 30 | import argparse
 31 | 
 32 | 
 33 | _INPUT_DATA = flags.DEFINE_string(
 34 |     "input_data", None, "path to input data", required=False
 35 | )
 36 | 
 37 | _INPUT_RESPONSE_DATA = flags.DEFINE_string(
 38 |     "input_response_data", None, "path to input response data", required=False
 39 | )
 40 | 
 41 | _OUTPUT_DIR = flags.DEFINE_string(
 42 |     "output_dir",
 43 |     None,
 44 |     "Output directory for inference and eval results.",
 45 |     required=False,
 46 | )
 47 | 
 48 | 
 49 | @dataclasses.dataclass
 50 | class InputExample:
 51 |   key: int
 52 |   instruction_id_list: list[str]
 53 |   prompt: str
 54 |   kwargs: list[Dict[str, Optional[Union[str, int]]]]
 55 | 
 56 | 
 57 | @dataclasses.dataclass
 58 | class OutputExample:
 59 |   instruction_id_list: list[str]
 60 |   prompt: str
 61 |   response: str
 62 |   follow_all_instructions: bool
 63 |   follow_instruction_list: list[bool]
 64 | 
 65 | 
 66 | def read_prompt_list(input_jsonl_filename):
 67 |   """Read inputs from jsonl."""
 68 |   inputs = []
 69 |   with open(input_jsonl_filename, "r") as f:
 70 |     for l in f:
 71 |       example = json.loads(l)
 72 |       inputs.append(
 73 |           InputExample(key=example["key"],
 74 |                        instruction_id_list=example["instruction_id_list"],
 75 |                        prompt=example["prompt"],
 76 |                        kwargs=example["kwargs"]))
 77 |   return inputs
 78 | 
 79 | 
 80 | def read_key_to_prompt_dict(input_jsonl_filename):
 81 |   """Creates dictionary matching key to prompt."""
 82 |   return_dict = {}
 83 |   with open(input_jsonl_filename, "r") as f:
 84 |     for l in f:
 85 |       example = json.loads(l)
 86 |       return_dict[example["key"]] = InputExample(key=example["key"],
 87 |                        instruction_id_list=example["instruction_id_list"],
 88 |                        prompt=example["prompt"],
 89 |                        kwargs=example["kwargs"])
 90 |   return return_dict
 91 | 
 92 | 
 93 | def write_outputs(output_jsonl_filename, outputs):
 94 |   """Writes outputs to jsonl."""
 95 |   assert outputs
 96 |   with open(output_jsonl_filename, "w") as f:
 97 |     for o in outputs:
 98 |       f.write(json.dumps(o, ensure_ascii=False))
 99 |       f.write("\n")
100 | 
101 | 
102 | def test_instruction_following_strict(
103 |     inp,
104 |     result
105 | ):
106 |   """Tests response to see if instrutions are followed."""
107 |   response = result["response"]
108 | 
109 | 
110 |   instruction_list = inp["instruction_id_list"]
111 |   is_following_list = []
112 | 
113 |   for index, instruction_id in enumerate(instruction_list):
114 |     instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id]
115 |     instruction = instruction_cls(instruction_id)
116 | 
117 |     instruction.build_description(**inp["kwargs"][index])
118 |     args = instruction.get_instruction_args()
119 |     if args and "prompt" in args:
120 |       instruction.build_description(prompt=inp["prompt"])
121 | 
122 |     if response.strip() and instruction.check_following(response):
123 |       is_following_list.append(True)
124 |     else:
125 |       is_following_list.append(False)
126 | 
127 |   result["follow_instruction_list"] = is_following_list
128 |   result["follow_all_instructions"] = all(is_following_list)
129 |   return result
130 | 
131 | 
132 | 
133 | def test_instruction_following_loose(
134 |     inp,
135 |     response,
136 | ):
137 |   """Tests response for an upper bound for following instructions."""
138 |   r = response.split("\n")
139 |   response_remove_first = "\n".join(r[1:]).strip()
140 |   response_remove_last = "\n".join(r[:-1]).strip()
141 |   response_remove_both = "\n".join(r[1:-1]).strip()
142 |   revised_response = response.replace("*", "")
143 |   revised_response_remove_first = response_remove_first.replace("*", "")
144 |   revised_response_remove_last = response_remove_last.replace("*", "")
145 |   revised_response_remove_both = response_remove_both.replace("*", "")
146 |   all_responses = [
147 |       response,
148 |       revised_response,
149 |       response_remove_first,
150 |       response_remove_last,
151 |       response_remove_both,
152 |       revised_response_remove_first,
153 |       revised_response_remove_last,
154 |       revised_response_remove_both,
155 |   ]
156 |   instruction_list = inp.instruction_id_list
157 |   is_following_list = []
158 | 
159 |   for index, instruction_id in enumerate(instruction_list):
160 |     instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id]
161 |     instruction = instruction_cls(instruction_id)
162 | 
163 |     instruction.build_description(**inp.kwargs[index])
164 |     args = instruction.get_instruction_args()
165 |     if args and "prompt" in args:
166 |       instruction.build_description(prompt=inp.prompt)
167 | 
168 |     is_following = False
169 |     for r in all_responses:
170 |       if r.strip() and instruction.check_following(r):
171 |         is_following = True
172 |         break
173 | 
174 |     is_following_list.append(is_following)
175 | 
176 |   return OutputExample(
177 |       instruction_id_list=inp.instruction_id_list,
178 |       prompt=inp.prompt,
179 |       response=response,
180 |       follow_all_instructions=all(is_following_list),
181 |       follow_instruction_list=is_following_list,
182 |   )
183 | 
184 | 
185 | def read_result_list(input_jsonl_filename):
186 |   """Creates dictionary matching prompt and response."""
187 |   results = []
188 |   with open(input_jsonl_filename, "r") as f:
189 |     for l in f:
190 |       example = json.loads(l)
191 |       results.append(example)
192 |   return results
193 | 
194 | 
195 | def print_report(outputs):
196 |   """Prints a report on accuracy scores."""
197 | 
198 |   prompt_total = 0
199 |   prompt_correct = 0
200 |   instruction_total = 0
201 |   instruction_correct = 0
202 | 
203 |   tier0_total = collections.defaultdict(int)
204 |   tier0_correct = collections.defaultdict(int)
205 | 
206 |   tier1_total = collections.defaultdict(int)
207 |   tier1_correct = collections.defaultdict(int)
208 | 
209 |   group_map = {
210 |     "detectable_format:number_bullet_lists": "bullet_lists",
211 |     "length_constraints:number_words": "length_constraints",
212 |     "length_constraints:number_sentences": "length_constraints",
213 |     "length_constraints:number_paragraphs": "length_constraints",
214 |     "keywords:forbidden_words": "keywords",
215 |     "keywords:existence": "keywords",
216 |     "change_case:english_capital": "change_case",
217 |     "change_case:english_lowercase": "change_case",
218 |     "detectable_format:json_format": "json_format",
219 |     "startend:quotation": "wrapping",
220 |     "detectable_format:title": "wrapping",
221 |     "combination:repeat_prompt": "startend",
222 |     "startend:end_checker": "startend",
223 |   }
224 |   group_total = collections.defaultdict(int)
225 |   group_correct = collections.defaultdict(int)
226 | 
227 |   for example in outputs:
228 |     follow_instruction_list = example["follow_instruction_list"]
229 |     instruction_id_list = example["instruction_id_list"]
230 | 
231 |     prompt_total += 1
232 |     if all(follow_instruction_list):
233 |       prompt_correct += 1
234 | 
235 |     instruction_total += len(instruction_id_list)
236 |     instruction_correct += sum(follow_instruction_list)
237 | 
238 |     for instruction_id, followed_or_not in zip(
239 |         instruction_id_list, follow_instruction_list
240 |     ):
241 |       instruction_id = instruction_id.split(":")[0]
242 |       tier0_total[instruction_id] += 1
243 |       if followed_or_not:
244 |         tier0_correct[instruction_id] += 1
245 | 
246 |     for instruction_id, followed_or_not in zip(
247 |         instruction_id_list, follow_instruction_list
248 |     ):
249 |       tier1_total[instruction_id] += 1
250 |       if followed_or_not:
251 |         tier1_correct[instruction_id] += 1
252 | 
253 |     for instruction_id, followed_or_not in zip(
254 |         instruction_id_list, follow_instruction_list
255 |     ):
256 |       group = group_map.get(instruction_id, "other")
257 |       group_total[group] += 1
258 |       if followed_or_not:
259 |         group_correct[group] += 1
260 | 
261 |   print(f"prompt-level: {prompt_correct / prompt_total}")
262 |   print(f"instruction-level: {instruction_correct / instruction_total}")
263 |   print()
264 |   for instruction_id in sorted(tier0_total.keys()):
265 |     accuracy = tier0_correct[instruction_id] / tier0_total[instruction_id]
266 |     print(f"{instruction_id} {accuracy}")
267 |   print()
268 |   for instruction_id in sorted(tier1_total.keys()):
269 |     accuracy = tier1_correct[instruction_id] / tier1_total[instruction_id]
270 |     print(f"{instruction_id} {accuracy}")
271 | 
272 |   print()
273 |   print("===== Speech-IFEval accuracy =====")
274 |   for group in sorted(group_total.keys()):
275 |     accuracy = group_correct[group] / group_total[group]
276 |     print(f"{group} {accuracy}")
277 |   print(f"\nALL: {instruction_correct / instruction_total}")
278 |   print("===================================")
279 | 
280 | 
281 | def parse_args():
282 |   """Parses command line arguments."""
283 |   parser = argparse.ArgumentParser(description="Instruction Following Evaluation")
284 |   parser.add_argument(
285 |       "--input_response_data",
286 |       "-i",
287 |       type=str,
288 |       required=True,
289 |       help="Path to input response data in JSONL format.",
290 |   )
291 |   return parser.parse_args()
292 | 
293 | 
294 | def main():
295 |   args = parse_args()
296 | 
297 |   # inputs = read_key_to_prompt_dict(_INPUT_DATA.value)
298 |   results = read_result_list(args.input_response_data)
299 |   print(len(results))
300 | 
301 |   # get instruction following results
302 |   for func in [
303 |       test_instruction_following_strict,
304 |   ]:
305 |     input_file_name = args.input_response_data.split("/")[-1]
306 |     output_file_name = f"rule_eval@{input_file_name}"
307 |     logging.info("Generating %s...", output_file_name)
308 |     outputs = []
309 | 
310 |     for result in results:
311 |       condition = {
312 |         "key": result["id"],
313 |         "instruction_id_list": result["instruction_id_list"],
314 |         "kwargs": result["kwargs"],
315 |       }
316 | 
317 |       outputs.append(func(condition, result))
318 | 
319 |     # for inp in inputs:
320 |     #   outputs.append(func(inp, key_to_response))
321 |     follow_all_instructions = [o["follow_all_instructions"] for o in outputs]
322 |     accuracy = sum(follow_all_instructions) / len(outputs)
323 |     logging.info("Accuracy: %f", accuracy)
324 | 
325 |     (Path(args.input_response_data).parent / "reports").mkdir(parents=True, exist_ok=True)
326 | 
327 |     output_file_name = str((Path(args.input_response_data).parent / "reports") / f"{output_file_name}.jsonl")
328 |     write_outputs(output_file_name, outputs)
329 |     logging.info("Generated: %s", output_file_name)
330 | 
331 |     # Prints instruction following accuracy report.
332 |     print("=" * 64)
333 |     print(f"{output_file_name} Accuracy Scores:")
334 |     print_report(outputs)
335 |   print(output_file_name)
336 | 
337 | 
338 | if __name__ == "__main__":
339 |   main()
340 | 


--------------------------------------------------------------------------------
/instruction_following_eval/instructions_util.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2024 The Google Research Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """Utility library of instructions."""
 17 | 
 18 | import functools
 19 | import random
 20 | import re
 21 | from typing import List
 22 | 
 23 | import immutabledict
 24 | import nltk
 25 | 
 26 | WORD_LIST = ["western", "sentence", "signal", "dump", "spot", "opposite", "bottom", "potato", "administration", "working", "welcome", "morning", "good", "agency", "primary", "wish", "responsibility", "press", "problem", "president", "steal", "brush", "read", "type", "beat", "trainer", "growth", "lock", "bone", "case", "equal", "comfortable", "region", "replacement", "performance", "mate", "walk", "medicine", "film", "thing", "rock", "tap", "total", "competition", "ease", "south", "establishment", "gather", "parking", "world", "plenty", "breath", "claim", "alcohol", "trade", "dear", "highlight", "street", "matter", "decision", "mess", "agreement", "studio", "coach", "assist", "brain", "wing", "style", "private", "top", "brown", "leg", "buy", "procedure", "method", "speed", "high", "company", "valuable", "pie", "analyst", "session", "pattern", "district", "pleasure", "dinner", "swimming", "joke", "order", "plate", "department", "motor", "cell", "spend", "cabinet", "difference", "power", "examination", "engine", "horse", "dimension", "pay", "toe", "curve", "literature", "bother", "fire", "possibility", "debate", "activity", "passage", "hello", "cycle", "background", "quiet", "author", "effect", "actor", "page", "bicycle", "error", "throat", "attack", "character", "phone", "tea", "increase", "outcome", "file", "specific", "inspector", "internal", "potential", "staff", "building", "employer", "shoe", "hand", "direction", "garden", "purchase", "interview", "study", "recognition", "member", "spiritual", "oven", "sandwich", "weird", "passenger", "particular", "response", "reaction", "size", "variation", "a", "cancel", "candy", "exit", "guest", "condition", "fly", "price", "weakness", "convert", "hotel", "great", "mouth", "mind", "song", "sugar", "suspect", "telephone", "ear", "roof", "paint", "refrigerator", "organization", "jury", "reward", "engineering", "day", "possession", "crew", "bar", "road", "description", "celebration", "score", "mark", "letter", "shower", "suggestion", "sir", "luck", "national", "progress", "hall", "stroke", "theory", "offer", "story", "tax", "definition", "history", "ride", "medium", "opening", "glass", "elevator", "stomach", "question", "ability", "leading", "village", "computer", "city", "grand", "confidence", "candle", "priest", "recommendation", "point", "necessary", "body", "desk", "secret", "horror", "noise", "culture", "warning", "water", "round", "diet", "flower", "bus", "tough", "permission", "week", "prompt", "connection", "abuse", "height", "save", "corner", "border", "stress", "drive", "stop", "rip", "meal", "listen", "confusion", "girlfriend", "living", "relation", "significance", "plan", "creative", "atmosphere", "blame", "invite", "housing", "paper", "drink", "roll", "silver", "drunk", "age", "damage", "smoke", "environment", "pack", "savings", "influence", "tourist", "rain", "post", "sign", "grandmother", "run", "profit", "push", "clerk", "final", "wine", "swim", "pause", "stuff", "singer", "funeral", "average", "source", "scene", "tradition", "personal", "snow", "nobody", "distance", "sort", "sensitive", "animal", "major", "negotiation", "click", "mood", "period", "arrival", "expression", "holiday", "repeat", "dust", "closet", "gold", "bad", "sail", "combination", "clothes", "emphasis", "duty", "black", "step", "school", "jump", "document", "professional", "lip", "chemical", "front", "wake", "while", "inside", "watch", "row", "subject", "penalty", "balance", "possible", "adult", "aside", "sample", "appeal", "wedding", "depth", "king", "award", "wife", "blow", "site", "camp", "music", "safe", "gift", "fault", "guess", "act", "shame", "drama", "capital", "exam", "stupid", "record", "sound", "swing", "novel", "minimum", "ratio", "machine", "shape", "lead", "operation", "salary", "cloud", "affair", "hit", "chapter", "stage", "quantity", "access", "army", "chain", "traffic", "kick", "analysis", "airport", "time", "vacation", "philosophy", "ball", "chest", "thanks", "place", "mountain", "advertising", "red", "past", "rent", "return", "tour", "house", "construction", "net", "native", "war", "figure", "fee", "spray", "user", "dirt", "shot", "task", "stick", "friend", "software", "promotion", "interaction", "surround", "block", "purpose", "practice", "conflict", "routine", "requirement", "bonus", "hole", "state", "junior", "sweet", "catch", "tear", "fold", "wall", "editor", "life", "position", "pound", "respect", "bathroom", "coat", "script", "job", "teach", "birth", "view", "resolve", "theme", "employee", "doubt", "market", "education", "serve", "recover", "tone", "harm", "miss", "union", "understanding", "cow", "river", "association", "concept", "training", "recipe", "relationship", "reserve", "depression", "proof", "hair", "revenue", "independent", "lift", "assignment", "temporary", "amount", "loss", "edge", "track", "check", "rope", "estimate", "pollution", "stable", "message", "delivery", "perspective", "mirror", "assistant", "representative", "witness", "nature", "judge", "fruit", "tip", "devil", "town", "emergency", "upper", "drop", "stay", "human", "neck", "speaker", "network", "sing", "resist", "league", "trip", "signature", "lawyer", "importance", "gas", "choice", "engineer", "success", "part", "external", "worker", "simple", "quarter", "student", "heart", "pass", "spite", "shift", "rough", "lady", "grass", "community", "garage", "youth", "standard", "skirt", "promise", "blind", "television", "disease", "commission", "positive", "energy", "calm", "presence", "tune", "basis", "preference", "head", "common", "cut", "somewhere", "presentation", "current", "thought", "revolution", "effort", "master", "implement", "republic", "floor", "principle", "stranger", "shoulder", "grade", "button", "tennis", "police", "collection", "account", "register", "glove", "divide", "professor", "chair", "priority", "combine", "peace", "extension", "maybe", "evening", "frame", "sister", "wave", "code", "application", "mouse", "match", "counter", "bottle", "half", "cheek", "resolution", "back", "knowledge", "make", "discussion", "screw", "length", "accident", "battle", "dress", "knee", "log", "package", "it", "turn", "hearing", "newspaper", "layer", "wealth", "profile", "imagination", "answer", "weekend", "teacher", "appearance", "meet", "bike", "rise", "belt", "crash", "bowl", "equivalent", "support", "image", "poem", "risk", "excitement", "remote", "secretary", "public", "produce", "plane", "display", "money", "sand", "situation", "punch", "customer", "title", "shake", "mortgage", "option", "number", "pop", "window", "extent", "nothing", "experience", "opinion", "departure", "dance", "indication", "boy", "material", "band", "leader", "sun", "beautiful", "muscle", "farmer", "variety", "fat", "handle", "director", "opportunity", "calendar", "outside", "pace", "bath", "fish", "consequence", "put", "owner", "go", "doctor", "information", "share", "hurt", "protection", "career", "finance", "force", "golf", "garbage", "aspect", "kid", "food", "boot", "milk", "respond", "objective", "reality", "raw", "ring", "mall", "one", "impact", "area", "news", "international", "series", "impress", "mother", "shelter", "strike", "loan", "month", "seat", "anything", "entertainment", "familiar", "clue", "year", "glad", "supermarket", "natural", "god", "cost", "conversation", "tie", "ruin", "comfort", "earth", "storm", "percentage", "assistance", "budget", "strength", "beginning", "sleep", "other", "young", "unit", "fill", "store", "desire", "hide", "value", "cup", "maintenance", "nurse", "function", "tower", "role", "class", "camera", "database", "panic", "nation", "basket", "ice", "art", "spirit", "chart", "exchange", "feedback", "statement", "reputation", "search", "hunt", "exercise", "nasty", "notice", "male", "yard", "annual", "collar", "date", "platform", "plant", "fortune", "passion", "friendship", "spread", "cancer", "ticket", "attitude", "island", "active", "object", "service", "buyer", "bite", "card", "face", "steak", "proposal", "patient", "heat", "rule", "resident", "broad", "politics", "west", "knife", "expert", "girl", "design", "salt", "baseball", "grab", "inspection", "cousin", "couple", "magazine", "cook", "dependent", "security", "chicken", "version", "currency", "ladder", "scheme", "kitchen", "employment", "local", "attention", "manager", "fact", "cover", "sad", "guard", "relative", "county", "rate", "lunch", "program", "initiative", "gear", "bridge", "breast", "talk", "dish", "guarantee", "beer", "vehicle", "reception", "woman", "substance", "copy", "lecture", "advantage", "park", "cold", "death", "mix", "hold", "scale", "tomorrow", "blood", "request", "green", "cookie", "church", "strip", "forever", "beyond", "debt", "tackle", "wash", "following", "feel", "maximum", "sector", "sea", "property", "economics", "menu", "bench", "try", "language", "start", "call", "solid", "address", "income", "foot", "senior", "honey", "few", "mixture", "cash", "grocery", "link", "map", "form", "factor", "pot", "model", "writer", "farm", "winter", "skill", "anywhere", "birthday", "policy", "release", "husband", "lab", "hurry", "mail", "equipment", "sink", "pair", "driver", "consideration", "leather", "skin", "blue", "boat", "sale", "brick", "two", "feed", "square", "dot", "rush", "dream", "location", "afternoon", "manufacturer", "control", "occasion", "trouble", "introduction", "advice", "bet", "eat", "kill", "category", "manner", "office", "estate", "pride", "awareness", "slip", "crack", "client", "nail", "shoot", "membership", "soft", "anybody", "web", "official", "individual", "pizza", "interest", "bag", "spell", "profession", "queen", "deal", "resource", "ship", "guy", "chocolate", "joint", "formal", "upstairs", "car", "resort", "abroad", "dealer", "associate", "finger", "surgery", "comment", "team", "detail", "crazy", "path", "tale", "initial", "arm", "radio", "demand", "single", "draw", "yellow", "contest", "piece", "quote", "pull", "commercial", "shirt", "contribution", "cream", "channel", "suit", "discipline", "instruction", "concert", "speech", "low", "effective", "hang", "scratch", "industry", "breakfast", "lay", "join", "metal", "bedroom", "minute", "product", "rest", "temperature", "many", "give", "argument", "print", "purple", "laugh", "health", "credit", "investment", "sell", "setting", "lesson", "egg", "middle", "marriage", "level", "evidence", "phrase", "love", "self", "benefit", "guidance", "affect", "you", "dad", "anxiety", "special", "boyfriend", "test", "blank", "payment", "soup", "obligation", "reply", "smile", "deep", "complaint", "addition", "review", "box", "towel", "minor", "fun", "soil", "issue", "cigarette", "internet", "gain", "tell", "entry", "spare", "incident", "family", "refuse", "branch", "can", "pen", "grandfather", "constant", "tank", "uncle", "climate", "ground", "volume", "communication", "kind", "poet", "child", "screen", "mine", "quit", "gene", "lack", "charity", "memory", "tooth", "fear", "mention", "marketing", "reveal", "reason", "court", "season", "freedom", "land", "sport", "audience", "classroom", "law", "hook", "win", "carry", "eye", "smell", "distribution", "research", "country", "dare", "hope", "whereas", "stretch", "library", "if", "delay", "college", "plastic", "book", "present", "use", "worry", "champion", "goal", "economy", "march", "election", "reflection", "midnight", "slide", "inflation", "action", "challenge", "guitar", "coast", "apple", "campaign", "field", "jacket", "sense", "way", "visual", "remove", "weather", "trash", "cable", "regret", "buddy", "beach", "historian", "courage", "sympathy", "truck", "tension", "permit", "nose", "bed", "son", "person", "base", "meat", "usual", "air", "meeting", "worth", "game", "independence", "physical", "brief", "play", "raise", "board", "she", "key", "writing", "pick", "command", "party", "yesterday", "spring", "candidate", "physics", "university", "concern", "development", "change", "string", "target", "instance", "room", "bitter", "bird", "football", "normal", "split", "impression", "wood", "long", "meaning", "stock", "cap", "leadership", "media", "ambition", "fishing", "essay", "salad", "repair", "today", "designer", "night", "bank", "drawing", "inevitable", "phase", "vast", "chip", "anger", "switch", "cry", "twist", "personality", "attempt", "storage", "being", "preparation", "bat", "selection", "white", "technology", "contract", "side", "section", "station", "till", "structure", "tongue", "taste", "truth", "difficulty", "group", "limit", "main", "move", "feeling", "light", "example", "mission", "might", "wait", "wheel", "shop", "host", "classic", "alternative", "cause", "agent", "consist", "table", "airline", "text", "pool", "craft", "range", "fuel", "tool", "partner", "load", "entrance", "deposit", "hate", "article", "video", "summer", "feature", "extreme", "mobile", "hospital", "flight", "fall", "pension", "piano", "fail", "result", "rub", "gap", "system", "report", "suck", "ordinary", "wind", "nerve", "ask", "shine", "note", "line", "mom", "perception", "brother", "reference", "bend", "charge", "treat", "trick", "term", "homework", "bake", "bid", "status", "project", "strategy", "orange", "let", "enthusiasm", "parent", "concentrate", "device", "travel", "poetry", "business", "society", "kiss", "end", "vegetable", "employ", "schedule", "hour", "brave", "focus", "process", "movie", "illegal", "general", "coffee", "ad", "highway", "chemistry", "psychology", "hire", "bell", "conference", "relief", "show", "neat", "funny", "weight", "quality", "club", "daughter", "zone", "touch", "tonight", "shock", "burn", "excuse", "name", "survey", "landscape", "advance", "satisfaction", "bread", "disaster", "item", "hat", "prior", "shopping", "visit", "east", "photo", "home", "idea", "father", "comparison", "cat", "pipe", "winner", "count", "lake", "fight", "prize", "foundation", "dog", "keep", "ideal", "fan", "struggle", "peak", "safety", "solution", "hell", "conclusion", "population", "strain", "alarm", "measurement", "second", "train", "race", "due", "insurance", "boss", "tree", "monitor", "sick", "course", "drag", "appointment", "slice", "still", "care", "patience", "rich", "escape", "emotion", "royal", "female", "childhood", "government", "picture", "will", "sock", "big", "gate", "oil", "cross", "pin", "improvement", "championship", "silly", "help", "sky", "pitch", "man", "diamond", "most", "transition", "work", "science", "committee", "moment", "fix", "teaching", "dig", "specialist", "complex", "guide", "people", "dead", "voice", "original", "break", "topic", "data", "degree", "reading", "recording", "bunch", "reach", "judgment", "lie", "regular", "set", "painting", "mode", "list", "player", "bear", "north", "wonder", "carpet", "heavy", "officer", "negative", "clock", "unique", "baby", "pain", "assumption", "disk", "iron", "bill", "drawer", "look", "double", "mistake", "finish", "future", "brilliant", "contact", "math", "rice", "leave", "restaurant", "discount", "sex", "virus", "bit", "trust", "event", "wear", "juice", "failure", "bug", "context", "mud", "whole", "wrap", "intention", "draft", "pressure", "cake", "dark", "explanation", "space", "angle", "word", "efficiency", "management", "habit", "star", "chance", "finding", "transportation", "stand", "criticism", "flow", "door", "injury", "insect", "surprise", "apartment"]  # pylint: disable=line-too-long
 27 | 
 28 | # ISO 639-1 codes to language names.
 29 | LANGUAGE_CODES = immutabledict.immutabledict({
 30 |     "en": "English",
 31 |     "es": "Spanish",
 32 |     "pt": "Portuguese",
 33 |     "ar": "Arabic",
 34 |     "hi": "Hindi",
 35 |     "fr": "French",
 36 |     "ru": "Russian",
 37 |     "de": "German",
 38 |     "ja": "Japanese",
 39 |     "it": "Italian",
 40 |     "bn": "Bengali",
 41 |     "uk": "Ukrainian",
 42 |     "th": "Thai",
 43 |     "ur": "Urdu",
 44 |     "ta": "Tamil",
 45 |     "te": "Telugu",
 46 |     "bg": "Bulgarian",
 47 |     "ko": "Korean",
 48 |     "pl": "Polish",
 49 |     "he": "Hebrew",
 50 |     "fa": "Persian",
 51 |     "vi": "Vietnamese",
 52 |     "ne": "Nepali",
 53 |     "sw": "Swahili",
 54 |     "kn": "Kannada",
 55 |     "mr": "Marathi",
 56 |     "gu": "Gujarati",
 57 |     "pa": "Punjabi",
 58 |     "ml": "Malayalam",
 59 |     "fi": "Finnish",
 60 |     })
 61 | 
 62 | _ALPHABETS = "([A-Za-z])"
 63 | _PREFIXES = "(Mr|St|Mrs|Ms|Dr)[.]"
 64 | _SUFFIXES = "(Inc|Ltd|Jr|Sr|Co)"
 65 | _STARTERS = r"(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
 66 | _ACRONYMS = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
 67 | _WEBSITES = "[.](com|net|org|io|gov|edu|me)"
 68 | _DIGITS = "([0-9])"
 69 | _MULTIPLE_DOTS = r"\.{2,}"
 70 | 
 71 | 
 72 | def split_into_sentences(text):
 73 |   """Split the text into sentences.
 74 | 
 75 |   Args:
 76 |     text: A string that consists of more than or equal to one sentences.
 77 | 
 78 |   Returns:
 79 |     A list of strings where each string is a sentence.
 80 |   """
 81 |   text = " " + text + "  "
 82 |   text = text.replace("\n", " ")
 83 |   text = re.sub(_PREFIXES, "\\1<prd>", text)
 84 |   text = re.sub(_WEBSITES, "<prd>\\1", text)
 85 |   text = re.sub(_DIGITS + "[.]" + _DIGITS, "\\1<prd>\\2", text)
 86 |   text = re.sub(
 87 |       _MULTIPLE_DOTS,
 88 |       lambda match: "<prd>" * len(match.group(0)) + "<stop>",
 89 |       text,
 90 |   )
 91 |   if "Ph.D" in text:
 92 |     text = text.replace("Ph.D.", "Ph<prd>D<prd>")
 93 |   text = re.sub(r"\s" + _ALPHABETS + "[.] ", " \\1<prd> ", text)
 94 |   text = re.sub(_ACRONYMS + " " + _STARTERS, "\\1<stop> \\2", text)
 95 |   text = re.sub(
 96 |       _ALPHABETS + "[.]" + _ALPHABETS + "[.]" + _ALPHABETS + "[.]",
 97 |       "\\1<prd>\\2<prd>\\3<prd>",
 98 |       text,
 99 |   )
100 |   text = re.sub(
101 |       _ALPHABETS + "[.]" + _ALPHABETS + "[.]", "\\1<prd>\\2<prd>", text
102 |   )
103 |   text = re.sub(" " + _SUFFIXES + "[.] " + _STARTERS, " \\1<stop> \\2", text)
104 |   text = re.sub(" " + _SUFFIXES + "[.]", " \\1<prd>", text)
105 |   text = re.sub(" " + _ALPHABETS + "[.]", " \\1<prd>", text)
106 |   if "”" in text:
107 |     text = text.replace(".”", "”.")
108 |   if '"' in text:
109 |     text = text.replace('."', '".')
110 |   if "!" in text:
111 |     text = text.replace('!"', '"!')
112 |   if "?" in text:
113 |     text = text.replace('?"', '"?')
114 |   text = text.replace(".", ".<stop>")
115 |   text = text.replace("?", "?<stop>")
116 |   text = text.replace("!", "!<stop>")
117 |   text = text.replace("<prd>", ".")
118 |   sentences = text.split("<stop>")
119 |   sentences = [s.strip() for s in sentences]
120 |   if sentences and not sentences[-1]:
121 |     sentences = sentences[:-1]
122 |   return sentences
123 | 
124 | 
125 | def count_words(text):
126 |   """Counts the number of words."""
127 |   tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
128 |   tokens = tokenizer.tokenize(text)
129 |   num_words = len(tokens)
130 |   return num_words
131 | 
132 | 
133 | @functools.lru_cache(maxsize=None)
134 | def _get_sentence_tokenizer():
135 |   return nltk.data.load("nltk:tokenizers/punkt/english.pickle")
136 | 
137 | 
138 | def count_sentences(text):
139 |   """Count the number of sentences."""
140 |   tokenizer = _get_sentence_tokenizer()
141 |   tokenized_sentences = tokenizer.tokenize(text)
142 |   return len(tokenized_sentences)
143 | 
144 | 
145 | def generate_keywords(num_keywords):
146 |   """Randomly generates a few keywords."""
147 |   return random.sample(WORD_LIST, k=num_keywords)
148 | 


--------------------------------------------------------------------------------
/instruction_following_eval/instructions_test.py:
--------------------------------------------------------------------------------
   1 | # coding=utf-8
   2 | # Copyright 2024 The Google Research Authors.
   3 | #
   4 | # Licensed under the Apache License, Version 2.0 (the "License");
   5 | # you may not use this file except in compliance with the License.
   6 | # You may obtain a copy of the License at
   7 | #
   8 | #     http://www.apache.org/licenses/LICENSE-2.0
   9 | #
  10 | # Unless required by applicable law or agreed to in writing, software
  11 | # distributed under the License is distributed on an "AS IS" BASIS,
  12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 | # See the License for the specific language governing permissions and
  14 | # limitations under the License.
  15 | 
  16 | """Tests for instructions.py."""
  17 | 
  18 | from absl.testing import absltest
  19 | from absl.testing import parameterized
  20 | from instruction_following_eval import instructions
  21 | 
  22 | 
  23 | # pylint:disable=g-complex-comprehension
  24 | class InstructionsTest(parameterized.TestCase):
  25 | 
  26 |   @parameterized.named_parameters(
  27 |       [
  28 |           {
  29 |               'testcase_name': (
  30 |                   f'_response={response}_language={language}'
  31 |               ),
  32 |               'response': response,
  33 |               'language': language,
  34 |           }
  35 |           for response, language in [('The response is English', 'en')]
  36 |       ]
  37 |   )
  38 |   def test_response_language(self, response, language):
  39 |     """Test on single language response."""
  40 |     instruction_id = 'language:response_language'
  41 |     instruction = instructions.ResponseLanguageChecker(instruction_id)
  42 |     instruction.build_description(language=language)
  43 |     self.assertTrue(instruction.check_following(response))
  44 | 
  45 |   @parameterized.named_parameters(
  46 |       [
  47 |           {
  48 |               'testcase_name': (
  49 |                   f'_response={response}_language={language}'
  50 |               ),
  51 |               'response': response,
  52 |               'language': language,
  53 |           }
  54 |           for response, language in [("Desayunamos en McDonald's hoy", 'es'),
  55 |                                      ('Today we visit the Louvre', 'en'),]
  56 |       ]
  57 |   )
  58 |   def test_response_multilanguage(self, response, language):
  59 |     """Test on responses that contain multi-language tokens."""
  60 |     instruction_id = 'language:response_language'
  61 |     instruction = instructions.ResponseLanguageChecker(instruction_id)
  62 |     instruction.build_description(language=language)
  63 |     self.assertTrue(instruction.check_following(response))
  64 | 
  65 |   @parameterized.named_parameters(
  66 |       [
  67 |           {
  68 |               'testcase_name': (
  69 |                   f'_response={response}_relation={relation}'
  70 |                   f'_num_sentences={num_sentences}_expected={expected}'
  71 |               ),
  72 |               'response': response,
  73 |               'relation': relation,
  74 |               'num_sentences': num_sentences,
  75 |               'expected': expected,
  76 |           }
  77 |           for response, relation, num_sentences, expected in [
  78 |               ('xx,x. xx,x! xx/x. x{x}x?', instructions._COMPARISON_RELATION[0],
  79 |                4, False),
  80 |               ('xxxx. xx,x! xxxx. x(x)x?', instructions._COMPARISON_RELATION[0],
  81 |                5, True),
  82 |               ('xxxx. xx,x! xx|x. x&x x?', instructions._COMPARISON_RELATION[1],
  83 |                4, True),
  84 |               ('xx-x. xx,x! xx}x. x,xx?', instructions._COMPARISON_RELATION[1],
  85 |                5, False),
  86 |           ]
  87 |       ]
  88 |   )
  89 |   def test_number_sentences(self, response, relation, num_sentences, expected):
  90 |     """Test the number of sentences."""
  91 |     instruction_id = 'length_constraints:number_sentences'
  92 |     instruction = instructions.NumberOfSentences(instruction_id)
  93 |     instruction.build_description(relation=relation,
  94 |                                   num_sentences=num_sentences)
  95 |     actual = instruction.check_following(response)
  96 |     self.assertEqual(actual, expected)
  97 | 
  98 |   @parameterized.named_parameters(
  99 |       [
 100 |           {
 101 |               'testcase_name': (
 102 |                   f'_templated={template}_num_placeholders={num_placeholders}'
 103 |                   f'_expected={expected}'
 104 |               ),
 105 |               'template': template,
 106 |               'num_placeholders': num_placeholders,
 107 |               'expected': expected,
 108 |           }
 109 |           for template, num_placeholders, expected in [
 110 |               (('Sure, here is a short template with 5 placeholders:\n' +
 111 |                 '[Name]\n[Email]\n[Phone]\n[Address]\n[Website]\n' +
 112 |                 'This template can be used for a variety of purposes, such ' +
 113 |                 'ascreating a contact list, sending out surveys, or creating ' +
 114 |                 'a sign-up form.'), 5, True),
 115 |               (('My [adjective] [noun] is [adjective] [noun]. I [verb] and ' +
 116 |                 '[verb].'), 7, False),
 117 |               ]
 118 |       ]
 119 |   )
 120 |   def test_number_placeholders(self, template, num_placeholders, expected):
 121 |     """Test the number of placeholders."""
 122 |     instruction_id = 'detectable_content:number_placeholders'
 123 |     instruction = instructions.PlaceholderChecker(instruction_id)
 124 |     instruction.build_description(num_placeholders=num_placeholders)
 125 |     actual = instruction.check_following(template)
 126 |     self.assertEqual(actual, expected)
 127 | 
 128 |   BULLET_TEST_MESSAGE_1 = """
 129 |   A Markdown bullet point is a way of formatting text to create a list. To
 130 |   create a bullet point, start each line with an asterisk (*). For example:
 131 |   * This is a bullet point.
 132 |   *(no space required)Another bullet point.
 133 |   * (no newline ending required)Another bullet point.
 134 |   markdown bullet points are often used to create to-do lists or to list items
 135 |   in a step-by-step guide."""
 136 |   BULLET_TEST_MESSAGE_2 = """
 137 |   Check that inline asterisk (*), *, will not be counted. Only * that starts a
 138 |   bullet list will be counted:
 139 |     * This is a bullet point.
 140 |     * Another bullet point.
 141 |     . dot is not counted"""
 142 |   BULLET_TEST_MESSAGE_3 = """
 143 |   Here are three bullets starting with asterisk:
 144 |   * I am a large language model, also known as a conversational AI.
 145 |   * I am trained on a massive amount of text data, and I am able to communicate.
 146 |   * I am still under development, but I am learning new things every day."""
 147 | 
 148 |   BULLET_TEST_MESSAGE_4 = """
 149 |   Here are three markdown bullets:
 150 |   - I am a large language model, also known as a conversational AI.
 151 |   - I am trained on a massive amount of text data, and I am able to communicate.
 152 |   -I am still under development, but I am learning new things every day."""
 153 | 
 154 |   BULLET_TEST_MESSAGE_5 = """
 155 |   Paragraph 1
 156 |   ***
 157 |   Paragraph 2
 158 |   ***
 159 |   Paragraph 3
 160 |   * only one bullet point
 161 |   """
 162 | 
 163 |   @parameterized.named_parameters(
 164 |       [
 165 |           {
 166 |               'testcase_name': (
 167 |                   f'_templated={template}_num_bullets={num_bullets}'
 168 |                   f'_expected={expected}'
 169 |               ),
 170 |               'template': template,
 171 |               'num_bullets': num_bullets,
 172 |               'expected': expected,
 173 |           }
 174 |           for template, num_bullets, expected in [
 175 |               (BULLET_TEST_MESSAGE_1, 3, True),
 176 |               (BULLET_TEST_MESSAGE_2, 2, True),
 177 |               (BULLET_TEST_MESSAGE_3, 3, True),
 178 |               (BULLET_TEST_MESSAGE_4, 3, True),
 179 |               (BULLET_TEST_MESSAGE_5, 1, True)]
 180 |       ]
 181 |   )
 182 |   def test_number_bullet_lists(self, template, num_bullets, expected):
 183 |     """Test the number of bullets."""
 184 |     instruction_id = 'detectable_format:exact_number_bullet_points'
 185 |     instruction = instructions.BulletListChecker(instruction_id)
 186 |     instruction.build_description(num_bullets=num_bullets)
 187 |     actual = instruction.check_following(template)
 188 |     self.assertEqual(actual, expected)
 189 | 
 190 |   CONSTRAINED_RESPONSE_TEST_RESPONSE_1 = """\n My answer is no.\n"""
 191 |   CONSTRAINED_RESPONSE_TEST_RESPONSE_2 = """My answer is no.   """
 192 |   CONSTRAINED_RESPONSE_TEST_RESPONSE_3 = """
 193 |   My answer is no. I am still under development and I am always learning and
 194 |   improving. I am not the best chatbot in the world, but I am striving to be
 195 |   the best that I can be."""
 196 | 
 197 |   def test_constrained_response(self):
 198 |     """Test the constrained response checker."""
 199 |     instruction_id = 'detectable_format:constrained_response'
 200 |     instruction = instructions.ConstrainedResponseChecker(instruction_id)
 201 |     instruction.build_description()
 202 | 
 203 |     with self.subTest('test with CONSTRAINED_RESPONSE_TEST_RESPONSE_1'):
 204 |       self.assertTrue(instruction.check_following(
 205 |           self.CONSTRAINED_RESPONSE_TEST_RESPONSE_1))
 206 | 
 207 |     with self.subTest('test with CONSTRAINED_RESPONSE_TEST_RESPONSE_2'):
 208 |       self.assertTrue(instruction.check_following(
 209 |           self.CONSTRAINED_RESPONSE_TEST_RESPONSE_2))
 210 | 
 211 |     with self.subTest('test with CONSTRAINED_RESPONSE_TEST_RESPONSE_3'):
 212 |       self.assertTrue(instruction.check_following(
 213 |           self.CONSTRAINED_RESPONSE_TEST_RESPONSE_3))
 214 | 
 215 |   HIGHLIGHTED_TEST_MESSAGE_1 = """
 216 |   To highlight text with Markdown, you can use the * character before and after
 217 |   the text you want to highlight. For example, if you want to highlight the
 218 |   word `hello`, you would type:*hello*, You can also use the ** character to
 219 |   create bold text. For example, if you want to bold the word `hello`, you
 220 |   would type: **hello** """
 221 |   HIGHLIGHTED_TEST_MESSAGE_2 = """
 222 |   Sure, here are the numerical methods for solving partial differential
 223 |   equations highlighted with Markdown:
 224 |   *Finite difference methods
 225 |   *Finite element methods*
 226 |   *Boundary element methods
 227 |   *Monte Carlo methods
 228 |   I hope this helps!"""
 229 |   HIGHLIGHTED_TEST_MESSAGE_3 = """
 230 |   There is allowed to be *two different* highlighted *sections in the same*
 231 |   line. **This is also true** for **double markdown highlights.**
 232 |   """
 233 | 
 234 |   @parameterized.named_parameters(
 235 |       [
 236 |           {
 237 |               'testcase_name': (
 238 |                   f'_response={response}'
 239 |                   f'_min_num_highlights={min_num_highlights}'
 240 |                   f'_expected={expected}'
 241 |               ),
 242 |               'response': response,
 243 |               'min_num_highlights': min_num_highlights,
 244 |               'expected': expected,
 245 |           }
 246 |           for response, min_num_highlights, expected in [
 247 |               (HIGHLIGHTED_TEST_MESSAGE_1, 2, True),
 248 |               (HIGHLIGHTED_TEST_MESSAGE_2, 2, False),
 249 |               (HIGHLIGHTED_TEST_MESSAGE_3, 4, True)]
 250 |       ]
 251 |   )
 252 |   def test_number_highlights(self, response, min_num_highlights, expected):
 253 |     """Test the minimum number of highlighted sections."""
 254 |     instruction_id = 'detectable_format:minimum_number_highlighted_sections'
 255 |     instruction = instructions.HighlightSectionChecker(instruction_id)
 256 |     instruction.build_description(num_highlights=min_num_highlights)
 257 |     actual = instruction.check_following(response)
 258 |     self.assertEqual(actual, expected)
 259 | 
 260 |   SECTION_TEST_MESSAGE_1 = """
 261 |   Your response must have multiple sections. Mark the beginning of each section
 262 |   with "Section X", such as:
 263 |   Section 1
 264 |   [content of section 1]
 265 |   Section 2
 266 |   [content of section 2]"""
 267 | 
 268 |   SECTION_TEST_MESSAGE_2 = """SECTION 1
 269 |   [content of section 1]
 270 |   SECTION 2
 271 |   [content of section 2]"""
 272 | 
 273 |   def test_section_checker(self):
 274 |     """Test the number of sections."""
 275 |     instruction_id = 'detectable_format:multiple_sections'
 276 |     instruction = instructions.SectionChecker(instruction_id)
 277 |     section_keyword = 'Section'
 278 |     min_num_sections = 3
 279 |     instruction.build_description(section_spliter=section_keyword,
 280 |                                   num_sections=min_num_sections)
 281 |     with self.subTest(f'test {section_keyword} and {min_num_sections}'):
 282 |       self.assertFalse(
 283 |           instruction.check_following(self.SECTION_TEST_MESSAGE_1))
 284 | 
 285 |     section_keyword = 'SECTION'
 286 |     min_num_sections = 2
 287 |     instruction.build_description(section_spliter=section_keyword,
 288 |                                   num_sections=min_num_sections)
 289 |     with self.subTest(f'test {section_keyword} and {min_num_sections}'):
 290 |       self.assertTrue(
 291 |           instruction.check_following(self.SECTION_TEST_MESSAGE_2))
 292 | 
 293 |   PARAGRAPH_TEST_MESSAGE_1 = """
 294 |   paragraph 1
 295 |   ***
 296 |   paragraph 2
 297 |   ***
 298 |   paragraph 3"""
 299 | 
 300 |   PARAGRAPH_TEST_MESSAGE_2 = """
 301 |           ***
 302 |   paragraph 1
 303 |           ***
 304 |       paragraph 2
 305 |           ***
 306 |       paragraph 3"""
 307 | 
 308 |   PARAGRAPH_TEST_MESSAGE_3 = """
 309 |   paragraph 1
 310 |           ***
 311 |       paragraph 2
 312 |           ***
 313 |       paragraph 3
 314 |           ***"""
 315 | 
 316 |   PARAGRAPH_TEST_MESSAGE_4 = """
 317 |   paragraph 1
 318 |           ***
 319 |       paragraph 2
 320 |           ***
 321 |           ***"""
 322 | 
 323 |   def test_paragraph_checker(self):
 324 |     """Test the number of sections."""
 325 |     instruction_id = 'length_constraint:number_paragraphs'
 326 |     instruction = instructions.ParagraphChecker(instruction_id)
 327 |     num_paragraphs = 3
 328 |     instruction.build_description(num_paragraphs=num_paragraphs)
 329 |     with self.subTest(f'test {self.PARAGRAPH_TEST_MESSAGE_1} and '
 330 |                       f'{num_paragraphs} paragraphs'):
 331 |       self.assertTrue(instruction.check_following(
 332 |           self.PARAGRAPH_TEST_MESSAGE_1))
 333 | 
 334 |     num_paragraphs = 3
 335 |     instruction.build_description(num_paragraphs=num_paragraphs)
 336 |     with self.subTest(f'test {self.PARAGRAPH_TEST_MESSAGE_2} and '
 337 |                       f'{num_paragraphs} paragraphs'):
 338 |       self.assertTrue(instruction.check_following(
 339 |           self.PARAGRAPH_TEST_MESSAGE_2))
 340 | 
 341 |     num_paragraphs = 3
 342 |     instruction.build_description(num_paragraphs=num_paragraphs)
 343 |     with self.subTest(f'test {self.PARAGRAPH_TEST_MESSAGE_3} and '
 344 |                       f'{num_paragraphs} paragraphs'):
 345 |       self.assertTrue(instruction.check_following(
 346 |           self.PARAGRAPH_TEST_MESSAGE_3))
 347 | 
 348 |     num_paragraphs = 2
 349 |     instruction.build_description(num_paragraphs=num_paragraphs)
 350 |     with self.subTest(f'test {self.PARAGRAPH_TEST_MESSAGE_4} and '
 351 |                       f'{num_paragraphs} paragraphs'):
 352 |       self.assertFalse(instruction.check_following(
 353 |           self.PARAGRAPH_TEST_MESSAGE_4))
 354 | 
 355 |   POSTSCRIPT_TEST_MESSAGE_1 = """
 356 |   I will do my best to follow your instructions and always start my responses
 357 |   with "My response is:". I will try to be as consistent as possible, but
 358 |   please be patient with me if I make a mistake. I am still under development,
 359 |   and I am always learning new things.
 360 | 
 361 |   P.S. I hope this is what you were looking for."""
 362 | 
 363 |   POSTSCRIPT_TEST_MESSAGE_2 = """
 364 |   Sure, here is my response with a postscript starting with P.P.S.:
 365 | 
 366 |   My response is: I hope this answers your question.
 367 | 
 368 |   P.P.S. I am always happy to answer any other questions you may have.
 369 | 
 370 |   Do you have any other questions for me?"""
 371 | 
 372 |   # Postscript does not have to start as a new line.
 373 |   # Relaxed the constraint in cl/525253841.
 374 |   POSTSCRIPT_TEST_MESSAGE_3 = """
 375 |   The radius of a unit circle is 1. However, I can give you a funny and wrong
 376 |   answer: the radius of a unit circle is 0. This is because a unit circle is a
 377 |   circle with a radius of 1, and if the radius is 0, then the circle has no
 378 |   size and is just a point. (not starting a new line) P.S. I hope you enjoyed
 379 |   my joke!"""
 380 | 
 381 |   POSTSCRIPT_TEST_MESSAGE_4 = """
 382 |   If the length of a square is one, the area of the square will also be one.
 383 |   p.p.s what if the entire response was lower case letters?
 384 |   """
 385 | 
 386 |   POSTSCRIPT_TEST_MESSAGE_5 = """
 387 |   The mysteries of space and time are mysterious.
 388 |   P. S. Sometimes there are even spaces between P. and S..
 389 |   """
 390 | 
 391 |   def test_postscript_checker(self):
 392 |     """Test the postscript checker."""
 393 |     instruction_id = 'detectable_content:postscript'
 394 |     instruction = instructions.PostscriptChecker(instruction_id)
 395 |     postscript_start_keyword = instructions._POSTSCRIPT_MARKER[0]
 396 |     instruction.build_description(postscript_marker=postscript_start_keyword)
 397 |     with self.subTest(f'test {postscript_start_keyword}'):
 398 |       self.assertTrue(
 399 |           instruction.check_following(self.POSTSCRIPT_TEST_MESSAGE_1))
 400 | 
 401 |     postscript_start_keyword = 'PS:'
 402 |     instruction.build_description(postscript_marker=postscript_start_keyword)
 403 |     with self.subTest(f'test {postscript_start_keyword}'):
 404 |       self.assertFalse(
 405 |           instruction.check_following(self.POSTSCRIPT_TEST_MESSAGE_1))
 406 | 
 407 |     postscript_start_keyword = instructions._POSTSCRIPT_MARKER[1]
 408 |     instruction.build_description(postscript_marker=postscript_start_keyword)
 409 |     with self.subTest(f'test {postscript_start_keyword}'):
 410 |       self.assertTrue(
 411 |           instruction.check_following(self.POSTSCRIPT_TEST_MESSAGE_2))
 412 | 
 413 |     postscript_start_keyword = 'P.S.'
 414 |     instruction.build_description(postscript_marker=postscript_start_keyword)
 415 |     with self.subTest(f'test {postscript_start_keyword}'):
 416 |       self.assertTrue(
 417 |           instruction.check_following(self.POSTSCRIPT_TEST_MESSAGE_3))
 418 | 
 419 |     postscript_start_keyword = 'P.P.S'
 420 |     instruction.build_description(postscript_marker=postscript_start_keyword)
 421 |     with self.subTest(f'test {postscript_start_keyword}'):
 422 |       self.assertTrue(
 423 |           instruction.check_following(self.POSTSCRIPT_TEST_MESSAGE_4))
 424 | 
 425 |     postscript_start_keyword = 'P.S.'
 426 |     instruction.build_description(postscript_marker=postscript_start_keyword)
 427 |     with self.subTest(f'test {postscript_start_keyword}'):
 428 |       self.assertTrue(
 429 |           instruction.check_following(self.POSTSCRIPT_TEST_MESSAGE_5))
 430 | 
 431 |   CONSTRAINED_START_TEST_MESSAGE_1 = """
 432 |   My response is: ASIC is a specialized chip for specific tasks in electronic
 433 |   devices, offering advantages in efficiency and processing speed."""
 434 | 
 435 |   CONSTRAINED_START_TEST_MESSAGE_2 = """
 436 |         My response is: ASIC is a specialized chip for specific tasks in
 437 |   electronic
 438 |   devices, offering advantages in efficiency and processing speed."""
 439 | 
 440 |   CONSTRAINED_START_TEST_MESSAGE_3 = """
 441 |   An ASIC, or Application-Specific Integrated Circuit, is a type of specialized
 442 |   chip that, my response is, is designed to perform specific tasks in electronic
 443 |   devices."""
 444 | 
 445 |   def test_constrained_start_checker(self):
 446 |     """Test the constrained start checker."""
 447 |     instruction_id = 'multi-turn:constrained_start'
 448 |     instruction = instructions.ConstrainedStartChecker(instruction_id)
 449 |     start_keyword = 'My response is:'
 450 |     instruction.build_description(starter=start_keyword)
 451 |     with self.subTest(f'test {start_keyword}'):
 452 |       self.assertTrue(
 453 |           instruction.check_following(self.CONSTRAINED_START_TEST_MESSAGE_1))
 454 | 
 455 |     with self.subTest(f'test {start_keyword} with spaces in the beginning'):
 456 |       self.assertTrue(instruction.check_following(
 457 |           self.CONSTRAINED_START_TEST_MESSAGE_2))
 458 | 
 459 |     start_keyword = 'my response is'
 460 |     with self.subTest(f'test {start_keyword} embedded in the middle'):
 461 |       self.assertFalse(
 462 |           instruction.check_following(self.CONSTRAINED_START_TEST_MESSAGE_3))
 463 | 
 464 |   REPHRASE_TEST_REPHRASED_MESSAGE_1 = """
 465 |   I am *content*."""
 466 |   REPHRASE_TEST_ORIGINAL_MESSAGE_1 = """
 467 |   I am *happy*."""
 468 | 
 469 |   REPHRASE_TEST_REPHRASED_MESSAGE_1_NOCHANGE = """
 470 |   I am ."""
 471 | 
 472 |   REPHRASE_TEST_REPHRASED_MESSAGE_1_FORMAT = """
 473 |   I am [content]."""
 474 | 
 475 |   REPHRASE_TEST_REPHRASED_MESSAGE_2 = """
 476 |   It is raining heavily *at this moment*."""
 477 |   REPHRASE_TEST_ORIGINAL_MESSAGE_2 = """
 478 |   *At present,* there is heavy rainfall occurring."""
 479 | 
 480 |   def test_rephrase_checker(self):
 481 |     """Test the rephrase checker."""
 482 |     instruction_id = 'detectable_format:rephrasing'
 483 |     instruction = instructions.RephraseChecker(instruction_id)
 484 |     instruction.build_description(
 485 |         original_message=self.REPHRASE_TEST_ORIGINAL_MESSAGE_1)
 486 |     with self.subTest(f'test {self.REPHRASE_TEST_REPHRASED_MESSAGE_1}'):
 487 |       self.assertTrue(
 488 |           instruction.check_following(self.REPHRASE_TEST_REPHRASED_MESSAGE_1))
 489 | 
 490 |     instruction.build_description(
 491 |         original_message=self.REPHRASE_TEST_ORIGINAL_MESSAGE_1)
 492 |     with self.subTest(
 493 |         f'test {self.REPHRASE_TEST_REPHRASED_MESSAGE_1_NOCHANGE}'):
 494 |       with self.assertRaises(ValueError):
 495 |         instruction.check_following(
 496 |             self.REPHRASE_TEST_REPHRASED_MESSAGE_1_NOCHANGE)
 497 | 
 498 |     instruction.build_description(
 499 |         original_message=self.REPHRASE_TEST_ORIGINAL_MESSAGE_1)
 500 |     with self.subTest(f'test {self.REPHRASE_TEST_REPHRASED_MESSAGE_1_FORMAT}'):
 501 |       with self.assertRaises(ValueError):
 502 |         instruction.check_following(
 503 |             self.REPHRASE_TEST_REPHRASED_MESSAGE_1_FORMAT)
 504 | 
 505 |     instruction.build_description(
 506 |         original_message=self.REPHRASE_TEST_ORIGINAL_MESSAGE_2)
 507 |     with self.subTest(f'test {self.REPHRASE_TEST_REPHRASED_MESSAGE_2}'):
 508 |       self.assertFalse(
 509 |           instruction.check_following(self.REPHRASE_TEST_REPHRASED_MESSAGE_2))
 510 | 
 511 |   TEST_INCLUDE_KEYWORD_MESSAGE_1 = """
 512 |   Paris is a city of beauty and romance. The romantic river Seine winds its way
 513 |   through the city, past iconic landmarks like the Eiffel Tower and the Louvre
 514 |   Museum, where the Mona Lisa resides. Whether you're taking a boat cruise down
 515 |   the river or simply strolling along the banks, you're sure to be captivated
 516 |   by the city's charm."""
 517 | 
 518 |   TEST_INCLUDE_KEYWORD_MESSAGE_2 = """
 519 |   Paris is a city of beauty, romance, and history. It is home to some of the
 520 |   most iconic landmarks in the world, including the Eiffel Tower, the Louvre
 521 |   Museum, and the Notre Dame Cathedral. The city is also known for its romantic
 522 |   river cruises, its delicious food, and its stylish people.
 523 |   """
 524 | 
 525 |   KEYWORDS = ('romantic', 'river', 'Mona Lisa')
 526 | 
 527 |   def test_keyword_checker(self):
 528 |     """Test the inclusion of keywords."""
 529 |     instruction_id = 'keywords:include_keywords'
 530 |     instruction = instructions.KeywordChecker(instruction_id)
 531 | 
 532 |     instruction.build_description(keywords=self.KEYWORDS)
 533 |     with self.subTest(f'test {self.TEST_INCLUDE_KEYWORD_MESSAGE_1}'):
 534 |       self.assertTrue(
 535 |           instruction.check_following(self.TEST_INCLUDE_KEYWORD_MESSAGE_1))
 536 | 
 537 |     instruction.build_description(keywords=self.KEYWORDS)
 538 |     with self.subTest(f'test {self.TEST_INCLUDE_KEYWORD_MESSAGE_2}'):
 539 |       self.assertFalse(
 540 |           instruction.check_following(self.TEST_INCLUDE_KEYWORD_MESSAGE_2))
 541 | 
 542 |   TEST_KEYWORD_FREQUNECY_MESSAGE_1 = """
 543 |   keyword, Keyword, KEYWORD
 544 |   """
 545 |   TEST_KEYWORD_FREQUENCY_KEYWORD_1 = '  keyword '
 546 | 
 547 |   TEST_KEYWORD_FREQUNECY_MESSAGE_2 = """
 548 |     *keyword
 549 |     *Keyword
 550 |     *KEYWORD
 551 |   """
 552 |   TEST_KEYWORD_FREQUENCY_KEYWORD_2 = 'KEYWORD'
 553 | 
 554 |   def test_keyword_frequency_checker(self):
 555 |     """Test the frequency of keywords."""
 556 | 
 557 |     instruction_id = 'keywords:keyword_frequency'
 558 |     instruction = instructions.KeywordFrequencyChecker(instruction_id)
 559 | 
 560 |     frequency = 4
 561 |     instruction.build_description(keyword=self.TEST_KEYWORD_FREQUENCY_KEYWORD_1,
 562 |                                   frequency=frequency,
 563 |                                   relation=instructions._COMPARISON_RELATION[0])
 564 |     with self.subTest(
 565 |         f'test {self.TEST_KEYWORD_FREQUENCY_KEYWORD_1} {frequency}'):
 566 |       self.assertTrue(
 567 |           instruction.check_following(self.TEST_KEYWORD_FREQUNECY_MESSAGE_1))
 568 | 
 569 |     frequency = 3
 570 |     instruction.build_description(keyword=self.TEST_KEYWORD_FREQUENCY_KEYWORD_1,
 571 |                                   frequency=frequency,
 572 |                                   relation=instructions._COMPARISON_RELATION[1])
 573 |     with self.subTest(
 574 |         f'test {self.TEST_KEYWORD_FREQUENCY_KEYWORD_1} {frequency}'):
 575 |       self.assertTrue(
 576 |           instruction.check_following(self.TEST_KEYWORD_FREQUNECY_MESSAGE_1))
 577 | 
 578 |     frequency = 4
 579 |     instruction.build_description(keyword=self.TEST_KEYWORD_FREQUENCY_KEYWORD_2,
 580 |                                   frequency=frequency,
 581 |                                   relation=instructions._COMPARISON_RELATION[1])
 582 |     with self.subTest(
 583 |         f'test {self.TEST_KEYWORD_FREQUENCY_KEYWORD_2} {frequency}'):
 584 |       self.assertFalse(
 585 |           instruction.check_following(self.TEST_KEYWORD_FREQUNECY_MESSAGE_2))
 586 | 
 587 |   TEST_NUM_WORDS_MESSAGE_1 = """
 588 |   d3sCRi7 lArge lAnguagE M0del w1tH 20 w0RdS."""
 589 | 
 590 |   TEST_NUM_WORDS_MESSAGE_2 = """
 591 |   L4RGE L4NGU4GE M0DEL: AI syst3m th4t und3rstands, g3n3r4tes, or tr4nsforms
 592 |   l4ngu4g3 b4s3d on pr3vious l3arning & d4t4."""
 593 | 
 594 |   def test_num_words_checker(self):
 595 |     """Test the checker on the number of words."""
 596 |     instruction_id = 'length_constraint:number_words'
 597 |     instruction = instructions.NumberOfWords(instruction_id)
 598 | 
 599 |     word_counts = 8
 600 |     instruction.build_description(num_words=word_counts,
 601 |                                   relation=instructions._COMPARISON_RELATION[0])
 602 |     with self.subTest(
 603 |         f'test {self.TEST_NUM_WORDS_MESSAGE_1} {word_counts}'):
 604 |       self.assertTrue(
 605 |           instruction.check_following(self.TEST_NUM_WORDS_MESSAGE_1))
 606 | 
 607 |     word_counts = 16
 608 |     instruction.build_description(num_words=word_counts,
 609 |                                   relation=instructions._COMPARISON_RELATION[0])
 610 |     with self.subTest(
 611 |         f'test {self.TEST_NUM_WORDS_MESSAGE_2} less than {word_counts}'):
 612 |       self.assertFalse(
 613 |           instruction.check_following(self.TEST_NUM_WORDS_MESSAGE_2))
 614 | 
 615 |     word_counts = 16
 616 |     instruction.build_description(num_words=word_counts,
 617 |                                   relation=instructions._COMPARISON_RELATION[1])
 618 |     with self.subTest(
 619 |         f'test {self.TEST_NUM_WORDS_MESSAGE_2} at least {word_counts}'):
 620 |       self.assertTrue(
 621 |           instruction.check_following(self.TEST_NUM_WORDS_MESSAGE_2))
 622 | 
 623 |   PARAGRAPH_FIRST_WORD_TEST_1 = """
 624 |   paragraph 1
 625 | 
 626 |   I paragraph 2
 627 | 
 628 |   paragraph 3"""
 629 | 
 630 |   PARAGRAPH_FIRST_WORD_TEST_2 = """
 631 |   paragraph 1
 632 | 
 633 |   I paragraph 2"""
 634 | 
 635 |   PARAGRAPH_FIRST_WORD_TEST_3 = """
 636 |   paragraph 1
 637 | 
 638 |   fail paragraph 2
 639 | 
 640 |   paragraph 3"""
 641 | 
 642 |   PARAGRAPH_FIRST_WORD_TEST_4 = """
 643 |   Wow this is a very long response.
 644 | 
 645 |   I can't believe there is more than three paragraphs.
 646 | 
 647 |   Really more than three? No way!
 648 | 
 649 |   I can't believe it but I guess I am living proof.
 650 | 
 651 |   Haha, you go that right."""
 652 | 
 653 |   PARAGRAPH_FIRST_WORD_TEST_5 = """
 654 |   Wow this is a very long response.
 655 | 
 656 |   I can't believe there is more than three paragraphs.
 657 | 
 658 |   "Really?! more than three? No way!"
 659 | 
 660 |   I can't believe it but I guess I am living proof.
 661 | 
 662 |   Haha, you go that right."""
 663 | 
 664 |   PARAGRAPH_FIRST_WORD_TEST_6 = """
 665 |   Wow this is a very long response.
 666 | 
 667 |   I can't believe there is more than three paragraphs.
 668 | 
 669 |   Rea!lly more than three? No way!
 670 | 
 671 |   I can't believe it but I guess I am living proof.
 672 | 
 673 |   Haha, you go that right."""
 674 | 
 675 |   def test_paragraph_first_word(self):
 676 |     """Test number of paragraphs and first word of nth paragraph."""
 677 |     instruction_id = 'length_constraints:nth_paragraph_first_word'
 678 |     instruction = instructions.ParagraphFirstWordCheck(instruction_id)
 679 |     tests = [
 680 |         self.PARAGRAPH_FIRST_WORD_TEST_1,
 681 |         self.PARAGRAPH_FIRST_WORD_TEST_2,
 682 |         self.PARAGRAPH_FIRST_WORD_TEST_3,
 683 |         self.PARAGRAPH_FIRST_WORD_TEST_4,
 684 |         self.PARAGRAPH_FIRST_WORD_TEST_5,
 685 |         self.PARAGRAPH_FIRST_WORD_TEST_6,
 686 |     ]
 687 | 
 688 |     for test in tests:
 689 |       if (test == self.PARAGRAPH_FIRST_WORD_TEST_1
 690 |           or test == self.PARAGRAPH_FIRST_WORD_TEST_2
 691 |           or test == self.PARAGRAPH_FIRST_WORD_TEST_3):
 692 |         num_paragraphs = 3
 693 |         nth_paragraph = 2
 694 |         first_word = 'I'
 695 |       elif test == self.PARAGRAPH_FIRST_WORD_TEST_4:
 696 |         num_paragraphs = 5
 697 |         nth_paragraph = 5
 698 |         first_word = 'haha'
 699 |       else:
 700 |         num_paragraphs = 5
 701 |         nth_paragraph = 3
 702 |         first_word = 'Really'
 703 | 
 704 |       instruction.build_description(
 705 |           num_paragraphs=num_paragraphs,
 706 |           nth_paragraph=nth_paragraph,
 707 |           first_word=first_word,
 708 |       )
 709 |       with self.subTest(
 710 |           f'test {test} \n. Test for '
 711 |           f'{num_paragraphs} paragraphs and '
 712 |           f'for paragraph {nth_paragraph} '
 713 |           f'{first_word} is first word'
 714 |       ):
 715 |         if (test == self.PARAGRAPH_FIRST_WORD_TEST_1
 716 |             or test == self.PARAGRAPH_FIRST_WORD_TEST_4
 717 |             or test == self.PARAGRAPH_FIRST_WORD_TEST_5):
 718 |           self.assertTrue(instruction.check_following(test))
 719 |         else:
 720 |           self.assertFalse(instruction.check_following(test))
 721 | 
 722 |   TEST_KEY_SENTENCES_1 = """
 723 |   Puppies are fun. They are playful, energetic, and always up for a good time.
 724 | Puppies love to run, jump, and play fetch. They are also very good at
 725 | cuddling and giving kisses. If you are looking for a fun and loving pet,
 726 | a puppy is a great choice.
 727 |   """
 728 | 
 729 |   TEST_KEY_SENTENCES_2 = """
 730 |   I like to eat candy. When I'm feeling happy, sad, or even angry, candy
 731 | always makes me feel better. I like to share candy with my friends and
 732 | family. It's a great way to show them how much I care.
 733 |   """
 734 | 
 735 |   TEST_KEY_SENTENCES_3 = """
 736 | I know that candy isn't the healthiest thing to eat, but I don't care.
 737 | I love it too much. I'll just have to make sure to eat it in moderation.
 738 |   """
 739 | 
 740 |   key_sentences = {'Puppies love to run, jump, and play fetch.',
 741 |                    'I like to eat candy.', 'Puppies are fun.'}
 742 | 
 743 |   def test_key_sentences(self):
 744 |     """Test the inclusion of key sentences."""
 745 |     instruction_id = 'keywords:key_sentences'
 746 |     instruction = instructions.KeySentenceChecker(instruction_id)
 747 | 
 748 |     num_sentences = 2
 749 |     instruction.build_description(
 750 |         key_sentences=self.key_sentences, num_sentences=num_sentences)
 751 | 
 752 |     with self.subTest(f'test {self.TEST_KEY_SENTENCES_1}'):
 753 |       self.assertTrue(instruction.check_following(self.TEST_KEY_SENTENCES_1))
 754 | 
 755 |     num_sentences = 1
 756 |     instruction.build_description(
 757 |         key_sentences=self.key_sentences, num_sentences=num_sentences)
 758 | 
 759 |     with self.subTest(f'test {self.TEST_KEY_SENTENCES_2}'):
 760 |       self.assertTrue(instruction.check_following(self.TEST_KEY_SENTENCES_2))
 761 | 
 762 |     with self.subTest(f'test {self.TEST_KEY_SENTENCES_3}'):
 763 |       self.assertFalse(instruction.check_following(self.TEST_KEY_SENTENCES_3))
 764 | 
 765 |   TEST_FORBIDDEN_WORDS_MESSAGE_1 = """
 766 |   The Nazis came to power in 1933 through a combination of legal and illegal
 767 |   means. Hitler was appointed chancellor by President Paul von Hindenburg, and
 768 |   the Nazis quickly consolidated their power by passing a series of laws that
 769 |   restricted the rights of opposition parties and individuals. By 1934, Hitler
 770 |   had become dictator of Germany.
 771 |   """
 772 | 
 773 |   TEST_FORBIDDEN_WORDS_MESSAGE_2 = """
 774 |   Dinosaurs were a diverse group of reptiles that dominated the Earth for over
 775 |   160 million years. They came in all shapes and sizes, from the tiny
 776 |   Compsognathus to the massive Argentinosaurus. Dinosaurs were the most
 777 |   successful land animals on Earth until they went extinct about 66 million
 778 |   years ago. The exact cause of their extinction is still unknown, but it
 779 |   is thought to have been a combination of factors, including an asteroid
 780 |   impact and climate change.
 781 |   """
 782 | 
 783 |   TEST_FORBIDDEN_WORDS_MESSAGE_3 = """
 784 |   GPT, or Generative Pre-trained Transformer, is a family of neural network
 785 |   models that uses the transformer architecture. GPT models are trained on a
 786 |   massive dataset of text and code, and can be used for a variety of tasks,
 787 |   including text generation, translation, and question answering. GPT models
 788 |   have been shown to be very effective at these tasks, and are being used by
 789 |   a variety of companies and organizations like Google.
 790 |   """
 791 |   FORBIDDEN_WORDS_1 = ('HOUSE', 'POWER', 'BECOME')
 792 |   FORBIDDEN_WORDS_2 = ('GOOGLE', 'TEXT')
 793 |   FORBIDDEN_WORDS_3 = ('GENE', 'TRANSFORM')
 794 | 
 795 |   def test_forbidden_words(self):
 796 |     """Test the exclusion of key words."""
 797 |     instruction_id = 'keywords:forbidden_words'
 798 |     instruction = instructions.ForbiddenWords(instruction_id)
 799 | 
 800 |     instruction.build_description(forbidden_words=self.FORBIDDEN_WORDS_1)
 801 |     with self.subTest(f'test {self.TEST_FORBIDDEN_WORDS_MESSAGE_1}\n ' +
 802 |                       f'with forbidden words: {self.FORBIDDEN_WORDS_1}. '):
 803 |       self.assertFalse(
 804 |           instruction.check_following(self.TEST_FORBIDDEN_WORDS_MESSAGE_1))
 805 | 
 806 |     with self.subTest(f'test {self.TEST_FORBIDDEN_WORDS_MESSAGE_2}\n ' +
 807 |                       f'with forbidden words: {self.FORBIDDEN_WORDS_1}. '):
 808 |       self.assertTrue(
 809 |           instruction.check_following(self.TEST_FORBIDDEN_WORDS_MESSAGE_2))
 810 | 
 811 |     with self.subTest(f'test {self.TEST_FORBIDDEN_WORDS_MESSAGE_3}\n ' +
 812 |                       f'with forbidden words: {self.FORBIDDEN_WORDS_1}. '):
 813 |       self.assertTrue(
 814 |           instruction.check_following(self.TEST_FORBIDDEN_WORDS_MESSAGE_3))
 815 | 
 816 |     instruction.build_description(forbidden_words=self.FORBIDDEN_WORDS_2)
 817 |     with self.subTest(f'test {self.TEST_FORBIDDEN_WORDS_MESSAGE_1}\n ' +
 818 |                       f'with forbidden words: {self.FORBIDDEN_WORDS_2}. '):
 819 |       self.assertTrue(
 820 |           instruction.check_following(self.TEST_FORBIDDEN_WORDS_MESSAGE_1))
 821 | 
 822 |     with self.subTest(f'test {self.TEST_FORBIDDEN_WORDS_MESSAGE_2}\n ' +
 823 |                       f'with forbidden words: {self.FORBIDDEN_WORDS_2}. '):
 824 |       self.assertTrue(
 825 |           instruction.check_following(self.TEST_FORBIDDEN_WORDS_MESSAGE_2))
 826 | 
 827 |     with self.subTest(f'test {self.TEST_FORBIDDEN_WORDS_MESSAGE_3}\n ' +
 828 |                       f'with forbidden words: {self.FORBIDDEN_WORDS_2}. '):
 829 |       self.assertFalse(
 830 |           instruction.check_following(self.TEST_FORBIDDEN_WORDS_MESSAGE_3))
 831 | 
 832 |     instruction.build_description(forbidden_words=self.FORBIDDEN_WORDS_3)
 833 |     with self.subTest(f'test {self.TEST_FORBIDDEN_WORDS_MESSAGE_3}\n ' +
 834 |                       f'with forbidden words: {self.FORBIDDEN_WORDS_2}. '):
 835 |       self.assertTrue(
 836 |           instruction.check_following(self.TEST_FORBIDDEN_WORDS_MESSAGE_3))
 837 | 
 838 |   TEST_ORIGINAL_PARAGRAPH_1 = """
 839 |   The sun is shining brightly today, and the birds are singing in the trees.
 840 |   It's a beautiful day to be outside, so I decided to go for a walk.
 841 |   As I walked, I took in the fresh air and the warm sunshine.
 842 |   I felt happy and relaxed, and I was grateful for the beautiful day
 843 |   """
 844 | 
 845 |   TEST_ORIGINAL_PARAGRAPH_2 = """
 846 |   Google is a global technology company that specializes in Internet-related
 847 |   services and products. It is one of the most successful companies in the
 848 |   world, and its products are used by billions of people every day. Google's
 849 |   mission is to organize the world's information and make it universally
 850 |   accessible and useful.
 851 |   """
 852 | 
 853 |   TEST_REPHRASED_PARAGRAPH_1 = """
 854 |   On a beautiful day, I went for a walk. The sun shone and birds sang.
 855 |   I enjoyed the fresh air and warm sun.
 856 |   I felt happy and grateful for the lovely day.
 857 |   """
 858 | 
 859 |   TEST_REPHRASED_PARAGRAPH_2 = """
 860 |   The weather was lovely, so I went for a walk. I enjoyed the
 861 |   fresh air and warm sun. It was a beautiful day, and I felt happy and grateful.
 862 |   """
 863 | 
 864 |   TEST_REPHRASED_PARAGRAPH_3 = """
 865 |   Google is a technology company that provides Internet services.
 866 |   It aims to organize the world's information and make it universally
 867 |   accessible and useful.
 868 |   """
 869 | 
 870 |   TEST_REPHRASED_PARAGRAPH_4 = """
 871 |   I like candy.
 872 |   """
 873 | 
 874 |   def test_rephrase_paragraph(self):
 875 |     """Test the rephrasing of paragraph."""
 876 |     instruction_id = 'detectable_content:rephrase_paragraph'
 877 |     instruction = instructions.RephraseParagraph(instruction_id)
 878 |     low, high = 20, 30
 879 |     instruction.build_description(
 880 |         low=low, high=high, original_paragraph=self.TEST_ORIGINAL_PARAGRAPH_1)
 881 | 
 882 |     with self.subTest(f'test {self.TEST_ORIGINAL_PARAGRAPH_1} to ' +
 883 |                       f'have between {low} and {high} same words.'):
 884 |       self.assertTrue(
 885 |           instruction.check_following(self.TEST_REPHRASED_PARAGRAPH_1))
 886 | 
 887 |     low, high = 20, 25
 888 |     instruction.build_description(
 889 |         low=low, high=high, original_paragraph=self.TEST_ORIGINAL_PARAGRAPH_1)
 890 | 
 891 |     with self.subTest(f'test {self.TEST_ORIGINAL_PARAGRAPH_1} to ' +
 892 |                       f'have between {low} and {high} same words.'):
 893 |       self.assertTrue(
 894 |           instruction.check_following(self.TEST_REPHRASED_PARAGRAPH_2))
 895 | 
 896 |     low, high = 15, 20
 897 |     instruction.build_description(
 898 |         low=low, high=high, original_paragraph=self.TEST_ORIGINAL_PARAGRAPH_2)
 899 | 
 900 |     with self.subTest(f'test {self.TEST_ORIGINAL_PARAGRAPH_2} to ' +
 901 |                       f'have between {low} and {high} same words.'):
 902 |       self.assertFalse(
 903 |           instruction.check_following(self.TEST_REPHRASED_PARAGRAPH_3))
 904 | 
 905 |     low, high = 0, 5
 906 |     instruction.build_description(
 907 |         low=low, high=high, original_paragraph=self.TEST_ORIGINAL_PARAGRAPH_2)
 908 | 
 909 |     with self.subTest(f'test {self.TEST_ORIGINAL_PARAGRAPH_2} to ' +
 910 |                       f'have between {low} and {high} same words.'):
 911 |       self.assertTrue(
 912 |           instruction.check_following(self.TEST_REPHRASED_PARAGRAPH_4))
 913 | 
 914 |     low, high = 1, 5
 915 |     instruction.build_description(
 916 |         low=low, high=high, original_paragraph=self.TEST_ORIGINAL_PARAGRAPH_2)
 917 | 
 918 |     with self.subTest(f'test {self.TEST_ORIGINAL_PARAGRAPH_2} to ' +
 919 |                       f'have between {low} and {high} same words.'):
 920 |       self.assertFalse(
 921 |           instruction.check_following(self.TEST_REPHRASED_PARAGRAPH_4))
 922 | 
 923 |   TEST_TWO_RESPONSES_1 = """
 924 |   This is response 1.
 925 |   ******
 926 |   This is response 2.
 927 |   """
 928 | 
 929 |   TEST_TWO_RESPONSES_2 = """
 930 |   This is response 1.
 931 |   ******
 932 |   This is response 1.
 933 |   """
 934 | 
 935 |   TEST_TWO_RESPONSES_3 = """
 936 |   This is response 1.
 937 |   ******
 938 |   This is response 2.
 939 |   ******
 940 |   This is response 3.
 941 |   """
 942 | 
 943 |   TEST_TWO_RESPONSES_4 = """
 944 |   ******
 945 |   Response 1.
 946 |   ******
 947 |   ******
 948 |   Response 2.
 949 |   ******
 950 |   """
 951 | 
 952 |   TEST_TWO_RESPONSES_5 = """
 953 |   ******
 954 |   Response 1
 955 |   ******
 956 |   Response 2
 957 |   ******
 958 |   """
 959 | 
 960 |   def test_two_responses(self):
 961 |     """Test that two responses are given."""
 962 |     instruction_id = 'combination:two_responses'
 963 |     instruction = instructions.TwoResponsesChecker(instruction_id)
 964 |     instruction.build_description()
 965 | 
 966 |     with self.subTest(f'test {self.TEST_TWO_RESPONSES_1}'):
 967 |       self.assertTrue(instruction.check_following(self.TEST_TWO_RESPONSES_1))
 968 | 
 969 |     with self.subTest(f'test {self.TEST_TWO_RESPONSES_2}'):
 970 |       self.assertFalse(instruction.check_following(self.TEST_TWO_RESPONSES_2))
 971 | 
 972 |     with self.subTest(f'test {self.TEST_TWO_RESPONSES_3}'):
 973 |       self.assertFalse(instruction.check_following(self.TEST_TWO_RESPONSES_3))
 974 | 
 975 |     with self.subTest(f'test {self.TEST_TWO_RESPONSES_4}'):
 976 |       self.assertFalse(instruction.check_following(self.TEST_TWO_RESPONSES_4))
 977 | 
 978 |     with self.subTest(f'test {self.TEST_TWO_RESPONSES_5}'):
 979 |       self.assertTrue(instruction.check_following(self.TEST_TWO_RESPONSES_5))
 980 | 
 981 |   PROMPT_TO_REPEAT = 'Write a CL description.'
 982 | 
 983 |   TEST_PROMPT_1 = """Write a CL description. First repeat the request word for word without change, then give your answer (1. do not say any words or characters before repeating the request; 2. the request you need to repeat does not include this sentence)"""
 984 | 
 985 |   TEST_PROMPT_ANSWER_1 = """Write a CL description. Hi, Le and TJ, please
 986 |   check this out. Thanks.
 987 |   """
 988 |   TEST_PROMPT_ANSWER_2 = """Hi, Le and TJ. Write a CL description. Thanks.
 989 |   """
 990 | 
 991 |   def test_prompt_repeat_answer(self):
 992 |     """Test that prompt is repeated then anwered."""
 993 |     instruction_id = 'combination:repeat_prompt'
 994 |     instruction = instructions.RepeatPromptThenAnswer(instruction_id)
 995 | 
 996 |     instruction.build_description(prompt_to_repeat=self.PROMPT_TO_REPEAT)
 997 |     with self.subTest(f'test {self.TEST_PROMPT_ANSWER_1}' +
 998 |                       f' with prompt: {self.TEST_PROMPT_1}'):
 999 |       self.assertTrue(instruction.check_following(self.TEST_PROMPT_ANSWER_1))
1000 | 
1001 |     with self.subTest(f'test {self.TEST_PROMPT_ANSWER_2}' +
1002 |                       f' with prompt: {self.TEST_PROMPT_1}'):
1003 |       self.assertFalse(instruction.check_following(self.TEST_PROMPT_ANSWER_2))
1004 | 
1005 |   TEST_END_CHECKER_1 = """
1006 |   The answer is 7. Any more questions?
1007 |   """
1008 | 
1009 |   TEST_END_CHECKER_2 = """
1010 |   At the end of this prompt I am required to say that this is the end.
1011 |   """
1012 | 
1013 |   TEST_END_CHECKER_3 = """
1014 |   This will fail. Paris is cool.
1015 |   """
1016 | 
1017 |   END_PHRASE_1 = """
1018 |   Any more questions?
1019 |   """
1020 | 
1021 |   END_PHRASE_2 = """
1022 |   This is the end.
1023 |   """
1024 | 
1025 |   END_PHRASE_3 = """
1026 |   This will fail.
1027 |   """
1028 | 
1029 |   def test_end_checker(self):
1030 |     """Check the end of the prompt."""
1031 |     instruction_id = 'startend:end_checker'
1032 |     instruction = instructions.EndChecker(instruction_id)
1033 |     instruction.build_description(end_phrase=self.END_PHRASE_1)
1034 |     with self.subTest(f'test {self.TEST_END_CHECKER_1}'):
1035 |       self.assertTrue(instruction.check_following(self.TEST_END_CHECKER_1))
1036 | 
1037 |     instruction.build_description(end_phrase=self.END_PHRASE_2)
1038 |     with self.subTest(f'test {self.TEST_END_CHECKER_2}'):
1039 |       self.assertTrue(instruction.check_following(self.TEST_END_CHECKER_2))
1040 | 
1041 |     instruction.build_description(end_phrase=self.END_PHRASE_3)
1042 |     with self.subTest(f'test {self.TEST_END_CHECKER_3}'):
1043 |       self.assertFalse(instruction.check_following(self.TEST_END_CHECKER_3))
1044 | 
1045 |   TEST_TITLE_MESSAGE_1 = """
1046 |   <<Song of Joy>>
1047 |   La la la. Happy song.
1048 |   """
1049 | 
1050 |   TEST_TITLE_MESSAGE_2 = """
1051 |   Is it fine for title to be at the end?
1052 |   <<This is the title>>
1053 |   """
1054 |   TEST_TITLE_MESSAGE_3 = """
1055 |   << >>
1056 |   There is no title.
1057 |   """
1058 | 
1059 |   TEST_TITLE_MESSAGE_4 = """
1060 |   <<This is not a title.
1061 |   This is a paragraph.>>
1062 |   """
1063 | 
1064 |   def test_title_checker(self):
1065 |     """Check the prompt for a title."""
1066 |     instruction_id = 'detectable_format:title'
1067 |     instruction = instructions.TitleChecker(instruction_id)
1068 |     instruction.build_description()
1069 |     with self.subTest(f'test {self.TEST_TITLE_MESSAGE_1}'):
1070 |       self.assertTrue(instruction.check_following(self.TEST_TITLE_MESSAGE_1))
1071 |     with self.subTest(f'test {self.TEST_TITLE_MESSAGE_2}'):
1072 |       self.assertTrue(instruction.check_following(self.TEST_TITLE_MESSAGE_2))
1073 | 
1074 |     with self.subTest(f'test {self.TEST_TITLE_MESSAGE_3}'):
1075 |       self.assertFalse(instruction.check_following(self.TEST_TITLE_MESSAGE_3))
1076 |     with self.subTest(f'test {self.TEST_TITLE_MESSAGE_4}'):
1077 |       self.assertFalse(instruction.check_following(self.TEST_TITLE_MESSAGE_4))
1078 | 
1079 |   TEST_LETTER_FREQUENCY_MESSAGE_1 = """
1080 |   There is the T. Four T's.
1081 |   """
1082 | 
1083 |   TEST_LETTER_FREQUENCY_MESSAGE_2 = """
1084 |   asdfghjkl!!aA
1085 |   """
1086 | 
1087 |   TEST_LETTER_FREQUENCY_MESSAGE_3 = """
1088 |   The letter P appears 3 times in this message.
1089 |     """
1090 | 
1091 |   def test_letter_frequency_checker(self):
1092 |     """Test the frequency of letters."""
1093 |     instruction_id = 'keywords:letter_frequency'
1094 |     instruction = instructions.LetterFrequencyChecker(instruction_id)
1095 | 
1096 |     letter = 'T'
1097 |     frequency = 4
1098 |     instruction.build_description(
1099 |         letter=letter,
1100 |         let_frequency=frequency,
1101 |         let_relation=instructions._COMPARISON_RELATION[1],
1102 |     )
1103 |     with self.subTest(f'test {self.TEST_LETTER_FREQUENCY_MESSAGE_1}'):
1104 |       self.assertTrue(
1105 |           instruction.check_following(self.TEST_LETTER_FREQUENCY_MESSAGE_1)
1106 |       )
1107 | 
1108 |     letter = 'a'
1109 |     frequency = 4
1110 |     instruction.build_description(
1111 |         letter=letter,
1112 |         let_frequency=frequency,
1113 |         let_relation=instructions._COMPARISON_RELATION[0],
1114 |     )
1115 |     with self.subTest(f'test {self.TEST_LETTER_FREQUENCY_MESSAGE_2}'):
1116 |       self.assertTrue(
1117 |           instruction.check_following(self.TEST_LETTER_FREQUENCY_MESSAGE_2)
1118 |       )
1119 | 
1120 |     letter = 'p'
1121 |     frequency = 4
1122 |     instruction.build_description(
1123 |         letter=letter,
1124 |         let_frequency=frequency,
1125 |         let_relation=instructions._COMPARISON_RELATION[1],
1126 |     )
1127 |     with self.subTest(f'test {self.TEST_LETTER_FREQUENCY_MESSAGE_2}'):
1128 |       self.assertFalse(
1129 |           instruction.check_following(self.TEST_LETTER_FREQUENCY_MESSAGE_2)
1130 |       )
1131 | 
1132 |   TEST_ENGLISH_CAPITAL_1 = """
1133 |   THIS IS AN ENGLISH SENTENCE. EVERY LETTER IS CAPITALIZED!!! AMAZING.
1134 |   """
1135 | 
1136 |   TEST_ENGLISH_CAPITAL_2 = """
1137 |   Every Word Is Capitalized.
1138 |   """
1139 | 
1140 |   def test_english_capital_checker(self):
1141 |     """Test that letters are all capitalized."""
1142 |     instruction_id = 'change_case:english_capital'
1143 |     instruction = instructions.CapitalLettersEnglishChecker(instruction_id)
1144 |     instruction.build_description()
1145 |     with self.subTest(f'test {self.TEST_ENGLISH_CAPITAL_1}'):
1146 |       self.assertTrue(instruction.check_following(self.TEST_ENGLISH_CAPITAL_1))
1147 | 
1148 |     with self.subTest(f'test {self.TEST_ENGLISH_CAPITAL_2}'):
1149 |       self.assertFalse(instruction.check_following(self.TEST_ENGLISH_CAPITAL_2))
1150 | 
1151 |   TEST_ENGLISH_LOWERCASE_1 = """
1152 |   every letter is lowercase.
1153 |   """
1154 | 
1155 |   TEST_ENGLISH_LOWERCASE_2 = """
1156 |   Almost every letter is lowercase.
1157 |   """
1158 | 
1159 |   def test_english_lowercase_checker(self):
1160 |     """Test that letters are all capitalized."""
1161 |     instruction_id = 'change_case:english_lowercase'
1162 |     instruction = instructions.LowercaseLettersEnglishChecker(instruction_id)
1163 |     instruction.build_description()
1164 |     with self.subTest(f'test {self.TEST_ENGLISH_LOWERCASE_1}'):
1165 |       self.assertTrue(
1166 |           instruction.check_following(self.TEST_ENGLISH_LOWERCASE_1)
1167 |       )
1168 | 
1169 |     with self.subTest(f'test {self.TEST_ENGLISH_LOWERCASE_2}'):
1170 |       self.assertFalse(
1171 |           instruction.check_following(self.TEST_ENGLISH_LOWERCASE_2)
1172 |       )
1173 | 
1174 |   TEST_COMMA_MESSAGE_1 = """
1175 |   Every sentence is short. There is no need for a comma.
1176 |   """
1177 | 
1178 |   TEST_COMMA_MESSAGE_2 = """
1179 |   Since the start of time, people have always found a way to punctuate.
1180 |   """
1181 | 
1182 |   def test_comma(self):
1183 |     instruction_id = 'punctuation:no_comma'
1184 |     instruction = instructions.CommaChecker(instruction_id)
1185 |     instruction.build_description()
1186 |     with self.subTest(f'test {self.TEST_COMMA_MESSAGE_1}'):
1187 |       self.assertTrue(instruction.check_following(self.TEST_COMMA_MESSAGE_1))
1188 |     with self.subTest(f'test {self.TEST_COMMA_MESSAGE_2}'):
1189 |       self.assertFalse(instruction.check_following(self.TEST_COMMA_MESSAGE_2))
1190 | 
1191 |   TEST_CAPITAL_WORD_FREQUENCY_MESSAGE_1 = """
1192 |   HERE there are THREE FUlly CAPITAL words.
1193 |   """
1194 | 
1195 |   TEST_CAPITAL_WORD_FREQUENCY_MESSAGE_2 = """
1196 |   THERE are Four FULLY CAPITAL WORDS. Many Others Are Only Partially So.
1197 |   """
1198 | 
1199 |   def test_capital_word_frequency(self):
1200 |     instruction_id = 'change_case:capital_word_frequency'
1201 |     instruction = instructions.CapitalWordFrequencyChecker(instruction_id)
1202 | 
1203 |     capital_frequency = 3
1204 |     instruction.build_description(
1205 |         capital_frequency=capital_frequency,
1206 |         capital_relation=instructions._COMPARISON_RELATION[1],
1207 |     )
1208 |     with self.subTest(f'test {self.TEST_CAPITAL_WORD_FREQUENCY_MESSAGE_1}'):
1209 |       self.assertTrue(
1210 |           instruction.check_following(
1211 |               self.TEST_CAPITAL_WORD_FREQUENCY_MESSAGE_1
1212 |           )
1213 |       )
1214 | 
1215 |     capital_frequency = 5
1216 |     instruction.build_description(
1217 |         capital_frequency=capital_frequency,
1218 |         capital_relation=instructions._COMPARISON_RELATION[0],
1219 |     )
1220 |     with self.subTest(f'test {self.TEST_CAPITAL_WORD_FREQUENCY_MESSAGE_2}'):
1221 |       self.assertTrue(
1222 |           instruction.check_following(
1223 |               self.TEST_CAPITAL_WORD_FREQUENCY_MESSAGE_2
1224 |           )
1225 |       )
1226 | 
1227 |     capital_frequency = 4
1228 |     instruction.build_description(
1229 |         capital_frequency=capital_frequency,
1230 |         capital_relation=instructions._COMPARISON_RELATION[0],
1231 |     )
1232 |     with self.subTest(f'test {self.TEST_CAPITAL_WORD_FREQUENCY_MESSAGE_2}'):
1233 |       self.assertFalse(
1234 |           instruction.check_following(
1235 |               self.TEST_CAPITAL_WORD_FREQUENCY_MESSAGE_2
1236 |           )
1237 |       )
1238 | 
1239 |   TEST_QUOTATION_MESSAGE_1 = """
1240 |   "This entire message is wrapped in double quotation marks."
1241 |   """
1242 | 
1243 |   TEST_QUOTATION_MESSAGE_2 = """
1244 |   "This message is wrapped in double quotation marks." But not everything.
1245 |   """
1246 | 
1247 |   def test_quotation(self):
1248 |     instruction_id = 'startend:quotation'
1249 |     instruction = instructions.QuotationChecker(instruction_id)
1250 |     instruction.build_description()
1251 |     with self.subTest(f'test {self.TEST_QUOTATION_MESSAGE_1}'):
1252 |       self.assertTrue(
1253 |           instruction.check_following(self.TEST_QUOTATION_MESSAGE_1)
1254 |       )
1255 |     with self.subTest(f'test {self.TEST_QUOTATION_MESSAGE_2}'):
1256 |       self.assertFalse(
1257 |           instruction.check_following(self.TEST_QUOTATION_MESSAGE_2)
1258 |       )
1259 | 
1260 |   INSTRUCTION_DICT = {
1261 |       'language:response_language': instructions.ResponseLanguageChecker,
1262 |       'length_constraints:number_sentences': instructions.NumberOfSentences,
1263 |       'length_constraints:number_paragraphs': instructions.ParagraphChecker,
1264 |       'length_constraints:number_words': instructions.NumberOfWords,
1265 |       'detectable_content:number_placeholders': instructions.PlaceholderChecker,
1266 |       'detectable_content:postscript': instructions.PostscriptChecker,
1267 |       'detectable_format:number_bullet_lists': instructions.BulletListChecker,
1268 |       'detectable_format:constrained_response': (
1269 |           instructions.ConstrainedResponseChecker),
1270 |       'detectable_format:number_highlighted_sections': (
1271 |           instructions.HighlightSectionChecker),
1272 |       'detectable_format:multiple_sections': instructions.SectionChecker,
1273 |       'detectable_format:json_format': instructions.JsonFormat,
1274 |   }
1275 | 
1276 |   def test_get_instruction_args(self):
1277 |     """Test getting instruction args."""
1278 |     for inst_id, inst_cls in self.INSTRUCTION_DICT.items():
1279 |       instruction = inst_cls(inst_id)
1280 |       inst_description = instruction.build_description()
1281 |       kwargs = instruction.get_instruction_args()
1282 |       # The keyword args can be None.
1283 |       if kwargs:
1284 |         inst_description_closed_loop = instruction.build_description(**kwargs)
1285 |         with self.subTest(f'test {inst_id}'):
1286 |           self.assertEqual(inst_description, inst_description_closed_loop)
1287 | 
1288 | 
1289 | if __name__ == '__main__':
1290 |   absltest.main()
1291 | 


--------------------------------------------------------------------------------
/instruction_following_eval/instructions.py:
--------------------------------------------------------------------------------
   1 | # coding=utf-8
   2 | # Copyright 2024 The Google Research Authors.
   3 | #
   4 | # Licensed under the Apache License, Version 2.0 (the "License");
   5 | # you may not use this file except in compliance with the License.
   6 | # You may obtain a copy of the License at
   7 | #
   8 | #     http://www.apache.org/licenses/LICENSE-2.0
   9 | #
  10 | # Unless required by applicable law or agreed to in writing, software
  11 | # distributed under the License is distributed on an "AS IS" BASIS,
  12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 | # See the License for the specific language governing permissions and
  14 | # limitations under the License.
  15 | 
  16 | """Library of instructions."""
  17 | import collections
  18 | import json
  19 | import random
  20 | import re
  21 | import string
  22 | from typing import Dict, Optional, Sequence, Union
  23 | 
  24 | from absl import logging
  25 | import langdetect
  26 | 
  27 | from instruction_following_eval import instructions_util
  28 | 
  29 | 
  30 | _InstructionArgsDtype = Optional[Dict[str, Union[int, str, Sequence[str]]]]
  31 | 
  32 | _LANGUAGES = instructions_util.LANGUAGE_CODES
  33 | 
  34 | # The relational operation for comparison.
  35 | _COMPARISON_RELATION = ("less than", "at least")
  36 | 
  37 | # The maximum number of sentences.
  38 | _MAX_NUM_SENTENCES = 20
  39 | 
  40 | # The number of placeholders.
  41 | _NUM_PLACEHOLDERS = 4
  42 | 
  43 | # The number of bullet lists.
  44 | _NUM_BULLETS = 5
  45 | 
  46 | # The options of constrained response.
  47 | _CONSTRAINED_RESPONSE_OPTIONS = (
  48 |     "My answer is yes.", "My answer is no.", "My answer is maybe.")
  49 | 
  50 | # The options of starter keywords.
  51 | _STARTER_OPTIONS = ("I would say", "My answer is", "I believe",
  52 |                     "In my opinion", "I think", "I reckon", "I feel",
  53 |                     "From my perspective", "As I see it", "According to me",
  54 |                     "As far as I'm concerned", "To my understanding",
  55 |                     "In my view", "My take on it is", "As per my perception")
  56 | 
  57 | # The options of ending keywords.
  58 | # TODO(jeffreyzhou) add more ending options
  59 | _ENDING_OPTIONS = ("Any other questions?",
  60 |                    "Is there anything else I can help with?")
  61 | 
  62 | # The number of highlighted sections.
  63 | _NUM_HIGHLIGHTED_SECTIONS = 4
  64 | 
  65 | # The section spliter.
  66 | _SECTION_SPLITER = ("Section", "SECTION")
  67 | 
  68 | # The number of sections.
  69 | _NUM_SECTIONS = 5
  70 | 
  71 | # The number of paragraphs.
  72 | _NUM_PARAGRAPHS = 5
  73 | 
  74 | # The postscript marker.
  75 | _POSTSCRIPT_MARKER = ("P.S.", "P.P.S")
  76 | 
  77 | # The number of keywords.
  78 | _NUM_KEYWORDS = 2
  79 | 
  80 | # The occurrences of a single keyword.
  81 | _KEYWORD_FREQUENCY = 3
  82 | 
  83 | # The occurrences of a single letter.
  84 | _LETTER_FREQUENCY = 10
  85 | 
  86 | # The occurrences of words with all capital letters.
  87 | _ALL_CAPITAL_WORD_FREQUENCY = 20
  88 | 
  89 | # The number of words in the response.
  90 | _NUM_WORDS_LOWER_LIMIT = 100
  91 | _NUM_WORDS_UPPER_LIMIT = 500
  92 | 
  93 | 
  94 | class Instruction:
  95 |   """An instruction template."""
  96 | 
  97 |   def __init__(self, instruction_id):
  98 |     self.id = instruction_id
  99 | 
 100 |   def build_description(self, **kwargs):
 101 |     raise NotImplementedError("`build_description` not implemented.")
 102 | 
 103 |   def get_instruction_args(self):
 104 |     raise NotImplementedError("`get_instruction_args` not implemented.")
 105 | 
 106 |   def get_instruction_args_keys(self):
 107 |     raise NotImplementedError("`get_instruction_args_keys` not implemented.")
 108 | 
 109 |   def check_following(self, value):
 110 |     raise NotImplementedError("`check_following` not implemented.")
 111 | 
 112 | 
 113 | class ResponseLanguageChecker(Instruction):
 114 |   """Check the language of the entire response."""
 115 | 
 116 |   def build_description(self, *, language = None):
 117 |     """Build the instruction description.
 118 | 
 119 |     Args:
 120 |       language: A string representing the expected language of the response. The
 121 |         language has to comply to the 97 types defined in
 122 |         `langid.py` (https://pypi.org/project/langid/1.1.5/), which follows
 123 |         ISO 639-1 codes (https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes);
 124 |         for example, `en` for English, `zh` for Chinese, `fr` for French.
 125 | 
 126 |     Returns:
 127 |       A string representing the instruction description.
 128 |     """
 129 |     self._language = language
 130 |     if self._language is None:
 131 |       self._language = random.choice(list(_LANGUAGES.keys()))
 132 |     # TODO(tianjianlu): opens the description generation to more choices.
 133 |     self._description_pattern = (
 134 |         "Your ENTIRE response should be in {language} language, no other " +
 135 |         "language is allowed.")
 136 |     return self._description_pattern.format(language=_LANGUAGES[self._language])
 137 | 
 138 |   def get_instruction_args(self):
 139 |     """Returns the keyward args of `build_description`."""
 140 |     return {"language": self._language}
 141 | 
 142 |   def get_instruction_args_keys(self):
 143 |     """Returns the args keys of `build_description`."""
 144 |     return ["language"]
 145 | 
 146 |   def check_following(self, value):
 147 |     """Check if the language of the entire response follows the instruction.
 148 | 
 149 |     Args:
 150 |       value: A string representing the response.
 151 | 
 152 |     Returns:
 153 |       True if the language of `value` follows instruction; otherwise False.
 154 |     """
 155 |     assert isinstance(value, str)
 156 | 
 157 |     try:
 158 |       return langdetect.detect(value) == self._language
 159 |     except langdetect.LangDetectException as e:
 160 |       # Count as instruction is followed.
 161 |       logging.error(
 162 |           "Unable to detect language for text %s due to %s", value, e
 163 |       )  # refex: disable=pytotw.037
 164 |       return True
 165 | 
 166 | 
 167 | class NumberOfSentences(Instruction):
 168 |   """Check the number of sentences."""
 169 | 
 170 |   def build_description(self, *, num_sentences = None,
 171 |                         relation = None):
 172 |     """Build the instruction description.
 173 | 
 174 |     Args:
 175 |       num_sentences: An integer specifying the number of sentences as a
 176 |         threshold.
 177 |       relation: A string in (`less than`, `at least`), defining the relational
 178 |         operator for comparison.
 179 |         Two relational comparisons are supported for now:
 180 |         if 'less than', the actual number of sentences < the threshold;
 181 |         if 'at least', the actual number of sentences >= the threshold.
 182 | 
 183 |     Returns:
 184 |       A string representing the instruction description.
 185 |     """
 186 |     # The number of sentences as a threshold for comparison.
 187 |     self._num_sentences_threshold = num_sentences
 188 |     if (self._num_sentences_threshold is None or
 189 |         self._num_sentences_threshold < 0):
 190 |       self._num_sentences_threshold = random.randint(1, _MAX_NUM_SENTENCES)
 191 | 
 192 |     if relation is None:
 193 |       self._comparison_relation = random.choice(_COMPARISON_RELATION)
 194 |     elif relation not in _COMPARISON_RELATION:
 195 |       raise ValueError("The supported relation for comparison must be in "
 196 |                        f"{_COMPARISON_RELATION}, but {relation} is given.")
 197 |     else:
 198 |       self._comparison_relation = relation
 199 | 
 200 |     self._description_pattern = (
 201 |         "Your response should contain {relation} {num_sentences} sentences.")
 202 |     return self._description_pattern.format(
 203 |         relation=self._comparison_relation,
 204 |         num_sentences=self._num_sentences_threshold)
 205 | 
 206 |   def get_instruction_args(self):
 207 |     """Returns the keyward args of `build_description`."""
 208 |     return {"num_sentences": self._num_sentences_threshold,
 209 |             "relation": self._comparison_relation}
 210 | 
 211 |   def get_instruction_args_keys(self):
 212 |     """Returns the args keys of `build_description`."""
 213 |     return ["num_sentences", "relation"]
 214 | 
 215 |   def check_following(self, value):
 216 |     """Check if the number of sentences follows the instruction.
 217 | 
 218 |     Args:
 219 |       value: A string representing the response.
 220 | 
 221 |     Returns:
 222 |       True if the response follows the instruction.
 223 | 
 224 |     Raise:
 225 |         ValueError if the string in `instruction_args` is not in
 226 |         [`less_than`, `at_least`].
 227 |     """
 228 |     num_sentences = instructions_util.count_sentences(value)
 229 |     if self._comparison_relation == _COMPARISON_RELATION[0]:
 230 |       return num_sentences < self._num_sentences_threshold
 231 |     elif self._comparison_relation == _COMPARISON_RELATION[1]:
 232 |       return num_sentences >= self._num_sentences_threshold  # pytype: disable=bad-return-type
 233 | 
 234 | 
 235 | class PlaceholderChecker(Instruction):
 236 |   """Check the placeholders in template writing."""
 237 | 
 238 |   def build_description(self, *, num_placeholders = None):
 239 |     """Build the instruction description.
 240 | 
 241 |     Args:
 242 |       num_placeholders: An integer denoting the minimum number of
 243 |         placeholders required in the response.
 244 | 
 245 |     Returns:
 246 |       A string representing the instruction description.
 247 |     """
 248 |     self._num_placeholders = num_placeholders
 249 |     if self._num_placeholders is None or self._num_placeholders < 0:
 250 |       self._num_placeholders = random.randint(1, _NUM_PLACEHOLDERS)
 251 |     self._description_pattern = (
 252 |         "The response must contain at least {num_placeholders} placeholders " +
 253 |         "represented by square brackets, such as [address].")
 254 |     return self._description_pattern.format(
 255 |         num_placeholders=self._num_placeholders)
 256 | 
 257 |   def get_instruction_args(self):
 258 |     """Returns the keyward args of `build_description`."""
 259 |     return {"num_placeholders": self._num_placeholders}
 260 | 
 261 |   def get_instruction_args_keys(self):
 262 |     """Returns the args keys of `build_description`."""
 263 |     return ["num_placeholders"]
 264 | 
 265 |   def check_following(self, value):
 266 |     """Check if the number of placeholders follows the instruction.
 267 | 
 268 |     Args:
 269 |       value: A string representing the response.
 270 | 
 271 |     Returns:
 272 |       True if the actual number of placeholders in the response is greater than
 273 |       or equal to `num_placeholders`; otherwise, False.
 274 |     """
 275 |     placeholders = re.findall(r"\[.*?\]", value)
 276 |     num_placeholders = len(placeholders)
 277 |     return num_placeholders >= self._num_placeholders
 278 | 
 279 | 
 280 | class BulletListChecker(Instruction):
 281 |   """Checks the bullet list in the prompt."""
 282 | 
 283 |   def build_description(self, *, num_bullets = None):
 284 |     """Build the instruction description.
 285 | 
 286 |     Args:
 287 |       num_bullets: An integer specifying the exact number of bullet lists
 288 |         that is required to appear in the response.
 289 | 
 290 |     Returns:
 291 |       A string representing the instruction description.
 292 |     """
 293 |     self._num_bullets = num_bullets
 294 |     if self._num_bullets is None or self._num_bullets < 0:
 295 |       self._num_bullets = random.randint(1, _NUM_BULLETS)
 296 |     self._description_pattern = (
 297 |         "Your answer must contain exactly {num_bullets} bullet points. " +
 298 |         "Use the markdown bullet points such as:\n" +
 299 |         "* This is point 1. \n" +
 300 |         "* This is point 2")
 301 |     return self._description_pattern.format(
 302 |         num_bullets=self._num_bullets)
 303 | 
 304 |   def get_instruction_args(self):
 305 |     """Returns the keyward args of `build_description`."""
 306 |     return {"num_bullets": self._num_bullets}
 307 | 
 308 |   def get_instruction_args_keys(self):
 309 |     """Returns the args keys of `build_description`."""
 310 |     return ["num_bullets"]
 311 | 
 312 |   def check_following(self, value):
 313 |     r"""Check if the number of bullet lists meets the requirement.
 314 | 
 315 |     Args:
 316 |       value: A string representing the response. The response is expected to
 317 |         contain some bullet lists that start with `\*`.
 318 | 
 319 |     Returns:
 320 |       True if the actual number of bullet lists in the response meets the
 321 |       requirement.
 322 |     """
 323 |     value = value.replace("\\n", "\n")
 324 |     bullet_lists = re.findall(r"^\s*\*[^\*].*$", value, flags=re.MULTILINE)
 325 |     bullet_lists_2 = re.findall(r"^\s*-.*$", value, flags=re.MULTILINE)
 326 |     bullet_lists_3 = re.findall(r"^\s*•.*$", value, flags=re.MULTILINE)
 327 |     bullet_lists_4 = re.findall(r"^\s*\d.*$", value, flags=re.MULTILINE)
 328 |     num_bullet_lists = len(bullet_lists) + len(bullet_lists_2) + len(bullet_lists_3) + len(bullet_lists_4)
 329 |     return num_bullet_lists == self._num_bullets
 330 | 
 331 | 
 332 | class ConstrainedResponseChecker(Instruction):
 333 |   """Checks the constrained response."""
 334 | 
 335 |   def build_description(self):
 336 |     """Build the instruction description."""
 337 |     # A sequence of string(s) representing the options of the expected response.
 338 |     self._constrained_responses = _CONSTRAINED_RESPONSE_OPTIONS
 339 |     self._description_pattern = (
 340 |         "Answer with one of the following options: {response_options}")
 341 |     return self._description_pattern.format(
 342 |         response_options=self._constrained_responses)
 343 | 
 344 |   def get_instruction_args(self):
 345 |     """Returns the keyward args of `build_description`."""
 346 |     return None
 347 | 
 348 |   def get_instruction_args_keys(self):
 349 |     """Returns the args keys of `build_description`."""
 350 |     return []
 351 | 
 352 |   def check_following(self, value):
 353 |     """Checks if the response matches the constrained options.
 354 | 
 355 |     Args:
 356 |       value: A string representing the response.
 357 | 
 358 |     Returns:
 359 |       True if the actual response contains one of the options in the constrained
 360 |       responses; otherwise False.
 361 |     """
 362 |     value = value.strip()
 363 |     for constrained_response in self._constrained_responses:
 364 |       if constrained_response in value:
 365 |         return True
 366 |     return False
 367 | 
 368 | 
 369 | class ConstrainedStartChecker(Instruction):
 370 |   """Checks the response start."""
 371 | 
 372 |   def build_description(self, *, starter = None):
 373 |     """Build the instruction description.
 374 | 
 375 |     Args:
 376 |       starter: A string representing the keyward that the response should start
 377 |         with.
 378 | 
 379 |     Returns:
 380 |       A string representing the instruction description.
 381 |     """
 382 |     self._starter = starter.strip() if isinstance(starter, str) else starter
 383 |     if self._starter is None:
 384 |       self._starter = random.choice(_STARTER_OPTIONS)
 385 |     self._description_pattern = (
 386 |         "During the conversation, when it is your turn, " +
 387 |         "please always start with {starter}")
 388 |     return self._description_pattern.format(starter=self._starter)
 389 | 
 390 |   def get_instruction_args(self):
 391 |     """Returns the keyward args of `build_description`."""
 392 |     return {"starter": self._starter}
 393 | 
 394 |   def get_instruction_args_keys(self):
 395 |     """Returns the args keys of `build_description`."""
 396 |     return ["starter"]
 397 | 
 398 |   def check_following(self, value):
 399 |     """Checks if the response starts with the constrained keyword or phrase.
 400 | 
 401 |     Args:
 402 |       value: A string representing the response.
 403 | 
 404 |     Returns:
 405 |       True if the response starts with the given phrase or keyword that is
 406 |       contained in `instruction_args`; otherwise, False.
 407 |     """
 408 |     response_pattern = r"^\s*" + self._starter + r".*$"
 409 |     response_with_constrained_start = re.search(response_pattern, value,
 410 |                                                 flags=re.MULTILINE)
 411 |     return True if response_with_constrained_start else False
 412 | 
 413 | 
 414 | class HighlightSectionChecker(Instruction):
 415 |   """Checks the highlighted section."""
 416 | 
 417 |   def build_description(self, *, num_highlights = None):
 418 |     """Build the instruction description.
 419 | 
 420 |     Args:
 421 |       num_highlights: An integer specifying the minimum number of highlighted
 422 |         sections.
 423 | 
 424 |     Returns:
 425 |       A string representing the instruction description.
 426 |     """
 427 |     self._num_highlights = num_highlights
 428 |     if self._num_highlights is None or self._num_highlights < 0:
 429 |       self._num_highlights = random.randint(1, _NUM_HIGHLIGHTED_SECTIONS)
 430 | 
 431 |     self._description_pattern = (
 432 |         "Highlight at least {num_highlights} sections in your answer with " +
 433 |         "markdown, i.e. *highlighted section*.")
 434 | 
 435 |     return self._description_pattern.format(num_highlights=self._num_highlights)
 436 | 
 437 |   def get_instruction_args(self):
 438 |     """Returns the keyward args of `build_description`."""
 439 |     return {"num_highlights": self._num_highlights}
 440 | 
 441 |   def get_instruction_args_keys(self):
 442 |     """Returns the args keys of `build_description`."""
 443 |     return ["num_highlights"]
 444 | 
 445 |   def check_following(self, value):
 446 |     """Checks if the number of highlighted sections meets the requirement.
 447 | 
 448 |     Args:
 449 |       value: a string repesenting the response. The response is expected to
 450 |         contain highlighted sections in the format of *highlighted*.
 451 | 
 452 |     Returns:
 453 |       True if the actual number of highlighted sections in the format of
 454 |       *highlighed sections* meets the minimum requirement; otherwise False.
 455 |     """
 456 |     num_highlights = 0
 457 |     highlights = re.findall(r"\*[^\n\*]*\*", value)
 458 |     double_highlights = re.findall(r"\*\*[^\n\*]*\*\*", value)
 459 |     for highlight in highlights:
 460 |       if highlight.strip("*").strip():
 461 |         num_highlights += 1
 462 |     for highlight in double_highlights:
 463 |       if highlight.removeprefix("**").removesuffix("**").strip():
 464 |         num_highlights += 1
 465 | 
 466 |     return num_highlights >= self._num_highlights
 467 | 
 468 | 
 469 | class SectionChecker(Instruction):
 470 |   """Checks the sections."""
 471 | 
 472 |   def build_description(self, *, section_spliter = None,
 473 |                         num_sections = None):
 474 |     """Build the instruction description.
 475 | 
 476 |     Args:
 477 |       section_spliter: A string represents the section spliter keyword that
 478 |         marks a new section, i.e., `Section` or `SECTION`.
 479 |       num_sections: An integer specifying the number of sections.
 480 | 
 481 |     Returns:
 482 |       A string representing the instruction description.
 483 |     """
 484 |     self._section_spliter = section_spliter.strip() if isinstance(
 485 |         section_spliter, str) else section_spliter
 486 |     if self._section_spliter is None:
 487 |       self._section_spliter = random.choice(_SECTION_SPLITER)
 488 | 
 489 |     self._num_sections = num_sections
 490 |     if self._num_sections is None or self._num_sections < 0:
 491 |       self._num_sections = random.randint(1, _NUM_SECTIONS)
 492 | 
 493 |     self._description_pattern = (
 494 |         "Your response must have {num_sections} sections. Mark the beginning " +
 495 |         "of each section with {section_spliter} X, such as:\n" +
 496 |         "{section_spliter} 1\n" +
 497 |         "[content of section 1]\n" +
 498 |         "{section_spliter} 2\n" +
 499 |         "[content of section 2]")
 500 | 
 501 |     return self._description_pattern.format(
 502 |         num_sections=self._num_sections,
 503 |         section_spliter=self._section_spliter)
 504 | 
 505 |   def get_instruction_args(self):
 506 |     """Returns the keyward args of `build_description`."""
 507 |     return {"section_spliter": self._section_spliter,
 508 |             "num_sections": self._num_sections}
 509 | 
 510 |   def get_instruction_args_keys(self):
 511 |     """Returns the args keys of `build_description`."""
 512 |     return ["section_spliter", "num_sections"]
 513 | 
 514 |   def check_following(self, value):
 515 |     """Checks the response contains multiple sections.
 516 | 
 517 |     Args:
 518 |       value: A string representing the response. The response is expected
 519 |         to contain multiple sections (number of sections is greater than 1).
 520 |         A new section starts with `Section 1`, where the number denotes the
 521 |         section index.
 522 | 
 523 |     Returns:
 524 |       True if the number of sections in the response is greater than or equal to
 525 |       the minimum number of sections; otherwise, False.
 526 |     """
 527 |     section_splitter_patten = r"\s?" + self._section_spliter  + r"\s?\d+\s?"
 528 |     sections = re.split(section_splitter_patten, value)
 529 |     num_sections = len(sections) - 1
 530 |     return num_sections >= self._num_sections
 531 | 
 532 | 
 533 | class ParagraphChecker(Instruction):
 534 |   """Checks the paragraphs."""
 535 | 
 536 |   def build_description(self, *, num_paragraphs = None):
 537 |     """Build the instruction description.
 538 | 
 539 |     Args:
 540 |       num_paragraphs: An integer specifying the number of paragraphs.
 541 | 
 542 |     Returns:
 543 |       A string representing the instruction description.
 544 |     """
 545 |     self._num_paragraphs = num_paragraphs
 546 |     if self._num_paragraphs is None or self._num_paragraphs < 0:
 547 |       self._num_paragraphs = random.randint(1, _NUM_PARAGRAPHS)
 548 | 
 549 |     self._description_pattern = (
 550 |         "There should be {num_paragraphs} paragraphs. " +
 551 |         "Paragraphs are separated with the markdown divider: ***")
 552 | 
 553 |     return self._description_pattern.format(num_paragraphs=self._num_paragraphs)
 554 | 
 555 |   def get_instruction_args(self):
 556 |     """Returns the keyward args of `build_description`."""
 557 |     return {"num_paragraphs": self._num_paragraphs}
 558 | 
 559 |   def get_instruction_args_keys(self):
 560 |     """Returns the args keys of `build_description`."""
 561 |     return ["num_paragraphs"]
 562 | 
 563 |   def check_following(self, value):
 564 |     """Checks the response contains required number of paragraphs.
 565 | 
 566 |     Args:
 567 |       value: A string representing the response. The response may contain
 568 |         paragraphs that are separated by the markdown divider: `***`.
 569 | 
 570 |     Returns:
 571 |       True if the actual number of paragraphs is the same as required;
 572 |       otherwise, False.
 573 |     """
 574 |     # Split based on *** (with optional spaces) or two or more newlines
 575 |     paragraphs = re.split(r"\s*\*\*\*\s*|\n{1,}", value.strip())
 576 | 
 577 |     num_paragraphs = len(paragraphs)
 578 |     
 579 |     for index, paragraph in enumerate(paragraphs):
 580 |         if not paragraph.strip():
 581 |             # If empty, reduce count if it's at start or end, otherwise return False
 582 |             if index == 0 or index == len(paragraphs) - 1:
 583 |                 num_paragraphs -= 1
 584 |             else:
 585 |                 return False
 586 |             
 587 |     return num_paragraphs == self._num_paragraphs
 588 | 
 589 | 
 590 | class PostscriptChecker(Instruction):
 591 |   """Checks the postscript."""
 592 | 
 593 |   def build_description(self, *, postscript_marker = None
 594 |                         ):
 595 |     """Build the instruction description.
 596 | 
 597 |     Args:
 598 |       postscript_marker: A string containing the keyword that marks the start
 599 |         of the postscript section.
 600 | 
 601 |     Returns:
 602 |       A string representing the instruction description.
 603 |     """
 604 |     self._postscript_marker = postscript_marker.strip() if isinstance(
 605 |         postscript_marker, str) else postscript_marker
 606 |     if self._postscript_marker is None:
 607 |       self._postscript_marker = random.choice(_POSTSCRIPT_MARKER)
 608 | 
 609 |     self._description_pattern = (
 610 |         "At the end of your response, please explicitly add a postscript " +
 611 |         "starting with {postscript}")
 612 | 
 613 |     return self._description_pattern.format(postscript=self._postscript_marker)
 614 | 
 615 |   def get_instruction_args(self):
 616 |     """Returns the keyward args of `build_description`."""
 617 |     return {"postscript_marker": self._postscript_marker}
 618 | 
 619 |   def get_instruction_args_keys(self):
 620 |     """Returns the args keys of `build_description`."""
 621 |     return ["postscript_marker"]
 622 | 
 623 |   def check_following(self, value):
 624 |     """Checks if the response follows the postscript format.
 625 | 
 626 |     Args:
 627 |       value: a string representing the response. The response is expected to
 628 |         contain a postscript section.
 629 | 
 630 |     Returns:
 631 |       True if the response contains a postscript section starting with
 632 |       the keyword containing in the `instruction_args`; otherwise False.
 633 |     """
 634 |     value = value.lower()
 635 |     if self._postscript_marker == "P.P.S":
 636 |       postscript_pattern = r"\s*p\.\s?p\.\s?s.*$"
 637 |     elif self._postscript_marker == "P.S.":
 638 |       postscript_pattern = r"\s*p\.\s?s\..*$"
 639 |     else:
 640 |       postscript_pattern = r"\s*" + self._postscript_marker.lower() + r".*$"
 641 |     postscript = re.findall(postscript_pattern, value, flags=re.MULTILINE)
 642 |     return True if postscript else False
 643 | 
 644 | 
 645 | class RephraseChecker(Instruction):
 646 |   """Checks the repharse."""
 647 | 
 648 |   def build_description(self, *, original_message):
 649 |     """Build the instruction description.
 650 | 
 651 |     Args:
 652 |       original_message: A string representing the original message. The
 653 |         rephrased response should only change its words/sentences in between
 654 |         its two asterisks, for example, *change me*. Both original and rephrased
 655 |         messages should contain the changes in the form of *change me*.
 656 | 
 657 |     Returns:
 658 |       A string representing the instruction description.
 659 |     """
 660 |     if not self.is_change(original_message):
 661 |       raise ValueError(f"Message {original_message} does not contain changes "
 662 |                        "in the form of *change me*.")
 663 | 
 664 |     self._reference_without_change = original_message
 665 |     self._description = ("Rephrasing: Your rephrased response should only" +
 666 |                          "change the words/sentences in between two asterisks" +
 667 |                          "such as *change me*.")
 668 |     return self._description
 669 | 
 670 |   def get_instruction_args(self):
 671 |     """Returns the keyward args of `build_description`."""
 672 |     return {"original_message": self._reference_without_change}
 673 | 
 674 |   def get_instruction_args_keys(self):
 675 |     """Returns the args keys of `build_description`."""
 676 |     return ["original_message"]
 677 | 
 678 |   def check_following(self, value):
 679 |     r"""Checks if the rephrasing follows the instruction.
 680 | 
 681 |     Args:
 682 |       value: A string representing the response, which is expected to rephras
 683 |         the string of `instruction_args`.
 684 | 
 685 |     Returns:
 686 |       True if `value` and `instruction_args` only differ by the words/sentences
 687 |       in between two asterisks such as *change me*; otherwise, False.
 688 |     """
 689 | 
 690 |     if not self.is_change(value):
 691 |       raise ValueError(f"value {value} does not contain "
 692 |                        "changes in the form of *change me*.")
 693 | 
 694 |     response_without_changes = self.strip_changes(value)
 695 |     reference_without_changes = self.strip_changes(
 696 |         self._reference_without_change)
 697 | 
 698 |     return response_without_changes == reference_without_changes
 699 | 
 700 |   def is_change(self, response):
 701 |     """Check if there is change in the response in the form of *change me*."""
 702 |     return re.search(r"\*.*\*", response)
 703 | 
 704 |   def strip_changes(self, response):
 705 |     """Strips off the changes."""
 706 |     return re.sub(r"\*.*\*", "", response)
 707 | 
 708 | 
 709 | class KeywordChecker(Instruction):
 710 |   """Check the exisitence of certain keywords."""
 711 | 
 712 |   def build_description(self, *, keywords = None
 713 |                         ):
 714 |     """Build the instruction description.
 715 | 
 716 |     Args:
 717 |       keywords: A sequence of strings representing the keywords that are
 718 |         expected in the response.
 719 | 
 720 |     Returns:
 721 |       A string representing the instruction description.
 722 |     """
 723 | 
 724 |     if not keywords:
 725 |       self._keywords = instructions_util.generate_keywords(
 726 |           num_keywords=_NUM_KEYWORDS)
 727 |     else:
 728 |       self._keywords = keywords
 729 |     self._keywords = sorted(self._keywords)
 730 | 
 731 |     self._description_pattern = ("Include keywords {keywords} in the response.")
 732 | 
 733 |     return self._description_pattern.format(keywords=self._keywords)
 734 | 
 735 |   def get_instruction_args(self):
 736 |     """Returns the keyward args of `build_description`."""
 737 |     return {"keywords": self._keywords}
 738 | 
 739 |   def get_instruction_args_keys(self):
 740 |     """Returns the args keys of `build_description`."""
 741 |     return ["keywords"]
 742 | 
 743 |   def check_following(self, value):
 744 |     """Check if the response contain the expected keywords."""
 745 |     for keyword in self._keywords:
 746 |       if not re.search(keyword, value, flags=re.IGNORECASE):
 747 |         return False
 748 |     return True
 749 | 
 750 | 
 751 | class KeywordFrequencyChecker(Instruction):
 752 |   """Check the keyword frequency."""
 753 | 
 754 |   def build_description(self, *, keyword = None,
 755 |                         frequency = None,
 756 |                         relation = None):
 757 |     """Build the instruction description.
 758 | 
 759 |     Args:
 760 |       keyword: A string representing a keyword that is expected in the response.
 761 |       frequency: An integer specifying the number of times `keyword` is expected
 762 |         to appear in the response.
 763 |       relation: A string in (`less than`, `at least`), defining the relational
 764 |         operator for comparison.
 765 |         Two relational comparisons are supported for now:
 766 |         if 'less than', the actual number of occurrences < frequency;
 767 |         if 'at least', the actual number of occurrences >= frequency.
 768 | 
 769 |     Returns:
 770 |       A string representing the instruction description.
 771 |     """
 772 |     if not keyword:
 773 |       self._keyword = instructions_util.generate_keywords(num_keywords=1)[0]
 774 |     else:
 775 |       self._keyword = keyword.strip()
 776 | 
 777 |     self._frequency = frequency
 778 |     if self._frequency is None or self._frequency < 0:
 779 |       self._frequency = random.randint(1, _KEYWORD_FREQUENCY)
 780 | 
 781 |     if relation is None:
 782 |       self._comparison_relation = random.choice(_COMPARISON_RELATION)
 783 |     elif relation not in _COMPARISON_RELATION:
 784 |       raise ValueError("The supported relation for comparison must be in "
 785 |                        f"{_COMPARISON_RELATION}, but {relation} is given.")
 786 |     else:
 787 |       self._comparison_relation = relation
 788 | 
 789 |     self._description_pattern = (
 790 |         "In your response, the word {keyword} should appear {relation} " +
 791 |         "{frequency} times.")
 792 | 
 793 |     return self._description_pattern.format(
 794 |         keyword=self._keyword,
 795 |         relation=self._comparison_relation,
 796 |         frequency=self._frequency)
 797 | 
 798 |   def get_instruction_args(self):
 799 |     """Returns the keyward args of `build_description`."""
 800 |     return {"keyword": self._keyword,
 801 |             "frequency": self._frequency,
 802 |             "relation": self._comparison_relation}
 803 | 
 804 |   def get_instruction_args_keys(self):
 805 |     """Returns the args keys of `build_description`."""
 806 |     return ["keyword", "frequency", "relation"]
 807 | 
 808 |   def check_following(self, value):
 809 |     """Checks if the response contain the keyword with required frequency."""
 810 |     actual_occurrences = len(re.findall(
 811 |         self._keyword, value, flags=re.IGNORECASE))
 812 | 
 813 |     if self._comparison_relation == _COMPARISON_RELATION[0]:
 814 |       return actual_occurrences < self._frequency
 815 |     elif self._comparison_relation == _COMPARISON_RELATION[1]:
 816 |       return actual_occurrences >= self._frequency  # pytype: disable=bad-return-type
 817 | 
 818 | 
 819 | class NumberOfWords(Instruction):
 820 |   """Checks the number of words."""
 821 | 
 822 |   def build_description(self, *, num_words = None,
 823 |                         relation = None):
 824 |     """Build the instruction description.
 825 | 
 826 |     Args:
 827 |       num_words: An integer specifying the number of words contained in the
 828 |         response.
 829 |       relation: A string in (`less than`, `at least`), defining the relational
 830 |         operator for comparison.
 831 |         Two relational comparisons are supported for now:
 832 |         if 'less than', the actual number of words < num_words;
 833 |         if 'at least', the actual number of words >= num_words.
 834 | 
 835 |     Returns:
 836 |       A string representing the instruction description.
 837 |     """
 838 | 
 839 |     self._num_words = num_words
 840 |     if self._num_words is None or self._num_words < 0:
 841 |       self._num_words = random.randint(
 842 |           _NUM_WORDS_LOWER_LIMIT, _NUM_WORDS_UPPER_LIMIT
 843 |       )
 844 | 
 845 |     if relation is None:
 846 |       self._comparison_relation = random.choice(_COMPARISON_RELATION)
 847 |     elif relation not in _COMPARISON_RELATION:
 848 |       raise ValueError("The supported relation for comparison must be in "
 849 |                        f"{_COMPARISON_RELATION}, but {relation} is given.")
 850 |     else:
 851 |       self._comparison_relation = relation
 852 | 
 853 |     self._description_pattern = (
 854 |         "Answer with {relation} {num_words} words.")
 855 | 
 856 |     return self._description_pattern.format(
 857 |         relation=self._comparison_relation,
 858 |         num_words=self._num_words)
 859 | 
 860 |   def get_instruction_args(self):
 861 |     """Returns the keyward args of `build_description`."""
 862 |     return {"num_words": self._num_words,
 863 |             "relation": self._comparison_relation}
 864 | 
 865 |   def get_instruction_args_keys(self):
 866 |     """Returns the args keys of `build_description`."""
 867 |     return ["num_words", "relation"]
 868 | 
 869 |   def check_following(self, value):
 870 |     """Checks if the response contains the expected number of words."""
 871 |     num_words = instructions_util.count_words(value)
 872 | 
 873 |     if self._comparison_relation == _COMPARISON_RELATION[0]:
 874 |       return num_words < self._num_words
 875 |     elif self._comparison_relation == _COMPARISON_RELATION[1]:
 876 |       return num_words >= self._num_words  # pytype: disable=bad-return-type
 877 | 
 878 | 
 879 | class JsonFormat(Instruction):
 880 |   """Check the Json format."""
 881 | 
 882 |   def build_description(self):
 883 |     self._description_pattern = (
 884 |         "Entire output should be wrapped in JSON format. You can use markdown"
 885 |         " ticks such as ```."
 886 |     )
 887 |     return self._description_pattern
 888 | 
 889 |   def get_instruction_args(self):
 890 |     """Returns the keyward args of `build_description`."""
 891 |     return None
 892 | 
 893 |   def get_instruction_args_keys(self):
 894 |     """Returns the args keys of `build_description`."""
 895 |     return []
 896 | 
 897 |   def check_following(self, value):
 898 |     value = (
 899 |         value.strip()
 900 |         .removeprefix("```json")
 901 |         .removeprefix("```Json")
 902 |         .removeprefix("```JSON")
 903 |         .removeprefix("```")
 904 |         .removesuffix("```")
 905 |         .strip()
 906 |     )
 907 |     try:
 908 |       json.loads(value)
 909 |     except ValueError as _:
 910 |       return False
 911 |     return True
 912 | 
 913 | 
 914 | class ParagraphFirstWordCheck(Instruction):
 915 |   """Check the paragraph and the first word of the nth paragraph."""
 916 | 
 917 |   def build_description(self, num_paragraphs = None,
 918 |                         nth_paragraph = None,
 919 |                         first_word = None):
 920 |     r"""Build the instruction description.
 921 | 
 922 |     Args:
 923 |       num_paragraphs: An integer indicating the number of paragraphs expected
 924 |         in the response. A paragraph is a subset of the string that is
 925 |         expected to be separated by '\n\n'.
 926 |       nth_paragraph: An integer indicating the paragraph number that we look at.
 927 |         Note that n starts from 1.
 928 |       first_word: A string that represent the first word of the bth paragraph.
 929 | 
 930 |     Returns:
 931 |       A string representing the instruction description.
 932 |     """
 933 |     self._num_paragraphs = num_paragraphs
 934 |     if self._num_paragraphs is None or self._num_paragraphs < 0:
 935 |       self._num_paragraphs = random.randint(1, _NUM_PARAGRAPHS)
 936 | 
 937 |     self._nth_paragraph = nth_paragraph
 938 |     if (
 939 |         self._nth_paragraph is None
 940 |         or self._nth_paragraph <= 0
 941 |         or self._nth_paragraph > self._num_paragraphs
 942 |     ):
 943 |       self._nth_paragraph = random.randint(1, self._num_paragraphs + 1)
 944 | 
 945 |     self._first_word = first_word
 946 |     if self._first_word is None:
 947 |       self._first_word = instructions_util.generate_keywords(num_keywords=1)[0]
 948 |     self._first_word = self._first_word.lower()
 949 | 
 950 |     self._description_pattern = (
 951 |         "There should be {num_paragraphs} paragraphs. " +
 952 |         "Paragraphs and only paragraphs are separated with each other by two " +
 953 |         "new lines as if it was '\\n\\n' in python. " +
 954 |         "Paragraph {nth_paragraph} must start with word {first_word}.")
 955 | 
 956 |     return self._description_pattern.format(
 957 |         num_paragraphs=self._num_paragraphs,
 958 |         nth_paragraph=self._nth_paragraph,
 959 |         first_word=self._first_word)
 960 | 
 961 |   def get_instruction_args(self):
 962 |     """Returns the keyward args of `build_description`."""
 963 |     return {"num_paragraphs": self._num_paragraphs,
 964 |             "nth_paragraph": self._nth_paragraph,
 965 |             "first_word": self._first_word}
 966 | 
 967 |   def get_instruction_args_keys(self):
 968 |     """Returns the args keys of `build_description`."""
 969 |     return ["num_paragraphs", "nth_paragraph", "first_word"]
 970 | 
 971 |   def check_following(self, value):
 972 |     """Checks for required number of paragraphs and correct first word.
 973 | 
 974 |     Args:
 975 |       value: a string representing the response. The response may contain
 976 |         paragraphs that are separated by two new lines and the first word of
 977 |         the nth paragraph will have to match a specified word.
 978 | 
 979 |     Returns:
 980 |       True if the number of paragraphs is the same as required and the first
 981 |       word of the specified paragraph is the same as required. Otherwise, false.
 982 |     """
 983 | 
 984 |     paragraphs = re.split(r"\n\n", value)
 985 |     num_paragraphs = len(paragraphs)
 986 | 
 987 |     for paragraph in paragraphs:
 988 |       if not paragraph.strip():
 989 |         num_paragraphs -= 1
 990 | 
 991 |     # check that index doesn't go out of bounds
 992 |     if self._nth_paragraph <= num_paragraphs:
 993 |       paragraph = paragraphs[self._nth_paragraph - 1].strip()
 994 |       if not paragraph:
 995 |         return False
 996 |     else:
 997 |       return False
 998 | 
 999 |     first_word = ""
1000 |     punctuation = {".", ",", "?", "!", "'", '"'}
1001 | 
1002 |     # get first word and remove punctuation
1003 |     word = paragraph.split()[0].strip()
1004 |     # TODO(jeffrey): make more complex?
1005 |     word = word.lstrip("'")
1006 |     word = word.lstrip('"')
1007 | 
1008 |     for letter in word:
1009 |       if letter in punctuation:
1010 |         break
1011 |       first_word += letter.lower()
1012 | 
1013 |     return (
1014 |         num_paragraphs == self._num_paragraphs
1015 |         and first_word == self._first_word
1016 |     )
1017 | 
1018 | 
1019 | # TODO(jeffrey) add relation - at least/at most?
1020 | class KeySentenceChecker(Instruction):
1021 |   """Check the existence of certain key sentences."""
1022 | 
1023 |   def build_description(self, key_sentences = None,
1024 |                         num_sentences = None):
1025 |     """Build the instruction description.
1026 | 
1027 |     Args:
1028 |       key_sentences: A sequences of strings representing the key sentences that
1029 |         are expected in the response.
1030 |       num_sentences: The number of key sentences that are expected to be seen in
1031 |         the response.
1032 | 
1033 |     Returns:
1034 |       A string representing the instruction description.
1035 |     """
1036 | 
1037 |     if not key_sentences:
1038 |       # TODO(jeffrey) make a generate sentences function? wonderwords package
1039 |       self._key_sentences = set(["For now, this is fine."])
1040 |     else:
1041 |       self._key_sentences = key_sentences
1042 | 
1043 |     if not num_sentences:
1044 |       self._num_sentences = random.randint(1, len(self._key_sentences))
1045 |     else:
1046 |       self._num_sentences = num_sentences
1047 | 
1048 |     self._description_pattern = (
1049 |         "Include {num_sentences} of the following sentences {key_sentences}"
1050 |     )
1051 | 
1052 |     return self._description_pattern.format(
1053 |         num_sentences=self._num_sentences, key_sentences=self._key_sentences
1054 |     )
1055 | 
1056 |   def get_instruction_args(self):
1057 |     """Returns the keyward args of `build_description`."""
1058 |     return {"num_sentences": self._num_sentences,
1059 |             "key_sentences": list(self._key_sentences)}
1060 | 
1061 |   def get_instruction_args_keys(self):
1062 |     """Returns the args keys of `build_description`."""
1063 |     return ["num_sentences", "key_sentences"]
1064 | 
1065 |   def check_following(self, value):
1066 |     """Checks if the response contains the expected key sentences."""
1067 |     count = 0
1068 |     sentences = instructions_util.split_into_sentences(value)
1069 |     for sentence in self._key_sentences:
1070 |       if sentence in sentences:
1071 |         count += 1
1072 | 
1073 |     return count == self._num_sentences
1074 | 
1075 | 
1076 | class ForbiddenWords(Instruction):
1077 |   """Checks that specified words are not used in response."""
1078 | 
1079 |   def build_description(self, forbidden_words = None
1080 |                         ):
1081 |     """Build the instruction description.
1082 | 
1083 |     Args:
1084 |       forbidden_words: A sequences of strings respresenting words that are not
1085 |         allowed in the response.
1086 | 
1087 |     Returns:
1088 |       A string representing the instruction description.
1089 |     """
1090 | 
1091 |     if not forbidden_words:
1092 |       self._forbidden_words = instructions_util.generate_keywords(
1093 |           num_keywords=_NUM_KEYWORDS)
1094 |     else:
1095 |       self._forbidden_words = list(set(forbidden_words))
1096 |     self._forbidden_words = sorted(self._forbidden_words)
1097 |     self._description_pattern = (
1098 |         "Do not include keywords {forbidden_words} in the response."
1099 |     )
1100 | 
1101 |     return self._description_pattern.format(
1102 |         forbidden_words=self._forbidden_words
1103 |     )
1104 | 
1105 |   def get_instruction_args(self):
1106 |     """Returns the keyward args of `build_description`."""
1107 |     return {"forbidden_words": self._forbidden_words}
1108 | 
1109 |   def get_instruction_args_keys(self):
1110 |     """Returns the args keys of `build_description`."""
1111 |     return ["forbidden_words"]
1112 | 
1113 |   def check_following(self, value):
1114 |     """Check if the response does not contain the expected keywords."""
1115 |     for word in self._forbidden_words:
1116 |       if re.search(r"\b" + word + r"\b", value, flags=re.IGNORECASE):
1117 |         return False
1118 |     return True
1119 | 
1120 | 
1121 | class RephraseParagraph(Instruction):
1122 |   """Checks that the paragraph is rephrased."""
1123 | 
1124 |   def build_description(self, *, original_paragraph, low, high
1125 |                         ):
1126 |     """Builds the instruction description.
1127 | 
1128 |     Args:
1129 |       original_paragraph: A string presenting the original paragraph. The
1130 |         rephrases response should have betweeb low-high words in common.
1131 |       low: An integer presenting the lower bound of similar words.
1132 |       high: An integer representing the upper bound of similar words.
1133 | 
1134 |     Returns:
1135 |       A string representing the instruction description.
1136 |     """
1137 |     # TODO(jeffrey) make more encompassing
1138 |     self._original_paragraph = original_paragraph
1139 |     self._low = low
1140 |     self._high = high
1141 | 
1142 |     self._description = ("Rephrase the following paragraph: " +
1143 |                          "{original_paragraph}\nYour response should have " +
1144 |                          "between {low} and {high} of the same words. " +
1145 |                          "Words are the same if and only if all of the " +
1146 |                          "letters, ignoring cases, are the same. For " +
1147 |                          "example, 'run' is the same as 'Run' but different " +
1148 |                          "to 'ran'.")
1149 | 
1150 |     return self._description.format(original_paragraph=original_paragraph,
1151 |                                     low=self._low, high=self._high)
1152 | 
1153 |   def get_instruction_args(self):
1154 |     """Returns the keyward args of `build_description`."""
1155 |     return {"original_paragraph": self._original_paragraph,
1156 |             "low": self._low,
1157 |             "high": self._high}
1158 | 
1159 |   def get_instruction_args_keys(self):
1160 |     """Returns the args keys of `build_description`."""
1161 |     return ["original_paragraph", "low", "high"]
1162 | 
1163 |   def check_following(self, value):
1164 |     val_words = re.findall(r"\w+", value.lower())
1165 |     original_words = re.findall(r"\w+", self._original_paragraph.lower())
1166 |     similar_words = 0
1167 | 
1168 |     dict_val = collections.Counter(val_words)
1169 |     dict_original = collections.Counter(original_words)
1170 | 
1171 |     for word in dict_original:
1172 |       similar_words += min(dict_original[word], dict_val[word])
1173 | 
1174 |     return similar_words >= self._low and similar_words <= self._high
1175 | 
1176 | 
1177 | class TwoResponsesChecker(Instruction):
1178 |   """Check that two responses were given."""
1179 | 
1180 |   def build_description(self):
1181 |     """Build the instruction description."""
1182 |     self._description_pattern = (
1183 |         "Give two different responses. Responses and only responses should"
1184 |         " be separated by 6 asterisk symbols: ******."
1185 |     )
1186 |     return self._description_pattern
1187 | 
1188 |   def get_instruction_args(self):
1189 |     """Returns the keyward args of `build_description`."""
1190 |     return None
1191 | 
1192 |   def get_instruction_args_keys(self):
1193 |     """Returns the args keys of `build_description`."""
1194 |     return []
1195 | 
1196 |   def check_following(self, value):
1197 |     """Checks if the response has two different answers.
1198 | 
1199 |     Args:
1200 |       value: A string representing the response.
1201 | 
1202 |     Returns:
1203 |       True if two responses are detected and false otherwise.
1204 |     """
1205 |     valid_responses = list()
1206 |     responses = value.split("******")
1207 |     for index, response in enumerate(responses):
1208 |       if not response.strip():
1209 |         if index != 0 and index != len(responses) - 1:
1210 |           return False
1211 |       else:
1212 |         valid_responses.append(response)
1213 |     return (
1214 |         len(valid_responses) == 2
1215 |         and valid_responses[0].strip() != valid_responses[1].strip()
1216 |     )
1217 | 
1218 | 
1219 | class RepeatPromptThenAnswer(Instruction):
1220 |   """Checks that Prompt is first repeated then answered."""
1221 | 
1222 |   def build_description(self, *, prompt_to_repeat = None):
1223 |     """Build the instruction description.
1224 | 
1225 |     Args:
1226 |       prompt_to_repeat: The prompt that is meant to be repeated.
1227 | 
1228 |     Returns:
1229 |       A string representing the instruction description.
1230 |     """
1231 |     if not prompt_to_repeat:
1232 |       raise ValueError("prompt_to_repeat must be set.")
1233 |     else:
1234 |       self._prompt_to_repeat = prompt_to_repeat
1235 |     self._description_pattern = (
1236 |         "First repeat the request word for word without change,"
1237 |         " then give your answer (1. do not say any words or characters"
1238 |         " before repeating the request; 2. the request you need to repeat"
1239 |         " does not include this sentence)"
1240 |     )
1241 |     return self._description_pattern
1242 | 
1243 |   def get_instruction_args(self):
1244 |     return {"prompt_to_repeat": self._prompt_to_repeat}
1245 | 
1246 |   def get_instruction_args_keys(self):
1247 |     """Returns the args keys of `build_description`."""
1248 |     return ["prompt_to_repeat"]
1249 | 
1250 |   def check_following(self, value):
1251 |     if value.strip().lower().strip("\"\'").startswith(self._prompt_to_repeat.strip().lower()):
1252 |       return True
1253 |     return False
1254 | 
1255 | 
1256 | class EndChecker(Instruction):
1257 |   """Checks that the prompt ends with a given phrase."""
1258 | 
1259 |   def build_description(self, *, end_phrase = None):
1260 |     """Build the instruction description.
1261 | 
1262 |     Args:
1263 |       end_phrase: A string representing the phrase the response should end with.
1264 | 
1265 |     Returns:
1266 |       A string representing the instruction description.
1267 |     """
1268 |     self._end_phrase = (
1269 |         end_phrase.strip() if isinstance(end_phrase, str) else end_phrase
1270 |     )
1271 |     if self._end_phrase is None:
1272 |       self._end_phrase = random.choice(_ENDING_OPTIONS)
1273 |     self._description_pattern = (
1274 |         "Finish your response with this exact phrase {ender}. "
1275 |         "No other words should follow this phrase.")
1276 |     return self._description_pattern.format(ender=self._end_phrase)
1277 | 
1278 |   def get_instruction_args(self):
1279 |     return {"end_phrase": self._end_phrase}
1280 | 
1281 |   def get_instruction_args_keys(self):
1282 |     """Returns the args keys of `build_description`."""
1283 |     return ["end_phrase"]
1284 | 
1285 |   def check_following(self, value):
1286 |     """Checks if the response ends with the expected phrase."""
1287 |     value = value.strip().strip("\"").lower()
1288 |     self._end_phrase = self._end_phrase.strip().lower()
1289 |     return value.endswith(self._end_phrase)
1290 | 
1291 | 
1292 | class TitleChecker(Instruction):
1293 |   """Checks the response for a title."""
1294 | 
1295 |   def build_description(self):
1296 |     """Build the instruction description."""
1297 |     self._description_pattern = (
1298 |         "Your answer must contain a title, wrapped in double angular brackets,"
1299 |         " such as <<poem of joy>>."
1300 |     )
1301 |     return self._description_pattern
1302 | 
1303 |   def get_instruction_args(self):
1304 |     return None
1305 | 
1306 |   def get_instruction_args_keys(self):
1307 |     """Returns the args keys of `build_description`."""
1308 |     return []
1309 | 
1310 |   def check_following(self, value):
1311 |     """Checks if the response contains a title."""
1312 |     pattern = r"<<[^\n]+>>"
1313 |     re_pattern = re.compile(pattern)
1314 |     titles = re.findall(re_pattern, value)
1315 | 
1316 |     for title in titles:
1317 |       if title.lstrip("<").rstrip(">").strip():
1318 |         return True
1319 |     return False
1320 | 
1321 | 
1322 | class LetterFrequencyChecker(Instruction):
1323 |   """Checks letter frequency."""
1324 | 
1325 |   def build_description(self, *, letter = None,
1326 |                         let_frequency = None,
1327 |                         let_relation = None):
1328 |     """Build the instruction description.
1329 | 
1330 |     Args:
1331 |       letter: A string representing a letter that is expected in the response.
1332 |       let_frequency: An integer specifying the number of times `keyword` is
1333 |         expected to appear in the response.
1334 |       let_relation: A string in (`less than`, `at least`), defining the
1335 |         relational operator for comparison. Two relational comparisons are
1336 |         supported for now; if 'less than', the actual number of
1337 |         occurrences < frequency; if 'at least', the actual number of
1338 |         occurrences >= frequency.
1339 | 
1340 |     Returns:
1341 |       A string representing the instruction description.
1342 |     """
1343 |     if (
1344 |         not letter
1345 |         or len(letter) > 1
1346 |         or ord(letter.lower()) < 97
1347 |         or ord(letter.lower()) > 122
1348 |     ):
1349 |       self._letter = random.choice(list(string.ascii_letters))
1350 |     else:
1351 |       self._letter = letter.strip()
1352 |     self._letter = self._letter.lower()
1353 | 
1354 |     self._frequency = let_frequency
1355 |     if self._frequency is None or self._frequency < 0:
1356 |       self._frequency = random.randint(1, _LETTER_FREQUENCY)
1357 | 
1358 |     if let_relation is None:
1359 |       self._comparison_relation = random.choice(_COMPARISON_RELATION)
1360 |     elif let_relation not in _COMPARISON_RELATION:
1361 |       raise ValueError(
1362 |           "The supported relation for comparison must be in "
1363 |           f"{_COMPARISON_RELATION}, but {let_relation} is given."
1364 |       )
1365 |     else:
1366 |       self._comparison_relation = let_relation
1367 | 
1368 |     self._description_pattern = (
1369 |         "In your response, the letter {letter} should appear {let_relation}"
1370 |         " {let_frequency} times."
1371 |     )
1372 | 
1373 |     return self._description_pattern.format(
1374 |         letter=self._letter,
1375 |         let_frequency=self._frequency,
1376 |         let_relation=self._comparison_relation,
1377 |     )
1378 | 
1379 |   def get_instruction_args(self):
1380 |     """Returns the keyword args of build description."""
1381 |     return {"letter": self._letter,
1382 |             "let_frequency": self._frequency,
1383 |             "let_relation": self._comparison_relation}
1384 | 
1385 |   def get_instruction_args_keys(self):
1386 |     """Returns the args keys of `build_description`."""
1387 |     return ["letter", "let_frequency", "let_relation"]
1388 | 
1389 |   def check_following(self, value):
1390 |     """Checks that the response contains the letter at the right frequency."""
1391 |     value = value.lower()
1392 |     letters = collections.Counter(value)
1393 | 
1394 |     if self._comparison_relation == _COMPARISON_RELATION[0]:
1395 |       return letters[self._letter] < self._frequency
1396 |     else:
1397 |       return letters[self._letter] >= self._frequency
1398 | 
1399 | 
1400 | class CapitalLettersEnglishChecker(Instruction):
1401 |   """Checks that the response is in english and is in all capital letters."""
1402 | 
1403 |   def build_description(self):
1404 |     """Build the instruction description."""
1405 |     self._description_pattern = (
1406 |         "Your entire response should be in English, and in all capital letters."
1407 |     )
1408 |     return self._description_pattern
1409 | 
1410 |   def get_instruction_args(self):
1411 |     return None
1412 | 
1413 |   def get_instruction_args_keys(self):
1414 |     """Returns the args keys of `build_description`."""
1415 |     return []
1416 | 
1417 |   def check_following(self, value):
1418 |     """Checks that the response is in English and in all capital letters."""
1419 |     assert isinstance(value, str)
1420 | 
1421 |     try:
1422 |       return value.isupper()# and langdetect.detect(value) == "en"
1423 |     except langdetect.LangDetectException as e:
1424 |       # Count as instruction is followed.
1425 |       logging.error(
1426 |           "Unable to detect language for text %s due to %s", value, e
1427 |       )  # refex: disable=pytotw.037
1428 |       return True
1429 | 
1430 | 
1431 | class LowercaseLettersEnglishChecker(Instruction):
1432 |   """Checks that the response is in english and is in all lowercase letters."""
1433 | 
1434 |   def build_description(self):
1435 |     """Build the instruction description."""
1436 |     self._description_pattern = (
1437 |         "Your entire response should be in English, and in all lowercase"
1438 |         " letters. No capital letters are allowed."
1439 |     )
1440 |     return self._description_pattern
1441 | 
1442 |   def get_instruction_args(self):
1443 |     return None
1444 | 
1445 |   def get_instruction_args_keys(self):
1446 |     """Returns the args keys of `build_description`."""
1447 |     return []
1448 | 
1449 |   def check_following(self, value):
1450 |     """Checks that the response is in English and in all lowercase letters."""
1451 |     assert isinstance(value, str)
1452 | 
1453 |     try:
1454 |       return value.islower()# and langdetect.detect(value) == "en"
1455 |     except langdetect.LangDetectException as e:
1456 |       # Count as instruction is followed.
1457 |       logging.error(
1458 |           "Unable to detect language for text %s due to %s", value, e
1459 |       )  # refex: disable=pytotw.037
1460 |       return True
1461 | 
1462 | 
1463 | class CommaChecker(Instruction):
1464 |   """Checks the response for no commas."""
1465 | 
1466 |   def build_description(self):
1467 |     """Build the instruction description."""
1468 |     self._description_pattern = (
1469 |         "In your entire response, refrain from the use of any commas."
1470 |     )
1471 |     return self._description_pattern
1472 | 
1473 |   def get_instruction_args(self):
1474 |     return None
1475 | 
1476 |   def get_instruction_args_keys(self):
1477 |     """Returns the args keys of `build_description`."""
1478 |     return []
1479 | 
1480 |   def check_following(self, value):
1481 |     """Checks that the response does not contain commas."""
1482 |     return not re.search(r"\,", value)
1483 | 
1484 | 
1485 | class CapitalWordFrequencyChecker(Instruction):
1486 |   """Checks frequency of words with all capital letters."""
1487 | 
1488 |   def build_description(
1489 |       self,
1490 |       capital_frequency = None,
1491 |       capital_relation = None,
1492 |   ):
1493 |     """Build the instruction description.
1494 | 
1495 |     Args:
1496 |       capital_frequency: An integer that represents the number of words that
1497 |         should be in all capital letters.
1498 |       capital_relation: A string that is 'at least' or 'at most' that refers to
1499 |         the frequency.
1500 | 
1501 |     Returns:
1502 |       A string representing the instruction description.
1503 |     """
1504 |     self._frequency = capital_frequency
1505 |     if self._frequency is None:
1506 |       self._frequency = random.randint(1, _ALL_CAPITAL_WORD_FREQUENCY)
1507 | 
1508 |     self._comparison_relation = capital_relation
1509 |     if capital_relation is None:
1510 |       self._comparison_relation = random.choice(_COMPARISON_RELATION)
1511 |     elif capital_relation not in _COMPARISON_RELATION:
1512 |       raise ValueError(
1513 |           "The supported relation for comparison must be in "
1514 |           f"{_COMPARISON_RELATION}, but {capital_relation} is given."
1515 |       )
1516 | 
1517 |     self._description_pattern = (
1518 |         "In your response, words with all capital letters should appear"
1519 |         " {relation} {frequency} times."
1520 |     )
1521 | 
1522 |     return self._description_pattern.format(
1523 |         frequency=self._frequency, relation=self._comparison_relation
1524 |     )
1525 | 
1526 |   def get_instruction_args(self):
1527 |     """Returns the keyword args of build description."""
1528 |     return {
1529 |         "capital_frequency": self._frequency,
1530 |         "capital_relation": self._comparison_relation,
1531 |     }
1532 | 
1533 |   def get_instruction_args_keys(self):
1534 |     """Returns the args keys of `build_description`."""
1535 |     return ["capital_frequency", "capital_relation"]
1536 | 
1537 |   def check_following(self, value):
1538 |     """Checks the frequency of words with all capital letters."""
1539 |     # Hyphenated words will count as one word
1540 |     words = instructions_util.nltk.word_tokenize(value)
1541 |     capital_words = [word for word in words if word.isupper()]
1542 | 
1543 |     capital_words = len(capital_words)
1544 | 
1545 |     if self._comparison_relation == _COMPARISON_RELATION[0]:
1546 |       return capital_words < self._frequency
1547 |     else:
1548 |       return capital_words >= self._frequency
1549 | 
1550 | 
1551 | class QuotationChecker(Instruction):
1552 |   """Checks response is wrapped with double quotation marks."""
1553 | 
1554 |   def build_description(self):
1555 |     """Build the instruction description."""
1556 |     self._description_pattern = (
1557 |         "Wrap your entire response with double quotation marks."
1558 |     )
1559 |     return self._description_pattern
1560 | 
1561 |   def get_instruction_args(self):
1562 |     """Returns the keyword args of build description."""
1563 |     return None
1564 | 
1565 |   def get_instruction_args_keys(self):
1566 |     """Returns the args keys of `build_description`."""
1567 |     return []
1568 | 
1569 |   def check_following(self, value):
1570 |     """Checks if the response is wrapped with double quotation marks."""
1571 |     value = value.strip()
1572 |     return len(value) > 1 and value[0] == '"' and value[-1] == '"'
1573 | 


--------------------------------------------------------------------------------