├── .gitignore ├── .gitmodules ├── Datasets ├── AUT │ ├── aut_10.json │ ├── aut_100.json │ ├── aut_10_val.json │ ├── aut_30_test.json │ └── aut_example.json ├── Instances │ ├── instances_10.json │ ├── instances_100.json │ ├── instances_10_val.json │ ├── instances_3.json │ ├── instances_30_test.json │ └── instances_example.json ├── README.md ├── Scientific │ ├── scientific_10.json │ ├── scientific_100.json │ ├── scientific_10_val.json │ ├── scientific_3.json │ ├── scientific_30_test.json │ └── scientific_example.json └── Similarities │ ├── similarities_10.json │ ├── similarities_100.json │ ├── similarities_10_val.json │ ├── similarities_3.json │ ├── similarities_30_test.json │ └── similarities_example.json ├── Evaluation ├── .gitignore ├── README.md ├── auto_grade_final.py ├── automation_csv.py ├── eval_functions │ ├── eval_criterion.py │ └── eval_prompts.py └── utils │ ├── openai_model.py │ └── util.py ├── Experiments ├── README.md ├── multi_agent │ ├── agents.py │ ├── config_role.json │ ├── discussion.py │ └── llm_discussion.py └── read_conversation.py ├── README.md ├── requirements.txt └── resources ├── discussion_framework.png ├── response_sample.png ├── roleplay.png └── teaser.png /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # Pickle files 86 | *.pickle 87 | 88 | # pyenv 89 | # For a library or package, you might want to ignore these files since the code is 90 | # intended to run in multiple environments; otherwise, check them in: 91 | # .python-version 92 | 93 | # pipenv 94 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 95 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 96 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 97 | # install all needed dependencies. 98 | #Pipfile.lock 99 | 100 | # poetry 101 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 102 | # This is especially recommended for binary packages to ensure reproducibility, and is more 103 | # commonly ignored for libraries. 104 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 105 | #poetry.lock 106 | 107 | # pdm 108 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 109 | #pdm.lock 110 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 111 | # in version control. 112 | # https://pdm.fming.dev/#use-with-ide 113 | .pdm.toml 114 | 115 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 116 | __pypackages__/ 117 | 118 | # Celery stuff 119 | celerybeat-schedule 120 | celerybeat.pid 121 | 122 | # SageMath parsed files 123 | *.sage.py 124 | 125 | # Environments 126 | .env 127 | .venv 128 | env/ 129 | venv/ 130 | ENV/ 131 | env.bak/ 132 | venv.bak/ 133 | 134 | # Spyder project settings 135 | .spyderproject 136 | .spyproject 137 | 138 | # Rope project settings 139 | .ropeproject 140 | 141 | # mkdocs documentation 142 | /site 143 | 144 | # mypy 145 | .mypy_cache/ 146 | .dmypy.json 147 | dmypy.json 148 | 149 | # Pyre type checker 150 | .pyre/ 151 | 152 | # pytype static type analyzer 153 | .pytype/ 154 | 155 | # Cython debug symbols 156 | cython_debug/ 157 | 158 | # PyCharm 159 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 160 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 161 | # and can be added to the global gitignore or merged into this file. For a more nuclear 162 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 163 | #.idea/ 164 | .DS_Store 165 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "llama"] 2 | path = llama 3 | url = https://github.com/facebookresearch/llama.git 4 | -------------------------------------------------------------------------------- /Datasets/AUT/aut_10.json: -------------------------------------------------------------------------------- 1 | { 2 | "Task": [ 3 | { 4 | "Problem": [ 5 | "What are some creative use for {object}? The goal is to come up with creative ideas,", 6 | "which are ideas that strike people as clever, unusual, interesting, uncommon, humorous,", 7 | "innovative, or different. Present a list of as many creative and diverse uses for", 8 | "{object} as possible." 9 | ], 10 | "Purpose": "1. Baseline" 11 | }, 12 | { 13 | "Problem": [ 14 | "What are some creative use for {object}? The goal is to come up with creative ideas,", 15 | "which are ideas that strike people as clever, unusual, interesting, uncommon, humorous,", 16 | "innovative, or different. Present a list of as many creative and diverse uses for", 17 | "{object} as possible!", 18 | 19 | "This is very important to my career and research since I need to come up with creative ideas!" 20 | ], 21 | "Purpose": "2. LLM Stimuli 1" 22 | }, 23 | { 24 | "Problem": [ 25 | "What are some creative use for {object}? The goal is to come up with creative ideas,", 26 | "which are ideas that strike people as clever, unusual, interesting, uncommon, humorous,", 27 | "innovative, or different. Present a list of as many creative and diverse uses for", 28 | "{object} as possible!", 29 | 30 | "This is very important to my job since I need to come up with creative idea!", 31 | "Hope you could come up as many as ideas to help me get through my career crisis!" 32 | ], 33 | "Purpose": "3. LLM Stimuli 2" 34 | }, 35 | { 36 | "Problem": [ 37 | "What are some creative use for {object}? The goal is to come up with creative ideas,", 38 | "which are ideas that strike people as clever, unusual, interesting, uncommon, humorous,", 39 | "innovative, or different. Present a list of as many creative and diverse uses for", 40 | "{object} as possible!", 41 | 42 | "Please come up with as many as you can since you are in a group discussion,", 43 | "the more ideas you can think of, the more creative would this discussion be." 44 | ], 45 | "Purpose": "4. Prompt for teamwork -> as many as u can" 46 | }, 47 | { 48 | "Problem": [ 49 | "What are some creative use for {object}? The goal is to come up with creative ideas,", 50 | "which are ideas that strike people as clever, unusual, interesting, uncommon, humorous,", 51 | "innovative, or different. Present a list of as many creative and diverse uses for", 52 | "{object} as possible!", 53 | 54 | "Please come up with as many as you can since you are in a group discussion,", 55 | "the more ideas you can think of, the more creative would this discussion be.", 56 | "This is very important to my career and research since I need to come up with creative ideas!" 57 | ], 58 | "Purpose": "5. LLM Stimuli + Prompt for teamwork" 59 | }, 60 | { 61 | "Problem": [ 62 | "What are some creative use for {object}? The goal is to come up with creative ideas,", 63 | "which are ideas that strike people as clever, unusual, interesting, uncommon, humorous,", 64 | "innovative, or different. Present a list of as many creative and diverse uses for", 65 | "{object} as possible!", 66 | 67 | "Please come up with as many as you can since you are in a group discussion,", 68 | "the more ideas you can think of, the more creative would this discussion be.", 69 | "Imagine your group needs to use {object} in a talent show to impress a panel of judges.", 70 | "What innovative uses can you come up with to showcase its versatility?" 71 | ], 72 | "Purpose": "6. Prompt for teamwork -> GPT-4 enforcement" 73 | }, 74 | { 75 | "Problem": [ 76 | "What are some creative use for {object}? The goal is to come up with creative ideas,", 77 | "which are ideas that strike people as clever, unusual, interesting, uncommon, humorous,", 78 | "innovative, or different. Present a list of as many creative and diverse uses for", 79 | "{object} as possible!", 80 | 81 | "Please come up with as many as you can since you are in a group discussion,", 82 | "the more ideas you can think of, the more creative would this discussion be.", 83 | "Pretend your group is a team of inventors presenting {object} as a revolutionary new product on a reality TV show.", 84 | "How do you pitch its most creative uses to win the competition?" 85 | ], 86 | "Purpose": "7. Prompt for teamwork -> GPT-4 enforcement" 87 | }, 88 | { 89 | "Problem": [ 90 | "What are some creative use for {object}? The goal is to come up with creative ideas,", 91 | "which are ideas that strike people as clever, unusual, interesting, uncommon, humorous,", 92 | "innovative, or different. Present a list of as many creative and diverse uses for", 93 | "{object} as possible!", 94 | 95 | "Please come up with as many as you can since you are in a group discussion,", 96 | "the more ideas you can think of, the more creative would this discussion be.", 97 | "Pretend your group is a team of inventors presenting {object} as a revolutionary new product on a reality TV show.", 98 | "How do you pitch its most creative uses to win the competition?", 99 | "Imagine that your group has been asked to use {object} in an international exhibition to represent your country's innovation.", 100 | "How can you showcase its unique applications?" 101 | ], 102 | "Purpose": "8. Prompt for teamwork -> GPT-4 enforcement" 103 | }, 104 | { 105 | "Problem": [ 106 | "What are some creative use for {object}? The goal is to come up with creative ideas,", 107 | "which are ideas that strike people as clever, unusual, interesting, uncommon, humorous,", 108 | "innovative, or different. Present a list of as many creative and diverse uses for", 109 | "{object} as possible!", 110 | 111 | "You would be in a group discussion with other teammates,", 112 | "as a result, you should answer as diverge and creative as you can." 113 | ], 114 | "Purpose": "9. Prompt for teamwork" 115 | }, 116 | { 117 | "Problem": [ 118 | "What are some creative use for {object}? The goal is to come up with creative ideas,", 119 | "which are ideas that strike people as clever, unusual, interesting, uncommon, humorous,", 120 | "innovative, or different. Present a list of as many creative and diverse uses for", 121 | "{object} as possible!", 122 | 123 | "You would be in a group discussion with other teammates,", 124 | "as a result, you should answer as diverge and creative as you can.", 125 | "This could notably increase the performance of the group discussion", 126 | "since you are an essential member in this group." 127 | ], 128 | "Purpose": "10. Prompt for teamwork" 129 | } 130 | ], 131 | "Examples": [ 132 | { 133 | "object": "Fork" 134 | }, 135 | { 136 | "object": "Jar" 137 | }, 138 | { 139 | "object": "Mirror" 140 | }, 141 | { 142 | "object": "Mug" 143 | }, 144 | { 145 | "object": "Needle" 146 | }, 147 | { 148 | "object": "Paper" 149 | }, 150 | { 151 | "object": "Ring" 152 | }, 153 | { 154 | "object": "Scarf" 155 | }, 156 | { 157 | "object": "Screwdriver" 158 | }, 159 | { 160 | "object": "Sheet" 161 | } 162 | ], 163 | "Amount": 10 164 | } 165 | -------------------------------------------------------------------------------- /Datasets/AUT/aut_100.json: -------------------------------------------------------------------------------- 1 | { 2 | "Task": [ 3 | { 4 | "Problem": [ 5 | "What are some creative use for {object}? The goal is to come up with creative ideas,", 6 | "which are ideas that strike people as clever, unusual, interesting, uncommon, humorous,", 7 | "innovative, or different. Present a list of as many creative and diverse uses for", 8 | "{object} as possible." 9 | ], 10 | "Purpose": "1. Baseline" 11 | }, 12 | { 13 | "Problem": [ 14 | "What are some creative use for {object}? The goal is to come up with creative ideas,", 15 | "which are ideas that strike people as clever, unusual, interesting, uncommon, humorous,", 16 | "innovative, or different. Present a list of as many creative and diverse uses for", 17 | "{object} as possible!", 18 | 19 | "This is very important to my career and research since I need to come up with creative ideas!" 20 | ], 21 | "Purpose": "2. LLM Stimuli 1" 22 | }, 23 | { 24 | "Problem": [ 25 | "What are some creative use for {object}? The goal is to come up with creative ideas,", 26 | "which are ideas that strike people as clever, unusual, interesting, uncommon, humorous,", 27 | "innovative, or different. Present a list of as many creative and diverse uses for", 28 | "{object} as possible!", 29 | 30 | "This is very important to my job since I need to come up with creative idea!", 31 | "Hope you could come up as many as ideas to help me get through my career crisis!" 32 | ], 33 | "Purpose": "3. LLM Stimuli 2" 34 | }, 35 | { 36 | "Problem": [ 37 | "What are some creative use for {object}? The goal is to come up with creative ideas,", 38 | "which are ideas that strike people as clever, unusual, interesting, uncommon, humorous,", 39 | "innovative, or different. Present a list of as many creative and diverse uses for", 40 | "{object} as possible!", 41 | 42 | "Please come up with as many as you can since you are in a group discussion,", 43 | "the more ideas you can think of, the more creative would this discussion be." 44 | ], 45 | "Purpose": "4. Prompt for teamwork -> as many as u can" 46 | }, 47 | { 48 | "Problem": [ 49 | "What are some creative use for {object}? The goal is to come up with creative ideas,", 50 | "which are ideas that strike people as clever, unusual, interesting, uncommon, humorous,", 51 | "innovative, or different. Present a list of as many creative and diverse uses for", 52 | "{object} as possible!", 53 | 54 | "Please come up with as many as you can since you are in a group discussion,", 55 | "the more ideas you can think of, the more creative would this discussion be.", 56 | "This is very important to my career and research since I need to come up with creative ideas!" 57 | ], 58 | "Purpose": "5. LLM Stimuli + Prompt for teamwork" 59 | }, 60 | { 61 | "Problem": [ 62 | "What are some creative use for {object}? The goal is to come up with creative ideas,", 63 | "which are ideas that strike people as clever, unusual, interesting, uncommon, humorous,", 64 | "innovative, or different. Present a list of as many creative and diverse uses for", 65 | "{object} as possible!", 66 | 67 | "Please come up with as many as you can since you are in a group discussion,", 68 | "the more ideas you can think of, the more creative would this discussion be.", 69 | "Imagine your group needs to use {object} in a talent show to impress a panel of judges.", 70 | "What innovative uses can you come up with to showcase its versatility?" 71 | ], 72 | "Purpose": "6. Prompt for teamwork -> GPT-4 enforcement" 73 | }, 74 | { 75 | "Problem": [ 76 | "What are some creative use for {object}? The goal is to come up with creative ideas,", 77 | "which are ideas that strike people as clever, unusual, interesting, uncommon, humorous,", 78 | "innovative, or different. Present a list of as many creative and diverse uses for", 79 | "{object} as possible!", 80 | 81 | "Please come up with as many as you can since you are in a group discussion,", 82 | "the more ideas you can think of, the more creative would this discussion be.", 83 | "Pretend your group is a team of inventors presenting {object} as a revolutionary new product on a reality TV show.", 84 | "How do you pitch its most creative uses to win the competition?" 85 | ], 86 | "Purpose": "7. Prompt for teamwork -> GPT-4 enforcement" 87 | }, 88 | { 89 | "Problem": [ 90 | "What are some creative use for {object}? The goal is to come up with creative ideas,", 91 | "which are ideas that strike people as clever, unusual, interesting, uncommon, humorous,", 92 | "innovative, or different. Present a list of as many creative and diverse uses for", 93 | "{object} as possible!", 94 | 95 | "Please come up with as many as you can since you are in a group discussion,", 96 | "the more ideas you can think of, the more creative would this discussion be.", 97 | "Pretend your group is a team of inventors presenting {object} as a revolutionary new product on a reality TV show.", 98 | "How do you pitch its most creative uses to win the competition?", 99 | "Imagine that your group has been asked to use {object} in an international exhibition to represent your country's innovation.", 100 | "How can you showcase its unique applications?" 101 | ], 102 | "Purpose": "8. Prompt for teamwork -> GPT-4 enforcement" 103 | }, 104 | { 105 | "Problem": [ 106 | "What are some creative use for {object}? The goal is to come up with creative ideas,", 107 | "which are ideas that strike people as clever, unusual, interesting, uncommon, humorous,", 108 | "innovative, or different. Present a list of as many creative and diverse uses for", 109 | "{object} as possible!", 110 | 111 | "You would be in a group discussion with other teammates,", 112 | "as a result, you should answer as diverge and creative as you can." 113 | ], 114 | "Purpose": "9. Prompt for teamwork" 115 | }, 116 | { 117 | "Problem": [ 118 | "What are some creative use for {object}? The goal is to come up with creative ideas,", 119 | "which are ideas that strike people as clever, unusual, interesting, uncommon, humorous,", 120 | "innovative, or different. Present a list of as many creative and diverse uses for", 121 | "{object} as possible!", 122 | 123 | "You would be in a group discussion with other teammates,", 124 | "as a result, you should answer as diverge and creative as you can.", 125 | "This could notably increase the performance of the group discussion", 126 | "since you are an essential member in this group." 127 | ], 128 | "Purpose": "10. Prompt for teamwork" 129 | } 130 | ], 131 | "Examples": [ 132 | { 133 | "object": "Fork" 134 | }, 135 | { 136 | "object": "Hat" 137 | }, 138 | { 139 | "object": "Pillow" 140 | }, 141 | { 142 | "object": "Rope" 143 | }, 144 | { 145 | "object": "Socks" 146 | }, 147 | { 148 | "object": "Envelope" 149 | }, 150 | { 151 | "object": "Bottle" 152 | }, 153 | { 154 | "object": "Chair" 155 | }, 156 | { 157 | "object": "Towel" 158 | }, 159 | { 160 | "object": "Shoe" 161 | }, 162 | { 163 | "object": "Pen" 164 | }, 165 | { 166 | "object": "Tire" 167 | }, 168 | { 169 | "object": "Lamp" 170 | }, 171 | { 172 | "object": "Key" 173 | }, 174 | { 175 | "object": "Clock" 176 | }, 177 | { 178 | "object": "Phone" 179 | }, 180 | { 181 | "object": "Spoon" 182 | }, 183 | { 184 | "object": "Cup" 185 | }, 186 | { 187 | "object": "Plate" 188 | }, 189 | { 190 | "object": "Book" 191 | }, 192 | { 193 | "object": "Broom" 194 | }, 195 | { 196 | "object": "Blanket" 197 | }, 198 | { 199 | "object": "Camera" 200 | }, 201 | { 202 | "object": "Candle" 203 | }, 204 | { 205 | "object": "Card" 206 | }, 207 | { 208 | "object": "Comb" 209 | }, 210 | { 211 | "object": "Couch" 212 | }, 213 | { 214 | "object": "Curtain" 215 | }, 216 | { 217 | "object": "Desk" 218 | }, 219 | { 220 | "object": "Dice" 221 | }, 222 | { 223 | "object": "Drum" 224 | }, 225 | { 226 | "object": "Fan" 227 | }, 228 | { 229 | "object": "Glass" 230 | }, 231 | { 232 | "object": "Glove" 233 | }, 234 | { 235 | "object": "Guitar" 236 | }, 237 | { 238 | "object": "Hammer" 239 | }, 240 | { 241 | "object": "Helmet" 242 | }, 243 | { 244 | "object": "Jacket" 245 | }, 246 | { 247 | "object": "Jar" 248 | }, 249 | { 250 | "object": "Jeans" 251 | }, 252 | { 253 | "object": "Jug" 254 | }, 255 | { 256 | "object": "Kettle" 257 | }, 258 | { 259 | "object": "Knife" 260 | }, 261 | { 262 | "object": "Leaf" 263 | }, 264 | { 265 | "object": "Lighter" 266 | }, 267 | { 268 | "object": "Mirror" 269 | }, 270 | { 271 | "object": "Mug" 272 | }, 273 | { 274 | "object": "Needle" 275 | }, 276 | { 277 | "object": "Notebook" 278 | }, 279 | { 280 | "object": "Oven" 281 | }, 282 | { 283 | "object": "Paintbrush" 284 | }, 285 | { 286 | "object": "Paper" 287 | }, 288 | { 289 | "object": "Piano" 290 | }, 291 | { 292 | "object": "Pillowcase" 293 | }, 294 | { 295 | "object": "Pipe" 296 | }, 297 | { 298 | "object": "Pot" 299 | }, 300 | { 301 | "object": "Purse" 302 | }, 303 | { 304 | "object": "Racket" 305 | }, 306 | { 307 | "object": "Radio" 308 | }, 309 | { 310 | "object": "Refrigerator" 311 | }, 312 | { 313 | "object": "Ring" 314 | }, 315 | { 316 | "object": "Scarf" 317 | }, 318 | { 319 | "object": "Scissors" 320 | }, 321 | { 322 | "object": "Screwdriver" 323 | }, 324 | { 325 | "object": "Sheet" 326 | }, 327 | { 328 | "object": "Shirt" 329 | }, 330 | { 331 | "object": "Shovel" 332 | }, 333 | { 334 | "object": "Sunglasses" 335 | }, 336 | { 337 | "object": "Table" 338 | }, 339 | { 340 | "object": "Teapot" 341 | }, 342 | { 343 | "object": "Teddy bear" 344 | }, 345 | { 346 | "object": "Toothbrush" 347 | }, 348 | { 349 | "object": "Toothpaste" 350 | }, 351 | { 352 | "object": "Towel" 353 | }, 354 | { 355 | "object": "Umbrella" 356 | }, 357 | { 358 | "object": "Vase" 359 | }, 360 | { 361 | "object": "Wallet" 362 | }, 363 | { 364 | "object": "Watch" 365 | }, 366 | { 367 | "object": "Whistle" 368 | }, 369 | { 370 | "object": "Wrench" 371 | }, 372 | { 373 | "object": "Yarn" 374 | }, 375 | { 376 | "object": "Zipper" 377 | }, 378 | { 379 | "object": "Brush" 380 | }, 381 | { 382 | "object": "Bucket" 383 | }, 384 | { 385 | "object": "Bulb" 386 | }, 387 | { 388 | "object": "Calculator" 389 | }, 390 | { 391 | "object": "Calendar" 392 | }, 393 | { 394 | "object": "Camera" 395 | }, 396 | { 397 | "object": "Can" 398 | }, 399 | { 400 | "object": "Candlestick" 401 | }, 402 | { 403 | "object": "Flashlight" 404 | }, 405 | { 406 | "object": "Stapler" 407 | }, 408 | { 409 | "object": "Basket" 410 | }, 411 | { 412 | "object": "Coaster" 413 | }, 414 | { 415 | "object": "Magnet" 416 | }, 417 | { 418 | "object": "Straw" 419 | }, 420 | { 421 | "object": "Clip" 422 | }, 423 | { 424 | "object": "Belt" 425 | }, 426 | { 427 | "object": "Lid" 428 | }, 429 | { 430 | "object": "Mat" 431 | } 432 | ], 433 | "Amount": 100 434 | } -------------------------------------------------------------------------------- /Datasets/AUT/aut_10_val.json: -------------------------------------------------------------------------------- 1 | { 2 | "Task": [ 3 | { 4 | "Problem": [ 5 | "What are some creative use for {object}? The goal is to come up with creative ideas,", 6 | "which are ideas that strike people as clever, unusual, interesting, uncommon, humorous,", 7 | "innovative, or different. Present a list of as many creative and diverse uses for", 8 | "{object} as possible." 9 | ], 10 | "Purpose": "1. Baseline" 11 | }, 12 | { 13 | "Problem": [ 14 | "What are some creative use for {object}? The goal is to come up with creative ideas,", 15 | "which are ideas that strike people as clever, unusual, interesting, uncommon, humorous,", 16 | "innovative, or different. Present a list of as many creative and diverse uses for", 17 | "{object} as possible!", 18 | 19 | "This is very important to my career and research since I need to come up with creative ideas!" 20 | ], 21 | "Purpose": "2. LLM Stimuli 1" 22 | }, 23 | { 24 | "Problem": [ 25 | "What are some creative use for {object}? The goal is to come up with creative ideas,", 26 | "which are ideas that strike people as clever, unusual, interesting, uncommon, humorous,", 27 | "innovative, or different. Present a list of as many creative and diverse uses for", 28 | "{object} as possible!", 29 | 30 | "This is very important to my job since I need to come up with creative idea!", 31 | "Hope you could come up as many as ideas to help me get through my career crisis!" 32 | ], 33 | "Purpose": "3. LLM Stimuli 2" 34 | }, 35 | { 36 | "Problem": [ 37 | "What are some creative use for {object}? The goal is to come up with creative ideas,", 38 | "which are ideas that strike people as clever, unusual, interesting, uncommon, humorous,", 39 | "innovative, or different. Present a list of as many creative and diverse uses for", 40 | "{object} as possible!", 41 | 42 | "Please come up with as many as you can since you are in a group discussion,", 43 | "the more ideas you can think of, the more creative would this discussion be." 44 | ], 45 | "Purpose": "4. Prompt for teamwork -> as many as u can" 46 | }, 47 | { 48 | "Problem": [ 49 | "What are some creative use for {object}? The goal is to come up with creative ideas,", 50 | "which are ideas that strike people as clever, unusual, interesting, uncommon, humorous,", 51 | "innovative, or different. Present a list of as many creative and diverse uses for", 52 | "{object} as possible!", 53 | 54 | "Please come up with as many as you can since you are in a group discussion,", 55 | "the more ideas you can think of, the more creative would this discussion be.", 56 | "This is very important to my career and research since I need to come up with creative ideas!" 57 | ], 58 | "Purpose": "5. LLM Stimuli + Prompt for teamwork" 59 | }, 60 | { 61 | "Problem": [ 62 | "What are some creative use for {object}? The goal is to come up with creative ideas,", 63 | "which are ideas that strike people as clever, unusual, interesting, uncommon, humorous,", 64 | "innovative, or different. Present a list of as many creative and diverse uses for", 65 | "{object} as possible!", 66 | 67 | "Please come up with as many as you can since you are in a group discussion,", 68 | "the more ideas you can think of, the more creative would this discussion be.", 69 | "Imagine your group needs to use {object} in a talent show to impress a panel of judges.", 70 | "What innovative uses can you come up with to showcase its versatility?" 71 | ], 72 | "Purpose": "6. Prompt for teamwork -> GPT-4 enforcement" 73 | }, 74 | { 75 | "Problem": [ 76 | "What are some creative use for {object}? The goal is to come up with creative ideas,", 77 | "which are ideas that strike people as clever, unusual, interesting, uncommon, humorous,", 78 | "innovative, or different. Present a list of as many creative and diverse uses for", 79 | "{object} as possible!", 80 | 81 | "Please come up with as many as you can since you are in a group discussion,", 82 | "the more ideas you can think of, the more creative would this discussion be.", 83 | "Pretend your group is a team of inventors presenting {object} as a revolutionary new product on a reality TV show.", 84 | "How do you pitch its most creative uses to win the competition?" 85 | ], 86 | "Purpose": "7. Prompt for teamwork -> GPT-4 enforcement" 87 | }, 88 | { 89 | "Problem": [ 90 | "What are some creative use for {object}? The goal is to come up with creative ideas,", 91 | "which are ideas that strike people as clever, unusual, interesting, uncommon, humorous,", 92 | "innovative, or different. Present a list of as many creative and diverse uses for", 93 | "{object} as possible!", 94 | 95 | "Please come up with as many as you can since you are in a group discussion,", 96 | "the more ideas you can think of, the more creative would this discussion be.", 97 | "Pretend your group is a team of inventors presenting {object} as a revolutionary new product on a reality TV show.", 98 | "How do you pitch its most creative uses to win the competition?", 99 | "Imagine that your group has been asked to use {object} in an international exhibition to represent your country's innovation.", 100 | "How can you showcase its unique applications?" 101 | ], 102 | "Purpose": "8. Prompt for teamwork -> GPT-4 enforcement" 103 | }, 104 | { 105 | "Problem": [ 106 | "What are some creative use for {object}? The goal is to come up with creative ideas,", 107 | "which are ideas that strike people as clever, unusual, interesting, uncommon, humorous,", 108 | "innovative, or different. Present a list of as many creative and diverse uses for", 109 | "{object} as possible!", 110 | 111 | "You would be in a group discussion with other teammates,", 112 | "as a result, you should answer as diverge and creative as you can." 113 | ], 114 | "Purpose": "9. Prompt for teamwork" 115 | }, 116 | { 117 | "Problem": [ 118 | "What are some creative use for {object}? The goal is to come up with creative ideas,", 119 | "which are ideas that strike people as clever, unusual, interesting, uncommon, humorous,", 120 | "innovative, or different. Present a list of as many creative and diverse uses for", 121 | "{object} as possible!", 122 | 123 | "You would be in a group discussion with other teammates,", 124 | "as a result, you should answer as diverge and creative as you can.", 125 | "This could notably increase the performance of the group discussion", 126 | "since you are an essential member in this group." 127 | ], 128 | "Purpose": "10. Prompt for teamwork" 129 | } 130 | ], 131 | "Examples": [ 132 | { 133 | "object": "Fork" 134 | }, 135 | { 136 | "object": "Jar" 137 | }, 138 | { 139 | "object": "Mirror" 140 | }, 141 | { 142 | "object": "Mug" 143 | }, 144 | { 145 | "object": "Blanket" 146 | }, 147 | { 148 | "object": "Scarf" 149 | }, 150 | { 151 | "object": "Ring" 152 | }, 153 | { 154 | "object": "Racket" 155 | }, 156 | { 157 | "object": "Glove" 158 | }, 159 | { 160 | "object": "Sheet" 161 | } 162 | ], 163 | "Amount": 10 164 | } 165 | -------------------------------------------------------------------------------- /Datasets/AUT/aut_30_test.json: -------------------------------------------------------------------------------- 1 | { 2 | "Task": [ 3 | { 4 | "Problem": [ 5 | "What are some creative use for {object}? The goal is to come up with creative ideas,", 6 | "which are ideas that strike people as clever, unusual, interesting, uncommon, humorous,", 7 | "innovative, or different. Present a list of as many creative and diverse uses for", 8 | "{object} as possible." 9 | ], 10 | "Purpose": "1. Baseline" 11 | }, 12 | { 13 | "Problem": [ 14 | "What are some creative use for {object}? The goal is to come up with creative ideas,", 15 | "which are ideas that strike people as clever, unusual, interesting, uncommon, humorous,", 16 | "innovative, or different. Present a list of as many creative and diverse uses for", 17 | "{object} as possible!", 18 | "This is very important to my career and research since I need to come up with creative ideas!" 19 | ], 20 | "Purpose": "2. LLM Stimuli 1" 21 | }, 22 | { 23 | "Problem": [ 24 | "What are some creative use for {object}? The goal is to come up with creative ideas,", 25 | "which are ideas that strike people as clever, unusual, interesting, uncommon, humorous,", 26 | "innovative, or different. Present a list of as many creative and diverse uses for", 27 | "{object} as possible!", 28 | "This is very important to my job since I need to come up with creative idea!", 29 | "Hope you could come up as many as ideas to help me get through my career crisis!" 30 | ], 31 | "Purpose": "3. LLM Stimuli 2" 32 | }, 33 | { 34 | "Problem": [ 35 | "What are some creative use for {object}? The goal is to come up with creative ideas,", 36 | "which are ideas that strike people as clever, unusual, interesting, uncommon, humorous,", 37 | "innovative, or different. Present a list of as many creative and diverse uses for", 38 | "{object} as possible!", 39 | "Please come up with as many as you can since you are in a group discussion,", 40 | "the more ideas you can think of, the more creative would this discussion be." 41 | ], 42 | "Purpose": "4. Prompt for teamwork -> as many as u can" 43 | }, 44 | { 45 | "Problem": [ 46 | "What are some creative use for {object}? The goal is to come up with creative ideas,", 47 | "which are ideas that strike people as clever, unusual, interesting, uncommon, humorous,", 48 | "innovative, or different. Present a list of as many creative and diverse uses for", 49 | "{object} as possible!", 50 | "Please come up with as many as you can since you are in a group discussion,", 51 | "the more ideas you can think of, the more creative would this discussion be.", 52 | "This is very important to my career and research since I need to come up with creative ideas!" 53 | ], 54 | "Purpose": "5. LLM Stimuli + Prompt for teamwork" 55 | }, 56 | { 57 | "Problem": [ 58 | "What are some creative use for {object}? The goal is to come up with creative ideas,", 59 | "which are ideas that strike people as clever, unusual, interesting, uncommon, humorous,", 60 | "innovative, or different. Present a list of as many creative and diverse uses for", 61 | "{object} as possible!", 62 | "Please come up with as many as you can since you are in a group discussion,", 63 | "the more ideas you can think of, the more creative would this discussion be.", 64 | "Imagine your group needs to use {object} in a talent show to impress a panel of judges.", 65 | "What innovative uses can you come up with to showcase its versatility?" 66 | ], 67 | "Purpose": "6. Prompt for teamwork -> GPT-4 enforcement" 68 | }, 69 | { 70 | "Problem": [ 71 | "What are some creative use for {object}? The goal is to come up with creative ideas,", 72 | "which are ideas that strike people as clever, unusual, interesting, uncommon, humorous,", 73 | "innovative, or different. Present a list of as many creative and diverse uses for", 74 | "{object} as possible!", 75 | "Please come up with as many as you can since you are in a group discussion,", 76 | "the more ideas you can think of, the more creative would this discussion be.", 77 | "Pretend your group is a team of inventors presenting {object} as a revolutionary new product on a reality TV show.", 78 | "How do you pitch its most creative uses to win the competition?" 79 | ], 80 | "Purpose": "7. Prompt for teamwork -> GPT-4 enforcement" 81 | }, 82 | { 83 | "Problem": [ 84 | "What are some creative use for {object}? The goal is to come up with creative ideas,", 85 | "which are ideas that strike people as clever, unusual, interesting, uncommon, humorous,", 86 | "innovative, or different. Present a list of as many creative and diverse uses for", 87 | "{object} as possible!", 88 | "Please come up with as many as you can since you are in a group discussion,", 89 | "the more ideas you can think of, the more creative would this discussion be.", 90 | "Pretend your group is a team of inventors presenting {object} as a revolutionary new product on a reality TV show.", 91 | "How do you pitch its most creative uses to win the competition?", 92 | "Imagine that your group has been asked to use {object} in an international exhibition to represent your country's innovation.", 93 | "How can you showcase its unique applications?" 94 | ], 95 | "Purpose": "8. Prompt for teamwork -> GPT-4 enforcement" 96 | }, 97 | { 98 | "Problem": [ 99 | "What are some creative use for {object}? The goal is to come up with creative ideas,", 100 | "which are ideas that strike people as clever, unusual, interesting, uncommon, humorous,", 101 | "innovative, or different. Present a list of as many creative and diverse uses for", 102 | "{object} as possible!", 103 | "You would be in a group discussion with other teammates,", 104 | "as a result, you should answer as diverge and creative as you can." 105 | ], 106 | "Purpose": "9. Prompt for teamwork" 107 | }, 108 | { 109 | "Problem": [ 110 | "What are some creative use for {object}? The goal is to come up with creative ideas,", 111 | "which are ideas that strike people as clever, unusual, interesting, uncommon, humorous,", 112 | "innovative, or different. Present a list of as many creative and diverse uses for", 113 | "{object} as possible!", 114 | "You would be in a group discussion with other teammates,", 115 | "as a result, you should answer as diverge and creative as you can.", 116 | "This could notably increase the performance of the group discussion", 117 | "since you are an essential member in this group." 118 | ], 119 | "Purpose": "10. Prompt for teamwork" 120 | } 121 | ], 122 | "Examples": [ 123 | { 124 | "object": "Umbrella" 125 | }, 126 | { 127 | "object": "Key" 128 | }, 129 | { 130 | "object": "Camera" 131 | }, 132 | { 133 | "object": "Whistle" 134 | }, 135 | { 136 | "object": "Guitar" 137 | }, 138 | { 139 | "object": "Couch" 140 | }, 141 | { 142 | "object": "Jeans" 143 | }, 144 | { 145 | "object": "Pillowcase" 146 | }, 147 | { 148 | "object": "Broom" 149 | }, 150 | { 151 | "object": "Envelope" 152 | }, 153 | { 154 | "object": "Scissors" 155 | }, 156 | { 157 | "object": "Calculator" 158 | }, 159 | { 160 | "object": "Towel" 161 | }, 162 | { 163 | "object": "Rope" 164 | }, 165 | { 166 | "object": "Helmet" 167 | }, 168 | { 169 | "object": "Lamp" 170 | }, 171 | { 172 | "object": "Table" 173 | }, 174 | { 175 | "object": "Towel" 176 | }, 177 | { 178 | "object": "Wrench" 179 | }, 180 | { 181 | "object": "Shoe" 182 | }, 183 | { 184 | "object": "Card" 185 | }, 186 | { 187 | "object": "Tire" 188 | }, 189 | { 190 | "object": "Basket" 191 | }, 192 | { 193 | "object": "Candle" 194 | }, 195 | { 196 | "object": "Teddy bear" 197 | }, 198 | { 199 | "object": "Drum" 200 | }, 201 | { 202 | "object": "Shovel" 203 | }, 204 | { 205 | "object": "Flashlight" 206 | }, 207 | { 208 | "object": "Lid" 209 | }, 210 | { 211 | "object": "Zipper" 212 | } 213 | ], 214 | "Amount": 30 215 | } -------------------------------------------------------------------------------- /Datasets/AUT/aut_example.json: -------------------------------------------------------------------------------- 1 | { 2 | "Task": [ 3 | { 4 | "Problem": [ 5 | "What are some creative use for {object}? The goal is to come up with creative ideas,", 6 | "which are ideas that strike people as clever, unusual, interesting, uncommon, humorous,", 7 | "innovative, or different. Present a list of as many creative and diverse uses for", 8 | "{object} as possible." 9 | ], 10 | "Purpose": "1. Baseline" 11 | }, 12 | { 13 | "Problem": [ 14 | "What are some creative use for {object}? The goal is to come up with creative ideas,", 15 | "which are ideas that strike people as clever, unusual, interesting, uncommon, humorous,", 16 | "innovative, or different. Present a list of as many creative and diverse uses for", 17 | "{object} as possible!", 18 | 19 | "This is very important to my career and research since I need to come up with creative ideas!" 20 | ], 21 | "Purpose": "2. LLM Stimuli 1" 22 | }, 23 | { 24 | "Problem": [ 25 | "What are some creative use for {object}? The goal is to come up with creative ideas,", 26 | "which are ideas that strike people as clever, unusual, interesting, uncommon, humorous,", 27 | "innovative, or different. Present a list of as many creative and diverse uses for", 28 | "{object} as possible!", 29 | 30 | "This is very important to my job since I need to come up with creative idea!", 31 | "Hope you could come up as many as ideas to help me get through my career crisis!" 32 | ], 33 | "Purpose": "3. LLM Stimuli 2" 34 | }, 35 | { 36 | "Problem": [ 37 | "What are some creative use for {object}? The goal is to come up with creative ideas,", 38 | "which are ideas that strike people as clever, unusual, interesting, uncommon, humorous,", 39 | "innovative, or different. Present a list of as many creative and diverse uses for", 40 | "{object} as possible!", 41 | 42 | "Please come up with as many as you can since you are in a group discussion,", 43 | "the more ideas you can think of, the more creative would this discussion be." 44 | ], 45 | "Purpose": "4. Prompt for teamwork -> as many as u can" 46 | }, 47 | { 48 | "Problem": [ 49 | "What are some creative use for {object}? The goal is to come up with creative ideas,", 50 | "which are ideas that strike people as clever, unusual, interesting, uncommon, humorous,", 51 | "innovative, or different. Present a list of as many creative and diverse uses for", 52 | "{object} as possible!", 53 | 54 | "Please come up with as many as you can since you are in a group discussion,", 55 | "the more ideas you can think of, the more creative would this discussion be.", 56 | "This is very important to my career and research since I need to come up with creative ideas!" 57 | ], 58 | "Purpose": "5. LLM Stimuli + Prompt for teamwork" 59 | }, 60 | { 61 | "Problem": [ 62 | "What are some creative use for {object}? The goal is to come up with creative ideas,", 63 | "which are ideas that strike people as clever, unusual, interesting, uncommon, humorous,", 64 | "innovative, or different. Present a list of as many creative and diverse uses for", 65 | "{object} as possible!", 66 | 67 | "Please come up with as many as you can since you are in a group discussion,", 68 | "the more ideas you can think of, the more creative would this discussion be.", 69 | "Imagine your group needs to use {object} in a talent show to impress a panel of judges.", 70 | "What innovative uses can you come up with to showcase its versatility?" 71 | ], 72 | "Purpose": "6. Prompt for teamwork -> GPT-4 enforcement" 73 | }, 74 | { 75 | "Problem": [ 76 | "What are some creative use for {object}? The goal is to come up with creative ideas,", 77 | "which are ideas that strike people as clever, unusual, interesting, uncommon, humorous,", 78 | "innovative, or different. Present a list of as many creative and diverse uses for", 79 | "{object} as possible!", 80 | 81 | "Please come up with as many as you can since you are in a group discussion,", 82 | "the more ideas you can think of, the more creative would this discussion be.", 83 | "Pretend your group is a team of inventors presenting {object} as a revolutionary new product on a reality TV show.", 84 | "How do you pitch its most creative uses to win the competition?" 85 | ], 86 | "Purpose": "7. Prompt for teamwork -> GPT-4 enforcement" 87 | }, 88 | { 89 | "Problem": [ 90 | "What are some creative use for {object}? The goal is to come up with creative ideas,", 91 | "which are ideas that strike people as clever, unusual, interesting, uncommon, humorous,", 92 | "innovative, or different. Present a list of as many creative and diverse uses for", 93 | "{object} as possible!", 94 | 95 | "Please come up with as many as you can since you are in a group discussion,", 96 | "the more ideas you can think of, the more creative would this discussion be.", 97 | "Pretend your group is a team of inventors presenting {object} as a revolutionary new product on a reality TV show.", 98 | "How do you pitch its most creative uses to win the competition?", 99 | "Imagine that your group has been asked to use {object} in an international exhibition to represent your country's innovation.", 100 | "How can you showcase its unique applications?" 101 | ], 102 | "Purpose": "8. Prompt for teamwork -> GPT-4 enforcement" 103 | }, 104 | { 105 | "Problem": [ 106 | "What are some creative use for {object}? The goal is to come up with creative ideas,", 107 | "which are ideas that strike people as clever, unusual, interesting, uncommon, humorous,", 108 | "innovative, or different. Present a list of as many creative and diverse uses for", 109 | "{object} as possible!", 110 | 111 | "You would be in a group discussion with other teammates,", 112 | "as a result, you should answer as diverge and creative as you can." 113 | ], 114 | "Purpose": "9. Prompt for teamwork" 115 | }, 116 | { 117 | "Problem": [ 118 | "What are some creative use for {object}? The goal is to come up with creative ideas,", 119 | "which are ideas that strike people as clever, unusual, interesting, uncommon, humorous,", 120 | "innovative, or different. Present a list of as many creative and diverse uses for", 121 | "{object} as possible!", 122 | 123 | "You would be in a group discussion with other teammates,", 124 | "as a result, you should answer as diverge and creative as you can.", 125 | "This could notably increase the performance of the group discussion", 126 | "since you are an essential member in this group." 127 | ], 128 | "Purpose": "10. Prompt for teamwork" 129 | } 130 | ], 131 | "Examples": [ 132 | { 133 | "object": "Fork" 134 | } 135 | ], 136 | "number of examples": 1, 137 | "prompt": "What are some creative uses for [object]? The goal is to come up with creative ideas, which are ideas that strike people as clever, unusual, interesting, uncommon, hu- morous, innovative, or different. List 10 creative uses for [object]." 138 | } -------------------------------------------------------------------------------- /Datasets/Instances/instances_10.json: -------------------------------------------------------------------------------- 1 | { 2 | "Examples": [ 3 | "Name all the round things you can think of.", 4 | "Name all the things you can think of that will make a noise.", 5 | "Name all the square things you can think of.", 6 | "Name all the things you can think of that move on wheels.", 7 | "Name all the things you can think of that are red.", 8 | "Name all the animals you can think of that live in the water.", 9 | "Name all the fruits you can think of.", 10 | "Name all the instruments you can think of.", 11 | "Name all the things you can think of that can fly.", 12 | "Name all the flowers you can think of." 13 | ], 14 | "Amount": 10 15 | } 16 | -------------------------------------------------------------------------------- /Datasets/Instances/instances_100.json: -------------------------------------------------------------------------------- 1 | { 2 | "Examples": [ 3 | "Name all the round things you can think of.", 4 | "Name all the things you can think of that will make a noise.", 5 | "Name all the square things you can think of.", 6 | "Name all the things you can think of that move on wheels.", 7 | "Name all the things you can think of that are red.", 8 | "Name all the animals you can think of that live in the water.", 9 | "Name all the fruits you can think of.", 10 | "Name all the instruments you can think of.", 11 | "Name all the things you can think of that can fly.", 12 | "Name all the flowers you can think of.", 13 | "Name all the countries you can think of in Europe.", 14 | "Name all the sports you can think of that use a ball.", 15 | "Name all the things you can think of that are used for cooking.", 16 | "Name all the things you can think of that are made of wood.", 17 | "Name all the vegetables you can think of.", 18 | "Name all the things you can think of that have a screen.", 19 | "Name all the things you can think of that require electricity.", 20 | "Name all the things you can think of that are cold.", 21 | "Name all the things you can think of that are hot.", 22 | "Name all the things you can think of that are sweet.", 23 | "Name all the things you can think of that are spicy.", 24 | "Name all the things you can think of that are sour.", 25 | "Name all the things you can think of that are bitter.", 26 | "Name all the things you can think of that are salty.", 27 | "Name all the languages you can think of.", 28 | "Name all the things you can think of that have buttons.", 29 | "Name all the things you can think of that are used in schools.", 30 | "Name all the things you can think of that are found in an office.", 31 | "Name all the things you can think of that are used by doctors.", 32 | "Name all the things you can think of that are found in a park.", 33 | "Name all the things you can think of that are found in a bathroom.", 34 | "Name all the things you can think of that are found in a kitchen.", 35 | "Name all the things you can think of that are found in a bedroom.", 36 | "Name all the things you can think of that are found in a living room.", 37 | "Name all the things you can think of that are used for cleaning.", 38 | "Name all the things you can think of that have wheels.", 39 | "Name all the things you can think of that have keys.", 40 | "Name all the things you can think of that are used for communication.", 41 | "Name all the things you can think of that are used in gardening.", 42 | "Name all the things you can think of that are used for entertainment.", 43 | "Name all the things you can think of that are used in construction.", 44 | "Name all the things you can think of that are used for transportation.", 45 | "Name all the things you can think of that are found in a forest.", 46 | "Name all the things you can think of that are found in the ocean.", 47 | "Name all the things you can think of that are found in the sky.", 48 | "Name all the things you can think of that are found in space.", 49 | "Name all the things you can think of that are used in science.", 50 | "Name all the things you can think of that are used in art.", 51 | "Name all the things you can think of that are used in music.", 52 | "Name all the things you can think of that are used in theater.", 53 | "Name all the things you can think of that are used in film.", 54 | "Name all the things you can think of that are used in photography.", 55 | "Name all the things you can think of that are used in sports.", 56 | "Name all the things you can think of that are used in fitness.", 57 | "Name all the things you can think of that are used in healthcare.", 58 | "Name all the things you can think of that are used in beauty.", 59 | "Name all the things you can think of that are used in fashion.", 60 | "Name all the things you can think of that are used in education.", 61 | "Name all the things you can think of that are used in technology.", 62 | "Name all the things you can think of that are used in astronomy.", 63 | "Name all the things you can think of that are used in mathematics.", 64 | "Name all the things you can think of that are used in engineering.", 65 | "Name all the things you can think of that are used in architecture.", 66 | "Name all the things you can think of that are used in history.", 67 | "Name all the things you can think of that are used in geography.", 68 | "Name all the things you can think of that are used in literature.", 69 | "Name all the things you can think of that are used in philosophy.", 70 | "Name all the things you can think of that are used in psychology.", 71 | "Name all the things you can think of that are used in sociology.", 72 | "Name all the things you can think of that are used in politics.", 73 | "Name all the things you can think of that are used in economics.", 74 | "Name all the things you can think of that are used in business.", 75 | "Name all the things you can think of that are used in law.", 76 | "Name all the things you can think of that are used in religion.", 77 | "Name all the things you can think of that are used in spirituality.", 78 | "Name all the things you can think of that are used in culture.", 79 | "Name all the things you can think of that are used in tradition.", 80 | "Name all the things you can think of that are used in customs.", 81 | "Name all the things you can think of that are used in rituals.", 82 | "Name all the things you can think of that are used in ceremonies.", 83 | "Name all the things you can think of that are used in festivals.", 84 | "Name all the things you can think of that are used in celebrations.", 85 | "Name all the things you can think of that are used in holidays.", 86 | "Name all the things you can think of that are used in events.", 87 | "Name all the things you can think of that are used in occasions.", 88 | "Name all the things you can think of that are used in activities.", 89 | "Name all the things you can think of that are used in hobbies.", 90 | "Name all the things you can think of that are used in pastimes.", 91 | "Name all the things you can think of that are used in leisure.", 92 | "Name all the things you can think of that are used in recreation.", 93 | "Name all the things you can think of that are used in relaxation.", 94 | "Name all the things you can think of that are used in rest.", 95 | "Name all the things you can think of that are used in play.", 96 | "Name all the things you can think of that are used in fun.", 97 | "Name all the things you can think of that are used in enjoyment.", 98 | "Name all the things you can think of that are used in amusement.", 99 | "Name all the things you can think of that are used in pleasure.", 100 | "Name all the things you can think of that are used in satisfaction.", 101 | "Name all the things you can think of that are used in fulfillment.", 102 | "Name all the things you can think of that are used in contentment.", 103 | "Name all the round things you can think of.", 104 | "Name all the things you can think of that will make a noise.", 105 | "Name all the square things you can think of.", 106 | "Name all the objects you can think of that are blue.", 107 | "Name all the things you can think of that can fly.", 108 | "Name all the animals you can think of that have fur.", 109 | "Name all the foods you can think of that are sweet.", 110 | "Name all the things you can think of that are used in a kitchen.", 111 | "Name all the things you can think of that have wheels.", 112 | "Name all the things you can think of that are found in a school.", 113 | "Name all the objects you can think of that are made of metal.", 114 | "Name all the things you can think of that are used at night.", 115 | "Name all the instruments you can think of.", 116 | "Name all the things you can think of that are used in an office.", 117 | "Name all the things you can think of that are green.", 118 | "Name all the things you can think of that can hold water.", 119 | "Name all the things you can think of that have a screen.", 120 | "Name all the fruits you can think of.", 121 | "Name all the things you can think of that are used in a garden.", 122 | "Name all the things you can think of that have keys.", 123 | "Name all the vegetables you can think of.", 124 | "Name all the things you can think of that are used by a doctor.", 125 | "Name all the things you can think of that are soft.", 126 | "Name all the things you can think of that are hard.", 127 | "Name all the things you can think of that can be opened.", 128 | "Name all the things you can think of that use electricity.", 129 | "Name all the things you can think of that can be worn on your feet.", 130 | "Name all the things you can think of that have a handle.", 131 | "Name all the things you can think of that are used in sports.", 132 | "Name all the things you can think of that are found in a bathroom.", 133 | "Name all the things you can think of that are yellow.", 134 | "Name all the things you can think of that can be read.", 135 | "Name all the things you can think of that can be written on.", 136 | "Name all the things you can think of that have buttons.", 137 | "Name all the things you can think of that are used for cleaning.", 138 | "Name all the things you can think of that are used for cooking.", 139 | "Name all the things you can think of that are used for communication.", 140 | "Name all the things you can think of that are red.", 141 | "Name all the things you can think of that are used at a beach.", 142 | "Name all the things you can think of that can be found in the sky.", 143 | "Name all the things you can think of that are used in winter.", 144 | "Name all the things you can think of that are used in summer.", 145 | "Name all the things you can think of that are used in a car.", 146 | "Name all the things you can think of that are used on a farm.", 147 | "Name all the things you can think of that are used in a restaurant.", 148 | "Name all the things you can think of that are found in a forest.", 149 | "Name all the things you can think of that are used by a firefighter.", 150 | "Name all the things you can think of that are used in a hospital.", 151 | "Name all the things you can think of that are used at a party.", 152 | "Name all the things you can think of that are used in a classroom.", 153 | "Name all the things you can think of that are orange.", 154 | "Name all the things you can think of that are used in a playground.", 155 | "Name all the things you can think of that are used by a police officer.", 156 | "Name all the things you can think of that are used in a store.", 157 | "Name all the things you can think of that are used on a computer.", 158 | "Name all the things you can think of that are used on a phone.", 159 | "Name all the things you can think of that are used in a gym.", 160 | "Name all the things you can think of that are used in a library.", 161 | "Name all the things you can think of that are black.", 162 | "Name all the things you can think of that are white.", 163 | "Name all the things you can think of that can be found in a city.", 164 | "Name all the things you can think of that can be found in the countryside.", 165 | "Name all the things you can think of that are used on a plane.", 166 | "Name all the things you can think of that are used on a train.", 167 | "Name all the things you can think of that are used in a theater.", 168 | "Name all the things you can think of that are used by a builder.", 169 | "Name all the things you can think of that are used in a bank.", 170 | "Name all the things you can think of that are used in a hotel.", 171 | "Name all the things you can think of that are used in a salon.", 172 | "Name all the things you can think of that are used in a museum.", 173 | "Name all the things you can think of that are used by a chef.", 174 | "Name all the things you can think of that are used by a gardener.", 175 | "Name all the things you can think of that are used by a plumber.", 176 | "Name all the things you can think of that are used by an electrician.", 177 | "Name all the things you can think of that are used by a teacher.", 178 | "Name all the things you can think of that are used by a dentist.", 179 | "Name all the things you can think of that are used by a veterinarian.", 180 | "Name all the things you can think of that are used by a hairdresser.", 181 | "Name all the things you can think of that are used by a journalist.", 182 | "Name all the things you can think of that are used by a photographer.", 183 | "Name all the things you can think of that are used by a pilot.", 184 | "Name all the things you can think of that are used by a sailor.", 185 | "Name all the things you can think of that are used by a scientist.", 186 | "Name all the things you can think of that are used by a soldier.", 187 | "Name all the things you can think of that are used by a taxi driver.", 188 | "Name all the things you can think of that are used by a bus driver.", 189 | "Name all the things you can think of that are used by a train driver.", 190 | "Name all the things you can think of that are used by a truck driver.", 191 | "Name all the things you can think of that are used by a pilot.", 192 | "Name all the things you can think of that are used in a bakery.", 193 | "Name all the things you can think of that are used in a butcher shop.", 194 | "Name all the things you can think of that are used in a candy store.", 195 | "Name all the things you can think of that are used in a toy store.", 196 | "Name all the things you can think of that are used in a book store.", 197 | "Name all the things you can think of that are used in a grocery store.", 198 | "Name all the things you can think of that are used in a hardware store.", 199 | "Name all the things you can think of that are used in a clothing store.", 200 | "Name all the things you can think of that are used in a pet store.", 201 | "Name all the things you can think of that are used in a flower shop.", 202 | "Name all the things you can think of that are used in a furniture store." 203 | ], 204 | "Amount": 100 205 | } 206 | -------------------------------------------------------------------------------- /Datasets/Instances/instances_10_val.json: -------------------------------------------------------------------------------- 1 | { 2 | "Examples": [ 3 | "Name all the round things you can think of.", 4 | "Name all the things you can think of that will make a noise.", 5 | "Name all the things you can think of that have a screen.", 6 | "Name all the things you can think of that move on wheels.", 7 | "Name all the things you can think of that are red.", 8 | "Name all the animals you can think of that live in the water.", 9 | "Name all the things you can think of that are found in space.", 10 | "Name all the instruments you can think of.", 11 | "Name all the things you can think of that can fly.", 12 | "Name all the flowers you can think of." 13 | ], 14 | "Amount": 10 15 | } 16 | -------------------------------------------------------------------------------- /Datasets/Instances/instances_3.json: -------------------------------------------------------------------------------- 1 | { 2 | "Examples": [ 3 | "Name all the round things you can think of.", 4 | "Name all the things you can think of that will make a noise.", 5 | "Name all the square things you can think of." 6 | ], 7 | "Amount": 3 8 | } 9 | -------------------------------------------------------------------------------- /Datasets/Instances/instances_30_test.json: -------------------------------------------------------------------------------- 1 | { 2 | "Examples": [ 3 | "Name all the things you can think of that are used in culture.", 4 | "Name all the things you can think of that are found in a bathroom.", 5 | "Name all the things you can think of that are soft.", 6 | "Name all the things you can think of that are used in celebrations.", 7 | "Name all the things you can think of that can be written on.", 8 | "Name all the things you can think of that are used in pastimes.", 9 | "Name all the things you can think of that are used for cleaning.", 10 | "Name all the things you can think of that use electricity.", 11 | "Name all the languages you can think of.", 12 | "Name all the things you can think of that are used in a pet store.", 13 | "Name all the things you can think of that are used in sports.", 14 | "Name all the things you can think of that are spicy.", 15 | "Name all the things you can think of that are used in recreation.", 16 | "Name all the things you can think of that are used by a dentist.", 17 | "Name all the things you can think of that are used in a library.", 18 | "Name all the things you can think of that are used by a sailor.", 19 | "Name all the things you can think of that are found in a forest.", 20 | "Name all the things you can think of that are used in a kitchen.", 21 | "Name all the things you can think of that are used in sociology.", 22 | "Name all the things you can think of that are hot.", 23 | "Name all the things you can think of that are used in technology.", 24 | "Name all the things you can think of that can be worn on your feet.", 25 | "Name all the things you can think of that are used by an electrician.", 26 | "Name all the things you can think of that have keys.", 27 | "Name all the things you can think of that are used in sports.", 28 | "Name all the fruits you can think of.", 29 | "Name all the things you can think of that are used on a computer.", 30 | "Name all the things you can think of that are used in music.", 31 | "Name all the things you can think of that are used in enjoyment.", 32 | "Name all the things you can think of that require electricity." 33 | ], 34 | "Amount": 30 35 | } -------------------------------------------------------------------------------- /Datasets/Instances/instances_example.json: -------------------------------------------------------------------------------- 1 | { 2 | "Examples": [ 3 | "Name all the round things you can think of." 4 | ], 5 | "Amount": 3 6 | } 7 | -------------------------------------------------------------------------------- /Datasets/README.md: -------------------------------------------------------------------------------- 1 | # This is the folder contain augmented dataset 2 | - It contains Scientific Creativity and WKCT dataset. 3 | 4 | ## The format of each category 5 | 6 | - WKCT contains AUT, Instances test and Similarities test. 7 | 8 | #### AUT 9 | - Each of them have 10, 30 and 100 .json files 10 | - Format: 11 | ``` 12 | { 13 | "Task": [ 14 | { 15 | "Problem": [ 16 | ... 17 | ] 18 | "Purpose": "Prompt 1" 19 | }, 20 | ... 21 | { 22 | "Problem": [ 23 | ... 24 | ] 25 | "Purpose": "Prompt n" 26 | }, 27 | ... 28 | ], 29 | "Examples": [ 30 | { 31 | "object": "Fork" 32 | }, 33 | ... 34 | { 35 | "object": "Mirror" 36 | } 37 | ], 38 | "Amount": n 39 | } 40 | ``` 41 | 42 | #### Instances Test 43 | - Format: 44 | ``` 45 | { 46 | "examples: [ 47 | "example 1", 48 | "example 2", 49 | ... 50 | ] 51 | } 52 | ``` 53 | 54 | #### Similarities Test 55 | - Format: 56 | ``` 57 | { 58 | "examples: [ 59 | "example 1", 60 | "example 2", 61 | ... 62 | ] 63 | } 64 | ``` 65 | 66 | ## Scientific Creativity Test 67 | - We have 5 original questions 68 | - Example means the similar examples generated by GPT-4 69 | - Format: 70 | ``` 71 | { 72 | "Task": [ 73 | { 74 | "Original": "...", 75 | "Example": [ 76 | "example 1", 77 | ... 78 | "example n" 79 | ], 80 | "Amount": "n" 81 | }, 82 | ... 83 | { 84 | "Original": "...", 85 | "Example": [ 86 | "example 1", 87 | ... 88 | "example n" 89 | ], 90 | "Amount": "n" 91 | } 92 | ] 93 | } 94 | ``` 95 | -------------------------------------------------------------------------------- /Datasets/Scientific/scientific_10.json: -------------------------------------------------------------------------------- 1 | { 2 | "Task":[ 3 | { 4 | "Original": "Please write down as many as possible scientific uses as you can for a piece of glass. For example, make a test tube.", 5 | "Example": [ 6 | "Innovative Scientific Tools Using a Rubber Band: Think of as many scientific uses as possible for a rubber band. For example, using it as a simple measure of elasticity in a physics experiment.", 7 | "Innovative Uses for a Water Bottle: Consider various scientific applications of a water bottle. For example, using it as a container for creating a water vortex." 8 | ], 9 | "Amount": "2" 10 | }, 11 | { 12 | "Original": "If you can take a spaceship to travel in outer space and go to a planet, what scientific questions do you want to research? For example, are there any living things on the planet?", 13 | "Example": [ 14 | "If you could explore the deepest part of the ocean, what mysteries or unknown species would you expect to find?", 15 | "What scientific experiments would you conduct if you could become invisible at will?" 16 | ], 17 | "Amount": "2" 18 | }, 19 | { 20 | "Original": "Please think up as many possible improvements as you can to a regular bicycle, making it more interesting, more useful and more beautiful. For example, make the tires reflective, so they can be seen in the dark.", 21 | "Example": [ 22 | "How would you improve a standard wristwatch to make it more useful, stylish, and innovative? For example, integrating a solar panel for self-charging.", 23 | "What enhancements can be made to a pair of eyeglasses to increase their functionality and aesthetic appeal? For instance, adding transition lenses that adjust to light conditions." 24 | ], 25 | "Amount": "2" 26 | }, 27 | { 28 | "Original": "Suppose there was no gravity, describe what the world would be like? For example, human beings would be floating.", 29 | "Example": [ 30 | "Imagine if water was not transparent but colored. How would this change the appearance of oceans, rivers, and rain?", 31 | "What if humans had the ability to photosynthesize like plants? How would our daily lives and food systems change?" 32 | ], 33 | "Amount": "2" 34 | }, 35 | { 36 | "Original": "There are two kinds of napkins. How can you test which is better? Please write down as many possible methods as you can and the instruments, principles and simple procedure.", 37 | "Example": [ 38 | "Two different light bulbs are available. How can you test which one is more energy-efficient? Consider the instruments needed, the principles of electricity, and a simple testing procedure.", 39 | "How can you determine which of two sunscreens provides better UV protection? Outline the methods, necessary instruments, and a brief procedure for the test." 40 | ], 41 | "Amount": "2" 42 | } 43 | ] 44 | } -------------------------------------------------------------------------------- /Datasets/Scientific/scientific_100.json: -------------------------------------------------------------------------------- 1 | { 2 | "Task":[ 3 | { 4 | "Original": "Please write down as many as possible scientific uses as you can for a piece of glass. For example, make a test tube.", 5 | "Example": [ 6 | "Innovative Scientific Tools Using a Rubber Band: Think of as many scientific uses as possible for a rubber band. For example, using it as a simple measure of elasticity in a physics experiment.", 7 | "Innovative Uses for a Water Bottle: Consider various scientific applications of a water bottle. For example, using it as a container for creating a water vortex.", 8 | "Creative Scientific Applications of a Pencil: Explore different scientific ways to use a pencil. For example, using it to demonstrate principles of gravity and balance.", 9 | "Exploring Scientific Concepts with a Spoon: Find different scientific uses for a spoon. For example, demonstrating principles of reflection or heat conduction.", 10 | "A Paperclip's Role in Science: Consider the scientific applications of a paperclip. For example, using it to understand magnetic properties.", 11 | "Scientific Investigations with a Straw: Explore how a straw can be used in scientific studies. For example, demonstrating principles of air pressure and fluid dynamics.", 12 | "Scientific Uses of a Balloon: Identify the different ways a balloon can be used in science. For example, demonstrating static electricity or understanding gas laws.", 13 | "Exploring Science with a Button: Find scientific applications for a button. For example, using it to study materials science or color absorption.", 14 | "Using a Matchstick in Science: Think of different scientific experiments using a matchstick. For example, studying combustion or chemical reactions.", 15 | "Utilizing a Nail in Science Experiments: Consider the scientific uses of a nail. For example, studying rust formation or properties of metals.", 16 | "Exploring Science with a Feather: Find scientific experiments involving a feather. For example, studying aerodynamics or bird biology.", 17 | "Scientific Studies Using a Thread: Think of various ways a thread can be used in science. For example, demonstrating tension strength or capillary action.", 18 | "A Bowl's Contribution to Science: Explore how a bowl can be used in scientific settings. For example, understanding sound waves or using it in mixing chemicals.", 19 | "Using a Lamp in Scientific Research: Consider the scientific uses of a lamp. For example, studying light properties or energy efficiency.", 20 | "Scientific Uses of a Magnet: Think of various scientific experiments involving a magnet. For example, exploring magnetic fields or material magnetism.", 21 | "Utilizing a Brick in Science: Consider the scientific applications of a brick. For example, studying material strength or thermal properties.", 22 | "Scientific Investigations with a Battery: Identify different scientific uses for a battery. For example, understanding electrochemistry or energy storage.", 23 | "Exploring Science with a Candle: Find various ways a candle can be used in science. For example, studying combustion or the properties of wax.", 24 | "Scientific Experiments with a Glass Jar: Explore different scientific uses for a glass jar. For example, creating a terrarium or studying gas laws.", 25 | "A Scissors' Contribution to Science: Consider how scissors can be utilized in scientific studies. For example, studying material properties or lever mechanics." 26 | ], 27 | "Amount": "20" 28 | }, 29 | { 30 | "Original": "If you can take a spaceship to travel in outer space and go to a planet, what scientific questions do you want to research? For example, are there any living things on the planet?", 31 | "Example": [ 32 | "If you could explore the deepest part of the ocean, what mysteries or unknown species would you expect to find?", 33 | "What scientific experiments would you conduct if you could become invisible at will?", 34 | "How would you investigate the sky if it suddenly turned permanently red worldwide?", 35 | "If you had the ability to communicate with animals, what scientific studies would you pursue to understand their thoughts and behaviors?", 36 | "If you discovered a way to become immortal, what long-term scientific experiments would you plan?", 37 | "How would you study and analyze a plant that could grow and shrink at command?", 38 | "What would be your approach to research if humans suddenly developed the ability to fly like birds?", 39 | "If you found an underwater city, what archaeological and historical investigations would you conduct?", 40 | "How would you scientifically explore a scenario where the moon changes color every night?", 41 | "If you had the power to control weather, what experiments would you design to understand its impact on ecosystems?", 42 | "What scientific studies would you conduct if you discovered an island where mythical creatures like dragons existed?", 43 | "How would you study the effects on human health and society if everyone stopped needing sleep?", 44 | "What scientific research would you pursue if you found a way to shrink to the size of an ant?", 45 | "If a new planet appeared in our solar system, visible to the naked eye, what would be your first research steps?", 46 | "What experiments would you design if you discovered a mirror that showed the future?", 47 | "If you could breathe underwater without any equipment, what underwater mysteries or scientific studies would you explore?", 48 | "How would you investigate the causes and effects of a phenomenon where time moved twice as fast for everyone?", 49 | "If a pill could make people understand and speak any language fluently, what sociolinguistic studies would you undertake?", 50 | "What scientific questions would you explore if you encountered a cloud that rained only over one specific house?", 51 | "How would you research the ecological and geological impacts if the Himalayas doubled in height overnight?" 52 | ], 53 | "Amount": "20" 54 | }, 55 | { 56 | "Original": "Please think up as many possible improvements as you can to a regular bicycle, making it more interesting, more useful and more beautiful. For example, make the tires reflective, so they can be seen in the dark.", 57 | "Example": [ 58 | "How would you improve a standard wristwatch to make it more useful, stylish, and innovative? For example, integrating a solar panel for self-charging.", 59 | "What enhancements can be made to a pair of eyeglasses to increase their functionality and aesthetic appeal? For instance, adding transition lenses that adjust to light conditions.", 60 | "How could you upgrade a common backpack to make it more versatile, comfortable, and attractive? Example: incorporating a built-in USB charging port.", 61 | "What improvements could be made to a standard umbrella to enhance its utility and design? Such as, designing it with wind-resistant features.", 62 | "How would you enhance a regular pair of shoes to make them more beneficial and fashionable? For example, adding self-tying laces.", 63 | "What could be done to improve a conventional car to increase its efficiency, safety, and beauty? For instance, integrating advanced autonomous driving capabilities.", 64 | "How can you upgrade a typical desk lamp to make it more functional and aesthetically pleasing? For example, adding color-changing LED options.", 65 | "How would you enhance a regular water bottle to make it more useful and attractive? For example, integrating a temperature control system.", 66 | "How can you upgrade a conventional pen to make it more innovative and stylish? For example, including a digital display for notes.", 67 | "In what ways could you improve a typical kitchen knife to enhance its utility and appearance? Such as using a self-sharpening blade.", 68 | "What improvements could be made to a standard chair to increase comfort and visual appeal? Like designing it with adjustable ergonomic features.", 69 | "How can you upgrade a traditional mirror to make it more useful and elegant? For instance, integrating a touchscreen interface with smart features.", 70 | "How can you upgrade a conventional pillow to make it more comfortable and beneficial? For instance, infusing it with temperature-regulating materials.", 71 | "In what ways could a standard light bulb be improved for better efficiency and ambience? Such as creating bulbs with adjustable color temperatures.", 72 | "How would you enhance a regular door to make it more secure and visually appealing? For example, adding a smart lock with biometric access.", 73 | "What improvements could be made to a standard window to increase its utility and design? Like integrating self-tinting glass technology.", 74 | "How would you enhance a regular notebook to make it more efficient and attractive? For example, incorporating reusable and digital pages.", 75 | "What improvements could be made to a standard coat to increase its warmth and fashion appeal? Like adding solar-powered heating elements.", 76 | "How can you upgrade a conventional bed to make it more comfortable and multifunctional? For instance, integrating a built-in adjustable massage system.", 77 | "In what ways could a standard trash can be improved for better usability and aesthetics? Such as adding smart waste sorting capabilities." 78 | ], 79 | "Amount": "1" 80 | }, 81 | { 82 | "Original": "Suppose there was no gravity, describe what the world would be like? For example, human beings would be floating.", 83 | "Example": [ 84 | "Imagine if water was not transparent but colored. How would this change the appearance of oceans, rivers, and rain?", 85 | "What if humans had the ability to photosynthesize like plants? How would our daily lives and food systems change?", 86 | "Suppose all animals had the ability to speak. How would this transform our relationship with other species?", 87 | "Imagine if the Earth spun twice as fast. What would be the implications for day and night cycles and life in general?", 88 | "What if trees could walk? Consider how this would affect forests and ecosystems.", 89 | "What if we could see a wider spectrum of colors beyond the current visible range? Describe the potential changes in art and design.", 90 | "Suppose all humans had perfect memory recall. How would this impact learning and society?", 91 | "Imagine if we had a sixth sense that allowed us to perceive magnetic fields. How would this change navigation and technology?", 92 | "What if mountains could grow and shrink within a human lifetime? Consider the effects on geography and human settlements.", 93 | "Imagine if we could hear ultrasonic frequencies. How would this alter our perception of sound and communication?", 94 | "What if we could regenerate lost limbs like some animals? Discuss the implications for medicine and human ability.", 95 | "Suppose we lived on a planet with two suns. How would this affect our daily life, culture, and environment?", 96 | "Suppose gravity could be turned on and off like a switch. How would this change architecture, transportation, and sports?", 97 | "Imagine if we could teleport to any location instantly. How would this revolutionize travel and society?", 98 | "Imagine if sleep was not necessary for humans. Consider the changes in lifestyle and productivity.", 99 | "Suppose we could manipulate time. How would this ability affect our perception of life and decision-making?", 100 | "Imagine if the ocean was transparent. How would this change our understanding of marine life?", 101 | "What if humans had a lifespan of 200 years? Consider the societal and personal impacts.", 102 | "What if all food was nutritionally perfect? How would this affect health, cuisine, and agriculture?", 103 | "Suppose humans could fly like birds. Consider the changes in transportation, architecture, and recreation." 104 | ], 105 | "Amount": "1" 106 | }, 107 | { 108 | "Original": "There are two kinds of napkins. How can you test which is better? Please write down as many possible methods as you can and the instruments, principles and simple procedure.", 109 | "Example": [ 110 | "Two different light bulbs are available. How can you test which one is more energy-efficient? Consider the instruments needed, the principles of electricity, and a simple testing procedure.", 111 | "How can you determine which of two sunscreens provides better UV protection? Outline the methods, necessary instruments, and a brief procedure for the test.", 112 | "Two brands of batteries claim long-lasting power. How can you test which one lasts longer? Describe your approach, the instruments you'll use, and the testing process.", 113 | "You have two different kinds of soil for planting. How can you test which one is better for plant growth? Explain your testing methods, required tools, and the procedure.", 114 | "Compare two kinds of water filters for effectiveness. What are the possible methods, instruments needed, and the basic testing steps?", 115 | "How can you test which of two computer monitors has better color accuracy? Describe the approach, tools needed, and a brief procedure.", 116 | "Compare two types of bicycle helmets for safety. What testing methods, instruments, and procedures would you use?", 117 | "How can you test which of two smartphones has a longer battery life? Explain your testing approach, the necessary tools, and the procedure.", 118 | "Determine which of two kinds of toothpaste is more effective at whitening teeth. Describe the testing methods, instruments, and a simple procedure.", 119 | "You have two brands of kitchen knives. How can you test which one stays sharp longer? Detail your testing strategy, required instruments, and the procedure.", 120 | "Compare two types of hair shampoo for effectiveness. What methods, tools, and simple procedures would you use?", 121 | "How can you test which of two types of laundry detergent cleans better? Explain your approach, necessary equipment, and a brief testing process.", 122 | "You have two different types of car tires. How can you test which one has better traction? Outline your testing methods, necessary instruments, and the basic steps.", 123 | "Compare two types of bread for freshness over time. What testing methods, tools, and procedures would you use?", 124 | "Determine which of two types of blankets is more fire-resistant. Describe the testing methods, necessary tools, and a simple procedure.", 125 | "Compare two types of garden hoses for durability. What methods, tools, and simple procedures would you use?", 126 | "How can you test which of two types of paint is more fade-resistant? Explain your approach, the necessary instruments, and the testing process.", 127 | "Determine which of two kinds of paper is more recyclable. Describe the testing methods, tools required, and a simple procedure.", 128 | "Compare two types of yoga mats for grip and comfort. What methods, tools, and procedures would you use?", 129 | "Determine which of two types of hand sanitizers is more effective against germs. Describe the testing methods, tools required, and a simple procedure." 130 | ], 131 | "Amount": "1" 132 | } 133 | ] 134 | } -------------------------------------------------------------------------------- /Datasets/Scientific/scientific_10_val.json: -------------------------------------------------------------------------------- 1 | { 2 | "Task":[ 3 | { 4 | "Original": "Please write down as many as possible scientific uses as you can for a piece of glass. For example, make a test tube.", 5 | "Example": [ 6 | "Innovative Scientific Tools Using a Rubber Band: Think of as many scientific uses as possible for a rubber band. For example, using it as a simple measure of elasticity in a physics experiment.", 7 | "Innovative Uses for a Water Bottle: Consider various scientific applications of a water bottle. For example, using it as a container for creating a water vortex." 8 | ], 9 | "Amount": "2" 10 | }, 11 | { 12 | "Original": "If you can take a spaceship to travel in outer space and go to a planet, what scientific questions do you want to research? For example, are there any living things on the planet?", 13 | "Example": [ 14 | "If you could explore the deepest part of the ocean, what mysteries or unknown species would you expect to find?", 15 | "What scientific experiments would you conduct if you could become invisible at will?" 16 | ], 17 | "Amount": "2" 18 | }, 19 | { 20 | "Original": "Please think up as many possible improvements as you can to a regular bicycle, making it more interesting, more useful and more beautiful. For example, make the tires reflective, so they can be seen in the dark.", 21 | "Example": [ 22 | "How would you improve a standard wristwatch to make it more useful, stylish, and innovative? For example, integrating a solar panel for self-charging.", 23 | "What enhancements can be made to a pair of eyeglasses to increase their functionality and aesthetic appeal? For instance, adding transition lenses that adjust to light conditions." 24 | ], 25 | "Amount": "2" 26 | }, 27 | { 28 | "Original": "Suppose there was no gravity, describe what the world would be like? For example, human beings would be floating.", 29 | "Example": [ 30 | "Imagine if water was not transparent but colored. How would this change the appearance of oceans, rivers, and rain?", 31 | "What if humans had the ability to photosynthesize like plants? How would our daily lives and food systems change?" 32 | ], 33 | "Amount": "2" 34 | }, 35 | { 36 | "Original": "There are two kinds of napkins. How can you test which is better? Please write down as many possible methods as you can and the instruments, principles and simple procedure.", 37 | "Example": [ 38 | "Two different light bulbs are available. How can you test which one is more energy-efficient? Consider the instruments needed, the principles of electricity, and a simple testing procedure.", 39 | "How can you determine which of two sunscreens provides better UV protection? Outline the methods, necessary instruments, and a brief procedure for the test." 40 | ], 41 | "Amount": "2" 42 | } 43 | ] 44 | } -------------------------------------------------------------------------------- /Datasets/Scientific/scientific_3.json: -------------------------------------------------------------------------------- 1 | { 2 | "Task":[ 3 | { 4 | "Original": "Please write down as many as possible scientific uses as you can for a piece of glass. For example, make a test tube.", 5 | "Example": [ 6 | "Innovative Scientific Tools Using a Rubber Band: Think of as many scientific uses as possible for a rubber band. For example, using it as a simple measure of elasticity in a physics experiment.", 7 | "Innovative Uses for a Water Bottle: Consider various scientific applications of a water bottle. For example, using it as a container for creating a water vortex." 8 | ], 9 | "Amount": "2" 10 | }, 11 | { 12 | "Original": "If you can take a spaceship to travel in outer space and go to a planet, what scientific questions do you want to research? For example, are there any living things on the planet?", 13 | "Example": [ 14 | "If you could explore the deepest part of the ocean, what mysteries or unknown species would you expect to find?" 15 | ], 16 | "Amount": "1" 17 | } 18 | ] 19 | } -------------------------------------------------------------------------------- /Datasets/Scientific/scientific_30_test.json: -------------------------------------------------------------------------------- 1 | { 2 | "Task": [ 3 | { 4 | "Original": "Please write down as many as possible scientific uses as you can for a piece of glass. For example, make a test tube.", 5 | "Example": [ 6 | "Creative Scientific Applications of a Pencil: Explore different scientific ways to use a pencil. For example, using it to demonstrate principles of gravity and balance.", 7 | "Exploring Science with a Button: Find scientific applications for a button. For example, using it to study materials science or color absorption.", 8 | "Scientific Investigations with a Battery: Identify different scientific uses for a battery. For example, understanding electrochemistry or energy storage.", 9 | "Exploring Science with a Feather: Find scientific experiments involving a feather. For example, studying aerodynamics or bird biology.", 10 | "Utilizing a Brick in Science: Consider the scientific applications of a brick. For example, studying material strength or thermal properties.", 11 | "Scientific Experiments with a Glass Jar: Explore different scientific uses for a glass jar. For example, creating a terrarium or studying gas laws." 12 | ], 13 | "Amount": "6" 14 | }, 15 | { 16 | "Original": "If you can take a spaceship to travel in outer space and go to a planet, what scientific questions do you want to research? For example, are there any living things on the planet?", 17 | "Example": [ 18 | "How would you scientifically explore a scenario where the moon changes color every night?", 19 | "What experiments would you design if you discovered a mirror that showed the future?", 20 | "If you could breathe underwater without any equipment, what underwater mysteries or scientific studies would you explore?", 21 | "If you discovered a way to become immortal, what long-term scientific experiments would you plan?", 22 | "How would you investigate the causes and effects of a phenomenon where time moved twice as fast for everyone?", 23 | "How would you study the effects on human health and society if everyone stopped needing sleep?" 24 | ], 25 | "Amount": "6" 26 | }, 27 | { 28 | "Original": "Please think up as many possible improvements as you can to a regular bicycle, making it more interesting, more useful and more beautiful. For example, make the tires reflective, so they can be seen in the dark.", 29 | "Example": [ 30 | "How can you upgrade a conventional pen to make it more innovative and stylish? For example, including a digital display for notes.", 31 | "How can you upgrade a conventional pillow to make it more comfortable and beneficial? For instance, infusing it with temperature-regulating materials.", 32 | "How could you upgrade a common backpack to make it more versatile, comfortable, and attractive? Example: incorporating a built-in USB charging port.", 33 | "How would you enhance a regular pair of shoes to make them more beneficial and fashionable? For example, adding self-tying laces.", 34 | "How can you upgrade a traditional mirror to make it more useful and elegant? For instance, integrating a touchscreen interface with smart features.", 35 | "How can you upgrade a typical desk lamp to make it more functional and aesthetically pleasing? For example, adding color-changing LED options." 36 | ], 37 | "Amount": "6" 38 | }, 39 | { 40 | "Original": "Suppose there was no gravity, describe what the world would be like? For example, human beings would be floating.", 41 | "Example": [ 42 | "Suppose gravity could be turned on and off like a switch. How would this change architecture, transportation, and sports?", 43 | "What if we could regenerate lost limbs like some animals? Discuss the implications for medicine and human ability.", 44 | "What if humans had a lifespan of 200 years? Consider the societal and personal impacts.", 45 | "Imagine if we could hear ultrasonic frequencies. How would this alter our perception of sound and communication?", 46 | "Imagine if we could teleport to any location instantly. How would this revolutionize travel and society?", 47 | "What if mountains could grow and shrink within a human lifetime? Consider the effects on geography and human settlements." 48 | ], 49 | "Amount": "6" 50 | }, 51 | { 52 | "Original": "There are two kinds of napkins. How can you test which is better? Please write down as many possible methods as you can and the instruments, principles and simple procedure.", 53 | "Example": [ 54 | "How can you test which of two smartphones has a longer battery life? Explain your testing approach, the necessary tools, and the procedure.", 55 | "You have two different kinds of soil for planting. How can you test which one is better for plant growth? Explain your testing methods, required tools, and the procedure.", 56 | "Two brands of batteries claim long-lasting power. How can you test which one lasts longer? Describe your approach, the instruments you'll use, and the testing process.", 57 | "How can you test which of two computer monitors has better color accuracy? Describe the approach, tools needed, and a brief procedure.", 58 | "Determine which of two kinds of toothpaste is more effective at whitening teeth. Describe the testing methods, instruments, and a simple procedure.", 59 | "Determine which of two types of blankets is more fire-resistant. Describe the testing methods, necessary tools, and a simple procedure." 60 | ], 61 | "Amount": "6" 62 | } 63 | ] 64 | } -------------------------------------------------------------------------------- /Datasets/Scientific/scientific_example.json: -------------------------------------------------------------------------------- 1 | { 2 | "Task":[ 3 | { 4 | "Original": "If you can take a spaceship to travel in outer space and go to a planet, what scientific questions do you want to research? For example, are there any living things on the planet?", 5 | "Example": [ 6 | "If you could explore the deepest part of the ocean, what mysteries or unknown species would you expect to find?" 7 | ], 8 | "Amount": "1" 9 | } 10 | ] 11 | } -------------------------------------------------------------------------------- /Datasets/Similarities/similarities_10.json: -------------------------------------------------------------------------------- 1 | { 2 | "Examples": [ 3 | "Tell me all the ways in which a book and a magazine are alike.", 4 | "Tell me all the ways in which a car and a bicycle are alike.", 5 | "Tell me all the ways in which a tree and a flower are alike.", 6 | "Tell me all the ways in which a lake and an ocean are alike.", 7 | "Tell me all the ways in which a bird and an airplane are alike.", 8 | "Tell me all the ways in which a computer and a smartphone are alike.", 9 | "Tell me all the ways in which a pencil and a pen are alike.", 10 | "Tell me all the ways in which a chair and a couch are alike.", 11 | "Tell me all the ways in which a fish and a whale are alike.", 12 | "Tell me all the ways in which a star and a planet are alike." 13 | ] 14 | } -------------------------------------------------------------------------------- /Datasets/Similarities/similarities_100.json: -------------------------------------------------------------------------------- 1 | { 2 | "Examples": [ 3 | "Tell me all the ways in which a book and a magazine are alike.", 4 | "Tell me all the ways in which a car and a bicycle are alike.", 5 | "Tell me all the ways in which a tree and a flower are alike.", 6 | "Tell me all the ways in which a lake and an ocean are alike.", 7 | "Tell me all the ways in which a bird and an airplane are alike.", 8 | "Tell me all the ways in which a computer and a smartphone are alike.", 9 | "Tell me all the ways in which a pencil and a pen are alike.", 10 | "Tell me all the ways in which a chair and a couch are alike.", 11 | "Tell me all the ways in which a fish and a whale are alike.", 12 | "Tell me all the ways in which a star and a planet are alike.", 13 | "Tell me all the ways in which a shoe and a sock are alike.", 14 | "Tell me all the ways in which a sandwich and a pizza are alike.", 15 | "Tell me all the ways in which a mountain and a hill are alike.", 16 | "Tell me all the ways in which a river and a stream are alike.", 17 | "Tell me all the ways in which a clock and a watch are alike.", 18 | "Tell me all the ways in which a lamp and a flashlight are alike.", 19 | "Tell me all the ways in which a door and a window are alike.", 20 | "Tell me all the ways in which a refrigerator and an oven are alike.", 21 | "Tell me all the ways in which a knife and a fork are alike.", 22 | "Tell me all the ways in which a camera and a video camera are alike.", 23 | "Tell me all the ways in which a hat and a cap are alike.", 24 | "Tell me all the ways in which a butterfly and a bee are alike.", 25 | "Tell me all the ways in which a piano and a guitar are alike.", 26 | "Tell me all the ways in which a shirt and a jacket are alike.", 27 | "Tell me all the ways in which a snake and a lizard are alike.", 28 | "Tell me all the ways in which a cloud and fog are alike.", 29 | "Tell me all the ways in which a train and a bus are alike.", 30 | "Tell me all the ways in which a cup and a mug are alike.", 31 | "Tell me all the ways in which a park and a garden are alike.", 32 | "Tell me all the ways in which a movie and a play are alike.", 33 | "Tell me all the ways in which a poem and a novel are alike.", 34 | "Tell me all the ways in which a toothbrush and a comb are alike.", 35 | "Tell me all the ways in which ice cream and yogurt are alike.", 36 | "Tell me all the ways in which a violin and a cello are alike.", 37 | "Tell me all the ways in which a basketball and a soccer ball are alike.", 38 | "Tell me all the ways in which a tablet and an e-reader are alike.", 39 | "Tell me all the ways in which a mirror and a photograph are alike.", 40 | "Tell me all the ways in which a salad and a soup are alike.", 41 | "Tell me all the ways in which a dragonfly and a butterfly are alike.", 42 | "Tell me all the ways in which a key and a lock are alike.", 43 | "Tell me all the ways in which a brick and a stone are alike.", 44 | "Tell me all the ways in which a scarf and a glove are alike.", 45 | "Tell me all the ways in which a spider and an ant are alike.", 46 | "Tell me all the ways in which a cake and a pie are alike.", 47 | "Tell me all the ways in which a frog and a toad are alike.", 48 | "Tell me all the ways in which a motorcycle and a scooter are alike.", 49 | "Tell me all the ways in which a tower and a skyscraper are alike.", 50 | "Tell me all the ways in which a boat and a ship are alike.", 51 | "Tell me all the ways in which a candle and a lamp are alike.", 52 | "Tell me all the ways in which a desk and a table are alike.", 53 | "Tell me all the ways in which a coat and a cloak are alike.", 54 | "Tell me all the ways in which a glass and a bottle are alike.", 55 | "Tell me all the ways in which a squirrel and a rabbit are alike.", 56 | "Tell me all the ways in which a diary and a journal are alike.", 57 | "Tell me all the ways in which a blender and a mixer are alike.", 58 | "Tell me all the ways in which a leopard and a tiger are alike.", 59 | "Tell me all the ways in which a hammer and a wrench are alike.", 60 | "Tell me all the ways in which a ballet and an opera are alike.", 61 | "Tell me all the ways in which a zebra and a horse are alike.", 62 | "Tell me all the ways in which a duck and a swan are alike.", 63 | "Tell me all the ways in which a berry and a nut are alike.", 64 | "Tell me all the ways in which a calendar and a diary are alike.", 65 | "Tell me all the ways in which a guitar and a ukulele are alike.", 66 | "Tell me all the ways in which a kite and a balloon are alike.", 67 | "Tell me all the ways in which a castle and a palace are alike.", 68 | "Tell me all the ways in which a lion and a cheetah are alike.", 69 | "Tell me all the ways in which a stove and a fireplace are alike.", 70 | "Tell me all the ways in which a rug and a carpet are alike.", 71 | "Tell me all the ways in which a muffin and a cupcake are alike.", 72 | "Tell me all the ways in which a peach and a plum are alike.", 73 | "Tell me all the ways in which a beetle and a cockroach are alike.", 74 | "Tell me all the ways in which a fox and a wolf are alike.", 75 | "Tell me all the ways in which a crow and a raven are alike.", 76 | "Tell me all the ways in which a shrimp and a crab are alike.", 77 | "Tell me all the ways in which a passport and a ticket are alike.", 78 | "Tell me all the ways in which a brush and a broom are alike.", 79 | "Tell me all the ways in which a church and a mosque are alike.", 80 | "Tell me all the ways in which a turtle and a tortoise are alike.", 81 | "Tell me all the ways in which a melon and a watermelon are alike.", 82 | "Tell me all the ways in which a skateboard and a rollerblade are alike.", 83 | "Tell me all the ways in which a crayon and a marker are alike.", 84 | "Tell me all the ways in which a map and a globe are alike.", 85 | "Tell me all the ways in which a bat and a bird are alike.", 86 | "Tell me all the ways in which a guitar and a banjo are alike.", 87 | "Tell me all the ways in which a hedgehog and a porcupine are alike.", 88 | "Tell me all the ways in which a yacht and a sailboat are alike.", 89 | "Tell me all the ways in which a lighthouse and a beacon are alike.", 90 | "Tell me all the ways in which a peach and a nectarine are alike.", 91 | "Tell me all the ways in which a donkey and a horse are alike.", 92 | "Tell me all the ways in which a notebook and a ledger are alike.", 93 | "Tell me all the ways in which a flute and a clarinet are alike.", 94 | "Tell me all the ways in which a jellyfish and an octopus are alike.", 95 | "Tell me all the ways in which a sweater and a poncho are alike.", 96 | "Tell me all the ways in which a comet and an asteroid are alike.", 97 | "Tell me all the ways in which a diary and a planner are alike.", 98 | "Tell me all the ways in which a trumpet and a trombone are alike.", 99 | "Tell me all the ways in which a vase and a pot are alike.", 100 | "Tell me all the ways in which an owl and a hawk are alike.", 101 | "Tell me all the ways in which an apple and a pear are alike.", 102 | "Tell me all the ways in which a goldfish and a guppy are alike." 103 | ] 104 | } -------------------------------------------------------------------------------- /Datasets/Similarities/similarities_10_val.json: -------------------------------------------------------------------------------- 1 | { 2 | "Examples": [ 3 | "Tell me all the ways in which a book and a magazine are alike.", 4 | "Tell me all the ways in which a car and a bicycle are alike.", 5 | "Tell me all the ways in which a tree and a flower are alike.", 6 | "Tell me all the ways in which a cake and a pie are alike.", 7 | "Tell me all the ways in which a bird and an airplane are alike.", 8 | "Tell me all the ways in which a kite and a balloon are alike.", 9 | "Tell me all the ways in which a pencil and a pen are alike.", 10 | "Tell me all the ways in which a chair and a couch are alike.", 11 | "Tell me all the ways in which a jellyfish and an octopus are alike.", 12 | "Tell me all the ways in which a star and a planet are alike." 13 | ] 14 | } -------------------------------------------------------------------------------- /Datasets/Similarities/similarities_3.json: -------------------------------------------------------------------------------- 1 | { 2 | "Examples": [ 3 | "Tell me all the ways in which a book and a magazine are alike.", 4 | "Tell me all the ways in which a car and a bicycle are alike.", 5 | "Tell me all the ways in which a tree and a flower are alike." 6 | ] 7 | } -------------------------------------------------------------------------------- /Datasets/Similarities/similarities_30_test.json: -------------------------------------------------------------------------------- 1 | { 2 | "Examples": [ 3 | "Tell me all the ways in which a hedgehog and a porcupine are alike.", 4 | "Tell me all the ways in which a diary and a planner are alike.", 5 | "Tell me all the ways in which a crayon and a marker are alike.", 6 | "Tell me all the ways in which a duck and a swan are alike.", 7 | "Tell me all the ways in which a passport and a ticket are alike.", 8 | "Tell me all the ways in which a salad and a soup are alike.", 9 | "Tell me all the ways in which a fox and a wolf are alike.", 10 | "Tell me all the ways in which a jellyfish and an octopus are alike.", 11 | "Tell me all the ways in which a poem and a novel are alike.", 12 | "Tell me all the ways in which a butterfly and a bee are alike.", 13 | "Tell me all the ways in which a brush and a broom are alike.", 14 | "Tell me all the ways in which a shoe and a sock are alike.", 15 | "Tell me all the ways in which a movie and a play are alike.", 16 | "Tell me all the ways in which a park and a garden are alike.", 17 | "Tell me all the ways in which a lamp and a flashlight are alike.", 18 | "Tell me all the ways in which a goldfish and a guppy are alike.", 19 | "Tell me all the ways in which a lion and a cheetah are alike.", 20 | "Tell me all the ways in which a comet and an asteroid are alike.", 21 | "Tell me all the ways in which a skateboard and a rollerblade are alike.", 22 | "Tell me all the ways in which ice cream and yogurt are alike.", 23 | "Tell me all the ways in which a sandwich and a pizza are alike.", 24 | "Tell me all the ways in which a cloud and fog are alike.", 25 | "Tell me all the ways in which a frog and a toad are alike.", 26 | "Tell me all the ways in which a snake and a lizard are alike.", 27 | "Tell me all the ways in which a diary and a journal are alike.", 28 | "Tell me all the ways in which a violin and a cello are alike.", 29 | "Tell me all the ways in which a ballet and an opera are alike.", 30 | "Tell me all the ways in which a donkey and a horse are alike.", 31 | "Tell me all the ways in which an apple and a pear are alike.", 32 | "Tell me all the ways in which a berry and a nut are alike." 33 | 34 | ], 35 | "Amount": 30 36 | } -------------------------------------------------------------------------------- /Datasets/Similarities/similarities_example.json: -------------------------------------------------------------------------------- 1 | { 2 | "Examples": [ 3 | "Tell me all the ways in which a book and a magazine are alike." 4 | ] 5 | } -------------------------------------------------------------------------------- /Evaluation/.gitignore: -------------------------------------------------------------------------------- 1 | *.pickle 2 | .env 3 | .auto_grade_bai.py -------------------------------------------------------------------------------- /Evaluation/README.md: -------------------------------------------------------------------------------- 1 | # Evaluation API Tool 2 | Evaluate **Fluency**, **Flexibility**, **Originality**, and **Elaboration** 3 | 4 | ## Setup and Installation 5 | 6 | 7 | ### 1. API Keys 8 | Before you execute, export you API key (set it in your environment) 9 | ```bash 10 | export OPENAI_API_KEY="your_api_key_here" 11 | ``` 12 | You can also write it into you ~/.bashrc or ~/.zshrc 13 | 14 | ### 2. Running the Script 15 | ```bash 16 | -v, --version: Specify the version of the OpenAI model to use. Options are "3" (for GPT-3.5) and "4" (for GPT-4). Default is "3". 17 | 18 | -i, --input_file: Name (without .json) of the input file located in the Results/{task}/Output/multi_agent directory. {task} is "AUT", "Scientific", "Instances", or "Similarities" 19 | 20 | -s, --sample: Number of times to sample the evaluation. Default is 3. 21 | 22 | -d, --task: Task for the evaluation. Options include "AUT", "Scientific", "Instances", and "Similarities". Default is "AUT". 23 | 24 | -o, --output: Choose whether to output the results into the LeaderBoard or not. Options are "y" (yes) and "n" (no). Default is "n". 25 | ``` 26 | 27 | #### For example: 28 | ```bash 29 | python3 auto_grade_final.py -v 3 -i Instances_single_few-shot_2-0 -s 3 -d Instances -o y 30 | ``` 31 | 32 | ## Output 33 | The results of the evaluation will be saved in a JSON file located in the corresponding task's `Results/{task}/Eval_Result/multi_agent` folder. 34 | - If the -o option is set to "y", the results will also be saved in a CSV file in the LeaderBoard folder: 35 | `Results/LeaderBoard/LeaderBoard-{args.task}.csv` 36 | ----- 37 | -------------------------------------------------------------------------------- /Evaluation/auto_grade_final.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | from pathlib import Path 5 | from utils.openai_model import OpenAIModel 6 | from eval_functions.eval_criterion import evaluate_aut, evaluate_scientific, evaluate_wkct 7 | import logging 8 | from automation_csv import calculate_mean_std, write_results_to_csv 9 | 10 | TASK_PATHS = { 11 | "AUT": "Results/AUT/Output", 12 | "Scientific": "Results/Scientific/Output", 13 | "Instances": "Results/Instances/Output", 14 | "Similarities": "Results/Similarities/Output", 15 | } 16 | 17 | def ensure_folder_exists(path): 18 | if not os.path.exists(path): 19 | os.makedirs(path) 20 | print(f"Created directory: {path}") 21 | 22 | def auto_grade(args): 23 | print("AUTO GRADE STARTED, Input_file: ", args.input_file) 24 | 25 | # OPENAI KEY 26 | api_key = os.getenv("OPENAI_API_KEY") 27 | version = "gpt-4-0125-preview" if args.version == "4" else "gpt-3.5-turbo-0125" 28 | print(f"Using GPT Version {version}, Input: {args.version}") 29 | 30 | # SETUP CACHE AND MODEL 31 | cache_file_name = f"cache_{args.version}.pickle" 32 | model = OpenAIModel(cache_file_name, version, api_key) 33 | 34 | # This is for assign the input folder 35 | print(f"{args.input_file.split('_')[1]}_agent") 36 | 37 | task_folder = TASK_PATHS[args.task] 38 | input_file_path = os.path.join(Path(__file__).parent, '..', task_folder, f"{args.input_file.split('_')[1]}_agent", f"{args.input_file}.json") 39 | 40 | ensure_folder_exists(os.path.dirname(input_file_path)) 41 | 42 | with open(input_file_path, "r") as file: 43 | responses = json.load(file) 44 | 45 | total_results = [] 46 | sampling_criteria = ["originality", "elaboration"] 47 | evaluation_criteria = ["fluency", "flexibility"] 48 | selected_criteria = evaluation_criteria + sampling_criteria 49 | 50 | if args.task == "AUT": 51 | for response_obj in responses: 52 | item = response_obj['item'] 53 | uses = response_obj.get('uses', []) 54 | item_results = {"item": item} 55 | if not uses: # Check if 'uses' is empty 56 | for criterion in selected_criteria: 57 | responses = [{"response": "No uses provided", "score": 0}] 58 | item_results[criterion] = responses 59 | log_score = {f"average_{criterion}": 0} 60 | item_results[criterion].append(log_score) 61 | else: 62 | for criterion in evaluation_criteria: 63 | result = evaluate_aut(model, response_obj, criterion, args.type, args.sample) 64 | item_results[criterion] = [result] 65 | model.save_cache() 66 | for criterion in sampling_criteria: 67 | total = [] 68 | for use in uses: 69 | result = evaluate_aut(model, {"item": item, "uses": [use]}, criterion, args.type, 1) 70 | total.append(result) 71 | print(f"Item: {item}, Use: {use}, {criterion.capitalize()} Score: {result['average_score']}") 72 | item_results[criterion] = total 73 | model.save_cache() 74 | for criterion in evaluation_criteria: 75 | avg_score = sum(res['average_score'] for res in item_results[criterion]) / len(item_results[criterion]) 76 | log_score = {f"average_{criterion}": avg_score} 77 | item_results[criterion].append(log_score) 78 | for criterion in sampling_criteria: 79 | avg_score = sum(res['average_score'] for res in item_results[criterion]) / len(item_results[criterion]) 80 | log_score = {f"average_{criterion}": avg_score} 81 | item_results[criterion].append(log_score) 82 | total_results.append(item_results) 83 | 84 | elif args.task == "Scientific": 85 | print("Scientific Task") 86 | for response_obj in responses: 87 | question = response_obj['question'] 88 | answer = response_obj.get('answer',[]) 89 | question_results = {"question": question} 90 | if not answer: # Check if 'answer' is empty 91 | for criterion in selected_criteria: 92 | responses = [{"answer": "No responses provided", "score": 0}] 93 | question_results[criterion] = responses 94 | log_score = {f"average_{criterion}": 0} 95 | question_results[criterion].append(log_score) 96 | else: 97 | for criterion in evaluation_criteria: 98 | result = evaluate_scientific(model, response_obj, criterion, args.type, args.sample) 99 | question_results[criterion] = [result] 100 | model.save_cache() 101 | for criterion in sampling_criteria: 102 | total = [] 103 | for ans in answer: 104 | result = evaluate_scientific(model, {"question": question, "answer": [ans]}, criterion, args.type, 1) 105 | total.append(result) 106 | print(f"Question: {question}, Answer: {ans}, {criterion.capitalize()} Score: {result['average_score']}") 107 | question_results[criterion] = total 108 | model.save_cache() 109 | for criterion in evaluation_criteria: 110 | avg_score = sum(res['average_score'] for res in question_results[criterion]) / len(question_results[criterion]) 111 | log_score = {f"average_{criterion}": avg_score} 112 | question_results[criterion].append(log_score) 113 | for criterion in sampling_criteria: 114 | avg_score = sum(res['average_score'] for res in question_results[criterion]) / len(question_results[criterion]) 115 | log_score = {f"average_{criterion}": avg_score} 116 | question_results[criterion].append(log_score) 117 | total_results.append(question_results) 118 | 119 | elif args.task == "Instances" or args.task == "Similarities": 120 | print("WKCT Task") 121 | for response_obj in responses: 122 | question = response_obj['question'] 123 | answer = response_obj.get('answer',[]) 124 | question_results = {"question": question} 125 | if not answer: # Check if 'answer' is empty 126 | for criterion in selected_criteria: 127 | responses = [{"answer": "No responses provided", "score": 0}] 128 | question_results[criterion] = responses 129 | log_score = {f"average_{criterion}": 0} 130 | question_results[criterion].append(log_score) 131 | else: 132 | for criterion in evaluation_criteria: 133 | result = evaluate_wkct(model, response_obj, criterion, args.type, args.sample) 134 | question_results[criterion] = [result] 135 | model.save_cache() 136 | for criterion in sampling_criteria: 137 | total = [] 138 | for ans in answer: 139 | result = evaluate_wkct(model, {"question": question, "answer": [ans]}, criterion, args.type, 1) 140 | total.append(result) 141 | print(f"Question: {question}, Answer: {ans}, {criterion.capitalize()} Score: {result['average_score']}") 142 | question_results[criterion] = total 143 | model.save_cache() 144 | for criterion in evaluation_criteria: 145 | avg_score = sum(res['average_score'] for res in question_results[criterion]) / len(question_results[criterion]) 146 | log_score = {f"average_{criterion}": avg_score} 147 | question_results[criterion].append(log_score) 148 | for criterion in sampling_criteria: 149 | avg_score = sum(res['average_score'] for res in question_results[criterion]) / len(question_results[criterion]) 150 | log_score = {f"average_{criterion}": avg_score} 151 | question_results[criterion].append(log_score) 152 | 153 | total_results.append(question_results) 154 | 155 | output_folder = task_folder.replace('Output', 'Eval_Result') 156 | output_file_path = os.path.join(Path(__file__).parent, '..', output_folder, f"{args.input_file.split('_')[1]}_agent", f"evaluation_{args.input_file}_{args.type}_{args.version}.json") 157 | 158 | ensure_folder_exists(os.path.dirname(output_file_path)) 159 | 160 | with open(output_file_path, "w") as outfile: 161 | json.dump(total_results, outfile, indent=4) 162 | print(f"Results saved to {output_file_path}") 163 | 164 | if args.output == 'y': 165 | mean_std_results = calculate_mean_std(total_results) 166 | output_csv_path = os.path.join(Path(__file__).parent, '..', 'Results', 'LeaderBoard', f'LeaderBoard-{args.task}.csv') 167 | ensure_folder_exists(os.path.dirname(output_csv_path)) 168 | write_results_to_csv(args.input_file, mean_std_results, output_csv_path, args.version) 169 | else: 170 | print('Output will not be saved in Leader Board!') 171 | 172 | 173 | if __name__ == "__main__": 174 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') 175 | # PARSERS 176 | parser = argparse.ArgumentParser(description="Evaluate responses based on specified criteria using OpenAI's API.") 177 | parser.add_argument("-v", "--version", default="3", choices=["3", "4"], help="Version of the OpenAI model to use.") 178 | parser.add_argument("-i", "--input_file", required=True, help="Name of the input file located in the Results directory.") 179 | parser.add_argument("-t", "--type", default="sampling", choices=["default", "sampling"], help="Variant of the evaluation.") 180 | parser.add_argument("-s", "--sample", default=3, type=int, help="Number of times to sample the evaluation.") 181 | parser.add_argument("-d", "--task", default="AUT", choices = ["AUT", "Scientific", "Instances", "Similarities"], help="Task for the evaluation. Default is AUT.") 182 | parser.add_argument("-o", "--output", default="n", choices=["y", "n"], help="Output into LeaderBoard or not") 183 | args = parser.parse_args() 184 | auto_grade(args) 185 | 186 | -------------------------------------------------------------------------------- /Evaluation/automation_csv.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import numpy as np 3 | import csv 4 | 5 | 6 | def calculate_mean_std(total_results): 7 | # Extracting scores for each criterion from the total results 8 | fluency_scores = [item["fluency"][-1]["average_fluency"] for item in total_results] 9 | flexibility_scores = [item["flexibility"][-1]["average_flexibility"] for item in total_results] 10 | originality_scores = [item["originality"][-1]["average_originality"] for item in total_results] 11 | elaboration_scores = [item["elaboration"][-1]["average_elaboration"] for item in total_results] 12 | 13 | # Calculating mean and standard deviation for each criterion 14 | results = { 15 | "mean_fluency": round(np.mean(fluency_scores), 3), 16 | "std_fluency": round(np.std(fluency_scores), 3), 17 | "mean_flexibility": round(np.mean(flexibility_scores), 3), 18 | "std_flexibility": round(np.std(flexibility_scores), 3), 19 | "mean_originality": round(np.mean(originality_scores), 3), 20 | "std_originality": round(np.std(originality_scores), 3), 21 | "mean_elaboration": round(np.mean(elaboration_scores), 3), 22 | "std_elaboration": round(np.std(elaboration_scores), 3), 23 | } 24 | return results 25 | 26 | def write_results_to_csv(input_file_name, mean_std_results, csv_file_path, version): 27 | 28 | headers = ['Timestamp', 'Task', 'Type', 'Mode', 'Agent', 'Round','Model Name', 'Role Name', 'Data Num', 'Mean Fluency', 'STD Fluency', 'Mean Flexibility', 'STD Flexibility', 'Mean Originality', 'STD Originality', 'Mean Elaboration', 'STD Elaboration', 'File Name'] 29 | csv_data = [] 30 | parts = input_file_name.split('_') 31 | 32 | 33 | Task = parts[0] # AUT, Scientific, Similarities, Instances 34 | Type = parts[2] # debate, conversational 35 | Data_Num = parts[-1].split('-')[0] 36 | Raw_Timestamp = parts[-2].split('-') 37 | print("Raw_Timestamp: ", Raw_Timestamp) 38 | date = '-'.join(Raw_Timestamp[:3]) 39 | print("date: ", date) 40 | time = ':'.join(Raw_Timestamp[3:6]) 41 | print("time: ", time) 42 | Timestamp = f'{date} {time}' 43 | 44 | Mode, Agent, Rounds, Model_Name, Role_Name = None, None, None, None, None # Initialize to None 45 | 46 | if parts[1] == "single": 47 | Agent = parts[4] 48 | Rounds = parts[5] 49 | Model_Name = parts[6] 50 | Mode = parts[3] 51 | Role_Name = parts[7] 52 | if parts[1] == 'multi': 53 | Mode = parts[3] 54 | Agent = parts[4] 55 | Rounds = parts[5] 56 | Model_Name = parts[6] 57 | Role_Name = parts[7] 58 | else: 59 | print('ERROR AGENT!!') 60 | 61 | 62 | row = [Timestamp, Task, Type, Mode, Agent, Rounds, Model_Name, Role_Name, Data_Num] 63 | row.extend([ 64 | mean_std_results['mean_fluency'], mean_std_results['std_fluency'], 65 | mean_std_results['mean_flexibility'], mean_std_results['std_flexibility'], 66 | mean_std_results['mean_originality'], mean_std_results['std_originality'], 67 | mean_std_results['mean_elaboration'], mean_std_results['std_elaboration'], 68 | input_file_name 69 | ]) 70 | csv_data.append(row) 71 | file_path = Path(csv_file_path) 72 | file_path.parent.mkdir(parents=True, exist_ok=True) 73 | try: 74 | with file_path.open(mode='a+', newline='', encoding='utf-8') as file: 75 | writer = csv.writer(file) 76 | if file.tell() == 0: # If file is empty, write headers 77 | writer.writerow(headers) 78 | writer.writerows(csv_data) 79 | 80 | # Now sort the data if needed, by reading, sorting, and rewriting the CSV file 81 | with file_path.open(mode='r', newline='', encoding='utf-8') as file: 82 | reader = csv.reader(file) 83 | header = next(reader) # Skip header 84 | sorted_data = sorted(reader, key=lambda x: (x[0], x[8])) # Sort by Timestamp and Data Num 85 | 86 | with file_path.open(mode='w', newline='', encoding='utf-8') as file: 87 | writer = csv.writer(file) 88 | writer.writerow(headers) # Write headers 89 | writer.writerows(sorted_data) 90 | 91 | print(f'Data sorted by Timestamp and Data and saved to {csv_file_path}') 92 | except Exception as e: 93 | print(f'ERROR: Failed to write data to CSV due to {e}') 94 | 95 | print(f'Data sorted by Timestamp and Data and saved to {csv_file_path}') 96 | -------------------------------------------------------------------------------- /Evaluation/eval_functions/eval_criterion.py: -------------------------------------------------------------------------------- 1 | from eval_functions.eval_prompts import aut_prompts, scientific_prompts, wkct_prompts 2 | from utils.util import parse_number_score 3 | from utils.openai_model import OpenAIModel 4 | import traceback 5 | import logging 6 | import time 7 | 8 | def evaluate_aut(model: OpenAIModel, response_obj, criterion, eval_type, sample_time=3): 9 | item = response_obj['item'] 10 | detect_empty_list = response_obj.get('uses', []) 11 | uses = '\n'.join(response_obj['uses']) if isinstance(response_obj['uses'], list) else response_obj['uses'] 12 | if not detect_empty_list: # Check if 'uses' is not provided 13 | if eval_type == "sampling": 14 | return { 15 | "use": "Empty List", 16 | "responses": [{"response": "No uses provided", "score": 0}], 17 | "average_score": 0 18 | } 19 | else: 20 | return { 21 | "responses": [{"response": "No uses provided", "score": 0}], 22 | "average_score": 0 23 | } 24 | 25 | get_prompt = aut_prompts[criterion].get(eval_type, aut_prompts[criterion]['default']) 26 | 27 | if eval_type == 'sampling': 28 | full_prompt = get_prompt + f"\nThe item is {item}\nThe response is: {uses}" 29 | print("FULL PROMPT ::: ", full_prompt) 30 | messages = [{"role": "user", "content": full_prompt}] 31 | sample_responses = [] 32 | sample_score = 0 33 | 34 | #SET SEED 35 | seed = 0 36 | 37 | success_count = 0 38 | 39 | while success_count < sample_time: 40 | try: 41 | response = model.generate_response(messages=messages, seed=seed) 42 | print("Given Seed ::: ", seed) 43 | print("Model Response ::: ", response) 44 | individual_score = parse_number_score(response) 45 | sample_responses.append({"response": response, "score": individual_score}) 46 | sample_score += individual_score 47 | print("Score ::: ", sample_score) 48 | success_count += 1 49 | except Exception as e: 50 | traceback.print_exc() 51 | logging.exception(f"Exception occurred: {e}") 52 | time.sleep(1) 53 | seed += 1 54 | 55 | average_item_score = sample_score / sample_time 56 | if eval_type == "sampling": 57 | return { 58 | "task result": response_obj['uses'], 59 | "responses": sample_responses, 60 | "average_score": average_item_score 61 | } 62 | 63 | 64 | return { 65 | "responses": sample_responses, 66 | "average_score": average_item_score 67 | } 68 | 69 | def evaluate_scientific(model: OpenAIModel, response_obj, criterion, eval_type, sample_time=3): 70 | # When fluency and flexibility are evaluated, the response_obj is a list of responses 71 | # When originality and elaboration are evaluated, the response_obj is a single response 72 | # eval_type is set to 'sampling' 73 | question = response_obj['question'] 74 | answer = '\n'.join(response_obj['answer']) if isinstance(response_obj['answer'], list) else response_obj['answer'] 75 | #task_response 76 | get_prompt = scientific_prompts[criterion].get(eval_type, scientific_prompts[criterion]['default']) 77 | 78 | if eval_type == 'sampling': 79 | full_prompt = get_prompt + f"\nThe task is: {question}\nThe response is: {answer}" 80 | print("Input Prompt ::: ", full_prompt) 81 | messages = [{"role": "user", "content": full_prompt}] 82 | sample_responses = [] 83 | sample_score = 0 84 | 85 | #SET SEED 86 | seed = 0 87 | 88 | success_count = 0 89 | 90 | while success_count < sample_time: 91 | try: 92 | response = model.generate_response(messages=messages, seed=seed) 93 | print("Given Seed ::: ", seed) 94 | print("Model Response ::: ", response) 95 | individual_score = parse_number_score(response) 96 | sample_responses.append({"response": response, "score": individual_score}) 97 | sample_score += individual_score 98 | print("Score ::: ", sample_score) 99 | success_count += 1 100 | except Exception as e: 101 | traceback.print_exc() 102 | logging.exception(f"Exception occurred: {e}") 103 | time.sleep(1) 104 | seed += 1 105 | 106 | average_item_score = sample_score / sample_time 107 | if eval_type == "sampling": 108 | return { 109 | "answer": answer, 110 | "responses": sample_responses, 111 | "average_score": average_item_score 112 | } 113 | return { 114 | "responses": sample_responses, 115 | "average_score": average_item_score 116 | } 117 | 118 | 119 | def evaluate_wkct(model: OpenAIModel, response_obj, criterion, eval_type, sample_time=3): 120 | # When fluency and flexibility are evaluated, the response_obj is a list of responses 121 | # When originality and elaboration are evaluated, the response_obj is a single response 122 | # eval_type is set to 'sampling' 123 | question = response_obj['question'] 124 | answer = '\n'.join(response_obj['answer']) if isinstance(response_obj['answer'], list) else response_obj['answer'] 125 | #task_response 126 | get_prompt = wkct_prompts[criterion].get(eval_type, aut_prompts[criterion]['default']) 127 | 128 | if eval_type == 'sampling': 129 | full_prompt = get_prompt + f"\nThe task is: {question}\nThe response is: {answer}" 130 | print("Input Prompt ::: ", full_prompt) 131 | messages = [{"role": "user", "content": full_prompt}] 132 | sample_responses = [] 133 | sample_score = 0 134 | 135 | #SET SEED 136 | seed = 0 137 | 138 | success_count = 0 139 | 140 | while success_count < sample_time: 141 | try: 142 | response = model.generate_response(messages=messages, seed=seed) 143 | print("Given Seed ::: ", seed) 144 | print("Model Response ::: ", response) 145 | individual_score = parse_number_score(response) 146 | sample_responses.append({"response": response, "score": individual_score}) 147 | sample_score += individual_score 148 | print("Score ::: ", sample_score) 149 | success_count += 1 150 | except Exception as e: 151 | traceback.print_exc() 152 | logging.exception(f"Exception occurred: {e}") 153 | time.sleep(1) 154 | seed += 1 155 | 156 | average_item_score = sample_score / sample_time 157 | if eval_type == "sampling": 158 | return { 159 | "answer": answer, 160 | "responses": sample_responses, 161 | "average_score": average_item_score 162 | } 163 | return { 164 | "responses": sample_responses, 165 | "average_score": average_item_score 166 | } -------------------------------------------------------------------------------- /Evaluation/eval_functions/eval_prompts.py: -------------------------------------------------------------------------------- 1 | aut_prompts = { 2 | 'fluency': { 3 | 'default': f""" 4 | You are a helpful assistant and a critical thinker. Participants were asked to list as many uses of an item as possible. Identify and count the number of unique, relevant responses and explain why. It is important to the total amount of unique, relevant, and practical responses in the specific format of [[X]] at the end of your response. 5 | """, 6 | 'fewshot': f""" 7 | You are a helpful assistant and a critical thinker. Participants were asked to list as many uses of an item as possible. Your task is to identify and count the number of unique, relevant responses. Explain your reasoning for considering each response unique and relevant. 8 | It is important to state the total amount of unique, relevant, and practical responses in the specific format of '(X)' at the end of your response. 9 | Example: 10 | The item is 'Bottle'. The responses are: 'water container, flower vase, message holder, decorative object, DIY bird feeder, makeshift funnel'. Unique and Relevant Responses: (6). Justification: Each use is distinct and practical in its own way, demonstrating a variety of applications for a bottle.\n 11 | Now, it's your turn: 12 | """, 13 | 'rubric': f""" 14 | You are a helpful assistant and a critical thinker. Participants were asked to list as many uses of an item as possible. Identify and count the number of unique, relevant responses and explain why. It is important to the total amount of unique, relevant, and practical responses in the specific format of (X) at the end of your response. 15 | """, 16 | 'sampling': f""" 17 | Your task is to evaluate a list of uses for a specific item provided by participants, focusing on identifying each unique and practical use listed. It's important to only consider uses that are relevant and feasible. Conclude your analysis by stating the total number of unique, relevant uses in this specific format: [[X]]. Also, briefly explain how you determined whether a response was relevant and practical. 18 | """ 19 | }, 20 | 'flexibility': { 21 | 'default': f""" 22 | You are a helpful assistant and a critical thinker. Participants were asked to list as many uses for an item as possible. Please evaluate the flexibility of the relevant responses, where flexibility refers to the variety of distinct categories or perspectives represented in the responses. Define and count the number of unique categories or perspectives present, and provide a brief explanation for how you determined these categories. It is important to present the total number of categories or perspectives in the specific format of [[X]] at the end of your response. 23 | """, 24 | 'fewshot': f""" 25 | You are a helpful assistant and a critical thinker. Participants were asked to list as many uses for an item as possible. Your task is to evaluate the flexibility of the relevant responses. Flexibility refers to the variety of distinct categories or perspectives represented in the responses. 26 | Define and count the number of unique categories or perspectives present, and provide a brief explanation for how you determined these categories. It is important to present the total number of categories or perspectives in the specific format of '(X)' at the end of your response. 27 | Example: 28 | The item is 'Spoon'. The responses are: 'eating utensil, measuring tool, gardening tool for small plants, musical instrument when hit against surfaces, art object in metalwork sculptures'. Unique Categories: (5). Justification: The responses represent distinct categories - culinary use, measurement, gardening, music, and art, showcasing a wide range of flexibility in the uses of a spoon.\n 29 | Now, it's your turn: 30 | """, 31 | 'rubric': f""" 32 | You are a helpful assistant and a critical thinker. Participants were asked to list as many uses for an item as possible. Please evaluate the flexibility of the relevant responses, where flexibility refers to the variety of distinct categories or perspectives represented in the responses. Define and count the number of unique categories or perspectives present, and provide a brief explanation for how you determined these categories. It is important to present the total number of categories or perspectives in the specific format of (X) at the end of your response. 33 | """, 34 | 'sampling': f""" 35 | Your task is to assess the range of unique categories or types of uses suggested in responses regarding the uses for a specific item. Your objective is to define and count the distinct categories or perspectives evident in the responses, and provide a brief explanation for how you determined these categories. Conclude your analysis by indicating the total number of unique categories or perspectives using the format: [[X]]. 36 | """ 37 | }, 38 | 'originality': { 39 | 'default': f""" 40 | You are a helpful assistant and a critical thinker. Please evaluate the overall originality of the collective responses to a divergent thinking task where participants were asked to list as many uses for an item as possible. Originality should be gauged by assessing the uniqueness or novelty of the ideas as a whole, considering factors like unexpectedness and rarity across all responses. Rate the overall originality of the set of responses on a scale from 1 to 5, with 5 indicating the highest level of originality. Provide a brief justification for your overall score. It is important to indicate the collective originality score in the specific format of (X) at the end of your response. 41 | """, 42 | 'fewshot': f""" 43 | You are a helpful assistant and a critical thinker. Your task is to evaluate the overall originality of the collective responses to a divergent thinking task. Participants were asked to list as many uses for a given item as possible. Assess the uniqueness or novelty of the ideas as a whole, considering factors like unexpectedness and rarity across all responses. 44 | Rate the overall originality of the set of responses on a scale from 1 to 5, with 5 indicating the highest level of originality. Provide a brief justification for your overall score. It is important to indicate the collective originality score in the specific format of '(X)' at the end of your response. 45 | Example 1: 46 | The item is 'Brick'. The responses are: 'building material, doorstop, paperweight, makeshift weapon, garden ornament'. Originality Score: (3). Justification: Most uses are common, but using a brick as a garden ornament is somewhat novel. 47 | Example 2: 48 | The item is 'Paperclip'. The responses are: 'holding papers, makeshift lockpick, zipper pull, sculpture material, reset tool for electronics'. Originality Score: (4). Justification: The ideas show a good range of common and unexpected uses, like sculpture material and reset tool for electronics, indicating higher originality. \n 49 | Now, it's your turn: 50 | """, 51 | 'rubric': f""" 52 | You are a helpful assistant and a critical thinker. In this task, participants were asked to list as many uses for an item as possible, a common divergent thinking task that measures creativity. Please evaluate the overall originality of the collective responses based on their uniqueness and novelty. Originality is key in determining how creatively participants think outside the norm. Rate the overall originality on a scale from 1 to 5, considering: 53 | - 1 point: Very Common - The ideas are mundane and frequently mentioned in everyday contexts. There's a notable lack of novelty, with responses being the most typical or expected uses. 54 | - 2 points: Somewhat Common - The ideas are somewhat ordinary but show slight variations from typical uses, indicating a basic level of creativity. 55 | - 3 points: Moderately Original - The ideas display a fair amount of creativity and novelty. They are not the usual thoughts but aren't highly rare or unexpected. 56 | - 4 points: Very Original - The ideas are significantly unique, demonstrating a high level of creativity and innovation. They are unexpected and not commonly considered. 57 | - 5 points: Extremely Original - The ideas are extraordinarily unique and rare, displaying a high degree of novelty, creativity, and unexpectedness. These ideas are seldom thought of in typical contexts. 58 | After reviewing the responses, assign an overall originality score based on these criteria. Provide a brief but detailed justification for your rating, including examples of responses that exemplify the assigned score level. It is important to conclude your response by stating the collective originality score in the format: (X) 59 | """, 60 | 'pairwise_v2': f""" 61 | Your task is to assess the creativity of responses in a divergent thinking exercise, focusing on originality and novelty. Rate their originality on a 1 to 5 scale, where 1 signifies very common ideas, and 5 indicates extremely original ideas. Consider: 62 | 1 point: Common, typical ideas with no novelty. 63 | 2 points: Slight variations from the norm, showing basic creativity. 64 | 3 points: Fairly creative, with a moderate level of novelty. 65 | 4 points: Highly unique and innovative ideas. 66 | 5 points: Exceptionally rare and creative ideas, displaying significant novelty. 67 | After evaluating, assign an overall originality score and justify your rating by citing specific examples from the responses. Conclude with the score in the format: 'Overall Originality Score: (X).' Ensure your justification is concise and directly related to the examples cited. 68 | """, 69 | 'sampling': f""" 70 | You are a helpful assistant and a critical thinker. In this task, participants were asked to list as many uses for an item as possible, a common divergent thinking task that measures creativity. Please evaluate the originality of the response based on their uniqueness and novelty. Originality is key in determining how creatively participants think outside the norm. Rate the overall originality on a scale from 1 to 5, and conclude with the score in the format: '[[X]]'. Consider the following guidance: 71 | - 1 point: Very Common - The idea is mundane and frequently mentioned in everyday contexts. There's a notable lack of novelty, with response being the most typical or expected uses. 72 | - 2 points: Somewhat Common - The idea is somewhat ordinary but shows slight variations from typical uses, indicating a basic level of creativity. 73 | - 3 points: Moderately Original - The idea displays a fair amount of creativity and novelty. They are not the usual thoughts but aren't highly rare or unexpected. 74 | - 4 points: Very Original - The idea is significantly unique, demonstrating a high level of creativity and innovation. They are unexpected and not commonly considered. 75 | - 5 points: Extremely Original - The idea is extraordinarily unique and rare, displaying a high degree of novelty, creativity, and unexpectedness. The idea is seldom thought of in typical contexts. 76 | After reviewing the responses, assign an originality score based on these criteria. Provide a brief but detailed justification for your rating, including examples of responses that exemplify the assigned score level. It is extremely important to put the score in this format: '[[X]]' 77 | """, 78 | 'pairwise': "Please act as an impartial judge and evaluate the originality of the responses provided by two different people to the given task. Compare the responses in terms of their uniqueness, novelty, and creativity. Originality should be assessed based on how unique and innovative each response is, without being influenced by the order in which they are presented or the length of the responses. Your evaluation should be objective, focusing solely on the originality of the ideas presented in each response. After your comparison, conclude with a clear verdict using this format: '[[A]]' if Result A's response is more original, '[[B]]' if Result B's response is more original, or '[[C]]' for equal originality.", 79 | }, 80 | 'elaboration': { 81 | 'default': f""" 82 | You are a helpful assistant and a critical thinker. Participants were asked to list as many uses for an item as possible. Please evaluate the overall level of elaboration in the set of responses on a scale of 1 to 5, with 5 being the highest. Elaboration should be judged based on the collective detail and development of the ideas across all responses. Provide a brief justification for your overall evaluation. It is important to indicate the overall elaboration score in the specific format of (X) at the end of your response. 83 | """, 84 | 'fewshot': f""" 85 | You are a helpful assistant and a critical thinker. Participants were asked to list as many uses for an item as possible. Your task is to evaluate the overall level of elaboration in the set of responses. Elaboration should be judged based on the collective detail and development of the ideas across all responses. 86 | Rate the level of elaboration on a scale of 1 to 5, with 5 being the highest. Provide a brief justification for your overall evaluation. It is important to indicate the overall elaboration score in the specific format of '(X)' at the end of your response. 87 | Example 1: 88 | The item is 'Brick'. The responses are: 'building material - used in construction for durability, doorstop - to keep doors open, paperweight - to hold down papers, makeshift weapon - in self-defense, garden ornament - painted and decorated for aesthetic appeal'. Elaboration Score: (4). Justification: The responses not only list uses but also include details on how and why each use is applicable, showing a high level of elaboration. 89 | Example 2: 90 | The item is 'Paperclip'. The responses are: 'holding papers together, used as a makeshift lockpick, can serve as a zipper pull, can be bent into various shapes for art projects'. Elaboration Score: (3). Justification: While the uses are varied, the details are somewhat basic and could be further developed for higher elaboration. \n 91 | Now, it's your turn: 92 | """, 93 | 'rubric': f""" 94 | You are a helpful assistant and a critical thinker. Participants were asked to list as many uses for an item as possible. Please evaluate the overall level of elaboration in the set of responses on a scale of 1 to 5, where 1 is the least elaborated and 5 is the most elaborated. Elaboration should be judged based on the collective detail and development of the ideas across all responses. Consider the following criteria for each rating point: 95 | 1 point: Very Basic - The responses are extremely basic with minimal detail or explanation. Ideas are presented in a very simple or cursory manner. 96 | 2 points: Somewhat Basic - The responses show a slight degree of detail, but remain on a basic level. Ideas are somewhat developed but lack depth. 97 | 3 points: Moderately Elaborated - The responses offer a moderate level of detail and development. Ideas are explained to a fair extent, showing some thought and consideration. 98 | 4 points: Highly Elaborated - The responses are well-developed and detailed. Ideas are thoroughly explained and exhibit a high level of thought and complexity. 99 | 5 points: Exceptionally Elaborated - The responses demonstrate exceptional elaboration. Ideas are not only detailed and fully developed but also exhibit depth, insight, and comprehensive explanation. 100 | After reviewing the responses, assign an overall elaboration score based on these criteria. Provide a brief justification for your rating. It is important to conclude your response by stating the overall elaboration score in the format (X). 101 | """, 102 | 'rubric_v2': f""" 103 | Your task is to evaluate the level of elaboration in responses from a divergent thinking exercise. Rate the elaboration on a scale of 1 to 5, with 1 indicating minimal elaboration and 5 representing exceptional elaboration. Elaboration refers to the detail and development of ideas presented in the responses. Use the following scale for guidance: 104 | 1 point: Very Basic - Responses are extremely basic, offering minimal detail. Ideas are presented simplistically. 105 | 2 points: Somewhat Basic - Responses include a slight degree of detail, remaining basic with somewhat developed ideas lacking depth. 106 | 3 points: Moderately Elaborated - Responses provide a moderate level of detail and development. Ideas are fairly explained, showing thoughtful consideration. 107 | 4 points: Highly Elaborated - Responses are detailed and well-developed. Ideas are thoroughly explained, reflecting high thought complexity. 108 | 5 points: Exceptionally Elaborated - Responses show exceptional elaboration, with detailed, in-depth, and comprehensive explanations of ideas. 109 | After your evaluation, assign an overall elaboration score and justify your rating with specific examples from the responses. Conclude with the score in the format: 'Overall Elaboration Score: (X).' Your justification should be concise and directly relate to the examples cited. 110 | """, 111 | 'sampling': f""" 112 | You are a helpful assistant and a critical thinker. Participants were asked to list as many uses for an item as possible. Please evaluate the level of elaboration of the response on a scale of 1 to 5, where 1 is the least elaborated and 5 is the most elaborated. Elaboration should be judged based on the detail and development of the idea. Conclude with the score in this format: '[[X]]' Consider the following guidance: 113 | 1 point: Very Basic - The response is extremely basic with minimal detail or explanation. Idea is presented in a very simple or cursory manner. 114 | 2 points: Somewhat Basic - The response shows a slight degree of detail, but remains on a basic level. Idea is somewhat developed but lacks depth. 115 | 3 points: Moderately Elaborated - The response offers a moderate level of detail and development. Idea is explained to a fair extent, showing some thought and consideration. 116 | 4 points: Highly Elaborated - The response is well-developed and detailed. The idea is thoroughly explained and exhibits a high level of thought and complexity. 117 | 5 points: Exceptionally Elaborated - The response demonstrates exceptional elaboration. Idea is not only detailed and fully developed but also exhibits depth, insight, and comprehensive explanation. 118 | After reviewing the responses, assign an elaboration score based on these criteria. Provide a brief justification for your rating. It is extremely important to put the score in this format: '[[X]]' 119 | """, 120 | 'pairwise': "Please act as an impartial judge and evaluate the level of elaboration in the responses provided by two different people to the given task. Compare the responses in terms of their detail, development, and thoroughness. Elaboration should be assessed based on how well-developed and comprehensive each response is, considering the depth and complexity of the ideas presented, without being influenced by the order in which they are presented or the length of the responses. Your evaluation should be objective, focusing solely on the level of elaboration evident in each response. After your comparison, conclude with a clear verdict using this format: '[[A]]' if Result A's response is more elaborated, '[[B]]' if Result B's response is more elaborated, or '[[C]]' for equal levels of elaboration.", 121 | } 122 | } 123 | 124 | scientific_prompts = { 125 | "fluency": { 126 | "default": """ 127 | You are a thoughtful assistant with a focus on scientific creativity. Identify and count the number of unique, relevant responses and explain why. It is important to the total amount of unique, relevant, and practical responses in the specific format of [[X]] at the end of your response. 128 | """, 129 | "rubric": """ 130 | Rate responses based on the quantity and uniqueness of scientific uses or inquiries mentioned. Consider: 131 | - 1 point: Very few (1-2) relevant scientific uses or inquiries. 132 | - 2 points: A small range (3-4) of relevant scientific uses or inquiries. 133 | - 3 points: A moderate variety (5-6) of relevant scientific uses or inquiries, showing some unique ideas. 134 | - 4 points: A broad range (7-8) of relevant and unique scientific uses or inquiries. 135 | - 5 points: An extensive range (9+) of highly relevant and unique scientific uses or inquiries, demonstrating exceptional scientific fluency. 136 | Provide a total count of unique, relevant scientific uses or inquiries in the format (X). 137 | """, 138 | "sampling": """ 139 | Your task is to evaluate a list of responses for a scientific creativity task provided by participants, focusing on identifying each unique and practical response listed. It's important to only consider responses that are relevant and reasonable. Conclude your analysis by stating the total number of unique, relevant responses in this specific format: [[X]]. Also, briefly explain how you determined whether a response was relevant and reasonable. 140 | """ 141 | }, 142 | "flexibility": { 143 | "default": """ 144 | You are a helpful assistant and a critical thinker. Please evaluate the flexibility of the relevant responses, where flexibility refers to the variety of distinct categories or perspectives represented in the responses. Define and count the number of unique categories or perspectives present, and provide a brief explanation for how you determined these categories. It is important to present the total number of categories or perspectives in the specific format of [[X]] at the end of your response. 145 | """, 146 | "rubric": """ 147 | Evaluate the diversity of scientific disciplines and approaches in the responses. Consider: 148 | - 1 point: Responses are limited to a single scientific discipline or approach. 149 | - 2 points: Responses include a limited range (2) of scientific disciplines or approaches. 150 | - 3 points: Responses show a moderate variety (3-4) of scientific disciplines or approaches. 151 | - 4 points: Responses demonstrate a broad range (5-6) of distinct scientific disciplines or approaches. 152 | - 5 points: Responses encompass a wide range (7+) of distinct and innovative scientific disciplines or approaches. 153 | Conclude with the total number of unique scientific perspectives in the format (X). 154 | """, 155 | "sampling": """ 156 | Your task is to assess the range of unique categories suggested in the responses given from a scientific creativity task. Your objective is to define and count the distinct categories or perspectives evident in the responses, and provide a brief explanation for how you determined these categories. Conclude your analysis by indicating the total number of unique categories or perspectives using the format: [[X]]. 157 | """ 158 | }, 159 | "originality": { 160 | "default": """ 161 | Focus on the originality and innovation of the proposed solutions or uses. Evaluate the responses for their novelty, especially solutions that demonstrate a significant departure from common knowledge or conventional approaches. Rate the overall novelty of the set of responses on a scale from 1 to 5, with 5 being the most novel. Justify your rating with examples from the responses and conclude with the novelty score in the format (X). 162 | """, 163 | "sampling": """ 164 | You are a helpful assistant and a critical thinker. Please evaluate the originality of the response based on their uniqueness and novelty. Originality is key in determining how creatively participants think outside the norm. Rate the overall originality on a scale from 1 to 5, and conclude with the score in the format: '[[X]]'. Consider the following guidance: 165 | - 1 point: Very Common - The idea is mundane and frequently mentioned in everyday contexts. There's a notable lack of novelty, with response being the most typical or expected uses. 166 | - 2 points: Somewhat Common - The idea is somewhat ordinary but shows slight variations from typical responses, indicating a basic level of creativity. 167 | - 3 points: Moderately Original - The idea displays a fair amount of creativity and novelty. They are not the usual thoughts but aren't highly rare or unexpected. 168 | - 4 points: Very Original - The idea is significantly unique, demonstrating a high level of creativity and innovation. They are unexpected and not commonly considered. 169 | - 5 points: Extremely Original - The idea is extraordinarily unique and rare, displaying a high degree of novelty, creativity, and unexpectedness. The idea is seldom thought of in typical contexts. 170 | After reviewing the responses, assign an originality score based on these criteria. Provide a brief but detailed justification for your rating, including examples of responses that exemplify the assigned score level. It is extremely important to put the score in this format: '[[X]]' 171 | """, 172 | "pairwise": """ 173 | Compare the originality of solutions or uses proposed by two different responses. Assess which response offers more novel or innovative ideas. Determine the more original set: '[[A]]' for the first response, '[[B]]' for the second, or '[[C]]' if they are equally original. 174 | """, 175 | "rubric": """ 176 | Rate the novelty and innovation of the proposed solutions or uses. Consider: 177 | - 1 point: Solutions are common with little to no novelty. 178 | - 2 points: Solutions show slight novelty or a new twist on common ideas. 179 | - 3 points: Solutions are moderately novel, offering some unexpected uses or ideas. 180 | - 4 points: Solutions are highly novel and innovative, showing unique and uncommon uses or ideas. 181 | - 5 points: Solutions are exceptionally novel and innovative, presenting groundbreaking and rare ideas. 182 | Provide an overall novelty score in the format (X). 183 | """ 184 | }, 185 | "elaboration": { 186 | "default": """ 187 | Evaluate the overall level of elaboration in the set of responses. Rate the elaboration on a scale of 1 to 5, with 5 being the highest level of detail and development. Provide a brief justification for your overall evaluation. Indicate the overall elaboration score in the specific format of (X). 188 | """, 189 | "rubric": """Assess the level of detail and development in the responses: 190 | - 1 point: Very Basic - Minimal detail, ideas are barely developed. 191 | - 2 points: Somewhat Basic - Some details present, but development is lacking. 192 | - 3 points: Moderately Elaborated - A fair amount of detail, ideas are reasonably developed. 193 | - 4 points: Highly Elaborated - Detailed and well-developed ideas, showing depth. 194 | - 5 points: Exceptionally Elaborated - Comprehensive detail, ideas are fully fleshed out and insightful. 195 | Justify your rating and conclude with the elaboration score in the format (X). 196 | """, 197 | "sampling": """ 198 | You are a helpful assistant and a critical thinker. Please evaluate the level of elaboration of the response on a scale of 1 to 5. Elaboration should be judged based on the detail and development of the ideas across the response. Conclude with the score in this format: '[[X]]' Consider the following guidance: 199 | 1 point: Very Basic - The response is extremely basic with minimal detail or explanation. Idea is presented in a very simple or cursory manner. 200 | 2 points: Somewhat Basic - The response shows a slight degree of detail, but remains on a basic level. Idea is somewhat developed but lacks depth. 201 | 3 points: Moderately Elaborated - The response offers a moderate level of detail and development. Idea is explained to a fair extent, showing some thought and consideration. 202 | 4 points: Highly Elaborated - The response is well-developed and detailed. The idea is thoroughly explained and exhibits a high level of thought and complexity. 203 | 5 points: Exceptionally Elaborated - The response demonstrates exceptional elaboration. Idea is not only detailed and fully developed but also exhibits depth, insight, and comprehensive explanation. 204 | After reviewing the responses, assign an elaboration score based on these criteria. Provide a brief justification for your rating. It is extremely important to put the score in this format: '[[X]]' 205 | """, 206 | "pairwise": """ 207 | Compare the level of elaboration in responses by two different participants. Determine which response is more detailed and well-developed. Conclude with '[[A]]' if the first response shows more elaboration, '[[B]]' if the second is more elaborated, or '[[C]]' for equal levels of elaboration. 208 | """ 209 | } 210 | } 211 | 212 | 213 | wkct_prompts = { 214 | "fluency": { 215 | "default": """ 216 | You are a thoughtful assistant with a focus on scientific creativity. Identify and count the number of unique, relevant responses and explain why. It is important to the total amount of unique, relevant, and practical responses in the specific format of [[X]] at the end of your response. 217 | """, 218 | "sampling": """ 219 | Your task is to evaluate a list of responses for a creativity task provided by participants, focusing on identifying each unique and practical response listed. It's important to only consider responses that are relevant and reasonable. Conclude your analysis by stating the total number of unique, relevant responses in this specific format: [[X]]. Also, briefly explain how you determined whether a response was relevant and reasonable. 220 | """ 221 | }, 222 | "flexibility": { 223 | "default": """ 224 | You are a helpful assistant and a critical thinker. Please evaluate the flexibility of the relevant responses, where flexibility refers to the variety of distinct categories or perspectives represented in the responses. Define and count the number of unique categories or perspectives present, and provide a brief explanation for how you determined these categories. It is important to present the total number of categories or perspectives in the specific format of [[X]] at the end of your response. 225 | """, 226 | "sampling": """ 227 | Your task is to assess the range of unique categories suggested in the responses given from a creativity task. Your objective is to define and count the distinct categories or perspectives evident in the responses, and provide a brief explanation for how you determined these categories. Conclude your analysis by indicating the total number of unique categories or perspectives using the format: [[X]]. 228 | """ 229 | }, 230 | "originality": { 231 | "default": """ 232 | Focus on the originality and innovation of the proposed solutions or uses. Evaluate the responses for their novelty, especially solutions that demonstrate a significant departure from common knowledge or conventional approaches. Rate the overall novelty of the set of responses on a scale from 1 to 5, with 5 being the most novel. Justify your rating with examples from the responses and conclude with the novelty score in the format (X). 233 | """, 234 | "sampling": """ 235 | You are a helpful assistant and a critical thinker. Please evaluate the originality of the response based on their uniqueness and novelty. Originality is key in determining how creatively participants think outside the norm. Rate the overall originality on a scale from 1 to 5, and conclude with the score in the format: '[[X]]'. Consider the following guidance: 236 | - 1 point: Very Common - The idea is mundane and frequently mentioned in everyday contexts. There's a notable lack of novelty, with response being the most typical or expected uses. 237 | - 2 points: Somewhat Common - The idea is somewhat ordinary but shows slight variations from typical responses, indicating a basic level of creativity. 238 | - 3 points: Moderately Original - The idea displays a fair amount of creativity and novelty. They are not the usual thoughts but aren't highly rare or unexpected. 239 | - 4 points: Very Original - The idea is significantly unique, demonstrating a high level of creativity and innovation. They are unexpected and not commonly considered. 240 | - 5 points: Extremely Original - The idea is extraordinarily unique and rare, displaying a high degree of novelty, creativity, and unexpectedness. The idea is seldom thought of in typical contexts. 241 | After reviewing the responses, assign an originality score based on these criteria. Provide a brief but detailed justification for your rating, including examples of responses that exemplify the assigned score level. It is extremely important to put the score in this format: '[[X]]' 242 | """ 243 | }, 244 | "elaboration": { 245 | "default": """ 246 | Evaluate the overall level of elaboration in the set of responses. Rate the elaboration on a scale of 1 to 5, with 5 being the highest level of detail and development. Provide a brief justification for your overall evaluation. Indicate the overall elaboration score in the specific format of (X). 247 | """, 248 | "sampling": """ 249 | You are a helpful assistant and a critical thinker. Please evaluate the level of elaboration of the response on a scale of 1 to 5. Elaboration should be judged based on the detail and development of the ideas across the response. Conclude with the score in this format: '[[X]]' Consider the following guidance: 250 | 1 point: Very Basic - The response is extremely basic with minimal detail or explanation. Idea is presented in a very simple or cursory manner. 251 | 2 points: Somewhat Basic - The response shows a slight degree of detail, but remains on a basic level. Idea is somewhat developed but lacks depth. 252 | 3 points: Moderately Elaborated - The response offers a moderate level of detail and development. Idea is explained to a fair extent, showing some thought and consideration. 253 | 4 points: Highly Elaborated - The response is well-developed and detailed. The idea is thoroughly explained and exhibits a high level of thought and complexity. 254 | 5 points: Exceptionally Elaborated - The response demonstrates exceptional elaboration. Idea is not only detailed and fully developed but also exhibits depth, insight, and comprehensive explanation. 255 | After reviewing the responses, assign an elaboration score based on these criteria. Provide a brief justification for your rating. It is extremely important to put the score in this format: '[[X]]' 256 | """ 257 | } 258 | } 259 | -------------------------------------------------------------------------------- /Evaluation/utils/openai_model.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | import os 3 | import pickle 4 | import time 5 | import logging 6 | from pathlib import Path 7 | 8 | class OpenAIModel: 9 | def __init__(self, cache_file, version, api_key): 10 | self.cache_file = cache_file 11 | self.cache_file_path = Path(__file__).parent 12 | self.cache_dict = self.load_cache() 13 | self.version = version 14 | self.client = OpenAI(api_key=api_key) 15 | 16 | 17 | def save_cache(self): 18 | with open(self.cache_file_path/self.cache_file, "wb") as f: 19 | pickle.dump(self.cache_dict, f) 20 | 21 | def load_cache(self, allow_retry=True): 22 | if os.path.exists(self.cache_file_path/self.cache_file): 23 | while True: 24 | try: 25 | with open(self.cache_file_path/self.cache_file, "rb") as f: 26 | return pickle.load(f) 27 | except Exception as e: 28 | if not allow_retry: 29 | raise e 30 | logging.error("Pickle Unpickling Error: Retry in 5sec...") 31 | time.sleep(5) 32 | return {} 33 | 34 | def generate_response(self, messages, temperature=1, top_p=1, seed=0): 35 | prompt = str((messages, seed)) 36 | if prompt in self.cache_dict: 37 | return self.cache_dict[prompt] 38 | else: 39 | try: 40 | response = self.client.chat.completions.create( 41 | model=self.version, 42 | messages=messages, 43 | temperature=temperature, 44 | top_p=top_p, 45 | seed=seed 46 | ) 47 | result = response.choices[0].message.content 48 | self.cache_dict[prompt] = result 49 | return result 50 | except Exception as e: 51 | logging.exception("Exception occurred during response generation: " + str(e)) 52 | time.sleep(1) 53 | 54 | def compare_pair(self, item, result_a, result_b, init_prompt, seed=0): 55 | item_prompt = f"Give me a creative use of {item}" 56 | prompt = f"""{init_prompt}\n 57 | [Task] 58 | {item_prompt} 59 | [The Start of Result A] 60 | {result_a} 61 | [The End of Result A] 62 | [The Start of Result B] 63 | {result_b} 64 | [The End of Result B] 65 | """ 66 | messages = [{"role": "user", "content": prompt}] 67 | response = self.generate_response(messages=messages, seed=seed) 68 | return response -------------------------------------------------------------------------------- /Evaluation/utils/util.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | def parse_judgement_result(response_text): 4 | pattern = r"\[\[\'?([ABC])\'?\]\]" # Regex pattern to find [[A]], [[B]], or [[C]] 5 | matches = re.findall(pattern, response_text) 6 | 7 | if matches: 8 | return matches[0] # Return the first match (A, B, or C) 9 | else: 10 | return None # Return None if no match is found 11 | 12 | def parse_number_score(input_str): 13 | # First, try to match the [[X]] format 14 | bracket_pattern = r'\[\[(\d+)\]\]' 15 | matches = re.findall(bracket_pattern, input_str) 16 | if matches: 17 | return int(matches[-1]) # Return the last matched number in this format 18 | 19 | # Next, try to match the "X." format 20 | matches = re.findall(r'\b(\d+)\.\s', input_str) 21 | if matches: 22 | return int(matches[-1]) 23 | 24 | # Return the last standalone number found 25 | match = re.findall(r'\b\d+\b', input_str) 26 | if match: 27 | return int(match[-1]) 28 | -------------------------------------------------------------------------------- /Experiments/README.md: -------------------------------------------------------------------------------- 1 | # Run Experiments 2 | 3 | This folder contains the scripts and configuration files necessary to run multi-agent discussions using different task types in the LLM-Discussion project.
4 | **NOTE:** The primary script to execute these experiments is `llm_discussion.py`. 5 | 6 | ## Prerequisites 7 | Before running the `llm_discussion.py` script, ensure that the following prerequisites are met: 8 | 9 | ### 1. Config File 10 | You need a configuration file for the agents. The default configuration file is `config_role.json`, located in the multi_agent folder. Yoou can modify or create new roles using the same format as provided in `config_role.json`. 11 | 12 | ## Usage 13 | 14 | ```bash 15 | cd /LLM-Discussion/Experiments/multi_agent 16 | python llm_discussion.py -c -d -t [-r ] [-e ] 17 | ``` 18 | 19 | #### For example: 20 | ```bash 21 | python3 llm_discussion.py -c config_role.json -d /home/chenlawrance/exp_repo/LLM-Creativity/Datasets/AUT/aut_30_test.json -r 5 -t AUT -e 22 | ``` 23 | 24 | ### Arguments: 25 | - -c, --config: Required. Path to the configuration file for agents. 26 | - -d, --dataset: Required. Path to the dataset file. 27 | - -t, --type: Required. Type of task to run. Choose from AUT, Scientific, Similarities, Instances. 28 | - -r, --rounds: Number of rounds in the discussion. Default is 5. 29 | - -e, --eval_mode: Run in evaluation mode. If specified, the script will evaluate the discussion output. 30 | - -p, --prompt: Specifies the prompt test. Default is 1. Prompts are located in discussion.py line 9 to line 13 31 | 32 | ## Output File 33 | The script will automatically create the necessary output folders if they do not exist. These folders will be created under the `Results/{task_type}` directory structure (`task_type` is "AUT", "Scientific", "Instances", or "Similarities") : 34 | 35 | Subfolders for storing different types of data: 36 | - `Results/{task_type}/chat_log`: Contains chat logs of the entire discussion. 37 | - `Results/{task_type}/Output/multi_agent`: Contains the final discussion results. 38 | - `Results/{task_type}/init`: Stores initial responses generated by the agents. 39 | 40 | #### Evaluation Results: 41 | If running in evaluation mode (--eval_mode or -e), results will be saved in an Evaluation folder at the root of the project. For more information on the output folder of the evaluation, refer to: [Evaluation Output Section](../Evaluation/README.md#output) 42 | 43 | ## View Qualitative Results 44 | Use `read_conversation.py` to read the entire chatlog 45 | ```bash 46 | python3 read_conversation.py -i 47 | ``` 48 | **** are saved in `LLM-Discussion/Results//chat_log` -------------------------------------------------------------------------------- /Experiments/multi_agent/agents.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import logging 4 | import subprocess 5 | import time 6 | from openai import OpenAI 7 | import google.generativeai as genai 8 | 9 | def generate_response_llama2_torchrun( 10 | message, 11 | ckpt_dir: str = "/tmp2/llama-2-7b-chat", 12 | tokenizer_path: str = "/home/chenlawrance/repo/LLM-Creativity/model/tokenizer.model", 13 | temperature: float = 0.6, 14 | top_p: float = 0.9, 15 | max_seq_len: int = 2048, 16 | max_batch_size: int = 4): 17 | message_json = json.dumps(message) # Serialize the message to a JSON string 18 | command = [ 19 | "torchrun", "--nproc_per_node=1", "/home/chenlawrance/repo/LLM-Creativity/llama_model/llama_chat_completion.py", 20 | "--ckpt_dir", ckpt_dir, 21 | "--tokenizer_path", tokenizer_path, 22 | "--max_seq_len", str(max_seq_len), 23 | "--max_batch_size", str(max_batch_size), 24 | "--temperature", str(temperature), 25 | "--top_p", str(top_p), 26 | "--message", message_json 27 | ] 28 | try: 29 | result = subprocess.run(command, capture_output=True, text=True, check=True) 30 | output = result.stdout.strip() 31 | 32 | # Find the beginning of the generated response 33 | assistant_prefix = "> Assistant:" 34 | start_idx = output.find(assistant_prefix) 35 | if start_idx != -1: 36 | # Calculate the starting index of the actual response 37 | start_of_response = start_idx + len(assistant_prefix) 38 | # Extract and return the generated response part 39 | generated_response = output[start_of_response:].strip() 40 | return generated_response 41 | else: 42 | return "No response generated or unable to extract response." 43 | except subprocess.CalledProcessError as e: 44 | print(f"Error executing torchrun command: {e.stderr}") 45 | return "Unable to generate response due to an error." 46 | 47 | class Agent: 48 | def generate_answer(self, answer_context): 49 | raise NotImplementedError("This method should be implemented by subclasses.") 50 | def construct_assistant_message(self, prompt): 51 | raise NotImplementedError("This method should be implemented by subclasses.") 52 | def construct_user_message(self, prompt): 53 | raise NotImplementedError("This method should be implemented by subclasses.") 54 | 55 | class OpenAIAgent(Agent): 56 | def __init__(self, model_name, agent_name, agent_role, agent_speciality, agent_role_prompt, speaking_rate, missing_history = []): 57 | self.model_name = model_name 58 | self.client = OpenAI() 59 | self.agent_name = agent_name 60 | self.agent_role = agent_role 61 | self.agent_speciality = agent_speciality 62 | self.agent_role_prompt = agent_role_prompt 63 | self.speaking_rate = speaking_rate 64 | self.missing_history = missing_history 65 | 66 | def generate_answer(self, answer_context, temperature=1): 67 | try: 68 | completion = self.client.chat.completions.create( 69 | model=self.model_name, 70 | messages=answer_context, 71 | n=1) 72 | result = completion.choices[0].message.content 73 | # for pure text -> return completion.choices[0].message.content 74 | return result 75 | except Exception as e: 76 | print(f"Error with model {self.model_name}: {e}") 77 | time.sleep(10) 78 | return self.generate_answer(answer_context) 79 | 80 | def construct_assistant_message(self, content): 81 | return {"role": "assistant", "content": content} 82 | 83 | def construct_user_message(self, content): 84 | return {"role": "user", "content": content} 85 | 86 | class GeminiAgent(Agent): 87 | def __init__(self, model_name, agent_name, agent_role, agent_speciality, agent_role_prompt, speaking_rate): 88 | self.model_name = model_name 89 | genai.configure(api_key=os.environ["GEMINI_API_KEY"]) # ~/.bashrc save : export GEMINI_API_KEY="YOUR_API" 90 | self.model = genai.GenerativeModel(self.model_name) 91 | self.agent_name = agent_name 92 | self.agent_role = agent_role 93 | self.agent_speciality = agent_speciality 94 | self.agent_role_prompt = agent_role_prompt 95 | self.speaking_rate = speaking_rate 96 | 97 | def generate_answer(self, answer_context,temperature= 1.0): 98 | try: 99 | response = self.model.generate_content( 100 | answer_context, 101 | generation_config=genai.types.GenerationConfig(temperature=temperature), 102 | safety_settings=[ 103 | {"category": "HARM_CATEGORY_HARASSMENT","threshold": "BLOCK_NONE",}, 104 | {"category": "HARM_CATEGORY_HATE_SPEECH","threshold": "BLOCK_NONE",}, 105 | {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT","threshold": "BLOCK_NONE",}, 106 | {"category": "HARM_CATEGORY_DANGEROUS_CONTENT","threshold": "BLOCK_NONE",}, 107 | ] 108 | ) 109 | # for pure text -> return response.text 110 | # return response.candidates[0].content 111 | return response.text 112 | except Exception as e: 113 | logging.exception("Exception occurred during response generation: " + str(e)) 114 | time.sleep(1) 115 | return self.generate_answer(answer_context) 116 | 117 | def construct_assistant_message(self, content): 118 | response = {"role": "model", "parts": [content]} 119 | return response 120 | 121 | def construct_user_message(self, content): 122 | response = {"role": "user", "parts": [content]} 123 | return response 124 | 125 | class Llama2Agent(Agent): 126 | def __init__(self, ckpt_dir, tokenizer_path, agent_name): 127 | self.ckpt_dir = ckpt_dir 128 | self.tokenizer_path = tokenizer_path 129 | self.agent_name = agent_name 130 | 131 | def generate_answer(self, answer_context, temperature=0.6, top_p=0.9, max_seq_len=100000, max_batch_size=4): # return pure text 132 | return generate_response_llama2_torchrun( 133 | message=answer_context, 134 | ckpt_dir=self.ckpt_dir, 135 | tokenizer_path=self.tokenizer_path, 136 | temperature=temperature, 137 | top_p=top_p, 138 | max_seq_len=max_seq_len, 139 | max_batch_size=max_batch_size 140 | ) 141 | 142 | def construct_assistant_message(self, content): 143 | return {"role": "assistant", "content": content} 144 | 145 | def construct_user_message(self, content): 146 | return {"role": "user", "content": content} -------------------------------------------------------------------------------- /Experiments/multi_agent/config_role.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "type": "openai", 4 | "model_name": "gpt-3.5-turbo-0125", 5 | "agent_name": "GPT Agent 6 - Environmentalist", 6 | "agent_role": "Environmentalist", 7 | "agent_speciality": "Sustainability and Environmental Health", 8 | "agent_role_prompt": "As an Environmentalist, your mission is to champion eco-friendly solutions that promote sustainability and protect our planet. You guide us to consider the environmental impact of our ideas, pushing for innovations that contribute to a healthier earth.", 9 | "speaking_rate": 1 10 | }, 11 | { 12 | "type": "openai", 13 | "model_name": "gpt-3.5-turbo-0125", 14 | "agent_name": "GPT Agent 4 - Creative Professional", 15 | "agent_role": "Creative Professional", 16 | "agent_speciality": "Aesthetics, Narratives, and Emotions", 17 | "agent_role_prompt": "As a Creative Professional, your artistic sensibility and mastery of narrative and emotion infuse our projects with beauty and depth. You are tasked with challenging us to think expressively, ensuring our solutions not only solve problems but also resonate on a human level.", 18 | "speaking_rate": 1 19 | }, 20 | { 21 | "type": "openai", 22 | "model_name": "gpt-3.5-turbo-0125", 23 | "agent_name": "GPT Agent 10-1 - Futurist", 24 | "agent_role": "Futurist", 25 | "agent_speciality": "Emerging Technologies and Future Scenarios", 26 | "agent_role_prompt": "As a Futurist, you inspire us to think beyond the present, considering emerging technologies and potential future scenarios. Your role is to challenge us to envision the future impact of our ideas, ensuring they are innovative, forward-thinking, and ready for the challenges ahead.", 27 | "speaking_rate": 1 28 | }, 29 | { 30 | "type": "openai", 31 | "model_name": "gpt-3.5-turbo-0125", 32 | "agent_name": "GPT Agent 10-2 - Futurist", 33 | "agent_role": "Futurist", 34 | "agent_speciality": "Emerging Technologies and Future Scenarios", 35 | "agent_role_prompt": "As a Futurist, you inspire us to think beyond the present, considering emerging technologies and potential future scenarios. Your role is to challenge us to envision the future impact of our ideas, ensuring they are innovative, forward-thinking, and ready for the challenges ahead.", 36 | "speaking_rate": 1 37 | } 38 | ] -------------------------------------------------------------------------------- /Experiments/multi_agent/discussion.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | from agents import OpenAIAgent, GeminiAgent, Llama2Agent 4 | import datetime 5 | import os 6 | 7 | class Discussion: 8 | PROMPTS = { 9 | 1: "You are in a group discussion with other teammates; as a result, answer as diversely and creatively as you can.", 10 | 2: "You're in a brainstorming session where each idea leads to the next. Embrace the flow of creativity without limits, encouraging one another to build on each suggestion for unexpected connections.", 11 | 3: "Pretend your team is at a think tank where unconventional ideas are the norm. Challenge each other to think from different perspectives, considering the most unusual or innovative ideas.", 12 | 4: "Engage in a collaborative discussion where each of you contributes a unique insight or query, aiming to delve into uncharted territories of thought. Throughout the discussion, focus on expanding the scope and depth of each contribution through constructive feedback, counterpoints, and further questioning. The objective is to achieve a broad spectrum of ideas and solutions, promoting a culture of continuous learning and innovation.", 13 | 5: "Envision your group as a crew on a mission to solve a mystery using only your creativity and wit. How would you piece together clues from each member's ideas to find the solution? And this would be crucial to your member’s life." 14 | } 15 | 16 | def __init__(self, dataset_file, rounds, prompt): 17 | self.dataset_file = dataset_file 18 | self.rounds = rounds 19 | self.discussion_prompt = self.PROMPTS.get(prompt, "Invalid prompt selected.") 20 | print("Discussion initialized with dataset: {} and {} rounds.".format(dataset_file, rounds)) 21 | 22 | def run(self): 23 | pass 24 | 25 | def save_conversation(self, filename, conversation_data): 26 | os.makedirs(os.path.dirname(filename), exist_ok=True) 27 | with open(filename, 'w') as file: 28 | json.dump(conversation_data, file, indent=4) 29 | print(f"Saved Conversation Data to {filename}") 30 | 31 | @staticmethod 32 | def load_config(config_path): 33 | with open(config_path, 'r') as f: 34 | return json.load(f) 35 | 36 | def extract_response(self, content): 37 | lines = content.split('\n') 38 | uses = [line.strip() for line in lines if line.strip() and re.match(r"^\d+\.", line)] 39 | uses = [use[use.find('.') + 2:] for use in uses] 40 | return uses 41 | 42 | class LLM_Debate(Discussion): 43 | def __init__(self, agents_config, dataset_file, rounds, task, prompt): 44 | super().__init__(dataset_file, rounds, prompt) 45 | self.task_type = task 46 | self.agents = self.initialize_agents(agents_config) 47 | print(f"LLM_Debate initialized for task: {task} with {len(self.agents)} agents.") 48 | 49 | def initialize_agents(self, agents_config): 50 | agents = [] 51 | for config in agents_config: 52 | if config['type'] == 'openai': 53 | agents.append(OpenAIAgent(model_name=config['model_name'], 54 | agent_name = config['agent_name'], 55 | agent_role = config['agent_role'], 56 | agent_speciality = config['agent_speciality'], 57 | agent_role_prompt = config['agent_role_prompt'], 58 | speaking_rate = config['speaking_rate'])) 59 | elif config['type'] == 'gemini': 60 | agents.append(GeminiAgent(model_name=config['model_name'], 61 | agent_name = config['agent_name'], 62 | agent_role=config['agent_role'], 63 | agent_speciality=config['agent_speciality'], 64 | agent_role_prompt=config['agent_role_prompt'], 65 | speaking_rate=config['speaking_rate'])) 66 | elif config['type'] == 'llama2': 67 | agents.append(Llama2Agent(ckpt_dir=config['ckpt_dir'], 68 | tokenizer_path=config['tokenizer_path'], 69 | agent_name = config['agent_name'])) 70 | else: 71 | raise ValueError(f"Unsupported agent type: {config['type']}") 72 | return agents 73 | 74 | def construct_response(self, question, most_recent_responses, current_agent, is_last_round, baseline=False): 75 | prefix_string = "These are the solutions to the problem from other agents:\n" 76 | for agent_name, responses in most_recent_responses.items(): 77 | if agent_name == current_agent.agent_name: 78 | continue 79 | if responses and 'parts' in responses[-1]: 80 | response_content = responses[-1]['parts'][0] 81 | else: 82 | response_content = responses[-1]['content'] 83 | 84 | other_agent_response = f"One agent solution: ```{response_content}```\n" 85 | prefix_string += other_agent_response 86 | 87 | if baseline: 88 | suffix_string = "Using the reasoning from other agents as additional advice, can you give an updated answer? Please put your answer in a list format, starting with 1. ... 2. ... 3. ... and so on." 89 | else: 90 | suffix_string = question + self.discussion_prompt 91 | if is_last_round: 92 | suffix_string += " This is the last round of the discussion, please finalize and present a list of as many creative answers as possible. Please list the final response in 1. ... 2. ... 3. ... and so on. \n\n" 93 | 94 | return prefix_string + suffix_string 95 | 96 | def save_debate_conversations(self, agents, all_responses, init_results, final_results, amount_of_data, task_type="AUT", baseline = False): 97 | current_date, formatted_time = self.get_current_datetime() 98 | model_names_concatenated = self.concatenate_model_names(agents) 99 | role_names_concatenated = self.concatenate_role_names(agents) 100 | subtask = self.determine_subtask(agents, baseline) 101 | 102 | output_filename = self.generate_filename(task_type, subtask, "chat_log", model_names_concatenated, role_names_concatenated, current_date, formatted_time, amount_of_data, len(agents), self.rounds) 103 | final_ans_filename = self.generate_final_filename(task_type, subtask, "multi_agent", model_names_concatenated, role_names_concatenated, current_date, formatted_time, amount_of_data, len(agents), self.rounds) 104 | init_ans_filename = self.generate_filename(task_type, subtask, "init", model_names_concatenated, role_names_concatenated, current_date, formatted_time, amount_of_data, len(agents), self.rounds) 105 | 106 | # Ensure all required directories exist 107 | os.makedirs(os.path.dirname(output_filename), exist_ok=True) 108 | os.makedirs(os.path.dirname(final_ans_filename), exist_ok=True) 109 | os.makedirs(os.path.dirname(init_ans_filename), exist_ok=True) 110 | 111 | self.save_conversation(output_filename, all_responses) 112 | self.save_conversation(final_ans_filename, final_results) 113 | self.save_conversation(init_ans_filename, init_results) 114 | 115 | return final_ans_filename 116 | 117 | @staticmethod 118 | def get_current_datetime(): 119 | current_time = datetime.datetime.now() 120 | current_date = current_time.strftime("%Y-%m-%d") 121 | formatted_time = current_time.strftime("%H-%M-%S") 122 | return current_date, formatted_time 123 | 124 | @staticmethod 125 | def concatenate_model_names(agents): 126 | if all(agent.model_name == agents[0].model_name for agent in agents): 127 | return agents[0].model_name.replace(".", "-") 128 | return "-".join(agent.model_name.replace(".", "-") for agent in agents) 129 | 130 | @staticmethod 131 | def concatenate_role_names(agents): 132 | if all(agent.agent_role == "None" for agent in agents): 133 | return "None" 134 | return "-".join(agent.agent_role.replace(" ", "") for agent in agents) 135 | 136 | def determine_subtask(self, agents, baseline): 137 | if baseline: 138 | return "baseline" 139 | if all(agent.agent_role == "None" for agent in agents): 140 | return "FINAL" 141 | return "roleplay" 142 | 143 | @staticmethod 144 | def generate_filename(task_type, subtask, data_type, model_names_concatenated, role_names_concatenated, current_date, formatted_time, amount_of_data, num_agents, num_rounds): 145 | return f"../../Results/{task_type}/{data_type}/{task_type}_multi_debate_{subtask}_{num_agents}_{num_rounds}_{model_names_concatenated}_{role_names_concatenated}_{data_type}_{current_date}-{formatted_time}_{amount_of_data}.json" 146 | 147 | @staticmethod 148 | def generate_final_filename(task_type, subtask, data_type, model_names_concatenated, role_names_concatenated, current_date, formatted_time, amount_of_data, num_agents, num_rounds): 149 | return f"../../Results/{task_type}/Output/{data_type}/{task_type}_multi_debate_{subtask}_{num_agents}_{num_rounds}_{model_names_concatenated}_{role_names_concatenated}_{data_type}_{current_date}-{formatted_time}_{amount_of_data}.json" 150 | 151 | class LLM_Debate_AUT_Baseline(LLM_Debate): 152 | def run(self): 153 | print(f"Starting LLM_Debate.run with dataset: {self.dataset_file}") 154 | 155 | with open(self.dataset_file, 'r') as f: 156 | dataset = json.load(f) 157 | all_responses = {} 158 | init_results = [] 159 | final_results = [] 160 | amount_of_data = len(dataset['Examples']) 161 | for example in dataset['Examples']: 162 | # --------------->>>> set the system content 163 | chat_history = {agent.agent_name: [] for agent in self.agents} 164 | object = example['object'] 165 | problem_template = " ".join(dataset["Task"][0]["Problem"]) 166 | question = problem_template.replace("{object}", object) 167 | initial_prompt = question 168 | most_recent_responses = {} 169 | # ------------------------------------------ 170 | for round in range(self.rounds): 171 | is_last_round = (round == self.rounds - 1) 172 | is_first_round = (round == 0) 173 | round_responses = {agent.agent_name: [] for agent in self.agents} 174 | print(f"Round {round + 1}, Object: {object}") 175 | for agent in self.agents: 176 | if agent.agent_role != "None": 177 | agent_role_prompt = f"You are a {agent.agent_role} whose specialty is {agent.agent_speciality}. {agent.agent_role_prompt} Remember to claim your role in the beginning of each conversation. " 178 | print(f"agent_role = {agent.agent_role}") 179 | else: 180 | agent_role_prompt = "" 181 | if is_first_round: 182 | formatted_initial_prompt = agent.construct_user_message(agent_role_prompt + "Can you answer the following question as creatively as possible: " + initial_prompt + " Please put your answer in a list format, starting with 1. ... 2. ... 3. ... and so on.") 183 | chat_history[agent.agent_name].append(formatted_initial_prompt) 184 | response = agent.generate_answer(chat_history[agent.agent_name]) 185 | # Save the initial response of the Agent 186 | uses_list = self.extract_response(response) 187 | init_result = {"item": object, "uses": uses_list, "Agent": agent.agent_name} 188 | init_results.append(init_result) 189 | else: 190 | combined_prompt = self.construct_response(question, most_recent_responses, agent, is_last_round,baseline = True) 191 | formatted_combined_prompt = agent.construct_user_message(agent_role_prompt + combined_prompt) 192 | chat_history[agent.agent_name].append(formatted_combined_prompt) 193 | response = agent.generate_answer(chat_history[agent.agent_name]) 194 | if is_last_round: 195 | uses_list = self.extract_response(response) 196 | final_result = {"item": object, "uses": uses_list, "Agent": agent.agent_name} 197 | final_results.append(final_result) 198 | 199 | formatted_response = agent.construct_assistant_message(response) 200 | chat_history[agent.agent_name].append(formatted_response) # Update the agent's chat history 201 | round_responses[agent.agent_name].append(formatted_response) 202 | most_recent_responses = round_responses 203 | all_responses[question] = chat_history 204 | 205 | output_file = self.save_debate_conversations(self.agents, all_responses, init_results, final_results, amount_of_data, task_type=self.task_type, baseline=True) 206 | return output_file 207 | 208 | class LLM_Debate_Scientific_Baseline(LLM_Debate): 209 | def run(self): 210 | with open(self.dataset_file, 'r') as f: 211 | dataset = json.load(f) 212 | all_responses = {} 213 | init_results = [] 214 | final_results = [] 215 | amount_of_data = 0 216 | for task in dataset['Task']: 217 | amount_of_data += len(task['Example']) 218 | for example in task['Example']: 219 | chat_history = {agent.agent_name: [] for agent in self.agents} 220 | # --------------->>>> set the system content 221 | question = example 222 | initial_prompt = question 223 | # ------------------------------------------ 224 | most_recent_responses = {} 225 | for round in range(self.rounds): 226 | is_last_round = (round == self.rounds - 1) 227 | is_first_round = (round == 0) 228 | round_responses = {agent.agent_name: [] for agent in self.agents} 229 | print(f"Round {round + 1}: Discussion on {question}") 230 | for agent in self.agents: 231 | if agent.agent_role != "None": 232 | agent_role_prompt = f"You are a {agent.agent_role} whose specialty is {agent.agent_speciality}. {agent.agent_role_prompt} Remember to claim your role in the beginning of each conversation. " 233 | print(f"agent_role = {agent.agent_role}") 234 | else: 235 | agent_role_prompt = "" 236 | 237 | if is_first_round: 238 | formatted_initial_prompt = agent.construct_user_message(agent_role_prompt + "Can you answer the following question as creatively as possible: " + initial_prompt + " Please put your answer in a list format, starting with 1. ... 2. ... 3. ... and so on.") 239 | chat_history[agent.agent_name].append(formatted_initial_prompt) 240 | response = agent.generate_answer(chat_history[agent.agent_name]) 241 | response_list = self.extract_response(response) 242 | init_result = {"question": question, "answer": response_list, "Agent": agent.agent_name} 243 | init_results.append(init_result) 244 | else: 245 | combined_prompt = self.construct_response(question, most_recent_responses, agent, is_last_round, baseline = True) 246 | formatted_combined_prompt = agent.construct_user_message(agent_role_prompt + combined_prompt) 247 | chat_history[agent.agent_name].append(formatted_combined_prompt) 248 | response = agent.generate_answer(chat_history[agent.agent_name]) 249 | if is_last_round: 250 | response_list = self.extract_response(response) 251 | print(f"response_list = {response_list}") 252 | final_result = {"question": question, "answer": response_list, "Agent": agent.agent_name} 253 | final_results.append(final_result) 254 | 255 | formatted_response = agent.construct_assistant_message(response) 256 | chat_history[agent.agent_name].append(formatted_response) # Update the agent's chat history 257 | round_responses[agent.agent_name].append(formatted_response) 258 | most_recent_responses = round_responses 259 | all_responses[question] = chat_history 260 | 261 | output_file = self.save_debate_conversations(self.agents, all_responses, init_results, final_results, amount_of_data, task_type=self.task_type, baseline=True) 262 | return output_file 263 | 264 | class LLM_Debate_Instance_Similarities_Baseline(LLM_Debate): 265 | def run(self): 266 | with open(self.dataset_file, 'r') as f: 267 | dataset = json.load(f) 268 | all_responses = {} 269 | init_results = [] 270 | final_results = [] 271 | amount_of_data = len(dataset['Examples']) 272 | for example in dataset['Examples']: 273 | chat_history = {agent.agent_name: [] for agent in self.agents} 274 | # --------------->>>> set the system content 275 | question = example 276 | initial_prompt = question 277 | # ------------------------------------------ 278 | most_recent_responses = {} 279 | for round in range(self.rounds): 280 | is_last_round = (round == self.rounds - 1) 281 | is_first_round = (round == 0) 282 | round_responses = {agent.agent_name: [] for agent in self.agents} 283 | print(f"Round {round + 1}: Discussion on {question}") 284 | for agent in self.agents: 285 | 286 | if agent.agent_role != "None": 287 | agent_role_prompt = f"You are a {agent.agent_role} whose specialty is {agent.agent_speciality}. {agent.agent_role_prompt} Remember to claim your role in the beginning of each conversation. " 288 | print(f"agent_role = {agent.agent_role}") 289 | else: 290 | agent_role_prompt = "" 291 | 292 | if is_first_round: 293 | formatted_initial_prompt = agent.construct_user_message(agent_role_prompt + "Can you answer the following question as creatively as possible: " + initial_prompt + " Please put your answer in a list format, starting with 1. ... 2. ... 3. ... and so on.") 294 | chat_history[agent.agent_name].append(formatted_initial_prompt) 295 | response = agent.generate_answer(chat_history[agent.agent_name]) 296 | response_list = self.extract_response(response) 297 | init_result = {"question": question, "answer": response_list, "Agent": agent.agent_name} 298 | init_results.append(init_result) 299 | else: 300 | combined_prompt = self.construct_response(question, most_recent_responses, agent, is_last_round, baseline = True) 301 | formatted_combined_prompt = agent.construct_user_message(agent_role_prompt + combined_prompt) 302 | chat_history[agent.agent_name].append(formatted_combined_prompt) 303 | response = agent.generate_answer(chat_history[agent.agent_name]) 304 | # Save Final Result 305 | if is_last_round: 306 | response_list = self.extract_response(response) 307 | final_result = {"question": question, "answer": response_list, "Agent": agent.agent_name} 308 | final_results.append(final_result) 309 | 310 | formatted_response = agent.construct_assistant_message(response) 311 | chat_history[agent.agent_name].append(formatted_response) # Update the agent's chat history 312 | round_responses[agent.agent_name].append(formatted_response) 313 | most_recent_responses = round_responses 314 | all_responses[question] = chat_history 315 | output_file = self.save_debate_conversations(self.agents, all_responses, init_results, final_results, amount_of_data, task_type=self.task_type, baseline=True) 316 | return output_file 317 | 318 | class LLM_Discussion_AUT(LLM_Debate): 319 | def run(self): 320 | print(f"Starting LLM_Debate.run with dataset: {self.dataset_file}") 321 | with open(self.dataset_file, 'r') as f: 322 | dataset = json.load(f) 323 | all_responses = {} 324 | init_results = [] 325 | final_results = [] 326 | amount_of_data = len(dataset['Examples']) 327 | for example in dataset['Examples']: 328 | 329 | # --------------->>>> set the system content 330 | chat_history = {agent.agent_name: [] for agent in self.agents} 331 | 332 | object = example['object'] 333 | problem_template = " ".join(dataset["Task"][0]["Problem"]) 334 | question = problem_template.replace("{object}", object) 335 | print("Discussion Prompt is ", self.discussion_prompt) 336 | initial_prompt = "Initiate a discussion with others to collectively complete the following task: " + question + self.discussion_prompt 337 | most_recent_responses = {} 338 | # ------------------------------------------ 339 | for round in range(self.rounds): 340 | is_last_round = (round == self.rounds - 1) 341 | is_first_round = (round == 0) 342 | round_responses = {agent.agent_name: [] for agent in self.agents} 343 | print(f"Round {round + 1}, Object: {object}") 344 | for agent in self.agents: 345 | if agent.agent_role != "None": 346 | agent_role_prompt = f"You are a {agent.agent_role} whose specialty is {agent.agent_speciality}. {agent.agent_role_prompt} Remember to claim your role in the beginning of each conversation. " 347 | print(f"agent_role = {agent.agent_role}") 348 | else: 349 | agent_role_prompt = "" 350 | 351 | if is_first_round: 352 | formatted_initial_prompt = agent.construct_user_message(agent_role_prompt + initial_prompt) 353 | chat_history[agent.agent_name].append(formatted_initial_prompt) 354 | response = agent.generate_answer(chat_history[agent.agent_name]) 355 | # Save the initial response of the Agent 356 | uses_list = self.extract_response(response) 357 | init_result = {"item": object, "uses": uses_list, "Agent": agent.agent_name} 358 | init_results.append(init_result) 359 | 360 | else: 361 | combined_prompt = self.construct_response(question, most_recent_responses, agent, is_last_round) 362 | formatted_combined_prompt = agent.construct_user_message(agent_role_prompt + combined_prompt) 363 | chat_history[agent.agent_name].append(formatted_combined_prompt) 364 | response = agent.generate_answer(chat_history[agent.agent_name]) 365 | # Save Final Result of the Agent 366 | if is_last_round: 367 | uses_list = self.extract_response(response) 368 | final_result = {"item": object, "uses": uses_list, "Agent": agent.agent_name} 369 | final_results.append(final_result) 370 | 371 | formatted_response = agent.construct_assistant_message(response) 372 | chat_history[agent.agent_name].append(formatted_response) # Update the agent's chat history 373 | round_responses[agent.agent_name].append(formatted_response) 374 | most_recent_responses = round_responses 375 | all_responses[question] = chat_history 376 | 377 | output_file = self.save_debate_conversations(self.agents, all_responses, init_results, final_results, amount_of_data, task_type=self.task_type) 378 | return output_file 379 | 380 | class LLM_Discussion_Scientific(LLM_Debate): 381 | def run(self): 382 | with open(self.dataset_file, 'r') as f: 383 | dataset = json.load(f) 384 | all_responses = {} 385 | init_results = [] 386 | final_results = [] 387 | amount_of_data = 0 388 | for task in dataset['Task']: 389 | amount_of_data += len(task['Example']) 390 | for example in task['Example']: 391 | chat_history = {agent.agent_name: [] for agent in self.agents} 392 | # --------------->>>> set the system content 393 | question = example 394 | initial_prompt = "Initiate a discussion with others to collectively complete the following task: " + question + self.discussion_prompt 395 | # ------------------------------------------ 396 | most_recent_responses = {} 397 | for round in range(self.rounds): 398 | is_last_round = (round == self.rounds - 1) 399 | is_first_round = (round == 0) 400 | round_responses = {agent.agent_name: [] for agent in self.agents} 401 | print(f"Round {round + 1}: Discussion on {question}") 402 | for agent in self.agents: 403 | if agent.agent_role != "None": 404 | agent_role_prompt = f"You are a {agent.agent_role} whose specialty is {agent.agent_speciality}. {agent.agent_role_prompt} Remember to claim your role in the beginning of each conversation. " 405 | print(f"agent_role = {agent.agent_role}") 406 | else: 407 | agent_role_prompt = "" 408 | 409 | if is_first_round: 410 | formatted_initial_prompt = agent.construct_user_message(agent_role_prompt + initial_prompt) 411 | chat_history[agent.agent_name].append(formatted_initial_prompt) 412 | response = agent.generate_answer(chat_history[agent.agent_name]) 413 | response_list = self.extract_response(response) 414 | init_result = {"question": question, "answer": response_list, "Agent": agent.agent_name} 415 | init_results.append(init_result) 416 | else: 417 | combined_prompt = self.construct_response(question, most_recent_responses, agent, is_last_round) 418 | formatted_combined_prompt = agent.construct_user_message(agent_role_prompt + combined_prompt) 419 | chat_history[agent.agent_name].append(formatted_combined_prompt) 420 | response = agent.generate_answer(chat_history[agent.agent_name]) 421 | # Save Final Result 422 | if is_last_round: 423 | response_list = self.extract_response(response) 424 | print(f"response_list = {response_list}") 425 | final_result = {"question": question, "answer": response_list, "Agent": agent.agent_name} 426 | final_results.append(final_result) 427 | 428 | formatted_response = agent.construct_assistant_message(response) 429 | chat_history[agent.agent_name].append(formatted_response) # Update the agent's chat history 430 | round_responses[agent.agent_name].append(formatted_response) 431 | most_recent_responses = round_responses 432 | all_responses[question] = chat_history 433 | 434 | output_file = self.save_debate_conversations(self.agents, all_responses, init_results, final_results, amount_of_data, task_type=self.task_type) 435 | return output_file 436 | 437 | class LLM_Discussion_Instance_Similarities(LLM_Debate): 438 | def run(self): 439 | with open(self.dataset_file, 'r') as f: 440 | dataset = json.load(f) 441 | all_responses = {} 442 | init_results = [] 443 | final_results = [] 444 | amount_of_data = len(dataset['Examples']) 445 | for example in dataset['Examples']: 446 | chat_history = {agent.agent_name: [] for agent in self.agents} 447 | # print("initial chat_history: ", chat_history, "\n") 448 | # --------------->>>> set the system content 449 | question = example 450 | initial_prompt = "Initiate a discussion with others to collectively complete the following task: " + question + self.discussion_prompt 451 | # ------------------------------------------ 452 | most_recent_responses = {} 453 | for round in range(self.rounds): 454 | is_last_round = (round == self.rounds - 1) 455 | is_first_round = (round == 0) 456 | round_responses = {agent.agent_name: [] for agent in self.agents} 457 | print(f"Round {round + 1}: Discussion on {question}") 458 | for agent in self.agents: 459 | 460 | if agent.agent_role != "None": 461 | agent_role_prompt = f"You are a {agent.agent_role} whose specialty is {agent.agent_speciality}. {agent.agent_role_prompt} Remember to claim your role in the beginning of each conversation. " 462 | print(f"agent_role = {agent.agent_role}") 463 | else: 464 | agent_role_prompt = "" 465 | 466 | if is_first_round: 467 | formatted_initial_prompt = agent.construct_user_message(agent_role_prompt + initial_prompt) 468 | chat_history[agent.agent_name].append(formatted_initial_prompt) 469 | response = agent.generate_answer(chat_history[agent.agent_name]) 470 | response_list = self.extract_response(response) 471 | init_result = {"question": question, "answer": response_list, "Agent": agent.agent_name} 472 | init_results.append(init_result) 473 | else: 474 | combined_prompt = self.construct_response(question, most_recent_responses, agent, is_last_round) 475 | formatted_combined_prompt = agent.construct_user_message(agent_role_prompt + combined_prompt) 476 | chat_history[agent.agent_name].append(formatted_combined_prompt) 477 | response = agent.generate_answer(chat_history[agent.agent_name]) 478 | 479 | # Save Final Result 480 | if is_last_round: 481 | response_list = self.extract_response(response) 482 | print(f"response_list = {response_list}") 483 | final_result = {"question": question, "answer": response_list, "Agent": agent.agent_name} 484 | final_results.append(final_result) 485 | 486 | formatted_response = agent.construct_assistant_message(response) 487 | chat_history[agent.agent_name].append(formatted_response) # Update the agent's chat history 488 | round_responses[agent.agent_name].append(formatted_response) 489 | most_recent_responses = round_responses 490 | all_responses[question] = chat_history 491 | output_file = self.save_debate_conversations(self.agents, all_responses, init_results, final_results, amount_of_data, task_type=self.task_type) 492 | return output_file -------------------------------------------------------------------------------- /Experiments/multi_agent/llm_discussion.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | import os 4 | from pathlib import Path 5 | from discussion import LLM_Discussion_AUT, LLM_Discussion_Scientific, LLM_Discussion_Instance_Similarities 6 | from types import SimpleNamespace 7 | from pathlib import Path 8 | 9 | 10 | # This file run LLM Discussion 11 | 12 | def main(): 13 | parser = argparse.ArgumentParser(description="Orchestrate a discussion with multiple AI agents.") 14 | parser.add_argument("-c", "--config", required=True, help="Path to the configuration file for agents.") 15 | parser.add_argument("-d", "--dataset", required=True, help="Path to the dataset file.") 16 | parser.add_argument("-r", "--rounds", type=int, default=5, help="Number of rounds in the discussion.") 17 | parser.add_argument("-t", "--type", choices= ["AUT", "Scientific","Similarities", "Instances"], help="Type of task to run.") 18 | parser.add_argument("-e", "--eval_mode", action="store_true", default=False, help="Run in evaluation mode.") 19 | parser.add_argument("-p", "--prompt", type = int, default = 1, help = "Prompt Test") 20 | args = parser.parse_args() 21 | 22 | if args.type == "AUT": 23 | agents_config = LLM_Discussion_AUT.load_config(args.config) 24 | discussion_runner = LLM_Discussion_AUT(agents_config, args.dataset, args.rounds, args.type, args.prompt) 25 | elif args.type == "Scientific": 26 | agents_config = LLM_Discussion_Scientific.load_config(args.config) 27 | discussion_runner = LLM_Discussion_Scientific(agents_config, args.dataset, args.rounds, args.type, args.prompt) 28 | elif args.type == "Similarities" or args.type == "Instances": 29 | agents_config = LLM_Discussion_Instance_Similarities.load_config(args.config) 30 | discussion_runner = LLM_Discussion_Instance_Similarities(agents_config, args.dataset, args.rounds, args.type, args.prompt) 31 | discussion_output = discussion_runner.run() 32 | 33 | if args.eval_mode: 34 | root_path = Path(__file__).resolve().parents[2] 35 | evaluation_root = root_path / 'Evaluation' 36 | sys.path.append(str(evaluation_root)) 37 | from auto_grade_final import auto_grade 38 | #Call Evaluation 39 | input_file_name = os.path.splitext(os.path.basename(discussion_output))[0] 40 | 41 | args = SimpleNamespace( 42 | version="3", 43 | input_file=input_file_name, 44 | type="sampling", 45 | sample=3, 46 | task=args.type, 47 | output="y", 48 | temperature=1.0 49 | ) 50 | auto_grade(args) 51 | 52 | if __name__ == "__main__": 53 | main() 54 | -------------------------------------------------------------------------------- /Experiments/read_conversation.py: -------------------------------------------------------------------------------- 1 | import json 2 | import argparse 3 | 4 | def main(filename: str): 5 | with open(filename, 'r') as file: 6 | chat_data = json.load(file) 7 | 8 | # Generating HTML content for the chat 9 | html_content = """ 10 | 11 | 12 | 13 | 14 | Chat History 15 | 16 | 26 | 27 | 28 | """ 29 | 30 | for question, agents_responses in chat_data.items(): 31 | html_content += f"

Discussion Topic: {question}

\n" 32 | 33 | for agent, responses in agents_responses.items(): 34 | html_content += f"
\n

{agent}

\n" 35 | 36 | for response in responses: 37 | role = response["role"] 38 | css_class = "user" if role == "user" else "model" 39 | if "content" in response: 40 | message = response["content"] 41 | elif "parts" in response: 42 | message = " ".join(response["parts"]) 43 | html_content += f"
{message}
\n" 44 | 45 | html_content += "
\n" 46 | 47 | html_content += """ 48 | 49 | 50 | """ 51 | 52 | # Save the HTML content to a file 53 | file_path = 'chat_history_ui_2.html' 54 | with open(file_path, 'w') as file: 55 | file.write(html_content) 56 | 57 | print(file_path) 58 | 59 | if __name__ == "__main__": 60 | parser = argparse.ArgumentParser(description="Make HTML to show chat.") 61 | parser.add_argument("-i", "--input", required=True, help="Path to the conversation log file.") 62 | args = parser.parse_args() 63 | main(args.input) 64 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LLM Discussion 2 | 3 | Official Implementation of [LLM Discussion: Enhancing the Creativity of Large Language Models via Discussion Framework and Role-Play](https://arxiv.org/abs/2405.06373) 4 | 5 | >Authors : [Li-Chun Lu*](https://github.com/lichun-19), [Shou-Jen Chen*](https://github.com/lawraa), [Tsung-Min Pai](https://github.com/Bai1026), Chan-Hung Yu & Hung-yi Lee, Shao-Hua Sun 6 | 7 | ## Introduction 8 | 9 | This repository contains the implementation of the **"LLM Discussion"** framework, as presented in our COLM 2024 paper. The framework aims to enhance the creativity of large language models (LLMs) through a three-phase discussion process and role-playing techniques. By emulating the human process of collective creativity, LLMs can produce more original and innovative responses. 10 | 11 | ## Abstract 12 | 13 | Large language models (LLMs) have shown exceptional proficiency in natural language processing but often fall short of generating creative and original responses to open-ended questions. To enhance LLM creativity, our key insight is to emulate the human process of inducing collective creativity through engaging discussions with participants from diverse backgrounds and perspectives. To this end, we propose LLM Discussion, a three-phase discussion framework that facilitates vigorous and diverging idea exchanges and ensures convergence to creative answers. Moreover, we adopt a role-playing technique by assigning distinct roles to LLMs to combat the homogeneity of LLMs. We evaluate the efficacy of the proposed framework with the Alternative Uses Test, Similarities Test, Instances Test, and Scientific Creativity Test through both LLM evaluation and human study. Our proposed framework outperforms single-LLM approaches and existing multi-LLM frameworks across various creativity metrics. 14 | 15 | ## Framework Overview 16 | 17 | The LLM Discussion framework is divided into three phases: 18 | 19 | 1. **Initiation Phase**: LLMs are introduced to the discussion topic and objectives. 20 | 2. **Discussion Phase**: Multiple rounds of idea exchanges among LLMs, encouraging active listening and idea building. 21 | 3. **Convergence Phase**: Summarization and convergence of the discussed ideas to produce a collective conclusion. 22 | 23 | ![image](resources/discussion_framework.png) 24 | 25 | 26 | Additionally, LLMs are assigned distinct roles to simulate diverse perspectives and reduce homogeneity in their responses. 27 | 28 |
29 | 30 | 31 |
32 | 33 | ## Installation 34 | 35 | To install the required dependencies, run: 36 | 37 | ```bash 38 | pip install -r requirements.txt 39 | ``` 40 | 41 | ## Run 42 | 43 | To run LLM Discussion, refer to [Experiments/README.md](Experiments/README.md) 44 | 45 | ## Citation 46 | ``` 47 | @inproceedings{lu2024llm, 48 | title={LLM Discussion: Enhancing the Creativity of Large Language Models via Discussion Framework and Role-Play}, 49 | author={Lu, Li-Chun and Chen, Shou-Jen and Pai, Tsung-Min and Yu, Chan-Hung and Lee, Hung-yi and Sun, Shao-Hua}, 50 | booktitle={Conference on Language Modeling}, 51 | year={2024} 52 | } 53 | ``` -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | openai==1.12.0 2 | google-generativeai==0.3.2 3 | numpy==1.26.4 -------------------------------------------------------------------------------- /resources/discussion_framework.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lawraa/LLM-Discussion/2efbde4f74dffb9d0adad144ed20929f75ea51f1/resources/discussion_framework.png -------------------------------------------------------------------------------- /resources/response_sample.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lawraa/LLM-Discussion/2efbde4f74dffb9d0adad144ed20929f75ea51f1/resources/response_sample.png -------------------------------------------------------------------------------- /resources/roleplay.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lawraa/LLM-Discussion/2efbde4f74dffb9d0adad144ed20929f75ea51f1/resources/roleplay.png -------------------------------------------------------------------------------- /resources/teaser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lawraa/LLM-Discussion/2efbde4f74dffb9d0adad144ed20929f75ea51f1/resources/teaser.png --------------------------------------------------------------------------------