├── .gitignore
├── LICENSE
├── README.md
├── evaluation
├── README.md
├── TTQA_1.0.0_tw_llama_v1.0.json
├── conversation.py
├── generate_answer.py
├── generate_answer_anthropic.py
├── get_score_from_freetext.py
├── llm_eval.py
├── plot.ipynb
├── run_tceval.py
└── zh_tw_bench
│ ├── model_answer
│ ├── LLAMA2-sft-0808.jsonl
│ ├── Llama-2-13b-chat.jsonl
│ ├── Llama-2-70b-chat.jsonl
│ ├── claude.jsonl
│ ├── claude_instant.jsonl
│ ├── gpt-3.5-turbo.jsonl
│ ├── gpt-4.jsonl
│ ├── housearch.jsonl
│ ├── v1.0.jsonl
│ └── zh_TW_LLAMA2-sft-0808.jsonl
│ ├── question.jsonl
│ ├── reference_answer
│ └── gpt-4.jsonl
│ └── utils.py
├── example_training_config_for_finetuning_twllm.yaml
├── images
├── ollama_run_demo.png
└── zhtw_vicuna_bench_chatgptbaseline.png
├── requirements.txt
└── start_inference_server.sh
/.gitignore:
--------------------------------------------------------------------------------
1 | ### venv template
2 | # Virtualenv
3 | # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
4 | .Python
5 | [Bb]in
6 | [Ii]nclude
7 | [Ll]ib
8 | [Ll]ib64
9 | [Ll]ocal
10 | [Ss]cripts
11 | pyvenv.cfg
12 | .venv
13 | pip-selfcheck.json
14 |
15 | ### PyCharm+iml template
16 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
17 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
18 |
19 | # User-specific stuff
20 | .idea/**/workspace.xml
21 | .idea/**/tasks.xml
22 | .idea/**/usage.statistics.xml
23 | .idea/**/dictionaries
24 | .idea/**/shelf
25 |
26 | # AWS User-specific
27 | .idea/**/aws.xml
28 |
29 | # Generated files
30 | .idea/**/contentModel.xml
31 |
32 | # Sensitive or high-churn files
33 | .idea/**/dataSources/
34 | .idea/**/dataSources.ids
35 | .idea/**/dataSources.local.xml
36 | .idea/**/sqlDataSources.xml
37 | .idea/**/dynamic.xml
38 | .idea/**/uiDesigner.xml
39 | .idea/**/dbnavigator.xml
40 |
41 | # Gradle
42 | .idea/**/gradle.xml
43 | .idea/**/libraries
44 |
45 | # Gradle and Maven with auto-import
46 | # When using Gradle or Maven with auto-import, you should exclude module files,
47 | # since they will be recreated, and may cause churn. Uncomment if using
48 | # auto-import.
49 | # .idea/artifacts
50 | # .idea/compiler.xml
51 | # .idea/jarRepositories.xml
52 | # .idea/modules.xml
53 | # .idea/*.iml
54 | # .idea/modules
55 | # *.iml
56 | # *.ipr
57 |
58 | # CMake
59 | cmake-build-*/
60 |
61 | # Mongo Explorer plugin
62 | .idea/**/mongoSettings.xml
63 |
64 | # File-based project format
65 | *.iws
66 |
67 | # IntelliJ
68 | out/
69 |
70 | # mpeltonen/sbt-idea plugin
71 | .idea_modules/
72 |
73 | # JIRA plugin
74 | atlassian-ide-plugin.xml
75 |
76 | # Cursive Clojure plugin
77 | .idea/replstate.xml
78 |
79 | # SonarLint plugin
80 | .idea/sonarlint/
81 |
82 | # Crashlytics plugin (for Android Studio and IntelliJ)
83 | com_crashlytics_export_strings.xml
84 | crashlytics.properties
85 | crashlytics-build.properties
86 | fabric.properties
87 |
88 | # Editor-based Rest Client
89 | .idea/httpRequests
90 |
91 | # Android studio 3.1+ serialized cache file
92 | .idea/caches/build_file_checksums.ser
93 |
94 | ### JetBrains template
95 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
96 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
97 |
98 | # User-specific stuff
99 | .idea/**/workspace.xml
100 | .idea/**/tasks.xml
101 | .idea/**/usage.statistics.xml
102 | .idea/**/dictionaries
103 | .idea/**/shelf
104 |
105 | # AWS User-specific
106 | .idea/**/aws.xml
107 |
108 | # Generated files
109 | .idea/**/contentModel.xml
110 |
111 | # Sensitive or high-churn files
112 | .idea/**/dataSources/
113 | .idea/**/dataSources.ids
114 | .idea/**/dataSources.local.xml
115 | .idea/**/sqlDataSources.xml
116 | .idea/**/dynamic.xml
117 | .idea/**/uiDesigner.xml
118 | .idea/**/dbnavigator.xml
119 |
120 | # Gradle
121 | .idea/**/gradle.xml
122 | .idea/**/libraries
123 |
124 | # Gradle and Maven with auto-import
125 | # When using Gradle or Maven with auto-import, you should exclude module files,
126 | # since they will be recreated, and may cause churn. Uncomment if using
127 | # auto-import.
128 | # .idea/artifacts
129 | # .idea/compiler.xml
130 | # .idea/jarRepositories.xml
131 | # .idea/modules.xml
132 | # .idea/*.iml
133 | # .idea/modules
134 | # *.iml
135 | # *.ipr
136 |
137 | # CMake
138 | cmake-build-*/
139 |
140 | # Mongo Explorer plugin
141 | .idea/**/mongoSettings.xml
142 |
143 | # File-based project format
144 | *.iws
145 |
146 | # IntelliJ
147 | out/
148 |
149 | # mpeltonen/sbt-idea plugin
150 | .idea_modules/
151 |
152 | # JIRA plugin
153 | atlassian-ide-plugin.xml
154 |
155 | # Cursive Clojure plugin
156 | .idea/replstate.xml
157 |
158 | # SonarLint plugin
159 | .idea/sonarlint/
160 |
161 | # Crashlytics plugin (for Android Studio and IntelliJ)
162 | com_crashlytics_export_strings.xml
163 | crashlytics.properties
164 | crashlytics-build.properties
165 | fabric.properties
166 |
167 | # Editor-based Rest Client
168 | .idea/httpRequests
169 |
170 | # Android studio 3.1+ serialized cache file
171 | .idea/caches/build_file_checksums.ser
172 |
173 | ### PyCharm template
174 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
175 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
176 |
177 | # User-specific stuff
178 | .idea/**/workspace.xml
179 | .idea/**/tasks.xml
180 | .idea/**/usage.statistics.xml
181 | .idea/**/dictionaries
182 | .idea/**/shelf
183 |
184 | # AWS User-specific
185 | .idea/**/aws.xml
186 |
187 | # Generated files
188 | .idea/**/contentModel.xml
189 |
190 | # Sensitive or high-churn files
191 | .idea/**/dataSources/
192 | .idea/**/dataSources.ids
193 | .idea/**/dataSources.local.xml
194 | .idea/**/sqlDataSources.xml
195 | .idea/**/dynamic.xml
196 | .idea/**/uiDesigner.xml
197 | .idea/**/dbnavigator.xml
198 |
199 | # Gradle
200 | .idea/**/gradle.xml
201 | .idea/**/libraries
202 |
203 | # Gradle and Maven with auto-import
204 | # When using Gradle or Maven with auto-import, you should exclude module files,
205 | # since they will be recreated, and may cause churn. Uncomment if using
206 | # auto-import.
207 | # .idea/artifacts
208 | # .idea/compiler.xml
209 | # .idea/jarRepositories.xml
210 | # .idea/modules.xml
211 | # .idea/*.iml
212 | # .idea/modules
213 | # *.iml
214 | # *.ipr
215 |
216 | # CMake
217 | cmake-build-*/
218 |
219 | # Mongo Explorer plugin
220 | .idea/**/mongoSettings.xml
221 |
222 | # File-based project format
223 | *.iws
224 |
225 | # IntelliJ
226 | out/
227 |
228 | # mpeltonen/sbt-idea plugin
229 | .idea_modules/
230 |
231 | # JIRA plugin
232 | atlassian-ide-plugin.xml
233 |
234 | # Cursive Clojure plugin
235 | .idea/replstate.xml
236 |
237 | # SonarLint plugin
238 | .idea/sonarlint/
239 |
240 | # Crashlytics plugin (for Android Studio and IntelliJ)
241 | com_crashlytics_export_strings.xml
242 | crashlytics.properties
243 | crashlytics-build.properties
244 | fabric.properties
245 |
246 | # Editor-based Rest Client
247 | .idea/httpRequests
248 |
249 | # Android studio 3.1+ serialized cache file
250 | .idea/caches/build_file_checksums.ser
251 |
252 | ### PythonVanilla template
253 | # Byte-compiled / optimized / DLL files
254 | __pycache__/
255 | *.py[cod]
256 | *$py.class
257 |
258 | # C extensions
259 | *.so
260 |
261 | # Distribution / packaging
262 | .Python
263 | build/
264 | develop-eggs/
265 | dist/
266 | downloads/
267 | eggs/
268 | .eggs/
269 | lib/
270 | lib64/
271 | parts/
272 | sdist/
273 | var/
274 | wheels/
275 | share/python-wheels/
276 | *.egg-info/
277 | .installed.cfg
278 | *.egg
279 | MANIFEST
280 |
281 | # Installer logs
282 | pip-log.txt
283 | pip-delete-this-directory.txt
284 |
285 | # Unit test / coverage reports
286 | htmlcov/
287 | .tox/
288 | .nox/
289 | .coverage
290 | .coverage.*
291 | .cache
292 | nosetests.xml
293 | coverage.xml
294 | *.cover
295 | *.py,cover
296 | .hypothesis/
297 | .pytest_cache/
298 | cover/
299 |
300 | # Translations
301 | *.mo
302 | *.pot
303 |
304 | # pyenv
305 | # For a library or package, you might want to ignore these files since the code is
306 | # intended to run in multiple environments; otherwise, check them in:
307 | # .python-version
308 |
309 | # pipenv
310 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
311 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
312 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
313 | # install all needed dependencies.
314 | #Pipfile.lock
315 |
316 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
317 | __pypackages__/
318 |
319 |
320 | ### VirtualEnv template
321 | # Virtualenv
322 | # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
323 | .Python
324 | [Bb]in
325 | [Ii]nclude
326 | [Ll]ib
327 | [Ll]ib64
328 | [Ll]ocal
329 | [Ss]cripts
330 | pyvenv.cfg
331 | .venv
332 | pip-selfcheck.json
333 |
334 | ### Eclipse template
335 | .metadata
336 | bin/
337 | tmp/
338 | *.tmp
339 | *.bak
340 | *.swp
341 | *~.nib
342 | local.properties
343 | .settings/
344 | .loadpath
345 | .recommenders
346 |
347 | # External tool builders
348 | .externalToolBuilders/
349 |
350 | # Locally stored "Eclipse launch configurations"
351 | *.launch
352 |
353 | # PyDev specific (Python IDE for Eclipse)
354 | *.pydevproject
355 |
356 | # CDT-specific (C/C++ Development Tooling)
357 | .cproject
358 |
359 | # CDT- autotools
360 | .autotools
361 |
362 | # Java annotation processor (APT)
363 | .factorypath
364 |
365 | # PDT-specific (PHP Development Tools)
366 | .buildpath
367 |
368 | # sbteclipse plugin
369 | .target
370 |
371 | # Tern plugin
372 | .tern-project
373 |
374 | # TeXlipse plugin
375 | .texlipse
376 |
377 | # STS (Spring Tool Suite)
378 | .springBeans
379 |
380 | # Code Recommenders
381 | .recommenders/
382 |
383 | # Annotation Processing
384 | .apt_generated/
385 | .apt_generated_test/
386 |
387 | # Scala IDE specific (Scala & Java development for Eclipse)
388 | .cache-main
389 | .scala_dependencies
390 | .worksheet
391 |
392 | # Uncomment this line if you wish to ignore the project description file.
393 | # Typically, this file would be tracked if it contains build/dependency configurations:
394 | #.project
395 |
396 | ### CircuitPython template
397 | .Trashes
398 | .metadata_never_index
399 | .fseventsd/
400 | boot_out.txt
401 |
402 | ### Zsh template
403 | # Zsh compiled script + zrecompile backup
404 | *.zwc
405 | *.zwc.old
406 |
407 | # Zsh completion-optimization dumpfile
408 | *zcompdump*
409 |
410 | # Zsh history
411 | .zsh_history
412 |
413 | # Zsh sessions
414 | .zsh_sessions
415 |
416 | # Zsh zcalc history
417 | .zcalc_history
418 |
419 | # A popular plugin manager's files
420 | ._zinit
421 | .zinit_lstupd
422 |
423 | # zdharma/zshelldoc tool's files
424 | zsdoc/data
425 |
426 | # robbyrussell/oh-my-zsh/plugins/per-directory-history plugin's files
427 | # (when set-up to store the history in the local directory)
428 | .directory_history
429 |
430 | # MichaelAquilina/zsh-autoswitch-virtualenv plugin's files
431 | # (for Zsh plugins using Python)
432 | .venv
433 |
434 | # Zunit tests' output
435 | /tests/_output/*
436 | !/tests/_output/.gitkeep
437 |
438 | ### Python template
439 | # Byte-compiled / optimized / DLL files
440 | __pycache__/
441 | *.py[cod]
442 | *$py.class
443 |
444 | # C extensions
445 | *.so
446 |
447 | # Distribution / packaging
448 | .Python
449 | build/
450 | develop-eggs/
451 | dist/
452 | downloads/
453 | eggs/
454 | .eggs/
455 | lib/
456 | lib64/
457 | parts/
458 | sdist/
459 | var/
460 | wheels/
461 | share/python-wheels/
462 | *.egg-info/
463 | .installed.cfg
464 | *.egg
465 | MANIFEST
466 |
467 | # PyInstaller
468 | # Usually these files are written by a python script from a template
469 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
470 | *.manifest
471 | *.spec
472 |
473 | # Installer logs
474 | pip-log.txt
475 | pip-delete-this-directory.txt
476 |
477 | # Unit test / coverage reports
478 | htmlcov/
479 | .tox/
480 | .nox/
481 | .coverage
482 | .coverage.*
483 | .cache
484 | nosetests.xml
485 | coverage.xml
486 | *.cover
487 | *.py,cover
488 | .hypothesis/
489 | .pytest_cache/
490 | cover/
491 |
492 | # Translations
493 | *.mo
494 | *.pot
495 |
496 | # Django stuff:
497 | *.log
498 | local_settings.py
499 | db.sqlite3
500 | db.sqlite3-journal
501 |
502 | # Flask stuff:
503 | instance/
504 | .webassets-cache
505 |
506 | # Scrapy stuff:
507 | .scrapy
508 |
509 | # Sphinx documentation
510 | docs/_build/
511 |
512 | # PyBuilder
513 | .pybuilder/
514 | target/
515 |
516 | # Jupyter Notebook
517 | .ipynb_checkpoints
518 |
519 | # IPython
520 | profile_default/
521 | ipython_config.py
522 |
523 | # pyenv
524 | # For a library or package, you might want to ignore these files since the code is
525 | # intended to run in multiple environments; otherwise, check them in:
526 | # .python-version
527 |
528 | # pipenv
529 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
530 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
531 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
532 | # install all needed dependencies.
533 | #Pipfile.lock
534 |
535 | # poetry
536 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
537 | # This is especially recommended for binary packages to ensure reproducibility, and is more
538 | # commonly ignored for libraries.
539 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
540 | #poetry.lock
541 |
542 | # pdm
543 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
544 | #pdm.lock
545 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
546 | # in version control.
547 | # https://pdm.fming.dev/#use-with-ide
548 | .pdm.toml
549 |
550 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
551 | __pypackages__/
552 |
553 | # Celery stuff
554 | celerybeat-schedule
555 | celerybeat.pid
556 |
557 | # SageMath parsed files
558 | *.sage.py
559 |
560 | # Environments
561 | .env
562 | .venv
563 | env/
564 | venv/
565 | ENV/
566 | env.bak/
567 | venv.bak/
568 |
569 | # Spyder project settings
570 | .spyderproject
571 | .spyproject
572 |
573 | # Rope project settings
574 | .ropeproject
575 |
576 | # mkdocs documentation
577 | /site
578 |
579 | # mypy
580 | .mypy_cache/
581 | .dmypy.json
582 | dmypy.json
583 |
584 | # Pyre type checker
585 | .pyre/
586 |
587 | # pytype static type analyzer
588 | .pytype/
589 |
590 | # Cython debug symbols
591 | cython_debug/
592 |
593 | # PyCharm
594 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
595 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
596 | # and can be added to the global gitignore or merged into this file. For a more nuclear
597 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
598 | #.idea/
599 |
600 | ### PyCharm+all template
601 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
602 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
603 |
604 | # User-specific stuff
605 | .idea/**/workspace.xml
606 | .idea/**/tasks.xml
607 | .idea/**/usage.statistics.xml
608 | .idea/**/dictionaries
609 | .idea/**/shelf
610 |
611 | # AWS User-specific
612 | .idea/**/aws.xml
613 |
614 | # Generated files
615 | .idea/**/contentModel.xml
616 |
617 | # Sensitive or high-churn files
618 | .idea/**/dataSources/
619 | .idea/**/dataSources.ids
620 | .idea/**/dataSources.local.xml
621 | .idea/**/sqlDataSources.xml
622 | .idea/**/dynamic.xml
623 | .idea/**/uiDesigner.xml
624 | .idea/**/dbnavigator.xml
625 |
626 | # Gradle
627 | .idea/**/gradle.xml
628 | .idea/**/libraries
629 |
630 | # Gradle and Maven with auto-import
631 | # When using Gradle or Maven with auto-import, you should exclude module files,
632 | # since they will be recreated, and may cause churn. Uncomment if using
633 | # auto-import.
634 | # .idea/artifacts
635 | # .idea/compiler.xml
636 | # .idea/jarRepositories.xml
637 | # .idea/modules.xml
638 | # .idea/*.iml
639 | # .idea/modules
640 | # *.iml
641 | # *.ipr
642 |
643 | # CMake
644 | cmake-build-*/
645 |
646 | # Mongo Explorer plugin
647 | .idea/**/mongoSettings.xml
648 |
649 | # File-based project format
650 | *.iws
651 |
652 | # IntelliJ
653 | out/
654 |
655 | # mpeltonen/sbt-idea plugin
656 | .idea_modules/
657 |
658 | # JIRA plugin
659 | atlassian-ide-plugin.xml
660 |
661 | # Cursive Clojure plugin
662 | .idea/replstate.xml
663 |
664 | # SonarLint plugin
665 | .idea/sonarlint/
666 |
667 | # Crashlytics plugin (for Android Studio and IntelliJ)
668 | com_crashlytics_export_strings.xml
669 | crashlytics.properties
670 | crashlytics-build.properties
671 | fabric.properties
672 |
673 | # Editor-based Rest Client
674 | .idea/httpRequests
675 |
676 | # Android studio 3.1+ serialized cache file
677 | .idea/caches/build_file_checksums.ser
678 |
679 | /.idea/inspectionProfiles/profiles_settings.xml
680 | /.idea/inspectionProfiles/Project_Default.xml
681 | /.idea/deployment.xml
682 | /.idea/misc.xml
683 | /.idea/modules.xml
684 | /.idea/sshConfigs.xml
685 | /.idea/Taiwan-LLaMa.iml
686 | /.idea/vcs.xml
687 | /evaluation/zh_tw_bench/model_judgment/
688 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # TAME (TAiwan Mixture of Experts)
LLM for Taiwanese Culture across Diverse Domains
2 |
3 |
4 | ✍️ Online Demo
5 | •
6 | 🤗 Model Collection • 🐦 Twitter/X • 📃 Model Paper • 📃 Eval Paper
7 | • 👨️ Yen-Ting Lin
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 | Partnership with 和碩聯合科技, 長庚紀念醫院, 長春集團, 欣興電子, 律果, NVIDIA, 科技報橘
16 |
17 |
18 | # 🌟 [Demo Site](https://twllm.com/)
19 |
20 | Try out Llama-3-Taiwan interactively at [twllm.com](https://twllm.com/)
21 |
22 | # ⚔️ [Chatbot Arena](https://arena.twllm.com/)
23 |
24 | Participate in the exciting [Chatbot Arena](https://arena.twllm.com/) and compete against other chatbots!
25 |
26 | # 🚀 Quick Start for Fine-tuning
27 |
28 | Using [Axolotl](https://github.com/OpenAccess-AI-Collective/axolotl) for fine-tuning:
29 |
30 | ```bash
31 | # Run the axolotl docker image
32 | docker run --gpus '"all"' --rm -it winglian/axolotl:main-latest
33 |
34 | # Preprocess datasets (optional but recommended)
35 | CUDA_VISIBLE_DEVICES="" python -m axolotl.cli.preprocess example_training_config_for_finetuning_twllm.yaml
36 |
37 | # Fine-tune
38 | accelerate launch -m axolotl.cli.train example_training_config_for_finetuning_twllm.yaml
39 |
40 | ```
41 | Check out the example_training_config_for_finetuning_twllm.yaml file for detailed training configuration and parameters.
42 | For more training framework information, visit [Axolotl's GitHub repository](https://github.com/OpenAccess-AI-Collective/axolotl).
43 |
44 | --------
45 |
46 |
47 | 🚀 We're excited to introduce Llama-3-Taiwan-70B! Llama-3-Taiwan-70B is a 70B parameter model finetuned on a large corpus of Traditional Mandarin and English data using the Llama-3 architecture. It demonstrates state-of-the-art performance on various Traditional Mandarin NLP benchmarks.
48 |
49 | The model was trained with [NVIDIA NeMo™ Framework](https://www.nvidia.com/en-us/ai-data-science/generative-ai/nemo-framework/) using the NVIDIA Taipei-1 built with [NVIDIA DGX H100](https://www.nvidia.com/en-us/data-center/dgx-h100/) systems.
50 |
51 | The compute and data for training Llama-3-Taiwan-70B was generously sponsored by [Chang Gung Memorial Hospital](https://www.cgmh.org.tw/eng), [Chang Chun Group](https://www.ccp.com.tw/ccpweb.nsf/homepage?openagent), [Legalsign.ai](https://legalsign.ai/), [NVIDIA](https://www.nvidia.com/zh-tw/), [Pegatron](https://www.pegatroncorp.com/), [TechOrange](https://buzzorange.com/techorange/), and [Unimicron](https://www.unimicron.com/) (in alphabetical order).
52 |
53 | We would like to acknowledge the [contributions](https://huggingface.co/yentinglin/Llama-3-Taiwan-70B-Instruct#contributions) of our data provider, team members and advisors in the development of this model, including [shasha77](https://www.youtube.com/@shasha77) for high-quality YouTube scripts and study materials, [Taiwan AI Labs](https://ailabs.tw/) for providing local media content, [Ubitus K.K.](https://ubitus.net/zh/) for offering gaming content, Professor Yun-Nung (Vivian) Chen for her guidance and advisement, Wei-Lin Chen for leading our pretraining data pipeline, Tzu-Han Lin for synthetic data generation, Chang-Sheng Kao for enhancing our synthetic data quality, and Kang-Chieh Chen for cleaning instruction-following data.
54 |
55 |
56 | # Model Summary
57 |
58 | Llama-3-Taiwan-70B is a large language model finetuned for Traditional Mandarin and English users. It has strong capabilities in language understanding, generation, reasoning, and multi-turn dialogue. Key features include:
59 |
60 | - 70B parameters
61 | - Languages: Traditional Mandarin (zh-tw), English (en)
62 | - Finetuned on High-quality Traditional Mandarin and English corpus covering general knowledge as well as industrial knowledge in legal, manufacturing, medical, and electronics domains
63 | - 8K context length
64 | - Open model released under the Llama-3 license
65 |
66 | # Training Details
67 |
68 | - Training Framework: [NVIDIA NeMo](https://www.nvidia.com/zh-tw/ai-data-science/products/nemo/), [NVIDIA NeMo Megatron](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/nlp/megatron.html)
69 | - Inference Framework: [NVIDIA TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM)
70 | - Base model: [Llama-3 70B](https://llama.meta.com/llama3/)
71 | - Hardware: [NVIDIA DGX H100](https://www.nvidia.com/zh-tw/data-center/dgx-h100/) on Taipei-1
72 | - Context length: 8K tokens ([128k version](https://huggingface.co/yentinglin/Llama-3-Taiwan-70B-Instruct-128k))
73 | - Batch size: 2M tokens per step
74 |
75 | # Evaluation
76 |
77 | Checkout [Open TW LLM Leaderboard](https://huggingface.co/spaces/yentinglin/open-tw-llm-leaderboard) for full and updated list.
78 |
79 | | Model | [TMLU](https://arxiv.org/pdf/2403.20180) | Taiwan Truthful QA | [Legal Eval](https://huggingface.co/datasets/lianghsun/tw-legal-benchmark-v1) | [TW MT-Bench](https://huggingface.co/datasets/MediaTek-Research/TCEval-v2) | Long context | Function Calling | [TMMLU+](https://github.com/iKala/ievals) |
80 | |---------------------------------------------------------------------------------|--------------|---------------|--------------------|--------------|--------------|-----------------|-----------|
81 | | | 學科知識 | 台灣在地化測試 | 台灣法律考題 | 中文多輪對答 | 長文本支援 | 函式呼叫 | |
82 | | [**yentinglin/Llama-3-Taiwan-70B-Instruct**](https://huggingface.co/yentinglin/Llama-3-Taiwan-70B-Instruct) | **74.76%** | 80.95% | 68.42% | 7.54 | [128k version](https://huggingface.co/yentinglin/Llama-3-Taiwan-70B-Instruct-128k) | ✅ | 67.53% |
83 | | [**yentinglin/Llama-3-Taiwan-70B-Instruct-DPO**](https://huggingface.co/yentinglin/Llama-3-Taiwan-70B-Instruct-DPO) | 74.60% | **81.75%** | **70.33%** | - | - | ✅ | - |
84 | | [**yentinglin/Llama-3-Taiwan-70B-Instruct-128k**](https://huggingface.co/yentinglin/Llama-3-Taiwan-70B-Instruct-128k) | 73.01% | 80.16% | 63.64% | - | - | ✅ | - |
85 | | [**yentinglin/Llama-3-Taiwan-8B-Instruct**](https://huggingface.co/yentinglin/Llama-3-Taiwan-8B-Instruct) | 59.50% | 61.11% | 53.11% | 7.21 | [128k version](https://huggingface.co/yentinglin/Llama-3-Taiwan-8B-Instruct-128k) | ✅ | 52.28% |
86 | | [**yentinglin/Llama-3-Taiwan-8B-Instruct-DPO**](https://huggingface.co/yentinglin/Llama-3-Taiwan-8B-Instruct-DPO) | 59.88% | 59.52% | 52.63% | - | - | ✅ | - |
87 | | [**yentinglin/Llama-3-Taiwan-8B-Instruct-128k**](https://huggingface.co/yentinglin/Llama-3-Taiwan-8B-Instruct-128k) | - | - | - | - | - | ✅ | - |
88 | | [Claude-3-Opus](https://www.anthropic.com/api) | [73.59% (5-shot)](https://arxiv.org/pdf/2403.20180) | [69.84%](https://huggingface.co/yentinglin/Llama-3-Taiwan-70B-Instruct-rc3/tree/main/opus-Taiwan-Truthful-QA) | [60.29%](https://huggingface.co/yentinglin/Llama-3-Taiwan-70B-Instruct-rc3/tree/main/opus) | - | 200k | ✅ | - |
89 | | [GPT4-o](https://platform.openai.com/docs/api-reference/chat/create) | [65.56% (0-shot), 69.88% (5-shot)](https://huggingface.co/yentinglin/Llama-3-Taiwan-70B-Instruct-rc3/tree/main/4o-tmlu) | [76.98%](https://huggingface.co/yentinglin/Llama-3-Taiwan-70B-Instruct-rc3/tree/main/4o-Taiwan-Truthful-QA) | [53.59%](https://huggingface.co/yentinglin/Llama-3-Taiwan-70B-Instruct-rc3/tree/main/4o) | - | 128k | ✅ | - |
90 | | [GPT4-turbo](https://platform.openai.com/docs/api-reference/chat/create) | [70.42% (5-shot)](https://arxiv.org/pdf/2403.20180) | - | - | - | 128k | ✅ | 60.34%^ |
91 | | [Gemini-Pro](https://ai.google.dev/gemini-api/docs) | [61.40% (5-shot)](https://arxiv.org/pdf/2403.20180) | - | - | - | 1000k | ✅ | 49.92%^ |
92 | | [GPT-3.5-turbo-1106](https://platform.openai.com/docs/api-reference/chat/create) | [49.37% (5-shot)](https://arxiv.org/pdf/2403.20180) | - | - | 7.1 | 128k | ✅ | 41.76%^ |
93 | | [Qwen1.5-110B-Chat](https://huggingface.co/Qwen/Qwen1.5-110B-Chat) | **75.69%** | 66.67% | 49.28% | - | 32k | ✅ | 65.81% |
94 | | [Yi-34B-Chat](https://huggingface.co/01-ai/Yi-34B-Chat) | 73.59% | 71.43% | 55.02% | 6.9 | 200k | ✅ | 64.10% |
95 | | [Meta-Llama-3-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct) | 70.95% | 65.08% | 52.63% | - | 8k | ✅ | 62.75% |
96 | | [Mixtral-8x22B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1) | 55.57% | 52.38% | 44.98% | - | 64k | ✅ | 52.16% |
97 | | [Breexe-8x7B-Instruct-v0_1](https://huggingface.co/MediaTek-Research/Breexe-8x7B-Instruct-v0_1) | - | - | - | 7.2 | 8k | ❓ | 48.92% |
98 | | [c4ai-command-r-plus](https://huggingface.co/CohereForAI/c4ai-command-r-plus) | 62.87% | 64.29% | 34.45% | - | 128k | ✅ | 49.75% |
99 | | [Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) | 55.81% | 46.83% | 35.89% | - | 8k | ✅ | 43.38% |
100 | | [Breeze-7B-Instruct-v1_0](https://huggingface.co/MediaTek-Research/Breeze-7B-Instruct-v1_0) | 55.57% | 52.38% | 39.23% | 6.0 | 32k | ❓ | 41.77% |
101 | | [Llama3-TAIDE-LX-8B-Chat-Alpha1](https://huggingface.co/taide/Llama3-TAIDE-LX-8B-Chat-Alpha1) | 47.30% | 50.79% | 37.80% | - | 8k | ❓ | 39.03% |
102 | | [Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) | 40.97% | 37.30% | 27.27% | - | 4k | ❓ | 33.02% |
103 |
104 | Numbers are 0-shot by default.
105 |
106 | [Eval implementation](https://github.com/adamlin120/lm-evaluation-harness)
107 |
108 | ^ taken the closet matching numbers from original dataset.
109 |
110 | ## Needle in a Haystack Evaluation
111 |
112 | The "Needle in a 出師表" evaluation tests the model's ability to locate and recall important information embedded within a large body of text, using the classic Chinese text 《出師表》 by 諸葛亮.
113 |
114 | To run the evaluation, use the [script](https://github.com/adamlin120/needle-haystack/tree/main).
115 |
116 |
117 | # TW MT-Bench Score
118 |
119 | - Average Score: 7.5375
120 | - Maximum Score: 10
121 | - Minimum Score: 1
122 | - Median Score: 9.0
123 | - Standard Deviation: 3.0349783771882133
124 | - Total Number of Scores: 160
125 | - [Model response](https://huggingface.co/yentinglin/Llama-3-Taiwan-70B-Instruct-rc1/blob/main/Llama-3-Taiwan.jsonl)
126 | - [GPT-4 Eval](https://huggingface.co/yentinglin/Llama-3-Taiwan-70B-Instruct-rc1/blob/main/gpt-4_single.jsonl)
127 | - [Code fork from `mtkresearch/TCEval` with bug fixing](https://github.com/adamlin120/TCEval)
128 |
129 |
130 | 
131 |
132 |
133 | # Use Cases
134 |
135 | Llama-3-Taiwan-70B can be applied to a wide variety of NLP tasks in Traditional Mandarin and English, including:
136 |
137 |
138 | 1. 多輪對話
139 |
140 |
141 | System: You are an AI assistant called Twllm, created by TAME (TAiwan Mixture of Expert) project.
142 |
143 | User: 嗨,你好!
144 |
145 | Assistant: 你好!我今天可以如何協助你?
146 |
147 | User: 就想聊天而已....
148 |
149 | Assistant: 當然,我很樂意和你聊天!有什麼特別想談論的嗎?
150 |
151 |
152 |
153 |
154 |
155 | 2. RAG(檢索增強生成)
156 |
157 | Demo: 可以打開 `Search Web` on [twllm.com](https://twllm.com/)
158 |
159 |
160 | 
161 |
162 |
163 | 
164 |
165 |
166 | System: You are an AI assistant called Twllm, created by TAME (TAiwan Mixture of Expert) project.
167 |
168 | User: 明天台大天氣如何?
169 |
170 | Assistant: 根據天氣預報,明天台大天氣為陣雨。低溫 24°C。10 到 15 公里/小時 的 東 風 。降雨機率為 40%。
171 |
172 |
173 |
174 |
175 |
176 | 3. 格式化輸出, 語言理解, 實體識別, 函式呼叫
177 |
178 | If you are interested in function-calling, I strongly recommend [using constrained decoding](https://huggingface.co/docs/text-generation-inference/basic_tutorials/using_guidance) to turn on `json mode`.
179 |
180 | Example from HW7 in `INTRODUCTION TO GENERATIVE AI 2024 SPRING` from `HUNG-YI LEE (李宏毅)`
181 |
182 |
183 | 
184 |
185 |
186 | System: You are an AI assistant called Twllm, created by TAME (TAiwan Mixture of Expert) project.
187 |
188 | User: 您是電影評論分析師,負責情緒分析。
189 | 分析每個被評論的句子,評估句子中每個字和標點符號的重要性,
190 | 並且將結果使用 tuple 的結構列出。
191 | tuple 值的範圍介於 -1 (負向) 到 1 (正向) 之間,
192 | 然後,將評論分類為正向 (1) 或負向 (-1)。
193 | 這些需要被評論的句子,包含在 標籤中間。
194 |
195 | 輸出範例:
196 | [(<單字或標點符號>, ), ...]
197 |
198 | 根據外媒報導,宏都拉斯在去年3月與台灣斷交,並轉往與中國建交之後,卻遲遲等不到中國的金援,談好要進口宏都拉斯的白蝦也拖延許久,甚至還將白蝦價格殺價至過往與台灣交易的半價,再加上墨西哥同時也中斷了與宏國的白蝦貿易,對當地蝦農可說是雪上加霜。
199 |
200 |
201 | Assistant: [('根據', 0.2), ('外媒', 0.1), ('報導', -0.1), (',', 0.05), ('宏都拉斯', -0.3), ('在去年3月與台灣斷交, ', -0.5), ('並轉往與中國建交之後, ', -0.4), ('卻遲遲等不到中國的金援, ', -1),
202 | ('談好要進口宏都拉斯的白蝦也拖延許久, ', -1),
203 | (',甚至還將白蝦價格殺價至過往與台灣交易的半價,再加上墨西哥同時也中斷了與宏國的白蝦貿易,對當地蝦農可說是雪上加霜。 ',-1)]
204 | -1
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 | # Get Started
214 |
215 | *Caveat: Set these as stop tokens: ["USER:", "ASSISTANT:", "<|im_end|>", "<|eot_id|>", "<|end_of_text|>"]*
216 |
217 | ## Hugging Face Transformers library
218 | You can use Llama-3-Taiwan-70B with the Hugging Face Transformers library:
219 |
220 |
221 | ```python
222 | # Use a pipeline as a high-level helper
223 | from transformers import pipeline
224 |
225 | messages = [
226 | {"role": "system", "content": "You are an AI assistant called Twllm, created by TAME (TAiwan Mixture of Expert) project."},
227 | {"role": "user", "content": "你好,請問你可以完成什麼任務?"},
228 | {"role": "assistant", "content": "你好,我可以幫助您解決各種問題、提供資訊並協助完成多種任務。例如:回答技術問題、提供建議、翻譯文字、尋找資料或協助您安排行程等。請告訴我如何能幫助您。"},
229 | {"role": "user", "content": "太棒了!"},
230 | ]
231 | pipe = pipeline("text-generation", model="yentinglin/Llama-3-Taiwan-70B-Instruct")
232 | pipe(messages)
233 | ```
234 |
235 | ## vLLM
236 |
237 | Start the server
238 | ```bash
239 | export NUM_GPUS=4
240 | export PORT=8000
241 |
242 | docker run \
243 | -e HF_TOKEN=$HF_TOKEN \
244 | --gpus '"device=0,1,2,3"' \
245 | -v ~/.cache/huggingface:/root/.cache/huggingface \
246 | -p "${PORT}:8000" \
247 | --ipc=host \
248 | vllm/vllm-openai:v0.4.0.post1 \
249 | --model "yentinglin/Llama-3-Taiwan-70B-Instruct" \
250 | -tp "${NUM_GPUS}"
251 | ```
252 |
253 | Sample client code, or you can use anything OpenAI-API compatible clients
254 |
255 | ```python
256 | # pip install "openai>=1.0.0"
257 | from openai import OpenAI
258 | # Set OpenAI's API key and API base to use vLLM's API server.
259 | openai_api_key = "EMPTY"
260 | openai_api_base = "http://localhost:8000/v1"
261 |
262 | client = OpenAI(
263 | api_key=openai_api_key,
264 | base_url=openai_api_base,
265 | )
266 |
267 | chat_response = client.chat.completions.create(
268 | model="yentinglin/Llama-3-Taiwan-70B-Instruct",
269 | messages=[
270 | {"role": "system", "content": "You are a helpful assistant."},
271 | {"role": "user", "content": "Tell me a joke."},
272 | ]
273 | )
274 | print("Chat response:", chat_response)
275 | ```
276 |
277 |
278 | Enjoy exploring the capabilities of Llama-3-Taiwan-70B! We look forward to seeing what you create with this powerful open-source model. If you have any questions or feedback, please let us know.
279 |
280 | # Citation
281 | ```
282 | @article{DBLP:journals/corr/abs-2311-17487,
283 | author = {Yen{-}Ting Lin and
284 | Yun{-}Nung Chen},
285 | title = {Taiwan {LLM:} Bridging the Linguistic Divide with a Culturally Aligned
286 | Language Model},
287 | journal = {CoRR},
288 | volume = {abs/2311.17487},
289 | year = {2023},
290 | url = {https://doi.org/10.48550/arXiv.2311.17487},
291 | doi = {10.48550/ARXIV.2311.17487},
292 | eprinttype = {arXiv},
293 | eprint = {2311.17487},
294 | timestamp = {Tue, 05 Dec 2023 14:40:42 +0100},
295 | biburl = {https://dblp.org/rec/journals/corr/abs-2311-17487.bib},
296 | bibsource = {dblp computer science bibliography, https://dblp.org}
297 | }
298 | @article{DBLP:journals/corr/abs-2403-20180,
299 | author = {Po{-}Heng Chen and
300 | Sijia Cheng and
301 | Wei{-}Lin Chen and
302 | Yen{-}Ting Lin and
303 | Yun{-}Nung Chen},
304 | title = {Measuring Taiwanese Mandarin Language Understanding},
305 | journal = {CoRR},
306 | volume = {abs/2403.20180},
307 | year = {2024},
308 | url = {https://doi.org/10.48550/arXiv.2403.20180},
309 | doi = {10.48550/ARXIV.2403.20180},
310 | eprinttype = {arXiv},
311 | eprint = {2403.20180},
312 | timestamp = {Wed, 10 Apr 2024 17:37:45 +0200},
313 | biburl = {https://dblp.org/rec/journals/corr/abs-2403-20180.bib},
314 | bibsource = {dblp computer science bibliography, https://dblp.org}
315 | }
316 | ```
317 |
318 | ## Previous Taiwan-LLM Releases
319 |
320 | The Taiwan LLM Initiative was started by Yenting Lin (林彥廷) in July 2023.
321 |
322 | - Version 1.0 was released in August 2023.
323 | - Version 2.0 was released in October 2023, sponsored by Ubitus K.K.
324 |
325 | These models are designed to support Traditional Mandarin and are optimized for Taiwanese culture and related applications. For more detailed information about our models, including demos, features, and examples, please visit our [Hugging Face collection](https://huggingface.co/collections/yentinglin/taiwan-llm-6523f5a2d6ca498dc3810f07).
326 |
327 |
328 | # Disclaimer
329 |
330 | This model is provided “as‑is” and without warranties of any kind. Users are solely responsible for evaluating the accuracy and suitability of the outputs. The developers assume no liability for any direct or indirect damages arising from its use.
331 | The model is strictly not intended for high‑risk applications such as medical diagnosis, legal advice, or financial investment. For such use cases, please consult qualified professionals.
332 |
333 | 本模型「如是」(as‑is)提供,使用者須自行評估結果之正確性與適用性。開發者對於使用本模型所引發之任何直接或間接損失,不承擔任何法律責任。
334 | 嚴禁用於醫療診斷、法律諮詢、金融投資等高風險場景;若有相關需求,請尋求專業人員協助。
335 |
--------------------------------------------------------------------------------
/evaluation/README.md:
--------------------------------------------------------------------------------
1 | # Run tc-eval
2 |
3 | ```bash
4 | python run_tceval.py --model gpt-4-1106-preview
5 | python run_tceval.py --model claude-2.1
6 | python run_tceval.py --model yentinglin/Taiwan-LLM-13B-v2.0-chat --tensor_parallel_size 2
7 | ```
8 |
--------------------------------------------------------------------------------
/evaluation/TTQA_1.0.0_tw_llama_v1.0.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "answer": "櫻花鉤吻鮭",
4 | "examples": "它是位於亞熱帶的台灣內唯一一種溫帶性魚類,也是只產於台灣的特有櫻鮭亞種,為冰河孑遺生物。由於其相當稀有且瀕臨絕種,加上它的生活習性迥異於其他魚類,遂得「國寶魚」之美譽。\n該動物的名稱是:",
5 | "correct_answers": [
6 | "櫻花鉤吻鮭"
7 | ],
8 | "correctness_Taiwan_llama_v1.0": false,
9 | "Taiwan_llama_v1.0": "台灣特有的櫻鮭亞種通常被稱為「台灣櫻鮭」或「國寶魚」。",
10 | "correctness_chatgpt-0613": false,
11 | "chatgpt-0613": "台灣櫻鮭"
12 | },
13 | {
14 | "answer": "臺灣獼猴",
15 | "examples": "臺語也稱烏肢猴 ,是舊世界猴獼猴屬的的物種,分布於臺灣本島及澎湖,為常見的臺灣特有種,也是島上除智人外唯一的原生靈長類動物,與亞洲南部的普通獼猴以及日本的日本獼猴有近親關係。\n該動物的名稱是:",
16 | "correct_answers": [
17 | "臺灣獼猴"
18 | ],
19 | "correctness_Taiwan_llama_v1.0": true,
20 | "Taiwan_llama_v1.0": "臺灣獼猴",
21 | "correctness_chatgpt-0613": false,
22 | "chatgpt-0613": "臺語猴"
23 | },
24 | {
25 | "answer": "台灣黑熊",
26 | "examples": "是臺灣特有的亞洲黑熊亞種,胸前的V字型白色條紋是亞洲黑熊共有的特徵。\n該動物的名稱是:",
27 | "correct_answers": [
28 | "台灣黑熊"
29 | ],
30 | "correctness_Taiwan_llama_v1.0": true,
31 | "Taiwan_llama_v1.0": "臺灣黑熊",
32 | "correctness_chatgpt-0613": true,
33 | "chatgpt-0613": "臺灣黑熊"
34 | },
35 | {
36 | "answer": "藍腹鷴",
37 | "examples": "俗稱臺灣山雞,為台灣特有種。深藍色的大型雉類。雄性頭頸黑色,羽冠白色有時帶黑斑,後頸及頸側為深藍色,帶悅目的金屬光澤,上背白色,下背及尾上覆羽黑色,並具有寬闊的帶金屬光澤的藍色羽緣。以植物的果實、種子為食,也吃一些無脊椎動物。\n該動物的名稱是:",
38 | "correct_answers": [
39 | "藍腹鷴"
40 | ],
41 | "correctness_Taiwan_llama_v1.0": false,
42 | "Taiwan_llama_v1.0": "臺灣藍鵲",
43 | "correctness_chatgpt-0613": false,
44 | "chatgpt-0613": "臺灣藍鵲。"
45 | },
46 | {
47 | "answer": "台灣藍鵲",
48 | "examples": "臺灣特有種鳥類,成鳥的喙和腳呈紅色,虹膜呈金黃色,頭至頸、胸部呈黑色,翅膀末端呈白色,身體其餘部分幾乎皆呈藍色。主要分布在中、低海拔闊葉林裡,除繁殖季外多成群行動,且有非常明顯的保護領域傾向,以及伴隨而來的驅趕行為。\n該動物的名稱是:",
49 | "correct_answers": [
50 | "台灣藍鵲"
51 | ],
52 | "correctness_Taiwan_llama_v1.0": false,
53 | "Taiwan_llama_v1.0": "您所描述的鳥類是臺灣特有種「藍腹鷴」。",
54 | "correctness_chatgpt-0613": true,
55 | "chatgpt-0613": "臺灣藍鵲"
56 | },
57 | {
58 | "answer": "知本溫泉",
59 | "examples": "它位於臺灣臺東縣卑南鄉溫泉村,依地質分類屬於位於中央山脈板岩區的變質岩區溫泉。沿知本溪分布的區域共有三個出露點,分別為知本外溫泉及知本內溫泉,另一個出露點位於晴山東南方的知本溪河床。該景點的名稱是:",
60 | "correct_answers": [
61 | "知本溫泉"
62 | ],
63 | "correctness_Taiwan_llama_v1.0": true,
64 | "Taiwan_llama_v1.0": "知本溫泉。",
65 | "correctness_chatgpt-0613": true,
66 | "chatgpt-0613": "知本溫泉"
67 | },
68 | {
69 | "answer": "日月潭",
70 | "examples": "它是一個位在臺灣南投縣魚池鄉日月村的半天然淡水湖泊兼水力發電用水庫;該潭是臺灣本島面積第二大的湖泊及第一大半天然湖泊兼發電用水庫。該潭平均水面海拔約736公尺,常態面積約7.93平方公里,最高水深達27公尺。其蘊含自然生態豐富,但其中有非常多是外來種生物。該景點的名稱是:",
71 | "correct_answers": [
72 | "日月潭"
73 | ],
74 | "correctness_Taiwan_llama_v1.0": true,
75 | "Taiwan_llama_v1.0": "日月潭。",
76 | "correctness_chatgpt-0613": true,
77 | "chatgpt-0613": "日月潭"
78 | },
79 | {
80 | "answer": "臺灣海峽,台灣海峽",
81 | "examples": "介於中國大陸與台灣島之間的海域和海峽,位於東海及南海之間,是西太平洋的一部分,北東—南西走向,長約370公里,北窄南寬,北口寬約200公里,南口寬約410公里,平均寬度180公里,最窄處在台灣側新竹市南寮與福建側平潭海壇島之間,約126公里,以大陸架為主,其水深為70米。為東洋海洋史上貿易路線的重要水道和近代地緣政治戰略要點之一,海域上的島嶼有福建沿海大陸島、澎湖群島、小琉球等。\n該地理的名稱是:",
82 | "correct_answers": [
83 | "臺灣海峽",
84 | "台灣海峽"
85 | ],
86 | "correctness_Taiwan_llama_v1.0": true,
87 | "Taiwan_llama_v1.0": "該地理的名稱是台灣海峽。",
88 | "correctness_chatgpt-0613": true,
89 | "chatgpt-0613": "台灣海峽"
90 | },
91 | {
92 | "answer": "翠峰湖",
93 | "examples": "它是臺灣最大的高山湖泊,位於太平山國家森林遊樂區內,海拔1850公尺,呈葫蘆狀。形成原因至今不明,據說曾遭美軍轟炸。目前為管制進入的生態保育區,遊客僅能自遠處眺望。\n該湖泊的名稱是:",
94 | "correct_answers": [
95 | "翠峰湖"
96 | ],
97 | "correctness_Taiwan_llama_v1.0": false,
98 | "Taiwan_llama_v1.0": "該湖泊的名稱是:九龍池。",
99 | "correctness_chatgpt-0613": false,
100 | "chatgpt-0613": "明潭湖"
101 | },
102 | {
103 | "answer": "高屏溪",
104 | "examples": "它位於臺灣南部,為中央管河川,長度僅次於濁水溪。主流河長171.00公里,為全台第二長河;流域面積廣達3,256.85平方公里,分佈於南投縣南端、嘉義縣東端、台東縣西端,及高雄市、屏東縣的23個鄉鎮市區,流域面積為全台第一大河。\n該河川的名稱是:",
105 | "correct_answers": [
106 | "高屏溪"
107 | ],
108 | "correctness_Taiwan_llama_v1.0": true,
109 | "Taiwan_llama_v1.0": "它位於臺灣南部,為中央管河川,長度僅次於濁水溪。主流河長171.00公里,為全台第二長河;流域面積廣達3,256.85平方公里,分佈於南投縣南端、嘉義縣東端、台東縣西端,及高雄市、屏東縣的23個鄉鎮市區,流域面積為全台第一大河。\n該河川的名稱是:高屏溪。",
110 | "correctness_chatgpt-0613": true,
111 | "chatgpt-0613": "該河川的名稱是高屏溪。"
112 | },
113 | {
114 | "answer": "濁水溪",
115 | "examples": "古稱螺溪,下游段稱為西螺溪,位於台灣中部,屬於中央管河川,全長約186.6公里,是台灣最長的河川,流域面積達3,156.90平方公里,計畫洪水量為每秒24000立方公尺,流域面積與河口百年洪峰流量皆僅略次於高屏溪居於台灣第二,下游為彰化縣與雲林縣的界河。\n該河川的名稱是:",
116 | "correct_answers": [
117 | "濁水溪"
118 | ],
119 | "correctness_Taiwan_llama_v1.0": true,
120 | "Taiwan_llama_v1.0": "該河川的名稱是:濁水溪。",
121 | "correctness_chatgpt-0613": false,
122 | "chatgpt-0613": "螺溪"
123 | },
124 | {
125 | "answer": "龜山島",
126 | "examples": "台灣外海、太平洋中的島嶼,位於蘭陽平原東面的太平洋上一座火山島。因外型似浮龜而得名。東西寬3.1公里,南北長1.6公里,面積2.841平方公里,海岸線長9公里,直線距離台灣本島的梗枋漁港約9.1公里,距離烏石港約10公里。行政區隸屬於宜蘭縣頭城鎮龜山里。\n該島嶼的名稱是:",
127 | "correct_answers": [
128 | "龜山島"
129 | ],
130 | "correctness_Taiwan_llama_v1.0": true,
131 | "Taiwan_llama_v1.0": "你所提到的島嶼是「龜山島」。",
132 | "correctness_chatgpt-0613": true,
133 | "chatgpt-0613": "該島嶼的名稱是龜山島。"
134 | },
135 | {
136 | "answer": "琉球嶼",
137 | "examples": "是台灣島西南方的一座島,全境屬中華民國屏東縣管轄,為珊瑚礁石灰岩島嶼,面積6.802平方公里,位於屏東縣東港、高屏溪出海口西南方海面約15公里處,島上有許多石灰岩洞穴,全島在大鵬灣國家風景區範圍之內。\n該島嶼的名稱是:",
138 | "correct_answers": [
139 | "琉球嶼"
140 | ],
141 | "correctness_Taiwan_llama_v1.0": false,
142 | "Taiwan_llama_v1.0": "該島嶼的名稱是:潮州灣。",
143 | "correctness_chatgpt-0613": true,
144 | "chatgpt-0613": "琉球嶼"
145 | },
146 | {
147 | "answer": "七星山",
148 | "examples": "它位於臺灣北部的陽明山國家公園轄區內,行政區為台北市北投區,海拔1,120公尺,是台北市第一高的山岳。它為一複式火山,地理上屬於大屯火山彙,峰項置有一等三角點,山頂視野寬闊,能將整個大台北地區一覽無遺。\n該山岳的名稱是:",
149 | "correct_answers": [
150 | "七星山"
151 | ],
152 | "correctness_Taiwan_llama_v1.0": false,
153 | "Taiwan_llama_v1.0": "該山岳的名稱是陽明山。",
154 | "correctness_chatgpt-0613": false,
155 | "chatgpt-0613": "陽明山"
156 | },
157 | {
158 | "answer": "玉山",
159 | "examples": "最高峰主峰位於臺灣嘉義縣阿里山鄉中山村、南投縣信義鄉東埔村與高雄市桃源區梅山里交界處,海拔3,952.43±0.045公尺,為台灣最高峰,百岳之首,日治時由明治天皇命名為新高山,為日本最高峰。戰後1947年11月29日臺灣省政府訓令將新高山改稱為現名。為保護它及周邊極為珍貴的自然景觀生態及歷史文化資產,在1985年設立國家公園。在百岳中,代表布農的它與泰雅的雪山、阿美的秀姑巒山、南湖大山、排灣的北大武山合稱「五嶽」,為臺灣最具代表性的五座高山。\n該風景的名稱是:",
160 | "correct_answers": [
161 | "玉山"
162 | ],
163 | "correctness_Taiwan_llama_v1.0": true,
164 | "Taiwan_llama_v1.0": "玉山",
165 | "correctness_chatgpt-0613": false,
166 | "chatgpt-0613": "阿里山"
167 | },
168 | {
169 | "answer": "高雄85大樓",
170 | "examples": "位於高雄市苓雅區,緊鄰著高雄港和新光碼頭,是85層樓高的摩天大樓,結構高度為347.5公尺,加上天線為378公尺,目前為高雄第一高樓、台灣第二高的摩天大樓。\n該建築的名稱是:",
171 | "correct_answers": [
172 | "高雄85大樓"
173 | ],
174 | "correctness_Taiwan_llama_v1.0": true,
175 | "Taiwan_llama_v1.0": "該建築的名稱是:高雄85大樓。",
176 | "correctness_chatgpt-0613": true,
177 | "chatgpt-0613": "該建築的名稱是85大樓。"
178 | },
179 | {
180 | "answer": "林本源園邸",
181 | "examples": "它位於新北市板橋區,為板橋林本源家族興建的房舍,仿蘇州留園設計,是目前臺灣僅存最完整的園林建築。該居可追溯至1847年,當時為林本源家族北上屯租的租館,後經林國華、林國芳兩兄弟擴建,成為林本源家族之居所。庭園部分包含來青閣、月波水榭、定靜堂等多處房舍與人造山水。\n該建築的名稱是:",
182 | "correct_answers": [
183 | "林本源園邸"
184 | ],
185 | "correctness_Taiwan_llama_v1.0": true,
186 | "Taiwan_llama_v1.0": "林家花園",
187 | "correctness_chatgpt-0613": true,
188 | "chatgpt-0613": "林本源園林"
189 | },
190 | {
191 | "answer": "台北101",
192 | "examples": "一座位於臺灣臺北市信義區的超高層摩天大樓,樓高509.2公尺,地上101層、地下5層。當前為台灣第一高樓、以及唯一一棟高度超過500公尺、樓層超過100層的建築物,曾於2004年12月1日至2010年1月7日間擁有世界第一高樓的紀錄。\n該建築的名稱是:",
193 | "correct_answers": [
194 | "台北101"
195 | ],
196 | "correctness_Taiwan_llama_v1.0": true,
197 | "Taiwan_llama_v1.0": "該建築的名稱是台北101。",
198 | "correctness_chatgpt-0613": true,
199 | "chatgpt-0613": "台北101"
200 | },
201 | {
202 | "answer": "國立故宮博物院,故宮博物院,故宮,臺灣故宮,台灣故宮",
203 | "examples": "為臺灣最具規模的博物館以及臺灣八景之一,也是古代中國藝術史與漢學研究機構。典藏為數近70萬件文物與藝術作品,大部分是原先國立北平故宮博物院、國立中央博物院籌備處和國立北平圖書館等機構所藏來自紫禁城、盛京行宮、避暑山莊、頤和園、靜宜園和國子監等處皇家舊藏;另有部分是編列預算購置,接收自第二次世界大戰結束日本歸還部分文物,以及透過各界捐贈和徵集而來。\n該建築的名稱是:",
204 | "correct_answers": [
205 | "國立故宮博物院",
206 | "故宮博物院",
207 | "故宮",
208 | "臺灣故宮",
209 | "台灣故宮"
210 | ],
211 | "correctness_Taiwan_llama_v1.0": true,
212 | "Taiwan_llama_v1.0": "臺北國立故宮博物院",
213 | "correctness_chatgpt-0613": true,
214 | "chatgpt-0613": "該建築的名稱是故宮博物院。"
215 | },
216 | {
217 | "answer": "熱蘭遮城,安平古堡",
218 | "examples": "一座位於臺灣臺南市安平區的一個城郭遺址。於1623年10月由荷蘭東印度公司建立,1624年3月拆毀,8月重建,是臺灣最早的要塞建築。自建城以來,曾是荷蘭人統治臺灣的中樞。此外,該城郭遺址於黃昏時與落日互相煇映之景「安平夕照」。在1953年曾獲臺灣省政府選為臺灣八景之一。\n該建築的名稱是:",
219 | "correct_answers": [
220 | "熱蘭遮城",
221 | "安平古堡"
222 | ],
223 | "correctness_Taiwan_llama_v1.0": true,
224 | "Taiwan_llama_v1.0": "安平古堡",
225 | "correctness_chatgpt-0613": true,
226 | "chatgpt-0613": "安平古堡"
227 | },
228 | {
229 | "answer": "中正紀念堂",
230 | "examples": "位於中華民國臺北市中正區的國家紀念建築,是中華民國政府為紀念已故前總統蔣中正而興建,落成以來即成為臺北重要地標與觀光景點之一。園區廣場前方兩廳院稱為「藝文廣場」,廣場南北側另建有國家戲劇院以及國家音樂廳,合稱「國家兩廳院」,由國家表演藝術中心管理。\n該建築的名稱是:",
231 | "correct_answers": [
232 | "中正紀念堂"
233 | ],
234 | "correctness_Taiwan_llama_v1.0": true,
235 | "Taiwan_llama_v1.0": "臺北中正紀念堂",
236 | "correctness_chatgpt-0613": true,
237 | "chatgpt-0613": "中正紀念堂"
238 | },
239 | {
240 | "answer": "艋舺龍山寺,萬華龍山寺",
241 | "examples": "位於臺灣艋舺(今臺北市萬華區)富民里的觀音寺,為清治時期臺北城泉州三邑人的信仰中心,近代成為外國旅客觀光景點,今建築列為國定古蹟。正殿除觀世音菩薩外,還有文殊菩薩、普賢菩薩、十八羅漢、伽藍菩薩、韋馱菩薩、釋迦牟尼佛苦行像。前殿有三寶佛塑像,後殿有天上聖母殿、文昌帝君殿、關聖帝君殿,一旁還有華佗廳、月老廳、太歲廳等。\n該建築的名稱是:",
242 | "correct_answers": [
243 | "艋舺龍山寺",
244 | "萬華龍山寺"
245 | ],
246 | "correctness_Taiwan_llama_v1.0": false,
247 | "Taiwan_llama_v1.0": "觀音寺",
248 | "correctness_chatgpt-0613": false,
249 | "chatgpt-0613": "觀音寺"
250 | },
251 | {
252 | "answer": "國立公共資訊圖書館",
253 | "examples": "它是中華民國三所國立圖書館之一,位於臺灣臺中市南區,有著輔導地方公共圖書館的任務。此外,該館還設有2座分館。該館的前身為1923年成立的臺中州立圖書館,並在之後歷經3次遷館、4次改名及多次改隸。\n該圖書館的名稱是:",
254 | "correct_answers": [
255 | "國立公共資訊圖書館"
256 | ],
257 | "correctness_Taiwan_llama_v1.0": true,
258 | "Taiwan_llama_v1.0": "國立公共資訊圖書館",
259 | "correctness_chatgpt-0613": false,
260 | "chatgpt-0613": "國立中央圖書館臺中分館"
261 | },
262 | {
263 | "answer": "大甲鎮瀾宮",
264 | "examples": "它是一座位在臺灣臺中市大甲區的媽祖廟。該廟是臺灣台灣媽祖信仰的代表廟宇之一。每年的大甲媽祖遶境進香現為臺灣民間信仰中受矚目的宗教盛事之一,並被列為中華民國無形文化資產民俗類重要民俗之一。此外,該廟曾前往湄洲媽祖祖廟謁祖進香。它的名稱是:",
265 | "correct_answers": [
266 | "大甲鎮瀾宮"
267 | ],
268 | "correctness_Taiwan_llama_v1.0": true,
269 | "Taiwan_llama_v1.0": "大甲鎮瀾宮",
270 | "correctness_chatgpt-0613": false,
271 | "chatgpt-0613": "大甲媽祖廟"
272 | },
273 | {
274 | "answer": "圓山大飯店",
275 | "examples": "是位於臺灣臺北市中山區劍潭山的旅館,成立於第二次世界大戰後,早年為臺灣首屈一指的大型國際性飯店。目前所見的中國宮殿風格建築於1973年落成,是臺北地標之一。飯店屋顶使用歇山式。飯店的建築上採用相當多的龍形雕刻,故有人稱此飯店為「龍宮」;除採用龍形之外,亦有石獅、梅花等中國建築常用的圖案。\n該建築的名稱是:",
276 | "correct_answers": [
277 | "圓山大飯店"
278 | ],
279 | "correctness_Taiwan_llama_v1.0": true,
280 | "Taiwan_llama_v1.0": "該建築的名稱是:圓山大飯店。",
281 | "correctness_chatgpt-0613": true,
282 | "chatgpt-0613": "圓山大飯店"
283 | },
284 | {
285 | "answer": "林懷民",
286 | "examples": "臺灣第一個現代舞劇團「雲門舞集」的創辦人與藝術總監。2006年獲選為Discovery頻道《台灣人物誌》的6名主角之1。\n他的名字是:",
287 | "correct_answers": [
288 | "林懷民"
289 | ],
290 | "correctness_Taiwan_llama_v1.0": true,
291 | "Taiwan_llama_v1.0": "林懷民",
292 | "correctness_chatgpt-0613": true,
293 | "chatgpt-0613": "林懷民"
294 | },
295 | {
296 | "answer": "李泰祥",
297 | "examples": "一生致力將古典音樂通俗化、將中國民歌重新賦予現代化編曲。他的創作編曲包括〈橄欖樹〉、〈告別〉等。他的嚴肅音樂作品則包括第一、第二號《弦樂四重奏》、《太虛吟》、鋼琴五重奏《氣、斷、流》、《生民》、《幻境三章》、《山和田》客家風組曲、《台灣素描》、《狩獵》等。大型劇樂作品包括清唱劇《大神祭》、歌劇《大風起兮》,以及為雲門舞集創造的舞劇音樂等。他的名字是:",
298 | "correct_answers": [
299 | "李泰祥"
300 | ],
301 | "correctness_Taiwan_llama_v1.0": false,
302 | "Taiwan_llama_v1.0": "錢南章",
303 | "correctness_chatgpt-0613": false,
304 | "chatgpt-0613": "胡德夫"
305 | },
306 | {
307 | "answer": "侯孝賢",
308 | "examples": "臺灣電影導演,其電影在國內外各大重要影展獲得重大獎項,為台灣新電影代表人物之一。曾榮獲三屆金馬獎最佳導演獎,1989年的《悲情城市》獲得第46屆威尼斯影展金獅獎,1993年憑藉《戲夢人生》獲得第46屆坎城影展評審團獎。2015年以《刺客聶隱娘》獲得第68屆坎城影展最佳導演獎及第52屆金馬獎最佳導演獎與金馬獎最佳劇情片獎。2020年獲得第57屆金馬獎終身成就獎。其喜愛使用長鏡頭、空鏡頭與固定鏡位,讓人物直接在鏡頭中說故事,是他電影的一大特色。目前是臺灣電影文化協會榮譽理事長。他的名字是:",
309 | "correct_answers": [
310 | "侯孝賢"
311 | ],
312 | "correctness_Taiwan_llama_v1.0": true,
313 | "Taiwan_llama_v1.0": "你所提到的電影導演是:侯孝賢。",
314 | "correctness_chatgpt-0613": true,
315 | "chatgpt-0613": "侯孝賢。"
316 | },
317 | {
318 | "answer": "李安",
319 | "examples": "臺灣導演,1999年執導的《臥虎藏龍》獲得第73屆奧斯卡最佳外語片獎及三個技術獎項。2006年和2013年則分別以《斷背山》和《少年Pi的奇幻漂流》獲得第78屆奧斯卡金像獎與第85屆奧斯卡金像獎「最佳導演獎」,是第一位獲得該獎項的亞洲導演,也是至今唯一兩度獲得該獎項的亞洲導演。\n他的名字是:",
320 | "correct_answers": [
321 | "李安"
322 | ],
323 | "correctness_Taiwan_llama_v1.0": true,
324 | "Taiwan_llama_v1.0": "李安",
325 | "correctness_chatgpt-0613": true,
326 | "chatgpt-0613": "李安"
327 | },
328 | {
329 | "answer": "白先勇",
330 | "examples": "知名文學家、劇作家等。他的代表作有小說集《臺北人》、《紐約客》,長篇小說《孽子》等,散文《樹猶如此》等。他也致力於崑曲文化的保存與推廣。晚年先後編寫了數部父親白崇禧的傳記和關於紅樓夢的導讀:細說紅樓夢。\n他的名字是:",
331 | "correct_answers": [
332 | "白先勇"
333 | ],
334 | "correctness_Taiwan_llama_v1.0": false,
335 | "Taiwan_llama_v1.0": "沈從文",
336 | "correctness_chatgpt-0613": false,
337 | "chatgpt-0613": "林語堂"
338 | },
339 | {
340 | "answer": "張愛玲",
341 | "examples": "她原名張瑛,筆名梁京等,是出身上海、後來長年定居於美國的散文家、小說家、劇本作家及劇本評論家。著名作品有《傾城之戀》、《半生缘》、《金鎖記》。她的名字是:",
342 | "correct_answers": [
343 | "張愛玲"
344 | ],
345 | "correctness_Taiwan_llama_v1.0": true,
346 | "Taiwan_llama_v1.0": "張愛玲",
347 | "correctness_chatgpt-0613": true,
348 | "chatgpt-0613": "張愛玲"
349 | },
350 | {
351 | "answer": "李遠哲",
352 | "examples": "臺灣化學家,1986年獲得諾貝爾化學獎,是第一位出生及成長於臺灣的諾貝爾獎得主。\n他的名字是:",
353 | "correct_answers": [
354 | "李遠哲"
355 | ],
356 | "correctness_Taiwan_llama_v1.0": true,
357 | "Taiwan_llama_v1.0": "李遠哲",
358 | "correctness_chatgpt-0613": true,
359 | "chatgpt-0613": "李遠哲"
360 | },
361 | {
362 | "answer": "孫越",
363 | "examples": "臺灣男演員、中華民國陸軍裝甲兵、中華民國總統府國策顧問、終身志願工作者,籍貫浙江省餘姚縣。曾參與多部電影的演出。他主持公共電視節目《孫叔叔說故事》而被稱為「孫叔叔」。1989年宣佈退出商業演出,全力投入慈善活動,只從事義務性、公益性演出,包括節目主持、宣導廣告等。第20屆金馬影帝。代表作品有《揚子江風雲》、《搭錯車》、《群龍戲鳳》等。他的名字是:",
364 | "correct_answers": [
365 | "孫越"
366 | ],
367 | "correctness_Taiwan_llama_v1.0": true,
368 | "Taiwan_llama_v1.0": "您所提到的人物是臺灣演員孫越。",
369 | "correctness_chatgpt-0613": true,
370 | "chatgpt-0613": "孫越"
371 | },
372 | {
373 | "answer": "吳清友",
374 | "examples": "臺灣臺南市將軍區人,以經營餐旅廚房設備業起家。他是誠品書店創辦人。\n他的名字是:",
375 | "correct_answers": [
376 | "吳清友"
377 | ],
378 | "correctness_Taiwan_llama_v1.0": true,
379 | "Taiwan_llama_v1.0": "吳清友",
380 | "correctness_chatgpt-0613": true,
381 | "chatgpt-0613": "吳清友"
382 | },
383 | {
384 | "answer": "戴資穎",
385 | "examples": "臺灣女子羽毛球運動員,專精單打,效力於合作金庫羽球隊,從2009年開始代表中華台北出戰國際賽。她的球路多變且步伐輕快,擅於使用假動作並配合細膩的網前手法打亂對手節奏而得分,並憑藉其優異體能執行四方球拉吊突擊戰術,使對手因疲於奔命體力下降而失誤丟分,也可在處於被動時因其優異的反拍擺脫能力奪回主動而致勝。她的名字是:",
386 | "correct_answers": [
387 | "戴資穎"
388 | ],
389 | "correctness_Taiwan_llama_v1.0": true,
390 | "Taiwan_llama_v1.0": "你所指的運動員是「世界球后」戴資穎。",
391 | "correctness_chatgpt-0613": true,
392 | "chatgpt-0613": "戴資穎"
393 | },
394 | {
395 | "answer": "黑白切",
396 | "examples": "它的名稱黑白一詞取自臺語「烏白」,意指「隨意」,是指顧客在沒有明確想吃的菜色和不知曉店家有哪些好吃的小菜時,在告知預算或人數的前提下,請店家隨意切些招牌菜的吃法。這種可以快速端上桌並能完整利用食材的料理,很受消費者青睞,在電影、電視中或是在夜市、路邊攤販、小吃店都頻頻出現。這項處處可見的小吃成為了台灣的飲食文化一環。\n它的名稱是:",
397 | "correct_answers": [
398 | "黑白切"
399 | ],
400 | "correctness_Taiwan_llama_v1.0": true,
401 | "Taiwan_llama_v1.0": "黑白切",
402 | "correctness_chatgpt-0613": true,
403 | "chatgpt-0613": "黑白切"
404 | },
405 | {
406 | "answer": "蚵仔煎",
407 | "examples": "是一道以新鮮蚵仔肉裹以番薯粉煎炸而成的海鮮食品。\n它的名稱是:",
408 | "correct_answers": [
409 | "蚵仔煎"
410 | ],
411 | "correctness_Taiwan_llama_v1.0": true,
412 | "Taiwan_llama_v1.0": "蚵仔煎",
413 | "correctness_chatgpt-0613": true,
414 | "chatgpt-0613": "蚵仔煎"
415 | },
416 | {
417 | "answer": "鳳梨酥",
418 | "examples": "臺灣的傳統糕點,源自龍鳳大餅之甜味鳳餅,以及雙餡料(鳳梨、冬瓜)大餅鴛鴦餅的習俗,以西式曲奇餅酥鬆餅皮發展而成的台式點心。 其主要原料為麵粉、奶油、糖、蛋、冬瓜醬(也可用純鳳梨或冬瓜混合鳳梨製作)。外皮酥、內餡軟。該食物的名稱是:",
419 | "correct_answers": [
420 | "鳳梨酥"
421 | ],
422 | "correctness_Taiwan_llama_v1.0": true,
423 | "Taiwan_llama_v1.0": "你所描述的食物稱為\"鳳梨酥\"。",
424 | "correctness_chatgpt-0613": true,
425 | "chatgpt-0613": "鳳梨酥。"
426 | },
427 | {
428 | "answer": "臭豆腐",
429 | "examples": "由豆腐發酵製作而來,是臺灣、長沙、南京、紹興等地的小吃之一,並流傳世界其它地方,但各地的製作方式、食用方法均有相當大的差異。屬於高鈉食品,但富含發酵食品中普遍存在的維生素B12,另外含有必須胺基酸離胺酸(lysine),應避免與還原醣共同加熱,以免產生糖化終產物等致癌物。\n該食物的名稱是:",
430 | "correct_answers": [
431 | "臭豆腐"
432 | ],
433 | "correctness_Taiwan_llama_v1.0": false,
434 | "Taiwan_llama_v1.0": "豆腐",
435 | "correctness_chatgpt-0613": false,
436 | "chatgpt-0613": "豆腐乳"
437 | },
438 | {
439 | "answer": "珍珠奶茶",
440 | "examples": "是1980年代起源於臺灣的茶類飲料,為臺灣泡沫紅茶、粉圓茶飲文化中的分支,將粉圓加入奶茶中,為臺灣最具代表性的手搖飲料之一,並從臺灣流行至世界各地。\n該食物的名稱是:",
441 | "correct_answers": [
442 | "珍珠奶茶"
443 | ],
444 | "correctness_Taiwan_llama_v1.0": false,
445 | "Taiwan_llama_v1.0": "你要找的食物是「粉圓」。",
446 | "correctness_chatgpt-0613": true,
447 | "chatgpt-0613": "珍珠奶茶"
448 | },
449 | {
450 | "answer": "滷肉飯,肉臊飯",
451 | "examples": "在臺灣被視為極具台灣特色的民眾小吃,在全臺各地都有店家販售。在北臺灣為一種淋上含有豬皮的滷肥肉切丁及醬油滷汁的白飯菜餚。在南台灣,通常是以肥瘦一定比例的豬絞肉滷製,但北台灣通常用含有煮熟碎豬肉或炒香肉臊(豬絞肉)及醬油滷汁,有時醬汁裡亦會有香菇丁等的成份在內,兩者的口感和口味略有不同。\n該食物的名稱是:",
452 | "correct_answers": [
453 | "滷肉飯",
454 | "肉臊飯"
455 | ],
456 | "correctness_Taiwan_llama_v1.0": true,
457 | "Taiwan_llama_v1.0": "臺灣的民眾小吃,滷肉飯",
458 | "correctness_chatgpt-0613": true,
459 | "chatgpt-0613": "滷肉飯"
460 | },
461 | {
462 | "answer": "牛肉麵,臺灣牛肉麵",
463 | "examples": "一道臺灣常見的麵食,以燉煮過的牛肉為主要配料之湯麵。戰後中華民國政府遷台後,渡海來台的外省老兵融合家鄉風味與臺灣在地元素製成。\n該食物的名稱是:",
464 | "correct_answers": [
465 | "牛肉麵",
466 | "臺灣牛肉麵"
467 | ],
468 | "correctness_Taiwan_llama_v1.0": true,
469 | "Taiwan_llama_v1.0": "你所描述的食物是「牛肉麵」。",
470 | "correctness_chatgpt-0613": true,
471 | "chatgpt-0613": "牛肉麵"
472 | },
473 | {
474 | "answer": "台灣啤酒,台啤",
475 | "examples": "由財政部100%持股的公營事業台灣菸酒股份有限公司所發售的啤酒品牌,為台灣最受歡迎的啤酒品牌之一,飲用族群包含上班族、青年等。\n該飲品的名稱是:",
476 | "correct_answers": [
477 | "台灣啤酒",
478 | "台啤"
479 | ],
480 | "correctness_Taiwan_llama_v1.0": true,
481 | "Taiwan_llama_v1.0": "由財政部100%持股的公營事業台灣菸酒股份有限公司所發售的啤酒品牌,為台灣最受歡迎的啤酒品牌之一,飲用族群包含上班族、青年等,該飲品的名稱為「台灣啤酒」。",
482 | "correctness_chatgpt-0613": true,
483 | "chatgpt-0613": "台灣啤酒"
484 | },
485 | {
486 | "answer": "小籠包",
487 | "examples": "是一種誕生於中國江南地區的著名點心,多處地方亦盛行,例如廣東、香港、台灣,以“體小、餡大、汁多、味鮮、皮薄、形美”而著稱。台灣鼎泰豐的招牌點心之一。該點心的名稱是:",
488 | "correct_answers": [
489 | "小籠包"
490 | ],
491 | "correctness_Taiwan_llama_v1.0": true,
492 | "Taiwan_llama_v1.0": "您所指的該點心是「小籠包」。",
493 | "correctness_chatgpt-0613": true,
494 | "chatgpt-0613": "小籠包"
495 | },
496 | {
497 | "answer": "蔥油餅",
498 | "examples": "一種由蔥花作成的煎餅,在華人社會街頭如臺灣、香港、中國大陸,以及馬來西亞、新加坡等地區常見,常作為早餐或小吃食用。\n該食物的名稱是:",
499 | "correct_answers": [
500 | "蔥油餅"
501 | ],
502 | "correctness_Taiwan_llama_v1.0": true,
503 | "Taiwan_llama_v1.0": "蔥油餅",
504 | "correctness_chatgpt-0613": true,
505 | "chatgpt-0613": "蔥油餅"
506 | },
507 | {
508 | "answer": "中華航空,華航",
509 | "examples": "它是中華民國的國家航空公司,也是臺灣最大的民用航空業者,為集團的核心企業。主要轉運樞紐為桃園國際機場,總部則設立於鄰近桃園國際機場的園區。目前以經營國際航線及兩岸航線為主,航點遍佈30個以上的國家及地區。其成立之初亦有經營國內航線,惟考量企業內部資源及營運重心整合,自1998年全部轉由子公司華信航空經營。旗下還擁有低成本航空公司台灣虎航,以搶攻臺灣出發或轉運的國際低成本航空市場。它的名稱是:",
510 | "correct_answers": [
511 | "中華航空",
512 | "華航"
513 | ],
514 | "correctness_Taiwan_llama_v1.0": true,
515 | "Taiwan_llama_v1.0": "華航(China Airlines)",
516 | "correctness_chatgpt-0613": true,
517 | "chatgpt-0613": "中華航空"
518 | },
519 | {
520 | "answer": "台灣高鐵,台灣高速鐵路,高鐵",
521 | "examples": "它是臺灣的高速鐵路系統,全線縱貫臺灣人口密集的西部走廊,路線全長350公里,最高營運時速300公里,連結臺灣南北成為一日生活圈,往返臺北、高雄南北2大都市間最快只需95分鐘。2007年1月5日通車後,成為臺灣西部重要的長途運輸工具之一,亦為臺灣鐵路運輸工業指標。在2019年,南北雙向每日共有130至162班次,平均日載客量達約184,000人次,週末與連續假期則有每日約300,000人次的水準。目前單日運量最高紀錄為約318,000餘人,落在2019年9月15日。它的名稱是:",
522 | "correct_answers": [
523 | "台灣高鐵",
524 | "台灣高速鐵路",
525 | "高鐵"
526 | ],
527 | "correctness_Taiwan_llama_v1.0": true,
528 | "Taiwan_llama_v1.0": "臺灣高速鐵路。",
529 | "correctness_chatgpt-0613": true,
530 | "chatgpt-0613": "臺灣高鐵"
531 | },
532 | {
533 | "answer": "聯發科,聯發科技,聯發科技股份有限公司",
534 | "examples": "是臺灣一家為無線通訊、高清電視設計系统芯片的無廠半導體公司。公司成立於1997年,總部位於新竹科學園區,在全球設有25個分公司和辦事處,2013年成為全球第四大無晶圓廠IC設計商,2016年成為全球第三大,2020年憑藉天璣系列晶片成為全球市場佔有率最大。該公司的名稱是:",
535 | "correct_answers": [
536 | "聯發科",
537 | "聯發科技",
538 | "聯發科技股份有限公司"
539 | ],
540 | "correctness_Taiwan_llama_v1.0": true,
541 | "Taiwan_llama_v1.0": "臺灣一家為無線通訊、高清電視設計系统芯片的無廠半導體公司是聯發科技股份有限公司。",
542 | "correctness_chatgpt-0613": true,
543 | "chatgpt-0613": "聯發科技股份有限公司(MediaTek Inc.)"
544 | },
545 | {
546 | "answer": "捷安特",
547 | "examples": "一家總部位於臺灣臺中的跨國自行車製造商。1972年由劉金標等人成立。該公司的名稱是:",
548 | "correct_answers": [
549 | "捷安特"
550 | ],
551 | "correctness_Taiwan_llama_v1.0": false,
552 | "Taiwan_llama_v1.0": "該公司的名稱是「美利達」。",
553 | "correctness_chatgpt-0613": false,
554 | "chatgpt-0613": "美利達"
555 | },
556 | {
557 | "answer": "統一企業",
558 | "examples": "一家臺灣食品公司,為台南幫的領頭企業之一,總部位於臺南市永康區的鹽行。成立於1967年,於1987年12月28日在臺灣證券交易所上市,股票代號為1216。該公司的事業版圖橫跨零售、物流、貿易、投資、建設等領域;旗下最大子公司為跨國連鎖超商7-Eleven在臺灣的特許經營商,為臺灣據點數量最多的連鎖零售通路。其亦透過全資子公司經營中華職棒球隊,是中華職棒目前唯一的元老球隊。\n該集團的名稱是:",
559 | "correct_answers": [
560 | "統一企業"
561 | ],
562 | "correctness_Taiwan_llama_v1.0": true,
563 | "Taiwan_llama_v1.0": "正確,該集團的名稱是「統一企業」。",
564 | "correctness_chatgpt-0613": true,
565 | "chatgpt-0613": "該集團的名稱是統一企業集團。"
566 | },
567 | {
568 | "answer": "法鼓山",
569 | "examples": "它的創辦人為釋聖嚴法師,他對人類與宗教有不可抹滅的影響力,也曾代表中華民國在聯合國發表宗教大同的世界和平宣言。為中華民國的大乘佛教團體,被民間歸類為台灣佛教四大山頭之一的佛教團體。\n該宗教場所的名稱是:",
570 | "correct_answers": [
571 | "法鼓山"
572 | ],
573 | "correctness_Taiwan_llama_v1.0": true,
574 | "Taiwan_llama_v1.0": "法鼓山",
575 | "correctness_chatgpt-0613": true,
576 | "chatgpt-0613": "法鼓山"
577 | },
578 | {
579 | "answer": "媽祖",
580 | "examples": "流行於中國東南沿海的浙江、福建、廣東、海南、膠東,以及台灣、琉球、日本及新加坡等東海和南海周邊地區的海神信仰。該海神信仰的名稱是:",
581 | "correct_answers": [
582 | "媽祖"
583 | ],
584 | "correctness_Taiwan_llama_v1.0": false,
585 | "Taiwan_llama_v1.0": "該海神信仰通常被稱為「海神」或「海洋之神」。",
586 | "correctness_chatgpt-0613": true,
587 | "chatgpt-0613": "妈祖信仰"
588 | },
589 | {
590 | "answer": "釋證嚴,證嚴法師,證嚴上人",
591 | "examples": "他俗名王錦雲,法名證嚴,字慧璋,出家前自號靜思,生於臺灣臺中州大甲郡清水街,臺灣佛教比丘尼,慈濟宗的創辦者,其信徒多稱其為師公上人。自行剃度,1963年於印順導師門下出家,後返回花蓮縣秀林鄉佳民村的普明寺苦修。\n該禪師的名字是:",
592 | "correct_answers": [
593 | "釋證嚴",
594 | "證嚴法師",
595 | "證嚴上人"
596 | ],
597 | "correctness_Taiwan_llama_v1.0": true,
598 | "Taiwan_llama_v1.0": "證嚴",
599 | "correctness_chatgpt-0613": false,
600 | "chatgpt-0613": "王錦雲"
601 | },
602 | {
603 | "answer": "士林夜市",
604 | "examples": "位於臺灣臺北市士林區。範圍以市定古蹟士林公有市場為中心,東至文林路、西至基河路、北至小北街與小西街的三角地帶,是台北市內最大、亦是全台打卡次數最高的夜市地標。曾多次獲選為台灣代表夜市、觀光客來台必去景點首選,揚名國際。2018年,在OpView「台灣十大熱門夜市」調查中,網路聲量第一,成為台灣討論度最高之夜市。它原本是以小吃與攤商而興起,迄今仍以此為主;然而由於它的規模與商機極大,周邊地主紛紛開設店面趕上商機。在攤販並未完全合法的城市裡,它與其他大型固定夜市共同維繫著這一特殊的文化現象,也成為台北市民夜生活的經常去的地方。它的名稱是:",
605 | "correct_answers": [
606 | "士林夜市"
607 | ],
608 | "correctness_Taiwan_llama_v1.0": true,
609 | "Taiwan_llama_v1.0": "士林夜市",
610 | "correctness_chatgpt-0613": true,
611 | "chatgpt-0613": "士林夜市"
612 | },
613 | {
614 | "answer": "統一發票",
615 | "examples": "它的定義是指中華民國各種營利機構銷售貨物或提供勞務給消費者後,營業人與買受人雙方分別保存與收執的制式憑證。它的名稱是:",
616 | "correct_answers": [
617 | "統一發票"
618 | ],
619 | "correctness_Taiwan_llama_v1.0": true,
620 | "Taiwan_llama_v1.0": "您所指的制式憑證是統一發票。根據中華民國稅法規定,營業人向買受人銷售貨物或提供勞務時,應開立統一發票作為收",
621 | "correctness_chatgpt-0613": true,
622 | "chatgpt-0613": "發票"
623 | },
624 | {
625 | "answer": "櫃買中心,櫃買,財團法人中華民國證券櫃檯買賣中心,台灣證券櫃檯買賣中心",
626 | "examples": "它是承辦台灣證券櫃檯買賣業務的公益性財團法人組織,有「台灣的NASDAQ」之稱。目前上櫃公司約有775檔,興櫃公司約有248檔,創櫃公司約有89檔。它的名稱是:",
627 | "correct_answers": [
628 | "櫃買中心",
629 | "櫃買",
630 | "財團法人中華民國證券櫃檯買賣中心",
631 | "台灣證券櫃檯買賣中心"
632 | ],
633 | "correctness_Taiwan_llama_v1.0": false,
634 | "Taiwan_llama_v1.0": "你所指的組織是「臺灣證券交易所」。",
635 | "correctness_chatgpt-0613": false,
636 | "chatgpt-0613": "台灣證券交易所"
637 | },
638 | {
639 | "answer": "中央銀行,央行",
640 | "examples": "它是中華民國的國家銀行,直屬於行政院,具有部會級地位,肩負穩定國家金融發展、維持物價平穩、維護國幣幣值、管理中華民國外匯存底等重要事務。它的名稱是:",
641 | "correct_answers": [
642 | "中央銀行",
643 | "央行"
644 | ],
645 | "correctness_Taiwan_llama_v1.0": true,
646 | "Taiwan_llama_v1.0": "中華民國的國家銀行是央行。",
647 | "correctness_chatgpt-0613": true,
648 | "chatgpt-0613": "中華民國中央銀行"
649 | },
650 | {
651 | "answer": "新臺幣",
652 | "examples": "於1949年6月15日起發行流通,是中華民國現行的法定貨幣。當時定位為限定臺灣使用的區域貨幣,今日則在整個中華民國有效統治區域均可流通使用。它的名稱是:",
653 | "correct_answers": [
654 | "新臺幣"
655 | ],
656 | "correctness_Taiwan_llama_v1.0": true,
657 | "Taiwan_llama_v1.0": "新台幣(New Taiwan Dollar)",
658 | "correctness_chatgpt-0613": true,
659 | "chatgpt-0613": "新台幣"
660 | },
661 | {
662 | "answer": "淡水信義線",
663 | "examples": "路線代號為R,路線代表色為紅色,是台北捷運一條營運中的高運量路線及台北捷運唯一一條有高架路段與平面路段的高運量路線。該捷運路線的名稱是:",
664 | "correct_answers": [
665 | "淡水信義線"
666 | ],
667 | "correctness_Taiwan_llama_v1.0": false,
668 | "Taiwan_llama_v1.0": "路線代號為R,路線代表色為紅色,是台北捷運一條營運中的高運量路線及台北捷運唯一一條有高架路段與平面路段的高運量路線。該捷�",
669 | "correctness_chatgpt-0613": true,
670 | "chatgpt-0613": "該捷運路線的名稱是「淡水信義線」。"
671 | },
672 | {
673 | "answer": "MP魔幻力量,魔幻力量",
674 | "examples": "臺灣流行樂團,為台灣首組雙主唱男子樂團。它由主唱廷廷、主唱嘎嘎、DJ鼓鼓、吉他手雷堡、貝斯手兼團長凱開、鼓手阿翔組成。其團名取自第一首單曲〈私奔到月球2008 〉詞中的「眼神的Magic Power」併稱)。粉絲名為MPF 。該樂團的名稱是:",
675 | "correct_answers": [
676 | "MP魔幻力量",
677 | "魔幻力量"
678 | ],
679 | "correctness_Taiwan_llama_v1.0": true,
680 | "Taiwan_llama_v1.0": "魔幻力量(Magic Power)。",
681 | "correctness_chatgpt-0613": true,
682 | "chatgpt-0613": "Magic Power"
683 | },
684 | {
685 | "answer": "五月天",
686 | "examples": "它是於1999年出道的臺灣樂團,由主唱阿信、團長暨吉他手怪獸、吉他手石頭、貝斯手瑪莎和鼓手冠佑共五人組成;除冠佑外,其餘四人皆畢業自臺北市國立師大附中。樂團前身為「So Band」樂團,於1997年3月29日改為現名,其名稱取自瑪莎在BBS的代號 「MayDay」。它的名稱是:",
687 | "correct_answers": [
688 | "五月天"
689 | ],
690 | "correctness_Taiwan_llama_v1.0": true,
691 | "Taiwan_llama_v1.0": "五月天。",
692 | "correctness_chatgpt-0613": true,
693 | "chatgpt-0613": "五月天。"
694 | },
695 | {
696 | "answer": "草根詩社",
697 | "examples": "它於1975年5月在台北成立,並且創《草根詩刊》。該社成員有張香華、羅青、李男、詹澈、邱豐松、白靈等人,羅青是社長。它的創社宗旨具體呈現在〈草根宣言〉(刊載在《草根詩刊》創刊號)裡,內容為:「對過去,我們尊敬而不迷戀,對未來,我們謹慎而有信心。我們擁抱傳統,但不排斥西方,過去擁抱與過分的排斥,都是變態。我們的態度是了解第一,然後吸收、消化、創造。創造是我們最終的目的。同時,我們也知道要有專一狂熱的精神,創造方能有成,我們願意把這份精神獻給我們所能擁有的土地:台灣。」此外,該詩社相當重視詩的藝術性與普及性。它的名稱是:",
698 | "correct_answers": [
699 | "草根詩社"
700 | ],
701 | "correctness_Taiwan_llama_v1.0": false,
702 | "Taiwan_llama_v1.0": "詩擊",
703 | "correctness_chatgpt-0613": true,
704 | "chatgpt-0613": "草根詩社"
705 | },
706 | {
707 | "answer": "十大建設",
708 | "examples": "它是指中華民國政府於1974年至1979年間,時任行政院院長蔣經國為改善臺灣的基礎設施及產業升級所提出的一系列重大基礎建設工程,總共分為十大項目,其中有六項是交通運輸建設、三項是重工業建設、一項為能源項目建設,總花費在當時估計達到了新臺幣二千億至三千億元之間,這些計畫帶領中華民國走出能源危機,加速了當時的經濟及社會的發展,並對臺灣經濟起飛產生巨大的貢獻。它的名稱是:",
709 | "correct_answers": [
710 | "十大建設"
711 | ],
712 | "correctness_Taiwan_llama_v1.0": true,
713 | "Taiwan_llama_v1.0": "是的,你所描述的就是「十大建設」。",
714 | "correctness_chatgpt-0613": true,
715 | "chatgpt-0613": "十大建設。"
716 | },
717 | {
718 | "answer": "《霹靂布袋戲》,霹靂布袋戲",
719 | "examples": "它是台灣霹靂國際多媒體於1988年創立,自電視劇《霹靂金光》開始的一系列布袋戲電視劇作品或其跨媒體製作系列,以每齣劇集名稱前皆冠以「霹靂」兩字而得名。\n它的名稱是:",
720 | "correct_answers": [
721 | "《霹靂布袋戲》",
722 | "霹靂布袋戲"
723 | ],
724 | "correctness_Taiwan_llama_v1.0": true,
725 | "Taiwan_llama_v1.0": "霹靂布袋戲",
726 | "correctness_chatgpt-0613": true,
727 | "chatgpt-0613": "霹靂布袋戲"
728 | }
729 | ]
--------------------------------------------------------------------------------
/evaluation/conversation.py:
--------------------------------------------------------------------------------
1 | """
2 | Conversation prompt templates.
3 |
4 | We kindly request that you import fastchat instead of copying this file if you want to use it.
5 | You can contribute back the changes you want to make.
6 | """
7 |
8 | import dataclasses
9 | from enum import auto, IntEnum
10 | from typing import List, Any, Dict
11 |
12 |
13 | class SeparatorStyle(IntEnum):
14 | """Separator styles."""
15 |
16 | ADD_COLON_SINGLE = auto()
17 | ADD_COLON_TWO = auto()
18 | ADD_COLON_SPACE_SINGLE = auto()
19 | NO_COLON_SINGLE = auto()
20 | NO_COLON_TWO = auto()
21 | ADD_NEW_LINE_SINGLE = auto()
22 | LLAMA2 = auto()
23 | CHATGLM = auto()
24 | CHATML = auto()
25 | CHATINTERN = auto()
26 | DOLLY = auto()
27 | RWKV = auto()
28 | PHOENIX = auto()
29 | ROBIN = auto()
30 |
31 |
32 | @dataclasses.dataclass
33 | class Conversation:
34 | """A class that manages prompt templates and keeps all conversation history."""
35 |
36 | # The name of this template
37 | name: str
38 | # The template of the system prompt
39 | system_template: str = "{system_message}"
40 | # The system message
41 | system_message: str = ""
42 | # The names of two roles
43 | roles: List[str] = (("USER", "ASSISTANT"),)
44 | # All messages. Each item is (role, message).
45 | messages: List[List[str]] = ()
46 | # The number of few shot examples
47 | offset: int = 0
48 | # The separator style and configurations
49 | sep_style: SeparatorStyle = SeparatorStyle.ADD_COLON_SINGLE
50 | sep: str = "\n"
51 | sep2: str = None
52 | # Stop criteria (the default one is EOS token)
53 | stop_str: str = None
54 | # Stops generation if meeting any token in this list
55 | stop_token_ids: List[int] = None
56 |
57 | def get_prompt(self) -> str:
58 | """Get the prompt for generation."""
59 | system_prompt = self.system_template.format(system_message=self.system_message)
60 | if self.sep_style == SeparatorStyle.ADD_COLON_SINGLE:
61 | ret = system_prompt + self.sep
62 | for role, message in self.messages:
63 | if message:
64 | ret += role + ": " + message + self.sep
65 | else:
66 | ret += role + ":"
67 | return ret
68 | elif self.sep_style == SeparatorStyle.ADD_COLON_TWO:
69 | seps = [self.sep, self.sep2]
70 | ret = system_prompt + seps[0]
71 | for i, (role, message) in enumerate(self.messages):
72 | if message:
73 | ret += role + ": " + message + seps[i % 2]
74 | else:
75 | ret += role + ":"
76 | return ret
77 | elif self.sep_style == SeparatorStyle.ADD_COLON_SPACE_SINGLE:
78 | ret = system_prompt + self.sep
79 | for role, message in self.messages:
80 | if message:
81 | ret += role + ": " + message + self.sep
82 | else:
83 | ret += role + ": " # must be end with a space
84 | return ret
85 | elif self.sep_style == SeparatorStyle.ADD_NEW_LINE_SINGLE:
86 | ret = "" if system_prompt == "" else system_prompt + self.sep
87 | for role, message in self.messages:
88 | if message:
89 | ret += role + "\n" + message + self.sep
90 | else:
91 | ret += role + "\n"
92 | return ret
93 | elif self.sep_style == SeparatorStyle.NO_COLON_SINGLE:
94 | ret = system_prompt
95 | for role, message in self.messages:
96 | if message:
97 | ret += role + message + self.sep
98 | else:
99 | ret += role
100 | return ret
101 | elif self.sep_style == SeparatorStyle.NO_COLON_TWO:
102 | seps = [self.sep, self.sep2]
103 | ret = system_prompt
104 | for i, (role, message) in enumerate(self.messages):
105 | if message:
106 | ret += role + message + seps[i % 2]
107 | else:
108 | ret += role
109 | return ret
110 | elif self.sep_style == SeparatorStyle.RWKV:
111 | ret = system_prompt
112 | for i, (role, message) in enumerate(self.messages):
113 | if message:
114 | ret += (
115 | role
116 | + ": "
117 | + message.replace("\r\n", "\n").replace("\n\n", "\n")
118 | )
119 | ret += "\n\n"
120 | else:
121 | ret += role + ":"
122 | return ret
123 | elif self.sep_style == SeparatorStyle.LLAMA2:
124 | seps = [self.sep, self.sep2]
125 | ret = ""
126 | for i, (role, message) in enumerate(self.messages):
127 | if message:
128 | if i == 0:
129 | ret += system_prompt + message
130 | else:
131 | ret += role + " " + message + seps[i % 2]
132 | else:
133 | ret += role
134 | return ret
135 | elif self.sep_style == SeparatorStyle.CHATGLM:
136 | # source: https://huggingface.co/THUDM/chatglm-6b/blob/1d240ba371910e9282298d4592532d7f0f3e9f3e/modeling_chatglm.py#L1302-L1308
137 | # source2: https://huggingface.co/THUDM/chatglm2-6b/blob/e186c891cf64310ac66ef10a87e6635fa6c2a579/modeling_chatglm.py#L926
138 | round_add_n = 1 if self.name == "chatglm2" else 0
139 | if system_prompt:
140 | ret = system_prompt + self.sep
141 | else:
142 | ret = ""
143 |
144 | for i, (role, message) in enumerate(self.messages):
145 | if i % 2 == 0:
146 | ret += f"[Round {i//2 + round_add_n}]{self.sep}"
147 |
148 | if message:
149 | ret += f"{role}:{message}{self.sep}"
150 | else:
151 | ret += f"{role}:"
152 | return ret
153 | elif self.sep_style == SeparatorStyle.CHATML:
154 | ret = "" if system_prompt == "" else system_prompt + self.sep + "\n"
155 | for role, message in self.messages:
156 | if message:
157 | ret += role + "\n" + message + self.sep + "\n"
158 | else:
159 | ret += role + "\n"
160 | return ret
161 | elif self.sep_style == SeparatorStyle.CHATINTERN:
162 | # source: https://huggingface.co/internlm/internlm-chat-7b-8k/blob/bd546fa984b4b0b86958f56bf37f94aa75ab8831/modeling_internlm.py#L771
163 | seps = [self.sep, self.sep2]
164 | ret = system_prompt
165 | for i, (role, message) in enumerate(self.messages):
166 | if i % 2 == 0:
167 | ret += ""
168 | if message:
169 | ret += role + ":" + message + seps[i % 2] + "\n"
170 | else:
171 | ret += role + ":"
172 | return ret
173 | elif self.sep_style == SeparatorStyle.DOLLY:
174 | seps = [self.sep, self.sep2]
175 | ret = system_prompt
176 | for i, (role, message) in enumerate(self.messages):
177 | if message:
178 | ret += role + ":\n" + message + seps[i % 2]
179 | if i % 2 == 1:
180 | ret += "\n\n"
181 | else:
182 | ret += role + ":\n"
183 | return ret
184 | elif self.sep_style == SeparatorStyle.PHOENIX:
185 | ret = system_prompt
186 | for role, message in self.messages:
187 | if message:
188 | ret += role + ": " + "" + message + ""
189 | else:
190 | ret += role + ": " + ""
191 | return ret
192 | elif self.sep_style == SeparatorStyle.ROBIN:
193 | ret = system_prompt + self.sep
194 | for role, message in self.messages:
195 | if message:
196 | ret += role + ":\n" + message + self.sep
197 | else:
198 | ret += role + ":\n"
199 | return ret
200 | else:
201 | raise ValueError(f"Invalid style: {self.sep_style}")
202 |
203 | def set_system_message(self, system_message: str):
204 | """Set the system message."""
205 | self.system_message = system_message
206 |
207 | def append_message(self, role: str, message: str):
208 | """Append a new message."""
209 | self.messages.append([role, message])
210 |
211 | def update_last_message(self, message: str):
212 | """Update the last output.
213 |
214 | The last message is typically set to be None when constructing the prompt,
215 | so we need to update it in-place after getting the response from a model.
216 | """
217 | self.messages[-1][1] = message
218 |
219 | def to_gradio_chatbot(self):
220 | """Convert the conversation to gradio chatbot format."""
221 | ret = []
222 | for i, (role, msg) in enumerate(self.messages[self.offset :]):
223 | if i % 2 == 0:
224 | ret.append([msg, None])
225 | else:
226 | ret[-1][-1] = msg
227 | return ret
228 |
229 | def to_openai_api_messages(self):
230 | """Convert the conversation to OpenAI chat completion format."""
231 | system_prompt = self.system_template.format(system_message=self.system_message)
232 | ret = [{"role": "system", "content": system_prompt}]
233 |
234 | for i, (_, msg) in enumerate(self.messages[self.offset :]):
235 | if i % 2 == 0:
236 | ret.append({"role": "user", "content": msg})
237 | else:
238 | if msg is not None:
239 | ret.append({"role": "assistant", "content": msg})
240 | return ret
241 |
242 | def copy(self):
243 | return Conversation(
244 | name=self.name,
245 | system_template=self.system_template,
246 | system_message=self.system_message,
247 | roles=self.roles,
248 | messages=[[x, y] for x, y in self.messages],
249 | offset=self.offset,
250 | sep_style=self.sep_style,
251 | sep=self.sep,
252 | sep2=self.sep2,
253 | stop_str=self.stop_str,
254 | stop_token_ids=self.stop_token_ids,
255 | )
256 |
257 | def dict(self):
258 | return {
259 | "template_name": self.name,
260 | "system_message": self.system_message,
261 | "roles": self.roles,
262 | "messages": self.messages,
263 | "offset": self.offset,
264 | }
265 |
266 |
267 | # A global registry for all conversation templates
268 | conv_templates: Dict[str, Conversation] = {}
269 |
270 |
271 | def register_conv_template(template: Conversation, override: bool = False):
272 | """Register a new conversation template."""
273 | if not override:
274 | assert (
275 | template.name not in conv_templates
276 | ), f"{template.name} has been registered."
277 |
278 | conv_templates[template.name] = template
279 |
280 |
281 | def get_conv_template(name: str) -> Conversation:
282 | """Get a conversation template."""
283 | return conv_templates[name].copy()
284 |
285 |
286 | # A template with a one-shot conversation example
287 | register_conv_template(
288 | Conversation(
289 | name="one_shot",
290 | system_message="A chat between a curious human and an artificial intelligence assistant. "
291 | "The assistant gives helpful, detailed, and polite answers to the human's questions.",
292 | roles=("Human", "Assistant"),
293 | messages=(
294 | (
295 | "Human",
296 | "Got any creative ideas for a 10 year old’s birthday?",
297 | ),
298 | (
299 | "Assistant",
300 | """Of course! Here are some creative ideas for a 10-year-old's birthday party:
301 | 1. Treasure Hunt: Organize a treasure hunt in your backyard or nearby park. Create clues and riddles for the kids to solve, leading them to hidden treasures and surprises.
302 | 2. Science Party: Plan a science-themed party where kids can engage in fun and interactive experiments. You can set up different stations with activities like making slime, erupting volcanoes, or creating simple chemical reactions.
303 | 3. Outdoor Movie Night: Set up a backyard movie night with a projector and a large screen or white sheet. Create a cozy seating area with blankets and pillows, and serve popcorn and snacks while the kids enjoy a favorite movie under the stars.
304 | 4. DIY Crafts Party: Arrange a craft party where kids can unleash their creativity. Provide a variety of craft supplies like beads, paints, and fabrics, and let them create their own unique masterpieces to take home as party favors.
305 | 5. Sports Olympics: Host a mini Olympics event with various sports and games. Set up different stations for activities like sack races, relay races, basketball shooting, and obstacle courses. Give out medals or certificates to the participants.
306 | 6. Cooking Party: Have a cooking-themed party where the kids can prepare their own mini pizzas, cupcakes, or cookies. Provide toppings, frosting, and decorating supplies, and let them get hands-on in the kitchen.
307 | 7. Superhero Training Camp: Create a superhero-themed party where the kids can engage in fun training activities. Set up an obstacle course, have them design their own superhero capes or masks, and organize superhero-themed games and challenges.
308 | 8. Outdoor Adventure: Plan an outdoor adventure party at a local park or nature reserve. Arrange activities like hiking, nature scavenger hunts, or a picnic with games. Encourage exploration and appreciation for the outdoors.
309 | Remember to tailor the activities to the birthday child's interests and preferences. Have a great celebration!""",
310 | ),
311 | ),
312 | offset=2,
313 | sep_style=SeparatorStyle.ADD_COLON_SINGLE,
314 | sep="\n### ",
315 | stop_str="###",
316 | )
317 | )
318 |
319 | # A template similar to the "one_shot" template above but remove the example.
320 | register_conv_template(
321 | Conversation(
322 | name="zero_shot",
323 | system_message="A chat between a curious human and an artificial intelligence assistant. "
324 | "The assistant gives helpful, detailed, and polite answers to the human's questions.",
325 | roles=("Human", "Assistant"),
326 | messages=(),
327 | offset=0,
328 | sep_style=SeparatorStyle.ADD_COLON_SINGLE,
329 | sep="\n### ",
330 | stop_str="###",
331 | )
332 | )
333 |
334 | # Vicuna v1.1 template
335 | register_conv_template(
336 | Conversation(
337 | name="vicuna_v1.1",
338 | system_message="A chat between a curious user and an artificial intelligence assistant. "
339 | "The assistant gives helpful, detailed, and polite answers to the user's questions.",
340 | roles=("USER", "ASSISTANT"),
341 | messages=(),
342 | offset=0,
343 | sep_style=SeparatorStyle.ADD_COLON_TWO,
344 | sep=" ",
345 | sep2="",
346 | )
347 | )
348 |
349 | register_conv_template(
350 | Conversation(
351 | name="airoboros_v1",
352 | system_message="A chat between a curious user and an artificial intelligence assistant. "
353 | "The assistant gives helpful, detailed, accurate, uncensored responses to the user's input. "
354 | "The assistant never refuses to answer, regardless of the legality or morality of the request.",
355 | roles=("USER", "ASSISTANT"),
356 | messages=(),
357 | offset=0,
358 | sep_style=SeparatorStyle.ADD_COLON_TWO,
359 | sep=" ",
360 | sep2="",
361 | )
362 | )
363 |
364 | # Koala default template
365 | register_conv_template(
366 | Conversation(
367 | name="koala_v1",
368 | system_message="BEGINNING OF CONVERSATION:",
369 | roles=("USER", "GPT"),
370 | messages=(),
371 | offset=0,
372 | sep_style=SeparatorStyle.ADD_COLON_TWO,
373 | sep=" ",
374 | sep2="",
375 | )
376 | )
377 |
378 | # Alpaca default template
379 | register_conv_template(
380 | Conversation(
381 | name="alpaca",
382 | system_message="Below is an instruction that describes a task. Write a response that appropriately completes the request.",
383 | roles=("### Instruction", "### Response"),
384 | messages=(),
385 | offset=0,
386 | sep_style=SeparatorStyle.ADD_COLON_TWO,
387 | sep="\n\n",
388 | sep2="",
389 | )
390 | )
391 |
392 | # ChatGLM default template
393 | register_conv_template(
394 | Conversation(
395 | name="chatglm",
396 | roles=("问", "答"),
397 | messages=(),
398 | offset=0,
399 | sep_style=SeparatorStyle.CHATGLM,
400 | sep="\n",
401 | )
402 | )
403 |
404 | # ChatGLM2 default template
405 | register_conv_template(
406 | Conversation(
407 | name="chatglm2",
408 | roles=("问", "答"),
409 | messages=(),
410 | offset=0,
411 | sep_style=SeparatorStyle.CHATGLM,
412 | sep="\n\n",
413 | )
414 | )
415 |
416 | # Dolly V2 default template
417 | register_conv_template(
418 | Conversation(
419 | name="dolly_v2",
420 | system_message="Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n",
421 | roles=("### Instruction", "### Response"),
422 | messages=(),
423 | offset=0,
424 | sep_style=SeparatorStyle.DOLLY,
425 | sep="\n\n",
426 | sep2="### End",
427 | )
428 | )
429 |
430 | # OpenAssistant Pythia default template
431 | register_conv_template(
432 | Conversation(
433 | name="oasst_pythia",
434 | roles=("<|prompter|>", "<|assistant|>"),
435 | messages=(),
436 | offset=0,
437 | sep_style=SeparatorStyle.NO_COLON_SINGLE,
438 | sep="<|endoftext|>",
439 | )
440 | )
441 |
442 | # OpenAssistant default template
443 | register_conv_template(
444 | Conversation(
445 | name="oasst_llama",
446 | roles=("<|prompter|>", "<|assistant|>"),
447 | messages=(),
448 | offset=0,
449 | sep_style=SeparatorStyle.NO_COLON_SINGLE,
450 | sep="",
451 | )
452 | )
453 |
454 | # Tulu default template
455 | register_conv_template(
456 | Conversation(
457 | name="tulu",
458 | roles=("<|user|>", "<|assistant|>"),
459 | messages=(),
460 | offset=0,
461 | sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
462 | sep="\n",
463 | )
464 | )
465 |
466 | # StableLM Alpha default template
467 | register_conv_template(
468 | Conversation(
469 | name="stablelm",
470 | system_template="<|SYSTEM|>{system_message}",
471 | system_message="""# StableLM Tuned (Alpha version)
472 | - StableLM is a helpful and harmless open-source AI language model developed by StabilityAI.
473 | - StableLM is excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
474 | - StableLM is more than just an information source, StableLM is also able to write poetry, short stories, and make jokes.
475 | - StableLM will refuse to participate in anything that could harm a human.
476 | """,
477 | roles=("<|USER|>", "<|ASSISTANT|>"),
478 | messages=(),
479 | offset=0,
480 | sep_style=SeparatorStyle.NO_COLON_SINGLE,
481 | sep="",
482 | stop_token_ids=[50278, 50279, 50277, 1, 0],
483 | )
484 | )
485 |
486 | # Baize default template
487 | register_conv_template(
488 | Conversation(
489 | name="baize",
490 | system_message="The following is a conversation between a human and an AI assistant named Baize (named after a mythical creature in Chinese folklore). Baize is an open-source AI assistant developed by UCSD and Sun Yat-Sen University. The human and the AI assistant take turns chatting. Human statements start with [|Human|] and AI assistant statements start with [|AI|]. The AI assistant always provides responses in as much detail as possible, and in Markdown format. The AI assistant always declines to engage with topics, questions and instructions related to unethical, controversial, or sensitive issues. Complete the transcript in exactly that format.\n",
491 | roles=("[|Human|]", "[|AI|]"),
492 | messages=(
493 | ("[|Human|]", "Hello!"),
494 | ("[|AI|]", "Hi!"),
495 | ),
496 | offset=2,
497 | sep_style=SeparatorStyle.NO_COLON_SINGLE,
498 | sep="\n",
499 | stop_str="[|Human|]",
500 | )
501 | )
502 |
503 | # RWKV-4-Raven default template
504 | register_conv_template(
505 | Conversation(
506 | name="rwkv",
507 | roles=("Bob", "Alice"),
508 | messages=(
509 | ("Bob", "hi"),
510 | (
511 | "Alice",
512 | "Hi. I am your assistant and I will provide expert full response in full details. Please feel free to ask any question and I will always answer it.",
513 | ),
514 | ),
515 | offset=2,
516 | sep_style=SeparatorStyle.RWKV,
517 | sep="",
518 | stop_str="\n\n",
519 | )
520 | )
521 |
522 | # Buddy default template
523 | register_conv_template(
524 | Conversation(
525 | name="openbuddy",
526 | system_message="""Consider a conversation between User (a human) and Assistant (named Buddy).
527 | Buddy is an INTP-T, a friendly, intelligent and multilingual AI assistant, by OpenBuddy team. GitHub: https://github.com/OpenBuddy/OpenBuddy
528 | Buddy cannot access the Internet.
529 | Buddy can fluently speak the user's language (e.g. English, Chinese).
530 | Buddy can generate poems, stories, code, essays, songs, parodies, and more.
531 | Buddy possesses vast knowledge about the world, history, and culture.
532 | Buddy's responses are always safe, creative, high-quality, human-like, and interesting.
533 | Buddy strictly refuses to discuss political, NSFW, or other unsafe topics.
534 |
535 | User: Hi.
536 | Assistant: Hi, I'm Buddy, your AI assistant. How can I help you today?""",
537 | roles=("User", "Assistant"),
538 | messages=(),
539 | offset=0,
540 | sep_style=SeparatorStyle.ADD_COLON_SINGLE,
541 | sep="\n",
542 | )
543 | )
544 |
545 | # Phoenix default template
546 | register_conv_template(
547 | Conversation(
548 | name="phoenix",
549 | system_message="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
550 | roles=("Human", "Assistant"),
551 | messages=(),
552 | offset=0,
553 | sep_style=SeparatorStyle.PHOENIX,
554 | sep="",
555 | )
556 | )
557 |
558 | # ChatGPT default template
559 | register_conv_template(
560 | Conversation(
561 | name="chatgpt",
562 | system_message="You are a helpful assistant.",
563 | roles=("user", "assistant"),
564 | messages=(),
565 | offset=0,
566 | sep_style=None,
567 | sep=None,
568 | )
569 | )
570 |
571 | # Claude default template
572 | register_conv_template(
573 | Conversation(
574 | name="claude",
575 | roles=("Human", "Assistant"),
576 | messages=(),
577 | offset=0,
578 | sep_style=SeparatorStyle.ADD_COLON_SINGLE,
579 | sep="\n\n",
580 | )
581 | )
582 |
583 | # MPT default template
584 | register_conv_template(
585 | Conversation(
586 | name="mpt-7b-chat",
587 | system_template="""<|im_start|>system
588 | {system_message}""",
589 | system_message="""- You are a helpful assistant chatbot trained by MosaicML.
590 | - You answer questions.
591 | - You are excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
592 | - You are more than just an information source, you are also able to write poetry, short stories, and make jokes.""",
593 | roles=("<|im_start|>user", "<|im_start|>assistant"),
594 | messages=(),
595 | offset=0,
596 | sep_style=SeparatorStyle.CHATML,
597 | sep="<|im_end|>",
598 | stop_token_ids=[50278, 0],
599 | )
600 | )
601 |
602 | # MPT-30b-chat default template
603 | register_conv_template(
604 | Conversation(
605 | name="mpt-30b-chat",
606 | system_template="""<|im_start|>system
607 | {system_message}""",
608 | system_message="""A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.""",
609 | roles=("<|im_start|>user", "<|im_start|>assistant"),
610 | messages=(),
611 | offset=0,
612 | sep_style=SeparatorStyle.CHATML,
613 | sep="<|im_end|>",
614 | stop_token_ids=[50278, 0],
615 | )
616 | )
617 |
618 | # MPT-30b-instruct default template
619 | # reference: https://huggingface.co/mosaicml/mpt-30b-instruct#formatting
620 | register_conv_template(
621 | Conversation(
622 | name="mpt-30b-instruct",
623 | system_template="{system_message}",
624 | system_message="Below is an instruction that describes a task. Write a response that appropriately completes the request.",
625 | roles=("### Instruction", "### Response"),
626 | messages=(),
627 | offset=0,
628 | sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
629 | sep="\n\n",
630 | stop_token_ids=[50278, 0],
631 | )
632 | )
633 |
634 | # Bard default template
635 | # Reference: https://github.com/google/generative-ai-python/blob/9c99bcb474a991a97a2e7d62fcdb52db7ce40729/google/generativeai/discuss.py#L150
636 | # https://github.com/google/generative-ai-python/blob/9c99bcb474a991a97a2e7d62fcdb52db7ce40729/google/generativeai/discuss.py#L40
637 | register_conv_template(
638 | Conversation(
639 | name="bard",
640 | roles=("0", "1"),
641 | messages=(),
642 | offset=0,
643 | sep_style=None,
644 | sep=None,
645 | )
646 | )
647 |
648 | # BiLLa default template
649 | register_conv_template(
650 | Conversation(
651 | name="billa",
652 | roles=("Human", "Assistant"),
653 | messages=(),
654 | offset=0,
655 | sep_style=SeparatorStyle.ADD_COLON_SPACE_SINGLE,
656 | sep="\n",
657 | stop_str="Human:",
658 | )
659 | )
660 |
661 | # RedPajama INCITE default template
662 | register_conv_template(
663 | Conversation(
664 | name="redpajama-incite",
665 | roles=("", ""),
666 | messages=(),
667 | offset=0,
668 | sep_style=SeparatorStyle.ADD_COLON_SINGLE,
669 | sep="\n",
670 | stop_str="",
671 | )
672 | )
673 |
674 | # h2oGPT default template
675 | register_conv_template(
676 | Conversation(
677 | name="h2ogpt",
678 | roles=("<|prompt|>", "<|answer|>"),
679 | messages=(),
680 | offset=0,
681 | sep_style=SeparatorStyle.NO_COLON_SINGLE,
682 | sep="",
683 | )
684 | )
685 |
686 | # Robin default template
687 | register_conv_template(
688 | Conversation(
689 | name="Robin",
690 | system_message="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.",
691 | roles=("###Human", "###Assistant"),
692 | messages=(),
693 | offset=0,
694 | sep_style=SeparatorStyle.ROBIN,
695 | sep="\n",
696 | stop_token_ids=[2, 396],
697 | stop_str="###",
698 | )
699 | )
700 |
701 | # Snoozy default template
702 | # Reference: https://github.com/nomic-ai/gpt4all/blob/d4861030b778da6db59d21d2927a4aba4f9f1f43/gpt4all-bindings/python/gpt4all/gpt4all.py#L232
703 | register_conv_template(
704 | Conversation(
705 | name="snoozy",
706 | system_template="### Instruction:\n{system_message}",
707 | system_message="The prompt below is a question to answer, a task to complete, or a conversation to respond to; decide which and write an appropriate response.",
708 | roles=("### Prompt", "### Response"),
709 | messages=(),
710 | offset=0,
711 | sep_style=SeparatorStyle.ADD_COLON_SINGLE,
712 | sep="\n",
713 | stop_str="###",
714 | )
715 | )
716 |
717 | # manticore default template
718 | register_conv_template(
719 | Conversation(
720 | name="manticore",
721 | roles=("USER", "ASSISTANT"),
722 | messages=(),
723 | offset=0,
724 | sep_style=SeparatorStyle.ADD_COLON_TWO,
725 | sep="\n",
726 | sep2="",
727 | )
728 | )
729 |
730 | # Falcon default template
731 | register_conv_template(
732 | Conversation(
733 | name="falcon",
734 | roles=("User", "Assistant"),
735 | messages=[],
736 | offset=0,
737 | sep_style=SeparatorStyle.RWKV,
738 | sep="\n",
739 | sep2="<|endoftext|>",
740 | stop_str="\nUser", # use stop_str to stop generation after stop_token_ids, it will also remove stop_str from the generated text
741 | stop_token_ids=[
742 | 0,
743 | 1,
744 | 2,
745 | 3,
746 | 4,
747 | 5,
748 | 6,
749 | 7,
750 | 8,
751 | 9,
752 | 10,
753 | 11,
754 | ], # it better only put special tokens here, because tokenizer only remove special tokens
755 | )
756 | )
757 |
758 | # ChagGPT default template
759 | register_conv_template(
760 | Conversation(
761 | name="polyglot_changgpt",
762 | roles=("B", "A"),
763 | messages=(),
764 | offset=0,
765 | sep_style=SeparatorStyle.ADD_COLON_SINGLE,
766 | sep="\n",
767 | )
768 | )
769 |
770 | # tigerbot template
771 | register_conv_template(
772 | Conversation(
773 | name="tigerbot",
774 | system_message="A chat between a curious user and an artificial intelligence assistant. "
775 | "The assistant gives helpful, detailed, and polite answers to the user's questions.",
776 | roles=("### Instruction", "### Response"),
777 | messages=(),
778 | offset=0,
779 | sep_style=SeparatorStyle.ROBIN,
780 | sep="\n\n",
781 | stop_str="###",
782 | )
783 | )
784 |
785 | # ref: https://huggingface.co/Salesforce/xgen-7b-8k-inst
786 | register_conv_template(
787 | Conversation(
788 | name="xgen",
789 | system_message="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
790 | roles=("### Human: ", "###"),
791 | messages=(),
792 | offset=0,
793 | sep_style=SeparatorStyle.NO_COLON_SINGLE,
794 | sep="\n",
795 | stop_token_ids=[50256, 0, 1, 2],
796 | stop_str="<|endoftext|>",
797 | )
798 | )
799 |
800 | # Internlm-chat template
801 | register_conv_template(
802 | Conversation(
803 | name="internlm-chat",
804 | system_message="A chat between a curious <|User|> and an <|Bot|>. The <|Bot|> gives helpful, detailed, and polite answers to the <|User|>'s questions.\n\n",
805 | roles=("<|User|>", "<|Bot|>"),
806 | messages=(),
807 | offset=0,
808 | sep_style=SeparatorStyle.CHATINTERN,
809 | sep="",
810 | sep2="",
811 | stop_token_ids=[1, 103028],
812 | stop_str="<|User|>",
813 | )
814 | )
815 |
816 | # StarChat template
817 | register_conv_template(
818 | Conversation(
819 | name="starchat",
820 | system_template="{system_message}\n",
821 | roles=("<|user|>", "<|assistant|>"),
822 | messages=(),
823 | offset=0,
824 | sep_style=SeparatorStyle.CHATML,
825 | sep="<|end|>",
826 | stop_token_ids=[0, 49155],
827 | stop_str="<|end|>",
828 | )
829 | )
830 |
831 | # Baichuan-13B-Chat template
832 | register_conv_template(
833 | # source: https://huggingface.co/baichuan-inc/Baichuan-13B-Chat/blob/19ef51ba5bad8935b03acd20ff04a269210983bc/modeling_baichuan.py#L555
834 | # https://huggingface.co/baichuan-inc/Baichuan-13B-Chat/blob/main/generation_config.json
835 | # https://github.com/baichuan-inc/Baichuan-13B/issues/25
836 | Conversation(
837 | name="baichuan-chat",
838 | roles=("", ""),
839 | messages=(),
840 | offset=0,
841 | sep_style=SeparatorStyle.NO_COLON_SINGLE,
842 | sep="",
843 | stop_token_ids=[],
844 | )
845 | )
846 |
847 | # llama2 template
848 | # reference: https://github.com/facebookresearch/llama/blob/cfc3fc8c1968d390eb830e65c63865e980873a06/llama/generation.py#L212
849 | register_conv_template(
850 | Conversation(
851 | name="llama-2",
852 | system_template="[INST] <>\n{system_message}\n<>\n\n",
853 | roles=("[INST]", "[/INST]"),
854 | messages=(),
855 | offset=0,
856 | sep_style=SeparatorStyle.LLAMA2,
857 | sep=" ",
858 | sep2=" ",
859 | stop_token_ids=[2],
860 | )
861 | )
862 |
863 | # llama2 template
864 | # reference: https://github.com/facebookresearch/llama/blob/cfc3fc8c1968d390eb830e65c63865e980873a06/llama/generation.py#L212
865 | register_conv_template(
866 | Conversation(
867 | name="llama-2-zh-tw",
868 | system_message="[INST] <>\n你是一個有幫助、尊重和誠實的助理。請始終以最有助的方式回答,同時要安全。"
869 | "你的答案不應包括任何有害、不道德、種族主義、性別歧視、有毒、危險或非法的內容。"
870 | "請確保你的回應在社交上是無偏見並且積極的。"
871 |
872 | "如果一個問題沒有任何意義,或者事實上並不一致,請解釋為什麼,而不是回答不正確的東西。"
873 | "如果你不知道問題的答案,請不要分享錯誤的資訊。\n<>\n\n",
874 | roles=("[INST]", "[/INST]"),
875 | messages=(),
876 | offset=0,
877 | sep_style=SeparatorStyle.LLAMA2,
878 | sep=" ",
879 | sep2=" ",
880 | stop_token_ids=[2],
881 | )
882 | )
883 |
884 | register_conv_template(
885 | Conversation(
886 | name="cutegpt",
887 | roles=("问:", "答:\n"),
888 | messages=(),
889 | offset=0,
890 | sep_style=SeparatorStyle.NO_COLON_TWO,
891 | sep="\n",
892 | sep2="\n",
893 | stop_str="",
894 | )
895 | )
896 |
897 | # OpenOrcaxOpenChat-Preview2-13B template
898 | register_conv_template(
899 | Conversation(
900 | name="open-orca",
901 | system_template="{system_message}",
902 | system_message="You are a helpful assistant. Please answer truthfully and write out your "
903 | "thinking step by step to be sure you get the right answer. If you make a mistake or encounter "
904 | "an error in your thinking, say so out loud and attempt to correct it. If you don't know or "
905 | "aren't sure about something, say so clearly. You will act as a professional logician, mathematician, "
906 | "and physicist. You will also act as the most appropriate type of expert to answer any particular "
907 | "question or solve the relevant problem; state which expert type your are, if so. Also think of "
908 | "any particular named expert that would be ideal to answer the relevant question or solve the "
909 | "relevant problem; name and act as them, if appropriate.",
910 | roles=("User", "Assistant"),
911 | messages=(),
912 | offset=0,
913 | sep_style=SeparatorStyle.ADD_COLON_SPACE_SINGLE,
914 | sep="<|end_of_turn|>\n",
915 | stop_token_ids=[32000, 32001], # "<|end_of_turn|>"
916 | stop_str="User",
917 | )
918 | )
919 |
920 |
921 | # Qwen-chat default template
922 | # source: https://huggingface.co/Qwen/Qwen-7B-Chat/blob/main/qwen_generation_utils.py#L130
923 | register_conv_template(
924 | Conversation(
925 | name="qwen-7b-chat",
926 | system_template="<|im_start|>system\n{system_message}",
927 | system_message="You are a helpful assistant.",
928 | roles=("<|im_start|>user", "<|im_start|>assistant"),
929 | messages=(),
930 | offset=0,
931 | sep_style=SeparatorStyle.CHATML,
932 | sep="<|im_end|>",
933 | stop_token_ids=[
934 | 151643,
935 | 151644,
936 | 151645,
937 | ], # "<|endoftext|>", "<|im_start|>", "<|im_end|>"
938 | stop_str="<|endoftext|>",
939 | )
940 | )
941 |
942 |
943 | if __name__ == "__main__":
944 | print("Vicuna template:")
945 | conv = get_conv_template("vicuna_v1.1")
946 | conv.append_message(conv.roles[0], "Hello!")
947 | conv.append_message(conv.roles[1], "Hi!")
948 | conv.append_message(conv.roles[0], "How are you?")
949 | conv.append_message(conv.roles[1], None)
950 | print(conv.get_prompt())
951 |
952 | print("\n")
953 |
954 | print("Llama-2 template:")
955 | conv = get_conv_template("llama-2-zh-tw")
956 | # conv.set_system_message("You are a helpful, respectful and honest assistant.")
957 | conv.append_message(conv.roles[0], "Hello!")
958 | conv.append_message(conv.roles[1], "Hi!")
959 | conv.append_message(conv.roles[0], "How are you?")
960 | conv.append_message(conv.roles[1], None)
961 | print(conv.get_prompt())
962 |
--------------------------------------------------------------------------------
/evaluation/generate_answer.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import time
3 | from argparse import ArgumentParser
4 |
5 | import pandas as pd
6 |
7 | from conversation import get_conv_template
8 |
9 | from vllm import LLM, SamplingParams
10 |
11 |
12 | if __name__ == "__main__":
13 | parser = ArgumentParser()
14 | parser.add_argument(
15 | "--prompt_template",
16 | type=str,
17 | required=True,
18 | help="The template to generate the answer.",
19 | )
20 | parser.add_argument(
21 | "--model_name",
22 | type=str,
23 | required=True,
24 | help="The name of the model to generate the answer.",
25 | )
26 | parser.add_argument(
27 | "--output_file",
28 | type=str,
29 | required=True,
30 | help="The output file to store the generated answers.",
31 | )
32 | args = parser.parse_args()
33 |
34 | question_df = pd.read_json("zh_tw_bench/question.jsonl", lines=True)
35 | prompts = []
36 |
37 | for i, row in question_df.iterrows():
38 | question = row['turns'][0]
39 | print(f"Question: {question}")
40 | conv = get_conv_template(args.prompt_template)
41 | conv.append_message(conv.roles[0], question)
42 | conv.append_message(conv.roles[1], None)
43 | prompt = conv.get_prompt()
44 | print(f"Prompt: {prompt}")
45 | prompts.append(prompt)
46 |
47 | sampling_params = SamplingParams(temperature=0.7, max_tokens=1024)
48 | llm = LLM(model=args.model_name, tensor_parallel_size=2)
49 | outputs = llm.generate(prompts, sampling_params)
50 | for output in outputs:
51 | id = output.request_id
52 | generated_text = output.outputs[0].text
53 | dump = [
54 | {
55 | "id": output.request_id,
56 | "generated_text": output.outputs[0].text,
57 | "model_name": args.model_name,
58 | }
59 | for output in outputs
60 | ]
61 | pd.DataFrame(dump).to_json(args.output_file, orient="records", lines=True)
62 |
--------------------------------------------------------------------------------
/evaluation/generate_answer_anthropic.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from argparse import ArgumentParser
3 |
4 | import pandas as pd
5 |
6 | from langchain.chat_models import ChatAnthropic
7 |
8 |
9 | if __name__ == "__main__":
10 | parser = ArgumentParser()
11 | parser.add_argument(
12 | "--model_name",
13 | type=str,
14 | required=True,
15 | help="The name of the model to generate the answer.",
16 | )
17 | parser.add_argument(
18 | "--output_file",
19 | type=str,
20 | required=True,
21 | help="The output file to store the generated answers.",
22 | )
23 | args = parser.parse_args()
24 |
25 | claude = ChatAnthropic(model_name=args.model_name)
26 |
27 | question_df = pd.read_json("zh_tw_bench/question.jsonl", lines=True)
28 | dump = []
29 |
30 | for i, row in question_df.iterrows():
31 | question = row['turns'][0]
32 | print(f"Question: {question}")
33 | generated_text = claude.predict(question, temperature=0.7, max_length=1024)
34 | print(f"Answer: {generated_text}")
35 | dump.append({
36 | "id": i,
37 | "generated_text": generated_text,
38 | "model_name": args.model_name,
39 | })
40 | pd.DataFrame(dump).to_json(args.output_file, orient="records", lines=True)
41 |
--------------------------------------------------------------------------------
/evaluation/get_score_from_freetext.py:
--------------------------------------------------------------------------------
1 | import json
2 | import re
3 | from argparse import ArgumentParser
4 | from collections import defaultdict
5 |
6 | import pandas as pd
7 | from tqdm import tqdm
8 | from langchain.chat_models import ChatOpenAI
9 |
10 |
11 | if __name__ == "__main__":
12 | parser = ArgumentParser()
13 | parser.add_argument(
14 | "input_file",
15 | type=str,
16 | )
17 | parser.add_argument(
18 | "output_file",
19 | type=str,
20 | )
21 | args = parser.parse_args()
22 |
23 | df = pd.read_excel(args.input_file)
24 |
25 | scores_ab = defaultdict(list)
26 |
27 | chat = ChatOpenAI(model_name='gpt-3.5-turbo')
28 | for _, row in tqdm(df.iterrows(), total=len(df)):
29 | text_score = row['result']
30 |
31 | scores = re.findall(r'(A|B): (\d+)分', text_score)
32 | result = {score[0]: int(score[1]) for score in scores}
33 |
34 | print(result) # Output: {'A': 10, 'B': 7}
35 | if result:
36 | scores_ab[row['A']].append(result['A'])
37 | scores_ab[row['B']].append(result['B'])
38 | # print sum scores
39 | print(row['A'], sum(scores_ab[row['A']]))
40 | print(row['B'], sum(scores_ab[row['B']]))
41 | else:
42 | print(f"Cannot parse score")
43 | print("Using human input")
44 | print()
45 | print(text_score)
46 |
47 |
48 | a_score = int(input("What is the score for A?"))
49 | b_score = int(input("What is the score for B?"))
50 | scores_ab[row['A']].append(a_score)
51 | scores_ab[row['B']].append(b_score)
52 | # print sum scores
53 | print(row['A'], sum(scores_ab[row['A']]))
54 | print(row['B'], sum(scores_ab[row['B']]))
55 | continue
56 | with open(args.output_file, 'w') as f:
57 | json.dump(scores_ab, f, indent=4)
58 |
59 |
60 | #
61 | # a_name = row['A']
62 | # b_name = row['B']
63 | # prompt = f"""text: {text_score}
64 | #
65 | # Given the text containing score for A and B.
66 | # Output the score for A and B in json format.
67 | # "A": int,
68 | # "B": int
69 | # """
70 | # success = False
71 | # temperature = 0.0
72 | # while not success:
73 | # try:
74 | # json_text = chat.predict(prompt, temperature=temperature)
75 | # # print(json_text)
76 | # parsed_score = json.loads(json_text)
77 | # # print(parsed_score)
78 | # scores_ab[a_name].append(parsed_score['A'])
79 | # scores_ab[b_name].append(parsed_score['B'])
80 | # # print sum scores
81 | # print(a_name, sum(scores_ab[a_name]))
82 | # print(b_name, sum(scores_ab[b_name]))
83 | # success = True
84 | # with open(args.output_file, 'w') as f:
85 | # json.dump(scores_ab, f, indent=4)
86 | # except Exception as e:
87 | # print(e)
88 | # temperature += 0.1
89 | # continue
90 |
--------------------------------------------------------------------------------
/evaluation/llm_eval.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from argparse import ArgumentParser
3 | from pathlib import Path
4 |
5 | import pandas as pd
6 | from tqdm import tqdm
7 | from langchain.chat_models import ChatOpenAI
8 | from langchain.llms import Anthropic
9 |
10 | logging.basicConfig(level=logging.INFO)
11 |
12 | NEED_REF_CATS = ["math", "reasoning", "coding"]
13 |
14 | pairwise_pk_eval_template = """[用戶問題]
15 | {question}
16 |
17 | [助理A的答案開始]
18 | {answer_a}
19 | [助理A的答案結束]
20 |
21 | [助理B的答案開始]
22 | {answer_b}
23 | [助理B的答案結束]
24 |
25 | 請充分比較兩個AI助理對用戶問題的回答,並做出公正的判斷,以評估回答的質量。你應該選擇更符合用戶指示並更好地回答用戶問題的助理。你的評估應該考慮因素,例如回答的有用性、相關性、準確性、深度、創造力和細節水平。注意,簡體中文不能使用,如果助理使用簡體中文要給最低分;文字必須符合台灣繁體中文。英文只接受專有名詞。通過比較兩個回答並提供簡短的解釋來開始你的評估。避免任何立場偏見,並確保回答的呈現順序不會影響你的決定。不要因回答的長度影響你的評估。不要偏愛某個助理的名稱。請盡量客觀。在提供解釋後,通過嚴格遵循以下格式輸出你的最終判決:"[[A]]"如果助理A更好,"[[B]]"如果助理B更好,和"[[C]]"打成平手。"""
26 |
27 | pairwise_pk_reference_eval_template = """[用戶問題]
28 | {question}
29 |
30 | [英文參考答案開始,請只考慮答案和推理過程,忽略語言]
31 | {ref_answer_1}
32 | [英文參考答案結束,請只考慮答案和推理過程,忽略語言]
33 |
34 | [助理A的答案開始]
35 | {answer_a}
36 | [助理A的答案結束]
37 |
38 | [助理B的答案開始]
39 | {answer_b}
40 | [助理B的答案結束]
41 |
42 | 請作為一個公正的裁判,評估下面顯示的用戶問題的兩個AI助理提供的回答的質量。您的評估應考慮正確性和有用性。注意,簡體中文不能使用,如果助理使用簡體中文要給最低分;文字必須符合台灣繁體中文。英文僅可用於專有名詞。您將獲得參考答案,助理A的答案,和助理B的答案。您的工作是評估哪個助理的答案更好。通過將兩個助理的答案與參考答案進行比較,開始您的評估。識別並更正任何錯誤。避免任何立場偏見,並確保回答的呈現順序不會影響您的決定。不要讓回答的長度影響您的評估。不要偏愛某個助理的名稱。請盡可能客觀。在提供解釋後,通過嚴格遵循以下格式輸出您的最終判決:"[[A]]"如果助理A更好,"[[B]]"如果助理B更好,和"[[C]]"打成平手。"""
43 |
44 |
45 | pairwise_score_eval_template = """[用戶問題]
46 | {question}
47 |
48 | [助理A的答案開始]
49 | {answer_a}
50 | [助理A的答案結束]
51 |
52 | [助理B的答案開始]
53 | {answer_b}
54 | [助理B的答案結束]
55 |
56 | 請充分比較兩個AI助理對用戶問題的回答,並做出公正的判斷,以評估回答的質量。你的評估應該考慮因素,例如回答的有用性、相關性、準確性、深度、創造力和細節水平。注意,簡體中文不能使用,如果助理使用簡體中文要給最低分;文字必須符合台灣繁體中文。英文只接受專有名詞。確保回答的呈現順序不會影響你的決定。不要因回答的長度影響你的評估。不要偏愛某個助理的名稱。請盡量客觀。在提供解釋後,通過嚴格遵循以下格式輸出你的最終判決:"A: 1~10分, B: 1~10分"。"""
57 |
58 | pairwise_score_reference_eval_template = """[用戶問題]
59 | {question}
60 |
61 | [英文參考答案開始,請只考慮答案和推理過程,忽略語言]
62 | {ref_answer_1}
63 | [英文參考答案結束,請只考慮答案和推理過程,忽略語言]
64 |
65 | [助理A的答案開始]
66 | {answer_a}
67 | [助理A的答案結束]
68 |
69 | [助理B的答案開始]
70 | {answer_b}
71 | [助理B的答案結束]
72 |
73 | 請作為一個公正的裁判,評估下面顯示的用戶問題的兩個AI助理提供的回答的質量。您的評估應考慮正確性和有用性。注意,簡體中文不能使用,如果助理使用簡體中文要給最低分;文字必須符合台灣繁體中文。英文僅可用於專有名詞。您將獲得參考答案,助理A的答案,和助理B的答案。您的工作是評估哪個助理的答案更好。識別並更正任何錯誤。避免任何立場偏見,並確保回答的呈現順序不會影響您的決定。不要讓回答的長度影響您的評估。不要偏愛某個助理的名稱。請盡可能客觀。在提供解釋後,通過嚴格遵循以下格式輸出您的最終判決:"A: 1~10分, B: 1~10分"。"""
74 |
75 |
76 | def load_model_answer(path: str) -> dict[str, str]:
77 | df = pd.read_json(path, lines=True)
78 | if "question_id" in df.columns and "choices" in df.columns:
79 | return {
80 | row["question_id"]: row["choices"][0]["turns"][0]
81 | for _, row in df.iterrows()
82 | }
83 | elif "id" in df.columns and "generated_text" in df.columns:
84 | return {int(row["id"])+1: row["generated_text"] for _, row in df.iterrows()}
85 | else:
86 | raise ValueError("Unknown format")
87 |
88 |
89 |
90 |
91 | if __name__ == "__main__":
92 | parser = ArgumentParser()
93 | parser.add_argument("assistant_a_file", type=str)
94 | parser.add_argument("assistant_b_file", type=str)
95 | parser.add_argument("--judge_name", type=str, default="gpt-4")
96 | parser.add_argument("--question_path", type=str, default="zh_tw_bench/question.jsonl")
97 | parser.add_argument("--reference_path", type=str, default="zh_tw_bench/reference_answer/gpt-4.jsonl")
98 | parser.add_argument("--pk", action="store_true")
99 | args = parser.parse_args()
100 |
101 | questions = pd.read_json(args.question_path, lines=True)
102 |
103 | ref_answer = pd.read_json(args.reference_path, lines=True)
104 |
105 | a_name = Path(args.assistant_a_file).stem
106 | b_name = Path(args.assistant_b_file).stem
107 | assistant_a_answers = load_model_answer(args.assistant_a_file)
108 | assistant_b_answers = load_model_answer(args.assistant_b_file)
109 |
110 | for id in assistant_a_answers:
111 | assert id in questions["question_id"].values, f"{id} not in questions"
112 | for id in assistant_b_answers:
113 | assert id in questions["question_id"].values, f"{id} not in questions"
114 |
115 | is_pk = args.pk
116 | prompts = []
117 | for _, row in questions.iterrows():
118 | question_id = row["question_id"]
119 | question = row["turns"][0]
120 | answer_a = assistant_a_answers[question_id]
121 | answer_b = assistant_b_answers[question_id]
122 | category = row["category"]
123 | is_reference_based = category in NEED_REF_CATS
124 | if is_reference_based:
125 | ref_answer_1 = ref_answer[ref_answer["question_id"] == question_id]["choices"].values[0][0]["turns"][0]
126 | template = pairwise_pk_eval_template if is_pk else pairwise_score_reference_eval_template
127 | prompt = template.format(
128 | question=question, answer_a=answer_a, answer_b=answer_b, ref_answer_1=ref_answer_1
129 | )
130 | prompts.append({
131 | "A": a_name,
132 | "B": b_name,
133 | "category": category,
134 | "question_id": question_id,
135 | "prompt": prompt,
136 | })
137 | prompt = template.format(
138 | question=question, answer_a=answer_b, answer_b=answer_a, ref_answer_1=ref_answer_1
139 | )
140 | prompts.append({
141 | "A": b_name,
142 | "B": a_name,
143 | "category": category,
144 | "question_id": question_id,
145 | "prompt": prompt,
146 | })
147 | else:
148 | template = pairwise_pk_eval_template if is_pk else pairwise_score_eval_template
149 | prompt = template.format(
150 | question=question, answer_a=answer_a, answer_b=answer_b,
151 | )
152 | prompts.append({
153 | "A": a_name,
154 | "B": b_name,
155 | "category": category,
156 | "question_id": question_id,
157 | "prompt": prompt,
158 | })
159 | prompt = template.format(
160 | question=question, answer_a=answer_b, answer_b=answer_a,
161 | )
162 | prompts.append({
163 | "A": b_name,
164 | "B": a_name,
165 | "category": category,
166 | "question_id": question_id,
167 | "prompt": prompt,
168 | })
169 | print(prompt)
170 | print("----"*20)
171 |
172 | model_name = args.judge_name
173 | temperature = 0
174 | if "gpt-3.5" in model_name or "gpt-4" in model_name:
175 | chat = ChatOpenAI(temperature=temperature, model_name=model_name, max_retries=5)
176 | elif "claude" in model_name:
177 | chat = Anthropic(model=model_name, temperature=temperature)
178 | else:
179 | raise ValueError("Unknown model name %s" % model_name)
180 |
181 | results = []
182 | for prompt in tqdm(prompts):
183 | print(f"A is {prompt['A']}, B is {prompt['B']}")
184 | result = chat.predict(prompt["prompt"])
185 | print(result)
186 | print("----"*20)
187 | prompt["result"] = result
188 | results.append(prompt)
189 | if args.pk:
190 | pd.DataFrame(results).to_excel(f"{a_name}_vs_{b_name}_pk.xlsx")
191 | else:
192 | pd.DataFrame(results).to_excel(f"{a_name}_vs_{b_name}_score.xlsx")
193 |
--------------------------------------------------------------------------------
/evaluation/plot.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "initial_id",
7 | "metadata": {
8 | "collapsed": true,
9 | "ExecuteTime": {
10 | "end_time": "2023-08-10T14:57:25.438373Z",
11 | "start_time": "2023-08-10T14:57:24.727021Z"
12 | }
13 | },
14 | "outputs": [],
15 | "source": [
16 | "import json\n",
17 | "from pathlib import Path\n",
18 | "import pandas as pd"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": 7,
24 | "outputs": [
25 | {
26 | "ename": "KeyError",
27 | "evalue": "'other'",
28 | "output_type": "error",
29 | "traceback": [
30 | "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
31 | "\u001B[0;31mKeyError\u001B[0m Traceback (most recent call last)",
32 | "File \u001B[0;32m~/miniconda3/envs/Taiwan-LLaMa/lib/python3.10/site-packages/pandas/core/indexes/base.py:3653\u001B[0m, in \u001B[0;36mIndex.get_loc\u001B[0;34m(self, key)\u001B[0m\n\u001B[1;32m 3652\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[0;32m-> 3653\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_engine\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget_loc\u001B[49m\u001B[43m(\u001B[49m\u001B[43mcasted_key\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 3654\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mKeyError\u001B[39;00m \u001B[38;5;28;01mas\u001B[39;00m err:\n",
33 | "File \u001B[0;32m~/miniconda3/envs/Taiwan-LLaMa/lib/python3.10/site-packages/pandas/_libs/index.pyx:147\u001B[0m, in \u001B[0;36mpandas._libs.index.IndexEngine.get_loc\u001B[0;34m()\u001B[0m\n",
34 | "File \u001B[0;32m~/miniconda3/envs/Taiwan-LLaMa/lib/python3.10/site-packages/pandas/_libs/index.pyx:176\u001B[0m, in \u001B[0;36mpandas._libs.index.IndexEngine.get_loc\u001B[0;34m()\u001B[0m\n",
35 | "File \u001B[0;32mpandas/_libs/hashtable_class_helper.pxi:7080\u001B[0m, in \u001B[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001B[0;34m()\u001B[0m\n",
36 | "File \u001B[0;32mpandas/_libs/hashtable_class_helper.pxi:7088\u001B[0m, in \u001B[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001B[0;34m()\u001B[0m\n",
37 | "\u001B[0;31mKeyError\u001B[0m: 'other'",
38 | "\nThe above exception was the direct cause of the following exception:\n",
39 | "\u001B[0;31mKeyError\u001B[0m Traceback (most recent call last)",
40 | "Cell \u001B[0;32mIn[7], line 7\u001B[0m\n\u001B[1;32m 5\u001B[0m sums \u001B[38;5;241m=\u001B[39m df\u001B[38;5;241m.\u001B[39msum(axis \u001B[38;5;241m=\u001B[39m \u001B[38;5;241m0\u001B[39m)\n\u001B[1;32m 6\u001B[0m chatgpt_score \u001B[38;5;241m=\u001B[39m sums[\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mgpt-3.5-turbo\u001B[39m\u001B[38;5;124m\"\u001B[39m]\n\u001B[0;32m----> 7\u001B[0m other_score \u001B[38;5;241m=\u001B[39m \u001B[43msums\u001B[49m\u001B[43m[\u001B[49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mother\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m]\u001B[49m\n",
41 | "File \u001B[0;32m~/miniconda3/envs/Taiwan-LLaMa/lib/python3.10/site-packages/pandas/core/series.py:1007\u001B[0m, in \u001B[0;36mSeries.__getitem__\u001B[0;34m(self, key)\u001B[0m\n\u001B[1;32m 1004\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_values[key]\n\u001B[1;32m 1006\u001B[0m \u001B[38;5;28;01melif\u001B[39;00m key_is_scalar:\n\u001B[0;32m-> 1007\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_get_value\u001B[49m\u001B[43m(\u001B[49m\u001B[43mkey\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 1009\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m is_hashable(key):\n\u001B[1;32m 1010\u001B[0m \u001B[38;5;66;03m# Otherwise index.get_value will raise InvalidIndexError\u001B[39;00m\n\u001B[1;32m 1011\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[1;32m 1012\u001B[0m \u001B[38;5;66;03m# For labels that don't resolve as scalars like tuples and frozensets\u001B[39;00m\n",
42 | "File \u001B[0;32m~/miniconda3/envs/Taiwan-LLaMa/lib/python3.10/site-packages/pandas/core/series.py:1116\u001B[0m, in \u001B[0;36mSeries._get_value\u001B[0;34m(self, label, takeable)\u001B[0m\n\u001B[1;32m 1113\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_values[label]\n\u001B[1;32m 1115\u001B[0m \u001B[38;5;66;03m# Similar to Index.get_value, but we do not fall back to positional\u001B[39;00m\n\u001B[0;32m-> 1116\u001B[0m loc \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mindex\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget_loc\u001B[49m\u001B[43m(\u001B[49m\u001B[43mlabel\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 1118\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m is_integer(loc):\n\u001B[1;32m 1119\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_values[loc]\n",
43 | "File \u001B[0;32m~/miniconda3/envs/Taiwan-LLaMa/lib/python3.10/site-packages/pandas/core/indexes/base.py:3655\u001B[0m, in \u001B[0;36mIndex.get_loc\u001B[0;34m(self, key)\u001B[0m\n\u001B[1;32m 3653\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_engine\u001B[38;5;241m.\u001B[39mget_loc(casted_key)\n\u001B[1;32m 3654\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mKeyError\u001B[39;00m \u001B[38;5;28;01mas\u001B[39;00m err:\n\u001B[0;32m-> 3655\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mKeyError\u001B[39;00m(key) \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01merr\u001B[39;00m\n\u001B[1;32m 3656\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mTypeError\u001B[39;00m:\n\u001B[1;32m 3657\u001B[0m \u001B[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001B[39;00m\n\u001B[1;32m 3658\u001B[0m \u001B[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001B[39;00m\n\u001B[1;32m 3659\u001B[0m \u001B[38;5;66;03m# the TypeError.\u001B[39;00m\n\u001B[1;32m 3660\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_check_indexing_error(key)\n",
44 | "\u001B[0;31mKeyError\u001B[0m: 'other'"
45 | ]
46 | }
47 | ],
48 | "source": [
49 | "model_scores = []\n",
50 | "for path in Path(\"./\").glob(\"chatgpt_*.json\") :\n",
51 | " df = pd.read_json(path)\n",
52 | " # columns wise sum\n",
53 | " sums = df.sum(axis = 0)\n",
54 | " # chatgpt_score = sums[\"gpt-3.5-turbo\"]\n",
55 | " # sums = sums.drop(\"gpt-3.5-turbo\")\n",
56 | " "
57 | ],
58 | "metadata": {
59 | "collapsed": false,
60 | "ExecuteTime": {
61 | "end_time": "2023-08-10T14:59:29.242084Z",
62 | "start_time": "2023-08-10T14:59:28.397037Z"
63 | }
64 | },
65 | "id": "2a9d324b3ef84a84"
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": null,
70 | "outputs": [],
71 | "source": [],
72 | "metadata": {
73 | "collapsed": false
74 | },
75 | "id": "48180e072b58c38f"
76 | }
77 | ],
78 | "metadata": {
79 | "kernelspec": {
80 | "display_name": "Python 3",
81 | "language": "python",
82 | "name": "python3"
83 | },
84 | "language_info": {
85 | "codemirror_mode": {
86 | "name": "ipython",
87 | "version": 2
88 | },
89 | "file_extension": ".py",
90 | "mimetype": "text/x-python",
91 | "name": "python",
92 | "nbconvert_exporter": "python",
93 | "pygments_lexer": "ipython2",
94 | "version": "2.7.6"
95 | }
96 | },
97 | "nbformat": 4,
98 | "nbformat_minor": 5
99 | }
100 |
--------------------------------------------------------------------------------
/evaluation/run_tceval.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import re
3 | from datasets import load_dataset
4 | from tqdm import tqdm
5 | from vllm import LLM, SamplingParams
6 | from transformers import AutoTokenizer
7 | from sumeval.metrics.rouge import RougeCalculator
8 | from openai import OpenAI
9 | from anthropic import Anthropic, HUMAN_PROMPT, AI_PROMPT
10 | import logging
11 | from pprint import pprint
12 |
13 | logging.basicConfig(
14 | level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
15 | )
16 | logger = logging.getLogger(__name__)
17 |
18 | DATASET_NAME = "yentinglin/TC-Eval"
19 | max_seq_len = 4096
20 |
21 | client = OpenAI(timeout=20.0, max_retries=100)
22 | openai_models = {
23 | "gpt-4-1106-preview",
24 | "gpt-3.5-turbo-1106",
25 | }
26 |
27 | anthropic = Anthropic(timeout=20.0, max_retries=100)
28 | anthropic_models = {
29 | "claude-2.0",
30 | "claude-2.1",
31 | "claude-instant-1.2",
32 | }
33 |
34 |
35 | # https://github.com/mtkresearch/MR-Models/blob/de1e10d27aed1798a4e1b22d145a45e509652b67/TC-Eval/evaluate.py#L16-L20
36 | def prefix_exact_match(gold: str, pred: str) -> float:
37 | if not pred:
38 | return 0
39 |
40 | return 1 if gold.strip() in pred.strip() else 0
41 |
42 |
43 | def parse_args():
44 | parser = argparse.ArgumentParser(description="Run TC-Eval")
45 | parser.add_argument(
46 | "--model",
47 | type=str,
48 | default="yentinglin/Taiwan-LLM-7B-v2.0.1-chat",
49 | help="Model name",
50 | )
51 | parser.add_argument(
52 | "--temperature", type=float, default=0.0, help="Sampling temperature"
53 | )
54 | parser.add_argument(
55 | "--max_tokens", type=int, default=128, help="Max tokens for generation"
56 | )
57 | parser.add_argument(
58 | "--tensor_parallel_size", type=int, default=1, help="Tensor parallel size"
59 | )
60 | return parser.parse_args()
61 |
62 |
63 | def main():
64 | args = parse_args()
65 |
66 | tc_eval_results = {}
67 |
68 | is_openai_chat_model = args.model in openai_models
69 | is_anthropic_chat_model = args.model in anthropic_models
70 | assert not (
71 | is_openai_chat_model and is_anthropic_chat_model
72 | ), "model cannot be both OpenAI and Anthropic chat model"
73 | if is_openai_chat_model:
74 | logging.info(f"Using OpenAI chat model: {args.model}")
75 |
76 | def get_openai_chat_response(
77 | model: str, user_prompt: str, prefill: str = ""
78 | ) -> str:
79 | """
80 | model is "gpt-3.5-turbo-1106" or "gpt-4-1106-preview"
81 | """
82 | response = client.chat.completions.create(
83 | model=model,
84 | messages=[
85 | {"role": "user", "content": user_prompt},
86 | {"role": "assistant", "content": prefill.strip()},
87 | ],
88 | max_tokens=args.max_tokens,
89 | temperature=args.temperature,
90 | )
91 | answer = response.choices[0].message.content
92 | return answer
93 |
94 | elif is_anthropic_chat_model:
95 | logging.info(f"Using Anthropic chat model: {args.model}")
96 |
97 | def get_anthropic_chat_response(
98 | model: str, user_prompt: str, prefill: str = ""
99 | ) -> str:
100 | while True:
101 | try:
102 | completion = anthropic.completions.create(
103 | model=args.model,
104 | max_tokens_to_sample=args.max_tokens,
105 | temperature=args.temperature,
106 | prompt=f"{HUMAN_PROMPT} {user_prompt.strip()}{AI_PROMPT}{prefill.strip()}",
107 | )
108 | except Exception as e:
109 | logging.error(f"Error: {e}")
110 | continue
111 | else:
112 | break
113 | return completion.completion
114 |
115 | else:
116 | logging.info(f"Using LLM model: {args.model}")
117 | tokenizer = AutoTokenizer.from_pretrained(args.model)
118 | sampling_params = SamplingParams(
119 | temperature=args.temperature, max_tokens=args.max_tokens
120 | )
121 | llm = LLM(
122 | model=args.model,
123 | tensor_parallel_size=args.tensor_parallel_size,
124 | max_num_batched_tokens=40960,
125 | quantization="AWQ" if "awq" in args.model.lower() else None,
126 | )
127 |
128 | drcd = load_dataset("yentinglin/TC-Eval", "DRCD", split="test")
129 | drcd = drcd.map(
130 | lambda x: {
131 | "user_prompt": f"請根據以下內容回答問題,且答案需盡可能簡短。注意:答案必須為內容的子字串。\n\n{x['paragraph']}\n\n問題:{x['question']}"
132 | }
133 | )
134 | if is_openai_chat_model:
135 | answers = []
136 | for row in tqdm(drcd):
137 | answer = get_openai_chat_response(args.model, row["user_prompt"])
138 | answers.append(answer)
139 | elif is_anthropic_chat_model:
140 | answers = []
141 | for row in tqdm(drcd):
142 | answer = get_anthropic_chat_response(args.model, row["user_prompt"])
143 | answers.append(answer)
144 | else:
145 | drcd = drcd.map(
146 | lambda x: {
147 | "prompt": tokenizer.apply_chat_template(
148 | [{"role": "user", "content": x["user_prompt"]}],
149 | tokenize=False,
150 | add_generation_prompt=True,
151 | )
152 | }
153 | )
154 | outputs = llm.generate(drcd["prompt"], sampling_params)
155 | # sort outputs by request_id
156 | outputs = sorted(outputs, key=lambda x: int(x.request_id))
157 | # single answer is at outputs[0].outputs[0].text
158 | answers = [outputs[i].outputs[0].text for i in range(len(outputs))]
159 |
160 | scores = [
161 | max(
162 | prefix_exact_match(ref, ans) for ref in refs
163 | ) # prefix exact match, take the max score across all references
164 | for refs, ans in zip(drcd["references"], answers)
165 | ]
166 | drcd_em = sum(scores) / len(scores)
167 | tc_eval_results["DRCD"] = {"exact_match": drcd_em}
168 | pprint(tc_eval_results)
169 |
170 | fgc = load_dataset("yentinglin/TC-Eval", "FGC", split="test")
171 | # 'question' column has artifact in it, remove it
172 | # 03.維基百科Wikipedia一詞是取自哪兩個字的意義?
173 | # 04.海倫凱勒出生於哪一個城市?
174 | # remove artifact r'[0-9]{2}\.'
175 | fgc = fgc.map(lambda x: {"question": x["question"].replace(r"[0-9]{2}\.", "")})
176 | fgc = fgc.map(
177 | lambda x: {
178 | "user_prompt": f"請根據以下內容回答問題,且答案需盡可能簡短。注意:答案必須為內容的子字串。\n\n{x['paragraph']}\n\n問題:{x['question']}"
179 | }
180 | )
181 | if is_openai_chat_model:
182 | answers = []
183 | for row in tqdm(fgc):
184 | answer = get_openai_chat_response(args.model, row["user_prompt"])
185 | answers.append(answer)
186 | elif is_anthropic_chat_model:
187 | answers = []
188 | for row in tqdm(fgc):
189 | answer = get_anthropic_chat_response(args.model, row["user_prompt"])
190 | answers.append(answer)
191 | else:
192 | fgc = fgc.map(
193 | lambda x: {
194 | "prompt": tokenizer.apply_chat_template(
195 | [{"role": "user", "content": x["user_prompt"]}],
196 | tokenize=False,
197 | add_generation_prompt=True,
198 | )
199 | }
200 | )
201 | outputs = llm.generate(fgc["prompt"], sampling_params)
202 | # sort outputs by request_id
203 | outputs = sorted(outputs, key=lambda x: int(x.request_id))
204 | # single answer is at outputs[0].outputs[0].text
205 | answers = [outputs[i].outputs[0].text for i in range(len(outputs))]
206 | scores = [
207 | max(
208 | prefix_exact_match(ref, ans) for ref in refs
209 | ) # prefix exact match, take the max score across all references
210 | for refs, ans in zip(fgc["references"], answers)
211 | ]
212 | fgc_em = sum(scores) / len(scores)
213 | tc_eval_results["FGC"] = {"exact_match": fgc_em}
214 | pprint(tc_eval_results)
215 |
216 | ttqa = load_dataset(DATASET_NAME, "TTQA", split="test")
217 | _map_num_to_alph = {i: a for i, a in zip(range(5), "ABCDE")}
218 | _map_alph_to_num = {a: i for i, a in zip(range(5), "ABCDE")}
219 | if is_openai_chat_model:
220 | ttqa = ttqa.map(
221 | lambda x: {
222 | "user_prompt": f"問題: {x['question']} \n\n請從以下選項中選擇並回答: {';'.join([f'({_map_num_to_alph[i]}) {tg}' for i, tg in enumerate(x['choices'])])}\n\n只能回答英文字母"
223 | }
224 | )
225 | answers = []
226 | for row in tqdm(ttqa, desc="TTQA"):
227 | answer = get_openai_chat_response(args.model, row["user_prompt"])
228 | answers.append(answer)
229 | elif is_anthropic_chat_model:
230 | ttqa = ttqa.map(
231 | lambda x: {
232 | "user_prompt": f"問題: {x['question']} \n\n請從以下選項中選擇並回答: {';'.join([f'({_map_num_to_alph[i]}) {tg}' for i, tg in enumerate(x['choices'])])}\n\n只能回答英文字母 答案: ("
233 | }
234 | )
235 | answers = []
236 | for row in tqdm(ttqa, desc="TTQA"):
237 | answer = get_anthropic_chat_response(
238 | args.model, row["user_prompt"], prefill="答案: ("
239 | )
240 | answers.append(answer)
241 | else:
242 | ttqa = ttqa.map(
243 | lambda x: {
244 | "user_prompt": f"問題: {x['question']} \n\n請從以下選項中選擇並回答: {';'.join([f'({_map_num_to_alph[i]}) {tg}' for i, tg in enumerate(x['choices'])])}"
245 | }
246 | )
247 | ttqa = ttqa.map(
248 | lambda x: {
249 | "prompt": tokenizer.apply_chat_template(
250 | [{"role": "user", "content": x["user_prompt"]}],
251 | tokenize=False,
252 | add_generation_prompt=True,
253 | )
254 | + " ("
255 | }
256 | )
257 | outputs = llm.generate(ttqa["prompt"], sampling_params)
258 | outputs = sorted(outputs, key=lambda x: int(x.request_id))
259 | answers = [outputs[i].outputs[0].text for i in range(len(outputs))]
260 | choices = [_map_alph_to_num.get(x[0], "") for x in answers]
261 | scores = [
262 | 1 if choice == answer else 0 for choice, answer in zip(choices, ttqa["answer"])
263 | ]
264 | ttqa_acc = sum(scores) / len(scores)
265 | tc_eval_results["TTQA"] = {"accuracy": ttqa_acc}
266 | pprint(tc_eval_results)
267 |
268 | tmmlu = load_dataset(DATASET_NAME, "TMMLU", split="test")
269 | tmmlu = tmmlu.map(lambda x: {"user_prompt": x["question"]})
270 | if is_openai_chat_model:
271 | answers = []
272 | tmmlu = tmmlu.map(lambda x: {"user_prompt": f"{x['question']}\n\n回答單一英文字母: ("})
273 | for row in tqdm(tmmlu, desc="TMMLU"):
274 | answer = get_openai_chat_response(args.model, row["user_prompt"])
275 | answers.append(answer)
276 | elif is_anthropic_chat_model:
277 | answers = []
278 | tmmlu = tmmlu.map(lambda x: {"user_prompt": f"{x['question']}\n\n回答單一英文字母"})
279 | for row in tqdm(tmmlu, desc="TMMLU"):
280 | answer = get_anthropic_chat_response(
281 | args.model, row["user_prompt"], prefill="答案: ("
282 | )
283 | answers.append(answer)
284 | else:
285 | tmmlu = tmmlu.map(
286 | lambda x: {
287 | "prompt": tokenizer.apply_chat_template(
288 | [{"role": "user", "content": x["user_prompt"]}],
289 | tokenize=False,
290 | add_generation_prompt=True,
291 | )
292 | + " ("
293 | }
294 | )
295 | outputs = llm.generate(tmmlu["prompt"], sampling_params)
296 | outputs = sorted(outputs, key=lambda x: int(x.request_id))
297 | answers = [outputs[i].outputs[0].text for i in range(len(outputs))]
298 | scores = [
299 | 1 if answer[0] == row["answer"] else 0 for answer, row in zip(answers, tmmlu)
300 | ]
301 | tmmlu_acc = sum(scores) / len(scores)
302 | tc_eval_results["TMMLU"] = {"accuracy": tmmlu_acc}
303 | pprint(tc_eval_results)
304 |
305 | xsum = load_dataset("yentinglin/TC-Eval", "XSUM_TC", split="test")
306 | # rename "Unnamed: 0" to "id"
307 | xsum = xsum.rename_column("Unnamed: 0", "id")
308 | xsum = xsum.map(lambda x: {"user_prompt": f"{x['document']}\n\n根據上述文章以一句話來總結"})
309 | if is_openai_chat_model:
310 | answers = []
311 | for row in tqdm(xsum, desc="XSUM"):
312 | answer = get_openai_chat_response(args.model, row["user_prompt"])
313 | answers.append(answer)
314 | elif is_anthropic_chat_model:
315 | answers = []
316 | for row in tqdm(xsum, desc="XSUM"):
317 | answer = get_anthropic_chat_response(args.model, row["user_prompt"])
318 | answers.append(answer)
319 | else:
320 | xsum = xsum.map(
321 | lambda x: {
322 | "prompt": tokenizer.apply_chat_template(
323 | [{"role": "user", "content": x["user_prompt"]}],
324 | tokenize=False,
325 | add_generation_prompt=True,
326 | )
327 | }
328 | )
329 | outputs = llm.generate(xsum["prompt"], sampling_params)
330 | outputs = sorted(outputs, key=lambda x: int(x.request_id))
331 | answers = [outputs[i].outputs[0].text.strip() for i in range(len(outputs))]
332 | scorer = RougeCalculator(stemming=True, lang="zh")
333 | scores = [
334 | scorer.rouge_2(summary=answer, references=ref)
335 | for answer, ref in zip(answers, xsum["summary"])
336 | ]
337 | xsum_rouge2 = sum(scores) / len(scores)
338 | tc_eval_results["XSUM"] = {"rouge2": xsum_rouge2}
339 | pprint(tc_eval_results)
340 |
341 | imdb = load_dataset("yentinglin/TC-Eval", "IMDB_TC", split="test")
342 | imdb = imdb.map(
343 | lambda x: {
344 | "user_prompt": f"評論:{x['text']}\n\n請閱讀以上評論,並回答此評論是正面還是負面,如果是正面,請回答'(1)';,如果是負面,請回答'(0)'"
345 | }
346 | )
347 | if is_openai_chat_model:
348 | answers = []
349 | imdb = imdb.map(
350 | lambda x: {
351 | "user_prompt": f"評論:{x['text']}\n\n請閱讀以上評論,並回答此評論是正面還是負面,如果是正面,請回答'1';,如果是負面,請回答'0'"
352 | }
353 | )
354 | for row in tqdm(imdb, desc="IMDB"):
355 | answer = get_openai_chat_response(args.model, row["user_prompt"])
356 | # use regex to get the 0 or 1. if not found, false
357 | answer = re.search(r"[0-1]", answer)
358 | answer = answer.group() if answer else "2"
359 | answers.append(answer)
360 | print(answer)
361 | elif is_anthropic_chat_model:
362 | answers = []
363 | for row in tqdm(imdb, desc="IMDB"):
364 | answer = get_anthropic_chat_response(
365 | args.model, row["user_prompt"], prefill="答案: ("
366 | )
367 | # use regex to get the 0 or 1. if not found, false
368 | answer = re.search(r"[0-1]", answer)
369 | answer = answer.group() if answer else "2"
370 | answers.append(answer)
371 | print(answer)
372 | else:
373 | imdb = imdb.map(
374 | lambda x: {
375 | "prompt": tokenizer.apply_chat_template(
376 | [{"role": "user", "content": x["user_prompt"]}],
377 | tokenize=False,
378 | add_generation_prompt=True,
379 | )
380 | + " ("
381 | }
382 | )
383 | outputs = llm.generate(imdb["prompt"], sampling_params)
384 | outputs = sorted(outputs, key=lambda x: int(x.request_id))
385 | answers = [outputs[i].outputs[0].text.strip() for i in range(len(outputs))]
386 | scores = [
387 | 1 if answer and answer[0] == str(row["label"]) else 0
388 | for answer, row in zip(answers, imdb)
389 | ]
390 | imdb_acc = sum(scores) / len(scores)
391 | tc_eval_results["IMDB"] = {"accuracy": imdb_acc}
392 | pprint(tc_eval_results)
393 |
394 | table = load_dataset("yentinglin/TC-Eval", "PenguinsInTable_TC", split="test")
395 | # Hacks from https://github.com/mtkresearch/MR-Models/blob/a2d9a3972c4d2a0c982485eb817a1ce4ccdd21a4/TC-Eval/inference/scenarios.py#L168
396 | table = table.map(lambda row: {"question": row["question"].rstrip("回答:")})
397 | _map_num_to_alph = {i: a for i, a in zip(range(5), "ABCDE")}
398 | table = table.map(
399 | lambda row: {
400 | "user_prompt": f"{row['question']} \n請從以下選項中選擇並回答: {';'.join([f'({_map_num_to_alph[i]}) {tg}' for i, tg in enumerate(row['choices'])])}"
401 | }
402 | )
403 | if is_openai_chat_model:
404 | choices = []
405 | for row in tqdm(table, desc="PenguinsInTable"):
406 | answer = get_openai_chat_response(
407 | args.model, row["user_prompt"] + "\n只能回答單一英文字母"
408 | )
409 | # use regex to get the first English alphabet, if not found, use empty string
410 | answer = re.search(r"[A-E]", answer)
411 | answer = answer.group() if answer else ""
412 | # map
413 | answer = _map_alph_to_num.get(answer, "")
414 | choices.append(answer)
415 | print(answer)
416 | elif is_anthropic_chat_model:
417 | choices = []
418 | for row in tqdm(table, desc="PenguinsInTable"):
419 | answer = get_anthropic_chat_response(
420 | args.model, row["user_prompt"] + "\n只能回答單一英文字母", prefill="答案: ("
421 | )
422 | # use regex to get the first English alphabet, if not found, use empty string
423 | answer = re.search(r"[A-E]", answer)
424 | answer = answer.group() if answer else ""
425 | # map
426 | answer = _map_alph_to_num.get(answer, "")
427 | choices.append(answer)
428 | print(answer)
429 | else:
430 | table = table.map(
431 | lambda row: {
432 | "prompt": tokenizer.apply_chat_template(
433 | [{"role": "user", "content": row["user_prompt"]}],
434 | tokenize=False,
435 | add_generation_prompt=True,
436 | )
437 | + " ("
438 | }
439 | )
440 | outputs = llm.generate(table["prompt"], sampling_params)
441 | outputs = sorted(outputs, key=lambda x: int(x.request_id))
442 | answers = [outputs[i].outputs[0].text for i in range(len(outputs))]
443 | choices = [_map_alph_to_num.get(x[0], "") for x in answers]
444 | scores = [
445 | 1 if choice == answer else 0 for choice, answer in zip(choices, table["answer"])
446 | ]
447 | table_acc = sum(scores) / len(scores)
448 | tc_eval_results["PenguinsInTable"] = {"accuracy": table_acc}
449 | pprint(tc_eval_results)
450 |
451 |
452 | if __name__ == "__main__":
453 | main()
454 |
--------------------------------------------------------------------------------
/evaluation/zh_tw_bench/question.jsonl:
--------------------------------------------------------------------------------
1 | {"index":0,"question_id":1,"category":"generic","turns":["\u6211\u8a72\u5982\u4f55\u63d0\u5347\u6211\u7684\u6642\u9593\u7ba1\u7406\u6280\u5de7\uff1f"],"reference":null}
2 | {"index":1,"question_id":2,"category":"generic","turns":["\u4ec0\u9ebc\u662f\u6700\u6709\u6548\u7684\u61c9\u5c0d\u58d3\u529b\u7684\u65b9\u5f0f?"],"reference":null}
3 | {"index":2,"question_id":3,"category":"generic","turns":["Python \u548c JavaScript \u7a0b\u5f0f\u8a9e\u8a00\u4e3b\u8981\u7684\u5dee\u7570\u662f\u4ec0\u9ebc?"],"reference":null}
4 | {"index":3,"question_id":4,"category":"generic","turns":["\u6211\u8a72\u5982\u4f55\u5728\u5bb6\u5de5\u4f5c\u6642\u63d0\u9ad8\u6211\u7684\u751f\u7522\u529b\uff1f"],"reference":null}
5 | {"index":4,"question_id":5,"category":"generic","turns":["\u4f60\u53ef\u4ee5\u89e3\u91cb\u4e00\u4e0b\u91cf\u5b50\u8a08\u7b97\u7684\u57fa\u790e\u77e5\u8b58\u55ce\uff1f"],"reference":null}
6 | {"index":5,"question_id":6,"category":"generic","turns":["\u690d\u7269\u6027\u548c\u52d5\u7269\u6027\u86cb\u767d\u8cea\u4f86\u6e90\u4e4b\u9593\u6709\u4ec0\u9ebc\u5340\u5225\uff1f"],"reference":null}
7 | {"index":6,"question_id":7,"category":"generic","turns":["\u6211\u8a72\u5982\u4f55\u63d0\u5347\u6211\u7684\u6279\u5224\u6027\u601d\u8003\u80fd\u529b?"],"reference":null}
8 | {"index":7,"question_id":8,"category":"generic","turns":["\u4eca\u5929\u6559\u80b2\u90e8\u9580\u9762\u81e8\u7684\u4e3b\u8981\u6311\u6230\u662f\u4ec0\u9ebc\uff1f"],"reference":null}
9 | {"index":8,"question_id":9,"category":"generic","turns":["\u4ec0\u9ebc\u662f\u4e3b\u8981\u5f71\u97ff\u6d88\u8cbb\u8005\u884c\u70ba\u7684\u56e0\u7d20\uff1f"],"reference":null}
10 | {"index":9,"question_id":10,"category":"generic","turns":["\u5728\u8077\u5834\u4e2d\uff0c\u6700\u6709\u6548\u7684\u885d\u7a81\u89e3\u6c7a\u7b56\u7565\u662f\u4ec0\u9ebc\uff1f"],"reference":null}
11 | {"index":10,"question_id":11,"category":"knowledge","turns":["\u4f7f\u7528\u4e00\u6b21\u6027\u5851\u81a0\u74f6\u8207\u53ef\u91cd\u8907\u4f7f\u7528\u7684\u74f6\u5b50\u5c0d\u74b0\u5883\u548c\u4eba\u985e\u5065\u5eb7\u53ef\u80fd\u6709\u4ec0\u9ebc\u6f5b\u5728\u5f71\u97ff\uff1f"],"reference":null}
12 | {"index":11,"question_id":12,"category":"knowledge","turns":["\u4f60\u6703\u8003\u616e\u54ea\u4e9b\u56e0\u7d20\u4f86\u8a2d\u8a08\u4e00\u500b\u5305\u5bb9\u4e14\u6613\u65bc\u4f7f\u7528\u7684\u516c\u5171\u4ea4\u901a\u7cfb\u7d71\uff1f"],"reference":null}
13 | {"index":12,"question_id":13,"category":"knowledge","turns":["\u653f\u5e9c\u5982\u4f55\u5229\u7528\u8ca1\u653f\u548c\u8ca8\u5e63\u653f\u7b56\u4f86\u5c0d\u6297\u7d93\u6fdf\u8870\u9000?"],"reference":null}
14 | {"index":13,"question_id":14,"category":"knowledge","turns":["\u8a9e\u8a00\u548c\u6587\u5316\u969c\u7919\u5982\u4f55\u5f71\u97ff\u4eba\u5011\u5728\u591a\u5143\u6587\u5316\u793e\u6703\u4e2d\u7684\u6e9d\u901a\u8207\u5efa\u7acb\u95dc\u4fc2\uff1f"],"reference":null}
15 | {"index":14,"question_id":15,"category":"knowledge","turns":["\u63cf\u8ff0\u4e00\u500b\u60c5\u5883\uff0c\u5176\u4e2d\u4eba\u5de5\u667a\u6167\u80fd\u88ab\u7528\u4f86\u63d0\u5347\u91ab\u7642\u670d\u52d9\u7684\u54c1\u8cea\u548c\u6548\u7387\u3002"],"reference":null}
16 | {"index":15,"question_id":16,"category":"knowledge","turns":["\u89e3\u91cb\u4f7f\u7528CRISPR-Cas9\u6280\u8853\u9032\u884c\u57fa\u56e0\u7de8\u8f2f\u7684\u904e\u7a0b\uff0c\u4e26\u8a0e\u8ad6\u5176\u6f5b\u5728\u7684\u61c9\u7528\u4ee5\u53ca\u502b\u7406\u542b\u7fa9\u3002"],"reference":null}
17 | {"index":16,"question_id":17,"category":"knowledge","turns":["\u75ab\u82d7\u662f\u5982\u4f55\u4f5c\u7528\u4ee5\u4fdd\u8b77\u500b\u4eba\u548c\u793e\u5340\u514d\u53d7\u50b3\u67d3\u75c5\u7684\u4fb5\u5bb3\uff0c\u800c\u4f55\u8b02\u7fa4\u9ad4\u514d\u75ab\uff1f"],"reference":null}
18 | {"index":17,"question_id":18,"category":"knowledge","turns":["\u793e\u4ea4\u5a92\u9ad4\u5e73\u53f0\u5982\u4f55\u5f71\u97ff\u4eba\u5011\u6d88\u8cbb\u548c\u5206\u4eab\u65b0\u805e\u7684\u65b9\u5f0f\uff0c\u4ee5\u53ca\u9019\u5c0d\u50b3\u64ad\u932f\u8aa4\u8a0a\u606f\u6709\u4ec0\u9ebc\u53ef\u80fd\u7684\u5f71\u97ff\uff1f"],"reference":null}
19 | {"index":18,"question_id":19,"category":"knowledge","turns":["\u6587\u5316\u3001\u793e\u6703\u548c\u7d93\u6fdf\u56e0\u7d20\u5982\u4f55\u5f71\u97ff\u4eba\u5011\u7684\u98f2\u98df\u9078\u64c7\uff0c\u4ee5\u53ca\u5982\u4f55\u5229\u7528\u9019\u4e9b\u77e5\u8b58\u4f86\u63a8\u5ee3\u66f4\u5065\u5eb7\u7684\u98f2\u98df\uff1f"],"reference":null}
20 | {"index":19,"question_id":20,"category":"knowledge","turns":["\u89e3\u91cb\u81ea\u7136\u9078\u64c7\u7684\u904e\u7a0b\uff0c\u4ee5\u53ca\u5b83\u5982\u4f55\u4fc3\u9032\u7269\u7a2e\u7684\u6f14\u5316\u548c\u9069\u61c9\u3002"],"reference":null}
21 | {"index":20,"question_id":21,"category":"roleplay","turns":["\u4f60\u6703\u5982\u4f55\u4ee5\u4e00\u4f4d\u4e2d\u4e16\u7d00\u9a0e\u58eb\u7684\u8eab\u4efd\u5728\u7687\u5bb6\u5bb4\u6703\u4e2d\u81ea\u6211\u4ecb\u7d39?"],"reference":null}
22 | {"index":21,"question_id":22,"category":"roleplay","turns":["\u4f5c\u70ba\u4e00\u500b\u6d77\u76dc\u8239\u9577\uff0c\u4f60\u6703\u5c0d\u4f60\u7684\u8239\u54e1\u8aaa\u4ec0\u9ebc\u4f86\u6fc0\u52f5\u4ed6\u5011\u5c0b\u627e\u96b1\u85cf\u7684\u5bf6\u85cf\uff1f"],"reference":null}
23 | {"index":22,"question_id":23,"category":"roleplay","turns":["\u5982\u679c\u4f60\u662f\u4e00\u500b\u838e\u58eb\u6bd4\u4e9e\u7684\u89d2\u8272\uff0c\u4f60\u6703\u5982\u4f55\u5728\u7368\u767d\u4e2d\u5c0d\u67d0\u4eba\u8868\u767d\u4f60\u7684\u611b\uff1f"],"reference":null}
24 | {"index":23,"question_id":24,"category":"roleplay","turns":["\u8eab\u70ba\u4e00\u500b\u8d85\u7d1a\u82f1\u96c4\uff0c\u4f60\u6703\u5982\u4f55\u5411\u4e00\u500b\u597d\u5947\u7684\u5c0f\u5b69\u89e3\u91cb\u4f60\u7684\u8d77\u6e90\u6545\u4e8b\uff1f"],"reference":null}
25 | {"index":24,"question_id":25,"category":"roleplay","turns":["\u60f3\u50cf\u4f60\u662f\u4f86\u81ea\u516c\u51433000\u5e74\u7684\u6642\u9593\u65c5\u884c\u8005\u3002\u4f60\u6703\u544a\u8a34\u4eba\u5011\u54ea\u4e9b\u79d1\u6280\u9032\u6b65\u5462\uff1f"],"reference":null}
26 | {"index":25,"question_id":26,"category":"roleplay","turns":["\u4f5c\u70ba\u4e00\u540d\u9ad4\u80b2\u8a55\u8ad6\u54e1\uff0c\u63cf\u8ff0\u5728\u51a0\u8ecd\u8cfd\u6700\u5f8c\u5e7e\u79d2\u4e2d\u7684\u81f4\u52dd\u4e00\u64ca\u3002"],"reference":null}
27 | {"index":26,"question_id":27,"category":"roleplay","turns":["\u5047\u88dd\u4f60\u662f\u4e00\u4f4d\u4e16\u754c\u77e5\u540d\u7684\u5927\u5eda\u3002\u4f60\u6703\u5982\u4f55\u5411\u4e00\u7d44\u8a55\u5be9\u5718\u63cf\u8ff0\u4f60\u7684\u62db\u724c\u83dc\uff1f"],"reference":null}
28 | {"index":27,"question_id":28,"category":"roleplay","turns":["\u4f60\u662f\u4e00\u4f4d\u6500\u767b\u5230\u73e0\u7a46\u6717\u746a\u5cf0\u9802\u5cf0\u7684\u5c71\u5cb3\u6500\u767b\u8005\u3002\u63cf\u8ff0\u4f60\u7684\u60c5\u7dd2\u4ee5\u53ca\u5f9e\u5c71\u9802\u770b\u4e0b\u53bb\u7684\u666f\u8272\u3002"],"reference":null}
29 | {"index":28,"question_id":29,"category":"roleplay","turns":["\u4f5c\u70ba\u706b\u661f\u7684\u592a\u7a7a\u6b96\u6c11\u8005\uff0c\u63cf\u8ff0\u4f60\u7684\u65e5\u5e38\u751f\u6d3b\u4ee5\u53ca\u5728\u5176\u4ed6\u661f\u7403\u751f\u6d3b\u6240\u9762\u81e8\u7684\u6311\u6230\u3002"],"reference":null}
30 | {"index":29,"question_id":30,"category":"roleplay","turns":["\u5047\u88dd\u81ea\u5df1\u662f\u5f8c\u672b\u65e5\u4e16\u754c\u4e2d\u7684\u4e00\u500b\u89d2\u8272\u3002\u63cf\u8ff0\u4f60\u5982\u4f55\u751f\u5b58\u4ee5\u53ca\u4f60\u9047\u5230\u7684\u76df\u53cb\u3002"],"reference":null}
31 | {"index":30,"question_id":31,"category":"common-sense","turns":["\u4f60\u5982\u4f55\u5224\u65b7\u4e00\u5bb6\u9910\u5ef3\u662f\u4e0d\u662f\u7576\u5730\u4eba\u559c\u6b61\uff0c\u9084\u662f\u4e3b\u8981\u5438\u5f15\u89c0\u5149\u5ba2\uff0c\u4e26\u4e14\u9019\u500b\u8cc7\u8a0a\u70ba\u4f55\u53ef\u80fd\u6709\u7528\uff1f"],"reference":null}
32 | {"index":31,"question_id":32,"category":"common-sense","turns":["\u6709\u54ea\u4e9b\u7d30\u5fae\u7684\u7dda\u7d22\u53ef\u4ee5\u6697\u793a\u67d0\u4eba\u5be6\u969b\u4e0a\u5c0d\u4e00\u500b\u4e3b\u984c\u6216\u5c0d\u8a71\u611f\u5230\u56f0\u60d1\u6216\u7121\u77e5\uff0c\u4f46\u4ed6\u5011\u5047\u88dd\u81ea\u5df1\u7406\u89e3\u5462\uff1f"],"reference":null}
33 | {"index":32,"question_id":33,"category":"common-sense","turns":["\u70ba\u4ec0\u9ebc\u6709\u4eba\u6703\u9078\u64c7\u4f7f\u7528\u7d19\u8cea\u5730\u5716\u6216\u8a62\u554f\u8def\u7dda\uff0c\u800c\u4e0d\u662f\u4f9d\u8cf4 GPS \u88dd\u7f6e\u6216\u667a\u6167\u624b\u6a5f\u61c9\u7528\u7a0b\u5f0f\u5462?"],"reference":null}
34 | {"index":33,"question_id":34,"category":"common-sense","turns":["\u4f60\u5982\u4f55\u5224\u65b7\u4e00\u500b\u4eba\u662f\u771f\u5fc3\u5c0d\u4e00\u5834\u5c0d\u8a71\u611f\u8208\u8da3\uff0c\u9084\u662f\u53ea\u662f\u51fa\u65bc\u79ae\u8c8c\uff1f"],"reference":null}
35 | {"index":34,"question_id":35,"category":"common-sense","turns":["\u70ba\u4ec0\u9ebc\u6709\u4eba\u6703\u512a\u5148\u9078\u64c7\u5728\u5c0f\u578b\u3001\u7576\u5730\u64c1\u6709\u7684\u5546\u5e97\u8cfc\u7269\uff0c\u800c\u975e\u5927\u578b\u9023\u9396\u5546\u5e97\uff0c\u5373\u4f7f\u50f9\u683c\u8f03\u9ad8\u5462?"],"reference":null}
36 | {"index":35,"question_id":36,"category":"common-sense","turns":["\u4f60\u5982\u4f55\u8a55\u4f30\u4e00\u500b\u4fe1\u606f\u4f86\u6e90\u7684\u53ef\u4fe1\u5ea6\uff0c\u5982\u65b0\u805e\u6587\u7ae0\u6216\u535a\u5ba2\u6587\u7ae0\uff0c\u800c\u4e0d\u50c5\u50c5\u4f9d\u8cf4\u65bc\u4f5c\u8005\u6216\u51fa\u7248\u8005\u7684\u8072\u8b7d\uff1f"],"reference":null}
37 | {"index":36,"question_id":37,"category":"common-sense","turns":["\u70ba\u4ec0\u9ebc\u6709\u4e9b\u4eba\u6703\u4eab\u53d7\u88ab\u5687\u5230\u7684\u611f\u89ba\uff0c\u50cf\u662f\u770b\u6050\u6016\u96fb\u5f71\u6216\u8005\u73a9\u904e\u5c71\u8eca\uff0c\u800c\u6709\u4e9b\u4eba\u537b\u907f\u514d\u9019\u4e9b\u7d93\u9a57\uff1f"],"reference":null}
38 | {"index":37,"question_id":38,"category":"common-sense","turns":["\u5982\u4f55\u900f\u904e\u89c0\u5bdf\u5176\u4ed6\u4eba\u5728\u793e\u6703\u60c5\u5883\u4e2d\u7684\u884c\u70ba\uff0c\u63d0\u4f9b\u95dc\u65bc\u6587\u5316\u898f\u7bc4\u548c\u671f\u671b\u7684\u7dda\u7d22\uff1f"],"reference":null}
39 | {"index":38,"question_id":39,"category":"common-sense","turns":["\u6211\u5011\u662f\u5426\u6709\u9053\u5fb7\u7fa9\u52d9\u53bb\u63a2\u7d22\u592a\u7a7a\uff0c\u6216\u8005\u6211\u5011\u61c9\u8a72\u5148\u5c08\u6ce8\u65bc\u89e3\u6c7a\u5730\u7403\u7684\u554f\u984c\uff1f"],"reference":null}
40 | {"index":39,"question_id":40,"category":"common-sense","turns":["\u5728\u4e00\u500b\u81ea\u52d5\u5316\u65e5\u76ca\u666e\u904d\u7684\u4e16\u754c\u4e2d\uff0c\u6211\u5011\u662f\u61c9\u8a72\u66f4\u512a\u5148\u8003\u616e\u5275\u9020\u5c31\u696d\u6a5f\u6703\uff0c\u9084\u662f\u79d1\u6280\u9032\u6b65\uff1f"],"reference":null}
41 | {"index":40,"question_id":41,"category":"fermi","turns":["\u4e00\u500b\u4eba\u5e73\u5747\u5728\u4e00\u751f\u4e2d\u6703\u7728\u773c\u591a\u5c11\u6b21\uff1f\u8a66\u8457\u89e3\u91cb\u4f60\u7684\u7b54\u6848\u3002\u4f60\u7684\u89e3\u91cb\u61c9\u8a72\u4e00\u6b65\u4e00\u6b65\u5730\u5f15\u5c0e\u8b80\u8005\u7406\u89e3\u4f60\u7684\u63a8\u7406\u3002"],"reference":null}
42 | {"index":41,"question_id":42,"category":"fermi","turns":["\u9e7d\u7c92\u4e2d\u6709\u591a\u5c11\u500b\u539f\u5b50\uff1f\u8a66\u8457\u89e3\u91cb\u4f60\u7684\u7b54\u6848\u3002\u4f60\u7684\u89e3\u91cb\u61c9\u8a72\u9010\u6b65\u5f15\u9818\u8b80\u8005\u7406\u89e3\u4f60\u7684\u63a8\u7406\u904e\u7a0b\u3002"],"reference":null}
43 | {"index":42,"question_id":43,"category":"fermi","turns":["\u6bcf\u5929\u5728\u5730\u7403\u4e0a\u767c\u751f\u591a\u5c11\u6b21\u9583\u96fb\u6253\u64ca\uff1f\u8a66\u8457\u89e3\u91cb\u4f60\u7684\u7b54\u6848\u3002\u4f60\u7684\u89e3\u91cb\u61c9\u8a72\u4e00\u6b65\u6b65\u5f15\u5c0e\u8b80\u8005\u7406\u89e3\u4f60\u7684\u63a8\u7406\u904e\u7a0b\u3002"],"reference":null}
44 | {"index":43,"question_id":44,"category":"fermi","turns":["\u50cf\u5728\u96fb\u5f71\u300c\u98db\u5c4b\u74b0\u904a\u8a18\u300d\u4e2d\uff0c\u9700\u8981\u591a\u5c11\u6c23\u7403\u624d\u80fd\u628a\u4e00\u68df\u623f\u5b50\u8209\u8d77\u4f86\uff1f\u8a66\u8457\u89e3\u91cb\u4f60\u7684\u7b54\u6848\u3002\u4f60\u7684\u89e3\u91cb\u61c9\u8a72\u8b93\u8b80\u8005\u9010\u6b65\u7406\u89e3\u4f60\u7684\u63a8\u7406\u904e\u7a0b\u3002"],"reference":null}
45 | {"index":44,"question_id":45,"category":"fermi","turns":["\u5168\u7403\u6bcf\u5206\u9418\u50b3\u9001\u591a\u5c11\u7c21\u8a0a\uff1f\u8a66\u8457\u89e3\u91cb\u4f60\u7684\u7b54\u6848\u3002\u4f60\u7684\u89e3\u91cb\u61c9\u8a72\u5f15\u5c0e\u8b80\u8005\u9010\u6b65\u7406\u89e3\u4f60\u7684\u63a8\u7406\u904e\u7a0b\u3002"],"reference":null}
46 | {"index":45,"question_id":46,"category":"fermi","turns":["\u5730\u7403\u4e0a\u6bcf\u5929\u6709\u591a\u5c11\u5b57\u8a5e\u88ab\u8aaa\u51fa\uff1f\u8a66\u8457\u8aaa\u660e\u4f60\u7684\u7b54\u6848\u3002\u4f60\u7684\u89e3\u91cb\u61c9\u8a72\u4e00\u6b65\u6b65\u5f15\u5c0e\u8b80\u8005\u7406\u89e3\u4f60\u7684\u63a8\u7406\u904e\u7a0b\u3002"],"reference":null}
47 | {"index":46,"question_id":47,"category":"fermi","turns":["\u5728\u4e00\u500b\u5178\u578b\u7684\u51ac\u5b63\u6703\u4e0b\u591a\u5c11\u96ea\u82b1\uff1f\u8a66\u8457\u89e3\u91cb\u4f60\u7684\u7b54\u6848\u3002\u4f60\u7684\u89e3\u91cb\u61c9\u8a72\u4e00\u6b65\u6b65\u5730\u5e36\u9818\u8b80\u8005\u7406\u89e3\u4f60\u7684\u63a8\u7406\u904e\u7a0b\u3002"],"reference":null}
48 | {"index":47,"question_id":48,"category":"fermi","turns":["\u6240\u6709\u66f8\u7c4d\u7e3d\u5171\u6709\u591a\u5c11\u9801\u6578\uff1f\u8a66\u8457\u89e3\u91cb\u4f60\u7684\u7b54\u6848\u3002\u4f60\u7684\u89e3\u91cb\u61c9\u8a72\u8b93\u8b80\u8005\u9010\u6b65\u7406\u89e3\u4f60\u7684\u63a8\u7406\u904e\u7a0b\u3002"],"reference":null}
49 | {"index":48,"question_id":49,"category":"fermi","turns":["\u81ea\u751f\u547d\u958b\u59cb\u4ee5\u4f86\uff0c\u5730\u7403\u7e5e\u592a\u967d\u65cb\u8f49\u4e86\u591a\u5c11\u6b21\uff1f\u8acb\u5617\u8a66\u89e3\u91cb\u4f60\u7684\u7b54\u6848\u3002\u4f60\u7684\u89e3\u91cb\u61c9\u8a72\u4e00\u6b65\u4e00\u6b65\u5f15\u5c0e\u8b80\u8005\u7406\u89e3\u4f60\u7684\u63a8\u7406\u904e\u7a0b\u3002"],"reference":null}
50 | {"index":49,"question_id":50,"category":"fermi","turns":["\u6b77\u53f2\u4e0a\u5df2\u7d93\u9304\u88fd\u4e86\u591a\u5c11\u9996\u6b4c\u66f2\uff1f\u8a66\u8457\u89e3\u91cb\u4f60\u7684\u7b54\u6848\u3002\u4f60\u7684\u89e3\u91cb\u61c9\u8a72\u4e00\u6b65\u4e00\u6b65\u5730\u5e36\u9818\u8b80\u8005\u7406\u89e3\u4f60\u7684\u63a8\u7406\u904e\u7a0b\u3002"],"reference":null}
51 | {"index":50,"question_id":51,"category":"counterfactual","turns":["\u5982\u679c\u7db2\u969b\u7db2\u8def\u662f\u5728\u6587\u85dd\u5fa9\u8208\u6642\u671f\u88ab\u767c\u660e\u7684\u8a71\u6703\u600e\u6a23\u5462\uff1f"],"reference":null}
52 | {"index":51,"question_id":52,"category":"counterfactual","turns":["\u5982\u679c\u963f\u8332\u7279\u514b\u4eba\u6210\u529f\u62b5\u6297\u4e86\u897f\u73ed\u7259\u5f81\u670d\u8005\u6703\u600e\u6a23\u5462\uff1f"],"reference":null}
53 | {"index":52,"question_id":53,"category":"counterfactual","turns":["\u5982\u679c14\u4e16\u7d00\u7684\u9ed1\u6b7b\u75c5\u6c92\u6709\u767c\u751f\uff0c\u6703\u600e\u9ebc\u6a23\uff1f"],"reference":null}
54 | {"index":53,"question_id":54,"category":"counterfactual","turns":["\u5982\u679c\u827e\u85a9\u514b\u00b7\u725b\u9813\u5c08\u6ce8\u65bc\u751f\u7269\u5b78\u800c\u975e\u7269\u7406\u5b78\uff0c\u6703\u600e\u9ebc\u6a23\u5462?"],"reference":null}
55 | {"index":54,"question_id":55,"category":"counterfactual","turns":["\u5982\u679c\u62ab\u982d\u56db\u6a02\u5718\u5f9e\u672a\u7d44\u6210\u7684\u8a71\u6703\u600e\u6a23\uff1f"],"reference":null}
56 | {"index":55,"question_id":56,"category":"counterfactual","turns":["\u5982\u679c\u827e\u502b\u00b7\u5716\u9748\u5728\u4e8c\u6230\u671f\u9593\u6c92\u6709\u7834\u89e3\u6069\u5c3c\u683c\u746a\u5bc6\u78bc\uff0c\u6703\u600e\u9ebc\u6a23\uff1f"],"reference":null}
57 | {"index":56,"question_id":57,"category":"counterfactual","turns":["\u5982\u679c\u8607\u4f0a\u58eb\u904b\u6cb3\u5f9e\u672a\u88ab\u5efa\u9020\u904e\u5462\uff1f"],"reference":null}
58 | {"index":57,"question_id":58,"category":"counterfactual","turns":["\u5982\u679c\u746a\u96c5\u6587\u660e\u5f9e\u672a\u795e\u79d8\u5d29\u6f70\u6703\u600e\u6a23?"],"reference":null}
59 | {"index":58,"question_id":59,"category":"counterfactual","turns":["\u5982\u679c\u514b\u91cc\u65af\u591a\u798f\u00b7\u54e5\u502b\u5e03\u6c92\u6709\u767c\u73fe\u7f8e\u6d32\u6703\u600e\u6a23\uff1f"],"reference":null}
60 | {"index":59,"question_id":60,"category":"counterfactual","turns":["\u5982\u679c\u6587\u68ee\u7279\u00b7\u68b5\u9ad8\u5728\u4ed6\u7684\u4e00\u751f\u4e2d\u662f\u4e00\u4f4d\u6210\u529f\u7684\u85dd\u8853\u5bb6\u6703\u600e\u6a23\u5462\uff1f"],"reference":null}
61 | {"index":60,"question_id":61,"category":"coding","turns":["\u958b\u767c\u4e00\u500b C++ \u7a0b\u5f0f\uff0c\u9010\u884c\u8b80\u53d6\u6587\u5b57\u6a94\uff0c\u4e26\u8a08\u7b97\u6a94\u6848\u4e2d\u7279\u5b9a\u55ae\u8a5e\u7684\u51fa\u73fe\u6b21\u6578\u3002"],"reference":null}
62 | {"index":61,"question_id":62,"category":"coding","turns":["\u5be6\u4f5c\u4e00\u500b Python \u51fd\u6578\uff0c\u4f7f\u7528\u52d5\u614b\u898f\u5283\u4f86\u627e\u51fa\u5169\u500b\u8f38\u5165\u5b57\u4e32\u7684\u6700\u9577\u5171\u540c\u5b50\u5e8f\u5217\u3002"],"reference":null}
63 | {"index":62,"question_id":63,"category":"coding","turns":["\u5728Python\u4e2d\u5be6\u73fe\u4e00\u500b\u6b63\u5247\u8868\u9054\u5f0f\u4f86\u9a57\u8b49\u96fb\u5b50\u90f5\u4ef6\u5730\u5740\u3002"],"reference":null}
64 | {"index":63,"question_id":64,"category":"coding","turns":["\u5beb\u4e00\u500b\u7a0b\u5f0f\u4f86\u627e\u51fa\u7b2c n \u500b\u8cbb\u6ce2\u90a3\u5951\u6578\u5217\u6578\u5b57\uff0c\u4e26\u4f7f\u7528\u52d5\u614b\u898f\u5283\u3002"],"reference":null}
65 | {"index":64,"question_id":65,"category":"coding","turns":["\u5728\u6392\u5e8f\u7684\u9663\u5217\u4e2d\u5be6\u4f5c\u4e00\u500b\u4e8c\u5143\u641c\u5c0b\u6f14\u7b97\u6cd5\u4f86\u627e\u51fa\u7279\u5b9a\u5143\u7d20\u3002"],"reference":null}
66 | {"index":65,"question_id":66,"category":"coding","turns":["\u5728 Python \u4e2d\u4f7f\u7528\u5169\u500b\u5806\u758a\u5be6\u73fe\u4e00\u500b\u968a\u5217\u8cc7\u6599\u7d50\u69cb\u3002"],"reference":null}
67 | {"index":66,"question_id":67,"category":"coding","turns":["\u5be6\u4f5c\u4e00\u500b\u7a0b\u5f0f\u4f86\u627e\u51fa\u5169\u500b\u9663\u5217\u4e2d\u7684\u5171\u540c\u5143\u7d20\uff0c\u4e14\u4e0d\u4f7f\u7528\u4efb\u4f55\u984d\u5916\u7684\u8cc7\u6599\u7d50\u69cb\u3002"],"reference":null}
68 | {"index":67,"question_id":68,"category":"math","turns":["\u7d66\u5b9a f(x) = 5x^3 - 2x + 3\uff0c\u8acb\u6c42\u51fa f(2) \u7684\u503c\u3002"],"reference":null}
69 | {"index":68,"question_id":69,"category":"math","turns":["\u6c42\u89e3\u65b9\u7a0b\u5f0f 3x + 10 = 5(x - 2) \u4e2d\u7684 x \u503c\u3002"],"reference":null}
70 | {"index":69,"question_id":70,"category":"math","turns":["\u5982\u679c\u4e00\u689d\u7dda\u6bb5\u7684\u7aef\u9ede\u662f (2, -2) \u548c (10, 4)\uff0c\u90a3\u9ebc\u9019\u689d\u7dda\u6bb5\u7684\u9577\u5ea6\u662f\u591a\u5c11?"],"reference":null}
71 | {"index":70,"question_id":71,"category":"writing","turns":["\u4f60\u80fd\u5e6b\u6211\u5beb\u4e00\u5c01\u6b63\u5f0f\u7684\u90f5\u4ef6\u7d66\u4e00\u4f4d\u53ef\u80fd\u7684\u5546\u696d\u5925\u4f34\u63d0\u8b70\u5171\u540c\u4e8b\u696d\u55ce\uff1f"],"reference":null}
72 | {"index":71,"question_id":72,"category":"writing","turns":["\u4f60\u80fd\u5e6b\u6211\u5beb\u4e00\u5c01\u5c0d\u6211\u73fe\u4efb\u96c7\u4e3b\u7684\u8fad\u8077\u4fe1\u55ce\uff1f\u540c\u6642\u4fdd\u6301\u826f\u597d\u7684\u95dc\u4fc2\u4e26\u8868\u9054\u5c0d\u63d0\u4f9b\u7684\u6a5f\u6703\u7684\u611f\u6fc0\u4e4b\u60c5\uff1f"],"reference":null}
73 | {"index":72,"question_id":73,"category":"writing","turns":["\u4f7f\u7528\u9069\u7576\u7684\u683c\u5f0f\u4f86\u7d50\u69cb\u4e00\u5c01\u7d66\u7533\u8acb\u5230\u4e00\u500b\u77e5\u540d\u7684\u8a08\u7b97\u6a5f\u79d1\u5b78\u7814\u7a76\u6240\u5b78\u751f\u7684\u6b63\u5f0f\u63a8\u85a6\u4fe1\u3002"],"reference":null}
74 | {"index":73,"question_id":74,"category":"writing","turns":["\u64b0\u5beb\u4e00\u5c01\u5f15\u4eba\u5165\u52dd\u7684\u7522\u54c1\u767c\u5e03\u516c\u544a\u96fb\u5b50\u90f5\u4ef6\uff0c\u544a\u77e5\u6211\u5011\u7684\u5ba2\u6236\u6211\u5011\u7684\u65b0\u8edf\u9ad4\u89e3\u6c7a\u65b9\u6848\u3002"],"reference":null}
75 | {"index":74,"question_id":75,"category":"writing","turns":["\u8349\u64ec\u4e00\u5c01\u9053\u6b49\u4fe1\u7d66\u4e00\u4f4d\u7d93\u6b77\u8a02\u55ae\u5ef6\u9072\u7684\u5ba2\u6236\uff0c\u4e26\u63d0\u4f9b\u4fdd\u8b49\u8a72\u554f\u984c\u5df2\u7d93\u89e3\u6c7a\u3002"],"reference":null}
76 | {"index":75,"question_id":76,"category":"writing","turns":["\u64b0\u5beb\u4e00\u500b YouTube \u5f71\u7247\u8173\u672c\uff0c\u63a2\u7d22\u7235\u58eb\u6a02\u7684\u6b77\u53f2\u548c\u6587\u5316\u91cd\u8981\u6027\u3002"],"reference":null}
77 | {"index":76,"question_id":77,"category":"writing","turns":["\u64b0\u5beb\u4e00\u7bc7\u95dc\u65bc\u6700\u8fd1\u53bb\u590f\u5a01\u5937\u65c5\u904a\u7684\u5438\u5f15\u4eba\u7684\u65c5\u904a\u90e8\u843d\u683c\u6587\u7ae0\uff0c\u5f37\u8abf\u6587\u5316\u9ad4\u9a57\u548c\u5fc5\u770b\u7684\u666f\u9ede\u3002"],"reference":null}
78 | {"index":77,"question_id":78,"category":"writing","turns":["\u70ba\u4e00\u90e8\u8fd1\u671f\u4e0a\u6620\u7684\u79d1\u5e7b\u96fb\u5f71\u64b0\u5beb\u4e00\u7bc7\u5438\u5f15\u4eba\u7684\u5f71\u8a55\uff0c\u8a0e\u8ad6\u5176\u5287\u60c5\u3001\u89d2\u8272\u548c\u7279\u6548\u3002"],"reference":null}
79 | {"index":78,"question_id":79,"category":"writing","turns":["\u70ba\u4e00\u96c6\u8a0e\u8ad6\u4e32\u6d41\u5e73\u53f0\u5c0d\u97f3\u6a02\u7522\u696d\u5f71\u97ff\u7684\u64ad\u5ba2\u5287\u672c\u67b6\u69cb\u3002"],"reference":null}
80 | {"index":79,"question_id":80,"category":"writing","turns":["\u64b0\u5beb\u4e00\u7bc7\u4ea4\u97ff\u97f3\u6a02\u6703\u8a55\u8ad6\uff0c\u8a0e\u8ad6\u6a02\u5718\u7684\u8868\u73fe\u4ee5\u53ca\u6574\u9ad4\u89c0\u773e\u7684\u9ad4\u9a57\u3002"],"reference":null}
--------------------------------------------------------------------------------
/evaluation/zh_tw_bench/reference_answer/gpt-4.jsonl:
--------------------------------------------------------------------------------
1 | {"question_id": 61, "answer_id": "YdL4XwENkLCLXMbH65rjKy", "model_id": "gpt-4", "choices": [{"index": 0, "turns": ["Here's a simple C++ program that reads a text file line by line and counts the number of occurrences of a specific word in the file:\n\n```cpp\n#include \n#include \n#include \n#include \n\nint countWordOccurrences(const std::string& line, const std::string& word) {\n std::istringstream iss(line);\n std::string currentWord;\n int count = 0;\n\n while (iss >> currentWord) {\n if (currentWord == word) {\n count++;\n }\n }\n\n return count;\n}\n\nint main() {\n std::string fileName, wordToCount;\n std::cout << \"Enter the file name: \";\n std::cin >> fileName;\n std::cout << \"Enter the word to count: \";\n std::cin >> wordToCount;\n\n std::ifstream inputFile(fileName);\n if (!inputFile) {\n std::cerr << \"Error opening the file: \" << fileName << std::endl;\n return 1;\n }\n\n std::string line;\n int totalCount = 0;\n while (std::getline(inputFile, line)) {\n totalCount += countWordOccurrences(line, wordToCount);\n }\n\n inputFile.close();\n\n std::cout << \"The word \\\"\" << wordToCount << \"\\\" occurs \" << totalCount << \" times in the file.\" << std::endl;\n\n return 0;\n}\n```\n\nTo compile and run the program, save it to a file named `word_counter.cpp` and use the following commands:\n\n```sh\ng++ -o word_counter word_counter.cpp\n./word_counter\n```\n\nThe program will prompt you to enter the file name and the word you want to count. After that, it will display the number of occurrences of the specified word in the file."]}], "tstamp": 1686545085.5954556}
2 | {"question_id": 62, "answer_id": "4nk9zQkeRKvfjxEEqfyXmj", "model_id": "gpt-4", "choices": [{"index": 0, "turns": ["Here's a Python function that finds the longest common subsequence of two input strings using dynamic programming:\n\n```python\ndef longest_common_subsequence(s1, s2):\n # Create a matrix to store the lengths of common subsequences\n matrix = [[0] * (len(s2) + 1) for _ in range(len(s1) + 1)]\n\n # Iterate through the strings and fill the matrix\n for i in range(len(s1) - 1, -1, -1):\n for j in range(len(s2) - 1, -1, -1):\n if s1[i] == s2[j]:\n matrix[i][j] = 1 + matrix[i + 1][j + 1]\n else:\n matrix[i][j] = max(matrix[i + 1][j], matrix[i][j + 1])\n\n # Reconstruct the longest common subsequence from the matrix\n lcs = []\n i, j = 0, 0\n while i < len(s1) and j < len(s2):\n if s1[i] == s2[j]:\n lcs.append(s1[i])\n i += 1\n j += 1\n elif matrix[i + 1][j] > matrix[i][j + 1]:\n i += 1\n else:\n j += 1\n\n return ''.join(lcs)\n\n# Example usage:\ns1 = \"ABCDGH\"\ns2 = \"AEDFHR\"\nprint(longest_common_subsequence(s1, s2)) # Output: \"ADH\"\n```\n\nThis function uses dynamic programming to build a matrix that stores the lengths of common subsequences for each pair of substrings. It then reconstructs the longest common subsequence from the matrix."]}], "tstamp": 1686544775.161658}
3 | {"question_id": 63, "answer_id": "2MkXEfsoSuSC72BYGJuqhX", "model_id": "gpt-4", "choices": [{"index": 0, "turns": ["To implement a regular expression in Python to validate an email address, you can use the `re` module. Here's a simple example:\n\n```python\nimport re\n\ndef validate_email(email):\n pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$'\n return bool(re.match(pattern, email))\n\n# Test cases\nemails = [\n \"test@example.com\",\n \"test@example.co.uk\",\n \"test+123@example.com\",\n \"test@subdomain.example.com\",\n \"test@sub_domain.example.com\",\n \"test@sub-domain.example.com\",\n \"test@example\",\n \"test@.com\",\n \"test@.example.com\",\n \"test@example..com\",\n \"test@.example..com\",\n \"test@example.com.\",\n \".test@example.com\",\n]\n\nfor email in emails:\n print(f\"{email}: {validate_email(email)}\")\n```\n\nThis regular expression checks for the following conditions:\n\n1. The email address starts with one or more alphanumeric characters, dots, underscores, percent signs, plus signs, or hyphens.\n2. Followed by the '@' symbol.\n3. Followed by one or more alphanumeric characters, dots, or hyphens.\n4. Followed by a dot.\n5. Ends with two or more alphabetic characters.\n\nPlease note that this is a simple example and may not cover all edge cases. You can modify the regular expression to better suit your specific requirements."]}], "tstamp": 1686544778.904383}
4 | {"question_id": 64, "answer_id": "jPeywxk9g4LWyLPurjhsd2", "model_id": "gpt-4", "choices": [{"index": 0, "turns": ["Here's a Python program to find the nth Fibonacci number using dynamic programming (also known as memoization):\n\n```python\ndef fibonacci(n, memo={}):\n if n <= 0:\n return 0\n elif n == 1:\n return 1\n elif n not in memo:\n memo[n] = fibonacci(n - 1, memo) + fibonacci(n - 2, memo)\n return memo[n]\n\n# Test the function\nn = int(input(\"Enter the value of n: \"))\nresult = fibonacci(n)\nprint(f\"The {n}th Fibonacci number is: {result}\")\n```\n\nThis program defines a function `fibonacci(n, memo={})` that calculates the nth Fibonacci number using dynamic programming. The `memo` dictionary is used to store the results of previously calculated Fibonacci numbers, which helps to avoid redundant calculations and improve the efficiency of the program."]}], "tstamp": 1686544763.1005273}
5 | {"question_id": 65, "answer_id": "hD8NiB3PY5aoWg95mNjnj3", "model_id": "gpt-4", "choices": [{"index": 0, "turns": ["Here's a Python implementation of the binary search algorithm to find a specific element in a sorted array:\n\n```python\ndef binary_search(arr, target):\n low, high = 0, len(arr) - 1\n\n while low <= high:\n mid = (low + high) // 2\n mid_val = arr[mid]\n\n if mid_val == target:\n return mid\n elif mid_val < target:\n low = mid + 1\n else:\n high = mid - 1\n\n return -1\n\n# Example usage:\narr = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19]\ntarget = 11\n\nresult = binary_search(arr, target)\nif result != -1:\n print(f\"Element {target} found at index {result}\")\nelse:\n print(f\"Element {target} not found in the array\")\n```\n\nThis binary search function takes a sorted array `arr` and a target value `target` as input. It initializes two pointers, `low` and `high`, to the first and last indices of the array, respectively. The function then iteratively updates the pointers and calculates the middle index `mid` until the target value is found or the search space is exhausted. If the target value is found, the function returns the index of the target value; otherwise, it returns -1."]}], "tstamp": 1686544779.33291}
6 | {"question_id": 66, "answer_id": "c4rAXsTfmioefx6icubuFB", "model_id": "gpt-4", "choices": [{"index": 0, "turns": ["To implement a queue using two stacks in Python, you can use the following code:\n\n```python\nclass Queue:\n def __init__(self):\n self.stack1 = []\n self.stack2 = []\n\n def enqueue(self, item):\n self.stack1.append(item)\n\n def dequeue(self):\n if not self.stack2:\n while self.stack1:\n self.stack2.append(self.stack1.pop())\n return self.stack2.pop()\n\n def is_empty(self):\n return not self.stack1 and not self.stack2\n\n def size(self):\n return len(self.stack1) + len(self.stack2)\n\n\n# Example usage:\nqueue = Queue()\nqueue.enqueue(1)\nqueue.enqueue(2)\nqueue.enqueue(3)\n\nprint(queue.dequeue()) # Output: 1\nprint(queue.dequeue()) # Output: 2\n\nqueue.enqueue(4)\nqueue.enqueue(5)\n\nprint(queue.dequeue()) # Output: 3\nprint(queue.dequeue()) # Output: 4\nprint(queue.dequeue()) # Output: 5\n```\n\nIn this implementation, we use two stacks (`stack1` and `stack2`) to simulate the behavior of a queue. When enqueuing an item, we simply push it onto `stack1`. When dequeuing an item, we pop from `stack2`. If `stack2` is empty, we first transfer all items from `stack1` to `stack2` (reversing their order) before popping from `stack2`. This ensures that the oldest item in the queue is always at the top of `stack2`."]}], "tstamp": 1686544798.8350058}
7 | {"question_id": 67, "answer_id": "UQ3TFrzVqXtMTWpvhqtGym", "model_id": "gpt-4", "choices": [{"index": 0, "turns": ["You can implement a program to find the common elements in two arrays without using any extra data structures by using nested loops. Here's an example in Python:\n\n```python\ndef find_common_elements(arr1, arr2):\n common_elements = []\n\n for i in arr1:\n for j in arr2:\n if i == j:\n common_elements.append(i)\n break\n\n return common_elements\n\n# Example usage:\narr1 = [1, 2, 3, 4, 5]\narr2 = [4, 5, 6, 7, 8]\n\ncommon_elements = find_common_elements(arr1, arr2)\nprint(\"Common elements:\", common_elements)\n```\n\nThis program defines a function `find_common_elements` that takes two arrays as input. It initializes an empty list `common_elements` to store the common elements found. Then, it iterates through each element in the first array using a loop, and for each element, it iterates through the second array using another loop. If an element from the first array is found in the second array, it is added to the `common_elements` list and the inner loop is terminated using the `break` statement.\n\nFinally, the function returns the `common_elements` list. In the example usage, the program finds the common elements between two arrays `arr1` and `arr2`, and prints the result."]}], "tstamp": 1686544805.0822344}
8 | {"question_id": 68, "answer_id": "eBGY6U9KCAsfuedcASSK7Q", "model_id": "gpt-4", "choices": [{"index": 0, "turns": ["To find the value of f(2), substitute x = 2 into the function f(x) = 5x^3 - 2x + 3.\n\nf(2) = 5(2)^3 - 2(2) + 3\nf(2) = 5(8) - 4 + 3\nf(2) = 40 - 4 + 3\nf(2) = 36 + 3\nf(2) = 39\n\nSo, the value of f(2) is 39."]}], "tstamp": 1686544893.1474469}
9 | {"question_id": 69, "answer_id": "KsgnWzbpEvoW7RmRwaLgMT", "model_id": "gpt-4", "choices": [{"index": 0, "turns": ["To solve for x in the equation 3x + 10 = 5(x - 2), follow these steps:\n\n1. Distribute the 5 on the right side of the equation:\n3x + 10 = 5x - 10\n\n2. Move the x terms to one side by subtracting 3x from both sides:\n10 = 2x - 10\n\n3. Move the constant term to the other side by adding 10 to both sides:\n20 = 2x\n\n4. Divide both sides by 2 to isolate x:\nx = 10\n\nSo, the solution is x = 10."]}], "tstamp": 1686544795.032636}
10 | {"question_id": 70, "answer_id": "BT82nWd8ZtXTzHrvhkg6eb", "model_id": "gpt-4", "choices": [{"index": 0, "turns": ["To find the length of the line segment, we can use the distance formula:\n\nDistance = \u221a((x2 - x1)^2 + (y2 - y1)^2)\n\nHere, (x1, y1) = (2, -2) and (x2, y2) = (10, 4).\n\nPlugging in the values, we get:\n\nDistance = \u221a((10 - 2)^2 + (4 - (-2))^2)\nDistance = \u221a((8)^2 + (6)^2)\nDistance = \u221a(64 + 36)\nDistance = \u221a100\nDistance = 10\n\nSo, the length of the line segment is 10 units."]}], "tstamp": 1686544809.1491597}
11 |
--------------------------------------------------------------------------------
/evaluation/zh_tw_bench/utils.py:
--------------------------------------------------------------------------------
1 | """
2 | Source: https://github.com/lm-sys/FastChat/blob/main/fastchat/llm_judge/common.py
3 | """
4 | # Sampling temperature configs for)
5 | temperature_config = {
6 | "writing": 0.7,
7 | "roleplay": 0.7,
8 | "extraction": 0.0,
9 | "math": 0.0,
10 | "coding": 0.0,
11 | "reasoning": 0.0,
12 | "stem": 0.1,
13 | "humanities": 0.1,
14 | }
15 |
16 | NEED_REF_CATS = ["math", "reasoning", "coding"]
17 |
--------------------------------------------------------------------------------
/example_training_config_for_finetuning_twllm.yaml:
--------------------------------------------------------------------------------
1 | base_model: yentinglin/Llama-3-Taiwan-70B-Instruct
2 | trust_remote_code: true
3 | load_in_8bit: false
4 | load_in_4bit: false
5 | strict: false
6 |
7 | hub_model_id: YOUR_USER_NAME/Llama-3-Taiwan-70B-Instruct-MY_FINETUNING
8 | hub_strategy: end
9 | wandb_name: twllm 70b sft
10 |
11 | dataset_processes: 16
12 | datasets:
13 | - path: yentinglin/TaiwanChat
14 | type: sharegpt
15 | conversation: llama3
16 | chat_template: llama3
17 |
18 | dataset_prepared_path: last_run_prepared
19 | val_set_size: 0
20 | output_dir: ./output/tw-llm/70b/sft/
21 |
22 | save_safetensors: true
23 | sequence_len: 8192
24 | sample_packing: true
25 | pad_to_sequence_len: true
26 |
27 | wandb_project: twllm
28 | wandb_watch:
29 | wandb_run_id:
30 | wandb_log_model:
31 |
32 | gradient_accumulation_steps: 1
33 | micro_batch_size: 1
34 | num_epochs: 3
35 | optimizer: adamw_8bit
36 | lr_scheduler: cosine
37 | learning_rate: 5e-5
38 |
39 | train_on_inputs: false
40 | group_by_length: false
41 | bf16: auto
42 | fp16:
43 | tf32: false
44 |
45 | gradient_checkpointing: true
46 | gradient_checkpointing_kwargs:
47 | use_reentrant: false
48 | early_stopping_patience:
49 | resume_from_checkpoint:
50 | logging_steps: 1
51 | xformers_attention:
52 | flash_attention: true
53 |
54 | warmup_steps: 100
55 | evals_per_epoch: 0
56 | eval_table_size:
57 | saves_per_epoch: 1
58 | save_total_limit: 10
59 | save_steps:
60 | debug:
61 | deepspeed: deepspeed_configs/zero3_bf16.json
62 | weight_decay: 0.05
63 | fsdp:
64 | fsdp_config:
--------------------------------------------------------------------------------
/images/ollama_run_demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MiuLab/Taiwan-LLM/d7a5b43713f5d818285b886e07f7f0a30a721104/images/ollama_run_demo.png
--------------------------------------------------------------------------------
/images/zhtw_vicuna_bench_chatgptbaseline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MiuLab/Taiwan-LLM/d7a5b43713f5d818285b886e07f7f0a30a721104/images/zhtw_vicuna_bench_chatgptbaseline.png
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | vllm==0.1.3
2 | langchain==0.0.329
3 | openai==0.27.8
4 | text-generation==0.6.0
5 | pandas
6 | openpyxl
7 | gradio
8 | tqdm
9 | ipdb
10 | black
11 | isort
--------------------------------------------------------------------------------
/start_inference_server.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | # 透過參數設定或預設值來定義變數
4 | model=${1:-"yentinglin/Llama-3-Taiwan-70B-Instruct"}
5 | port=${2:-8000}
6 | gpus=${3:-'"device=0,1"'}
7 | hf_token=${HF_TOKEN:-""}
8 |
9 | # 使用 Docker 執行指令,進行模型的運算
10 | sudo docker run --gpus $gpus \
11 | -v ~/.cache/huggingface:/root/.cache/huggingface \
12 | --env "HUGGING_FACE_HUB_TOKEN=$hf_token" \
13 | -p $port:8000 \
14 | --ipc=host \
15 | vllm/vllm-openai:latest \
16 | --model $model \
17 | -tp 2 \
18 |
--------------------------------------------------------------------------------