91 | References
92 |
93 |
94 | ```bibtex
95 | @misc{polyglot-ko,
96 | title = {{Polyglot-Ko: Open-Source Korean Autoregressive Language Model}},
97 | author = {Ko, Hyunwoong and Yang, Kichang and Ryu, Minho and Choi, Taekyoon and Yang, Seungmu and Hyun, jiwung and Park, Sungho},
98 | url = {https://www.github.com/eleutherai/polyglot},
99 | month = {9},
100 | year = {2022},
101 | }
102 | ```
103 |
104 | ```bibtex
105 | @misc{alpaca,
106 | author = {Rohan Taori and Ishaan Gulrajani and Tianyi Zhang and Yann Dubois and Xuechen Li and Carlos Guestrin and Percy Liang and Tatsunori B. Hashimoto },
107 | title = {Stanford Alpaca: An Instruction-following LLaMA model},
108 | year = {2023},
109 | publisher = {GitHub},
110 | journal = {GitHub repository},
111 | howpublished = {\url{https://github.com/tatsu-lab/stanford_alpaca}},
112 | }
113 | ```
114 |
115 | ```bibtex
116 | @misc{kullm,
117 | author = {NLP & AI Lab and Human-Inspired AI research},
118 | title = {KULLM: Korea University Large Language Model Project},
119 | year = {2023},
120 | publisher = {GitHub},
121 | journal = {GitHub repository},
122 | howpublished = {\url{https://github.com/nlpai-lab/kullm}},
123 | }
124 | ```
125 |
126 | ```bibtex
127 | @article{lin2021few,
128 | title={Few-shot learning with multilingual language models},
129 | author={Lin, Xi Victoria and Mihaylov, Todor and Artetxe, Mikel and Wang, Tianlu and Chen, Shuohui and Simig, Daniel and Ott, Myle and Goyal, Naman and Bhosale, Shruti and Du, Jingfei and others},
130 | journal={arXiv preprint arXiv:2112.10668},
131 | year={2021}
132 | }
133 | ```
134 |
135 | ```bibtex
136 | @inproceedings{kim-etal-2021-changes,
137 | title = "What Changes Can Large-scale Language Models Bring? Intensive Study on {H}yper{CLOVA}: Billions-scale {K}orean Generative Pretrained Transformers",
138 | author = "Kim, Boseop and
139 | Kim, HyoungSeok and
140 | Lee, Sang-Woo and
141 | Lee, Gichang and
142 | Kwak, Donghyun and
143 | Dong Hyeon, Jeon and
144 | Park, Sunghyun and
145 | Kim, Sungju and
146 | Kim, Seonhoon and
147 | Seo, Dongpil and
148 | Lee, Heungsub and
149 | Jeong, Minyoung and
150 | Lee, Sungjae and
151 | Kim, Minsub and
152 | Ko, Suk Hyun and
153 | Kim, Seokhun and
154 | Park, Taeyong and
155 | Kim, Jinuk and
156 | Kang, Soyoung and
157 | Ryu, Na-Hyeon and
158 | Yoo, Kang Min and
159 | Chang, Minsuk and
160 | Suh, Soobin and
161 | In, Sookyo and
162 | Park, Jinseong and
163 | Kim, Kyungduk and
164 | Kim, Hiun and
165 | Jeong, Jisu and
166 | Yeo, Yong Goo and
167 | Ham, Donghoon and
168 | Park, Dongju and
169 | Lee, Min Young and
170 | Kang, Jaewook and
171 | Kang, Inho and
172 | Ha, Jung-Woo and
173 | Park, Woomyoung and
174 | Sung, Nako",
175 | booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing",
176 | month = nov,
177 | year = "2021",
178 | address = "Online and Punta Cana, Dominican Republic",
179 | publisher = "Association for Computational Linguistics",
180 | url = "https://aclanthology.org/2021.emnlp-main.274",
181 | doi = "10.18653/v1/2021.emnlp-main.274",
182 | pages = "3405--3424",
183 | ```
184 |
185 | ```bibtex
186 | @inproceedings{xue-etal-2021-mt5,
187 | title = "m{T}5: A Massively Multilingual Pre-trained Text-to-Text Transformer",
188 | author = "Xue, Linting and
189 | Constant, Noah and
190 | Roberts, Adam and
191 | Kale, Mihir and
192 | Al-Rfou, Rami and
193 | Siddhant, Aditya and
194 | Barua, Aditya and
195 | Raffel, Colin",
196 | booktitle = "Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies",
197 | month = jun,
198 | year = "2021",
199 | address = "Online",
200 | publisher = "Association for Computational Linguistics",
201 | url = "https://aclanthology.org/2021.naacl-main.41",
202 | doi = "10.18653/v1/2021.naacl-main.41",
203 | pages = "483--498",
204 | }
205 | ```
206 |
207 | ```bibtex
208 | @article{ouyang2022training,
209 | title={Training language models to follow instructions with human feedback},
210 | author={Ouyang, Long and Wu, Jeffrey and Jiang, Xu and Almeida, Diogo and Wainwright, Carroll and Mishkin, Pamela and Zhang, Chong and Agarwal, Sandhini and Slama, Katarina and Ray, Alex and others},
211 | journal={Advances in Neural Information Processing Systems},
212 | volume={35},
213 | pages={27730--27744},
214 | year={2022}
215 | }
216 | ```
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # HAE-RAE
2 | Repository for the HAE-RAE project, a project to improve the reasoning and instruction-following abilities of Polyglot-Ko. This repository hosts datasets, blog posts, and code from our progress.
3 |
--------------------------------------------------------------------------------
/blog/CSAT-QA.md:
--------------------------------------------------------------------------------
1 | # CSAT-QA: How Far Can LLMs Reach in Korean Language Understanding?
2 |
3 | ### Introduction
4 |
5 | In this blog post, we release CSAT-QA, a multiple choice question answering dataset for the Korean language. The dataset includes questions collected from the College Scholastic Ability Test (CSAT), also known as the 대학수학능력시험 in South Korea, a standardized test required for university admissions in the country. In this project, we have gathered and made available 936 question-and-answer pairs from CSAT exams held between 2007 and 2022. These resources are now open-source for public use.
6 |
7 | ### Dataset Collection
8 |
9 | The CSAT-QA dataset, encompasses four distinct curriculums: the 7th National Curriculum, the 2007 Revised Curriculum, the 2009 Revised Curriculum, and the 2015 Revised Curriculum. For the collected dataset, we implemented the following preprocessing steps:
10 |
11 | Initially, due to the unreliability of publicly accessible Korean OCR systems, we opted to manually transcribe the CSAT test questions to ensure the quality of our dataset.
12 |
13 | Second, we excluded questions related to "Middle Korean," an ancient form of the language. This was necessary as the majority of language models cannot encode such vocabulary.
14 |
15 | In the subsequent phase, we manually converted all tables and graphs into the LaTeX format, while translating images into descriptive, alternative texts. This is done to fully represent complex questions in a language model-friendly manner.
16 |
17 | Finally, we introduced four unique token pairs: \