├── .gitattributes ├── .gitignore ├── LICENSE ├── README.md ├── SemEval2024_task8_overview_April.pdf ├── images ├── MBZUAI-logo.png ├── data_statistics.png └── sofia_uni.png ├── subtaskA ├── baseline │ └── transformer_baseline.py ├── format_checker │ └── format_checker.py └── scorer │ └── scorer.py ├── subtaskB ├── baseline │ └── transformer_baseline.py ├── format_checker │ └── format_checker.py └── scorer │ └── scorer.py └── subtaskC ├── baseline ├── requirements.txt ├── run.sh └── transformer_baseline.py ├── format_checker └── format_checker.py └── scorer └── scorer.py /.gitattributes: -------------------------------------------------------------------------------- 1 | *.jsonl filter=lfs diff=lfs merge=lfs -text 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | venv/ 2 | wandb/ 3 | runs/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SemEval-2024 Task 8: Multigenerator, Multidomain, and Multilingual Black-Box Machine-Generated Text Detection 2 | 3 | [![Code License: Apache 2.0](https://img.shields.io/badge/License-Apache_2.0-green.svg)](https://raw.githubusercontent.com/mbzuai-nlp/SemEval2024-task8/subtask_A_and_B/LICENSE) 4 | 5 |

6 | 7 |

8 | 9 | 10 | [News](#news) | [Competition](#competition) | [Subtasks](#subtasks) | [Data Source](#data_source) | [Data Format](#data_format) | [Evaluation Metrics](#scorer_and_official_evaluation_metrics) | [Baselines](#baselines) | [FAQ](#faq) | [Organizers](#organizers) | [Contacts](#contacts) 11 | 12 | Large language models (LLMs) are becoming mainstream and easily accessible, ushering in an explosion of machine-generated content over various channels, such as news, social media, question-answering forums, educational, and even academic contexts. Recent LLMs, such as ChatGPT and GPT-4, generate remarkably fluent responses to a wide variety of user queries. The articulate nature of such generated texts makes LLMs attractive for replacing human labor in many scenarios. However, this has also resulted in concerns regarding their potential misuse, such as spreading misinformation and causing disruptions in the education system. Since humans perform only slightly better than chance when classifying machine-generated vs. human-written text, there is a need to develop automatic systems to identify machine-generated text with the goal of mitigating its potential misuse. 13 | 14 | We offer three subtasks over two paradigms of text generation: (1) **full text** when a considered text is entirely written by a human or generated by a machine; and (2) **mixed text** when a machine-generated text is refined by a human or a human-written text paraphrased by a machine. 15 | 16 | ## NEWS 17 | 18 | ### 22 April 2024 19 | Check the SemEval Shared Task [Paper](https://github.com/mbzuai-nlp/SemEval2024-task8/blob/main/SemEval2024_task8_overview_April.pdf). To appear in NAACL SemEval-2024 soon! 20 | 21 | ### 3 Feb 2024 22 | The results of the test phase are published! 23 | 24 | Test results: https://docs.google.com/spreadsheets/d/1BWSb-vcEZHqKmycOHdrEvOiORpN93SqC5KiYILbKxk4/edit?usp=sharing 25 | 26 | Test gold labels: https://drive.google.com/drive/folders/13aFJK4UyY3Gxg_2ceEAWfJvzopB1vkPc?usp=sharing 27 | 28 | ### 13 Jan 2024 29 | Dear all participants, we apologize that there were something wrong with our CodaBench platform during **10-13 Jan**. We fixed it today and restart the competition. 30 | You can submit your solutions and then we will **announce** the final test results and rank until **the end of evaluation** (31 Jan). 31 | 32 | PS: For submissions during 10-13 Jan, sorry we are only allowed to save all your score results but no permission to save all your submissions. 33 | In case of some mistakes, you can resubmit your running results. 34 | 35 | 36 | ### Test Sets are Ready, Go! 37 | The SemEval-2024 Task 8 test sets are now available! 38 | We have prepared machine-generated and human-written texts in English, Arabic, German, and Italian. 39 | 40 | Access our test sets by [Google drive link](https://drive.google.com/drive/folders/10DKtClzkwIIAatzHBWXZXuQNID-DNGSG?usp=sharing). 41 | 42 | Submit your solution by **31 January 2024** using the CodaBench platform! 43 | 44 | ## Competition 45 | 46 | Our competition is launched on the CodaBench platform: [https://www.codabench.org/competitions/1752](https://www.codabench.org/competitions/1752). 47 | 48 | ## Subtasks 49 | 50 | - **Subtask A. Binary Human-Written vs. Machine-Generated Text Classification:** Given a full text, determine whether it is human-written or machine-generated. There are two tracks for subtask A: monolingual (only English sources) and multilingual. 51 | 52 | - **Subtask B. Multi-Way Machine-Generated Text Classification:** Given a full text, determine who generated it. It can be human-written or generated by a specific language model. 53 | 54 | - **Subtask C. Human-Machine Mixed Text Detection:** Given a mixed text, where the first part is human-written and the second part is machine-generated, determine the boundary, where the change occurs. 55 | 56 | ## Data Restriction 57 | Note that additional training data is **NOT allowed** for all participants. 58 | 59 | ## Data Source 60 | The data for the task is an extension of the M4 dataset. Here are current statistics about the dataset. 61 | 62 |

63 | Title 64 |

65 | 66 | ## Citation 67 | The M4 dataset is described in an [EACL'2024 paper -- Best Resource Paper Award](https://aclanthology.org/2024.eacl-long.83/): 68 | ```bibtex 69 | @inproceedings{wang-etal-2024-m4, 70 | title = "M4: Multi-generator, Multi-domain, and Multi-lingual Black-Box Machine-Generated Text Detection", 71 | author = "Wang, Yuxia and 72 | Mansurov, Jonibek and 73 | Ivanov, Petar and 74 | Su, Jinyan and 75 | Shelmanov, Artem and 76 | Tsvigun, Akim and 77 | Whitehouse, Chenxi and 78 | Mohammed Afzal, Osama and 79 | Mahmoud, Tarek and 80 | Sasaki, Toru and 81 | Arnold, Thomas and 82 | Aji, Alham and 83 | Habash, Nizar and 84 | Gurevych, Iryna and 85 | Nakov, Preslav", 86 | editor = "Graham, Yvette and 87 | Purver, Matthew", 88 | booktitle = "Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)", 89 | month = mar, 90 | year = "2024", 91 | address = "St. Julian{'}s, Malta", 92 | publisher = "Association for Computational Linguistics", 93 | url = "https://aclanthology.org/2024.eacl-long.83", 94 | pages = "1369--1407", 95 | abstract = "Large language models (LLMs) have demonstrated remarkable capability to generate fluent responses to a wide variety of user queries. However, this has also raised concerns about the potential misuse of such texts in journalism, education, and academia. In this study, we strive to create automated systems that can detect machine-generated texts and pinpoint potential misuse. We first introduce a large-scale benchmark M4, which is a multi-generator, multi-domain, and multi-lingual corpus for machine-generated text detection. Through an extensive empirical study of this dataset, we show that it is challenging for detectors to generalize well on instances from unseen domains or LLMs. In such cases, detectors tend to misclassify machine-generated text as human-written. These results show that the problem is far from solved and that there is a lot of room for improvement. We believe that our dataset will enable future research towards more robust approaches to this pressing societal problem. The dataset is available at https://github.com/mbzuai-nlp/M4", 96 | } 97 | ``` 98 | 99 | The SemEval-2024 Task 8 bibtex is below: 100 | ```bibtex 101 | @inproceedings{semeval2024task8, 102 | author = {Wang, Yuxia and Mansurov, Jonibek and Ivanov, Petar and su, jinyan and Shelmanov, Artem and Tsvigun, Akim and Mohammed Afzal, Osama and Mahmoud, Tarek and Puccetti, Giovanni and Arnold, Thomas and Whitehouse, Chenxi and Aji, Alham Fikri and Habash, Nizar and Gurevych, Iryna and Nakov, Preslav}, 103 | title = {SemEval-2024 Task 8: Multidomain, Multimodel and Multilingual Machine-Generated Text Detection}, 104 | booktitle = {Proceedings of the 18th International Workshop on Semantic Evaluation (SemEval-2024)}, 105 | month = {June}, 106 | year = {2024}, 107 | address = {Mexico City, Mexico}, 108 | publisher = {Association for Computational Linguistics}, 109 | pages = {2041--2063}, 110 | abstract = {We present the results and the main findings of SemEval-2024 Task 8: Multigenerator, Multidomain, and Multilingual Machine-Generated Text Detection. The task featured three subtasks. Subtask A is a binary classification task determining whether a text is written by a human or generated by a machine. This subtask has two tracks: a monolingual track focused solely on English texts and a multilingual track. Subtask B is to detect the exact source of a text, discerning whether it is written by a human or generated by a specific LLM. Subtask C aims to identify the changing point within a text, at which the authorship transitions from human to machine. The task attracted a large number of participants: subtask A monolingual (126), subtask A multilingual (59), subtask B (70), and subtask C (30). In this paper, we present the task, analyze the results, and discuss the system submissions and the methods they used. For all subtasks, the best systems used LLMs.}, 111 | url = {https://aclanthology.org/2024.semeval2024-1.275} 112 | } 113 | ``` 114 | 115 | ## Data Format 116 | ### Data Download Instructions 117 | 118 | To download the dataset for this project, follow these steps: 119 | 120 | 1. Install the `gdown` package using pip: 121 | 122 | ``` 123 | pip install gdown 124 | ```` 125 | 126 | 2. Use `gdown` to download the dataset folders by providing the respective file IDs for each subtask: 127 | 128 | | Task | Google Drive Folder Link | File ID | 129 | |---------------|--------------------------------------------------------------------------------------------------------------------|------------------------------------------------| 130 | | Whole dataset | [Google Drive Folder](https://drive.google.com/drive/folders/14DulzxuH5TDhXtviRVXsH5e2JTY2POLi) | 14DulzxuH5TDhXtviRVXsH5e2JTY2POLi | 131 | | Subtask A | [Google Drive Folder](https://drive.google.com/drive/folders/1CAbb3DjrOPBNm0ozVBfhvrEh9P9rAppc) | 1CAbb3DjrOPBNm0ozVBfhvrEh9P9rAppc | 132 | | Subtask B | [Google Drive Folder](https://drive.google.com/drive/folders/11YeloR2eTXcTzdwI04Z-M2QVvIeQAU6-) | 11YeloR2eTXcTzdwI04Z-M2QVvIeQAU6- | 133 | | Subtask C | [Google Drive Folder](https://drive.google.com/drive/folders/16bRUuoeb_LxnCkcKM-ed6X6K5t_1C6mL) | 16bRUuoeb_LxnCkcKM-ed6X6K5t_1C6mL | 134 | 135 | ``` 136 | gdown --folder https://drive.google.com/drive/folders/ 137 | ``` 138 | Make sure to replace `` with the respective file IDs provided above when running the `gdown` command for the desired dataset. 139 | 140 | 3. After downloading place the files in their respective subtask folder. 141 | 142 | 143 | The datasets are JSONL files. 144 | The data is located in the following folders: 145 | * **Subtask A:** 146 | * Monolingual track: 147 | * subtaskA/data/subtaskA_train_monolingual.jsonl 148 | * subtaskA/data/subtaskA_dev_monolingual.jsonl 149 | * Multilingual track: 150 | * subtaskA/data/subtaskA_train_multilingual.jsonl 151 | * subtaskA/data/subtaskA_dev_multilingual.jsonl 152 | * **Subtask B:** 153 | * subtaskB/data/subtaskB_train.jsonl 154 | * subtaskB/data/subtaskB_dev.jsonl 155 | * **Subtask C:** 156 | * subtaskC/data/subtaskC_train.jsonl 157 | * subtaskC/data/subtaskC_dev.jsonl 158 | 159 | 160 | ### Statistics 161 | | Subtask | #Train | #Dev | 162 | |:----------------------------|--------:|--------:| 163 | | Subtask A (monolingual) | 119,757 | 5,000 | 164 | | Subtask A (multilingual) | 172,417 | 4,000 | 165 | | Subtask B | 71,027 | 3,000 | 166 | | Subtask C | 3,649 | 505 | 167 | 168 | 169 | ### Input Data Format 170 | 171 | #### Subtask A: 172 | An object in the JSON format: 173 | ``` 174 | { 175 | id -> identifier of the example, 176 | label -> label (human text: 0, machine text: 1,), 177 | text -> text generated by a machine or written by a human, 178 | model -> model that generated the data, 179 | source -> source (Wikipedia, Wikihow, Peerread, Reddit, Arxiv) on English or language (Arabic, Russian, Chinese, Indonesian, Urdu, Bulgarian, German) 180 | } 181 | ``` 182 | 183 | #### Subtask B: 184 | An object of the JSON has the following format: 185 | ``` 186 | { 187 | id -> identifier of the example, 188 | label -> label (human: 0, chatGPT: 1, cohere: 2, davinci: 3, bloomz: 4, dolly: 5), 189 | text -> text generated by machine or written by human, 190 | model -> model name that generated data, 191 | source -> source (Wikipedia, Wikihow, Peerread, Reddit, Arxiv) on English 192 | } 193 | ``` 194 | 195 | 196 | #### Subtask C: 197 | An object of the JSON has the following format: 198 | ``` 199 | { 200 | id -> identifier of the example, 201 | label -> label (index of the word split by whitespace where change happens), 202 | text -> text generated by machine or written by human, 203 | } 204 | ``` 205 | 206 | ### Prediction File Format and Format Checkers 207 | 208 | A prediction file must be one single JSONL file for all texts. The entry for each text must include the fields "id" and "label". 209 | 210 | The format checkers verify that your prediction file complies with the expected format. They are located in the ```format_checker``` module in each subtask directory. 211 | 212 | #### Subtask A: 213 | ```python 214 | python3 subtaskA/format_checker/format_checker.py --pred_files_path= 215 | ``` 216 | 217 | #### Subtask B: 218 | ```python 219 | python3 subtaskB/format_checker/format_checker.py --pred_files_path= 220 | ``` 221 | 222 | ### Subtask C: 223 | To launch it, please run the following command: 224 | ```python 225 | python3 subtaskC/format_checker/format_checker.py --pred_files_path= 226 | ``` 227 | 228 | Note that format checkers can not verify whether the prediction file you submit contains predictions for all test instances because it does not have an access to the test file. 229 | 230 | ## Scorer and Official Evaluation Metrics 231 | 232 | The scorers for the subtasks are located in the ```scorer``` modules in each subtask directory. 233 | The scorer will report the official evaluation metric and other metrics for a given prediction file. 234 | 235 | ### Subtask A: 236 | The **official evaluation metric** for the Subtask A is **accuracy**. However, the scorer also reports macro-F1 and micro-F1. 237 | 238 | The scorer is run by the following command: 239 | ```python 240 | python3 subtaskA/scorer/scorer.py --gold_file_path= --pred_file_path= 241 | ``` 242 | 243 | ### Subtask B: 244 | The **official evaluation metric** for the Subtask B is **accuracy**. However, the scorer also reports macro-F1 and micro-F1. 245 | 246 | The scorer is run by the following command: 247 | ```python 248 | python3 subtaskB/scorer/scorer.py --gold_file_path= --pred_file_path= 249 | ``` 250 | 251 | ### Subtask C: 252 | The **official evaluation metric** for Subtask C is the **Mean Absolute Error (MAE)**. This metric measures the absolute distance between the predicted word and the actual word where the switch between human and machine occurs. 253 | To launch it, please run the following command: 254 | ```python 255 | python3 subtaskC/scorer/scorer.py --gold_file_path= --pred_file_path= 256 | ``` 257 | 258 | ## Baselines 259 | 260 | ### Task A 261 | 262 | Running the Transformer baseline: 263 | ``` 264 | python3 subtaskA/baseline/transformer_baseline.py --train_file_path --test_file_path --prediction_file_path --subtask A --model 265 | ``` 266 | 267 | The average results for the monolingual setup across three runs for RoBERTa is 0.74; 268 | 269 | The average results for the multilingual setup across three runs for XLM-R is 0.72; 270 | 271 | ### Task B 272 | 273 | Running the Transformer baseline: 274 | ``` 275 | python3 subtaskB/baseline/transformer_baseline.py --train_file_path --test_file_path --prediction_file_path --subtask B --model 276 | ``` 277 | The average results across three runs for RoBERTa is 0.75; 278 | 279 | ### Task C 280 | 281 | Running the Transformer baseline 282 | ``` 283 | bash subtaskC/baseline/run.sh 284 | ``` 285 | The average MAE score across three runs for longformer is: 3.53 ± 0.212 286 | 287 | To modify the hyperparameters, please edit the corresponding python command within the run.sh file. 288 | 289 | ## FAQ 290 | #### Q: How many times can we submit? Which submission will be used for the final ranking? 291 | **A:** We do not limit your submission times. The **final (last) submission** will be used for the final rank. 292 | 293 | #### Q: For subtask C, how did we define the gold boundary? 294 | **A:** Simply speaking, given a text: human_text_segment + machine_generated_text, the boundary label = len(human_text_segment.split(" ")). 295 | **Note that using split(" ") with whitespace as the argument, rather than split()** 296 | 297 | #### Q: Where should we register for this shared task? 298 | **A:** In our competition on CodaBench: [https://www.codabench.org/competitions/1752](https://www.codabench.org/competitions/1752). 299 | 300 | #### Q: Should we do all subtasks or just one of them? 301 | **A:** You can choose any tasks in which you are interested. Also, if you just want to do English track, it is also allowed, or if you just want to do multilingual track, it is welcomed. 302 | 303 | #### Q: Are all of the deadlines alligned with the dates posted here? https://semeval.github.io/SemEval2024/ 304 | **A:** Yes, so far all deadlines are aligned with the https://semeval.github.io/SemEval2024/ , we will make announcement if there are any changes. 305 | 306 | 307 | #### Q: Could you please tell me what the differences are between our task’s dataset and the M4 dataset? Are they absolutely the same? 308 | 309 | **A:** There are mainly three major differences compared to the M4 dataset: 1) task formulation is different, 2) we upsampled human text for data balance; and 3) new and surprising domains, generators and languages will appear in test sets (real test set will not include information about generators, domains and languages). 310 | 311 | #### Q: We noticed significant disproportionality between training and development sets. For example Subtask A related to machine-generated texts: the training set does not contain BLOOMz outputs, while the development set contains only them. Could you please clarify the reason for such an intriguing splitting? 312 | 313 | **A:** We split in this way because it is more aligned with the real application scenarios where many domains and generators are unseen during training. Besides, such a development set also serves as a hint to participants that totally new domains, generators and languages will be included in the real test sets (real test set will not include information about generators, domains and languages). 314 | 315 | #### Q: Whether it is allowed to use additional data? 316 | 317 | **A:** It is not allowed to use extra data. 318 | 319 | ## Organizers 320 | 321 | - Yuxia Wang, Mohamed bin Zayed University of Artificial Intelligence 322 | - Alham Fikri Aji, Mohamed bin Zayed University of Artificial Intelligence 323 | - Artem Shelmanov, Mohamed bin Zayed University of Artificial Intelligence 324 | - Akim Tsvigun, Semrush 325 | - Giovanni Puccetti, Institute of Information Science and Technology, A. Faedo (ISTI CNR) 326 | - Chenxi Whitehouse, Mohamed bin Zayed University of Artificial Intelligence 327 | - Petar Ivanov, Sofia University 328 | - Jonibek Mansurov, Mohamed bin Zayed University of Artificial Intelligence 329 | - Jinyan Su, Mohamed bin Zayed University of Artificial Intelligence 330 | - Tarek Mahmoud, Mohamed bin Zayed University of Artificial Intelligence 331 | - Osama Mohammed Afzal, Mohamed bin Zayed University of Artificial Intelligence 332 | - Thomas Arnold, Technical University Darmstadt 333 | - Iryna Gurevych, Mohamed bin Zayed University of Artificial Intelligence 334 | - Nizar Habash, Mohamed bin Zayed University of Artificial Intelligence 335 | - Preslav Nakov, Mohamed bin Zayed University of Artificial Intelligence 336 | 337 | ## Contacts 338 | 339 | Google group: [https://groups.google.com/g/semeval2024-task8/](https://groups.google.com/g/semeval2024-task8/) 340 | Email: semeval2024-task8@googlegroups.com 341 | -------------------------------------------------------------------------------- /SemEval2024_task8_overview_April.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-nlp/SemEval2024-task8/d8350c840bc505eaba06b4baf69993c2d18fef5e/SemEval2024_task8_overview_April.pdf -------------------------------------------------------------------------------- /images/MBZUAI-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-nlp/SemEval2024-task8/d8350c840bc505eaba06b4baf69993c2d18fef5e/images/MBZUAI-logo.png -------------------------------------------------------------------------------- /images/data_statistics.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-nlp/SemEval2024-task8/d8350c840bc505eaba06b4baf69993c2d18fef5e/images/data_statistics.png -------------------------------------------------------------------------------- /images/sofia_uni.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-nlp/SemEval2024-task8/d8350c840bc505eaba06b4baf69993c2d18fef5e/images/sofia_uni.png -------------------------------------------------------------------------------- /subtaskA/baseline/transformer_baseline.py: -------------------------------------------------------------------------------- 1 | from datasets import Dataset 2 | import pandas as pd 3 | import evaluate 4 | import numpy as np 5 | from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, AutoTokenizer, set_seed 6 | import os 7 | from sklearn.model_selection import train_test_split 8 | from scipy.special import softmax 9 | import argparse 10 | import logging 11 | 12 | def preprocess_function(examples, **fn_kwargs): 13 | return fn_kwargs['tokenizer'](examples["text"], truncation=True) 14 | 15 | 16 | def get_data(train_path, test_path, random_seed): 17 | """ 18 | function to read dataframe with columns 19 | """ 20 | 21 | train_df = pd.read_json(train_path, lines=True) 22 | test_df = pd.read_json(test_path, lines=True) 23 | 24 | train_df, val_df = train_test_split(train_df, test_size=0.2, stratify=train_df['label'], random_state=random_seed) 25 | 26 | return train_df, val_df, test_df 27 | 28 | def compute_metrics(eval_pred): 29 | 30 | f1_metric = evaluate.load("f1") 31 | 32 | predictions, labels = eval_pred 33 | predictions = np.argmax(predictions, axis=1) 34 | 35 | results = {} 36 | results.update(f1_metric.compute(predictions=predictions, references = labels, average="micro")) 37 | 38 | return results 39 | 40 | 41 | def fine_tune(train_df, valid_df, checkpoints_path, id2label, label2id, model): 42 | 43 | # pandas dataframe to huggingface Dataset 44 | train_dataset = Dataset.from_pandas(train_df) 45 | valid_dataset = Dataset.from_pandas(valid_df) 46 | 47 | # get tokenizer and model from huggingface 48 | tokenizer = AutoTokenizer.from_pretrained(model) # put your model here 49 | model = AutoModelForSequenceClassification.from_pretrained( 50 | model, num_labels=len(label2id), id2label=id2label, label2id=label2id # put your model here 51 | ) 52 | 53 | # tokenize data for train/valid 54 | tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True, fn_kwargs={'tokenizer': tokenizer}) 55 | tokenized_valid_dataset = valid_dataset.map(preprocess_function, batched=True, fn_kwargs={'tokenizer': tokenizer}) 56 | 57 | 58 | data_collator = DataCollatorWithPadding(tokenizer=tokenizer) 59 | 60 | 61 | # create Trainer 62 | training_args = TrainingArguments( 63 | output_dir=checkpoints_path, 64 | learning_rate=2e-5, 65 | per_device_train_batch_size=16, 66 | per_device_eval_batch_size=16, 67 | num_train_epochs=3, 68 | weight_decay=0.01, 69 | evaluation_strategy="epoch", 70 | save_strategy="epoch", 71 | load_best_model_at_end=True, 72 | ) 73 | 74 | trainer = Trainer( 75 | model=model, 76 | args=training_args, 77 | train_dataset=tokenized_train_dataset, 78 | eval_dataset=tokenized_valid_dataset, 79 | tokenizer=tokenizer, 80 | data_collator=data_collator, 81 | compute_metrics=compute_metrics, 82 | ) 83 | 84 | trainer.train() 85 | 86 | # save best model 87 | best_model_path = checkpoints_path+'/best/' 88 | 89 | if not os.path.exists(best_model_path): 90 | os.makedirs(best_model_path) 91 | 92 | 93 | trainer.save_model(best_model_path) 94 | 95 | 96 | def test(test_df, model_path, id2label, label2id): 97 | 98 | # load tokenizer from saved model 99 | tokenizer = AutoTokenizer.from_pretrained(model_path) 100 | 101 | # load best model 102 | model = AutoModelForSequenceClassification.from_pretrained( 103 | model_path, num_labels=len(label2id), id2label=id2label, label2id=label2id 104 | ) 105 | 106 | test_dataset = Dataset.from_pandas(test_df) 107 | 108 | tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True, fn_kwargs={'tokenizer': tokenizer}) 109 | data_collator = DataCollatorWithPadding(tokenizer=tokenizer) 110 | 111 | # create Trainer 112 | trainer = Trainer( 113 | model=model, 114 | tokenizer=tokenizer, 115 | data_collator=data_collator, 116 | compute_metrics=compute_metrics, 117 | ) 118 | # get logits from predictions and evaluate results using classification report 119 | predictions = trainer.predict(tokenized_test_dataset) 120 | prob_pred = softmax(predictions.predictions, axis=-1) 121 | preds = np.argmax(predictions.predictions, axis=-1) 122 | metric = evaluate.load("bstrai/classification_report") 123 | results = metric.compute(predictions=preds, references=predictions.label_ids) 124 | 125 | # return dictionary of classification report 126 | return results, preds 127 | 128 | 129 | if __name__ == '__main__': 130 | 131 | parser = argparse.ArgumentParser() 132 | parser.add_argument("--train_file_path", "-tr", required=True, help="Path to the train file.", type=str) 133 | parser.add_argument("--test_file_path", "-t", required=True, help="Path to the test file.", type=str) 134 | parser.add_argument("--subtask", "-sb", required=True, help="Subtask (A or B).", type=str, choices=['A', 'B']) 135 | parser.add_argument("--model", "-m", required=True, help="Transformer to train and test", type=str) 136 | parser.add_argument("--prediction_file_path", "-p", required=True, help="Path where to save the prediction file.", type=str) 137 | 138 | args = parser.parse_args() 139 | 140 | random_seed = 0 141 | train_path = args.train_file_path # For example 'subtaskA_train_multilingual.jsonl' 142 | test_path = args.test_file_path # For example 'subtaskA_test_multilingual.jsonl' 143 | model = args.model # For example 'xlm-roberta-base' 144 | subtask = args.subtask # For example 'A' 145 | prediction_path = args.prediction_file_path # For example subtaskB_predictions.jsonl 146 | 147 | if not os.path.exists(train_path): 148 | logging.error("File doesnt exists: {}".format(train_path)) 149 | raise ValueError("File doesnt exists: {}".format(train_path)) 150 | 151 | if not os.path.exists(test_path): 152 | logging.error("File doesnt exists: {}".format(train_path)) 153 | raise ValueError("File doesnt exists: {}".format(train_path)) 154 | 155 | 156 | if subtask == 'A': 157 | id2label = {0: "human", 1: "machine"} 158 | label2id = {"human": 0, "machine": 1} 159 | elif subtask == 'B': 160 | id2label = {0: 'human', 1: 'chatGPT', 2: 'cohere', 3: 'davinci', 4: 'bloomz', 5: 'dolly'} 161 | label2id = {'human': 0, 'chatGPT': 1,'cohere': 2, 'davinci': 3, 'bloomz': 4, 'dolly': 5} 162 | else: 163 | logging.error("Wrong subtask: {}. It should be A or B".format(train_path)) 164 | raise ValueError("Wrong subtask: {}. It should be A or B".format(train_path)) 165 | 166 | set_seed(random_seed) 167 | 168 | #get data for train/dev/test sets 169 | train_df, valid_df, test_df = get_data(train_path, test_path, random_seed) 170 | 171 | # train detector model 172 | fine_tune(train_df, valid_df, f"{model}/subtask{subtask}/{random_seed}", id2label, label2id, model) 173 | 174 | # test detector model 175 | results, predictions = test(test_df, f"{model}/subtask{subtask}/{random_seed}/best/", id2label, label2id) 176 | 177 | logging.info(results) 178 | predictions_df = pd.DataFrame({'id': test_df['id'], 'label': predictions}) 179 | predictions_df.to_json(prediction_path, lines=True, orient='records') 180 | -------------------------------------------------------------------------------- /subtaskA/format_checker/format_checker.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import logging 4 | import json 5 | import pandas as pd 6 | """ 7 | This script checks whether the results format for subtask A and subtask B is correct. 8 | It also provides some warnings about possible errors. 9 | 10 | The submission of the result file should be in jsonl format. 11 | It should be a lines of objects: 12 | { 13 | id -> identifier of the test sample, 14 | labels -> labels (0 or 1 for subtask A and from 0 to 5 for subtask B), 15 | } 16 | 17 | """ 18 | 19 | logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO) 20 | COLUMNS = ['id', 'label'] 21 | 22 | 23 | def check_format(file_path): 24 | if not os.path.exists(file_path): 25 | logging.error("File doesnt exists: {}".format(file_path)) 26 | return False 27 | 28 | try: 29 | submission = pd.read_json(file_path, lines=True)[['id', 'label']] 30 | except: 31 | logging.error("File is not a valid json file: {}".format(file_path)) 32 | return False 33 | 34 | for column in COLUMNS: 35 | if submission[column].isna().any(): 36 | logging.error("NA value in file {} in column {}".format(file_path, column)) 37 | return False 38 | 39 | if not submission['label'].isin(range(0, 2)).all(): 40 | logging.error("Unknown Label in file {}".format(file_path)) 41 | logging.error("Unique Labels in the file are {}".format(submission['label'].unique())) 42 | return False 43 | 44 | return True 45 | 46 | 47 | if __name__ == "__main__": 48 | 49 | parser = argparse.ArgumentParser() 50 | parser.add_argument("--pred_files_path", "-p", nargs='+', required=True, 51 | help="Path to the files you want to check.", type=str) 52 | 53 | args = parser.parse_args() 54 | logging.info("Subtask A and B. Checking files: {}".format(args.pred_files_path)) 55 | 56 | for pred_file_path in args.pred_files_path: 57 | check_result = check_format(pred_file_path) 58 | result = 'Format is correct' if check_result else 'Something wrong in file format' 59 | logging.info("Subtask A and B. Checking file: {}. Result: {}".format(args.pred_files_path, result)) 60 | -------------------------------------------------------------------------------- /subtaskA/scorer/scorer.py: -------------------------------------------------------------------------------- 1 | import logging.handlers 2 | import argparse 3 | from sklearn.metrics import f1_score, accuracy_score 4 | import pandas as pd 5 | import sys 6 | sys.path.append('.') 7 | from subtaskA.format_checker.format_checker import check_format 8 | 9 | """ 10 | Scoring of SEMEVAL-Task-8--subtask-A-and-B with the metrics f1-macro, f1-micro and accuracy. 11 | """ 12 | 13 | def evaluate(pred_fpath, gold_fpath): 14 | """ 15 | Evaluates the predicted classes w.r.t. a gold file. 16 | Metrics are: f1-macro, f1-micro and accuracy 17 | 18 | :param pred_fpath: a json file with predictions, 19 | :param gold_fpath: the original annotated gold file. 20 | 21 | The submission of the result file should be in jsonl format. 22 | It should be a lines of objects: 23 | { 24 | id -> identifier of the test sample, 25 | labels -> labels (0 or 1 for subtask A and from 0 to 5 for subtask B), 26 | } 27 | """ 28 | 29 | pred_labels = pd.read_json(pred_fpath, lines=True)[['id', 'label']] 30 | gold_labels = pd.read_json(gold_fpath, lines=True)[['id', 'label']] 31 | 32 | merged_df = pred_labels.merge(gold_labels, on='id', suffixes=('_pred', '_gold')) 33 | 34 | macro_f1 = f1_score(merged_df['label_gold'], merged_df['label_pred'], average="macro", zero_division=0) 35 | micro_f1 = f1_score(merged_df['label_gold'], merged_df['label_pred'], average="micro", zero_division=0) 36 | accuracy = accuracy_score(merged_df['label_gold'], merged_df['label_pred']) 37 | 38 | return macro_f1, micro_f1, accuracy 39 | 40 | 41 | def validate_files(pred_files): 42 | if not check_format(pred_files): 43 | logging.error('Bad format for pred file {}. Cannot score.'.format(pred_files)) 44 | return False 45 | return True 46 | 47 | 48 | if __name__ == '__main__': 49 | parser = argparse.ArgumentParser() 50 | parser.add_argument( "--gold_file_path", '-g', type=str, required=True, help="Paths to the file with gold annotations.") 51 | parser.add_argument("--pred_file_path", '-p', type=str, required=True, help="Path to the file with predictions") 52 | args = parser.parse_args() 53 | 54 | pred_file_path = args.pred_file_path 55 | gold_file_path = args.gold_file_path 56 | 57 | if validate_files(pred_file_path): 58 | logging.info('Prediction file format is correct') 59 | macro_f1, micro_f1, accuracy = evaluate(pred_file_path, gold_file_path) 60 | logging.info("macro-F1={:.5f}\tmicro-F1={:.5f}\taccuracy={:.5f}".format(macro_f1, micro_f1, accuracy)) 61 | 62 | 63 | -------------------------------------------------------------------------------- /subtaskB/baseline/transformer_baseline.py: -------------------------------------------------------------------------------- 1 | from datasets import Dataset 2 | import pandas as pd 3 | import evaluate 4 | import numpy as np 5 | from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, AutoTokenizer, set_seed 6 | import os 7 | from sklearn.model_selection import train_test_split 8 | from scipy.special import softmax 9 | import argparse 10 | import logging 11 | 12 | def preprocess_function(examples, **fn_kwargs): 13 | return fn_kwargs['tokenizer'](examples["text"], truncation=True) 14 | 15 | 16 | def get_data(train_path, test_path, random_seed): 17 | """ 18 | function to read dataframe with columns 19 | """ 20 | 21 | train_df = pd.read_json(train_path, lines=True) 22 | test_df = pd.read_json(test_path, lines=True) 23 | 24 | train_df, val_df = train_test_split(train_df, test_size=0.2, stratify=train_df['label'], random_state=random_seed) 25 | 26 | return train_df, val_df, test_df 27 | 28 | def compute_metrics(eval_pred): 29 | 30 | f1_metric = evaluate.load("f1") 31 | 32 | predictions, labels = eval_pred 33 | predictions = np.argmax(predictions, axis=1) 34 | 35 | results = {} 36 | results.update(f1_metric.compute(predictions=predictions, references = labels, average="micro")) 37 | 38 | return results 39 | 40 | 41 | def fine_tune(train_df, valid_df, checkpoints_path, id2label, label2id, model): 42 | 43 | # pandas dataframe to huggingface Dataset 44 | train_dataset = Dataset.from_pandas(train_df) 45 | valid_dataset = Dataset.from_pandas(valid_df) 46 | 47 | # get tokenizer and model from huggingface 48 | tokenizer = AutoTokenizer.from_pretrained(model) # put your model here 49 | model = AutoModelForSequenceClassification.from_pretrained( 50 | model, num_labels=len(label2id), id2label=id2label, label2id=label2id # put your model here 51 | ) 52 | 53 | # tokenize data for train/valid 54 | tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True, fn_kwargs={'tokenizer': tokenizer}) 55 | tokenized_valid_dataset = valid_dataset.map(preprocess_function, batched=True, fn_kwargs={'tokenizer': tokenizer}) 56 | 57 | 58 | data_collator = DataCollatorWithPadding(tokenizer=tokenizer) 59 | 60 | 61 | # create Trainer 62 | training_args = TrainingArguments( 63 | output_dir=checkpoints_path, 64 | learning_rate=2e-5, 65 | per_device_train_batch_size=16, 66 | per_device_eval_batch_size=16, 67 | num_train_epochs=3, 68 | weight_decay=0.01, 69 | evaluation_strategy="epoch", 70 | save_strategy="epoch", 71 | load_best_model_at_end=True, 72 | ) 73 | 74 | trainer = Trainer( 75 | model=model, 76 | args=training_args, 77 | train_dataset=tokenized_train_dataset, 78 | eval_dataset=tokenized_valid_dataset, 79 | tokenizer=tokenizer, 80 | data_collator=data_collator, 81 | compute_metrics=compute_metrics, 82 | ) 83 | 84 | trainer.train() 85 | 86 | # save best model 87 | best_model_path = checkpoints_path+'/best/' 88 | 89 | if not os.path.exists(best_model_path): 90 | os.makedirs(best_model_path) 91 | 92 | 93 | trainer.save_model(best_model_path) 94 | 95 | 96 | def test(test_df, model_path, id2label, label2id): 97 | 98 | # load tokenizer from saved model 99 | tokenizer = AutoTokenizer.from_pretrained(model_path) 100 | 101 | # load best model 102 | model = AutoModelForSequenceClassification.from_pretrained( 103 | model_path, num_labels=len(label2id), id2label=id2label, label2id=label2id 104 | ) 105 | 106 | test_dataset = Dataset.from_pandas(test_df) 107 | 108 | tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True, fn_kwargs={'tokenizer': tokenizer}) 109 | data_collator = DataCollatorWithPadding(tokenizer=tokenizer) 110 | 111 | # create Trainer 112 | trainer = Trainer( 113 | model=model, 114 | tokenizer=tokenizer, 115 | data_collator=data_collator, 116 | compute_metrics=compute_metrics, 117 | ) 118 | # get logits from predictions and evaluate results using classification report 119 | predictions = trainer.predict(tokenized_test_dataset) 120 | prob_pred = softmax(predictions.predictions, axis=-1) 121 | preds = np.argmax(predictions.predictions, axis=-1) 122 | metric = evaluate.load("bstrai/classification_report") 123 | results = metric.compute(predictions=preds, references=predictions.label_ids) 124 | 125 | # return dictionary of classification report 126 | return results, preds 127 | 128 | 129 | if __name__ == '__main__': 130 | 131 | parser = argparse.ArgumentParser() 132 | parser.add_argument("--train_file_path", "-tr", required=True, help="Path to the train file.", type=str) 133 | parser.add_argument("--test_file_path", "-t", required=True, help="Path to the test file.", type=str) 134 | parser.add_argument("--subtask", "-sb", required=True, help="Subtask (A or B).", type=str, choices=['A', 'B']) 135 | parser.add_argument("--model", "-m", required=True, help="Transformer to train and test", type=str) 136 | parser.add_argument("--prediction_file_path", "-p", required=True, help="Path where to save the prediction file.", type=str) 137 | 138 | args = parser.parse_args() 139 | 140 | random_seed = 0 141 | train_path = args.train_file_path # For example 'subtaskA_train_multilingual.jsonl' 142 | test_path = args.test_file_path # For example 'subtaskA_test_multilingual.jsonl' 143 | model = args.model # For example 'xlm-roberta-base' 144 | subtask = args.subtask # For example 'A' 145 | prediction_path = args.prediction_file_path # For example subtaskB_predictions.jsonl 146 | 147 | if not os.path.exists(train_path): 148 | logging.error("File doesnt exists: {}".format(train_path)) 149 | raise ValueError("File doesnt exists: {}".format(train_path)) 150 | 151 | if not os.path.exists(test_path): 152 | logging.error("File doesnt exists: {}".format(train_path)) 153 | raise ValueError("File doesnt exists: {}".format(train_path)) 154 | 155 | 156 | if subtask == 'A': 157 | id2label = {0: "human", 1: "machine"} 158 | label2id = {"human": 0, "machine": 1} 159 | elif subtask == 'B': 160 | id2label = {0: 'human', 1: 'chatGPT', 2: 'cohere', 3: 'davinci', 4: 'bloomz', 5: 'dolly'} 161 | label2id = {'human': 0, 'chatGPT': 1,'cohere': 2, 'davinci': 3, 'bloomz': 4, 'dolly': 5} 162 | else: 163 | logging.error("Wrong subtask: {}. It should be A or B".format(train_path)) 164 | raise ValueError("Wrong subtask: {}. It should be A or B".format(train_path)) 165 | 166 | set_seed(random_seed) 167 | 168 | #get data for train/dev/test sets 169 | train_df, valid_df, test_df = get_data(train_path, test_path, random_seed) 170 | 171 | # train detector model 172 | fine_tune(train_df, valid_df, f"{model}/subtask{subtask}/{random_seed}", id2label, label2id, model) 173 | 174 | # test detector model 175 | results, predictions = test(test_df, f"{model}/subtask{subtask}/{random_seed}/best/", id2label, label2id) 176 | 177 | logging.info(results) 178 | predictions_df = pd.DataFrame({'id': test_df['id'], 'label': predictions}) 179 | predictions_df.to_json(prediction_path, lines=True, orient='records') 180 | -------------------------------------------------------------------------------- /subtaskB/format_checker/format_checker.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import logging 4 | import json 5 | import pandas as pd 6 | """ 7 | This script checks whether the results format for subtask A and subtask B is correct. 8 | It also provides some warnings about possible errors. 9 | 10 | The submission of the result file should be in jsonl format. 11 | It should be a lines of objects: 12 | { 13 | id -> identifier of the test sample, 14 | labels -> labels (0 or 1 for subtask A and from 0 to 5 for subtask B), 15 | } 16 | 17 | """ 18 | 19 | logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO) 20 | COLUMNS = ['id', 'label'] 21 | 22 | 23 | def check_format(file_path): 24 | if not os.path.exists(file_path): 25 | logging.error("File doesnt exists: {}".format(file_path)) 26 | return False 27 | 28 | try: 29 | submission = pd.read_json(file_path, lines=True)[['id', 'label']] 30 | except: 31 | logging.error("File is not a valid json file: {}".format(file_path)) 32 | return False 33 | 34 | for column in COLUMNS: 35 | if submission[column].isna().any(): 36 | logging.error("NA value in file {} in column {}".format(file_path, column)) 37 | return False 38 | 39 | if not submission['label'].isin(range(0, 6)).all(): 40 | logging.error("Unknown Label in file {}".format(file_path)) 41 | logging.error("Unique Labels in the file are {}".format(submission['label'].unique())) 42 | return False 43 | 44 | return True 45 | 46 | 47 | if __name__ == "__main__": 48 | 49 | parser = argparse.ArgumentParser() 50 | parser.add_argument("--pred_files_path", "-p", nargs='+', required=True, 51 | help="Path to the files you want to check.", type=str) 52 | 53 | args = parser.parse_args() 54 | logging.info("Subtask A and B. Checking files: {}".format(args.pred_files_path)) 55 | 56 | for pred_file_path in args.pred_files_path: 57 | check_result = check_format(pred_file_path) 58 | result = 'Format is correct' if check_result else 'Something wrong in file format' 59 | logging.info("Subtask A and B. Checking file: {}. Result: {}".format(args.pred_files_path, result)) -------------------------------------------------------------------------------- /subtaskB/scorer/scorer.py: -------------------------------------------------------------------------------- 1 | import logging.handlers 2 | import argparse 3 | from sklearn.metrics import f1_score, accuracy_score 4 | import pandas as pd 5 | import sys 6 | sys.path.append('.') 7 | from subtaskB.format_checker.format_checker import check_format 8 | 9 | """ 10 | Scoring of SEMEVAL-Task-8--subtask-A-and-B with the metrics f1-macro, f1-micro and accuracy. 11 | """ 12 | 13 | def evaluate(pred_fpath, gold_fpath): 14 | """ 15 | Evaluates the predicted classes w.r.t. a gold file. 16 | Metrics are: f1-macro, f1-micro and accuracy 17 | 18 | :param pred_fpath: a json file with predictions, 19 | :param gold_fpath: the original annotated gold file. 20 | 21 | The submission of the result file should be in jsonl format. 22 | It should be a lines of objects: 23 | { 24 | id -> identifier of the test sample, 25 | labels -> labels (0 or 1 for subtask A and from 0 to 5 for subtask B), 26 | } 27 | """ 28 | 29 | pred_labels = pd.read_json(pred_fpath, lines=True)[['id', 'label']] 30 | gold_labels = pd.read_json(gold_fpath, lines=True)[['id', 'label']] 31 | 32 | merged_df = pred_labels.merge(gold_labels, on='id', suffixes=('_pred', '_gold')) 33 | 34 | macro_f1 = f1_score(merged_df['label_gold'], merged_df['label_pred'], average="macro", zero_division=0) 35 | micro_f1 = f1_score(merged_df['label_gold'], merged_df['label_pred'], average="micro", zero_division=0) 36 | accuracy = accuracy_score(merged_df['label_gold'], merged_df['label_pred']) 37 | 38 | return macro_f1, micro_f1, accuracy 39 | 40 | 41 | def validate_files(pred_files): 42 | if not check_format(pred_files): 43 | logging.error('Bad format for pred file {}. Cannot score.'.format(pred_files)) 44 | return False 45 | return True 46 | 47 | 48 | if __name__ == '__main__': 49 | parser = argparse.ArgumentParser() 50 | parser.add_argument( "--gold_file_path", '-g', type=str, required=True, help="Paths to the file with gold annotations.") 51 | parser.add_argument("--pred_file_path", '-p', type=str, required=True, help="Path to the file with predictions") 52 | args = parser.parse_args() 53 | 54 | pred_file_path = args.pred_file_path 55 | gold_file_path = args.gold_file_path 56 | 57 | if validate_files(pred_file_path): 58 | logging.info('Prediction file format is correct') 59 | macro_f1, micro_f1, accuracy = evaluate(pred_file_path, gold_file_path) 60 | logging.info("macro-F1={:.5f}\tmicro-F1={:.5f}\taccuracy={:.5f}".format(macro_f1, micro_f1, accuracy)) 61 | 62 | 63 | -------------------------------------------------------------------------------- /subtaskC/baseline/requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate==0.23.0 2 | certifi==2023.7.22 3 | charset-normalizer==3.2.0 4 | cmake==3.27.5 5 | filelock==3.12.4 6 | fsspec==2023.9.1 7 | huggingface-hub==0.17.2 8 | idna==3.4 9 | Jinja2==3.1.2 10 | joblib==1.3.2 11 | lit==16.0.6 12 | MarkupSafe==2.1.3 13 | mpmath==1.3.0 14 | networkx==3.1 15 | numpy==1.26.0 16 | nvidia-cublas-cu11==11.10.3.66 17 | nvidia-cuda-cupti-cu11==11.7.101 18 | nvidia-cuda-nvrtc-cu11==11.7.99 19 | nvidia-cuda-runtime-cu11==11.7.99 20 | nvidia-cudnn-cu11==8.5.0.96 21 | nvidia-cufft-cu11==10.9.0.58 22 | nvidia-curand-cu11==10.2.10.91 23 | nvidia-cusolver-cu11==11.4.0.1 24 | nvidia-cusparse-cu11==11.7.4.91 25 | nvidia-nccl-cu11==2.14.3 26 | nvidia-nvtx-cu11==11.7.91 27 | packaging==23.1 28 | pandas==2.1.0 29 | psutil==5.9.5 30 | python-dateutil==2.8.2 31 | pytz==2023.3.post1 32 | PyYAML==6.0.1 33 | regex==2023.8.8 34 | requests==2.31.0 35 | safetensors==0.3.3 36 | scikit-learn==1.3.0 37 | scipy==1.11.2 38 | six==1.16.0 39 | sympy==1.12 40 | threadpoolctl==3.2.0 41 | tokenizers==0.13.3 42 | torch==2.0.1 43 | tqdm==4.66.1 44 | transformers==4.33.2 45 | triton==2.0.0 46 | typing_extensions==4.8.0 47 | tzdata==2023.3 48 | urllib3==2.0.4 49 | -------------------------------------------------------------------------------- /subtaskC/baseline/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | exp_name="exp_1" 3 | seed_value=42 4 | python transformer_baseline.py \ 5 | --model_path "allenai/longformer-base-4096" \ 6 | --train_file "../data/subtaskC_train.jsonl" \ 7 | --load_best_model_at_end True \ 8 | --dev_file "../data/subtaskC_dev.jsonl" \ 9 | --test_files ../data/subtaskC_dev.jsonl \ 10 | --metric_for_best_model "eval_mean_absolute_diff" \ 11 | --greater_is_better False \ 12 | --do_train True \ 13 | --do_predict True \ 14 | --seed $seed_value \ 15 | --output_dir "./runs/$exp_name" \ 16 | --logging_dir "./runs/$exp_name/logs" \ 17 | --num_train_epochs 10 \ 18 | --per_device_train_batch_size 32 \ 19 | --per_device_eval_batch_size 32 \ 20 | --auto_find_batch_size True \ 21 | --logging_steps 10 \ 22 | --load_best_model_at_end True \ 23 | --evaluation_strategy "epoch" \ 24 | --save_strategy "epoch" \ 25 | --save_total_limit 2 26 | -------------------------------------------------------------------------------- /subtaskC/baseline/transformer_baseline.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import json 3 | from transformers import AutoTokenizer, AutoModelForTokenClassification 4 | from transformers.trainer_callback import TrainerState 5 | import transformers 6 | import pandas as pd 7 | import numpy as np 8 | 9 | from dataclasses import dataclass, field 10 | from typing import Any, List, Optional 11 | import logging 12 | import glob 13 | import os 14 | 15 | logging.basicConfig(level=logging.INFO) 16 | logger = logging.getLogger() 17 | 18 | 19 | @dataclass 20 | class ModelConfig: 21 | model_path: str = "allenai/longformer-base-4096" 22 | 23 | 24 | @dataclass 25 | class DatasetConfig: 26 | train_file: str = field(default=None, metadata={"help": "Path to train jsonl file"}) 27 | dev_file: str = field(default=None, metadata={"help": "Path to dev jsonl file"}) 28 | test_files: List[str] = field( 29 | default=None, metadata={"help": "Path to test json files"} 30 | ) 31 | 32 | 33 | @dataclass 34 | class TrainingArgsConfig(transformers.TrainingArguments): 35 | seed: int = 42 36 | output_dir: str = "./runs/exp_3" 37 | num_train_epochs: int = 10 38 | per_device_train_batch_size: int = 32 39 | per_device_eval_batch_size: int = 32 40 | auto_find_batch_size: bool = True 41 | logging_dir: str = "./runs/exp_3/logs" 42 | logging_steps: int = 10 43 | load_best_model_at_end: bool = True 44 | evaluation_strategy: str = "epoch" 45 | save_strategy: str = "epoch" 46 | save_total_limit: int = 2 47 | 48 | 49 | class Semeval_Data(torch.utils.data.Dataset): 50 | def __init__(self, data_path, max_length=1024, inference=False, debug=False): 51 | with open(data_path, "r") as f: 52 | self.data = [json.loads(line) for line in f] 53 | self.inference = inference 54 | self.tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096") 55 | self.max_length = max_length 56 | self.debug = debug 57 | 58 | def __len__(self): 59 | return len(self.data) 60 | 61 | def __getitem__(self, idx): 62 | text = self.data[idx]["text"] 63 | id = self.data[idx]["id"] 64 | label = None 65 | labels_available = "label" in self.data[idx] 66 | 67 | if labels_available: 68 | label = self.data[idx]["label"] 69 | 70 | if self.debug and not self.inference: 71 | print("Orignal Human Position: ", label) 72 | 73 | labels = [] 74 | corresponding_word = [] 75 | tokens = [] 76 | input_ids = [] 77 | attention_mask = [] 78 | 79 | for jdx, word in enumerate(text.split(" ")): 80 | word_encoded = self.tokenizer.tokenize(word) 81 | sub_words = len(word_encoded) 82 | 83 | if labels_available: 84 | is_machine_text = 1 if jdx >= label else 0 85 | labels.extend([is_machine_text] * sub_words) 86 | 87 | corresponding_word.extend([jdx] * sub_words) 88 | tokens.extend(word_encoded) 89 | input_ids.extend(self.tokenizer.convert_tokens_to_ids(word_encoded)) 90 | attention_mask.extend([1] * sub_words) 91 | 92 | ###Add padding to labels as -100 93 | if len(input_ids) < self.max_length - 2: 94 | input_ids = ( 95 | [0] + input_ids + [2] + [1] * (self.max_length - len(input_ids) - 2) 96 | ) 97 | if labels_available: 98 | labels = [-100] + labels + [-100] * (self.max_length - len(labels) - 1) 99 | 100 | attention_mask = ( 101 | [1] 102 | + attention_mask 103 | + [1] 104 | + [0] * (self.max_length - len(attention_mask) - 2) 105 | ) 106 | corresponding_word = ( 107 | [-100] 108 | + corresponding_word 109 | + [-100] * (self.max_length - len(corresponding_word) - 1) 110 | ) 111 | tokens = ( 112 | [""] 113 | + tokens 114 | + [""] 115 | + [""] * (self.max_length - len(tokens) - 2) 116 | ) 117 | else: 118 | # Add -100 for CLS and SEP tokens 119 | input_ids = [0] + input_ids[: self.max_length - 2] + [2] 120 | 121 | if labels_available: 122 | labels = [-100] + labels[: self.max_length - 2] + [-100] 123 | 124 | corresponding_word = ( 125 | [-100] + corresponding_word[: self.max_length - 2] + [-100] 126 | ) 127 | attention_mask = [1] + attention_mask[: self.max_length - 2] + [1] 128 | tokens = [""] + tokens[: self.max_length - 2] + [""] 129 | 130 | encoded = {} 131 | if labels_available: 132 | encoded["labels"] = torch.tensor(labels) 133 | 134 | encoded["input_ids"] = torch.tensor(input_ids) 135 | encoded["attention_mask"] = torch.tensor(attention_mask) 136 | 137 | if labels_available: 138 | if encoded["input_ids"].shape != encoded["labels"].shape: 139 | print("Input IDs Shape: ", encoded["input_ids"].shape) 140 | print("Labels Shape: ", encoded["labels"].shape) 141 | assert encoded["input_ids"].shape == encoded["labels"].shape 142 | 143 | if self.debug and not self.inference: 144 | print("Tokenized Human Position: ", labels.index(1)) 145 | print("Original Human Position: ", label) 146 | print("Full Human Text:", text) 147 | print("\n") 148 | print("Human Text Truncated:", text.split(" ")[:label]) 149 | print("\n") 150 | encoded["partial_human_review"] = " ".join(text.split(" ")[:label]) 151 | 152 | if self.inference: 153 | encoded["text"] = text 154 | encoded["id"] = id 155 | encoded["corresponding_word"] = corresponding_word 156 | 157 | return encoded 158 | 159 | 160 | def evaluate_position_difference(actual_position, predicted_position): 161 | """ 162 | Compute the absolute difference between the actual and predicted start positions. 163 | 164 | Args: 165 | - actual_position (int): Actual start position of machine-generated text. 166 | - predicted_position (int): Predicted start position of machine-generated text. 167 | 168 | Returns: 169 | - int: Absolute difference between the start positions. 170 | """ 171 | return abs(actual_position - predicted_position) 172 | 173 | 174 | def get_start_position(sequence, mapping=None, token_level=True): 175 | """ 176 | Get the start position from a sequence of labels or predictions. 177 | 178 | Args: 179 | - sequence (np.array): A sequence of labels or predictions. 180 | - mapping (np.array): Mapping from index to word for the sequence. 181 | - token_level (bool): If True, return positional indices; else, return word mappings. 182 | 183 | Returns: 184 | - int or str: Start position in the sequence. 185 | """ 186 | # Locate the position of label '1' 187 | 188 | if mapping is not None: 189 | mask = mapping != -100 190 | sequence = sequence[mask] 191 | mapping = mapping[mask] 192 | 193 | index = np.where(sequence == 1)[0] 194 | value = index[0] if index.size else (len(sequence) - 1) 195 | 196 | if not token_level: 197 | value = mapping[value] 198 | 199 | return value 200 | 201 | 202 | def evaluate_machine_start_position( 203 | labels, predictions, idx2word=None, token_level=False 204 | ): 205 | """ 206 | Evaluate the starting position of machine-generated text in both predicted and actual sequences. 207 | 208 | Args: 209 | - labels (np.array): Actual labels. 210 | - predictions (np.array): Predicted labels. 211 | - idx2word (np.array): Mapping from index to word for each sequence in the batch. 212 | - token_level (bool): Flag to determine if evaluation is at token level. If True, return positional indices; else, return word mappings. 213 | 214 | Returns: 215 | - float: Mean absolute difference between the start positions in predictions and actual labels. 216 | """ 217 | predicted_positions = predictions.argmax(axis=-1) 218 | 219 | actual_starts = [] 220 | predicted_starts = [] 221 | 222 | if not token_level and idx2word is None: 223 | raise ValueError( 224 | "idx2word must be provided if evaluation is at word level (token_level=False)" 225 | ) 226 | 227 | for idx in range(labels.shape[0]): 228 | # Remove padding 229 | mask = labels[idx] != -100 230 | predict, label, mapping = ( 231 | predicted_positions[idx][mask], 232 | labels[idx][mask], 233 | idx2word[idx][mask] if not token_level else None, 234 | ) 235 | 236 | # If token_level is True, just use the index; otherwise, map to word 237 | predicted_value = get_start_position(predict, mapping, token_level) 238 | actual_value = get_start_position(label, mapping, token_level) 239 | 240 | predicted_starts.append(predicted_value) 241 | actual_starts.append(actual_value) 242 | 243 | position_differences = [ 244 | evaluate_position_difference(actual, predict) 245 | for actual, predict in zip(actual_starts, predicted_starts) 246 | ] 247 | mean_position_difference = np.mean(position_differences) 248 | 249 | return mean_position_difference 250 | 251 | 252 | def compute_metrics(p): 253 | pred, labels = p 254 | mean_absolute_diff = evaluate_machine_start_position(labels, pred, token_level=True) 255 | 256 | return { 257 | "mean_absolute_diff": mean_absolute_diff, 258 | } 259 | 260 | 261 | if __name__ == "__main__": 262 | parser = transformers.HfArgumentParser( 263 | (ModelConfig, DatasetConfig, TrainingArgsConfig) 264 | ) 265 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 266 | print("Model Arguments: ", model_args) 267 | print("Data Arguments: ", data_args) 268 | print("Training Arguments: ", training_args) 269 | 270 | # Set seed 271 | transformers.set_seed(training_args.seed) 272 | 273 | model_path = model_args.model_path 274 | if ( 275 | training_args.do_eval or training_args.do_predict 276 | ) and not training_args.do_train: 277 | output_dir = training_args.output_dir 278 | if not os.path.exists(output_dir): 279 | raise ValueError( 280 | f"Output directory ({output_dir}) does not exist. Please train the model first." 281 | ) 282 | 283 | # Find the best model checkpoint 284 | ckpt_paths = sorted( 285 | glob.glob(os.path.join(output_dir, "checkpoint-*")), 286 | key=lambda x: int(x.split("-")[-1]), 287 | ) 288 | 289 | if not ckpt_paths: 290 | raise ValueError( 291 | f"Output directory ({output_dir}) does not contain any checkpoint. Please train the model first." 292 | ) 293 | 294 | state = TrainerState.load_from_json( 295 | os.path.join(ckpt_paths[-1], "trainer_state.json") 296 | ) 297 | best_model_path = state.best_model_checkpoint or model_args.model_path 298 | if state.best_model_checkpoint is None: 299 | logger.info( 300 | "No best model checkpoint found. Using the default model checkpoint." 301 | ) 302 | print(f"Best model path: {best_model_path}") 303 | model_path = best_model_path 304 | 305 | # 4. Load model 306 | model = AutoModelForTokenClassification.from_pretrained( 307 | model_path, num_labels=2, trust_remote_code=True 308 | ) 309 | 310 | train_set = Semeval_Data(data_args.train_file) 311 | dev_set = Semeval_Data(data_args.dev_file) 312 | 313 | trainer = transformers.Trainer( 314 | model=model, 315 | args=training_args, 316 | train_dataset=train_set, 317 | eval_dataset=dev_set, 318 | tokenizer=train_set.tokenizer, 319 | compute_metrics=compute_metrics, 320 | ) 321 | 322 | if training_args.do_train: 323 | logger.info("Training...") 324 | logger.info("*** Train Dataset ***") 325 | logger.info(f"Number of samples: {len(train_set)}") 326 | logger.info("*** Dev Dataset ***") 327 | logger.info(f"Number of samples: {len(dev_set)}") 328 | 329 | trainer.train() 330 | 331 | logger.info("Training completed!") 332 | 333 | if training_args.do_eval: 334 | logger.info("Evaluating...") 335 | logger.info("*** Dev Dataset ***") 336 | logger.info(f"Number of samples: {len(dev_set)}") 337 | 338 | metrics = trainer.evaluate() 339 | logger.info(f"Metrics: {metrics}") 340 | trainer.save_metrics("eval", metrics) 341 | 342 | logger.info("Evaluation completed!") 343 | 344 | if training_args.do_predict: 345 | test_sets = [] 346 | for test_file in data_args.test_files: 347 | test_set = Semeval_Data(test_file, inference=True) 348 | test_sets.append(test_set) 349 | logger.info("Predicting...") 350 | logger.info("*** Test Datasets ***") 351 | logger.info(f"Number of samples: {len(test_sets)}") 352 | 353 | for idx, test_set in enumerate(test_sets): 354 | logger.info(f"Test Dataset {idx + 1}") 355 | logger.info(f"Number of samples: {len(test_set)}") 356 | 357 | predictions, _, _ = trainer.predict(test_set) 358 | logger.info("Predictions completed!") 359 | 360 | df = pd.DataFrame( 361 | { 362 | "id": [i["id"] for i in test_set], 363 | "label": [ 364 | get_start_position( 365 | i[0], 366 | np.array(i[1]["corresponding_word"]), 367 | token_level=False, 368 | ) 369 | for i in list(zip(predictions.argmax(axis=-1), test_set)) 370 | ], 371 | } 372 | ) 373 | import os 374 | 375 | file_name = os.path.basename(data_args.test_files[idx]) 376 | file_dirs = os.path.join(training_args.output_dir, "predictions") 377 | os.makedirs(file_dirs, exist_ok=True) 378 | file_path = os.path.join(file_dirs, file_name) 379 | records = df.to_dict("records") 380 | with open(file_path, "w") as f: 381 | for record in records: 382 | f.write(json.dumps(record) + "\n") 383 | -------------------------------------------------------------------------------- /subtaskC/format_checker/format_checker.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import logging 4 | import json 5 | import pandas as pd 6 | 7 | """ 8 | This script checks whether the results format for subtask C is correct. 9 | It also provides some warnings about possible errors. 10 | 11 | The submission of the result file should be in CSV format with the columns: 12 | { 13 | "id" -> identifier of the test sample, 14 | "label" -> predicted start position, 15 | } 16 | """ 17 | 18 | logging.basicConfig(format="%(levelname)s : %(message)s", level=logging.INFO) 19 | COLUMNS = ["id", "labels"] 20 | 21 | 22 | def check_format(file_path): 23 | if not os.path.exists(file_path): 24 | logging.error("File doesnt exists: {}".format(file_path)) 25 | return False 26 | 27 | try: 28 | submission = pd.read_json(file_path, lines=True)[["id", "label"]] 29 | except Exception as e: 30 | logging.error("File is not a valid jsonl file: {}".format(file_path)) 31 | logging.error(e) 32 | return False 33 | 34 | for column in COLUMNS: 35 | if submission[column].isna().any(): 36 | logging.error("NA value in file {} in column {}".format(file_path, column)) 37 | return False 38 | 39 | if not submission["label"].dtypes == "int64": 40 | logging.error("Unknown datatype in file {} for column label".format(file_path)) 41 | 42 | return False 43 | 44 | return True 45 | 46 | 47 | if __name__ == "__main__": 48 | parser = argparse.ArgumentParser() 49 | parser.add_argument( 50 | "--pred_files_path", 51 | "-p", 52 | nargs="+", 53 | required=True, 54 | help="Path to the files you want to check.", 55 | type=str, 56 | ) 57 | 58 | args = parser.parse_args() 59 | logging.info("Subtask C. Checking files: {}".format(args.pred_files_path)) 60 | 61 | for pred_file_path in args.pred_files_path: 62 | check_result = check_format(pred_file_path) 63 | result = ( 64 | "Format is correct" if check_result else "Something wrong in file format" 65 | ) 66 | logging.info( 67 | "Subtask C. Checking file: {}. Result: {}".format( 68 | args.pred_files_path, result 69 | ) 70 | ) 71 | -------------------------------------------------------------------------------- /subtaskC/scorer/scorer.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import logging.handlers 3 | import argparse 4 | from sklearn.metrics import f1_score, accuracy_score 5 | import pandas as pd 6 | import sys 7 | import os 8 | import numpy as np 9 | 10 | """ 11 | Scoring of SEMEVAL-Task-8--subtask-C with the metric Mean Absolute Error (MAE) 12 | """ 13 | logging.basicConfig(format="%(levelname)s : %(message)s", level=logging.INFO) 14 | COLUMNS = ["id", "label"] 15 | 16 | 17 | def check_format(file_path): 18 | if not os.path.exists(file_path): 19 | logging.error("File doesnt exists: {}".format(file_path)) 20 | return False 21 | 22 | try: 23 | submission = pd.read_json(file_path, lines=True)[["id", "label"]] 24 | except Exception as e: 25 | logging.error("File is not a valid csv file: {}".format(file_path)) 26 | logging.error(e) 27 | return False 28 | 29 | for column in COLUMNS: 30 | if submission[column].isna().any(): 31 | logging.error("NA value in file {} in column {}".format(file_path, column)) 32 | return False 33 | 34 | if not submission["label"].dtypes == "int64": 35 | logging.error("Unknown datatype in file {} for column label".format(file_path)) 36 | 37 | return False 38 | 39 | return True 40 | 41 | 42 | def evaluate_position_difference(actual_position, predicted_position): 43 | """ 44 | Compute the absolute difference between the actual and predicted start positions. 45 | 46 | Args: 47 | - actual_position (int): Actual start position of machine-generated text. 48 | - predicted_position (int): Predicted start position of machine-generated text. 49 | 50 | Returns: 51 | - int: Absolute difference between the start positions. 52 | """ 53 | return abs(actual_position - predicted_position) 54 | 55 | 56 | def evaluate(pred_fpath, gold_fpath): 57 | """ 58 | Evaluates the predicted classes w.r.t. a gold file. 59 | Metrics are: Mean Absolute Error (MAE) 60 | 61 | :param pred_fpath: a csv file with predictions, 62 | :param gold_fpath: the original annotated csv file. 63 | 64 | The submission of the result file should be in jsonl format. 65 | It should be a lines of objects: 66 | { 67 | id -> identifier of the test sample, 68 | labels -> labels (0 or 1 for subtask A and from 0 to 5 for subtask B), 69 | } 70 | """ 71 | 72 | pred_labels = pd.read_json(pred_fpath, lines=True)[["id", "label"]] 73 | gold_labels = pd.read_json(gold_fpath, lines=True)[["id", "label"]] 74 | 75 | merged_df = pred_labels.merge(gold_labels, on="id", suffixes=("_pred", "_gold")) 76 | 77 | # Compute the absolute difference between the actual and predicted start positions. 78 | out = merged_df.apply( 79 | lambda row: evaluate_position_difference(row["label_gold"], row["label_pred"]), 80 | axis=1, 81 | ).values 82 | logging.info(f"Number of samples: {len(merged_df)}") 83 | # Compute the mean absolute error (MAE) 84 | mae = np.mean(out) 85 | return mae 86 | 87 | 88 | def validate_files(pred_files): 89 | if not check_format(pred_files): 90 | logging.error("Bad format for pred file {}. Cannot score.".format(pred_files)) 91 | return False 92 | return True 93 | 94 | 95 | if __name__ == "__main__": 96 | parser = argparse.ArgumentParser() 97 | parser.add_argument( 98 | "--gold_file_path", 99 | "-g", 100 | type=str, 101 | required=True, 102 | help="Paths to the CSV file with gold annotations.", 103 | ) 104 | parser.add_argument( 105 | "--pred_file_path", 106 | "-p", 107 | type=str, 108 | required=True, 109 | help="Path to the CSV file with predictions", 110 | ) 111 | args = parser.parse_args() 112 | 113 | pred_file_path = args.pred_file_path 114 | gold_file_path = args.gold_file_path 115 | 116 | if validate_files(pred_file_path): 117 | logging.info("Prediction file format is correct") 118 | mae = evaluate(pred_file_path, gold_file_path) 119 | logging.info(f"Mean Absolute Error={mae:.5f}") 120 | --------------------------------------------------------------------------------