├── .gitattributes
├── .gitignore
├── LICENSE
├── README.md
├── SemEval2024_task8_overview_April.pdf
├── images
    ├── MBZUAI-logo.png
    ├── data_statistics.png
    └── sofia_uni.png
├── subtaskA
    ├── baseline
    │   └── transformer_baseline.py
    ├── format_checker
    │   └── format_checker.py
    └── scorer
    │   └── scorer.py
├── subtaskB
    ├── baseline
    │   └── transformer_baseline.py
    ├── format_checker
    │   └── format_checker.py
    └── scorer
    │   └── scorer.py
└── subtaskC
    ├── baseline
        ├── requirements.txt
        ├── run.sh
        └── transformer_baseline.py
    ├── format_checker
        └── format_checker.py
    └── scorer
        └── scorer.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | *.jsonl filter=lfs diff=lfs merge=lfs -text
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | venv/
2 | wandb/
3 | runs/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # SemEval-2024 Task 8: Multigenerator, Multidomain, and Multilingual Black-Box Machine-Generated Text Detection
  2 | 
  3 | [![Code License: Apache 2.0](https://img.shields.io/badge/License-Apache_2.0-green.svg)](https://raw.githubusercontent.com/mbzuai-nlp/SemEval2024-task8/subtask_A_and_B/LICENSE)
  4 | 
  5 | <p align="left" float="left">
  6 |   <img src="images/MBZUAI-logo.png" height="40" />
  7 | </p>
  8 | 
  9 | 
 10 | [News](#news) | [Competition](#competition) | [Subtasks](#subtasks) | [Data Source](#data_source) | [Data Format](#data_format) | [Evaluation Metrics](#scorer_and_official_evaluation_metrics) | [Baselines](#baselines) | [FAQ](#faq) | [Organizers](#organizers) | [Contacts](#contacts)
 11 | 
 12 | Large language models (LLMs) are becoming mainstream and easily accessible, ushering in an explosion of machine-generated content over various channels, such as news, social media, question-answering forums, educational, and even academic contexts. Recent LLMs, such as ChatGPT and GPT-4, generate remarkably fluent responses to a wide variety of user queries. The articulate nature of such generated texts makes LLMs attractive for replacing human labor in many scenarios. However, this has also resulted in concerns regarding their potential misuse, such as spreading misinformation and causing disruptions in the education system. Since humans perform only slightly better than chance when classifying machine-generated vs. human-written text, there is a need to develop automatic systems to identify machine-generated text with the goal of mitigating its potential misuse. 
 13 | 
 14 | We offer three subtasks over two paradigms of text generation: (1) **full text** when a considered text is entirely written by a human or generated by a machine; and (2) **mixed text** when a machine-generated text is refined by a human or a human-written text paraphrased by a machine.
 15 | 
 16 | ## NEWS 
 17 | 
 18 | ### 22 April 2024
 19 | Check the SemEval Shared Task [Paper](https://github.com/mbzuai-nlp/SemEval2024-task8/blob/main/SemEval2024_task8_overview_April.pdf). To appear in NAACL SemEval-2024 soon!
 20 | 
 21 | ### 3 Feb 2024
 22 | The results of the test phase are published!
 23 | 
 24 | Test results: https://docs.google.com/spreadsheets/d/1BWSb-vcEZHqKmycOHdrEvOiORpN93SqC5KiYILbKxk4/edit?usp=sharing
 25 | 
 26 | Test gold labels: https://drive.google.com/drive/folders/13aFJK4UyY3Gxg_2ceEAWfJvzopB1vkPc?usp=sharing
 27 | 
 28 | ### 13 Jan 2024
 29 | Dear all participants, we apologize that there were something wrong with our CodaBench platform during **10-13 Jan**. We fixed it today and restart the competition. 
 30 | You can submit your solutions and then we will **announce** the final test results and rank until **the end of evaluation** (31 Jan).
 31 | 
 32 | PS: For submissions during 10-13 Jan, sorry we are only allowed to save all your score results but no permission to save all your submissions. 
 33 | In case of some mistakes, you can resubmit your running results.
 34 | 
 35 | 
 36 | ### Test Sets are Ready, Go!
 37 | The SemEval-2024 Task 8 test sets are now available!
 38 | We have prepared machine-generated and human-written texts in English, Arabic, German, and Italian.
 39 | 
 40 | Access our test sets by [Google drive link](https://drive.google.com/drive/folders/10DKtClzkwIIAatzHBWXZXuQNID-DNGSG?usp=sharing).
 41 | 
 42 | Submit your solution by **31 January 2024** using the CodaBench platform!
 43 | 
 44 | ## Competition
 45 | 
 46 | Our competition is launched on the CodaBench platform: [https://www.codabench.org/competitions/1752](https://www.codabench.org/competitions/1752).
 47 | 
 48 | ## Subtasks
 49 | 
 50 | - **Subtask A. Binary Human-Written vs. Machine-Generated Text Classification:** Given a full text, determine whether it is human-written or machine-generated. There are two tracks for subtask A: monolingual (only English sources) and multilingual.
 51 | 
 52 | - **Subtask B. Multi-Way Machine-Generated Text Classification:** Given a full text, determine who generated it. It can be human-written or generated by a specific language model.
 53 | 
 54 | - **Subtask C. Human-Machine Mixed Text Detection:** Given a mixed text, where the first part is human-written and the second part is machine-generated, determine the boundary, where the change occurs.
 55 | 
 56 | ## Data Restriction
 57 | Note that additional training data is **NOT allowed** for all participants.
 58 | 
 59 | ## <a name="data_source"></a>Data Source
 60 | The data for the task is an extension of the M4 dataset. Here are current statistics about the dataset.
 61 | 
 62 | <p align="center" width="80%">
 63 |     <a><img src="images/data_statistics.png" alt="Title" style="width: 80%; min-width: 250px; display: block; margin: auto;"></a>
 64 | </p>
 65 | 
 66 | ## Citation
 67 | The M4 dataset is described in an [EACL'2024 paper -- Best Resource Paper Award](https://aclanthology.org/2024.eacl-long.83/):
 68 | ```bibtex
 69 | @inproceedings{wang-etal-2024-m4,
 70 |     title = "M4: Multi-generator, Multi-domain, and Multi-lingual Black-Box Machine-Generated Text Detection",
 71 |     author = "Wang, Yuxia  and
 72 |       Mansurov, Jonibek  and
 73 |       Ivanov, Petar  and
 74 |       Su, Jinyan  and
 75 |       Shelmanov, Artem  and
 76 |       Tsvigun, Akim  and
 77 |       Whitehouse, Chenxi  and
 78 |       Mohammed Afzal, Osama  and
 79 |       Mahmoud, Tarek  and
 80 |       Sasaki, Toru  and
 81 |       Arnold, Thomas  and
 82 |       Aji, Alham  and
 83 |       Habash, Nizar  and
 84 |       Gurevych, Iryna  and
 85 |       Nakov, Preslav",
 86 |     editor = "Graham, Yvette  and
 87 |       Purver, Matthew",
 88 |     booktitle = "Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)",
 89 |     month = mar,
 90 |     year = "2024",
 91 |     address = "St. Julian{'}s, Malta",
 92 |     publisher = "Association for Computational Linguistics",
 93 |     url = "https://aclanthology.org/2024.eacl-long.83",
 94 |     pages = "1369--1407",
 95 |     abstract = "Large language models (LLMs) have demonstrated remarkable capability to generate fluent responses to a wide variety of user queries. However, this has also raised concerns about the potential misuse of such texts in journalism, education, and academia. In this study, we strive to create automated systems that can detect machine-generated texts and pinpoint potential misuse. We first introduce a large-scale benchmark M4, which is a multi-generator, multi-domain, and multi-lingual corpus for machine-generated text detection. Through an extensive empirical study of this dataset, we show that it is challenging for detectors to generalize well on instances from unseen domains or LLMs. In such cases, detectors tend to misclassify machine-generated text as human-written. These results show that the problem is far from solved and that there is a lot of room for improvement. We believe that our dataset will enable future research towards more robust approaches to this pressing societal problem. The dataset is available at https://github.com/mbzuai-nlp/M4",
 96 | }
 97 | ```
 98 | 
 99 | The SemEval-2024 Task 8 bibtex is below:
100 | ```bibtex
101 | @inproceedings{semeval2024task8,
102 |   author    = {Wang, Yuxia  and  Mansurov, Jonibek  and  Ivanov, Petar  and  su, jinyan  and  Shelmanov, Artem  and  Tsvigun, Akim  and  Mohammed Afzal, Osama  and  Mahmoud, Tarek  and  Puccetti, Giovanni  and  Arnold, Thomas  and  Whitehouse, Chenxi  and  Aji, Alham Fikri  and  Habash, Nizar  and  Gurevych, Iryna  and  Nakov, Preslav},
103 |   title     = {SemEval-2024 Task 8: Multidomain, Multimodel and Multilingual Machine-Generated Text Detection},
104 |   booktitle      = {Proceedings of the 18th International Workshop on Semantic Evaluation (SemEval-2024)},
105 |   month          = {June},
106 |   year           = {2024},
107 |   address        = {Mexico City, Mexico},
108 |   publisher      = {Association for Computational Linguistics},
109 |   pages     = {2041--2063},
110 |   abstract  = {We present the results and the main findings of SemEval-2024 Task 8: Multigenerator, Multidomain, and Multilingual Machine-Generated Text Detection. The task featured three subtasks. Subtask A is a binary classification task determining whether a text is written by a human or generated by a machine. This subtask has two tracks: a monolingual track focused solely on English texts and a multilingual track. Subtask B is to detect the exact source of a text, discerning whether it is written by a human or generated by a specific LLM. Subtask C aims to identify the changing point within a text, at which the authorship transitions from human to machine. The task attracted a large number of participants: subtask A monolingual (126), subtask A multilingual (59), subtask B (70), and subtask C (30). In this paper, we present the task, analyze the results, and discuss the system submissions and the methods they used. For all subtasks, the best systems used LLMs.},
111 |   url       = {https://aclanthology.org/2024.semeval2024-1.275}
112 | }
113 | ```
114 | 
115 | ## <a name="data_format"></a>Data Format
116 | ### Data Download Instructions
117 | 
118 | To download the dataset for this project, follow these steps:
119 | 
120 | 1. Install the `gdown` package using pip:
121 | 
122 | ```
123 | pip install gdown
124 | ````
125 | 
126 | 2. Use `gdown` to download the dataset folders by providing the respective file IDs for each subtask:
127 | 
128 | | Task          | Google Drive Folder Link                                                                                           | File ID                                        |
129 | |---------------|--------------------------------------------------------------------------------------------------------------------|------------------------------------------------|
130 | | Whole dataset | [Google Drive Folder](https://drive.google.com/drive/folders/14DulzxuH5TDhXtviRVXsH5e2JTY2POLi)            | 14DulzxuH5TDhXtviRVXsH5e2JTY2POLi            |
131 | | Subtask A     | [Google Drive Folder](https://drive.google.com/drive/folders/1CAbb3DjrOPBNm0ozVBfhvrEh9P9rAppc)            | 1CAbb3DjrOPBNm0ozVBfhvrEh9P9rAppc            |
132 | | Subtask B     | [Google Drive Folder](https://drive.google.com/drive/folders/11YeloR2eTXcTzdwI04Z-M2QVvIeQAU6-)            | 11YeloR2eTXcTzdwI04Z-M2QVvIeQAU6-            |
133 | | Subtask C     | [Google Drive Folder](https://drive.google.com/drive/folders/16bRUuoeb_LxnCkcKM-ed6X6K5t_1C6mL)            | 16bRUuoeb_LxnCkcKM-ed6X6K5t_1C6mL            |
134 | 
135 | ```
136 | gdown --folder https://drive.google.com/drive/folders/<file_id>
137 | ```
138 | Make sure to replace `<file_id>` with the respective file IDs provided above when running the `gdown` command for the desired dataset.
139 | 
140 | 3. After downloading place the files in their respective subtask folder.
141 | 
142 | 
143 | The datasets are JSONL files.
144 | The data is located in the following folders:
145 | * **Subtask A:**
146 |   * Monolingual track:
147 |     * subtaskA/data/subtaskA_train_monolingual.jsonl
148 |     * subtaskA/data/subtaskA_dev_monolingual.jsonl
149 |   * Multilingual track:
150 |     * subtaskA/data/subtaskA_train_multilingual.jsonl
151 |     * subtaskA/data/subtaskA_dev_multilingual.jsonl
152 | * **Subtask B:**
153 |   * subtaskB/data/subtaskB_train.jsonl
154 |   * subtaskB/data/subtaskB_dev.jsonl
155 | * **Subtask C:**
156 |   * subtaskC/data/subtaskC_train.jsonl
157 |   * subtaskC/data/subtaskC_dev.jsonl
158 | 
159 | 
160 | ### Statistics
161 | | Subtask                     |  #Train |   #Dev  |
162 | |:----------------------------|--------:|--------:|
163 | | Subtask A (monolingual)     | 119,757 |   5,000 |
164 | | Subtask A (multilingual)    | 172,417 |   4,000 |
165 | | Subtask B                   |  71,027 |   3,000 |
166 | | Subtask C                   |   3,649 |     505 | 
167 | 
168 | 
169 | ### Input Data Format
170 | 
171 | #### Subtask A:
172 | An object in the JSON format:
173 | ```
174 | {
175 |   id -> identifier of the example,
176 |   label -> label (human text: 0, machine text: 1,),
177 |   text -> text generated by a machine or written by a human,
178 |   model -> model that generated the data,
179 |   source -> source (Wikipedia, Wikihow, Peerread, Reddit, Arxiv)  on English or language (Arabic, Russian, Chinese, Indonesian, Urdu, Bulgarian, German)
180 | }
181 | ```
182 | 
183 | #### Subtask B:
184 | An object of the JSON has the following format:
185 | ```
186 | {
187 |   id -> identifier of the example,
188 |   label -> label (human: 0, chatGPT: 1, cohere: 2, davinci: 3, bloomz: 4, dolly: 5),
189 |   text -> text generated by machine or written by human,
190 |   model -> model name that generated data,
191 |   source -> source (Wikipedia, Wikihow, Peerread, Reddit, Arxiv) on English
192 | }
193 | ```
194 | 
195 | 
196 | #### Subtask C:
197 | An object of the JSON has the following format:
198 | ```
199 | {
200 |   id -> identifier of the example,
201 |   label -> label (index of the word split by whitespace where change happens),
202 |   text -> text generated by machine or written by human,
203 | }
204 | ``` 
205 | 
206 | ### Prediction File Format and Format Checkers
207 | 
208 | A prediction file must be one single JSONL file for all texts. The entry for each text must include the fields "id" and "label".  
209 | 
210 | The format checkers verify that your prediction file complies with the expected format. They are located in the ```format_checker``` module in each subtask directory.
211 | 
212 | #### Subtask A:
213 | ```python
214 | python3 subtaskA/format_checker/format_checker.py --pred_files_path=<path_to_your_results_files> 
215 | ```
216 | 
217 | #### Subtask B:
218 | ```python
219 | python3 subtaskB/format_checker/format_checker.py --pred_files_path=<path_to_your_results_files> 
220 | ```
221 | 
222 | ### Subtask C:
223 | To launch it, please run the following command:
224 | ```python
225 | python3 subtaskC/format_checker/format_checker.py --pred_files_path=<path_to_your_results_files> 
226 | ```
227 | 
228 | Note that format checkers can not verify whether the prediction file you submit contains predictions for all test instances because it does not have an access to the test file.
229 | 
230 | ## <a name="scorer_and_official_evaluation_metrics"></a>Scorer and Official Evaluation Metrics
231 | 
232 | The scorers for the subtasks are located in the ```scorer``` modules in each subtask directory.
233 | The scorer will report the official evaluation metric and other metrics for a given prediction file.
234 | 
235 | ### Subtask A:
236 | The **official evaluation metric** for the Subtask A is **accuracy**. However, the scorer also reports macro-F1 and micro-F1. 
237 | 
238 | The scorer is run by the following command:
239 | ```python
240 | python3 subtaskA/scorer/scorer.py --gold_file_path=<path_to_gold_labels> --pred_file_path=<path_to_your_results_file> 
241 | ```
242 | 
243 | ### Subtask B:
244 | The **official evaluation metric** for the Subtask B is **accuracy**. However, the scorer also reports macro-F1 and micro-F1. 
245 | 
246 | The scorer is run by the following command:
247 | ```python
248 | python3 subtaskB/scorer/scorer.py --gold_file_path=<path_to_gold_labels> --pred_file_path=<path_to_your_results_file> 
249 | ```
250 | 
251 | ### Subtask C:
252 | The **official evaluation metric** for Subtask C is the **Mean Absolute Error (MAE)**. This metric measures the absolute distance between the predicted word and the actual word where the switch between human and machine occurs.
253 | To launch it, please run the following command:
254 | ```python
255 | python3 subtaskC/scorer/scorer.py --gold_file_path=<path_to_gold_labels> --pred_file_path=<path_to_your_results_file> 
256 | ```
257 | 
258 | ## <a name="baselines"></a>Baselines
259 | 
260 | ### Task A
261 | 
262 | Running the Transformer baseline:
263 |  ```
264 | python3 subtaskA/baseline/transformer_baseline.py --train_file_path <path_to_train_file> --test_file_path <path_to_test_file> --prediction_file_path <path_to_save_predictions> --subtask A --model <path_to_model>
265 |  ```
266 | 
267 | The average results for the monolingual setup across three runs for RoBERTa is 0.74;
268 | 
269 | The average results for the multilingual setup across three runs for XLM-R is 0.72;
270 | 
271 | ### Task B
272 | 
273 | Running the Transformer baseline:
274 |  ```
275 | python3 subtaskB/baseline/transformer_baseline.py --train_file_path <path_to_train_file> --test_file_path <path_to_test_file> --prediction_file_path <path_to_save_predictions> --subtask B --model <path_to_model>
276 |  ```
277 | The average results across three runs for RoBERTa is 0.75;
278 | 
279 | ### Task C
280 | 
281 | Running the Transformer baseline
282 |  ```
283 | bash subtaskC/baseline/run.sh
284 |  ```
285 | The average MAE score across three runs for longformer is: 3.53 ± 0.212
286 | 
287 | To modify the hyperparameters, please edit the corresponding python command within the run.sh file.
288 | 
289 | ## <a name="faq"></a> FAQ
290 | #### Q: How many times can we submit? Which submission will be used for the final ranking?
291 | **A:** We do not limit your submission times. The **final (last) submission** will be used for the final rank.
292 | 
293 | #### Q: For subtask C, how did we define the gold boundary?
294 | **A:** Simply speaking, given a text: human_text_segment + machine_generated_text, the boundary label = len(human_text_segment.split(" ")).
295 | **Note that using split(" ") with whitespace as the argument, rather than split()**
296 | 
297 | #### Q: Where should we register for this shared task?
298 | **A:** In our competition on CodaBench: [https://www.codabench.org/competitions/1752](https://www.codabench.org/competitions/1752).
299 | 
300 | #### Q: Should we do all subtasks or just one of them?
301 | **A:** You can choose any tasks in which you are interested. Also, if you just want to do English track, it is also allowed, or if you just want to do multilingual track, it is welcomed.
302 | 
303 | #### Q: Are all of the deadlines alligned with the dates posted here? https://semeval.github.io/SemEval2024/
304 | **A:** Yes, so far all deadlines are aligned with the https://semeval.github.io/SemEval2024/ , we will make announcement if there are any changes.
305 | 
306 | 
307 | #### Q: Could you please tell me what the differences are between our task’s dataset and the M4 dataset? Are they absolutely the same?
308 | 
309 | **A:** There are mainly three major differences compared to the M4 dataset: 1) task formulation is different, 2) we upsampled human text for data balance; and 3) new and surprising domains, generators and languages will appear in test sets (real test set will not include information about generators, domains and languages).
310 | 
311 | #### Q: We noticed significant disproportionality between training and development sets. For example Subtask A related to machine-generated texts: the training set does not contain BLOOMz outputs, while the development set contains only them. Could you please clarify the reason for such an intriguing splitting?
312 | 
313 | **A:** We split in this way because it is more aligned with the real application scenarios where many domains and generators are unseen during training. Besides, such a development set also serves as a hint to participants that totally new domains, generators and languages will be included in the real test sets (real test set will not include information about generators, domains and languages).
314 | 
315 | #### Q: Whether it is allowed to use additional data?
316 | 
317 | **A:** It is not allowed to use extra data.
318 | 
319 | ## Organizers
320 | 
321 | - Yuxia Wang, Mohamed bin Zayed University of Artificial Intelligence
322 | - Alham Fikri Aji, Mohamed bin Zayed University of Artificial Intelligence
323 | - Artem Shelmanov, Mohamed bin Zayed University of Artificial Intelligence
324 | - Akim Tsvigun, Semrush
325 | - Giovanni Puccetti, Institute of Information Science and Technology, A. Faedo (ISTI CNR)
326 | - Chenxi Whitehouse, Mohamed bin Zayed University of Artificial Intelligence
327 | - Petar Ivanov, Sofia University
328 | - Jonibek Mansurov, Mohamed bin Zayed University of Artificial Intelligence
329 | - Jinyan Su, Mohamed bin Zayed University of Artificial Intelligence
330 | - Tarek Mahmoud, Mohamed bin Zayed University of Artificial Intelligence
331 | - Osama Mohammed Afzal, Mohamed bin Zayed University of Artificial Intelligence
332 | - Thomas Arnold, Technical University Darmstadt
333 | - Iryna Gurevych, Mohamed bin Zayed University of Artificial Intelligence
334 | - Nizar Habash, Mohamed bin Zayed University of Artificial Intelligence
335 | - Preslav Nakov, Mohamed bin Zayed University of Artificial Intelligence
336 | 
337 | ## Contacts
338 | 
339 | Google group: [https://groups.google.com/g/semeval2024-task8/](https://groups.google.com/g/semeval2024-task8/)  
340 | Email: semeval2024-task8@googlegroups.com
341 | 


--------------------------------------------------------------------------------
/SemEval2024_task8_overview_April.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-nlp/SemEval2024-task8/d8350c840bc505eaba06b4baf69993c2d18fef5e/SemEval2024_task8_overview_April.pdf


--------------------------------------------------------------------------------
/images/MBZUAI-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-nlp/SemEval2024-task8/d8350c840bc505eaba06b4baf69993c2d18fef5e/images/MBZUAI-logo.png


--------------------------------------------------------------------------------
/images/data_statistics.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-nlp/SemEval2024-task8/d8350c840bc505eaba06b4baf69993c2d18fef5e/images/data_statistics.png


--------------------------------------------------------------------------------
/images/sofia_uni.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-nlp/SemEval2024-task8/d8350c840bc505eaba06b4baf69993c2d18fef5e/images/sofia_uni.png


--------------------------------------------------------------------------------
/subtaskA/baseline/transformer_baseline.py:
--------------------------------------------------------------------------------
  1 | from datasets import Dataset
  2 | import pandas as pd
  3 | import evaluate
  4 | import numpy as np
  5 | from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, AutoTokenizer, set_seed
  6 | import os
  7 | from sklearn.model_selection import train_test_split
  8 | from scipy.special import softmax
  9 | import argparse
 10 | import logging
 11 | 
 12 | def preprocess_function(examples, **fn_kwargs):
 13 |     return fn_kwargs['tokenizer'](examples["text"], truncation=True)
 14 | 
 15 | 
 16 | def get_data(train_path, test_path, random_seed):
 17 |     """
 18 |     function to read dataframe with columns
 19 |     """
 20 | 
 21 |     train_df = pd.read_json(train_path, lines=True)
 22 |     test_df = pd.read_json(test_path, lines=True)
 23 |     
 24 |     train_df, val_df = train_test_split(train_df, test_size=0.2, stratify=train_df['label'], random_state=random_seed)
 25 | 
 26 |     return train_df, val_df, test_df
 27 | 
 28 | def compute_metrics(eval_pred):
 29 | 
 30 |     f1_metric = evaluate.load("f1")
 31 | 
 32 |     predictions, labels = eval_pred
 33 |     predictions = np.argmax(predictions, axis=1)
 34 |     
 35 |     results = {}
 36 |     results.update(f1_metric.compute(predictions=predictions, references = labels, average="micro"))
 37 | 
 38 |     return results
 39 | 
 40 | 
 41 | def fine_tune(train_df, valid_df, checkpoints_path, id2label, label2id, model):
 42 | 
 43 |     # pandas dataframe to huggingface Dataset
 44 |     train_dataset = Dataset.from_pandas(train_df)
 45 |     valid_dataset = Dataset.from_pandas(valid_df)
 46 |     
 47 |     # get tokenizer and model from huggingface
 48 |     tokenizer = AutoTokenizer.from_pretrained(model)     # put your model here
 49 |     model = AutoModelForSequenceClassification.from_pretrained(
 50 |        model, num_labels=len(label2id), id2label=id2label, label2id=label2id    # put your model here
 51 |     )
 52 |     
 53 |     # tokenize data for train/valid
 54 |     tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True, fn_kwargs={'tokenizer': tokenizer})
 55 |     tokenized_valid_dataset = valid_dataset.map(preprocess_function, batched=True,  fn_kwargs={'tokenizer': tokenizer})
 56 |     
 57 | 
 58 |     data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
 59 | 
 60 | 
 61 |     # create Trainer 
 62 |     training_args = TrainingArguments(
 63 |         output_dir=checkpoints_path,
 64 |         learning_rate=2e-5,
 65 |         per_device_train_batch_size=16,
 66 |         per_device_eval_batch_size=16,
 67 |         num_train_epochs=3,
 68 |         weight_decay=0.01,
 69 |         evaluation_strategy="epoch",
 70 |         save_strategy="epoch",
 71 |         load_best_model_at_end=True,
 72 |     )
 73 | 
 74 |     trainer = Trainer(
 75 |         model=model,
 76 |         args=training_args,
 77 |         train_dataset=tokenized_train_dataset,
 78 |         eval_dataset=tokenized_valid_dataset,
 79 |         tokenizer=tokenizer,
 80 |         data_collator=data_collator,
 81 |         compute_metrics=compute_metrics,
 82 |     )
 83 | 
 84 |     trainer.train()
 85 | 
 86 |     # save best model
 87 |     best_model_path = checkpoints_path+'/best/'
 88 |     
 89 |     if not os.path.exists(best_model_path):
 90 |         os.makedirs(best_model_path)
 91 |     
 92 | 
 93 |     trainer.save_model(best_model_path)
 94 | 
 95 | 
 96 | def test(test_df, model_path, id2label, label2id):
 97 |     
 98 |     # load tokenizer from saved model 
 99 |     tokenizer = AutoTokenizer.from_pretrained(model_path)
100 | 
101 |     # load best model
102 |     model = AutoModelForSequenceClassification.from_pretrained(
103 |        model_path, num_labels=len(label2id), id2label=id2label, label2id=label2id
104 |     )
105 |             
106 |     test_dataset = Dataset.from_pandas(test_df)
107 | 
108 |     tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True,  fn_kwargs={'tokenizer': tokenizer})
109 |     data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
110 | 
111 |     # create Trainer
112 |     trainer = Trainer(
113 |         model=model,
114 |         tokenizer=tokenizer,
115 |         data_collator=data_collator,
116 |         compute_metrics=compute_metrics,
117 |     )
118 |     # get logits from predictions and evaluate results using classification report
119 |     predictions = trainer.predict(tokenized_test_dataset)
120 |     prob_pred = softmax(predictions.predictions, axis=-1)
121 |     preds = np.argmax(predictions.predictions, axis=-1)
122 |     metric = evaluate.load("bstrai/classification_report")
123 |     results = metric.compute(predictions=preds, references=predictions.label_ids)
124 |     
125 |     # return dictionary of classification report
126 |     return results, preds
127 | 
128 | 
129 | if __name__ == '__main__':
130 | 
131 |     parser = argparse.ArgumentParser()
132 |     parser.add_argument("--train_file_path", "-tr", required=True, help="Path to the train file.", type=str)
133 |     parser.add_argument("--test_file_path", "-t", required=True, help="Path to the test file.", type=str)
134 |     parser.add_argument("--subtask", "-sb", required=True, help="Subtask (A or B).", type=str, choices=['A', 'B'])
135 |     parser.add_argument("--model", "-m", required=True, help="Transformer to train and test", type=str)
136 |     parser.add_argument("--prediction_file_path", "-p", required=True, help="Path where to save the prediction file.", type=str)
137 | 
138 |     args = parser.parse_args()
139 | 
140 |     random_seed = 0
141 |     train_path =  args.train_file_path # For example 'subtaskA_train_multilingual.jsonl'
142 |     test_path =  args.test_file_path # For example 'subtaskA_test_multilingual.jsonl'
143 |     model =  args.model # For example 'xlm-roberta-base'
144 |     subtask =  args.subtask # For example 'A'
145 |     prediction_path = args.prediction_file_path # For example subtaskB_predictions.jsonl
146 | 
147 |     if not os.path.exists(train_path):
148 |         logging.error("File doesnt exists: {}".format(train_path))
149 |         raise ValueError("File doesnt exists: {}".format(train_path))
150 |     
151 |     if not os.path.exists(test_path):
152 |         logging.error("File doesnt exists: {}".format(train_path))
153 |         raise ValueError("File doesnt exists: {}".format(train_path))
154 |     
155 | 
156 |     if subtask == 'A':
157 |         id2label = {0: "human", 1: "machine"}
158 |         label2id = {"human": 0, "machine": 1}
159 |     elif subtask == 'B':
160 |         id2label = {0: 'human', 1: 'chatGPT', 2: 'cohere', 3: 'davinci', 4: 'bloomz', 5: 'dolly'}
161 |         label2id = {'human': 0, 'chatGPT': 1,'cohere': 2, 'davinci': 3, 'bloomz': 4, 'dolly': 5}
162 |     else:
163 |         logging.error("Wrong subtask: {}. It should be A or B".format(train_path))
164 |         raise ValueError("Wrong subtask: {}. It should be A or B".format(train_path))
165 | 
166 |     set_seed(random_seed)
167 | 
168 |     #get data for train/dev/test sets
169 |     train_df, valid_df, test_df = get_data(train_path, test_path, random_seed)
170 |     
171 |     # train detector model
172 |     fine_tune(train_df, valid_df, f"{model}/subtask{subtask}/{random_seed}", id2label, label2id, model)
173 | 
174 |     # test detector model
175 |     results, predictions = test(test_df, f"{model}/subtask{subtask}/{random_seed}/best/", id2label, label2id)
176 |     
177 |     logging.info(results)
178 |     predictions_df = pd.DataFrame({'id': test_df['id'], 'label': predictions})
179 |     predictions_df.to_json(prediction_path, lines=True, orient='records')
180 | 


--------------------------------------------------------------------------------
/subtaskA/format_checker/format_checker.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import logging
 4 | import json
 5 | import pandas as pd
 6 | """
 7 | This script checks whether the results format for subtask A and subtask B is correct. 
 8 | It also provides some warnings about possible errors.
 9 | 
10 | The submission of the result file should be in jsonl format. 
11 | It should be a lines of objects:
12 | {
13 |   id     -> identifier of the test sample,
14 |   labels -> labels (0 or 1 for subtask A and from 0 to 5 for subtask B),
15 | }
16 | 
17 | """
18 | 
19 | logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
20 | COLUMNS = ['id', 'label']
21 | 
22 | 
23 | def check_format(file_path):
24 |   if not os.path.exists(file_path):
25 |     logging.error("File doesnt exists: {}".format(file_path))
26 |     return False
27 |   
28 |   try:
29 |     submission = pd.read_json(file_path, lines=True)[['id', 'label']]
30 |   except:
31 |     logging.error("File is not a valid json file: {}".format(file_path))
32 |     return False
33 |   
34 |   for column in COLUMNS:
35 |     if submission[column].isna().any():
36 |       logging.error("NA value in file {} in column {}".format(file_path, column))
37 |       return False
38 |   
39 |   if not submission['label'].isin(range(0, 2)).all():
40 |     logging.error("Unknown Label in file {}".format(file_path))
41 |     logging.error("Unique Labels in the file are {}".format(submission['label'].unique()))
42 |     return False
43 |       
44 |   return True
45 | 
46 | 
47 | if __name__ == "__main__":
48 | 
49 |   parser = argparse.ArgumentParser()
50 |   parser.add_argument("--pred_files_path", "-p", nargs='+', required=True, 
51 |     help="Path to the files you want to check.", type=str)
52 | 
53 |   args = parser.parse_args()
54 |   logging.info("Subtask A and B. Checking files: {}".format(args.pred_files_path))
55 |   
56 |   for pred_file_path in args.pred_files_path:
57 |     check_result = check_format(pred_file_path)
58 |     result = 'Format is correct' if check_result else 'Something wrong in file format'
59 |     logging.info("Subtask A and B. Checking file: {}. Result: {}".format(args.pred_files_path, result))
60 | 


--------------------------------------------------------------------------------
/subtaskA/scorer/scorer.py:
--------------------------------------------------------------------------------
 1 | import logging.handlers
 2 | import argparse
 3 | from sklearn.metrics import f1_score, accuracy_score
 4 | import pandas as pd
 5 | import sys
 6 | sys.path.append('.')
 7 | from subtaskA.format_checker.format_checker import check_format
 8 | 
 9 | """
10 | Scoring of SEMEVAL-Task-8--subtask-A-and-B  with the metrics f1-macro, f1-micro and accuracy. 
11 | """
12 | 
13 | def evaluate(pred_fpath, gold_fpath):
14 |   """
15 |     Evaluates the predicted classes w.r.t. a gold file.
16 |     Metrics are: f1-macro, f1-micro and accuracy
17 | 
18 |     :param pred_fpath: a json file with predictions, 
19 |     :param gold_fpath: the original annotated gold file.
20 | 
21 |     The submission of the result file should be in jsonl format. 
22 |     It should be a lines of objects:
23 |     {
24 |       id     -> identifier of the test sample,
25 |       labels -> labels (0 or 1 for subtask A and from 0 to 5 for subtask B),
26 |     }
27 |   """
28 |   
29 |   pred_labels = pd.read_json(pred_fpath, lines=True)[['id', 'label']]
30 |   gold_labels = pd.read_json(gold_fpath, lines=True)[['id', 'label']]
31 | 
32 |   merged_df = pred_labels.merge(gold_labels, on='id', suffixes=('_pred', '_gold'))
33 | 
34 |   macro_f1 = f1_score(merged_df['label_gold'], merged_df['label_pred'], average="macro", zero_division=0)
35 |   micro_f1 = f1_score(merged_df['label_gold'], merged_df['label_pred'], average="micro", zero_division=0)
36 |   accuracy = accuracy_score(merged_df['label_gold'], merged_df['label_pred'])
37 |   
38 |   return macro_f1, micro_f1, accuracy
39 | 
40 | 
41 | def validate_files(pred_files):
42 |   if not check_format(pred_files):
43 |     logging.error('Bad format for pred file {}. Cannot score.'.format(pred_files))
44 |     return False
45 |   return True
46 | 
47 | 
48 | if __name__ == '__main__':
49 |   parser = argparse.ArgumentParser()
50 |   parser.add_argument( "--gold_file_path", '-g', type=str, required=True, help="Paths to the file with gold annotations.")
51 |   parser.add_argument("--pred_file_path", '-p', type=str, required=True, help="Path to the file with predictions")
52 |   args = parser.parse_args()
53 | 
54 |   pred_file_path = args.pred_file_path
55 |   gold_file_path = args.gold_file_path
56 | 
57 |   if validate_files(pred_file_path):
58 |     logging.info('Prediction file format is correct')
59 |     macro_f1, micro_f1, accuracy = evaluate(pred_file_path, gold_file_path)
60 |     logging.info("macro-F1={:.5f}\tmicro-F1={:.5f}\taccuracy={:.5f}".format(macro_f1, micro_f1, accuracy))
61 | 
62 | 
63 | 


--------------------------------------------------------------------------------
/subtaskB/baseline/transformer_baseline.py:
--------------------------------------------------------------------------------
  1 | from datasets import Dataset
  2 | import pandas as pd
  3 | import evaluate
  4 | import numpy as np
  5 | from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, AutoTokenizer, set_seed
  6 | import os
  7 | from sklearn.model_selection import train_test_split
  8 | from scipy.special import softmax
  9 | import argparse
 10 | import logging
 11 | 
 12 | def preprocess_function(examples, **fn_kwargs):
 13 |     return fn_kwargs['tokenizer'](examples["text"], truncation=True)
 14 | 
 15 | 
 16 | def get_data(train_path, test_path, random_seed):
 17 |     """
 18 |     function to read dataframe with columns
 19 |     """
 20 | 
 21 |     train_df = pd.read_json(train_path, lines=True)
 22 |     test_df = pd.read_json(test_path, lines=True)
 23 |     
 24 |     train_df, val_df = train_test_split(train_df, test_size=0.2, stratify=train_df['label'], random_state=random_seed)
 25 | 
 26 |     return train_df, val_df, test_df
 27 | 
 28 | def compute_metrics(eval_pred):
 29 | 
 30 |     f1_metric = evaluate.load("f1")
 31 | 
 32 |     predictions, labels = eval_pred
 33 |     predictions = np.argmax(predictions, axis=1)
 34 |     
 35 |     results = {}
 36 |     results.update(f1_metric.compute(predictions=predictions, references = labels, average="micro"))
 37 | 
 38 |     return results
 39 | 
 40 | 
 41 | def fine_tune(train_df, valid_df, checkpoints_path, id2label, label2id, model):
 42 | 
 43 |     # pandas dataframe to huggingface Dataset
 44 |     train_dataset = Dataset.from_pandas(train_df)
 45 |     valid_dataset = Dataset.from_pandas(valid_df)
 46 |     
 47 |     # get tokenizer and model from huggingface
 48 |     tokenizer = AutoTokenizer.from_pretrained(model)     # put your model here
 49 |     model = AutoModelForSequenceClassification.from_pretrained(
 50 |        model, num_labels=len(label2id), id2label=id2label, label2id=label2id    # put your model here
 51 |     )
 52 |     
 53 |     # tokenize data for train/valid
 54 |     tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True, fn_kwargs={'tokenizer': tokenizer})
 55 |     tokenized_valid_dataset = valid_dataset.map(preprocess_function, batched=True,  fn_kwargs={'tokenizer': tokenizer})
 56 |     
 57 | 
 58 |     data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
 59 | 
 60 | 
 61 |     # create Trainer 
 62 |     training_args = TrainingArguments(
 63 |         output_dir=checkpoints_path,
 64 |         learning_rate=2e-5,
 65 |         per_device_train_batch_size=16,
 66 |         per_device_eval_batch_size=16,
 67 |         num_train_epochs=3,
 68 |         weight_decay=0.01,
 69 |         evaluation_strategy="epoch",
 70 |         save_strategy="epoch",
 71 |         load_best_model_at_end=True,
 72 |     )
 73 | 
 74 |     trainer = Trainer(
 75 |         model=model,
 76 |         args=training_args,
 77 |         train_dataset=tokenized_train_dataset,
 78 |         eval_dataset=tokenized_valid_dataset,
 79 |         tokenizer=tokenizer,
 80 |         data_collator=data_collator,
 81 |         compute_metrics=compute_metrics,
 82 |     )
 83 | 
 84 |     trainer.train()
 85 | 
 86 |     # save best model
 87 |     best_model_path = checkpoints_path+'/best/'
 88 |     
 89 |     if not os.path.exists(best_model_path):
 90 |         os.makedirs(best_model_path)
 91 |     
 92 | 
 93 |     trainer.save_model(best_model_path)
 94 | 
 95 | 
 96 | def test(test_df, model_path, id2label, label2id):
 97 |     
 98 |     # load tokenizer from saved model 
 99 |     tokenizer = AutoTokenizer.from_pretrained(model_path)
100 | 
101 |     # load best model
102 |     model = AutoModelForSequenceClassification.from_pretrained(
103 |        model_path, num_labels=len(label2id), id2label=id2label, label2id=label2id
104 |     )
105 |             
106 |     test_dataset = Dataset.from_pandas(test_df)
107 | 
108 |     tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True,  fn_kwargs={'tokenizer': tokenizer})
109 |     data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
110 | 
111 |     # create Trainer
112 |     trainer = Trainer(
113 |         model=model,
114 |         tokenizer=tokenizer,
115 |         data_collator=data_collator,
116 |         compute_metrics=compute_metrics,
117 |     )
118 |     # get logits from predictions and evaluate results using classification report
119 |     predictions = trainer.predict(tokenized_test_dataset)
120 |     prob_pred = softmax(predictions.predictions, axis=-1)
121 |     preds = np.argmax(predictions.predictions, axis=-1)
122 |     metric = evaluate.load("bstrai/classification_report")
123 |     results = metric.compute(predictions=preds, references=predictions.label_ids)
124 |     
125 |     # return dictionary of classification report
126 |     return results, preds
127 | 
128 | 
129 | if __name__ == '__main__':
130 | 
131 |     parser = argparse.ArgumentParser()
132 |     parser.add_argument("--train_file_path", "-tr", required=True, help="Path to the train file.", type=str)
133 |     parser.add_argument("--test_file_path", "-t", required=True, help="Path to the test file.", type=str)
134 |     parser.add_argument("--subtask", "-sb", required=True, help="Subtask (A or B).", type=str, choices=['A', 'B'])
135 |     parser.add_argument("--model", "-m", required=True, help="Transformer to train and test", type=str)
136 |     parser.add_argument("--prediction_file_path", "-p", required=True, help="Path where to save the prediction file.", type=str)
137 | 
138 |     args = parser.parse_args()
139 | 
140 |     random_seed = 0
141 |     train_path =  args.train_file_path # For example 'subtaskA_train_multilingual.jsonl'
142 |     test_path =  args.test_file_path # For example 'subtaskA_test_multilingual.jsonl'
143 |     model =  args.model # For example 'xlm-roberta-base'
144 |     subtask =  args.subtask # For example 'A'
145 |     prediction_path = args.prediction_file_path # For example subtaskB_predictions.jsonl
146 | 
147 |     if not os.path.exists(train_path):
148 |         logging.error("File doesnt exists: {}".format(train_path))
149 |         raise ValueError("File doesnt exists: {}".format(train_path))
150 |     
151 |     if not os.path.exists(test_path):
152 |         logging.error("File doesnt exists: {}".format(train_path))
153 |         raise ValueError("File doesnt exists: {}".format(train_path))
154 |     
155 | 
156 |     if subtask == 'A':
157 |         id2label = {0: "human", 1: "machine"}
158 |         label2id = {"human": 0, "machine": 1}
159 |     elif subtask == 'B':
160 |         id2label = {0: 'human', 1: 'chatGPT', 2: 'cohere', 3: 'davinci', 4: 'bloomz', 5: 'dolly'}
161 |         label2id = {'human': 0, 'chatGPT': 1,'cohere': 2, 'davinci': 3, 'bloomz': 4, 'dolly': 5}
162 |     else:
163 |         logging.error("Wrong subtask: {}. It should be A or B".format(train_path))
164 |         raise ValueError("Wrong subtask: {}. It should be A or B".format(train_path))
165 | 
166 |     set_seed(random_seed)
167 | 
168 |     #get data for train/dev/test sets
169 |     train_df, valid_df, test_df = get_data(train_path, test_path, random_seed)
170 |     
171 |     # train detector model
172 |     fine_tune(train_df, valid_df, f"{model}/subtask{subtask}/{random_seed}", id2label, label2id, model)
173 | 
174 |     # test detector model
175 |     results, predictions = test(test_df, f"{model}/subtask{subtask}/{random_seed}/best/", id2label, label2id)
176 |     
177 |     logging.info(results)
178 |     predictions_df = pd.DataFrame({'id': test_df['id'], 'label': predictions})
179 |     predictions_df.to_json(prediction_path, lines=True, orient='records')
180 | 


--------------------------------------------------------------------------------
/subtaskB/format_checker/format_checker.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import logging
 4 | import json
 5 | import pandas as pd
 6 | """
 7 | This script checks whether the results format for subtask A and subtask B is correct. 
 8 | It also provides some warnings about possible errors.
 9 | 
10 | The submission of the result file should be in jsonl format. 
11 | It should be a lines of objects:
12 | {
13 |   id     -> identifier of the test sample,
14 |   labels -> labels (0 or 1 for subtask A and from 0 to 5 for subtask B),
15 | }
16 | 
17 | """
18 | 
19 | logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
20 | COLUMNS = ['id', 'label']
21 | 
22 | 
23 | def check_format(file_path):
24 |   if not os.path.exists(file_path):
25 |     logging.error("File doesnt exists: {}".format(file_path))
26 |     return False
27 |   
28 |   try:
29 |     submission = pd.read_json(file_path, lines=True)[['id', 'label']]
30 |   except:
31 |     logging.error("File is not a valid json file: {}".format(file_path))
32 |     return False
33 |   
34 |   for column in COLUMNS:
35 |     if submission[column].isna().any():
36 |       logging.error("NA value in file {} in column {}".format(file_path, column))
37 |       return False
38 |   
39 |   if not submission['label'].isin(range(0, 6)).all():
40 |     logging.error("Unknown Label in file {}".format(file_path))
41 |     logging.error("Unique Labels in the file are {}".format(submission['label'].unique()))
42 |     return False
43 |       
44 |   return True
45 | 
46 | 
47 | if __name__ == "__main__":
48 | 
49 |   parser = argparse.ArgumentParser()
50 |   parser.add_argument("--pred_files_path", "-p", nargs='+', required=True, 
51 |     help="Path to the files you want to check.", type=str)
52 | 
53 |   args = parser.parse_args()
54 |   logging.info("Subtask A and B. Checking files: {}".format(args.pred_files_path))
55 |   
56 |   for pred_file_path in args.pred_files_path:
57 |     check_result = check_format(pred_file_path)
58 |     result = 'Format is correct' if check_result else 'Something wrong in file format'
59 |     logging.info("Subtask A and B. Checking file: {}. Result: {}".format(args.pred_files_path, result))


--------------------------------------------------------------------------------
/subtaskB/scorer/scorer.py:
--------------------------------------------------------------------------------
 1 | import logging.handlers
 2 | import argparse
 3 | from sklearn.metrics import f1_score, accuracy_score
 4 | import pandas as pd
 5 | import sys
 6 | sys.path.append('.')
 7 | from subtaskB.format_checker.format_checker import check_format
 8 | 
 9 | """
10 | Scoring of SEMEVAL-Task-8--subtask-A-and-B  with the metrics f1-macro, f1-micro and accuracy. 
11 | """
12 | 
13 | def evaluate(pred_fpath, gold_fpath):
14 |   """
15 |     Evaluates the predicted classes w.r.t. a gold file.
16 |     Metrics are: f1-macro, f1-micro and accuracy
17 | 
18 |     :param pred_fpath: a json file with predictions, 
19 |     :param gold_fpath: the original annotated gold file.
20 | 
21 |     The submission of the result file should be in jsonl format. 
22 |     It should be a lines of objects:
23 |     {
24 |       id     -> identifier of the test sample,
25 |       labels -> labels (0 or 1 for subtask A and from 0 to 5 for subtask B),
26 |     }
27 |   """
28 |   
29 |   pred_labels = pd.read_json(pred_fpath, lines=True)[['id', 'label']]
30 |   gold_labels = pd.read_json(gold_fpath, lines=True)[['id', 'label']]
31 | 
32 |   merged_df = pred_labels.merge(gold_labels, on='id', suffixes=('_pred', '_gold'))
33 | 
34 |   macro_f1 = f1_score(merged_df['label_gold'], merged_df['label_pred'], average="macro", zero_division=0)
35 |   micro_f1 = f1_score(merged_df['label_gold'], merged_df['label_pred'], average="micro", zero_division=0)
36 |   accuracy = accuracy_score(merged_df['label_gold'], merged_df['label_pred'])
37 |   
38 |   return macro_f1, micro_f1, accuracy
39 | 
40 | 
41 | def validate_files(pred_files):
42 |   if not check_format(pred_files):
43 |     logging.error('Bad format for pred file {}. Cannot score.'.format(pred_files))
44 |     return False
45 |   return True
46 | 
47 | 
48 | if __name__ == '__main__':
49 |   parser = argparse.ArgumentParser()
50 |   parser.add_argument( "--gold_file_path", '-g', type=str, required=True, help="Paths to the file with gold annotations.")
51 |   parser.add_argument("--pred_file_path", '-p', type=str, required=True, help="Path to the file with predictions")
52 |   args = parser.parse_args()
53 | 
54 |   pred_file_path = args.pred_file_path
55 |   gold_file_path = args.gold_file_path
56 | 
57 |   if validate_files(pred_file_path):
58 |     logging.info('Prediction file format is correct')
59 |     macro_f1, micro_f1, accuracy = evaluate(pred_file_path, gold_file_path)
60 |     logging.info("macro-F1={:.5f}\tmicro-F1={:.5f}\taccuracy={:.5f}".format(macro_f1, micro_f1, accuracy))
61 | 
62 | 
63 | 


--------------------------------------------------------------------------------
/subtaskC/baseline/requirements.txt:
--------------------------------------------------------------------------------
 1 | accelerate==0.23.0
 2 | certifi==2023.7.22
 3 | charset-normalizer==3.2.0
 4 | cmake==3.27.5
 5 | filelock==3.12.4
 6 | fsspec==2023.9.1
 7 | huggingface-hub==0.17.2
 8 | idna==3.4
 9 | Jinja2==3.1.2
10 | joblib==1.3.2
11 | lit==16.0.6
12 | MarkupSafe==2.1.3
13 | mpmath==1.3.0
14 | networkx==3.1
15 | numpy==1.26.0
16 | nvidia-cublas-cu11==11.10.3.66
17 | nvidia-cuda-cupti-cu11==11.7.101
18 | nvidia-cuda-nvrtc-cu11==11.7.99
19 | nvidia-cuda-runtime-cu11==11.7.99
20 | nvidia-cudnn-cu11==8.5.0.96
21 | nvidia-cufft-cu11==10.9.0.58
22 | nvidia-curand-cu11==10.2.10.91
23 | nvidia-cusolver-cu11==11.4.0.1
24 | nvidia-cusparse-cu11==11.7.4.91
25 | nvidia-nccl-cu11==2.14.3
26 | nvidia-nvtx-cu11==11.7.91
27 | packaging==23.1
28 | pandas==2.1.0
29 | psutil==5.9.5
30 | python-dateutil==2.8.2
31 | pytz==2023.3.post1
32 | PyYAML==6.0.1
33 | regex==2023.8.8
34 | requests==2.31.0
35 | safetensors==0.3.3
36 | scikit-learn==1.3.0
37 | scipy==1.11.2
38 | six==1.16.0
39 | sympy==1.12
40 | threadpoolctl==3.2.0
41 | tokenizers==0.13.3
42 | torch==2.0.1
43 | tqdm==4.66.1
44 | transformers==4.33.2
45 | triton==2.0.0
46 | typing_extensions==4.8.0
47 | tzdata==2023.3
48 | urllib3==2.0.4
49 | 


--------------------------------------------------------------------------------
/subtaskC/baseline/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | exp_name="exp_1"
 3 | seed_value=42
 4 | python transformer_baseline.py \
 5 |   --model_path "allenai/longformer-base-4096" \
 6 |   --train_file "../data/subtaskC_train.jsonl" \
 7 |   --load_best_model_at_end True \
 8 |   --dev_file "../data/subtaskC_dev.jsonl" \
 9 |   --test_files ../data/subtaskC_dev.jsonl \
10 |   --metric_for_best_model "eval_mean_absolute_diff" \
11 |   --greater_is_better False \
12 |   --do_train True \
13 |   --do_predict True \
14 |   --seed $seed_value \
15 |   --output_dir "./runs/$exp_name" \
16 |   --logging_dir "./runs/$exp_name/logs" \
17 |   --num_train_epochs 10 \
18 |   --per_device_train_batch_size 32 \
19 |   --per_device_eval_batch_size 32 \
20 |   --auto_find_batch_size True \
21 |   --logging_steps 10 \
22 |   --load_best_model_at_end True \
23 |   --evaluation_strategy "epoch" \
24 |   --save_strategy "epoch" \
25 |   --save_total_limit 2
26 | 


--------------------------------------------------------------------------------
/subtaskC/baseline/transformer_baseline.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import json
  3 | from transformers import AutoTokenizer, AutoModelForTokenClassification
  4 | from transformers.trainer_callback import TrainerState
  5 | import transformers
  6 | import pandas as pd
  7 | import numpy as np
  8 | 
  9 | from dataclasses import dataclass, field
 10 | from typing import Any, List, Optional
 11 | import logging
 12 | import glob
 13 | import os
 14 | 
 15 | logging.basicConfig(level=logging.INFO)
 16 | logger = logging.getLogger()
 17 | 
 18 | 
 19 | @dataclass
 20 | class ModelConfig:
 21 |     model_path: str = "allenai/longformer-base-4096"
 22 | 
 23 | 
 24 | @dataclass
 25 | class DatasetConfig:
 26 |     train_file: str = field(default=None, metadata={"help": "Path to train jsonl file"})
 27 |     dev_file: str = field(default=None, metadata={"help": "Path to dev jsonl file"})
 28 |     test_files: List[str] = field(
 29 |         default=None, metadata={"help": "Path to test json files"}
 30 |     )
 31 | 
 32 | 
 33 | @dataclass
 34 | class TrainingArgsConfig(transformers.TrainingArguments):
 35 |     seed: int = 42
 36 |     output_dir: str = "./runs/exp_3"
 37 |     num_train_epochs: int = 10
 38 |     per_device_train_batch_size: int = 32
 39 |     per_device_eval_batch_size: int = 32
 40 |     auto_find_batch_size: bool = True
 41 |     logging_dir: str = "./runs/exp_3/logs"
 42 |     logging_steps: int = 10
 43 |     load_best_model_at_end: bool = True
 44 |     evaluation_strategy: str = "epoch"
 45 |     save_strategy: str = "epoch"
 46 |     save_total_limit: int = 2
 47 | 
 48 | 
 49 | class Semeval_Data(torch.utils.data.Dataset):
 50 |     def __init__(self, data_path, max_length=1024, inference=False, debug=False):
 51 |         with open(data_path, "r") as f:
 52 |             self.data = [json.loads(line) for line in f]
 53 |         self.inference = inference
 54 |         self.tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096")
 55 |         self.max_length = max_length
 56 |         self.debug = debug
 57 | 
 58 |     def __len__(self):
 59 |         return len(self.data)
 60 | 
 61 |     def __getitem__(self, idx):
 62 |         text = self.data[idx]["text"]
 63 |         id = self.data[idx]["id"]
 64 |         label = None
 65 |         labels_available = "label" in self.data[idx]
 66 | 
 67 |         if labels_available:
 68 |             label = self.data[idx]["label"]
 69 | 
 70 |         if self.debug and not self.inference:
 71 |             print("Orignal Human Position: ", label)
 72 | 
 73 |         labels = []
 74 |         corresponding_word = []
 75 |         tokens = []
 76 |         input_ids = []
 77 |         attention_mask = []
 78 | 
 79 |         for jdx, word in enumerate(text.split(" ")):
 80 |             word_encoded = self.tokenizer.tokenize(word)
 81 |             sub_words = len(word_encoded)
 82 | 
 83 |             if labels_available:
 84 |                 is_machine_text = 1 if jdx >= label else 0
 85 |                 labels.extend([is_machine_text] * sub_words)
 86 | 
 87 |             corresponding_word.extend([jdx] * sub_words)
 88 |             tokens.extend(word_encoded)
 89 |             input_ids.extend(self.tokenizer.convert_tokens_to_ids(word_encoded))
 90 |             attention_mask.extend([1] * sub_words)
 91 | 
 92 |         ###Add padding to labels as -100
 93 |         if len(input_ids) < self.max_length - 2:
 94 |             input_ids = (
 95 |                 [0] + input_ids + [2] + [1] * (self.max_length - len(input_ids) - 2)
 96 |             )
 97 |             if labels_available:
 98 |                 labels = [-100] + labels + [-100] * (self.max_length - len(labels) - 1)
 99 | 
100 |             attention_mask = (
101 |                 [1]
102 |                 + attention_mask
103 |                 + [1]
104 |                 + [0] * (self.max_length - len(attention_mask) - 2)
105 |             )
106 |             corresponding_word = (
107 |                 [-100]
108 |                 + corresponding_word
109 |                 + [-100] * (self.max_length - len(corresponding_word) - 1)
110 |             )
111 |             tokens = (
112 |                 ["<s>"]
113 |                 + tokens
114 |                 + ["</s>"]
115 |                 + ["<pad>"] * (self.max_length - len(tokens) - 2)
116 |             )
117 |         else:
118 |             # Add -100 for CLS and SEP tokens
119 |             input_ids = [0] + input_ids[: self.max_length - 2] + [2]
120 | 
121 |             if labels_available:
122 |                 labels = [-100] + labels[: self.max_length - 2] + [-100]
123 | 
124 |             corresponding_word = (
125 |                 [-100] + corresponding_word[: self.max_length - 2] + [-100]
126 |             )
127 |             attention_mask = [1] + attention_mask[: self.max_length - 2] + [1]
128 |             tokens = ["<s>"] + tokens[: self.max_length - 2] + ["</s>"]
129 | 
130 |         encoded = {}
131 |         if labels_available:
132 |             encoded["labels"] = torch.tensor(labels)
133 | 
134 |         encoded["input_ids"] = torch.tensor(input_ids)
135 |         encoded["attention_mask"] = torch.tensor(attention_mask)
136 | 
137 |         if labels_available:
138 |             if encoded["input_ids"].shape != encoded["labels"].shape:
139 |                 print("Input IDs Shape: ", encoded["input_ids"].shape)
140 |                 print("Labels Shape: ", encoded["labels"].shape)
141 |             assert encoded["input_ids"].shape == encoded["labels"].shape
142 | 
143 |         if self.debug and not self.inference:
144 |             print("Tokenized Human Position: ", labels.index(1))
145 |             print("Original Human Position: ", label)
146 |             print("Full Human Text:", text)
147 |             print("\n")
148 |             print("Human Text Truncated:", text.split(" ")[:label])
149 |             print("\n")
150 |             encoded["partial_human_review"] = " ".join(text.split(" ")[:label])
151 | 
152 |         if self.inference:
153 |             encoded["text"] = text
154 |             encoded["id"] = id
155 |             encoded["corresponding_word"] = corresponding_word
156 | 
157 |         return encoded
158 | 
159 | 
160 | def evaluate_position_difference(actual_position, predicted_position):
161 |     """
162 |     Compute the absolute difference between the actual and predicted start positions.
163 | 
164 |     Args:
165 |     - actual_position (int): Actual start position of machine-generated text.
166 |     - predicted_position (int): Predicted start position of machine-generated text.
167 | 
168 |     Returns:
169 |     - int: Absolute difference between the start positions.
170 |     """
171 |     return abs(actual_position - predicted_position)
172 | 
173 | 
174 | def get_start_position(sequence, mapping=None, token_level=True):
175 |     """
176 |     Get the start position from a sequence of labels or predictions.
177 | 
178 |     Args:
179 |     - sequence (np.array): A sequence of labels or predictions.
180 |     - mapping (np.array): Mapping from index to word for the sequence.
181 |     - token_level (bool): If True, return positional indices; else, return word mappings.
182 | 
183 |     Returns:
184 |     - int or str: Start position in the sequence.
185 |     """
186 |     # Locate the position of label '1'
187 | 
188 |     if mapping is not None:
189 |         mask = mapping != -100
190 |         sequence = sequence[mask]
191 |         mapping = mapping[mask]
192 | 
193 |     index = np.where(sequence == 1)[0]
194 |     value = index[0] if index.size else (len(sequence) - 1)
195 | 
196 |     if not token_level:
197 |         value = mapping[value]
198 | 
199 |     return value
200 | 
201 | 
202 | def evaluate_machine_start_position(
203 |     labels, predictions, idx2word=None, token_level=False
204 | ):
205 |     """
206 |     Evaluate the starting position of machine-generated text in both predicted and actual sequences.
207 | 
208 |     Args:
209 |     - labels (np.array): Actual labels.
210 |     - predictions (np.array): Predicted labels.
211 |     - idx2word (np.array): Mapping from index to word for each sequence in the batch.
212 |     - token_level (bool): Flag to determine if evaluation is at token level. If True, return positional indices; else, return word mappings.
213 | 
214 |     Returns:
215 |     - float: Mean absolute difference between the start positions in predictions and actual labels.
216 |     """
217 |     predicted_positions = predictions.argmax(axis=-1)
218 | 
219 |     actual_starts = []
220 |     predicted_starts = []
221 | 
222 |     if not token_level and idx2word is None:
223 |         raise ValueError(
224 |             "idx2word must be provided if evaluation is at word level (token_level=False)"
225 |         )
226 | 
227 |     for idx in range(labels.shape[0]):
228 |         # Remove padding
229 |         mask = labels[idx] != -100
230 |         predict, label, mapping = (
231 |             predicted_positions[idx][mask],
232 |             labels[idx][mask],
233 |             idx2word[idx][mask] if not token_level else None,
234 |         )
235 | 
236 |         # If token_level is True, just use the index; otherwise, map to word
237 |         predicted_value = get_start_position(predict, mapping, token_level)
238 |         actual_value = get_start_position(label, mapping, token_level)
239 | 
240 |         predicted_starts.append(predicted_value)
241 |         actual_starts.append(actual_value)
242 | 
243 |     position_differences = [
244 |         evaluate_position_difference(actual, predict)
245 |         for actual, predict in zip(actual_starts, predicted_starts)
246 |     ]
247 |     mean_position_difference = np.mean(position_differences)
248 | 
249 |     return mean_position_difference
250 | 
251 | 
252 | def compute_metrics(p):
253 |     pred, labels = p
254 |     mean_absolute_diff = evaluate_machine_start_position(labels, pred, token_level=True)
255 | 
256 |     return {
257 |         "mean_absolute_diff": mean_absolute_diff,
258 |     }
259 | 
260 | 
261 | if __name__ == "__main__":
262 |     parser = transformers.HfArgumentParser(
263 |         (ModelConfig, DatasetConfig, TrainingArgsConfig)
264 |     )
265 |     model_args, data_args, training_args = parser.parse_args_into_dataclasses()
266 |     print("Model Arguments: ", model_args)
267 |     print("Data Arguments: ", data_args)
268 |     print("Training Arguments: ", training_args)
269 | 
270 |     # Set seed
271 |     transformers.set_seed(training_args.seed)
272 | 
273 |     model_path = model_args.model_path
274 |     if (
275 |         training_args.do_eval or training_args.do_predict
276 |     ) and not training_args.do_train:
277 |         output_dir = training_args.output_dir
278 |         if not os.path.exists(output_dir):
279 |             raise ValueError(
280 |                 f"Output directory ({output_dir}) does not exist. Please train the model first."
281 |             )
282 | 
283 |         # Find the best model checkpoint
284 |         ckpt_paths = sorted(
285 |             glob.glob(os.path.join(output_dir, "checkpoint-*")),
286 |             key=lambda x: int(x.split("-")[-1]),
287 |         )
288 | 
289 |         if not ckpt_paths:
290 |             raise ValueError(
291 |                 f"Output directory ({output_dir}) does not contain any checkpoint. Please train the model first."
292 |             )
293 | 
294 |         state = TrainerState.load_from_json(
295 |             os.path.join(ckpt_paths[-1], "trainer_state.json")
296 |         )
297 |         best_model_path = state.best_model_checkpoint or model_args.model_path
298 |         if state.best_model_checkpoint is None:
299 |             logger.info(
300 |                 "No best model checkpoint found. Using the default model checkpoint."
301 |             )
302 |         print(f"Best model path: {best_model_path}")
303 |         model_path = best_model_path
304 | 
305 |     # 4. Load model
306 |     model = AutoModelForTokenClassification.from_pretrained(
307 |         model_path, num_labels=2, trust_remote_code=True
308 |     )
309 | 
310 |     train_set = Semeval_Data(data_args.train_file)
311 |     dev_set = Semeval_Data(data_args.dev_file)
312 | 
313 |     trainer = transformers.Trainer(
314 |         model=model,
315 |         args=training_args,
316 |         train_dataset=train_set,
317 |         eval_dataset=dev_set,
318 |         tokenizer=train_set.tokenizer,
319 |         compute_metrics=compute_metrics,
320 |     )
321 | 
322 |     if training_args.do_train:
323 |         logger.info("Training...")
324 |         logger.info("*** Train Dataset ***")
325 |         logger.info(f"Number of samples: {len(train_set)}")
326 |         logger.info("*** Dev Dataset ***")
327 |         logger.info(f"Number of samples: {len(dev_set)}")
328 | 
329 |         trainer.train()
330 | 
331 |         logger.info("Training completed!")
332 | 
333 |     if training_args.do_eval:
334 |         logger.info("Evaluating...")
335 |         logger.info("*** Dev Dataset ***")
336 |         logger.info(f"Number of samples: {len(dev_set)}")
337 | 
338 |         metrics = trainer.evaluate()
339 |         logger.info(f"Metrics: {metrics}")
340 |         trainer.save_metrics("eval", metrics)
341 | 
342 |         logger.info("Evaluation completed!")
343 | 
344 |     if training_args.do_predict:
345 |         test_sets = []
346 |         for test_file in data_args.test_files:
347 |             test_set = Semeval_Data(test_file, inference=True)
348 |             test_sets.append(test_set)
349 |         logger.info("Predicting...")
350 |         logger.info("*** Test Datasets ***")
351 |         logger.info(f"Number of samples: {len(test_sets)}")
352 | 
353 |         for idx, test_set in enumerate(test_sets):
354 |             logger.info(f"Test Dataset {idx + 1}")
355 |             logger.info(f"Number of samples: {len(test_set)}")
356 | 
357 |             predictions, _, _ = trainer.predict(test_set)
358 |             logger.info("Predictions completed!")
359 | 
360 |             df = pd.DataFrame(
361 |                 {
362 |                     "id": [i["id"] for i in test_set],
363 |                     "label": [
364 |                         get_start_position(
365 |                             i[0],
366 |                             np.array(i[1]["corresponding_word"]),
367 |                             token_level=False,
368 |                         )
369 |                         for i in list(zip(predictions.argmax(axis=-1), test_set))
370 |                     ],
371 |                 }
372 |             )
373 |             import os
374 | 
375 |             file_name = os.path.basename(data_args.test_files[idx])
376 |             file_dirs = os.path.join(training_args.output_dir, "predictions")
377 |             os.makedirs(file_dirs, exist_ok=True)
378 |             file_path = os.path.join(file_dirs, file_name)
379 |             records = df.to_dict("records")
380 |             with open(file_path, "w") as f:
381 |                 for record in records:
382 |                     f.write(json.dumps(record) + "\n")
383 | 


--------------------------------------------------------------------------------
/subtaskC/format_checker/format_checker.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import logging
 4 | import json
 5 | import pandas as pd
 6 | 
 7 | """
 8 | This script checks whether the results format for subtask C is correct. 
 9 | It also provides some warnings about possible errors.
10 | 
11 | The submission of the result file should be in CSV format with the columns:
12 | {
13 |     "id" -> identifier of the test sample,
14 |     "label" -> predicted start position,
15 | }
16 | """
17 | 
18 | logging.basicConfig(format="%(levelname)s : %(message)s", level=logging.INFO)
19 | COLUMNS = ["id", "labels"]
20 | 
21 | 
22 | def check_format(file_path):
23 |     if not os.path.exists(file_path):
24 |         logging.error("File doesnt exists: {}".format(file_path))
25 |         return False
26 | 
27 |     try:
28 |         submission = pd.read_json(file_path, lines=True)[["id", "label"]]
29 |     except Exception as e:
30 |         logging.error("File is not a valid jsonl file: {}".format(file_path))
31 |         logging.error(e)
32 |         return False
33 | 
34 |     for column in COLUMNS:
35 |         if submission[column].isna().any():
36 |             logging.error("NA value in file {} in column {}".format(file_path, column))
37 |             return False
38 | 
39 |     if not submission["label"].dtypes == "int64":
40 |         logging.error("Unknown datatype in file {} for column label".format(file_path))
41 | 
42 |         return False
43 | 
44 |     return True
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     parser = argparse.ArgumentParser()
49 |     parser.add_argument(
50 |         "--pred_files_path",
51 |         "-p",
52 |         nargs="+",
53 |         required=True,
54 |         help="Path to the files you want to check.",
55 |         type=str,
56 |     )
57 | 
58 |     args = parser.parse_args()
59 |     logging.info("Subtask C. Checking files: {}".format(args.pred_files_path))
60 | 
61 |     for pred_file_path in args.pred_files_path:
62 |         check_result = check_format(pred_file_path)
63 |         result = (
64 |             "Format is correct" if check_result else "Something wrong in file format"
65 |         )
66 |         logging.info(
67 |             "Subtask C. Checking file: {}. Result: {}".format(
68 |                 args.pred_files_path, result
69 |             )
70 |         )
71 | 


--------------------------------------------------------------------------------
/subtaskC/scorer/scorer.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import logging.handlers
  3 | import argparse
  4 | from sklearn.metrics import f1_score, accuracy_score
  5 | import pandas as pd
  6 | import sys
  7 | import os
  8 | import numpy as np
  9 | 
 10 | """
 11 | Scoring of SEMEVAL-Task-8--subtask-C  with the metric Mean Absolute Error (MAE)
 12 | """
 13 | logging.basicConfig(format="%(levelname)s : %(message)s", level=logging.INFO)
 14 | COLUMNS = ["id", "label"]
 15 | 
 16 | 
 17 | def check_format(file_path):
 18 |     if not os.path.exists(file_path):
 19 |         logging.error("File doesnt exists: {}".format(file_path))
 20 |         return False
 21 | 
 22 |     try:
 23 |         submission = pd.read_json(file_path, lines=True)[["id", "label"]]
 24 |     except Exception as e:
 25 |         logging.error("File is not a valid csv file: {}".format(file_path))
 26 |         logging.error(e)
 27 |         return False
 28 | 
 29 |     for column in COLUMNS:
 30 |         if submission[column].isna().any():
 31 |             logging.error("NA value in file {} in column {}".format(file_path, column))
 32 |             return False
 33 | 
 34 |     if not submission["label"].dtypes == "int64":
 35 |         logging.error("Unknown datatype in file {} for column label".format(file_path))
 36 | 
 37 |         return False
 38 | 
 39 |     return True
 40 | 
 41 | 
 42 | def evaluate_position_difference(actual_position, predicted_position):
 43 |     """
 44 |     Compute the absolute difference between the actual and predicted start positions.
 45 | 
 46 |     Args:
 47 |     - actual_position (int): Actual start position of machine-generated text.
 48 |     - predicted_position (int): Predicted start position of machine-generated text.
 49 | 
 50 |     Returns:
 51 |     - int: Absolute difference between the start positions.
 52 |     """
 53 |     return abs(actual_position - predicted_position)
 54 | 
 55 | 
 56 | def evaluate(pred_fpath, gold_fpath):
 57 |     """
 58 |     Evaluates the predicted classes w.r.t. a gold file.
 59 |     Metrics are: Mean Absolute Error (MAE)
 60 | 
 61 |     :param pred_fpath: a csv file with predictions,
 62 |     :param gold_fpath: the original annotated csv file.
 63 | 
 64 |     The submission of the result file should be in jsonl format.
 65 |     It should be a lines of objects:
 66 |     {
 67 |       id     -> identifier of the test sample,
 68 |       labels -> labels (0 or 1 for subtask A and from 0 to 5 for subtask B),
 69 |     }
 70 |     """
 71 | 
 72 |     pred_labels = pd.read_json(pred_fpath, lines=True)[["id", "label"]]
 73 |     gold_labels = pd.read_json(gold_fpath, lines=True)[["id", "label"]]
 74 | 
 75 |     merged_df = pred_labels.merge(gold_labels, on="id", suffixes=("_pred", "_gold"))
 76 | 
 77 |     # Compute the absolute difference between the actual and predicted start positions.
 78 |     out = merged_df.apply(
 79 |         lambda row: evaluate_position_difference(row["label_gold"], row["label_pred"]),
 80 |         axis=1,
 81 |     ).values
 82 |     logging.info(f"Number of samples: {len(merged_df)}")
 83 |     # Compute the mean absolute error (MAE)
 84 |     mae = np.mean(out)
 85 |     return mae
 86 | 
 87 | 
 88 | def validate_files(pred_files):
 89 |     if not check_format(pred_files):
 90 |         logging.error("Bad format for pred file {}. Cannot score.".format(pred_files))
 91 |         return False
 92 |     return True
 93 | 
 94 | 
 95 | if __name__ == "__main__":
 96 |     parser = argparse.ArgumentParser()
 97 |     parser.add_argument(
 98 |         "--gold_file_path",
 99 |         "-g",
100 |         type=str,
101 |         required=True,
102 |         help="Paths to the CSV file with gold annotations.",
103 |     )
104 |     parser.add_argument(
105 |         "--pred_file_path",
106 |         "-p",
107 |         type=str,
108 |         required=True,
109 |         help="Path to the CSV file with predictions",
110 |     )
111 |     args = parser.parse_args()
112 | 
113 |     pred_file_path = args.pred_file_path
114 |     gold_file_path = args.gold_file_path
115 | 
116 |     if validate_files(pred_file_path):
117 |         logging.info("Prediction file format is correct")
118 |         mae = evaluate(pred_file_path, gold_file_path)
119 |         logging.info(f"Mean Absolute Error={mae:.5f}")
120 | 


--------------------------------------------------------------------------------