├── .gitignore ├── LICENSE ├── LICENSE_FOR_SAMPLE_TEXTS ├── Makefile ├── README.md ├── annotable.py ├── color_manager.py ├── conll2jsonlines.py ├── conll2sacr.py ├── conll_transform.py ├── docs ├── imgs │ ├── glozz_annotation.png │ ├── notebook_join.png │ ├── notebook_part_of_speech_of_first_mentions.png │ ├── notebook_pivot.png │ ├── notebook_pivot_chart.png │ ├── notebook_sentence_lengths.png │ ├── notebook_singletons.png │ ├── pict01.png │ ├── pict02.png │ ├── pict03.png │ ├── pict04.png │ └── pict05.png ├── sample_notebook.html └── sample_notebook.ipynb ├── glozz2sacr.pl ├── jsonlines2conll.py ├── jsonlines2text.py ├── mypy.ini ├── pyproject.toml ├── requirements.txt ├── sacr2ann.py ├── sacr2annotable.py ├── sacr2conll.py ├── sacr2df.py ├── sacr2glozz.pl ├── sacr_parser.py ├── sacr_parser2.py ├── setup.cfg ├── standoff2inline.py ├── testing ├── aesop.sacr ├── caesar.sacr ├── cicero.sacr ├── docs.jsonlines ├── lucian_speakers.sacr ├── pliny.sacr ├── simple.sacr ├── singe.conll ├── singe.jsonlines └── testing_sacr2conll.conll ├── testing_conll2sacr ├── _aesop.sacr___part_000 ├── _ceasar.sacr___part_000 ├── _cicero.sacr___part_000 ├── _pliny.sacr___part_000 └── _simple.sacr___part_000 ├── tests ├── test_annotable.py ├── test_sacr2ann.py ├── test_sacr2annotable.py └── test_sacr_parser.py └── text2jsonlines.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | .idea/ 3 | venv/ 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Mozilla Public License Version 2.0 2 | ================================== 3 | 4 | 1. Definitions 5 | -------------- 6 | 7 | 1.1. "Contributor" 8 | means each individual or legal entity that creates, contributes to 9 | the creation of, or owns Covered Software. 10 | 11 | 1.2. "Contributor Version" 12 | means the combination of the Contributions of others (if any) used 13 | by a Contributor and that particular Contributor's Contribution. 14 | 15 | 1.3. "Contribution" 16 | means Covered Software of a particular Contributor. 17 | 18 | 1.4. "Covered Software" 19 | means Source Code Form to which the initial Contributor has attached 20 | the notice in Exhibit A, the Executable Form of such Source Code 21 | Form, and Modifications of such Source Code Form, in each case 22 | including portions thereof. 23 | 24 | 1.5. "Incompatible With Secondary Licenses" 25 | means 26 | 27 | (a) that the initial Contributor has attached the notice described 28 | in Exhibit B to the Covered Software; or 29 | 30 | (b) that the Covered Software was made available under the terms of 31 | version 1.1 or earlier of the License, but not also under the 32 | terms of a Secondary License. 33 | 34 | 1.6. "Executable Form" 35 | means any form of the work other than Source Code Form. 36 | 37 | 1.7. "Larger Work" 38 | means a work that combines Covered Software with other material, in 39 | a separate file or files, that is not Covered Software. 40 | 41 | 1.8. "License" 42 | means this document. 43 | 44 | 1.9. "Licensable" 45 | means having the right to grant, to the maximum extent possible, 46 | whether at the time of the initial grant or subsequently, any and 47 | all of the rights conveyed by this License. 48 | 49 | 1.10. "Modifications" 50 | means any of the following: 51 | 52 | (a) any file in Source Code Form that results from an addition to, 53 | deletion from, or modification of the contents of Covered 54 | Software; or 55 | 56 | (b) any new file in Source Code Form that contains any Covered 57 | Software. 58 | 59 | 1.11. "Patent Claims" of a Contributor 60 | means any patent claim(s), including without limitation, method, 61 | process, and apparatus claims, in any patent Licensable by such 62 | Contributor that would be infringed, but for the grant of the 63 | License, by the making, using, selling, offering for sale, having 64 | made, import, or transfer of either its Contributions or its 65 | Contributor Version. 66 | 67 | 1.12. "Secondary License" 68 | means either the GNU General Public License, Version 2.0, the GNU 69 | Lesser General Public License, Version 2.1, the GNU Affero General 70 | Public License, Version 3.0, or any later versions of those 71 | licenses. 72 | 73 | 1.13. "Source Code Form" 74 | means the form of the work preferred for making modifications. 75 | 76 | 1.14. "You" (or "Your") 77 | means an individual or a legal entity exercising rights under this 78 | License. For legal entities, "You" includes any entity that 79 | controls, is controlled by, or is under common control with You. For 80 | purposes of this definition, "control" means (a) the power, direct 81 | or indirect, to cause the direction or management of such entity, 82 | whether by contract or otherwise, or (b) ownership of more than 83 | fifty percent (50%) of the outstanding shares or beneficial 84 | ownership of such entity. 85 | 86 | 2. License Grants and Conditions 87 | -------------------------------- 88 | 89 | 2.1. Grants 90 | 91 | Each Contributor hereby grants You a world-wide, royalty-free, 92 | non-exclusive license: 93 | 94 | (a) under intellectual property rights (other than patent or trademark) 95 | Licensable by such Contributor to use, reproduce, make available, 96 | modify, display, perform, distribute, and otherwise exploit its 97 | Contributions, either on an unmodified basis, with Modifications, or 98 | as part of a Larger Work; and 99 | 100 | (b) under Patent Claims of such Contributor to make, use, sell, offer 101 | for sale, have made, import, and otherwise transfer either its 102 | Contributions or its Contributor Version. 103 | 104 | 2.2. Effective Date 105 | 106 | The licenses granted in Section 2.1 with respect to any Contribution 107 | become effective for each Contribution on the date the Contributor first 108 | distributes such Contribution. 109 | 110 | 2.3. Limitations on Grant Scope 111 | 112 | The licenses granted in this Section 2 are the only rights granted under 113 | this License. No additional rights or licenses will be implied from the 114 | distribution or licensing of Covered Software under this License. 115 | Notwithstanding Section 2.1(b) above, no patent license is granted by a 116 | Contributor: 117 | 118 | (a) for any code that a Contributor has removed from Covered Software; 119 | or 120 | 121 | (b) for infringements caused by: (i) Your and any other third party's 122 | modifications of Covered Software, or (ii) the combination of its 123 | Contributions with other software (except as part of its Contributor 124 | Version); or 125 | 126 | (c) under Patent Claims infringed by Covered Software in the absence of 127 | its Contributions. 128 | 129 | This License does not grant any rights in the trademarks, service marks, 130 | or logos of any Contributor (except as may be necessary to comply with 131 | the notice requirements in Section 3.4). 132 | 133 | 2.4. Subsequent Licenses 134 | 135 | No Contributor makes additional grants as a result of Your choice to 136 | distribute the Covered Software under a subsequent version of this 137 | License (see Section 10.2) or under the terms of a Secondary License (if 138 | permitted under the terms of Section 3.3). 139 | 140 | 2.5. Representation 141 | 142 | Each Contributor represents that the Contributor believes its 143 | Contributions are its original creation(s) or it has sufficient rights 144 | to grant the rights to its Contributions conveyed by this License. 145 | 146 | 2.6. Fair Use 147 | 148 | This License is not intended to limit any rights You have under 149 | applicable copyright doctrines of fair use, fair dealing, or other 150 | equivalents. 151 | 152 | 2.7. Conditions 153 | 154 | Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted 155 | in Section 2.1. 156 | 157 | 3. Responsibilities 158 | ------------------- 159 | 160 | 3.1. Distribution of Source Form 161 | 162 | All distribution of Covered Software in Source Code Form, including any 163 | Modifications that You create or to which You contribute, must be under 164 | the terms of this License. You must inform recipients that the Source 165 | Code Form of the Covered Software is governed by the terms of this 166 | License, and how they can obtain a copy of this License. You may not 167 | attempt to alter or restrict the recipients' rights in the Source Code 168 | Form. 169 | 170 | 3.2. Distribution of Executable Form 171 | 172 | If You distribute Covered Software in Executable Form then: 173 | 174 | (a) such Covered Software must also be made available in Source Code 175 | Form, as described in Section 3.1, and You must inform recipients of 176 | the Executable Form how they can obtain a copy of such Source Code 177 | Form by reasonable means in a timely manner, at a charge no more 178 | than the cost of distribution to the recipient; and 179 | 180 | (b) You may distribute such Executable Form under the terms of this 181 | License, or sublicense it under different terms, provided that the 182 | license for the Executable Form does not attempt to limit or alter 183 | the recipients' rights in the Source Code Form under this License. 184 | 185 | 3.3. Distribution of a Larger Work 186 | 187 | You may create and distribute a Larger Work under terms of Your choice, 188 | provided that You also comply with the requirements of this License for 189 | the Covered Software. If the Larger Work is a combination of Covered 190 | Software with a work governed by one or more Secondary Licenses, and the 191 | Covered Software is not Incompatible With Secondary Licenses, this 192 | License permits You to additionally distribute such Covered Software 193 | under the terms of such Secondary License(s), so that the recipient of 194 | the Larger Work may, at their option, further distribute the Covered 195 | Software under the terms of either this License or such Secondary 196 | License(s). 197 | 198 | 3.4. Notices 199 | 200 | You may not remove or alter the substance of any license notices 201 | (including copyright notices, patent notices, disclaimers of warranty, 202 | or limitations of liability) contained within the Source Code Form of 203 | the Covered Software, except that You may alter any license notices to 204 | the extent required to remedy known factual inaccuracies. 205 | 206 | 3.5. Application of Additional Terms 207 | 208 | You may choose to offer, and to charge a fee for, warranty, support, 209 | indemnity or liability obligations to one or more recipients of Covered 210 | Software. However, You may do so only on Your own behalf, and not on 211 | behalf of any Contributor. You must make it absolutely clear that any 212 | such warranty, support, indemnity, or liability obligation is offered by 213 | You alone, and You hereby agree to indemnify every Contributor for any 214 | liability incurred by such Contributor as a result of warranty, support, 215 | indemnity or liability terms You offer. You may include additional 216 | disclaimers of warranty and limitations of liability specific to any 217 | jurisdiction. 218 | 219 | 4. Inability to Comply Due to Statute or Regulation 220 | --------------------------------------------------- 221 | 222 | If it is impossible for You to comply with any of the terms of this 223 | License with respect to some or all of the Covered Software due to 224 | statute, judicial order, or regulation then You must: (a) comply with 225 | the terms of this License to the maximum extent possible; and (b) 226 | describe the limitations and the code they affect. Such description must 227 | be placed in a text file included with all distributions of the Covered 228 | Software under this License. Except to the extent prohibited by statute 229 | or regulation, such description must be sufficiently detailed for a 230 | recipient of ordinary skill to be able to understand it. 231 | 232 | 5. Termination 233 | -------------- 234 | 235 | 5.1. The rights granted under this License will terminate automatically 236 | if You fail to comply with any of its terms. However, if You become 237 | compliant, then the rights granted under this License from a particular 238 | Contributor are reinstated (a) provisionally, unless and until such 239 | Contributor explicitly and finally terminates Your grants, and (b) on an 240 | ongoing basis, if such Contributor fails to notify You of the 241 | non-compliance by some reasonable means prior to 60 days after You have 242 | come back into compliance. Moreover, Your grants from a particular 243 | Contributor are reinstated on an ongoing basis if such Contributor 244 | notifies You of the non-compliance by some reasonable means, this is the 245 | first time You have received notice of non-compliance with this License 246 | from such Contributor, and You become compliant prior to 30 days after 247 | Your receipt of the notice. 248 | 249 | 5.2. If You initiate litigation against any entity by asserting a patent 250 | infringement claim (excluding declaratory judgment actions, 251 | counter-claims, and cross-claims) alleging that a Contributor Version 252 | directly or indirectly infringes any patent, then the rights granted to 253 | You by any and all Contributors for the Covered Software under Section 254 | 2.1 of this License shall terminate. 255 | 256 | 5.3. In the event of termination under Sections 5.1 or 5.2 above, all 257 | end user license agreements (excluding distributors and resellers) which 258 | have been validly granted by You or Your distributors under this License 259 | prior to termination shall survive termination. 260 | 261 | ************************************************************************ 262 | * * 263 | * 6. Disclaimer of Warranty * 264 | * ------------------------- * 265 | * * 266 | * Covered Software is provided under this License on an "as is" * 267 | * basis, without warranty of any kind, either expressed, implied, or * 268 | * statutory, including, without limitation, warranties that the * 269 | * Covered Software is free of defects, merchantable, fit for a * 270 | * particular purpose or non-infringing. The entire risk as to the * 271 | * quality and performance of the Covered Software is with You. * 272 | * Should any Covered Software prove defective in any respect, You * 273 | * (not any Contributor) assume the cost of any necessary servicing, * 274 | * repair, or correction. This disclaimer of warranty constitutes an * 275 | * essential part of this License. No use of any Covered Software is * 276 | * authorized under this License except under this disclaimer. * 277 | * * 278 | ************************************************************************ 279 | 280 | ************************************************************************ 281 | * * 282 | * 7. Limitation of Liability * 283 | * -------------------------- * 284 | * * 285 | * Under no circumstances and under no legal theory, whether tort * 286 | * (including negligence), contract, or otherwise, shall any * 287 | * Contributor, or anyone who distributes Covered Software as * 288 | * permitted above, be liable to You for any direct, indirect, * 289 | * special, incidental, or consequential damages of any character * 290 | * including, without limitation, damages for lost profits, loss of * 291 | * goodwill, work stoppage, computer failure or malfunction, or any * 292 | * and all other commercial damages or losses, even if such party * 293 | * shall have been informed of the possibility of such damages. This * 294 | * limitation of liability shall not apply to liability for death or * 295 | * personal injury resulting from such party's negligence to the * 296 | * extent applicable law prohibits such limitation. Some * 297 | * jurisdictions do not allow the exclusion or limitation of * 298 | * incidental or consequential damages, so this exclusion and * 299 | * limitation may not apply to You. * 300 | * * 301 | ************************************************************************ 302 | 303 | 8. Litigation 304 | ------------- 305 | 306 | Any litigation relating to this License may be brought only in the 307 | courts of a jurisdiction where the defendant maintains its principal 308 | place of business and such litigation shall be governed by laws of that 309 | jurisdiction, without reference to its conflict-of-law provisions. 310 | Nothing in this Section shall prevent a party's ability to bring 311 | cross-claims or counter-claims. 312 | 313 | 9. Miscellaneous 314 | ---------------- 315 | 316 | This License represents the complete agreement concerning the subject 317 | matter hereof. If any provision of this License is held to be 318 | unenforceable, such provision shall be reformed only to the extent 319 | necessary to make it enforceable. Any law or regulation which provides 320 | that the language of a contract shall be construed against the drafter 321 | shall not be used to construe this License against a Contributor. 322 | 323 | 10. Versions of the License 324 | --------------------------- 325 | 326 | 10.1. New Versions 327 | 328 | Mozilla Foundation is the license steward. Except as provided in Section 329 | 10.3, no one other than the license steward has the right to modify or 330 | publish new versions of this License. Each version will be given a 331 | distinguishing version number. 332 | 333 | 10.2. Effect of New Versions 334 | 335 | You may distribute the Covered Software under the terms of the version 336 | of the License under which You originally received the Covered Software, 337 | or under the terms of any subsequent version published by the license 338 | steward. 339 | 340 | 10.3. Modified Versions 341 | 342 | If you create software not governed by this License, and you want to 343 | create a new license for such software, you may create and use a 344 | modified version of this License if you rename the license and remove 345 | any references to the name of the license steward (except to note that 346 | such modified license differs from this License). 347 | 348 | 10.4. Distributing Source Code Form that is Incompatible With Secondary 349 | Licenses 350 | 351 | If You choose to distribute Source Code Form that is Incompatible With 352 | Secondary Licenses under the terms of this version of the License, the 353 | notice described in Exhibit B of this License must be attached. 354 | 355 | Exhibit A - Source Code Form License Notice 356 | ------------------------------------------- 357 | 358 | This Source Code Form is subject to the terms of the Mozilla Public 359 | License, v. 2.0. If a copy of the MPL was not distributed with this 360 | file, You can obtain one at http://mozilla.org/MPL/2.0/. 361 | 362 | If it is not possible or desirable to put the notice in a particular 363 | file, then You may include the notice in a location (such as a LICENSE 364 | file in a relevant directory) where a recipient would be likely to look 365 | for such a notice. 366 | 367 | You may add additional accurate notices of copyright ownership. 368 | 369 | Exhibit B - "Incompatible With Secondary Licenses" Notice 370 | --------------------------------------------------------- 371 | 372 | This Source Code Form is "Incompatible With Secondary Licenses", as 373 | defined by the Mozilla Public License, v. 2.0. 374 | -------------------------------------------------------------------------------- /LICENSE_FOR_SAMPLE_TEXTS: -------------------------------------------------------------------------------- 1 | The sample texts have been downloaded from wikipedia. They are distributed 2 | under the terms of the CC BY-SA-3.0 licence, which is reproduced below. 3 | 4 | https://creativecommons.org/licenses/by-sa/3.0/ 5 | 6 | ************************************************************************ 7 | 8 | CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE LEGAL SERVICES. DISTRIBUTION OF THIS LICENSE DOES NOT CREATE AN ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES REGARDING THE INFORMATION PROVIDED, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM ITS USE. 9 | 10 | License 11 | 12 | THE WORK (AS DEFINED BELOW) IS PROVIDED UNDER THE TERMS OF THIS CREATIVE COMMONS PUBLIC LICENSE ("CCPL" OR "LICENSE"). THE WORK IS PROTECTED BY COPYRIGHT AND/OR OTHER APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS AUTHORIZED UNDER THIS LICENSE OR COPYRIGHT LAW IS PROHIBITED. 13 | 14 | BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE TO BE BOUND BY THE TERMS OF THIS LICENSE. TO THE EXTENT THIS LICENSE MAY BE CONSIDERED TO BE A CONTRACT, THE LICENSOR GRANTS YOU THE RIGHTS CONTAINED HERE IN CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND CONDITIONS. 15 | 16 | 1. Definitions 17 | 18 | "Adaptation" means a work based upon the Work, or upon the Work and other pre-existing works, such as a translation, adaptation, derivative work, arrangement of music or other alterations of a literary or artistic work, or phonogram or performance and includes cinematographic adaptations or any other form in which the Work may be recast, transformed, or adapted including in any form recognizably derived from the original, except that a work that constitutes a Collection will not be considered an Adaptation for the purpose of this License. For the avoidance of doubt, where the Work is a musical work, performance or phonogram, the synchronization of the Work in timed-relation with a moving image ("synching") will be considered an Adaptation for the purpose of this License. 19 | "Collection" means a collection of literary or artistic works, such as encyclopedias and anthologies, or performances, phonograms or broadcasts, or other works or subject matter other than works listed in Section 1(f) below, which, by reason of the selection and arrangement of their contents, constitute intellectual creations, in which the Work is included in its entirety in unmodified form along with one or more other contributions, each constituting separate and independent works in themselves, which together are assembled into a collective whole. A work that constitutes a Collection will not be considered an Adaptation (as defined below) for the purposes of this License. 20 | "Creative Commons Compatible License" means a license that is listed at https://creativecommons.org/compatiblelicenses that has been approved by Creative Commons as being essentially equivalent to this License, including, at a minimum, because that license: (i) contains terms that have the same purpose, meaning and effect as the License Elements of this License; and, (ii) explicitly permits the relicensing of adaptations of works made available under that license under this License or a Creative Commons jurisdiction license with the same License Elements as this License. 21 | "Distribute" means to make available to the public the original and copies of the Work or Adaptation, as appropriate, through sale or other transfer of ownership. 22 | "License Elements" means the following high-level license attributes as selected by Licensor and indicated in the title of this License: Attribution, ShareAlike. 23 | "Licensor" means the individual, individuals, entity or entities that offer(s) the Work under the terms of this License. 24 | "Original Author" means, in the case of a literary or artistic work, the individual, individuals, entity or entities who created the Work or if no individual or entity can be identified, the publisher; and in addition (i) in the case of a performance the actors, singers, musicians, dancers, and other persons who act, sing, deliver, declaim, play in, interpret or otherwise perform literary or artistic works or expressions of folklore; (ii) in the case of a phonogram the producer being the person or legal entity who first fixes the sounds of a performance or other sounds; and, (iii) in the case of broadcasts, the organization that transmits the broadcast. 25 | "Work" means the literary and/or artistic work offered under the terms of this License including without limitation any production in the literary, scientific and artistic domain, whatever may be the mode or form of its expression including digital form, such as a book, pamphlet and other writing; a lecture, address, sermon or other work of the same nature; a dramatic or dramatico-musical work; a choreographic work or entertainment in dumb show; a musical composition with or without words; a cinematographic work to which are assimilated works expressed by a process analogous to cinematography; a work of drawing, painting, architecture, sculpture, engraving or lithography; a photographic work to which are assimilated works expressed by a process analogous to photography; a work of applied art; an illustration, map, plan, sketch or three-dimensional work relative to geography, topography, architecture or science; a performance; a broadcast; a phonogram; a compilation of data to the extent it is protected as a copyrightable work; or a work performed by a variety or circus performer to the extent it is not otherwise considered a literary or artistic work. 26 | "You" means an individual or entity exercising rights under this License who has not previously violated the terms of this License with respect to the Work, or who has received express permission from the Licensor to exercise rights under this License despite a previous violation. 27 | "Publicly Perform" means to perform public recitations of the Work and to communicate to the public those public recitations, by any means or process, including by wire or wireless means or public digital performances; to make available to the public Works in such a way that members of the public may access these Works from a place and at a place individually chosen by them; to perform the Work to the public by any means or process and the communication to the public of the performances of the Work, including by public digital performance; to broadcast and rebroadcast the Work by any means including signs, sounds or images. 28 | "Reproduce" means to make copies of the Work by any means including without limitation by sound or visual recordings and the right of fixation and reproducing fixations of the Work, including storage of a protected performance or phonogram in digital form or other electronic medium. 29 | 30 | 2. Fair Dealing Rights. Nothing in this License is intended to reduce, limit, or restrict any uses free from copyright or rights arising from limitations or exceptions that are provided for in connection with the copyright protection under copyright law or other applicable laws. 31 | 32 | 3. License Grant. Subject to the terms and conditions of this License, Licensor hereby grants You a worldwide, royalty-free, non-exclusive, perpetual (for the duration of the applicable copyright) license to exercise the rights in the Work as stated below: 33 | 34 | to Reproduce the Work, to incorporate the Work into one or more Collections, and to Reproduce the Work as incorporated in the Collections; 35 | to create and Reproduce Adaptations provided that any such Adaptation, including any translation in any medium, takes reasonable steps to clearly label, demarcate or otherwise identify that changes were made to the original Work. For example, a translation could be marked "The original work was translated from English to Spanish," or a modification could indicate "The original work has been modified."; 36 | to Distribute and Publicly Perform the Work including as incorporated in Collections; and, 37 | to Distribute and Publicly Perform Adaptations. 38 | 39 | For the avoidance of doubt: 40 | Non-waivable Compulsory License Schemes. In those jurisdictions in which the right to collect royalties through any statutory or compulsory licensing scheme cannot be waived, the Licensor reserves the exclusive right to collect such royalties for any exercise by You of the rights granted under this License; 41 | Waivable Compulsory License Schemes. In those jurisdictions in which the right to collect royalties through any statutory or compulsory licensing scheme can be waived, the Licensor waives the exclusive right to collect such royalties for any exercise by You of the rights granted under this License; and, 42 | Voluntary License Schemes. The Licensor waives the right to collect royalties, whether individually or, in the event that the Licensor is a member of a collecting society that administers voluntary licensing schemes, via that society, from any exercise by You of the rights granted under this License. 43 | 44 | The above rights may be exercised in all media and formats whether now known or hereafter devised. The above rights include the right to make such modifications as are technically necessary to exercise the rights in other media and formats. Subject to Section 8(f), all rights not expressly granted by Licensor are hereby reserved. 45 | 46 | 4. Restrictions. The license granted in Section 3 above is expressly made subject to and limited by the following restrictions: 47 | 48 | You may Distribute or Publicly Perform the Work only under the terms of this License. You must include a copy of, or the Uniform Resource Identifier (URI) for, this License with every copy of the Work You Distribute or Publicly Perform. You may not offer or impose any terms on the Work that restrict the terms of this License or the ability of the recipient of the Work to exercise the rights granted to that recipient under the terms of the License. You may not sublicense the Work. You must keep intact all notices that refer to this License and to the disclaimer of warranties with every copy of the Work You Distribute or Publicly Perform. When You Distribute or Publicly Perform the Work, You may not impose any effective technological measures on the Work that restrict the ability of a recipient of the Work from You to exercise the rights granted to that recipient under the terms of the License. This Section 4(a) applies to the Work as incorporated in a Collection, but this does not require the Collection apart from the Work itself to be made subject to the terms of this License. If You create a Collection, upon notice from any Licensor You must, to the extent practicable, remove from the Collection any credit as required by Section 4(c), as requested. If You create an Adaptation, upon notice from any Licensor You must, to the extent practicable, remove from the Adaptation any credit as required by Section 4(c), as requested. 49 | You may Distribute or Publicly Perform an Adaptation only under the terms of: (i) this License; (ii) a later version of this License with the same License Elements as this License; (iii) a Creative Commons jurisdiction license (either this or a later license version) that contains the same License Elements as this License (e.g., Attribution-ShareAlike 3.0 US)); (iv) a Creative Commons Compatible License. If you license the Adaptation under one of the licenses mentioned in (iv), you must comply with the terms of that license. If you license the Adaptation under the terms of any of the licenses mentioned in (i), (ii) or (iii) (the "Applicable License"), you must comply with the terms of the Applicable License generally and the following provisions: (I) You must include a copy of, or the URI for, the Applicable License with every copy of each Adaptation You Distribute or Publicly Perform; (II) You may not offer or impose any terms on the Adaptation that restrict the terms of the Applicable License or the ability of the recipient of the Adaptation to exercise the rights granted to that recipient under the terms of the Applicable License; (III) You must keep intact all notices that refer to the Applicable License and to the disclaimer of warranties with every copy of the Work as included in the Adaptation You Distribute or Publicly Perform; (IV) when You Distribute or Publicly Perform the Adaptation, You may not impose any effective technological measures on the Adaptation that restrict the ability of a recipient of the Adaptation from You to exercise the rights granted to that recipient under the terms of the Applicable License. This Section 4(b) applies to the Adaptation as incorporated in a Collection, but this does not require the Collection apart from the Adaptation itself to be made subject to the terms of the Applicable License. 50 | If You Distribute, or Publicly Perform the Work or any Adaptations or Collections, You must, unless a request has been made pursuant to Section 4(a), keep intact all copyright notices for the Work and provide, reasonable to the medium or means You are utilizing: (i) the name of the Original Author (or pseudonym, if applicable) if supplied, and/or if the Original Author and/or Licensor designate another party or parties (e.g., a sponsor institute, publishing entity, journal) for attribution ("Attribution Parties") in Licensor's copyright notice, terms of service or by other reasonable means, the name of such party or parties; (ii) the title of the Work if supplied; (iii) to the extent reasonably practicable, the URI, if any, that Licensor specifies to be associated with the Work, unless such URI does not refer to the copyright notice or licensing information for the Work; and (iv) , consistent with Ssection 3(b), in the case of an Adaptation, a credit identifying the use of the Work in the Adaptation (e.g., "French translation of the Work by Original Author," or "Screenplay based on original Work by Original Author"). The credit required by this Section 4(c) may be implemented in any reasonable manner; provided, however, that in the case of a Adaptation or Collection, at a minimum such credit will appear, if a credit for all contributing authors of the Adaptation or Collection appears, then as part of these credits and in a manner at least as prominent as the credits for the other contributing authors. For the avoidance of doubt, You may only use the credit required by this Section for the purpose of attribution in the manner set out above and, by exercising Your rights under this License, You may not implicitly or explicitly assert or imply any connection with, sponsorship or endorsement by the Original Author, Licensor and/or Attribution Parties, as appropriate, of You or Your use of the Work, without the separate, express prior written permission of the Original Author, Licensor and/or Attribution Parties. 51 | Except as otherwise agreed in writing by the Licensor or as may be otherwise permitted by applicable law, if You Reproduce, Distribute or Publicly Perform the Work either by itself or as part of any Adaptations or Collections, You must not distort, mutilate, modify or take other derogatory action in relation to the Work which would be prejudicial to the Original Author's honor or reputation. Licensor agrees that in those jurisdictions (e.g. Japan), in which any exercise of the right granted in Section 3(b) of this License (the right to make Adaptations) would be deemed to be a distortion, mutilation, modification or other derogatory action prejudicial to the Original Author's honor and reputation, the Licensor will waive or not assert, as appropriate, this Section, to the fullest extent permitted by the applicable national law, to enable You to reasonably exercise Your right under Section 3(b) of this License (right to make Adaptations) but not otherwise. 52 | 53 | 5. Representations, Warranties and Disclaimer 54 | 55 | UNLESS OTHERWISE MUTUALLY AGREED TO BY THE PARTIES IN WRITING, LICENSOR OFFERS THE WORK AS-IS AND MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE WORK, EXPRESS, IMPLIED, STATUTORY OR OTHERWISE, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF TITLE, MERCHANTIBILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, ACCURACY, OR THE PRESENCE OF ABSENCE OF ERRORS, WHETHER OR NOT DISCOVERABLE. SOME JURISDICTIONS DO NOT ALLOW THE EXCLUSION OF IMPLIED WARRANTIES, SO SUCH EXCLUSION MAY NOT APPLY TO YOU. 56 | 57 | 6. Limitation on Liability. EXCEPT TO THE EXTENT REQUIRED BY APPLICABLE LAW, IN NO EVENT WILL LICENSOR BE LIABLE TO YOU ON ANY LEGAL THEORY FOR ANY SPECIAL, INCIDENTAL, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES ARISING OUT OF THIS LICENSE OR THE USE OF THE WORK, EVEN IF LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 58 | 59 | 7. Termination 60 | 61 | This License and the rights granted hereunder will terminate automatically upon any breach by You of the terms of this License. Individuals or entities who have received Adaptations or Collections from You under this License, however, will not have their licenses terminated provided such individuals or entities remain in full compliance with those licenses. Sections 1, 2, 5, 6, 7, and 8 will survive any termination of this License. 62 | Subject to the above terms and conditions, the license granted here is perpetual (for the duration of the applicable copyright in the Work). Notwithstanding the above, Licensor reserves the right to release the Work under different license terms or to stop distributing the Work at any time; provided, however that any such election will not serve to withdraw this License (or any other license that has been, or is required to be, granted under the terms of this License), and this License will continue in full force and effect unless terminated as stated above. 63 | 64 | 8. Miscellaneous 65 | 66 | Each time You Distribute or Publicly Perform the Work or a Collection, the Licensor offers to the recipient a license to the Work on the same terms and conditions as the license granted to You under this License. 67 | Each time You Distribute or Publicly Perform an Adaptation, Licensor offers to the recipient a license to the original Work on the same terms and conditions as the license granted to You under this License. 68 | If any provision of this License is invalid or unenforceable under applicable law, it shall not affect the validity or enforceability of the remainder of the terms of this License, and without further action by the parties to this agreement, such provision shall be reformed to the minimum extent necessary to make such provision valid and enforceable. 69 | No term or provision of this License shall be deemed waived and no breach consented to unless such waiver or consent shall be in writing and signed by the party to be charged with such waiver or consent. 70 | This License constitutes the entire agreement between the parties with respect to the Work licensed here. There are no understandings, agreements or representations with respect to the Work not specified here. Licensor shall not be bound by any additional provisions that may appear in any communication from You. This License may not be modified without the mutual written agreement of the Licensor and You. 71 | The rights granted under, and the subject matter referenced, in this License were drafted utilizing the terminology of the Berne Convention for the Protection of Literary and Artistic Works (as amended on September 28, 1979), the Rome Convention of 1961, the WIPO Copyright Treaty of 1996, the WIPO Performances and Phonograms Treaty of 1996 and the Universal Copyright Convention (as revised on July 24, 1971). These rights and subject matter take effect in the relevant jurisdiction in which the License terms are sought to be enforced according to the corresponding provisions of the implementation of those treaty provisions in the applicable national law. If the standard suite of rights granted under applicable copyright law includes additional rights not granted under this License, such additional rights are deemed to be included in the License; this License is not intended to restrict the license of any rights under applicable law. 72 | 73 | Creative Commons Notice 74 | 75 | Creative Commons is not a party to this License, and makes no warranty whatsoever in connection with the Work. Creative Commons will not be liable to You or any party on any legal theory for any damages whatsoever, including without limitation any general, special, incidental or consequential damages arising in connection to this license. Notwithstanding the foregoing two (2) sentences, if Creative Commons has expressly identified itself as the Licensor hereunder, it shall have all rights and obligations of Licensor. 76 | 77 | Except for the limited purpose of indicating to the public that the Work is licensed under the CCPL, Creative Commons does not authorize the use by either party of the trademark "Creative Commons" or any related trademark or logo of Creative Commons without the prior written consent of Creative Commons. Any permitted use will be in compliance with Creative Commons' then-current trademark usage guidelines, as may be published on its website or otherwise made available upon request from time to time. For the avoidance of doubt, this trademark restriction does not form part of the License. 78 | 79 | Creative Commons may be contacted at https://creativecommons.org/. 80 | 81 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | check-lint: 2 | . venv/bin/activate && (isort --check .; black --check .; flake8 .; mypy --strict .) 3 | 4 | lint: 5 | . venv/bin/activate && (isort .; black .; flake8 .; mypy --strict .) 6 | 7 | test: 8 | . venv/bin/activate && pytest 9 | -------------------------------------------------------------------------------- /color_manager.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module defines a ColorManager. 3 | """ 4 | 5 | # 2020 Bruno Oberle, MPL 2.0, see the LICENSE file. 6 | 7 | __version__ = '1.1.0' 8 | 9 | 10 | class ColorManager: 11 | """Generate colors based on HSL. 12 | 13 | The hue is the main iterator, so that colors are significantly different from 14 | each others. 15 | 16 | Example: 17 | -------- 18 | 19 | >>> cm = ColorManager(50, 20, 20) 20 | >>> for i in range(5): 21 | ... color = cm.get_next_color() 22 | ... print(color) 23 | hsl(0, 100%, 80%) 24 | hsl(50, 100%, 80%) 25 | hsl(100, 100%, 80%) 26 | hsl(150, 100%, 80%) 27 | hsl(200, 100%, 80%) 28 | 29 | Class Attributes 30 | ---------------- 31 | gray: str 32 | The gray color. 33 | 34 | Attributes 35 | ---------- 36 | hue_step: `int` (default: 25) 37 | Hue step. 38 | saturation_step: `int` (default: 25) 39 | Saturation step. 40 | lightness_step: `int` (default: 10) 41 | Lightness step. 42 | gray: `str` (default `rgb(125, 125, 125)`) 43 | Value returned when there is no more color available, and `repeat` is 44 | `False`. 45 | repeat: bool (default: `True`) 46 | When no more available color, repeat. 47 | 48 | Note 49 | ---- 50 | Use `len(cm)` to get the number of available colors. 51 | """ 52 | 53 | gray = "rgb(125, 125, 125)" 54 | 55 | def __init__(self, hue_step=25, saturation_step=25, lightness_step=10, 56 | repeat=True): 57 | self.hue_step = hue_step 58 | self.saturation_step = saturation_step 59 | self.lightness_step = lightness_step 60 | self.gray = "rgb(125, 125, 125)" 61 | self.reset_iterator() 62 | self.repeat = repeat 63 | 64 | def __len__(self): 65 | """Return the number of available colors.""" 66 | hue = 360 // self.hue_step + 1 67 | saturation = 100 // self.saturation_step 68 | lightness = 70 // self.lightness_step # because ]10;80] 69 | return hue * saturation * lightness 70 | 71 | def reset_iterator(self): 72 | """Reset the iterator.""" 73 | self._iter = self.iter_color() 74 | 75 | def get_next_color(self): 76 | """Return the next color.""" 77 | return next(self._iter) 78 | 79 | def iter_color(self): 80 | """Generator that goes through all the colors. 81 | 82 | It is use as the iterator. 83 | 84 | When there is no more color, yield the `gray` instance attribute. 85 | Never raises StopIteration. 86 | """ 87 | while True: 88 | for s in range(100, -1, -self.saturation_step): 89 | for l in range(80, 9, -self.lightness_step): 90 | for h in range (0, 361, self.hue_step): 91 | yield "hsl(%d, %d%%, %d%%)" % (h, s, l) 92 | if not self.repeat: 93 | while True: 94 | yield self.gray 95 | 96 | 97 | 98 | class CommonColorManager: 99 | """Generate colors based on named html colors. 100 | """ 101 | 102 | gray = "gray" 103 | 104 | colors = [ 105 | "red", 106 | "maroon", 107 | "yellow", 108 | "olive", 109 | "lime", 110 | "green", 111 | "aqua", 112 | "teal", 113 | "blue", 114 | "navy", 115 | "fuchsia", 116 | "purple", 117 | ] 118 | 119 | def __init__(self, remove_yellow=True, repeat=True): 120 | self.repeat = True 121 | self.colors = self.__class__.colors.copy() 122 | if remove_yellow: 123 | self.colors.remove("yellow") 124 | self.reset_iterator() 125 | 126 | def __len__(self): 127 | """Return the number of available colors.""" 128 | return len(self.colors) 129 | 130 | def reset_iterator(self): 131 | """Reset the iterator.""" 132 | self._iter = self.iter_color() 133 | 134 | def get_next_color(self): 135 | """Return the next color.""" 136 | return next(self._iter) 137 | 138 | def iter_color(self): 139 | """Generator that goes through all the colors.""" 140 | while True: 141 | for color in self.colors: 142 | yield color 143 | if not self.repeat: 144 | while True: 145 | yield self.__class__.gray 146 | 147 | 148 | -------------------------------------------------------------------------------- /conll2jsonlines.py: -------------------------------------------------------------------------------- 1 | r""" 2 | Convert conll format (2012 or U or X) into jsonlines format. 3 | 4 | The jsonlines format stores data for 5 | several texts (a corpus). Each line is a valid json document, as follows: 6 | 7 | { 8 | "clusters": [], 9 | "doc_key": "nw:docname", 10 | "sentences": [["This", "is", "the", "first", "sentence", "."], 11 | ["This", "is", "the", "second", "."]], 12 | "speakers": [["spk1", "spk1", "spk1", "spk1", "spk1", "spk1"], 13 | ["spk2", "spk2", "spk2", "spk2", "spk2"]] 14 | } 15 | 16 | It is used for some coreference resolution systems, such as: 17 | 18 | - https://github.com/kentonl/e2e-coref 19 | - https://github.com/kkjawz/coref-ee 20 | - https://github.com/boberle/cofr 21 | 22 | To convert from the original CoNLL2012 format into jsonlines format: 23 | 24 | python3 conll2jsonlines.py \ 25 | --token-col 3 \ 26 | --speaker-col 9 \ 27 | INPUT_FILE \ 28 | OUTPUT_FILE 29 | 30 | To convert from the StanfordNLP format into jsonlines format: 31 | 32 | python3 conll2jsonlines.py \ 33 | --skip-singletons \ 34 | --skip-empty-documents \ 35 | --tab \ 36 | --ignore-double-indices 0 \ 37 | --token-col 1 \ 38 | --speaker-col "_" \ 39 | --no-coref \ 40 | INPUT_FILE \ 41 | OUTPUT_FILE 42 | 43 | To convert from the Democrat corpus in CoNLL format (with a column for 44 | paragraphs at position 11): 45 | 46 | python3 conll2jsonlines.py \ 47 | --tab \ 48 | --ignore-double-indices 0 \ 49 | --token-col 1 \ 50 | --speaker-col "_" \ 51 | --par-col 11 \ 52 | testing/singe.conll \ 53 | testing/singe.jsonlines 54 | 55 | Note that you may have to change document keys in the CoNLL files before 56 | running this script if you want to transform them. 57 | """ 58 | 59 | import json 60 | import os 61 | import argparse 62 | 63 | import conll_transform 64 | 65 | 66 | def conll2jsonlines( 67 | infpath, outfpath, 68 | sep=None, token_col=3, speaker_col=9, add_coref=True, par_col=0, 69 | ignore_double_indices=None, 70 | skip_empty_documents=False, skip_singletons=False): 71 | 72 | docs = conll_transform.read_files( 73 | infpath, 74 | sep=sep, 75 | ignore_double_indices=ignore_double_indices, 76 | ) 77 | 78 | with open(outfpath, 'w') as fh: 79 | 80 | for doc_key, doc in docs.items(): 81 | 82 | print("Doing %s" % doc_key) 83 | 84 | if add_coref: 85 | clusters = conll_transform.compute_chains(doc) 86 | clusters = [ 87 | [ list(mention) for mention in cluster] 88 | for cluster in clusters 89 | ] 90 | for cluster in clusters: 91 | conll_transform.sentpos2textpos(cluster, doc) 92 | if skip_singletons: 93 | clusters = list(filter(lambda c: len(c) > 1, clusters)) 94 | if skip_empty_documents and not clusters: 95 | print("Skipping %s because no cluster" % doc_key) 96 | continue 97 | else: 98 | clusters = [] 99 | 100 | tokens = [t for sent in doc for t in sent] 101 | 102 | sentences = [ 103 | [token[token_col] for token in sent] for sent in doc 104 | ] 105 | 106 | if par_col: 107 | start = 0 108 | length = 0 109 | current = -1 110 | paragraphs = [] 111 | for sent in doc: 112 | length += len(sent) 113 | if int(sent[0][par_col]) != current: 114 | current = int(sent[0][par_col]) 115 | paragraphs.append([start, start+length-1]) 116 | start += length 117 | length = 0 118 | else: 119 | #paragraphs = [[0, len(tokens)]] 120 | paragraphs = None 121 | 122 | if speaker_col.isdigit(): 123 | speakers = [ 124 | [token[int(speaker_col)] for token in sent] for sent in doc 125 | ] 126 | else: 127 | speakers = [ 128 | [speaker_col for token in sent] for sent in sentences 129 | ] 130 | 131 | 132 | dic = dict( 133 | doc_key=doc_key, 134 | clusters=clusters, 135 | sentences=sentences, 136 | speakers=speakers, 137 | ) 138 | if paragraphs is not None: 139 | dic['paragraphs'] = paragraphs 140 | fh.write(json.dumps(dic) + "\n") 141 | 142 | 143 | 144 | def parse_args(): 145 | # definition 146 | parser = argparse.ArgumentParser(prog="conll2jsonlines", 147 | #description="convert conll file to jsonlines", 148 | description=__doc__, 149 | formatter_class=argparse.RawDescriptionHelpFormatter) 150 | # arguments (not options) 151 | parser.add_argument("input_fpath", default="", help="input file") 152 | parser.add_argument("output_fpath", default="", help="output file") 153 | # options 154 | parser.add_argument("--skip-singletons", dest="skip_singletons", 155 | default=False, action="store_true", help="skip singletons") 156 | parser.add_argument("--skip-empty-documents", dest="skip_empty_documents", 157 | default=False, action="store_true", help="skip empty documents") 158 | parser.add_argument("--no-coref", dest="add_coref", 159 | default=True, action="store_false", 160 | help="ignore coreference information") 161 | parser.add_argument("--tab", dest="sep_is_tab", 162 | default=False, action="store_true", 163 | help="separator is tab and no a bunch of spaces as in the original " 164 | "conll 2012 format") 165 | parser.add_argument("--token-col", dest="token_col", type=int, 166 | default=3, help="col index for tokens, def 3") 167 | parser.add_argument("--speaker-col", dest="speaker_col", default="9", 168 | help="col index for speakers, def 9. Use a char (ex. _) if you want " 169 | "the speaker col to be filled with that char, eg if there is no " 170 | "speaker column)") 171 | parser.add_argument("--ignore-double-indices", dest="ignore_double_indices", 172 | type=int, default=None, 173 | help="ignore line containing a hyphen in the given column") 174 | parser.add_argument("--par-col", dest="par_col", type=int, 175 | default=0, help="paragraph column, def 0 (= no paragraph information)") 176 | # reading 177 | args = parser.parse_args() 178 | return args 179 | 180 | 181 | 182 | def main(): 183 | args = parse_args() 184 | conll2jsonlines( 185 | infpath=args.input_fpath, 186 | outfpath=args.output_fpath, 187 | skip_empty_documents=args.skip_empty_documents, 188 | skip_singletons=args.skip_singletons, 189 | add_coref=args.add_coref, 190 | token_col=args.token_col, 191 | speaker_col=args.speaker_col, 192 | sep="\t" if args.sep_is_tab else None, 193 | ignore_double_indices=args.ignore_double_indices, 194 | par_col=args.par_col, 195 | ) 196 | 197 | 198 | 199 | if __name__ == '__main__': 200 | main() 201 | 202 | -------------------------------------------------------------------------------- /conll2sacr.py: -------------------------------------------------------------------------------- 1 | r""" 2 | Convert a CoNLL-2012 or CoNLL-U file in a SACR file, which you can 3 | open with the SACR program (http://boberle.com/projects/sacr). In this way, 4 | you can check and edit coreference annotation. To convert back, use the 5 | `sacr2conll.py` script. 6 | 7 | To convert from conll-2012 (space separated columns, word column is 3): 8 | 9 | python3 conll2sacr.py --output-dir DIR INPUT_FILE.conll 10 | 11 | This will convert every document in `INPUT_FILE.conll` into a document in `DIR` 12 | (the name of the file is based on the document name in the conll file). 13 | 14 | To convert from conll-u (tabulation separated columns, word column is 1): 15 | 16 | python3 conll2sacr.py --output-dir DIR \ 17 | --tab \ 18 | --token-col 1 \ 19 | INPUT_FILE.conll 20 | 21 | Use the `--ignore-double-indices` if you want to ignore French amalgams 22 | (`du -> de le`) decomposed by some corpora and software (such as StanfordNLP). 23 | """ 24 | 25 | 26 | import re 27 | import argparse 28 | import os 29 | 30 | import conll_transform 31 | from standoff2inline import Standoff2Inline 32 | 33 | def convert(doc, doc_key, dpath, token_col): 34 | 35 | res = "" 36 | 37 | for sent in doc: 38 | inliner = Standoff2Inline(kind='sacr') 39 | mentions = conll_transform.compute_mentions([t[-1] for t in sent]) 40 | for (start, stop), chain in mentions: 41 | inliner.add((start, (f"C{chain}", dict())), stop-1) 42 | res += inliner.apply(tokens=[t[token_col] for t in sent]) 43 | res += "\n\n" 44 | 45 | if not isinstance(doc_key, str): 46 | doc_key = "_".join(str(x) for x in doc_key) 47 | fname = re.sub(r'[^-\w.]', r'_', doc_key) 48 | fpath = os.path.join(dpath, fname) 49 | open(fpath, 'w').write(res) 50 | 51 | 52 | def parse_args(): 53 | # definition 54 | parser = argparse.ArgumentParser(prog="conll2sacr", 55 | #description="convert conll to sacr", 56 | description=__doc__, 57 | formatter_class=argparse.RawDescriptionHelpFormatter) 58 | # arguments (not options) 59 | parser.add_argument("infpath", default="", help="input file") 60 | #parser.add_argument("outfpath", default="", help="output file") 61 | # options 62 | parser.add_argument("--output-dir", dest="outdpath", required=True, 63 | help="output directory") 64 | parser.add_argument("--token-col", dest="token_col", type=int, 65 | default=3, help="col index for tokens, def 3") 66 | parser.add_argument("--ignore-double-indices", dest="ignore_double_indices", 67 | type=int, default=None, 68 | help="ignore line containing a hyphen in the given column") 69 | parser.add_argument("--tab", dest="tab_sep", default=False, 70 | action="store_true", help="use tabulation as separator (conllu)") 71 | # reading 72 | args = parser.parse_args() 73 | return args 74 | 75 | 76 | 77 | def main(): 78 | 79 | args = parse_args() 80 | 81 | docs = conll_transform.read_file( 82 | args.infpath, 83 | sep="\t" if args.tab_sep else None, 84 | ignore_double_indices=args.ignore_double_indices, 85 | ) 86 | for doc_key, doc in docs.items(): 87 | print(f"Doing {doc_key}") 88 | convert(doc=doc, doc_key=doc_key, dpath=args.outdpath, 89 | token_col=args.token_col) 90 | 91 | 92 | if __name__ == '__main__': 93 | main() 94 | -------------------------------------------------------------------------------- /docs/imgs/glozz_annotation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/boberle/corefconversion/f04490d461e4dc117ffea2448ee6dea2ba51bb3e/docs/imgs/glozz_annotation.png -------------------------------------------------------------------------------- /docs/imgs/notebook_join.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/boberle/corefconversion/f04490d461e4dc117ffea2448ee6dea2ba51bb3e/docs/imgs/notebook_join.png -------------------------------------------------------------------------------- /docs/imgs/notebook_part_of_speech_of_first_mentions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/boberle/corefconversion/f04490d461e4dc117ffea2448ee6dea2ba51bb3e/docs/imgs/notebook_part_of_speech_of_first_mentions.png -------------------------------------------------------------------------------- /docs/imgs/notebook_pivot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/boberle/corefconversion/f04490d461e4dc117ffea2448ee6dea2ba51bb3e/docs/imgs/notebook_pivot.png -------------------------------------------------------------------------------- /docs/imgs/notebook_pivot_chart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/boberle/corefconversion/f04490d461e4dc117ffea2448ee6dea2ba51bb3e/docs/imgs/notebook_pivot_chart.png -------------------------------------------------------------------------------- /docs/imgs/notebook_sentence_lengths.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/boberle/corefconversion/f04490d461e4dc117ffea2448ee6dea2ba51bb3e/docs/imgs/notebook_sentence_lengths.png -------------------------------------------------------------------------------- /docs/imgs/notebook_singletons.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/boberle/corefconversion/f04490d461e4dc117ffea2448ee6dea2ba51bb3e/docs/imgs/notebook_singletons.png -------------------------------------------------------------------------------- /docs/imgs/pict01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/boberle/corefconversion/f04490d461e4dc117ffea2448ee6dea2ba51bb3e/docs/imgs/pict01.png -------------------------------------------------------------------------------- /docs/imgs/pict02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/boberle/corefconversion/f04490d461e4dc117ffea2448ee6dea2ba51bb3e/docs/imgs/pict02.png -------------------------------------------------------------------------------- /docs/imgs/pict03.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/boberle/corefconversion/f04490d461e4dc117ffea2448ee6dea2ba51bb3e/docs/imgs/pict03.png -------------------------------------------------------------------------------- /docs/imgs/pict04.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/boberle/corefconversion/f04490d461e4dc117ffea2448ee6dea2ba51bb3e/docs/imgs/pict04.png -------------------------------------------------------------------------------- /docs/imgs/pict05.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/boberle/corefconversion/f04490d461e4dc117ffea2448ee6dea2ba51bb3e/docs/imgs/pict05.png -------------------------------------------------------------------------------- /glozz2sacr.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings FATAL=>'all'; 4 | use open ':utf8'; 5 | use utf8; 6 | 7 | use Data::Dumper; 8 | #use XML::LibXML::Simple qw(XMLin); 9 | use XML::Simple qw(XMLin); 10 | 11 | 12 | ######################################################################## 13 | # Global variables 14 | ######################################################################## 15 | 16 | my $INPUT_CORPUS = ''; 17 | my $INPUT_ANNOTATIONS = ''; 18 | my $OUTPUT_FILE = ''; 19 | my $REFNAME_FIELD = ''; # REF, refname, etc. 20 | my $UNIT_TYPE = ''; # maillon, MENTION, etc. 21 | my $RESET_REFNAME_FIELD = ''; 22 | 23 | $Data::Dumper::Terse = 1; 24 | $Data::Dumper::Indent = 1; 25 | 26 | 27 | ######################################################################## 28 | # Get the CLI parameters 29 | ######################################################################## 30 | 31 | my $HELP =<<"END"; 32 | USAGE 33 | $0 [OPTIONS] INPUT_AND_OUTPUT_FILES 34 | 35 | EXAMPLE: 36 | $0 test.aa output 37 | 38 | DESCRIPTION 39 | Convert a couple of Glozz files to a SACR file. You can give the FILES 40 | in any order: the .ac and .aa files will be determined by their 41 | extensions. You can give only one of the Glozz file: the other will be 42 | found (if in the same directory). If no output file is specified, print 43 | on STDOUT. 44 | 45 | OPTIONS (-o value --opt value) 46 | -h Print help. 47 | --ref-field Name of the field where the referent is store (REF, refname, 48 | etc.). Default is REF. 49 | --unit-type Type of the unit (maillon, MENTION, etc.). Default is MENTION. 50 | --reset Get a new name for referent (useful if the name used in the 51 | glozz file contains non standard characters). 52 | END 53 | 54 | sub get_cl_parameters { 55 | 56 | # default 57 | $INPUT_CORPUS = ''; 58 | $INPUT_ANNOTATIONS = ''; 59 | $OUTPUT_FILE = ''; 60 | $REFNAME_FIELD = 'REF'; 61 | $UNIT_TYPE = 'MENTION'; 62 | $RESET_REFNAME_FIELD = ''; 63 | 64 | my $pending = ''; 65 | 66 | for (@ARGV) { 67 | print $HELP and exit if m/^(?:-h|--?help)$/; 68 | last if $pending and m/^-/; 69 | # pending 70 | if ($pending eq '--ref-name') { 71 | $REFNAME_FIELD = $_; 72 | $pending = ''; 73 | } elsif ($pending eq '--unit-type') { 74 | $UNIT_TYPE = $_; 75 | $pending = ''; 76 | # end of pending 77 | } elsif ($pending) { 78 | last; 79 | # options with value waiting for next element in @ARGV 80 | } elsif (m/^(?:--ref-name)$/) { 81 | $pending = '--ref-name'; 82 | } elsif (m/^--unit-type$/) { 83 | $pending = '--unit-type'; 84 | # switch (= options with no value) 85 | } elsif (m/^--reset$/) { 86 | $RESET_REFNAME_FIELD = 1; 87 | # end of options 88 | } elsif (m/^-.+/) { 89 | die "$0: *** option '$_' doesn't exists ***\n"; 90 | # arguments, not options 91 | } elsif (!$INPUT_CORPUS and m/^(.+?\.ac)$/ and -f $1) { 92 | $INPUT_CORPUS = $1; 93 | (my $tmp = $INPUT_CORPUS) =~ s/\.ac$/.aa/; 94 | if (!$INPUT_ANNOTATIONS and -f $tmp) { 95 | $INPUT_ANNOTATIONS = $tmp; 96 | } 97 | } elsif (!$INPUT_ANNOTATIONS and m/^(.+?\.aa)$/ and -f $1) { 98 | $INPUT_ANNOTATIONS = $1; 99 | (my $tmp = $INPUT_ANNOTATIONS) =~ s/\.aa$/.ac/; 100 | if (!$INPUT_CORPUS and -f $tmp) { 101 | $INPUT_CORPUS = $tmp; 102 | } 103 | } elsif (!$OUTPUT_FILE and m/^([^-].*+)$/) { 104 | $OUTPUT_FILE = $1; 105 | } else { 106 | die "$0: *** bad option '$_' ***\n"; 107 | } 108 | } # for 109 | 110 | die "$0: *** no glozz file specified ***\n" 111 | unless $INPUT_ANNOTATIONS and $INPUT_CORPUS; 112 | 113 | } 114 | 115 | 116 | 117 | ######################################################################## 118 | # parser 119 | # 120 | # ex: 121 | # 'characterisation' => { 122 | # 'featureSet' => { 123 | # 'feature' => { 124 | # 'gender' => { 125 | # 'content' => 'fem' 126 | # }, 127 | # 'gramcat' => { 128 | # 'content' => 'definite' 129 | # }, 130 | # $REFNAME_FIELD => {}, 131 | # 'number' => { 132 | # 'content' => 'sg' 133 | # } 134 | # } 135 | # }, 136 | # 137 | # ex: 138 | # 'characterisation' => { 139 | # 'featureSet' => { 140 | # 'feature' => { 141 | # 'name' => $REFNAME_FIELD 142 | # } 143 | # }, 144 | # 'type' => $UNIT_TYPE 145 | # }, 146 | ######################################################################## 147 | 148 | sub parse { 149 | 150 | my $xml = shift; 151 | my $corpus = shift; 152 | 153 | my $data = XMLin $xml, ForceArray=>'', KeyAttr => { feature=>'name' }; 154 | #print Dumper $data; die; 155 | 156 | my @annotations = (); 157 | my @paragraphs = (); 158 | 159 | for my $r_unit_hash (@{$data->{unit}}) { 160 | if ($r_unit_hash->{characterisation}->{type} eq 'paragraph') { 161 | push @paragraphs, { 162 | start => $r_unit_hash->{positioning}->{start}->{singlePosition}->{index}, 163 | end => $r_unit_hash->{positioning}->{end}->{singlePosition}->{index} }; 164 | } elsif ($r_unit_hash->{characterisation}->{type} eq $UNIT_TYPE) { 165 | push @annotations, { 166 | start => $r_unit_hash->{positioning}->{start}->{singlePosition}->{index}, 167 | end => $r_unit_hash->{positioning}->{end}->{singlePosition}->{index}, 168 | props => $r_unit_hash->{characterisation}->{featureSet}->{feature} }; 169 | } else { 170 | die "$0: *** don't know unit type '$r_unit_hash->{characterisation}->{type}' ***\n"; 171 | } 172 | } 173 | 174 | my %chain_names = (); 175 | my $name_counter = 0; 176 | 177 | # props is of the form: { $REFNAME_FIELD=>{content=>'value'}, prop=>{content=>'value'}} 178 | for my $r_annotation (@annotations) { 179 | my %props = (); 180 | #print Dumper $r_annotation; 181 | #print substr($corpus, $r_annotation->{start}, $r_annotation->{end}-$r_annotation->{start}), "\n"; 182 | for my $key (keys %{$r_annotation->{props}}) { 183 | # only one feature, which has no content: 184 | # 'characterisation' => { 185 | # 'featureSet' => { 186 | # 'feature' => { 187 | # 'name' => $REFNAME_FIELD 188 | # } 189 | # }, 190 | # 'type' => $UNIT_TYPE 191 | # }, 192 | if ($key eq 'name' and not ref $r_annotation->{props}->{$key}) { 193 | unless ($r_annotation->{props}->{$key} eq $REFNAME_FIELD) { 194 | $props{$r_annotation->{props}->{$key}} = ''; 195 | } 196 | # otherwise 197 | # 'characterisation' => { 198 | # 'featureSet' => { 199 | # 'feature' => { 200 | # 'gender' => { 201 | # 'content' => 'fem' 202 | # }, 203 | # 'gramcat' => { 204 | # 'content' => 'definite' 205 | # }, 206 | # $REFNAME_FIELD => {}, 207 | # 'number' => { 208 | # 'content' => 'sg' 209 | # } 210 | # } 211 | # }, 212 | # 'type' => $UNIT_TYPE 213 | # }, 214 | } else { 215 | if ($key eq $REFNAME_FIELD) { 216 | $r_annotation->{$REFNAME_FIELD} = $r_annotation->{props}->{$key}->{content}; 217 | } else { 218 | $props{$key} = $r_annotation->{props}->{$key}->{content}; 219 | } 220 | } 221 | } 222 | $r_annotation->{props} = \%props; 223 | $r_annotation->{$REFNAME_FIELD} = 'TODO' unless $r_annotation->{$REFNAME_FIELD}; 224 | if ($RESET_REFNAME_FIELD) { 225 | if ($r_annotation->{$REFNAME_FIELD} eq 'SI') { 226 | $r_annotation->{$REFNAME_FIELD} = "L".$name_counter; 227 | } else { 228 | if (not exists $chain_names{$r_annotation->{$REFNAME_FIELD}}) { 229 | $chain_names{$r_annotation->{$REFNAME_FIELD}} = "C".$name_counter; 230 | $name_counter++; 231 | } 232 | $r_annotation->{$REFNAME_FIELD} = $chain_names{$r_annotation->{$REFNAME_FIELD}}; 233 | } 234 | } 235 | } 236 | 237 | #print Dumper \@annotations; 238 | #print Dumper \@paragraphs; 239 | 240 | # NOTE: this is very important, otherwise nested annotations go 241 | # wrong! 242 | @annotations = sort{$a->{start}<=>$b->{start} 243 | or $b->{end}-$b->{start} <=> $a->{end}-$a->{start}} @annotations; 244 | 245 | # test that there are no overlapping annotations 246 | for my $i (@annotations) { 247 | for my $j (@annotations) { 248 | next if $i == $j; 249 | if ($i->{start} < $j->{start} 250 | and $j->{start} < $i->{end} 251 | and $i->{end} < $j->{end}) { 252 | $i->{end} = $j->{end}; 253 | print sprintf "Correcting overlapping annotations: '%s' (%d,%d) and '%s' (%d,%d)\n", 254 | substr($corpus, $i->{start}, $i->{end}-$i->{start}), 255 | $i->{start}, $i->{end}, 256 | substr($corpus, $j->{start}, $j->{end}-$j->{start}), 257 | $j->{start}, $j->{end}; 258 | } 259 | } 260 | } 261 | for my $i (@annotations) { 262 | for my $j (@annotations) { 263 | next if $i == $j; 264 | if ($i->{start} < $j->{start} 265 | and $j->{start} < $i->{end} 266 | and $i->{end} < $j->{end}) { 267 | die sprintf "$0: overlapping annotations: '%s' (%d,%d) and '%s' (%d,%d) ***\n", 268 | substr($corpus, $i->{start}, $i->{end}-$i->{start}), 269 | $i->{start}, $i->{end}, 270 | substr($corpus, $j->{start}, $j->{end}-$j->{start}), 271 | $j->{start}, $j->{end}; 272 | } 273 | } 274 | } 275 | 276 | my $result = ''; 277 | 278 | my @pending_annotations = (); 279 | for my $r_par (sort{$a->{start} <=> $b->{start}} @paragraphs) { 280 | my $par_text = substr($corpus, $r_par->{start}, $r_par->{end}-$r_par->{start}); 281 | my $len = length $par_text; 282 | for (my $i= 0; $i<$len; $i++) { 283 | while (@pending_annotations 284 | and $pending_annotations[0]->{end}-$r_par->{start} == $i and $i > 0) { 285 | $result .= '}'; 286 | shift @pending_annotations; 287 | } 288 | while (@annotations 289 | and $annotations[0]->{start}-$r_par->{start} == $i) { 290 | #DEBUG: my $props_string = ''; 291 | if (exists $annotations[0]->{props}->{headpos} 292 | and exists $annotations[0]->{props}->{headstring}) { 293 | $annotations[0]->{props}->{head} = "$annotations[0]->{props}->{headpos}: $annotations[0]->{props}->{headstring}"; 294 | delete $annotations[0]->{props}->{headpos}; 295 | delete $annotations[0]->{props}->{headstring}; 296 | } 297 | my @props_strings = (); 298 | for my $key (sort keys %{$annotations[0]->{props}}) { 299 | my $val = $annotations[0]->{props}->{$key}; 300 | if (not defined $val) { 301 | $val = ""; 302 | } 303 | push @props_strings, "$key=\"$val\""; 304 | } 305 | #my $props_string = join(',', map{"$_=\"$annotations[0]->{props}->{$_}\""} sort keys %{$annotations[0]->{props}}); 306 | my $props_string = join(',', @props_strings); 307 | $props_string = ":$props_string" if $props_string; 308 | #print Dumper $r_annotation; 309 | $result .= "{$annotations[0]->{$REFNAME_FIELD}$props_string "; 310 | unshift @pending_annotations, shift @annotations; 311 | } 312 | $result .= substr($par_text, $i, 1); 313 | } 314 | # closing at the end of the paragraph 315 | while (@pending_annotations) { 316 | $result .= '}'; 317 | shift @pending_annotations; 318 | } 319 | $result .= "\n\n"; 320 | } 321 | 322 | if (@annotations) { 323 | print Dumper \@annotations; 324 | die "$0: *** some annotations left ***\n"; 325 | } 326 | 327 | return $result; 328 | 329 | 330 | } 331 | 332 | 333 | ######################################################################## 334 | # check comment line 335 | ######################################################################## 336 | 337 | sub check_comment_line { 338 | 339 | my @lines = split /\n/, shift; 340 | 341 | for (@lines) { 342 | s/^#\s*(title|source|NOTE)\s*:/#$1:/; 343 | if (m/^#\s*(COLOR|TOKENIZATION-TYPE|textid|part-heading)\s*:/) { 344 | $_ =~ s/ //g; 345 | } 346 | } 347 | 348 | return join("\n", @lines); 349 | 350 | } 351 | 352 | 353 | ######################################################################## 354 | # main() 355 | ######################################################################## 356 | 357 | sub confirm_yn { 358 | 359 | my $message = shift || 'Confirm ? (y|n) '; 360 | my $default = shift; 361 | 362 | ITER: { 363 | print $message; 364 | my $ans = ; 365 | print "\n" unless -t STDIN; 366 | return 1 if $ans =~ m/^\s*+y(?:es)?\s*+$/; 367 | return 0 if $ans =~ m/^\s*+n(?:o)?\s*+$/; 368 | return $default if (defined($default) and $ans =~ m/^\s*+$/); 369 | redo ITER; 370 | } 371 | 372 | } 373 | 374 | sub read_file { 375 | my $file = shift; 376 | open my $fh, $file or die "$0: *** can't open $file ***\n"; 377 | local $/ = undef; 378 | my $content = <$fh>; 379 | close $fh or die "$0: *** can't close $file ***\n"; 380 | return $content; 381 | } 382 | 383 | 384 | sub write_file { 385 | my $file = shift; 386 | my $content = shift; 387 | open my $fh, ">", $file or die "$0: *** can't open $file ***\n"; 388 | print $fh $content; 389 | close $fh or die "$0: *** can't close $file ***\n"; 390 | } 391 | 392 | 393 | sub main { 394 | 395 | get_cl_parameters(); 396 | 397 | if (-e $OUTPUT_FILE) { 398 | return '' unless (confirm_yn("File $OUTPUT_FILE exists. Overwrite [Y/n]?", 1)); 399 | } 400 | 401 | my $sacr = parse( 402 | read_file($INPUT_ANNOTATIONS), 403 | read_file($INPUT_CORPUS) ); 404 | 405 | $sacr = check_comment_line($sacr); 406 | 407 | if ($OUTPUT_FILE) { 408 | write_file($OUTPUT_FILE, $sacr); 409 | } else { 410 | print $sacr; 411 | } 412 | 413 | return $OUTPUT_FILE; 414 | } 415 | 416 | 417 | main() 418 | and print "$0: done!\n"; 419 | 420 | 421 | -------------------------------------------------------------------------------- /jsonlines2conll.py: -------------------------------------------------------------------------------- 1 | """Script to convert a jsonlines file to a CoNLL file. 2 | 3 | Use the `-h` and `--help` switches to get detailed help on the options. 4 | 5 | Example command (output uses spaces): 6 | 7 | python3 jsonlines2conll.py -g testing/singe.jsonlines -o ouput.conll 8 | 9 | #begin document (ge/articleswiki_singe.xml); part 000 10 | Singe (0) 11 | 12 | Les (0 13 | singes 0) 14 | sont - 15 | des (0 16 | mammifères - 17 | de - 18 | l' (1 19 | ordre - 20 | des - 21 | de - 22 | les (2 23 | primates 1)|2) 24 | ... 25 | #end document 26 | 27 | 28 | Example command (merging coreference information with an existing conll 29 | file, for example to add predicted coreference): 30 | 31 | python3 jsonlines2conll.py -g testing/singe.jsonlines -o ouput.conll \ 32 | -c testing/singe.conll 33 | 34 | #begin document (ge/articleswiki_singe.xml); part 000 35 | 1 Singe Singe NOUN ... 36 | 37 | 1 Les le DET ... 38 | 2 singes singe NOUN ... 39 | 3 sont être AUX ... 40 | 4 des un DET ... 41 | 5 mammifères mammifère NOUN ... 42 | 6 de de ADP ... 43 | 7 l' le DET ... 44 | 8 ordre ordre NOUN ... 45 | 9-10 des _ _ ... 46 | 9 de de ADP ... 47 | 10 les le DET ... 48 | 11 primates primate NOUN ... 49 | ... 50 | #end document 51 | 52 | 53 | Example command (merging + output uses tabulation): 54 | 55 | python3 jsonlines2conll.py -g testing/singe.jsonlines -o ouput.conll -c testing/singe.conll -T 56 | """ 57 | 58 | import argparse 59 | import json 60 | 61 | import conll_transform 62 | 63 | 64 | def jsonlines2conll(*fpaths, cols=None, predicted_clusters=True, 65 | merge_with=None, outfpath=None, tabsep=False): 66 | 67 | if cols is None: 68 | cols = ['sentences'] 69 | 70 | docs = dict() 71 | 72 | for line in (l for fpath in fpaths for l in open(fpath)): 73 | 74 | data = json.loads(line) 75 | doc_key = data["doc_key"] 76 | 77 | sents = [ 78 | # token is just right: a tuple of col 79 | [list(token) for token in zip(*sent)] 80 | # sent is: [ sent1_tokens, sent2_speakers,... ] 81 | for sent in zip(*[iter(data[col]) for col in cols]) 82 | ] 83 | 84 | chains = data['predicted_clusters' 85 | if predicted_clusters else 'clusters'] 86 | 87 | mentions = [ m for chain in chains for m in chain ] 88 | conll_transform.textpos2sentpos(mentions, sents) 89 | 90 | conll_transform.write_chains(sents, chains, append=True) 91 | 92 | docs[doc_key] = sents 93 | 94 | if merge_with: 95 | conll_transform.replace_coref_col(docs, merge_with) 96 | docs = merge_with 97 | 98 | if outfpath: 99 | conll_transform.write_file(outfpath, docs, sep="\t" if tabsep else None) 100 | 101 | return docs 102 | 103 | 104 | def parse_args(): 105 | # definition 106 | parser = argparse.ArgumentParser(prog="jsonlines2conll", 107 | description="convert jsonlines to conll", 108 | #description=__doc__, 109 | formatter_class=argparse.RawDescriptionHelpFormatter) 110 | # arguments (not options) 111 | parser.add_argument("infpaths", nargs="+", help="input files") 112 | # options 113 | parser.add_argument("-g", "--gold", dest="gold_clusters", 114 | default=False, action="store_true", 115 | help="use gold clusters instead of predicted clusters") 116 | parser.add_argument("-t", "--in-tab-sep", dest="intabsep", 117 | default=False, action="store_true", 118 | help="input conll files use tab as separator") 119 | parser.add_argument("-T", "--out-tab-sep", dest="outtabsep", 120 | default=False, action="store_true", 121 | help="output conll files use tab as separator") 122 | parser.add_argument("-o", dest="outfpath", required=True, 123 | help="output file") 124 | parser.add_argument("-c", "--conll", dest="conll_files", action="append", 125 | default=[], 126 | help="conll files to merge with, may be repeated") 127 | parser.add_argument("--cols", dest="cols", default='sentences', 128 | help="comma separated list of cols to include, in order " 129 | "(default: 'sentences')") 130 | # reading 131 | args = parser.parse_args() 132 | return args 133 | 134 | 135 | def main(): 136 | 137 | args = parse_args() 138 | 139 | if args.conll_files: 140 | merge_with = conll_transform.read_files(*args.conll_files, 141 | sep="\t" if args.intabsep else None) 142 | else: 143 | merge_with = None 144 | 145 | jsonlines2conll( 146 | *args.infpaths, 147 | outfpath=args.outfpath, 148 | predicted_clusters=not args.gold_clusters, 149 | merge_with=merge_with, 150 | cols=args.cols.split(','), 151 | tabsep=args.outtabsep, 152 | ) 153 | 154 | 155 | if __name__ == '__main__': 156 | main() 157 | 158 | -------------------------------------------------------------------------------- /jsonlines2text.py: -------------------------------------------------------------------------------- 1 | """Script to convert from a jsonlines file to a text representation of 2 | coreference annotation. The output is html. Mentions are surrounded by 3 | brackets. Coreference chains are represented by colors (each chain has 4 | a specific color) and, if requested by a switch, an index (1, 2, 3...). 5 | Singletons may be hidden or shown in a specific color (gray by default), 6 | without any index. 7 | 8 | If your jsonlines file contains several documents, you may show the 9 | document name by using the `--heading` option. 10 | 11 | Here is a minimal example: 12 | 13 | python3 jsonlines2text.py testing/docs.jsonlines -o output.html 14 | 15 | Use the `-h` and `--help` switches to get a detailed list of options. 16 | """ 17 | 18 | 19 | import argparse 20 | import json 21 | 22 | from standoff2inline import Highlighter, highlight 23 | from color_manager import ColorManager, CommonColorManager 24 | 25 | 26 | 27 | def sort_mentions(clusters): 28 | res = [] 29 | for cluster in clusters: 30 | cluster = sorted(cluster, key=lambda x: x[1], reverse=True) 31 | cluster = sorted(cluster, key=lambda x: x[0]) 32 | res.append(cluster) 33 | return res 34 | 35 | 36 | 37 | def sort_clusters(clusters): 38 | clusters = sorted(clusters, key=lambda x: x[0][1], reverse=True) 39 | clusters = sorted(clusters, key=lambda x: x[0][0]) 40 | return clusters 41 | 42 | 43 | 44 | 45 | def highlight_clusters(tokens, clusters, paragraphs, *, singleton_color, 46 | color_manager, add_indices): 47 | 48 | clusters = sort_mentions(clusters) 49 | clusters = sort_clusters(clusters) 50 | 51 | if color_manager == "complete": 52 | cm = ColorManager(hue_step=25, saturation_step=25, lightness_step=10) 53 | elif color_manager == "common": 54 | cm = CommonColorManager() 55 | else: 56 | cm = None 57 | 58 | hls = [] 59 | 60 | if paragraphs: 61 | hl = Highlighter( 62 | prefix="

", 63 | suffix="

" 64 | ) 65 | for start, end in paragraphs: 66 | hl.add_mark(start, end) 67 | hls.append(hl) 68 | 69 | counter = 1 70 | 71 | for i, cluster in enumerate(clusters, start=1): 72 | hl = None 73 | if len(cluster) == 1: 74 | if singleton_color == "": 75 | pass 76 | else: 77 | color = (cm.gray if cm else 'gray') \ 78 | if singleton_color is None else singleton_color 79 | start_span = f'' 80 | end_span = "" 81 | hl = Highlighter( 82 | prefix=f'{start_span}[{end_span}', 83 | suffix=f'{start_span}]{end_span}') 84 | else: 85 | color = cm.get_next_color() if cm else "black" 86 | start_span = f'' 87 | end_span = "" 88 | index = f"{counter}{end_span}" if add_indices else "" 89 | hl = Highlighter( 90 | prefix=f"{start_span}[{end_span}", 91 | suffix=f"{start_span}]{index}" 92 | ) 93 | counter += 1 94 | if hl is not None: # None if only singletons, and they must not be 95 | # marked, or empty document 96 | for start, end in cluster: 97 | hl.add_mark(start, end) 98 | hls.append(hl) 99 | 100 | res = highlight(tokens, *hls) 101 | 102 | return res 103 | 104 | 105 | 106 | 107 | 108 | def filter_tokens(tokens, clusters, n): 109 | tokens = tokens[:n] 110 | new_clusters = [] 111 | for cluster in clusters: 112 | new_cluster = [] 113 | for mention in cluster: 114 | if mention[0] < n and mention[1] < n: 115 | new_cluster.append(mention) 116 | if new_cluster: 117 | new_clusters.append(new_cluster) 118 | return tokens, new_clusters 119 | 120 | 121 | 122 | def convert(doc, gold, n, **kwargs): 123 | tokens = [t for sent in doc['sentences'] for t in sent] 124 | if gold: 125 | clusters = doc.get('clusters', list()) 126 | else: 127 | clusters = doc.get('predicted_clusters', doc.get('clusters', list())) 128 | if n: 129 | tokens, clusters = filter_tokens(tokens, clusters, n) 130 | paragraphs = doc.get('paragraphs') 131 | res = highlight_clusters(tokens, clusters, paragraphs, **kwargs) 132 | return res 133 | 134 | 135 | 136 | def parse_args(): 137 | # definition 138 | parser = argparse.ArgumentParser(prog="jsonlines2text", 139 | description=__doc__, 140 | formatter_class=argparse.RawDescriptionHelpFormatter) 141 | # arguments (not options) 142 | #parser.add_argument("infpaths", nargs="+", help="input files") 143 | parser.add_argument("infpath", default="", help="input file") 144 | #parser.add_argument("outfpath", default="", help="output file") 145 | # options 146 | parser.add_argument("-o", dest="outfpath", help="output file") 147 | parser.add_argument("--cm", "--color-manager", dest="color_manager", 148 | default="complete", 149 | help="color manager: \"\", \"complete\" (the default), \"common\"") 150 | parser.add_argument("--sing-color", dest="singleton_color", 151 | help="singleton color: COLOR (default is 'gray') or \"\" to hide " 152 | "singleton markers", default=None), 153 | parser.add_argument("-i", "--add-indices", dest="add_indices", 154 | default=False, action="store_true", 155 | help="add indices to each chain and mention") 156 | parser.add_argument("-g", "--gold", dest="gold", default=False, 157 | action="store_true", 158 | help="use the 'clusters' key even if a 'predicted_clusters' key is " 159 | "present") 160 | parser.add_argument("-n", dest="n", default=0, type=int, 161 | help="number of tokens to consider from the beginning of the text") 162 | parser.add_argument("--heading", dest="heading", default="

%s

", 163 | help="template for text name, default is '

%s

'. Leave " 164 | "blank to ignore doc name") 165 | # reading 166 | args = parser.parse_args() 167 | return args 168 | 169 | 170 | 171 | def main(): 172 | args = parse_args() 173 | res = "" 174 | for line in open(args.infpath): 175 | doc = json.loads(line) 176 | if args.heading: 177 | if "%s" in args.heading: 178 | res += args.heading % doc['doc_key'] 179 | else: 180 | res += args.heading 181 | res += convert(doc, n=args.n, gold=args.gold, 182 | singleton_color=args.singleton_color, 183 | color_manager=args.color_manager, add_indices=args.add_indices 184 | ) 185 | if args.outfpath: 186 | open(args.outfpath, 'w').write(res) 187 | else: 188 | print(res) 189 | 190 | 191 | 192 | if __name__ == '__main__': 193 | main() 194 | 195 | -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | exclude=^(color_manager|conll2jsonlines|conll2sacr|conll_transform|jsonlines2conll|jsonlines2text|sacr2conll|sacr_parser|text2jsonlines|standoff2inline)\.py$ 3 | strict=true 4 | disable_error_code=override 5 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.isort] 2 | profile="black" 3 | skip_gitignore=true 4 | skip=["color_manager.py","conll2jsonlines.py","conll2sacr.py","conll_transform.py","jsonlines2conll.py","jsonlines2text.py","sacr2conll.py","sacr_parser.py","text2jsonlines.py","standoff2inline.py"] 5 | 6 | [tool.black] 7 | extend-exclude='(color_manager|conll2jsonlines|conll2sacr|conll_transform|jsonlines2conll|jsonlines2text|sacr2conll|sacr_parser|text2jsonlines|standoff2inline).py' -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | black==23.3.0 2 | flake8==6.0.0 3 | isort==5.12.0 4 | pytest==7.2.2 5 | mypy==1.1.1 6 | coverage==7.2.3 7 | pandas==2.0.0 8 | -------------------------------------------------------------------------------- /sacr2ann.py: -------------------------------------------------------------------------------- 1 | """Convert a sacr file to an ann/txt files (BRAT standoff annotations). 2 | 3 | The script will produce two files, one for the text and one for the annotations. 4 | 5 | Annotations are of the form: 6 | 7 | T1 Person 0 9 A Peasant 8 | T2 Animal 16 43 an Eagle captured in a trap 9 | T3 Object 37 43 a trap 10 | T4 Animal 62 70 the bird 11 | R1 Coreference Arg1:T2 Arg2:T4 12 | T5 Person 76 79 him 13 | R2 Coreference Arg1:T1 Arg2:T5 14 | 15 | Note that only a subset of the BRAT format is implemented for now, namely 16 | the text-bound annotations and the relations. 17 | 18 | Please consult the README file for more information. 19 | """ 20 | 21 | from __future__ import annotations 22 | 23 | import argparse 24 | from argparse import Namespace 25 | from collections import defaultdict 26 | from dataclasses import dataclass 27 | from pathlib import Path 28 | 29 | from sacr_parser2 import ( 30 | MentionEnd, 31 | MentionStart, 32 | ParagraphEnd, 33 | SacrParser, 34 | Spaces, 35 | Word, 36 | ) 37 | 38 | DEFAULT_MENTION_TYPE = "Mention" 39 | DEFAULT_RELATION_TYPE = "Coreference" 40 | 41 | 42 | @dataclass 43 | class Annotation: 44 | index: int 45 | kind: str 46 | 47 | def __eq__(self, other: Annotation) -> bool: 48 | return self.index == other.index and self.kind == other.kind 49 | 50 | 51 | @dataclass 52 | class TextAnnotation(Annotation): 53 | start: int 54 | end: int 55 | 56 | def __eq__(self, other: TextAnnotation) -> bool: 57 | return ( 58 | super().__eq__(other) 59 | and self.start == other.start 60 | and self.end == other.end 61 | ) 62 | 63 | 64 | @dataclass 65 | class RelationAnnotation(Annotation): 66 | source: Annotation 67 | target: Annotation 68 | 69 | def __eq__(self, other: RelationAnnotation) -> bool: 70 | return ( 71 | super().__eq__(other) 72 | and self.source == other.source 73 | and self.target == other.target 74 | ) 75 | 76 | 77 | class Sacr2AnnConverter: 78 | def __init__(self, type_property_name: str | None = None): 79 | self.type_property_name = type_property_name 80 | self._text: str | None = None 81 | self._annotations: list[Annotation] | None = None 82 | 83 | def convert(self, source: str | Path) -> None: 84 | parser = SacrParser(source=source) 85 | 86 | text: str = "" 87 | annotations: list[Annotation] = [] 88 | text_annotation_count: int = 0 89 | relation_annotation_count: int = 0 90 | 91 | chains: dict[int, list[TextAnnotation]] = defaultdict(list) 92 | filo: list[TextAnnotation] = [] 93 | 94 | for token in parser.parse(): 95 | start_position = len(text) 96 | 97 | if isinstance(token, (Word, Spaces)): 98 | text += token.value 99 | elif isinstance(token, ParagraphEnd): 100 | text += "\n\n" 101 | 102 | elif isinstance(token, MentionStart): 103 | text_annotation_count += 1 104 | if self.type_property_name: 105 | kind = token.features.get( 106 | self.type_property_name, DEFAULT_MENTION_TYPE 107 | ) 108 | else: 109 | kind = DEFAULT_MENTION_TYPE 110 | text_annotation = TextAnnotation( 111 | index=text_annotation_count, 112 | kind=kind, 113 | start=start_position, 114 | end=0, 115 | ) 116 | filo.append(text_annotation) 117 | annotations.append(text_annotation) 118 | 119 | if token.chain_index in chains: 120 | relation_annotation_count += 1 121 | relation_annotation = RelationAnnotation( 122 | index=relation_annotation_count, 123 | kind=DEFAULT_RELATION_TYPE, 124 | source=chains[token.chain_index][-1], 125 | target=text_annotation, 126 | ) 127 | annotations.append(relation_annotation) 128 | 129 | chains[token.chain_index].append(text_annotation) 130 | 131 | elif isinstance(token, MentionEnd): 132 | text_annotation = filo.pop() 133 | text_annotation.end = len(text) 134 | 135 | self._text = text 136 | self._annotations = annotations 137 | 138 | @property 139 | def text(self) -> str: 140 | if self._text is None: 141 | raise RuntimeError("You need to parse before reading the text property") 142 | return self._text 143 | 144 | @property 145 | def annotations(self) -> list[Annotation]: 146 | if self._annotations is None: 147 | raise RuntimeError( 148 | "You need to parse before reading the annotations property" 149 | ) 150 | return self._annotations 151 | 152 | def write_text_to_file(self, file: Path) -> None: 153 | file.write_text(self.text) 154 | 155 | @staticmethod 156 | def _convert_annotations_as_string(text: str, annotations: list[Annotation]) -> str: 157 | string = "" 158 | for annotation in annotations: 159 | if isinstance(annotation, TextAnnotation): 160 | span = text[annotation.start : annotation.end] 161 | string += f"T{annotation.index}\t{annotation.kind} {annotation.start} {annotation.end}\t{span}\n" 162 | elif isinstance(annotation, RelationAnnotation): 163 | string += f"R{annotation.index}\t{annotation.kind} Arg1:T{annotation.source.index} Arg2:T{annotation.target.index}\n" 164 | else: 165 | raise RuntimeError( 166 | "unknown annotation type: " + annotation.__class__.__name__ 167 | ) 168 | return string 169 | 170 | @property 171 | def annotations_as_string(self) -> str: 172 | return self._convert_annotations_as_string(self.text, self.annotations) 173 | 174 | def write_annotations_to_file(self, file: Path) -> None: 175 | file.write_text(self.annotations_as_string) 176 | 177 | 178 | def convert( 179 | input_file: Path, txt_output: Path, ann_output: Path, type_property_name: str 180 | ) -> None: 181 | converter = Sacr2AnnConverter(type_property_name=type_property_name) 182 | converter.convert(input_file) 183 | converter.write_text_to_file(txt_output) 184 | converter.write_annotations_to_file(ann_output) 185 | 186 | 187 | def parse_args() -> Namespace: 188 | parser = argparse.ArgumentParser( 189 | prog="sacr2ann", 190 | description=__doc__, 191 | formatter_class=argparse.RawDescriptionHelpFormatter, 192 | ) 193 | parser.add_argument("input", help="input file") 194 | parser.add_argument( 195 | "--txt", 196 | dest="txt_output", 197 | default=None, 198 | help="output file, default is input file name + .txt", 199 | ) 200 | parser.add_argument( 201 | "--ann", 202 | dest="ann_output", 203 | default=None, 204 | help="output file, default is input file name + .ann", 205 | ) 206 | parser.add_argument( 207 | "--type-property-name", 208 | default=None, 209 | help=f"name of the property where to find the type of text annotation. If not given, '{DEFAULT_MENTION_TYPE}' is used as the type", 210 | ) 211 | args = parser.parse_args() 212 | return args 213 | 214 | 215 | def main() -> None: 216 | args = parse_args() 217 | convert( 218 | input_file=Path(args.input), 219 | txt_output=Path(args.txt_output or (args.input + ".txt")), 220 | ann_output=Path(args.ann_output or (args.input + ".ann")), 221 | type_property_name=args.type_property_name, 222 | ) 223 | 224 | 225 | if __name__ == "__main__": 226 | main() 227 | -------------------------------------------------------------------------------- /sacr2annotable.py: -------------------------------------------------------------------------------- 1 | """Convert a corpus of SACR texts into a Corpus (from annotable.py) that can be 2 | used to output dataframes. 3 | 4 | It is a class which should be used as follows: 5 | 6 | files = [ 7 | Path("file1.sacr"), 8 | Path("file2.sacr"), 9 | Path("file3.sacr"), 10 | # ... 11 | ] 12 | 13 | converter = Sacr2AnnotableConverter() 14 | for file in files: 15 | converter.convert_text(file) 16 | corpus = converter.corpus 17 | 18 | dataframes = corpus.get_dataframes() 19 | """ 20 | 21 | from __future__ import annotations 22 | 23 | import re 24 | from pathlib import Path 25 | 26 | from annotable import Corpus, Mention, Paragraph, Sentence, Text, Token 27 | from sacr_parser2 import ( 28 | Comment, 29 | MentionEnd, 30 | MentionStart, 31 | ParagraphEnd, 32 | SacrParser, 33 | SentenceChange, 34 | Spaces, 35 | TextID, 36 | Word, 37 | ) 38 | 39 | TEXT_METADATA_PATTERN = re.compile(r"textmetadata\s*:\s*(\w+)\s*=\s*(.*)") 40 | 41 | 42 | class Sacr2AnnotableConverter: 43 | def __init__(self) -> None: 44 | self.corpus: Corpus = Corpus() 45 | 46 | def convert_text(self, source: str | Path) -> None: 47 | parser = SacrParser(source=source) 48 | 49 | text: Text = Text() 50 | current_paragraph: Paragraph = Paragraph() 51 | current_sentence: Sentence = Sentence() 52 | filo: list[Mention] = [] 53 | 54 | for token in parser.parse(): 55 | if isinstance(token, Spaces): 56 | for mention in filo: 57 | mention.string += token.value 58 | 59 | elif isinstance(token, Word): 60 | t = Token(token.start, token.end, token.value) 61 | for mention in filo: 62 | mention.add_token(t) 63 | mention.string += token.value 64 | current_sentence.add_token(t) 65 | 66 | elif isinstance(token, TextID): 67 | text.name = token.text_id 68 | 69 | elif isinstance(token, ParagraphEnd): 70 | if current_sentence.token_count: 71 | current_paragraph.add_sentence(current_sentence) 72 | current_sentence = Sentence() 73 | text.add_paragraph(current_paragraph) 74 | current_paragraph = Paragraph() 75 | 76 | elif isinstance(token, SentenceChange): 77 | if current_sentence.token_count: 78 | current_paragraph.add_sentence(current_sentence) 79 | current_sentence = Sentence() 80 | 81 | elif isinstance(token, MentionStart): 82 | mention = Mention(chain_name=token.chain_name, string="") 83 | for k, v in token.features.items(): 84 | mention[k] = v 85 | current_sentence.add_mention(mention) 86 | filo.append(mention) 87 | 88 | elif isinstance(token, MentionEnd): 89 | filo.pop() 90 | 91 | elif isinstance(token, Comment): 92 | if m := TEXT_METADATA_PATTERN.fullmatch(token.value): 93 | text.metadata[m.group(1)] = m.group(2) 94 | 95 | if current_sentence.token_count: 96 | current_paragraph.add_sentence(current_sentence) 97 | if current_paragraph.sentence_count: 98 | text.add_paragraph(current_paragraph) 99 | 100 | self.corpus.add_text(text) 101 | -------------------------------------------------------------------------------- /sacr2conll.py: -------------------------------------------------------------------------------- 1 | """ 2 | Convert a SACR file (http://boberle.com/projects/sacr) to a conll file. The 3 | conll format produced is tabulation separated with three columns: index, word 4 | and coreference. 5 | 6 | To convert: 7 | 8 | python3 sacr2conll.py -o OUTPUT.conll INPUT.sacr 9 | 10 | You can specify the document name (or key) with the `--docname` option. 11 | Otherwise, it will be `#textid`, if any, otherwise the file name. 12 | 13 | With the --speaker switch, you can add a 4th column, which will be placed 14 | before the coreference columns. In the SACR file, the speaker can be mentionned 15 | as a comment prefixed with `#speaker:` before each line, like this: 16 | 17 | #title: Lucian, Dialogues of the Dead, 4: Hermes and Charon 18 | 19 | #speaker: Hermes 20 | Ferryman, what do you say to settling up accounts? It will prevent any 21 | unpleasantness later on. 22 | 23 | #speaker: Charon 24 | Very good. It does save trouble to get these things straight. 25 | 26 | You can remove the speaker for a paragraph by setting: 27 | 28 | #speaker: 29 | ... the text of the narrator ... 30 | """ 31 | 32 | 33 | import os 34 | import argparse 35 | import re 36 | 37 | import sacr_parser 38 | 39 | __version__ = "1.0.0" 40 | 41 | 42 | def read_file(fpath, index, docname=None, part_is_index=True, include_speaker=False): 43 | 44 | parser = sacr_parser.SacrParser( 45 | fpath=fpath, 46 | tokenization_mode=sacr_parser.WORD_TOKENIZATION, 47 | ) 48 | 49 | tokens = [] 50 | starts = dict() # start -> {ids} 51 | ends = dict() # end -> {ids} 52 | sentences = set() # index of last tokens 53 | 54 | filo = [] 55 | 56 | textid = None 57 | speaker = "" 58 | 59 | for item, params in parser.parse(): 60 | 61 | if item == "text_id": 62 | textid = params 63 | 64 | elif item in ("par_start", "par_end", "sentence_change"): 65 | if tokens: 66 | sentences.add(len(tokens)) 67 | 68 | elif item == "mention_start": 69 | chain = params[0] 70 | l = len(tokens) 71 | if l not in starts: 72 | starts[l] = [] 73 | starts[l].append(chain) 74 | filo.append(chain) 75 | 76 | elif item == "comment": 77 | if params.startswith("speaker:"): 78 | speaker = params[8:].strip().replace(" ", "_") 79 | 80 | elif item == "mention_end": 81 | chain = filo.pop() 82 | l = len(tokens) - 1 83 | if l not in ends: 84 | ends[l] = [] 85 | ends[l].append(chain) 86 | 87 | elif item == "token": 88 | tokens.append((params, speaker)) 89 | 90 | lines = [] 91 | 92 | counter = 0 93 | for i, (token, speaker) in enumerate(tokens): 94 | if i in sentences: 95 | lines.append("") 96 | counter = 0 97 | corefcol = "_".join( 98 | # ["(%d)" % x for x in (starts[i] 99 | # if (i in starts and i in ends) else [])] 100 | # + ["(%d" % x for x in (starts[i] 101 | ["(%d" % x for x in (starts[i] if i in starts else [])] 102 | + ["%d)" % x for x in (ends[i] if i in ends else [])] 103 | ) 104 | corefcol = re.sub(r"\((\d+)_\1\)", r"(\1)", corefcol) 105 | if not corefcol: 106 | corefcol = "-" 107 | if include_speaker: 108 | cols = [str(counter), token, speaker, corefcol] 109 | else: 110 | cols = [str(counter), token, corefcol] 111 | lines.append("\t".join(cols)) 112 | counter += 1 113 | 114 | if not docname: 115 | docname = textid if textid else os.path.basename(fpath) 116 | res = "#begin document (%s); part %03d\n" % (docname, index if part_is_index else 0) 117 | res += "\n".join(lines) 118 | res += "\n#end document\n" 119 | return res 120 | 121 | 122 | def parse_args(): 123 | # definition 124 | parser = argparse.ArgumentParser( 125 | prog="sacr2conll", 126 | # description="convert sacr files to conll file", 127 | description=__doc__, 128 | formatter_class=argparse.RawDescriptionHelpFormatter, 129 | ) 130 | # arguments (not options) 131 | parser.add_argument("infpaths", nargs="+", help="input files") 132 | # options 133 | parser.add_argument( 134 | "-o", dest="outfpath", default="", help="output file, default is stdout" 135 | ) 136 | parser.add_argument( 137 | "-n", 138 | "--docname", 139 | dest="docname", 140 | default="", 141 | help="document name; otherwise #textid; otherwise file name", 142 | ) 143 | parser.add_argument( 144 | "-i", 145 | "--index", 146 | dest="part_is_index", 147 | default=False, 148 | action="store_true", 149 | help="document part is file index (otherwise the part is 0; " 150 | "this is implied by --docname", 151 | ) 152 | parser.add_argument( 153 | "-s", 154 | "--speaker", 155 | default=False, 156 | action="store_true", 157 | help="include a column 'speaker' before the coref column", 158 | ) 159 | # special options 160 | parser.add_argument( 161 | "--version", action="version", version="%(prog)s " + __version__ 162 | ) 163 | # reading 164 | args = parser.parse_args() 165 | # check 166 | if args.docname: 167 | args.part_is_index = True 168 | return args 169 | 170 | 171 | def main(): 172 | args = parse_args() 173 | res = [] 174 | for i, fpath in enumerate(args.infpaths): 175 | res.append( 176 | read_file( 177 | fpath, index=i, docname=args.docname, part_is_index=args.part_is_index, include_speaker=args.speaker 178 | ) 179 | ) 180 | res = "\n\n".join(res) 181 | if args.outfpath: 182 | open(args.outfpath, "w").write(res) 183 | else: 184 | print(res) 185 | 186 | 187 | if __name__ == "__main__": 188 | main() 189 | -------------------------------------------------------------------------------- /sacr2df.py: -------------------------------------------------------------------------------- 1 | """Convert a corpus of texts annotated with SACR in a series of dataframes or 2 | CSV files. 3 | 4 | This dataframes/files model: 5 | - corpus, texts, sentences, tokens 6 | - mentions, chains and relations 7 | 8 | Please see the README file for a detail description. 9 | 10 | You can use the script in the CLI: 11 | 12 | python3 sacr2df.py text1.sacr text2.sacr ... -o output_file.zip 13 | 14 | or as a library, for example in a Jupyter notebook: 15 | 16 | from sacr2df import convert_sacr_files_to_dataframes 17 | from pathlib import Path 18 | 19 | dfs = convert_sacr_files_to_dataframes( 20 | Path("testing/aesop.sacr"), 21 | Path("testing/caesar.sacr"), 22 | Path("testing/cicero.sacr"), 23 | Path("testing/pliny.sacr"), 24 | ) 25 | 26 | # then do something with the dfs: 27 | print(dfs.texts.head()) 28 | print(dfs.paragraphs.head()) 29 | print(dfs.sentences.head()) 30 | print(dfs.tokens.head()) 31 | print(dfs.text_chains.head()) 32 | print(dfs.text_mentions.head()) 33 | print(dfs.text_consecutive_relations.head()) 34 | print(dfs.text_to_first_relations.head()) 35 | """ 36 | 37 | import argparse 38 | from argparse import Namespace 39 | from pathlib import Path 40 | 41 | from annotable import DataFrameSet 42 | from sacr2annotable import Sacr2AnnotableConverter 43 | 44 | 45 | def convert_sacr_files_to_dataframes( 46 | *files: Path, output_file: Path | None = None 47 | ) -> DataFrameSet: 48 | conv = Sacr2AnnotableConverter() 49 | for file in files: 50 | conv.convert_text(file) 51 | corpus = conv.corpus 52 | 53 | if output_file: 54 | corpus.save_csv_as_zip(output_file) 55 | 56 | return corpus.get_dataframes() 57 | 58 | 59 | def parse_args() -> Namespace: 60 | parser = argparse.ArgumentParser( 61 | prog="sacr2df", 62 | description=__doc__, 63 | formatter_class=argparse.RawDescriptionHelpFormatter, 64 | ) 65 | parser.add_argument("input_files", nargs="+", help="input files") 66 | parser.add_argument( 67 | "--output_file", 68 | "-o", 69 | required=True, 70 | help="output file. This is a zip file containing the csv", 71 | ) 72 | args = parser.parse_args() 73 | return args 74 | 75 | 76 | def main() -> None: 77 | args = parse_args() 78 | convert_sacr_files_to_dataframes( 79 | *[Path(f) for f in args.input_files], 80 | output_file=Path(args.output_file), 81 | ) 82 | 83 | 84 | if __name__ == "__main__": 85 | main() 86 | -------------------------------------------------------------------------------- /sacr2glozz.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings FATAL=>'all'; 4 | use open ':utf8'; 5 | use utf8; 6 | 7 | use Data::Dumper; 8 | 9 | ######################################################################## 10 | # Global variables 11 | ######################################################################## 12 | 13 | my $USER = 'me'; 14 | my $MIN_NB_OF_LINKS = 0; 15 | my $REFNAME_PROPERTY = ''; # if empty, don't use 16 | my $USE_SCHEMATA = ''; 17 | my $INPUT_FILE = ''; 18 | my $OUTPUT_FILE_CORPUS = ''; 19 | my $OUTPUT_FILE_ANNOTATIONS = ''; 20 | my $OUTPUT_FILE_MODEL = ''; 21 | my $DONT_KEEP_COMMENTS = ''; 22 | my $EMPTY_REFNAME_FIELD = ''; 23 | my $EMPTIED_REFNAME_FIELD_VALUE = ''; 24 | my $BUILD_GLOZZ_MODEL = ''; 25 | my $EXPLODE_HEAD = ''; 26 | my $LINK_NAME = ''; 27 | my %FILTER = (); 28 | 29 | $Data::Dumper::Terse = 1; 30 | $Data::Dumper::Indent = 1; 31 | 32 | ######################################################################## 33 | # Get the CLI parameters 34 | ######################################################################## 35 | 36 | my $HELP =<<"END"; 37 | USAGE 38 | $0 [OPTIONS] INPUT OUTPUT 39 | 40 | DESCRIPTION 41 | Convert a SACR file to a couple of Glozz files (.ac and .aa). 42 | Do not specify the extensions (.ac/.aa) for the output file. 43 | 44 | OPTIONS (-o value --opt value) 45 | -h Print help. 46 | -m --min VALUE The minimum length of a chain. If -e AND -p are set, then 47 | the chains with less links have the value specified in -e. 48 | Otherwise, they are excluded. 49 | Default is 0 (all links are included). 50 | -e VALUE Put VALUE in the the PROP_NAME property (if the -p option is 51 | used) for chains with less than -m. (E.g. "" or "SI" for 52 | SIngleton.) 53 | -p PROP_NAME Include a property PROP_NAME with the name of the referent. 54 | If empty string, don't use. 55 | -s --schema Include schemata. 56 | -K Don't keep comments. 57 | -e Explode head property into 'headpos' and 'headstring'. 58 | -f REFNAME Include only REFNAME (this option can be repeated). 59 | --model Build a Glozz annotation model (.aam). 60 | --link-name VAL Name of the link (like 'link', 'mention', 'markable', etc.). 61 | Default is 'MENTION'. 62 | END 63 | 64 | sub get_cl_parameters { 65 | 66 | # default 67 | $MIN_NB_OF_LINKS = 0; 68 | $REFNAME_PROPERTY = ''; 69 | $USE_SCHEMATA = ''; 70 | $INPUT_FILE = ''; 71 | $OUTPUT_FILE_CORPUS = ''; 72 | $OUTPUT_FILE_ANNOTATIONS = ''; 73 | $OUTPUT_FILE_MODEL = ''; 74 | $EXPLODE_HEAD = ''; 75 | $DONT_KEEP_COMMENTS = ''; 76 | $EMPTY_REFNAME_FIELD = ''; 77 | $EMPTIED_REFNAME_FIELD_VALUE = ''; 78 | $BUILD_GLOZZ_MODEL = ''; 79 | $LINK_NAME = 'MENTION'; 80 | 81 | my $pending = ''; 82 | 83 | for (@ARGV) { 84 | print $HELP and exit if m/^(?:-h|--?help)$/; 85 | last if $pending and m/^-/; 86 | # pending 87 | if ($pending eq '-m' and m/^\d++$/) { 88 | $MIN_NB_OF_LINKS = $_; 89 | $pending = ''; 90 | } elsif ($pending eq '-f') { 91 | $FILTER{$_} = 1; 92 | $pending = ''; 93 | } elsif ($pending eq '-e') { 94 | $EMPTY_REFNAME_FIELD = 1; 95 | $EMPTIED_REFNAME_FIELD_VALUE = $_; 96 | $pending = ''; 97 | } elsif ($pending eq '-p') { 98 | $REFNAME_PROPERTY = $_; 99 | $pending = ''; 100 | } elsif ($pending eq '--link-name') { 101 | $LINK_NAME = $_; 102 | $pending = ''; 103 | # end of pending 104 | } elsif ($pending) { 105 | last; 106 | # options with value waiting for next element in @ARGV 107 | } elsif (m/^(?:-m|--min)$/) { 108 | $pending = '-m'; 109 | } elsif (m/^-f$/) { 110 | $pending = '-f'; 111 | } elsif (m/^--link-name$/) { 112 | $pending = '--link-name'; 113 | } elsif (m/^(?:-p|--property)$/) { 114 | $pending = '-p'; 115 | } elsif (m/^(?:-e)$/) { 116 | $pending = '-e'; 117 | # switch (= options with no value) 118 | } elsif (m/^(?:-s|--schema)$/) { 119 | $USE_SCHEMATA = 1; 120 | } elsif (m/^--model$/) { 121 | $BUILD_GLOZZ_MODEL = 1; 122 | } elsif (m/^-K$/) { 123 | $DONT_KEEP_COMMENTS = 1; 124 | } elsif (m/^-e$/) { 125 | $EXPLODE_HEAD = 1; 126 | # end of options 127 | } elsif (m/^-.+/) { 128 | die "$0: *** option '$_' doesn't exists ***\n"; 129 | # arguments, not options 130 | } elsif (!$INPUT_FILE) { 131 | $INPUT_FILE = $_; 132 | } elsif (!$OUTPUT_FILE_CORPUS) { 133 | $OUTPUT_FILE_CORPUS = "$_.ac"; 134 | $OUTPUT_FILE_ANNOTATIONS = "$_.aa"; 135 | $OUTPUT_FILE_MODEL = "$_.aam"; 136 | } else { 137 | die "$0: *** bad argument '$_' ***\n"; 138 | } 139 | } # for 140 | 141 | die "$0: *** missing value for option '$pending' ***\n" if $pending; 142 | 143 | die "$0: *** no input/output file ***\n" 144 | unless $INPUT_FILE and $OUTPUT_FILE_ANNOTATIONS and $OUTPUT_FILE_CORPUS 145 | and $OUTPUT_FILE_MODEL; 146 | 147 | die "$0: *** file '$INPUT_FILE' doesn't exist ***\n" 148 | unless -f-r $INPUT_FILE; 149 | 150 | } 151 | 152 | 153 | ######################################################################## 154 | # parser 155 | ######################################################################## 156 | 157 | sub parse { 158 | 159 | my $content = shift; 160 | my $r_filter = shift; 161 | 162 | my $corpus = ''; 163 | my @paragraphs = (); # format: { start=>0, end=>0 } 164 | my @annotations = (); # format: [ start=>0, end=>0, offset=>LEN_CORPUS, name=>NAME props => {props} ] 165 | my @filoAnnotations = (); 166 | 167 | # each line is a paragraph 168 | for my $line (split /\n/, $content) { 169 | chomp $line; 170 | if ($line =~ m/^\s*+$/) { 171 | # nothing 172 | } elsif ($line =~ m/^\s*+#.*+$/ or $line =~ m/^\*++$/) { 173 | unless ($DONT_KEEP_COMMENTS) { 174 | push @paragraphs, { start=>length($corpus), end=>length($corpus)+length($line) }; 175 | $corpus .= $line; 176 | } else { 177 | # nothing 178 | } 179 | } else { 180 | my $plain_text = ''; 181 | pos($line) = 0; 182 | while (pos($line) < length $line) { 183 | if ($line =~ m/\G\{([-_0-9a-zA-Z]++)/gc) { 184 | my $refname = $1; 185 | my %props = (); 186 | if ($line =~ m/\G:/gc) { 187 | while ($line =~ m/\G([-_0-9a-zA-Z]++)=(?:"([^"]*+)"|([-_0-9a-zA-Z]++)),?/gc) { 188 | #print "DEBUG: $1\n"; 189 | if ($1 eq 'head' and $EXPLODE_HEAD and 190 | $2 =~ m/^\s*+(\d++)\s*+:\s*+(.++)$/) { 191 | $props{headpos} = $1; 192 | $props{headstring} = $2; 193 | } else { 194 | if (length $2) { 195 | $props{$1} = $2; 196 | } elsif (length $3) { 197 | $props{$1} = $3; 198 | } else { 199 | $props{$1} = ""; 200 | } 201 | } 202 | } 203 | } 204 | $props{$REFNAME_PROPERTY} = $refname if $REFNAME_PROPERTY; 205 | unless ($line =~ m/\G\s/gc) { 206 | die "$0: *** ill formed line: $line (no space after properties) ***\n"; 207 | } 208 | push @filoAnnotations, { 209 | start => length $plain_text, 210 | end => undef, 211 | name => $refname, 212 | props => { %props }, 213 | offset => length $corpus 214 | }; 215 | } elsif ($line =~ m/\G\}/gc) { 216 | die "$0: *** too many {'s ***\n" unless @filoAnnotations; 217 | $filoAnnotations[-1]->{end} = length $plain_text; 218 | push @annotations, pop @filoAnnotations; 219 | } elsif ($line =~ m/\G(.)/gc) { 220 | $plain_text .= $1; 221 | } 222 | } # while 223 | die "$0: *** filo not empty for line: $line ***\n" if @filoAnnotations; 224 | die "$0: *** string not completed ***\n" unless pos($line) == length($line); 225 | # set the paragraph 226 | push @paragraphs, { start=>length($corpus), end=>length($corpus)+length($plain_text) }; 227 | $corpus .= $plain_text; 228 | } # if 229 | } # for 230 | 231 | my $counter = time(); 232 | my $xml = ''; 233 | for (@paragraphs) { 234 | $xml .= "\n"; 235 | $xml .= "$USER$counter\n"; 236 | $xml .= "paragraph\n"; 237 | $xml .= sprintf ''."\n", 238 | $_->{start}, $_->{end}; 239 | $xml .= "\n"; 240 | $counter++; 241 | } 242 | 243 | my %property_list = (); 244 | for my $annot (@annotations) { 245 | for my $prop (keys %{$annot->{props}}) { 246 | $property_list{$prop} = 1; 247 | } 248 | } 249 | 250 | my %chains = (); 251 | for (@annotations) { 252 | if (exists $chains{$_->{name}}) { 253 | $chains{$_->{name}}++; 254 | } else { 255 | $chains{$_->{name}} = 0; 256 | } 257 | } 258 | 259 | my %schemata = (); # format: REFNAME => [ IDCOUNTER, IDCOUNTER, ... ] 260 | for (@annotations) { 261 | next if %$r_filter and not $r_filter->{$_->{name}}; 262 | if ($chains{$_->{name}} < $MIN_NB_OF_LINKS) { 263 | if ($EMPTY_REFNAME_FIELD and $REFNAME_PROPERTY) { 264 | $_->{props}->{$REFNAME_PROPERTY} = $EMPTIED_REFNAME_FIELD_VALUE; 265 | } else { 266 | next; 267 | } 268 | } 269 | $xml .= "\n"; 270 | $xml .= "$USER$counter\n"; 271 | $xml .= "\n"; 272 | if ($_->{name} =~ m/^_/) { 273 | (my $name = $_->{name}) =~ s/^_//; 274 | $xml .= "$name\n", ; 275 | } else { 276 | $xml .= "$LINK_NAME\n"; 277 | } 278 | $xml .= "\n"; 279 | for my $k (keys %{$_->{props}}) { 280 | my $val = $_->{props}->{$k}; 281 | $xml .= "$val\n"; 282 | } 283 | $xml .= "\n"; 284 | $xml .= "\n"; 285 | $xml .= sprintf ''."\n", 286 | $_->{start}+$_->{offset}, $_->{end}+$_->{offset}; 287 | $xml .= "\n"; 288 | if (exists $schemata{$_->{name}}) { 289 | # for the format of the ID, see embedded-unit below 290 | push @{$schemata{$_->{name}}}, "${USER}_$counter"; 291 | } else { 292 | $schemata{$_->{name}} = [ "${USER}_$counter" ]; 293 | } 294 | $counter++; 295 | } 296 | 297 | if ($USE_SCHEMATA) { 298 | for my $k (keys %schemata) { 299 | next if $k =~ m/^_/; 300 | next if scalar @{$schemata{$k}} < $MIN_NB_OF_LINKS; 301 | $xml .= "\n"; 302 | $xml .= "$USER$counter\n"; 303 | $xml .= "cr\n"; 304 | $xml .= "\n"; 305 | $xml .= "$k\n"; 306 | $xml .= "\n"; 307 | $xml .= "\n"; 308 | $xml .= "\n"; 309 | for my $id (@{$schemata{$k}}) { 310 | # NOTE: 'id' is not the id of the unit! It is in fact 311 | # "AUTHOR_CREATIONDATE" of the unit, and the 'id' of the 312 | # unit is in fact not used! 313 | $xml .= "\n"; 314 | } 315 | $xml .= "\n"; 316 | $xml .= "\n"; 317 | $counter++; 318 | } 319 | } 320 | 321 | my $model = ""; 322 | 323 | if ($BUILD_GLOZZ_MODEL) { 324 | $model = "\n"; 325 | $model .= "\n"; 326 | $model .= "\n"; 327 | $model .= "\n"; 328 | for my $property (keys %property_list) { 329 | $model .= "\n"; 330 | $model .= "\n"; 331 | $model .= "\n"; 332 | $model .= "\n"; 333 | $model .= "\n"; 334 | 335 | } 336 | $model .= "\n"; 337 | $model .= "\n"; 338 | $model .= "\n"; 339 | $model .= "\n"; 340 | $model .= "\n"; 341 | $model .= "\n"; 342 | $model .= "\n"; 343 | } 344 | 345 | return ($corpus, 346 | "\n\n$xml\n", 347 | $model); 348 | 349 | } 350 | 351 | 352 | ######################################################################## 353 | # Helper functions 354 | ######################################################################## 355 | 356 | 357 | sub confirm_yn { 358 | 359 | my $message = shift || 'Confirm ? (y|n) '; 360 | my $default = shift; 361 | 362 | ITER: { 363 | print $message; 364 | my $ans = ; 365 | print "\n" unless -t STDIN; 366 | return 1 if $ans =~ m/^\s*+y(?:es)?\s*+$/; 367 | return 0 if $ans =~ m/^\s*+n(?:o)?\s*+$/; 368 | return $default if (defined($default) and $ans =~ m/^\s*+$/); 369 | redo ITER; 370 | } 371 | 372 | } 373 | 374 | sub read_file { 375 | 376 | my $file = shift; 377 | 378 | open my $fh, $file or die "$0: *** can't open $file ***\n"; 379 | 380 | local $/ = undef; 381 | 382 | my $content = <$fh>; 383 | 384 | close $fh or die "$0: *** can't close $file ***\n"; 385 | 386 | return $content; 387 | 388 | } 389 | 390 | sub write_file { 391 | 392 | my $file = shift; 393 | my $content = shift; 394 | 395 | open my $fh, ">", $file or die "$0: *** can't open $file ***\n"; 396 | print $fh $content; 397 | close $fh or die "$0: *** can't close $file ***\n"; 398 | 399 | } 400 | 401 | 402 | ######################################################################## 403 | # main() 404 | ######################################################################## 405 | 406 | sub main { 407 | 408 | get_cl_parameters(); 409 | 410 | if (-e $OUTPUT_FILE_ANNOTATIONS or -e $OUTPUT_FILE_CORPUS or -e 411 | $OUTPUT_FILE_MODEL) { 412 | return unless (confirm_yn("Output files exist. Overwrite [Y/n]?", 1)); 413 | } 414 | 415 | my $content = read_file($INPUT_FILE); 416 | 417 | my ($corpus, $xml, $model) = parse($content, \%FILTER); 418 | write_file($OUTPUT_FILE_CORPUS, $corpus); 419 | write_file($OUTPUT_FILE_ANNOTATIONS, $xml); 420 | if ($OUTPUT_FILE_MODEL) { 421 | write_file($OUTPUT_FILE_MODEL, $model); 422 | } 423 | 424 | return 1; 425 | 426 | } 427 | 428 | 429 | main() 430 | and print "$0: done!\n"; 431 | 432 | 433 | -------------------------------------------------------------------------------- /sacr_parser.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module offers a parser for the SACR ("Script d'Annotation des Chaînes de 3 | Référence") format. 4 | 5 | Synopsis 6 | -------- 7 | 8 | The parser yields the following elements: 9 | * ('text_id', ) 10 | * ('comment', , , )) 14 | * ('mention_end', None) 15 | * ('token', ) 16 | * ('sentence_change', None) 17 | 18 | Note that spaces are not yielded as token. 19 | 20 | import sacr_parser 21 | import annotable 22 | 23 | corpus = annotatble.Corpus() 24 | 25 | for fpath in fpaths: 26 | parser = sacr_parser.SacrParser( 27 | fpath=fpath, 28 | tokenization_mode=sacr_parser.WORD_TOKENIZATION, 29 | ) 30 | text = annotable.Text(id_=fpath) 31 | self.corpus.add_text(text) 32 | for item, params in parser.parse(): 33 | if item == 'text_id': 34 | text.id_ = params 35 | elif item == 'par_start': 36 | ... 37 | elif item == 'par_end': 38 | ... 39 | elif item == 'sentence_change': 40 | ... 41 | elif item == 'mention_start': 42 | ... 43 | elif item == 'token': 44 | ... 45 | elif item == 'mention_end': 46 | ... 47 | """ 48 | 49 | __version__ = "1.0.0" 50 | 51 | import re 52 | 53 | WORD_TOKENIZATION = 1 54 | CHAR_TOKENIZATION = 2 55 | 56 | 57 | def escape_regex(string): 58 | """Escape a string so it can be literally search for in a regex. 59 | 60 | Used for additional_tokens. 61 | """ 62 | return re.sub(r"([-{}\[\]().])", r"\\\1", string) 63 | 64 | 65 | class SacrParser: 66 | """Parse a file in the SACR format. 67 | 68 | Attribute 69 | --------- 70 | tokenization_mode: int 71 | The tokenization mode, use the constants: `WORD_TOKENIZATION` and 72 | `CHAR_TOKENIZATION` 73 | fpath: str 74 | Path of the file to parse. 75 | """ 76 | 77 | @staticmethod 78 | def get_word_regex(additional_tokens=None): 79 | """Compute the regex to match words, including additional_tokens.""" 80 | if not additional_tokens: 81 | addtional_tokens = [] 82 | additional_tokens = sorted( 83 | [escape_regex(w) for w in additional_tokens], key=lambda x: len(x) 84 | ) 85 | token_str = "[a-zßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿœα-ω0-9_]+'?|[-+±]?[.,]?[0-9]+" 86 | if additional_tokens: 87 | return re.compile( 88 | "([%d]+|%s)" % (token_str, "|".join(additional_tokens)), re.I 89 | ) 90 | else: 91 | return re.compile("(%s)" % token_str, re.I) 92 | 93 | def __init__(self, fpath, tokenization_mode=WORD_TOKENIZATION): 94 | self.tokenization_mode = tokenization_mode 95 | self.fpath = fpath 96 | 97 | def parse(self): 98 | """Parse the file and yields elements. See the module description.""" 99 | content = open(self.fpath).read() 100 | additional_tokens = [] 101 | pos = 0 102 | chains = dict() 103 | open_mention_counter = 0 104 | # patterns: 105 | additional_tokens_pattern = re.compile(r"#additional_?token:\s*(.+)\s*\n\n+") 106 | text_id_pattern = re.compile(r"#text_?id:\s*(.+)\s*\n\n*") 107 | comment_pattern = re.compile(r"(?:#(.*)\n+|\*{5,})") 108 | end_par_pattern = re.compile(r"\n\n+") 109 | space_pattern = re.compile(r"\s+") 110 | new_line_pattern = re.compile(r"\n") 111 | open_mention_pattern = re.compile(r"\{(\w+)(:| )") 112 | feature_pattern = re.compile(r'(\w+)=(?:(\w+)|"([^"]*)")(,| )') 113 | close_mention_pattern = re.compile(r"\}") 114 | sentence_end_pattern = re.compile(r'(?:\.+"?|\!|\?)') 115 | if self.tokenization_mode == WORD_TOKENIZATION: 116 | word_pattern = self.__class__.get_word_regex(additional_tokens) 117 | else: 118 | word_pattern = re.compile(r".") 119 | # eat leading blank lines 120 | m = re.compile(r"\s+").match(content, pos) 121 | if m: 122 | # print('eat leading spaces') 123 | pos += len(m.group(0)) 124 | while pos < len(content): 125 | m = additional_tokens_pattern.match(content, pos) 126 | if m: 127 | # print('add word') 128 | pos += len(m.group(0)) 129 | additional_tokens.append(m.group(1)) 130 | word_pattern = SacrParser.get_word_regex(additional_tokens) 131 | continue 132 | m = text_id_pattern.match(content, pos) 133 | if m: 134 | # print('textid') 135 | pos += len(m.group(0)) 136 | yield "text_id", m.group(1) 137 | continue 138 | m = comment_pattern.match(content, pos) 139 | if m: 140 | # print('comment', m.group(0)) 141 | pos += len(m.group(0)) 142 | comment = m.group(1).strip() 143 | if comment: 144 | yield "comment", comment 145 | continue 146 | # paragraph of text 147 | yield "par_start", None 148 | while pos < len(content): 149 | # print("%d, %d" % (pos, len(content))) 150 | # print(content[pos]) 151 | m = end_par_pattern.match(content, pos) 152 | if m: 153 | # print('end par') 154 | pos += len(m.group(0)) 155 | yield "par_end", None 156 | break 157 | m = space_pattern.match(content, pos) 158 | if m: 159 | # print('space') 160 | pos += len(m.group(0)) 161 | continue 162 | m = new_line_pattern.match(content, pos) 163 | if m: 164 | # print('newline') 165 | pos += len(m.group(0)) 166 | continue 167 | m = open_mention_pattern.match(content, pos) 168 | if m: 169 | # print('mention') 170 | pos += len(m.group(0)) 171 | open_mention_counter += 1 172 | if m.group(1) not in chains: 173 | chains[m.group(1)] = len(chains) 174 | chain_index = chains[m.group(1)] 175 | chain_name = m.group(1) 176 | features = dict() 177 | if m.group(2) == ":": 178 | while pos < len(content): 179 | m = feature_pattern.match(content, pos) 180 | if m: 181 | key = m.group(1) 182 | value = m.group(2) if m.group(2) is not None else m.group(3) 183 | features[key] = value 184 | pos += len(m.group(0)) 185 | if m.group(4) == " ": 186 | break 187 | else: 188 | raise SyntaxError( 189 | "can't understand '%s' near %d" % (content, pos) 190 | ) 191 | yield "mention_start", (chain_index, chain_name, features) 192 | continue 193 | m = close_mention_pattern.match(content, pos) 194 | if m: 195 | # print('end mention') 196 | pos += len(m.group(0)) 197 | open_mention_counter -= 1 198 | yield "mention_end", None 199 | continue 200 | m = word_pattern.match(content, pos) 201 | if m: 202 | # print('token: %s' % m.group(0)) 203 | pos += len(m.group(0)) 204 | yield "token", m.group(0) 205 | continue 206 | if open_mention_counter == 0: 207 | m = sentence_end_pattern.match(content, pos) 208 | if m: 209 | # print('token: %s' % m.group(0)) 210 | pos += len(m.group(0)) 211 | yield "token", m.group(0) 212 | yield "sentence_change", None 213 | continue 214 | m = re.compile(r".").match(content, pos) 215 | if m: 216 | # print('token: %s' % m.group(0)) 217 | pos += len(m.group(0)) 218 | yield "token", m.group(0) 219 | continue 220 | assert False 221 | -------------------------------------------------------------------------------- /sacr_parser2.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import re 4 | from dataclasses import dataclass 5 | from pathlib import Path 6 | from typing import Generator 7 | 8 | 9 | @dataclass 10 | class Token: 11 | start: int 12 | end: int 13 | 14 | def __eq__(self, other: Token) -> bool: 15 | return self.start == other.start and self.end == other.end 16 | 17 | 18 | @dataclass 19 | class TextID(Token): 20 | text_id: str 21 | 22 | def __eq__(self, other: TextID) -> bool: 23 | return super().__eq__(other) and self.text_id == other.text_id 24 | 25 | 26 | @dataclass 27 | class Comment(Token): 28 | value: str 29 | 30 | def __eq__(self, other: Comment) -> bool: 31 | return super().__eq__(other) and self.value == other.value 32 | 33 | 34 | @dataclass 35 | class ParagraphStart(Token): 36 | ... 37 | 38 | 39 | @dataclass 40 | class ParagraphEnd(Token): 41 | ... 42 | 43 | 44 | @dataclass 45 | class MentionStart(Token): 46 | chain_index: int 47 | chain_name: str 48 | features: dict[str, str] 49 | 50 | def __eq__(self, other: MentionStart) -> bool: 51 | return ( 52 | super().__eq__(other) 53 | and self.chain_index == other.chain_index 54 | and self.chain_name == other.chain_name 55 | and self.features == other.features 56 | ) 57 | 58 | 59 | @dataclass 60 | class MentionEnd(Token): 61 | ... 62 | 63 | 64 | @dataclass 65 | class Spaces(Token): 66 | value: str 67 | 68 | def __eq__(self, other: Spaces) -> bool: 69 | return super().__eq__(other) and self.value == other.value 70 | 71 | 72 | @dataclass 73 | class NewLineInsideParagraph(Token): 74 | value: str 75 | 76 | def __eq__(self, other: NewLineInsideParagraph) -> bool: 77 | return super().__eq__(other) and self.value == other.value 78 | 79 | 80 | @dataclass 81 | class Word(Token): 82 | value: str 83 | 84 | def __eq__(self, other: Word) -> bool: 85 | return super().__eq__(other) and self.value == other.value 86 | 87 | 88 | @dataclass 89 | class SentenceChange(Token): 90 | ... 91 | 92 | 93 | def escape_regex(string: str) -> str: 94 | """Escape a string so it can be literally searched for in a regex. 95 | 96 | Used for `additional_tokens`. 97 | """ 98 | return re.sub(r"([-{}\[\]().])", r"\\\1", string) 99 | 100 | 101 | class SacrParser: 102 | """Parse a file in the SACR format.""" 103 | 104 | def __init__(self, source: str | Path): 105 | if isinstance(source, str): 106 | self.content = source 107 | else: 108 | self.content = source.read_text() 109 | 110 | @staticmethod 111 | def get_word_pattern(additional_tokens: list[str] | None = None) -> re.Pattern[str]: 112 | """Compute the regex to match words, including additional_tokens.""" 113 | if not additional_tokens: 114 | additional_tokens = [] 115 | additional_tokens = sorted( 116 | [escape_regex(w) for w in additional_tokens], key=lambda x: len(x) 117 | ) 118 | token_str = "[a-zßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿœα-ω0-9_]+'?|[-+±]?[.,]?[0-9]+" 119 | if additional_tokens: 120 | return re.compile( 121 | "(%s|%s)" % (token_str, "|".join(additional_tokens)), re.IGNORECASE 122 | ) 123 | else: 124 | return re.compile("(%s)" % token_str, re.IGNORECASE) 125 | 126 | def parse(self) -> Generator[Token, None, None]: 127 | """Parse the file and yields elements.""" 128 | content = self.content 129 | additional_tokens: list[str] = [] 130 | pos = 0 131 | chains: dict[str, int] = dict() 132 | open_mention_counter = 0 133 | 134 | # patterns 135 | additional_tokens_pattern = re.compile(r"#additional_?token:\s*(.+)\s*\n\n+") 136 | text_id_pattern = re.compile(r"#text_?id:\s*(.+)\s*\n\n*") 137 | comment_pattern = re.compile(r"(?:#(.*)(?:\n+|$)|\*{5,})") 138 | end_par_pattern = re.compile(r"\n\n+") 139 | space_pattern = re.compile(r"\s+") 140 | new_line_pattern = re.compile(r"\n") 141 | open_mention_pattern = re.compile(r"\{(\w+)(:| )") 142 | feature_pattern = re.compile(r'(\w+)=(?:(\w+)|"([^"]*)")(,| )') 143 | close_mention_pattern = re.compile(r"\}") 144 | sentence_end_pattern = re.compile(r'(?:\.+"?|\!|\?)') 145 | word_pattern = self.get_word_pattern(additional_tokens) 146 | 147 | # eat leading blank lines 148 | if m := re.compile(r"\s+").match(content, pos): 149 | pos += len(m.group(0)) 150 | 151 | while pos < len(content): 152 | if m := additional_tokens_pattern.match(content, pos): 153 | pos += len(m.group(0)) 154 | additional_tokens.append(m.group(1)) 155 | word_pattern = SacrParser.get_word_pattern(additional_tokens) 156 | continue 157 | 158 | if m := text_id_pattern.match(content, pos): 159 | length = len(m.group(0)) 160 | yield TextID(pos, pos + length, m.group(1)) 161 | pos += length 162 | continue 163 | 164 | if m := comment_pattern.match(content, pos): 165 | length = len(m.group(0)) 166 | if m.group(1): # no group 0 if ****** 167 | comment = m.group(1).strip() 168 | if comment: 169 | yield Comment(pos, pos + length, comment) 170 | pos += length 171 | continue 172 | 173 | # parsing a paragraph 174 | 175 | yield ParagraphStart(pos, pos) 176 | 177 | while pos < len(content): 178 | if m := end_par_pattern.match(content, pos): 179 | length = len(m.group(0)) 180 | yield ParagraphEnd(pos, pos + length) 181 | pos += length 182 | break 183 | 184 | if m := new_line_pattern.match(content, pos): 185 | length = len(m.group(0)) 186 | yield NewLineInsideParagraph(pos, pos + length, m.group(0)) 187 | pos += length 188 | continue 189 | 190 | if m := space_pattern.match(content, pos): 191 | length = len(m.group(0)) 192 | yield Spaces(pos, pos + length, m.group(0)) 193 | pos += length 194 | continue 195 | 196 | if m := open_mention_pattern.match(content, pos): 197 | start = pos 198 | pos += len(m.group(0)) 199 | open_mention_counter += 1 200 | 201 | if m.group(1) not in chains: 202 | chains[m.group(1)] = len(chains) 203 | chain_index = chains[m.group(1)] 204 | chain_name = m.group(1) 205 | 206 | features = dict() 207 | if m.group(2) == ":": 208 | while pos < len(content): 209 | if m := feature_pattern.match(content, pos): 210 | key = m.group(1) 211 | value = ( 212 | m.group(2) if m.group(2) is not None else m.group(3) 213 | ) 214 | features[key] = value 215 | pos += len(m.group(0)) 216 | if m.group(4) == " ": 217 | break 218 | else: 219 | raise SyntaxError( 220 | "can't understand '%s' near %d" % (content, pos) 221 | ) 222 | yield MentionStart( 223 | start, 224 | pos, 225 | chain_index=chain_index, 226 | chain_name=chain_name, 227 | features=features, 228 | ) 229 | continue 230 | 231 | if m := close_mention_pattern.match(content, pos): 232 | length = len(m.group(0)) 233 | yield MentionEnd(pos, pos + length) 234 | pos += length 235 | open_mention_counter -= 1 236 | continue 237 | 238 | if m := word_pattern.match(content, pos): 239 | length = len(m.group(0)) 240 | yield Word(pos, pos + length, m.group(0)) 241 | pos += length 242 | continue 243 | 244 | if open_mention_counter == 0: 245 | if m := sentence_end_pattern.match(content, pos): 246 | length = len(m.group(0)) 247 | yield Word(pos, pos + length, m.group(0)) 248 | yield SentenceChange(pos, pos + length) 249 | pos += length 250 | continue 251 | 252 | if m := re.compile(r".").match(content, pos): 253 | length = len(m.group(0)) 254 | yield Word(pos, pos + length, m.group(0)) 255 | pos += length 256 | continue 257 | assert False 258 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore=E203,E501,W503 3 | exclude=.git,__pycache__,venv,.idea,color_manager.py,conll2jsonlines.py,conll2sacr.py,conll_transform.py,jsonlines2conll.py,jsonlines2text.py,sacr2conll.py,sacr_parser.py,text2jsonlines.py,standoff2inline.py -------------------------------------------------------------------------------- /standoff2inline.py: -------------------------------------------------------------------------------- 1 | """ 2 | Converting standoff annotations to inline annotations. 3 | 4 | For example, in the sentence: 5 | 6 | The little cat drinks milk. 7 | 8 | you know that the third word, between the 12th and 14th characters, is a noun. 9 | You may want to surround it with some tags, like `` and ``: 10 | 11 | The little cat drinks milk. 12 | 13 | This module offer classes and function to: 14 | * add inline annotations, like xml annotations, counting in characters or 15 | tokens, 16 | * highlight some chunks of text, for example with styled `` tags, 17 | * remove parts without annotations and replace them with something like 18 | `[...]`. 19 | 20 | A quick preview: 21 | 22 | ```python 23 | from standoff2inline import Standoff2Inline 24 | 25 | string = "The little cat drinks milk." 26 | inliner = Standoff2Inline() 27 | inliner.add((0, ""), (26, "")) 28 | inliner.add((0, ""), (13, "")) 29 | inliner.add((11, ""), (13, "")) 30 | inliner.add((22, ""), (25, "")) 31 | inliner.add((0, ""), (2, "")) 32 | inliner.apply(string) 33 | ``` 34 | 35 | which gives: 36 | 37 | ``` 38 | The little cat drinks 39 | milk. 40 | ``` 41 | 42 | Please read the user guide and play with the module in the Jupyter notebook. 43 | 44 | ************************************************************************ 45 | 46 | (c) Bruno Oberle 2019 - boberle.com 47 | 48 | Distributed under the term of the Mozilla Public License 2. See the LICENSE 49 | file. 50 | 51 | Version 1.0.0 52 | 53 | """ 54 | 55 | 56 | 57 | class Standoff2Inline: 58 | """Conversion from standoff annotation to inline annotations. 59 | 60 | Constructor: 61 | * `kind` (opt): one of `xml|sacr`: predefined annotation scheme. 62 | * `end_is_stop`: the "end" position is the position of the next token or 63 | character, not the last. 64 | """ 65 | 66 | 67 | def __init__(self, kind=None, end_is_stop=False): 68 | self.kind = kind 69 | self._elements = [] 70 | self._sorted = False 71 | self.end_is_stop = end_is_stop 72 | 73 | 74 | 75 | def add(self, start, end=None, stop=None): 76 | """Add an annotation. 77 | 78 | Annotations are given as a tuple `(position, string)`, where position 79 | may be in characters or tokens. 80 | 81 | The `start` annotation is required, `end|stop` annotation is optional. 82 | 83 | You give either an `end` or `stop` annotation. `stop` works like 84 | Python's `range` function: the annotation is introduced *before* the 85 | next element. 86 | """ 87 | 88 | if stop is not None: 89 | if isinstance(stop, int): 90 | stop = (stop, None) 91 | stop, value = stop 92 | end = (stop-1, value) 93 | if isinstance(end, int): 94 | end = (end, None) 95 | if end is None: 96 | end = (-1, None) 97 | self._elements.append((start, end)) 98 | self._sorted = False 99 | 100 | 101 | 102 | def _iter_elements(self, elements): 103 | if self.kind is None: 104 | yield from self._get_strings(elements) 105 | elif self.kind == 'xml': 106 | yield from self._get_xml_strings(elements) 107 | elif self.kind == 'sacr': 108 | yield from self._get_sacr_strings(elements) 109 | else: 110 | assert False, self.kind 111 | 112 | 113 | 114 | def _get_xml_strings(self, elements): 115 | for (start, start_val), (end, end_val) in elements: 116 | if isinstance(start_val, str): 117 | tagname, dic = start_val, dict() 118 | else: 119 | tagname, dic = start_val 120 | attrs = " ".join('%s="%s"' % (k, v) for k, v in dic.items()) 121 | if attrs: 122 | attrs = " " + attrs 123 | start_val = "<%s%s>" % (tagname, attrs) 124 | end_val = "" % tagname 125 | yield (start, start_val), (end, end_val) 126 | 127 | 128 | 129 | def _get_sacr_strings(self, elements): 130 | for (start, start_val), (end, end_val) in elements: 131 | tagname, dic = start_val 132 | attrs = " ".join('%s="%s"' % (k, v) for k, v in dic.items()) 133 | if attrs: 134 | attrs = ":" + attrs 135 | start_val = "{%s%s " % (tagname, attrs) 136 | end_val = "}" 137 | yield (start, start_val), (end, end_val) 138 | 139 | 140 | 141 | def _get_strings(self, elements): 142 | for (start, start_val), (end, end_val) in elements: 143 | if end_val is None: 144 | end_val = "" 145 | yield (start, start_val), (end, end_val) 146 | 147 | 148 | 149 | def _tokens2string(self, tokens): 150 | """Convert a list of tokens into a string and compute new positions. 151 | 152 | Return a tuple `(string, elements)`, where `elements` is like 153 | `self.elements`, but with position in the string rather than in the 154 | token list. 155 | """ 156 | 157 | string = "" 158 | t2s = [] 159 | for i, token in enumerate(tokens): 160 | start = len(string) 161 | t2s.append((start, start+len(token)-1)) 162 | string += token + " " 163 | elements = [] 164 | for (start, start_val), (end, end_val) in self._elements: 165 | start = t2s[start][0] 166 | end = t2s[end][1] 167 | elements.append(((start, start_val), (end, end_val))) 168 | return string, elements 169 | 170 | 171 | 172 | def apply(self, string=None, tokens=None): 173 | """Insert the annotations and return a string with inline annotations. 174 | 175 | Specify either a `string` or a list of `tokens`. 176 | """ 177 | 178 | return "".join( 179 | x[1] for x in self.iter_result(string=string, tokens=tokens)) 180 | 181 | 182 | 183 | def iter_result(self, string=None, tokens=None, return_tokens=False): 184 | """Iterate over `prefix|string|suffix`. 185 | 186 | Each iteration yields a tuple `(kind, string)` where `kind` is one of 187 | `prefix|string|suffix` and `string` is either the annotation value or 188 | a chunk of text. 189 | """ 190 | 191 | assert string or tokens and not (string and tokens) 192 | def yield_(k, v): 193 | if len(v): 194 | yield k, v 195 | if not self._sorted: 196 | self._elements.sort(key=lambda e: e[1][0], reverse=True) 197 | self._elements.sort(key=lambda e: e[0][0]) 198 | self._sorted = True 199 | if tokens and not return_tokens: 200 | string, elements = self._tokens2string(tokens) 201 | else: 202 | elements = self._elements 203 | res = "" 204 | pos = 0 205 | filo = [] 206 | move_one = 0 if self.end_is_stop else 1 207 | for (start, start_val), end_data in self._iter_elements(elements): 208 | while filo and filo[-1][0] < start: 209 | end, end_val = filo.pop() 210 | if tokens and return_tokens: 211 | yield from yield_('string', tokens[pos:end+move_one]) 212 | else: 213 | yield from yield_('string', string[pos:end+move_one]) 214 | pos = end + move_one 215 | yield 'suffix', end_val 216 | if tokens and return_tokens: 217 | yield from yield_('string', tokens[pos:start]) 218 | else: 219 | yield from yield_('string', string[pos:start]) 220 | yield 'prefix', start_val 221 | pos = start 222 | if end_data[0] != -1: 223 | filo.append(end_data) 224 | while filo: 225 | end, end_val = filo.pop() 226 | if tokens and return_tokens: 227 | yield from yield_('string', tokens[pos:end+move_one]) 228 | else: 229 | yield from yield_('string', string[pos:end+move_one]) 230 | pos = end + move_one 231 | yield from yield_('suffix', end_val) 232 | if tokens and return_tokens: 233 | yield from yield_('string', tokens[pos:]) 234 | else: 235 | yield from yield_('string', string[pos:]) 236 | return res 237 | 238 | 239 | 240 | class Highlighter: 241 | 242 | 243 | def __init__(self, marks=None, prefix=None, suffix=None): 244 | self.marks = marks if marks is not None else list() 245 | self.prefix = prefix 246 | self.suffix = suffix 247 | 248 | 249 | def _get_affix(self, current, value): 250 | if current is None: 251 | return value 252 | if isinstance(current, list): 253 | current.append(value) 254 | return current 255 | return [current, value] 256 | 257 | 258 | def set_style(self, underline=False, bold=False, italic=False, 259 | color=None): 260 | res = "" 261 | if underline: 262 | res += "text-decoration: underline; " 263 | if bold: 264 | res += "font-weight: bold; " 265 | if italic: 266 | res += "font-style: italic; " 267 | if color is not None: 268 | res += "color: %s; " % color 269 | if res: 270 | self.prefix = '%s' % ( 271 | res, self.prefix if self.prefix else "") 272 | self.suffix = '%s' % (self.suffix if self.suffix else "") 273 | 274 | 275 | def add_mark(self, start, end, prefix=None, suffix=None): 276 | self.marks.append((start, end)) 277 | if prefix is not None: 278 | self.prefix = self._get_affix(self.prefix, prefix) 279 | if suffix is not None: 280 | self.suffix = self._get_affix(self.suffix, suffix) 281 | 282 | 283 | def add_marks(self, marks): 284 | for start, end in marks: 285 | self.add_mark(start, end) 286 | 287 | 288 | 289 | def highlight_characters(text, *highlighters, end_is_stop=False): 290 | inliner = Standoff2Inline(end_is_stop=end_is_stop) 291 | for hl in highlighters: 292 | for i in range(len(hl.marks)): 293 | start, end = hl.marks[i] 294 | prefix = hl.prefix[i] if isinstance(hl.prefix, list) else hl.prefix 295 | suffix = hl.suffix[i] if isinstance(hl.suffix, list) else hl.suffix 296 | inliner.add( 297 | (start, prefix), 298 | (end, suffix), 299 | ) 300 | return inliner.apply(text) 301 | 302 | 303 | 304 | def highlight(text, *highlighters, margin=0, max_gap=0, ellipsis=" [...] ", 305 | char=False, end_is_stop=False): 306 | inliner = Standoff2Inline(end_is_stop=end_is_stop) 307 | for hl in highlighters: 308 | for i in range(len(hl.marks)): 309 | start, end = hl.marks[i] 310 | prefix = hl.prefix[i] if isinstance(hl.prefix, list) else hl.prefix 311 | suffix = hl.suffix[i] if isinstance(hl.suffix, list) else hl.suffix 312 | inliner.add( 313 | (start, prefix), 314 | (end, suffix), 315 | ) 316 | #return inliner.apply(tokens=text) 317 | chunks = [ 318 | [a, b] for a, b in inliner.iter_result( 319 | string=text if char else None, 320 | tokens=text if not char else None, 321 | return_tokens=True 322 | ) 323 | ] 324 | if not char: 325 | ellipsis = ellipsis.strip() 326 | if margin and chunks[0][0] == 'string' and len(chunks[0][1]) > margin: 327 | chunks[0][1] = [ellipsis] + chunks[0][1][-1*margin:] 328 | if margin and chunks[-1][0] == 'string' and len(chunks[-1][1]) > margin: 329 | chunks[-1][1] = chunks[-1][1][:margin] + [ellipsis] 330 | level = 1 if chunks[0][0] == 'prefix' else 0 331 | if max_gap: 332 | for i in range(1, len(chunks)-1): 333 | kind, string = chunks[i] 334 | if kind == 'prefix': 335 | level += 1 336 | chunks[i][1] = chunks[i][1] 337 | if kind == 'suffix': 338 | level -= 1 339 | chunks[i][1] = chunks[i][1] 340 | if kind == 'string' and level == 0: 341 | if len(string) > max_gap: 342 | chunks[i][1] = chunks[i][1][:margin] \ 343 | + [ellipsis] + chunks[i][1][-1*margin:] 344 | res = "" 345 | need_space = False 346 | for kind, chunk in chunks: 347 | if kind == "string": 348 | if need_space and not char: 349 | res += " " 350 | res += chunk if char else " ".join(chunk) 351 | need_space = True 352 | else: 353 | if kind == "prefix" and need_space and not char: 354 | res += " " 355 | need_space = False 356 | res += chunk 357 | return res.rstrip() 358 | 359 | 360 | 361 | -------------------------------------------------------------------------------- /testing/aesop.sacr: -------------------------------------------------------------------------------- 1 | #title:The Peasant and the Eagle (Aesop), translated by G. F. Townsend (1887) 2 | 3 | #textid:aesop 4 | 5 | #textmetadata:work=literature 6 | 7 | {Peasant:function="s subject",head="1",partofspeech="i noun with indefinite article" A Peasant} found {Eagle:function="o object",head="1",partofspeech="i noun with indefinite article" an Eagle captured in {M3:function="a adverbial",head="1",partofspeech="i noun with indefinite article" a trap}}, and much admiring {Eagle:function="o object",head="1",partofspeech="d noun with definite article" the bird}, set {Peasant:function="o object",head="0",partofspeech="s personnal pronoun" him} free. {Eagle:function="s subject",head="1",partofspeech="d noun with definite article" The Eagle} did not prove ungrateful to {Peasant:function="o object",head="1",partofspeech="n noun with determiner" {Eagle:function="t other",head="0",partofspeech="e possessive adjective" his} deliverer}, for seeing {Peasant:function="o object",head="1",partofspeech="d noun with definite article" the Peasant sitting under {Wall:function="a adverbial",head="1",partofspeech="i noun with indefinite article" a wall {Wall:function="s subject",head="0",partofspeech="r relative pronoun" which} was not safe}}, {Eagle:function="s subject",head="0",partofspeech="s personnal pronoun" he} flew toward {Peasant:function="a adverbial",head="0",partofspeech="s personnal pronoun" him} and with {M14:function="a adverbial",head="1",partofspeech="n noun with determiner" {Eagle:function="t other",head="0",partofspeech="e possessive adjective" his} talons} snatched {Bundle:function="o object",head="1",partofspeech="i noun with indefinite article" a bundle} from {M17:function="a adverbial",head="1",partofspeech="t noun without determiner" {Peasant:function="t other",head="0",partofspeech="e possessive adjective" his} head}. When {Peasant:function="s subject",head="1",partofspeech="d noun with definite article" the Peasant} rose in pursuit, {Eagle:function="s subject",head="1",partofspeech="d noun with definite article" the Eagle} let {Bundle:function="o object",head="1",partofspeech="d noun with definite article" the bundle} fall again. Taking {Bundle:function="o object",head="0",partofspeech="s personnal pronoun" it} up, {Peasant:function="s subject",head="1",partofspeech="d noun with definite article" the man} returned to {M24:function="a adverbial",head="2",partofspeech="d noun with definite article" the same place}, to find that {Wall:function="s subject",head="1",partofspeech="d noun with definite article" the wall under {Wall:function="a adverbial",head="0",partofspeech="r relative pronoun" which} {Peasant:function="s subject",head="0",partofspeech="s personnal pronoun" he} had been sitting} had fallen to pieces; and {Peasant:function="s subject",head="0",partofspeech="s personnal pronoun" he} marveled at {M29:function="o object",head="1",partofspeech="d noun with definite article" the service} rendered {Peasant:function="o object",head="0",partofspeech="s personnal pronoun" him} by {Eagle:function="a adverbial",head="1",partofspeech="d noun with definite article" the Eagle}. 8 | 9 | 10 | 11 | #COLOR:Peasant=hsl(25, 100%, 80%) 12 | #COLOR:Eagle=hsl(0, 100%, 80%) 13 | #COLOR:Wall=hsl(50, 100%, 80%) 14 | #COLOR:Bundle=hsl(75, 100%, 80%) 15 | 16 | #TOKENIZATION-TYPE:1 17 | 18 | -------------------------------------------------------------------------------- /testing/caesar.sacr: -------------------------------------------------------------------------------- 1 | #textmetadata:work=politics 2 | 3 | #textid:caesar 4 | 5 | {Caesar:function="s subject",head="0",partofspeech="a name" Gaius Julius Caesar} ({M2:function="t other",head="0",partofspeech="t noun without determiner" 12 or 13 July 100 BC} – {M3:function="t other",head="0",partofspeech="t noun without determiner" 15 March 44 BC}), known by {M6:function="o object",head="1",partofspeech="n noun with determiner" {M5:function="o object",head="1",partofspeech="n noun with determiner" {Caesar:function="m noun modifier",head="0",partofspeech="e possessive adjective" his} nomen} and {M7:function="o object",head="0",partofspeech="n noun with determiner" cognomen} Julius Caesar}, was {Caesar:function="o object",head="2",partofspeech="i noun with indefinite article" a Roman politician, military general, and historian {Caesar:function="s subject",head="0",partofspeech="r relative pronoun" who} played {M10:function="o object",head="2",partofspeech="i noun with indefinite article" a critical role} in {M12:function="a adverbial",head="1",partofspeech="d noun with definite article" the events {M12:function="s subject",head="0",partofspeech="r relative pronoun" that} led to {M11:function="o object",head="1",partofspeech="d noun with definite article" the demise of {M13:function="m noun modifier",head="2",partofspeech="d noun with definite article" the Roman Republic}} and {M14:function="o object",head="1",partofspeech="d noun with definite article" the rise of {M15:function="m noun modifier",head="2",partofspeech="d noun with definite article" the Roman Empire}}}}. {Caesar:function="s subject",head="0",partofspeech="s personnal pronoun" He} also wrote {M17:function="o object",head="1",partofspeech="t noun without determiner" Latin prose}. 6 | 7 | In {M1:function="a adverbial",head="0",partofspeech="t noun without determiner" 60 BC}, {M23:function="s subject",head="0",partofspeech="a name" {Caesar:function="s subject",head="0",partofspeech="a name" Caesar}, {Crassus:function="s subject",head="0",partofspeech="a name" Crassus} and {Pompey:function="s subject",head="0",partofspeech="a name" Pompey}} formed {M21:function="o object",head="1",partofspeech="d noun with definite article" the First Triumvirate, a political alliance {M21:function="s subject",head="0",partofspeech="r relative pronoun" that} dominated {M20:function="o object",head="1",partofspeech="t noun without determiner" Roman politics} for {M22:function="a adverbial",head="1",partofspeech="t noun without determiner" several years}}. {M23:function="s subject",head="0",partofspeech="s personnal pronoun" Their} attempts to amass power as {M8:function="t other",head="0",partofspeech="a name" Populares} were opposed by {M25:function="o object",head="1",partofspeech="a name" the Optimates} within {TheSenate:function="a adverbial",head="2",partofspeech="a name" the Roman Senate}, among {M25:function="a adverbial",head="0",partofspeech="s personnal pronoun" them} {M19:function="a adverbial",head="0",partofspeech="a name" Cato the Younger} with {M26:function="a adverbial",head="2",partofspeech="d noun with definite article" the frequent support of {M27:function="m noun modifier",head="0",partofspeech="a name" Cicero}}. {Caesar:function="s subject",head="0",partofspeech="a name" Caesar} rose to become one of {M28:function="o object",head="3",partofspeech="d noun with definite article" the most powerful politicians in {M29:function="a adverbial",head="2",partofspeech="d noun with definite article" the Roman Republic}} through {M40:function="a adverbial",head="1",partofspeech="i noun with indefinite article" a number of {M31:function="m noun modifier",head="1",partofspeech="n noun with determiner" {Caesar:function="m noun modifier",head="0",partofspeech="s personnal pronoun" his} accomplishments}}, notably {M33:function="m noun modifier",head="1",partofspeech="n noun with determiner" {Caesar:function="m noun modifier",head="0",partofspeech="s personnal pronoun" his} victories in {M24:function="a adverbial",head="2",partofspeech="d noun with definite article" the Gallic Wars}, completed by {M35:function="a adverbial",head="0",partofspeech="t noun without determiner" 51 BC}}. During this time, {Caesar:function="s subject",head="0",partofspeech="a name" Caesar} became {Caesar:function="o object",head="3",partofspeech="d noun with definite article" the first Roman general} to cross both {M38:function="o object",head="2",partofspeech="d noun with definite article" {M37:function="o object",head="2",partofspeech="d noun with definite article" the English Channel} and {TheRhine:function="o object",head="2",partofspeech="d noun with definite article" the Rhine River}}, when {Caesar:function="s subject",head="0",partofspeech="s personnal pronoun" he} built {M34:function="o object",head="1",partofspeech="i noun with indefinite article" a bridge across {TheRhine:function="a adverbial",head="1",partofspeech="d noun with definite article" the Rhine}} and crossed {M37:function="o object",head="1",partofspeech="d noun with definite article" the Channel} to invade {Britain:function="o object",head="0",partofspeech="a name" Britain}. {M39:function="s subject",head="2",partofspeech="n noun with determiner" {Caesar:function="m noun modifier",head="0",partofspeech="a name" Caesar'}s wars} extended {M42:function="o object",head="2",partofspeech="n noun with determiner" {Rome:function="m noun modifier",head="0",partofspeech="a name" Rome'}s territory} to {Britain:function="a adverbial",head="0",partofspeech="a name" Britain} and past {M50:function="a adverbial",head="0",partofspeech="a name" Gaul}. {M40:function="s subject",head="1",partofspeech="n noun with determiner" These achievements} granted {Caesar:function="o object",head="0",partofspeech="s personnal pronoun" him} unmatched {M30:function="o object",head="1",partofspeech="t noun without determiner" military power} and threatened to eclipse {M18:function="o object",head="1",partofspeech="d noun with definite article" the standing of {Pompey:function="m noun modifier",head="0",partofspeech="a name" Pompey, {Pompey:function="s subject",head="0",partofspeech="r relative pronoun" who} had realigned {Pompey:function="o object",head="0",partofspeech="s personnal pronoun" himself} with {TheSenate:function="o object",head="1",partofspeech="d noun with definite article" the Senate} after {M47:function="a adverbial",head="1",partofspeech="d noun with definite article" the death of {Crassus:function="m noun modifier",head="0",partofspeech="a name" Crassus}} in {M9:function="a adverbial",head="0",partofspeech="t noun without determiner" 53 BC}}}. With {M24:function="a adverbial",head="2",partofspeech="d noun with definite article" the Gallic Wars} concluded, {TheSenate:function="s subject",head="1",partofspeech="d noun with definite article" the Senate} ordered {Caesar:function="o object",head="0",partofspeech="a name" Caesar} to step down from {M32:function="o object",head="2",partofspeech="n noun with determiner" {Caesar:function="m noun modifier",head="0",partofspeech="e possessive adjective" his} military command} and return to {Rome:function="o object",head="0",partofspeech="a name" Rome}. Leaving {M41:function="o object",head="1",partofspeech="n noun with determiner" {Caesar:function="m noun modifier",head="0",partofspeech="e possessive adjective" his} command} in {M50:function="a adverbial",head="0",partofspeech="a name" Gaul} meant losing {M44:function="o object",head="1",partofspeech="n noun with determiner" {Caesar:function="m noun modifier",head="0",partofspeech="e possessive adjective" his} immunity from being charged as a criminal for waging {M51:function="o object",head="1",partofspeech="t noun without determiner" unsanctioned wars}}. As a result, {Caesar:function="s subject",head="0",partofspeech="a name" Caesar} found himself with no other options but to cross {M52:function="o object",head="1",partofspeech="d noun with definite article" the Rubicon} with {M53:function="a adverbial",head="3",partofspeech="d noun with definite article" the 13th Legion}, leaving {M54:function="o object",head="1",partofspeech="n noun with determiner" {Caesar:function="m noun modifier",head="0",partofspeech="e possessive adjective" his} province} and illegally entering {M55:function="o object",head="1",partofspeech="a name" Roman Italy} under arms. {M56:function="s subject",head="0",partofspeech="s personnal pronoun" This} began {M59:function="o object",head="3",partofspeech="n noun with determiner" {Caesar:function="m noun modifier",head="0",partofspeech="a name" Caesar'}s civil war}, and {M58:function="s subject",head="1",partofspeech="n noun with determiner" {Caesar:function="m noun modifier",head="0",partofspeech="e possessive adjective" his} victory in {M59:function="a adverbial",head="1",partofspeech="d noun with definite article" the war}} put {Caesar:function="o object",head="0",partofspeech="s personnal pronoun" him} in {M57:function="a adverbial",head="2",partofspeech="i noun with indefinite article" an unrivaled position of {M60:function="m noun modifier",head="0",partofspeech="t noun without determiner" {M61:function="m noun modifier",head="0",partofspeech="t noun without determiner" power} and {M62:function="m noun modifier",head="0",partofspeech="t noun without determiner" influence}}}. 8 | 9 | 10 | 11 | #COLOR:Caesar=hsl(0, 100%, 80%) 12 | #COLOR:M12=hsl(25, 100%, 80%) 13 | #COLOR:Crassus=hsl(275, 100%, 80%) 14 | #COLOR:Pompey=hsl(225, 100%, 80%) 15 | #COLOR:M23=hsl(75, 100%, 80%) 16 | #COLOR:M21=hsl(50, 100%, 80%) 17 | #COLOR:M25=hsl(100, 100%, 80%) 18 | #COLOR:TheSenate=hsl(250, 100%, 80%) 19 | #COLOR:M40=hsl(200, 100%, 80%) 20 | #COLOR:M24=hsl(300, 100%, 80%) 21 | #COLOR:M37=hsl(150, 100%, 80%) 22 | #COLOR:TheRhine=hsl(125, 100%, 80%) 23 | #COLOR:Britain=hsl(175, 100%, 80%) 24 | #COLOR:Rome=hsl(325, 100%, 80%) 25 | #COLOR:M50=hsl(350, 100%, 80%) 26 | #COLOR:M59=hsl(0, 100%, 70%) 27 | 28 | #TOKENIZATION-TYPE:1 29 | 30 | -------------------------------------------------------------------------------- /testing/cicero.sacr: -------------------------------------------------------------------------------- 1 | #textmetadata:work=politics 2 | 3 | #textid:cicero 4 | 5 | {Cicero:function="s subject",head="0",partofspeech="a name" Marcus Tullius Cicero}({M2:function="t other",head="0",partofspeech="t noun without determiner" 106 BC} – {M3:function="t other",head="0",partofspeech="t noun without determiner" 7 December 43 BC}) was {Cicero:function="o object",head="2",partofspeech="i noun with indefinite article" a Roman {Cicero:function="o object",head="0",partofspeech="i noun with indefinite article" statesman}, {Cicero:function="o object",head="0",partofspeech="i noun with indefinite article" orator}, {Cicero:function="o object",head="0",partofspeech="i noun with indefinite article" lawyer} and {Cicero:function="o object",head="0",partofspeech="i noun with indefinite article" philosopher}, {Cicero:function="s subject",head="0",partofspeech="r relative pronoun" who} served as {Cicero:function="o object",head="0",partofspeech="t noun without determiner" consul} in {M10:function="a adverbial",head="1",partofspeech="d noun with definite article" the year 63 BC}}. {Cicero:function="s subject",head="0",partofspeech="s personnal pronoun" He} came from {M1:function="o object",head="3",partofspeech="i noun with indefinite article" a wealthy municipal family of {M5:function="m noun modifier",head="3",partofspeech="d noun with definite article" the Roman equestrian order}}, and is considered {Cicero:function="o object",head="0",partofspeech="n noun with determiner" one of {M7:function="o object",head="4",partofspeech="n noun with determiner" {M11:function="o object",head="3",partofspeech="n noun with determiner" {M9:function="m noun modifier",head="0",partofspeech="a name" Rome'}s greatest orators} and {M8:function="o object",head="1",partofspeech="n noun with determiner" prose stylists}}}. 6 | 7 | {Cicero:function="s subject",head="0",partofspeech="s personnal pronoun" His} influence on {Latin:function="o object",head="2",partofspeech="d noun with definite article" the Latin language} was so immense that {M14:function="s subject",head="2",partofspeech="d noun with definite article" the subsequent history of {M15:function="m noun modifier",head="0",partofspeech="t noun without determiner" prose}}, not only in {Latin:function="a adverbial",head="0",partofspeech="t noun without determiner" Latin} but in {M17:function="a adverbial",head="1",partofspeech="t noun without determiner" European languages} up to {M18:function="a adverbial",head="3",partofspeech="d noun with definite article" the 19th century}, was said to be either {M19:function="o object",head="1",partofspeech="i noun with indefinite article" {M20:function="o object",head="1",partofspeech="i noun with indefinite article" a reaction against} or {M21:function="o object",head="1",partofspeech="i noun with indefinite article" a return to {M23:function="o object",head="1",partofspeech="n noun with determiner" {Cicero:function="m noun modifier",head="0",partofspeech="e possessive adjective" his} style}}}. {Cicero:function="s subject",head="0",partofspeech="a name" Cicero} introduced the {M12:function="o object",head="0",partofspeech="a name" Romans} to {M22:function="o object",head="2",partofspeech="d noun with definite article" the chief schools of {M32:function="m noun modifier",head="1",partofspeech="t noun without determiner" Greek philosophy}} and created {M4:function="o object",head="3",partofspeech="i noun with indefinite article" a Latin philosophical vocabulary} (with {M31:function="a adverbial",head="0",partofspeech="i noun with indefinite article" {M25:function="a adverbial",head="0",partofspeech="t noun without determiner" neologisms} such as {M26:function="a adverbial",head="0",partofspeech="t noun without determiner" evidentia}, {M27:function="a adverbial",head="0",partofspeech="t noun without determiner" humanitas}, {M28:function="a adverbial",head="0",partofspeech="t noun without determiner" qualitas}, {M29:function="a adverbial",head="0",partofspeech="t noun without determiner" quantitas}, and {M30:function="a adverbial",head="0",partofspeech="t noun without determiner" essentia}}) distinguishing {Cicero:function="o object",head="0",partofspeech="s personnal pronoun" himself} as {Cicero:function="o object",head="1",partofspeech="i noun with indefinite article" a {Cicero:function="o object",head="0",partofspeech="t noun without determiner" translator} and {Cicero:function="o object",head="0",partofspeech="t noun without determiner" philosopher}}. 8 | 9 | 10 | 11 | #COLOR:Cicero=hsl(0, 100%, 80%) 12 | #COLOR:Latin=hsl(25, 100%, 80%) 13 | 14 | #TOKENIZATION-TYPE:1 15 | 16 | -------------------------------------------------------------------------------- /testing/lucian_speakers.sacr: -------------------------------------------------------------------------------- 1 | #title: Lucian, Dialogues of the Dead, 4: Hermes and Charon 2 | 3 | #speaker: Hermes 4 | Ferryman, what do you say to settling up accounts? It will prevent any 5 | unpleasantness later on. 6 | 7 | #speaker: Charon 8 | Very good. It does save trouble to get these things straight. 9 | 10 | #speaker: Hermes 11 | One anchor, to your order, five shillings. 12 | 13 | #speaker: Charon 14 | That is a lot of money. 15 | 16 | #speaker: Hermes 17 | So help me Pluto, it is what I had to pay. One rowlock-strap, fourpence. 18 | 19 | #speaker: Charon 20 | Five and four; put that down. 21 | 22 | #speaker: Hermes 23 | Then there was a needle, for mending the sail; tenpence. 24 | 25 | #speaker: Charon 26 | Down with it. 27 | 28 | -------------------------------------------------------------------------------- /testing/pliny.sacr: -------------------------------------------------------------------------------- 1 | #textmetadata:work=science 2 | 3 | #textid:pliny 4 | 5 | {PlinyTheElder:function="s subject",head="0",partofspeech="a name" Pliny the Elder} (AD {PlinyTheElder:function="t other",head="0",partofspeech="t noun without determiner" 23}–{PlinyTheElder:function="t other",head="0",partofspeech="t noun without determiner" 79}) was {PlinyTheElder:function="o object",head="2",partofspeech="i noun with indefinite article" {PlinyTheElder:function="o object",head="2",partofspeech="i noun with indefinite article" a Roman author}, {PlinyTheElder:function="o object",head="0",partofspeech="i noun with indefinite article" naturalist} and {PlinyTheElder:function="o object",head="1",partofspeech="i noun with indefinite article" natural philosopher}, {PlinyTheElder:function="o object",head="4",partofspeech="i noun with indefinite article" a naval and army commander of {PlinyTheElder:function="m noun modifier",head="3",partofspeech="d noun with definite article" the early Roman Empire}}, and {PlinyTheElder:function="o object",head="0",partofspeech="t noun without determiner" friend of {PlinyTheElder:function="m noun modifier",head="1",partofspeech="a name" emperor Vespasian}}}. 6 | 7 | Spending most of {M12:function="a adverbial",head="2",partofspeech="n noun with determiner" {PlinyTheElder:function="m noun modifier",head="0",partofspeech="e possessive adjective" his} spare time} studying, writing, and investigating {M2:function="o object",head="3",partofspeech="t noun without determiner" natural and geographic phenomena} in {M3:function="a adverbial",head="1",partofspeech="d noun with definite article" the field}, {PlinyTheElder:function="s subject",head="0",partofspeech="a name" Pliny} wrote {M7:function="o object",head="1",partofspeech="d noun with definite article" the encyclopedic Naturalis Historia (Natural History), {M7:function="s subject",head="0",partofspeech="r relative pronoun" which} became {M6:function="o object",head="2",partofspeech="i noun with indefinite article" an editorial model for {M5:function="m noun modifier",head="0",partofspeech="t noun without determiner" encyclopedias}}}. {PlinyTheYounger:function="s subject",head="1",partofspeech="n noun with determiner" {PlinyTheElder:function="m noun modifier",head="0",partofspeech="e possessive adjective" His} nephew}, {PlinyTheYounger:function="m noun modifier",head="0",partofspeech="a name" Pliny the Younger}, wrote of {PlinyTheElder:function="o object",head="0",partofspeech="s personnal pronoun" him} in {M1:function="a adverbial",head="1",partofspeech="i noun with indefinite article" a letter to {Tacitus:function="a adverbial",head="2",partofspeech="d noun with definite article" the historian Tacitus}}: For {M10:function="a adverbial",head="1",partofspeech="n noun with determiner" {PlinyTheYounger:function="m noun modifier",head="0",partofspeech="e possessive adjective" my} part} {PlinyTheYounger:function="s subject",head="0",partofspeech="s personnal pronoun" I} deem {M14:function="o object",head="0",partofspeech="r relative pronoun" those blessed to {M14:function="o object",head="0",partofspeech="r relative pronoun" whom}, by favour of {M13:function="m noun modifier",head="1",partofspeech="d noun with definite article" the gods}, it has been granted either to do {M15:function="o object",head="0",partofspeech="r relative pronoun" what is worth writing of}, or to write {M11:function="o object",head="0",partofspeech="r relative pronoun" what is worth reading}}; above measure blessed {M17:function="o object",head="0",partofspeech="r relative pronoun" those on {M17:function="o object",head="0",partofspeech="r relative pronoun" whom} both gifts have been conferred}. In {M16:function="m noun modifier",head="2",partofspeech="d noun with definite article" the latter number} will be {PlinyTheElder:function="o object",head="1",partofspeech="n noun with determiner" {PlinyTheYounger:function="m noun modifier",head="0",partofspeech="e possessive adjective" my} uncle}, by virtue of {M18:function="m noun modifier",head="1",partofspeech="n noun with determiner" {M4:function="",head="",partofspeech="" {PlinyTheElder:function="m noun modifier",head="0",partofspeech="e possessive adjective" his} own} and of {M9:function="m noun modifier",head="1",partofspeech="n noun with determiner" {Tacitus:function="m noun modifier",head="0",partofspeech="e possessive adjective" your} compositions}}. 8 | 9 | {PlinyTheYounger:function="s subject",head="0",partofspeech="a name" Pliny the Younger} refers to {M20:function="o object",head="2",partofspeech="n noun with determiner" {Tacitus:function="m noun modifier",head="0",partofspeech="a name" Tacitus}’s reliance} upon {M21:function="a adverbial",head="3",partofspeech="n noun with determiner" {PlinyTheElder:function="m noun modifier",head="1",partofspeech="n noun with determiner" {PlinyTheYounger:function="m noun modifier",head="0",partofspeech="e possessive adjective" his} uncle'}s book}, {M22:function="m noun modifier",head="1",partofspeech="d noun with definite article" the History of the German Wars}. 10 | 11 | 12 | 13 | #COLOR:PlinyTheElder=hsl(0, 100%, 80%) 14 | #COLOR:M7=hsl(25, 100%, 80%) 15 | #COLOR:PlinyTheYounger=hsl(50, 100%, 80%) 16 | #COLOR:Tacitus=hsl(125, 100%, 80%) 17 | #COLOR:M14=hsl(75, 100%, 80%) 18 | #COLOR:M17=hsl(100, 100%, 80%) 19 | 20 | #TOKENIZATION-TYPE:1 21 | 22 | -------------------------------------------------------------------------------- /testing/simple.sacr: -------------------------------------------------------------------------------- 1 | {one {one His} head} hurts. 2 | -------------------------------------------------------------------------------- /testing/testing_sacr2conll.conll: -------------------------------------------------------------------------------- 1 | #begin document (aesop.sacr); part 000 2 | 0 A (0 3 | 1 Peasant 0) 4 | 2 found - 5 | 3 an (1 6 | 4 Eagle - 7 | 5 captured - 8 | 6 in - 9 | 7 a (2 10 | 8 trap 2)_1) 11 | 9 , - 12 | 10 and - 13 | 11 much - 14 | 12 admiring - 15 | 13 the (1 16 | 14 bird 1) 17 | 15 , - 18 | 16 set - 19 | 17 him (0) 20 | 18 free - 21 | 19 . - 22 | 23 | 0 The (1 24 | 1 Eagle 1) 25 | 2 did - 26 | 3 not - 27 | 4 prove - 28 | 5 ungrateful - 29 | 6 to - 30 | 7 his (0_(1) 31 | 8 deliverer 0) 32 | 9 , - 33 | 10 for - 34 | 11 seeing - 35 | 12 the (0 36 | 13 Peasant - 37 | 14 sitting - 38 | 15 under - 39 | 16 a (3 40 | 17 wall - 41 | 18 which (3) 42 | 19 was - 43 | 20 not - 44 | 21 safe 3)_0) 45 | 22 , - 46 | 23 he (1) 47 | 24 flew - 48 | 25 toward - 49 | 26 him (0) 50 | 27 and - 51 | 28 with - 52 | 29 his (4_(1) 53 | 30 talons 4) 54 | 31 snatched - 55 | 32 a (5 56 | 33 bundle 5) 57 | 34 from - 58 | 35 his (6_(0) 59 | 36 head 6) 60 | 37 . - 61 | 62 | 0 When - 63 | 1 the (0 64 | 2 Peasant 0) 65 | 3 rose - 66 | 4 in - 67 | 5 pursuit - 68 | 6 , - 69 | 7 the (1 70 | 8 Eagle 1) 71 | 9 let - 72 | 10 the (5 73 | 11 bundle 5) 74 | 12 fall - 75 | 13 again - 76 | 14 . - 77 | 78 | 0 Taking - 79 | 1 it (5) 80 | 2 up - 81 | 3 , - 82 | 4 the (0 83 | 5 man 0) 84 | 6 returned - 85 | 7 to - 86 | 8 the (7 87 | 9 same - 88 | 10 place 7) 89 | 11 , - 90 | 12 to - 91 | 13 find - 92 | 14 that - 93 | 15 the (3 94 | 16 wall - 95 | 17 under - 96 | 18 which (3) 97 | 19 he (0) 98 | 20 had - 99 | 21 been - 100 | 22 sitting 3) 101 | 23 had - 102 | 24 fallen - 103 | 25 to - 104 | 26 pieces - 105 | 27 ; - 106 | 28 and - 107 | 29 he (0) 108 | 30 marveled - 109 | 31 at - 110 | 32 the (8 111 | 33 service 8) 112 | 34 rendered - 113 | 35 him (0) 114 | 36 by - 115 | 37 the (1 116 | 38 Eagle 1) 117 | 39 . - 118 | #end document 119 | 120 | 121 | #begin document (caesar.sacr); part 000 122 | 0 Gaius (0 123 | 1 Julius - 124 | 2 Caesar 0) 125 | 3 ( - 126 | 4 12 (1 127 | 5 or - 128 | 6 13 - 129 | 7 July - 130 | 8 100 - 131 | 9 BC 1) 132 | 10 – - 133 | 11 15 (2 134 | 12 March - 135 | 13 44 - 136 | 14 BC 2) 137 | 15 ) - 138 | 16 , - 139 | 17 known - 140 | 18 by - 141 | 19 his (3_(4_(0) 142 | 20 nomen 4) 143 | 21 and - 144 | 22 cognomen (5) 145 | 23 Julius - 146 | 24 Caesar 3) 147 | 25 , - 148 | 26 was - 149 | 27 a (0 150 | 28 Roman - 151 | 29 politician - 152 | 30 , - 153 | 31 military - 154 | 32 general - 155 | 33 , - 156 | 34 and - 157 | 35 historian - 158 | 36 who (0) 159 | 37 played - 160 | 38 a (6 161 | 39 critical - 162 | 40 role 6) 163 | 41 in - 164 | 42 the (7 165 | 43 events - 166 | 44 that (7) 167 | 45 led - 168 | 46 to - 169 | 47 the (8 170 | 48 demise - 171 | 49 of - 172 | 50 the (9 173 | 51 Roman - 174 | 52 Republic 9)_8) 175 | 53 and - 176 | 54 the (10 177 | 55 rise - 178 | 56 of - 179 | 57 the (11 180 | 58 Roman - 181 | 59 Empire 11)_10)_7)_0) 182 | 60 . - 183 | 184 | 0 He (0) 185 | 1 also - 186 | 2 wrote - 187 | 3 Latin (12 188 | 4 prose 12) 189 | 5 . - 190 | 191 | 0 In - 192 | 1 60 (13 193 | 2 BC 13) 194 | 3 , - 195 | 4 Caesar (14_(0) 196 | 5 , - 197 | 6 Crassus (15) 198 | 7 and - 199 | 8 Pompey (16)_14) 200 | 9 formed - 201 | 10 the (17 202 | 11 First - 203 | 12 Triumvirate - 204 | 13 , - 205 | 14 a - 206 | 15 political - 207 | 16 alliance - 208 | 17 that (17) 209 | 18 dominated - 210 | 19 Roman (18 211 | 20 politics 18) 212 | 21 for - 213 | 22 several (19 214 | 23 years 19)_17) 215 | 24 . - 216 | 217 | 0 Their (14) 218 | 1 attempts - 219 | 2 to - 220 | 3 amass - 221 | 4 power - 222 | 5 as - 223 | 6 Populares (20) 224 | 7 were - 225 | 8 opposed - 226 | 9 by - 227 | 10 the (21 228 | 11 Optimates 21) 229 | 12 within - 230 | 13 the (22 231 | 14 Roman - 232 | 15 Senate 22) 233 | 16 , - 234 | 17 among - 235 | 18 them (21) 236 | 19 Cato (23 237 | 20 the - 238 | 21 Younger 23) 239 | 22 with - 240 | 23 the (24 241 | 24 frequent - 242 | 25 support - 243 | 26 of - 244 | 27 Cicero (25)_24) 245 | 28 . - 246 | 247 | 0 Caesar (0) 248 | 1 rose - 249 | 2 to - 250 | 3 become - 251 | 4 one - 252 | 5 of - 253 | 6 the (26 254 | 7 most - 255 | 8 powerful - 256 | 9 politicians - 257 | 10 in - 258 | 11 the (27 259 | 12 Roman - 260 | 13 Republic 27)_26) 261 | 14 through - 262 | 15 a (28 263 | 16 number - 264 | 17 of - 265 | 18 his (29_(0) 266 | 19 accomplishments 29)_28) 267 | 20 , - 268 | 21 notably - 269 | 22 his (30_(0) 270 | 23 victories - 271 | 24 in - 272 | 25 the (31 273 | 26 Gallic - 274 | 27 Wars 31) 275 | 28 , - 276 | 29 completed - 277 | 30 by - 278 | 31 51 (32 279 | 32 BC 32)_30) 280 | 33 . - 281 | 282 | 0 During - 283 | 1 this - 284 | 2 time - 285 | 3 , - 286 | 4 Caesar (0) 287 | 5 became - 288 | 6 the (0 289 | 7 first - 290 | 8 Roman - 291 | 9 general 0) 292 | 10 to - 293 | 11 cross - 294 | 12 both - 295 | 13 the (33_(34 296 | 14 English - 297 | 15 Channel 34) 298 | 16 and - 299 | 17 the (35 300 | 18 Rhine - 301 | 19 River 35)_33) 302 | 20 , - 303 | 21 when - 304 | 22 he (0) 305 | 23 built - 306 | 24 a (36 307 | 25 bridge - 308 | 26 across - 309 | 27 the (35 310 | 28 Rhine 35)_36) 311 | 29 and - 312 | 30 crossed - 313 | 31 the (34 314 | 32 Channel 34) 315 | 33 to - 316 | 34 invade - 317 | 35 Britain (37) 318 | 36 . - 319 | 320 | 0 Caesar' (38_(0) 321 | 1 s - 322 | 2 wars 38) 323 | 3 extended - 324 | 4 Rome' (39_(40) 325 | 5 s - 326 | 6 territory 39) 327 | 7 to - 328 | 8 Britain (37) 329 | 9 and - 330 | 10 past - 331 | 11 Gaul (41) 332 | 12 . - 333 | 334 | 0 These (28 335 | 1 achievements 28) 336 | 2 granted - 337 | 3 him (0) 338 | 4 unmatched - 339 | 5 military (42 340 | 6 power 42) 341 | 7 and - 342 | 8 threatened - 343 | 9 to - 344 | 10 eclipse - 345 | 11 the (43 346 | 12 standing - 347 | 13 of - 348 | 14 Pompey (16 349 | 15 , - 350 | 16 who (16) 351 | 17 had - 352 | 18 realigned - 353 | 19 himself (16) 354 | 20 with - 355 | 21 the (22 356 | 22 Senate 22) 357 | 23 after - 358 | 24 the (44 359 | 25 death - 360 | 26 of - 361 | 27 Crassus (15)_44) 362 | 28 in - 363 | 29 53 (45 364 | 30 BC 45)_16)_43) 365 | 31 . - 366 | 367 | 0 With - 368 | 1 the (31 369 | 2 Gallic - 370 | 3 Wars 31) 371 | 4 concluded - 372 | 5 , - 373 | 6 the (22 374 | 7 Senate 22) 375 | 8 ordered - 376 | 9 Caesar (0) 377 | 10 to - 378 | 11 step - 379 | 12 down - 380 | 13 from - 381 | 14 his (46_(0) 382 | 15 military - 383 | 16 command 46) 384 | 17 and - 385 | 18 return - 386 | 19 to - 387 | 20 Rome (40) 388 | 21 . - 389 | 390 | 0 Leaving - 391 | 1 his (47_(0) 392 | 2 command 47) 393 | 3 in - 394 | 4 Gaul (41) 395 | 5 meant - 396 | 6 losing - 397 | 7 his (48_(0) 398 | 8 immunity - 399 | 9 from - 400 | 10 being - 401 | 11 charged - 402 | 12 as - 403 | 13 a - 404 | 14 criminal - 405 | 15 for - 406 | 16 waging - 407 | 17 unsanctioned (49 408 | 18 wars 49)_48) 409 | 19 . - 410 | 411 | 0 As - 412 | 1 a - 413 | 2 result - 414 | 3 , - 415 | 4 Caesar (0) 416 | 5 found - 417 | 6 himself - 418 | 7 with - 419 | 8 no - 420 | 9 other - 421 | 10 options - 422 | 11 but - 423 | 12 to - 424 | 13 cross - 425 | 14 the (50 426 | 15 Rubicon 50) 427 | 16 with - 428 | 17 the (51 429 | 18 13th - 430 | 19 Legion 51) 431 | 20 , - 432 | 21 leaving - 433 | 22 his (52_(0) 434 | 23 province 52) 435 | 24 and - 436 | 25 illegally - 437 | 26 entering - 438 | 27 Roman (53 439 | 28 Italy 53) 440 | 29 under - 441 | 30 arms - 442 | 31 . - 443 | 444 | 0 This (54) 445 | 1 began - 446 | 2 Caesar' (55_(0) 447 | 3 s - 448 | 4 civil - 449 | 5 war 55) 450 | 6 , - 451 | 7 and - 452 | 8 his (56_(0) 453 | 9 victory - 454 | 10 in - 455 | 11 the (55 456 | 12 war 55)_56) 457 | 13 put - 458 | 14 him (0) 459 | 15 in - 460 | 16 an (57 461 | 17 unrivaled - 462 | 18 position - 463 | 19 of - 464 | 20 power (58_(59) 465 | 21 and - 466 | 22 influence (60)_58)_57) 467 | 23 . - 468 | #end document 469 | 470 | 471 | #begin document (cicero.sacr); part 000 472 | 0 Marcus (0 473 | 1 Tullius - 474 | 2 Cicero 0) 475 | 3 ( - 476 | 4 106 (1 477 | 5 BC 1) 478 | 6 – - 479 | 7 7 (2 480 | 8 December - 481 | 9 43 - 482 | 10 BC 2) 483 | 11 ) - 484 | 12 was - 485 | 13 a (0 486 | 14 Roman - 487 | 15 statesman (0) 488 | 16 , - 489 | 17 orator (0) 490 | 18 , - 491 | 19 lawyer (0) 492 | 20 and - 493 | 21 philosopher (0) 494 | 22 , - 495 | 23 who (0) 496 | 24 served - 497 | 25 as - 498 | 26 consul (0) 499 | 27 in - 500 | 28 the (3 501 | 29 year - 502 | 30 63 - 503 | 31 BC 3)_0) 504 | 32 . - 505 | 506 | 0 He (0) 507 | 1 came - 508 | 2 from - 509 | 3 a (4 510 | 4 wealthy - 511 | 5 municipal - 512 | 6 family - 513 | 7 of - 514 | 8 the (5 515 | 9 Roman - 516 | 10 equestrian - 517 | 11 order 5)_4) 518 | 12 , - 519 | 13 and - 520 | 14 is - 521 | 15 considered - 522 | 16 one (0 523 | 17 of - 524 | 18 Rome' (6_(7_(8) 525 | 19 s - 526 | 20 greatest - 527 | 21 orators 7) 528 | 22 and - 529 | 23 prose (9 530 | 24 stylists 9)_6)_0) 531 | 25 . - 532 | 533 | 0 His (0) 534 | 1 influence - 535 | 2 on - 536 | 3 the (10 537 | 4 Latin - 538 | 5 language 10) 539 | 6 was - 540 | 7 so - 541 | 8 immense - 542 | 9 that - 543 | 10 the (11 544 | 11 subsequent - 545 | 12 history - 546 | 13 of - 547 | 14 prose (12)_11) 548 | 15 , - 549 | 16 not - 550 | 17 only - 551 | 18 in - 552 | 19 Latin (10) 553 | 20 but - 554 | 21 in - 555 | 22 European (13 556 | 23 languages 13) 557 | 24 up - 558 | 25 to - 559 | 26 the (14 560 | 27 19th - 561 | 28 century 14) 562 | 29 , - 563 | 30 was - 564 | 31 said - 565 | 32 to - 566 | 33 be - 567 | 34 either - 568 | 35 a (15_(16 569 | 36 reaction - 570 | 37 against 16) 571 | 38 or - 572 | 39 a (17 573 | 40 return - 574 | 41 to - 575 | 42 his (18_(0) 576 | 43 style 18)_17)_15) 577 | 44 . - 578 | 579 | 0 Cicero (0) 580 | 1 introduced - 581 | 2 the - 582 | 3 Romans (19) 583 | 4 to - 584 | 5 the (20 585 | 6 chief - 586 | 7 schools - 587 | 8 of - 588 | 9 Greek (21 589 | 10 philosophy 21)_20) 590 | 11 and - 591 | 12 created - 592 | 13 a (22 593 | 14 Latin - 594 | 15 philosophical - 595 | 16 vocabulary 22) 596 | 17 ( - 597 | 18 with - 598 | 19 neologisms (23_(24) 599 | 20 such - 600 | 21 as - 601 | 22 evidentia (25) 602 | 23 , - 603 | 24 humanitas (26) 604 | 25 , - 605 | 26 qualitas (27) 606 | 27 , - 607 | 28 quantitas (28) 608 | 29 , - 609 | 30 and - 610 | 31 essentia (29)_23) 611 | 32 ) - 612 | 33 distinguishing - 613 | 34 himself (0) 614 | 35 as - 615 | 36 a (0 616 | 37 translator (0) 617 | 38 and - 618 | 39 philosopher (0)_0) 619 | 40 . - 620 | #end document 621 | 622 | 623 | #begin document (pliny.sacr); part 000 624 | 0 Pliny (0 625 | 1 the - 626 | 2 Elder 0) 627 | 3 ( - 628 | 4 AD - 629 | 5 23 (0) 630 | 6 – - 631 | 7 79 (0) 632 | 8 ) - 633 | 9 was - 634 | 10 a (0_(0 635 | 11 Roman - 636 | 12 author 0) 637 | 13 , - 638 | 14 naturalist (0) 639 | 15 and - 640 | 16 natural (0 641 | 17 philosopher 0) 642 | 18 , - 643 | 19 a (0 644 | 20 naval - 645 | 21 and - 646 | 22 army - 647 | 23 commander - 648 | 24 of - 649 | 25 the (0 650 | 26 early - 651 | 27 Roman - 652 | 28 Empire 0)_0) 653 | 29 , - 654 | 30 and - 655 | 31 friend (0 656 | 32 of - 657 | 33 emperor (0 658 | 34 Vespasian 0)_0)_0) 659 | 35 . - 660 | 661 | 0 Spending - 662 | 1 most - 663 | 2 of - 664 | 3 his (1_(0) 665 | 4 spare - 666 | 5 time 1) 667 | 6 studying - 668 | 7 , - 669 | 8 writing - 670 | 9 , - 671 | 10 and - 672 | 11 investigating - 673 | 12 natural (2 674 | 13 and - 675 | 14 geographic - 676 | 15 phenomena 2) 677 | 16 in - 678 | 17 the (3 679 | 18 field 3) 680 | 19 , - 681 | 20 Pliny (0) 682 | 21 wrote - 683 | 22 the (4 684 | 23 encyclopedic - 685 | 24 Naturalis - 686 | 25 Historia - 687 | 26 ( - 688 | 27 Natural - 689 | 28 History - 690 | 29 ) - 691 | 30 , - 692 | 31 which (4) 693 | 32 became - 694 | 33 an (5 695 | 34 editorial - 696 | 35 model - 697 | 36 for - 698 | 37 encyclopedias (6)_5)_4) 699 | 38 . - 700 | 701 | 0 His (7_(0) 702 | 1 nephew 7) 703 | 2 , - 704 | 3 Pliny (7 705 | 4 the - 706 | 5 Younger 7) 707 | 6 , - 708 | 7 wrote - 709 | 8 of - 710 | 9 him (0) 711 | 10 in - 712 | 11 a (8 713 | 12 letter - 714 | 13 to - 715 | 14 the (9 716 | 15 historian - 717 | 16 Tacitus 9)_8) 718 | 17 : - 719 | 18 For - 720 | 19 my (10_(7) 721 | 20 part 10) 722 | 21 I (7) 723 | 22 deem - 724 | 23 those (11 725 | 24 blessed - 726 | 25 to - 727 | 26 whom (11) 728 | 27 , - 729 | 28 by - 730 | 29 favour - 731 | 30 of - 732 | 31 the (12 733 | 32 gods 12) 734 | 33 , - 735 | 34 it - 736 | 35 has - 737 | 36 been - 738 | 37 granted - 739 | 38 either - 740 | 39 to - 741 | 40 do - 742 | 41 what (13 743 | 42 is - 744 | 43 worth - 745 | 44 writing - 746 | 45 of 13) 747 | 46 , - 748 | 47 or - 749 | 48 to - 750 | 49 write - 751 | 50 what (14 752 | 51 is - 753 | 52 worth - 754 | 53 reading 14)_11) 755 | 54 ; - 756 | 55 above - 757 | 56 measure - 758 | 57 blessed - 759 | 58 those (15 760 | 59 on - 761 | 60 whom (15) 762 | 61 both - 763 | 62 gifts - 764 | 63 have - 765 | 64 been - 766 | 65 conferred 15) 767 | 66 . - 768 | 769 | 0 In - 770 | 1 the (16 771 | 2 latter - 772 | 3 number 16) 773 | 4 will - 774 | 5 be - 775 | 6 my (0_(7) 776 | 7 uncle 0) 777 | 8 , - 778 | 9 by - 779 | 10 virtue - 780 | 11 of - 781 | 12 his (17_(18_(0) 782 | 13 own 18) 783 | 14 and - 784 | 15 of - 785 | 16 your (19_(9) 786 | 17 compositions 19)_17) 787 | 18 . - 788 | 789 | 0 Pliny (7 790 | 1 the - 791 | 2 Younger 7) 792 | 3 refers - 793 | 4 to - 794 | 5 Tacitus (20_(9) 795 | 6 ’ - 796 | 7 s - 797 | 8 reliance 20) 798 | 9 upon - 799 | 10 his (21_(0_(7) 800 | 11 uncle' 0) 801 | 12 s - 802 | 13 book 21) 803 | 14 , - 804 | 15 the (22 805 | 16 History - 806 | 17 of - 807 | 18 the - 808 | 19 German - 809 | 20 Wars 22) 810 | 21 . - 811 | #end document 812 | 813 | 814 | #begin document (simple.sacr); part 000 815 | 0 His (0_(0) 816 | 1 head 0) 817 | 2 hurts - 818 | 3 . - 819 | #end document 820 | -------------------------------------------------------------------------------- /testing_conll2sacr/_aesop.sacr___part_000: -------------------------------------------------------------------------------- 1 | {C0 A Peasant} found {C1 an Eagle captured in {C2 a trap}} , and much admiring {C1 the bird} , set {C0 him} free . 2 | 3 | {C1 The Eagle} did not prove ungrateful to {C0 {C1 his} deliverer} , for seeing {C0 the Peasant sitting under {C3 a wall {C3 which} was not safe}} , {C1 he} flew toward {C0 him} and with {C4 {C1 his} talons} snatched {C5 a bundle} from {C6 {C0 his} head} . 4 | 5 | When {C0 the Peasant} rose in pursuit , {C1 the Eagle} let {C5 the bundle} fall again . 6 | 7 | Taking {C5 it} up , {C0 the man} returned to {C7 the same place} , to find that {C3 the wall under {C3 which} {C0 he} had been sitting} had fallen to pieces ; and {C0 he} marveled at {C8 the service} rendered {C0 him} by {C1 the Eagle} . 8 | 9 | -------------------------------------------------------------------------------- /testing_conll2sacr/_ceasar.sacr___part_000: -------------------------------------------------------------------------------- 1 | {C0 Gaius Julius Caesar} ( {C1 12 or 13 July 100 BC} – {C2 15 March 44 BC} ) , known by {C3 {C4 {C0 his} nomen} and {C5 cognomen} Julius Caesar} , was {C0 a Roman politician , military general , and historian {C0 who} played {C6 a critical role} in {C7 the events {C7 that} led to {C8 the demise of {C9 the Roman Republic}} and {C10 the rise of {C11 the Roman Empire}}}} . 2 | 3 | {C0 He} also wrote {C12 Latin prose} . 4 | 5 | In {C13 60 BC} , {C14 {C0 Caesar} , {C15 Crassus} and {C16 Pompey}} formed {C17 the First Triumvirate , a political alliance {C17 that} dominated {C18 Roman politics} for {C19 several years}} . 6 | 7 | {C14 Their} attempts to amass power as {C20 Populares} were opposed by {C21 the Optimates} within {C22 the Roman Senate} , among {C21 them} {C23 Cato the Younger} with {C24 the frequent support of {C25 Cicero}} . 8 | 9 | {C0 Caesar} rose to become one of {C26 the most powerful politicians in {C27 the Roman Republic}} through {C28 a number of {C29 {C0 his} accomplishments}} , notably {C30 {C0 his} victories in {C31 the Gallic Wars} , completed by {C32 51 BC}} . 10 | 11 | During this time , {C0 Caesar} became {C0 the first Roman general} to cross both {C33 {C34 the English Channel} and {C35 the Rhine River}} , when {C0 he} built {C36 a bridge across {C35 the Rhine}} and crossed {C34 the Channel} to invade {C37 Britain} . 12 | 13 | {C38 {C0 Caesar'} s wars} extended {C39 {C40 Rome'} s territory} to {C37 Britain} and past {C41 Gaul} . 14 | 15 | {C28 These achievements} granted {C0 him} unmatched {C42 military power} and threatened to eclipse {C43 the standing of {C16 Pompey , {C16 who} had realigned {C16 himself} with {C22 the Senate} after {C44 the death of {C15 Crassus}} in {C45 53 BC}}} . 16 | 17 | With {C31 the Gallic Wars} concluded , {C22 the Senate} ordered {C0 Caesar} to step down from {C46 {C0 his} military command} and return to {C40 Rome} . 18 | 19 | Leaving {C47 {C0 his} command} in {C41 Gaul} meant losing {C48 {C0 his} immunity from being charged as a criminal for waging {C49 unsanctioned wars}} . 20 | 21 | As a result , {C0 Caesar} found himself with no other options but to cross {C50 the Rubicon} with {C51 the 13th Legion} , leaving {C52 {C0 his} province} and illegally entering {C53 Roman Italy} under arms . 22 | 23 | {C54 This} began {C55 {C0 Caesar'} s civil war} , and {C56 {C0 his} victory in {C55 the war}} put {C0 him} in {C57 an unrivaled position of {C58 {C59 power} and {C60 influence}}} . 24 | 25 | -------------------------------------------------------------------------------- /testing_conll2sacr/_cicero.sacr___part_000: -------------------------------------------------------------------------------- 1 | {C0 Marcus Tullius Cicero} ( {C1 106 BC} – {C2 7 December 43 BC} ) was {C0 a Roman {C0 statesman} , {C0 orator} , {C0 lawyer} and {C0 philosopher} , {C0 who} served as {C0 consul} in {C3 the year 63 BC}} . 2 | 3 | {C0 He} came from {C4 a wealthy municipal family of {C5 the Roman equestrian order}} , and is considered {C0 one of {C6 {C7 {C8 Rome'} s greatest orators} and {C9 prose stylists}}} . 4 | 5 | {C0 His} influence on {C10 the Latin language} was so immense that {C11 the subsequent history of {C12 prose}} , not only in {C10 Latin} but in {C13 European languages} up to {C14 the 19th century} , was said to be either {C15 {C16 a reaction against} or {C17 a return to {C18 {C0 his} style}}} . 6 | 7 | {C0 Cicero} introduced the {C19 Romans} to {C20 the chief schools of {C21 Greek philosophy}} and created {C22 a Latin philosophical vocabulary} ( with {C23 {C24 neologisms} such as {C25 evidentia} , {C26 humanitas} , {C27 qualitas} , {C28 quantitas} , and {C29 essentia}} ) distinguishing {C0 himself} as {C0 a {C0 translator} and {C0 philosopher}} . 8 | 9 | -------------------------------------------------------------------------------- /testing_conll2sacr/_pliny.sacr___part_000: -------------------------------------------------------------------------------- 1 | {C0 Pliny the Elder} ( AD {C0 23} – {C0 79} ) was {C0 {C0 a Roman author} , {C0 naturalist} and {C0 natural philosopher} , {C0 a naval and army commander of {C0 the early Roman Empire}} , and {C0 friend of {C0 emperor Vespasian}}} . 2 | 3 | Spending most of {C1 {C0 his} spare time} studying , writing , and investigating {C2 natural and geographic phenomena} in {C3 the field} , {C0 Pliny} wrote {C4 the encyclopedic Naturalis Historia ( Natural History ) , {C4 which} became {C5 an editorial model for {C6 encyclopedias}}} . 4 | 5 | {C7 {C0 His} nephew} , {C7 Pliny the Younger} , wrote of {C0 him} in {C8 a letter to {C9 the historian Tacitus}} : For {C10 {C7 my} part} {C7 I} deem {C11 those blessed to {C11 whom} , by favour of {C12 the gods} , it has been granted either to do {C13 what is worth writing of} , or to write {C14 what is worth reading}} ; above measure blessed {C15 those on {C15 whom} both gifts have been conferred} . 6 | 7 | In {C16 the latter number} will be {C0 {C7 my} uncle} , by virtue of {C17 {C18 {C0 his} own} and of {C19 {C9 your} compositions}} . 8 | 9 | {C7 Pliny the Younger} refers to {C20 {C9 Tacitus} ’ s reliance} upon {C21 {C0 {C7 his} uncle'} s book} , {C22 the History of the German Wars} . 10 | 11 | -------------------------------------------------------------------------------- /testing_conll2sacr/_simple.sacr___part_000: -------------------------------------------------------------------------------- 1 | {C0 {C0 His} head} hurts . 2 | 3 | -------------------------------------------------------------------------------- /tests/test_sacr2ann.py: -------------------------------------------------------------------------------- 1 | from sacr2ann import ( 2 | DEFAULT_MENTION_TYPE, 3 | DEFAULT_RELATION_TYPE, 4 | Annotation, 5 | RelationAnnotation, 6 | Sacr2AnnConverter, 7 | TextAnnotation, 8 | ) 9 | 10 | 11 | def test_sacr2ann_1_annotation() -> None: 12 | text = """hello {chain1:a=1,b=2,type=WORD world}!""" 13 | converter = Sacr2AnnConverter(type_property_name="type") 14 | converter.convert(text) 15 | 16 | assert converter.text == "hello world!" 17 | 18 | ann = converter.annotations 19 | assert len(ann) == 1 20 | assert ann[0] == TextAnnotation(index=1, kind="WORD", start=6, end=11) 21 | 22 | 23 | def test_sacr2ann_2_annotations_in_1_paragraph_in_2_chains() -> None: 24 | text = """hello {chain1:a=1,b=2,type=ABC world}! It'{chain2:type=DEF s} sunny""" 25 | converter = Sacr2AnnConverter(type_property_name="type") 26 | converter.convert(text) 27 | 28 | assert converter.text == "hello world! It's sunny" 29 | 30 | ann = converter.annotations 31 | assert len(ann) == 2 32 | assert ann[0] == TextAnnotation(index=1, kind="ABC", start=6, end=11) 33 | assert ann[1] == TextAnnotation(index=2, kind="DEF", start=16, end=17) 34 | 35 | 36 | def test_sacr2ann_2_annotations_in_1_paragraph_in_1_chain() -> None: 37 | text = "hello {chain1:a=1,b=2,type=ABC world}! " "It'{chain1:type=DEF s} sunny" 38 | converter = Sacr2AnnConverter(type_property_name="type") 39 | converter.convert(text) 40 | 41 | assert converter.text == "hello world! It's sunny" 42 | 43 | ann = converter.annotations 44 | assert len(ann) == 3 45 | assert ann[0] == TextAnnotation(index=1, kind="ABC", start=6, end=11) 46 | assert ann[1] == TextAnnotation(index=2, kind="DEF", start=16, end=17) 47 | assert ann[2] == RelationAnnotation( 48 | index=1, kind=DEFAULT_RELATION_TYPE, source=ann[0], target=ann[1] 49 | ) 50 | 51 | 52 | def test_sacr2ann_3_annotations_in_1_paragraph_in_1_chain() -> None: 53 | text = ( 54 | "hello {chain1:a=1,b=2,type=ABC world}! " 55 | "It'{chain1:type=DEF s} sunny. " 56 | "It's not {chain1:type=ABC rainy}" 57 | ) 58 | converter = Sacr2AnnConverter(type_property_name="type") 59 | converter.convert(text) 60 | 61 | assert converter.text == "hello world! It's sunny. It's not rainy" 62 | 63 | ann = converter.annotations 64 | assert len(ann) == 5 65 | assert ann[0] == TextAnnotation(index=1, kind="ABC", start=6, end=11) 66 | assert ann[1] == TextAnnotation(index=2, kind="DEF", start=16, end=17) 67 | assert ann[2] == RelationAnnotation( 68 | index=1, kind=DEFAULT_RELATION_TYPE, source=ann[0], target=ann[1] 69 | ) 70 | assert ann[3] == TextAnnotation(index=3, kind="ABC", start=34, end=39) 71 | assert ann[4] == RelationAnnotation( 72 | index=2, kind=DEFAULT_RELATION_TYPE, source=ann[1], target=ann[3] 73 | ) 74 | 75 | 76 | def test_sacr2ann_3_annotations_in_1_paragraph_in_2_chains() -> None: 77 | text = ( 78 | "hello {chain1:a=1,b=2,type=ABC world}! " 79 | "It'{chain2:type=DEF s} sunny. " 80 | "It's not {chain2:type=ABC rainy}" 81 | ) 82 | converter = Sacr2AnnConverter(type_property_name="type") 83 | converter.convert(text) 84 | 85 | assert converter.text == "hello world! It's sunny. It's not rainy" 86 | 87 | ann = converter.annotations 88 | assert len(ann) == 4 89 | assert ann[0] == TextAnnotation(index=1, kind="ABC", start=6, end=11) 90 | assert ann[1] == TextAnnotation(index=2, kind="DEF", start=16, end=17) 91 | assert ann[2] == TextAnnotation(index=3, kind="ABC", start=34, end=39) 92 | assert ann[3] == RelationAnnotation( 93 | index=1, kind=DEFAULT_RELATION_TYPE, source=ann[1], target=ann[2] 94 | ) 95 | 96 | 97 | def test_sacr2ann_4_annotations_in_1_paragraph_in_2_chains() -> None: 98 | text = ( 99 | "hello {chain1:a=1,b=2,type=ABC world}! " 100 | "It'{chain2:type=DEF s} sunny. " 101 | "It's not {chain2:type=ABC rainy}. " 102 | "{chain1:type=GHI It}'s hot" 103 | ) 104 | converter = Sacr2AnnConverter(type_property_name="type") 105 | converter.convert(text) 106 | 107 | assert converter.text == "hello world! It's sunny. It's not rainy. It's hot" 108 | 109 | ann = converter.annotations 110 | assert len(ann) == 6 111 | assert ann[0] == TextAnnotation(index=1, kind="ABC", start=6, end=11) 112 | assert ann[1] == TextAnnotation(index=2, kind="DEF", start=16, end=17) 113 | assert ann[2] == TextAnnotation(index=3, kind="ABC", start=34, end=39) 114 | assert ann[3] == RelationAnnotation( 115 | index=1, kind=DEFAULT_RELATION_TYPE, source=ann[1], target=ann[2] 116 | ) 117 | assert ann[4] == TextAnnotation(index=4, kind="GHI", start=41, end=43) 118 | assert ann[5] == RelationAnnotation( 119 | index=2, kind=DEFAULT_RELATION_TYPE, source=ann[0], target=ann[4] 120 | ) 121 | 122 | 123 | def test_sacr2ann_2_annotations_in_2_paragraphs_in_2_chains() -> None: 124 | text = ( 125 | "hello {chain1:a=1,b=2,type=ABC world}!\n\n\n\n" "It'{chain2:type=DEF s} sunny" 126 | ) 127 | converter = Sacr2AnnConverter(type_property_name="type") 128 | converter.convert(text) 129 | 130 | assert converter.text == "hello world!\n\nIt's sunny" 131 | 132 | ann = converter.annotations 133 | assert len(ann) == 2 134 | assert ann[0] == TextAnnotation(index=1, kind="ABC", start=6, end=11) 135 | assert ann[1] == TextAnnotation(index=2, kind="DEF", start=17, end=18) 136 | 137 | 138 | def test_sacr2ann_3_annotations_in_3_paragraphs_in_3_chains() -> None: 139 | text = ( 140 | "hello {chain1:a=1,b=2,type=ABC world}!\n\n\n\n" 141 | "It'{chain2:type=DEF s} sunny.\n\n" 142 | "It's not {chain3:type=ABC rainy}" 143 | ) 144 | converter = Sacr2AnnConverter(type_property_name="type") 145 | converter.convert(text) 146 | 147 | assert converter.text == "hello world!\n\nIt's sunny.\n\nIt's not rainy" 148 | 149 | ann = converter.annotations 150 | assert len(ann) == 3 151 | assert ann[0] == TextAnnotation(index=1, kind="ABC", start=6, end=11) 152 | assert ann[1] == TextAnnotation(index=2, kind="DEF", start=17, end=18) 153 | assert ann[2] == TextAnnotation(index=3, kind="ABC", start=36, end=41) 154 | 155 | 156 | def test_sacr2ann_2_nested_annotations() -> None: 157 | text = "{c1:type=ABC {c1:type=DEF abc} def} ghi jkl mno" 158 | converter = Sacr2AnnConverter(type_property_name="type") 159 | converter.convert(text) 160 | 161 | assert converter.text == "abc def ghi jkl mno" 162 | 163 | ann = converter.annotations 164 | assert len(ann) == 3 165 | assert ann[0] == TextAnnotation(index=1, kind="ABC", start=0, end=7) 166 | assert ann[1] == TextAnnotation(index=2, kind="DEF", start=0, end=3) 167 | assert ann[2] == RelationAnnotation( 168 | index=1, kind=DEFAULT_RELATION_TYPE, source=ann[0], target=ann[1] 169 | ) 170 | 171 | 172 | def test_sacr2ann_3_nested_annotations() -> None: 173 | text = "{c1:type=ABC {c1:type=DEF abc def {c1:type=GHI ghi}}} jkl mno" 174 | converter = Sacr2AnnConverter(type_property_name="type") 175 | converter.convert(text) 176 | 177 | assert converter.text == "abc def ghi jkl mno" 178 | 179 | ann = converter.annotations 180 | assert len(ann) == 5 181 | assert ann[0] == TextAnnotation(index=1, kind="ABC", start=0, end=11) 182 | assert ann[1] == TextAnnotation(index=2, kind="DEF", start=0, end=11) 183 | assert ann[2] == RelationAnnotation( 184 | index=1, kind=DEFAULT_RELATION_TYPE, source=ann[0], target=ann[1] 185 | ) 186 | assert ann[3] == TextAnnotation(index=3, kind="GHI", start=8, end=11) 187 | assert ann[4] == RelationAnnotation( 188 | index=2, kind=DEFAULT_RELATION_TYPE, source=ann[1], target=ann[3] 189 | ) 190 | 191 | 192 | def test_sacr2ann_annotations_with_leading_comments() -> None: 193 | text = "# my comment\n\n# my other comment\n\n\n" "abc {c1 def}\n\n" "{c2 ghi}" 194 | converter = Sacr2AnnConverter(type_property_name="type") 195 | converter.convert(text) 196 | 197 | assert converter.text == "abc def\n\nghi" 198 | 199 | ann = converter.annotations 200 | assert len(ann) == 2 201 | assert ann[0] == TextAnnotation(index=1, kind=DEFAULT_MENTION_TYPE, start=4, end=7) 202 | assert ann[1] == TextAnnotation(index=2, kind=DEFAULT_MENTION_TYPE, start=9, end=12) 203 | 204 | 205 | def test_sacr2ann_annotations_with_middle_comments() -> None: 206 | text = ( 207 | "# my comment\n\n# my other comment\n\n\n" 208 | "abc {c1 def}\n\n" 209 | "# the middle comment\n\n" 210 | "{c2 ghi}" 211 | ) 212 | converter = Sacr2AnnConverter(type_property_name="type") 213 | converter.convert(text) 214 | 215 | assert converter.text == "abc def\n\nghi" 216 | 217 | ann = converter.annotations 218 | assert len(ann) == 2 219 | assert ann[0] == TextAnnotation(index=1, kind=DEFAULT_MENTION_TYPE, start=4, end=7) 220 | assert ann[1] == TextAnnotation(index=2, kind=DEFAULT_MENTION_TYPE, start=9, end=12) 221 | 222 | 223 | def test_sacr2ann_annotations_with_trailing_comments() -> None: 224 | text = ( 225 | "# my comment\n\n# my other comment\n\n\n" 226 | "abc {c1 def}\n\n" 227 | "# the middle comment\n\n" 228 | "{c2 ghi}\n\n" 229 | "# end of text" 230 | ) 231 | converter = Sacr2AnnConverter(type_property_name="type") 232 | converter.convert(text) 233 | 234 | assert converter.text == "abc def\n\nghi\n\n" 235 | 236 | ann = converter.annotations 237 | assert len(ann) == 2 238 | assert ann[0] == TextAnnotation(index=1, kind=DEFAULT_MENTION_TYPE, start=4, end=7) 239 | assert ann[1] == TextAnnotation(index=2, kind=DEFAULT_MENTION_TYPE, start=9, end=12) 240 | 241 | 242 | def test_sacr2ann_annotations_with_spaces() -> None: 243 | text = "# my comment\n\n# my other comment\n\n\n" " abc {c1 def ghi}" 244 | converter = Sacr2AnnConverter(type_property_name="type") 245 | converter.convert(text) 246 | 247 | assert converter.text == " abc def ghi" 248 | 249 | ann = converter.annotations 250 | assert len(ann) == 1 251 | assert ann[0] == TextAnnotation(index=1, kind=DEFAULT_MENTION_TYPE, start=7, end=15) 252 | 253 | 254 | def test_sacr2ann_type_property_name() -> None: 255 | text = "abc {c1:type=foo def}" 256 | converter = Sacr2AnnConverter(type_property_name="type") 257 | converter.convert(text) 258 | 259 | assert converter.text == "abc def" 260 | 261 | ann = converter.annotations 262 | assert len(ann) == 1 263 | assert ann[0] == TextAnnotation(index=1, kind="foo", start=4, end=7) 264 | 265 | 266 | def test_convert_annotations() -> None: 267 | text = "hello world! It's sunny. It's not rainy. It's hot" 268 | 269 | annotations: list[Annotation] = [] 270 | annotations.append(TextAnnotation(index=1, kind="ABC", start=6, end=11)) 271 | annotations.append(TextAnnotation(index=2, kind="DEF", start=16, end=17)) 272 | annotations.append(TextAnnotation(index=3, kind="ABC", start=34, end=39)) 273 | annotations.append( 274 | RelationAnnotation( 275 | index=1, 276 | kind=DEFAULT_RELATION_TYPE, 277 | source=annotations[1], 278 | target=annotations[2], 279 | ) 280 | ) 281 | annotations.append(TextAnnotation(index=4, kind="GHI", start=41, end=43)) 282 | annotations.append( 283 | RelationAnnotation( 284 | index=2, 285 | kind=DEFAULT_RELATION_TYPE, 286 | source=annotations[0], 287 | target=annotations[4], 288 | ) 289 | ) 290 | 291 | string = Sacr2AnnConverter._convert_annotations_as_string(text, annotations) 292 | assert string == ( 293 | "T1\tABC 6 11\tworld\n" 294 | "T2\tDEF 16 17\ts\n" 295 | "T3\tABC 34 39\trainy\n" 296 | f"R1\t{DEFAULT_RELATION_TYPE} Arg1:T2 Arg2:T3\n" 297 | "T4\tGHI 41 43\tIt\n" 298 | f"R2\t{DEFAULT_RELATION_TYPE} Arg1:T1 Arg2:T4\n" 299 | ) 300 | -------------------------------------------------------------------------------- /tests/test_sacr2annotable.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from sacr2annotable import Sacr2AnnotableConverter 4 | 5 | 6 | @pytest.fixture 7 | def text1() -> str: 8 | return ( 9 | "{c1:prop1=a,prop2=b {c2:prop1=cc,prop2=dd abc def} ghi}. jkl {c1:prop1=eee,prop2=fff mno}.\n\n" 10 | "pqr stu {c1:prop1=gggg,prop2=hhhh vwx}\n\n" 11 | ) 12 | 13 | 14 | @pytest.fixture 15 | def text2() -> str: 16 | return ( 17 | "#textid:mytext\n\n" 18 | "#textmetadata:type=literature\n" 19 | "#textmetadata:genre=\n" 20 | "#textmetadata:century=19\n\n" 21 | "ABC {c2:prop1=A,prop2=B DEF {c2:prop1=CC,prop2=DD GHI}} ! JKL MNO\n\n" 22 | "PRQ ? {c3:prop1=EEE,prop2=FFF STU}.\n\n" 23 | "# comment 1\n" 24 | "# comment 2\n\n" 25 | "{c2:prop1=GGGG,prop2=HHHH VWX}" 26 | ) 27 | 28 | 29 | def test_sacr2annotable_converter(text1: str, text2: str) -> None: 30 | conv = Sacr2AnnotableConverter() 31 | conv.convert_text(text1) 32 | conv.convert_text(text2) 33 | corpus = conv.corpus 34 | 35 | t1 = corpus._texts[0] 36 | t2 = corpus._texts[1] 37 | 38 | assert t1.name is None 39 | assert t2.name == "mytext" 40 | 41 | expected_tokens = "abc def ghi . jkl mno . pqr stu vwx".split() 42 | assert t1.token_count == len(expected_tokens) 43 | assert [t.value for t in t1.tokens] == expected_tokens 44 | 45 | expected_tokens = "ABC DEF GHI ! JKL MNO PRQ ? STU . VWX".split() 46 | assert t2.token_count == len(expected_tokens) 47 | assert [t.value for t in t2.tokens] == expected_tokens 48 | 49 | assert t1.sentence_count == 3 50 | assert t2.sentence_count == 5 51 | assert t1.paragraph_count == 2 52 | assert t2.paragraph_count == 3 53 | assert t1.mention_count == 4 54 | assert t2.mention_count == 4 55 | assert t1.chain_count == 2 56 | assert t2.chain_count == 2 57 | 58 | mentions = list(corpus.iter_text_mentions_as_dict()) 59 | assert mentions[0]["string"] == "abc def ghi" 60 | assert mentions[3]["string"] == "abc def" 61 | 62 | assert mentions[4]["string"] == "DEF GHI" 63 | assert mentions[5]["string"] == "GHI" 64 | 65 | assert mentions[0]["index_of_mention_in_the_text"] == 0 66 | assert mentions[1]["index_of_mention_in_the_text"] == 2 67 | assert mentions[2]["index_of_mention_in_the_text"] == 3 68 | assert mentions[3]["index_of_mention_in_the_text"] == 1 69 | assert mentions[4]["index_of_mention_in_the_text"] == 0 70 | assert mentions[5]["index_of_mention_in_the_text"] == 1 71 | assert mentions[6]["index_of_mention_in_the_text"] == 3 72 | assert mentions[7]["index_of_mention_in_the_text"] == 2 73 | 74 | assert mentions[1]["prop1"] == "eee" 75 | assert mentions[1]["prop2"] == "fff" 76 | assert mentions[5]["prop1"] == "CC" 77 | assert mentions[5]["prop2"] == "DD" 78 | assert mentions[7]["is_singleton"] is True 79 | 80 | chains = list(corpus.iter_text_chains_as_dict()) 81 | assert chains[0]["size"] == 3 82 | assert chains[2]["size"] == 3 83 | assert chains[3]["size"] == 1 84 | 85 | 86 | def test_sacr2annotable_text_metadata(text1: str, text2: str) -> None: 87 | conv = Sacr2AnnotableConverter() 88 | conv.convert_text(text1) 89 | conv.convert_text(text2) 90 | corpus = conv.corpus 91 | 92 | with pytest.raises(KeyError): 93 | _ = corpus._texts[0].metadata["type"] 94 | 95 | with pytest.raises(KeyError): 96 | _ = corpus._texts[0].metadata["genre"] 97 | 98 | with pytest.raises(KeyError): 99 | _ = corpus._texts[0].metadata["century"] 100 | 101 | assert corpus._texts[1].metadata["type"] == "literature" 102 | assert corpus._texts[1].metadata["genre"] == "" 103 | assert corpus._texts[1].metadata["century"] == "19" 104 | -------------------------------------------------------------------------------- /tests/test_sacr_parser.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from sacr_parser2 import ( 4 | Comment, 5 | NewLineInsideParagraph, 6 | ParagraphEnd, 7 | ParagraphStart, 8 | SacrParser, 9 | Spaces, 10 | TextID, 11 | Token, 12 | Word, 13 | ) 14 | 15 | text1 = """#textid:abc-123 16 | 17 | # my comment 18 | # my other comment 19 | 20 | abc def ghi 21 | klm nop 22 | qrs 23 | 24 | # comment between 25 | 26 | ABC DEF GHI 27 | 28 | xyz 29 | XYZ 30 | 31 | # end comment""" 32 | 33 | 34 | tokens1 = [ 35 | TextID(start=0, end=17, text_id="abc-123"), 36 | Comment(start=17, end=30, value="my comment"), 37 | Comment(start=30, end=50, value="my other comment"), 38 | ParagraphStart(start=50, end=50), 39 | Word(start=50, end=53, value="abc"), 40 | Spaces(start=53, end=54, value=" "), 41 | Word(start=54, end=57, value="def"), 42 | Spaces(start=57, end=58, value=" "), 43 | Word(start=58, end=61, value="ghi"), 44 | NewLineInsideParagraph(start=61, end=62, value="\n"), 45 | Word(start=62, end=65, value="klm"), 46 | Spaces(start=65, end=66, value=" "), 47 | Word(start=66, end=69, value="nop"), 48 | NewLineInsideParagraph(start=69, end=70, value="\n"), 49 | Word(start=70, end=73, value="qrs"), 50 | ParagraphEnd(start=73, end=75), 51 | Comment(start=75, end=94, value="comment between"), 52 | ParagraphStart(start=94, end=94), 53 | Word(start=94, end=97, value="ABC"), 54 | Spaces(start=97, end=98, value=" "), 55 | Word(start=98, end=101, value="DEF"), 56 | Spaces(start=101, end=102, value=" "), 57 | Word(start=102, end=105, value="GHI"), 58 | ParagraphEnd(start=105, end=107), 59 | ParagraphStart(start=107, end=107), 60 | Word(start=107, end=110, value="xyz"), 61 | NewLineInsideParagraph(start=110, end=111, value="\n"), 62 | Word(start=111, end=114, value="XYZ"), 63 | ParagraphEnd(start=114, end=116), 64 | Comment(start=116, end=129, value="end comment"), 65 | ] 66 | 67 | 68 | text2 = """abc def 69 | ghi""" 70 | 71 | 72 | tokens2 = [ 73 | ParagraphStart(start=0, end=0), 74 | Word(start=0, end=3, value="abc"), 75 | Spaces(start=3, end=4, value=" "), 76 | Word(start=4, end=7, value="def"), 77 | NewLineInsideParagraph(start=7, end=8, value="\n"), 78 | Word(start=8, end=11, value="ghi"), 79 | ] 80 | 81 | 82 | text3 = """#hello 83 | #textid:abc-123 84 | abc def 85 | # not a comment 86 | """ 87 | 88 | 89 | tokens3 = [ 90 | Comment(start=0, end=7, value="hello"), 91 | TextID(start=7, end=23, text_id="abc-123"), 92 | ParagraphStart(start=23, end=23), 93 | Word(start=23, end=26, value="abc"), 94 | Spaces(start=26, end=27, value=" "), 95 | Word(start=27, end=30, value="def"), 96 | NewLineInsideParagraph(start=30, end=31, value="\n"), 97 | Word(start=31, end=32, value="#"), 98 | Spaces(start=32, end=33, value=" "), 99 | Word(start=33, end=36, value="not"), 100 | Spaces(start=36, end=37, value=" "), 101 | Word(start=37, end=38, value="a"), 102 | Spaces(start=38, end=39, value=" "), 103 | Word(start=39, end=46, value="comment"), 104 | NewLineInsideParagraph(start=46, end=47, value="\n"), 105 | ] 106 | 107 | text4 = """#comment 108 | abc def 109 | 110 | # comment 111 | """ 112 | 113 | tokens4 = [ 114 | Comment(start=0, end=9, value="comment"), 115 | ParagraphStart(start=9, end=9), 116 | Word(start=9, end=12, value="abc"), 117 | Spaces(start=12, end=13, value=" "), 118 | Word(start=13, end=16, value="def"), 119 | ParagraphEnd(start=16, end=18), 120 | Comment(start=18, end=28, value="comment"), 121 | ] 122 | 123 | 124 | @pytest.mark.parametrize( 125 | "text, tokens", 126 | [ 127 | (text1, tokens1), 128 | (text2, tokens2), 129 | (text3, tokens3), 130 | (text4, tokens4), 131 | ], 132 | ) 133 | def test_parse_texts(text: str, tokens: list[Token]) -> None: 134 | parser = SacrParser(text) 135 | actual_tokens = list(parser.parse()) 136 | assert len(actual_tokens) == len(tokens) 137 | for a_t, t in zip(actual_tokens, tokens): 138 | assert a_t == t 139 | -------------------------------------------------------------------------------- /text2jsonlines.py: -------------------------------------------------------------------------------- 1 | """Convert plain text to jsonlines. The jsonlines format stores data for 2 | several texts (a corpus). Each line is a valid json document, as follows: 3 | 4 | { 5 | "clusters": [], 6 | "doc_key": "nw:docname", 7 | "sentences": [["This", "is", "the", "first", "sentence", "."], 8 | ["This", "is", "the", "second", "."]], 9 | "speakers": [["spk1", "spk1", "spk1", "spk1", "spk1", "spk1"], 10 | ["spk2", "spk2", "spk2", "spk2", "spk2"]] 11 | "pos": [["DET", "V", "DET", "ADJ", "NOUN", "PUNCT"], 12 | ["DET", "V", "DET", "ADJ", "PUNCT"]], 13 | } 14 | 15 | It is used for some coreference resolution systems, such as: 16 | 17 | - https://github.com/kentonl/e2e-coref 18 | - https://github.com/kkjawz/coref-ee 19 | - https://github.com/boberle/cofr 20 | 21 | Tokenization is done with StanfordNLP 22 | (https://github.com/stanfordnlp/stanfordnlp) (Qi, Dozat, Zhang, Manning 2018). 23 | 24 | You need to install StanfordNLP via pip and then load the models, for example 25 | for French models (use "en" for English models): 26 | 27 | python3 -c "import stanfordnlp; stanfordnlp.download('fr')" 28 | 29 | Notes: 30 | - the doc key is the concatenation of `--genre` and the file path, 31 | - speaker data are left blank ("_") 32 | """ 33 | 34 | # (C) Bruno Oberle 2020 - Mozilla Public Licence 2.0 35 | 36 | 37 | import argparse 38 | import json 39 | import re 40 | 41 | import stanfordnlp 42 | from stanfordnlp.models.common.conll import CoNLLFile 43 | 44 | # download French models: 45 | #stanfordnlp.download('fr') 46 | 47 | 48 | def tokenize(fpath, lang): 49 | 50 | content = open(fpath).read() 51 | paragraphs = re.split(r'\n+', content) 52 | res_sents = [] 53 | res_pars = [] 54 | res_pos = [] 55 | start_par = 0 56 | for par in paragraphs: 57 | par = par.strip() 58 | if not par: 59 | continue 60 | doc = stanfordnlp.Document(par) 61 | nlp = stanfordnlp.Pipeline(lang=lang, processors="tokenize,mwt,pos") 62 | doc = nlp(doc) 63 | #print(doc.conll_file.conll_as_string()) 64 | #print(doc.conll_file.sents) 65 | sents = [ 66 | [ token[1] for token in sent if '-' not in token[0] ] 67 | for sent in doc.conll_file.sents 68 | ] 69 | pos = [ 70 | [ token[3] for token in sent if '-' not in token[0] ] 71 | for sent in doc.conll_file.sents 72 | ] 73 | res_sents.extend(sents) 74 | res_pos.extend(pos) 75 | length = sum((len(s) for s in sents)) 76 | res_pars.append([start_par, start_par+length-1]) 77 | start_par = start_par+length 78 | return res_sents, res_pos, res_pars 79 | 80 | 81 | def make_jsonlines(sents, pos, pars, fpath, genre): 82 | doc = dict( 83 | doc_key = f"{genre[:2]}:{fpath}", 84 | sentences = sents, 85 | speakers = [ [ "_" for tok in sent ] for sent in sents ], 86 | clusters = [], 87 | pos = pos, 88 | paragraphs = pars, 89 | ) 90 | return json.dumps(doc) 91 | 92 | 93 | 94 | def make_conll(sents, fpath, genre): 95 | res = f"#begin document {genre[:2]}:{fpath}\n" 96 | for sent in sents: 97 | for i, token in enumerate(sent): 98 | res += f"{i+1}\t{token}\n" 99 | res += "\n" 100 | res += "#end document" 101 | return res 102 | 103 | 104 | 105 | def parse_args(): 106 | # definition 107 | parser = argparse.ArgumentParser(prog="text2jsonlines", 108 | description=__doc__, 109 | formatter_class=argparse.RawDescriptionHelpFormatter) 110 | # arguments (not options) 111 | parser.add_argument("infpath", default="", help="input file") 112 | # options 113 | parser.add_argument("--conll", dest="export_conll", default=False, 114 | action="store_true", 115 | help="export conll and not jsonlines (for debugging)") 116 | parser.add_argument("--genre", dest="genre", default="ge", 117 | help="genre (default is 'ge')") 118 | parser.add_argument("--lang", dest="lang", default="en", 119 | help="lang: en, fr, etc. (default is 'en')") 120 | parser.add_argument("-o", dest="outfpath", required=False, 121 | default=None, help="output file (default to stdout)") 122 | # reading 123 | args = parser.parse_args() 124 | return args 125 | 126 | 127 | 128 | def main(): 129 | args = parse_args() 130 | sents, pos, pars = tokenize(args.infpath, lang=args.lang) 131 | if args.export_conll: 132 | code = make_conll(sents, fpath=args.infpath, genre=args.genre) 133 | else: 134 | code = make_jsonlines(sents, pos, pars, 135 | fpath=args.infpath, genre=args.genre) 136 | if args.outfpath: 137 | open(args.outfpath, 'w').write(code + "\n") 138 | else: 139 | print(code) 140 | 141 | 142 | 143 | if __name__ == '__main__': 144 | main() 145 | --------------------------------------------------------------------------------