├── .gitignore
├── LICENSE
├── LICENSE_FOR_SAMPLE_TEXTS
├── Makefile
├── README.md
├── annotable.py
├── color_manager.py
├── conll2jsonlines.py
├── conll2sacr.py
├── conll_transform.py
├── docs
    ├── imgs
    │   ├── glozz_annotation.png
    │   ├── notebook_join.png
    │   ├── notebook_part_of_speech_of_first_mentions.png
    │   ├── notebook_pivot.png
    │   ├── notebook_pivot_chart.png
    │   ├── notebook_sentence_lengths.png
    │   ├── notebook_singletons.png
    │   ├── pict01.png
    │   ├── pict02.png
    │   ├── pict03.png
    │   ├── pict04.png
    │   └── pict05.png
    ├── sample_notebook.html
    └── sample_notebook.ipynb
├── glozz2sacr.pl
├── jsonlines2conll.py
├── jsonlines2text.py
├── mypy.ini
├── pyproject.toml
├── requirements.txt
├── sacr2ann.py
├── sacr2annotable.py
├── sacr2conll.py
├── sacr2df.py
├── sacr2glozz.pl
├── sacr_parser.py
├── sacr_parser2.py
├── setup.cfg
├── standoff2inline.py
├── testing
    ├── aesop.sacr
    ├── caesar.sacr
    ├── cicero.sacr
    ├── docs.jsonlines
    ├── lucian_speakers.sacr
    ├── pliny.sacr
    ├── simple.sacr
    ├── singe.conll
    ├── singe.jsonlines
    └── testing_sacr2conll.conll
├── testing_conll2sacr
    ├── _aesop.sacr___part_000
    ├── _ceasar.sacr___part_000
    ├── _cicero.sacr___part_000
    ├── _pliny.sacr___part_000
    └── _simple.sacr___part_000
├── tests
    ├── test_annotable.py
    ├── test_sacr2ann.py
    ├── test_sacr2annotable.py
    └── test_sacr_parser.py
└── text2jsonlines.py


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | .idea/
3 | venv/
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Mozilla Public License Version 2.0
  2 | ==================================
  3 | 
  4 | 1. Definitions
  5 | --------------
  6 | 
  7 | 1.1. "Contributor"
  8 |     means each individual or legal entity that creates, contributes to
  9 |     the creation of, or owns Covered Software.
 10 | 
 11 | 1.2. "Contributor Version"
 12 |     means the combination of the Contributions of others (if any) used
 13 |     by a Contributor and that particular Contributor's Contribution.
 14 | 
 15 | 1.3. "Contribution"
 16 |     means Covered Software of a particular Contributor.
 17 | 
 18 | 1.4. "Covered Software"
 19 |     means Source Code Form to which the initial Contributor has attached
 20 |     the notice in Exhibit A, the Executable Form of such Source Code
 21 |     Form, and Modifications of such Source Code Form, in each case
 22 |     including portions thereof.
 23 | 
 24 | 1.5. "Incompatible With Secondary Licenses"
 25 |     means
 26 | 
 27 |     (a) that the initial Contributor has attached the notice described
 28 |         in Exhibit B to the Covered Software; or
 29 | 
 30 |     (b) that the Covered Software was made available under the terms of
 31 |         version 1.1 or earlier of the License, but not also under the
 32 |         terms of a Secondary License.
 33 | 
 34 | 1.6. "Executable Form"
 35 |     means any form of the work other than Source Code Form.
 36 | 
 37 | 1.7. "Larger Work"
 38 |     means a work that combines Covered Software with other material, in
 39 |     a separate file or files, that is not Covered Software.
 40 | 
 41 | 1.8. "License"
 42 |     means this document.
 43 | 
 44 | 1.9. "Licensable"
 45 |     means having the right to grant, to the maximum extent possible,
 46 |     whether at the time of the initial grant or subsequently, any and
 47 |     all of the rights conveyed by this License.
 48 | 
 49 | 1.10. "Modifications"
 50 |     means any of the following:
 51 | 
 52 |     (a) any file in Source Code Form that results from an addition to,
 53 |         deletion from, or modification of the contents of Covered
 54 |         Software; or
 55 | 
 56 |     (b) any new file in Source Code Form that contains any Covered
 57 |         Software.
 58 | 
 59 | 1.11. "Patent Claims" of a Contributor
 60 |     means any patent claim(s), including without limitation, method,
 61 |     process, and apparatus claims, in any patent Licensable by such
 62 |     Contributor that would be infringed, but for the grant of the
 63 |     License, by the making, using, selling, offering for sale, having
 64 |     made, import, or transfer of either its Contributions or its
 65 |     Contributor Version.
 66 | 
 67 | 1.12. "Secondary License"
 68 |     means either the GNU General Public License, Version 2.0, the GNU
 69 |     Lesser General Public License, Version 2.1, the GNU Affero General
 70 |     Public License, Version 3.0, or any later versions of those
 71 |     licenses.
 72 | 
 73 | 1.13. "Source Code Form"
 74 |     means the form of the work preferred for making modifications.
 75 | 
 76 | 1.14. "You" (or "Your")
 77 |     means an individual or a legal entity exercising rights under this
 78 |     License. For legal entities, "You" includes any entity that
 79 |     controls, is controlled by, or is under common control with You. For
 80 |     purposes of this definition, "control" means (a) the power, direct
 81 |     or indirect, to cause the direction or management of such entity,
 82 |     whether by contract or otherwise, or (b) ownership of more than
 83 |     fifty percent (50%) of the outstanding shares or beneficial
 84 |     ownership of such entity.
 85 | 
 86 | 2. License Grants and Conditions
 87 | --------------------------------
 88 | 
 89 | 2.1. Grants
 90 | 
 91 | Each Contributor hereby grants You a world-wide, royalty-free,
 92 | non-exclusive license:
 93 | 
 94 | (a) under intellectual property rights (other than patent or trademark)
 95 |     Licensable by such Contributor to use, reproduce, make available,
 96 |     modify, display, perform, distribute, and otherwise exploit its
 97 |     Contributions, either on an unmodified basis, with Modifications, or
 98 |     as part of a Larger Work; and
 99 | 
100 | (b) under Patent Claims of such Contributor to make, use, sell, offer
101 |     for sale, have made, import, and otherwise transfer either its
102 |     Contributions or its Contributor Version.
103 | 
104 | 2.2. Effective Date
105 | 
106 | The licenses granted in Section 2.1 with respect to any Contribution
107 | become effective for each Contribution on the date the Contributor first
108 | distributes such Contribution.
109 | 
110 | 2.3. Limitations on Grant Scope
111 | 
112 | The licenses granted in this Section 2 are the only rights granted under
113 | this License. No additional rights or licenses will be implied from the
114 | distribution or licensing of Covered Software under this License.
115 | Notwithstanding Section 2.1(b) above, no patent license is granted by a
116 | Contributor:
117 | 
118 | (a) for any code that a Contributor has removed from Covered Software;
119 |     or
120 | 
121 | (b) for infringements caused by: (i) Your and any other third party's
122 |     modifications of Covered Software, or (ii) the combination of its
123 |     Contributions with other software (except as part of its Contributor
124 |     Version); or
125 | 
126 | (c) under Patent Claims infringed by Covered Software in the absence of
127 |     its Contributions.
128 | 
129 | This License does not grant any rights in the trademarks, service marks,
130 | or logos of any Contributor (except as may be necessary to comply with
131 | the notice requirements in Section 3.4).
132 | 
133 | 2.4. Subsequent Licenses
134 | 
135 | No Contributor makes additional grants as a result of Your choice to
136 | distribute the Covered Software under a subsequent version of this
137 | License (see Section 10.2) or under the terms of a Secondary License (if
138 | permitted under the terms of Section 3.3).
139 | 
140 | 2.5. Representation
141 | 
142 | Each Contributor represents that the Contributor believes its
143 | Contributions are its original creation(s) or it has sufficient rights
144 | to grant the rights to its Contributions conveyed by this License.
145 | 
146 | 2.6. Fair Use
147 | 
148 | This License is not intended to limit any rights You have under
149 | applicable copyright doctrines of fair use, fair dealing, or other
150 | equivalents.
151 | 
152 | 2.7. Conditions
153 | 
154 | Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted
155 | in Section 2.1.
156 | 
157 | 3. Responsibilities
158 | -------------------
159 | 
160 | 3.1. Distribution of Source Form
161 | 
162 | All distribution of Covered Software in Source Code Form, including any
163 | Modifications that You create or to which You contribute, must be under
164 | the terms of this License. You must inform recipients that the Source
165 | Code Form of the Covered Software is governed by the terms of this
166 | License, and how they can obtain a copy of this License. You may not
167 | attempt to alter or restrict the recipients' rights in the Source Code
168 | Form.
169 | 
170 | 3.2. Distribution of Executable Form
171 | 
172 | If You distribute Covered Software in Executable Form then:
173 | 
174 | (a) such Covered Software must also be made available in Source Code
175 |     Form, as described in Section 3.1, and You must inform recipients of
176 |     the Executable Form how they can obtain a copy of such Source Code
177 |     Form by reasonable means in a timely manner, at a charge no more
178 |     than the cost of distribution to the recipient; and
179 | 
180 | (b) You may distribute such Executable Form under the terms of this
181 |     License, or sublicense it under different terms, provided that the
182 |     license for the Executable Form does not attempt to limit or alter
183 |     the recipients' rights in the Source Code Form under this License.
184 | 
185 | 3.3. Distribution of a Larger Work
186 | 
187 | You may create and distribute a Larger Work under terms of Your choice,
188 | provided that You also comply with the requirements of this License for
189 | the Covered Software. If the Larger Work is a combination of Covered
190 | Software with a work governed by one or more Secondary Licenses, and the
191 | Covered Software is not Incompatible With Secondary Licenses, this
192 | License permits You to additionally distribute such Covered Software
193 | under the terms of such Secondary License(s), so that the recipient of
194 | the Larger Work may, at their option, further distribute the Covered
195 | Software under the terms of either this License or such Secondary
196 | License(s).
197 | 
198 | 3.4. Notices
199 | 
200 | You may not remove or alter the substance of any license notices
201 | (including copyright notices, patent notices, disclaimers of warranty,
202 | or limitations of liability) contained within the Source Code Form of
203 | the Covered Software, except that You may alter any license notices to
204 | the extent required to remedy known factual inaccuracies.
205 | 
206 | 3.5. Application of Additional Terms
207 | 
208 | You may choose to offer, and to charge a fee for, warranty, support,
209 | indemnity or liability obligations to one or more recipients of Covered
210 | Software. However, You may do so only on Your own behalf, and not on
211 | behalf of any Contributor. You must make it absolutely clear that any
212 | such warranty, support, indemnity, or liability obligation is offered by
213 | You alone, and You hereby agree to indemnify every Contributor for any
214 | liability incurred by such Contributor as a result of warranty, support,
215 | indemnity or liability terms You offer. You may include additional
216 | disclaimers of warranty and limitations of liability specific to any
217 | jurisdiction.
218 | 
219 | 4. Inability to Comply Due to Statute or Regulation
220 | ---------------------------------------------------
221 | 
222 | If it is impossible for You to comply with any of the terms of this
223 | License with respect to some or all of the Covered Software due to
224 | statute, judicial order, or regulation then You must: (a) comply with
225 | the terms of this License to the maximum extent possible; and (b)
226 | describe the limitations and the code they affect. Such description must
227 | be placed in a text file included with all distributions of the Covered
228 | Software under this License. Except to the extent prohibited by statute
229 | or regulation, such description must be sufficiently detailed for a
230 | recipient of ordinary skill to be able to understand it.
231 | 
232 | 5. Termination
233 | --------------
234 | 
235 | 5.1. The rights granted under this License will terminate automatically
236 | if You fail to comply with any of its terms. However, if You become
237 | compliant, then the rights granted under this License from a particular
238 | Contributor are reinstated (a) provisionally, unless and until such
239 | Contributor explicitly and finally terminates Your grants, and (b) on an
240 | ongoing basis, if such Contributor fails to notify You of the
241 | non-compliance by some reasonable means prior to 60 days after You have
242 | come back into compliance. Moreover, Your grants from a particular
243 | Contributor are reinstated on an ongoing basis if such Contributor
244 | notifies You of the non-compliance by some reasonable means, this is the
245 | first time You have received notice of non-compliance with this License
246 | from such Contributor, and You become compliant prior to 30 days after
247 | Your receipt of the notice.
248 | 
249 | 5.2. If You initiate litigation against any entity by asserting a patent
250 | infringement claim (excluding declaratory judgment actions,
251 | counter-claims, and cross-claims) alleging that a Contributor Version
252 | directly or indirectly infringes any patent, then the rights granted to
253 | You by any and all Contributors for the Covered Software under Section
254 | 2.1 of this License shall terminate.
255 | 
256 | 5.3. In the event of termination under Sections 5.1 or 5.2 above, all
257 | end user license agreements (excluding distributors and resellers) which
258 | have been validly granted by You or Your distributors under this License
259 | prior to termination shall survive termination.
260 | 
261 | ************************************************************************
262 | *                                                                      *
263 | *  6. Disclaimer of Warranty                                           *
264 | *  -------------------------                                           *
265 | *                                                                      *
266 | *  Covered Software is provided under this License on an "as is"       *
267 | *  basis, without warranty of any kind, either expressed, implied, or  *
268 | *  statutory, including, without limitation, warranties that the       *
269 | *  Covered Software is free of defects, merchantable, fit for a        *
270 | *  particular purpose or non-infringing. The entire risk as to the     *
271 | *  quality and performance of the Covered Software is with You.        *
272 | *  Should any Covered Software prove defective in any respect, You     *
273 | *  (not any Contributor) assume the cost of any necessary servicing,   *
274 | *  repair, or correction. This disclaimer of warranty constitutes an   *
275 | *  essential part of this License. No use of any Covered Software is   *
276 | *  authorized under this License except under this disclaimer.         *
277 | *                                                                      *
278 | ************************************************************************
279 | 
280 | ************************************************************************
281 | *                                                                      *
282 | *  7. Limitation of Liability                                          *
283 | *  --------------------------                                          *
284 | *                                                                      *
285 | *  Under no circumstances and under no legal theory, whether tort      *
286 | *  (including negligence), contract, or otherwise, shall any           *
287 | *  Contributor, or anyone who distributes Covered Software as          *
288 | *  permitted above, be liable to You for any direct, indirect,         *
289 | *  special, incidental, or consequential damages of any character      *
290 | *  including, without limitation, damages for lost profits, loss of    *
291 | *  goodwill, work stoppage, computer failure or malfunction, or any    *
292 | *  and all other commercial damages or losses, even if such party      *
293 | *  shall have been informed of the possibility of such damages. This   *
294 | *  limitation of liability shall not apply to liability for death or   *
295 | *  personal injury resulting from such party's negligence to the       *
296 | *  extent applicable law prohibits such limitation. Some               *
297 | *  jurisdictions do not allow the exclusion or limitation of           *
298 | *  incidental or consequential damages, so this exclusion and          *
299 | *  limitation may not apply to You.                                    *
300 | *                                                                      *
301 | ************************************************************************
302 | 
303 | 8. Litigation
304 | -------------
305 | 
306 | Any litigation relating to this License may be brought only in the
307 | courts of a jurisdiction where the defendant maintains its principal
308 | place of business and such litigation shall be governed by laws of that
309 | jurisdiction, without reference to its conflict-of-law provisions.
310 | Nothing in this Section shall prevent a party's ability to bring
311 | cross-claims or counter-claims.
312 | 
313 | 9. Miscellaneous
314 | ----------------
315 | 
316 | This License represents the complete agreement concerning the subject
317 | matter hereof. If any provision of this License is held to be
318 | unenforceable, such provision shall be reformed only to the extent
319 | necessary to make it enforceable. Any law or regulation which provides
320 | that the language of a contract shall be construed against the drafter
321 | shall not be used to construe this License against a Contributor.
322 | 
323 | 10. Versions of the License
324 | ---------------------------
325 | 
326 | 10.1. New Versions
327 | 
328 | Mozilla Foundation is the license steward. Except as provided in Section
329 | 10.3, no one other than the license steward has the right to modify or
330 | publish new versions of this License. Each version will be given a
331 | distinguishing version number.
332 | 
333 | 10.2. Effect of New Versions
334 | 
335 | You may distribute the Covered Software under the terms of the version
336 | of the License under which You originally received the Covered Software,
337 | or under the terms of any subsequent version published by the license
338 | steward.
339 | 
340 | 10.3. Modified Versions
341 | 
342 | If you create software not governed by this License, and you want to
343 | create a new license for such software, you may create and use a
344 | modified version of this License if you rename the license and remove
345 | any references to the name of the license steward (except to note that
346 | such modified license differs from this License).
347 | 
348 | 10.4. Distributing Source Code Form that is Incompatible With Secondary
349 | Licenses
350 | 
351 | If You choose to distribute Source Code Form that is Incompatible With
352 | Secondary Licenses under the terms of this version of the License, the
353 | notice described in Exhibit B of this License must be attached.
354 | 
355 | Exhibit A - Source Code Form License Notice
356 | -------------------------------------------
357 | 
358 |   This Source Code Form is subject to the terms of the Mozilla Public
359 |   License, v. 2.0. If a copy of the MPL was not distributed with this
360 |   file, You can obtain one at http://mozilla.org/MPL/2.0/.
361 | 
362 | If it is not possible or desirable to put the notice in a particular
363 | file, then You may include the notice in a location (such as a LICENSE
364 | file in a relevant directory) where a recipient would be likely to look
365 | for such a notice.
366 | 
367 | You may add additional accurate notices of copyright ownership.
368 | 
369 | Exhibit B - "Incompatible With Secondary Licenses" Notice
370 | ---------------------------------------------------------
371 | 
372 |   This Source Code Form is "Incompatible With Secondary Licenses", as
373 |   defined by the Mozilla Public License, v. 2.0.
374 | 


--------------------------------------------------------------------------------
/LICENSE_FOR_SAMPLE_TEXTS:
--------------------------------------------------------------------------------
 1 | The sample texts have been downloaded from wikipedia.  They are distributed
 2 | under the terms of the CC BY-SA-3.0 licence, which is reproduced below.
 3 | 
 4 | https://creativecommons.org/licenses/by-sa/3.0/
 5 | 
 6 | ************************************************************************
 7 | 
 8 |      CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE LEGAL SERVICES. DISTRIBUTION OF THIS LICENSE DOES NOT CREATE AN ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES REGARDING THE INFORMATION PROVIDED, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM ITS USE. 
 9 | 
10 | License
11 | 
12 | THE WORK (AS DEFINED BELOW) IS PROVIDED UNDER THE TERMS OF THIS CREATIVE COMMONS PUBLIC LICENSE ("CCPL" OR "LICENSE"). THE WORK IS PROTECTED BY COPYRIGHT AND/OR OTHER APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS AUTHORIZED UNDER THIS LICENSE OR COPYRIGHT LAW IS PROHIBITED.
13 | 
14 | BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE TO BE BOUND BY THE TERMS OF THIS LICENSE. TO THE EXTENT THIS LICENSE MAY BE CONSIDERED TO BE A CONTRACT, THE LICENSOR GRANTS YOU THE RIGHTS CONTAINED HERE IN CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND CONDITIONS.
15 | 
16 | 1. Definitions
17 | 
18 |     "Adaptation" means a work based upon the Work, or upon the Work and other pre-existing works, such as a translation, adaptation, derivative work, arrangement of music or other alterations of a literary or artistic work, or phonogram or performance and includes cinematographic adaptations or any other form in which the Work may be recast, transformed, or adapted including in any form recognizably derived from the original, except that a work that constitutes a Collection will not be considered an Adaptation for the purpose of this License. For the avoidance of doubt, where the Work is a musical work, performance or phonogram, the synchronization of the Work in timed-relation with a moving image ("synching") will be considered an Adaptation for the purpose of this License.
19 |     "Collection" means a collection of literary or artistic works, such as encyclopedias and anthologies, or performances, phonograms or broadcasts, or other works or subject matter other than works listed in Section 1(f) below, which, by reason of the selection and arrangement of their contents, constitute intellectual creations, in which the Work is included in its entirety in unmodified form along with one or more other contributions, each constituting separate and independent works in themselves, which together are assembled into a collective whole. A work that constitutes a Collection will not be considered an Adaptation (as defined below) for the purposes of this License.
20 |     "Creative Commons Compatible License" means a license that is listed at https://creativecommons.org/compatiblelicenses that has been approved by Creative Commons as being essentially equivalent to this License, including, at a minimum, because that license: (i) contains terms that have the same purpose, meaning and effect as the License Elements of this License; and, (ii) explicitly permits the relicensing of adaptations of works made available under that license under this License or a Creative Commons jurisdiction license with the same License Elements as this License.
21 |     "Distribute" means to make available to the public the original and copies of the Work or Adaptation, as appropriate, through sale or other transfer of ownership.
22 |     "License Elements" means the following high-level license attributes as selected by Licensor and indicated in the title of this License: Attribution, ShareAlike.
23 |     "Licensor" means the individual, individuals, entity or entities that offer(s) the Work under the terms of this License.
24 |     "Original Author" means, in the case of a literary or artistic work, the individual, individuals, entity or entities who created the Work or if no individual or entity can be identified, the publisher; and in addition (i) in the case of a performance the actors, singers, musicians, dancers, and other persons who act, sing, deliver, declaim, play in, interpret or otherwise perform literary or artistic works or expressions of folklore; (ii) in the case of a phonogram the producer being the person or legal entity who first fixes the sounds of a performance or other sounds; and, (iii) in the case of broadcasts, the organization that transmits the broadcast.
25 |     "Work" means the literary and/or artistic work offered under the terms of this License including without limitation any production in the literary, scientific and artistic domain, whatever may be the mode or form of its expression including digital form, such as a book, pamphlet and other writing; a lecture, address, sermon or other work of the same nature; a dramatic or dramatico-musical work; a choreographic work or entertainment in dumb show; a musical composition with or without words; a cinematographic work to which are assimilated works expressed by a process analogous to cinematography; a work of drawing, painting, architecture, sculpture, engraving or lithography; a photographic work to which are assimilated works expressed by a process analogous to photography; a work of applied art; an illustration, map, plan, sketch or three-dimensional work relative to geography, topography, architecture or science; a performance; a broadcast; a phonogram; a compilation of data to the extent it is protected as a copyrightable work; or a work performed by a variety or circus performer to the extent it is not otherwise considered a literary or artistic work.
26 |     "You" means an individual or entity exercising rights under this License who has not previously violated the terms of this License with respect to the Work, or who has received express permission from the Licensor to exercise rights under this License despite a previous violation.
27 |     "Publicly Perform" means to perform public recitations of the Work and to communicate to the public those public recitations, by any means or process, including by wire or wireless means or public digital performances; to make available to the public Works in such a way that members of the public may access these Works from a place and at a place individually chosen by them; to perform the Work to the public by any means or process and the communication to the public of the performances of the Work, including by public digital performance; to broadcast and rebroadcast the Work by any means including signs, sounds or images.
28 |     "Reproduce" means to make copies of the Work by any means including without limitation by sound or visual recordings and the right of fixation and reproducing fixations of the Work, including storage of a protected performance or phonogram in digital form or other electronic medium.
29 | 
30 | 2. Fair Dealing Rights. Nothing in this License is intended to reduce, limit, or restrict any uses free from copyright or rights arising from limitations or exceptions that are provided for in connection with the copyright protection under copyright law or other applicable laws.
31 | 
32 | 3. License Grant. Subject to the terms and conditions of this License, Licensor hereby grants You a worldwide, royalty-free, non-exclusive, perpetual (for the duration of the applicable copyright) license to exercise the rights in the Work as stated below:
33 | 
34 |     to Reproduce the Work, to incorporate the Work into one or more Collections, and to Reproduce the Work as incorporated in the Collections;
35 |     to create and Reproduce Adaptations provided that any such Adaptation, including any translation in any medium, takes reasonable steps to clearly label, demarcate or otherwise identify that changes were made to the original Work. For example, a translation could be marked "The original work was translated from English to Spanish," or a modification could indicate "The original work has been modified.";
36 |     to Distribute and Publicly Perform the Work including as incorporated in Collections; and,
37 |     to Distribute and Publicly Perform Adaptations.
38 | 
39 |     For the avoidance of doubt:
40 |         Non-waivable Compulsory License Schemes. In those jurisdictions in which the right to collect royalties through any statutory or compulsory licensing scheme cannot be waived, the Licensor reserves the exclusive right to collect such royalties for any exercise by You of the rights granted under this License;
41 |         Waivable Compulsory License Schemes. In those jurisdictions in which the right to collect royalties through any statutory or compulsory licensing scheme can be waived, the Licensor waives the exclusive right to collect such royalties for any exercise by You of the rights granted under this License; and,
42 |         Voluntary License Schemes. The Licensor waives the right to collect royalties, whether individually or, in the event that the Licensor is a member of a collecting society that administers voluntary licensing schemes, via that society, from any exercise by You of the rights granted under this License.
43 | 
44 | The above rights may be exercised in all media and formats whether now known or hereafter devised. The above rights include the right to make such modifications as are technically necessary to exercise the rights in other media and formats. Subject to Section 8(f), all rights not expressly granted by Licensor are hereby reserved.
45 | 
46 | 4. Restrictions. The license granted in Section 3 above is expressly made subject to and limited by the following restrictions:
47 | 
48 |     You may Distribute or Publicly Perform the Work only under the terms of this License. You must include a copy of, or the Uniform Resource Identifier (URI) for, this License with every copy of the Work You Distribute or Publicly Perform. You may not offer or impose any terms on the Work that restrict the terms of this License or the ability of the recipient of the Work to exercise the rights granted to that recipient under the terms of the License. You may not sublicense the Work. You must keep intact all notices that refer to this License and to the disclaimer of warranties with every copy of the Work You Distribute or Publicly Perform. When You Distribute or Publicly Perform the Work, You may not impose any effective technological measures on the Work that restrict the ability of a recipient of the Work from You to exercise the rights granted to that recipient under the terms of the License. This Section 4(a) applies to the Work as incorporated in a Collection, but this does not require the Collection apart from the Work itself to be made subject to the terms of this License. If You create a Collection, upon notice from any Licensor You must, to the extent practicable, remove from the Collection any credit as required by Section 4(c), as requested. If You create an Adaptation, upon notice from any Licensor You must, to the extent practicable, remove from the Adaptation any credit as required by Section 4(c), as requested.
49 |     You may Distribute or Publicly Perform an Adaptation only under the terms of: (i) this License; (ii) a later version of this License with the same License Elements as this License; (iii) a Creative Commons jurisdiction license (either this or a later license version) that contains the same License Elements as this License (e.g., Attribution-ShareAlike 3.0 US)); (iv) a Creative Commons Compatible License. If you license the Adaptation under one of the licenses mentioned in (iv), you must comply with the terms of that license. If you license the Adaptation under the terms of any of the licenses mentioned in (i), (ii) or (iii) (the "Applicable License"), you must comply with the terms of the Applicable License generally and the following provisions: (I) You must include a copy of, or the URI for, the Applicable License with every copy of each Adaptation You Distribute or Publicly Perform; (II) You may not offer or impose any terms on the Adaptation that restrict the terms of the Applicable License or the ability of the recipient of the Adaptation to exercise the rights granted to that recipient under the terms of the Applicable License; (III) You must keep intact all notices that refer to the Applicable License and to the disclaimer of warranties with every copy of the Work as included in the Adaptation You Distribute or Publicly Perform; (IV) when You Distribute or Publicly Perform the Adaptation, You may not impose any effective technological measures on the Adaptation that restrict the ability of a recipient of the Adaptation from You to exercise the rights granted to that recipient under the terms of the Applicable License. This Section 4(b) applies to the Adaptation as incorporated in a Collection, but this does not require the Collection apart from the Adaptation itself to be made subject to the terms of the Applicable License.
50 |     If You Distribute, or Publicly Perform the Work or any Adaptations or Collections, You must, unless a request has been made pursuant to Section 4(a), keep intact all copyright notices for the Work and provide, reasonable to the medium or means You are utilizing: (i) the name of the Original Author (or pseudonym, if applicable) if supplied, and/or if the Original Author and/or Licensor designate another party or parties (e.g., a sponsor institute, publishing entity, journal) for attribution ("Attribution Parties") in Licensor's copyright notice, terms of service or by other reasonable means, the name of such party or parties; (ii) the title of the Work if supplied; (iii) to the extent reasonably practicable, the URI, if any, that Licensor specifies to be associated with the Work, unless such URI does not refer to the copyright notice or licensing information for the Work; and (iv) , consistent with Ssection 3(b), in the case of an Adaptation, a credit identifying the use of the Work in the Adaptation (e.g., "French translation of the Work by Original Author," or "Screenplay based on original Work by Original Author"). The credit required by this Section 4(c) may be implemented in any reasonable manner; provided, however, that in the case of a Adaptation or Collection, at a minimum such credit will appear, if a credit for all contributing authors of the Adaptation or Collection appears, then as part of these credits and in a manner at least as prominent as the credits for the other contributing authors. For the avoidance of doubt, You may only use the credit required by this Section for the purpose of attribution in the manner set out above and, by exercising Your rights under this License, You may not implicitly or explicitly assert or imply any connection with, sponsorship or endorsement by the Original Author, Licensor and/or Attribution Parties, as appropriate, of You or Your use of the Work, without the separate, express prior written permission of the Original Author, Licensor and/or Attribution Parties.
51 |     Except as otherwise agreed in writing by the Licensor or as may be otherwise permitted by applicable law, if You Reproduce, Distribute or Publicly Perform the Work either by itself or as part of any Adaptations or Collections, You must not distort, mutilate, modify or take other derogatory action in relation to the Work which would be prejudicial to the Original Author's honor or reputation. Licensor agrees that in those jurisdictions (e.g. Japan), in which any exercise of the right granted in Section 3(b) of this License (the right to make Adaptations) would be deemed to be a distortion, mutilation, modification or other derogatory action prejudicial to the Original Author's honor and reputation, the Licensor will waive or not assert, as appropriate, this Section, to the fullest extent permitted by the applicable national law, to enable You to reasonably exercise Your right under Section 3(b) of this License (right to make Adaptations) but not otherwise.
52 | 
53 | 5. Representations, Warranties and Disclaimer
54 | 
55 | UNLESS OTHERWISE MUTUALLY AGREED TO BY THE PARTIES IN WRITING, LICENSOR OFFERS THE WORK AS-IS AND MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE WORK, EXPRESS, IMPLIED, STATUTORY OR OTHERWISE, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF TITLE, MERCHANTIBILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, ACCURACY, OR THE PRESENCE OF ABSENCE OF ERRORS, WHETHER OR NOT DISCOVERABLE. SOME JURISDICTIONS DO NOT ALLOW THE EXCLUSION OF IMPLIED WARRANTIES, SO SUCH EXCLUSION MAY NOT APPLY TO YOU.
56 | 
57 | 6. Limitation on Liability. EXCEPT TO THE EXTENT REQUIRED BY APPLICABLE LAW, IN NO EVENT WILL LICENSOR BE LIABLE TO YOU ON ANY LEGAL THEORY FOR ANY SPECIAL, INCIDENTAL, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES ARISING OUT OF THIS LICENSE OR THE USE OF THE WORK, EVEN IF LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
58 | 
59 | 7. Termination
60 | 
61 |     This License and the rights granted hereunder will terminate automatically upon any breach by You of the terms of this License. Individuals or entities who have received Adaptations or Collections from You under this License, however, will not have their licenses terminated provided such individuals or entities remain in full compliance with those licenses. Sections 1, 2, 5, 6, 7, and 8 will survive any termination of this License.
62 |     Subject to the above terms and conditions, the license granted here is perpetual (for the duration of the applicable copyright in the Work). Notwithstanding the above, Licensor reserves the right to release the Work under different license terms or to stop distributing the Work at any time; provided, however that any such election will not serve to withdraw this License (or any other license that has been, or is required to be, granted under the terms of this License), and this License will continue in full force and effect unless terminated as stated above.
63 | 
64 | 8. Miscellaneous
65 | 
66 |     Each time You Distribute or Publicly Perform the Work or a Collection, the Licensor offers to the recipient a license to the Work on the same terms and conditions as the license granted to You under this License.
67 |     Each time You Distribute or Publicly Perform an Adaptation, Licensor offers to the recipient a license to the original Work on the same terms and conditions as the license granted to You under this License.
68 |     If any provision of this License is invalid or unenforceable under applicable law, it shall not affect the validity or enforceability of the remainder of the terms of this License, and without further action by the parties to this agreement, such provision shall be reformed to the minimum extent necessary to make such provision valid and enforceable.
69 |     No term or provision of this License shall be deemed waived and no breach consented to unless such waiver or consent shall be in writing and signed by the party to be charged with such waiver or consent.
70 |     This License constitutes the entire agreement between the parties with respect to the Work licensed here. There are no understandings, agreements or representations with respect to the Work not specified here. Licensor shall not be bound by any additional provisions that may appear in any communication from You. This License may not be modified without the mutual written agreement of the Licensor and You.
71 |     The rights granted under, and the subject matter referenced, in this License were drafted utilizing the terminology of the Berne Convention for the Protection of Literary and Artistic Works (as amended on September 28, 1979), the Rome Convention of 1961, the WIPO Copyright Treaty of 1996, the WIPO Performances and Phonograms Treaty of 1996 and the Universal Copyright Convention (as revised on July 24, 1971). These rights and subject matter take effect in the relevant jurisdiction in which the License terms are sought to be enforced according to the corresponding provisions of the implementation of those treaty provisions in the applicable national law. If the standard suite of rights granted under applicable copyright law includes additional rights not granted under this License, such additional rights are deemed to be included in the License; this License is not intended to restrict the license of any rights under applicable law.
72 | 
73 |     Creative Commons Notice
74 | 
75 |     Creative Commons is not a party to this License, and makes no warranty whatsoever in connection with the Work. Creative Commons will not be liable to You or any party on any legal theory for any damages whatsoever, including without limitation any general, special, incidental or consequential damages arising in connection to this license. Notwithstanding the foregoing two (2) sentences, if Creative Commons has expressly identified itself as the Licensor hereunder, it shall have all rights and obligations of Licensor.
76 | 
77 |     Except for the limited purpose of indicating to the public that the Work is licensed under the CCPL, Creative Commons does not authorize the use by either party of the trademark "Creative Commons" or any related trademark or logo of Creative Commons without the prior written consent of Creative Commons. Any permitted use will be in compliance with Creative Commons' then-current trademark usage guidelines, as may be published on its website or otherwise made available upon request from time to time. For the avoidance of doubt, this trademark restriction does not form part of the License.
78 | 
79 |     Creative Commons may be contacted at https://creativecommons.org/.
80 | 
81 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | check-lint:
2 | 	. venv/bin/activate && (isort --check .; black --check .; flake8 .; mypy --strict .)
3 | 
4 | lint:
5 | 	. venv/bin/activate && (isort .; black .; flake8 .; mypy --strict .)
6 | 
7 | test:
8 | 	. venv/bin/activate && pytest
9 | 


--------------------------------------------------------------------------------
/color_manager.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This module defines a ColorManager.
  3 | """
  4 | 
  5 | # 2020 Bruno Oberle, MPL 2.0, see the LICENSE file.
  6 | 
  7 | __version__ = '1.1.0'
  8 | 
  9 | 
 10 | class ColorManager:
 11 |     """Generate colors based on HSL.
 12 | 
 13 |     The hue is the main iterator, so that colors are significantly different from
 14 |     each others.
 15 | 
 16 |     Example:
 17 |     --------
 18 | 
 19 |         >>> cm = ColorManager(50, 20, 20)
 20 |         >>> for i in range(5):
 21 |         ...     color = cm.get_next_color()
 22 |         ...     print(color)
 23 |         hsl(0, 100%, 80%)
 24 |         hsl(50, 100%, 80%)
 25 |         hsl(100, 100%, 80%)
 26 |         hsl(150, 100%, 80%)
 27 |         hsl(200, 100%, 80%)
 28 | 
 29 |     Class Attributes
 30 |     ----------------
 31 |     gray: str
 32 |         The gray color.
 33 | 
 34 |     Attributes
 35 |     ----------
 36 |     hue_step: `int` (default: 25)
 37 |         Hue step.
 38 |     saturation_step: `int` (default: 25)
 39 |         Saturation step.
 40 |     lightness_step: `int` (default: 10)
 41 |         Lightness step.
 42 |     gray: `str` (default `rgb(125, 125, 125)`)
 43 |         Value returned when there is no more color available, and `repeat` is
 44 |         `False`.
 45 |     repeat: bool (default: `True`)
 46 |         When no more available color, repeat.
 47 | 
 48 |     Note
 49 |     ----
 50 |     Use `len(cm)` to get the number of available colors.
 51 |     """
 52 | 
 53 |     gray = "rgb(125, 125, 125)"
 54 | 
 55 |     def __init__(self, hue_step=25, saturation_step=25, lightness_step=10,
 56 |             repeat=True):
 57 |         self.hue_step = hue_step
 58 |         self.saturation_step = saturation_step
 59 |         self.lightness_step = lightness_step
 60 |         self.gray = "rgb(125, 125, 125)"
 61 |         self.reset_iterator()
 62 |         self.repeat = repeat
 63 | 
 64 |     def __len__(self):
 65 |         """Return the number of available colors."""
 66 |         hue = 360 // self.hue_step + 1
 67 |         saturation = 100 // self.saturation_step
 68 |         lightness = 70 // self.lightness_step # because ]10;80]
 69 |         return hue * saturation * lightness
 70 | 
 71 |     def reset_iterator(self):
 72 |         """Reset the iterator."""
 73 |         self._iter = self.iter_color()
 74 | 
 75 |     def get_next_color(self):
 76 |         """Return the next color."""
 77 |         return next(self._iter)
 78 | 
 79 |     def iter_color(self):
 80 |         """Generator that goes through all the colors.
 81 | 
 82 |         It is use as the iterator.
 83 |         
 84 |         When there is no more color, yield the `gray` instance attribute.
 85 |         Never raises StopIteration.
 86 |         """
 87 |         while True:
 88 |             for s in range(100, -1, -self.saturation_step):
 89 |                 for l in range(80, 9, -self.lightness_step):
 90 |                     for h in range (0, 361, self.hue_step):
 91 |                         yield "hsl(%d, %d%%, %d%%)" % (h, s, l)
 92 |             if not self.repeat:
 93 |                 while True:
 94 |                     yield self.gray
 95 | 
 96 | 
 97 | 
 98 | class CommonColorManager:
 99 |     """Generate colors based on named html colors.
100 |     """
101 | 
102 |     gray = "gray"
103 | 
104 |     colors = [
105 |         "red",
106 |         "maroon",
107 |         "yellow",
108 |         "olive",
109 |         "lime",
110 |         "green",
111 |         "aqua",
112 |         "teal",
113 |         "blue",
114 |         "navy",
115 |         "fuchsia",
116 |         "purple",
117 |     ]
118 | 
119 |     def __init__(self, remove_yellow=True, repeat=True):
120 |         self.repeat = True
121 |         self.colors = self.__class__.colors.copy()
122 |         if remove_yellow:
123 |             self.colors.remove("yellow")
124 |         self.reset_iterator()
125 | 
126 |     def __len__(self):
127 |         """Return the number of available colors."""
128 |         return len(self.colors)
129 | 
130 |     def reset_iterator(self):
131 |         """Reset the iterator."""
132 |         self._iter = self.iter_color()
133 | 
134 |     def get_next_color(self):
135 |         """Return the next color."""
136 |         return next(self._iter)
137 | 
138 |     def iter_color(self):
139 |         """Generator that goes through all the colors."""
140 |         while True:
141 |             for color in self.colors:
142 |                 yield color
143 |             if not self.repeat:
144 |                 while True:
145 |                     yield self.__class__.gray
146 | 
147 | 
148 | 


--------------------------------------------------------------------------------
/conll2jsonlines.py:
--------------------------------------------------------------------------------
  1 | r"""
  2 | Convert conll format (2012 or U or X) into jsonlines format.
  3 | 
  4 | The jsonlines format stores data for
  5 | several texts (a corpus).  Each line is a valid json document, as follows:
  6 | 
  7 |     {
  8 |       "clusters": [],
  9 |       "doc_key": "nw:docname",
 10 |       "sentences": [["This", "is", "the", "first", "sentence", "."],
 11 |                     ["This", "is", "the", "second", "."]],
 12 |       "speakers":  [["spk1", "spk1", "spk1", "spk1", "spk1", "spk1"],
 13 |                     ["spk2", "spk2", "spk2", "spk2", "spk2"]]
 14 |     }
 15 | 
 16 | It is used for some coreference resolution systems, such as:
 17 | 
 18 | - https://github.com/kentonl/e2e-coref
 19 | - https://github.com/kkjawz/coref-ee
 20 | - https://github.com/boberle/cofr
 21 | 
 22 | To convert from the original CoNLL2012 format into jsonlines format:
 23 | 
 24 | python3 conll2jsonlines.py \
 25 |   --token-col 3 \
 26 |   --speaker-col 9 \
 27 |   INPUT_FILE \
 28 |   OUTPUT_FILE
 29 | 
 30 | To convert from the StanfordNLP format into jsonlines format:
 31 | 
 32 | python3 conll2jsonlines.py \
 33 |   --skip-singletons \
 34 |   --skip-empty-documents \
 35 |   --tab \
 36 |   --ignore-double-indices 0 \
 37 |   --token-col 1 \
 38 |   --speaker-col "_" \
 39 |   --no-coref \
 40 |   INPUT_FILE \
 41 |   OUTPUT_FILE
 42 | 
 43 | To convert from the Democrat corpus in CoNLL format (with a column for
 44 | paragraphs at position 11):
 45 | 
 46 | python3 conll2jsonlines.py \
 47 |   --tab \
 48 |   --ignore-double-indices 0 \
 49 |   --token-col 1 \
 50 |   --speaker-col "_" \
 51 |   --par-col 11 \
 52 |   testing/singe.conll \
 53 |   testing/singe.jsonlines
 54 | 
 55 | Note that you may have to change document keys in the CoNLL files before
 56 | running this script if you want to transform them.
 57 | """
 58 | 
 59 | import json
 60 | import os
 61 | import argparse
 62 | 
 63 | import conll_transform
 64 | 
 65 | 
 66 | def conll2jsonlines(
 67 |         infpath, outfpath,
 68 |         sep=None, token_col=3, speaker_col=9, add_coref=True, par_col=0,
 69 |         ignore_double_indices=None,
 70 |         skip_empty_documents=False, skip_singletons=False):
 71 | 
 72 |     docs = conll_transform.read_files(
 73 |         infpath,
 74 |         sep=sep,
 75 |         ignore_double_indices=ignore_double_indices,
 76 |     )
 77 | 
 78 |     with open(outfpath, 'w') as fh:
 79 | 
 80 |         for doc_key, doc in docs.items():
 81 | 
 82 |             print("Doing %s" % doc_key)
 83 | 
 84 |             if add_coref:
 85 |                 clusters = conll_transform.compute_chains(doc)
 86 |                 clusters = [
 87 |                     [ list(mention) for mention in cluster]
 88 |                     for cluster in clusters
 89 |                 ]
 90 |                 for cluster in clusters:
 91 |                     conll_transform.sentpos2textpos(cluster, doc)
 92 |                 if skip_singletons:
 93 |                     clusters = list(filter(lambda c: len(c) > 1, clusters))
 94 |                 if skip_empty_documents and not clusters:
 95 |                     print("Skipping %s because no cluster" % doc_key)
 96 |                     continue
 97 |             else:
 98 |                 clusters = []
 99 | 
100 |             tokens = [t for sent in doc for t in sent]
101 | 
102 |             sentences = [
103 |                 [token[token_col] for token in sent] for sent in doc
104 |             ]
105 | 
106 |             if par_col:
107 |                 start = 0
108 |                 length = 0
109 |                 current = -1
110 |                 paragraphs = []
111 |                 for sent in doc:
112 |                     length += len(sent)
113 |                     if int(sent[0][par_col]) != current:
114 |                         current = int(sent[0][par_col])
115 |                         paragraphs.append([start, start+length-1])
116 |                         start += length
117 |                         length = 0
118 |             else:
119 |                 #paragraphs = [[0, len(tokens)]]
120 |                 paragraphs = None
121 | 
122 |             if speaker_col.isdigit():
123 |                 speakers = [
124 |                     [token[int(speaker_col)] for token in sent] for sent in doc
125 |                 ]
126 |             else:
127 |                 speakers = [
128 |                     [speaker_col for token in sent] for sent in sentences
129 |                 ]
130 | 
131 | 
132 |             dic = dict(
133 |                 doc_key=doc_key,
134 |                 clusters=clusters,
135 |                 sentences=sentences,
136 |                 speakers=speakers,
137 |             )
138 |             if paragraphs is not None:
139 |                 dic['paragraphs'] = paragraphs
140 |             fh.write(json.dumps(dic) + "\n")
141 | 
142 | 
143 | 
144 | def parse_args():
145 |     # definition
146 |     parser = argparse.ArgumentParser(prog="conll2jsonlines",
147 |         #description="convert conll file to jsonlines",
148 |         description=__doc__,
149 |         formatter_class=argparse.RawDescriptionHelpFormatter)
150 |     # arguments (not options)
151 |     parser.add_argument("input_fpath", default="", help="input file")
152 |     parser.add_argument("output_fpath", default="", help="output file")
153 |     # options
154 |     parser.add_argument("--skip-singletons", dest="skip_singletons",
155 |         default=False, action="store_true", help="skip singletons")
156 |     parser.add_argument("--skip-empty-documents", dest="skip_empty_documents",
157 |         default=False, action="store_true", help="skip empty documents")
158 |     parser.add_argument("--no-coref", dest="add_coref",
159 |         default=True, action="store_false",
160 |         help="ignore coreference information")
161 |     parser.add_argument("--tab", dest="sep_is_tab",
162 |         default=False, action="store_true",
163 |         help="separator is tab and no a bunch of spaces as in the original "
164 |         "conll 2012 format")
165 |     parser.add_argument("--token-col", dest="token_col", type=int,
166 |         default=3, help="col index for tokens, def 3")
167 |     parser.add_argument("--speaker-col", dest="speaker_col", default="9",
168 |         help="col index for speakers, def 9. Use a char (ex. _) if you want "
169 |         "the speaker col to be filled with that char, eg if there is no "
170 |         "speaker column)")
171 |     parser.add_argument("--ignore-double-indices", dest="ignore_double_indices",
172 |         type=int, default=None,
173 |         help="ignore line containing a hyphen in the given column")
174 |     parser.add_argument("--par-col", dest="par_col", type=int,
175 |         default=0, help="paragraph column, def 0 (= no paragraph information)")
176 |     # reading
177 |     args = parser.parse_args()
178 |     return args
179 | 
180 | 
181 | 
182 | def main():
183 |     args = parse_args()
184 |     conll2jsonlines(
185 |         infpath=args.input_fpath,
186 |         outfpath=args.output_fpath,
187 |         skip_empty_documents=args.skip_empty_documents,
188 |         skip_singletons=args.skip_singletons,
189 |         add_coref=args.add_coref,
190 |         token_col=args.token_col,
191 |         speaker_col=args.speaker_col,
192 |         sep="\t" if args.sep_is_tab else None,
193 |         ignore_double_indices=args.ignore_double_indices,
194 |         par_col=args.par_col,
195 |     )
196 | 
197 | 
198 | 
199 | if __name__ == '__main__':
200 |     main()
201 | 
202 | 


--------------------------------------------------------------------------------
/conll2sacr.py:
--------------------------------------------------------------------------------
 1 | r"""
 2 | Convert a CoNLL-2012 or CoNLL-U file in a SACR file, which you can
 3 | open with the SACR program (http://boberle.com/projects/sacr).  In this way,
 4 | you can check and edit coreference annotation.  To convert back, use the
 5 | `sacr2conll.py` script.
 6 | 
 7 | To convert from conll-2012 (space separated columns, word column is 3):
 8 | 
 9 |     python3 conll2sacr.py --output-dir DIR INPUT_FILE.conll
10 | 
11 | This will convert every document in `INPUT_FILE.conll` into a document in `DIR`
12 | (the name of the file is based on the document name in the conll file).
13 | 
14 | To convert from conll-u (tabulation separated columns, word column is 1):
15 | 
16 |     python3 conll2sacr.py --output-dir DIR \
17 |         --tab \
18 |         --token-col 1 \ 
19 |         INPUT_FILE.conll
20 | 
21 | Use the `--ignore-double-indices` if you want to ignore French amalgams
22 | (`du -> de le`) decomposed by some corpora and software (such as StanfordNLP).
23 | """
24 | 
25 | 
26 | import re
27 | import argparse
28 | import os
29 | 
30 | import conll_transform
31 | from standoff2inline import Standoff2Inline
32 | 
33 | def convert(doc, doc_key, dpath, token_col):
34 | 
35 |     res = ""
36 | 
37 |     for sent in doc:
38 |         inliner = Standoff2Inline(kind='sacr')
39 |         mentions = conll_transform.compute_mentions([t[-1] for t in sent])
40 |         for (start, stop), chain in mentions:
41 |             inliner.add((start, (f"C{chain}", dict())), stop-1)
42 |         res += inliner.apply(tokens=[t[token_col] for t in sent])
43 |         res += "\n\n"
44 | 
45 |     if not isinstance(doc_key, str):
46 |         doc_key = "_".join(str(x) for x in doc_key)
47 |     fname = re.sub(r'[^-\w.]', r'_', doc_key)
48 |     fpath = os.path.join(dpath, fname)
49 |     open(fpath, 'w').write(res)
50 | 
51 | 
52 | def parse_args():
53 |     # definition
54 |     parser = argparse.ArgumentParser(prog="conll2sacr",
55 |         #description="convert conll to sacr",
56 |         description=__doc__,
57 |         formatter_class=argparse.RawDescriptionHelpFormatter)
58 |     # arguments (not options)
59 |     parser.add_argument("infpath", default="", help="input file")
60 |     #parser.add_argument("outfpath", default="", help="output file")
61 |     # options
62 |     parser.add_argument("--output-dir", dest="outdpath", required=True,
63 |         help="output directory")
64 |     parser.add_argument("--token-col", dest="token_col", type=int,
65 |         default=3, help="col index for tokens, def 3")
66 |     parser.add_argument("--ignore-double-indices", dest="ignore_double_indices",
67 |         type=int, default=None,
68 |         help="ignore line containing a hyphen in the given column")
69 |     parser.add_argument("--tab", dest="tab_sep", default=False,
70 |        action="store_true", help="use tabulation as separator (conllu)")
71 |     # reading
72 |     args = parser.parse_args()
73 |     return args
74 | 
75 | 
76 | 
77 | def main():
78 | 
79 |     args = parse_args()
80 | 
81 |     docs = conll_transform.read_file(
82 |         args.infpath,
83 |         sep="\t" if args.tab_sep else None,
84 |         ignore_double_indices=args.ignore_double_indices,
85 |     )
86 |     for doc_key, doc in docs.items():
87 |         print(f"Doing {doc_key}")
88 |         convert(doc=doc, doc_key=doc_key, dpath=args.outdpath,
89 |             token_col=args.token_col)
90 | 
91 | 
92 | if __name__ == '__main__':
93 |     main()
94 | 


--------------------------------------------------------------------------------
/docs/imgs/glozz_annotation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boberle/corefconversion/f04490d461e4dc117ffea2448ee6dea2ba51bb3e/docs/imgs/glozz_annotation.png


--------------------------------------------------------------------------------
/docs/imgs/notebook_join.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boberle/corefconversion/f04490d461e4dc117ffea2448ee6dea2ba51bb3e/docs/imgs/notebook_join.png


--------------------------------------------------------------------------------
/docs/imgs/notebook_part_of_speech_of_first_mentions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boberle/corefconversion/f04490d461e4dc117ffea2448ee6dea2ba51bb3e/docs/imgs/notebook_part_of_speech_of_first_mentions.png


--------------------------------------------------------------------------------
/docs/imgs/notebook_pivot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boberle/corefconversion/f04490d461e4dc117ffea2448ee6dea2ba51bb3e/docs/imgs/notebook_pivot.png


--------------------------------------------------------------------------------
/docs/imgs/notebook_pivot_chart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boberle/corefconversion/f04490d461e4dc117ffea2448ee6dea2ba51bb3e/docs/imgs/notebook_pivot_chart.png


--------------------------------------------------------------------------------
/docs/imgs/notebook_sentence_lengths.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boberle/corefconversion/f04490d461e4dc117ffea2448ee6dea2ba51bb3e/docs/imgs/notebook_sentence_lengths.png


--------------------------------------------------------------------------------
/docs/imgs/notebook_singletons.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boberle/corefconversion/f04490d461e4dc117ffea2448ee6dea2ba51bb3e/docs/imgs/notebook_singletons.png


--------------------------------------------------------------------------------
/docs/imgs/pict01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boberle/corefconversion/f04490d461e4dc117ffea2448ee6dea2ba51bb3e/docs/imgs/pict01.png


--------------------------------------------------------------------------------
/docs/imgs/pict02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boberle/corefconversion/f04490d461e4dc117ffea2448ee6dea2ba51bb3e/docs/imgs/pict02.png


--------------------------------------------------------------------------------
/docs/imgs/pict03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boberle/corefconversion/f04490d461e4dc117ffea2448ee6dea2ba51bb3e/docs/imgs/pict03.png


--------------------------------------------------------------------------------
/docs/imgs/pict04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boberle/corefconversion/f04490d461e4dc117ffea2448ee6dea2ba51bb3e/docs/imgs/pict04.png


--------------------------------------------------------------------------------
/docs/imgs/pict05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boberle/corefconversion/f04490d461e4dc117ffea2448ee6dea2ba51bb3e/docs/imgs/pict05.png


--------------------------------------------------------------------------------
/glozz2sacr.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | use strict;
  3 | use warnings FATAL=>'all';
  4 | use open ':utf8';
  5 | use utf8;
  6 | 
  7 | use Data::Dumper;
  8 | #use XML::LibXML::Simple   qw(XMLin);
  9 | use XML::Simple   qw(XMLin);
 10 | 
 11 | 
 12 | ########################################################################
 13 | # Global variables
 14 | ########################################################################
 15 | 
 16 | my $INPUT_CORPUS = '';
 17 | my $INPUT_ANNOTATIONS = '';
 18 | my $OUTPUT_FILE = '';
 19 | my $REFNAME_FIELD = ''; # REF, refname, etc.
 20 | my $UNIT_TYPE = ''; # maillon, MENTION, etc.
 21 | my $RESET_REFNAME_FIELD = '';
 22 | 
 23 | $Data::Dumper::Terse  = 1;
 24 | $Data::Dumper::Indent = 1;
 25 | 
 26 | 
 27 | ########################################################################
 28 | # Get the CLI parameters
 29 | ########################################################################
 30 | 
 31 | my $HELP =<<"END";
 32 | USAGE
 33 |   $0 [OPTIONS] INPUT_AND_OUTPUT_FILES
 34 | 
 35 | EXAMPLE:
 36 |   $0 test.aa output
 37 | 
 38 | DESCRIPTION
 39 |   Convert a couple of Glozz files to a SACR file.  You can give the FILES
 40 |   in any order: the .ac and .aa files will be determined by their
 41 |   extensions.  You can give only one of the Glozz file: the other will be
 42 |   found (if in the same directory).  If no output file is specified, print
 43 |   on STDOUT.
 44 | 
 45 | OPTIONS (-o value --opt value)
 46 |    -h             Print help.
 47 |    --ref-field    Name of the field where the referent is store (REF, refname,
 48 |                   etc.). Default is REF.
 49 |    --unit-type    Type of the unit (maillon, MENTION, etc.). Default is MENTION.
 50 |    --reset        Get a new name for referent (useful if the name used in the
 51 |                   glozz file contains non standard characters).
 52 | END
 53 | 
 54 | sub get_cl_parameters {
 55 | 
 56 |    # default
 57 |    $INPUT_CORPUS = '';
 58 |    $INPUT_ANNOTATIONS = '';
 59 |    $OUTPUT_FILE = '';
 60 |    $REFNAME_FIELD = 'REF';
 61 |    $UNIT_TYPE = 'MENTION';
 62 |    $RESET_REFNAME_FIELD = '';
 63 | 
 64 |    my $pending = '';
 65 | 
 66 |    for (@ARGV) {
 67 |       print $HELP and exit if m/^(?:-h|--?help)$/;
 68 |       last if $pending and m/^-/;
 69 |       # pending
 70 |       if ($pending eq '--ref-name') {
 71 |          $REFNAME_FIELD = $_;
 72 |          $pending = '';
 73 |       } elsif ($pending eq '--unit-type') {
 74 |          $UNIT_TYPE = $_;
 75 |          $pending = '';
 76 |       # end of pending
 77 |       } elsif ($pending) {
 78 |          last;
 79 |       # options with value waiting for next element in @ARGV
 80 |       } elsif (m/^(?:--ref-name)$/) {
 81 |          $pending = '--ref-name';
 82 |       } elsif (m/^--unit-type$/) {
 83 |          $pending = '--unit-type';
 84 |       # switch (= options with no value)
 85 |       } elsif (m/^--reset$/) {
 86 |          $RESET_REFNAME_FIELD = 1;
 87 |       # end of options
 88 |       } elsif (m/^-.+/) {
 89 |          die "$0: *** option '$_' doesn't exists ***\n";
 90 |       # arguments, not options
 91 |       } elsif (!$INPUT_CORPUS and m/^(.+?\.ac)$/ and -f $1) {
 92 |          $INPUT_CORPUS = $1;
 93 |          (my $tmp = $INPUT_CORPUS) =~ s/\.ac$/.aa/;
 94 |          if (!$INPUT_ANNOTATIONS and -f $tmp) {
 95 |             $INPUT_ANNOTATIONS = $tmp;
 96 |          }
 97 |       } elsif (!$INPUT_ANNOTATIONS and m/^(.+?\.aa)$/ and -f $1) {
 98 |          $INPUT_ANNOTATIONS = $1;
 99 |          (my $tmp = $INPUT_ANNOTATIONS) =~ s/\.aa$/.ac/;
100 |          if (!$INPUT_CORPUS and -f $tmp) {
101 |             $INPUT_CORPUS = $tmp;
102 |          }
103 |       } elsif (!$OUTPUT_FILE and m/^([^-].*+)$/) {
104 |          $OUTPUT_FILE = $1;
105 |       } else {
106 |          die "$0: *** bad option '$_' ***\n";
107 |       }
108 |    } # for
109 | 
110 |    die "$0: *** no glozz file specified ***\n"
111 |       unless $INPUT_ANNOTATIONS and $INPUT_CORPUS;
112 | 
113 | }
114 | 
115 | 
116 | 
117 | ########################################################################
118 | # parser
119 | # 
120 | # ex:
121 | #    'characterisation' => {
122 | #      'featureSet' => {
123 | #        'feature' => {
124 | #          'gender' => {
125 | #            'content' => 'fem'
126 | #          },
127 | #          'gramcat' => {
128 | #            'content' => 'definite'
129 | #          },
130 | #          $REFNAME_FIELD => {},
131 | #          'number' => {
132 | #            'content' => 'sg'
133 | #          }
134 | #        }
135 | #      },
136 | #
137 | # ex:
138 | #     'characterisation' => {
139 | #       'featureSet' => {
140 | #         'feature' => {
141 | #           'name' => $REFNAME_FIELD
142 | #         }
143 | #       },
144 | #       'type' => $UNIT_TYPE
145 | #     },
146 | ########################################################################
147 | 
148 | sub parse {
149 | 
150 |    my $xml = shift;
151 |    my $corpus = shift;
152 | 
153 |    my $data = XMLin $xml, ForceArray=>'', KeyAttr => { feature=>'name' };
154 |    #print Dumper $data; die;
155 | 
156 |    my @annotations = ();
157 |    my @paragraphs = ();
158 | 
159 |    for my $r_unit_hash (@{$data->{unit}}) {
160 |       if ($r_unit_hash->{characterisation}->{type} eq 'paragraph') {
161 |          push @paragraphs, {
162 |             start => $r_unit_hash->{positioning}->{start}->{singlePosition}->{index},
163 |             end => $r_unit_hash->{positioning}->{end}->{singlePosition}->{index} };
164 |       } elsif ($r_unit_hash->{characterisation}->{type} eq $UNIT_TYPE) {
165 |          push @annotations, {
166 |             start => $r_unit_hash->{positioning}->{start}->{singlePosition}->{index},
167 |             end => $r_unit_hash->{positioning}->{end}->{singlePosition}->{index},
168 |             props => $r_unit_hash->{characterisation}->{featureSet}->{feature} };
169 |       } else {
170 |          die "$0: *** don't know unit type '$r_unit_hash->{characterisation}->{type}' ***\n";
171 |       }
172 |    }
173 | 
174 |    my %chain_names = ();
175 |    my $name_counter = 0;
176 | 
177 |    # props is of the form: { $REFNAME_FIELD=>{content=>'value'}, prop=>{content=>'value'}}
178 |    for my $r_annotation (@annotations) {
179 |       my %props = ();
180 |       #print Dumper $r_annotation;
181 |       #print substr($corpus, $r_annotation->{start}, $r_annotation->{end}-$r_annotation->{start}), "\n";
182 |       for my $key (keys %{$r_annotation->{props}}) {
183 |          # only one feature, which has no content:
184 |          #     'characterisation' => {
185 |          #       'featureSet' => {
186 |          #         'feature' => {
187 |          #           'name' => $REFNAME_FIELD
188 |          #         }
189 |          #       },
190 |          #       'type' => $UNIT_TYPE
191 |          #     },
192 |          if ($key eq 'name' and not ref $r_annotation->{props}->{$key}) {
193 |             unless ($r_annotation->{props}->{$key} eq $REFNAME_FIELD) {
194 |                $props{$r_annotation->{props}->{$key}} = '';
195 |             }
196 |          # otherwise
197 |          #     'characterisation' => {
198 |          #       'featureSet' => {
199 |          #         'feature' => {
200 |          #           'gender' => {
201 |          #             'content' => 'fem'
202 |          #           },
203 |          #           'gramcat' => {
204 |          #             'content' => 'definite'
205 |          #           },
206 |          #           $REFNAME_FIELD => {},
207 |          #           'number' => {
208 |          #             'content' => 'sg'
209 |          #           }
210 |          #         }
211 |          #       },
212 |          #       'type' => $UNIT_TYPE
213 |          #     },
214 |          } else {
215 |             if ($key eq $REFNAME_FIELD) {
216 |                $r_annotation->{$REFNAME_FIELD} = $r_annotation->{props}->{$key}->{content};
217 |             } else {
218 |                $props{$key} = $r_annotation->{props}->{$key}->{content};
219 |             }
220 |          }
221 |       }
222 |       $r_annotation->{props} = \%props;
223 |       $r_annotation->{$REFNAME_FIELD} = 'TODO' unless $r_annotation->{$REFNAME_FIELD};
224 |       if ($RESET_REFNAME_FIELD) {
225 |          if ($r_annotation->{$REFNAME_FIELD} eq 'SI') {
226 |             $r_annotation->{$REFNAME_FIELD} = "L".$name_counter;
227 |          } else {
228 |             if (not exists $chain_names{$r_annotation->{$REFNAME_FIELD}}) {
229 |                $chain_names{$r_annotation->{$REFNAME_FIELD}} = "C".$name_counter;
230 |                $name_counter++;
231 |             }
232 |             $r_annotation->{$REFNAME_FIELD} = $chain_names{$r_annotation->{$REFNAME_FIELD}};
233 |          }
234 |       }
235 |    }
236 | 
237 |    #print Dumper \@annotations;
238 |    #print Dumper \@paragraphs;
239 | 
240 |    # NOTE: this is very important, otherwise nested annotations go
241 |    # wrong!
242 |    @annotations = sort{$a->{start}<=>$b->{start}
243 |       or $b->{end}-$b->{start} <=> $a->{end}-$a->{start}} @annotations;
244 | 
245 |    # test that there are no overlapping annotations
246 |    for my $i (@annotations) {
247 |       for my $j (@annotations) {
248 |          next if $i == $j;
249 |          if ($i->{start} < $j->{start}
250 |                and $j->{start} < $i->{end}
251 |                and $i->{end} < $j->{end}) {
252 |             $i->{end} = $j->{end};
253 |             print sprintf "Correcting overlapping annotations: '%s' (%d,%d) and '%s' (%d,%d)\n",
254 |                substr($corpus, $i->{start}, $i->{end}-$i->{start}),
255 |                $i->{start}, $i->{end},
256 |                substr($corpus, $j->{start}, $j->{end}-$j->{start}),
257 |                $j->{start}, $j->{end};
258 |          }
259 |       }
260 |    }
261 |    for my $i (@annotations) {
262 |       for my $j (@annotations) {
263 |          next if $i == $j;
264 |          if ($i->{start} < $j->{start}
265 |                and $j->{start} < $i->{end}
266 |                and $i->{end} < $j->{end}) {
267 |             die sprintf "$0: overlapping annotations: '%s' (%d,%d) and '%s' (%d,%d) ***\n",
268 |                substr($corpus, $i->{start}, $i->{end}-$i->{start}),
269 |                $i->{start}, $i->{end},
270 |                substr($corpus, $j->{start}, $j->{end}-$j->{start}),
271 |                $j->{start}, $j->{end};
272 |          }
273 |       }
274 |    }
275 | 
276 |    my $result = '';
277 | 
278 |    my @pending_annotations = ();
279 |    for my $r_par (sort{$a->{start} <=> $b->{start}} @paragraphs) {
280 |       my $par_text = substr($corpus, $r_par->{start}, $r_par->{end}-$r_par->{start});
281 |       my $len = length $par_text;
282 |       for (my $i= 0; $i<$len; $i++) {
283 |          while (@pending_annotations
284 |                and $pending_annotations[0]->{end}-$r_par->{start} == $i and $i > 0) {
285 |             $result .= '}';
286 |             shift @pending_annotations;
287 |          }
288 |          while (@annotations
289 |                and $annotations[0]->{start}-$r_par->{start} == $i) {
290 |             #DEBUG: my $props_string = '';
291 |             if (exists $annotations[0]->{props}->{headpos}
292 |                   and exists $annotations[0]->{props}->{headstring}) {
293 |                $annotations[0]->{props}->{head} = "$annotations[0]->{props}->{headpos}: $annotations[0]->{props}->{headstring}";
294 |                delete $annotations[0]->{props}->{headpos};
295 |                delete $annotations[0]->{props}->{headstring};
296 |             }
297 |             my @props_strings = ();
298 |             for my $key (sort keys %{$annotations[0]->{props}}) {
299 |                my $val = $annotations[0]->{props}->{$key};
300 |                if (not defined $val) {
301 |                   $val = "";
302 |                }
303 |                push @props_strings, "$key=\"$val\"";
304 |             }
305 |             #my $props_string = join(',', map{"$_=\"$annotations[0]->{props}->{$_}\""} sort keys %{$annotations[0]->{props}});
306 |             my $props_string = join(',', @props_strings);
307 |             $props_string = ":$props_string" if $props_string;
308 |             #print Dumper $r_annotation;
309 |             $result .= "{$annotations[0]->{$REFNAME_FIELD}$props_string ";
310 |             unshift @pending_annotations, shift @annotations;
311 |          }
312 |          $result .= substr($par_text, $i, 1);
313 |       }
314 |       # closing at the end of the paragraph
315 |       while (@pending_annotations) {
316 |          $result .= '}';
317 |          shift @pending_annotations;
318 |       }
319 |       $result .= "\n\n";
320 |    }
321 | 
322 |    if (@annotations) {
323 |       print Dumper \@annotations;
324 |       die "$0: *** some annotations left ***\n";
325 |    }
326 | 
327 |    return $result;
328 | 
329 | 
330 | }
331 | 
332 | 
333 | ########################################################################
334 | # check comment line
335 | ########################################################################
336 | 
337 | sub check_comment_line {
338 | 
339 |    my @lines = split /\n/, shift;
340 | 
341 |    for (@lines) {
342 |       s/^#\s*(title|source|NOTE)\s*:/#$1:/;
343 |       if (m/^#\s*(COLOR|TOKENIZATION-TYPE|textid|part-heading)\s*:/) {
344 |          $_ =~ s/ //g;
345 |       }
346 |    }
347 | 
348 |    return join("\n", @lines);
349 | 
350 | }
351 | 
352 | 
353 | ########################################################################
354 | # main()
355 | ########################################################################
356 | 
357 | sub confirm_yn {
358 | 
359 | 	my $message = shift || 'Confirm ? (y|n) ';
360 | 	my $default = shift;
361 | 
362 | 	ITER: {
363 | 		print $message;
364 | 		my $ans = <STDIN>;
365 | 		print "\n" unless -t STDIN;
366 | 		return 1 if $ans =~ m/^\s*+y(?:es)?\s*+$/;
367 | 		return 0 if $ans =~ m/^\s*+n(?:o)?\s*+$/;
368 | 		return $default if (defined($default) and $ans =~ m/^\s*+$/);
369 | 		redo ITER;
370 | 	}
371 | 
372 | }
373 | 
374 | sub read_file {
375 |    my $file = shift;
376 |    open my $fh, $file or die "$0: *** can't open $file ***\n";
377 |    local $/ = undef;
378 |    my $content = <$fh>;
379 |    close $fh or die "$0: *** can't close $file ***\n";
380 |    return $content;
381 | }
382 | 
383 | 
384 | sub write_file {
385 |    my $file = shift;
386 |    my $content = shift;
387 |    open my $fh, ">", $file or die "$0: *** can't open $file ***\n";
388 |    print $fh $content;
389 |    close $fh or die "$0: *** can't close $file ***\n";
390 | }
391 | 
392 | 
393 | sub main {
394 | 
395 |    get_cl_parameters();
396 | 
397 |    if (-e $OUTPUT_FILE) {
398 |       return '' unless (confirm_yn("File $OUTPUT_FILE exists.  Overwrite [Y/n]?", 1));
399 |    }
400 | 
401 |    my $sacr = parse(
402 |       read_file($INPUT_ANNOTATIONS),
403 |       read_file($INPUT_CORPUS) );
404 | 
405 |    $sacr = check_comment_line($sacr);
406 | 
407 |    if ($OUTPUT_FILE) {
408 |       write_file($OUTPUT_FILE, $sacr);
409 |    } else {
410 |       print $sacr;
411 |    }
412 | 
413 |    return $OUTPUT_FILE;
414 | }
415 | 
416 | 
417 | main()
418 |  and print "$0: done!\n";
419 | 
420 | 
421 | 


--------------------------------------------------------------------------------
/jsonlines2conll.py:
--------------------------------------------------------------------------------
  1 | """Script to convert a jsonlines file to a CoNLL file.
  2 | 
  3 | Use the `-h` and `--help` switches to get detailed help on the options.
  4 | 
  5 | Example command (output uses spaces):
  6 | 
  7 |     python3 jsonlines2conll.py -g testing/singe.jsonlines -o ouput.conll
  8 | 
  9 |     #begin document (ge/articleswiki_singe.xml); part 000
 10 |     Singe   (0)
 11 | 
 12 |              Les         (0
 13 |           singes         0)
 14 |             sont          -
 15 |              des         (0
 16 |       mammifères          -
 17 |               de          -
 18 |               l'         (1
 19 |            ordre          -
 20 |              des          -
 21 |               de          -
 22 |              les         (2
 23 |         primates      1)|2)
 24 |     ...
 25 |     #end document
 26 | 
 27 | 
 28 | Example command (merging coreference information with an existing conll
 29 | file, for example to add predicted coreference):
 30 | 
 31 |     python3 jsonlines2conll.py -g testing/singe.jsonlines -o ouput.conll \
 32 |         -c testing/singe.conll
 33 | 
 34 |     #begin document (ge/articleswiki_singe.xml); part 000
 35 |     1   Singe   Singe   NOUN   ...
 36 | 
 37 |        1            Les             le     DET   ...
 38 |        2         singes          singe    NOUN   ...
 39 |        3           sont           être     AUX   ...
 40 |        4            des             un     DET   ...
 41 |        5     mammifères      mammifère    NOUN   ...
 42 |        6             de             de     ADP   ...
 43 |        7             l'             le     DET   ...
 44 |        8          ordre          ordre    NOUN   ...
 45 |     9-10            des              _       _   ...
 46 |        9             de             de     ADP   ...
 47 |       10            les             le     DET   ...
 48 |       11       primates        primate    NOUN   ...
 49 |     ...
 50 |     #end document
 51 | 
 52 | 
 53 | Example command (merging + output uses tabulation):
 54 | 
 55 |     python3 jsonlines2conll.py -g testing/singe.jsonlines -o ouput.conll -c testing/singe.conll -T
 56 | """
 57 | 
 58 | import argparse
 59 | import json
 60 | 
 61 | import conll_transform
 62 | 
 63 | 
 64 | def jsonlines2conll(*fpaths, cols=None, predicted_clusters=True,
 65 |         merge_with=None, outfpath=None, tabsep=False):
 66 | 
 67 |     if cols is None:
 68 |         cols = ['sentences']
 69 | 
 70 |     docs = dict()
 71 | 
 72 |     for line in (l for fpath in fpaths for l in open(fpath)):
 73 | 
 74 |         data = json.loads(line)
 75 |         doc_key = data["doc_key"]
 76 | 
 77 |         sents = [
 78 |             # token is just right: a tuple of col
 79 |             [list(token) for token in zip(*sent)]
 80 |             # sent is: [ sent1_tokens, sent2_speakers,... ]
 81 |             for sent in zip(*[iter(data[col]) for col in cols])
 82 |         ]
 83 | 
 84 |         chains = data['predicted_clusters'
 85 |             if predicted_clusters else 'clusters']
 86 | 
 87 |         mentions = [ m for chain in chains for m in chain ]
 88 |         conll_transform.textpos2sentpos(mentions, sents)
 89 | 
 90 |         conll_transform.write_chains(sents, chains, append=True)
 91 | 
 92 |         docs[doc_key] = sents
 93 | 
 94 |     if merge_with:
 95 |         conll_transform.replace_coref_col(docs, merge_with)
 96 |         docs = merge_with
 97 | 
 98 |     if outfpath:
 99 |         conll_transform.write_file(outfpath, docs, sep="\t" if tabsep else None)
100 | 
101 |     return docs
102 | 
103 | 
104 | def parse_args():
105 |     # definition
106 |     parser = argparse.ArgumentParser(prog="jsonlines2conll",
107 |         description="convert jsonlines to conll",
108 |         #description=__doc__,
109 |         formatter_class=argparse.RawDescriptionHelpFormatter)
110 |     # arguments (not options)
111 |     parser.add_argument("infpaths", nargs="+", help="input files")
112 |     # options
113 |     parser.add_argument("-g", "--gold", dest="gold_clusters",
114 |         default=False, action="store_true",
115 |         help="use gold clusters instead of predicted clusters")
116 |     parser.add_argument("-t", "--in-tab-sep", dest="intabsep",
117 |         default=False, action="store_true",
118 |         help="input conll files use tab as separator")
119 |     parser.add_argument("-T", "--out-tab-sep", dest="outtabsep",
120 |         default=False, action="store_true",
121 |         help="output conll files use tab as separator")
122 |     parser.add_argument("-o", dest="outfpath", required=True,
123 |         help="output file")
124 |     parser.add_argument("-c", "--conll", dest="conll_files", action="append",
125 |         default=[],
126 |         help="conll files to merge with, may be repeated")
127 |     parser.add_argument("--cols", dest="cols", default='sentences',
128 |         help="comma separated list of cols to include, in order "
129 |         "(default: 'sentences')")
130 |     # reading
131 |     args = parser.parse_args()
132 |     return args
133 | 
134 | 
135 | def main():
136 | 
137 |     args = parse_args()
138 | 
139 |     if args.conll_files:
140 |         merge_with = conll_transform.read_files(*args.conll_files,
141 |             sep="\t" if args.intabsep else None)
142 |     else:
143 |         merge_with = None
144 | 
145 |     jsonlines2conll(
146 |         *args.infpaths,
147 |         outfpath=args.outfpath,
148 |         predicted_clusters=not args.gold_clusters,
149 |         merge_with=merge_with,
150 |         cols=args.cols.split(','),
151 |         tabsep=args.outtabsep,
152 |     )
153 |         
154 | 
155 | if __name__ == '__main__':
156 |     main()
157 | 
158 | 


--------------------------------------------------------------------------------
/jsonlines2text.py:
--------------------------------------------------------------------------------
  1 | """Script to convert from a jsonlines file to a text representation of
  2 | coreference annotation.  The output is html.  Mentions are surrounded by
  3 | brackets.  Coreference chains are represented by colors (each chain has
  4 | a specific color) and, if requested by a switch, an index (1, 2, 3...).
  5 | Singletons may be hidden or shown in a specific color (gray by default),
  6 | without any index.
  7 | 
  8 | If your jsonlines file contains several documents, you may show the
  9 | document name by using the `--heading` option.
 10 | 
 11 | Here is a minimal example:
 12 | 
 13 |     python3 jsonlines2text.py testing/docs.jsonlines -o output.html
 14 | 
 15 | Use the `-h` and `--help` switches to get a detailed list of options.
 16 | """
 17 | 
 18 | 
 19 | import argparse
 20 | import json
 21 | 
 22 | from standoff2inline import Highlighter, highlight
 23 | from color_manager import ColorManager, CommonColorManager
 24 | 
 25 | 
 26 | 
 27 | def sort_mentions(clusters):
 28 |     res = []
 29 |     for cluster in clusters:
 30 |         cluster = sorted(cluster, key=lambda x: x[1], reverse=True)
 31 |         cluster = sorted(cluster, key=lambda x: x[0])
 32 |         res.append(cluster)
 33 |     return res
 34 | 
 35 | 
 36 | 
 37 | def sort_clusters(clusters):
 38 |     clusters = sorted(clusters, key=lambda x: x[0][1], reverse=True)
 39 |     clusters = sorted(clusters, key=lambda x: x[0][0])
 40 |     return clusters
 41 | 
 42 | 
 43 | 
 44 | 
 45 | def highlight_clusters(tokens, clusters, paragraphs, *, singleton_color,
 46 |         color_manager, add_indices):
 47 |     
 48 |     clusters = sort_mentions(clusters)
 49 |     clusters = sort_clusters(clusters)
 50 | 
 51 |     if color_manager == "complete":
 52 |         cm = ColorManager(hue_step=25, saturation_step=25, lightness_step=10)
 53 |     elif color_manager == "common":
 54 |         cm = CommonColorManager()
 55 |     else:
 56 |         cm = None
 57 | 
 58 |     hls = []
 59 | 
 60 |     if paragraphs:
 61 |         hl = Highlighter(
 62 |             prefix="<p>",
 63 |             suffix="</p>"
 64 |         )
 65 |         for start, end in paragraphs:
 66 |             hl.add_mark(start, end)
 67 |         hls.append(hl)
 68 | 
 69 |     counter = 1
 70 | 
 71 |     for i, cluster in enumerate(clusters, start=1):
 72 |         hl = None
 73 |         if len(cluster) == 1:
 74 |             if singleton_color == "":
 75 |                 pass
 76 |             else:
 77 |                 color = (cm.gray if cm else 'gray') \
 78 |                     if singleton_color is None else singleton_color
 79 |                 start_span = f'<span style="color: {color};">'
 80 |                 end_span = "</span>"
 81 |                 hl = Highlighter(
 82 |                     prefix=f'{start_span}[{end_span}',
 83 |                     suffix=f'{start_span}]{end_span}')
 84 |         else:
 85 |             color = cm.get_next_color() if cm else "black"
 86 |             start_span = f'<span style="color: {color};">'
 87 |             end_span = "</span>"
 88 |             index = f"<sub>{counter}</sub>{end_span}" if add_indices else ""
 89 |             hl = Highlighter(
 90 |                 prefix=f"<b>{start_span}[{end_span}",
 91 |                 suffix=f"{start_span}]{index}</b>"
 92 |             )
 93 |             counter += 1
 94 |         if hl is not None: # None if only singletons, and they must not be
 95 |                            # marked, or empty document
 96 |             for start, end in cluster:
 97 |                 hl.add_mark(start, end)
 98 |             hls.append(hl)
 99 | 
100 |     res = highlight(tokens, *hls) 
101 | 
102 |     return res
103 | 
104 | 
105 | 
106 | 
107 | 
108 | def filter_tokens(tokens, clusters, n):
109 |     tokens = tokens[:n]
110 |     new_clusters = []
111 |     for cluster in clusters:
112 |         new_cluster = []
113 |         for mention in cluster:
114 |             if mention[0] < n and mention[1] < n:
115 |                 new_cluster.append(mention)
116 |         if new_cluster:
117 |             new_clusters.append(new_cluster)
118 |     return tokens, new_clusters
119 | 
120 | 
121 | 
122 | def convert(doc, gold, n, **kwargs):
123 |     tokens = [t for sent in doc['sentences'] for t in sent]
124 |     if gold:
125 |         clusters = doc.get('clusters', list())
126 |     else:
127 |         clusters = doc.get('predicted_clusters', doc.get('clusters', list()))
128 |     if n:
129 |         tokens, clusters = filter_tokens(tokens, clusters, n)
130 |     paragraphs = doc.get('paragraphs')
131 |     res = highlight_clusters(tokens, clusters, paragraphs, **kwargs)
132 |     return res
133 | 
134 | 
135 | 
136 | def parse_args():
137 |     # definition
138 |     parser = argparse.ArgumentParser(prog="jsonlines2text",
139 |         description=__doc__,
140 |         formatter_class=argparse.RawDescriptionHelpFormatter)
141 |     # arguments (not options)
142 |     #parser.add_argument("infpaths", nargs="+", help="input files")
143 |     parser.add_argument("infpath", default="", help="input file")
144 |     #parser.add_argument("outfpath", default="", help="output file")
145 |     # options
146 |     parser.add_argument("-o", dest="outfpath", help="output file")
147 |     parser.add_argument("--cm", "--color-manager", dest="color_manager",
148 |         default="complete",
149 |         help="color manager: \"\", \"complete\" (the default), \"common\"")
150 |     parser.add_argument("--sing-color", dest="singleton_color",
151 |         help="singleton color: COLOR (default is 'gray') or \"\" to hide "
152 |         "singleton markers", default=None),
153 |     parser.add_argument("-i", "--add-indices", dest="add_indices",
154 |         default=False, action="store_true",
155 |         help="add indices to each chain and mention")
156 |     parser.add_argument("-g", "--gold", dest="gold", default=False,
157 |         action="store_true",
158 |         help="use the 'clusters' key even if a 'predicted_clusters' key is "
159 |         "present")
160 |     parser.add_argument("-n", dest="n", default=0, type=int,
161 |         help="number of tokens to consider from the beginning of the text")
162 |     parser.add_argument("--heading", dest="heading", default="<h1>%s</h1>",
163 |         help="template for text name, default is '<h1>%s</h1>'.  Leave "
164 |         "blank to ignore doc name")
165 |     # reading
166 |     args = parser.parse_args()
167 |     return args
168 | 
169 | 
170 | 
171 | def main():
172 |     args = parse_args()
173 |     res = ""
174 |     for line in open(args.infpath):
175 |         doc = json.loads(line)
176 |         if args.heading:
177 |             if "%s" in args.heading:
178 |                 res += args.heading % doc['doc_key']
179 |             else:
180 |                 res += args.heading
181 |         res += convert(doc, n=args.n, gold=args.gold,
182 |             singleton_color=args.singleton_color,
183 |             color_manager=args.color_manager, add_indices=args.add_indices
184 |         )
185 |     if args.outfpath:
186 |         open(args.outfpath, 'w').write(res)
187 |     else:
188 |         print(res)
189 | 
190 | 
191 | 
192 | if __name__ == '__main__':
193 |     main()
194 | 
195 | 


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | exclude=^(color_manager|conll2jsonlines|conll2sacr|conll_transform|jsonlines2conll|jsonlines2text|sacr2conll|sacr_parser|text2jsonlines|standoff2inline)\.py$
3 | strict=true
4 | disable_error_code=override
5 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.isort]
2 | profile="black"
3 | skip_gitignore=true
4 | skip=["color_manager.py","conll2jsonlines.py","conll2sacr.py","conll_transform.py","jsonlines2conll.py","jsonlines2text.py","sacr2conll.py","sacr_parser.py","text2jsonlines.py","standoff2inline.py"]
5 | 
6 | [tool.black]
7 | extend-exclude='(color_manager|conll2jsonlines|conll2sacr|conll_transform|jsonlines2conll|jsonlines2text|sacr2conll|sacr_parser|text2jsonlines|standoff2inline).py'


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | black==23.3.0
2 | flake8==6.0.0
3 | isort==5.12.0
4 | pytest==7.2.2
5 | mypy==1.1.1
6 | coverage==7.2.3
7 | pandas==2.0.0
8 | 


--------------------------------------------------------------------------------
/sacr2ann.py:
--------------------------------------------------------------------------------
  1 | """Convert a sacr file to an ann/txt files (BRAT standoff annotations).
  2 | 
  3 | The script will produce two files, one for the text and one for the annotations.
  4 | 
  5 | Annotations are of the form:
  6 | 
  7 |     T1      Person 0 9      A Peasant
  8 |     T2      Animal 16 43    an Eagle captured in a trap
  9 |     T3      Object 37 43    a trap
 10 |     T4      Animal 62 70    the bird
 11 |     R1      Coreference Arg1:T2 Arg2:T4
 12 |     T5      Person 76 79    him
 13 |     R2      Coreference Arg1:T1 Arg2:T5
 14 | 
 15 | Note that only a subset of the BRAT format is implemented for now, namely
 16 | the text-bound annotations and the relations.
 17 | 
 18 | Please consult the README file for more information.
 19 | """
 20 | 
 21 | from __future__ import annotations
 22 | 
 23 | import argparse
 24 | from argparse import Namespace
 25 | from collections import defaultdict
 26 | from dataclasses import dataclass
 27 | from pathlib import Path
 28 | 
 29 | from sacr_parser2 import (
 30 |     MentionEnd,
 31 |     MentionStart,
 32 |     ParagraphEnd,
 33 |     SacrParser,
 34 |     Spaces,
 35 |     Word,
 36 | )
 37 | 
 38 | DEFAULT_MENTION_TYPE = "Mention"
 39 | DEFAULT_RELATION_TYPE = "Coreference"
 40 | 
 41 | 
 42 | @dataclass
 43 | class Annotation:
 44 |     index: int
 45 |     kind: str
 46 | 
 47 |     def __eq__(self, other: Annotation) -> bool:
 48 |         return self.index == other.index and self.kind == other.kind
 49 | 
 50 | 
 51 | @dataclass
 52 | class TextAnnotation(Annotation):
 53 |     start: int
 54 |     end: int
 55 | 
 56 |     def __eq__(self, other: TextAnnotation) -> bool:
 57 |         return (
 58 |             super().__eq__(other)
 59 |             and self.start == other.start
 60 |             and self.end == other.end
 61 |         )
 62 | 
 63 | 
 64 | @dataclass
 65 | class RelationAnnotation(Annotation):
 66 |     source: Annotation
 67 |     target: Annotation
 68 | 
 69 |     def __eq__(self, other: RelationAnnotation) -> bool:
 70 |         return (
 71 |             super().__eq__(other)
 72 |             and self.source == other.source
 73 |             and self.target == other.target
 74 |         )
 75 | 
 76 | 
 77 | class Sacr2AnnConverter:
 78 |     def __init__(self, type_property_name: str | None = None):
 79 |         self.type_property_name = type_property_name
 80 |         self._text: str | None = None
 81 |         self._annotations: list[Annotation] | None = None
 82 | 
 83 |     def convert(self, source: str | Path) -> None:
 84 |         parser = SacrParser(source=source)
 85 | 
 86 |         text: str = ""
 87 |         annotations: list[Annotation] = []
 88 |         text_annotation_count: int = 0
 89 |         relation_annotation_count: int = 0
 90 | 
 91 |         chains: dict[int, list[TextAnnotation]] = defaultdict(list)
 92 |         filo: list[TextAnnotation] = []
 93 | 
 94 |         for token in parser.parse():
 95 |             start_position = len(text)
 96 | 
 97 |             if isinstance(token, (Word, Spaces)):
 98 |                 text += token.value
 99 |             elif isinstance(token, ParagraphEnd):
100 |                 text += "\n\n"
101 | 
102 |             elif isinstance(token, MentionStart):
103 |                 text_annotation_count += 1
104 |                 if self.type_property_name:
105 |                     kind = token.features.get(
106 |                         self.type_property_name, DEFAULT_MENTION_TYPE
107 |                     )
108 |                 else:
109 |                     kind = DEFAULT_MENTION_TYPE
110 |                 text_annotation = TextAnnotation(
111 |                     index=text_annotation_count,
112 |                     kind=kind,
113 |                     start=start_position,
114 |                     end=0,
115 |                 )
116 |                 filo.append(text_annotation)
117 |                 annotations.append(text_annotation)
118 | 
119 |                 if token.chain_index in chains:
120 |                     relation_annotation_count += 1
121 |                     relation_annotation = RelationAnnotation(
122 |                         index=relation_annotation_count,
123 |                         kind=DEFAULT_RELATION_TYPE,
124 |                         source=chains[token.chain_index][-1],
125 |                         target=text_annotation,
126 |                     )
127 |                     annotations.append(relation_annotation)
128 | 
129 |                 chains[token.chain_index].append(text_annotation)
130 | 
131 |             elif isinstance(token, MentionEnd):
132 |                 text_annotation = filo.pop()
133 |                 text_annotation.end = len(text)
134 | 
135 |         self._text = text
136 |         self._annotations = annotations
137 | 
138 |     @property
139 |     def text(self) -> str:
140 |         if self._text is None:
141 |             raise RuntimeError("You need to parse before reading the text property")
142 |         return self._text
143 | 
144 |     @property
145 |     def annotations(self) -> list[Annotation]:
146 |         if self._annotations is None:
147 |             raise RuntimeError(
148 |                 "You need to parse before reading the annotations property"
149 |             )
150 |         return self._annotations
151 | 
152 |     def write_text_to_file(self, file: Path) -> None:
153 |         file.write_text(self.text)
154 | 
155 |     @staticmethod
156 |     def _convert_annotations_as_string(text: str, annotations: list[Annotation]) -> str:
157 |         string = ""
158 |         for annotation in annotations:
159 |             if isinstance(annotation, TextAnnotation):
160 |                 span = text[annotation.start : annotation.end]
161 |                 string += f"T{annotation.index}\t{annotation.kind} {annotation.start} {annotation.end}\t{span}\n"
162 |             elif isinstance(annotation, RelationAnnotation):
163 |                 string += f"R{annotation.index}\t{annotation.kind} Arg1:T{annotation.source.index} Arg2:T{annotation.target.index}\n"
164 |             else:
165 |                 raise RuntimeError(
166 |                     "unknown annotation type: " + annotation.__class__.__name__
167 |                 )
168 |         return string
169 | 
170 |     @property
171 |     def annotations_as_string(self) -> str:
172 |         return self._convert_annotations_as_string(self.text, self.annotations)
173 | 
174 |     def write_annotations_to_file(self, file: Path) -> None:
175 |         file.write_text(self.annotations_as_string)
176 | 
177 | 
178 | def convert(
179 |     input_file: Path, txt_output: Path, ann_output: Path, type_property_name: str
180 | ) -> None:
181 |     converter = Sacr2AnnConverter(type_property_name=type_property_name)
182 |     converter.convert(input_file)
183 |     converter.write_text_to_file(txt_output)
184 |     converter.write_annotations_to_file(ann_output)
185 | 
186 | 
187 | def parse_args() -> Namespace:
188 |     parser = argparse.ArgumentParser(
189 |         prog="sacr2ann",
190 |         description=__doc__,
191 |         formatter_class=argparse.RawDescriptionHelpFormatter,
192 |     )
193 |     parser.add_argument("input", help="input file")
194 |     parser.add_argument(
195 |         "--txt",
196 |         dest="txt_output",
197 |         default=None,
198 |         help="output file, default is input file name + .txt",
199 |     )
200 |     parser.add_argument(
201 |         "--ann",
202 |         dest="ann_output",
203 |         default=None,
204 |         help="output file, default is input file name + .ann",
205 |     )
206 |     parser.add_argument(
207 |         "--type-property-name",
208 |         default=None,
209 |         help=f"name of the property where to find the type of text annotation. If not given, '{DEFAULT_MENTION_TYPE}' is used as the type",
210 |     )
211 |     args = parser.parse_args()
212 |     return args
213 | 
214 | 
215 | def main() -> None:
216 |     args = parse_args()
217 |     convert(
218 |         input_file=Path(args.input),
219 |         txt_output=Path(args.txt_output or (args.input + ".txt")),
220 |         ann_output=Path(args.ann_output or (args.input + ".ann")),
221 |         type_property_name=args.type_property_name,
222 |     )
223 | 
224 | 
225 | if __name__ == "__main__":
226 |     main()
227 | 


--------------------------------------------------------------------------------
/sacr2annotable.py:
--------------------------------------------------------------------------------
  1 | """Convert a corpus of SACR texts into a Corpus (from annotable.py) that can be
  2 | used to output dataframes.
  3 | 
  4 | It is a class which should be used as follows:
  5 | 
  6 | files = [
  7 |     Path("file1.sacr"),
  8 |     Path("file2.sacr"),
  9 |     Path("file3.sacr"),
 10 |     # ...
 11 | ]
 12 | 
 13 | converter = Sacr2AnnotableConverter()
 14 | for file in files:
 15 |     converter.convert_text(file)
 16 | corpus = converter.corpus
 17 | 
 18 | dataframes = corpus.get_dataframes()
 19 | """
 20 | 
 21 | from __future__ import annotations
 22 | 
 23 | import re
 24 | from pathlib import Path
 25 | 
 26 | from annotable import Corpus, Mention, Paragraph, Sentence, Text, Token
 27 | from sacr_parser2 import (
 28 |     Comment,
 29 |     MentionEnd,
 30 |     MentionStart,
 31 |     ParagraphEnd,
 32 |     SacrParser,
 33 |     SentenceChange,
 34 |     Spaces,
 35 |     TextID,
 36 |     Word,
 37 | )
 38 | 
 39 | TEXT_METADATA_PATTERN = re.compile(r"textmetadata\s*:\s*(\w+)\s*=\s*(.*)")
 40 | 
 41 | 
 42 | class Sacr2AnnotableConverter:
 43 |     def __init__(self) -> None:
 44 |         self.corpus: Corpus = Corpus()
 45 | 
 46 |     def convert_text(self, source: str | Path) -> None:
 47 |         parser = SacrParser(source=source)
 48 | 
 49 |         text: Text = Text()
 50 |         current_paragraph: Paragraph = Paragraph()
 51 |         current_sentence: Sentence = Sentence()
 52 |         filo: list[Mention] = []
 53 | 
 54 |         for token in parser.parse():
 55 |             if isinstance(token, Spaces):
 56 |                 for mention in filo:
 57 |                     mention.string += token.value
 58 | 
 59 |             elif isinstance(token, Word):
 60 |                 t = Token(token.start, token.end, token.value)
 61 |                 for mention in filo:
 62 |                     mention.add_token(t)
 63 |                     mention.string += token.value
 64 |                 current_sentence.add_token(t)
 65 | 
 66 |             elif isinstance(token, TextID):
 67 |                 text.name = token.text_id
 68 | 
 69 |             elif isinstance(token, ParagraphEnd):
 70 |                 if current_sentence.token_count:
 71 |                     current_paragraph.add_sentence(current_sentence)
 72 |                     current_sentence = Sentence()
 73 |                 text.add_paragraph(current_paragraph)
 74 |                 current_paragraph = Paragraph()
 75 | 
 76 |             elif isinstance(token, SentenceChange):
 77 |                 if current_sentence.token_count:
 78 |                     current_paragraph.add_sentence(current_sentence)
 79 |                     current_sentence = Sentence()
 80 | 
 81 |             elif isinstance(token, MentionStart):
 82 |                 mention = Mention(chain_name=token.chain_name, string="")
 83 |                 for k, v in token.features.items():
 84 |                     mention[k] = v
 85 |                 current_sentence.add_mention(mention)
 86 |                 filo.append(mention)
 87 | 
 88 |             elif isinstance(token, MentionEnd):
 89 |                 filo.pop()
 90 | 
 91 |             elif isinstance(token, Comment):
 92 |                 if m := TEXT_METADATA_PATTERN.fullmatch(token.value):
 93 |                     text.metadata[m.group(1)] = m.group(2)
 94 | 
 95 |         if current_sentence.token_count:
 96 |             current_paragraph.add_sentence(current_sentence)
 97 |         if current_paragraph.sentence_count:
 98 |             text.add_paragraph(current_paragraph)
 99 | 
100 |         self.corpus.add_text(text)
101 | 


--------------------------------------------------------------------------------
/sacr2conll.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Convert a SACR file (http://boberle.com/projects/sacr) to a conll file.  The
  3 | conll format produced is tabulation separated with three columns: index, word
  4 | and coreference.
  5 | 
  6 | To convert:
  7 | 
  8 |     python3 sacr2conll.py -o OUTPUT.conll INPUT.sacr
  9 | 
 10 | You can specify the document name (or key) with the `--docname` option.
 11 | Otherwise, it will be `#textid`, if any, otherwise the file name.
 12 | 
 13 | With the --speaker switch, you can add a 4th column, which will be placed
 14 | before the coreference columns.  In the SACR file, the speaker can be mentionned
 15 | as a comment prefixed with `#speaker:` before each line, like this:
 16 | 
 17 |     #title: Lucian, Dialogues of the Dead, 4: Hermes and Charon
 18 | 
 19 |     #speaker: Hermes
 20 |     Ferryman, what do you say to settling up accounts? It will prevent any
 21 |     unpleasantness later on.
 22 | 
 23 |     #speaker: Charon
 24 |     Very good. It does save trouble to get these things straight.
 25 | 
 26 | You can remove the speaker for a paragraph by setting:
 27 | 
 28 |     #speaker:
 29 |     ... the text of the narrator ...
 30 | """
 31 | 
 32 | 
 33 | import os
 34 | import argparse
 35 | import re
 36 | 
 37 | import sacr_parser
 38 | 
 39 | __version__ = "1.0.0"
 40 | 
 41 | 
 42 | def read_file(fpath, index, docname=None, part_is_index=True, include_speaker=False):
 43 | 
 44 |     parser = sacr_parser.SacrParser(
 45 |         fpath=fpath,
 46 |         tokenization_mode=sacr_parser.WORD_TOKENIZATION,
 47 |     )
 48 | 
 49 |     tokens = []
 50 |     starts = dict()  # start -> {ids}
 51 |     ends = dict()  # end -> {ids}
 52 |     sentences = set()  # index of last tokens
 53 | 
 54 |     filo = []
 55 | 
 56 |     textid = None
 57 |     speaker = ""
 58 | 
 59 |     for item, params in parser.parse():
 60 | 
 61 |         if item == "text_id":
 62 |             textid = params
 63 | 
 64 |         elif item in ("par_start", "par_end", "sentence_change"):
 65 |             if tokens:
 66 |                 sentences.add(len(tokens))
 67 | 
 68 |         elif item == "mention_start":
 69 |             chain = params[0]
 70 |             l = len(tokens)
 71 |             if l not in starts:
 72 |                 starts[l] = []
 73 |             starts[l].append(chain)
 74 |             filo.append(chain)
 75 | 
 76 |         elif item == "comment":
 77 |             if params.startswith("speaker:"):
 78 |                 speaker = params[8:].strip().replace(" ", "_")
 79 | 
 80 |         elif item == "mention_end":
 81 |             chain = filo.pop()
 82 |             l = len(tokens) - 1
 83 |             if l not in ends:
 84 |                 ends[l] = []
 85 |             ends[l].append(chain)
 86 | 
 87 |         elif item == "token":
 88 |             tokens.append((params, speaker))
 89 | 
 90 |     lines = []
 91 | 
 92 |     counter = 0
 93 |     for i, (token, speaker) in enumerate(tokens):
 94 |         if i in sentences:
 95 |             lines.append("")
 96 |             counter = 0
 97 |         corefcol = "_".join(
 98 |             # ["(%d)" % x for x in (starts[i]
 99 |             #  if (i in starts and i in ends) else [])]
100 |             # + ["(%d" % x for x in (starts[i]
101 |             ["(%d" % x for x in (starts[i] if i in starts else [])]
102 |             + ["%d)" % x for x in (ends[i] if i in ends else [])]
103 |         )
104 |         corefcol = re.sub(r"\((\d+)_\1\)", r"(\1)", corefcol)
105 |         if not corefcol:
106 |             corefcol = "-"
107 |         if include_speaker:
108 |             cols = [str(counter), token, speaker, corefcol]
109 |         else:
110 |             cols = [str(counter), token, corefcol]
111 |         lines.append("\t".join(cols))
112 |         counter += 1
113 | 
114 |     if not docname:
115 |         docname = textid if textid else os.path.basename(fpath)
116 |     res = "#begin document (%s); part %03d\n" % (docname, index if part_is_index else 0)
117 |     res += "\n".join(lines)
118 |     res += "\n#end document\n"
119 |     return res
120 | 
121 | 
122 | def parse_args():
123 |     # definition
124 |     parser = argparse.ArgumentParser(
125 |         prog="sacr2conll",
126 |         # description="convert sacr files to conll file",
127 |         description=__doc__,
128 |         formatter_class=argparse.RawDescriptionHelpFormatter,
129 |     )
130 |     # arguments (not options)
131 |     parser.add_argument("infpaths", nargs="+", help="input files")
132 |     # options
133 |     parser.add_argument(
134 |         "-o", dest="outfpath", default="", help="output file, default is stdout"
135 |     )
136 |     parser.add_argument(
137 |         "-n",
138 |         "--docname",
139 |         dest="docname",
140 |         default="",
141 |         help="document name; otherwise #textid; otherwise file name",
142 |     )
143 |     parser.add_argument(
144 |         "-i",
145 |         "--index",
146 |         dest="part_is_index",
147 |         default=False,
148 |         action="store_true",
149 |         help="document part is file index (otherwise the part is 0; "
150 |         "this is implied by --docname",
151 |     )
152 |     parser.add_argument(
153 |         "-s",
154 |         "--speaker",
155 |         default=False,
156 |         action="store_true",
157 |         help="include a column 'speaker' before the coref column",
158 |     )
159 |     # special options
160 |     parser.add_argument(
161 |         "--version", action="version", version="%(prog)s " + __version__
162 |     )
163 |     # reading
164 |     args = parser.parse_args()
165 |     # check
166 |     if args.docname:
167 |         args.part_is_index = True
168 |     return args
169 | 
170 | 
171 | def main():
172 |     args = parse_args()
173 |     res = []
174 |     for i, fpath in enumerate(args.infpaths):
175 |         res.append(
176 |             read_file(
177 |                 fpath, index=i, docname=args.docname, part_is_index=args.part_is_index, include_speaker=args.speaker
178 |             )
179 |         )
180 |     res = "\n\n".join(res)
181 |     if args.outfpath:
182 |         open(args.outfpath, "w").write(res)
183 |     else:
184 |         print(res)
185 | 
186 | 
187 | if __name__ == "__main__":
188 |     main()
189 | 


--------------------------------------------------------------------------------
/sacr2df.py:
--------------------------------------------------------------------------------
 1 | """Convert a corpus of texts annotated with SACR in a series of dataframes or
 2 | CSV files.
 3 | 
 4 | This dataframes/files model:
 5 | - corpus, texts, sentences, tokens
 6 | - mentions, chains and relations
 7 | 
 8 | Please see the README file for a detail description.
 9 | 
10 | You can use the script in the CLI:
11 | 
12 |     python3 sacr2df.py text1.sacr text2.sacr ... -o output_file.zip
13 | 
14 | or as a library, for example in a Jupyter notebook:
15 | 
16 |     from sacr2df import convert_sacr_files_to_dataframes
17 |     from pathlib import Path
18 | 
19 |     dfs = convert_sacr_files_to_dataframes(
20 |         Path("testing/aesop.sacr"),
21 |         Path("testing/caesar.sacr"),
22 |         Path("testing/cicero.sacr"),
23 |         Path("testing/pliny.sacr"),
24 |     )
25 | 
26 |     # then do something with the dfs:
27 |     print(dfs.texts.head())
28 |     print(dfs.paragraphs.head())
29 |     print(dfs.sentences.head())
30 |     print(dfs.tokens.head())
31 |     print(dfs.text_chains.head())
32 |     print(dfs.text_mentions.head())
33 |     print(dfs.text_consecutive_relations.head())
34 |     print(dfs.text_to_first_relations.head())
35 | """
36 | 
37 | import argparse
38 | from argparse import Namespace
39 | from pathlib import Path
40 | 
41 | from annotable import DataFrameSet
42 | from sacr2annotable import Sacr2AnnotableConverter
43 | 
44 | 
45 | def convert_sacr_files_to_dataframes(
46 |     *files: Path, output_file: Path | None = None
47 | ) -> DataFrameSet:
48 |     conv = Sacr2AnnotableConverter()
49 |     for file in files:
50 |         conv.convert_text(file)
51 |     corpus = conv.corpus
52 | 
53 |     if output_file:
54 |         corpus.save_csv_as_zip(output_file)
55 | 
56 |     return corpus.get_dataframes()
57 | 
58 | 
59 | def parse_args() -> Namespace:
60 |     parser = argparse.ArgumentParser(
61 |         prog="sacr2df",
62 |         description=__doc__,
63 |         formatter_class=argparse.RawDescriptionHelpFormatter,
64 |     )
65 |     parser.add_argument("input_files", nargs="+", help="input files")
66 |     parser.add_argument(
67 |         "--output_file",
68 |         "-o",
69 |         required=True,
70 |         help="output file. This is a zip file containing the csv",
71 |     )
72 |     args = parser.parse_args()
73 |     return args
74 | 
75 | 
76 | def main() -> None:
77 |     args = parse_args()
78 |     convert_sacr_files_to_dataframes(
79 |         *[Path(f) for f in args.input_files],
80 |         output_file=Path(args.output_file),
81 |     )
82 | 
83 | 
84 | if __name__ == "__main__":
85 |     main()
86 | 


--------------------------------------------------------------------------------
/sacr2glozz.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | use strict;
  3 | use warnings FATAL=>'all';
  4 | use open ':utf8';
  5 | use utf8;
  6 | 
  7 | use Data::Dumper;
  8 | 
  9 | ########################################################################
 10 | # Global variables
 11 | ########################################################################
 12 | 
 13 | my $USER = 'me';
 14 | my $MIN_NB_OF_LINKS = 0;
 15 | my $REFNAME_PROPERTY = ''; # if empty, don't use
 16 | my $USE_SCHEMATA = '';
 17 | my $INPUT_FILE = '';
 18 | my $OUTPUT_FILE_CORPUS = '';
 19 | my $OUTPUT_FILE_ANNOTATIONS = '';
 20 | my $OUTPUT_FILE_MODEL = '';
 21 | my $DONT_KEEP_COMMENTS = '';
 22 | my $EMPTY_REFNAME_FIELD = '';
 23 | my $EMPTIED_REFNAME_FIELD_VALUE = '';
 24 | my $BUILD_GLOZZ_MODEL = '';
 25 | my $EXPLODE_HEAD = '';
 26 | my $LINK_NAME = '';
 27 | my %FILTER = ();
 28 | 
 29 | $Data::Dumper::Terse  = 1;
 30 | $Data::Dumper::Indent = 1;
 31 | 
 32 | ########################################################################
 33 | # Get the CLI parameters
 34 | ########################################################################
 35 | 
 36 | my $HELP =<<"END";
 37 | USAGE
 38 |   $0 [OPTIONS] INPUT OUTPUT
 39 | 
 40 | DESCRIPTION
 41 |   Convert a SACR file to a couple of Glozz files (.ac and .aa).
 42 |   Do not specify the extensions (.ac/.aa) for the output file.
 43 | 
 44 | OPTIONS (-o value --opt value)
 45 |    -h             Print help.
 46 |    -m --min VALUE The minimum length of a chain.  If -e AND -p are set, then
 47 |                   the chains with less links have the value specified in -e.
 48 |                   Otherwise, they are excluded.
 49 |                   Default is 0 (all links are included).
 50 |    -e VALUE       Put VALUE in the the PROP_NAME property (if the -p option is
 51 |                   used) for chains with less than -m. (E.g. "" or "SI" for
 52 |                   SIngleton.)
 53 |    -p PROP_NAME   Include a property PROP_NAME with the name of the referent.
 54 |                   If empty string, don't use.
 55 |    -s --schema    Include schemata.
 56 |    -K             Don't keep comments.
 57 |    -e             Explode head property into 'headpos' and 'headstring'.
 58 |    -f REFNAME     Include only REFNAME (this option can be repeated).
 59 |    --model        Build a Glozz annotation model (.aam).
 60 |    --link-name VAL Name of the link (like 'link', 'mention', 'markable', etc.).
 61 |                   Default is 'MENTION'.
 62 | END
 63 | 
 64 | sub get_cl_parameters {
 65 | 
 66 |    # default
 67 |    $MIN_NB_OF_LINKS = 0;
 68 |    $REFNAME_PROPERTY = '';
 69 |    $USE_SCHEMATA = '';
 70 |    $INPUT_FILE = '';
 71 |    $OUTPUT_FILE_CORPUS = '';
 72 |    $OUTPUT_FILE_ANNOTATIONS = '';
 73 |    $OUTPUT_FILE_MODEL = '';
 74 |    $EXPLODE_HEAD = '';
 75 |    $DONT_KEEP_COMMENTS = '';
 76 |    $EMPTY_REFNAME_FIELD = '';
 77 |    $EMPTIED_REFNAME_FIELD_VALUE = '';
 78 |    $BUILD_GLOZZ_MODEL = '';
 79 |    $LINK_NAME = 'MENTION';
 80 | 
 81 |    my $pending = '';
 82 | 
 83 |    for (@ARGV) {
 84 |       print $HELP and exit if m/^(?:-h|--?help)$/;
 85 |       last if $pending and m/^-/;
 86 |       # pending
 87 |       if ($pending eq '-m' and m/^\d++$/) {
 88 |          $MIN_NB_OF_LINKS = $_;
 89 |          $pending = '';
 90 |       } elsif ($pending eq '-f') {
 91 |          $FILTER{$_} = 1;
 92 |          $pending = '';
 93 |       } elsif ($pending eq '-e') {
 94 |          $EMPTY_REFNAME_FIELD = 1;
 95 |          $EMPTIED_REFNAME_FIELD_VALUE = $_;
 96 |          $pending = '';
 97 |       } elsif ($pending eq '-p') {
 98 |          $REFNAME_PROPERTY = $_;
 99 |          $pending = '';
100 |       } elsif ($pending eq '--link-name') {
101 |          $LINK_NAME = $_;
102 |          $pending = '';
103 |       # end of pending
104 |       } elsif ($pending) {
105 |          last;
106 |       # options with value waiting for next element in @ARGV
107 |       } elsif (m/^(?:-m|--min)$/) {
108 |          $pending = '-m';
109 |       } elsif (m/^-f$/) {
110 |          $pending = '-f';
111 |       } elsif (m/^--link-name$/) {
112 |          $pending = '--link-name';
113 |       } elsif (m/^(?:-p|--property)$/) {
114 |          $pending = '-p';
115 |       } elsif (m/^(?:-e)$/) {
116 |          $pending = '-e';
117 |       # switch (= options with no value)
118 |       } elsif (m/^(?:-s|--schema)$/) {
119 |          $USE_SCHEMATA = 1;
120 |       } elsif (m/^--model$/) {
121 |          $BUILD_GLOZZ_MODEL = 1;
122 |       } elsif (m/^-K$/) {
123 |          $DONT_KEEP_COMMENTS = 1;
124 |       } elsif (m/^-e$/) {
125 |          $EXPLODE_HEAD = 1;
126 |       # end of options
127 |       } elsif (m/^-.+/) {
128 |          die "$0: *** option '$_' doesn't exists ***\n";
129 |       # arguments, not options
130 |       } elsif (!$INPUT_FILE) {
131 |          $INPUT_FILE = $_;
132 |       } elsif (!$OUTPUT_FILE_CORPUS) {
133 |          $OUTPUT_FILE_CORPUS = "$_.ac";
134 |          $OUTPUT_FILE_ANNOTATIONS = "$_.aa";
135 |          $OUTPUT_FILE_MODEL = "$_.aam";
136 |       } else {
137 |          die "$0: *** bad argument '$_' ***\n";
138 |       }
139 |    } # for
140 | 
141 |    die "$0: *** missing value for option '$pending' ***\n" if $pending;
142 | 
143 |    die "$0: *** no input/output file ***\n"
144 |       unless $INPUT_FILE and $OUTPUT_FILE_ANNOTATIONS and $OUTPUT_FILE_CORPUS
145 |       and $OUTPUT_FILE_MODEL;
146 | 
147 |    die "$0: *** file '$INPUT_FILE' doesn't exist ***\n"
148 |       unless -f-r $INPUT_FILE;
149 | 
150 | }
151 | 
152 | 
153 | ########################################################################
154 | # parser
155 | ########################################################################
156 | 
157 | sub parse {
158 | 
159 |    my $content = shift;
160 |    my $r_filter = shift;
161 | 
162 |    my $corpus = '';
163 |    my @paragraphs = (); # format: { start=>0, end=>0 }
164 |    my @annotations = (); # format: [ start=>0, end=>0, offset=>LEN_CORPUS, name=>NAME props => {props} ]
165 |    my @filoAnnotations = ();
166 | 
167 |    # each line is a paragraph
168 |    for my $line (split /\n/, $content) {
169 |       chomp $line;
170 |       if ($line =~ m/^\s*+$/) {
171 |          # nothing
172 |       } elsif ($line =~ m/^\s*+#.*+$/ or $line =~ m/^\*++$/) {
173 |          unless ($DONT_KEEP_COMMENTS) {
174 |             push @paragraphs, { start=>length($corpus), end=>length($corpus)+length($line) };
175 |             $corpus .= $line;
176 |          } else {
177 |             # nothing
178 |          }
179 |       } else {
180 |          my $plain_text = '';
181 |          pos($line) = 0;
182 |          while (pos($line) < length $line) {
183 |             if ($line =~ m/\G\{([-_0-9a-zA-Z]++)/gc) {
184 |                my $refname = $1;
185 |                my %props = ();
186 |                if ($line =~ m/\G:/gc) {
187 |                   while ($line =~ m/\G([-_0-9a-zA-Z]++)=(?:"([^"]*+)"|([-_0-9a-zA-Z]++)),?/gc) {
188 |                      #print "DEBUG: $1\n";
189 |                      if ($1 eq 'head' and $EXPLODE_HEAD and
190 |                            $2 =~ m/^\s*+(\d++)\s*+:\s*+(.++)$/) {
191 |                         $props{headpos} = $1;
192 |                         $props{headstring} = $2;
193 |                      } else {
194 |                         if (length $2) {
195 |                            $props{$1} = $2;
196 |                         } elsif (length $3) {
197 |                            $props{$1} = $3;
198 |                         } else {
199 |                            $props{$1} = "";
200 |                         }
201 |                      }
202 |                   }
203 |                }
204 |                $props{$REFNAME_PROPERTY} = $refname if $REFNAME_PROPERTY;
205 |                unless ($line =~ m/\G\s/gc) {
206 |                   die "$0: *** ill formed line: $line (no space after properties) ***\n";
207 |                }
208 |                push @filoAnnotations, {
209 |                      start => length $plain_text,
210 |                      end => undef,
211 |                      name => $refname,
212 |                      props => { %props },
213 |                      offset => length $corpus
214 |                   };
215 |             } elsif ($line =~ m/\G\}/gc) {
216 |                die "$0: *** too many {'s ***\n" unless @filoAnnotations;
217 |                $filoAnnotations[-1]->{end} = length $plain_text;
218 |                push @annotations, pop @filoAnnotations;
219 |             } elsif ($line =~ m/\G(.)/gc) {
220 |                $plain_text .= $1;
221 |             }
222 |          } # while
223 |          die "$0: *** filo not empty for line: $line ***\n" if @filoAnnotations;
224 |          die "$0: *** string not completed ***\n" unless pos($line) == length($line);
225 |          # set the paragraph
226 |          push @paragraphs, { start=>length($corpus), end=>length($corpus)+length($plain_text) };
227 |          $corpus .= $plain_text;
228 |       } # if
229 |    } # for
230 | 
231 |    my $counter = time();
232 |    my $xml = '';
233 |    for (@paragraphs) {
234 |       $xml .= "<unit id=\"${USER}_$counter\">\n";
235 |       $xml .= "<metadata><author>$USER</author><creation-date>$counter</creation-date></metadata>\n";
236 |       $xml .= "<characterisation><type>paragraph</type><featureSet /></characterisation>\n";
237 |       $xml .= sprintf '<positioning><start><singlePosition index="%d" /></start><end><singlePosition index="%d" /></end></positioning>'."\n",
238 |          $_->{start}, $_->{end};
239 |       $xml .= "</unit>\n";
240 |       $counter++;
241 |    }
242 | 
243 |    my %property_list = ();
244 |    for my $annot (@annotations) {
245 |       for my $prop (keys %{$annot->{props}}) {
246 |          $property_list{$prop} = 1;
247 |       }
248 |    }
249 | 
250 |    my %chains = ();
251 |    for (@annotations) {
252 |       if (exists $chains{$_->{name}}) {
253 |          $chains{$_->{name}}++;
254 |       } else {
255 |          $chains{$_->{name}} = 0;
256 |       }
257 |    }
258 | 
259 |    my %schemata = (); # format: REFNAME => [ IDCOUNTER, IDCOUNTER, ... ]
260 |    for (@annotations) {
261 |       next if %$r_filter and not $r_filter->{$_->{name}};
262 |       if ($chains{$_->{name}} < $MIN_NB_OF_LINKS) {
263 |          if ($EMPTY_REFNAME_FIELD and $REFNAME_PROPERTY) {
264 |             $_->{props}->{$REFNAME_PROPERTY} = $EMPTIED_REFNAME_FIELD_VALUE;
265 |          } else {
266 |             next;
267 |          }
268 |       }
269 |       $xml .= "<unit id=\"${USER}_$counter\">\n";
270 |       $xml .= "<metadata><author>$USER</author><creation-date>$counter</creation-date></metadata>\n";
271 |       $xml .= "<characterisation>\n";
272 |       if ($_->{name} =~ m/^_/) {
273 |          (my $name = $_->{name}) =~ s/^_//;
274 |          $xml .= "<type>$name</type>\n", ;
275 |       } else {
276 |          $xml .= "<type>$LINK_NAME</type>\n";
277 |       }
278 |       $xml .= "<featureSet>\n";
279 |       for my $k (keys %{$_->{props}}) {
280 |          my $val = $_->{props}->{$k};
281 |          $xml .= "<feature name=\"$k\">$val</feature>\n";
282 |       }
283 |       $xml .= "</featureSet>\n";
284 |       $xml .= "</characterisation>\n";
285 |       $xml .= sprintf '<positioning><start><singlePosition index="%d" /></start><end><singlePosition index="%d" /></end></positioning>'."\n",
286 |          $_->{start}+$_->{offset}, $_->{end}+$_->{offset};
287 |       $xml .= "</unit>\n";
288 |       if (exists $schemata{$_->{name}}) {
289 |          # for the format of the ID, see embedded-unit below
290 |          push @{$schemata{$_->{name}}}, "${USER}_$counter";
291 |       } else {
292 |          $schemata{$_->{name}} = [ "${USER}_$counter" ];
293 |       }
294 |       $counter++;
295 |    }
296 | 
297 |    if ($USE_SCHEMATA) {
298 |       for my $k (keys %schemata) {
299 |          next if $k =~ m/^_/;
300 |          next if scalar @{$schemata{$k}} < $MIN_NB_OF_LINKS;
301 |          $xml .= "<schema id=\"${USER}_$counter\">\n";
302 |          $xml .= "<metadata><author>$USER</author><creation-date>$counter</creation-date></metadata>\n";
303 |          $xml .= "<characterisation><type>cr</type>\n";
304 |          $xml .= "<featureSet>\n";
305 |          $xml .= "<feature name=\"nom\">$k</feature>\n";
306 |          $xml .= "</featureSet>\n";
307 |          $xml .= "</characterisation>\n";
308 |          $xml .= "<positioning>\n";
309 |          for my $id (@{$schemata{$k}}) {
310 |             # NOTE: 'id' is not the id of the unit! It is in fact
311 |             # "AUTHOR_CREATIONDATE" of the unit, and the 'id' of the
312 |             # unit is in fact not used!
313 |             $xml .= "<embedded-unit id=\"$id\"/>\n";
314 |          }
315 |          $xml .= "</positioning>\n";
316 |          $xml .= "</schema>\n";
317 |          $counter++;
318 |       }
319 |    }
320 | 
321 |    my $model = "";
322 | 
323 |    if ($BUILD_GLOZZ_MODEL) {
324 |       $model = "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n";
325 |       $model .= "<annotationModel>\n";
326 |       $model .= "<units>\n";
327 |       $model .= "<type name=\"$LINK_NAME\">\n";
328 |       for my $property (keys %property_list) {
329 |          $model .= "<featureSet>\n";
330 |          $model .= "<feature name=\"$property\">\n";
331 |          $model .= "<value type=\"free\" default=\"\" />\n";
332 |          $model .= "</feature>\n";
333 |          $model .= "</featureSet>\n";
334 | 
335 |       }
336 |       $model .= "</type>\n";
337 |       $model .= "</units>\n";
338 |       $model .= "<relations>\n";
339 |       $model .= "</relations>\n";
340 |       $model .= "<schemas>\n";
341 |       $model .= "</schemas>\n";
342 |       $model .= "</annotationModel>\n";
343 |    }
344 | 
345 |    return ($corpus,
346 |       "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n<annotations>\n$xml</annotations>\n",
347 |       $model);
348 | 
349 | }
350 | 
351 | 
352 | ########################################################################
353 | # Helper functions
354 | ########################################################################
355 | 
356 | 
357 | sub confirm_yn {
358 | 
359 | 	my $message = shift || 'Confirm ? (y|n) ';
360 | 	my $default = shift;
361 | 
362 | 	ITER: {
363 | 		print $message;
364 | 		my $ans = <STDIN>;
365 | 		print "\n" unless -t STDIN;
366 | 		return 1 if $ans =~ m/^\s*+y(?:es)?\s*+$/;
367 | 		return 0 if $ans =~ m/^\s*+n(?:o)?\s*+$/;
368 | 		return $default if (defined($default) and $ans =~ m/^\s*+$/);
369 | 		redo ITER;
370 | 	}
371 | 
372 | }
373 | 
374 | sub read_file {
375 | 
376 |    my $file = shift;
377 | 
378 |    open my $fh, $file or die "$0: *** can't open $file ***\n";
379 | 
380 |    local $/ = undef;
381 | 
382 |    my $content = <$fh>;
383 | 
384 |    close $fh or die "$0: *** can't close $file ***\n";
385 | 
386 |    return $content;
387 | 
388 | }
389 | 
390 | sub write_file {
391 | 
392 |    my $file = shift;
393 |    my $content = shift;
394 | 
395 |    open my $fh, ">", $file or die "$0: *** can't open $file ***\n";
396 |    print $fh $content;
397 |    close $fh or die "$0: *** can't close $file ***\n";
398 | 
399 | }
400 | 
401 | 
402 | ########################################################################
403 | # main()
404 | ########################################################################
405 | 
406 | sub main {
407 | 
408 |    get_cl_parameters();
409 | 
410 |    if (-e $OUTPUT_FILE_ANNOTATIONS or -e $OUTPUT_FILE_CORPUS or -e
411 |          $OUTPUT_FILE_MODEL) {
412 |       return unless (confirm_yn("Output files exist.  Overwrite [Y/n]?", 1));
413 |    }
414 | 
415 |    my $content = read_file($INPUT_FILE);
416 | 
417 |    my ($corpus, $xml, $model) = parse($content, \%FILTER);
418 |    write_file($OUTPUT_FILE_CORPUS, $corpus);
419 |    write_file($OUTPUT_FILE_ANNOTATIONS, $xml);
420 |    if ($OUTPUT_FILE_MODEL) {
421 |       write_file($OUTPUT_FILE_MODEL, $model);
422 |    }
423 | 
424 |    return 1;
425 | 
426 | }
427 | 
428 | 
429 | main()
430 |    and print "$0: done!\n";
431 | 
432 | 
433 | 


--------------------------------------------------------------------------------
/sacr_parser.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This module offers a parser for the SACR ("Script d'Annotation des Chaînes de
  3 | Référence") format.
  4 | 
  5 | Synopsis
  6 | --------
  7 | 
  8 | The parser yields the following elements:
  9 | * ('text_id', <TEXT_ID from the directive #textid>)
 10 | * ('comment', <COMMENT TEXT, stripped) NOTE: only if there is a text
 11 | * ('par_start', None)
 12 | * ('par_end', None)
 13 | * ('mention_start', (<CHAIN_INDEX>, <CHAIN_NAME>, <FEATURES>))
 14 | * ('mention_end', None)
 15 | * ('token', <STRING>)
 16 | * ('sentence_change', None)
 17 | 
 18 | Note that spaces are not yielded as token.
 19 | 
 20 |     import sacr_parser
 21 |     import annotable
 22 | 
 23 |     corpus = annotatble.Corpus()
 24 | 
 25 |     for fpath in fpaths:
 26 |         parser = sacr_parser.SacrParser(
 27 |             fpath=fpath,
 28 |             tokenization_mode=sacr_parser.WORD_TOKENIZATION,
 29 |         )
 30 |         text = annotable.Text(id_=fpath)
 31 |         self.corpus.add_text(text)
 32 |         for item, params in parser.parse():
 33 |             if item == 'text_id':
 34 |                 text.id_ = params
 35 |             elif item == 'par_start':
 36 |                 ...
 37 |             elif item == 'par_end':
 38 |                 ...
 39 |             elif item == 'sentence_change':
 40 |                 ...
 41 |             elif item == 'mention_start':
 42 |                 ...
 43 |             elif item == 'token':
 44 |                 ...
 45 |             elif item == 'mention_end':
 46 |                 ...
 47 | """
 48 | 
 49 | __version__ = "1.0.0"
 50 | 
 51 | import re
 52 | 
 53 | WORD_TOKENIZATION = 1
 54 | CHAR_TOKENIZATION = 2
 55 | 
 56 | 
 57 | def escape_regex(string):
 58 |     """Escape a string so it can be literally search for in a regex.
 59 | 
 60 |     Used for additional_tokens.
 61 |     """
 62 |     return re.sub(r"([-{}\[\]().])", r"\\\1", string)
 63 | 
 64 | 
 65 | class SacrParser:
 66 |     """Parse a file in the SACR format.
 67 | 
 68 |     Attribute
 69 |     ---------
 70 |     tokenization_mode: int
 71 |         The tokenization mode, use the constants: `WORD_TOKENIZATION` and
 72 |         `CHAR_TOKENIZATION`
 73 |     fpath: str
 74 |         Path of the file to parse.
 75 |     """
 76 | 
 77 |     @staticmethod
 78 |     def get_word_regex(additional_tokens=None):
 79 |         """Compute the regex to match words, including additional_tokens."""
 80 |         if not additional_tokens:
 81 |             addtional_tokens = []
 82 |         additional_tokens = sorted(
 83 |             [escape_regex(w) for w in additional_tokens], key=lambda x: len(x)
 84 |         )
 85 |         token_str = "[a-zßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿœα-ω0-9_]+'?|[-+±]?[.,]?[0-9]+"
 86 |         if additional_tokens:
 87 |             return re.compile(
 88 |                 "([%d]+|%s)" % (token_str, "|".join(additional_tokens)), re.I
 89 |             )
 90 |         else:
 91 |             return re.compile("(%s)" % token_str, re.I)
 92 | 
 93 |     def __init__(self, fpath, tokenization_mode=WORD_TOKENIZATION):
 94 |         self.tokenization_mode = tokenization_mode
 95 |         self.fpath = fpath
 96 | 
 97 |     def parse(self):
 98 |         """Parse the file and yields elements.  See the module description."""
 99 |         content = open(self.fpath).read()
100 |         additional_tokens = []
101 |         pos = 0
102 |         chains = dict()
103 |         open_mention_counter = 0
104 |         # patterns:
105 |         additional_tokens_pattern = re.compile(r"#additional_?token:\s*(.+)\s*\n\n+")
106 |         text_id_pattern = re.compile(r"#text_?id:\s*(.+)\s*\n\n*")
107 |         comment_pattern = re.compile(r"(?:#(.*)\n+|\*{5,})")
108 |         end_par_pattern = re.compile(r"\n\n+")
109 |         space_pattern = re.compile(r"\s+")
110 |         new_line_pattern = re.compile(r"\n")
111 |         open_mention_pattern = re.compile(r"\{(\w+)(:| )")
112 |         feature_pattern = re.compile(r'(\w+)=(?:(\w+)|"([^"]*)")(,| )')
113 |         close_mention_pattern = re.compile(r"\}")
114 |         sentence_end_pattern = re.compile(r'(?:\.+"?|\!|\?)')
115 |         if self.tokenization_mode == WORD_TOKENIZATION:
116 |             word_pattern = self.__class__.get_word_regex(additional_tokens)
117 |         else:
118 |             word_pattern = re.compile(r".")
119 |         # eat leading blank lines
120 |         m = re.compile(r"\s+").match(content, pos)
121 |         if m:
122 |             # print('eat leading spaces')
123 |             pos += len(m.group(0))
124 |         while pos < len(content):
125 |             m = additional_tokens_pattern.match(content, pos)
126 |             if m:
127 |                 # print('add word')
128 |                 pos += len(m.group(0))
129 |                 additional_tokens.append(m.group(1))
130 |                 word_pattern = SacrParser.get_word_regex(additional_tokens)
131 |                 continue
132 |             m = text_id_pattern.match(content, pos)
133 |             if m:
134 |                 # print('textid')
135 |                 pos += len(m.group(0))
136 |                 yield "text_id", m.group(1)
137 |                 continue
138 |             m = comment_pattern.match(content, pos)
139 |             if m:
140 |                 # print('comment', m.group(0))
141 |                 pos += len(m.group(0))
142 |                 comment = m.group(1).strip()
143 |                 if comment:
144 |                     yield "comment", comment
145 |                 continue
146 |             # paragraph of text
147 |             yield "par_start", None
148 |             while pos < len(content):
149 |                 # print("%d, %d" % (pos, len(content)))
150 |                 # print(content[pos])
151 |                 m = end_par_pattern.match(content, pos)
152 |                 if m:
153 |                     # print('end par')
154 |                     pos += len(m.group(0))
155 |                     yield "par_end", None
156 |                     break
157 |                 m = space_pattern.match(content, pos)
158 |                 if m:
159 |                     # print('space')
160 |                     pos += len(m.group(0))
161 |                     continue
162 |                 m = new_line_pattern.match(content, pos)
163 |                 if m:
164 |                     # print('newline')
165 |                     pos += len(m.group(0))
166 |                     continue
167 |                 m = open_mention_pattern.match(content, pos)
168 |                 if m:
169 |                     # print('mention')
170 |                     pos += len(m.group(0))
171 |                     open_mention_counter += 1
172 |                     if m.group(1) not in chains:
173 |                         chains[m.group(1)] = len(chains)
174 |                     chain_index = chains[m.group(1)]
175 |                     chain_name = m.group(1)
176 |                     features = dict()
177 |                     if m.group(2) == ":":
178 |                         while pos < len(content):
179 |                             m = feature_pattern.match(content, pos)
180 |                             if m:
181 |                                 key = m.group(1)
182 |                                 value = m.group(2) if m.group(2) is not None else m.group(3)
183 |                                 features[key] = value
184 |                                 pos += len(m.group(0))
185 |                                 if m.group(4) == " ":
186 |                                     break
187 |                             else:
188 |                                 raise SyntaxError(
189 |                                     "can't understand '%s' near %d" % (content, pos)
190 |                                 )
191 |                     yield "mention_start", (chain_index, chain_name, features)
192 |                     continue
193 |                 m = close_mention_pattern.match(content, pos)
194 |                 if m:
195 |                     # print('end mention')
196 |                     pos += len(m.group(0))
197 |                     open_mention_counter -= 1
198 |                     yield "mention_end", None
199 |                     continue
200 |                 m = word_pattern.match(content, pos)
201 |                 if m:
202 |                     # print('token: %s' % m.group(0))
203 |                     pos += len(m.group(0))
204 |                     yield "token", m.group(0)
205 |                     continue
206 |                 if open_mention_counter == 0:
207 |                     m = sentence_end_pattern.match(content, pos)
208 |                     if m:
209 |                         # print('token: %s' % m.group(0))
210 |                         pos += len(m.group(0))
211 |                         yield "token", m.group(0)
212 |                         yield "sentence_change", None
213 |                         continue
214 |                 m = re.compile(r".").match(content, pos)
215 |                 if m:
216 |                     # print('token: %s' % m.group(0))
217 |                     pos += len(m.group(0))
218 |                     yield "token", m.group(0)
219 |                     continue
220 |                 assert False
221 | 


--------------------------------------------------------------------------------
/sacr_parser2.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import re
  4 | from dataclasses import dataclass
  5 | from pathlib import Path
  6 | from typing import Generator
  7 | 
  8 | 
  9 | @dataclass
 10 | class Token:
 11 |     start: int
 12 |     end: int
 13 | 
 14 |     def __eq__(self, other: Token) -> bool:
 15 |         return self.start == other.start and self.end == other.end
 16 | 
 17 | 
 18 | @dataclass
 19 | class TextID(Token):
 20 |     text_id: str
 21 | 
 22 |     def __eq__(self, other: TextID) -> bool:
 23 |         return super().__eq__(other) and self.text_id == other.text_id
 24 | 
 25 | 
 26 | @dataclass
 27 | class Comment(Token):
 28 |     value: str
 29 | 
 30 |     def __eq__(self, other: Comment) -> bool:
 31 |         return super().__eq__(other) and self.value == other.value
 32 | 
 33 | 
 34 | @dataclass
 35 | class ParagraphStart(Token):
 36 |     ...
 37 | 
 38 | 
 39 | @dataclass
 40 | class ParagraphEnd(Token):
 41 |     ...
 42 | 
 43 | 
 44 | @dataclass
 45 | class MentionStart(Token):
 46 |     chain_index: int
 47 |     chain_name: str
 48 |     features: dict[str, str]
 49 | 
 50 |     def __eq__(self, other: MentionStart) -> bool:
 51 |         return (
 52 |             super().__eq__(other)
 53 |             and self.chain_index == other.chain_index
 54 |             and self.chain_name == other.chain_name
 55 |             and self.features == other.features
 56 |         )
 57 | 
 58 | 
 59 | @dataclass
 60 | class MentionEnd(Token):
 61 |     ...
 62 | 
 63 | 
 64 | @dataclass
 65 | class Spaces(Token):
 66 |     value: str
 67 | 
 68 |     def __eq__(self, other: Spaces) -> bool:
 69 |         return super().__eq__(other) and self.value == other.value
 70 | 
 71 | 
 72 | @dataclass
 73 | class NewLineInsideParagraph(Token):
 74 |     value: str
 75 | 
 76 |     def __eq__(self, other: NewLineInsideParagraph) -> bool:
 77 |         return super().__eq__(other) and self.value == other.value
 78 | 
 79 | 
 80 | @dataclass
 81 | class Word(Token):
 82 |     value: str
 83 | 
 84 |     def __eq__(self, other: Word) -> bool:
 85 |         return super().__eq__(other) and self.value == other.value
 86 | 
 87 | 
 88 | @dataclass
 89 | class SentenceChange(Token):
 90 |     ...
 91 | 
 92 | 
 93 | def escape_regex(string: str) -> str:
 94 |     """Escape a string so it can be literally searched for in a regex.
 95 | 
 96 |     Used for `additional_tokens`.
 97 |     """
 98 |     return re.sub(r"([-{}\[\]().])", r"\\\1", string)
 99 | 
100 | 
101 | class SacrParser:
102 |     """Parse a file in the SACR format."""
103 | 
104 |     def __init__(self, source: str | Path):
105 |         if isinstance(source, str):
106 |             self.content = source
107 |         else:
108 |             self.content = source.read_text()
109 | 
110 |     @staticmethod
111 |     def get_word_pattern(additional_tokens: list[str] | None = None) -> re.Pattern[str]:
112 |         """Compute the regex to match words, including additional_tokens."""
113 |         if not additional_tokens:
114 |             additional_tokens = []
115 |         additional_tokens = sorted(
116 |             [escape_regex(w) for w in additional_tokens], key=lambda x: len(x)
117 |         )
118 |         token_str = "[a-zßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿœα-ω0-9_]+'?|[-+±]?[.,]?[0-9]+"
119 |         if additional_tokens:
120 |             return re.compile(
121 |                 "(%s|%s)" % (token_str, "|".join(additional_tokens)), re.IGNORECASE
122 |             )
123 |         else:
124 |             return re.compile("(%s)" % token_str, re.IGNORECASE)
125 | 
126 |     def parse(self) -> Generator[Token, None, None]:
127 |         """Parse the file and yields elements."""
128 |         content = self.content
129 |         additional_tokens: list[str] = []
130 |         pos = 0
131 |         chains: dict[str, int] = dict()
132 |         open_mention_counter = 0
133 | 
134 |         # patterns
135 |         additional_tokens_pattern = re.compile(r"#additional_?token:\s*(.+)\s*\n\n+")
136 |         text_id_pattern = re.compile(r"#text_?id:\s*(.+)\s*\n\n*")
137 |         comment_pattern = re.compile(r"(?:#(.*)(?:\n+|$)|\*{5,})")
138 |         end_par_pattern = re.compile(r"\n\n+")
139 |         space_pattern = re.compile(r"\s+")
140 |         new_line_pattern = re.compile(r"\n")
141 |         open_mention_pattern = re.compile(r"\{(\w+)(:| )")
142 |         feature_pattern = re.compile(r'(\w+)=(?:(\w+)|"([^"]*)")(,| )')
143 |         close_mention_pattern = re.compile(r"\}")
144 |         sentence_end_pattern = re.compile(r'(?:\.+"?|\!|\?)')
145 |         word_pattern = self.get_word_pattern(additional_tokens)
146 | 
147 |         # eat leading blank lines
148 |         if m := re.compile(r"\s+").match(content, pos):
149 |             pos += len(m.group(0))
150 | 
151 |         while pos < len(content):
152 |             if m := additional_tokens_pattern.match(content, pos):
153 |                 pos += len(m.group(0))
154 |                 additional_tokens.append(m.group(1))
155 |                 word_pattern = SacrParser.get_word_pattern(additional_tokens)
156 |                 continue
157 | 
158 |             if m := text_id_pattern.match(content, pos):
159 |                 length = len(m.group(0))
160 |                 yield TextID(pos, pos + length, m.group(1))
161 |                 pos += length
162 |                 continue
163 | 
164 |             if m := comment_pattern.match(content, pos):
165 |                 length = len(m.group(0))
166 |                 if m.group(1):  # no group 0 if ******
167 |                     comment = m.group(1).strip()
168 |                     if comment:
169 |                         yield Comment(pos, pos + length, comment)
170 |                 pos += length
171 |                 continue
172 | 
173 |             # parsing a paragraph
174 | 
175 |             yield ParagraphStart(pos, pos)
176 | 
177 |             while pos < len(content):
178 |                 if m := end_par_pattern.match(content, pos):
179 |                     length = len(m.group(0))
180 |                     yield ParagraphEnd(pos, pos + length)
181 |                     pos += length
182 |                     break
183 | 
184 |                 if m := new_line_pattern.match(content, pos):
185 |                     length = len(m.group(0))
186 |                     yield NewLineInsideParagraph(pos, pos + length, m.group(0))
187 |                     pos += length
188 |                     continue
189 | 
190 |                 if m := space_pattern.match(content, pos):
191 |                     length = len(m.group(0))
192 |                     yield Spaces(pos, pos + length, m.group(0))
193 |                     pos += length
194 |                     continue
195 | 
196 |                 if m := open_mention_pattern.match(content, pos):
197 |                     start = pos
198 |                     pos += len(m.group(0))
199 |                     open_mention_counter += 1
200 | 
201 |                     if m.group(1) not in chains:
202 |                         chains[m.group(1)] = len(chains)
203 |                     chain_index = chains[m.group(1)]
204 |                     chain_name = m.group(1)
205 | 
206 |                     features = dict()
207 |                     if m.group(2) == ":":
208 |                         while pos < len(content):
209 |                             if m := feature_pattern.match(content, pos):
210 |                                 key = m.group(1)
211 |                                 value = (
212 |                                     m.group(2) if m.group(2) is not None else m.group(3)
213 |                                 )
214 |                                 features[key] = value
215 |                                 pos += len(m.group(0))
216 |                                 if m.group(4) == " ":
217 |                                     break
218 |                             else:
219 |                                 raise SyntaxError(
220 |                                     "can't understand '%s' near %d" % (content, pos)
221 |                                 )
222 |                     yield MentionStart(
223 |                         start,
224 |                         pos,
225 |                         chain_index=chain_index,
226 |                         chain_name=chain_name,
227 |                         features=features,
228 |                     )
229 |                     continue
230 | 
231 |                 if m := close_mention_pattern.match(content, pos):
232 |                     length = len(m.group(0))
233 |                     yield MentionEnd(pos, pos + length)
234 |                     pos += length
235 |                     open_mention_counter -= 1
236 |                     continue
237 | 
238 |                 if m := word_pattern.match(content, pos):
239 |                     length = len(m.group(0))
240 |                     yield Word(pos, pos + length, m.group(0))
241 |                     pos += length
242 |                     continue
243 | 
244 |                 if open_mention_counter == 0:
245 |                     if m := sentence_end_pattern.match(content, pos):
246 |                         length = len(m.group(0))
247 |                         yield Word(pos, pos + length, m.group(0))
248 |                         yield SentenceChange(pos, pos + length)
249 |                         pos += length
250 |                         continue
251 | 
252 |                 if m := re.compile(r".").match(content, pos):
253 |                     length = len(m.group(0))
254 |                     yield Word(pos, pos + length, m.group(0))
255 |                     pos += length
256 |                     continue
257 |                 assert False
258 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ignore=E203,E501,W503
3 | exclude=.git,__pycache__,venv,.idea,color_manager.py,conll2jsonlines.py,conll2sacr.py,conll_transform.py,jsonlines2conll.py,jsonlines2text.py,sacr2conll.py,sacr_parser.py,text2jsonlines.py,standoff2inline.py


--------------------------------------------------------------------------------
/standoff2inline.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Converting standoff annotations to inline annotations.
  3 | 
  4 | For example, in the sentence:
  5 | 
  6 |     The little cat drinks milk.
  7 | 
  8 | you know that the third word, between the 12th and 14th characters, is a noun.
  9 | You may want to surround it with some tags, like `<noun>` and `</noun>`:
 10 | 
 11 |     The little <noun>cat</noun> drinks milk.
 12 | 
 13 | This module offer classes and function to:
 14 | * add inline annotations, like xml annotations, counting in characters or
 15 |   tokens,
 16 | * highlight some chunks of text, for example with styled `<span>` tags,
 17 | * remove parts without annotations and replace them with something like
 18 |   `[...]`.
 19 | 
 20 | A quick preview:
 21 | 
 22 | ```python
 23 | from standoff2inline import Standoff2Inline
 24 | 
 25 | string = "The little cat drinks milk."
 26 | inliner = Standoff2Inline()
 27 | inliner.add((0, "<sent>"), (26, "</sent>"))
 28 | inliner.add((0, "<gn>"), (13, "</gn>"))
 29 | inliner.add((11, "<noun>"), (13, "</noun>"))
 30 | inliner.add((22, "<noun>"), (25, "</noun>"))
 31 | inliner.add((0, "<det>"), (2, "</det>"))
 32 | inliner.apply(string)
 33 | ```
 34 | 
 35 | which gives:
 36 | 
 37 | ```
 38 | <sent><gn><det>The</det> little <noun>cat</noun></gn> drinks
 39 | <noun>milk</noun>.</sent>
 40 | ```
 41 | 
 42 | Please read the user guide and play with the module in the Jupyter notebook.
 43 | 
 44 | ************************************************************************
 45 | 
 46 | (c) Bruno Oberle 2019 - boberle.com
 47 | 
 48 | Distributed under the term of the Mozilla Public License 2.  See the LICENSE
 49 | file.
 50 | 
 51 | Version 1.0.0
 52 | 
 53 | """
 54 | 
 55 | 
 56 | 
 57 | class Standoff2Inline:
 58 |     """Conversion from standoff annotation to inline annotations.
 59 | 
 60 |     Constructor:
 61 |     * `kind` (opt): one of `xml|sacr`: predefined annotation scheme.
 62 |     * `end_is_stop`: the "end" position is the position of the next token or
 63 |        character, not the last.
 64 |     """
 65 | 
 66 | 
 67 |     def __init__(self, kind=None, end_is_stop=False):
 68 |         self.kind = kind
 69 |         self._elements = []
 70 |         self._sorted = False
 71 |         self.end_is_stop = end_is_stop
 72 | 
 73 | 
 74 | 
 75 |     def add(self, start, end=None, stop=None):
 76 |         """Add an annotation.
 77 | 
 78 |         Annotations are given as a tuple `(position, string)`, where position
 79 |         may be in characters or tokens.
 80 | 
 81 |         The `start` annotation is required, `end|stop` annotation is optional.
 82 | 
 83 |         You give either an `end` or `stop` annotation.  `stop` works like
 84 |         Python's `range` function: the annotation is introduced *before* the
 85 |         next element.
 86 |         """
 87 | 
 88 |         if stop is not None:
 89 |             if isinstance(stop, int):
 90 |                 stop = (stop, None)
 91 |             stop, value = stop
 92 |             end = (stop-1, value)
 93 |         if isinstance(end, int):
 94 |             end = (end, None)
 95 |         if end is None:
 96 |             end = (-1, None)
 97 |         self._elements.append((start, end))
 98 |         self._sorted = False
 99 | 
100 | 
101 | 
102 |     def _iter_elements(self, elements):
103 |         if self.kind is None:
104 |             yield from self._get_strings(elements)
105 |         elif self.kind == 'xml':
106 |             yield from self._get_xml_strings(elements)
107 |         elif self.kind == 'sacr':
108 |             yield from self._get_sacr_strings(elements)
109 |         else:
110 |             assert False, self.kind
111 | 
112 | 
113 | 
114 |     def _get_xml_strings(self, elements):
115 |         for (start, start_val), (end, end_val) in elements:
116 |             if isinstance(start_val, str):
117 |                 tagname, dic = start_val, dict()
118 |             else:
119 |                 tagname, dic = start_val
120 |             attrs = " ".join('%s="%s"' % (k, v) for k, v in dic.items())
121 |             if attrs:
122 |                 attrs = " " + attrs
123 |             start_val = "<%s%s>" % (tagname, attrs)
124 |             end_val = "</%s>" % tagname
125 |             yield (start, start_val), (end, end_val)
126 | 
127 | 
128 | 
129 |     def _get_sacr_strings(self, elements):
130 |         for (start, start_val), (end, end_val) in elements:
131 |             tagname, dic = start_val
132 |             attrs = " ".join('%s="%s"' % (k, v) for k, v in dic.items())
133 |             if attrs:
134 |                 attrs = ":" + attrs
135 |             start_val = "{%s%s " % (tagname, attrs)
136 |             end_val = "}"
137 |             yield (start, start_val), (end, end_val)
138 | 
139 | 
140 | 
141 |     def _get_strings(self, elements):
142 |         for (start, start_val), (end, end_val) in elements:
143 |             if end_val is None:
144 |                 end_val = ""
145 |             yield (start, start_val), (end, end_val)
146 | 
147 | 
148 | 
149 |     def _tokens2string(self, tokens):
150 |         """Convert a list of tokens into a string and compute new positions.
151 | 
152 |         Return a tuple `(string, elements)`, where `elements` is like
153 |         `self.elements`, but with position in the string rather than in the
154 |         token list.
155 |         """
156 | 
157 |         string = ""
158 |         t2s = []
159 |         for i, token in enumerate(tokens):
160 |             start = len(string)
161 |             t2s.append((start, start+len(token)-1))
162 |             string += token + " "
163 |         elements = []
164 |         for (start, start_val), (end, end_val) in self._elements:
165 |             start = t2s[start][0]
166 |             end = t2s[end][1]
167 |             elements.append(((start, start_val), (end, end_val)))
168 |         return string, elements
169 | 
170 | 
171 | 
172 |     def apply(self, string=None, tokens=None):
173 |         """Insert the annotations and return a string with inline annotations.
174 | 
175 |         Specify either a `string` or a list of `tokens`.
176 |         """
177 | 
178 |         return "".join(
179 |             x[1] for x in self.iter_result(string=string, tokens=tokens))
180 | 
181 | 
182 | 
183 |     def iter_result(self, string=None, tokens=None, return_tokens=False):
184 |         """Iterate over `prefix|string|suffix`.
185 | 
186 |         Each iteration yields a tuple `(kind, string)` where `kind` is one of
187 |         `prefix|string|suffix` and `string` is either the annotation value or
188 |         a chunk of text.
189 |         """
190 | 
191 |         assert string or tokens and not (string and tokens)
192 |         def yield_(k, v):
193 |             if len(v):
194 |                 yield k, v
195 |         if not self._sorted:
196 |             self._elements.sort(key=lambda e: e[1][0], reverse=True)
197 |             self._elements.sort(key=lambda e: e[0][0])
198 |             self._sorted = True
199 |         if tokens and not return_tokens:
200 |             string, elements = self._tokens2string(tokens)
201 |         else:
202 |             elements = self._elements
203 |         res = ""
204 |         pos = 0
205 |         filo = []
206 |         move_one = 0 if self.end_is_stop else 1
207 |         for (start, start_val), end_data in self._iter_elements(elements):
208 |             while filo and filo[-1][0] < start:
209 |                 end, end_val = filo.pop()
210 |                 if tokens and return_tokens:
211 |                     yield from yield_('string', tokens[pos:end+move_one])
212 |                 else:
213 |                     yield from yield_('string', string[pos:end+move_one])
214 |                 pos = end + move_one
215 |                 yield 'suffix', end_val
216 |             if tokens and return_tokens:
217 |                 yield from yield_('string', tokens[pos:start])
218 |             else:
219 |                 yield from yield_('string', string[pos:start])
220 |             yield 'prefix', start_val
221 |             pos = start
222 |             if end_data[0] != -1:
223 |                 filo.append(end_data)
224 |         while filo:
225 |             end, end_val = filo.pop()
226 |             if tokens and return_tokens:
227 |                 yield from yield_('string', tokens[pos:end+move_one])
228 |             else:
229 |                 yield from yield_('string', string[pos:end+move_one])
230 |             pos = end + move_one
231 |             yield from yield_('suffix', end_val)
232 |         if tokens and return_tokens:
233 |             yield from yield_('string', tokens[pos:])
234 |         else:
235 |             yield from yield_('string', string[pos:])
236 |         return res
237 | 
238 | 
239 | 
240 | class Highlighter:
241 | 
242 | 
243 |     def __init__(self, marks=None, prefix=None, suffix=None):
244 |         self.marks = marks if marks is not None else list()
245 |         self.prefix = prefix
246 |         self.suffix = suffix
247 | 
248 | 
249 |     def _get_affix(self, current, value):
250 |         if current is None:
251 |             return value
252 |         if isinstance(current, list):
253 |             current.append(value)
254 |             return current
255 |         return [current, value]
256 | 
257 | 
258 |     def set_style(self, underline=False, bold=False, italic=False,
259 |             color=None):
260 |         res = ""
261 |         if underline:
262 |             res += "text-decoration: underline; "
263 |         if bold:
264 |             res += "font-weight: bold; "
265 |         if italic:
266 |             res += "font-style: italic; "
267 |         if color is not None:
268 |             res += "color: %s; " % color
269 |         if res:
270 |             self.prefix = '<span style="%s">%s' % (
271 |                 res, self.prefix if self.prefix else "")
272 |             self.suffix = '%s</span>' % (self.suffix if self.suffix else "")
273 | 
274 | 
275 |     def add_mark(self, start, end, prefix=None, suffix=None):
276 |         self.marks.append((start, end))
277 |         if prefix is not None:
278 |             self.prefix = self._get_affix(self.prefix, prefix)
279 |         if suffix is not None:
280 |             self.suffix = self._get_affix(self.suffix, suffix)
281 | 
282 | 
283 |     def add_marks(self, marks):
284 |         for start, end in marks:
285 |             self.add_mark(start, end)
286 | 
287 | 
288 | 
289 | def highlight_characters(text, *highlighters, end_is_stop=False):
290 |     inliner = Standoff2Inline(end_is_stop=end_is_stop)
291 |     for hl in highlighters:
292 |         for i in range(len(hl.marks)):
293 |             start, end = hl.marks[i]
294 |             prefix = hl.prefix[i] if isinstance(hl.prefix, list) else hl.prefix
295 |             suffix = hl.suffix[i] if isinstance(hl.suffix, list) else hl.suffix
296 |             inliner.add(
297 |                 (start, prefix),
298 |                 (end, suffix),
299 |             )
300 |     return inliner.apply(text)
301 | 
302 | 
303 | 
304 | def highlight(text, *highlighters, margin=0, max_gap=0, ellipsis=" [...] ",
305 |         char=False, end_is_stop=False):
306 |     inliner = Standoff2Inline(end_is_stop=end_is_stop)
307 |     for hl in highlighters:
308 |         for i in range(len(hl.marks)):
309 |             start, end = hl.marks[i]
310 |             prefix = hl.prefix[i] if isinstance(hl.prefix, list) else hl.prefix
311 |             suffix = hl.suffix[i] if isinstance(hl.suffix, list) else hl.suffix
312 |             inliner.add(
313 |                 (start, prefix),
314 |                 (end, suffix),
315 |             )
316 |     #return inliner.apply(tokens=text)
317 |     chunks = [
318 |         [a, b] for a, b in inliner.iter_result(
319 |             string=text if char else None,
320 |             tokens=text if not char else None,
321 |             return_tokens=True
322 |         )
323 |     ]
324 |     if not char:
325 |         ellipsis = ellipsis.strip()
326 |     if margin and chunks[0][0] == 'string' and len(chunks[0][1]) > margin:
327 |         chunks[0][1] = [ellipsis] + chunks[0][1][-1*margin:]
328 |     if margin and chunks[-1][0] == 'string' and len(chunks[-1][1]) > margin:
329 |         chunks[-1][1] = chunks[-1][1][:margin] + [ellipsis]
330 |     level = 1 if chunks[0][0] == 'prefix' else 0
331 |     if max_gap:
332 |         for i in range(1, len(chunks)-1):
333 |             kind, string = chunks[i]
334 |             if kind == 'prefix':
335 |                 level += 1
336 |                 chunks[i][1] = chunks[i][1]
337 |             if kind == 'suffix':
338 |                 level -= 1
339 |                 chunks[i][1] = chunks[i][1]
340 |             if kind == 'string' and level == 0:
341 |                 if len(string) > max_gap:
342 |                     chunks[i][1] = chunks[i][1][:margin] \
343 |                         + [ellipsis] + chunks[i][1][-1*margin:]
344 |     res = ""
345 |     need_space = False
346 |     for kind, chunk in chunks:
347 |         if kind == "string":
348 |             if need_space and not char:
349 |                 res += " "
350 |             res += chunk if char else " ".join(chunk)
351 |             need_space = True
352 |         else:
353 |             if kind == "prefix" and need_space and not char:
354 |                 res += " "
355 |                 need_space = False
356 |             res += chunk
357 |     return res.rstrip()
358 | 
359 | 
360 | 
361 | 


--------------------------------------------------------------------------------
/testing/aesop.sacr:
--------------------------------------------------------------------------------
 1 | #title:The Peasant and the Eagle (Aesop), translated by G. F. Townsend (1887)
 2 | 
 3 | #textid:aesop
 4 | 
 5 | #textmetadata:work=literature
 6 | 
 7 | {Peasant:function="s subject",head="1",partofspeech="i noun with indefinite article" A Peasant} found {Eagle:function="o object",head="1",partofspeech="i noun with indefinite article" an Eagle captured in {M3:function="a adverbial",head="1",partofspeech="i noun with indefinite article" a trap}}, and much admiring {Eagle:function="o object",head="1",partofspeech="d noun with definite article" the bird}, set {Peasant:function="o object",head="0",partofspeech="s personnal pronoun" him} free. {Eagle:function="s subject",head="1",partofspeech="d noun with definite article" The Eagle} did not prove ungrateful to {Peasant:function="o object",head="1",partofspeech="n noun with determiner" {Eagle:function="t other",head="0",partofspeech="e possessive adjective" his} deliverer}, for seeing {Peasant:function="o object",head="1",partofspeech="d noun with definite article" the Peasant sitting under {Wall:function="a adverbial",head="1",partofspeech="i noun with indefinite article" a wall {Wall:function="s subject",head="0",partofspeech="r relative pronoun" which} was not safe}}, {Eagle:function="s subject",head="0",partofspeech="s personnal pronoun" he} flew toward {Peasant:function="a adverbial",head="0",partofspeech="s personnal pronoun" him} and with {M14:function="a adverbial",head="1",partofspeech="n noun with determiner" {Eagle:function="t other",head="0",partofspeech="e possessive adjective" his} talons} snatched {Bundle:function="o object",head="1",partofspeech="i noun with indefinite article" a bundle} from {M17:function="a adverbial",head="1",partofspeech="t noun without determiner" {Peasant:function="t other",head="0",partofspeech="e possessive adjective" his} head}. When {Peasant:function="s subject",head="1",partofspeech="d noun with definite article" the Peasant} rose in pursuit, {Eagle:function="s subject",head="1",partofspeech="d noun with definite article" the Eagle} let {Bundle:function="o object",head="1",partofspeech="d noun with definite article" the bundle} fall again. Taking {Bundle:function="o object",head="0",partofspeech="s personnal pronoun" it} up, {Peasant:function="s subject",head="1",partofspeech="d noun with definite article" the man} returned to {M24:function="a adverbial",head="2",partofspeech="d noun with definite article" the same place}, to find that {Wall:function="s subject",head="1",partofspeech="d noun with definite article" the wall under {Wall:function="a adverbial",head="0",partofspeech="r relative pronoun" which} {Peasant:function="s subject",head="0",partofspeech="s personnal pronoun" he} had been sitting} had fallen to pieces; and {Peasant:function="s subject",head="0",partofspeech="s personnal pronoun" he} marveled at {M29:function="o object",head="1",partofspeech="d noun with definite article" the service} rendered {Peasant:function="o object",head="0",partofspeech="s personnal pronoun" him} by {Eagle:function="a adverbial",head="1",partofspeech="d noun with definite article" the Eagle}.
 8 | 
 9 | 
10 | 
11 | #COLOR:Peasant=hsl(25, 100%, 80%)
12 | #COLOR:Eagle=hsl(0, 100%, 80%)
13 | #COLOR:Wall=hsl(50, 100%, 80%)
14 | #COLOR:Bundle=hsl(75, 100%, 80%)
15 | 
16 | #TOKENIZATION-TYPE:1
17 | 
18 | 


--------------------------------------------------------------------------------
/testing/caesar.sacr:
--------------------------------------------------------------------------------
 1 | #textmetadata:work=politics
 2 | 
 3 | #textid:caesar
 4 | 
 5 | {Caesar:function="s subject",head="0",partofspeech="a name" Gaius Julius Caesar} ({M2:function="t other",head="0",partofspeech="t noun without determiner" 12 or 13 July 100 BC} – {M3:function="t other",head="0",partofspeech="t noun without determiner" 15 March 44 BC}), known by {M6:function="o object",head="1",partofspeech="n noun with determiner" {M5:function="o object",head="1",partofspeech="n noun with determiner" {Caesar:function="m noun modifier",head="0",partofspeech="e possessive adjective" his} nomen} and {M7:function="o object",head="0",partofspeech="n noun with determiner" cognomen} Julius Caesar}, was {Caesar:function="o object",head="2",partofspeech="i noun with indefinite article" a Roman politician, military general, and historian {Caesar:function="s subject",head="0",partofspeech="r relative pronoun" who} played {M10:function="o object",head="2",partofspeech="i noun with indefinite article" a critical role} in {M12:function="a adverbial",head="1",partofspeech="d noun with definite article" the events {M12:function="s subject",head="0",partofspeech="r relative pronoun" that} led to {M11:function="o object",head="1",partofspeech="d noun with definite article" the demise of {M13:function="m noun modifier",head="2",partofspeech="d noun with definite article" the Roman Republic}} and {M14:function="o object",head="1",partofspeech="d noun with definite article" the rise of {M15:function="m noun modifier",head="2",partofspeech="d noun with definite article" the Roman Empire}}}}. {Caesar:function="s subject",head="0",partofspeech="s personnal pronoun" He} also wrote {M17:function="o object",head="1",partofspeech="t noun without determiner" Latin prose}.
 6 | 
 7 | In {M1:function="a adverbial",head="0",partofspeech="t noun without determiner" 60 BC}, {M23:function="s subject",head="0",partofspeech="a name" {Caesar:function="s subject",head="0",partofspeech="a name" Caesar}, {Crassus:function="s subject",head="0",partofspeech="a name" Crassus} and {Pompey:function="s subject",head="0",partofspeech="a name" Pompey}} formed {M21:function="o object",head="1",partofspeech="d noun with definite article" the First Triumvirate, a political alliance {M21:function="s subject",head="0",partofspeech="r relative pronoun" that} dominated {M20:function="o object",head="1",partofspeech="t noun without determiner" Roman politics} for {M22:function="a adverbial",head="1",partofspeech="t noun without determiner" several years}}. {M23:function="s subject",head="0",partofspeech="s personnal pronoun" Their} attempts to amass power as {M8:function="t other",head="0",partofspeech="a name" Populares} were opposed by {M25:function="o object",head="1",partofspeech="a name" the Optimates} within {TheSenate:function="a adverbial",head="2",partofspeech="a name" the Roman Senate}, among {M25:function="a adverbial",head="0",partofspeech="s personnal pronoun" them} {M19:function="a adverbial",head="0",partofspeech="a name" Cato the Younger} with {M26:function="a adverbial",head="2",partofspeech="d noun with definite article" the frequent support of {M27:function="m noun modifier",head="0",partofspeech="a name" Cicero}}. {Caesar:function="s subject",head="0",partofspeech="a name" Caesar} rose to become one of {M28:function="o object",head="3",partofspeech="d noun with definite article" the most powerful politicians in {M29:function="a adverbial",head="2",partofspeech="d noun with definite article" the Roman Republic}} through {M40:function="a adverbial",head="1",partofspeech="i noun with indefinite article" a number of {M31:function="m noun modifier",head="1",partofspeech="n noun with determiner" {Caesar:function="m noun modifier",head="0",partofspeech="s personnal pronoun" his} accomplishments}}, notably {M33:function="m noun modifier",head="1",partofspeech="n noun with determiner" {Caesar:function="m noun modifier",head="0",partofspeech="s personnal pronoun" his} victories in {M24:function="a adverbial",head="2",partofspeech="d noun with definite article" the Gallic Wars}, completed by {M35:function="a adverbial",head="0",partofspeech="t noun without determiner" 51 BC}}. During this time, {Caesar:function="s subject",head="0",partofspeech="a name" Caesar} became {Caesar:function="o object",head="3",partofspeech="d noun with definite article" the first Roman general} to cross both {M38:function="o object",head="2",partofspeech="d noun with definite article" {M37:function="o object",head="2",partofspeech="d noun with definite article" the English Channel} and {TheRhine:function="o object",head="2",partofspeech="d noun with definite article" the Rhine River}}, when {Caesar:function="s subject",head="0",partofspeech="s personnal pronoun" he} built {M34:function="o object",head="1",partofspeech="i noun with indefinite article" a bridge across {TheRhine:function="a adverbial",head="1",partofspeech="d noun with definite article" the Rhine}} and crossed {M37:function="o object",head="1",partofspeech="d noun with definite article" the Channel} to invade {Britain:function="o object",head="0",partofspeech="a name" Britain}. {M39:function="s subject",head="2",partofspeech="n noun with determiner" {Caesar:function="m noun modifier",head="0",partofspeech="a name" Caesar'}s wars} extended {M42:function="o object",head="2",partofspeech="n noun with determiner" {Rome:function="m noun modifier",head="0",partofspeech="a name" Rome'}s territory} to {Britain:function="a adverbial",head="0",partofspeech="a name" Britain} and past {M50:function="a adverbial",head="0",partofspeech="a name" Gaul}. {M40:function="s subject",head="1",partofspeech="n noun with determiner" These achievements} granted {Caesar:function="o object",head="0",partofspeech="s personnal pronoun" him} unmatched {M30:function="o object",head="1",partofspeech="t noun without determiner" military power} and threatened to eclipse {M18:function="o object",head="1",partofspeech="d noun with definite article" the standing of {Pompey:function="m noun modifier",head="0",partofspeech="a name" Pompey, {Pompey:function="s subject",head="0",partofspeech="r relative pronoun" who} had realigned {Pompey:function="o object",head="0",partofspeech="s personnal pronoun" himself} with {TheSenate:function="o object",head="1",partofspeech="d noun with definite article" the Senate} after {M47:function="a adverbial",head="1",partofspeech="d noun with definite article" the death of {Crassus:function="m noun modifier",head="0",partofspeech="a name" Crassus}} in {M9:function="a adverbial",head="0",partofspeech="t noun without determiner" 53 BC}}}. With {M24:function="a adverbial",head="2",partofspeech="d noun with definite article" the Gallic Wars} concluded, {TheSenate:function="s subject",head="1",partofspeech="d noun with definite article" the Senate} ordered {Caesar:function="o object",head="0",partofspeech="a name" Caesar} to step down from {M32:function="o object",head="2",partofspeech="n noun with determiner" {Caesar:function="m noun modifier",head="0",partofspeech="e possessive adjective" his} military command} and return to {Rome:function="o object",head="0",partofspeech="a name" Rome}. Leaving {M41:function="o object",head="1",partofspeech="n noun with determiner" {Caesar:function="m noun modifier",head="0",partofspeech="e possessive adjective" his} command} in {M50:function="a adverbial",head="0",partofspeech="a name" Gaul} meant losing {M44:function="o object",head="1",partofspeech="n noun with determiner" {Caesar:function="m noun modifier",head="0",partofspeech="e possessive adjective" his} immunity from being charged as a criminal for waging {M51:function="o object",head="1",partofspeech="t noun without determiner" unsanctioned wars}}. As a result, {Caesar:function="s subject",head="0",partofspeech="a name" Caesar} found himself with no other options but to cross {M52:function="o object",head="1",partofspeech="d noun with definite article" the Rubicon} with {M53:function="a adverbial",head="3",partofspeech="d noun with definite article" the 13th Legion}, leaving {M54:function="o object",head="1",partofspeech="n noun with determiner" {Caesar:function="m noun modifier",head="0",partofspeech="e possessive adjective" his} province} and illegally entering {M55:function="o object",head="1",partofspeech="a name" Roman Italy} under arms. {M56:function="s subject",head="0",partofspeech="s personnal pronoun" This} began {M59:function="o object",head="3",partofspeech="n noun with determiner" {Caesar:function="m noun modifier",head="0",partofspeech="a name" Caesar'}s civil war}, and {M58:function="s subject",head="1",partofspeech="n noun with determiner" {Caesar:function="m noun modifier",head="0",partofspeech="e possessive adjective" his} victory in {M59:function="a adverbial",head="1",partofspeech="d noun with definite article" the war}} put {Caesar:function="o object",head="0",partofspeech="s personnal pronoun" him} in {M57:function="a adverbial",head="2",partofspeech="i noun with indefinite article" an unrivaled position of {M60:function="m noun modifier",head="0",partofspeech="t noun without determiner" {M61:function="m noun modifier",head="0",partofspeech="t noun without determiner" power} and {M62:function="m noun modifier",head="0",partofspeech="t noun without determiner" influence}}}.
 8 | 
 9 | 
10 | 
11 | #COLOR:Caesar=hsl(0, 100%, 80%)
12 | #COLOR:M12=hsl(25, 100%, 80%)
13 | #COLOR:Crassus=hsl(275, 100%, 80%)
14 | #COLOR:Pompey=hsl(225, 100%, 80%)
15 | #COLOR:M23=hsl(75, 100%, 80%)
16 | #COLOR:M21=hsl(50, 100%, 80%)
17 | #COLOR:M25=hsl(100, 100%, 80%)
18 | #COLOR:TheSenate=hsl(250, 100%, 80%)
19 | #COLOR:M40=hsl(200, 100%, 80%)
20 | #COLOR:M24=hsl(300, 100%, 80%)
21 | #COLOR:M37=hsl(150, 100%, 80%)
22 | #COLOR:TheRhine=hsl(125, 100%, 80%)
23 | #COLOR:Britain=hsl(175, 100%, 80%)
24 | #COLOR:Rome=hsl(325, 100%, 80%)
25 | #COLOR:M50=hsl(350, 100%, 80%)
26 | #COLOR:M59=hsl(0, 100%, 70%)
27 | 
28 | #TOKENIZATION-TYPE:1
29 | 
30 | 


--------------------------------------------------------------------------------
/testing/cicero.sacr:
--------------------------------------------------------------------------------
 1 | #textmetadata:work=politics
 2 | 
 3 | #textid:cicero
 4 | 
 5 | {Cicero:function="s subject",head="0",partofspeech="a name" Marcus Tullius Cicero}({M2:function="t other",head="0",partofspeech="t noun without determiner" 106 BC} – {M3:function="t other",head="0",partofspeech="t noun without determiner" 7 December 43 BC}) was {Cicero:function="o object",head="2",partofspeech="i noun with indefinite article" a Roman {Cicero:function="o object",head="0",partofspeech="i noun with indefinite article" statesman}, {Cicero:function="o object",head="0",partofspeech="i noun with indefinite article" orator}, {Cicero:function="o object",head="0",partofspeech="i noun with indefinite article" lawyer} and {Cicero:function="o object",head="0",partofspeech="i noun with indefinite article" philosopher}, {Cicero:function="s subject",head="0",partofspeech="r relative pronoun" who} served as {Cicero:function="o object",head="0",partofspeech="t noun without determiner" consul} in {M10:function="a adverbial",head="1",partofspeech="d noun with definite article" the year 63 BC}}. {Cicero:function="s subject",head="0",partofspeech="s personnal pronoun" He} came from {M1:function="o object",head="3",partofspeech="i noun with indefinite article" a wealthy municipal family of {M5:function="m noun modifier",head="3",partofspeech="d noun with definite article" the Roman equestrian order}}, and is considered {Cicero:function="o object",head="0",partofspeech="n noun with determiner" one of {M7:function="o object",head="4",partofspeech="n noun with determiner" {M11:function="o object",head="3",partofspeech="n noun with determiner" {M9:function="m noun modifier",head="0",partofspeech="a name" Rome'}s greatest orators} and {M8:function="o object",head="1",partofspeech="n noun with determiner" prose stylists}}}.
 6 | 
 7 | {Cicero:function="s subject",head="0",partofspeech="s personnal pronoun" His} influence on {Latin:function="o object",head="2",partofspeech="d noun with definite article" the Latin language} was so immense that {M14:function="s subject",head="2",partofspeech="d noun with definite article" the subsequent history of {M15:function="m noun modifier",head="0",partofspeech="t noun without determiner" prose}}, not only in {Latin:function="a adverbial",head="0",partofspeech="t noun without determiner" Latin} but in {M17:function="a adverbial",head="1",partofspeech="t noun without determiner" European languages} up to {M18:function="a adverbial",head="3",partofspeech="d noun with definite article" the 19th century}, was said to be either {M19:function="o object",head="1",partofspeech="i noun with indefinite article" {M20:function="o object",head="1",partofspeech="i noun with indefinite article" a reaction against} or {M21:function="o object",head="1",partofspeech="i noun with indefinite article" a return to {M23:function="o object",head="1",partofspeech="n noun with determiner" {Cicero:function="m noun modifier",head="0",partofspeech="e possessive adjective" his} style}}}. {Cicero:function="s subject",head="0",partofspeech="a name" Cicero} introduced the {M12:function="o object",head="0",partofspeech="a name" Romans} to {M22:function="o object",head="2",partofspeech="d noun with definite article" the chief schools of {M32:function="m noun modifier",head="1",partofspeech="t noun without determiner" Greek philosophy}} and created {M4:function="o object",head="3",partofspeech="i noun with indefinite article" a Latin philosophical vocabulary} (with {M31:function="a adverbial",head="0",partofspeech="i noun with indefinite article" {M25:function="a adverbial",head="0",partofspeech="t noun without determiner" neologisms} such as {M26:function="a adverbial",head="0",partofspeech="t noun without determiner" evidentia}, {M27:function="a adverbial",head="0",partofspeech="t noun without determiner" humanitas}, {M28:function="a adverbial",head="0",partofspeech="t noun without determiner" qualitas}, {M29:function="a adverbial",head="0",partofspeech="t noun without determiner" quantitas}, and {M30:function="a adverbial",head="0",partofspeech="t noun without determiner" essentia}}) distinguishing {Cicero:function="o object",head="0",partofspeech="s personnal pronoun" himself} as {Cicero:function="o object",head="1",partofspeech="i noun with indefinite article" a {Cicero:function="o object",head="0",partofspeech="t noun without determiner" translator} and {Cicero:function="o object",head="0",partofspeech="t noun without determiner" philosopher}}.
 8 | 
 9 | 
10 | 
11 | #COLOR:Cicero=hsl(0, 100%, 80%)
12 | #COLOR:Latin=hsl(25, 100%, 80%)
13 | 
14 | #TOKENIZATION-TYPE:1
15 | 
16 | 


--------------------------------------------------------------------------------
/testing/lucian_speakers.sacr:
--------------------------------------------------------------------------------
 1 | #title: Lucian, Dialogues of the Dead, 4: Hermes and Charon
 2 | 
 3 | #speaker: Hermes
 4 | Ferryman, what do you say to settling up accounts? It will prevent any
 5 | unpleasantness later on.
 6 | 
 7 | #speaker: Charon
 8 | Very good. It does save trouble to get these things straight.
 9 | 
10 | #speaker: Hermes
11 | One anchor, to your order, five shillings.
12 | 
13 | #speaker: Charon
14 | That is a lot of money.
15 | 
16 | #speaker: Hermes
17 | So help me Pluto, it is what I had to pay. One rowlock-strap, fourpence.
18 | 
19 | #speaker: Charon
20 | Five and four; put that down.
21 | 
22 | #speaker: Hermes
23 | Then there was a needle, for mending the sail; tenpence.
24 | 
25 | #speaker: Charon
26 | Down with it.
27 | 
28 | 


--------------------------------------------------------------------------------
/testing/pliny.sacr:
--------------------------------------------------------------------------------
 1 | #textmetadata:work=science
 2 | 
 3 | #textid:pliny
 4 | 
 5 | {PlinyTheElder:function="s subject",head="0",partofspeech="a name" Pliny the Elder} (AD {PlinyTheElder:function="t other",head="0",partofspeech="t noun without determiner" 23}–{PlinyTheElder:function="t other",head="0",partofspeech="t noun without determiner" 79}) was {PlinyTheElder:function="o object",head="2",partofspeech="i noun with indefinite article" {PlinyTheElder:function="o object",head="2",partofspeech="i noun with indefinite article" a Roman author}, {PlinyTheElder:function="o object",head="0",partofspeech="i noun with indefinite article" naturalist} and {PlinyTheElder:function="o object",head="1",partofspeech="i noun with indefinite article" natural philosopher}, {PlinyTheElder:function="o object",head="4",partofspeech="i noun with indefinite article" a naval and army commander of {PlinyTheElder:function="m noun modifier",head="3",partofspeech="d noun with definite article" the early Roman Empire}}, and {PlinyTheElder:function="o object",head="0",partofspeech="t noun without determiner" friend of {PlinyTheElder:function="m noun modifier",head="1",partofspeech="a name" emperor Vespasian}}}.
 6 | 
 7 | Spending most of {M12:function="a adverbial",head="2",partofspeech="n noun with determiner" {PlinyTheElder:function="m noun modifier",head="0",partofspeech="e possessive adjective" his} spare time} studying, writing, and investigating {M2:function="o object",head="3",partofspeech="t noun without determiner" natural and geographic phenomena} in {M3:function="a adverbial",head="1",partofspeech="d noun with definite article" the field}, {PlinyTheElder:function="s subject",head="0",partofspeech="a name" Pliny} wrote {M7:function="o object",head="1",partofspeech="d noun with definite article" the encyclopedic Naturalis Historia (Natural History), {M7:function="s subject",head="0",partofspeech="r relative pronoun" which} became {M6:function="o object",head="2",partofspeech="i noun with indefinite article" an editorial model for {M5:function="m noun modifier",head="0",partofspeech="t noun without determiner" encyclopedias}}}. {PlinyTheYounger:function="s subject",head="1",partofspeech="n noun with determiner" {PlinyTheElder:function="m noun modifier",head="0",partofspeech="e possessive adjective" His} nephew}, {PlinyTheYounger:function="m noun modifier",head="0",partofspeech="a name" Pliny the Younger}, wrote of {PlinyTheElder:function="o object",head="0",partofspeech="s personnal pronoun" him} in {M1:function="a adverbial",head="1",partofspeech="i noun with indefinite article" a letter to {Tacitus:function="a adverbial",head="2",partofspeech="d noun with definite article" the historian Tacitus}}: For {M10:function="a adverbial",head="1",partofspeech="n noun with determiner" {PlinyTheYounger:function="m noun modifier",head="0",partofspeech="e possessive adjective" my} part} {PlinyTheYounger:function="s subject",head="0",partofspeech="s personnal pronoun" I} deem {M14:function="o object",head="0",partofspeech="r relative pronoun" those blessed to {M14:function="o object",head="0",partofspeech="r relative pronoun" whom}, by favour of {M13:function="m noun modifier",head="1",partofspeech="d noun with definite article" the gods}, it has been granted either to do {M15:function="o object",head="0",partofspeech="r relative pronoun" what is worth writing of}, or to write {M11:function="o object",head="0",partofspeech="r relative pronoun" what is worth reading}}; above measure blessed {M17:function="o object",head="0",partofspeech="r relative pronoun" those on {M17:function="o object",head="0",partofspeech="r relative pronoun" whom} both gifts have been conferred}. In {M16:function="m noun modifier",head="2",partofspeech="d noun with definite article" the latter number} will be {PlinyTheElder:function="o object",head="1",partofspeech="n noun with determiner" {PlinyTheYounger:function="m noun modifier",head="0",partofspeech="e possessive adjective" my} uncle}, by virtue of {M18:function="m noun modifier",head="1",partofspeech="n noun with determiner" {M4:function="",head="",partofspeech="" {PlinyTheElder:function="m noun modifier",head="0",partofspeech="e possessive adjective" his} own} and of {M9:function="m noun modifier",head="1",partofspeech="n noun with determiner" {Tacitus:function="m noun modifier",head="0",partofspeech="e possessive adjective" your} compositions}}.
 8 | 
 9 | {PlinyTheYounger:function="s subject",head="0",partofspeech="a name" Pliny the Younger} refers to {M20:function="o object",head="2",partofspeech="n noun with determiner" {Tacitus:function="m noun modifier",head="0",partofspeech="a name" Tacitus}’s reliance} upon {M21:function="a adverbial",head="3",partofspeech="n noun with determiner" {PlinyTheElder:function="m noun modifier",head="1",partofspeech="n noun with determiner" {PlinyTheYounger:function="m noun modifier",head="0",partofspeech="e possessive adjective" his} uncle'}s book}, {M22:function="m noun modifier",head="1",partofspeech="d noun with definite article" the History of the German Wars}.
10 | 
11 | 
12 | 
13 | #COLOR:PlinyTheElder=hsl(0, 100%, 80%)
14 | #COLOR:M7=hsl(25, 100%, 80%)
15 | #COLOR:PlinyTheYounger=hsl(50, 100%, 80%)
16 | #COLOR:Tacitus=hsl(125, 100%, 80%)
17 | #COLOR:M14=hsl(75, 100%, 80%)
18 | #COLOR:M17=hsl(100, 100%, 80%)
19 | 
20 | #TOKENIZATION-TYPE:1
21 | 
22 | 


--------------------------------------------------------------------------------
/testing/simple.sacr:
--------------------------------------------------------------------------------
1 | {one {one His} head} hurts.
2 | 


--------------------------------------------------------------------------------
/testing/testing_sacr2conll.conll:
--------------------------------------------------------------------------------
  1 | #begin document (aesop.sacr); part 000
  2 | 0	A	(0
  3 | 1	Peasant	0)
  4 | 2	found	-
  5 | 3	an	(1
  6 | 4	Eagle	-
  7 | 5	captured	-
  8 | 6	in	-
  9 | 7	a	(2
 10 | 8	trap	2)_1)
 11 | 9	,	-
 12 | 10	and	-
 13 | 11	much	-
 14 | 12	admiring	-
 15 | 13	the	(1
 16 | 14	bird	1)
 17 | 15	,	-
 18 | 16	set	-
 19 | 17	him	(0)
 20 | 18	free	-
 21 | 19	.	-
 22 | 
 23 | 0	The	(1
 24 | 1	Eagle	1)
 25 | 2	did	-
 26 | 3	not	-
 27 | 4	prove	-
 28 | 5	ungrateful	-
 29 | 6	to	-
 30 | 7	his	(0_(1)
 31 | 8	deliverer	0)
 32 | 9	,	-
 33 | 10	for	-
 34 | 11	seeing	-
 35 | 12	the	(0
 36 | 13	Peasant	-
 37 | 14	sitting	-
 38 | 15	under	-
 39 | 16	a	(3
 40 | 17	wall	-
 41 | 18	which	(3)
 42 | 19	was	-
 43 | 20	not	-
 44 | 21	safe	3)_0)
 45 | 22	,	-
 46 | 23	he	(1)
 47 | 24	flew	-
 48 | 25	toward	-
 49 | 26	him	(0)
 50 | 27	and	-
 51 | 28	with	-
 52 | 29	his	(4_(1)
 53 | 30	talons	4)
 54 | 31	snatched	-
 55 | 32	a	(5
 56 | 33	bundle	5)
 57 | 34	from	-
 58 | 35	his	(6_(0)
 59 | 36	head	6)
 60 | 37	.	-
 61 | 
 62 | 0	When	-
 63 | 1	the	(0
 64 | 2	Peasant	0)
 65 | 3	rose	-
 66 | 4	in	-
 67 | 5	pursuit	-
 68 | 6	,	-
 69 | 7	the	(1
 70 | 8	Eagle	1)
 71 | 9	let	-
 72 | 10	the	(5
 73 | 11	bundle	5)
 74 | 12	fall	-
 75 | 13	again	-
 76 | 14	.	-
 77 | 
 78 | 0	Taking	-
 79 | 1	it	(5)
 80 | 2	up	-
 81 | 3	,	-
 82 | 4	the	(0
 83 | 5	man	0)
 84 | 6	returned	-
 85 | 7	to	-
 86 | 8	the	(7
 87 | 9	same	-
 88 | 10	place	7)
 89 | 11	,	-
 90 | 12	to	-
 91 | 13	find	-
 92 | 14	that	-
 93 | 15	the	(3
 94 | 16	wall	-
 95 | 17	under	-
 96 | 18	which	(3)
 97 | 19	he	(0)
 98 | 20	had	-
 99 | 21	been	-
100 | 22	sitting	3)
101 | 23	had	-
102 | 24	fallen	-
103 | 25	to	-
104 | 26	pieces	-
105 | 27	;	-
106 | 28	and	-
107 | 29	he	(0)
108 | 30	marveled	-
109 | 31	at	-
110 | 32	the	(8
111 | 33	service	8)
112 | 34	rendered	-
113 | 35	him	(0)
114 | 36	by	-
115 | 37	the	(1
116 | 38	Eagle	1)
117 | 39	.	-
118 | #end document
119 | 
120 | 
121 | #begin document (caesar.sacr); part 000
122 | 0	Gaius	(0
123 | 1	Julius	-
124 | 2	Caesar	0)
125 | 3	(	-
126 | 4	12	(1
127 | 5	or	-
128 | 6	13	-
129 | 7	July	-
130 | 8	100	-
131 | 9	BC	1)
132 | 10	–	-
133 | 11	15	(2
134 | 12	March	-
135 | 13	44	-
136 | 14	BC	2)
137 | 15	)	-
138 | 16	,	-
139 | 17	known	-
140 | 18	by	-
141 | 19	his	(3_(4_(0)
142 | 20	nomen	4)
143 | 21	and	-
144 | 22	cognomen	(5)
145 | 23	Julius	-
146 | 24	Caesar	3)
147 | 25	,	-
148 | 26	was	-
149 | 27	a	(0
150 | 28	Roman	-
151 | 29	politician	-
152 | 30	,	-
153 | 31	military	-
154 | 32	general	-
155 | 33	,	-
156 | 34	and	-
157 | 35	historian	-
158 | 36	who	(0)
159 | 37	played	-
160 | 38	a	(6
161 | 39	critical	-
162 | 40	role	6)
163 | 41	in	-
164 | 42	the	(7
165 | 43	events	-
166 | 44	that	(7)
167 | 45	led	-
168 | 46	to	-
169 | 47	the	(8
170 | 48	demise	-
171 | 49	of	-
172 | 50	the	(9
173 | 51	Roman	-
174 | 52	Republic	9)_8)
175 | 53	and	-
176 | 54	the	(10
177 | 55	rise	-
178 | 56	of	-
179 | 57	the	(11
180 | 58	Roman	-
181 | 59	Empire	11)_10)_7)_0)
182 | 60	.	-
183 | 
184 | 0	He	(0)
185 | 1	also	-
186 | 2	wrote	-
187 | 3	Latin	(12
188 | 4	prose	12)
189 | 5	.	-
190 | 
191 | 0	In	-
192 | 1	60	(13
193 | 2	BC	13)
194 | 3	,	-
195 | 4	Caesar	(14_(0)
196 | 5	,	-
197 | 6	Crassus	(15)
198 | 7	and	-
199 | 8	Pompey	(16)_14)
200 | 9	formed	-
201 | 10	the	(17
202 | 11	First	-
203 | 12	Triumvirate	-
204 | 13	,	-
205 | 14	a	-
206 | 15	political	-
207 | 16	alliance	-
208 | 17	that	(17)
209 | 18	dominated	-
210 | 19	Roman	(18
211 | 20	politics	18)
212 | 21	for	-
213 | 22	several	(19
214 | 23	years	19)_17)
215 | 24	.	-
216 | 
217 | 0	Their	(14)
218 | 1	attempts	-
219 | 2	to	-
220 | 3	amass	-
221 | 4	power	-
222 | 5	as	-
223 | 6	Populares	(20)
224 | 7	were	-
225 | 8	opposed	-
226 | 9	by	-
227 | 10	the	(21
228 | 11	Optimates	21)
229 | 12	within	-
230 | 13	the	(22
231 | 14	Roman	-
232 | 15	Senate	22)
233 | 16	,	-
234 | 17	among	-
235 | 18	them	(21)
236 | 19	Cato	(23
237 | 20	the	-
238 | 21	Younger	23)
239 | 22	with	-
240 | 23	the	(24
241 | 24	frequent	-
242 | 25	support	-
243 | 26	of	-
244 | 27	Cicero	(25)_24)
245 | 28	.	-
246 | 
247 | 0	Caesar	(0)
248 | 1	rose	-
249 | 2	to	-
250 | 3	become	-
251 | 4	one	-
252 | 5	of	-
253 | 6	the	(26
254 | 7	most	-
255 | 8	powerful	-
256 | 9	politicians	-
257 | 10	in	-
258 | 11	the	(27
259 | 12	Roman	-
260 | 13	Republic	27)_26)
261 | 14	through	-
262 | 15	a	(28
263 | 16	number	-
264 | 17	of	-
265 | 18	his	(29_(0)
266 | 19	accomplishments	29)_28)
267 | 20	,	-
268 | 21	notably	-
269 | 22	his	(30_(0)
270 | 23	victories	-
271 | 24	in	-
272 | 25	the	(31
273 | 26	Gallic	-
274 | 27	Wars	31)
275 | 28	,	-
276 | 29	completed	-
277 | 30	by	-
278 | 31	51	(32
279 | 32	BC	32)_30)
280 | 33	.	-
281 | 
282 | 0	During	-
283 | 1	this	-
284 | 2	time	-
285 | 3	,	-
286 | 4	Caesar	(0)
287 | 5	became	-
288 | 6	the	(0
289 | 7	first	-
290 | 8	Roman	-
291 | 9	general	0)
292 | 10	to	-
293 | 11	cross	-
294 | 12	both	-
295 | 13	the	(33_(34
296 | 14	English	-
297 | 15	Channel	34)
298 | 16	and	-
299 | 17	the	(35
300 | 18	Rhine	-
301 | 19	River	35)_33)
302 | 20	,	-
303 | 21	when	-
304 | 22	he	(0)
305 | 23	built	-
306 | 24	a	(36
307 | 25	bridge	-
308 | 26	across	-
309 | 27	the	(35
310 | 28	Rhine	35)_36)
311 | 29	and	-
312 | 30	crossed	-
313 | 31	the	(34
314 | 32	Channel	34)
315 | 33	to	-
316 | 34	invade	-
317 | 35	Britain	(37)
318 | 36	.	-
319 | 
320 | 0	Caesar'	(38_(0)
321 | 1	s	-
322 | 2	wars	38)
323 | 3	extended	-
324 | 4	Rome'	(39_(40)
325 | 5	s	-
326 | 6	territory	39)
327 | 7	to	-
328 | 8	Britain	(37)
329 | 9	and	-
330 | 10	past	-
331 | 11	Gaul	(41)
332 | 12	.	-
333 | 
334 | 0	These	(28
335 | 1	achievements	28)
336 | 2	granted	-
337 | 3	him	(0)
338 | 4	unmatched	-
339 | 5	military	(42
340 | 6	power	42)
341 | 7	and	-
342 | 8	threatened	-
343 | 9	to	-
344 | 10	eclipse	-
345 | 11	the	(43
346 | 12	standing	-
347 | 13	of	-
348 | 14	Pompey	(16
349 | 15	,	-
350 | 16	who	(16)
351 | 17	had	-
352 | 18	realigned	-
353 | 19	himself	(16)
354 | 20	with	-
355 | 21	the	(22
356 | 22	Senate	22)
357 | 23	after	-
358 | 24	the	(44
359 | 25	death	-
360 | 26	of	-
361 | 27	Crassus	(15)_44)
362 | 28	in	-
363 | 29	53	(45
364 | 30	BC	45)_16)_43)
365 | 31	.	-
366 | 
367 | 0	With	-
368 | 1	the	(31
369 | 2	Gallic	-
370 | 3	Wars	31)
371 | 4	concluded	-
372 | 5	,	-
373 | 6	the	(22
374 | 7	Senate	22)
375 | 8	ordered	-
376 | 9	Caesar	(0)
377 | 10	to	-
378 | 11	step	-
379 | 12	down	-
380 | 13	from	-
381 | 14	his	(46_(0)
382 | 15	military	-
383 | 16	command	46)
384 | 17	and	-
385 | 18	return	-
386 | 19	to	-
387 | 20	Rome	(40)
388 | 21	.	-
389 | 
390 | 0	Leaving	-
391 | 1	his	(47_(0)
392 | 2	command	47)
393 | 3	in	-
394 | 4	Gaul	(41)
395 | 5	meant	-
396 | 6	losing	-
397 | 7	his	(48_(0)
398 | 8	immunity	-
399 | 9	from	-
400 | 10	being	-
401 | 11	charged	-
402 | 12	as	-
403 | 13	a	-
404 | 14	criminal	-
405 | 15	for	-
406 | 16	waging	-
407 | 17	unsanctioned	(49
408 | 18	wars	49)_48)
409 | 19	.	-
410 | 
411 | 0	As	-
412 | 1	a	-
413 | 2	result	-
414 | 3	,	-
415 | 4	Caesar	(0)
416 | 5	found	-
417 | 6	himself	-
418 | 7	with	-
419 | 8	no	-
420 | 9	other	-
421 | 10	options	-
422 | 11	but	-
423 | 12	to	-
424 | 13	cross	-
425 | 14	the	(50
426 | 15	Rubicon	50)
427 | 16	with	-
428 | 17	the	(51
429 | 18	13th	-
430 | 19	Legion	51)
431 | 20	,	-
432 | 21	leaving	-
433 | 22	his	(52_(0)
434 | 23	province	52)
435 | 24	and	-
436 | 25	illegally	-
437 | 26	entering	-
438 | 27	Roman	(53
439 | 28	Italy	53)
440 | 29	under	-
441 | 30	arms	-
442 | 31	.	-
443 | 
444 | 0	This	(54)
445 | 1	began	-
446 | 2	Caesar'	(55_(0)
447 | 3	s	-
448 | 4	civil	-
449 | 5	war	55)
450 | 6	,	-
451 | 7	and	-
452 | 8	his	(56_(0)
453 | 9	victory	-
454 | 10	in	-
455 | 11	the	(55
456 | 12	war	55)_56)
457 | 13	put	-
458 | 14	him	(0)
459 | 15	in	-
460 | 16	an	(57
461 | 17	unrivaled	-
462 | 18	position	-
463 | 19	of	-
464 | 20	power	(58_(59)
465 | 21	and	-
466 | 22	influence	(60)_58)_57)
467 | 23	.	-
468 | #end document
469 | 
470 | 
471 | #begin document (cicero.sacr); part 000
472 | 0	Marcus	(0
473 | 1	Tullius	-
474 | 2	Cicero	0)
475 | 3	(	-
476 | 4	106	(1
477 | 5	BC	1)
478 | 6	–	-
479 | 7	7	(2
480 | 8	December	-
481 | 9	43	-
482 | 10	BC	2)
483 | 11	)	-
484 | 12	was	-
485 | 13	a	(0
486 | 14	Roman	-
487 | 15	statesman	(0)
488 | 16	,	-
489 | 17	orator	(0)
490 | 18	,	-
491 | 19	lawyer	(0)
492 | 20	and	-
493 | 21	philosopher	(0)
494 | 22	,	-
495 | 23	who	(0)
496 | 24	served	-
497 | 25	as	-
498 | 26	consul	(0)
499 | 27	in	-
500 | 28	the	(3
501 | 29	year	-
502 | 30	63	-
503 | 31	BC	3)_0)
504 | 32	.	-
505 | 
506 | 0	He	(0)
507 | 1	came	-
508 | 2	from	-
509 | 3	a	(4
510 | 4	wealthy	-
511 | 5	municipal	-
512 | 6	family	-
513 | 7	of	-
514 | 8	the	(5
515 | 9	Roman	-
516 | 10	equestrian	-
517 | 11	order	5)_4)
518 | 12	,	-
519 | 13	and	-
520 | 14	is	-
521 | 15	considered	-
522 | 16	one	(0
523 | 17	of	-
524 | 18	Rome'	(6_(7_(8)
525 | 19	s	-
526 | 20	greatest	-
527 | 21	orators	7)
528 | 22	and	-
529 | 23	prose	(9
530 | 24	stylists	9)_6)_0)
531 | 25	.	-
532 | 
533 | 0	His	(0)
534 | 1	influence	-
535 | 2	on	-
536 | 3	the	(10
537 | 4	Latin	-
538 | 5	language	10)
539 | 6	was	-
540 | 7	so	-
541 | 8	immense	-
542 | 9	that	-
543 | 10	the	(11
544 | 11	subsequent	-
545 | 12	history	-
546 | 13	of	-
547 | 14	prose	(12)_11)
548 | 15	,	-
549 | 16	not	-
550 | 17	only	-
551 | 18	in	-
552 | 19	Latin	(10)
553 | 20	but	-
554 | 21	in	-
555 | 22	European	(13
556 | 23	languages	13)
557 | 24	up	-
558 | 25	to	-
559 | 26	the	(14
560 | 27	19th	-
561 | 28	century	14)
562 | 29	,	-
563 | 30	was	-
564 | 31	said	-
565 | 32	to	-
566 | 33	be	-
567 | 34	either	-
568 | 35	a	(15_(16
569 | 36	reaction	-
570 | 37	against	16)
571 | 38	or	-
572 | 39	a	(17
573 | 40	return	-
574 | 41	to	-
575 | 42	his	(18_(0)
576 | 43	style	18)_17)_15)
577 | 44	.	-
578 | 
579 | 0	Cicero	(0)
580 | 1	introduced	-
581 | 2	the	-
582 | 3	Romans	(19)
583 | 4	to	-
584 | 5	the	(20
585 | 6	chief	-
586 | 7	schools	-
587 | 8	of	-
588 | 9	Greek	(21
589 | 10	philosophy	21)_20)
590 | 11	and	-
591 | 12	created	-
592 | 13	a	(22
593 | 14	Latin	-
594 | 15	philosophical	-
595 | 16	vocabulary	22)
596 | 17	(	-
597 | 18	with	-
598 | 19	neologisms	(23_(24)
599 | 20	such	-
600 | 21	as	-
601 | 22	evidentia	(25)
602 | 23	,	-
603 | 24	humanitas	(26)
604 | 25	,	-
605 | 26	qualitas	(27)
606 | 27	,	-
607 | 28	quantitas	(28)
608 | 29	,	-
609 | 30	and	-
610 | 31	essentia	(29)_23)
611 | 32	)	-
612 | 33	distinguishing	-
613 | 34	himself	(0)
614 | 35	as	-
615 | 36	a	(0
616 | 37	translator	(0)
617 | 38	and	-
618 | 39	philosopher	(0)_0)
619 | 40	.	-
620 | #end document
621 | 
622 | 
623 | #begin document (pliny.sacr); part 000
624 | 0	Pliny	(0
625 | 1	the	-
626 | 2	Elder	0)
627 | 3	(	-
628 | 4	AD	-
629 | 5	23	(0)
630 | 6	–	-
631 | 7	79	(0)
632 | 8	)	-
633 | 9	was	-
634 | 10	a	(0_(0
635 | 11	Roman	-
636 | 12	author	0)
637 | 13	,	-
638 | 14	naturalist	(0)
639 | 15	and	-
640 | 16	natural	(0
641 | 17	philosopher	0)
642 | 18	,	-
643 | 19	a	(0
644 | 20	naval	-
645 | 21	and	-
646 | 22	army	-
647 | 23	commander	-
648 | 24	of	-
649 | 25	the	(0
650 | 26	early	-
651 | 27	Roman	-
652 | 28	Empire	0)_0)
653 | 29	,	-
654 | 30	and	-
655 | 31	friend	(0
656 | 32	of	-
657 | 33	emperor	(0
658 | 34	Vespasian	0)_0)_0)
659 | 35	.	-
660 | 
661 | 0	Spending	-
662 | 1	most	-
663 | 2	of	-
664 | 3	his	(1_(0)
665 | 4	spare	-
666 | 5	time	1)
667 | 6	studying	-
668 | 7	,	-
669 | 8	writing	-
670 | 9	,	-
671 | 10	and	-
672 | 11	investigating	-
673 | 12	natural	(2
674 | 13	and	-
675 | 14	geographic	-
676 | 15	phenomena	2)
677 | 16	in	-
678 | 17	the	(3
679 | 18	field	3)
680 | 19	,	-
681 | 20	Pliny	(0)
682 | 21	wrote	-
683 | 22	the	(4
684 | 23	encyclopedic	-
685 | 24	Naturalis	-
686 | 25	Historia	-
687 | 26	(	-
688 | 27	Natural	-
689 | 28	History	-
690 | 29	)	-
691 | 30	,	-
692 | 31	which	(4)
693 | 32	became	-
694 | 33	an	(5
695 | 34	editorial	-
696 | 35	model	-
697 | 36	for	-
698 | 37	encyclopedias	(6)_5)_4)
699 | 38	.	-
700 | 
701 | 0	His	(7_(0)
702 | 1	nephew	7)
703 | 2	,	-
704 | 3	Pliny	(7
705 | 4	the	-
706 | 5	Younger	7)
707 | 6	,	-
708 | 7	wrote	-
709 | 8	of	-
710 | 9	him	(0)
711 | 10	in	-
712 | 11	a	(8
713 | 12	letter	-
714 | 13	to	-
715 | 14	the	(9
716 | 15	historian	-
717 | 16	Tacitus	9)_8)
718 | 17	:	-
719 | 18	For	-
720 | 19	my	(10_(7)
721 | 20	part	10)
722 | 21	I	(7)
723 | 22	deem	-
724 | 23	those	(11
725 | 24	blessed	-
726 | 25	to	-
727 | 26	whom	(11)
728 | 27	,	-
729 | 28	by	-
730 | 29	favour	-
731 | 30	of	-
732 | 31	the	(12
733 | 32	gods	12)
734 | 33	,	-
735 | 34	it	-
736 | 35	has	-
737 | 36	been	-
738 | 37	granted	-
739 | 38	either	-
740 | 39	to	-
741 | 40	do	-
742 | 41	what	(13
743 | 42	is	-
744 | 43	worth	-
745 | 44	writing	-
746 | 45	of	13)
747 | 46	,	-
748 | 47	or	-
749 | 48	to	-
750 | 49	write	-
751 | 50	what	(14
752 | 51	is	-
753 | 52	worth	-
754 | 53	reading	14)_11)
755 | 54	;	-
756 | 55	above	-
757 | 56	measure	-
758 | 57	blessed	-
759 | 58	those	(15
760 | 59	on	-
761 | 60	whom	(15)
762 | 61	both	-
763 | 62	gifts	-
764 | 63	have	-
765 | 64	been	-
766 | 65	conferred	15)
767 | 66	.	-
768 | 
769 | 0	In	-
770 | 1	the	(16
771 | 2	latter	-
772 | 3	number	16)
773 | 4	will	-
774 | 5	be	-
775 | 6	my	(0_(7)
776 | 7	uncle	0)
777 | 8	,	-
778 | 9	by	-
779 | 10	virtue	-
780 | 11	of	-
781 | 12	his	(17_(18_(0)
782 | 13	own	18)
783 | 14	and	-
784 | 15	of	-
785 | 16	your	(19_(9)
786 | 17	compositions	19)_17)
787 | 18	.	-
788 | 
789 | 0	Pliny	(7
790 | 1	the	-
791 | 2	Younger	7)
792 | 3	refers	-
793 | 4	to	-
794 | 5	Tacitus	(20_(9)
795 | 6	’	-
796 | 7	s	-
797 | 8	reliance	20)
798 | 9	upon	-
799 | 10	his	(21_(0_(7)
800 | 11	uncle'	0)
801 | 12	s	-
802 | 13	book	21)
803 | 14	,	-
804 | 15	the	(22
805 | 16	History	-
806 | 17	of	-
807 | 18	the	-
808 | 19	German	-
809 | 20	Wars	22)
810 | 21	.	-
811 | #end document
812 | 
813 | 
814 | #begin document (simple.sacr); part 000
815 | 0	His	(0_(0)
816 | 1	head	0)
817 | 2	hurts	-
818 | 3	.	-
819 | #end document
820 | 


--------------------------------------------------------------------------------
/testing_conll2sacr/_aesop.sacr___part_000:
--------------------------------------------------------------------------------
1 | {C0 A Peasant} found {C1 an Eagle captured in {C2 a trap}} , and much admiring {C1 the bird} , set {C0 him} free . 
2 | 
3 | {C1 The Eagle} did not prove ungrateful to {C0 {C1 his} deliverer} , for seeing {C0 the Peasant sitting under {C3 a wall {C3 which} was not safe}} , {C1 he} flew toward {C0 him} and with {C4 {C1 his} talons} snatched {C5 a bundle} from {C6 {C0 his} head} . 
4 | 
5 | When {C0 the Peasant} rose in pursuit , {C1 the Eagle} let {C5 the bundle} fall again . 
6 | 
7 | Taking {C5 it} up , {C0 the man} returned to {C7 the same place} , to find that {C3 the wall under {C3 which} {C0 he} had been sitting} had fallen to pieces ; and {C0 he} marveled at {C8 the service} rendered {C0 him} by {C1 the Eagle} . 
8 | 
9 | 


--------------------------------------------------------------------------------
/testing_conll2sacr/_ceasar.sacr___part_000:
--------------------------------------------------------------------------------
 1 | {C0 Gaius Julius Caesar} ( {C1 12 or 13 July 100 BC} – {C2 15 March 44 BC} ) , known by {C3 {C4 {C0 his} nomen} and {C5 cognomen} Julius Caesar} , was {C0 a Roman politician , military general , and historian {C0 who} played {C6 a critical role} in {C7 the events {C7 that} led to {C8 the demise of {C9 the Roman Republic}} and {C10 the rise of {C11 the Roman Empire}}}} . 
 2 | 
 3 | {C0 He} also wrote {C12 Latin prose} . 
 4 | 
 5 | In {C13 60 BC} , {C14 {C0 Caesar} , {C15 Crassus} and {C16 Pompey}} formed {C17 the First Triumvirate , a political alliance {C17 that} dominated {C18 Roman politics} for {C19 several years}} . 
 6 | 
 7 | {C14 Their} attempts to amass power as {C20 Populares} were opposed by {C21 the Optimates} within {C22 the Roman Senate} , among {C21 them} {C23 Cato the Younger} with {C24 the frequent support of {C25 Cicero}} . 
 8 | 
 9 | {C0 Caesar} rose to become one of {C26 the most powerful politicians in {C27 the Roman Republic}} through {C28 a number of {C29 {C0 his} accomplishments}} , notably {C30 {C0 his} victories in {C31 the Gallic Wars} , completed by {C32 51 BC}} . 
10 | 
11 | During this time , {C0 Caesar} became {C0 the first Roman general} to cross both {C33 {C34 the English Channel} and {C35 the Rhine River}} , when {C0 he} built {C36 a bridge across {C35 the Rhine}} and crossed {C34 the Channel} to invade {C37 Britain} . 
12 | 
13 | {C38 {C0 Caesar'} s wars} extended {C39 {C40 Rome'} s territory} to {C37 Britain} and past {C41 Gaul} . 
14 | 
15 | {C28 These achievements} granted {C0 him} unmatched {C42 military power} and threatened to eclipse {C43 the standing of {C16 Pompey , {C16 who} had realigned {C16 himself} with {C22 the Senate} after {C44 the death of {C15 Crassus}} in {C45 53 BC}}} . 
16 | 
17 | With {C31 the Gallic Wars} concluded , {C22 the Senate} ordered {C0 Caesar} to step down from {C46 {C0 his} military command} and return to {C40 Rome} . 
18 | 
19 | Leaving {C47 {C0 his} command} in {C41 Gaul} meant losing {C48 {C0 his} immunity from being charged as a criminal for waging {C49 unsanctioned wars}} . 
20 | 
21 | As a result , {C0 Caesar} found himself with no other options but to cross {C50 the Rubicon} with {C51 the 13th Legion} , leaving {C52 {C0 his} province} and illegally entering {C53 Roman Italy} under arms . 
22 | 
23 | {C54 This} began {C55 {C0 Caesar'} s civil war} , and {C56 {C0 his} victory in {C55 the war}} put {C0 him} in {C57 an unrivaled position of {C58 {C59 power} and {C60 influence}}} . 
24 | 
25 | 


--------------------------------------------------------------------------------
/testing_conll2sacr/_cicero.sacr___part_000:
--------------------------------------------------------------------------------
1 | {C0 Marcus Tullius Cicero} ( {C1 106 BC} – {C2 7 December 43 BC} ) was {C0 a Roman {C0 statesman} , {C0 orator} , {C0 lawyer} and {C0 philosopher} , {C0 who} served as {C0 consul} in {C3 the year 63 BC}} . 
2 | 
3 | {C0 He} came from {C4 a wealthy municipal family of {C5 the Roman equestrian order}} , and is considered {C0 one of {C6 {C7 {C8 Rome'} s greatest orators} and {C9 prose stylists}}} . 
4 | 
5 | {C0 His} influence on {C10 the Latin language} was so immense that {C11 the subsequent history of {C12 prose}} , not only in {C10 Latin} but in {C13 European languages} up to {C14 the 19th century} , was said to be either {C15 {C16 a reaction against} or {C17 a return to {C18 {C0 his} style}}} . 
6 | 
7 | {C0 Cicero} introduced the {C19 Romans} to {C20 the chief schools of {C21 Greek philosophy}} and created {C22 a Latin philosophical vocabulary} ( with {C23 {C24 neologisms} such as {C25 evidentia} , {C26 humanitas} , {C27 qualitas} , {C28 quantitas} , and {C29 essentia}} ) distinguishing {C0 himself} as {C0 a {C0 translator} and {C0 philosopher}} . 
8 | 
9 | 


--------------------------------------------------------------------------------
/testing_conll2sacr/_pliny.sacr___part_000:
--------------------------------------------------------------------------------
 1 | {C0 Pliny the Elder} ( AD {C0 23} – {C0 79} ) was {C0 {C0 a Roman author} , {C0 naturalist} and {C0 natural philosopher} , {C0 a naval and army commander of {C0 the early Roman Empire}} , and {C0 friend of {C0 emperor Vespasian}}} . 
 2 | 
 3 | Spending most of {C1 {C0 his} spare time} studying , writing , and investigating {C2 natural and geographic phenomena} in {C3 the field} , {C0 Pliny} wrote {C4 the encyclopedic Naturalis Historia ( Natural History ) , {C4 which} became {C5 an editorial model for {C6 encyclopedias}}} . 
 4 | 
 5 | {C7 {C0 His} nephew} , {C7 Pliny the Younger} , wrote of {C0 him} in {C8 a letter to {C9 the historian Tacitus}} : For {C10 {C7 my} part} {C7 I} deem {C11 those blessed to {C11 whom} , by favour of {C12 the gods} , it has been granted either to do {C13 what is worth writing of} , or to write {C14 what is worth reading}} ; above measure blessed {C15 those on {C15 whom} both gifts have been conferred} . 
 6 | 
 7 | In {C16 the latter number} will be {C0 {C7 my} uncle} , by virtue of {C17 {C18 {C0 his} own} and of {C19 {C9 your} compositions}} . 
 8 | 
 9 | {C7 Pliny the Younger} refers to {C20 {C9 Tacitus} ’ s reliance} upon {C21 {C0 {C7 his} uncle'} s book} , {C22 the History of the German Wars} . 
10 | 
11 | 


--------------------------------------------------------------------------------
/testing_conll2sacr/_simple.sacr___part_000:
--------------------------------------------------------------------------------
1 | {C0 {C0 His} head} hurts . 
2 | 
3 | 


--------------------------------------------------------------------------------
/tests/test_sacr2ann.py:
--------------------------------------------------------------------------------
  1 | from sacr2ann import (
  2 |     DEFAULT_MENTION_TYPE,
  3 |     DEFAULT_RELATION_TYPE,
  4 |     Annotation,
  5 |     RelationAnnotation,
  6 |     Sacr2AnnConverter,
  7 |     TextAnnotation,
  8 | )
  9 | 
 10 | 
 11 | def test_sacr2ann_1_annotation() -> None:
 12 |     text = """hello {chain1:a=1,b=2,type=WORD world}!"""
 13 |     converter = Sacr2AnnConverter(type_property_name="type")
 14 |     converter.convert(text)
 15 | 
 16 |     assert converter.text == "hello world!"
 17 | 
 18 |     ann = converter.annotations
 19 |     assert len(ann) == 1
 20 |     assert ann[0] == TextAnnotation(index=1, kind="WORD", start=6, end=11)
 21 | 
 22 | 
 23 | def test_sacr2ann_2_annotations_in_1_paragraph_in_2_chains() -> None:
 24 |     text = """hello {chain1:a=1,b=2,type=ABC world}! It'{chain2:type=DEF s} sunny"""
 25 |     converter = Sacr2AnnConverter(type_property_name="type")
 26 |     converter.convert(text)
 27 | 
 28 |     assert converter.text == "hello world! It's sunny"
 29 | 
 30 |     ann = converter.annotations
 31 |     assert len(ann) == 2
 32 |     assert ann[0] == TextAnnotation(index=1, kind="ABC", start=6, end=11)
 33 |     assert ann[1] == TextAnnotation(index=2, kind="DEF", start=16, end=17)
 34 | 
 35 | 
 36 | def test_sacr2ann_2_annotations_in_1_paragraph_in_1_chain() -> None:
 37 |     text = "hello {chain1:a=1,b=2,type=ABC world}! " "It'{chain1:type=DEF s} sunny"
 38 |     converter = Sacr2AnnConverter(type_property_name="type")
 39 |     converter.convert(text)
 40 | 
 41 |     assert converter.text == "hello world! It's sunny"
 42 | 
 43 |     ann = converter.annotations
 44 |     assert len(ann) == 3
 45 |     assert ann[0] == TextAnnotation(index=1, kind="ABC", start=6, end=11)
 46 |     assert ann[1] == TextAnnotation(index=2, kind="DEF", start=16, end=17)
 47 |     assert ann[2] == RelationAnnotation(
 48 |         index=1, kind=DEFAULT_RELATION_TYPE, source=ann[0], target=ann[1]
 49 |     )
 50 | 
 51 | 
 52 | def test_sacr2ann_3_annotations_in_1_paragraph_in_1_chain() -> None:
 53 |     text = (
 54 |         "hello {chain1:a=1,b=2,type=ABC world}! "
 55 |         "It'{chain1:type=DEF s} sunny. "
 56 |         "It's not {chain1:type=ABC rainy}"
 57 |     )
 58 |     converter = Sacr2AnnConverter(type_property_name="type")
 59 |     converter.convert(text)
 60 | 
 61 |     assert converter.text == "hello world! It's sunny. It's not rainy"
 62 | 
 63 |     ann = converter.annotations
 64 |     assert len(ann) == 5
 65 |     assert ann[0] == TextAnnotation(index=1, kind="ABC", start=6, end=11)
 66 |     assert ann[1] == TextAnnotation(index=2, kind="DEF", start=16, end=17)
 67 |     assert ann[2] == RelationAnnotation(
 68 |         index=1, kind=DEFAULT_RELATION_TYPE, source=ann[0], target=ann[1]
 69 |     )
 70 |     assert ann[3] == TextAnnotation(index=3, kind="ABC", start=34, end=39)
 71 |     assert ann[4] == RelationAnnotation(
 72 |         index=2, kind=DEFAULT_RELATION_TYPE, source=ann[1], target=ann[3]
 73 |     )
 74 | 
 75 | 
 76 | def test_sacr2ann_3_annotations_in_1_paragraph_in_2_chains() -> None:
 77 |     text = (
 78 |         "hello {chain1:a=1,b=2,type=ABC world}! "
 79 |         "It'{chain2:type=DEF s} sunny. "
 80 |         "It's not {chain2:type=ABC rainy}"
 81 |     )
 82 |     converter = Sacr2AnnConverter(type_property_name="type")
 83 |     converter.convert(text)
 84 | 
 85 |     assert converter.text == "hello world! It's sunny. It's not rainy"
 86 | 
 87 |     ann = converter.annotations
 88 |     assert len(ann) == 4
 89 |     assert ann[0] == TextAnnotation(index=1, kind="ABC", start=6, end=11)
 90 |     assert ann[1] == TextAnnotation(index=2, kind="DEF", start=16, end=17)
 91 |     assert ann[2] == TextAnnotation(index=3, kind="ABC", start=34, end=39)
 92 |     assert ann[3] == RelationAnnotation(
 93 |         index=1, kind=DEFAULT_RELATION_TYPE, source=ann[1], target=ann[2]
 94 |     )
 95 | 
 96 | 
 97 | def test_sacr2ann_4_annotations_in_1_paragraph_in_2_chains() -> None:
 98 |     text = (
 99 |         "hello {chain1:a=1,b=2,type=ABC world}! "
100 |         "It'{chain2:type=DEF s} sunny. "
101 |         "It's not {chain2:type=ABC rainy}. "
102 |         "{chain1:type=GHI It}'s hot"
103 |     )
104 |     converter = Sacr2AnnConverter(type_property_name="type")
105 |     converter.convert(text)
106 | 
107 |     assert converter.text == "hello world! It's sunny. It's not rainy. It's hot"
108 | 
109 |     ann = converter.annotations
110 |     assert len(ann) == 6
111 |     assert ann[0] == TextAnnotation(index=1, kind="ABC", start=6, end=11)
112 |     assert ann[1] == TextAnnotation(index=2, kind="DEF", start=16, end=17)
113 |     assert ann[2] == TextAnnotation(index=3, kind="ABC", start=34, end=39)
114 |     assert ann[3] == RelationAnnotation(
115 |         index=1, kind=DEFAULT_RELATION_TYPE, source=ann[1], target=ann[2]
116 |     )
117 |     assert ann[4] == TextAnnotation(index=4, kind="GHI", start=41, end=43)
118 |     assert ann[5] == RelationAnnotation(
119 |         index=2, kind=DEFAULT_RELATION_TYPE, source=ann[0], target=ann[4]
120 |     )
121 | 
122 | 
123 | def test_sacr2ann_2_annotations_in_2_paragraphs_in_2_chains() -> None:
124 |     text = (
125 |         "hello {chain1:a=1,b=2,type=ABC world}!\n\n\n\n" "It'{chain2:type=DEF s} sunny"
126 |     )
127 |     converter = Sacr2AnnConverter(type_property_name="type")
128 |     converter.convert(text)
129 | 
130 |     assert converter.text == "hello world!\n\nIt's sunny"
131 | 
132 |     ann = converter.annotations
133 |     assert len(ann) == 2
134 |     assert ann[0] == TextAnnotation(index=1, kind="ABC", start=6, end=11)
135 |     assert ann[1] == TextAnnotation(index=2, kind="DEF", start=17, end=18)
136 | 
137 | 
138 | def test_sacr2ann_3_annotations_in_3_paragraphs_in_3_chains() -> None:
139 |     text = (
140 |         "hello {chain1:a=1,b=2,type=ABC world}!\n\n\n\n"
141 |         "It'{chain2:type=DEF s} sunny.\n\n"
142 |         "It's not {chain3:type=ABC rainy}"
143 |     )
144 |     converter = Sacr2AnnConverter(type_property_name="type")
145 |     converter.convert(text)
146 | 
147 |     assert converter.text == "hello world!\n\nIt's sunny.\n\nIt's not rainy"
148 | 
149 |     ann = converter.annotations
150 |     assert len(ann) == 3
151 |     assert ann[0] == TextAnnotation(index=1, kind="ABC", start=6, end=11)
152 |     assert ann[1] == TextAnnotation(index=2, kind="DEF", start=17, end=18)
153 |     assert ann[2] == TextAnnotation(index=3, kind="ABC", start=36, end=41)
154 | 
155 | 
156 | def test_sacr2ann_2_nested_annotations() -> None:
157 |     text = "{c1:type=ABC {c1:type=DEF abc} def} ghi jkl mno"
158 |     converter = Sacr2AnnConverter(type_property_name="type")
159 |     converter.convert(text)
160 | 
161 |     assert converter.text == "abc def ghi jkl mno"
162 | 
163 |     ann = converter.annotations
164 |     assert len(ann) == 3
165 |     assert ann[0] == TextAnnotation(index=1, kind="ABC", start=0, end=7)
166 |     assert ann[1] == TextAnnotation(index=2, kind="DEF", start=0, end=3)
167 |     assert ann[2] == RelationAnnotation(
168 |         index=1, kind=DEFAULT_RELATION_TYPE, source=ann[0], target=ann[1]
169 |     )
170 | 
171 | 
172 | def test_sacr2ann_3_nested_annotations() -> None:
173 |     text = "{c1:type=ABC {c1:type=DEF abc def {c1:type=GHI ghi}}} jkl mno"
174 |     converter = Sacr2AnnConverter(type_property_name="type")
175 |     converter.convert(text)
176 | 
177 |     assert converter.text == "abc def ghi jkl mno"
178 | 
179 |     ann = converter.annotations
180 |     assert len(ann) == 5
181 |     assert ann[0] == TextAnnotation(index=1, kind="ABC", start=0, end=11)
182 |     assert ann[1] == TextAnnotation(index=2, kind="DEF", start=0, end=11)
183 |     assert ann[2] == RelationAnnotation(
184 |         index=1, kind=DEFAULT_RELATION_TYPE, source=ann[0], target=ann[1]
185 |     )
186 |     assert ann[3] == TextAnnotation(index=3, kind="GHI", start=8, end=11)
187 |     assert ann[4] == RelationAnnotation(
188 |         index=2, kind=DEFAULT_RELATION_TYPE, source=ann[1], target=ann[3]
189 |     )
190 | 
191 | 
192 | def test_sacr2ann_annotations_with_leading_comments() -> None:
193 |     text = "# my comment\n\n# my other comment\n\n\n" "abc {c1 def}\n\n" "{c2 ghi}"
194 |     converter = Sacr2AnnConverter(type_property_name="type")
195 |     converter.convert(text)
196 | 
197 |     assert converter.text == "abc def\n\nghi"
198 | 
199 |     ann = converter.annotations
200 |     assert len(ann) == 2
201 |     assert ann[0] == TextAnnotation(index=1, kind=DEFAULT_MENTION_TYPE, start=4, end=7)
202 |     assert ann[1] == TextAnnotation(index=2, kind=DEFAULT_MENTION_TYPE, start=9, end=12)
203 | 
204 | 
205 | def test_sacr2ann_annotations_with_middle_comments() -> None:
206 |     text = (
207 |         "# my comment\n\n# my other comment\n\n\n"
208 |         "abc {c1 def}\n\n"
209 |         "# the middle comment\n\n"
210 |         "{c2 ghi}"
211 |     )
212 |     converter = Sacr2AnnConverter(type_property_name="type")
213 |     converter.convert(text)
214 | 
215 |     assert converter.text == "abc def\n\nghi"
216 | 
217 |     ann = converter.annotations
218 |     assert len(ann) == 2
219 |     assert ann[0] == TextAnnotation(index=1, kind=DEFAULT_MENTION_TYPE, start=4, end=7)
220 |     assert ann[1] == TextAnnotation(index=2, kind=DEFAULT_MENTION_TYPE, start=9, end=12)
221 | 
222 | 
223 | def test_sacr2ann_annotations_with_trailing_comments() -> None:
224 |     text = (
225 |         "# my comment\n\n# my other comment\n\n\n"
226 |         "abc {c1 def}\n\n"
227 |         "# the middle comment\n\n"
228 |         "{c2 ghi}\n\n"
229 |         "# end of text"
230 |     )
231 |     converter = Sacr2AnnConverter(type_property_name="type")
232 |     converter.convert(text)
233 | 
234 |     assert converter.text == "abc def\n\nghi\n\n"
235 | 
236 |     ann = converter.annotations
237 |     assert len(ann) == 2
238 |     assert ann[0] == TextAnnotation(index=1, kind=DEFAULT_MENTION_TYPE, start=4, end=7)
239 |     assert ann[1] == TextAnnotation(index=2, kind=DEFAULT_MENTION_TYPE, start=9, end=12)
240 | 
241 | 
242 | def test_sacr2ann_annotations_with_spaces() -> None:
243 |     text = "# my comment\n\n# my other comment\n\n\n" "  abc  {c1 def  ghi}"
244 |     converter = Sacr2AnnConverter(type_property_name="type")
245 |     converter.convert(text)
246 | 
247 |     assert converter.text == "  abc  def  ghi"
248 | 
249 |     ann = converter.annotations
250 |     assert len(ann) == 1
251 |     assert ann[0] == TextAnnotation(index=1, kind=DEFAULT_MENTION_TYPE, start=7, end=15)
252 | 
253 | 
254 | def test_sacr2ann_type_property_name() -> None:
255 |     text = "abc {c1:type=foo def}"
256 |     converter = Sacr2AnnConverter(type_property_name="type")
257 |     converter.convert(text)
258 | 
259 |     assert converter.text == "abc def"
260 | 
261 |     ann = converter.annotations
262 |     assert len(ann) == 1
263 |     assert ann[0] == TextAnnotation(index=1, kind="foo", start=4, end=7)
264 | 
265 | 
266 | def test_convert_annotations() -> None:
267 |     text = "hello world! It's sunny. It's not rainy. It's hot"
268 | 
269 |     annotations: list[Annotation] = []
270 |     annotations.append(TextAnnotation(index=1, kind="ABC", start=6, end=11))
271 |     annotations.append(TextAnnotation(index=2, kind="DEF", start=16, end=17))
272 |     annotations.append(TextAnnotation(index=3, kind="ABC", start=34, end=39))
273 |     annotations.append(
274 |         RelationAnnotation(
275 |             index=1,
276 |             kind=DEFAULT_RELATION_TYPE,
277 |             source=annotations[1],
278 |             target=annotations[2],
279 |         )
280 |     )
281 |     annotations.append(TextAnnotation(index=4, kind="GHI", start=41, end=43))
282 |     annotations.append(
283 |         RelationAnnotation(
284 |             index=2,
285 |             kind=DEFAULT_RELATION_TYPE,
286 |             source=annotations[0],
287 |             target=annotations[4],
288 |         )
289 |     )
290 | 
291 |     string = Sacr2AnnConverter._convert_annotations_as_string(text, annotations)
292 |     assert string == (
293 |         "T1\tABC 6 11\tworld\n"
294 |         "T2\tDEF 16 17\ts\n"
295 |         "T3\tABC 34 39\trainy\n"
296 |         f"R1\t{DEFAULT_RELATION_TYPE} Arg1:T2 Arg2:T3\n"
297 |         "T4\tGHI 41 43\tIt\n"
298 |         f"R2\t{DEFAULT_RELATION_TYPE} Arg1:T1 Arg2:T4\n"
299 |     )
300 | 


--------------------------------------------------------------------------------
/tests/test_sacr2annotable.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | from sacr2annotable import Sacr2AnnotableConverter
  4 | 
  5 | 
  6 | @pytest.fixture
  7 | def text1() -> str:
  8 |     return (
  9 |         "{c1:prop1=a,prop2=b {c2:prop1=cc,prop2=dd abc def} ghi}. jkl {c1:prop1=eee,prop2=fff mno}.\n\n"
 10 |         "pqr stu {c1:prop1=gggg,prop2=hhhh vwx}\n\n"
 11 |     )
 12 | 
 13 | 
 14 | @pytest.fixture
 15 | def text2() -> str:
 16 |     return (
 17 |         "#textid:mytext\n\n"
 18 |         "#textmetadata:type=literature\n"
 19 |         "#textmetadata:genre=\n"
 20 |         "#textmetadata:century=19\n\n"
 21 |         "ABC {c2:prop1=A,prop2=B DEF {c2:prop1=CC,prop2=DD GHI}} ! JKL MNO\n\n"
 22 |         "PRQ ? {c3:prop1=EEE,prop2=FFF STU}.\n\n"
 23 |         "# comment 1\n"
 24 |         "# comment 2\n\n"
 25 |         "{c2:prop1=GGGG,prop2=HHHH VWX}"
 26 |     )
 27 | 
 28 | 
 29 | def test_sacr2annotable_converter(text1: str, text2: str) -> None:
 30 |     conv = Sacr2AnnotableConverter()
 31 |     conv.convert_text(text1)
 32 |     conv.convert_text(text2)
 33 |     corpus = conv.corpus
 34 | 
 35 |     t1 = corpus._texts[0]
 36 |     t2 = corpus._texts[1]
 37 | 
 38 |     assert t1.name is None
 39 |     assert t2.name == "mytext"
 40 | 
 41 |     expected_tokens = "abc def ghi . jkl mno . pqr stu vwx".split()
 42 |     assert t1.token_count == len(expected_tokens)
 43 |     assert [t.value for t in t1.tokens] == expected_tokens
 44 | 
 45 |     expected_tokens = "ABC DEF GHI ! JKL MNO PRQ ? STU . VWX".split()
 46 |     assert t2.token_count == len(expected_tokens)
 47 |     assert [t.value for t in t2.tokens] == expected_tokens
 48 | 
 49 |     assert t1.sentence_count == 3
 50 |     assert t2.sentence_count == 5
 51 |     assert t1.paragraph_count == 2
 52 |     assert t2.paragraph_count == 3
 53 |     assert t1.mention_count == 4
 54 |     assert t2.mention_count == 4
 55 |     assert t1.chain_count == 2
 56 |     assert t2.chain_count == 2
 57 | 
 58 |     mentions = list(corpus.iter_text_mentions_as_dict())
 59 |     assert mentions[0]["string"] == "abc def ghi"
 60 |     assert mentions[3]["string"] == "abc def"
 61 | 
 62 |     assert mentions[4]["string"] == "DEF GHI"
 63 |     assert mentions[5]["string"] == "GHI"
 64 | 
 65 |     assert mentions[0]["index_of_mention_in_the_text"] == 0
 66 |     assert mentions[1]["index_of_mention_in_the_text"] == 2
 67 |     assert mentions[2]["index_of_mention_in_the_text"] == 3
 68 |     assert mentions[3]["index_of_mention_in_the_text"] == 1
 69 |     assert mentions[4]["index_of_mention_in_the_text"] == 0
 70 |     assert mentions[5]["index_of_mention_in_the_text"] == 1
 71 |     assert mentions[6]["index_of_mention_in_the_text"] == 3
 72 |     assert mentions[7]["index_of_mention_in_the_text"] == 2
 73 | 
 74 |     assert mentions[1]["prop1"] == "eee"
 75 |     assert mentions[1]["prop2"] == "fff"
 76 |     assert mentions[5]["prop1"] == "CC"
 77 |     assert mentions[5]["prop2"] == "DD"
 78 |     assert mentions[7]["is_singleton"] is True
 79 | 
 80 |     chains = list(corpus.iter_text_chains_as_dict())
 81 |     assert chains[0]["size"] == 3
 82 |     assert chains[2]["size"] == 3
 83 |     assert chains[3]["size"] == 1
 84 | 
 85 | 
 86 | def test_sacr2annotable_text_metadata(text1: str, text2: str) -> None:
 87 |     conv = Sacr2AnnotableConverter()
 88 |     conv.convert_text(text1)
 89 |     conv.convert_text(text2)
 90 |     corpus = conv.corpus
 91 | 
 92 |     with pytest.raises(KeyError):
 93 |         _ = corpus._texts[0].metadata["type"]
 94 | 
 95 |     with pytest.raises(KeyError):
 96 |         _ = corpus._texts[0].metadata["genre"]
 97 | 
 98 |     with pytest.raises(KeyError):
 99 |         _ = corpus._texts[0].metadata["century"]
100 | 
101 |     assert corpus._texts[1].metadata["type"] == "literature"
102 |     assert corpus._texts[1].metadata["genre"] == ""
103 |     assert corpus._texts[1].metadata["century"] == "19"
104 | 


--------------------------------------------------------------------------------
/tests/test_sacr_parser.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | from sacr_parser2 import (
  4 |     Comment,
  5 |     NewLineInsideParagraph,
  6 |     ParagraphEnd,
  7 |     ParagraphStart,
  8 |     SacrParser,
  9 |     Spaces,
 10 |     TextID,
 11 |     Token,
 12 |     Word,
 13 | )
 14 | 
 15 | text1 = """#textid:abc-123
 16 | 
 17 | # my comment
 18 | # my other comment
 19 | 
 20 | abc def ghi
 21 | klm nop
 22 | qrs
 23 | 
 24 | # comment between
 25 | 
 26 | ABC DEF GHI
 27 | 
 28 | xyz
 29 | XYZ
 30 | 
 31 | # end comment"""
 32 | 
 33 | 
 34 | tokens1 = [
 35 |     TextID(start=0, end=17, text_id="abc-123"),
 36 |     Comment(start=17, end=30, value="my comment"),
 37 |     Comment(start=30, end=50, value="my other comment"),
 38 |     ParagraphStart(start=50, end=50),
 39 |     Word(start=50, end=53, value="abc"),
 40 |     Spaces(start=53, end=54, value=" "),
 41 |     Word(start=54, end=57, value="def"),
 42 |     Spaces(start=57, end=58, value=" "),
 43 |     Word(start=58, end=61, value="ghi"),
 44 |     NewLineInsideParagraph(start=61, end=62, value="\n"),
 45 |     Word(start=62, end=65, value="klm"),
 46 |     Spaces(start=65, end=66, value=" "),
 47 |     Word(start=66, end=69, value="nop"),
 48 |     NewLineInsideParagraph(start=69, end=70, value="\n"),
 49 |     Word(start=70, end=73, value="qrs"),
 50 |     ParagraphEnd(start=73, end=75),
 51 |     Comment(start=75, end=94, value="comment between"),
 52 |     ParagraphStart(start=94, end=94),
 53 |     Word(start=94, end=97, value="ABC"),
 54 |     Spaces(start=97, end=98, value=" "),
 55 |     Word(start=98, end=101, value="DEF"),
 56 |     Spaces(start=101, end=102, value=" "),
 57 |     Word(start=102, end=105, value="GHI"),
 58 |     ParagraphEnd(start=105, end=107),
 59 |     ParagraphStart(start=107, end=107),
 60 |     Word(start=107, end=110, value="xyz"),
 61 |     NewLineInsideParagraph(start=110, end=111, value="\n"),
 62 |     Word(start=111, end=114, value="XYZ"),
 63 |     ParagraphEnd(start=114, end=116),
 64 |     Comment(start=116, end=129, value="end comment"),
 65 | ]
 66 | 
 67 | 
 68 | text2 = """abc def
 69 | ghi"""
 70 | 
 71 | 
 72 | tokens2 = [
 73 |     ParagraphStart(start=0, end=0),
 74 |     Word(start=0, end=3, value="abc"),
 75 |     Spaces(start=3, end=4, value=" "),
 76 |     Word(start=4, end=7, value="def"),
 77 |     NewLineInsideParagraph(start=7, end=8, value="\n"),
 78 |     Word(start=8, end=11, value="ghi"),
 79 | ]
 80 | 
 81 | 
 82 | text3 = """#hello
 83 | #textid:abc-123
 84 | abc def
 85 | # not a comment
 86 | """
 87 | 
 88 | 
 89 | tokens3 = [
 90 |     Comment(start=0, end=7, value="hello"),
 91 |     TextID(start=7, end=23, text_id="abc-123"),
 92 |     ParagraphStart(start=23, end=23),
 93 |     Word(start=23, end=26, value="abc"),
 94 |     Spaces(start=26, end=27, value=" "),
 95 |     Word(start=27, end=30, value="def"),
 96 |     NewLineInsideParagraph(start=30, end=31, value="\n"),
 97 |     Word(start=31, end=32, value="#"),
 98 |     Spaces(start=32, end=33, value=" "),
 99 |     Word(start=33, end=36, value="not"),
100 |     Spaces(start=36, end=37, value=" "),
101 |     Word(start=37, end=38, value="a"),
102 |     Spaces(start=38, end=39, value=" "),
103 |     Word(start=39, end=46, value="comment"),
104 |     NewLineInsideParagraph(start=46, end=47, value="\n"),
105 | ]
106 | 
107 | text4 = """#comment
108 | abc def
109 | 
110 | # comment
111 | """
112 | 
113 | tokens4 = [
114 |     Comment(start=0, end=9, value="comment"),
115 |     ParagraphStart(start=9, end=9),
116 |     Word(start=9, end=12, value="abc"),
117 |     Spaces(start=12, end=13, value=" "),
118 |     Word(start=13, end=16, value="def"),
119 |     ParagraphEnd(start=16, end=18),
120 |     Comment(start=18, end=28, value="comment"),
121 | ]
122 | 
123 | 
124 | @pytest.mark.parametrize(
125 |     "text, tokens",
126 |     [
127 |         (text1, tokens1),
128 |         (text2, tokens2),
129 |         (text3, tokens3),
130 |         (text4, tokens4),
131 |     ],
132 | )
133 | def test_parse_texts(text: str, tokens: list[Token]) -> None:
134 |     parser = SacrParser(text)
135 |     actual_tokens = list(parser.parse())
136 |     assert len(actual_tokens) == len(tokens)
137 |     for a_t, t in zip(actual_tokens, tokens):
138 |         assert a_t == t
139 | 


--------------------------------------------------------------------------------
/text2jsonlines.py:
--------------------------------------------------------------------------------
  1 | """Convert plain text to jsonlines.  The jsonlines format stores data for
  2 | several texts (a corpus).  Each line is a valid json document, as follows:
  3 | 
  4 |     {
  5 |       "clusters": [],
  6 |       "doc_key": "nw:docname",
  7 |       "sentences": [["This", "is", "the", "first", "sentence", "."],
  8 |                     ["This", "is", "the", "second", "."]],
  9 |       "speakers":  [["spk1", "spk1", "spk1", "spk1", "spk1", "spk1"],
 10 |                     ["spk2", "spk2", "spk2", "spk2", "spk2"]]
 11 |       "pos":       [["DET", "V", "DET", "ADJ", "NOUN", "PUNCT"],
 12 |                     ["DET", "V", "DET", "ADJ", "PUNCT"]],
 13 |     }
 14 | 
 15 | It is used for some coreference resolution systems, such as:
 16 | 
 17 | - https://github.com/kentonl/e2e-coref
 18 | - https://github.com/kkjawz/coref-ee
 19 | - https://github.com/boberle/cofr
 20 | 
 21 | Tokenization is done with StanfordNLP
 22 | (https://github.com/stanfordnlp/stanfordnlp) (Qi, Dozat, Zhang, Manning 2018).
 23 | 
 24 | You need to install StanfordNLP via pip and then load the models, for example
 25 | for French models (use "en" for English models):
 26 | 
 27 |     python3 -c "import stanfordnlp; stanfordnlp.download('fr')"
 28 | 
 29 | Notes:
 30 | - the doc key is the concatenation of `--genre` and the file path,
 31 | - speaker data are left blank ("_")
 32 | """
 33 | 
 34 | # (C) Bruno Oberle 2020 - Mozilla Public Licence 2.0
 35 | 
 36 | 
 37 | import argparse
 38 | import json
 39 | import re
 40 | 
 41 | import stanfordnlp
 42 | from stanfordnlp.models.common.conll import CoNLLFile
 43 | 
 44 | # download French models:
 45 | #stanfordnlp.download('fr')
 46 | 
 47 | 
 48 | def tokenize(fpath, lang):
 49 | 
 50 |     content = open(fpath).read()
 51 |     paragraphs = re.split(r'\n+', content)
 52 |     res_sents = []
 53 |     res_pars = []
 54 |     res_pos = []
 55 |     start_par = 0
 56 |     for par in paragraphs:
 57 |         par = par.strip()
 58 |         if not par:
 59 |             continue
 60 |         doc = stanfordnlp.Document(par)
 61 |         nlp = stanfordnlp.Pipeline(lang=lang, processors="tokenize,mwt,pos")
 62 |         doc = nlp(doc)
 63 |         #print(doc.conll_file.conll_as_string())
 64 |         #print(doc.conll_file.sents)
 65 |         sents = [
 66 |             [ token[1] for token in sent if '-' not in token[0] ]
 67 |             for sent in doc.conll_file.sents
 68 |         ]
 69 |         pos = [
 70 |             [ token[3] for token in sent if '-' not in token[0] ]
 71 |             for sent in doc.conll_file.sents
 72 |         ]
 73 |         res_sents.extend(sents)
 74 |         res_pos.extend(pos)
 75 |         length = sum((len(s) for s in sents))
 76 |         res_pars.append([start_par, start_par+length-1])
 77 |         start_par = start_par+length
 78 |     return res_sents, res_pos, res_pars
 79 | 
 80 | 
 81 | def make_jsonlines(sents, pos, pars, fpath, genre):
 82 |     doc = dict(
 83 |         doc_key = f"{genre[:2]}:{fpath}",
 84 |         sentences = sents,
 85 |         speakers = [ [ "_" for tok in sent ] for sent in sents ],
 86 |         clusters = [],
 87 |         pos = pos,
 88 |         paragraphs = pars,
 89 |     )
 90 |     return json.dumps(doc)
 91 | 
 92 | 
 93 | 
 94 | def make_conll(sents, fpath, genre):
 95 |     res = f"#begin document {genre[:2]}:{fpath}\n"
 96 |     for sent in sents:
 97 |         for i, token in enumerate(sent):
 98 |             res += f"{i+1}\t{token}\n"
 99 |         res += "\n"
100 |     res += "#end document"
101 |     return res
102 | 
103 | 
104 | 
105 | def parse_args():
106 |     # definition
107 |     parser = argparse.ArgumentParser(prog="text2jsonlines",
108 |         description=__doc__,
109 |         formatter_class=argparse.RawDescriptionHelpFormatter)
110 |     # arguments (not options)
111 |     parser.add_argument("infpath", default="", help="input file")
112 |     # options
113 |     parser.add_argument("--conll", dest="export_conll", default=False,
114 |        action="store_true",
115 |        help="export conll and not jsonlines (for debugging)")
116 |     parser.add_argument("--genre", dest="genre", default="ge",
117 |         help="genre (default is 'ge')")
118 |     parser.add_argument("--lang", dest="lang", default="en",
119 |         help="lang: en, fr, etc. (default is 'en')")
120 |     parser.add_argument("-o", dest="outfpath", required=False,
121 |         default=None, help="output file (default to stdout)")
122 |     # reading
123 |     args = parser.parse_args()
124 |     return args
125 | 
126 | 
127 | 
128 | def main():
129 |     args = parse_args()
130 |     sents, pos, pars = tokenize(args.infpath, lang=args.lang)
131 |     if args.export_conll:
132 |         code = make_conll(sents, fpath=args.infpath, genre=args.genre)
133 |     else:
134 |         code = make_jsonlines(sents, pos, pars,
135 |             fpath=args.infpath, genre=args.genre)
136 |     if args.outfpath:
137 |         open(args.outfpath, 'w').write(code + "\n")
138 |     else:
139 |         print(code)
140 | 
141 | 
142 | 
143 | if __name__ == '__main__':
144 |     main()
145 | 


--------------------------------------------------------------------------------