├── setup.cfg ├── src └── poioapi │ ├── VERSION │ ├── tests │ ├── sample_files │ │ ├── tcf_graf │ │ │ ├── corpus.txt │ │ │ ├── test_write.tcf │ │ │ ├── corpus.hdr │ │ │ ├── corpus-sentences.xml │ │ │ ├── corpus-POStags.xml │ │ │ ├── corpus-lemmas.xml │ │ │ └── corpus-tokens.xml │ │ ├── mapper │ │ │ ├── example_export.json │ │ │ └── example.json │ │ ├── toolbox_graf │ │ │ ├── toolbox.txt │ │ │ ├── toolbox-itmGroup.xml │ │ │ ├── toolbox.hdr │ │ │ └── toolbox_latex.txt │ │ ├── balochi_graf │ │ │ ├── balochi-comment.xml │ │ │ ├── balochi.hdr │ │ │ ├── balochi-graid2.xml │ │ │ └── balochi.txt │ │ ├── brat_graf │ │ │ ├── annotation.conf │ │ │ └── dict-aleman2000-9-69.hdr │ │ ├── elan_graf │ │ │ ├── example-gestures.xml │ │ │ ├── example-gesture_meaning.xml │ │ │ ├── example.hdr │ │ │ ├── example-gesture_phases.xml │ │ │ ├── example-utterance.xml │ │ │ └── example-phonetic_transcription.xml │ │ ├── typecraft_graf │ │ │ ├── typecraft_example.hdr │ │ │ ├── typecraft_example-description.xml │ │ │ ├── typecraft_example-phrase.xml │ │ │ └── typecraft_example-translation.xml │ │ ├── mandinka │ │ │ ├── mandinka_latex.txt │ │ │ └── mandinka.txt │ │ ├── latex │ │ │ ├── mandinka_latex_expected.tex │ │ │ └── toolbox_latex_expected.tex │ │ ├── shoebox_graf │ │ │ └── shoebox.xml │ │ └── odin │ │ │ └── odin_test.xml │ ├── io │ │ ├── __init__.py │ │ ├── test_brat.py │ │ ├── test_latex.py │ │ ├── test_obt.py │ │ ├── test_shoebox.py │ │ ├── test_mandinka.py │ │ ├── test_toolbox.py │ │ ├── test_toolboxxml.py │ │ ├── test_memory.py │ │ ├── test_elan.py │ │ ├── test_odin.py │ │ ├── test_tcf.py │ │ ├── test_typecraft.py │ │ └── test_graf.py │ ├── __init__.py │ ├── test_mapper.py │ ├── test_annotationgraph.py │ └── test_data.py │ ├── __init__.py │ ├── io │ ├── __init__.py │ ├── wikipedia_extractor.py │ ├── brat.py │ ├── shoebox.py │ ├── toolboxxml.py │ ├── obt.py │ └── memory.py │ ├── mappings │ ├── MANDINKA_TYPECRAFT.json │ └── ODIN_TYPECRAFT.json │ └── corpus.py ├── requirements.txt ├── doc ├── _static │ ├── Thumbs.db │ ├── graf_schema.png │ ├── typecraft_import.png │ ├── elan_tier_attributes.png │ ├── elan_tier_hierarchy.png │ ├── excel_screenshit_hinuq.jpg │ ├── calc_settings_csvexport.png │ └── excel_screenshit_hinuq_small.jpg ├── data.rst ├── io.elan.rst ├── io.graf.rst ├── io.typecraft.rst ├── annotationtree.rst ├── annotationgraph.rst ├── io.pickle.rst ├── modules.rst ├── index.rst ├── parserwriter.rst ├── brat.rst ├── Makefile ├── make.bat └── analysis.rst ├── example_data ├── toolbox.txt └── balochi_graf │ ├── balochi-comment.xml │ ├── balochi.hdr │ ├── balochi-graid2.xml │ └── balochi.txt ├── MANIFEST.in ├── .gitignore ├── README.rst ├── examples ├── filter.py └── poio_converter.py └── setup.py /setup.cfg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/poioapi/VERSION: -------------------------------------------------------------------------------- 1 | 0.3.6 -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpydoc 2 | regex 3 | -------------------------------------------------------------------------------- /doc/_static/Thumbs.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cidles/poio-api/HEAD/doc/_static/Thumbs.db -------------------------------------------------------------------------------- /src/poioapi/tests/sample_files/tcf_graf/corpus.txt: -------------------------------------------------------------------------------- 1 | Peter aß eine Käsepizza. Sie schmeckte ihm. -------------------------------------------------------------------------------- /example_data/toolbox.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cidles/poio-api/HEAD/example_data/toolbox.txt -------------------------------------------------------------------------------- /doc/_static/graf_schema.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cidles/poio-api/HEAD/doc/_static/graf_schema.png -------------------------------------------------------------------------------- /doc/_static/typecraft_import.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cidles/poio-api/HEAD/doc/_static/typecraft_import.png -------------------------------------------------------------------------------- /doc/_static/elan_tier_attributes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cidles/poio-api/HEAD/doc/_static/elan_tier_attributes.png -------------------------------------------------------------------------------- /doc/_static/elan_tier_hierarchy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cidles/poio-api/HEAD/doc/_static/elan_tier_hierarchy.png -------------------------------------------------------------------------------- /doc/_static/excel_screenshit_hinuq.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cidles/poio-api/HEAD/doc/_static/excel_screenshit_hinuq.jpg -------------------------------------------------------------------------------- /doc/_static/calc_settings_csvexport.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cidles/poio-api/HEAD/doc/_static/calc_settings_csvexport.png -------------------------------------------------------------------------------- /doc/data.rst: -------------------------------------------------------------------------------- 1 | ============ 2 | poioapi.data 3 | ============ 4 | 5 | .. automodule:: poioapi.data 6 | :members: 7 | 8 | 9 | -------------------------------------------------------------------------------- /doc/io.elan.rst: -------------------------------------------------------------------------------- 1 | =============== 2 | poioapi.io.elan 3 | =============== 4 | 5 | .. automodule:: poioapi.io.elan 6 | :members: 7 | 8 | -------------------------------------------------------------------------------- /doc/io.graf.rst: -------------------------------------------------------------------------------- 1 | =============== 2 | poioapi.io.graf 3 | =============== 4 | 5 | .. automodule:: poioapi.io.graf 6 | :members: 7 | 8 | -------------------------------------------------------------------------------- /src/poioapi/tests/sample_files/mapper/example_export.json: -------------------------------------------------------------------------------- 1 | { 2 | "gloss": { 3 | "3PL": "", 4 | "4PL": "" 5 | } 6 | } -------------------------------------------------------------------------------- /doc/_static/excel_screenshit_hinuq_small.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cidles/poio-api/HEAD/doc/_static/excel_screenshit_hinuq_small.jpg -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE README.rst MANIFEST.in requirements.txt 2 | include setup.py distribute_setup.py 3 | include src/poioapi/VERSION 4 | 5 | -------------------------------------------------------------------------------- /doc/io.typecraft.rst: -------------------------------------------------------------------------------- 1 | ==================== 2 | poioapi.io.typecraft 3 | ==================== 4 | 5 | .. automodule:: poioapi.io.typecraft 6 | :members: 7 | 8 | -------------------------------------------------------------------------------- /src/poioapi/tests/sample_files/toolbox_graf/toolbox.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cidles/poio-api/HEAD/src/poioapi/tests/sample_files/toolbox_graf/toolbox.txt -------------------------------------------------------------------------------- /doc/annotationtree.rst: -------------------------------------------------------------------------------- 1 | ====================== 2 | poioapi.annotationtree 3 | ====================== 4 | 5 | .. automodule:: poioapi.annotationtree 6 | :members: 7 | 8 | -------------------------------------------------------------------------------- /doc/annotationgraph.rst: -------------------------------------------------------------------------------- 1 | ======================= 2 | poioapi.annotationgraph 3 | ======================= 4 | 5 | .. automodule:: poioapi.annotationgraph 6 | :members: 7 | 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_STORE 2 | *.pyc 3 | .idea 4 | .vscode 5 | dist/ 6 | build/ 7 | doc/_build/ 8 | src/Poio_API.egg-info/ 9 | src/poio_api.egg-info/ 10 | graf_python.egg-info/ 11 | src/graf_python.egg-info/ 12 | -------------------------------------------------------------------------------- /src/poioapi/tests/sample_files/mapper/example.json: -------------------------------------------------------------------------------- 1 | { 2 | "tier_mapping": { 3 | "gloss": ["gloss"], 4 | "part of speech": ["pos"] 5 | }, 6 | "gloss": { 7 | "1SG": "1SG", 8 | "SINGLE": "ANNOTATION", 9 | "MULTI, TAG": "TEST" 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /doc/io.pickle.rst: -------------------------------------------------------------------------------- 1 | ================= 2 | poioapi.io.pickle 3 | ================= 4 | 5 | .. automodule:: poioapi.io.pickle 6 | :members: 7 | 8 | Indices and tables 9 | ================== 10 | 11 | * :ref:`genindex` 12 | * :ref:`modindex` 13 | * :ref:`search` -------------------------------------------------------------------------------- /src/poioapi/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Poio Tools for Linguists 4 | # 5 | # Copyright (C) 2009-2013 Poio Project 6 | # Author: Peter Bouda 7 | # URL: 8 | # For license information, see LICENSE.TXT 9 | -------------------------------------------------------------------------------- /src/poioapi/io/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Poio Tools for Linguists 4 | # 5 | # Copyright (C) 2009-2013 Poio Project 6 | # Author: António Lopes 7 | # URL: 8 | # For license information, see LICENSE.TXT -------------------------------------------------------------------------------- /src/poioapi/tests/io/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Poio Tools for Linguists 4 | # 5 | # Copyright (C) 2009-2013 Poio Project 6 | # Author: Antonio Lopes 7 | # URL: 8 | # For license information, see LICENSE.TXT 9 | -------------------------------------------------------------------------------- /src/poioapi/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Poio Tools for Linguists 4 | # 5 | # Copyright (C) 2009-2013 Poio Project 6 | # Author: Antonio Lopes 7 | # URL: 8 | # For license information, see LICENSE.TXT 9 | -------------------------------------------------------------------------------- /doc/modules.rst: -------------------------------------------------------------------------------- 1 | *************** 2 | PoioAPI Package 3 | *************** 4 | 5 | .. toctree:: 6 | :maxdepth: 1 7 | 8 | data 9 | annotationtree 10 | annotationgraph 11 | 12 | ****************** 13 | PoioAPI IO Package 14 | ****************** 15 | 16 | .. toctree:: 17 | :maxdepth: 1 18 | 19 | io.elan 20 | io.graf 21 | io.pickle 22 | io.typecraft 23 | -------------------------------------------------------------------------------- /example_data/balochi_graf/balochi-comment.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /src/poioapi/tests/sample_files/balochi_graf/balochi-comment.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /src/poioapi/tests/sample_files/tcf_graf/test_write.tcf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Poio API conversion 5 | 6 | 7 | file:///D:/Data/elan/elan-example1.mpg 8 | h:\ProjectsWin\git-github\poio-api\src\poioapi\tests\sample_files\elan_graf\example.eaf 9 | 10 | 11 | -------------------------------------------------------------------------------- /src/poioapi/tests/sample_files/brat_graf/annotation.conf: -------------------------------------------------------------------------------- 1 | [entities] 2 | formatting 3 | italic 4 | tab 5 | newline 6 | bold 7 | underline 8 | superscript 9 | smallcaps 10 | hyphen 11 | pagebreak 12 | dictinterpretation 13 | head 14 | pos 15 | translation 16 | crossreference 17 | counterpart 18 | footnote 19 | stratum 20 | phonology 21 | boundary 22 | dialectidentification 23 | headorth 24 | typo 25 | iso-639-3 26 | spa 27 | des 28 | doculect 29 | Desano 30 | Espan_ol 31 | 32 | [relations] 33 | # To Arg1:, Arg2: 34 | Arg1:, Arg2:, : 35 | 36 | [events] 37 | # none 38 | 39 | [attributes] 40 | # none 41 | -------------------------------------------------------------------------------- /src/poioapi/tests/sample_files/elan_graf/example-gestures.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | R Gesture Unit 1 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | Poio API 2 | ======== 3 | Poio API is a free and open source Python library to access and search data from 4 | language documentation in your linguistic analysis workflow. It converts file 5 | formats like Elan's EAF, Toolbox files, Typecraft XML and others into annotation 6 | graphs as defined in ISO 24612. Those graphs, for which we use an implementation 7 | called "Graph Annotation Framework" (GrAF), allow unified access to linguistic 8 | data from a wide range sources. 9 | 10 | For documentation, please visit http://media.cidles.eu/poio/poio-api/ 11 | 12 | License 13 | ------- 14 | 15 | Poio API source code is distributed under the Apache 2.0 License. 16 | 17 | Poio API documentation is distributed under the Creative Commons Attribution 18 | 3.0 Unported. 19 | -------------------------------------------------------------------------------- /src/poioapi/tests/sample_files/elan_graf/example-gesture_meaning.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | a roundabout 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /doc/index.rst: -------------------------------------------------------------------------------- 1 | .. Poio API documentation master file, created by 2 | sphinx-quickstart on Tue Feb 21 11:21:42 2012. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to Poio API's documentation! 7 | ************************************ 8 | 9 | Poio API provides access to language documentation data and a wide range of annotations schemes stored in different file formats. 10 | 11 | The project's homepage is: http://media.cidles.eu/poio/poio-api/ 12 | 13 | 14 | Contents 15 | ======== 16 | 17 | .. toctree:: 18 | :maxdepth: 2 19 | 20 | introduction 21 | conversion 22 | parserwriter 23 | analysis 24 | 25 | 26 | API documentation 27 | ================= 28 | 29 | .. toctree:: 30 | :maxdepth: 2 31 | 32 | modules 33 | 34 | 35 | Indices and tables 36 | ================== 37 | 38 | * :ref:`genindex` 39 | * :ref:`search` 40 | 41 | -------------------------------------------------------------------------------- /src/poioapi/tests/sample_files/tcf_graf/corpus.hdr: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | TCF Example 6 | 7 | 8 | Documentation Place 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /src/poioapi/tests/sample_files/toolbox_graf/toolbox-itmGroup.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | itmGroup..n013 13 | itmGroup..n014 14 | 15 | 16 | 17 | 18 | 19 | Wak nmatu—The pig wife 20 | Kalsarap Namaf 21 | 22 | 23 | 24 | 25 | 26 | Kalsarap on coconuts 27 | Kalsarap Namaf 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /src/poioapi/tests/sample_files/tcf_graf/corpus-sentences.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | sentences..ns1 13 | sentences..ns2 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | t1 t2 t3 t4 t5 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | t6 t7 t8 t9 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /src/poioapi/tests/sample_files/toolbox_graf/toolbox.hdr: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Toolbox XML Example 6 | 7 | 8 | Documentation Place 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /example_data/balochi_graf/balochi.hdr: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Pickle Example 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /example_data/balochi_graf/balochi-graid2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 0.h:a 19 | 20 | 21 | 22 | 23 | 24 | 25 | nc 26 | 27 | 28 | 29 | 30 | 31 | 32 | 0:s 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /src/poioapi/tests/sample_files/balochi_graf/balochi.hdr: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Pickle Example 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /src/poioapi/tests/sample_files/balochi_graf/balochi-graid2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 0.h:a 19 | 20 | 21 | 22 | 23 | 24 | 25 | nc 26 | 27 | 28 | 29 | 30 | 31 | 32 | 0:s 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /src/poioapi/tests/sample_files/typecraft_graf/typecraft_example.hdr: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Typecraft Example 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /src/poioapi/tests/sample_files/elan_graf/example.hdr: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Elan Example 6 | 7 | 8 | Documentation Place 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /doc/parserwriter.rst: -------------------------------------------------------------------------------- 1 | Parser and Writer classes to map from and to file formats 2 | ********************************************************* 3 | 4 | This chapter explains how the Parser and Writer classes in Poio API work. You 5 | will learn how to write your own parsers and writers to support a custom file 6 | format. Poio API already support a lot of file formats out of the box, which 7 | are explained in the following sections. In any case the parser 8 | class is used by a general `Converter` class to map the file format onto 9 | a GrAF object. The user may then modify the GrAF object and write back the 10 | changes to any of the supported file format (or a custom format, if you 11 | implemented a writer). The following Python code demonstrates how one 12 | file format can be convert to another one with support of an existing parser 13 | and writer class: 14 | 15 | .. code-block:: python 16 | 17 | parser = poioapi.io.wikipedia_extractor.Parser("Wikipedia.xml") 18 | writer = poioapi.io.graf.Writer() 19 | 20 | converter = poioapi.io.graf.GrAFConverter(parser, writer) 21 | converter.parse() 22 | converter.write("Wikipedia.hdr") 23 | 24 | This code parses from the XML output of the `Wikipedia Extractor 25 | `_ and writes the content 26 | as GrAF files. 27 | 28 | **Contents** 29 | 30 | .. toctree:: 31 | :maxdepth: 2 32 | 33 | parserwriter_howto 34 | excel 35 | -------------------------------------------------------------------------------- /src/poioapi/tests/sample_files/mandinka/mandinka_latex.txt: -------------------------------------------------------------------------------- 1 | Musu-kéebâa níŋ a lá maañóo le táa-tá lóo-ñín-óo la. 2 | femme-âgé.D avec 3SG GEN jeune_épouse.D FOC aller-ACPP bois-chercher-D OBL 3 | Une vieille femme et sa jeune co-épouse étaient allées chercher du bois. 4 | Kabíríŋ i yé i la lôo lá-ndi, 5 | quand 3PL ACPP 3PL GEN bois.D être_posé-CAUS 6 | Quand elles ont posé leur bois [avant de le charger], 7 | saa mínín-tá maañóo la lóo-sít-óo la, 8 | serpent.D s’enrouler-ACPP jeune_épouse.D GEN bois-attacher-D OBL 9 | un serpent s’est enroulé autour du fagot de la jeune épouse, 10 | barí wo máŋ a lóŋ. 11 | mais DEM ACPN 3SG savoir 12 | mais celle-ci ne s’en est pas aperçue. 13 | Musu-keebaa-mâa ñáa be sǎa kaŋ, 14 | femme-âgé-SELECT.D œil.D COPLOC serpent.D sur 15 | La vieille avait son regard fixé sur le serpent 16 | míŋ be mínín-diŋ a la lóo-sít-ôo bála, 17 | REL COPLOC s’enrouler-RES 3SG GEN bois-attacher-D CONT 18 | qui était enroulé au fagot de la jeune co-épouse, 19 | a yé a fó a ye kó, 20 | 3SG ACPP 3SG dire 3SG BEN QUOT 21 | elle lui a répondu, 22 | « Níŋ yunduyónd-óo son-ta, 23 | si youndouyondo-D être_d’accord-ACPP 24 | « Si le youndouyondo est d’accord, 25 | ŋ si ŋ́ kuu janníŋ ŋ be tábí-r-ôo ké-la. » 26 | 1PL POT REFL laver avant_que 1PL COPLOC cuire-ANTIP-D faire-INF 27 | nous pourrons nous laver avant de faire à manger. » 28 | -------------------------------------------------------------------------------- /src/poioapi/mappings/MANDINKA_TYPECRAFT.json: -------------------------------------------------------------------------------- 1 | { 2 | "tier_mapping": { 3 | "gloss": ["gloss"], 4 | "part of speech": ["pos"] 5 | }, 6 | "gloss": { 7 | "1SG": "1SG", 8 | "1PL": "1PL", 9 | "2PL": "2PL", 10 | "2SG": "2SG", 11 | "3PL": "3PL", 12 | "3SG": "3SG", 13 | "ABSTR": "ABSTR", 14 | "ACPN": "CMPL.NEG", 15 | "ACPP": "CMPL", 16 | "ANTIP": "APASS", 17 | "APPART": "", 18 | "ASSOC": "ASSOC", 19 | "BEN": "BEN", 20 | "CAUS": "CAUS", 21 | "CONT": "", 22 | "CONTR": "CONTR", 23 | "CTRP ": "", 24 | "D, DEF": "DEF", 25 | "DEM": "DEM", 26 | "DEST": "", 27 | "EMPH": "EMPH", 28 | "FOC": "FOC", 29 | "GEN": "GEN", 30 | "GER": "GERDV", 31 | "HAB": "HAB", 32 | "INAC": "NCOMPL", 33 | "INACN": "NCOMPL.NEG", 34 | "INDEF": "INDEF", 35 | "INF": "INF", 36 | "INT": "INTS", 37 | "LOC": "LOC", 38 | "MAN": "MAN", 39 | "MTV": "MTV", 40 | "NMAG": "V>Nagt", 41 | "NMINS": "V>Ninstr", 42 | "OBL": "OBL", 43 | "OBLIG": "OBLIG", 44 | "OPT": "OPT", 45 | "ORN": "ORN", 46 | "PAS": "PASS", 47 | "PL": "PL", 48 | "PLASS": "Plassc", 49 | "POT": "POT", 50 | "PREDS": "", 51 | "PRIV": "PRIV", 52 | "PROG": "PROG", 53 | "Q": "Q", 54 | "QUOT": "QUOT", 55 | "RECIP": "RECP", 56 | "REFL": "REFL", 57 | "REL": "REL", 58 | "RES": "RESLT", 59 | "RESID": "LOCREL", 60 | "SELECT": "SLCT", 61 | "SPHP": "", 62 | "SUBJN": "SBJV.NEG", 63 | "SUBJP": "SBJV", 64 | "ADVCL": ["pos", "ADV"], 65 | "COPID": ["pos", "COPident"], 66 | "COPLOC": ["pos", "COPloc"], 67 | "COPN": ["pos", "COPneg"], 68 | "INTERJ": ["pos", "INTRJCT"], 69 | "ORD": ["pos", "ORD"] 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /examples/filter.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Poio Tools for Linguists 4 | # 5 | # Copyright (C) 2009-2013 Poio Project 6 | # Author: Peter Bouda 7 | # URL: 8 | # For license information, see LICENSE.TXT 9 | 10 | import codecs 11 | import poioapi.annotationgraph 12 | 13 | # Load an annotation graph from example file at: 14 | # http://tla.mpi.nl/tools/tla-tools/elan/download 15 | ag = poioapi.annotationgraph.AnnotationGraph() 16 | ag.from_elan("elan-example3.eaf") 17 | 18 | # Set one of the tier hierarchies as our "working" hierarchy 19 | tier_hierarchy = None 20 | for t in ag.tier_hierarchies: 21 | if t[0] == "utterance..W-Spch": 22 | tier_hierarchy = t 23 | ag.structure_type_handler = poioapi.data.DataStructureType( 24 | tier_hierarchy) 25 | 26 | # Create filter manually 27 | af = poioapi.annotationgraph.AnnotationGraphFilter(ag) 28 | af.set_filter_for_tier("words..W-Words", "follow") 29 | af.set_filter_for_tier("part_of_speech..W-POS", r"\bpro\b") 30 | 31 | ag.append_filter(af) 32 | 33 | print("Filtered root nodes:") 34 | print(ag.filtered_node_ids) 35 | 36 | # Remove filter again 37 | ag.pop_filter() 38 | 39 | # Create filter from dict 40 | search_terms = { 41 | "words..W-Words": "follow", 42 | "part_of_speech..W-POS": r"\bpro\b" 43 | } 44 | af = ag.create_filter_for_dict(search_terms) 45 | 46 | ag.append_filter(af) 47 | 48 | print("Filtered root nodes:") 49 | print(ag.filtered_node_ids) 50 | 51 | # write result as HTML 52 | html = ag.as_html_table(True) 53 | f = codecs.open("test.html", "w", "utf-8") 54 | f.write(html) 55 | f.close() 56 | 57 | -------------------------------------------------------------------------------- /src/poioapi/tests/sample_files/brat_graf/dict-aleman2000-9-69.hdr: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | QuantHistLing digitization of: Tulio Alemán M., Reinaldo López H., Marion Miller. 2000. Wirã ya, Peamasa ya wererituri 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | Wirã ya, Peamasa ya wererituri 15 | Tulio Alemán M., Reinaldo López H., Marion Miller 16 | 2000 17 | None 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /src/poioapi/tests/io/test_brat.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Poio Tools for Linguists 4 | # 5 | # Copyright (C) 2009-2013 Poio Project 6 | # Author: António Lopes 7 | # URL: 8 | # For license information, see LICENSE.TXT 9 | 10 | from __future__ import absolute_import 11 | 12 | import os 13 | import codecs 14 | 15 | import poioapi.io.brat 16 | 17 | import graf 18 | 19 | 20 | class TestBrat: 21 | """ 22 | This class contain the test methods to the 23 | class io.brat.py. 24 | 25 | """ 26 | 27 | def setup(self): 28 | filename = os.path.join(os.path.dirname(__file__), "..", "sample_files", 29 | "brat_graf", "dict-aleman2000-9-69.hdr") 30 | 31 | self.outputfile = os.path.join(os.path.dirname(__file__), "..", "sample_files", 32 | "brat_graf", "result.ann") 33 | 34 | parser = graf.io.GraphParser() 35 | self.graf = parser.parse(filename) 36 | self.writer = poioapi.io.brat.Writer("dictinterpretation", "substring") 37 | 38 | def test_write(self): 39 | converter = poioapi.io.graf.GrAFConverter(None, self.writer) 40 | converter.graf = self.graf 41 | converter.write(self.outputfile) 42 | 43 | annotations = os.path.join(os.path.dirname(__file__), "..", "sample_files", 44 | "brat_graf", "dict-aleman2000-9-69.ann") 45 | 46 | annotations_res = os.path.join(os.path.dirname(__file__), "..", "sample_files", 47 | "brat_graf", "result.ann") 48 | 49 | file_ann = codecs.open(annotations, "r", "utf-8") 50 | file_ann_res = codecs.open(annotations_res, "r", "utf-8") 51 | 52 | assert len(file_ann.readlines()) == len(file_ann_res.readlines()) -------------------------------------------------------------------------------- /src/poioapi/tests/sample_files/latex/mandinka_latex_expected.tex: -------------------------------------------------------------------------------- 1 | \documentclass[a4paper,11pt]{article} 2 | \usepackage{ucs} 3 | \usepackage[utf8x]{inputenc} 4 | \usepackage[T1]{fontenc} 5 | \usepackage{gb4n} 6 | 7 | \usepackage{latexsym} 8 | 9 | \begin{document} 10 | \ea 11 | \glll 12 | Musukéebâa níŋ a lá maañóo le táatá lóoñínóo la\\ 13 | Musu-kéebâa níŋ a lá maañóo le táa-tá lóo-ñín-óo la\\ 14 | femme-âgé.D avec 3SG GEN jeune\_épouse.D FOC aller-ACPP bois-chercher-D OBL\\ 15 | \glt{} Une vieille femme et sa jeune co-épouse étaient allées chercher du bois.\\ 16 | \z 17 | \ea 18 | \glll 19 | Kabíríŋ i yé i la lôo lándi sa□a míníntá maañóo la lóosítóo la barí wo máŋ a lóŋ\\ 20 | Kabíríŋ i yé i la lôo lá-ndi sa□a mínín-tá maañóo la lóo-sít-óo la barí wo máŋ a lóŋ\\ 21 | quand 3PL ACPP 3PL GEN bois.D être\_posé-CAUS serpent.D s’enrouler-ACPP jeune\_épouse.D GEN bois-attacher-D OBL mais DEM ACPN 3SG savoir\\ 22 | \glt{} Quand elles ont posé leur bois [avant de le charger], un serpent s’est enroulé autour du fagot de la jeune épouse, mais celle-ci ne s’en est pas aperçue.\\ 23 | \z 24 | \ea 25 | \glll 26 | Musukeebaamâa ñáa be sǎa kaŋ míŋ be míníndiŋ a la lóosítôo bála a yé a fó a ye kó Níŋ yunduyóndóo sonta ŋ si ŋ kuu janníŋ ŋ be tábírôo kéla\\ 27 | Musu-keebaa-mâa ñáa be sǎa kaŋ míŋ be mínín-diŋ a la lóo-sít-ôo bála a yé a fó a ye kó Níŋ yunduyónd-óo son-ta ŋ si ŋ kuu janníŋ ŋ be tábí-r-ôo ké-la\\ 28 | femme-âgé-SELECT.D œil.D COPLOC serpent.D sur REL COPLOC s’enrouler-RES 3SG GEN bois-attacher-D CONT 3SG ACPP 3SG dire 3SG BEN QUOT si youndouyondo-D être\_d’accord-ACPP 1PL POT REFL laver avant\_que 1PL COPLOC cuire-ANTIP-D faire-INF\\ 29 | \glt{} La vieille avait son regard fixé sur le serpent qui était enroulé au fagot de la jeune co-épouse, elle lui a répondu, «Si le youndouyondo est d’accord, nous pourrons nous laver avant de faire à manger.»\\ 30 | \z 31 | \end{document} -------------------------------------------------------------------------------- /src/poioapi/tests/sample_files/elan_graf/example-gesture_phases.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | preparation 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | stroke 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | retraction 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /src/poioapi/tests/io/test_latex.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Poio Tools for Linguists 4 | # 5 | # Copyright (C) 2009-2014 Poio Project 6 | # Author: Pedro Manha 7 | # URL: 8 | # For license information, see LICENSE.TXT 9 | 10 | import poioapi.io.latex 11 | import poioapi.annotationgraph 12 | 13 | import os.path 14 | import filecmp 15 | 16 | 17 | class TestWriter(): 18 | 19 | def test_write_with_mandinka(self): 20 | input = os.path.join(os.path.dirname(__file__), "..", 21 | "sample_files", "mandinka", "mandinka_latex.txt") 22 | 23 | output = os.path.join(os.path.dirname(__file__), "..", 24 | "sample_files", "latex", "mandinka_latex.tex") 25 | 26 | expected = os.path.join(os.path.dirname(__file__), "..", 27 | "sample_files", "latex", "mandinka_latex_expected.tex") 28 | 29 | ag = poioapi.annotationgraph.AnnotationGraph() 30 | ag = ag.from_mandinka(input) 31 | writer = poioapi.io.latex.Writer() 32 | writer.write(output, ag) 33 | 34 | assert(os.path.getsize(output) == os.path.getsize(expected)) 35 | assert(filecmp.cmp(output, expected, False)) 36 | 37 | def test_write_with_toolbox(self): 38 | input = os.path.join(os.path.dirname(__file__), "..", 39 | "sample_files", "toolbox_graf", "toolbox_latex.txt") 40 | 41 | output = os.path.join(os.path.dirname(__file__), "..", 42 | "sample_files", "latex", "toolbox_latex.tex") 43 | 44 | expected = os.path.join(os.path.dirname(__file__), "..", 45 | "sample_files", "latex", "toolbox_latex_expected.tex") 46 | 47 | ag = poioapi.annotationgraph.AnnotationGraph() 48 | ag = ag.from_toolbox(input) 49 | writer = poioapi.io.latex.Writer() 50 | writer.write(output, ag) 51 | 52 | assert(os.path.getsize(output) == os.path.getsize(expected)) 53 | assert(filecmp.cmp(output, expected, False)) -------------------------------------------------------------------------------- /src/poioapi/tests/io/test_obt.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Poio Tools for Linguists 4 | # 5 | # Copyright (C) 2009-2014 Poio Project 6 | # Author: Peter Bouda 7 | # URL: 8 | # For license information, see LICENSE.TXT 9 | 10 | from __future__ import unicode_literals 11 | 12 | import os 13 | import poioapi.io.obt 14 | 15 | class TestParser: 16 | """ 17 | This class contain the test methods to the 18 | class io/obt.py. 19 | 20 | """ 21 | 22 | def setup(self): 23 | self.filename = os.path.join(os.path.dirname(__file__), "..", 24 | "sample_files", "obt", "suite_fotball.xml") 25 | 26 | self.parser = poioapi.io.obt.Parser(self.filename) 27 | 28 | def test_get_root_tiers(self): 29 | root_tiers = self.parser.get_root_tiers() 30 | assert len(root_tiers) == 1 31 | 32 | def test_get_child_tiers_for_tier(self): 33 | root_tiers = self.parser.get_root_tiers() 34 | 35 | child_tiers = self.parser.get_child_tiers_for_tier(root_tiers[0]) 36 | assert len(child_tiers) == 1 37 | 38 | child_tiers = self.parser.get_child_tiers_for_tier( 39 | poioapi.io.graf.Tier('word')) 40 | assert len(child_tiers) == 1 41 | 42 | def test_get_annotations_for_tier(self): 43 | root_tiers = self.parser.get_root_tiers() 44 | root_annotations = self.parser.get_annotations_for_tier(root_tiers[0]) 45 | 46 | assert len(root_annotations) == 101 47 | 48 | tier = poioapi.io.graf.Tier("word") 49 | annotation_parent = root_annotations[0] 50 | assert annotation_parent.value == \ 51 | "I Strømsgodset-stallen er det bare to spillere som kan skilte med millioninntekt fra 2012 ." 52 | assert annotation_parent.id == "a0" 53 | 54 | tier_annotations = self.parser.get_annotations_for_tier( 55 | tier, annotation_parent) 56 | 57 | assert len(tier_annotations) == 15 58 | -------------------------------------------------------------------------------- /src/poioapi/tests/sample_files/typecraft_graf/typecraft_example-description.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | firi: can firi inflect like a verb? 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /src/poioapi/tests/io/test_shoebox.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Poio Tools for Linguists 4 | # 5 | # Copyright (C) 2009-2013 Poio Project 6 | # Author: António Lopes 7 | # URL: 8 | # For license information, see LICENSE.TXT 9 | 10 | from __future__ import unicode_literals 11 | 12 | import os 13 | 14 | import poioapi.io.shoebox 15 | import poioapi.io.graf 16 | 17 | 18 | class TestParser: 19 | """ 20 | This class contain the test methods to the 21 | class io.shoebox.py. 22 | 23 | """ 24 | 25 | def setup(self): 26 | self.filename = os.path.join(os.path.dirname(__file__), "..", 27 | "sample_files", "shoebox_graf", "shoebox.xml") 28 | self.parser = poioapi.io.shoebox.Parser(self.filename) 29 | 30 | def test_get_root_tiers(self): 31 | root_tiers = self.parser.get_root_tiers() 32 | assert len(root_tiers) == 1 33 | 34 | def test_get_child_tiers_for_tier(self): 35 | root_tiers = self.parser.get_root_tiers() 36 | 37 | child_tiers = self.parser.get_child_tiers_for_tier(root_tiers[0]) 38 | assert len(child_tiers) == 1 39 | 40 | child_tiers = self.parser.get_child_tiers_for_tier( 41 | poioapi.io.graf.Tier('m')) 42 | assert len(child_tiers) == 1 43 | 44 | 45 | def test_get_annotations_for_tier(self): 46 | root_tiers = self.parser.get_root_tiers() 47 | root_annotations = self.parser.get_annotations_for_tier(root_tiers[0]) 48 | 49 | assert len(root_annotations) == 1 50 | 51 | tier = poioapi.io.graf.Tier("t") 52 | annotation_parent = root_annotations[0] 53 | assert annotation_parent.value == "Baho katali difisi na sungula hawowa mbuya kamei sungula kamgamba, chigende nhambo. " 54 | assert annotation_parent.id == "mjs3001revised" 55 | 56 | tier_annotations = self.parser.get_annotations_for_tier( 57 | tier, annotation_parent) 58 | 59 | assert len(tier_annotations) == 12 60 | 61 | tier = poioapi.io.graf.Tier("p") 62 | annotation_parent = tier_annotations[0] 63 | assert annotation_parent.value == "Baho" 64 | 65 | tier_annotations = self.parser.get_annotations_for_tier( 66 | tier, annotation_parent) 67 | 68 | assert len(tier_annotations) == 1 69 | -------------------------------------------------------------------------------- /src/poioapi/tests/io/test_mandinka.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Poio Tools for Linguists 4 | # 5 | # Copyright (C) 2009-2014 Poio Project 6 | # Author: Peter Bouda 7 | # URL: 8 | # For license information, see LICENSE.TXT 9 | 10 | from __future__ import unicode_literals 11 | 12 | import os 13 | 14 | import poioapi.io.mandinka 15 | import poioapi.data 16 | import poioapi.mapper 17 | 18 | tier_map = { 19 | poioapi.data.TIER_UTTERANCE: "utterance_gen", 20 | poioapi.data.TIER_WORD: "t", 21 | poioapi.data.TIER_MORPHEME: "m", 22 | poioapi.data.TIER_POS: "p", 23 | poioapi.data.TIER_GLOSS: "g", 24 | poioapi.data.TIER_TRANSLATION: "f", 25 | poioapi.data.TIER_COMMENT: "nt" 26 | } 27 | 28 | class TestParser: 29 | """ 30 | This class contain the test methods to the 31 | class io/mandinka.py. 32 | 33 | """ 34 | 35 | def setup(self): 36 | self.filename = os.path.join(os.path.dirname(__file__), "..", 37 | "sample_files", "mandinka", "mandinka.txt") 38 | 39 | self.parser = poioapi.io.mandinka.Parser(self.filename, None) 40 | 41 | def test_get_root_tiers(self): 42 | root_tiers = self.parser.get_root_tiers() 43 | assert len(root_tiers) == 1 44 | 45 | def test_get_child_tiers_for_tier(self): 46 | root_tiers = self.parser.get_root_tiers() 47 | 48 | child_tiers = self.parser.get_child_tiers_for_tier(root_tiers[0]) 49 | assert len(child_tiers) == 2 50 | 51 | child_tiers = self.parser.get_child_tiers_for_tier( 52 | poioapi.io.graf.Tier(poioapi.data.tier_labels[poioapi.data.TIER_WORD])) 53 | assert len(child_tiers) == 1 54 | 55 | def test_get_annotations_for_tier(self): 56 | root_tiers = self.parser.get_root_tiers() 57 | root_annotations = self.parser.get_annotations_for_tier(root_tiers[0]) 58 | assert len(root_annotations) == 7 59 | 60 | tier = poioapi.io.graf.Tier(poioapi.data.tier_labels[poioapi.data.TIER_WORD]) 61 | annotation_parent = root_annotations[0] 62 | result = 'Musukéebâa níŋ a lá maañóo le táatá lóoñínóo la.' 63 | #result = codecs.encode(result) 64 | 65 | assert annotation_parent.value == result 66 | assert annotation_parent.id == "a0" 67 | 68 | tier_annotations = self.parser.get_annotations_for_tier( 69 | tier, annotation_parent) 70 | 71 | assert len(tier_annotations) == 9 -------------------------------------------------------------------------------- /src/poioapi/tests/io/test_toolbox.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Poio Tools for Linguists 4 | # 5 | # Copyright (C) 2009-2013 Poio Project 6 | # Author: António Lopes 7 | # URL: 8 | # For license information, see LICENSE.TXT 9 | 10 | from __future__ import unicode_literals 11 | 12 | import os 13 | 14 | import poioapi.io.toolbox 15 | import poioapi.io.graf 16 | 17 | class TestParser: 18 | """ 19 | This class contain the test methods to the 20 | class io/toolbox.py. 21 | 22 | """ 23 | 24 | def setup(self): 25 | self.filename = os.path.join(os.path.dirname(__file__), "..", 26 | "sample_files", "toolbox_graf", "toolbox.txt") 27 | self.parser = poioapi.io.toolbox.Parser(self.filename, "ref") 28 | 29 | def test_tier_hierachy(self): 30 | assert self.parser.tier_hierarchy.data_hierarchy == \ 31 | ['ref', [ 'utterance_gen', ['tx', ['mb', ['ge', 'ps']]], 32 | ['rf', 'rt', 'graid', 'ft', 'nt']], 33 | ['id', 'dt']] 34 | 35 | def test_get_root_tiers(self): 36 | root_tiers = self.parser.get_root_tiers() 37 | assert len(root_tiers) == 1 38 | 39 | def test_get_child_tiers_for_tier(self): 40 | root_tiers = self.parser.get_root_tiers() 41 | 42 | child_tiers = self.parser.get_child_tiers_for_tier(root_tiers[0]) 43 | assert len(child_tiers) == 3 44 | 45 | child_tiers = self.parser.get_child_tiers_for_tier( 46 | poioapi.io.graf.Tier('tx')) 47 | assert len(child_tiers) == 1 48 | 49 | 50 | def test_get_annotations_for_tier(self): 51 | root_tiers = self.parser.get_root_tiers() 52 | root_annotations = self.parser.get_annotations_for_tier(root_tiers[0]) 53 | 54 | assert len(root_annotations) == 295 55 | 56 | tier = poioapi.io.graf.Tier("utterance_gen") 57 | annotation_parent = root_annotations[0] 58 | assert annotation_parent.value == "Pear_Madi.001" 59 | assert annotation_parent.id == "a0" 60 | 61 | tier_annotations = self.parser.get_annotations_for_tier( 62 | tier, annotation_parent) 63 | 64 | assert len(tier_annotations) == 1 65 | 66 | tier = poioapi.io.graf.Tier("tx") 67 | annotation_parent = tier_annotations[0] 68 | assert annotation_parent.value == "diž yikes . diž bikes čeq čeqi rekʼe" 69 | 70 | tier_annotations = self.parser.get_annotations_for_tier( 71 | tier, annotation_parent) 72 | 73 | assert len(tier_annotations) == 8 74 | -------------------------------------------------------------------------------- /src/poioapi/tests/sample_files/tcf_graf/corpus-POStags.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | NE 19 | 20 | 21 | 22 | 23 | 24 | 25 | VVFIN 26 | 27 | 28 | 29 | 30 | 31 | 32 | ART 33 | 34 | 35 | 36 | 37 | 38 | 39 | NE 40 | 41 | 42 | 43 | 44 | 45 | 46 | $. 47 | 48 | 49 | 50 | 51 | 52 | 53 | PPER 54 | 55 | 56 | 57 | 58 | 59 | 60 | VVFIN 61 | 62 | 63 | 64 | 65 | 66 | 67 | PPER 68 | 69 | 70 | 71 | 72 | 73 | 74 | $. 75 | 76 | 77 | 78 | -------------------------------------------------------------------------------- /src/poioapi/tests/sample_files/tcf_graf/corpus-lemmas.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | Peter 19 | 20 | 21 | 22 | 23 | 24 | 25 | essen 26 | 27 | 28 | 29 | 30 | 31 | 32 | ein 33 | 34 | 35 | 36 | 37 | 38 | 39 | Käsepizza 40 | 41 | 42 | 43 | 44 | 45 | 46 | . 47 | 48 | 49 | 50 | 51 | 52 | 53 | sie 54 | 55 | 56 | 57 | 58 | 59 | 60 | schmecken 61 | 62 | 63 | 64 | 65 | 66 | 67 | er 68 | 69 | 70 | 71 | 72 | 73 | 74 | . 75 | 76 | 77 | 78 | -------------------------------------------------------------------------------- /src/poioapi/tests/sample_files/tcf_graf/corpus-tokens.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | Peter 19 | 20 | 21 | 22 | 23 | 24 | 25 | ass 26 | 27 | 28 | 29 | 30 | 31 | 32 | eine 33 | 34 | 35 | 36 | 37 | 38 | 39 | Käsepizza 40 | 41 | 42 | 43 | 44 | 45 | 46 | . 47 | 48 | 49 | 50 | 51 | 52 | 53 | Sie 54 | 55 | 56 | 57 | 58 | 59 | 60 | schmeckte 61 | 62 | 63 | 64 | 65 | 66 | 67 | ihm 68 | 69 | 70 | 71 | 72 | 73 | 74 | . 75 | 76 | 77 | 78 | -------------------------------------------------------------------------------- /src/poioapi/tests/sample_files/typecraft_graf/typecraft_example-phrase.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | VALID 16 | Akosua hwiee nsuo no faa tokuro no mu 17 | 18 | 19 | 20 | 21 | 22 | VALID 23 | Mɪ twii kaa no firii fie fa sukuu no mu kɔ asɔrɪ 24 | 25 | 26 | 27 | 28 | 29 | VALID 30 | ɔ fa sukuu no mu kɔ dwaso 31 | 32 | 33 | 34 | 35 | 36 | VALID 37 | Abain ɛyɛ nyuɔma a ɛbe bua nsrahwɛ ewɔ ɔmain mu. 38 | 39 | 40 | 41 | 42 | 43 | VALID 44 | Me fa afuo no mu kɔ sukuu 45 | 46 | 47 | 48 | 49 | 50 | VALID 51 | Na sukku mu na mepɛ sɛ me de kɔ 52 | 53 | 54 | 55 | 56 | 57 | VALID 58 | bere ahorow dumbienu a εka bɔ mu yε afe baako; 59 | 60 | 61 | 62 | 63 | 64 | VALID 65 | bere ahorow dumbienu a εka bɔ mu yε afe baako; 66 | 67 | 68 | 69 | 70 | 71 | VALID 72 | Kwantwakɔsansuo a yɛ hu wo ekorasi mu 73 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /src/poioapi/tests/sample_files/latex/toolbox_latex_expected.tex: -------------------------------------------------------------------------------- 1 | \documentclass[a4paper,11pt]{article} 2 | \usepackage{ucs} 3 | \usepackage[utf8x]{inputenc} 4 | \usepackage[T1]{fontenc} 5 | \usepackage{gb4n} 6 | 7 | \usepackage{tipa} 8 | 9 | \begin{document} 10 | \ea 11 | \ref Pear\_Madi.001\\ 12 | \glll 13 | diž yikes . diž bikes čeq čeqi rek'e\\ 14 | diž y--ike--s . diž b--ike--s čeq čeq--i rek'e\\ 15 | I.DAT.pn II-.aff--see.v--PST.-aff 1sec.break I.DAT.pn III-.aff--see.v--PST.-aff forest.n forest.n--ESS.II.-aff person.n\\ 16 | \glt{} I saw . I saw a forest, a person in the forest\\ 17 | \z 18 | \ea 19 | \ref Pear\_Madi.002\\ 20 | \glll 21 | rek'e bak'arzi bak'arziruho zoq'es .... bak'arziruho zoq'es\\ 22 | rek'e bak'arzi bak'arziru--ho zoq'e--s .---.---.---. bak'arziru--ho zoq'e--s\\ 23 | person.n collect.v collect.III/IV/V.PL.v--PRS.-aff be.v--PST.-aff 1sec.break--.--1sec.break--.--1sec.break--.--1sec.break collect.III/IV/V.PL.v--PRS.-aff be.v--PST.-aff\\ 24 | \glt{} The person collected (them).\\ 25 | \z 26 | \ea 27 | \ref Pear\_Madi.003\\ 28 | \glll 29 | rikes gruša i hałoy haw sobiratno bun tohobito aq'es k'onk'a\textturny 'o rek'e uži\\ 30 | r--ike--s gruša *i hało--y haw *sobirat--no b--u--n tohobito aq'e--s k'onk'a--\textturny 'o rek'e uži\\ 31 | V?-.aff--see.v--PST.-aff pear(r).n ***.***.pn he.OBL--ERG.-aff that.pn ***.***--and.-aff III-.aff--do,.make.v--PFT.-aff on.the.other.side.adv come.v--PST.-aff bike.n--ESS.III.-aff person.n boy.n\\ 32 | \glt{} (I? / He?) saw the pear and he collected it, from the other side came a person, a boy on the bicycle.\\ 33 | \z 34 | \ea 35 | \ref Pear\_Madi.004\\ 36 | \glll 37 | hałoy haw bik'ek'iš .. karzina\\ 38 | hało--y haw b--ik'ek'--iš .---. karžina\\ 39 | this.OBL.pn--ERG.-aff this.pn III-.aff--steal.v--PST.-aff 1sec.break--.--1sec.break basket(r).n\\ 40 | \glt{} He stole this .. basket.\\ 41 | \z 42 | \ea 43 | \ref Pear\_Madi.005 \#\#\#\#\#\#\#\# 51.5\\ 44 | \glll 45 | bik'ek'no oxes oxeya gamač' keziyiqno\\ 46 | b--ik'ek'--no ox--es gamač' keziyiq--no\\ 47 | III-.aff--steal.v--PFT?.-aff leave.v--PST.-aff stone.n meet.II.v--PFT?.-aff\\ 48 | \z 49 | \ea 50 | \ref Pear\_Madi.006 \#\#\#\#\#\#\#\#\#\#\#\\ 51 | \glll 52 | gamač'lis k'onk'a\textturny 'o ? bekin hago rede ne\textturny in\\ 53 | gamač'---*l--i--s k'onk'a--\textturny 'o *? b--*ek--in hago rede ne\textturny --in\\ 54 | stone.n--.--***.***--ESS.II.-aff--GEN1.-aff bike.n--ESS.III.-aff ***.*** I/II.PL-.aff--***.***--PFT.-aff this.pn wood.n give.v--PFT.-aff\\ 55 | \z 56 | \ea 57 | \ref Pear\_Madi.007\\ 58 | \glll 59 | hezodoy haw grušan sadaq bosiš\\ 60 | hezodoy haw gruša--n sadaq b--os--iš\\ 61 | then.adv this.pn pear(r).n--and.-clit all.adv III-.aff--fall.v--PST.-aff\\ 62 | \z 63 | \end{document} -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Poio Tools for Linguists 4 | # 5 | # Copyright (C) 2001-2013 Poio Project 6 | # Author: Peter Bouda 7 | # URL: 8 | # For license information, see LICENSE.TXT 9 | 10 | import os 11 | 12 | # Use the VERSION file to get version 13 | version_file = os.path.join(os.path.dirname(__file__), 'src', 'poioapi', 'VERSION') 14 | with open(version_file) as fh: 15 | poioapi_version = fh.read().strip() 16 | 17 | #import distribute_setup 18 | #distribute_setup.use_setuptools() 19 | 20 | from setuptools import setup, find_packages 21 | 22 | setup( 23 | name = "poio-api", 24 | description = "A Python Library to access and manipulate linguistically annotated corpus files.", 25 | version = poioapi_version, 26 | url = "http://media.cidles.eu/poio/poio-api/", 27 | # download_url = "https://s3.amazonaws.com/cidles/downloads/poio-api/poio-api-{0}.tar.gz".format(poioapi_version), 28 | #long_description = "Python implementation of the Graph Annotation Framework. (http://www.americannationalcorpus.org/graf-wiki)", 29 | license = "Apache License, Version 2.0", 30 | keywords = ['NLP', 'CL', 'natural language processing', 31 | 'computational linguistics', 'parsing', 'tagging', 32 | 'annotation', 'linguistics', 'language', 33 | 'natural language', "language documentation"], 34 | maintainer = "Peter Bouda", 35 | maintainer_email = "pbouda@cidles.eu", 36 | author = "Peter Bouda", 37 | author_email = "pbouda@cidles.eu", 38 | classifiers = [ 39 | 'Development Status :: 4 - Beta', 40 | 'Intended Audience :: Developers', 41 | 'Intended Audience :: Education', 42 | 'Intended Audience :: Information Technology', 43 | 'Intended Audience :: Science/Research', 44 | 'License :: OSI Approved :: Apache Software License', 45 | 'Operating System :: OS Independent', 46 | 'Programming Language :: Python :: 2.6', 47 | 'Programming Language :: Python :: 2.7', 48 | 'Programming Language :: Python :: 3.2', 49 | 'Programming Language :: Python :: 3.3', 50 | 'Topic :: Scientific/Engineering', 51 | 'Topic :: Scientific/Engineering :: Human Machine Interfaces', 52 | 'Topic :: Scientific/Engineering :: Information Analysis', 53 | 'Topic :: Text Processing', 54 | 'Topic :: Text Processing :: General', 55 | 'Topic :: Text Processing :: Indexing', 56 | 'Topic :: Text Processing :: Linguistic', 57 | ], 58 | packages = [ 'poioapi', 'poioapi.io' ], 59 | package_dir = { '': 'src' }, 60 | package_data = { 'poioapi': ['VERSION', 'mappings/*.json'] }, 61 | #install_requires=['PyYAML>=3.09'], 62 | #test_suite = 'graf.test.simple', 63 | ) 64 | -------------------------------------------------------------------------------- /src/poioapi/tests/io/test_toolboxxml.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Poio Tools for Linguists 4 | # 5 | # Copyright (C) 2009-2013 Poio Project 6 | # Author: António Lopes 7 | # URL: 8 | # For license information, see LICENSE.TXT 9 | 10 | import os 11 | 12 | import poioapi.io.toolboxxml 13 | import poioapi.io.graf 14 | 15 | 16 | class TestParser: 17 | """ 18 | This class contain the test methods to the 19 | class io.toolboxxml.py. 20 | 21 | """ 22 | 23 | def setup(self): 24 | self.filename = os.path.join(os.path.dirname(__file__), "..", "sample_files", 25 | "toolbox_graf", "toolbox.xml") 26 | 27 | self.parser = poioapi.io.toolboxxml.Parser(self.filename) 28 | 29 | def test_get_root_tiers(self): 30 | root_tiers = self.parser.get_root_tiers() 31 | 32 | assert len(root_tiers) == 1 33 | 34 | def test_get_child_tiers_for_tier(self): 35 | root_tiers = self.parser.get_root_tiers() 36 | child_tier = self.parser.get_child_tiers_for_tier(root_tiers[0]) 37 | 38 | assert len(child_tier) == 1 39 | assert child_tier[0].name == "idGroup" 40 | 41 | tier = poioapi.io.graf.Tier("tx") 42 | child_tiers = self.parser.get_child_tiers_for_tier(tier) 43 | 44 | assert len(child_tiers) == 2 45 | assert child_tiers[0].name == "mr" 46 | assert child_tiers[1].name == "mg" 47 | 48 | def test_get_annotations_for_tier(self): 49 | root_tiers = self.parser.get_root_tiers() 50 | root_annotations = self.parser.get_annotations_for_tier(root_tiers[0]) 51 | 52 | assert len(root_annotations) == 2 53 | 54 | tier = poioapi.io.graf.Tier("idGroup") 55 | annoation_parent = root_annotations[0] 56 | 57 | tier_annotations = self.parser.get_annotations_for_tier(tier, annoation_parent) 58 | 59 | assert len(tier_annotations) == 29 60 | 61 | def test_tier_has_regions(self): 62 | tier = poioapi.io.graf.Tier("tx") 63 | has_regions = self.parser.tier_has_regions(tier) 64 | 65 | assert not has_regions 66 | 67 | tier = poioapi.io.graf.Tier("idGroup") 68 | has_regions = self.parser.tier_has_regions(tier) 69 | 70 | assert has_regions 71 | 72 | def test_region_for_annotation(self): 73 | root_tiers = self.parser.get_root_tiers() 74 | root_annotations = self.parser.get_annotations_for_tier(root_tiers[0]) 75 | 76 | tier = poioapi.io.graf.Tier("idGroup") 77 | annoation_parent = root_annotations[0] 78 | annotations = self.parser.get_annotations_for_tier(tier, annoation_parent) 79 | 80 | regions = self.parser.region_for_annotation(annotations[0]) 81 | expected_regions = ('905.88', '917.4') 82 | 83 | assert regions == expected_regions 84 | -------------------------------------------------------------------------------- /src/poioapi/tests/io/test_memory.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Poio Tools for Linguists 4 | # 5 | # Copyright (C) 2009-2014 Poio Project 6 | # Author: Peter Bouda 7 | # URL: 8 | # For license information, see LICENSE.TXT 9 | 10 | import poioapi.io.memory 11 | import poioapi.io.graf 12 | 13 | class SimpleParser(poioapi.io.graf.BaseParser): 14 | tiers = ["utterance", "word", "wfw", "graid"] 15 | 16 | utterance_tier = ["this is a test", "this is another test"] 17 | word_tier = [['this', 'is', 'a', 'test'], ['this', 'is', 'another', 'test']] 18 | wfw_tier = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'] 19 | graid_tier = ['i', 'j', 'k', 'l', 'm', 'n', 'o', 'p'] 20 | 21 | def __init__(self): 22 | pass 23 | 24 | def get_root_tiers(self): 25 | return [poioapi.io.graf.Tier("utterance")] 26 | 27 | def get_child_tiers_for_tier(self, tier): 28 | if tier.name == "utterance": 29 | return [poioapi.io.graf.Tier("word")] 30 | if tier.name == "word": 31 | return [poioapi.io.graf.Tier("graid"), poioapi.io.graf.Tier("wfw")] 32 | 33 | return None 34 | 35 | def get_annotations_for_tier(self, tier, annotation_parent=None): 36 | if tier.name == "utterance": 37 | return [poioapi.io.graf.Annotation(i, v) for i, v in enumerate(self.utterance_tier)] 38 | 39 | if tier.name == "word": 40 | return [poioapi.io.graf.Annotation(2 + 4 * annotation_parent.id + i, v) for i, v 41 | in enumerate(self.word_tier[annotation_parent.id])] 42 | 43 | if tier.name == "graid": 44 | return [poioapi.io.graf.Annotation(annotation_parent.id + 10, self.graid_tier[annotation_parent.id - 2])] 45 | 46 | if tier.name == "wfw": 47 | return [poioapi.io.graf.Annotation(annotation_parent.id + 12, self.wfw_tier[annotation_parent.id - 2])] 48 | 49 | return [] 50 | 51 | def tier_has_regions(self, tier): 52 | if tier.name == "utterance": 53 | return True 54 | return False 55 | 56 | def region_for_annotation(self, annotation): 57 | if annotation.id == 0: 58 | return (0, 100) 59 | elif annotation.id == 1: 60 | return (101, 200) 61 | 62 | def get_primary_data(self): 63 | pass 64 | 65 | class TestGrAFConverter: 66 | def setup(self): 67 | self.parser = SimpleParser() 68 | self.converter = poioapi.io.memory.MemoryConverter(self.parser) 69 | self.converter.parse() 70 | 71 | def test_tier_hierarchies(self): 72 | assert( 73 | self.converter.tier_hierarchies == \ 74 | [['utterance', ['word', ['graid'], ['wfw']]]]) 75 | 76 | def test_region_for_annotations(self): 77 | assert(self.converter.region_for_annotation == \ 78 | {0: (0, 100), 1: (101, 200)}) 79 | -------------------------------------------------------------------------------- /example_data/balochi_graf/balochi.txt: -------------------------------------------------------------------------------- 1 | guš-īt: 2 | ki yag bādišā=(y)ē=at 3 | ē bādišā bi=m-ē wat-ī šār-ay wasat-(t)ā yakk tīr=i barγ-ē dāšt 4 | ki harčī am-ē tilīpun-ān-ī sīm=at-ant 5 | bi am-ēš-ī wasl=at-ant 6 | harka ki arz-ē b-dāšt-ēn 7 | am-ēširā ki takān dāt-ēn 8 | bādišā ōdā sī=(y)a būt 9 | yag rōč-ē dīst 10 | ta am-ē aždiyā-(y)ē āt-a=u 11 | am-ēš-ā takān=a dant 12 | bādšā dēm dāt yakk-ē-rā 13 | ki ē čī-(y)ē 14 | āt-ant 15 | gušt 16 | bādšā sāib aždiyā-(y)ē 17 | gu 18 | hā… ē aždiyā amr-ē dār-īt 19 | arz-ē dār-īt 20 | ēš-ī arz-ā kay pa=(m)man b-(y)ār-īt 21 | man ki na-zān-īn 22 | ē bē-zuwān-ē 23 | yakk naǰǰār-ē gu 24 | man=a zān-īn 25 | gušt=ī 26 | ki zān-ay 27 | ēš-ī arz-ā āwurt-ay 28 | man trā inka xalāit=(t)a day-īn 29 | gu 30 | xayli xub 31 | ē naǰǰār ki āt gō tēγ=u arrag-ān 32 | ē aždiyā wayl kurt=u 33 | rāda būt=u 34 | aždīyā naǰǰār-ā išāra dāt=u išāra=u 35 | bi kō-ay tā šut 36 | ōdā ki šut 37 | ta uhō… ē aždiyā diga mās-ē dār-īt 38 | ē mās=ay ša=m-ē kōh-ay pāčin na-(w)ant mazan šāx 39 | ša=m-ēš-ān šikār kurt-a=u 40 | am-ē šāx=ay ēš-ī guṭṭ-ā gīr kurt-ag=ant=u 41 | napas tank=int 42 | āt-a idā 43 | bi bādšā arz kurt-a 44 | išāra kurt=u 45 | ē šāx-ān-ā pāčin-ē-ān-ā arra kurt=u 46 | ā alās ūt 47 | āzāt būt 48 | ē aždiyā diga ǰā-(y)ē šut=u 49 | ēš-ā mazan-ēn xalāit-(t)ē dāt ǰawāir 50 | guṛā ē ṭukkur-ē ki šut 51 | padā ēš-ā išāra-(y)ē ku 52 | ōštāt 53 | am-inkas-ēn tōm-ē zurt=u 54 | ša wat-ī dap-ay tā prēnt bi am-ē ǰawāir-ān-ī tā ēš-ī tūrag-ay tā 55 | ki āt 56 | bādšā gu 57 | ē čē arz-ē dāšt 58 | gušt=ī 59 | b-(y)ā ki ē rang ēš-ī mās ē rang guṭṭō=at=ō 60 | āt=u 61 | man āzāt kurt-un=u 62 | guṛā mnā inka ǰawāir dāt=u 63 | am-ē tōm-ā=um am-ēš-ā=um mnā ē dā 64 | ša wat-ī dap-ay tā=ē prēnt 65 | gušt=ī 66 | ī dgar=u ǰawāir=ant 67 | am-ēš-ā mašmā na-zān-an 68 | ē maššōra kurt-ant 69 | gušt-ant 70 | b-(y)ā ēš-ā p-kiš-an mašmā 71 | ēš-ā ǰwān-ēn ǰā-(y)ē bāg-ē ḍigār-ay tā kišt-ant=u ǰwān-ēn narm-ēn ḍigār-ē=u 72 | wall-ē sabz kurt=u 73 | ēš činkas galaw ku 74 | ša truss-ā kass-ē ē galaw-ān-ā na-wārt 75 | ki ša aždiyā-(y)ay dap-ay tā=ant 76 | mār-ay zār=ant 77 | ē činka tōm kurt-ant=u 78 | šut-ant 79 | ṭū-(y)ēn pālēz-ē bādšā kišt=u 80 | yakk pīramard-ē-rā sawzwān ku 81 | ē laggit ē pālēz bi galaw-ā=ō 82 | bēxī galaw ku 83 | pīramard=am awal čē=(w)a na-ku 84 | tawkal=a na-kurt=u 85 | ar-uk-(k)ē dāšt 86 | am-ē galaw-ān-ā dēmā bi=m-ē ar-uk-(k)ay dēmā kōṭit=u 87 | dāt 88 | ē ar=am pazzōr ūt 89 | pīramard šurū ku wārtin-ā 90 | ta ē či čīz=ant 91 | pīramard=am pazzōr ūt 92 | bādšā yag rōč-ē ki āt 93 | gušt=ī 94 | ki pīramard bābā ē čē=(w)ant 95 | gušt=ī 96 | ē bēxī ǰwān-ēn čīz=ant 97 | man=um wārt-a, 98 | ē ar-ā=um dāt-a=un=ō 99 | šumā=um bōr-it. 100 | wazīr gu 101 | na dēmā man=a war-īn 102 | ta ay bādšā ma-war 103 | wazīr wārt 104 | ta ē aǰab-ēn xušmizzag-ēn čīz=ant 105 | wārt-ant=ō 106 | ki wārt-ant 107 | nūn gušt-ant 108 | b-(y)ā mašmā p=ēšī nām=ē b-ill-an 109 | nām=ay guṛā galaw-ā išt-ant xarmizza 110 | ki mizzag-ay awal xar burt 111 | xarmizza š=ōdā mant 112 | -------------------------------------------------------------------------------- /src/poioapi/tests/sample_files/balochi_graf/balochi.txt: -------------------------------------------------------------------------------- 1 | guš-īt: 2 | ki yag bādišā=(y)ē=at 3 | ē bādišā bi=m-ē wat-ī šār-ay wasat-(t)ā yakk tīr=i barγ-ē dāšt 4 | ki harčī am-ē tilīpun-ān-ī sīm=at-ant 5 | bi am-ēš-ī wasl=at-ant 6 | harka ki arz-ē b-dāšt-ēn 7 | am-ēširā ki takān dāt-ēn 8 | bādišā ōdā sī=(y)a būt 9 | yag rōč-ē dīst 10 | ta am-ē aždiyā-(y)ē āt-a=u 11 | am-ēš-ā takān=a dant 12 | bādšā dēm dāt yakk-ē-rā 13 | ki ē čī-(y)ē 14 | āt-ant 15 | gušt 16 | bādšā sāib aždiyā-(y)ē 17 | gu 18 | hā… ē aždiyā amr-ē dār-īt 19 | arz-ē dār-īt 20 | ēš-ī arz-ā kay pa=(m)man b-(y)ār-īt 21 | man ki na-zān-īn 22 | ē bē-zuwān-ē 23 | yakk naǰǰār-ē gu 24 | man=a zān-īn 25 | gušt=ī 26 | ki zān-ay 27 | ēš-ī arz-ā āwurt-ay 28 | man trā inka xalāit=(t)a day-īn 29 | gu 30 | xayli xub 31 | ē naǰǰār ki āt gō tēγ=u arrag-ān 32 | ē aždiyā wayl kurt=u 33 | rāda būt=u 34 | aždīyā naǰǰār-ā išāra dāt=u išāra=u 35 | bi kō-ay tā šut 36 | ōdā ki šut 37 | ta uhō… ē aždiyā diga mās-ē dār-īt 38 | ē mās=ay ša=m-ē kōh-ay pāčin na-(w)ant mazan šāx 39 | ša=m-ēš-ān šikār kurt-a=u 40 | am-ē šāx=ay ēš-ī guṭṭ-ā gīr kurt-ag=ant=u 41 | napas tank=int 42 | āt-a idā 43 | bi bādšā arz kurt-a 44 | išāra kurt=u 45 | ē šāx-ān-ā pāčin-ē-ān-ā arra kurt=u 46 | ā alās ūt 47 | āzāt būt 48 | ē aždiyā diga ǰā-(y)ē šut=u 49 | ēš-ā mazan-ēn xalāit-(t)ē dāt ǰawāir 50 | guṛā ē ṭukkur-ē ki šut 51 | padā ēš-ā išāra-(y)ē ku 52 | ōštāt 53 | am-inkas-ēn tōm-ē zurt=u 54 | ša wat-ī dap-ay tā prēnt bi am-ē ǰawāir-ān-ī tā ēš-ī tūrag-ay tā 55 | ki āt 56 | bādšā gu 57 | ē čē arz-ē dāšt 58 | gušt=ī 59 | b-(y)ā ki ē rang ēš-ī mās ē rang guṭṭō=at=ō 60 | āt=u 61 | man āzāt kurt-un=u 62 | guṛā mnā inka ǰawāir dāt=u 63 | am-ē tōm-ā=um am-ēš-ā=um mnā ē dā 64 | ša wat-ī dap-ay tā=ē prēnt 65 | gušt=ī 66 | ī dgar=u ǰawāir=ant 67 | am-ēš-ā mašmā na-zān-an 68 | ē maššōra kurt-ant 69 | gušt-ant 70 | b-(y)ā ēš-ā p-kiš-an mašmā 71 | ēš-ā ǰwān-ēn ǰā-(y)ē bāg-ē ḍigār-ay tā kišt-ant=u ǰwān-ēn narm-ēn ḍigār-ē=u 72 | wall-ē sabz kurt=u 73 | ēš činkas galaw ku 74 | ša truss-ā kass-ē ē galaw-ān-ā na-wārt 75 | ki ša aždiyā-(y)ay dap-ay tā=ant 76 | mār-ay zār=ant 77 | ē činka tōm kurt-ant=u 78 | šut-ant 79 | ṭū-(y)ēn pālēz-ē bādšā kišt=u 80 | yakk pīramard-ē-rā sawzwān ku 81 | ē laggit ē pālēz bi galaw-ā=ō 82 | bēxī galaw ku 83 | pīramard=am awal čē=(w)a na-ku 84 | tawkal=a na-kurt=u 85 | ar-uk-(k)ē dāšt 86 | am-ē galaw-ān-ā dēmā bi=m-ē ar-uk-(k)ay dēmā kōṭit=u 87 | dāt 88 | ē ar=am pazzōr ūt 89 | pīramard šurū ku wārtin-ā 90 | ta ē či čīz=ant 91 | pīramard=am pazzōr ūt 92 | bādšā yag rōč-ē ki āt 93 | gušt=ī 94 | ki pīramard bābā ē čē=(w)ant 95 | gušt=ī 96 | ē bēxī ǰwān-ēn čīz=ant 97 | man=um wārt-a, 98 | ē ar-ā=um dāt-a=un=ō 99 | šumā=um bōr-it. 100 | wazīr gu 101 | na dēmā man=a war-īn 102 | ta ay bādšā ma-war 103 | wazīr wārt 104 | ta ē aǰab-ēn xušmizzag-ēn čīz=ant 105 | wārt-ant=ō 106 | ki wārt-ant 107 | nūn gušt-ant 108 | b-(y)ā mašmā p=ēšī nām=ē b-ill-an 109 | nām=ay guṛā galaw-ā išt-ant xarmizza 110 | ki mizzag-ay awal xar burt 111 | xarmizza š=ōdā mant 112 | -------------------------------------------------------------------------------- /doc/brat.rst: -------------------------------------------------------------------------------- 1 | GrAF to brat conversion 2 | ======================= 3 | 4 | brat is a web-based tool for text annotation (http://brat.nlplab.org/). 5 | It works quite simple, through a annotation file with a **same name** of a text file, using the tokens in it, parses the 6 | text file in order to find the annotations using the token ranges. 7 | The annotations configuration are specified in a file name "annotation.conf", this file is also required otherwise the 8 | brat will through warnings and errors about the annotations. 9 | 10 | Our convert will be based in the data from QuantHistLing project (http://www.quanthistling.info/data). The annotation 11 | file should be like this: 12 | 13 | .. code-block:: xml 14 | 15 | [entities] 16 | formatting 17 | italic 18 | tab 19 | newline 20 | bold 21 | underline 22 | superscript 23 | smallcaps 24 | hyphen 25 | pagebreak 26 | dictinterpretation 27 | head 28 | pos 29 | translation 30 | crossreference 31 | counterpart 32 | footnote 33 | stratum 34 | phonology 35 | boundary 36 | dialectidentification 37 | headorth 38 | typo 39 | iso-639-3 40 | spa 41 | des 42 | doculect 43 | Desano 44 | Espan_ol 45 | 46 | [relations] 47 | # To Arg1:, Arg2: 48 | Arg1:, Arg2:, : 49 | 50 | [events] 51 | # none 52 | 53 | [attributes] 54 | # none 55 | 56 | For this demonstration we will use the GrAF files from the Aleman2000 dictionary. 57 | 58 | To convert a GrAF file to brat first is need to have a GrAF object: 59 | 60 | .. code-block:: python 61 | 62 | parser = graf.io.GraphParser() 63 | graf_graph = parser.parse("dict-aleman2000-9-69.hdr") 64 | 65 | Once we get the graph object is need to set the brat writer. 66 | The brat writer is defined with two paremeters: annotation_space and feature_name. 67 | 68 | * The annotation_space serves to filter what annotations are wanted from the graph object to write in brat annotation file. 69 | * The feature_name é a feature key that contains the real value of each annotation. 70 | 71 | .. code-block:: python 72 | 73 | brat = poioapi.io.brat.Writer("dictinterpretation", feature_name="substring") 74 | 75 | In our case we want go get only the annotations from "dictinterpretation" and that contain the feature "substring": 76 | 77 | .. code-block:: python 78 | 79 | brat.write(outputfilename="dict-aleman2000-9-69.ann", graf_graph) 80 | 81 | The result should be a file named "dict-aleman2000-9-69.ann". 82 | 83 | .. code-block:: python 84 | 85 | T1 head 0 6 áriri 86 | #1 AnnotatorNotes T1 NodeID = aleman2000/9/7/annotation/2 87 | T2 Desano 0 6 áriri 88 | #2 AnnotatorNotes T2 NodeID = aleman2000/9/7/annotation/2 89 | T3 des 0 6 áriri 90 | [...] 91 | 92 | **Note:** In order to brat works properly the result file (filename.ann) should have the same name as the text file. -------------------------------------------------------------------------------- /src/poioapi/tests/sample_files/mandinka/mandinka.txt: -------------------------------------------------------------------------------- 1 | Conte 1 2 | 3 | Musu-kéebâa níŋ a lá maañóo le táa-tá lóo-ñín-óo la. 4 | femme-âgé.D avec 3SG GEN jeune_épouse.D FOC aller-ACPP bois-chercher-D OBL 5 | Une vieille femme et sa jeune co-épouse étaient allées chercher du bois. 6 | Kabíríŋ i yé i la lôo lá-ndi, 7 | quand 3PL ACPP 3PL GEN bois.D être_posé-CAUS 8 | Quand elles ont posé leur bois [avant de le charger], 9 | saa mínín-tá maañóo la lóo-sít-óo la, 10 | serpent.D s’enrouler-ACPP jeune_épouse.D GEN bois-attacher-D OBL 11 | un serpent s’est enroulé autour du fagot de la jeune épouse, 12 | barí wo máŋ a lóŋ. 13 | mais DEM ACPN 3SG savoir 14 | mais celle-ci ne s’en est pas aperçue. 15 | Moo wó moo ye lóo-sít-ôo cíká 16 | personne INDEF personne ACPP bois-attacher-D soulever 17 | Chacune des deux a soulevé son fagot de bois 18 | a yé a láa a kuŋ-ó to, i be súw-o wálín-na. 19 | 3SG ACPP 3SG poser 3SG tête-D LOC 3PL COPLOC maison-D se_diriger_vers-INF 20 | et l’a mis sur sa tête, et les voilà qui se dirigent vers la maison. 21 | Maañôo kó musu-kéebáa ye kó, « Níŋ ŋ futa-ta, 22 | jeune_épouse.D QUOT femme-âgé.D BEN QUOT si 1PL arriver-ACPP 23 | La jeune co-épouse a dit à la vieille, « En arrivant, 24 | ŋ ŋa ŋ́ kuu fólóo janníŋ ŋ be dómó-r-ôo ké-la. » 25 | 1PL SUBJP REFL laver d’abord avant_que 1PL COPLOC manger-ANTIP-D faire-INF 26 | il faudra d’abord nous laver avant de manger. » 27 | Musu-keebaa-mâa ñáa be sǎa kaŋ, 28 | femme-âgé-SELECT.D œil.D COPLOC serpent.D sur 29 | La vieille avait son regard fixé sur le serpent 30 | míŋ be mínín-diŋ a la lóo-sít-ôo bála, 31 | REL COPLOC s’enrouler-RES 3SG GEN bois-attacher-D CONT 32 | qui était enroulé au fagot de la jeune co-épouse, 33 | a yé a fó a ye kó, 34 | 3SG ACPP 3SG dire 3SG BEN QUOT 35 | elle lui a répondu, 36 | « Níŋ yunduyónd-óo son-ta, 37 | si youndouyondo-D être_d’accord-ACPP 38 | « Si le youndouyondo est d’accord, 39 | ŋ si ŋ́ kuu janníŋ ŋ be tábí-r-ôo ké-la. » 40 | 1PL POT REFL laver avant_que 1PL COPLOC cuire-ANTIP-D faire-INF 41 | nous pourrons nous laver avant de faire à manger. » 42 | I táa-tá wǒ le ñáama fő ... súw-o kóno, 43 | 3PL aller-ACPP DEM FOC à_la_façon jusqu’à maison-D dans 44 | Elles sont allées comme ça jusqu’à la maison, 45 | i bée ye i la lóo-sít-óo boyi-ndi. 46 | 3PL tous ACPP 3PL GEN bois-attacher-D tomber-CAUS 47 | et toutes les deux ont déposé leur fagot de bois. 48 | Saa míŋ be maañóo la lóo-sít-óo kaŋ, 49 | serpent.D REL COPLOC jeune_épouse.D GEN bois-attacher-D sur 50 | Alors le serpent qui était sur le fagot de la jeune co-épouse 51 | a murum-murun-tá naŋ, a yé musu-keebaa-máa kiŋ, ca̋pa̋t, 52 | 3SG tourner-tourner-ACPP CTRP 3SG ACPP femme-âgé-SELECT.D mordre ADVCL 53 | s’est retourné, il a piqué la vieille, 54 | wǒ faa-ta. 55 | DEM mourir-ACPP 56 | et celle-ci est morte. 57 | -------------------------------------------------------------------------------- /src/poioapi/corpus.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Poio Tools for Linguists 4 | # 5 | # Copyright (C) 2009-2013 Poio Project 6 | # Author: Peter Bouda 7 | # URL: 8 | # For license information, see LICENSE.TXT 9 | """ 10 | The corpus module contains classes to handle collections of work items 11 | (currently: annotation trees; later: annotation graphs). It connects those 12 | work items to files on disk to keep track of the corpus the user works with. 13 | Each class provides a simple list of items to go through all work items for 14 | queries and updates. The queries and updates are handle by the classes of the 15 | work items. 16 | """ 17 | 18 | from __future__ import unicode_literals 19 | 20 | import poioapi.data 21 | import poioapi.annotationtree 22 | import poioapi.annotationgraph 23 | 24 | class CorpusTrees(): 25 | 26 | def __init__(self, data_structure_type): 27 | self.items = [] 28 | self.data_structure_type = data_structure_type 29 | 30 | def add_item(self, filepath, filetype): 31 | if filetype == poioapi.data.TREEPICKLE: 32 | annotation_tree = poioapi.annotationtree.AnnotationTree( 33 | poioapi.data.data_structure_handler_for_type( 34 | self.data_structure_type 35 | ) 36 | ) 37 | annotation_tree.load_tree_from_pickle(filepath) 38 | if annotation_tree.structure_type_handler != self.data_structure_type: 39 | raise( 40 | poioapi.data.DataStructureTypeNotCompatible( 41 | "Data structure type {0} not compatible with corpus" 42 | "data type {1}".format( 43 | annotation_tree.structure_type_handler, 44 | self.data_structure_type))) 45 | 46 | annotation_tree.init_filters() 47 | self.items.append( (filepath, annotation_tree) ) 48 | else: 49 | raise poioapi.data.UnknownFileFormatError() 50 | 51 | 52 | class CorpusGraphs(list): 53 | 54 | def add_item(self, filepath, filetype): 55 | annotation_graph = poioapi.annotationgraph.AnnotationGraph(None) 56 | if filetype == poioapi.data.EAF: 57 | annotation_graph.from_elan(filepath) 58 | if filetype == poioapi.data.EAFFROMTOOLBOX: 59 | annotation_graph.from_elan(filepath) 60 | elif filetype == poioapi.data.TYPECRAFT: 61 | annotation_graph.from_typecraft(filepath) 62 | else: 63 | raise poioapi.data.UnknownFileFormatError() 64 | 65 | annotation_graph.structure_type_handler = \ 66 | poioapi.data.DataStructureType( 67 | annotation_graph.tier_hierarchies[0] 68 | ) 69 | 70 | self.append( (filepath, annotation_graph) ) 71 | 72 | @property 73 | def tier_names(self): 74 | result = set() 75 | for _, ag in self: 76 | for tier_name in ag.structure_type_handler.flat_data_hierarchy: 77 | result.add(tier_name) 78 | return result 79 | -------------------------------------------------------------------------------- /src/poioapi/io/wikipedia_extractor.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Poio Tools for Linguists 4 | # 5 | # Copyright (C) 2009-2013 Poio Project 6 | # Author: António Lopes 7 | # URL: 8 | # For license information, see LICENSE.TXT 9 | 10 | from __future__ import absolute_import, unicode_literals 11 | 12 | import os 13 | 14 | import xml.etree.ElementTree as ET 15 | 16 | import poioapi.io.graf 17 | 18 | 19 | class Parser(poioapi.io.graf.BaseParser): 20 | 21 | def __init__(self, filepath): 22 | self.filepath = filepath 23 | (self.basedirname, _) = os.path.splitext(os.path.abspath(self.filepath)) 24 | 25 | self.parse() 26 | 27 | def parse(self): 28 | self.root = ET.parse(self.filepath).getroot() 29 | self.documents_map = {} 30 | self.documents = [] 31 | 32 | def get_root_tiers(self): 33 | return [poioapi.io.graf.Tier('doc')] 34 | 35 | def get_child_tiers_for_tier(self, tier): 36 | pass 37 | 38 | def get_annotations_for_tier(self, tier, annotation_parent=None): 39 | annotations = [] 40 | last_position = 0 41 | 42 | if tier.name == "doc": 43 | for a, annotation in enumerate(self.root): 44 | text = annotation.text 45 | id = annotation.attrib["id"] 46 | 47 | features = {"title":annotation.attrib["title"], 48 | "url":annotation.attrib["url"]} 49 | 50 | annotations.append(poioapi.io.graf.Annotation(id, 51 | None, features)) 52 | 53 | if len(annotation) is not 0: 54 | text += annotation[0].tail 55 | 56 | self.documents_map[id] = (last_position, last_position + 57 | len(text) + 1) 58 | self.documents.append(text) 59 | 60 | last_position += len(text) + 1 61 | 62 | return annotations 63 | 64 | def region_for_annotation(self, annotation): 65 | return self.documents_map[annotation.id] 66 | 67 | def tier_has_regions(self, tier): 68 | if tier.name == 'doc': 69 | return True 70 | 71 | return False 72 | 73 | # def write_raw_file(self): 74 | # file = os.path.abspath(self.basedirname + '.txt') 75 | # 76 | # if sys.version_info > (2, 7): 77 | # f = codecs.open(file, 'w', 'utf-8') 78 | # else: 79 | # f = open(file, 'w') 80 | # 81 | # for text in self.documents: 82 | # f.write(text) 83 | # 84 | # f.close() 85 | 86 | def get_primary_data(self): 87 | """This method gets the information about 88 | the source data file. 89 | 90 | Returns 91 | ------- 92 | primary_data : object 93 | PrimaryData object. 94 | 95 | """ 96 | 97 | primary_data = poioapi.io.graf.PrimaryData() 98 | primary_data.type = poioapi.io.graf.TEXT 99 | primary_data.content = "\n".join(self.documents) 100 | 101 | return primary_data 102 | -------------------------------------------------------------------------------- /src/poioapi/tests/sample_files/shoebox_graf/shoebox.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | DEM 10 |

prn

11 |
12 |
13 | 14 | 15 | long 16 | time 17 | ago 18 |

adv

19 |
20 |
21 | 22 | 23 | 5- 24 |

ncp-

25 |
26 | 27 | hyena:5/6 28 |

n

29 |
30 |
31 | 32 | 33 | CONJ 34 |

conj

35 |
36 |
37 | 38 | 39 | hare:9/10 40 |

n

41 |
42 |
43 | 44 | 45 | PAST- 46 |

tm-

47 |
48 | 49 | 2- 50 |

ncp-

51 |
52 | 53 | be 54 |

v

55 |
56 | 57 | -FV 58 |

-fv

59 |
60 |
61 | 62 | 63 | friend:1a,9/2,10 64 |

n

65 |
66 |
67 | 68 | 69 | then 70 |

adv

71 |
72 |
73 | 74 | 75 | hare:9/10 76 |

n

77 |
78 |
79 | 80 | 81 | 1.PAST- 82 |

sm-

83 |
84 | 85 | 1- 86 |

ncp-

87 |
88 | 89 | speak 90 |

v

91 |
92 | 93 | -FV7- 94 |

-fvacp-

95 |
96 |
97 | 98 | 99 | 100 |

101 | 102 | 103 | go 104 |

v

105 |
106 | 107 | -FV 108 |

-fv

109 |
110 |
111 | 112 | 113 | 9/10- 114 |

ncp-

115 |
116 | 117 | journey:9/10 118 |

n

119 |
120 |
121 |
122 | They stayed there for many days 123 |
124 |
125 | 126 |
127 | -------------------------------------------------------------------------------- /src/poioapi/tests/sample_files/typecraft_graf/typecraft_example-translation.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | Travelling in search of water has become dangerous in the villages. 19 | 20 | 21 | 22 | 23 | 24 | 25 | Akosua poured water through the hole 26 | 27 | 28 | 29 | 30 | 31 | 32 | I drove the car from the house through the school to church 33 | 34 | 35 | 36 | 37 | 38 | 39 | She passes through the school to the market. 40 | 41 | 42 | 43 | 44 | 45 | 46 | Government is doing things to enhance tourism in the country. 47 | 48 | 49 | 50 | 51 | 52 | 53 | twelve periods that add up to one year 54 | 55 | 56 | 57 | 58 | 59 | 60 | I pass through the farm to school 61 | 62 | 63 | 64 | 65 | 66 | 67 | It is school that I wanted to send it 68 | 69 | 70 | 71 | 72 | 73 | 74 | twelve periods which add up to a year 75 | 76 | 77 | 78 | -------------------------------------------------------------------------------- /src/poioapi/tests/sample_files/toolbox_graf/toolbox_latex.txt: -------------------------------------------------------------------------------- 1 | \_sh v3.0 1801 Text 2 | 3 | \id Pear.Madi 4 | \dt aufgenommen: 27/Aug/2006 5 | \dt glossiert: 29/Aug/2006 6 | 7 | \nt Пахрудинова Мадина Шейховна (1993, 13 Jahre) 8 | \sen ungeeignet 9 | 10 | \ref Pear_Madi.001 11 | \tx diž yikes . diž bikes čeq čeqi rekʼe 12 | \mb diž y- ike -s . diž b- ike -s čeq čeq -i rekʼe 13 | \ge I.DAT II- see -PST 1sec I.DAT III- see -PST forest forest -ESS.II person 14 | \ps pn aff- v -aff break pn aff- v -aff n n -aff n 15 | 16 | \ft I saw . I saw a forest, a person in the forest 17 | \rt 18 | 19 | \ref Pear_Madi.002 20 | \tx rekʼe bakʼarzi bakʼarziruho zoqʼes .... bakʼarziruho zoqʼes 21 | \mb rekʼe bakʼarzi bakʼarziru -ho zoqʼe -s . - . - . - . bakʼarziru -ho zoqʼe -s 22 | \ge person collect collect.III/IV/V.PL -PRS be -PST 1sec - 1sec - 1sec - 1sec collect.III/IV/V.PL -PRS be -PST 23 | \ps n v v -aff v -aff break - break - break - break v -aff v -aff 24 | 25 | \ft The person collected (them). 26 | \rt 27 | 28 | \nt 17 sec Pause, Madina überlegt, was Birne auf Ginukh heisst 29 | 30 | \ref Pear_Madi.003 31 | \tx rikes ɡruša i haɬoy haw sobiratno bun tohobito aqʼes kʼonkʼaʎʼo rekʼe uži 32 | \mb r- ike -s ɡruša *i haɬo -y haw *sobirat -no b- u -n tohobito aqʼe -s kʼonkʼa -ʎʼo rekʼe uži 33 | \ge V?- see -PST pear(r) *** he.OBL -ERG that *** -and III- do, make -PFT on.the.other.side come -PST bike -ESS.III person boy 34 | \ps aff- v -aff n *** pn -aff pn *** -aff aff- v -aff adv v -aff n -aff n n 35 | 36 | \ft (I? / He?) saw the pear and he collected it, from the other side came a person, a boy on the bicycle. 37 | \rt 38 | 39 | \ref Pear_Madi.004 40 | \tx haɬoy haw bikʼekʼiš .. karzina 41 | \mb haɬo -y haw b- ikʼekʼ -iš . - . karžina 42 | \ge this.OBL -ERG this III- steal -PST 1sec - 1sec basket(r) 43 | \ps pn -aff pn aff- v -aff break - break n 44 | 45 | \ft He stole this .. basket. 46 | \rt 47 | 48 | \ref Pear_Madi.005 ######## 51.5 49 | \tx bikʼekʼno oxes oxeya ɡamačʼ keziyiqno 50 | \mb b- ikʼekʼ -no ox -es ɡamačʼ keziyiq -no 51 | \ge III- steal -PFT? leave -PST stone meet.II -PFT? 52 | \ps aff- v -aff v -aff n v -aff 53 | 54 | \ft 55 | \ref Pear_Madi.006 ########### 56 | \tx ɡamačʼlis kʼonkʼaʎʼo ? bekin haɡo rede neʎin 57 | \mb ɡamačʼ - *l -i -s kʼonkʼa -ʎʼo *? b- *ek -in haɡo rede neʎ -in 58 | \ge stone - *** -ESS.II -GEN1 bike -ESS.III *** I/II.PL- *** -PFT this wood give -PFT 59 | \ps n - *** -aff -aff n -aff *** aff- *** -aff pn n v -aff 60 | 61 | \ft 62 | \ref Pear_Madi.007 63 | \tx hezodoy haw ɡrušan sadaq bosiš 64 | \mb hezodoy haw ɡruša -n sadaq b- os -iš 65 | \ge then this pear(r) -and all III- fall -PST 66 | \ps adv pn n -clit adv aff- v -aff 67 | 68 | \ft 69 | -------------------------------------------------------------------------------- /src/poioapi/tests/io/test_elan.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Poio Tools for Linguists 4 | # 5 | # Copyright (C) 2009-2013 Poio Project 6 | # Author: António Lopes 7 | # URL: 8 | # For license information, see LICENSE.TXT 9 | 10 | import os 11 | 12 | import poioapi.io.elan 13 | import poioapi.io.graf 14 | 15 | class TestElan: 16 | """ 17 | This class contain the test methods to the 18 | class io.elan.py. 19 | 20 | """ 21 | 22 | def setup(self): 23 | self.filename = os.path.join(os.path.dirname(__file__), "..", "sample_files", 24 | "elan_graf", "example.eaf") 25 | 26 | self.basedirname = os.path.dirname(self.filename) 27 | 28 | self.metafile = os.path.join(os.path.dirname(__file__), "..", "sample_files", 29 | "elan_graf", "example-extinfo.xml") 30 | 31 | self.elan = poioapi.io.elan.Parser(self.filename) 32 | 33 | def test_get_root_tiers(self): 34 | root_tiers = self.elan.get_root_tiers() 35 | 36 | assert len(root_tiers) == 4 37 | 38 | def test_get_child_tiers_for_tier(self): 39 | # Get the root tiers 40 | root_tiers = self.elan.get_root_tiers() 41 | 42 | # Select the W-Spch tier 43 | tier = root_tiers[1] 44 | 45 | child_tier = self.elan.get_child_tiers_for_tier(tier) 46 | 47 | assert len(child_tier) == 2 48 | 49 | def test_get_annotations_for_tier(self): 50 | root_tier = self.elan.get_root_tiers()[1] # W-Spch 51 | root_tier_annotations = self.elan.get_annotations_for_tier(root_tier) 52 | assert len(root_tier_annotations) == 15 53 | 54 | annotation = root_tier_annotations[0] # a8 55 | child_tier = self.elan.get_child_tiers_for_tier(root_tier)[0] # W-Words 56 | child_tier_annotations = self.elan.get_annotations_for_tier(child_tier, annotation) 57 | assert len(child_tier_annotations) == 12 58 | 59 | 60 | def test_get_annotations_for_tier_with_parent(self): 61 | root_tiers = self.elan.get_root_tiers() 62 | 63 | child_tiers = self.elan.get_child_tiers_for_tier(root_tiers[1]) 64 | 65 | parent_annotation = poioapi.io.graf.Annotation('a8', 'ann_value') 66 | 67 | child_tier_annotations = self.elan.get_annotations_for_tier(child_tiers[1], parent_annotation) 68 | 69 | assert len(child_tier_annotations) == 1 70 | assert child_tier_annotations[0].id == "a217" 71 | 72 | def test_tier_has_regions(self): 73 | root_tiers = self.elan.get_root_tiers() 74 | 75 | child_tiers = self.elan.get_child_tiers_for_tier(root_tiers[1]) 76 | 77 | tier = child_tiers[0] # W-Words 78 | 79 | has_regions = self.elan.tier_has_regions(tier) 80 | 81 | assert has_regions == True 82 | 83 | def test_region_for_annotation(self): 84 | root_tier = self.elan.get_root_tiers()[1] # W-Spch 85 | root_tier_annotations = self.elan.get_annotations_for_tier(root_tier) 86 | 87 | annotation = root_tier_annotations[0] 88 | 89 | regions = self.elan.region_for_annotation(annotation) 90 | 91 | expected_regions = (780, 4090) 92 | 93 | assert regions == expected_regions 94 | 95 | def test__annotation_for_region(self): 96 | annotation = self.elan._annotation_for_region("W-Spch", 780, 1340) 97 | assert annotation.attrib["ANNOTATION_ID"] == "a8" -------------------------------------------------------------------------------- /src/poioapi/io/brat.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Poio Tools for Linguists 4 | # 5 | # Copyright (C) 2009-2013 Poio Project 6 | # Author: António Lopes 7 | # URL: 8 | # For license information, see LICENSE.TXT 9 | 10 | """ 11 | 12 | """ 13 | 14 | from __future__ import unicode_literals 15 | 16 | import os 17 | import codecs 18 | 19 | 20 | class Writer(): 21 | def __init__(self, annotation_space, feature_name="annotation_value"): 22 | self.annotation_space = annotation_space 23 | self.feature_name = feature_name 24 | 25 | def write(self, outputfile, converter): # graf_graph, tier_hierarchies=None, meta_information=None): 26 | ann_file = codecs.open(outputfile, "w", "utf-8") 27 | t = 1 28 | n = 1 29 | relation_map = {} 30 | 31 | label_list = ["head", "translation", "pos", "italic", "bold"] 32 | 33 | for annotation in converter.graf.annotation_spaces[self.annotation_space]: 34 | if annotation.label in label_list: 35 | if self.feature_name in annotation.features: 36 | annotation_value = annotation.features[self.feature_name] 37 | 38 | if annotation_value: 39 | node = annotation.element 40 | 41 | for feature, value in annotation.features.items(): 42 | if value: 43 | if feature != self.feature_name: 44 | annotation_type = value 45 | else: 46 | annotation_type = annotation.label 47 | 48 | if node.links: 49 | anchors = node.links[0][0].anchors 50 | line = "T{0}\t{1} {2} {3}\t{4}\n".\ 51 | format(t, annotation_type, anchors[0], anchors[1], annotation_value) 52 | note = "#{0}\tAnnotatorNotes T{1}\t{2}\n".format(n, t, node) 53 | relation_map[node.id] = "T{0}".format(t) 54 | 55 | ann_file.write(line) 56 | ann_file.write(note) 57 | t += 1 58 | n += 1 59 | 60 | ann_file.close() 61 | 62 | # def create_relations(self, graf_graph, relation_map, ann_file): 63 | # r = 1 64 | 65 | # for node_id, text_bound in relation_map.items(): 66 | # for edge in graf_graph.edges: 67 | # if node_id == edge.from_node.id: 68 | # line = "R{0} To Arg1:{2} Arg2:{1}\n".\ 69 | # format(r, relation_map[edge.from_node.id], 70 | # relation_map[edge.to_node.id]) 71 | # r += 1 72 | # ann_file.write(line) 73 | 74 | # return ann_file 75 | 76 | # def create_conf_file(self, graf_graph, outputfile): 77 | # basedirname = os.path.dirname(outputfile) 78 | 79 | # annotation_conf = open(basedirname+"/annotation.conf", "w") 80 | 81 | # annotation_conf.write("[entities]\n") 82 | 83 | # for entity in graf_graph.header.annotation_spaces: 84 | # annotation_conf.write(entity+"\n") 85 | 86 | # annotation_conf.write("\n[relations]\n# To Arg1:, Arg2:" 87 | # "\n Arg1:, Arg2:, :" 88 | # "\n\n[events]\n# none\n\n[attributes]\n# none") 89 | 90 | # annotation_conf.close() -------------------------------------------------------------------------------- /src/poioapi/tests/io/test_odin.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Poio Tools for Linguists 4 | # 5 | # Copyright (C) 2009-2014 Poio Project 6 | # Author: Pedro Manha 7 | # URL: 8 | # For license information, see LICENSE.TXT 9 | 10 | # from __future__ import unicode_literals 11 | 12 | import os 13 | 14 | import poioapi.io.odin 15 | import poioapi.io.graf 16 | import poioapi.data 17 | import poioapi.mapper 18 | 19 | 20 | class TestParser: 21 | 22 | def __init__(self): 23 | self._samples_dir = os.path.join(os.path.dirname(__file__), '..', 24 | 'sample_files', 'odin') 25 | self._inputfile = '' 26 | 27 | self._root_tier = None 28 | 29 | self._parser = None 30 | 31 | def setup(self): 32 | self._inputfile = os.path.join(self._samples_dir, 'odin_test.xml') 33 | self._parser = poioapi.io.odin.Parser(self._inputfile, None) 34 | 35 | self._root_tier = poioapi.io.graf.Tier('source') 36 | 37 | def test_get_root_tiers(self): 38 | root_tiers = self._parser.get_root_tiers() 39 | assert len(root_tiers) == 1 40 | assert root_tiers[0].name == 'source' 41 | 42 | def test_get_child_tiers_for_tier(self): 43 | root_tiers = self._parser.get_root_tiers() 44 | 45 | phrase_tiers = self._parser.get_child_tiers_for_tier(root_tiers[0]) 46 | assert len(phrase_tiers) == 1 47 | 48 | child_tiers = self._parser.get_child_tiers_for_tier(phrase_tiers[0]) 49 | assert len(child_tiers) == 2 50 | ct_name_list = [a.name for a in child_tiers] 51 | assert self._parser.tier_labels.tier_label(poioapi.data.TIER_WORD) in ct_name_list 52 | assert self._parser.tier_labels.tier_label(poioapi.data.TIER_TRANSLATION) in ct_name_list 53 | 54 | tier = poioapi.io.graf.Tier(poioapi.data.tier_labels[poioapi.data.TIER_WORD]) 55 | child_tiers = self._parser.get_child_tiers_for_tier(tier) 56 | assert len(child_tiers) == 1 57 | assert child_tiers[0].name == self._parser.tier_labels.tier_label( 58 | poioapi.data.TIER_MORPHEME) 59 | 60 | def test_get_annotations_for_tier(self): 61 | root_annot = self._parser.get_annotations_for_tier(self._root_tier) 62 | utter = poioapi.io.graf.Tier(self._parser.tier_labels.tier_label( 63 | poioapi.data.TIER_UTTERANCE)) 64 | 65 | utter_annots = self._parser.get_annotations_for_tier(utter, root_annot[0]) 66 | 67 | assert len(utter_annots) == 18 68 | 69 | def test_gloss_special_case(self): 70 | utter_tier = poioapi.io.graf.Tier(self._parser.tier_labels.tier_label( 71 | poioapi.data.TIER_UTTERANCE)) 72 | word_tier = poioapi.io.graf.Tier(self._parser.tier_labels.tier_label( 73 | poioapi.data.TIER_WORD)) 74 | m_tier = poioapi.io.graf.Tier(self._parser.tier_labels.tier_label( 75 | poioapi.data.TIER_MORPHEME)) 76 | g_tier = poioapi.io.graf.Tier(self._parser.tier_labels.tier_label( 77 | poioapi.data.TIER_GLOSS)) 78 | 79 | root = self._parser.get_annotations_for_tier(self._root_tier)[0] 80 | utter = self._parser.get_annotations_for_tier(utter_tier, root)[0] 81 | words = self._parser.get_annotations_for_tier(word_tier, utter) 82 | morphemes = [] 83 | 84 | for w in words: 85 | morphemes.extend(self._parser.get_annotations_for_tier(m_tier, w)) 86 | 87 | glosses = [] 88 | for m in morphemes: 89 | glosses.extend(self._parser.get_annotations_for_tier(g_tier, m)) 90 | 91 | assert len(glosses) == 7 92 | 93 | expected = ['the', 'Paulo', 'worked', 'more', 'than', 'what', 'nobody'] 94 | assert set(expected) == set([a.value for a in glosses]) -------------------------------------------------------------------------------- /src/poioapi/tests/io/test_tcf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Poio Tools for Linguists 4 | # 5 | # Copyright (C) 2009-2013 Poio Project 6 | # Author: António Lopes 7 | # URL: 8 | # For license information, see LICENSE.TXT 9 | 10 | import os 11 | import tempfile 12 | import difflib 13 | 14 | import poioapi.io.tcf 15 | import poioapi.io.elan 16 | import poioapi.io.graf 17 | 18 | 19 | class TestParser: 20 | """ 21 | This class contain the test methods to the 22 | class io.tcf.py. 23 | 24 | """ 25 | 26 | def setup(self): 27 | self.filename = os.path.join(os.path.dirname(__file__), "..", "sample_files", 28 | "tcf_graf", "corpus.xml") 29 | 30 | self.basedirname = os.path.dirname(self.filename) 31 | 32 | self.parser = poioapi.io.tcf.Parser(self.filename) 33 | 34 | def test_get_root_tiers(self): 35 | root_tiers = self.parser.get_root_tiers() 36 | 37 | assert len(root_tiers) == 1 38 | 39 | def test_get_child_tiers_for_tier(self): 40 | root_tiers = self.parser.get_root_tiers() 41 | root_child_tier = self.parser.get_child_tiers_for_tier(root_tiers[0]) 42 | assert root_child_tier[0].name == "tokens" 43 | 44 | children_tier = self.parser.get_child_tiers_for_tier(root_child_tier[0]) 45 | assert len(children_tier) == 2 46 | 47 | def test_get_annotations_for_tier(self): 48 | root_tiers = self.parser.get_root_tiers() 49 | root_annotations = self.parser.get_annotations_for_tier(root_tiers[0]) 50 | assert len(root_annotations) == 2 51 | 52 | parent_annotation = poioapi.io.graf.Annotation("s1", "t1 t2 t3 t4 t5") 53 | token_tier = self.parser.get_child_tiers_for_tier(root_tiers[0]) 54 | token_annotations = self.parser.get_annotations_for_tier(token_tier[0], parent_annotation) 55 | assert len(token_annotations) == 5 56 | 57 | def test_tier_has_regions(self): 58 | root_tiers = self.parser.get_root_tiers() 59 | assert self.parser.tier_has_regions(root_tiers[0]) 60 | 61 | def test_region_for_annotation(self): 62 | root_tiers = self.parser.get_root_tiers() 63 | root_annotations = self.parser.get_annotations_for_tier(root_tiers[0]) 64 | region = self.parser.region_for_annotation(root_annotations[0]) 65 | assert region == ('1', '20') 66 | 67 | class TestWriter: 68 | 69 | def setup(self): 70 | self.filename = os.path.join(os.path.dirname(__file__), "..", "sample_files", 71 | "elan_graf", "example.eaf") 72 | 73 | self.parser = poioapi.io.elan.Parser(self.filename) 74 | self.writer = poioapi.io.tcf.Writer() 75 | 76 | def test_write(self): 77 | outputfile = tempfile.TemporaryFile() 78 | self.converter = poioapi.io.graf.GrAFConverter(self.parser, self.writer) 79 | self.converter.parse() 80 | self.converter.write(outputfile) 81 | 82 | testfile = os.path.join(os.path.dirname(__file__), "..", "sample_files", 83 | "tcf_graf", "test_write.tcf") 84 | 85 | #outputfile2 = os.path.join(os.path.dirname(__file__), "..", "sample_files", 86 | # "tcf_graf", "test_write.tcf") 87 | #self.converter.write(outputfile2) 88 | 89 | outputfile.seek(0) 90 | fromlines = outputfile.readlines() 91 | fromlines = [l.decode("utf-8") for l in fromlines] 92 | tolines = open(testfile, 'U').readlines() 93 | 94 | diff = difflib.unified_diff(fromlines, tolines) 95 | for line in diff: 96 | if not line.startswith("---") and not line.startswith("+++") and \ 97 | (line.startswith("+") or line.startswith("-")): 98 | assert "ed:phoneticsegmentation" in line 99 | -------------------------------------------------------------------------------- /src/poioapi/tests/io/test_typecraft.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Poio Tools for Linguists 4 | # 5 | # Copyright (C) 2009-2013 Poio Project 6 | # Author: António Lopes 7 | # URL: 8 | # For license information, see LICENSE.TXT 9 | 10 | import os 11 | import os.path 12 | import re 13 | import filecmp 14 | 15 | import xml.etree.ElementTree as ET 16 | 17 | import poioapi.io.typecraft 18 | import poioapi.io.graf 19 | import poioapi.annotationgraph 20 | 21 | class TestParser: 22 | """ 23 | This class contain the test methods to the 24 | class io.typecraft.py. 25 | 26 | """ 27 | 28 | def setup(self): 29 | self.filename = os.path.join(os.path.dirname(__file__), "..", "sample_files", 30 | "typecraft_graf", "typecraft_example.xml") 31 | 32 | self.basedirname = os.path.dirname(self.filename) 33 | 34 | self.parser = poioapi.io.typecraft.Parser(self.filename) 35 | 36 | self.converter = poioapi.io.graf.GrAFConverter(self.parser) 37 | self.converter.parse() 38 | 39 | self.graph = self.converter.graf 40 | 41 | tree = ET.parse(self.filename) 42 | self.root = tree.getroot() 43 | 44 | self.xml_namespace = re.search('\{(.*)\}', self.root.tag).group() 45 | 46 | def test_phrase_nodes(self): 47 | nodes_number = len(self.root.findall(self.xml_namespace+"phrase")) - 1 48 | 49 | expected_nodes_number = 0 50 | 51 | for nodes in self.graph.nodes: 52 | if "phrase" in nodes.id: 53 | expected_nodes_number += 1 54 | 55 | assert(nodes_number == expected_nodes_number) 56 | 57 | def test_phrase_annotation_features(self): 58 | node_phrase = self.root.find(self.xml_namespace+"phrase") 59 | 60 | expected_features_number = len(node_phrase.attrib) - 3 61 | 62 | for elements in node_phrase: 63 | key = str(elements.tag).split(self.xml_namespace) 64 | if key[1] != "word" and key[1] != "globaltags": 65 | expected_features_number += 1 66 | 67 | node = self.graph.nodes["phrase..n9764"] 68 | 69 | node_annotations = node.annotations._elements 70 | 71 | features_number = len(node_annotations[0].features) 72 | 73 | assert(features_number == expected_features_number) 74 | 75 | def test_get_root_tiers(self): 76 | root_tiers = self.parser.get_root_tiers() 77 | 78 | assert len(root_tiers) == 1 79 | 80 | def test_get_child_tiers_for_tier(self): 81 | root_tiers = self.parser.get_root_tiers() 82 | 83 | tier = root_tiers[0] 84 | 85 | child_tier = self.parser.get_child_tiers_for_tier(tier) 86 | 87 | assert len(child_tier) == 3 88 | 89 | def test_get_annotations_for_tier(self): 90 | root_tiers = self.parser.get_root_tiers() 91 | child_tier_annotations = self.parser.get_annotations_for_tier(root_tiers[0]) 92 | 93 | assert len(child_tier_annotations) == 10 94 | 95 | 96 | class TestWriter: 97 | 98 | def setup(self): 99 | self._inputfile = os.path.join(os.path.dirname(__file__), "..", 100 | "sample_files", "typecraft_graf", 101 | "typecraft_example.xml") 102 | self._outputfile = os.path.join(os.path.dirname(__file__), "..", 103 | "sample_files", "mandinka", 104 | "mandinka_typecraft.xml") 105 | 106 | def test_conversion(self): 107 | inputfile = os.path.join(os.path.dirname(__file__), "..", "sample_files", 108 | "mandinka", "mandinka.txt") 109 | outputfile = os.path.join(os.path.dirname(__file__), "..", "sample_files", 110 | "mandinka", "mandinka_typecraft.xml") 111 | originalfile = os.path.join(os.path.dirname(__file__), "..", "sample_files", 112 | "mandinka", "mandinka_typecraft_original.xml") 113 | ag = poioapi.annotationgraph.AnnotationGraph.from_mandinka(inputfile) 114 | writer = poioapi.io.typecraft.Writer() 115 | writer.write(outputfile, ag) 116 | assert os.path.getsize(outputfile) == os.path.getsize(originalfile) 117 | assert filecmp.cmp(outputfile, originalfile, shallow=False) -------------------------------------------------------------------------------- /src/poioapi/tests/test_mapper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Poio Tools for Linguists 4 | # 5 | # Copyright (C) 2009-2013 Poio Project 6 | # Author: Peter Bouda 7 | # URL: 8 | # For license information, see LICENSE.TXT 9 | 10 | import os.path 11 | import filecmp 12 | 13 | import poioapi.mapper 14 | import poioapi.data 15 | 16 | 17 | class TestTierMapper: 18 | 19 | _tm = None 20 | _sample_file = '' 21 | 22 | def setup(self): 23 | self._sample_file = os.path.join(os.path.dirname(__file__), 24 | 'sample_files', 'mapper', 25 | 'example.json') 26 | 27 | def test_load_mapping(self): 28 | self._tm = poioapi.mapper.TierMapper() 29 | self._tm.load_mapping(self._sample_file) 30 | 31 | assert len(self._tm._tier_mapping) == 2 32 | 33 | gloss_tier_labels = self._tm.tier_labels(poioapi.data.TIER_GLOSS) 34 | pos_tier_labels = self._tm.tier_labels(poioapi.data.TIER_POS) 35 | 36 | assert len(gloss_tier_labels) == 1 37 | assert len(pos_tier_labels) == 1 38 | 39 | def test_tier_labels(self): 40 | self._tm = poioapi.mapper.TierMapper() 41 | self._tm.load_mapping(self._sample_file) 42 | 43 | tiers_to_succeed = ['gloss'] 44 | tiers_to_test = self._tm.tier_labels(poioapi.data.TIER_GLOSS) 45 | 46 | assert set(tiers_to_succeed) == set(tiers_to_test) 47 | 48 | no_tier_type = [] 49 | no_type_to_test = self._tm.tier_labels(poioapi.data.TIER_TRANSLATION) 50 | 51 | assert no_tier_type == no_type_to_test 52 | 53 | def test_tier_label(self): 54 | self._tm = poioapi.mapper.TierMapper() 55 | self._tm.load_mapping(self._sample_file) 56 | 57 | tier_to_succeed = 'pos' 58 | tier_to_test = self._tm.tier_label(poioapi.data.TIER_POS, 0) 59 | 60 | assert tier_to_succeed == tier_to_test 61 | 62 | def test_append(self): 63 | self._tm = poioapi.mapper.TierMapper() 64 | self._tm.load_mapping(self._sample_file) 65 | 66 | expected = ['gloss', 'test'] 67 | self._tm.append_to_tier_labels(poioapi.data.TIER_GLOSS, ['test']) 68 | to_test = self._tm.tier_labels(poioapi.data.TIER_GLOSS) 69 | 70 | assert expected == to_test 71 | 72 | def test_exists(self): 73 | self._tm = poioapi.mapper.TierMapper() 74 | self._tm.load_mapping(self._sample_file) 75 | tag_exists = self._tm.tier_label_exists('pos') 76 | tag_not_exists = self._tm.tier_label_exists('test') 77 | 78 | assert tag_exists is True 79 | assert tag_not_exists is False 80 | 81 | 82 | 83 | class TestAnnotationMapper: 84 | 85 | _am = None 86 | _sample_file = '' 87 | 88 | def setup(self): 89 | self._sample_file = os.path.join(os.path.dirname(__file__), 90 | "sample_files", "mapper", 91 | "example.json") 92 | 93 | def test_load_default(self): 94 | self._am = poioapi.mapper.AnnotationMapper(poioapi.data.MANDINKA, poioapi.data.TYPECRAFT) 95 | assert(len(self._am.annotation_mappings) == 1) 96 | assert(len(self._am.annotation_mappings[poioapi.data.TIER_GLOSS]) == 63) 97 | 98 | def test_load_user_mapping(self): 99 | self._am = poioapi.mapper.AnnotationMapper(poioapi.data.MANDINKA, poioapi.data.TYPECRAFT) 100 | self._am.load_mappings(self._sample_file) 101 | 102 | assert(len(self._am.annotation_mappings[poioapi.data.TIER_GLOSS]) == 65) 103 | 104 | def test_validate_tag(self): 105 | tag_to_succeed = '1SG' 106 | multitag_to_succeed = 'TAG' 107 | 108 | self._am = poioapi.mapper.AnnotationMapper(poioapi.data.MANDINKA, poioapi.data.TYPECRAFT) 109 | self._am.load_mappings(self._sample_file) 110 | 111 | assert(self._am.validate_tag(poioapi.data.TIER_GLOSS, tag_to_succeed) == '1SG') 112 | assert(self._am.validate_tag(poioapi.data.TIER_GLOSS, multitag_to_succeed) == 'TEST') 113 | 114 | def test_export(self): 115 | self._am = poioapi.mapper.AnnotationMapper(poioapi.data.MANDINKA, poioapi.data.TYPECRAFT) 116 | self._am.load_mappings(self._sample_file) 117 | 118 | self._am.add_to_missing(poioapi.data.TIER_GLOSS, '3PL') 119 | self._am.add_to_missing(poioapi.data.TIER_GLOSS, '4PL') 120 | filename = os.path.join(os.path.dirname(__file__), 'sample_files', 'mapper', 'example_export_test.json') 121 | expected_filename = os.path.join(os.path.dirname(__file__), 'sample_files', 'mapper', 'example_export.json') 122 | self._am.export_missing_tags(filename) 123 | 124 | assert(os.path.getsize(filename) == os.path.getsize(expected_filename)) 125 | assert(filecmp.cmp(filename, expected_filename, False) is True) -------------------------------------------------------------------------------- /examples/poio_converter.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Poio Tools for Linguists 4 | # 5 | # Copyright (C) 2009-2013 Poio Project 6 | # Author: Peter Bouda 7 | # URL: 8 | # For license information, see LICENSE.TXT 9 | 10 | import sys 11 | import optparse 12 | import codecs 13 | import os 14 | 15 | import poioapi.annotationgraph 16 | import poioapi.data 17 | import poioapi.io.typecraft 18 | import poioapi.io.latex 19 | 20 | 21 | def main(argv): 22 | usage = "usage: %prog [options] inputfile outputfile" 23 | parser = optparse.OptionParser(usage=usage) 24 | parser.add_option("-i", "--inputtype", dest="inputtype", 25 | help="Type of the input file (elan|toolbox|shoebox|mandinka|odin)") 26 | parser.add_option("-o", "--outputtype", dest="outputtype", 27 | help="Type of the output file (html|graf|typecraft|latex)") 28 | parser.add_option("-r", "--roottier", dest="roottier", 29 | help="Root tier for html output, is the record marker in Toolbox") 30 | parser.add_option("-t", "--map-file", dest="mapping", 31 | help="A JSON file containing the tier and tag mapping.") 32 | parser.add_option("-m", "--missing-tags", action='store_true', dest="missing_tags", default=False, 33 | help="If any missing tags are found, writes them to the output file, in JSON format. " 34 | "If this flag is omitted, but missing tags are found, they are ignored.") 35 | parser.add_option('-l', '--language-code', dest='language_code', default='und', 36 | help='The language of the source text. Use the ISO 639-3 code for the language as the value' 37 | ' of this parameter.') 38 | (options, files) = parser.parse_args() 39 | 40 | if len(files) != 2: 41 | parser.print_usage() 42 | sys.exit(0) 43 | 44 | if options.inputtype not in ['toolbox', 'elan', 'shoebox', 'obt', 45 | 'mandinka', 'odin']: 46 | parser.print_usage() 47 | sys.exit(0) 48 | 49 | if options.outputtype not in ['html', 'graf', 'typecraft', 'latex']: 50 | parser.print_usage() 51 | sys.exit(0) 52 | mapping = None 53 | if options.mapping: 54 | if os.path.exists(options.mapping): 55 | mapping = options.mapping 56 | else: 57 | print('The file {0} does not exist.'.format(options.mapping)) 58 | parser.print_help() 59 | sys.exit(0) 60 | 61 | # Load the data from files 62 | ag = None 63 | if options.inputtype == "elan": 64 | ag = poioapi.annotationgraph.AnnotationGraph.from_elan(files[0]) 65 | elif options.inputtype == "mandinka": 66 | ag = poioapi.annotationgraph.AnnotationGraph.from_mandinka(files[0]) 67 | elif options.inputtype == "obt": 68 | ag = poioapi.annotationgraph.AnnotationGraph.from_obt(files[0]) 69 | elif options.inputtype == "shoebox": 70 | ag = poioapi.annotationgraph.AnnotationGraph.from_shoebox(files[0]) 71 | elif options.inputtype == "toolbox": 72 | if not options.roottier: 73 | print("No record marker specified (argument \"-r\"). Assuming \"ref\" as record marker.") 74 | 75 | ag = poioapi.annotationgraph.AnnotationGraph.from_toolbox(files[0]) 76 | elif options.inputtype == 'odin': 77 | ag = poioapi.annotationgraph.AnnotationGraph.from_odin(files[0]) 78 | 79 | 80 | # Set the structure type for hierarchical/interlinear output 81 | root_found = False 82 | if options.roottier: 83 | for th in ag.tier_hierarchies: 84 | if options.roottier == th[0] or th[0].endswith('..' + options.roottier): 85 | ag.structure_type_handler = poioapi.data.DataStructureType(th) 86 | root_found = True 87 | 88 | if not root_found: 89 | print("Could not find root tier in file or root tier was not specified. Will use the first tier hierarchy.") 90 | 91 | if options.outputtype == "html": 92 | # Output as html 93 | f = codecs.open(files[1], "w", "utf-8") 94 | f.write(ag.as_html_table(False, True)) 95 | f.close() 96 | elif options.outputtype == "graf": 97 | writer = poioapi.io.graf.Writer() 98 | writer.write(files[1], ag) 99 | elif options.outputtype == "typecraft": 100 | missing_tags = options.missing_tags 101 | 102 | typecraft = poioapi.io.typecraft.Writer() 103 | if missing_tags: 104 | typecraft.missing_tags(files[1], ag, additional_map_path=mapping) 105 | else: 106 | typecraft.write(files[1], ag, extra_tag_map=mapping, language=options.language_code) 107 | elif options.outputtype == 'latex': 108 | latex = poioapi.io.latex.Writer() 109 | latex.write(files[1], ag) 110 | 111 | if __name__ == "__main__": 112 | main(sys.argv) -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # Internal variables. 11 | PAPEROPT_a4 = -D latex_paper_size=a4 12 | PAPEROPT_letter = -D latex_paper_size=letter 13 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 14 | 15 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest 16 | 17 | help: 18 | @echo "Please use \`make ' where is one of" 19 | @echo " html to make standalone HTML files" 20 | @echo " dirhtml to make HTML files named index.html in directories" 21 | @echo " singlehtml to make a single large HTML file" 22 | @echo " pickle to make pickle files" 23 | @echo " json to make JSON files" 24 | @echo " htmlhelp to make HTML files and a HTML help project" 25 | @echo " qthelp to make HTML files and a qthelp project" 26 | @echo " devhelp to make HTML files and a Devhelp project" 27 | @echo " epub to make an epub" 28 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 29 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 30 | @echo " text to make text files" 31 | @echo " man to make manual pages" 32 | @echo " changes to make an overview of all changed/added/deprecated items" 33 | @echo " linkcheck to check all external links for integrity" 34 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 35 | 36 | clean: 37 | -rm -rf $(BUILDDIR)/* 38 | 39 | html: 40 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 41 | @echo 42 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 43 | 44 | dirhtml: 45 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 46 | @echo 47 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 48 | 49 | singlehtml: 50 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 51 | @echo 52 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 53 | 54 | pickle: 55 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 56 | @echo 57 | @echo "Build finished; now you can process the pickle files." 58 | 59 | json: 60 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 61 | @echo 62 | @echo "Build finished; now you can process the JSON files." 63 | 64 | htmlhelp: 65 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 66 | @echo 67 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 68 | ".hhp project file in $(BUILDDIR)/htmlhelp." 69 | 70 | qthelp: 71 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 72 | @echo 73 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 74 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 75 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/PyAnnotation.qhcp" 76 | @echo "To view the help file:" 77 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/PyAnnotation.qhc" 78 | 79 | devhelp: 80 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 81 | @echo 82 | @echo "Build finished." 83 | @echo "To view the help file:" 84 | @echo "# mkdir -p $$HOME/.local/share/devhelp/PyAnnotation" 85 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/PyAnnotation" 86 | @echo "# devhelp" 87 | 88 | epub: 89 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 90 | @echo 91 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 92 | 93 | latex: 94 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 95 | @echo 96 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 97 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 98 | "(use \`make latexpdf' here to do that automatically)." 99 | 100 | latexpdf: 101 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 102 | @echo "Running LaTeX files through pdflatex..." 103 | make -C $(BUILDDIR)/latex all-pdf 104 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 105 | 106 | text: 107 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 108 | @echo 109 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 110 | 111 | man: 112 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 113 | @echo 114 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 115 | 116 | changes: 117 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 118 | @echo 119 | @echo "The overview file is in $(BUILDDIR)/changes." 120 | 121 | linkcheck: 122 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 123 | @echo 124 | @echo "Link check complete; look for any errors in the above output " \ 125 | "or in $(BUILDDIR)/linkcheck/output.txt." 126 | 127 | doctest: 128 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 129 | @echo "Testing of doctests in the sources finished, look at the " \ 130 | "results in $(BUILDDIR)/doctest/output.txt." 131 | -------------------------------------------------------------------------------- /src/poioapi/tests/sample_files/odin/odin_test.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | http://www.ub.uni-konstanz.de/kops/volltexte/2003/1103/pdf/ap114.pdf 10 | http://odin.linguistlist.org/igt_raw.php?id=1052&langcode=por 11 | (2002). . </p> 12 | 13 | 14 | (1) O Paulo trabalhou mais do que ninguém. 15 | the Paulo worked more than_what nobody 16 | `Paulo has worked harder than anybody (else).' 17 | 18 | 19 | (2) O Paulo não trabalhou mais do que ninguém. 20 | the Paulo not worked more than_what nobody 21 | `Paulo has not worked harder than anybody (else).' 22 | 23 | 24 | (3) Desta vez, o Paulo correu menos do que nunca. 25 | on_thistime, the Paulo ran less than_what never 26 | `This time, Paulo ran less than ever.' 27 | 28 | 29 | (1) O Paulo trabalhou mais do que ninguém. 30 | the Paulo worked more than_what nobody 31 | `Paulo has worked harder than anybody.' 32 | 33 | 34 | (13) Todos os lugares onde ninguém esteve foram ignorados. 35 | all the places where nobody was were ignored 36 | `All the places where nobody was were ignored.' 37 | 38 | 39 | (16) A montanha mais alta que o Paulo nunca escalou é o Monte Everest. 40 | the mountain more high that the Paulo never climbed is the Mount Everest 41 | `The highest mountain that Paulo never climbed is Mount Everest.' 42 | 43 | 44 | (24) O Paulo correu mais do que nunca. 45 | the Paulo ran more than_ what never 46 | `Paulo has run faster than ever.' 47 | 48 | 49 | (25) O Paulo correu menos do que nunca. 50 | the Paulo ran less than_what never 51 | `Paulo has run less than ever.' 52 | 53 | 54 | (26) O Paulo é (muito) mais alto do que é a Ana. 55 | the Paulo is (much) more tall than_what is the Ana 56 | `Paulo is (much) taller than Ana is' 57 | 58 | 59 | (27) O Paulo é (muito) mais alto do que a Ana. 60 | the Paulo is (much) more tall than_what the Ana 61 | `Paulo is (much) taller than Ana' 62 | 63 | 64 | (37) O Paulo é tão alto como a Ana. 65 | the Paulo is as tall as the Ana. 66 | `Paulo is as tall as Ana.' 67 | 68 | 69 | (41) O Paulo trabalhou mais do que ninguém. 70 | the Paulo worked more than_what nobody 71 | `Paulo has worked harder than anybody.' 72 | 73 | 74 | (42) O Paulo correu menos do que nunca. 75 | the Paulo ran less than_what never 76 | `This time, Paulo ran less than ever.' 77 | 78 | 79 | (43) *O Paulo correu tanto como nunca. 80 | the Paulo ran as-much as never 81 | `Paulo ran as fast as ever.' 82 | 83 | 84 | (44) O Paulo é mais alto do que a maioria dos seus colegas. 85 | the Paulo is more tall than_what the majority of_his colleagues 86 | `Paulo is taller that most of his colleagues.' 87 | 88 | 89 | (57) O Paulo trabalhou mais do que ninguém. 90 | the Paulo worked more than_what nobody 91 | `Paulo has worked harder than anybody.' 92 | 93 | 94 | (58) O Paulo não trabalhou mais do que ninguém. 95 | the Paulo not worked more than_what nobody 96 | `Paulo has not worked harder than anybody.' 97 | 98 | 99 | (62) Ninguém viu nada. 100 | nobody saw nothing 101 | `Nobody saw anything.' 102 | 103 | 104 | 105 | 106 | 107 | -------------------------------------------------------------------------------- /src/poioapi/tests/sample_files/elan_graf/example-utterance.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | and you follow then the sign Kleef 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | that's the oranje single 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | then you follow the sign kleef 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | you come down 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | you know eh after this trajanus plein 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | you come down to the 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | rhine eh valley 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | yeah that's another eh 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | kind of rotunde 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | and then you follow the signs kleef 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | go down 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | and then you go the this way 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | eh ja to kleef 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | so you go out of the Institute to the Saint Anna Straat. 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | and then you go the other, Saint Anna Straat to this to the center of the town, to this big rotunde. 145 | 146 | 147 | 148 | -------------------------------------------------------------------------------- /src/poioapi/tests/test_annotationgraph.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Poio Tools for Linguists 4 | # 5 | # Copyright (C) 2009-2013 Poio Project 6 | # Author: Peter Bouda 7 | # URL: 8 | # For license information, see LICENSE.TXT 9 | 10 | from __future__ import unicode_literals 11 | 12 | import os 13 | 14 | from poioapi import data 15 | import poioapi.annotationgraph 16 | 17 | class TestAnnotationGraph: 18 | """ 19 | This class contain the test methods to the 20 | class annotationgraph.py. 21 | 22 | """ 23 | 24 | def setup(self): 25 | filename = os.path.abspath(os.path.join(os.path.dirname( __file__ ), 26 | '..', '..', '..', 'example_data', 'turkish.eaf')) 27 | 28 | self.annotation_graph = \ 29 | poioapi.annotationgraph.AnnotationGraph.from_elan(filename) 30 | 31 | self.annotation_graph.init_filters() 32 | 33 | self.anngraphfilter = poioapi.annotationgraph.AnnotationGraphFilter( 34 | self.annotation_graph) 35 | 36 | def test_root_nodes(self): 37 | root_nodes = self.annotation_graph.root_nodes() 38 | assert(len(root_nodes) == 9) 39 | 40 | def test_nodes_for_tier(self): 41 | root_nodes = self.annotation_graph.root_nodes() 42 | nodes = self.annotation_graph.nodes_for_tier("Äußerung", root_nodes[0]) 43 | 44 | assert(len(nodes) == 0) 45 | 46 | def test_annotations_for_tier(self): 47 | node = self.annotation_graph.graf.nodes["Glosse..P-Gloss..na262"] 48 | annotations = self.annotation_graph.annotations_for_tier("Glosse", node) 49 | assert(len(annotations) == 1) 50 | 51 | def test_annotation_value_for_annotation(self): 52 | node = self.annotation_graph.graf.nodes["Glosse..P-Gloss..na262"] 53 | annotations = self.annotation_graph.annotations_for_tier("Glosse", node) 54 | value = self.annotation_graph.annotation_value_for_annotation( 55 | annotations[0]) 56 | assert(value=="REPPAST") 57 | 58 | def test_annotation_value_for_node(self): 59 | node = self.annotation_graph.graf.nodes["Glosse..P-Gloss..na262"] 60 | value = self.annotation_graph.annotation_value_for_node(node) 61 | assert(value=="REPPAST") 62 | 63 | def test_as_html_table(self): 64 | html = self.annotation_graph.as_html_table() 65 | assert(len(html) > 0) 66 | 67 | def test_append_filter(self): 68 | self.anngraphfilter.set_filter_for_tier("Glosse..P-Gloss", "ANOM") 69 | self.annotation_graph.append_filter(self.anngraphfilter) 70 | self.anngraphfilter.reset_match_object() 71 | 72 | assert self.annotation_graph.filtered_node_ids[-1] == \ 73 | ['Äußerung..P-Spch..na2', 'Äußerung..P-Spch..na9'] 74 | 75 | def test_reset_filters(self): 76 | self.anngraphfilter.set_filter_for_tier("Glosse..P-Gloss", "ANOM") 77 | self.annotation_graph.append_filter(self.anngraphfilter) 78 | self.anngraphfilter.reset_match_object() 79 | 80 | assert self.annotation_graph.filtered_node_ids[-1] == \ 81 | ['Äußerung..P-Spch..na2', 'Äußerung..P-Spch..na9'] 82 | 83 | def test_create_filter_for_dict(self): 84 | search_terms = { "Glosse..P-Gloss": "yesterday" } 85 | self.anngraphfilter = self.annotation_graph.create_filter_for_dict( 86 | search_terms) 87 | self.annotation_graph.append_filter(self.anngraphfilter) 88 | self.anngraphfilter.reset_match_object() 89 | 90 | assert self.annotation_graph.filtered_node_ids[-1] == \ 91 | ['Äußerung..P-Spch..na1'] 92 | 93 | # there was a bug where, for any type of tier, when one of the possible 94 | # names was a subset of another name of the same tier, duplicates 95 | # were being created in the AnnotationGraph. 96 | def test_for_node_duplicates(self): 97 | inputfile = os.path.join(os.path.dirname(__file__), 'sample_files', 98 | 'toolbox_graf', 'toolbox.txt') 99 | ag = poioapi.annotationgraph.AnnotationGraph.from_toolbox(inputfile) 100 | for tier_type in data.tier_labels.keys(): 101 | original = [] 102 | for marker in ag.tier_mapper.tier_labels(tier_type): 103 | ids = [n.id for n in ag.nodes_for_tier(marker)] 104 | original.extend(ids) 105 | 106 | trimmed = set(original) 107 | assert len(original) == len(trimmed) 108 | 109 | class TestAnnotationGraphFilter: 110 | 111 | def setup(self): 112 | filename = os.path.join(os.path.dirname(__file__), "sample_files", 113 | "balochi_graf", "balochi.hdr") 114 | self.annotation_graph = \ 115 | poioapi.annotationgraph.AnnotationGraph.from_graf(filename) 116 | 117 | self.annotation_graph.structure_type_handler = \ 118 | data.DataStructureTypeGraid() 119 | self.anngraphfilter = poioapi.annotationgraph.AnnotationGraphFilter( 120 | self.annotation_graph) 121 | 122 | def test_element_passes_filter(self): 123 | self.anngraphfilter.set_filter_for_tier("graid2", "nc") 124 | 125 | element = self.annotation_graph.graf.nodes['utterance..na898'] 126 | expected_result = False 127 | 128 | assert(self.anngraphfilter.element_passes_filter(element) 129 | == expected_result) 130 | 131 | element = self.annotation_graph.graf.nodes['utterance..na6'] 132 | expected_result = True 133 | 134 | assert(self.anngraphfilter.element_passes_filter(element) 135 | == expected_result) 136 | 137 | #element = self.annotation_graph.graf.nodes['utterance..na89'] 138 | self.anngraphfilter.set_filter_for_tier("graid2", "") 139 | self.anngraphfilter.set_filter_for_tier("clause_unit", "nc") 140 | expected_result = False 141 | 142 | assert(self.anngraphfilter.element_passes_filter(element) 143 | == expected_result) 144 | -------------------------------------------------------------------------------- /doc/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | set PATH=%PATH%;C:\Python27\scripts 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set BUILDDIR=_build 11 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 12 | set I18NSPHINXOPTS=%SPHINXOPTS% . 13 | if NOT "%PAPER%" == "" ( 14 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 15 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 16 | ) 17 | 18 | if "%1" == "" goto help 19 | 20 | if "%1" == "help" ( 21 | :help 22 | echo.Please use `make ^` where ^ is one of 23 | echo. html to make standalone HTML files 24 | echo. dirhtml to make HTML files named index.html in directories 25 | echo. singlehtml to make a single large HTML file 26 | echo. pickle to make pickle files 27 | echo. json to make JSON files 28 | echo. htmlhelp to make HTML files and a HTML help project 29 | echo. qthelp to make HTML files and a qthelp project 30 | echo. devhelp to make HTML files and a Devhelp project 31 | echo. epub to make an epub 32 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 33 | echo. text to make text files 34 | echo. man to make manual pages 35 | echo. texinfo to make Texinfo files 36 | echo. gettext to make PO message catalogs 37 | echo. changes to make an overview over all changed/added/deprecated items 38 | echo. linkcheck to check all external links for integrity 39 | echo. doctest to run all doctests embedded in the documentation if enabled 40 | goto end 41 | ) 42 | 43 | if "%1" == "clean" ( 44 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 45 | del /q /s %BUILDDIR%\* 46 | goto end 47 | ) 48 | 49 | if "%1" == "html" ( 50 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 51 | if errorlevel 1 exit /b 1 52 | echo. 53 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 54 | goto end 55 | ) 56 | 57 | if "%1" == "dirhtml" ( 58 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 59 | if errorlevel 1 exit /b 1 60 | echo. 61 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 62 | goto end 63 | ) 64 | 65 | if "%1" == "singlehtml" ( 66 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 67 | if errorlevel 1 exit /b 1 68 | echo. 69 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 70 | goto end 71 | ) 72 | 73 | if "%1" == "pickle" ( 74 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 75 | if errorlevel 1 exit /b 1 76 | echo. 77 | echo.Build finished; now you can process the pickle files. 78 | goto end 79 | ) 80 | 81 | if "%1" == "json" ( 82 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 83 | if errorlevel 1 exit /b 1 84 | echo. 85 | echo.Build finished; now you can process the JSON files. 86 | goto end 87 | ) 88 | 89 | if "%1" == "htmlhelp" ( 90 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 91 | if errorlevel 1 exit /b 1 92 | echo. 93 | echo.Build finished; now you can run HTML Help Workshop with the ^ 94 | .hhp project file in %BUILDDIR%/htmlhelp. 95 | goto end 96 | ) 97 | 98 | if "%1" == "qthelp" ( 99 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 100 | if errorlevel 1 exit /b 1 101 | echo. 102 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 103 | .qhcp project file in %BUILDDIR%/qthelp, like this: 104 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\Poio-api.qhcp 105 | echo.To view the help file: 106 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\Poio-api.ghc 107 | goto end 108 | ) 109 | 110 | if "%1" == "devhelp" ( 111 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 112 | if errorlevel 1 exit /b 1 113 | echo. 114 | echo.Build finished. 115 | goto end 116 | ) 117 | 118 | if "%1" == "epub" ( 119 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 120 | if errorlevel 1 exit /b 1 121 | echo. 122 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 123 | goto end 124 | ) 125 | 126 | if "%1" == "latex" ( 127 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 128 | if errorlevel 1 exit /b 1 129 | echo. 130 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 131 | goto end 132 | ) 133 | 134 | if "%1" == "text" ( 135 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 136 | if errorlevel 1 exit /b 1 137 | echo. 138 | echo.Build finished. The text files are in %BUILDDIR%/text. 139 | goto end 140 | ) 141 | 142 | if "%1" == "man" ( 143 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 144 | if errorlevel 1 exit /b 1 145 | echo. 146 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 147 | goto end 148 | ) 149 | 150 | if "%1" == "texinfo" ( 151 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 152 | if errorlevel 1 exit /b 1 153 | echo. 154 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 155 | goto end 156 | ) 157 | 158 | if "%1" == "gettext" ( 159 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 160 | if errorlevel 1 exit /b 1 161 | echo. 162 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 163 | goto end 164 | ) 165 | 166 | if "%1" == "changes" ( 167 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 168 | if errorlevel 1 exit /b 1 169 | echo. 170 | echo.The overview file is in %BUILDDIR%/changes. 171 | goto end 172 | ) 173 | 174 | if "%1" == "linkcheck" ( 175 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 176 | if errorlevel 1 exit /b 1 177 | echo. 178 | echo.Link check complete; look for any errors in the above output ^ 179 | or in %BUILDDIR%/linkcheck/output.txt. 180 | goto end 181 | ) 182 | 183 | if "%1" == "doctest" ( 184 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 185 | if errorlevel 1 exit /b 1 186 | echo. 187 | echo.Testing of doctests in the sources finished, look at the ^ 188 | results in %BUILDDIR%/doctest/output.txt. 189 | goto end 190 | ) 191 | 192 | :end 193 | -------------------------------------------------------------------------------- /src/poioapi/io/shoebox.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Poio Tools for Linguists 4 | # 5 | # Copyright (C) 2009-2013 Poio Project 6 | # Author: António Lopes 7 | # URL: 8 | # For license information, see LICENSE.TXT 9 | 10 | """ 11 | 12 | """ 13 | 14 | from __future__ import absolute_import 15 | 16 | import re 17 | 18 | import xml.etree.ElementTree as ET 19 | 20 | import poioapi.io.graf 21 | 22 | 23 | class Parser(poioapi.io.graf.BaseParser): 24 | 25 | def __init__(self, filepath): 26 | """Class's constructor. 27 | 28 | Parameters 29 | ---------- 30 | filepath : str 31 | Path of the Toolbox XML file. 32 | 33 | """ 34 | 35 | self.filepath = filepath 36 | self.parse() 37 | 38 | def parse(self): 39 | """This method will parse the input file. 40 | 41 | """ 42 | 43 | root = ET.parse(self.filepath) 44 | tree = root.getroot() 45 | self._current_id = 0 46 | self._elements_map = {"ref": [], "t": {}, "m": {}, 47 | "g": {}, "p": {}, "f": {}} 48 | 49 | self.parse_element_tree(tree) 50 | 51 | def parse_element_tree(self, tree): 52 | """ 53 | tag name and value represent the title 54 | ref represents the 55 | """ 56 | for t in tree: 57 | if t.tag == "ref": 58 | self._current_ref = t.attrib['value'] 59 | self._elements_map["ref"].append({"id":self._current_ref, "value":""}) 60 | 61 | elif t.tag == "t": 62 | self._current_t = self._next_id() 63 | self._add_elment_to_elements(t, self._current_t, self._current_ref, 64 | t.attrib['value']) 65 | self._add_phrase(t.attrib['value']) 66 | 67 | elif t.tag == "p": 68 | if t.text and "-" not in t.text: 69 | self._add_elment_to_elements(t, self._next_id(), self._current_t, 70 | t.text) 71 | 72 | elif t.tag == "m": 73 | self._current_m = self._next_id() 74 | self._add_elment_to_elements(t, self._current_m, self._current_t, 75 | t.attrib['value']) 76 | 77 | elif t.tag == "g": 78 | self._add_elment_to_elements(t, self._next_id(), self._current_m, t.text) 79 | 80 | elif t.tag == "name": 81 | self.meta_information = t.attrib["value"] 82 | 83 | if len(t.getchildren()) > 0: 84 | self.parse_element_tree(t) 85 | 86 | def _add_phrase(self, value): 87 | for ref in self._elements_map["ref"]: 88 | if ref["id"] == self._current_ref: 89 | ref["value"] += value + " " 90 | 91 | 92 | def _add_elment_to_elements(self, t, id, parent=None, value=None, features=None, region=None): 93 | if (t.tag, parent) in self._elements_map: 94 | self._elements_map[(t.tag, parent)].append( 95 | {"id": id, "value": value, "region": region, "features": features}) 96 | else: 97 | self._elements_map[(t.tag, parent)] = [{"id": id, "value": value, 98 | "region": region, 99 | "features": features}] 100 | 101 | def get_root_tiers(self): 102 | return [poioapi.io.graf.Tier("ref")] 103 | 104 | def get_child_tiers_for_tier(self, tier): 105 | if tier.name == "ref": 106 | return [poioapi.io.graf.Tier("t")] 107 | if tier.name == "t": 108 | return [poioapi.io.graf.Tier("p"), 109 | poioapi.io.graf.Tier("m")] 110 | if tier.name == "m": 111 | return [poioapi.io.graf.Tier("g")] 112 | 113 | def get_annotations_for_tier(self, tier, annotation_parent=None): 114 | if tier.name == "ref": 115 | return [poioapi.io.graf.Annotation(e["id"], e['value']) 116 | for e in self._elements_map[tier.name]] 117 | 118 | else: 119 | if (tier.name, annotation_parent.id) in self._elements_map: 120 | return [poioapi.io.graf.Annotation(e["id"], e["value"], 121 | e["features"]) 122 | for e in self._elements_map[(tier.name, annotation_parent.id)]] 123 | else: 124 | return [] 125 | 126 | def tier_has_regions(self, tier): 127 | #if tier.name == "t": 128 | # return True 129 | 130 | return False 131 | 132 | def region_for_annotation(self, annotation): 133 | idGroup = [value for key, value in self._elements_map.items() 134 | if "idGroup" in key] 135 | 136 | for elements in idGroup: 137 | for e in elements: 138 | if e["id"] == annotation.id: 139 | return e["region"] 140 | 141 | return None 142 | 143 | def get_primary_data(self): 144 | """This method gets the information about 145 | the source data file. 146 | 147 | Returns 148 | ------- 149 | primary_data : object 150 | PrimaryData object. 151 | 152 | """ 153 | 154 | primary_data = poioapi.io.graf.PrimaryData() 155 | primary_data.type = poioapi.io.graf.NONE 156 | primary_data.filename = "unknown" 157 | 158 | return primary_data 159 | 160 | def _next_id(self): 161 | current_id = str(int(self._current_id) + 1) 162 | self._current_id = current_id 163 | 164 | return current_id 165 | 166 | def _split_region(self, element): 167 | try: 168 | aud = element.find("aud").text 169 | results = re.findall("\d*\.\d+|\d+", aud) 170 | 171 | region = (results[-2], results[-1]) 172 | value = aud.split(results[-2])[0] 173 | except: 174 | value = None 175 | region = None 176 | 177 | return value, region -------------------------------------------------------------------------------- /src/poioapi/io/toolboxxml.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Poio Tools for Linguists 4 | # 5 | # Copyright (C) 2009-2013 Poio Project 6 | # Author: António Lopes 7 | # URL: 8 | # For license information, see LICENSE.TXT 9 | 10 | """ 11 | 12 | """ 13 | 14 | from __future__ import absolute_import 15 | 16 | import re 17 | 18 | import xml.etree.ElementTree as ET 19 | 20 | import poioapi.io.graf 21 | 22 | 23 | class Parser(poioapi.io.graf.BaseParser): 24 | 25 | def __init__(self, filepath): 26 | """Class's constructor. 27 | 28 | Parameters 29 | ---------- 30 | filepath : str 31 | Path of the Toolbox XML file. 32 | 33 | """ 34 | 35 | self.filepath = filepath 36 | self.parse() 37 | 38 | def parse(self): 39 | """This method will parse the input file. 40 | 41 | """ 42 | 43 | root = ET.parse(self.filepath) 44 | tree = root.getroot() 45 | self._current_id = 0 46 | self._elements_map = {"itmGroup": [], "idGroup": {}, "txGroup": {}, 47 | "tx": {}, "mr": {}, "mg": {}} 48 | 49 | self.parse_element_tree(tree) 50 | 51 | def parse_element_tree(self, tree): 52 | for t in tree: 53 | if t.tag == "itmGroup": 54 | self._current_itmGroup = t.find("itm").text 55 | self._elements_map["itmGroup"].append( 56 | {"id": self._current_itmGroup, "value": t.find("ti").text, 57 | "features": {'sp': t.find("sp").text}}) 58 | 59 | elif t.tag == "idGroup": 60 | self._current_idGroup = t.find("id").text 61 | value, region = self._split_region(t) 62 | fg = None 63 | 64 | if len(t.find("fg")) > 0: 65 | fg = t.find("fg").text 66 | 67 | self._add_elment_to_elements(t, self._current_idGroup, 68 | self._current_itmGroup, value, {"fg": fg}, region) 69 | 70 | elif t.tag == "txGroup": 71 | self._current_txGroup = self._next_id() 72 | self._add_elment_to_elements(t, self._current_txGroup, self._current_idGroup) 73 | 74 | elif t.tag == "tx": 75 | self._current_tx = self._next_id() 76 | self._add_elment_to_elements(t, self._current_tx, self._current_txGroup, t.text) 77 | 78 | elif t.tag == "mg" or t.tag == "mr": 79 | self._add_elment_to_elements(t, self._next_id(), self._current_tx, t.text) 80 | 81 | if len(t.getchildren()) > 0: 82 | self.parse_element_tree(t) 83 | 84 | def _add_elment_to_elements(self, t, id, parent=None, value=None, features=None, region=None): 85 | if (t.tag, parent) in self._elements_map: 86 | self._elements_map[(t.tag, parent)].append( 87 | {"id": id, "value": value, "region": region, "features": features}) 88 | else: 89 | self._elements_map[(t.tag, parent)] = [{"id": id, "value": value, 90 | "region": region, 91 | "features": features}] 92 | 93 | def get_root_tiers(self): 94 | return [poioapi.io.graf.Tier("itmGroup")] 95 | 96 | def get_child_tiers_for_tier(self, tier): 97 | if tier.name == "itmGroup": 98 | return [poioapi.io.graf.Tier("idGroup")] 99 | if tier.name == "idGroup": 100 | return [poioapi.io.graf.Tier("txGroup")] 101 | if tier.name == "txGroup": 102 | return [poioapi.io.graf.Tier("tx")] 103 | if tier.name == "tx": 104 | return [poioapi.io.graf.Tier("mr"), 105 | poioapi.io.graf.Tier("mg")] 106 | 107 | def get_annotations_for_tier(self, tier, annotation_parent=None): 108 | if tier.name == "itmGroup": 109 | return [poioapi.io.graf.Annotation(e["id"], e["value"], 110 | e["features"]) 111 | for e in self._elements_map[tier.name]] 112 | 113 | else: 114 | if (tier.name, annotation_parent.id) in self._elements_map: 115 | return [poioapi.io.graf.Annotation(e["id"], e["value"], 116 | e["features"]) 117 | for e in self._elements_map[(tier.name, annotation_parent.id)]] 118 | else: 119 | return [] 120 | 121 | def tier_has_regions(self, tier): 122 | if tier.name == "idGroup": 123 | return True 124 | 125 | return False 126 | 127 | def region_for_annotation(self, annotation): 128 | idGroup = [value for key, value in self._elements_map.items() 129 | if "idGroup" in key] 130 | 131 | for elements in idGroup: 132 | for e in elements: 133 | if e["id"] == annotation.id: 134 | return e["region"] 135 | 136 | return None 137 | 138 | def get_primary_data(self): 139 | """This method gets the information about 140 | the source data file. 141 | 142 | Returns 143 | ------- 144 | primary_data : object 145 | PrimaryData object. 146 | 147 | """ 148 | 149 | primary_data = poioapi.io.graf.PrimaryData() 150 | primary_data.type = poioapi.io.graf.NONE 151 | primary_data.filename = "unknown" 152 | 153 | return primary_data 154 | 155 | def _next_id(self): 156 | current_id = str(int(self._current_id) + 1) 157 | self._current_id = current_id 158 | 159 | return current_id 160 | 161 | def _split_region(self, element): 162 | try: 163 | aud = element.find("aud").text 164 | results = re.findall("\d*\.\d+|\d+", aud) 165 | 166 | region = (results[-2], results[-1]) 167 | value = aud.split(results[-2])[0] 168 | except: 169 | value = None 170 | region = None 171 | 172 | return value, region 173 | -------------------------------------------------------------------------------- /src/poioapi/tests/sample_files/elan_graf/example-phonetic_transcription.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | səʋ juː ɡɔ aut əf ðə ɪnstɪtjuːt tʊ zə sant ana straːt 19 | 20 | 21 | 22 | 23 | 24 | 25 | ənd ðen juː ɡəʊ ðɪ ɔθɛ sant ana straːt tʊ ðɪs to ðə sɛntə əf ðə taun tu ðɪs bɪɡ rɔtundə 26 | 27 | 28 | 29 | 30 | 31 | 32 | ænd juː fɔləʊ ðən ðə saɪn kleːf 33 | 34 | 35 | 36 | 37 | 38 | 39 | ðæts ðə ɔranjə sɪŋl 40 | 41 | 42 | 43 | 44 | 45 | 46 | ðen juː fɔləʊ ðə saɪn kleːf 47 | 48 | 49 | 50 | 51 | 52 | 53 | juː kʌm daʊn 54 | 55 | 56 | 57 | 58 | 59 | 60 | juː nɔː ə aftə ðɪs trajanus pleːɪn 61 | 62 | 63 | 64 | 65 | 66 | 67 | juː kʌm daʊn tʊ ðə 68 | 69 | 70 | 71 | 72 | 73 | 74 | raɪn ə vælɪ 75 | 76 | 77 | 78 | 79 | 80 | 81 | jɛː ðæts ənaðə ə 82 | 83 | 84 | 85 | 86 | 87 | 88 | kaɪnd əv rotʊndə 89 | 90 | 91 | 92 | 93 | 94 | 95 | ənd ðen juː fɔləʊ ðə saɪns kleːf 96 | 97 | 98 | 99 | 100 | 101 | 102 | jə ɡɔ daʊn 103 | 104 | 105 | 106 | 107 | 108 | 109 | ənd ðen juː ɡɔ ðə ðɪs weɪ 110 | 111 | 112 | 113 | 114 | 115 | 116 | ə jaː tʊ kleːf 117 | 118 | 119 | 120 | -------------------------------------------------------------------------------- /src/poioapi/tests/io/test_graf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Poio Tools for Linguists 4 | # 5 | # Copyright (C) 2009-2013 Poio Project 6 | # Author: António Lopes 7 | # URL: 8 | # For license information, see LICENSE.TXT 9 | 10 | import os 11 | 12 | import poioapi.io.elan 13 | import poioapi.io.graf 14 | 15 | import xml.etree.ElementTree 16 | 17 | 18 | class TestBaseParser: 19 | 20 | def setup(self): 21 | self.filename = os.path.join(os.path.dirname(__file__), "..", "sample_files", 22 | "elan_graf", "example.eaf") 23 | 24 | tree = xml.etree.ElementTree.parse(self.filename) 25 | self.root = tree.getroot() 26 | 27 | def test_get_root_tiers(self): 28 | 29 | tiers = self.root.findall('TIER') 30 | 31 | root_tiers = [] 32 | 33 | for tier in tiers: 34 | if not 'PARENT_REF' in tier.attrib: 35 | root_tiers.append(tier) 36 | 37 | assert(len(root_tiers) == 4) 38 | 39 | def test_get_child_tiers_for_tier(self): 40 | 41 | root_tier = "W-Spch" 42 | tier_childs = self.root.findall("TIER[@PARENT_REF='"+root_tier+"']") 43 | 44 | assert(len(tier_childs) == 2) 45 | 46 | def test_get_annotations_for_tier(self): 47 | 48 | root_tier = "W-Spch" 49 | tier_annotations = self.root.findall("TIER[@TIER_ID='"+root_tier+"']/ANNOTATION") 50 | 51 | assert(len(tier_annotations) == 15) 52 | 53 | def test_create_data_structure(self): 54 | 55 | depends_on_dict = dict() 56 | 57 | structure_elements = ['utterance','words','part_of_speech'] 58 | 59 | parent = '_parent_linguistic_type_ref' 60 | son = '_linguistic_type_ref' 61 | 62 | if parent in depends_on_dict: 63 | if son not in depends_on_dict[parent]: 64 | depends_on_dict[parent].append(son) 65 | else: 66 | depends_on_dict[parent] = [son] 67 | 68 | pass 69 | 70 | 71 | class SimpleParser(poioapi.io.graf.BaseParser): 72 | tiers = ["utterance", "word", "wfw", "graid"] 73 | 74 | utterance_tier = ["this is a test", "this is another test"] 75 | word_tier = [['this', 'is', 'a', 'test'], ['this', 'is', 'another', 'test']] 76 | wfw_tier = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'] 77 | graid_tier = ['i', 'j', 'k', 'l', 'm', 'n', 'o', 'p'] 78 | 79 | def __init__(self): 80 | pass 81 | 82 | def get_root_tiers(self): 83 | return [poioapi.io.graf.Tier("utterance")] 84 | 85 | def get_child_tiers_for_tier(self, tier): 86 | if tier.name == "utterance": 87 | return [poioapi.io.graf.Tier("word")] 88 | if tier.name == "word": 89 | return [poioapi.io.graf.Tier("graid"), poioapi.io.graf.Tier("wfw")] 90 | 91 | return None 92 | 93 | def get_annotations_for_tier(self, tier, annotation_parent=None): 94 | if tier.name == "utterance": 95 | return [poioapi.io.graf.Annotation(i, v) for i, v in enumerate(self.utterance_tier)] 96 | 97 | if tier.name == "word": 98 | return [poioapi.io.graf.Annotation(2 + 4 * annotation_parent.id + i, v) for i, v 99 | in enumerate(self.word_tier[annotation_parent.id])] 100 | 101 | if tier.name == "graid": 102 | return [poioapi.io.graf.Annotation(annotation_parent.id + 10, self.graid_tier[annotation_parent.id - 2])] 103 | 104 | if tier.name == "wfw": 105 | return [poioapi.io.graf.Annotation(annotation_parent.id + 12, self.wfw_tier[annotation_parent.id - 2])] 106 | 107 | return [] 108 | 109 | def tier_has_regions(self, tier): 110 | return False 111 | 112 | def region_for_annotation(self, annotation): 113 | pass 114 | 115 | def get_primary_data(self): 116 | pass 117 | 118 | class TestGrAFConverter: 119 | def setup(self): 120 | self.parser = SimpleParser() 121 | self.converter = poioapi.io.graf.GrAFConverter(self.parser) 122 | self.converter.parse() 123 | 124 | self.graph = self.converter.graf 125 | 126 | def test_get_root_tiers(self): 127 | assert len(self.parser.get_root_tiers()) == 1 128 | 129 | def test_get_child_tiers_for_tier(self): 130 | root_tiers = self.parser.get_root_tiers() 131 | 132 | tier = root_tiers[0] 133 | 134 | child_tier = self.parser.get_child_tiers_for_tier(tier) 135 | 136 | assert len(child_tier) == 1 137 | 138 | def test_get_annotations_for_tier(self): 139 | root_tiers = self.parser.get_root_tiers() 140 | 141 | tier = root_tiers[0] 142 | 143 | child_tier_annotations = self.parser.get_annotations_for_tier(tier) 144 | 145 | assert len(child_tier_annotations) == 2 146 | 147 | def test_get_nodes_from_graf(self): 148 | nodes = self.graph.nodes 149 | 150 | assert len(nodes) == 26 151 | 152 | def test_get_annotation_from_node(self): 153 | node = self.graph.nodes['word..n2'] 154 | annotation = node.annotations._elements[0] 155 | 156 | assert annotation.id == 2 157 | 158 | def test_get_edges_from_graf(self): 159 | edges = self.graph.edges 160 | 161 | assert len(edges) == 18 162 | 163 | def test_get_edge_nodes(self): 164 | edge = self.graph.edges['e2'] 165 | 166 | assert edge.from_node == self.graph.nodes['utterance..n0'] 167 | assert edge.to_node == self.graph.nodes['word..n2'] 168 | 169 | def test_get_annotations_spaces_from_graf(self): 170 | annotation_spaces = self.graph.annotation_spaces 171 | 172 | assert len(annotation_spaces) == 4 173 | assert len(annotation_spaces['utterance']) == 2 174 | assert len(annotation_spaces['word']) == 8 175 | assert len(annotation_spaces['graid']) == 8 176 | 177 | def test_append_tier_hierarchies(self): 178 | filename = os.path.join(os.path.dirname(__file__), "..", "sample_files", 179 | "elan_graf", "example.eaf") 180 | 181 | elan = poioapi.io.elan.Parser(filename) 182 | 183 | converter = poioapi.io.graf.GrAFConverter(elan) 184 | converter.parse() 185 | 186 | expected_tier_hierarchies = ['utterance..W-Spch', 187 | ['words..W-Words', 188 | ['part_of_speech..W-POS']], 189 | ['phonetic_transcription..W-IPA']] 190 | 191 | assert expected_tier_hierarchies in converter.tier_hierarchies 192 | -------------------------------------------------------------------------------- /src/poioapi/io/obt.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Poio Tools for Linguists 4 | # 5 | # Copyright (C) 2009-2014 Poio Project 6 | # Author: Peter Bouda 7 | # URL: 8 | # For license information, see LICENSE.TXT 9 | 10 | import re 11 | import codecs 12 | import collections 13 | 14 | import poioapi.io.graf 15 | 16 | re_last_quote = re.compile("[^\"]*$") 17 | 18 | class Parser(poioapi.io.graf.BaseParser): 19 | """ 20 | Class that will handle parse of OBT files. OBT is The Oslo-Bergen-Tagger. 21 | http://www.tekstlab.uio.no/obt-ny/english/index.html 22 | 23 | Code on Github: 24 | https://github.com/noklesta/The-Oslo-Bergen-Tagger 25 | 26 | """ 27 | 28 | def __init__(self, input_stream): 29 | """Class's constructor. 30 | 31 | Parameters 32 | ---------- 33 | stream : str or IOBase 34 | Path of the OBT output file or an IO.stream. 35 | 36 | """ 37 | self._input_stream = None 38 | 39 | self.input_stream = input_stream 40 | self.parse() 41 | 42 | def input_stream(): 43 | doc = "The input_stream property." 44 | def fget(self): 45 | return self._input_stream 46 | def fset(self, value): 47 | if not hasattr(value, 'read'): 48 | self._input_stream = codecs.open(value, "r", "utf-8") 49 | else: 50 | self._input_stream = value 51 | def fdel(self): 52 | del self._input_stream 53 | return locals() 54 | input_stream = property(**input_stream()) 55 | 56 | def parse(self): 57 | """ 58 | This method is called by the constructor. It will parse the input file 59 | and collect the data in intermediate data structures for later 60 | processing. 61 | 62 | """ 63 | current_phrase_id = 0 64 | current_phrase_words = [] 65 | current_id = 1 66 | current_word_id = None 67 | 68 | self._annotations_for_parent = collections.defaultdict(list) 69 | 70 | for line in self.input_stream: 71 | line = line.strip() 72 | if line.startswith("") and line.endswith(""): 73 | current_word = line[6:-7] 74 | current_word_id = current_id 75 | current_id += 1 76 | # add annotation 77 | self._annotations_for_parent[("a{0}".format(current_phrase_id), 78 | "word")].append((poioapi.io.graf.Annotation("a{0}".format( 79 | current_word_id), 80 | current_word))) 81 | current_phrase_words.append(current_word) 82 | 83 | elif not line.startswith('"<'): 84 | last_quote_match = re_last_quote.search(line) 85 | variant = line[1:last_quote_match.start(0)-1] 86 | variant_tags = last_quote_match.group(0).split() 87 | variant_tags = [t for t in variant_tags if t != "<<<" and \ 88 | t != ">>>"] 89 | 90 | current_variant_id = current_id 91 | current_id += 1 92 | self._annotations_for_parent[("a{0}".format(current_word_id), 93 | "variant")].append( 94 | (poioapi.io.graf.Annotation("a{0}".format( 95 | current_variant_id), 96 | variant))) 97 | 98 | for tag in variant_tags: 99 | self._annotations_for_parent[("a{0}".format( 100 | current_variant_id), 101 | "tag")].append( 102 | (poioapi.io.graf.Annotation("a{0}".format( 103 | current_id), 104 | tag))) 105 | current_id += 1 106 | 107 | # create phrase 108 | if "" in variant_tags: 109 | current_phrase = " ".join(current_phrase_words) 110 | self._annotations_for_parent[(None, "phrase")].append( 111 | (poioapi.io.graf.Annotation("a{0}".format( 112 | current_phrase_id), 113 | current_phrase))) 114 | current_phrase_id = current_id 115 | current_id += 1 116 | current_phrase_words = [] 117 | 118 | # Text might not end with a 119 | if current_phrase_words != []: 120 | current_phrase = " ".join(current_phrase_words) 121 | self._annotations_for_parent[(None, "phrase")].append( 122 | (poioapi.io.graf.Annotation("a{0}".format( 123 | current_phrase_id), 124 | current_phrase))) 125 | 126 | def get_root_tiers(self): 127 | """This method retrieves all the root tiers. 128 | 129 | Returns 130 | ------- 131 | list : array-like 132 | List of tiers type. 133 | 134 | """ 135 | 136 | return [poioapi.io.graf.Tier("phrase")] 137 | 138 | def get_child_tiers_for_tier(self, tier): 139 | """This method retrieves all the child tiers 140 | of a specific tier. 141 | 142 | Parameters 143 | ---------- 144 | tier : object 145 | Tier to find the children from. 146 | 147 | Returns 148 | ------- 149 | child_tiers : array-like 150 | List of tiers type. 151 | 152 | """ 153 | 154 | if tier.name == "phrase": 155 | return [poioapi.io.graf.Tier("word")] 156 | elif tier.name == "word": 157 | return [poioapi.io.graf.Tier("variant")] 158 | elif tier.name == "variant": 159 | return [poioapi.io.graf.Tier("tag")] 160 | 161 | def get_annotations_for_tier(self, tier, annotation_parent=None): 162 | parent_id = None 163 | if annotation_parent: 164 | parent_id = annotation_parent.id 165 | return self._annotations_for_parent[(parent_id, tier.name)] 166 | 167 | def tier_has_regions(self, tier): 168 | return False 169 | 170 | def region_for_annotation(self, annotation): 171 | return None 172 | 173 | def get_primary_data(self): 174 | """This method returns the primary data of the OBT file. 175 | 176 | Returns 177 | ------- 178 | primary_data : object 179 | PrimaryData object. 180 | 181 | """ 182 | primary_data = poioapi.io.graf.PrimaryData() 183 | primary_data.type = poioapi.io.graf.NONE 184 | primary_data.filename = "unknown" 185 | 186 | return primary_data 187 | -------------------------------------------------------------------------------- /src/poioapi/tests/test_data.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Poio Tools for Linguists 4 | # 5 | # Copyright (C) 2009-2013 Poio Project 6 | # Author: António Lopes 7 | # URL: 8 | # For license information, see LICENSE.TXT 9 | 10 | """This module contains the tests to the class 11 | DataStrutctureType in data.py module. 12 | 13 | This test serves to ensure the viability of the methods of 14 | the class DataStructureType in data.py. 15 | """ 16 | 17 | from poioapi import data 18 | import os 19 | import filecmp 20 | import nose.tools 21 | 22 | class TestDataStructureType: 23 | """ 24 | This class contain the test methods to the 25 | class data.py. 26 | 27 | """ 28 | 29 | def setup(self): 30 | self.data_structure_type = data.DataStructureType() 31 | self.data_structure_type_graid = data.DataStructureTypeGraid() 32 | 33 | # def test_get_siblings_of_type(self): 34 | # """Raise an assertion if there's no siblings to return. 35 | 36 | # Return all the siblings of a given type in the hierarchy 37 | # including the given type itself. 38 | 39 | # Raises 40 | # ------ 41 | # AssertionError 42 | # If the results there aren't the expected. 43 | 44 | # """ 45 | 46 | # # If the ann_type value equal like this 47 | # ann_type = 'utterance' 48 | 49 | # # The result expected should be 50 | # expected_result = ['utterance'] 51 | # print(self.data_structure_type.get_siblings_of_type(ann_type)) 52 | 53 | # assert(self.data_structure_type.get_siblings_of_type( 54 | # ann_type) == expected_result) 55 | 56 | def test_get_parents_of_type(self): 57 | """Raise an assertion if there's no parents to return. 58 | 59 | Returns all the elements that are above a given type in the type 60 | hierarchy. 61 | 62 | Raises 63 | ------ 64 | AssertionError 65 | If the results there aren't the expected. 66 | 67 | """ 68 | 69 | ann_type = 'utterance' 70 | expected_result = [] 71 | assert (self.data_structure_type.get_parents_of_type( 72 | ann_type) == expected_result) 73 | 74 | ann_type = 'word' 75 | expected_result = ['utterance', 'translation'] 76 | assert (self.data_structure_type.get_parents_of_type( 77 | ann_type) == expected_result) 78 | 79 | 80 | def test__get_parents_of_type_helper(self): 81 | """Raise an assertion if there's no elements to return. 82 | 83 | Helper function for get_parents_of_type. 84 | 85 | Raises 86 | ------ 87 | AssertionError 88 | If the results there aren't the expected. 89 | 90 | """ 91 | 92 | # If the ann_type and hierarchy values equal like this 93 | ann_type = 'clause unit' 94 | hierarchy = \ 95 | ['utterance', 96 | ['clause unit', 97 | ['word', 'wfw', 'graid1'], 98 | 'graid2'], 99 | 'translation', 'comment'] 100 | 101 | # The result expected should be a {tuple} 102 | expected_result = (True, ['utterance', 'translation', 'comment']) 103 | 104 | assert (self.data_structure_type._get_parents_of_type_helper( 105 | ann_type, hierarchy) == expected_result) 106 | 107 | def test_get_children_of_type(self): 108 | """Assert that `get_children_of_type()` works fine. 109 | 110 | `get_children_of_type()` returns all the elements that are beneath a 111 | given type in the type hierarchy. 112 | 113 | Raises 114 | ------ 115 | AssertionError 116 | If the results there aren't the expected. 117 | 118 | """ 119 | 120 | ann_type = 'utterance' 121 | expected_result = ['word', 'translation'] 122 | assert (self.data_structure_type.get_children_of_type( 123 | ann_type) == expected_result) 124 | 125 | data_structure_type = data.DataStructureTypeGraid() 126 | ann_type = 'utterance' 127 | expected_result = ['clause_unit', 'graid2', 'translation', 'comment'] 128 | assert (data_structure_type.get_children_of_type( 129 | ann_type) == expected_result) 130 | 131 | 132 | def test_empty_element(self): 133 | """Raise an assertion if there's no elements to return. 134 | 135 | Return the appended list of a certain data hierarchy. 136 | 137 | Raises 138 | ------ 139 | AssertionError 140 | If the results there aren't the expected. 141 | 142 | """ 143 | 144 | # The expected result to the define data hierarchy 145 | expected_result = [{'id': None, 'annotation': ''}, 146 | [[{'id': None, 'annotation': ''}]], 147 | {'id': None, 'annotation': ''}] 148 | 149 | assert (self.data_structure_type.empty_element() == expected_result) 150 | 151 | def test__append_list(self): 152 | """Raise an assertion if the elements list is invalid. 153 | 154 | Append element values and it's ids to the data structure elements. 155 | 156 | Raises 157 | ------ 158 | AssertionError 159 | If the results there aren't the expected. 160 | 161 | """ 162 | 163 | # If the element value equal like this 164 | element = ['word', 'wfw', 'graid1'] 165 | 166 | # The result expected should be 167 | expected_result = [{'id': None, 'annotation': ''}, 168 | {'id': None, 'annotation': ''}, 169 | {'id': None, 'annotation': ''}] 170 | 171 | assert (self.data_structure_type._append_list( 172 | element) == expected_result) 173 | 174 | def test_test_flatten_hierarchy_elements(self): 175 | """Raise an assertion if the elements aren't correct. 176 | 177 | Flat the elements appended to a new list of elements. 178 | 179 | Raises 180 | ------ 181 | AssertionError 182 | If the results there aren't the expected. 183 | 184 | """ 185 | 186 | # If the elements value equal like this 187 | elements = \ 188 | ['utterance', 189 | ['clause unit', 190 | ['word', 'wfw', 'graid1'], 191 | 'graid2'], 192 | 'translation', 'comment'] 193 | 194 | # The result expected should be 195 | expected_result = ['utterance', 196 | 'clause unit', 197 | 'word', 'wfw', 'graid1', 198 | 'graid2', 199 | 'translation', 200 | 'comment'] 201 | 202 | assert (self.data_structure_type._flatten_hierarchy_elements( 203 | elements) == expected_result) -------------------------------------------------------------------------------- /doc/analysis.rst: -------------------------------------------------------------------------------- 1 | Linguistic analysis and pipelines based on GrAF graphs 2 | ====================================================== 3 | 4 | We think that GrAF graphs can play an important role in the implementation 5 | of scientific workflows in linguistics. Based on the GrAF objects that 6 | Poio API generates you might pipe the data to scientific Python libraries 7 | like `networkx `_, `numpy `_ 8 | or `scipy `_. The American National Corpus implemented 9 | connectors for GrAF and two linguistic frameworks. The conversion of custom 10 | file formats to GrAF through Poio API can thus act as an entry point to those 11 | pipelines and support to merge data and annotation from a wide range of 12 | heteregenous data sources for further analysis. 13 | 14 | 15 | Search in annotation graphs: filters and filter chains 16 | ------------------------------------------------------ 17 | 18 | The **filter** class :py:class:`poioapi.annotationgraph.AnnotationGraphFilter` 19 | can be used to search in annotation graphs in Poio API. The filter class can 20 | only be used together with the annotation graph class 21 | :py:class:`poioapi.annotationgraph.AnnotationGraph`. The idea is that 22 | each annotation graph can contain a set of filters, that each reduce the 23 | full annotation graph to a subset. This list of filters is what we call a 24 | **filter chain**. Each filter consists of search terms for each of the 25 | tiers that were loaded from an input file, as described in section 26 | :ref:`data_structure_types`. The search terms can be simple strings or 27 | regular expressions. 28 | 29 | To be able to apply a filter to an annotation graph you have to load some 30 | data first. In this example we will use the `example file from the Elan 31 | homepage `_. First, we 32 | create a new annotation graph and load the file: 33 | 34 | .. code-block:: python 35 | 36 | import poioapi.annotationgraph 37 | 38 | ag = poioapi.annotationgraph.AnnotationGraph() 39 | ag.from_elan("elan-example3.eaf") 40 | 41 | In the next step we set the default tier hierarchy for the annotation graph. 42 | As the example file contains four root tiers with subtiers we have to choose 43 | one of the hierarchies carefully. In our case we choose the hierarchy with 44 | the root tier `utterance..W-Spch` that we find at index `1` of the 45 | property `ag.tier_hierarchies` after we loaded the file. We choose this 46 | tier hierchary to be used for all subsequent filter operations: 47 | 48 | .. code-block:: python 49 | 50 | ag.structure_type_handler = \ 51 | poioapi.data.DataStructureType(ag.tier_hierarchies[1]) 52 | 53 | In our case the hierarchy `ag.tier_hierarchies[1]` contains the following 54 | tiers: 55 | 56 | .. code-block:: python 57 | 58 | ['utterance..W-Spch', 59 | ['words..W-Words', 60 | ['part_of_speech..W-POS']], 61 | ['phonetic_transcription..W-IPA']] 62 | 63 | Now we are ready to create a filter for the data. We will filter the data 64 | with serch terms on two of the subtiers of our tier hierarchy: we will search 65 | for ``follow`` on the `words` tier and for the regular expression ``\bpro\b`` 66 | on the `POS` tier. We can look up the full names of the tiers in the above 67 | tier hierarchy. The following code creates a filter object and adds the 68 | two search terms for the two tiers: 69 | 70 | .. code-block:: python 71 | 72 | af = poioapi.annotationgraph.AnnotationGraphFilter(ag) 73 | af.set_filter_for_tier("words..W-Words", "follow") 74 | af.set_filter_for_tier("part_of_speech..W-POS", r"\bpro\b") 75 | 76 | The final step is to append the filter to the filter chain of the annotation 77 | graph: 78 | 79 | .. code-block:: python 80 | 81 | ag.append_filter(af) 82 | 83 | The append operation will already start the process of graph filtering. The 84 | result is stored in the property `filtered_node_ids` of the annotation 85 | graph object, which is a list of root nodes where child nodes matched 86 | the search term: 87 | 88 | .. code-block:: python 89 | 90 | print(ag.filtered_node_ids) 91 | [['utterance..W-Spch..na10', 92 | 'utterance..W-Spch..na12', 93 | 'utterance..W-Spch..na19']] 94 | 95 | You can get a visible result set by writing a filtered HTML representation 96 | of the annotation graph: 97 | 98 | .. code-block:: python 99 | 100 | import codecs 101 | html = ag.as_html_table(True) 102 | f = codecs.open("filtered.html", "w", "utf-8") 103 | f.write(html) 104 | f.close() 105 | 106 | You can add more filters to the annotation graph by creating more filter 107 | objects and passing them to `append_filter()`. If you want to remove a filter 108 | you can call `pop_filter()`, which will remove the filter that was last added 109 | to the annotation graph object: 110 | 111 | .. code-block:: python 112 | 113 | ag.pop_filter() 114 | 115 | A convenient way to create filter objects is by passing a dictionary with 116 | tier names and search terms to the method `create_filter_for_dict()` of the 117 | annotation graph object. The following code will create the same filter as in 118 | the example above: 119 | 120 | .. code-block:: python 121 | 122 | search_terms = { 123 | "words..W-Words": "follow", 124 | "part_of_speech..W-POS": r"\bpro\b" 125 | } 126 | af = ag.create_filter_for_dict(search_terms) 127 | 128 | You can then append the filter to the filter chain. A complete script that 129 | demonstrates filters and filter chains is available on Github: 130 | 131 | https://github.com/cidles/poio-api/blob/master/examples/filter.py 132 | 133 | 134 | Real world examples 135 | ------------------- 136 | 137 | Counting word orders 138 | .................... 139 | 140 | The following example is based on the parser explained in section 141 | :ref:`excel_parser`. The whole workflow to count word order in GrAF is 142 | implemented as `IPython notebook `_, which 143 | you can view and download here: 144 | 145 | http://nbviewer.ipython.org/urls/raw.github.com/pbouda/notebooks/master/Diana%20Hinuq%20Word%20Order.ipynb 146 | 147 | 148 | D3.js for visualization 149 | ....................... 150 | 151 | The graf-python documentation contains a nice example how to visualize GrAF 152 | data with the help of the `networkx library `_ 153 | and the Javascript visualization library `D3.js `_: 154 | 155 | https://graf-python.readthedocs.org/en/latest/Translation%20Graph%20from%20GrAF.html 156 | 157 | To just see the example visualization click here: 158 | 159 | http://bl.ocks.org/anonymous/4250342 160 | 161 | 162 | GrAF connectors 163 | --------------- 164 | 165 | The American National Corpus implemented GrAF connectors for the `Unstructured 166 | Information Management applications (Apache UIMA) `_ 167 | fraemwork and the `general architecture for text engineering (GATE) 168 | `_ software. You can download the ANC software here: 169 | 170 | * http://www.anc.org/software/uimautils/ 171 | * http://www.anc.org/software/gate-tools/ 172 | -------------------------------------------------------------------------------- /src/poioapi/mappings/ODIN_TYPECRAFT.json: -------------------------------------------------------------------------------- 1 | { 2 | "tier_mapping": { 3 | "utterance": [ 4 | "phrase" 5 | ], 6 | "word": [ 7 | "word" 8 | ], 9 | "morpheme": [ 10 | "morpheme" 11 | ], 12 | "gloss": [ 13 | "gloss" 14 | ], 15 | "translation": [ 16 | "translation" 17 | ] 18 | }, 19 | "gloss": { 20 | "1": "", 21 | "2": "", 22 | "3": "", 23 | "4": "", 24 | "5": "", 25 | "6": "", 26 | "7": "", 27 | "8": "", 28 | "9": "", 29 | "10": "", 30 | "11": "", 31 | "12": "", 32 | "14": "", 33 | "16": "", 34 | "17": "", 35 | "18": "", 36 | "50": "", 37 | "82": "", 38 | "90": "", 39 | "700": "", 40 | "1776": "", 41 | "1956": "", 42 | "1958": "", 43 | "1990": "", 44 | "1SG, 1S": "1SG", 45 | "3SG, 3S": "3SG", 46 | "A": "", 47 | "AKAN": "", 48 | "COMPL, COMP": "COMPL", 49 | "CONS": "CONS", 50 | "DCLM": "", 51 | "DE, DEF": "DEF", 52 | "FANTE": "", 53 | "FOC": "FOC", 54 | "HAB": "HAB", 55 | "I": "", 56 | "ITIVE": "ITER", 57 | "K": "", 58 | "LOC": "LOC", 59 | "NEG": "NEG", 60 | "PAST, PST": "PAST", 61 | "PERF": "PERF", 62 | "PROG, PRG": "PROG", 63 | "REL": "REL", 64 | "SUBJ, SUB, SB, SBJ": "SBJ", 65 | "TWI": "", 66 | "VENT": "", 67 | "WOMAN": "", 68 | "(ADV)": ["pos","ADV"], 69 | "(DE)": "", 70 | "(PFV": "PFV", 71 | "(POSS, POSS": "POSS", 72 | "(Q": "Q", 73 | "1-CL": "", 74 | "2SG, 2S": "2SG", 75 | "30-MW": "", 76 | "ACC": "ACC", 77 | "ASP": "ASP", 78 | "ASP/SEP": "", 79 | "ATT": "ATT", 80 | "AUX, AUX3": ["pos","AUX"], 81 | "BA": "", 82 | "BE": "", 83 | "BEI": "", 84 | "C": "", 85 | "CAI": "", 86 | "CAUS, CAUSE, CAUSATIVE": "CAUS", 87 | "CL": ["pos","CL"], 88 | "CL)": "", 89 | "CLASS": "CL", 90 | "CLF": "CLF", 91 | "CLS": "", 92 | "CONJ, CNJ": ["pos","CONJ"], 93 | "COP": ["pos","COP"], 94 | "CURR": "", 95 | "D, DET": ["pos","DET"], 96 | "DI": "", 97 | "DISP": "", 98 | "DOU": "", 99 | "DUR": "DUR", 100 | "EXP, EXPL": ["pos","EXPL"], 101 | "FIN": "", 102 | "GE": "", 103 | "GEI": "", 104 | "GEN": "GEN", 105 | "GUO": "", 106 | "HAI": "", 107 | "HO": "", 108 | "HONG": "", 109 | "INDEF": "INDEF", 110 | "JIAN": "", 111 | "JIAO": "", 112 | "JIU": "", 113 | "KA": "", 114 | "L": "", 115 | "LE": "", 116 | "LE,": "", 117 | "MA": "", 118 | "MW": "", 119 | "N1": "", 120 | "N2": "", 121 | "N3": "", 122 | "NE,": "", 123 | "NTNU": "", 124 | "NZL": "", 125 | "O-157": "", 126 | "ONLY": "", 127 | "PAR": "", 128 | "PART, PRT": ["pos","PRT"], 129 | "PASS, PASSIVE": "PASS", 130 | "PC": "", 131 | "PERF/PRT": "", 132 | "PFV": "PFV", 133 | "PKU": "", 134 | "PL": "PL", 135 | "PREP, P": ["pos","PREP"], 136 | "PRF": "PRF", 137 | "PTCL": ["pos","PTCP"], 138 | "Q, Q-PRT": "Q", 139 | "RANG": "", 140 | "RV": "", 141 | "RVC": "", 142 | "SEP": "", 143 | "SFP": "", 144 | "SG": "SG", 145 | "SHI": "", 146 | "SUO": "", 147 | "T": "", 148 | "THERE": "", 149 | "TV": "", 150 | "UC": "", 151 | "USA": "", 152 | "V1": ["pos","V1"], 153 | "V2": ["pos","V2"], 154 | "V3": ["pos","V3"], 155 | "W, WH": "WH", 156 | "Y": "", 157 | "Z": "", 158 | "ZAI": "", 159 | "ZHE": "", 160 | "(D)": ["pos","DET"], 161 | "(F": "", 162 | "DEM": ["pos","DEM"], 163 | "F": "", 164 | "FACT": "", 165 | "IMP": "IMP", 166 | "IMPERF": "IPFV", 167 | "T0P, TOP": "TOP", 168 | "1PL": "1PL", 169 | "2PL": "2PL", 170 | "COM": "", 171 | "CONSEC": "CONSEC", 172 | "DAT, DAT&3, DAT/3": "DAT", 173 | "DEP": "", 174 | "EWE": "", 175 | "IDEO": "", 176 | "IMPERS": "", 177 | "LOG": "", 178 | "NAME": "", 179 | "NPRES": "", 180 | "POT": "", 181 | "PRED": "PRED", 182 | "PRES, PRS, PRESENT": "PRES", 183 | "RED": "", 184 | "TP,": "", 185 | "CM": "", 186 | "LIG": "", 187 | "LIKPE": "", 188 | "APPL": "APPL", 189 | "APPL-PAST": "", 190 | "CAUSE-FV": "", 191 | "FV": "FV", 192 | "IV": "IV", 193 | "10OM": "", 194 | "10S": "", 195 | "10SM": "", 196 | "17SB": "", 197 | "1SM": "", 198 | "2OM": "CL2.OM", 199 | "2SM": "CL2.SM", 200 | "3SM": "CL3.SM", 201 | "4OM": "CL4.OM", 202 | "4P": "", 203 | "7OM": "CL7.OM", 204 | "7S": "CL7.SM", 205 | "8SM": "CL8.SM", 206 | "9SB": "CL9.SM", 207 | "AGR": "AGR", 208 | "AP": "", 209 | "ASC": "", 210 | "CL7": "CL7", 211 | "IND": "IND", 212 | "J": "", 213 | "MID": "", 214 | "MOOD": "MOD", 215 | "O, OBJ": "OBJ", 216 | "OBL": "OBL", 217 | "OM": "OM", 218 | "OP": "", 219 | "PA": "", 220 | "REC, RECIP": "RECP", 221 | "REFL": "REFL", 222 | "S": "", 223 | "SB-REC": "", 224 | "SM": "SM", 225 | "SP": "", 226 | "T-4O": "", 227 | "CONT": "", 228 | "REMOTE": "DIST", 229 | "(PI": "", 230 | "3MASC, 3MAS": "3SG.MASC", 231 | "3PL": "3PL", 232 | "ART": ["pos","ART"], 233 | "E": "", 234 | "CI": "", 235 | "FEM": "FEM", 236 | "FPL": "", 237 | "FUT": "FUT", 238 | "I)": "", 239 | "M, MASC": "MASC", 240 | "M)": "", 241 | "M,": "MASC", 242 | "MPL": "", 243 | "NOM": "NOM", 244 | "PI": "", 245 | "PI)": "", 246 | "PRON": "", 247 | "PRÓPRIA": "", 248 | "PRÓPRIOS": "", 249 | "RS": "", 250 | "SE": "", 251 | "SG/PL": "", 252 | "SI": "", 253 | "DU": "DU", 254 | "N": "", 255 | "NEUTER": "NEUT", 256 | "PCP": "", 257 | "[I": "", 258 | "SCR": "", 259 | "Q/COMP": "", 260 | "T-6O": "", 261 | "TP": "", 262 | "SUBJUNCTIVE": "SBJV" 263 | } 264 | } 265 | -------------------------------------------------------------------------------- /src/poioapi/io/memory.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Poio Tools for Linguists 4 | # 5 | # Copyright (C) 2009-2014 Poio Project 6 | # Author: Peter Bouda 7 | # URL: 8 | # For license information, see LICENSE.TXT 9 | 10 | """ 11 | This modules provides classes to store informations from Parsers to a data 12 | structure in memors. The data that is stored is equivalent to a GrAF graph but 13 | without the overhead of Python objects. The AnnotationGraph object thus can be 14 | used wit memory or GrAF data storage. 15 | 16 | """ 17 | 18 | import redis 19 | 20 | class MemoryConverter: 21 | """This class handles the conversion of different file formats into memory 22 | data types. It uses a sub-class of BaseParser to get the 23 | annotations and the tier hierarchies. 24 | 25 | """ 26 | 27 | def __init__(self, parser, writer=None): 28 | self.parser = parser 29 | self.tier_hierarchies = [] 30 | self.meta_information = None 31 | self.primary_data = None 32 | self.original_file = None 33 | self.annotations_for_parent = dict() 34 | self.region_for_annotation = dict() 35 | 36 | def parse(self): 37 | """This method will be the responsible to transform 38 | the parser into a redis key/value items. This method also 39 | retrieves and stores the tiers hierarchies. 40 | 41 | """ 42 | 43 | self._tiers_parent_list = [] 44 | self.root_tiers = [] 45 | tiers_hierarchy_map = {} 46 | 47 | for tier in self.parser.get_root_tiers(): 48 | self.root_tiers.append(tier.name) 49 | self._convert_tier(tier, None) 50 | 51 | i = 0 52 | for t in self._tiers_parent_list: 53 | if t[1] is None: 54 | i += 1 55 | tiers_hierarchy_map[str(i)] = [t[0]] 56 | else: 57 | self._append_tier_to_hierarchy(tiers_hierarchy_map[str(i)], 58 | t[1], t[0]) 59 | 60 | for i, hierarchy in tiers_hierarchy_map.items(): 61 | self.tier_hierarchies.append(hierarchy) 62 | 63 | if hasattr(self.parser, 'meta_information'): 64 | self.meta_information = self.parser.meta_information 65 | 66 | self.primary_data = self.parser.get_primary_data() 67 | if hasattr(self.parser, 'filepath') and \ 68 | isinstance(self.parser.filepath, str): 69 | self.original_file = os.path.abspath(self.parser.filepath) 70 | 71 | def _convert_tier(self, tier, parent_annotation, 72 | parent_prefix=None): 73 | child_tiers = self.parser.get_child_tiers_for_tier(tier) 74 | 75 | if tier.annotation_space is None: 76 | prefix = tier.name 77 | annotation_name = prefix 78 | else: 79 | annotation_name = tier.annotation_space.replace(' ', '_') 80 | 81 | prefix = "{0}{1}{2}".format(annotation_name, GRAFSEPARATOR, 82 | tier.name) 83 | 84 | has_regions = False 85 | 86 | if self.parser.tier_has_regions(tier): 87 | has_regions = True 88 | 89 | self._add_tier_in_hierarchy_list(prefix, parent_prefix) 90 | 91 | annotations = self.parser.get_annotations_for_tier(tier, 92 | parent_annotation) 93 | 94 | for annotation in annotations: 95 | region = None 96 | 97 | if has_regions: 98 | region = self.parser.region_for_annotation(annotation) 99 | self.region_for_annotation[annotation.id] = region 100 | 101 | #node_id = NodeId(prefix, annotation.id) 102 | parent_annotation_id = None 103 | if parent_annotation is not None: 104 | parent_annotation_id = parent_annotation.id 105 | self.annotations_for_parent[(parent_annotation_id, tier.name)] = \ 106 | annotation 107 | #self._add_node(node_id, annotation, annotation_name, regions, 108 | # parent_node) 109 | #self._add_root_nodes(prefix, node_id) 110 | 111 | if child_tiers: 112 | for t in child_tiers: 113 | self._convert_tier(t, annotation, prefix) 114 | 115 | if annotations == [] and child_tiers: 116 | for t in child_tiers: 117 | self._convert_tier(t, None, prefix) 118 | 119 | def _add_tier_in_hierarchy_list(self, prefix, parent_prefix): 120 | if not (prefix, parent_prefix) in self._tiers_parent_list: 121 | self._tiers_parent_list.append((prefix, parent_prefix)) 122 | 123 | def _append_tier_to_hierarchy(self, tiers_list, parent_tier, tier): 124 | for t in tiers_list: 125 | if isinstance(t, list): 126 | self._append_tier_to_hierarchy(t, parent_tier, tier) 127 | else: 128 | if t == parent_tier: 129 | tiers_list.append([tier]) 130 | 131 | # def _add_node(self, node_id, annotation, annotation_name, regions, 132 | # from_node_id): 133 | # self._add_node_to_graph(node_id, regions, from_node_id) 134 | # self._add_graf_annotation(annotation_name, annotation.id, node_id, 135 | # annotation.value, annotation.features) 136 | 137 | # def _add_root_nodes(self, prefix, node_id): 138 | # if prefix in self.root_tiers: 139 | # self.graf.header.roots.append(node_id.to_str()) 140 | 141 | # def _add_graf_annotation(self, annotation_name, annotation_id, 142 | # annotation_ref, annotation_value, annotation_features=None): 143 | # annotation = graf.Annotation(annotation_name, annotation_features, 144 | # annotation_id) 145 | 146 | # if annotation_value is not None: 147 | # annotation.features['annotation_value'] = annotation_value 148 | 149 | # self.graf.nodes[annotation_ref.to_str()].annotations.add(annotation) 150 | 151 | # if annotation_name in self.graf.annotation_spaces: 152 | # #if annotation not in self.graf.annotation_spaces[annotation_name]: 153 | # self.graf.annotation_spaces[annotation_name].add(annotation) 154 | # else: 155 | # annotation_space = graf.AnnotationSpace(annotation_name) 156 | # annotation_space.add(annotation) 157 | 158 | # self.graf.annotation_spaces.add(annotation_space) 159 | 160 | # def _add_node_to_graph(self, node_id, regions=None, 161 | # from_node_id=None): 162 | 163 | # node = graf.Node(node_id.to_str()) 164 | 165 | # if from_node_id is not None: 166 | # edge_id = node_id.str_edge() 167 | # self.graf.create_edge(self.graf.nodes[from_node_id.to_str()], node, 168 | # edge_id) 169 | 170 | # if regions is not None: 171 | # region_id = node_id.str_region() 172 | # region = graf.Region(region_id, *regions) 173 | # node.add_region(region) 174 | 175 | # self.graf.regions.add(region) 176 | 177 | # self.graf.nodes.add(node) 178 | --------------------------------------------------------------------------------