├── .appveyor.yml
├── .github
    └── workflows
    │   ├── python-package.yml
    │   └── python-publish.yml
├── .gitignore
├── .travis.yml
├── EVALUATION.md
├── LICENSE.txt
├── README.md
├── ci
    └── deploy.sh
├── docs
    ├── Makefile
    ├── README
    ├── api.rst
    ├── conf.py
    ├── index.rst
    ├── requirements.txt
    ├── scripts.rst
    ├── short_defs.pdf
    ├── toy.xml
    ├── ucca_db.rst
    └── uccaapp.rst
├── requirements.distances.txt
├── requirements.txt
├── requirements.visualize.txt
├── scripts
    ├── __init__.py
    ├── annotate.py
    ├── convert_1_0_to_1_2.py
    ├── convert_2_0_to_1_2.py
    ├── convert_articles_and_reflexives.py
    ├── count_parents_children.py
    ├── count_tokens.py
    ├── distances
    │   ├── __init__.py
    │   └── align.py
    ├── evaluate_db.py
    ├── evaluate_standard.py
    ├── find_constructions.py
    ├── fix_tokenization.py
    ├── join_passages.py
    ├── join_sdp.py
    ├── load_word_vectors.py
    ├── match_text.py
    ├── normalize.py
    ├── pickle_to_standard.py
    ├── remove_br_tokens.py
    ├── replace_tokens_by_dict.py
    ├── set_external_id_offline.py
    ├── site_pickle_to_standard.py
    ├── site_to_standard.py
    ├── site_to_text.py
    ├── split_corpus.py
    ├── standard_to_json.py
    ├── standard_to_paragraphs.py
    ├── standard_to_pickle.py
    ├── standard_to_sentences.py
    ├── standard_to_site.py
    ├── standard_to_text.py
    ├── statistics.py
    ├── text_to_standard.py
    ├── unique_roles.py
    ├── validate.py
    ├── visualize.py
    └── visualize_as_text.py
├── setup.cfg
├── setup.py
├── test_files
    ├── 120_parsed.xml
    ├── implicit1.xml
    ├── implicit1_ref.xml
    ├── implicit2.xml
    ├── implicit2_ref.xml
    ├── site1.xml
    ├── site2.xml
    ├── site3.xml
    ├── site4.xml
    ├── site5.xml
    ├── standard3.xml
    ├── standard3_valid.xml
    └── toy_bad.xml
├── ucca
    ├── README.md
    ├── __init__.py
    ├── __version__.py
    ├── constructions.py
    ├── convert.py
    ├── core.py
    ├── diffutil.py
    ├── evaluation.py
    ├── ioutil.py
    ├── layer0.py
    ├── layer1.py
    ├── normalization.py
    ├── tests
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── test_constructions.py
    │   ├── test_convert.py
    │   ├── test_core.py
    │   ├── test_evaluation.py
    │   ├── test_ioutil.py
    │   ├── test_layer0.py
    │   ├── test_layer1.py
    │   ├── test_normalization.py
    │   ├── test_textutil.py
    │   ├── test_validation.py
    │   └── test_visualization.py
    ├── textutil.py
    ├── validation.py
    └── visualization.py
├── ucca_db
    ├── __init__.py
    ├── api.py
    ├── download.py
    └── upload.py
└── uccaapp
    ├── __init__.py
    ├── api.py
    ├── convert_and_evaluate.py
    ├── copy_categories.py
    ├── create_annotation_tasks.py
    ├── create_tokenization_tasks.py
    ├── download_task.py
    ├── evaluate.py
    ├── export_units_by_filter.py
    ├── get_passage_id.py
    ├── set_external_id.py
    ├── set_tasks_to_ongoing.py
    ├── submit_tasks.py
    ├── tokenize_and_upload.py
    ├── transfer_categories.py
    ├── upload_conllu_passages.py
    ├── upload_streussel_passages.py
    └── upload_task.py


/.appveyor.yml:
--------------------------------------------------------------------------------
 1 | os: Visual Studio 2015
 2 | 
 3 | platform: x64
 4 | 
 5 | environment:
 6 |   MSVC_DEFAULT_OPTIONS: ON
 7 |   MINICONDA: "C:\\Miniconda36-x64"
 8 | 
 9 | configuration: Release
10 | 
11 | init:
12 |   - cmd: cmake --version
13 |   - cmd: msbuild /version
14 | 
15 | install:
16 |   - cmd: git submodule update --init --recursive
17 |   - set PATH=%MINICONDA%;%MINICONDA%\Scripts;%PATH%
18 |   - conda config --set always_yes yes --set changeps1 no
19 |   - conda update -q conda
20 |   - conda info -a
21 |   - conda create -q -n test-env python=3.6 cython numpy matplotlib networkx pytest
22 |   - activate test-env
23 |   - pip install .
24 |   - python -m spacy download en_core_web_md
25 | 
26 | build: off
27 | 
28 | test_script:
29 |   - pytest --durations=0 -v ucca/tests
30 | 


--------------------------------------------------------------------------------
/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: Python package
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ master ]
 9 |   pull_request:
10 |     branches: [ master ]
11 | 
12 | jobs:
13 |   build:
14 | 
15 |     runs-on: ubuntu-latest
16 |     strategy:
17 |       matrix:
18 |         python-version: ['3.6', '3.7']
19 | 
20 |     steps:
21 |     - uses: actions/checkout@v2
22 |     - name: Set up Python ${{ matrix.python-version }}
23 |       uses: actions/setup-python@v2
24 |       with:
25 |         python-version: ${{ matrix.python-version }}
26 |     - name: Install dependencies
27 |       run: |
28 |         python -m pip install --upgrade pip
29 |         python -m pip install flake8 pytest
30 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
31 |         if [ -f requirements.visualize.txt ]; then pip install -r requirements.visualize.txt; fi
32 |         python -m spacy download en_core_web_md
33 |     - name: Lint with flake8
34 |       run: |
35 |         # stop the build if there are Python syntax errors or undefined names
36 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
37 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
38 |         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
39 |     - name: Test with pytest
40 |       run: |
41 |         pytest
42 |     - name: Test TUPA
43 |       run: |
44 |         pip install -U --upgrade-strategy=only-if-needed tupa
45 |         python -m tupa test_files/standard3.xml -t test_files/standard3.xml -I 1 --max-words-external=50 --word-dim=10 --lstm-layer-dim=10 --embedding-layer-dim=10
46 | 


--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflows will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | 
 4 | name: Upload Python Package
 5 | 
 6 | on:
 7 |   release:
 8 |     types: [created]
 9 | 
10 | jobs:
11 |   deploy:
12 | 
13 |     runs-on: ubuntu-latest
14 | 
15 |     steps:
16 |     - uses: actions/checkout@v2
17 |     - name: Set up Python
18 |       uses: actions/setup-python@v2
19 |       with:
20 |         python-version: '3.x'
21 |     - name: Install dependencies
22 |       run: |
23 |         python -m pip install --upgrade pip
24 |         pip install setuptools wheel twine
25 |     - name: Build and publish
26 |       env:
27 |         TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
28 |         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
29 |       run: |
30 |         python setup.py sdist bdist_wheel
31 |         twine upload dist/*
32 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__*
2 | *.pyc
3 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | sudo: false
 2 | language: python
 3 | python: 3.6
 4 | env:
 5 |   global:
 6 |     - TWINE_USERNAME=danielh
 7 |     - secure: QrZ/47sh/8WeeTLU37yfhW94bwO2ocsbMMIRebSS9Y+FssrCi9IbSuTp6NliXlJq17rozGtEf9alu9JetE8hnivACGJm0cz2/j3oYaeCxz8sbTpXeEr8JHiDk6MCfCD9VMrpeo04RBmI76BY1mwdCvxQSJEn/NtkI9jjSaqjLCLcaFWD7mTuYefxrPplROQJPu+jcW1snnubntuux1nRxULC3Ge/IRWb4OYajLJcPXiVsdleSNV9avLE2xIPTFZf4cwHpRxZslKgHeyCLk+JoDlL0qneB4UWB/SZF8CHoYvidPJDzG5NHAEgfxSqbUq3DRvgVAPqR0YoQd/MQbPLBN6v1aY2zbqHJtTS1xidnnYIs3gJWVAurx6WjkNc9QYwdN22EPmYDVquW2tZgvi2kHRoJY+gEYylJRY0jOzqYmZUV9WOZeeb2AzgXnVjQubEm0NSYCC3BYjkiSmwpDWTcr/HvCQ+9iOI1OD56F7B6oowzXBP0Z/IClMd9Pb3vs9cRr6di/Vf+ijjUeHQxyKHiv2R2mGnPuR8d/gR538xmbc/RlEt2tycMD25SBAeFdtlUfB5Si8llTSd6YktZzZhkHiaIPBYAVEbrK3832TM7B7sGAa8R6Y8gctP6ccE/kFpSdnFHuENgRu2VZBDx6q8UmkArRLbrCvzmbn658EySkc=
 8 | jobs:
 9 |   include:
10 |   - env: TEST=unit
11 |     install:
12 |     - pip install .
13 |     - pip install pytest
14 |     - pip install -r requirements.visualize.txt
15 |     - python -m spacy download en_core_web_md
16 |     script: pytest --durations=0 -v ucca/tests
17 |     deploy:
18 |       provider: script
19 |       script: ci/deploy.sh
20 |       on:
21 |         repo: huji-nlp/ucca
22 |         tags: true
23 |   - env: TEST=tupa
24 |     install:
25 |     - pip install -U --upgrade-strategy=only-if-needed tupa
26 |     - python -m spacy download en_core_web_md
27 |     script: python -m tupa test_files/standard3.xml -t test_files/standard3.xml -I 1 --max-words-external=50 --word-dim=10 --lstm-layer-dim=10 --embedding-layer-dim=10
28 | 


--------------------------------------------------------------------------------
/EVALUATION.md:
--------------------------------------------------------------------------------
 1 | The evaluation process is done through the `evaluate` function that is located in the [evaluation.py](ucca/evaluation.py) script.
 2 | A wrapping script of the `evaluation.py` script is [evaluate_standard.py](scripts/evaluate_standard.py). For more details on how the scripts receives its argument, please write `evaluate_standard --help` in the prompt.
 3 | The evaluation process compares the gold-standard annotation of a specific passage, with the calculated annotation of that same passage.
 4 | Both passages are of `Passage` object type, which is an object that contains the connected graph that represents the annotation of the passage.
 5 | The evaluation includes the recall, precision and F1 scores. The calculation of these scores is done by comparing each edge's labels and yield, which are the literals that are under the edge's child node (if we look at the annotation as a tree).
 6 | We can also do an unlabeled evaluation, and then for each edge only its yield will be compared. It is important to know that when there is a remote edge, it is ignored in the yield comparison, but we do look at it when comparing lables of edges.
 7 | Also, when there is an implicit node, edges going into them are evaluated by their parent's yield.
 8 | 
 9 | Now let us look more closely at the `evaluate` function:
10 | 
11 | The `evaluate` function receives the following input parameters:
12 | 1. guessed: Passage object to evaluate
13 | 2. ref: reference (gold standard) Passage object to compare to
14 | 3. converter: optional function to apply to passages before evaluation. One can choose to convert passages from the following formats to the `Passage` class:
15 |     - site XML
16 |     - standard XML
17 |     - conll (CoNLL-X dependency parsing shared task)
18 |     - sdp (SemEval 2015 semantic dependency parsing shared task)
19 | 4. verbose: whether to print the results
20 | 5. constructions: names of construction types to include in the evaluation. By construction we mean that the evaluation can be done on specific types of edges, for example just on the Proccess and State edges. If there is a need in doing the evaluation based on specific labels, a useful flag is `--constructions=categories` , which shows evaluation results per edge label (category).
21 | The default construction includes the following edges:
22 |     - primary edges (`--constructions=primary`)
23 |     - remote edges (`--constructions=remote`)
24 |     - implicit edges (`--constructions=implicit`)
25 | Other types of edges that can be included are:
26 |     - aspectual verbs (`--constructions=aspectual_verbs`)
27 |     - light verbs (`--constructions=light_verbs`)
28 |     - multi-word expressions (mwe) (`--constructions=mwe`)
29 |     - predicate nouns (`--constructions=pred_nouns`)
30 |     - predicate adjectives (`--constructions=pred_adjs`)
31 |     - expletives (`--constructions=expletives`)
32 | 
33 | If there is a need in doing the evaluation based on specific labels, a useful flag is `--constructions=categories` , which shows evaluation results per edge label (category).
34 | 6. units: whether to evaluate common units
35 | 7. fscore: whether to compute precision, recall and f1 score
36 | 8. errors: whether to print the mistakes (prints something similar to a confusion matrix). It is worth mentioning the `--as-table` option in the [evaluate_standard.py](scripts/evaluate_standard.py) script, that prints the confusion matrix as a table.
37 | 9. normalize: flatten centers and move common functions to root before evaluation - modifies passages. There's an option to normalize the passages jointly. In order to normalize them seperately, it should be done before calling `evaluate`. 
38 | 10. eval_type: specific evaluation type(s) to limit to. One can choose one of the following evaluation types:
39 |     - labeled - in the process of evaluation, both the labels of the edges and their yields are compared.
40 |     - unlabeled - in the process of evaluation, only the edges' yields are compared.
41 |     - weak_labeled - in the process of evaluation, certain types of labels will be considered the same - for example Process and State edges will be considered the same and only their yields will be compared,  while Process and Participant will not be considered the same.
42 | 11. ref_yield_tags: reference passage for fine-grained evaluation. In other words, it enables us to do evaluation to edges of different types of labels (that are not part of the UCCA labels), such as subject, object and so on. Nevertheless, the recall, precision and f1 scores will still be calculated based on the UCCA parsing. 
43 | 
44 | The function evaluate returns a `Score` object, which contains the recall, precision and f1 scores of the generated annotation.
45 | For example, by running [test_validation.py](ucca/tests/test_validation.py), the line [Score](ucca/tests/test_evaluation.py#L331) generates a `Score` class. One of its elements is called `evaluators`, which comprises of three `EvaluatorResults` classes:
46 | - 'labeled'
47 | - 'unlabeled'
48 | - 'weak_labeled'
49 | 
50 | Each of those `EvaluatorResults` classes may contain the results for any of the edges mentioned above. As a default it contains the results for 3 types of edges:
51 | - primary
52 | - remote
53 | - impicit
54 | 
55 | The results for each such type of edges comprise of:
56 | - errors
57 | - f1
58 | - num_guessed
59 | - num_matches
60 | - num_only_guessed
61 | - num_unly_ref
62 | - num_ref
63 | - p (precision)
64 | - r (recall)
65 | 
66 | For more details on the `evaluate` function, please see the following links:
67 | 
68 | [evaluate](https://ucca.readthedocs.io/en/latest/api/ucca.evaluation.evaluate.html#ucca.evaluation.evaluate)
69 | 
70 | [Scores](https://ucca.readthedocs.io/en/latest/api/ucca.evaluation.Scores.html#ucca.evaluation.Scores)
71 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Universal Conceptual Cognitive Annotation
 2 | ============================
 3 | UCCA is a linguistic framework for semantic annotation, whose details
 4 | are available at [the following paper](http://aclweb.org/anthology/P13-1023):
 5 | 
 6 |     @inproceedings{abend2013universal,
 7 |       author={Abend, Omri  and  Rappoport, Ari},
 8 |       title={{U}niversal {C}onceptual {C}ognitive {A}nnotation ({UCCA})},
 9 |       booktitle={Proc. of ACL},
10 |       month={August},
11 |       year={2013},
12 |       pages={228--238},
13 |       url={http://aclweb.org/anthology/P13-1023}
14 |     }
15 | 
16 | This Python 3 package provides an API to the UCCA annotation and tools to
17 | manipulate and process it. Its main features are conversion between different
18 | representations of UCCA annotations, and rich objects for all of the linguistic
19 | relations which appear in the theoretical framework (see `core`, `layer0`, `layer1`
20 | and `convert` modules under the `ucca` package).
21 | 
22 | The `scripts` package contains various utilities for processing passage files.
23 | 
24 | To parse text to UCCA graphs, use [TUPA, the UCCA parser](https://github.com/danielhers/tupa).
25 | 
26 | 
27 | Authors
28 | ------
29 | * Amit Beka
30 | * Daniel Hershcovich: dh@di.ku.dk 
31 | 
32 | 
33 | License
34 | -------
35 | This package is licensed under the GPLv3 or later license.
36 | 
37 |                 [ ~ Dependencies scanned by PyUp.io ~ ]
38 | [![Build Status (Travis CI)](https://travis-ci.org/danielhers/ucca.svg?branch=master)](https://travis-ci.org/danielhers/ucca)
39 | [![Build Status (AppVeyor)](https://ci.appveyor.com/api/projects/status/github/danielhers/ucca?svg=true)](https://ci.appveyor.com/project/danielh/ucca)
40 | [![Build Status (Docs)](https://readthedocs.org/projects/ucca/badge/?version=latest)](http://ucca.readthedocs.io/en/latest/)
41 | [![PyPI version](https://badge.fury.io/py/UCCA.svg)](https://badge.fury.io/py/UCCA)
42 | 


--------------------------------------------------------------------------------
/ci/deploy.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -xe
3 | 
4 | pip install collective.checkdocs twine
5 | python setup.py checkdocs || exit 1
6 | python setup.py sdist bdist_wheel
7 | twine upload --skip-existing dist/*
8 | 
9 | 


--------------------------------------------------------------------------------
/docs/README:
--------------------------------------------------------------------------------
  1 | The UCCA Corpus
  2 | Version 1.1
  3 | 29/12/2015
  4 | ===============
  5 | See updated guidelines at https://github.com/omriabnd/UCCA-Documents
  6 | 
  7 | This bundle contains 369 passages annotated according to the foundational layer of UCCA. 
  8 | The passages are given as xmls in a format which is described below. The total number of tokens 
  9 | in this corpus is 158771. It also contains the annotation guidelines that were given to the annotators,
 10 | a metadata file and a toy example XML.
 11 | 
 12 | The dataset is a part of the UCCA project developed in the NLP lab of the Hebrew University 
 13 | by Omri Abend and Ari Rappoport. The users of this dataset are kindly requested to cite the 
 14 | following publication:
 15 | 
 16 | "UCCA: A Novel Framework for Semantic Representation" / Omri Abend and Ari Rappoport, ACL 2013
 17 | 
 18 | Example passages can be graphically viewed through our web application (URL: vm-05.cs.huji.ac.il).
 19 | Please refer to our website (URL: homepages.inf.ed.ac.uk/oabend/) or email (oabend@inf.ed.ac.uk)
 20 | for regular updates on the UCCA project and available resources.
 21 | 
 22 | 
 23 | Files included
 24 | --------------
 25 | 1. The passages files in an XML format. file names are of the form "ucca_passageXXX.xml" where XXX 
 26 |    is the passage ID. Please see the UCCA resource webpage for a software package for reading and using 
 27 |    these files.
 28 | 2. toy.xml: a toy example for explaining the UCCA xml format.
 29 | 3. metadata: a file that contains some metadata for the passages. Specifically it contains the source
 30 |    of the text used (i.e., the Wikipedia article it was taken from), and the index of the annotator
 31 |    that did the final proof-reading (it can be 2,3 or 6).
 32 | 4. guidelines.pdf: the annotation guidelines that were given to the annotators are summarized in 
 33 |    this file named "UCCA in a nutshell". Concise definitions are available through the UCCA website
 34 |    as well.
 35 | 5. short_defs.pdf: a brief summary of the categories used by UCCA's foundational layer.
 36 | 
 37 | 
 38 | XML format:
 39 | -----------
 40 | 
 41 | The xml format allows easy extension with further layers. The top level of each xml is composed of 
 42 | the layers annotated over the passage. Each layer has a unique ID and a set of nodes that it introduces. 
 43 | Each node specifies its outbound edges. The ID of a node is formatted as 
 44 | "<layer ID>.<node ID within the layer>". 
 45 | 
 46 | Layer 0 is a special layer which specifies the tokens of the passage and their linear order. Its nodes 
 47 | are therefore the tokens themselves. Each node may either be of type "Word" or of type "Punctuation". 
 48 | The attribute "paragraph" specifies the number of the paragraph the terminal belongs to, while 
 49 | "paragraph_position" specifies the position of the terminal inside that paragraph. The attribute 
 50 | "text" specifies the written form of the terminal.
 51 | 
 52 | Layer 1 is the foundational layer of UCCA. Although non-terminal nodes of UCCA are generally untyped 
 53 | (their type is effectively determined by their inbound and outbound edges), the xml format does separate 
 54 | the nodes into three coarse-grained types: 
 55 | (1) FN (regular node)
 56 | (2) PNCT (a node whose only descendant is a punctuation terminal)
 57 | (3) LKG (a linkage node). 
 58 | We note that the node type does not provide any additional information, as it can be deterministically 
 59 | derived from the identity of its edges. It is therefore only used for easier readability.
 60 | 
 61 | Each node specifies its outbound edges through its "edge" elements. The ID of the node to which the edge is
 62 | directed is specified by the attribute "toID". The type of the edge may be either of the following:
 63 | (1) any of the 13 categories of the foundational layer (abbreviated as A,P,S,D,C,E,N,R,T,H,L,F,G; see paper).
 64 | (2) LR (Link Relation) or LA (Link Argument) for edges between a linkage node and its Linker or Parallel 
 65 | Scenes, respectively.
 66 | (3) Terminal for an edge to a word terminal.
 67 | (4) U for an edge to a punctuation terminal.
 68 | 
 69 | A node in layer 1 may also be a leaf that represents an implicit unit. In this case, the node would have 
 70 | an attribute "implicit" with the value "true".
 71 | 
 72 | Layer 2 is left empty as a place holder where future layers (e.g., coreference, linkage type, 
 73 | information structure) can be represented. UCCA is designed to allow an open-ended set of layers 
 74 | to be annotated on top of a given passage.
 75 | 
 76 | 
 77 | Toy example:
 78 | ------------
 79 | 
 80 | The file toy.xml contains the annotation of a simple sentence "After Graduation, Mary moved to New York 
 81 | City". The terminals can be seen under the element <layer layerID="0">.
 82 | 
 83 | Consider now the nodes of the foundational layer (those under the element <layer layerID="1">).
 84 | 
 85 | Consider the node whose ID is "1.1". It has 5 children, one is a Linker (and therefore the edge leading 
 86 | to it bears the type L), two are Parallel Scenes (the edge leading to them bear the type H), 
 87 | 2 are punctuation marks (the edges leading to them bear the type U). 
 88 | 
 89 | Note that edges leading to terminals (i.e., to nodes in layer0) bear the type 'Terminal'. 
 90 | 
 91 | Consider node "1.13". This node is of type LKG, which means it represents a linkage relation. 
 92 | It has three children, a Linker (i.e., the linkage relation; the edge has the tag 'LR'), and 
 93 | two linkage arguments (bear the type 'LA').
 94 | 
 95 | 
 96 | Licensing:
 97 | ----------
 98 | 
 99 | The texts are taken from the English Wikipedia (http://en.wikipedia.org). 
100 | The specific articles they were taken from are listed in the metadata file. 
101 | The Wikipedia texts, as well as the UCCA annotation is distributed under the 
102 | "Attribution-ShareAlike 3.0 Unported" license (http://creativecommons.org/licenses/by-sa/3.0/).
103 | Please follow the link for exact details.
104 | 
105 | 
106 | ACKNOWLEDGEMENTS:
107 | -----------------
108 | 
109 | We would like to thank Tomer Eshet for his partnering in developing the UCCA web application,
110 | and Amit Beka for his help with UCCA's development set and software tools. We would also like
111 | to thank our four annotators for hard and thorough work.
112 | 


--------------------------------------------------------------------------------
/docs/api.rst:
--------------------------------------------------------------------------------
 1 | .. _api:
 2 | 
 3 | API Documentation
 4 | =================
 5 | 
 6 | Getting Started
 7 | ---------------
 8 | 
 9 | To load UCCA passages from XML files, manipulate them and write to files, use the following code template::
10 | 
11 |     from ucca.ioutil import get_passages_with_progress_bar, write_passage
12 |     for passage in get_passages_with_progress_bar(filenames):
13 |         ...
14 |         write_passage(passage)
15 | 
16 | Each passage instantiates the :class:`ucca.core.Passage` class.
17 | 
18 | XML files can be downloaded from the various `UCCA corpora <https://github.com/UniversalConceptualCognitiveAnnotation>`__.
19 | 
20 | .. automodapi:: ucca.constructions
21 | .. automodapi:: ucca.convert
22 | .. automodapi:: ucca.core
23 | .. automodapi:: ucca.diffutil
24 | .. automodapi:: ucca.evaluation
25 | .. automodapi:: ucca.ioutil
26 | .. automodapi:: ucca.layer0
27 | .. automodapi:: ucca.layer1
28 | .. automodapi:: ucca.normalization
29 | .. automodapi:: ucca.textutil
30 | .. automodapi:: ucca.validation
31 | .. automodapi:: ucca.visualization
32 | 
33 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. UCCA documentation master file, created by
 2 |    sphinx-quickstart on Sun Oct 28 09:01:22 2018.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | 
 7 | .. include:: ../README.rst
 8 | 
 9 | For more information about how to use this library, see the :ref:`api`.
10 | 
11 | .. toctree::
12 |    :maxdepth: 2
13 |    :caption: Contents:
14 |    
15 |    api
16 |    scripts
17 |    ucca_db
18 |    uccaapp
19 | 
20 | Indices and tables
21 | ==================
22 | 
23 | * :ref:`genindex`
24 | * :ref:`modindex`
25 | * :ref:`search`
26 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx-automodapi>=0.12
2 | 


--------------------------------------------------------------------------------
/docs/scripts.rst:
--------------------------------------------------------------------------------
 1 | .. _scripts:
 2 | 
 3 | Scripts Documentation
 4 | =====================
 5 | 
 6 | .. automodapi:: scripts.annotate
 7 | .. automodapi:: scripts.convert_1_0_to_1_2
 8 | .. automodapi:: scripts.convert_2_0_to_1_2
 9 | .. automodapi:: scripts.count_parents_children
10 | .. automodapi:: scripts.evaluate_db
11 | .. automodapi:: scripts.evaluate_standard
12 | .. automodapi:: scripts.find_constructions
13 | .. automodapi:: scripts.fix_tokenization
14 | .. automodapi:: scripts.join_passages
15 | .. automodapi:: scripts.join_sdp
16 | .. automodapi:: scripts.load_word_vectors
17 | .. automodapi:: scripts.normalize
18 | .. automodapi:: scripts.pickle_to_standard
19 | .. automodapi:: scripts.replace_tokens_by_dict
20 | .. automodapi:: scripts.site_pickle_to_standard
21 | .. automodapi:: scripts.site_to_standard
22 | .. automodapi:: scripts.site_to_text
23 | .. automodapi:: scripts.split_corpus
24 | .. automodapi:: scripts.standard_to_pickle
25 | .. automodapi:: scripts.standard_to_sentences
26 | .. automodapi:: scripts.standard_to_site
27 | .. automodapi:: scripts.standard_to_text
28 | .. automodapi:: scripts.statistics
29 | .. automodapi:: scripts.unique_roles
30 | .. automodapi:: scripts.validate
31 | .. automodapi:: scripts.visualize
32 | 
33 | 


--------------------------------------------------------------------------------
/docs/short_defs.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huji-nlp/ucca/31e381f2b2693e82c8402b09a6d28f97d595a1eb/docs/short_defs.pdf


--------------------------------------------------------------------------------
/docs/toy.xml:
--------------------------------------------------------------------------------
  1 | <root annotationID="0" passageID="504">
  2 |   <attributes />
  3 |   <layer layerID="0">
  4 |     <attributes />
  5 |     <node ID="0.1" type="Word">
  6 |       <attributes paragraph="1" paragraph_position="1" text="After" />
  7 |       <extra dep="prep" head="5" iob="2" lemma="after" ner="" pos="ADP" tag="IN" />
  8 |     </node>
  9 |     <node ID="0.2" type="Word">
 10 |       <attributes paragraph="1" paragraph_position="2" text="graduation" />
 11 |       <extra dep="pobj" head="1" iob="2" lemma="graduation" ner="" pos="NOUN" tag="NN" />
 12 |     </node>
 13 |     <node ID="0.3" type="Punctuation">
 14 |       <attributes paragraph="1" paragraph_position="3" text="," />
 15 |       <extra dep="punct" head="5" iob="2" lemma="," ner="" pos="PUNCT" tag="," />
 16 |     </node>
 17 |     <node ID="0.4" type="Word">
 18 |       <attributes paragraph="1" paragraph_position="4" text="Mary" />
 19 |       <extra dep="nsubj" head="5" iob="3" lemma="mary" ner="PERSON" pos="PROPN" tag="NNP" />
 20 |     </node>
 21 |     <node ID="0.5" type="Word">
 22 |       <attributes paragraph="1" paragraph_position="5" text="moved" />
 23 |       <extra dep="ROOT" head="5" iob="2" lemma="move" ner="" pos="VERB" tag="VBD" />
 24 |     </node>
 25 |     <node ID="0.6" type="Word">
 26 |       <attributes paragraph="1" paragraph_position="6" text="to" />
 27 |       <extra dep="prep" head="5" iob="2" lemma="to" ner="" pos="ADP" tag="IN" />
 28 |     </node>
 29 |     <node ID="0.7" type="Word">
 30 |       <attributes paragraph="1" paragraph_position="7" text="New" />
 31 |       <extra dep="compound" head="8" iob="3" lemma="new" ner="GPE" pos="PROPN" tag="NNP" />
 32 |     </node>
 33 |     <node ID="0.8" type="Word">
 34 |       <attributes paragraph="1" paragraph_position="8" text="York" />
 35 |       <extra dep="compound" head="9" iob="1" lemma="york" ner="GPE" pos="PROPN" tag="NNP" />
 36 |     </node>
 37 |     <node ID="0.9" type="Word">
 38 |       <attributes paragraph="1" paragraph_position="9" text="City" />
 39 |       <extra dep="pobj" head="6" iob="1" lemma="city" ner="GPE" pos="PROPN" tag="NNP" />
 40 |     </node>
 41 |     <node ID="0.10" type="Punctuation">
 42 |       <attributes paragraph="1" paragraph_position="10" text="." />
 43 |       <extra dep="punct" head="5" iob="2" lemma="." ner="" pos="PUNCT" tag="." />
 44 |     </node>
 45 |   </layer>
 46 |   <layer layerID="1">
 47 |     <attributes />
 48 |     <node ID="1.1" type="FN">
 49 |       <attributes />
 50 |       <edge toID="1.2" type="L">
 51 |         <attributes />
 52 |       </edge>
 53 |       <edge toID="1.3" type="H">
 54 |         <attributes />
 55 |       </edge>
 56 |       <edge toID="1.5" type="U">
 57 |         <attributes />
 58 |       </edge>
 59 |       <edge toID="1.6" type="H">
 60 |         <attributes />
 61 |       </edge>
 62 |       <edge toID="1.12" type="U">
 63 |         <attributes />
 64 |       </edge>
 65 |     </node>
 66 |     <node ID="1.2" type="FN">
 67 |       <attributes />
 68 |       <edge toID="0.1" type="Terminal">
 69 |         <attributes />
 70 |       </edge>
 71 |     </node>
 72 |     <node ID="1.3" type="FN">
 73 |       <attributes />
 74 |       <edge toID="1.4" type="P">
 75 |         <attributes />
 76 |       </edge>
 77 |       <edge toID="1.7" type="A">
 78 |         <attributes remote="True" />
 79 |       </edge>
 80 |     </node>
 81 |     <node ID="1.4" type="FN">
 82 |       <attributes />
 83 |       <edge toID="0.2" type="Terminal">
 84 |         <attributes />
 85 |       </edge>
 86 |     </node>
 87 |     <node ID="1.5" type="PNCT">
 88 |       <attributes />
 89 |       <edge toID="0.3" type="Terminal">
 90 |         <attributes />
 91 |       </edge>
 92 |     </node>
 93 |     <node ID="1.6" type="FN">
 94 |       <attributes />
 95 |       <edge toID="1.7" type="A">
 96 |         <attributes />
 97 |       </edge>
 98 |       <edge toID="1.8" type="P">
 99 |         <attributes />
100 |       </edge>
101 |       <edge toID="1.9" type="A">
102 |         <attributes />
103 |       </edge>
104 |     </node>
105 |     <node ID="1.7" type="FN">
106 |       <attributes />
107 |       <edge toID="0.4" type="Terminal">
108 |         <attributes />
109 |       </edge>
110 |     </node>
111 |     <node ID="1.8" type="FN">
112 |       <attributes />
113 |       <edge toID="0.5" type="Terminal">
114 |         <attributes />
115 |       </edge>
116 |     </node>
117 |     <node ID="1.9" type="FN">
118 |       <attributes />
119 |       <edge toID="1.10" type="R">
120 |         <attributes />
121 |       </edge>
122 |       <edge toID="1.11" type="C">
123 |         <attributes />
124 |       </edge>
125 |     </node>
126 |     <node ID="1.10" type="FN">
127 |       <attributes />
128 |       <edge toID="0.6" type="Terminal">
129 |         <attributes />
130 |       </edge>
131 |     </node>
132 |     <node ID="1.11" type="FN">
133 |       <attributes />
134 |       <edge toID="0.7" type="Terminal">
135 |         <attributes />
136 |       </edge>
137 |       <edge toID="0.8" type="Terminal">
138 |         <attributes />
139 |       </edge>
140 |       <edge toID="0.9" type="Terminal">
141 |         <attributes />
142 |       </edge>
143 |     </node>
144 |     <node ID="1.12" type="PNCT">
145 |       <attributes />
146 |       <edge toID="0.10" type="Terminal">
147 |         <attributes />
148 |       </edge>
149 |     </node>
150 |     <node ID="1.13" type="LKG">
151 |       <attributes />
152 |       <edge toID="1.2" type="LR">
153 |         <attributes />
154 |       </edge>
155 |       <edge toID="1.3" type="LA">
156 |         <attributes />
157 |       </edge>
158 |       <edge toID="1.6" type="LA">
159 |         <attributes />
160 |       </edge>
161 |     </node>
162 |   </layer>
163 | </root>
164 | 


--------------------------------------------------------------------------------
/docs/ucca_db.rst:
--------------------------------------------------------------------------------
1 | .. _ucca_db:
2 | 
3 | UCCA DB Documentation
4 | =====================
5 | 
6 | .. automodapi:: ucca_db.api
7 | .. automodapi:: ucca_db.download
8 | .. automodapi:: ucca_db.upload
9 | 


--------------------------------------------------------------------------------
/docs/uccaapp.rst:
--------------------------------------------------------------------------------
 1 | .. _uccaapp:
 2 | 
 3 | UCCA-App API Documentation
 4 | ==========================
 5 | 
 6 | .. automodapi:: uccaapp.api
 7 | .. automodapi:: uccaapp.convert_and_evaluate
 8 | .. automodapi:: uccaapp.copy_categories
 9 | .. automodapi:: uccaapp.create_annotation_tasks
10 | .. automodapi:: uccaapp.create_tokenization_tasks
11 | .. automodapi:: uccaapp.download_task
12 | .. automodapi:: uccaapp.upload_conllu_passages
13 | .. automodapi:: uccaapp.upload_streussel_passages
14 | .. automodapi:: uccaapp.upload_task
15 | 
16 | 


--------------------------------------------------------------------------------
/requirements.distances.txt:
--------------------------------------------------------------------------------
1 | distances>=1.0
2 | zss>=1.2
3 | munkres>=1.0.12


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy>=1.15.0
2 | spacy==2.3.5
3 | requests>=2.18.4
4 | tqdm>=4.23.3
5 | 


--------------------------------------------------------------------------------
/requirements.visualize.txt:
--------------------------------------------------------------------------------
1 | matplotlib==3.3.3
2 | networkx>=2.0


--------------------------------------------------------------------------------
/scripts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huji-nlp/ucca/31e381f2b2693e82c8402b09a6d28f97d595a1eb/scripts/__init__.py


--------------------------------------------------------------------------------
/scripts/annotate.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | 
 5 | from ucca.ioutil import write_passage, get_passages_with_progress_bar
 6 | from ucca.textutil import annotate_all, is_annotated
 7 | 
 8 | desc = """Read UCCA standard format in XML or binary pickle, and write back with POS tags and dependency parse."""
 9 | 
10 | 
11 | def main(args):
12 |     for passage in annotate_all(get_passages_with_progress_bar(args.filenames, desc="Annotating"),
13 |                                 replace=True, as_array=args.as_array, verbose=args.verbose):
14 |         assert is_annotated(passage, args.as_array), "Passage %s is not annotated" % passage.ID
15 |         write_passage(passage, outdir=args.out_dir, verbose=args.verbose)
16 | 
17 | 
18 | if __name__ == '__main__':
19 |     argparser = argparse.ArgumentParser(description=desc)
20 |     argparser.add_argument("filenames", nargs="+", help="passage file names to annotate")
21 |     argparser.add_argument("-o", "--out-dir", default=".", help="directory to write annotated files to")
22 |     argparser.add_argument("-a", "--as-array", action="store_true", help="save annotations as array in passage level")
23 |     argparser.add_argument("-v", "--verbose", action="store_true", help="print tagged text for each passage")
24 |     main(argparser.parse_args())
25 | 


--------------------------------------------------------------------------------
/scripts/convert_2_0_to_1_2.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import os
 3 | from argparse import ArgumentParser
 4 | 
 5 | from ucca import layer1
 6 | from ucca.ioutil import get_passages_with_progress_bar, write_passage
 7 | from ucca.normalization import destroy, copy_edge
 8 | 
 9 | desc = """Convert the English Wiki corpus from version 2.0 to 1.2"""
10 | 
11 | 
12 | def replace_time_and_quantifier(edge):
13 |     if edge.tag in (layer1.EdgeTags.Time, layer1.EdgeTags.Quantifier):
14 |         edge.tag = layer1.EdgeTags.Adverbial if edge.parent.is_scene() else layer1.EdgeTags.Elaborator
15 |         if len(edge.parent.parents) == 1 and edge.parent.incoming[0].tag == edge.tag:
16 |             for e in edge.parent:
17 |                 copy_edge(e, parent=edge.parent.parents[0])
18 |             destroy(edge.parent)
19 |         return True
20 |     return False
21 | 
22 | 
23 | RULES = (replace_time_and_quantifier,)
24 | 
25 | 
26 | def convert_passage(passage, report_writer):
27 |     for rule in RULES:
28 |         for node in passage.layer(layer1.LAYER_ID).all:
29 |             for edge in node:
30 |                 parent = edge.parent
31 |                 parent_str = str(parent)
32 |                 if rule(edge):
33 |                     report_writer.writerow((rule.__name__, passage.ID, edge, parent_str, parent))
34 | 
35 | 
36 | def main(args):
37 |     os.makedirs(args.outdir, exist_ok=True)
38 |     with open(args.outfile, "w", encoding="utf-8", newline="") as f:
39 |         writer = csv.writer(f)
40 |         writer.writerow(("rule", "passage", "edge", "before", "after"))
41 |         for passage in get_passages_with_progress_bar(args.passages, desc="Converting"):
42 |             convert_passage(passage, report_writer=writer)
43 |             write_passage(passage, outdir=args.outdir, prefix=args.prefix, verbose=args.verbose)
44 |             f.flush()
45 |     print("Wrote '%s'" % args.outfile)
46 | 
47 | 
48 | if __name__ == "__main__":
49 |     argparser = ArgumentParser(description=desc)
50 |     argparser.add_argument("passages", nargs="+", help="the corpus, given as xml/pickle file names")
51 |     argparser.add_argument("-o", "--outdir", default=".", help="output directory")
52 |     argparser.add_argument("-p", "--prefix", default="", help="output filename prefix")
53 |     argparser.add_argument("-O", "--outfile", default=os.path.splitext(argparser.prog)[0] + ".csv", help="log file")
54 |     argparser.add_argument("-v", "--verbose", action="store_true", help="print more information")
55 |     main(argparser.parse_args())
56 | 


--------------------------------------------------------------------------------
/scripts/convert_articles_and_reflexives.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import os
  3 | from argparse import ArgumentParser
  4 | 
  5 | from ucca import layer0, layer1
  6 | from ucca.ioutil import get_passages_with_progress_bar, write_passage
  7 | from ucca.normalization import fparent, copy_edge, traverse_up_centers
  8 | 
  9 | desc = """Change articles to Function, complying with UCCA v2 guidelines"""
 10 | 
 11 | ARTICLES = {
 12 |     "de": ("der", "die", "das", "den", "dem", "des", "ein", "eine", "einen", "einem", "eines"),
 13 |     "en": ("a", "an", "the"),
 14 | }
 15 | 
 16 | REFLEXIVES = {
 17 |     "en": ("herself", "himself", "itself", "themselves", "yourself", "yourselves", "myself", "ourselves", "oneself"),
 18 | }
 19 | 
 20 | NONE = {
 21 |     "de": ("kein", "keine", "keinen", "keines", "keiner", "keinem"),
 22 | }
 23 | 
 24 | 
 25 | def change_article_to_function(terminal, parent, lang):
 26 |     if terminal.text.lower() in ARTICLES[lang]:
 27 |         for edge in parent.incoming:
 28 |             if not edge.attrib.get("remote"):
 29 |                 # First, remove Functions to avoid duplicates
 30 |                 if len(edge.categories) > 1:
 31 |                     edge.categories = [category for category in edge.categories
 32 |                                        if category.tag != layer1.EdgeTags.Function]
 33 |                 # Then replace Elaborators to Functions
 34 |                 for category in edge.categories:
 35 |                     if category.tag == layer1.EdgeTags.Elaborator:
 36 |                         category.tag = layer1.EdgeTags.Function
 37 |                         return True
 38 | 
 39 | 
 40 | def insert_reflexive_into_relation(terminal, parent, lang):
 41 |     if terminal.text.lower() in REFLEXIVES.get(lang, ()):
 42 |         for edge in parent.incoming:
 43 |             if not edge.attrib.get("remote"):
 44 |                 for category in edge.categories:
 45 |                     if category.tag == layer1.EdgeTags.Adverbial:
 46 |                         for grandparent in parent.parents:
 47 |                             new_parent = grandparent.process or grandparent.state
 48 |                             if new_parent is not None:
 49 |                                 while any(layer1.EdgeTags.Center in e.tags for e in new_parent):
 50 |                                     new_parent = next(e for e in new_parent if layer1.EdgeTags.Center in e.tags).child
 51 |                                 parent.destroy()
 52 |                                 new_parent.add(layer1.EdgeTags.Terminal, terminal)
 53 |                                 return True
 54 | 
 55 | 
 56 | def change_none_to_quantifier(terminal, parent, lang):
 57 |     if terminal.text.lower() in NONE.get(lang, ()):
 58 |         parent = traverse_up_centers(parent)
 59 |         for edge in parent.incoming:
 60 |             if not edge.attrib.get("remote"):
 61 |                 for category in edge.categories:
 62 |                     if category.tag == layer1.EdgeTags.Adverbial:
 63 |                         for participant_edge in edge.parent:
 64 |                             if layer1.EdgeTags.Participant in participant_edge.tags:
 65 |                                 new_parent = participant_edge.child
 66 |                                 if new_parent.start_position == terminal.position + 1:
 67 |                                     if not new_parent.centers:
 68 |                                         edges = new_parent.outgoing
 69 |                                         center = new_parent.layer.add_fnode(new_parent, layer1.EdgeTags.Center)
 70 |                                         for sub_edge in edges:
 71 |                                             copy_edge(sub_edge, center)
 72 |                                             new_parent.remove(sub_edge)
 73 |                                     category.tag = layer1.EdgeTags.Quantifier
 74 |                                     participant_edge.add(layer1.EdgeTags.Adverbial)
 75 |                                     copy_edge(edge, new_parent)
 76 |                                     edge.parent.remove(edge)
 77 |                                     return True
 78 | 
 79 | 
 80 | RULES = (change_article_to_function, insert_reflexive_into_relation, change_none_to_quantifier)
 81 | 
 82 | 
 83 | def convert_passage(passage, lang, report_writer):
 84 |     for rule in RULES:
 85 |         for terminal in passage.layer(layer0.LAYER_ID).all:
 86 |             parent = fparent(terminal)
 87 |             if len(parent.children) == 1 and rule(terminal, parent, lang):
 88 |                 report_writer.writerow((rule.__name__, passage.ID, terminal.ID, parent, fparent(terminal)))
 89 | 
 90 | 
 91 | def main(args):
 92 |     os.makedirs(args.outdir, exist_ok=True)
 93 |     with open(args.outfile, "w", encoding="utf-8", newline="") as f:
 94 |         writer = csv.writer(f)
 95 |         writer.writerow(("rule", "passage", "terminal", "before", "after"))
 96 |         for passage in get_passages_with_progress_bar(args.passages, desc="Converting"):
 97 |             convert_passage(passage, lang=passage.attrib.get("lang", args.lang), report_writer=writer)
 98 |             write_passage(passage, outdir=args.outdir, prefix=args.prefix, verbose=args.verbose)
 99 |             f.flush()
100 |     print("Wrote '%s'" % args.outfile)
101 | 
102 | 
103 | if __name__ == "__main__":
104 |     argparser = ArgumentParser(description=desc)
105 |     argparser.add_argument("passages", nargs="+", help="the corpus, given as xml/pickle file names")
106 |     argparser.add_argument("-l", "--lang", choices=ARTICLES, help="two-letter language code for article list")
107 |     argparser.add_argument("-o", "--outdir", default=".", help="output directory")
108 |     argparser.add_argument("-p", "--prefix", default="", help="output filename prefix")
109 |     argparser.add_argument("-O", "--outfile", default=os.path.splitext(argparser.prog)[0] + ".csv", help="log file")
110 |     argparser.add_argument("-v", "--verbose", action="store_true", help="print tagged text for each passage")
111 |     main(argparser.parse_args())
112 | 


--------------------------------------------------------------------------------
/scripts/count_parents_children.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | import sys
 5 | from collections import Counter, defaultdict
 6 | 
 7 | from ucca import layer1
 8 | from ucca.ioutil import get_passages_with_progress_bar
 9 | 
10 | desc = """Parses XML files in UCCA standard format, and creates a histogram for the number of parents per unit."""
11 | 
12 | 
13 | def plot_histogram(counter, label, plot=None):
14 |     import matplotlib.pyplot as plt
15 |     plt.figure()
16 |     nums = list(counter.keys())
17 |     counts = list(counter.values())
18 |     indices = range(len(counts))
19 |     bars = plt.bar(indices, counts, align="center")
20 |     plt.xticks(indices, nums)
21 |     top = 1.06 * max(counts)
22 |     plt.ylim(min(counts), top)
23 |     plt.xlabel("number of %s" % label)
24 |     plt.ylabel("count")
25 |     for bar in bars:
26 |         count = bar.get_height()
27 |         plt.text(bar.get_x() + bar.get_width() / 2., count, "%.1f%%" % (100.0 * count / sum(counts)),
28 |                  ha="center", va="bottom")
29 |     if plot:
30 |         plt.savefig(plot + "histogram_" + label + ".png")
31 |     else:
32 |         plt.show()
33 | 
34 | 
35 | def plot_pie(counter, label, plot=None):
36 |     import matplotlib.pyplot as plt
37 |     plt.figure()
38 |     nums = list(counter.keys())
39 |     counts = list(counter.values())
40 |     plt.pie(counts, labels=nums, autopct="%1.1f%%",
41 |             counterclock=True, wedgeprops={"edgecolor": "white"})
42 |     plt.axis("equal")
43 |     if plot:
44 |         plt.savefig(plot + "pie_" + label + ".png")
45 |     else:
46 |         plt.show()
47 | 
48 | 
49 | def main(args):
50 |     histograms = defaultdict(Counter)
51 |     for passage in get_passages_with_progress_bar(args.filenames):
52 |         for node in passage.layer(layer1.LAYER_ID).all:
53 |             if node.ID != "1.1":  # Exclude the root node
54 |                 histograms["parents"][clip(node.incoming, 3)] += 1
55 |                 histograms["children"][clip(node.outgoing, 7)] += 1
56 | 
57 |     for label, counter in histograms.items():
58 |         handle = open(args.outfile + label + ".txt", "w", encoding="utf-8") if args.outfile else sys.stdout
59 |         handle.writelines(["%s\t%d\n" % (num, count) for num, count in counter.items()])
60 |         if handle is not sys.stdout:
61 |             handle.close()
62 |         try:
63 |             plot_histogram(counter, label, plot=args.plot)
64 |             plot_pie(counter, label, plot=args.plot)
65 |         except:
66 |             pass
67 | 
68 | 
69 | def clip(l, m):
70 |     return len(l) if len(l) <= m else ">%d" % m
71 | 
72 | 
73 | if __name__ == "__main__":
74 |     argparser = argparse.ArgumentParser(description=desc)
75 |     argparser.add_argument("filenames", nargs="+", help="file names to analyze")
76 |     argparser.add_argument("-o", "--outfile", default="data/counts_",
77 |                            help="output file prefix for histogram")
78 |     argparser.add_argument("-p", "--plot", default="data/plot_",
79 |                            help="output file prefix for plot image file")
80 |     main(argparser.parse_args())
81 | 


--------------------------------------------------------------------------------
/scripts/count_tokens.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import sys
 4 | import urllib.request
 5 | from itertools import product
 6 | 
 7 | from ucca import layer0
 8 | from uccaapp.download_task import TaskDownloader
 9 | 
10 | 
11 | ## def main(output = None, comment = False, sentence_level = False, categories = (), tokens = (), tokens_mode = CONSECUTIVE,
12 | ##          case_insensitive = False, tokens_by_file = False, remotes = False, write = False, **kwargs):
13 | ##     if tokens_by_file:
14 | ##         with open(tokens[0]) as f:
15 | ##             token_lists = [line.strip().split() for line in f]
16 | ##     elif tokens != ():
17 | ##         token_lists = [tokens]
18 | ##     else:
19 | ##         token_lists = ()
20 | 
21 | ##     filtered_nodes = []
22 | ##     for passage, task_id, user_id in TaskDownloader(**kwargs).download_tasks(write=False, **kwargs):
23 | ##         if sentence_level:
24 | ##             cur_passages = convert.split2sentences(passage)
25 | ##             all_nodes = [p.layer(layer1.LAYER_ID).heads[0] for p in cur_passages]
26 | ##         else:
27 | ##             all_nodes = list(passage.layer(layer1.LAYER_ID).all)
28 | ##         for node in all_nodes:
29 | ##             if comment and node.extra.get("remarks"):
30 | ##                 filtered_nodes.append(("comment",node,task_id,user_id))
31 | ##             if remotes and len([n for n in node.outgoing if n.attrib.get("remote")]) > 0:
32 | ##                 filtered_nodes.append(("remotes", node, task_id, user_id))
33 | ##             if token_lists and not node.attrib.get("implicit"):
34 | ##                 for token_list in token_lists:
35 | ##                     unit_tokens = [t.text for t in node.get_terminals(punct=True)]
36 | ##                     if case_insensitive:
37 | ##                         unit_tokens = [x.lower() for x in unit_tokens]
38 | ##                         token_list = [x.lower() for x in token_list]
39 | ##                     if tokens_match(unit_tokens, token_list, tokens_mode):
40 | ##                         filtered_nodes.append(('TOKENS', node, task_id, user_id))
41 | ##             else:
42 | ##                 all_tags = [c.tag for edge in node for c in edge.categories]
43 | ##                 intersection = set(categories).intersection(all_tags)
44 | 
45 | def count_tokens(**kwargs):
46 |     output = []
47 |     for passage, task_id, user_id in TaskDownloader(**kwargs).download_tasks(**kwargs):
48 |         num_tokens = len(passage.layer(layer0.LAYER_ID).all)
49 |         output.append((num_tokens,task_id,user_id))
50 |     return output
51 | 
52 | def main(output=None, tokens=(), **kwargs):
53 |     kwargs["write"] = False
54 |     f = open(output, 'w', encoding="utf-8") if output else sys.stdout
55 |     for num_tokens, task_id, user_id in count_tokens(**kwargs):
56 |         print(str(num_tokens), task_id, user_id, file=f, sep="\t", flush=True)
57 |     if output:
58 |         f.close()
59 | 
60 | 
61 | if __name__ == "__main__":
62 |     argument_parser = argparse.ArgumentParser()
63 |     TaskDownloader.add_arguments(argument_parser)
64 |     argument_parser.add_argument("--output", help="output file name")
65 |     main(**vars(argument_parser.parse_args()))
66 | 
67 | 


--------------------------------------------------------------------------------
/scripts/distances/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huji-nlp/ucca/31e381f2b2693e82c8402b09a6d28f97d595a1eb/scripts/distances/__init__.py


--------------------------------------------------------------------------------
/scripts/evaluate_db.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | The evaluation software for UCCA layer 1.
 4 | """
 5 | 
 6 | from argparse import ArgumentParser
 7 | 
 8 | from ucca import convert, constructions
 9 | from ucca.evaluation import evaluate
10 | from ucca_db import api
11 | 
12 | 
13 | def main(args):
14 |     keys = [args.guessed, args.ref]
15 |     xmls = api.get_by_xids(db_name=args.db_filename, host_name=args.host, xids=keys) if args.from_xids else \
16 |         api.get_xml_trees(db_name=args.db_filename, host_name=args.host, pid=args.pid, usernames=keys)
17 |     guessed, ref = [convert.from_site(x) for x in xmls]
18 |     if args.units or args.fscore or args.errors:
19 |         evaluate(guessed, ref, units=args.units, fscore=args.fscore, errors=args.errors,
20 |                  constructions=args.constructions, verbose=True)
21 | 
22 | 
23 | if __name__ == '__main__':
24 |     argparser = ArgumentParser(description="Evaluate passages on UCCA DB")
25 |     argparser.add_argument("--db", "-d", required=True, dest="db_filename", help="the db file name")
26 |     argparser.add_argument("--host", "--hst", help="the host name")
27 |     group = argparser.add_mutually_exclusive_group()
28 |     group.add_argument("-p", "--pid", type=int, help="the passage ID")
29 |     group.add_argument("-x", "--from_xids", action="store_true",
30 |                        help="interpret the ref and the guessed parameters as Xids in the db")
31 |     argparser.add_argument("--guessed", "-g", required=True,
32 |                            help="if a db is defined - the username for the guessed annotation; "
33 |                                 "else - the xml file name for the guessed annotation")
34 |     argparser.add_argument("-r", "--ref", required=True,
35 |                            help="if a db is defined - the username for the reference annotation; "
36 |                                 "else - the xml file name for the reference annotation")
37 |     argparser.add_argument("-u", "--units", action="store_true",
38 |                            help="the units the annotations have in common, and those each has separately")
39 |     argparser.add_argument("-f", "--fscore", action="store_true",
40 |                            help="outputs the traditional P,R,F instead of the scene structure evaluation")
41 |     argparser.add_argument("-e", "--errors", action="store_true",
42 |                            help="prints the error distribution according to its frequency")
43 |     constructions.add_argument(argparser)
44 |     main(argparser.parse_args())
45 | 


--------------------------------------------------------------------------------
/scripts/find_constructions.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | 
 3 | from argparse import ArgumentParser
 4 | 
 5 | from ucca.constructions import extract_candidates, add_argument
 6 | from ucca.ioutil import get_passages_with_progress_bar, external_write_mode
 7 | 
 8 | 
 9 | def main(args):
10 |     for passage in get_passages_with_progress_bar(args.passages):
11 |         c2es = OrderedDict((c, [candidate.edge for candidate in candidates]) for c, candidates in
12 |                            extract_candidates(passage, constructions=args.constructions, verbose=args.verbose).items()
13 |                            if candidates)
14 |         if any(c2es.values()):
15 |             with external_write_mode():
16 |                 if not args.verbose:
17 |                     print("%s:" % passage.ID)
18 |                 for construction, edges in c2es.items():
19 |                     if edges:
20 |                         print("  %s:" % construction.description)
21 |                         for edge in edges:
22 |                             print("    %s [%s %s]" % (edge, edge.tag, edge.child))
23 |                 print()
24 | 
25 | 
26 | if __name__ == "__main__":
27 |     argparser = ArgumentParser(description="Extract linguistic constructions from UCCA corpus.")
28 |     argparser.add_argument("passages", nargs="+", help="the corpus, given as xml/pickle file names")
29 |     add_argument(argparser, False)
30 |     argparser.add_argument("-v", "--verbose", action="store_true", help="print tagged text for each passage")
31 |     main(argparser.parse_args())
32 | 


--------------------------------------------------------------------------------
/scripts/join_passages.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | import os
 5 | import sys
 6 | from collections import defaultdict
 7 | 
 8 | import ucca.convert
 9 | from ucca.ioutil import passage2file, get_passages
10 | 
11 | desc = """Parses XML/pickle files in UCCA standard format, and writes a single passage.
12 | """
13 | 
14 | 
15 | def main(args):
16 |     os.makedirs(args.outdir, exist_ok=True)
17 |     passages = list(get_passages(args.filenames))
18 |     if args.join_by_prefix:
19 |         subsets = defaultdict(list)
20 |         for passage in passages:
21 |             subsets[passage.ID[:-3]].append(passage)
22 |     else:
23 |         subsets = {passages[0].ID: passages}
24 |     for passage_id, subset in sorted(subsets.items()):
25 |         print("Joining passages " + ", ".join(passage.ID for passage in subset), file=sys.stderr)
26 |         joined = ucca.convert.join_passages(passages, passage_id=passage_id, remarks=args.remarks)
27 |         outfile = "%s/%s.%s" % (args.outdir, args.prefix + joined.ID, "pickle" if args.binary else "xml")
28 |         print("Writing joined passage file '%s'..." % outfile, file=sys.stderr)
29 |         passage2file(joined, outfile, binary=args.binary)
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     argparser = argparse.ArgumentParser(description=desc)
34 |     argparser.add_argument("filenames", nargs="+", help="passage file names to join")
35 |     argparser.add_argument("-o", "--outdir", default=".", help="output directory")
36 |     argparser.add_argument("-p", "--prefix", default="", help="output filename prefix")
37 |     argparser.add_argument("-r", "--remarks", action="store_true", help="annotate original IDs")
38 |     argparser.add_argument("-b", "--binary", action="store_true", help="write in pickle binary format (.pickle)")
39 |     argparser.add_argument("-j", "--join-by-prefix", action="store_true",
40 |                            help="join each set of passages whose IDs share all but the last 3 characters")
41 |     main(argparser.parse_args())
42 | 


--------------------------------------------------------------------------------
/scripts/join_sdp.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | import glob
 5 | import os
 6 | import sys
 7 | 
 8 | desc = """Combines several SDP parsed files to one.
 9 | """
10 | 
11 | 
12 | def main(args):
13 |     lines = [args.prefix + args.header + "\n"]
14 |     for pattern in args.filenames:
15 |         filenames = sorted(glob.glob(pattern))
16 |         if not filenames:
17 |             raise IOError("Not found: " + pattern)
18 |         for filename in filenames:
19 |             base = os.path.basename(os.path.splitext(filename)[0])
20 |             lines.append(args.prefix + base + "\n")
21 |             with open(filename, encoding="utf-8") as f:
22 |                 lines += f.readlines()
23 |         f = sys.stdout if args.outfile is None else open(args.outfile, "w", encoding="utf-8")
24 |         f.writelines(lines)
25 |         if args.outfile is not None:
26 |             f.close()
27 | 
28 | 
29 | if __name__ == '__main__':
30 |     argparser = argparse.ArgumentParser(description=desc)
31 |     argparser.add_argument("filenames", nargs="+",
32 |                            help="SDP file names to join")
33 |     argparser.add_argument("-o", "--outfile",
34 |                            help="output filename (standard output if unspecified)")
35 |     argparser.add_argument("-H", "--header", default="SDP 2015",
36 |                            help="first line in the file, not including prefix")
37 |     argparser.add_argument("-p", "--prefix", default="#",
38 |                            help="prefix for comment lines")
39 |     main(argparser.parse_args())
40 | 


--------------------------------------------------------------------------------
/scripts/load_word_vectors.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | 
 5 | from ucca.textutil import get_word_vectors
 6 | 
 7 | desc = """Load word vectors file to make sure it works."""
 8 | 
 9 | 
10 | def main(args):
11 |     for filename in args.filenames:
12 |         vectors, dim = get_word_vectors(size=args.rows, dim=args.dim, filename=filename)
13 |         print("Loaded %d rows, dim=%d" % (len(vectors), dim))
14 | 
15 | 
16 | if __name__ == '__main__':
17 |     argparser = argparse.ArgumentParser(description=desc)
18 |     argparser.add_argument("filenames", nargs="+", help="word vector files to load")
19 |     argparser.add_argument("-r", "--rows", type=int, help="maximum number of word vectors")
20 |     argparser.add_argument("-d", "--dim", type=int, help="maximum dimension of word vectors")
21 |     main(argparser.parse_args())
22 | 


--------------------------------------------------------------------------------
/scripts/match_text.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import re
 3 | import sys
 4 | from glob import glob
 5 | from itertools import groupby
 6 | from operator import attrgetter
 7 | 
 8 | from tqdm import tqdm
 9 | 
10 | from ucca import layer0
11 | from ucca.ioutil import get_passages_with_progress_bar
12 | 
13 | 
14 | def gen_lines(filenames):
15 |     for filename in glob(filenames) or [filenames]:
16 |         with open(filename, encoding="utf-8") as f:
17 |             try:
18 |                 for line in map(str.strip, f):
19 |                     if line and not line.startswith("#"):
20 |                         yield re.sub(r"\[\d+\]", "", line)  # Remove numbers inside brackets
21 |             except UnicodeDecodeError as e:
22 |                 raise IOError("Failed reading '%s'" % filename) from e
23 | 
24 | 
25 | class CandidateMatcher:
26 |     def __init__(self, text):
27 |         self.text = text
28 |         self.char_map = {}
29 |         no_space_chars = []
30 |         for i, char in enumerate(text):
31 |             if not char.isspace():
32 |                 self.char_map[len(no_space_chars)] = i
33 |                 no_space_chars.append(char)
34 |         self.no_space_text = "".join(no_space_chars)
35 | 
36 |     def __call__(self, no_space_text):
37 |         try:
38 |             index = self.no_space_text.index(no_space_text)
39 |             return self.text[self.char_map[index]:self.char_map[index + len(no_space_text) - 1] + 1]
40 |         except ValueError:
41 |             return None
42 | 
43 | 
44 | def match_passage_text(passage, matchers, out):
45 |     passage_tokens = sorted(passage.layer(layer0.LAYER_ID).all, key=attrgetter("position"))
46 |     for paragraph, terminals in groupby(passage_tokens, key=attrgetter("paragraph")):
47 |         tokens = [terminal.text for terminal in terminals]
48 |         no_space_text = "".join(tokens)
49 |         match = next(filter(None, (matcher(no_space_text) for matcher in matchers)), "@@@" + " ".join(tokens))
50 |         print(passage.ID, match, sep="\t", file=out)
51 | 
52 | 
53 | def alternative_spellings(text):
54 |     yield text
55 | 
56 | 
57 | def main(args):
58 |     matchers = [CandidateMatcher(spelling) for line in tqdm(list(gen_lines(args.text)),
59 |                                                             desc="Indexing " + args.text, unit=" lines")
60 |                 for spelling in alternative_spellings(line)]
61 |     out = open(args.out, "w", encoding="utf-8") if args.out else sys.stdout
62 |     for p in get_passages_with_progress_bar(args.filenames, desc="Matching", converters={}):
63 |         match_passage_text(p, matchers, out)
64 |     out.close()
65 | 
66 | 
67 | if __name__ == "__main__":
68 |     argparser = argparse.ArgumentParser(description="Match UCCA passages to original text and print aligned lines")
69 |     argparser.add_argument("text", help="file of text to match to")
70 |     argparser.add_argument("filenames", nargs="+", help="files or directories of UCCA passages to match")
71 |     argparser.add_argument("-o", "--out", default="text.tsv", help="output file")
72 |     argparser.add_argument("-l", "--lang", default="en", help="spaCy language")
73 |     main(argparser.parse_args())
74 | 


--------------------------------------------------------------------------------
/scripts/normalize.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | 
 4 | from ucca.ioutil import get_passages_with_progress_bar, write_passage
 5 | from ucca.normalization import normalize
 6 | 
 7 | 
 8 | def main(args):
 9 |     if args.outdir:
10 |         os.makedirs(args.outdir, exist_ok=True)
11 |     for p in get_passages_with_progress_bar(args.filenames, desc="Normalizing", converters={}):
12 |         normalize(p, extra=args.extra)
13 |         write_passage(p, outdir=args.outdir, prefix=args.prefix, binary=args.binary, verbose=False)
14 | 
15 | 
16 | if __name__ == "__main__":
17 |     argparser = argparse.ArgumentParser(description="Normalize UCCA passages")
18 |     argparser.add_argument("filenames", nargs="+", help="files or directories to normalize")
19 |     argparser.add_argument("-o", "--outdir", default=".", help="output directory")
20 |     argparser.add_argument("-p", "--prefix", default="", help="output filename prefix")
21 |     argparser.add_argument("-b", "--binary", action="store_true", help="write in pickle binary format (.pickle)")
22 |     argparser.add_argument("-e", "--extra", action="store_true", help="extra normalization rules")
23 |     main(argparser.parse_args())
24 | 


--------------------------------------------------------------------------------
/scripts/pickle_to_standard.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import argparse
 3 | import os
 4 | import sys
 5 | 
 6 | from ucca.ioutil import file2passage, passage2file
 7 | 
 8 | desc = """Parses pickle files in UCCA standard format, and writes them in XML format.
 9 | """
10 | 
11 | 
12 | def main(args):
13 |     for filename in args.filenames:
14 |         sys.stderr.write("Reading passage '%s'...\n" % filename)
15 |         passage = file2passage(filename)
16 |         basename = os.path.splitext(os.path.basename(filename))[0]
17 |         outfile = args.outdir + os.path.sep + basename + ".xml"
18 |         sys.stderr.write("Writing file '%s'...\n" % outfile)
19 |         passage2file(passage, outfile)
20 | 
21 | 
22 | if __name__ == '__main__':
23 |     argparser = argparse.ArgumentParser(description=desc)
24 |     argparser.add_argument('filenames', nargs='+', help="pickle file names to convert")
25 |     argparser.add_argument('-o', '--outdir', default='.', help="output directory")
26 |     main(argparser.parse_args())
27 | 


--------------------------------------------------------------------------------
/scripts/remove_br_tokens.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import ntpath
 4 | import argparse
 5 | import os
 6 | from xml.etree.ElementTree import tostring
 7 | 
 8 | from ucca import convert
 9 | from ucca.ioutil import external_write_mode
10 | from ucca.ioutil import get_passages_with_progress_bar
11 | import xml.etree.ElementTree as ET
12 | 
13 | desc = """Removes <br> tokens from a standard XML."""
14 | 
15 | 
16 | def main(args):
17 |     os.makedirs(args.outdir, exist_ok=True)
18 |     for fn in args.filenames:
19 |         tree = ET.parse(fn)
20 |         root = tree.getroot()
21 |         to_remove = []
22 |         old_to_new_ID = {}
23 | 
24 |         for node in root.getiterator():
25 |             if node.tag == 'layer' and node.attrib.get('layerID',None) == "0":
26 |                 layer0 = node
27 |                 break
28 | 
29 |         last_parag = "1"
30 |         position_in_paragraph = 0
31 |         position = 1
32 |         for node in layer0.getiterator():
33 |             if node.tag == 'node':
34 |                 new_ID = '0.' + str(position)
35 |                 old_to_new_ID[node.attrib['ID']] = new_ID
36 |                 node.attrib['ID'] = new_ID
37 |                 for e in node.iter():
38 |                     if e.tag == 'attributes':
39 |                         if e.attrib.get('text',None) in ['','<br>']:
40 |                             to_remove.append(node)
41 |                         else:
42 |                             position += 1
43 |                             if e.attrib.get('paragraph', "0") != last_parag:
44 |                                 position_in_paragraph = 0
45 |                                 last_parag = e.attrib.get('paragraph', "0")
46 |                             position_in_paragraph += 1
47 |                             e.attrib['paragraph_position'] = str(position_in_paragraph)
48 | 
49 |         for node in to_remove:
50 |             layer0.remove(node)
51 | 
52 |         # fixing layer1
53 |         for node in root.getiterator():
54 |             if node.tag == 'layer' and node.attrib.get('layerID',None) == "1":
55 |                 layer1 = node
56 |                 break
57 | 
58 |         for node in layer1.getiterator():
59 |             if node.tag == 'edge':
60 |                 if node.attrib.get("toID",None) in old_to_new_ID.keys():
61 |                     node.attrib["toID"] = old_to_new_ID[node.attrib["toID"]]
62 | 
63 |         P = convert.from_standard(root)
64 |         xml_str = tostring(root).decode()
65 |         site_filename = os.path.join(args.outdir,ntpath.basename(fn))
66 |         f = open(site_filename,'w')
67 |         f.write(xml_str)
68 |         f.close()
69 | 
70 | 
71 | 
72 | if __name__ == "__main__":
73 |     argparser = argparse.ArgumentParser(description=desc)
74 |     argparser.add_argument("filenames", nargs="+", help="XML file names to convert")
75 |     argparser.add_argument("-o", "--outdir", default=".", help="output directory")
76 |     argparser.add_argument("-v", "--verbose", action="store_true", help="verbose output")
77 |     main(argparser.parse_args())
78 | 


--------------------------------------------------------------------------------
/scripts/replace_tokens_by_dict.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | from glob import glob
 4 | 
 5 | desc = """Replaces the tokens according to a dictionary."""
 6 | 
 7 | 
 8 | def read_dictionary_from_file(filename):
 9 |     f = open(filename, encoding="utf-8")
10 |     d = {}
11 |     for line in f:
12 |         fields = line.strip().split()
13 |         d[fields[0]] = fields[1]
14 |         d[fields[0].strip().encode('ascii', 'xmlcharrefreplace').decode()] = \
15 |             fields[1].strip().encode('ascii', 'xmlcharrefreplace').decode()
16 |     print(d)
17 |     return d
18 | 
19 | 
20 | def main(args):
21 |     os.makedirs(args.out_dir, exist_ok=True)
22 |     replacement_dict = read_dictionary_from_file(args.dict)
23 |     for pattern in args.filenames:
24 |         for filename in sorted(glob(pattern)) or [pattern]:
25 |             basename = os.path.basename(filename)
26 |             with open(os.path.join(args.out_dir, basename), "w", encoding="utf-8") as outfile:
27 |                 with open(filename, encoding="utf-8") as infile:
28 |                     xml_string = infile.read()
29 |                 for k, v in replacement_dict.items():
30 |                     if args.whole_word:
31 |                         xml_string = xml_string.replace("text=\"" + k + "\"", "text=\"" + v + "\"")
32 |                     else:
33 |                         xml_string = xml_string.replace(k, v)
34 |                 print(xml_string, file=outfile, end="")
35 |     print("Done")
36 | 
37 | 
38 | if __name__ == "__main__":
39 |     argparser = argparse.ArgumentParser(description=desc)
40 |     argparser.add_argument("filenames", nargs="+", help="files to replace tokens in")
41 |     argparser.add_argument("-o", "--out-dir", default=".", help="output directory for changed XMLs")
42 |     argparser.add_argument("-d", "--dict",
43 |                            help="filename to read the dictionary from. the file should have one line per entry, in the"
44 |                                 " format of <original text> <replaced text>")
45 |     argparser.add_argument("-w", "--whole-word", action="store_true", help="replace whole word")
46 |     main(argparser.parse_args())
47 | 


--------------------------------------------------------------------------------
/scripts/set_external_id_offline.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import argparse
 3 | import os
 4 | import sys
 5 | 
 6 | from ucca.ioutil import get_passages_with_progress_bar, write_passage
 7 | 
 8 | desc = """Rename passages by a given mapping of IDs"""
 9 | 
10 | 
11 | def main(filename, input_filenames, outdir):
12 |     os.makedirs(outdir, exist_ok=True)
13 |     with open(filename, encoding="utf-8") as f:
14 |         pairs = [line.strip().split() for line in f]
15 |         old_to_new_id = {old_id: new_id for new_id, old_id in pairs}
16 |     for passage in get_passages_with_progress_bar(input_filenames, desc="Renaming"):
17 |         passage._ID = old_to_new_id[passage.ID]
18 |         write_passage(passage, outdir=outdir, verbose=False)
19 | 
20 | 
21 | if __name__ == "__main__":
22 |     argument_parser = argparse.ArgumentParser(description=desc)
23 |     argument_parser.add_argument("filename", help="file with lines of the form <NEW ID> <OLD ID>")
24 |     argument_parser.add_argument("input_filenames", help="filename pattern or directory with input passages")
25 |     argument_parser.add_argument("-o", "--outdir", default=".", help="output directory")
26 |     main(**vars(argument_parser.parse_args()))
27 |     sys.exit(0)
28 | 


--------------------------------------------------------------------------------
/scripts/site_pickle_to_standard.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import pickle
 4 | from glob import glob
 5 | from xml.etree.ElementTree import Element
 6 | 
 7 | import ucca.convert
 8 | from ucca.ioutil import write_passage
 9 | 
10 | desc = """Parses pickle files containing XML in UCCA site format, and convert to standard XML"""
11 | 
12 | 
13 | def pickle_site2passage(filename):
14 |     """Opens a pickle file containing XML in UCCA site format and returns its parsed Passage object"""
15 |     with open(filename, "rb") as h:
16 |         root = elem = pickle.load(h)
17 |         while isinstance(elem, list):
18 |             try:
19 |                 elem = next(e for e in elem if isinstance(e, (Element, list)))
20 |             except StopIteration:
21 |                 raise ValueError("Cannot parse %s" % root)
22 |         return ucca.convert.from_site(elem)
23 | 
24 | 
25 | def main(args):
26 |     os.makedirs(args.out_dir, exist_ok=True)
27 |     exceptions = []
28 |     for pattern in args.filenames:
29 |         for filename in sorted(glob(pattern)) or [pattern]:
30 |             print("Reading '%s'..." % filename)
31 |             try:
32 |                 passage = pickle_site2passage(filename)
33 |                 write_passage(passage, outdir=args.out_dir, binary=args.binary, basename=os.path.basename(filename))
34 |             except ValueError as e:
35 |                 exceptions.append((filename, e))
36 |     if exceptions:
37 |         for filename, e in exceptions:
38 |             print("'%s': %s" % (filename, e))
39 | 
40 | 
41 | if __name__ == "__main__":
42 |     argparser = argparse.ArgumentParser(description=desc)
43 |     argparser.add_argument("filenames", nargs="*", help="pickle file names to convert")
44 |     argparser.add_argument("-o", "--out-dir", default=".", help="output directory")
45 |     argparser.add_argument("-b", "--binary", help="output binary pickle")
46 |     main(argparser.parse_args())
47 | 


--------------------------------------------------------------------------------
/scripts/site_to_standard.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import sqlite3
 4 | from glob import glob
 5 | from xml.etree.ElementTree import ElementTree, fromstring
 6 | 
 7 | import ucca.convert
 8 | from ucca.ioutil import write_passage
 9 | 
10 | desc = """Parses an XML in UCCA site format.
11 | 
12 | The input can be given as either an XML file or a DB file with passage ID
13 | and user name, and the output is either the standard format XML or
14 | a pickled object.
15 | Possible input methods are using a DB file with pid and user, which gets the
16 | annotation of the specified user for the specified passage from teh DB file,
17 | or using filenames of a site-formatted XML file.
18 | 
19 | """
20 | 
21 | 
22 | def site2passage(filename):
23 |     """Opens a file and returns its parsed Passage object"""
24 |     with open(filename, encoding="utf-8") as f:
25 |         print("Reading '%s'..." % filename)
26 |         return ucca.convert.from_site(ElementTree().parse(f))
27 | 
28 | 
29 | def db2passage(handle, pid, user):
30 |     """Gets the annotation of user to pid from the DB handle - returns a passage"""
31 |     handle.execute("SELECT id FROM users WHERE username=?", (user,))
32 |     uid = handle.fetchone()[0]
33 |     handle.execute("SELECT xml FROM xmls WHERE paid=? AND uid=? ORDER BY ts DESC", (pid, uid))
34 |     return ucca.convert.from_site(fromstring(handle.fetchone()[0]))
35 | 
36 | 
37 | def main(args):
38 |     os.makedirs(args.out_dir, exist_ok=True)
39 |     for filename, passage in ((filename, site2passage(filename)) for pattern in args.filenames
40 |                               for filename in sorted(glob(pattern)) or [pattern]) if args.filenames \
41 |             else ((pid, db2passage(sqlite3.connect(args.db).cursor(), pid, args.user)) for pid in args.pids):
42 |         write_passage(passage, outdir=args.out_dir, binary=args.binary)
43 | 
44 | 
45 | def check_illegal_combinations(args):
46 |     if args.db and not (args.pids and args.user):
47 |         argparser.error("Must specify a username and a passage ID when using DB file option")
48 |     if (args.pids or args.user) and not args.db:
49 |         argparser.error("Cannot use user and passage ID options without DB file")
50 |     return args
51 | 
52 | 
53 | if __name__ == "__main__":
54 |     argparser = argparse.ArgumentParser(description=desc)
55 |     argparser.add_argument("filenames", nargs="*", help="XML file name to convert")
56 |     argparser.add_argument("-d", "--db", help="DB file to get input from")
57 |     argparser.add_argument("-o", "--out-dir", default=".", help="output directory for standard XML")
58 |     argparser.add_argument("-b", "--binary", help="output file for binary pickle")
59 |     argparser.add_argument("-p", "--pids", nargs="*", type=int, help="PassageIDs to query DB")
60 |     argparser.add_argument("-u", "--user", help="Username to DB query")
61 |     main(check_illegal_combinations(argparser.parse_args()))
62 | 


--------------------------------------------------------------------------------
/scripts/site_to_text.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/python3
 2 | 
 3 | import argparse
 4 | import pickle
 5 | from xml.etree.ElementTree import ElementTree, fromstring
 6 | 
 7 | import psycopg2
 8 | 
 9 | import ucca.convert
10 | 
11 | desc = """Parses an XML in UCCA site format.
12 | 
13 | The input can be given as either an XML file or a DB file with passage ID
14 | and user name, and the output is either the standard format XML or
15 | a pickled object.
16 | Possible input methods are using a DB file with pid and user, which gets the
17 | annotation of the specified user for the specified passage from teh DB file,
18 | or using filename of a site-formatted XML file.
19 | 
20 | """
21 | 
22 | 
23 | def site2passage(filename):
24 |     """Opens a file and returns its parsed Passage object"""
25 |     with open(filename, encoding="utf-8") as f:
26 |         etree = ElementTree().parse(f)
27 |     return ucca.convert.from_site(etree)
28 | 
29 | 
30 | def db2passage(handle, pid, user):
31 |     """Gets the annotation of user to pid from the DB handle - returns a passage"""
32 |     handle.execute("SET search_path to oabend")
33 |     handle.execute("SELECT id FROM users WHERE username=%s", (user,))
34 |     uid = handle.fetchone()[0]
35 |     handle.execute("SELECT xml,ts FROM xmls WHERE paid=%s AND uid=%s " +
36 |                    "ORDER BY ts DESC", (pid, uid))
37 |     raw_xml, ts = handle.fetchone()
38 |     #print('extracted passage from '+str(ts))
39 |     return ucca.convert.from_site(fromstring(raw_xml))
40 | 
41 | 
42 | def main(args):
43 |     # Checking for illegal combinations
44 |     if args.db and args.filename:
45 |         argparser.error("Only one source, XML or DB file, can be used")
46 |     if (not args.db) and (not args.filename):
47 |         argparser.error("Must specify one source, XML or DB file")
48 |     if args.db and not (args.pid and args.user):
49 |         argparser.error("Must specify a username and a passage ID when " +
50 |                         "using DB file option")
51 |     if (args.pid or args.user) and not args.db:
52 |         argparser.error("Cannot use user and passage ID options without DB file")
53 | 
54 |     if args.filename:
55 |         passage = site2passage(args.filename)
56 |     else:
57 |         conn = psycopg2.connect(host=args.host, database=args.db)
58 |         c = conn.cursor()
59 |         passage = db2passage(c, args.pid, args.user)
60 | 
61 |     if args.binary:
62 |         with open(args.binary, "wb") as binf:
63 |             pickle.dump(passage, binf)
64 |     else:
65 |         output = ucca.convert.to_text(passage, lang=args.lang)
66 |         if args.outfile:
67 |             with open(args.outfile, "w", encoding="utf-8") as outf:
68 |                 outf.write(output)
69 |         else:
70 |             print(output)
71 | 
72 | 
73 | if __name__ == "__main__":
74 |     argparser = argparse.ArgumentParser(description=desc)
75 |     argparser.add_argument("filename", nargs="?", help="XML file name to convert")
76 |     argparser.add_argument("-o", "--outfile", help="output file for standard XML")
77 |     argparser.add_argument("-b", "--binary", help="output file for binary pickel")
78 |     argparser.add_argument("-d", "--db", help="DB file to get input from")
79 |     argparser.add_argument("--host", help="DB host server to get input from")
80 |     argparser.add_argument("-p", "--pid", type=int, help="PassageID to query DB")
81 |     argparser.add_argument("-u", "--user", help="Username to DB query")
82 |     argparser.add_argument("-l", "--lang", default="en", help="language two-letter code for sentence model")
83 |     main(argparser.parse_args())
84 | 


--------------------------------------------------------------------------------
/scripts/split_corpus.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import re
 4 | from shutil import copyfile
 5 | 
 6 | desc = """Split a directory of files into "train", "dev" and "test" directories.
 7 | All files not in either "train" or "dev" will go into "test".
 8 | """
 9 | TRAIN_DEFAULT = 300
10 | DEV_DEFAULT = 34
11 | 
12 | 
13 | # TEST on all the rest
14 | 
15 | 
16 | def copy(src, dest, link=False):
17 |     if link:
18 |         try:
19 |             os.symlink(src, dest)
20 |         except (NotImplementedError, OSError):
21 |             copyfile(src, dest)
22 |     else:
23 |         copyfile(src, dest)
24 | 
25 | 
26 | def numeric(s):
27 |     try:
28 |         return int(re.findall("([0-9]+)", s)[-1])
29 |     except (ValueError, IndexError) as e:
30 |         raise ValueError("Cannot find numeric ID in '%s'" % s) from e
31 | 
32 | 
33 | def not_split_dir(filename):
34 |     return filename not in ("train", "dev", "test") and not filename.startswith(".")
35 | 
36 | 
37 | def split_passages(directory, train, dev, link, quiet=False):
38 |     filenames = sorted(filter(not_split_dir, os.listdir(directory)), key=numeric)
39 |     assert filenames, "No files to split"
40 |     assert train + dev <= len(filenames), "Not enough files to split: %d+%d>%d" % (train, dev, len(filenames))
41 |     for subdirectory in "train", "dev", "test":
42 |         os.makedirs(os.path.join(directory, subdirectory), exist_ok=True)
43 |     print("%d files to split: %d/%d/%d" % (len(filenames), train, dev, len(filenames) - train - dev))
44 |     print_format = "Creating link in %s to: " if link else "Copying to %s: "
45 |     if not quiet:
46 |         print(print_format % "train", end="", flush=True)
47 |     for f in filenames[:train]:
48 |         copy(os.path.join(directory, f), os.path.join(directory, "train", f), link)
49 |         if not quiet:
50 |             print(f, end=" ", flush=True)
51 |     if not quiet:
52 |         print()
53 |         print(print_format % "dev", end="", flush=True)
54 |     for f in filenames[train:train + dev]:
55 |         copy(os.path.join(directory, f), os.path.join(directory, "dev", f), link)
56 |         if not quiet:
57 |             print(f, end=" ", flush=True)
58 |     if not quiet:
59 |         print()
60 |         print(print_format % "test", end="", flush=True)
61 |     for f in filenames[train + dev:]:
62 |         copy(os.path.join(directory, f), os.path.join(directory, "test", f), link)
63 |         if not quiet:
64 |             print(f, end=" ", flush=True)
65 |     if not quiet:
66 |         print()
67 | 
68 | 
69 | def main(args):
70 |     split_passages(os.path.abspath(args.directory), args.train, args.dev, link=args.link, quiet=args.quiet)
71 | 
72 | 
73 | if __name__ == "__main__":
74 |     argparser = argparse.ArgumentParser(description=desc)
75 |     argparser.add_argument("directory", default=".", nargs="?", help="directory to split (default: current directory)")
76 |     argparser.add_argument("-t", "--train", type=int, default=TRAIN_DEFAULT,
77 |                            help="size of train split (default: %d)" % TRAIN_DEFAULT)
78 |     argparser.add_argument("-d", "--dev", type=int, default=DEV_DEFAULT,
79 |                            help="size of dev split (default: %d)" % DEV_DEFAULT)
80 |     argparser.add_argument("-l", "--link", action="store_true", help="create symbolic link instead of copying")
81 |     argparser.add_argument("-q", "--quiet", action="store_true", help="less output")
82 |     main(argparser.parse_args())
83 | 


--------------------------------------------------------------------------------
/scripts/standard_to_json.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | import os
 5 | 
 6 | from ucca import convert
 7 | from ucca.ioutil import external_write_mode
 8 | from ucca.ioutil import get_passages_with_progress_bar
 9 | 
10 | desc = """Parses an XML in UCCA standard format, and writes them in new site format."""
11 | 
12 | 
13 | def main(args):
14 |     os.makedirs(args.outdir, exist_ok=True)
15 |     for passage in get_passages_with_progress_bar(args.filenames):
16 |         site_filename = os.path.join(args.outdir, passage.ID + ".json")
17 |         with open(site_filename, "w", encoding="utf-8") as f:
18 |             print("\n".join(convert.to_json(passage)), file=f)
19 |         if args.verbose:
20 |             with external_write_mode():
21 |                 print("Wrote '%s'" % site_filename)
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     argparser = argparse.ArgumentParser(description=desc)
26 |     argparser.add_argument("filenames", nargs="+", help="XML file names to convert")
27 |     argparser.add_argument("-o", "--outdir", default=".", help="output directory")
28 |     argparser.add_argument("-v", "--verbose", action="store_true", help="verbose output")
29 |     main(argparser.parse_args())
30 | 


--------------------------------------------------------------------------------
/scripts/standard_to_paragraphs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | import os
 5 | import sys
 6 | from itertools import count
 7 | 
 8 | from ucca.convert import split2paragraphs
 9 | from ucca.ioutil import passage2file, get_passages_with_progress_bar, external_write_mode
10 | from ucca.normalization import normalize
11 | 
12 | desc = """Parses XML files in UCCA standard format, and writes a passage per paragraph."""
13 | 
14 | 
15 | def main(args):
16 |     os.makedirs(args.outdir, exist_ok=True)
17 |     i = 0
18 |     for passage in get_passages_with_progress_bar(args.filenames, "Splitting"):
19 |         for paragraph in split2paragraphs(
20 |                 passage, remarks=args.remarks, lang=args.lang, ids=map(str, count(i)) if args.enumerate else None):
21 |             i += 1
22 |             outfile = os.path.join(args.outdir, args.prefix + paragraph.ID + (".pickle" if args.binary else ".xml"))
23 |             if args.verbose:
24 |                 with external_write_mode():
25 |                     print(paragraph, file=sys.stderr)
26 |                     print("Writing passage file for paragraph '%s'..." % outfile, file=sys.stderr)
27 |             if args.normalize:
28 |                 normalize(paragraph)
29 |             passage2file(paragraph, outfile, binary=args.binary)
30 | 
31 | 
32 | if __name__ == "__main__":
33 |     argparser = argparse.ArgumentParser(description=desc)
34 |     argparser.add_argument("filenames", nargs="+", help="passage file names to convert")
35 |     argparser.add_argument("-o", "--outdir", default=".", help="output directory")
36 |     argparser.add_argument("-p", "--prefix", default="", help="output filename prefix")
37 |     argparser.add_argument("-r", "--remarks", action="store_true", help="annotate original IDs")
38 |     argparser.add_argument("-l", "--lang", default="en", help="language two-letter code for paragraph model")
39 |     argparser.add_argument("-b", "--binary", action="store_true", help="write in pickle binary format (.pickle)")
40 |     argparser.add_argument("-e", "--enumerate", action="store_true", help="set output paragraph ID by global order")
41 |     argparser.add_argument("-N", "--no-normalize", dest="normalize", action="store_false",
42 |                            help="do not normalize passages after splitting")
43 |     argparser.add_argument("-v", "--verbose", action="store_true", help="print information about every split paragraph")
44 |     main(argparser.parse_args())
45 | 


--------------------------------------------------------------------------------
/scripts/standard_to_pickle.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import sys
 3 | 
 4 | import argparse
 5 | import os
 6 | from tqdm import tqdm
 7 | 
 8 | from ucca.ioutil import file2passage, passage2file, external_write_mode
 9 | 
10 | desc = """Parses an XML in UCCA standard format, and writes them in binary Pickle format."""
11 | 
12 | 
13 | def main(args):
14 |     os.makedirs(args.outdir, exist_ok=True)
15 |     for filename in tqdm(args.filenames, desc="Converting", unit=" passages"):
16 |         if args.verbose:
17 |             with external_write_mode():
18 |                 print("Reading passage '%s'..." % filename, file=sys.stderr)
19 |         passage = file2passage(filename)
20 |         basename = os.path.splitext(os.path.basename(filename))[0]
21 |         outfile = args.outdir + os.path.sep + basename + ".pickle"
22 |         if args.verbose:
23 |             with external_write_mode():
24 |                 print("Writing file '%s'..." % outfile, file=sys.stderr)
25 |         passage2file(passage, outfile, binary=True)
26 | 
27 | 
28 | if __name__ == '__main__':
29 |     argparser = argparse.ArgumentParser(description=desc)
30 |     argparser.add_argument('filenames', nargs='+', help="XML file names to convert")
31 |     argparser.add_argument('-o', '--outdir', default='.', help="output directory")
32 |     argparser.add_argument('-v', '--verbose', action="store_true", help="verbose output")
33 |     main(argparser.parse_args())
34 | 


--------------------------------------------------------------------------------
/scripts/standard_to_sentences.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import argparse
  4 | import os
  5 | import sys
  6 | from itertools import count
  7 | from logging import warning
  8 | 
  9 | from ucca.convert import split2sentences, split_passage
 10 | from ucca.ioutil import passage2file, get_passages_with_progress_bar, external_write_mode
 11 | from ucca.normalization import normalize
 12 | from ucca.textutil import extract_terminals
 13 | 
 14 | desc = """Parses XML files in UCCA standard format, and writes a passage per sentence."""
 15 | 
 16 | NUM_NODES_WARNING = 500  # Warn if a sentence has more than this many nodes
 17 | 
 18 | 
 19 | class Splitter:
 20 |     def __init__(self, sentences, enum=False, suffix_format="%03d", suffix_start=0):
 21 |         self.sentences = sentences
 22 |         self.sentence_to_index = {}
 23 |         for i, sentence in enumerate(sentences):
 24 |             self.sentence_to_index.setdefault(sentence, []).append(i)
 25 |         self.enumerate = enum
 26 |         self.suffix_format = suffix_format
 27 |         self.suffix_start = suffix_start
 28 |         self.index = 0
 29 |         self.matched_indices = set()
 30 | 
 31 |     @classmethod
 32 |     def read_file(cls, filename, **kwargs):
 33 |         if filename is None:
 34 |             return None
 35 |         with open(filename, encoding="utf-8") as f:
 36 |             sentences = [line.strip() for line in f]
 37 |         return cls(sentences, **kwargs)
 38 | 
 39 |     def split(self, passage):
 40 |         ends = []
 41 |         ids = []
 42 |         token_lists = []
 43 |         for terminal in extract_terminals(passage):
 44 |             token_lists.append([])
 45 |             for terminals in token_lists if self.index is None else [token_lists[0]]:
 46 |                 terminals.append(terminal)
 47 |                 sentence = " ".join(t.text for t in terminals)
 48 |                 if self.index is not None and self.index < len(self.sentences) and self.sentences[
 49 |                         self.index].startswith(sentence):  # Try matching next sentence rather than shortest
 50 |                     index = self.index if self.sentences[self.index] == sentence else None
 51 |                 else:
 52 |                     indices = self.sentence_to_index.get(sentence)
 53 |                     index = self.index = indices.pop(0) if indices else None
 54 |                 if index is not None:
 55 |                     self.matched_indices.add(index)
 56 |                     last_end = terminals[0].position - 1
 57 |                     if len(terminals) > 1 and last_end and last_end not in ends:
 58 |                         ends.append(last_end)
 59 |                     ends.append(terminal.position)
 60 |                     ids.append(str(index))
 61 |                     token_lists = []
 62 |                     self.index += 1
 63 |                     break
 64 |         return split_passage(passage, ends, ids=ids if self.enumerate else None,
 65 |                              suffix_format=self.suffix_format, suffix_start=self.suffix_start)
 66 | 
 67 | 
 68 | def main(args):
 69 |     splitter = Splitter.read_file(args.sentences, enum=args.enumerate,
 70 |                                   suffix_format=args.suffix_format, suffix_start=args.suffix_start)
 71 |     os.makedirs(args.outdir, exist_ok=True)
 72 |     i = 0
 73 |     for passage in get_passages_with_progress_bar(args.filenames, "Splitting"):
 74 |         for sentence in splitter.split(passage) if splitter else split2sentences(
 75 |                 passage, remarks=args.remarks, lang=args.lang, ids=map(str, count(i)) if args.enumerate else None):
 76 |             i += 1
 77 |             outfile = os.path.join(args.outdir, args.prefix + sentence.ID + (".pickle" if args.binary else ".xml"))
 78 |             if len(sentence.nodes) > NUM_NODES_WARNING:
 79 |                 warning(f"Sentence {i} in passage {passage.ID} has {len(sentence.nodes)} > {NUM_NODES_WARNING} nodes")
 80 |             if args.verbose:
 81 |                 with external_write_mode():
 82 |                     print(sentence, file=sys.stderr)
 83 |                     print("Writing passage file for sentence '%s'..." % outfile, file=sys.stderr)
 84 |             if args.normalize:
 85 |                 normalize(sentence)
 86 |             passage2file(sentence, outfile, binary=args.binary)
 87 |     if splitter and len(splitter.matched_indices) < len(splitter.sentences):
 88 |         print("", "Unmatched sentences:", *[s for i, s in enumerate(splitter.sentences)
 89 |                                             if i not in splitter.matched_indices], sep="\n")
 90 | 
 91 | 
 92 | if __name__ == "__main__":
 93 |     argparser = argparse.ArgumentParser(description=desc)
 94 |     argparser.add_argument("filenames", nargs="+", help="passage file names to convert")
 95 |     argparser.add_argument("-o", "--outdir", default=".", help="output directory")
 96 |     argparser.add_argument("-p", "--prefix", default="", help="output filename prefix")
 97 |     argparser.add_argument("-f", "--suffix-format", default="%03d", help="sentence number suffix format")
 98 |     argparser.add_argument("-i", "--suffix-start", type=int, default=0, help="start index for number suffix")
 99 |     argparser.add_argument("-r", "--remarks", action="store_true", help="annotate original IDs")
100 |     argparser.add_argument("-l", "--lang", default="en", help="language two-letter code for sentence model")
101 |     argparser.add_argument("-b", "--binary", action="store_true", help="write in pickle binary format (.pickle)")
102 |     argparser.add_argument("-s", "--sentences", help="optional input file with sentence at each line to split by")
103 |     argparser.add_argument("-e", "--enumerate", action="store_true", help="set each output sentence ID by global order")
104 |     argparser.add_argument("-N", "--no-normalize", dest="normalize", action="store_false",
105 |                            help="do not normalize passages after splitting")
106 |     argparser.add_argument("-v", "--verbose", action="store_true", help="print information about every split sentence")
107 |     main(argparser.parse_args())
108 | 


--------------------------------------------------------------------------------
/scripts/standard_to_site.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | import os
 5 | from xml.etree.ElementTree import tostring
 6 | 
 7 | from ucca import convert
 8 | from ucca.ioutil import external_write_mode
 9 | from ucca.ioutil import get_passages_with_progress_bar
10 | 
11 | desc = """Parses an XML in UCCA standard format, and writes them in old site format."""
12 | 
13 | 
14 | def main(args):
15 |     os.makedirs(args.outdir, exist_ok=True)
16 |     for passage in get_passages_with_progress_bar(args.filenames):
17 |         site_filename = os.path.join(args.outdir, passage.ID + ".xml")
18 |         with open(site_filename, "w", encoding="utf-8") as f:
19 |             print(tostring(convert.to_site(passage)).decode(), file=f)
20 |         if args.verbose:
21 |             with external_write_mode():
22 |                 print("Wrote '%s'" % site_filename)
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     argparser = argparse.ArgumentParser(description=desc)
27 |     argparser.add_argument("filenames", nargs="+", help="XML file names to convert")
28 |     argparser.add_argument("-o", "--outdir", default=".", help="output directory")
29 |     argparser.add_argument("-v", "--verbose", action="store_true", help="verbose output")
30 |     main(argparser.parse_args())
31 | 


--------------------------------------------------------------------------------
/scripts/standard_to_text.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | import os
 5 | import re
 6 | from glob import glob
 7 | 
 8 | from tqdm import tqdm
 9 | 
10 | from ucca.convert import to_text
11 | from ucca.ioutil import file2passage, get_passages_with_progress_bar
12 | 
13 | desc = """Parses files in UCCA standard format, and writes as text files or a text file with a line per passage."""
14 | 
15 | 
16 | def numeric(x):
17 |     try:
18 |         return tuple(map(int, re.findall("\d+", x)))
19 |     except ValueError:
20 |         return x
21 | 
22 | 
23 | def write_text(passage, f, sentences, lang, prepend_id=False):
24 |     for line in to_text(passage, sentences=sentences, lang=lang):
25 |         fields = [passage.ID, line] if prepend_id else [line]
26 |         print(*fields, file=f, sep="\t")
27 | 
28 | 
29 | def main(args):
30 |     os.makedirs(args.outdir, exist_ok=True)
31 |     if args.join:
32 |         out_file = os.path.join(args.outdir, args.join)
33 |         with open(out_file, "w", encoding="utf-8") as f:
34 |             for passage in get_passages_with_progress_bar(sorted(args.filenames, key=numeric), desc="Converting"):
35 |                 write_text(passage, f, sentences=args.sentences, lang=args.lang, prepend_id=args.prepend_id)
36 |         print("Wrote '%s'." % out_file)
37 |     else:  # one file per passage
38 |         for pattern in args.filenames:
39 |             for filename in tqdm(glob(pattern) or [pattern], desc="Converting", unit=" passages"):
40 |                 passage = file2passage(filename)
41 |                 basename = os.path.splitext(os.path.basename(filename))[0]
42 |                 with open(os.path.join(args.outdir, basename + ".txt"), "w", encoding="utf-8") as f:
43 |                     write_text(passage, f, sentences=args.sentences, lang=args.lang, prepend_id=args.prepend_id)
44 | 
45 | 
46 | if __name__ == "__main__":
47 |     argparser = argparse.ArgumentParser(description=desc)
48 |     argparser.add_argument("filenames", nargs="+", help="passage file names to convert")
49 |     argparser.add_argument("-o", "--outdir", default=".", help="output directory")
50 |     argparser.add_argument("-s", "--sentences", action="store_true", help="split to sentences using spaCy")
51 |     argparser.add_argument("-l", "--lang", default="en", help="language two-letter code for sentence model")
52 |     argparser.add_argument("-j", "--join", help="write just one text file with this name, with one line per passage")
53 |     argparser.add_argument("-p", "--prepend-id", action="store_true", help="prepend the passage ID to the output text")
54 |     main(argparser.parse_args())
55 | 


--------------------------------------------------------------------------------
/scripts/statistics.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | from collections import Counter
 5 | 
 6 | import pandas as pd
 7 | 
 8 | from ucca import layer0, layer1
 9 | from ucca.ioutil import get_passages_with_progress_bar
10 | 
11 | desc = """Prints statistics on UCCA passages"""
12 | 
13 | 
14 | def main(args):
15 |     df = pd.DataFrame(index=args.directories, columns=["sentences", "tokens", "nodes", "discontinuous", "reentrant",
16 |                                                        "implicit", "edges", "primary", "remote"])
17 |     df.fillna(0, inplace=True)
18 |     for i, directory in enumerate(args.directories):
19 |         row = df.loc[directory]
20 |         for passage in get_passages_with_progress_bar(directory, desc=directory):
21 |             l1 = passage.layer(layer1.LAYER_ID)
22 |             non_terminals = [n for n in l1.all if n not in l1.heads and len(n.get_terminals()) > 1]
23 |             edges = {e for n in non_terminals for e in n}
24 |             remote_counter = Counter(e.attrib.get("remote", False) for e in edges)
25 |             row["sentences"] += 1
26 |             row["tokens"] += len(passage.layer(layer0.LAYER_ID).all)
27 |             row["nodes"] += len(non_terminals)
28 |             row["discontinuous"] += sum(1 for n in non_terminals if n.discontiguous)
29 |             row["reentrant"] += sum(1 for n in non_terminals if any(e.attrib.get("remote") for e in n.incoming))
30 |             row["edges"] += len(edges)
31 |             row["primary"] += remote_counter[False]
32 |             row["remote"] += remote_counter[True]
33 |             row["implicit"] += sum(1 for n in l1.all if n.attrib.get("implicit"))
34 | 
35 |     # Change to percentages
36 |     df["discontinuous"] *= 100. / df["nodes"]
37 |     df["reentrant"] *= 100. / df["nodes"]
38 |     df["implicit"] *= 100. / df["nodes"]
39 |     df["primary"] *= 100. / df["edges"]
40 |     df["remote"] *= 100. / df["edges"]
41 | 
42 |     # Print
43 |     if args.outfile:
44 |         df.T.to_csv(args.outfile, float_format="%.2f", sep="&", line_terminator=" \\\\\n")
45 |         print("Saved to " + args.outfile)
46 |     else:
47 |         with pd.option_context("display.max_rows", None, "display.max_columns", None):
48 |             print(df.T)
49 | 
50 | 
51 | if __name__ == '__main__':
52 |     argparser = argparse.ArgumentParser(description=desc)
53 |     argparser.add_argument("directories", nargs="+", help="directories to process")
54 |     argparser.add_argument("-o", "--outfile", help="output file for statistics")
55 |     main(argparser.parse_args())
56 | 


--------------------------------------------------------------------------------
/scripts/text_to_standard.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import string
 3 | from glob import glob
 4 | 
 5 | from tqdm import tqdm
 6 | 
 7 | from ucca import core, layer0, layer1
 8 | from ucca.ioutil import write_passage
 9 | 
10 | PUNCTUATION = set(string.punctuation)
11 | 
12 | 
13 | def gen_lines(patterns):
14 |     for pattern in patterns:
15 |         for filename in glob(pattern) or [pattern]:
16 |             with open(filename, encoding="utf-8") as f:
17 |                 for line in f:
18 |                     line = line.strip()
19 |                     if line:
20 |                         yield line
21 | 
22 | 
23 | def main(args):
24 |     for i, line in enumerate(tqdm(gen_lines(args.filenames), unit=" lines", desc="Creating passages"), start=1):
25 |         p = core.Passage(args.format % i)
26 |         l0 = layer0.Layer0(p)
27 |         layer1.Layer1(p)
28 |         for tok in line.split():
29 |             l0.add_terminal(text=tok, punct=PUNCTUATION.issuperset(tok))
30 |         write_passage(p, outdir=args.out_dir, binary=args.binary, verbose=False)
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     argparser = argparse.ArgumentParser(description="Create unannotated passage files from tokenized and split text")
35 |     argparser.add_argument("filenames", nargs="+", help="Input filenames containing tokenized and sentence-split text")
36 |     argparser.add_argument("-o", "--out-dir", help="Directory to write output files to")
37 |     argparser.add_argument("-f", "--format", default="1%04d0", help="String format for passage IDs")
38 |     argparser.add_argument("-b", "--binary", action="store_true", help="Write Pickle files instead of XML")
39 |     main(argparser.parse_args())
40 | 


--------------------------------------------------------------------------------
/scripts/unique_roles.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | from collections import Counter
 5 | 
 6 | from ucca import layer1
 7 | from ucca.ioutil import get_passages_with_progress_bar
 8 | 
 9 | desc = """Finds edge tags that are empirically always unique: occur at most once in edges per node
10 | """
11 | 
12 | 
13 | def main(args):
14 |     out = args.direction == "out"
15 |     roles = set(tag for name, tag in layer1.EdgeTags.__dict__.items()
16 |                 if isinstance(tag, str) and not name.startswith('__'))
17 |     for passage in get_passages_with_progress_bar([args.directory]):
18 |         for node in passage.layer(layer1.LAYER_ID).all:
19 |             counts = Counter(edge.tag for edge in (node if out else node.incoming))
20 |             roles.difference_update(tag for tag, count in counts.items() if count > 1)
21 | 
22 |     lines = "\n".join(sorted(roles))
23 |     print(lines)
24 |     if args.outfile:
25 |         with open(args.outfile, "w", encoding="utf-8") as f:
26 |             print(lines, file=f)
27 | 
28 | 
29 | if __name__ == '__main__':
30 |     argparser = argparse.ArgumentParser(description=desc)
31 |     argparser.add_argument('-d', '--directory', required=True, help="directory with passage files to process")
32 |     argparser.add_argument('-o', '--outfile', default="data/unique_roles.txt", help="output file for data")
33 |     argparser.add_argument('-D', '--direction', default="out", help="direction of edges to check (out|in)")
34 |     main(argparser.parse_args())
35 | 


--------------------------------------------------------------------------------
/scripts/validate.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from multiprocessing import Pool
 3 | 
 4 | import argparse
 5 | 
 6 | from ucca.ioutil import get_passages_with_progress_bar, external_write_mode
 7 | from ucca.normalization import normalize
 8 | from ucca.validation import validate
 9 | 
10 | 
11 | class Validator:
12 |     def __init__(self, normalization=False, extra=False, linkage=True, multigraph=False, strict=False):
13 |         self.normalization = normalization
14 |         self.extra = extra
15 |         self.linkage = linkage
16 |         self.multigraph = multigraph
17 |         self.strict = strict
18 | 
19 |     def validate_passage(self, passage):
20 |         if self.normalization:
21 |             normalize(passage, extra=self.extra)
22 |         errors = list(validate(passage, linkage=self.linkage, multigraph=self.multigraph))
23 |         passage_id = passage.ID
24 |         user_id = passage.attrib.get("userID")
25 |         if user_id:
26 |             passage_id += " " + user_id
27 |         task_id = passage.attrib.get("annotationID")
28 |         if task_id:
29 |             passage_id += " " + task_id
30 |         if self.strict:
31 |             print_errors(passage_id, errors)
32 |         return passage_id, errors
33 | 
34 | 
35 | def main(args):
36 |     validator = Validator(args.normalize, args.extra, linkage=args.linkage, multigraph=args.multigraph,
37 |                           strict=args.strict)
38 |     with Pool(10) as pool:
39 |         errors = pool.map(validator.validate_passage,
40 |                           get_passages_with_progress_bar(args.filenames, desc="Validating", converters={}))
41 |     errors = dict((k, v) for k, v in errors if v)
42 |     if errors:
43 |         if not args.strict:
44 |             id_len = max(map(len, errors))
45 |             for passage_id, es in sorted(errors.items()):
46 |                 print_errors(passage_id, es, id_len)
47 |         sys.exit(1)
48 |     else:
49 |         print("No errors found.")
50 | 
51 | 
52 | def print_errors(passage_id, errors, id_len=None):
53 |     for i, e in enumerate(errors):
54 |         with external_write_mode():
55 |             print("%-*s|%s" % (id_len or len(passage_id), "" if i else passage_id, e), flush=True)
56 | 
57 | 
58 | def check_args(parser, args):
59 |     if args.extra and not args.normalize:
60 |         parser.error("Cannot specify --extra without --normalize")
61 |     return args
62 | 
63 | 
64 | if __name__ == "__main__":
65 |     argparser = argparse.ArgumentParser(description="Validate UCCA passages")
66 |     argparser.add_argument("filenames", nargs="+", help="files or directories to validate")
67 |     argparser.add_argument("-S", "--strict", action="store_true", help="fail as soon as a violation is found")
68 |     argparser.add_argument("-n", "--normalize", action="store_true", help="normalize before validation")
69 |     argparser.add_argument("-e", "--extra", action="store_true", help="extra normalization rules")
70 |     argparser.add_argument("--no-linkage", dest="linkage", action="store_false", help="skip linkage validations")
71 |     argparser.add_argument("--multigraph", action="store_true", help="allow multiple edges with the same parent+child")
72 |     main(check_args(argparser, argparser.parse_args()))
73 | 


--------------------------------------------------------------------------------
/scripts/visualize.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from argparse import ArgumentParser
 3 | 
 4 | from ucca import visualization, layer0
 5 | from ucca.convert import split2sentences
 6 | from ucca.ioutil import get_passages, get_passages_with_progress_bar, external_write_mode
 7 | 
 8 | 
 9 | def print_text(args, text, suffix):
10 |     if args.out_dir:
11 |         with open(os.path.join(args.out_dir, suffix), "w") as f:
12 |             print(text, file=f)
13 |     else:
14 |         with external_write_mode():
15 |             print(text)
16 | 
17 | 
18 | def main(args):
19 |     if args.out_dir:
20 |         os.makedirs(args.out_dir, exist_ok=True)
21 |         if not args.tikz:
22 |             import matplotlib
23 |             matplotlib.use('Agg')
24 |     to_stdout = (args.tikz or args.standoff) and not args.out_dir
25 |     t = args.passages
26 |     t = get_passages(t) if to_stdout else get_passages_with_progress_bar(t, desc="Visualizing")
27 |     if args.sentences:
28 |         t = (sentence for passage in t for sentence in split2sentences(passage))
29 |     for passage in t:
30 |         if args.tikz:
31 |             print_text(args, visualization.tikz(passage), passage.ID + ".tikz.txt")
32 |         elif args.standoff:
33 |             print_text(args, visualization.standoff(passage), passage.ID + ".ann")
34 |         else:
35 |             import matplotlib.pyplot as plt
36 |             width = len(passage.layer(layer0.LAYER_ID).all) * 19 / 27
37 |             plt.figure(passage.ID, figsize=(width, width * 10 / 19))
38 |             visualization.draw(passage, node_ids=args.node_ids)
39 |             if args.out_dir:
40 |                 plt.savefig(os.path.join(args.out_dir, passage.ID + "." + args.format))
41 |                 plt.close()
42 |             else:
43 |                 plt.show()
44 | 
45 | 
46 | if __name__ == "__main__":
47 |     argparser = ArgumentParser(description="Visualize the given passages as graphs.")
48 |     argparser.add_argument("passages", nargs="+", help="UCCA passages, given as xml/pickle file names")
49 |     group = argparser.add_mutually_exclusive_group()
50 |     group.add_argument("-t", "--tikz", action="store_true", help="print tikz code rather than showing plots")
51 |     group.add_argument("-s", "--standoff", action="store_true", help="print standoff code rather than showing plots")
52 |     argparser.add_argument("-o", "--out-dir", help="directory to save figures in (otherwise displayed immediately)")
53 |     argparser.add_argument("-i", "--node-ids", action="store_true", help="print tikz code rather than showing plots")
54 |     argparser.add_argument("-f", "--format", choices=("png", "svg"), default="png", help="image format")
55 |     argparser.add_argument("--sentences", help="split to sentences to avoid huge plots")
56 |     main(argparser.parse_args())
57 | 


--------------------------------------------------------------------------------
/scripts/visualize_as_text.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from argparse import ArgumentParser
  3 | from ucca.constructions import extract_candidates, add_argument
  4 | import xml.etree.ElementTree as ETree
  5 | from ucca import layer0, layer1, convert, textutil
  6 | from operator import attrgetter, itemgetter
  7 | from ucca.convert import split2sentences
  8 | from ucca.ioutil import get_passages, get_passages_with_progress_bar, external_write_mode
  9 | import re
 10 | 
 11 | # Script by Sriram Chaudhury
 12 | 
 13 | 
 14 | descr = {'Q':'Unknown', 'T':'Unknown', 'Terminal':'Terminal_node','P':'Process' ,'S':'State', 'A':'Participant', 'D':'Adverbial', 'C':'Center', 'E':'Elaborator', 'N':'Connector', 'R':'Relator', 'H':'Parallel_Scene', 'L':'Linker' ,'G':'Ground', 'F':'Function', 'U':'Punctuation'
 15 | }
 16 | 
 17 | def find_path(node,path):
 18 |         if(len(node.parents) >= 1):
 19 |             if(node.tag != 'Word') and (node.tag != 'Punctuation'):
 20 |                 #path.append(node.ID+'--'+node.ftag+':'+descr[node.ftag]+'-->'+node.parents[0].ID)
 21 |                 path.append(('-->('+node.ftag+':'+descr[node.ftag]+')-->'+node.parents[0].ID))
 22 |             else:
 23 |                 #path.append(node.text+'--'+'Terminal'+'-->'+node.parents[0].ID)
 24 |                 path.append((node.text+'('+str(node.ID)+')'+'--Terminal-->'+node.parents[0].ID))
 25 |             for j in node.parents:
 26 |                 find_path(j,path)
 27 |         return path
 28 | 
 29 | 
 30 | def find_children(node, path, level):
 31 |     remote_found = 0
 32 |     for edge in node:
 33 |         if(edge.attrib.get('remote')):
 34 |             t12 = edge
 35 |             remote_found = 1
 36 | 
 37 |     for ch in node.children:
 38 |         if (ch.tag != 'Word') and (ch.tag != 'Punctuation'):
 39 |             if(remote_found) and (ch.ID == t12.child.ID):
 40 |                 path.append((ch.ftag, ch.ID+'*', level+1,True))
 41 |             else:
 42 |                 path.append((ch.ftag, ch.ID, level+1,False))
 43 |             find_children(ch, path, level+1)
 44 |         else:
 45 |             path.append((ch.text, ch.ID, level+1,False))
 46 |             path.append('End')
 47 | 
 48 |     return path
 49 | 
 50 | 
 51 | 
 52 | 
 53 | def main(args):
 54 | 
 55 |     for passage in get_passages_with_progress_bar(args.passages):
 56 |         t = split2sentences(passage)
 57 |         sen_no = 0
 58 |         for sen in t:
 59 |             #print('sentence %d\n\n%s\n%s' %(i,convert.to_text(sen), convert.to_sequence(sen)))
 60 |             print('sentence %d\n\n%s\n' %(sen_no, convert.to_text(sen)))
 61 | 
 62 |             root = sen.nodes['1.1']
 63 |             first = 1
 64 |             tab_len = {}
 65 |             tab_len[0] = len('1.1')
 66 |             for i in root.children:
 67 |                 print('\n')
 68 |                 path = []
 69 |                 level = 1
 70 |                 path.append((i.ftag, i.ID, level,False))
 71 |                 path = find_children(i, path, level)
 72 |                 end = 0
 73 |                 if(first):
 74 |                     pstr = root.ID
 75 |                     first = 0
 76 |                 else:
 77 |                     for k in range(0, tab_len[0]):
 78 |                         pstr = pstr + ' '
 79 |                 for j in path:
 80 |                     if(j == 'End'):
 81 |                         print(pstr)
 82 |                         pstr = ''
 83 |                         end = 1
 84 |                         continue
 85 |                     rel = j[0]
 86 |                     nd = j[1]
 87 |                     tab = int(j[2])
 88 |                     remote = j[3]
 89 |                     if(end):
 90 |                         q_mark = 0
 91 |                         for k in range(0,tab_len[tab-1]):
 92 |                             if(k == tab_len[q_mark]):
 93 |                                 pstr = pstr + '.'
 94 |                                 q_mark += 1
 95 |                             else:
 96 |                                 pstr = pstr+' '
 97 |                             end = 0
 98 |                     if(rel in descr):
 99 |                         rel_desc = rel+':' +descr[rel]
100 |                     else:
101 |                         rel_desc = rel
102 |                     if(remote):
103 |                         pstr = pstr + '|-->Remote(' + rel_desc + ')-->' + nd
104 |                     else:
105 |                         pstr = pstr+'|-->('+rel_desc+')-->'+nd
106 |                     tab_len[tab] = len(pstr)
107 | 
108 |             print('-----------------------------------\n')
109 |             sen_no += 1
110 | 
111 | 
112 | 
113 | if __name__ == "__main__":
114 |     argparser = ArgumentParser(description="Xml to conll and find the path of the word from UCCA xml file.")
115 |     argparser.add_argument("passages", nargs="+", help="the corpus, given as xml/pickle file names")
116 |     add_argument(argparser, False)
117 |     main(argparser.parse_args())
118 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | 
 5 | import os
 6 | import re
 7 | from glob import glob
 8 | from setuptools import setup, find_packages
 9 | 
10 | from ucca.__version__ import VERSION
11 | 
12 | try:
13 |     this_file = __file__
14 | except NameError:
15 |     this_file = sys.argv[0]
16 | os.chdir(os.path.dirname(os.path.abspath(this_file)))
17 | 
18 | extras_require = {}
19 | install_requires = []
20 | for requirements_file in glob("requirements.*txt"):
21 |     suffix = re.match(r"[^.]*\.(.*)\.?txt", requirements_file).group(1).rstrip(".")
22 |     with open(requirements_file) as f:
23 |         (extras_require.setdefault(suffix, []) if suffix else install_requires).extend(f.read().splitlines())
24 | 
25 | with open('README.md', encoding='utf-8') as f:
26 |     long_description = f.read()
27 | 
28 | setup(name="UCCA",
29 |       version=VERSION,
30 |       install_requires=install_requires,
31 |       extras_require=extras_require,
32 |       description="Universal Conceptual Cognitive Annotation",
33 |       long_description=long_description,
34 |       long_description_content_type='text/markdown',
35 |       author="Daniel Hershcovich",
36 |       author_email="danielh@cs.huji.ac.il",
37 |       url="https://github.com/huji-nlp/ucca",
38 |       classifiers=[
39 |           "Development Status :: 4 - Beta",
40 |           "Intended Audience :: Science/Research",
41 |           "Programming Language :: Python :: 3.6",
42 |           "Topic :: Text Processing :: Linguistic",
43 |           "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)",
44 |       ],
45 |       packages=find_packages(),
46 |       )
47 | 


--------------------------------------------------------------------------------
/test_files/implicit1.xml:
--------------------------------------------------------------------------------
  1 | <root annotationID="0" passageID="001961-0004">
  2 |   <attributes />
  3 |   <layer layerID="0">
  4 |     <attributes />
  5 |     <extra doc="[[]]" />
  6 |     <node ID="0.1" type="Word">
  7 |       <attributes paragraph="1" paragraph_position="1" text="Save" />
  8 |       <extra orig_paragraph="1" />
  9 |     </node>
 10 |     <node ID="0.2" type="Word">
 11 |       <attributes paragraph="1" paragraph_position="2" text="money" />
 12 |       <extra orig_paragraph="1" />
 13 |     </node>
 14 |     <node ID="0.3" type="Word">
 15 |       <attributes paragraph="1" paragraph_position="3" text="and" />
 16 |       <extra orig_paragraph="1" />
 17 |     </node>
 18 |     <node ID="0.4" type="Word">
 19 |       <attributes paragraph="1" paragraph_position="4" text="go" />
 20 |       <extra orig_paragraph="1" />
 21 |     </node>
 22 |     <node ID="0.5" type="Word">
 23 |       <attributes paragraph="1" paragraph_position="5" text="somewhere" />
 24 |       <extra orig_paragraph="1" />
 25 |     </node>
 26 |     <node ID="0.6" type="Word">
 27 |       <attributes paragraph="1" paragraph_position="6" text="else" />
 28 |       <extra orig_paragraph="1" />
 29 |     </node>
 30 |     <node ID="0.7" type="Punctuation">
 31 |       <attributes paragraph="1" paragraph_position="7" text="!" />
 32 |       <extra orig_paragraph="1" />
 33 |     </node>
 34 |   </layer>
 35 |   <layer layerID="1">
 36 |     <attributes />
 37 |     <node ID="1.1" type="FN">
 38 |       <attributes />
 39 |       <edge toID="1.2" type="H">
 40 |         <attributes />
 41 |         <category layer_name="UCCA's foundational layer (+restrictions)" slot="1" tag="H" />
 42 |       </edge>
 43 |       <edge toID="1.3" type="L">
 44 |         <attributes />
 45 |         <category layer_name="UCCA's foundational layer (+restrictions)" slot="1" tag="L" />
 46 |       </edge>
 47 |       <edge toID="1.4" type="H">
 48 |         <attributes />
 49 |         <category layer_name="UCCA's foundational layer (+restrictions)" slot="1" tag="H" />
 50 |       </edge>
 51 |     </node>
 52 |     <node ID="1.2" type="FN">
 53 |       <attributes />
 54 |       <extra tree_id="6" />
 55 |       <edge toID="1.12" type="P">
 56 |         <attributes />
 57 |         <category layer_name="UCCA's foundational layer (+restrictions)" slot="1" tag="P" />
 58 |       </edge>
 59 |       <edge toID="1.13" type="A">
 60 |         <attributes />
 61 |         <category layer_name="UCCA's foundational layer (+restrictions)" slot="1" tag="A" />
 62 |       </edge>
 63 |     </node>
 64 |     <node ID="1.3" type="FN">
 65 |       <attributes />
 66 |       <extra tree_id="7" />
 67 |       <edge toID="0.3" type="Terminal">
 68 |         <attributes />
 69 |         <category tag="Terminal" />
 70 |       </edge>
 71 |     </node>
 72 |     <node ID="1.4" type="FN">
 73 |       <attributes />
 74 |       <extra tree_id="8" />
 75 |       <edge toID="1.5" type="A">
 76 |         <attributes />
 77 |         <category layer_name="UCCA's foundational layer (+restrictions)" slot="1" tag="A" />
 78 |       </edge>
 79 |       <edge toID="1.6" type="P">
 80 |         <attributes />
 81 |         <category layer_name="UCCA's foundational layer (+restrictions)" slot="1" tag="P" />
 82 |       </edge>
 83 |       <edge toID="1.7" type="A">
 84 |         <attributes />
 85 |         <category layer_name="UCCA's foundational layer (+restrictions)" slot="1" tag="A" />
 86 |       </edge>
 87 |     </node>
 88 |     <node ID="1.5" type="FN">
 89 |       <attributes implicit="True" />
 90 |     </node>
 91 |     <node ID="1.6" type="FN">
 92 |       <attributes />
 93 |       <extra tree_id="8-2" />
 94 |       <edge toID="0.4" type="Terminal">
 95 |         <attributes />
 96 |         <category tag="Terminal" />
 97 |       </edge>
 98 |     </node>
 99 |     <node ID="1.7" type="FN">
100 |       <attributes />
101 |       <extra tree_id="8-3" />
102 |       <edge toID="1.8" type="C">
103 |         <attributes />
104 |         <category layer_name="UCCA's foundational layer (+restrictions)" slot="1" tag="C" />
105 |       </edge>
106 |       <edge toID="1.9" type="E">
107 |         <attributes />
108 |         <category layer_name="UCCA's foundational layer (+restrictions)" slot="1" tag="E" />
109 |       </edge>
110 |       <edge toID="1.14" type="U">
111 |         <attributes />
112 |         <category tag="U" />
113 |       </edge>
114 |     </node>
115 |     <node ID="1.8" type="FN">
116 |       <attributes />
117 |       <extra tree_id="8-3-1" />
118 |       <edge toID="0.5" type="Terminal">
119 |         <attributes />
120 |         <category tag="Terminal" />
121 |       </edge>
122 |     </node>
123 |     <node ID="1.9" type="FN">
124 |       <attributes />
125 |       <extra tree_id="8-3-2" />
126 |       <edge toID="0.6" type="Terminal">
127 |         <attributes />
128 |         <category tag="Terminal" />
129 |       </edge>
130 |     </node>
131 |     <node ID="1.12" type="FN">
132 |       <attributes />
133 |       <extra tree_id="6-2" />
134 |       <edge toID="0.1" type="Terminal">
135 |         <attributes />
136 |         <category tag="Terminal" />
137 |       </edge>
138 |     </node>
139 |     <node ID="1.13" type="FN">
140 |       <attributes />
141 |       <extra tree_id="6-3" />
142 |       <edge toID="0.2" type="Terminal">
143 |         <attributes />
144 |         <category tag="Terminal" />
145 |       </edge>
146 |     </node>
147 |     <node ID="1.14" type="PNCT">
148 |       <attributes />
149 |       <edge toID="0.7" type="Terminal">
150 |         <attributes />
151 |         <category tag="Terminal" />
152 |       </edge>
153 |     </node>
154 |   </layer>
155 | </root>
156 | 


--------------------------------------------------------------------------------
/test_files/implicit1_ref.xml:
--------------------------------------------------------------------------------
  1 | <root annotationID="0" passageID="001961-0004">
  2 |   <attributes annotationID="4095" passageID="327" remarks="26218" userID="26" />
  3 |   <layer layerID="0">
  4 |     <attributes />
  5 |     <extra doc="[[]]" />
  6 |     <node ID="0.1" type="Word">
  7 |       <attributes paragraph="1" paragraph_position="1" text="Save" />
  8 |       <extra orig_paragraph="1" />
  9 |     </node>
 10 |     <node ID="0.2" type="Word">
 11 |       <attributes paragraph="1" paragraph_position="2" text="money" />
 12 |       <extra orig_paragraph="1" />
 13 |     </node>
 14 |     <node ID="0.3" type="Word">
 15 |       <attributes paragraph="1" paragraph_position="3" text="and" />
 16 |       <extra orig_paragraph="1" />
 17 |     </node>
 18 |     <node ID="0.4" type="Word">
 19 |       <attributes paragraph="1" paragraph_position="4" text="go" />
 20 |       <extra orig_paragraph="1" />
 21 |     </node>
 22 |     <node ID="0.5" type="Word">
 23 |       <attributes paragraph="1" paragraph_position="5" text="somewhere" />
 24 |       <extra orig_paragraph="1" />
 25 |     </node>
 26 |     <node ID="0.6" type="Word">
 27 |       <attributes paragraph="1" paragraph_position="6" text="else" />
 28 |       <extra orig_paragraph="1" />
 29 |     </node>
 30 |     <node ID="0.7" type="Punctuation">
 31 |       <attributes paragraph="1" paragraph_position="7" text="!" />
 32 |       <extra orig_paragraph="1" />
 33 |     </node>
 34 |   </layer>
 35 |   <layer layerID="1">
 36 |     <attributes />
 37 |     <node ID="1.1" type="FN">
 38 |       <attributes />
 39 |       <edge toID="1.2" type="H">
 40 |         <attributes />
 41 |         <category layer_name="UCCA's foundational layer (+restrictions)" slot="1" tag="H" />
 42 |       </edge>
 43 |       <edge toID="1.3" type="L">
 44 |         <attributes />
 45 |         <category layer_name="UCCA's foundational layer (+restrictions)" slot="1" tag="L" />
 46 |       </edge>
 47 |       <edge toID="1.4" type="H">
 48 |         <attributes />
 49 |         <category layer_name="UCCA's foundational layer (+restrictions)" slot="1" tag="H" />
 50 |       </edge>
 51 |     </node>
 52 |     <node ID="1.2" type="FN">
 53 |       <attributes />
 54 |       <extra tree_id="6" />
 55 |       <edge toID="1.11" type="A">
 56 |         <attributes />
 57 |         <category layer_name="UCCA's foundational layer (+restrictions)" slot="1" tag="A" />
 58 |       </edge>
 59 |       <edge toID="1.12" type="P">
 60 |         <attributes />
 61 |         <category layer_name="UCCA's foundational layer (+restrictions)" slot="1" tag="P" />
 62 |       </edge>
 63 |       <edge toID="1.13" type="A">
 64 |         <attributes />
 65 |         <category layer_name="UCCA's foundational layer (+restrictions)" slot="1" tag="A" />
 66 |       </edge>
 67 |     </node>
 68 |     <node ID="1.3" type="FN">
 69 |       <attributes />
 70 |       <extra tree_id="7" />
 71 |       <edge toID="0.3" type="Terminal">
 72 |         <attributes />
 73 |         <category tag="Terminal" />
 74 |       </edge>
 75 |     </node>
 76 |     <node ID="1.4" type="FN">
 77 |       <attributes />
 78 |       <extra tree_id="8" />
 79 |       <edge toID="1.5" type="A">
 80 |         <attributes />
 81 |         <category layer_name="UCCA's foundational layer (+restrictions)" slot="1" tag="A" />
 82 |       </edge>
 83 |       <edge toID="1.6" type="P">
 84 |         <attributes />
 85 |         <category layer_name="UCCA's foundational layer (+restrictions)" slot="1" tag="P" />
 86 |       </edge>
 87 |       <edge toID="1.7" type="A">
 88 |         <attributes />
 89 |         <category layer_name="UCCA's foundational layer (+restrictions)" slot="1" tag="A" />
 90 |       </edge>
 91 |     </node>
 92 |     <node ID="1.5" type="FN">
 93 |       <attributes implicit="True" />
 94 |     </node>
 95 |     <node ID="1.6" type="FN">
 96 |       <attributes />
 97 |       <extra tree_id="8-2" />
 98 |       <edge toID="0.4" type="Terminal">
 99 |         <attributes />
100 |         <category tag="Terminal" />
101 |       </edge>
102 |     </node>
103 |     <node ID="1.7" type="FN">
104 |       <attributes />
105 |       <extra tree_id="8-3" />
106 |       <edge toID="1.8" type="C">
107 |         <attributes />
108 |         <category layer_name="UCCA's foundational layer (+restrictions)" slot="1" tag="C" />
109 |       </edge>
110 |       <edge toID="1.9" type="E">
111 |         <attributes />
112 |         <category layer_name="UCCA's foundational layer (+restrictions)" slot="1" tag="E" />
113 |       </edge>
114 |       <edge toID="1.14" type="U">
115 |         <attributes />
116 |         <category tag="U" />
117 |       </edge>
118 |     </node>
119 |     <node ID="1.8" type="FN">
120 |       <attributes />
121 |       <extra tree_id="8-3-1" />
122 |       <edge toID="0.5" type="Terminal">
123 |         <attributes />
124 |         <category tag="Terminal" />
125 |       </edge>
126 |     </node>
127 |     <node ID="1.9" type="FN">
128 |       <attributes />
129 |       <extra tree_id="8-3-2" />
130 |       <edge toID="0.6" type="Terminal">
131 |         <attributes />
132 |         <category tag="Terminal" />
133 |       </edge>
134 |     </node>
135 |     <node ID="1.11" type="FN">
136 |       <attributes implicit="True" />
137 |     </node>
138 |     <node ID="1.12" type="FN">
139 |       <attributes />
140 |       <extra tree_id="6-2" />
141 |       <edge toID="0.1" type="Terminal">
142 |         <attributes />
143 |         <category tag="Terminal" />
144 |       </edge>
145 |     </node>
146 |     <node ID="1.13" type="FN">
147 |       <attributes />
148 |       <extra tree_id="6-3" />
149 |       <edge toID="0.2" type="Terminal">
150 |         <attributes />
151 |         <category tag="Terminal" />
152 |       </edge>
153 |     </node>
154 |     <node ID="1.14" type="PNCT">
155 |       <attributes />
156 |       <edge toID="0.7" type="Terminal">
157 |         <attributes />
158 |         <category tag="Terminal" />
159 |       </edge>
160 |     </node>
161 |   </layer>
162 | </root>
163 | 


--------------------------------------------------------------------------------
/test_files/implicit2.xml:
--------------------------------------------------------------------------------
  1 | <root annotationID="0" passageID="008635-0002">
  2 |   <attributes />
  3 |   <extra format="ucca" />
  4 |   <layer layerID="0">
  5 |     <attributes />
  6 |     <extra doc="[[[9548244504980166557, 9548244504980166557, 164681854541413346, 86, 0, 2, 400, 1, 13110060611322374290, 2482183445564601373, 14432934319865826938], [5711639017775284443, 5711639017775284443, 10554686591937588953, 84, 0, 2, 8206900633647566924, 0, 13110060611322374290, 6217145520856553898, 14387080380872362826], [12510949447758279278, 12510949447758279278, 1292078113972184607, 85, 0, 2, 443, -1, 13110060611322374290, 260667111241363922, 15366090995793808690], [227504873216781231, null, 4062917326063685704, 90, 0, 2, 440, 4, 4370460163704169311, 646772771845179972, 227504873216781231], [2090661578966068036, 2090661578966068036, 8427216679587749980, 93, 391, 3, 12837356684637874264, 1, 8148669997605808657, 2090661578966068036, 2090661578966068036], [14889849580704678361, 14889849580704678361, 15308085513773655218, 92, 391, 1, 428, 1, 13110060611322374290, 9409450202036847209, 14872279755251211665], [2483095116303079762, 2483095116303079762, 10554686591937588953, 84, 391, 1, 402, 1, 4088098365541558500, 1489474827855109852, 2483095116303079762], [10838604093892214417, 10838604093892214417, 15308085513773655218, 92, 0, 2, 439, -5, 13110060611322374290, 8148669997605808657, 16111258787187009011], [12646065887601541794, 12646065887601541794, 12646065887601541794, 97, 0, 2, 445, -7, 12646065887601541794, 12646065887601541794, 12646065887601541794]]]" />
  7 |     <node ID="0.1" type="Word">
  8 |       <attributes paragraph="1" paragraph_position="1" text="very" />
  9 |     </node>
 10 |     <node ID="0.2" type="Word">
 11 |       <attributes paragraph="1" paragraph_position="2" text="good" />
 12 |     </node>
 13 |     <node ID="0.3" type="Word">
 14 |       <attributes paragraph="1" paragraph_position="3" text="with" />
 15 |     </node>
 16 |     <node ID="0.4" type="Word">
 17 |       <attributes paragraph="1" paragraph_position="4" text="my" />
 18 |     </node>
 19 |     <node ID="0.5" type="Word">
 20 |       <attributes paragraph="1" paragraph_position="5" text="5" />
 21 |     </node>
 22 |     <node ID="0.6" type="Word">
 23 |       <attributes paragraph="1" paragraph_position="6" text="year" />
 24 |     </node>
 25 |     <node ID="0.7" type="Word">
 26 |       <attributes paragraph="1" paragraph_position="7" text="old" />
 27 |     </node>
 28 |     <node ID="0.8" type="Word">
 29 |       <attributes paragraph="1" paragraph_position="8" text="daughter" />
 30 |     </node>
 31 |     <node ID="0.9" type="Punctuation">
 32 |       <attributes paragraph="1" paragraph_position="9" text="." />
 33 |     </node>
 34 |   </layer>
 35 |   <layer layerID="1">
 36 |     <attributes />
 37 |     <node ID="1.1" type="FN">
 38 |       <attributes />
 39 |       <edge toID="1.2" type="H">
 40 |         <attributes />
 41 |         <category tag="H" />
 42 |       </edge>
 43 |     </node>
 44 |     <node ID="1.2" type="FN">
 45 |       <attributes />
 46 |       <edge toID="1.3" type="D">
 47 |         <attributes />
 48 |         <category tag="D" />
 49 |       </edge>
 50 |       <edge toID="1.4" type="S">
 51 |         <attributes />
 52 |         <category tag="S" />
 53 |       </edge>
 54 |       <edge toID="1.5" type="A">
 55 |         <attributes />
 56 |         <category tag="A" />
 57 |       </edge>
 58 |     </node>
 59 |     <node ID="1.3" type="FN">
 60 |       <attributes />
 61 |       <edge toID="0.1" type="Terminal">
 62 |         <attributes />
 63 |         <category tag="Terminal" />
 64 |       </edge>
 65 |     </node>
 66 |     <node ID="1.4" type="FN">
 67 |       <attributes />
 68 |       <edge toID="0.2" type="Terminal">
 69 |         <attributes />
 70 |         <category tag="Terminal" />
 71 |       </edge>
 72 |     </node>
 73 |     <node ID="1.5" type="FN">
 74 |       <attributes />
 75 |       <edge toID="1.6" type="R">
 76 |         <attributes />
 77 |         <category tag="R" />
 78 |       </edge>
 79 |       <edge toID="1.7" type="E">
 80 |         <attributes />
 81 |         <category tag="E" />
 82 |       </edge>
 83 |       <edge toID="1.8" type="E">
 84 |         <attributes />
 85 |         <category tag="E" />
 86 |       </edge>
 87 |       <edge toID="1.9" type="E">
 88 |         <attributes />
 89 |         <category tag="E" />
 90 |       </edge>
 91 |       <edge toID="1.10" type="C">
 92 |         <attributes />
 93 |         <category tag="C" />
 94 |       </edge>
 95 |       <edge toID="1.11" type="U">
 96 |         <attributes />
 97 |         <category tag="U" />
 98 |       </edge>
 99 |     </node>
100 |     <node ID="1.6" type="FN">
101 |       <attributes />
102 |       <edge toID="0.3" type="Terminal">
103 |         <attributes />
104 |         <category tag="Terminal" />
105 |       </edge>
106 |     </node>
107 |     <node ID="1.7" type="FN">
108 |       <attributes />
109 |       <edge toID="0.4" type="Terminal">
110 |         <attributes />
111 |         <category tag="Terminal" />
112 |       </edge>
113 |     </node>
114 |     <node ID="1.8" type="FN">
115 |       <attributes />
116 |       <edge toID="1.12" type="E">
117 |         <attributes />
118 |         <category tag="E" />
119 |       </edge>
120 |       <edge toID="1.13" type="C">
121 |         <attributes />
122 |         <category tag="C" />
123 |       </edge>
124 |     </node>
125 |     <node ID="1.9" type="FN">
126 |       <attributes />
127 |       <edge toID="0.7" type="Terminal">
128 |         <attributes />
129 |         <category tag="Terminal" />
130 |       </edge>
131 |     </node>
132 |     <node ID="1.10" type="FN">
133 |       <attributes />
134 |       <edge toID="0.8" type="Terminal">
135 |         <attributes />
136 |         <category tag="Terminal" />
137 |       </edge>
138 |     </node>
139 |     <node ID="1.11" type="PNCT">
140 |       <attributes />
141 |       <edge toID="0.9" type="Terminal">
142 |         <attributes />
143 |         <category tag="Terminal" />
144 |       </edge>
145 |     </node>
146 |     <node ID="1.12" type="FN">
147 |       <attributes />
148 |       <edge toID="0.5" type="Terminal">
149 |         <attributes />
150 |         <category tag="Terminal" />
151 |       </edge>
152 |     </node>
153 |     <node ID="1.13" type="FN">
154 |       <attributes />
155 |       <edge toID="0.6" type="Terminal">
156 |         <attributes />
157 |         <category tag="Terminal" />
158 |       </edge>
159 |     </node>
160 |   </layer>
161 | </root>
162 | 


--------------------------------------------------------------------------------
/test_files/site1.xml:
--------------------------------------------------------------------------------
 1 | <root schemeVersion="1.0.4">
 2 | 	<unitGroups>
 3 | 	</unitGroups>
 4 | 	<units passageID="118">
 5 | 		<unit type="To Be Defined" id="0" unanalyzable="false" uncertain="false">
 6 | 			<unit type="To Be Defined" id="1" unanalyzable="false" uncertain="false">
 7 | 				<unit type="To Be Defined" id="3" unanalyzable="false" uncertain="false">
 8 | 					<word id="2">1</word>
 9 | 				</unit>
10 | 				<unit type="To Be Defined" id="5" unanalyzable="false" uncertain="false">
11 | 					<word id="4">2</word>
12 | 				</unit>
13 | 				<unit type="To Be Defined" id="7" unanalyzable="false" uncertain="false">
14 | 					<word id="6">3</word>
15 | 				</unit>
16 | 				<unit type="To Be Defined" id="9" unanalyzable="false" uncertain="false">
17 | 					<word id="8">4</word>
18 | 				</unit>
19 | 				<unit type="Punctuation" id="11" unanalyzable="false" uncertain="false">
20 | 					<word id="10">.</word>
21 | 				</unit>
22 | 			</unit>
23 | 			<unit type="To Be Defined" id="12" unanalyzable="false" uncertain="false">
24 | 				<unit type="To Be Defined" id="14" unanalyzable="false" uncertain="false">
25 | 					<word id="13">6</word>
26 | 				</unit>
27 | 				<unit type="To Be Defined" id="16" unanalyzable="false" uncertain="false">
28 | 					<word id="15">7</word>
29 | 				</unit>
30 | 				<unit type="To Be Defined" id="18" unanalyzable="false" uncertain="false">
31 | 					<word id="17">8</word>
32 | 				</unit>
33 | 				<unit type="To Be Defined" id="20" unanalyzable="false" uncertain="false">
34 | 					<word id="19">9</word>
35 | 				</unit>
36 | 				<unit type="To Be Defined" id="22" unanalyzable="false" uncertain="false">
37 | 					<word id="21">10</word>
38 | 				</unit>
39 | 				<unit type="Punctuation" id="24" unanalyzable="false" uncertain="false">
40 | 					<word id="23">.</word>
41 | 				</unit>
42 | 			</unit>
43 | 			<unit type="To Be Defined" id="25" unanalyzable="false" uncertain="false">
44 | 				<unit type="To Be Defined" id="27" unanalyzable="false" uncertain="false">
45 | 					<word id="26">12</word>
46 | 				</unit>
47 | 				<unit type="To Be Defined" id="29" unanalyzable="false" uncertain="false">
48 | 					<word id="28">13</word>
49 | 				</unit>
50 | 				<unit type="To Be Defined" id="31" unanalyzable="false" uncertain="false">
51 | 					<word id="30">14</word>
52 | 				</unit>
53 | 				<unit type="To Be Defined" id="33" unanalyzable="false" uncertain="false">
54 | 					<word id="32">15</word>
55 | 				</unit>
56 | 			</unit>
57 | 		</unit>
58 | 	</units>
59 | 	<LRUunits>
60 | 	</LRUunits>
61 | 	<hiddenUnits>
62 | 	</hiddenUnits>
63 | </root>
64 | 


--------------------------------------------------------------------------------
/test_files/site2.xml:
--------------------------------------------------------------------------------
 1 | <root schemeVersion="1.0.4">
 2 | 	<unitGroups></unitGroups>
 3 | 	<units passageID="120">
 4 | 		<unit type="To Be Defined" id="0" unanalyzable="false" uncertain="false">
 5 | 			<unit type="To Be Defined" id="1" unanalyzable="false" uncertain="false">
 6 | 				<unit type="Linker" id="35" remarks="remark" unanalyzable="false" uncertain="false">
 7 | 					<unit type="Center" id="36" unanalyzable="false" uncertain="false">
 8 | 						<unit type="To Be Defined" id="3" unanalyzable="false" uncertain="false">
 9 | 							<word id="2">1</word>
10 | 						</unit>
11 | 					</unit>
12 | 					<unit type="Elaborator" id="37" unanalyzable="false" uncertain="false">
13 | 						<unit type="To Be Defined" id="5" unanalyzable="false" uncertain="false">
14 | 							<word id="4">2</word>
15 | 						</unit>
16 | 					</unit>
17 | 				</unit>
18 | 				<unit type="Linked U" id="34" unanalyzable="false" uncertain="true">
19 | 					<unit type="To Be Defined" id="7" unanalyzable="false" uncertain="false">
20 | 						<word id="6">3</word>
21 | 					</unit>
22 | 					<unit type="To Be Defined" id="9" unanalyzable="false" uncertain="false">
23 | 						<word id="8">4</word>
24 | 					</unit>
25 | 					<unit type="Punctuation" id="11" unanalyzable="false" uncertain="false">
26 | 						<word id="10">.</word>
27 | 					</unit>
28 | 				</unit>
29 | 			</unit>
30 | 			<unit type="To Be Defined" id="12" unanalyzable="false" uncertain="false">
31 | 				<unit type="To Be Defined" id="14" unanalyzable="false" uncertain="false">
32 | 					<word id="13">6</word>
33 | 				</unit>
34 | 				<unit type="To Be Defined" id="16" unanalyzable="false" uncertain="false">
35 | 					<word id="15">7</word>
36 | 				</unit>
37 | 				<unit type="To Be Defined" id="18" unanalyzable="false" uncertain="false">
38 | 					<word id="17">8</word>
39 | 				</unit>
40 | 				<unit type="To Be Defined" id="20" unanalyzable="false" uncertain="false">
41 | 					<word id="19">9</word>
42 | 				</unit>
43 | 				<unit type="To Be Defined" id="22" unanalyzable="false" uncertain="false">
44 | 					<word id="21">10</word>
45 | 				</unit>
46 | 				<unit type="Punctuation" id="24" unanalyzable="false" uncertain="false">
47 | 					<word id="23">.</word>
48 | 				</unit>
49 | 			</unit>
50 | 			<unit type="To Be Defined" id="25" unanalyzable="false" uncertain="false">
51 | 				<unit type="To Be Defined" id="27" unanalyzable="false" uncertain="false">
52 | 					<word id="26">12</word>
53 | 				</unit>
54 | 				<unit type="To Be Defined" id="29" unanalyzable="false" uncertain="false">
55 | 					<word id="28">13</word>
56 | 				</unit>
57 | 				<unit type="To Be Defined" id="31" unanalyzable="false" uncertain="false">
58 | 					<word id="30">14</word>
59 | 				</unit>
60 | 				<unit type="To Be Defined" id="33" unanalyzable="false" uncertain="false">
61 | 					<word id="32">15</word>
62 | 				</unit>
63 | 			</unit>
64 | 		</unit>
65 | 	</units>
66 | 	<LRUunits>
67 | 		<LRUunit id="37"></LRUunit>
68 | 		<LRUunit id="36"></LRUunit>
69 | 		<LRUunit id="35"></LRUunit>
70 | 		<LRUunit id="34"></LRUunit>
71 | 	</LRUunits>
72 | 	<hiddenUnits></hiddenUnits>
73 | </root>
74 | 


--------------------------------------------------------------------------------
/test_files/site3.xml:
--------------------------------------------------------------------------------
  1 | ﻿<root schemeVersion="1.0.4">
  2 | 	<unitGroups>
  3 | 		<unit type="Participant" id="67" unanalyzable="false" uncertain="false"></unit>
  4 | 		<unit type="Center" id="63" unanalyzable="false" uncertain="false"></unit>
  5 | 	</unitGroups>
  6 | 	<units passageID="120">
  7 | 		<unit type="To Be Defined" id="0" unanalyzable="false" uncertain="false">
  8 | 			<unit type="To Be Defined" id="1" unanalyzable="false" uncertain="false">
  9 | 				<unit type="Linker" id="59" remarks="remark" unanalyzable="false" uncertain="false">
 10 | 					<unit type="Center" id="60" unanalyzable="false" uncertain="false">
 11 | 						<unit type="To Be Defined" id="3" unanalyzable="false" uncertain="false">
 12 | 							<word id="2">1</word>
 13 | 						</unit>
 14 | 					</unit>
 15 | 					<unit type="Elaborator" id="61" unanalyzable="false" uncertain="false">
 16 | 						<unit type="To Be Defined" id="5" unanalyzable="false" uncertain="false">
 17 | 							<word id="4">2</word>
 18 | 						</unit>
 19 | 					</unit>
 20 | 				</unit>
 21 | 				<unit type="Linked U" id="34" unanalyzable="false" uncertain="true">
 22 | 					<unit type="To Be Defined" id="7" unanalyzable="false" uncertain="false">
 23 | 						<word id="6">3</word>
 24 | 					</unit>
 25 | 					<unit type="To Be Defined" id="9" unanalyzable="false" uncertain="false">
 26 | 						<word id="8">4</word>
 27 | 					</unit>
 28 | 					<unit type="Punctuation" id="11" unanalyzable="false" uncertain="false">
 29 | 						<word id="10">.</word>
 30 | 					</unit>
 31 | 				</unit>
 32 | 			</unit>
 33 | 			<unit type="To Be Defined" id="12" unanalyzable="false" uncertain="false">
 34 | 				<unit type="Linked U" id="71" unanalyzable="false" uncertain="false">
 35 | 					<remoteUnit id="69" type="aDverbial"></remoteUnit>
 36 | 					<unit type="To Be Defined" id="66" unitGroupID="67" unanalyzable="false" uncertain="false">
 37 | 						<unit type="Elaborator" id="65" unanalyzable="false" uncertain="false">
 38 | 							<unit type="To Be Defined" id="14" unanalyzable="false" uncertain="false">
 39 | 								<word id="13">6</word>
 40 | 							</unit>
 41 | 						</unit>
 42 | 						<unit type="To Be Defined" id="62" unitGroupID="63" unanalyzable="false" uncertain="false">
 43 | 							<unit type="To Be Defined" id="16" unanalyzable="false" uncertain="false">
 44 | 								<word id="15">7</word>
 45 | 							</unit>
 46 | 						</unit>
 47 | 					</unit>
 48 | 					<unit type="Process" id="70" unanalyzable="false" uncertain="false">
 49 | 						<unit type="To Be Defined" id="18" unanalyzable="false" uncertain="false">
 50 | 							<word id="17">8</word>
 51 | 						</unit>
 52 | 					</unit>
 53 | 					<unit type="To Be Defined" id="68" unitGroupID="67" unanalyzable="false" uncertain="false">
 54 | 						<unit type="To Be Defined" id="64" unitGroupID="63" unanalyzable="false" uncertain="false">
 55 | 							<unit type="To Be Defined" id="20" unanalyzable="false" uncertain="false">
 56 | 								<word id="19">9</word>
 57 | 							</unit>
 58 | 						</unit>
 59 | 					</unit>
 60 | 				</unit>
 61 | 				<unit type="Function" id="69" unanalyzable="false" uncertain="false">
 62 | 					<unit type="To Be Defined" id="22" unanalyzable="false" uncertain="false">
 63 | 						<word id="21">10</word>
 64 | 					</unit>
 65 | 				</unit>
 66 | 				<unit type="Punctuation" id="24" unanalyzable="false" uncertain="false">
 67 | 					<word id="23">.</word>
 68 | 				</unit>
 69 | 			</unit>
 70 | 			<unit type="To Be Defined" id="25" unanalyzable="false" uncertain="false">
 71 | 				<unit type="Linked U" id="74" unanalyzable="false" uncertain="false">
 72 | 					<unit type="To Be Defined" id="27" unanalyzable="false" uncertain="false">
 73 | 						<word id="26">12</word>
 74 | 					</unit>
 75 | 				</unit>
 76 | 				<unit type="Linked U" id="75" unanalyzable="false" uncertain="false">
 77 | 					<unit type="To Be Defined" id="29" unanalyzable="false" uncertain="false">
 78 | 						<word id="28">13</word>
 79 | 					</unit>
 80 | 				</unit>
 81 | 				<unit type="Linked U" id="76" unanalyzable="false" uncertain="false">
 82 | 					<unit type="To Be Defined" id="31" unanalyzable="false" uncertain="false">
 83 | 						<word id="30">14</word>
 84 | 					</unit>
 85 | 				</unit>
 86 | 				<unit type="Linker" id="77" unanalyzable="false" uncertain="false">
 87 | 					<linkage args="74,75,76"></linkage>
 88 | 					<implicitUnit id="78" type="Center"></implicitUnit>
 89 | 					<unit type="Elaborator" id="79" unanalyzable="false" uncertain="false">
 90 |             <unit type="To Be Defined" id="33" unanalyzable="false" uncertain="false">
 91 |               <word id="32">15</word>
 92 |             </unit>
 93 | 					</unit>
 94 | 				</unit>
 95 | 			</unit>
 96 | 		</unit>
 97 | 	</units>
 98 | 	<LRUunits></LRUunits>
 99 | 	<hiddenUnits></hiddenUnits>
100 | </root>
101 | 


--------------------------------------------------------------------------------
/test_files/standard3.xml:
--------------------------------------------------------------------------------
1 | <root annotationID="0" passageID="120"><attributes /><layer layerID="0"><attributes /><node ID="0.1" type="Word"><attributes paragraph="1" paragraph_position="1" text="1" /></node><node ID="0.2" type="Word"><attributes paragraph="1" paragraph_position="2" text="2" /></node><node ID="0.3" type="Word"><attributes paragraph="1" paragraph_position="3" text="3" /></node><node ID="0.4" type="Word"><attributes paragraph="1" paragraph_position="4" text="4" /></node><node ID="0.5" type="Punctuation"><attributes paragraph="1" paragraph_position="5" text="." /></node><node ID="0.6" type="Word"><attributes paragraph="2" paragraph_position="1" text="6" /></node><node ID="0.7" type="Word"><attributes paragraph="2" paragraph_position="2" text="7" /></node><node ID="0.8" type="Word"><attributes paragraph="2" paragraph_position="3" text="8" /></node><node ID="0.9" type="Word"><attributes paragraph="2" paragraph_position="4" text="9" /></node><node ID="0.10" type="Word"><attributes paragraph="2" paragraph_position="5" text="10" /></node><node ID="0.11" type="Punctuation"><attributes paragraph="2" paragraph_position="6" text="." /></node><node ID="0.12" type="Word"><attributes paragraph="3" paragraph_position="1" text="12" /></node><node ID="0.13" type="Word"><attributes paragraph="3" paragraph_position="2" text="13" /></node><node ID="0.14" type="Word"><attributes paragraph="3" paragraph_position="3" text="14" /></node><node ID="0.15" type="Word"><attributes paragraph="3" paragraph_position="4" text="15" /></node></layer><layer layerID="1"><attributes /><node ID="1.1" type="FN"><attributes /><edge toID="1.2" type="L"><attributes /></edge><edge toID="1.5" type="H"><attributes /></edge><edge toID="1.7" type="H"><attributes /></edge><edge toID="1.12" type="F"><attributes /></edge><edge toID="1.13" type="U"><attributes /></edge><edge toID="1.14" type="H"><attributes /></edge><edge toID="1.15" type="H"><attributes /></edge><edge toID="1.16" type="H"><attributes /></edge><edge toID="1.17" type="L"><attributes /></edge></node><node ID="1.2" type="FN"><attributes /><extra remarks="remark" /><edge toID="1.3" type="C"><attributes /></edge><edge toID="1.4" type="E"><attributes /></edge></node><node ID="1.3" type="FN"><attributes /><edge toID="0.1" type="Terminal"><attributes /></edge></node><node ID="1.4" type="FN"><attributes /><edge toID="0.2" type="Terminal"><attributes /></edge></node><node ID="1.5" type="FN"><attributes uncertain="True" /><edge toID="0.3" type="Terminal"><attributes /></edge><edge toID="0.4" type="Terminal"><attributes /></edge><edge toID="1.6" type="U"><attributes /></edge></node><node ID="1.6" type="PNCT"><attributes /><edge toID="0.5" type="Terminal"><attributes /></edge></node><node ID="1.7" type="FN"><attributes /><edge toID="1.8" type="A"><attributes /></edge><edge toID="1.11" type="P"><attributes /></edge><edge toID="1.12" type="D"><attributes remote="True" /></edge></node><node ID="1.8" type="FN"><attributes /><edge toID="1.9" type="E"><attributes /></edge><edge toID="1.10" type="C"><attributes /></edge></node><node ID="1.9" type="FN"><attributes /><edge toID="0.6" type="Terminal"><attributes /></edge></node><node ID="1.10" type="FN"><attributes /><edge toID="0.7" type="Terminal"><attributes /></edge><edge toID="0.9" type="Terminal"><attributes /></edge></node><node ID="1.11" type="FN"><attributes /><edge toID="0.8" type="Terminal"><attributes /></edge></node><node ID="1.12" type="FN"><attributes /><edge toID="0.10" type="Terminal"><attributes /></edge></node><node ID="1.13" type="PNCT"><attributes /><edge toID="0.11" type="Terminal"><attributes /></edge></node><node ID="1.14" type="FN"><attributes /><edge toID="0.12" type="Terminal"><attributes /></edge></node><node ID="1.15" type="FN"><attributes /><edge toID="0.13" type="Terminal"><attributes /></edge></node><node ID="1.16" type="FN"><attributes /><edge toID="0.14" type="Terminal"><attributes /></edge></node><node ID="1.17" type="FN"><attributes /><edge toID="1.18" type="C"><attributes /></edge><edge toID="1.19" type="E"><attributes /></edge></node><node ID="1.18" type="FN"><attributes implicit="True" /></node><node ID="1.19" type="FN"><attributes /><edge toID="0.15" type="Terminal"><attributes /></edge></node><node ID="1.20" type="LKG"><attributes /><edge toID="1.14" type="LA"><attributes /></edge><edge toID="1.15" type="LA"><attributes /></edge><edge toID="1.16" type="LA"><attributes /></edge><edge toID="1.17" type="LR"><attributes /></edge></node></layer></root>
2 | 


--------------------------------------------------------------------------------
/test_files/toy_bad.xml:
--------------------------------------------------------------------------------
  1 | <root annotationID="0" passageID="504">
  2 |   <attributes />
  3 |   <layer layerID="0">
  4 |     <attributes />
  5 |     <node ID="0.1" type="Word">
  6 |       <attributes paragraph="1" paragraph_position="1" text="After" />
  7 |       <extra dep="prep" head="5" iob="2" lemma="after" ner="" pos="ADP" tag="IN" />
  8 |     </node>
  9 |     <node ID="0.2" type="Word">
 10 |       <attributes paragraph="1" paragraph_position="2" text="graduation" />
 11 |       <extra dep="pobj" head="1" iob="2" lemma="graduation" ner="" pos="NOUN" tag="NN" />
 12 |     </node>
 13 |     <node ID="0.3" type="Punctuation">
 14 |       <attributes paragraph="1" paragraph_position="3" text="," />
 15 |       <extra dep="punct" head="5" iob="2" lemma="," ner="" pos="PUNCT" tag="," />
 16 |     </node>
 17 |     <node ID="0.4" type="Word">
 18 |       <attributes paragraph="1" paragraph_position="4" text="Mary" />
 19 |       <extra dep="nsubj" head="5" iob="3" lemma="mary" ner="PERSON" pos="PROPN" tag="NNP" />
 20 |     </node>
 21 |     <node ID="0.5" type="Word">
 22 |       <attributes paragraph="1" paragraph_position="5" text="moved" />
 23 |       <extra dep="ROOT" head="5" iob="2" lemma="move" ner="" pos="VERB" tag="VBD" />
 24 |     </node>
 25 |     <node ID="0.6" type="Word">
 26 |       <attributes paragraph="1" paragraph_position="6" text="to" />
 27 |       <extra dep="prep" head="5" iob="2" lemma="to" ner="" pos="ADP" tag="IN" />
 28 |     </node>
 29 |     <node ID="0.7" type="Word">
 30 |       <attributes paragraph="1" paragraph_position="7" text="New" />
 31 |       <extra dep="compound" head="8" iob="3" lemma="new" ner="GPE" pos="PROPN" tag="NNP" />
 32 |     </node>
 33 |     <node ID="0.8" type="Word">
 34 |       <attributes paragraph="1" paragraph_position="8" text="York" />
 35 |       <extra dep="compound" head="9" iob="1" lemma="york" ner="GPE" pos="PROPN" tag="NNP" />
 36 |     </node>
 37 |     <node ID="0.9" type="Word">
 38 |       <attributes paragraph="1" paragraph_position="9" text="City" />
 39 |       <extra dep="pobj" head="6" iob="1" lemma="city" ner="GPE" pos="PROPN" tag="NNP" />
 40 |     </node>
 41 |     <node ID="0.10" type="Punctuation">
 42 |       <attributes paragraph="1" paragraph_position="10" text="." />
 43 |       <extra dep="punct" head="5" iob="2" lemma="." ner="" pos="PUNCT" tag="." />
 44 |     </node>
 45 |   </layer>
 46 |   <layer layerID="1">
 47 |     <attributes />
 48 |     <node ID="1.1" type="FN">
 49 |       <attributes />
 50 |       <edge toID="1.2" type="L">
 51 |         <attributes />
 52 |       </edge>
 53 |       <edge toID="1.3" type="H">
 54 |         <attributes />
 55 |       </edge>
 56 |       <edge toID="1.5" type="U">
 57 |         <attributes />
 58 |       </edge>
 59 |       <edge toID="1.6" type="H">
 60 |         <attributes />
 61 |       </edge>
 62 |       <edge toID="1.11" type="U">
 63 |         <attributes />
 64 |       </edge>
 65 |     </node>
 66 |     <node ID="1.2" type="FN">
 67 |       <attributes />
 68 |       <edge toID="0.1" type="Terminal">
 69 |         <attributes />
 70 |       </edge>
 71 |     </node>
 72 |     <node ID="1.3" type="FN">
 73 |       <attributes />
 74 |       <edge toID="1.4" type="P">
 75 |         <attributes />
 76 |       </edge>
 77 |       <edge toID="1.7" type="A">
 78 |         <attributes remote="True" />
 79 |       </edge>
 80 |       <edge toID="1.10" type="A">
 81 |         <attributes remote="True" />
 82 |       </edge>
 83 |     </node>
 84 |     <node ID="1.4" type="FN">
 85 |       <attributes />
 86 |       <edge toID="0.2" type="Terminal">
 87 |         <attributes />
 88 |       </edge>
 89 |     </node>
 90 |     <node ID="1.5" type="PNCT">
 91 |       <attributes />
 92 |       <edge toID="0.3" type="Terminal">
 93 |         <attributes />
 94 |       </edge>
 95 |     </node>
 96 |     <node ID="1.6" type="FN">
 97 |       <attributes />
 98 |       <edge toID="1.7" type="A">
 99 |         <attributes />
100 |       </edge>
101 |       <edge toID="1.8" type="P">
102 |         <attributes />
103 |       </edge>
104 |       <edge toID="1.9" type="F">
105 |         <attributes />
106 |       </edge>
107 |       <edge toID="1.10" type="A">
108 |         <attributes />
109 |       </edge>
110 |     </node>
111 |     <node ID="1.7" type="FN">
112 |       <attributes />
113 |       <edge toID="0.4" type="Terminal">
114 |         <attributes />
115 |       </edge>
116 |     </node>
117 |     <node ID="1.8" type="FN">
118 |       <attributes />
119 |       <edge toID="0.5" type="Terminal">
120 |         <attributes />
121 |       </edge>
122 |     </node>
123 |     <node ID="1.9" type="FN">
124 |       <attributes />
125 |       <edge toID="0.6" type="Terminal">
126 |         <attributes />
127 |       </edge>
128 |     </node>
129 |     <node ID="1.10" type="FN">
130 |       <attributes />
131 |       <edge toID="0.7" type="Terminal">
132 |         <attributes />
133 |       </edge>
134 |       <edge toID="0.8" type="Terminal">
135 |         <attributes />
136 |       </edge>
137 |       <edge toID="0.9" type="Terminal">
138 |         <attributes />
139 |       </edge>
140 |     </node>
141 |     <node ID="1.11" type="PNCT">
142 |       <attributes />
143 |       <edge toID="0.9" type="Terminal">
144 |         <attributes />
145 |       </edge>
146 |     </node>
147 |     <node ID="1.12" type="LKG">
148 |       <attributes />
149 |       <edge toID="1.2" type="LR">
150 |         <attributes />
151 |       </edge>
152 |       <edge toID="1.3" type="LA">
153 |         <attributes />
154 |       </edge>
155 |       <edge toID="1.6" type="LA">
156 |         <attributes />
157 |       </edge>
158 |     </node>
159 |   </layer>
160 | </root>
161 | 


--------------------------------------------------------------------------------
/ucca/README.md:
--------------------------------------------------------------------------------
 1 | `ucca` package
 2 | ====================
 3 | 
 4 | List of Modules
 5 | ---------------
 6 | 1. `constructions`: extracting linguistic constructions from text
 7 | 1. `convert`: converting between UCCA objects and various formats
 8 | 1. `core`: basic objects of UCCA relations: `Node`, `Edge`, `Layer` and `Passage`
 9 | 1. `evaluation`: comparing passages and inspecting the differences
10 | 1. `ioutil`: reading and writing `Passage` objects
11 | 1. `layer0`: text layer objects: `Layer0` and `Terminal`
12 | 1. `layer1`: foundational layer objects: `Layer1`, `FoundationalNode`, `PunctNode` and `Linkage`
13 | 1. `normalization`: modifying `Passage`s to standardized conventions
14 | 1. `textutil`: text processing utilities, including NLP pipeline
15 | 1. `validation`: checks for validity of `Passage`s
16 | 1. `visualization`: draw `Passage` as graph
17 | 
18 | In addition, the `tests` package enables unit-testing.
19 | 
20 | Authors
21 | ------
22 | * Amit Beka: amit.beka@gmail.com
23 | * Daniel Hershcovich: danielh@cs.huji.ac.il


--------------------------------------------------------------------------------
/ucca/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huji-nlp/ucca/31e381f2b2693e82c8402b09a6d28f97d595a1eb/ucca/__init__.py


--------------------------------------------------------------------------------
/ucca/__version__.py:
--------------------------------------------------------------------------------
1 | VERSION = "1.3.11"
2 | # noinspection PyBroadException
3 | try:
4 |     from subprocess import check_output, DEVNULL
5 |     GIT_VERSION = check_output(["git", "describe", "--tags", "--always"], stderr=DEVNULL).decode().strip().lstrip("v")
6 | except:
7 |     GIT_VERSION = VERSION
8 | 


--------------------------------------------------------------------------------
/ucca/diffutil.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | from ucca.ioutil import passage2file
 4 | 
 5 | 
 6 | def diff_passages(true_passage, pred_passage, write=False):
 7 |     """
 8 |     Debug method to print missing or mistaken attributes, nodes and edges
 9 |     """
10 |     lines = list()
11 |     if not true_passage._attrib.equals(pred_passage._attrib):
12 |         lines.append("Passage attributes mismatch: %s, %s" %
13 |                      (true_passage._attrib, pred_passage._attrib))
14 |     try:
15 |         for lid, l1 in true_passage._layers.items():
16 |             l2 = true_passage.layer(lid)
17 |             if not l1._attrib.equals(l2._attrib):
18 |                 lines.append("Layer %d attributes mismatch: %s, %s" %
19 |                              (lid, l1._attrib, l2._attrib))
20 |     except KeyError:  # no layer with same ID found
21 |         lines.append("Missing layer: %s, %s" %
22 |                      (true_passage._layers, pred_passage._layers))
23 |     pred_ids = {node.extra.get("remarks", node.ID): node
24 |                 for node in pred_passage.missing_nodes(true_passage)}
25 |     true_ids = {node.ID: node
26 |                 for node in true_passage.missing_nodes(pred_passage)}
27 |     for pred_id, pred_node in list(pred_ids.items()):
28 |         true_node = true_ids.get(pred_id)
29 |         if true_node:
30 |             pred_ids.pop(pred_id)
31 |             true_ids.pop(pred_id)
32 |             pred_edges = {edge.tag + "->" + edge.child.ID: edge for edge in
33 |                           pred_node.missing_edges(true_node)}
34 |             true_edges = {edge.tag + "->" + edge.child.ID: edge for edge in
35 |                           true_node.missing_edges(pred_node)}
36 |             intersection = set(pred_edges).intersection(set(true_edges))
37 |             pred_edges = {s: edge for s, edge in pred_edges.items() if s not in intersection}
38 |             true_edges = {s: edge for s, edge in true_edges.items() if s not in intersection}
39 | 
40 |             node_lines = []
41 |             if not pred_node._attrib.equals(true_node._attrib):
42 |                 node_lines.append("  Attributes mismatch: %s, %s" %
43 |                                   (sorted(true_node._attrib.items()), sorted(pred_node._attrib.items())))
44 |             if pred_edges:
45 |                 node_lines.append("  Mistake edges: %s" % ", ".join(pred_edges))
46 |             if true_edges:
47 |                 node_lines.append("  Missing edges: %s" % ", ".join(true_edges))
48 |             if node_lines:
49 |                 lines.append("For node " + pred_id + ":")
50 |                 lines.extend(node_lines)
51 |     if pred_ids:
52 |         lines.append("Mistake nodes: %s" % ", ".join(pred_ids))
53 |     if true_ids:
54 |         lines.append("Missing nodes: %s" % ", ".join(true_ids))
55 |     if write and lines:
56 |         outfile = "%s.xml" % true_passage.ID
57 |         sys.stderr.write("Writing passage '%s'...\n" % outfile)
58 |         passage2file(true_passage, outfile)
59 |         outfile = "%s_pred.xml" % pred_passage.ID
60 |         sys.stderr.write("Writing passage '%s'...\n" % outfile)
61 |         passage2file(pred_passage, outfile)
62 |     return "\n" + "\n".join(lines)
63 | 


--------------------------------------------------------------------------------
/ucca/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huji-nlp/ucca/31e381f2b2693e82c8402b09a6d28f97d595a1eb/ucca/tests/__init__.py


--------------------------------------------------------------------------------
/ucca/tests/test_constructions.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | 
 3 | import pytest
 4 | 
 5 | from ucca import textutil
 6 | from ucca.constructions import CATEGORIES_NAME, DEFAULT, CONSTRUCTIONS, extract_candidates
 7 | from .conftest import PASSAGES, loaded, loaded_valid, multi_sent, crossing, discontiguous, l1_passage, empty
 8 | 
 9 | """Tests the constructions module functions and classes."""
10 | 
11 | 
12 | def assert_spacy_not_loaded(*args, **kwargs):
13 |     del args, kwargs
14 |     assert False, "Should not load spaCy when passage is pre-annotated"
15 | 
16 | 
17 | def extract_and_check(p, constructions=None, expected=None):
18 |     d = OrderedDict((construction, [candidate.edge for candidate in candidates]) for construction, candidates in
19 |                     extract_candidates(p, constructions=constructions).items() if candidates)
20 |     if expected is not None:
21 |         hist = {c.name: len(e) for c, e in d.items()}
22 |         assert hist == expected, " != ".join(",".join(sorted(h)) for h in (hist, expected))
23 | 
24 | 
25 | @pytest.mark.parametrize("create, expected", (
26 |         (loaded, {'P': 1, 'remote': 1, 'E': 3, 'primary': 15, 'U': 2, 'F': 1, 'C': 3, 'A': 1, 'D': 1, 'L': 2, 'mwe': 2,
27 |                   'H': 5, 'implicit': 1, 'main_rel': 1}),
28 |         (loaded_valid, {'P': 1, 'remote': 1, 'E': 3, 'primary': 15, 'U': 2, 'F': 1, 'C': 3, 'A': 1, 'D': 1, 'L': 2,
29 |                         'mwe': 2, 'H': 5, 'implicit': 1, 'main_rel': 1}),
30 |         (multi_sent, {'U': 4, 'P': 3, 'mwe': 2, 'H': 3, 'primary': 6, 'main_rel': 2}),
31 |         (crossing, {'U': 3, 'P': 2, 'remote': 1, 'mwe': 1, 'H': 2, 'primary': 3, 'main_rel': 2}),
32 |         (discontiguous, {'G': 1, 'U': 2, 'E': 2, 'primary': 13, 'P': 3, 'F': 1, 'C': 1, 'A': 3, 'D': 2,
33 |                          'mwe': 6, 'H': 3, 'implicit':3, 'main_rel': 2}),
34 |         (l1_passage, {'P': 2, 'mwe': 4, 'H': 3, 'primary': 11, 'U': 2, 'A': 5, 'D': 1, 'L': 2, 'remote': 2, 'S': 1,
35 |                       'implicit':1, 'main_rel': 3}),
36 | 
37 |         (empty, {}),
38 | ))
39 | def test_extract_all(create, expected):
40 |     extract_and_check(create(), constructions=CONSTRUCTIONS, expected=expected)
41 | 
42 | 
43 | @pytest.mark.parametrize("create", PASSAGES)
44 | @pytest.mark.parametrize("constructions", (DEFAULT, [CATEGORIES_NAME]), ids=("default", CATEGORIES_NAME))
45 | def test_extract(create, constructions, monkeypatch):
46 |     monkeypatch.setattr(textutil, "get_nlp", assert_spacy_not_loaded)
47 |     extract_and_check(create(), constructions=constructions)
48 | 


--------------------------------------------------------------------------------
/ucca/tests/test_ioutil.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pytest
 3 | import random
 4 | from glob import glob
 5 | 
 6 | from ucca import layer0, layer1, convert, ioutil, diffutil
 7 | from .conftest import loaded, multi_sent, discontiguous, l1_passage
 8 | 
 9 | """Tests the ioutil module functions and classes."""
10 | 
11 | 
12 | def test_split2sentences():
13 |     """Tests splitting a passage by sentence ends.
14 |     """
15 |     p = multi_sent()
16 |     split = convert.split2sentences(p)
17 |     assert len(split) == 3
18 |     terms = [[t.text for t in s.layer(layer0.LAYER_ID).all] for s in split]
19 |     assert terms[0] == ["1", "2", "3", "."]
20 |     assert terms[1] == ["5", "6", "."]
21 |     assert terms[2] == ["8", ".", "10", "."]
22 |     assert all(t.paragraph == 1 for s in split for t in s.layer(layer0.LAYER_ID).all)
23 |     top_scenes = [s.layer(layer1.LAYER_ID).top_scenes for s in split]
24 |     for t in top_scenes:
25 |         assert len(t) == 1
26 |         assert t[0].incoming[0].tag == layer1.EdgeTags.ParallelScene
27 | 
28 | 
29 | def test_split2paragraphs():
30 |     """Tests splitting a passage by paragraph ends.
31 |     """
32 |     p = multi_sent()
33 |     split = convert.split2paragraphs(p)
34 |     assert len(split) == 2
35 |     terms = [[t.text for t in s.layer(layer0.LAYER_ID).all] for s in split]
36 |     assert terms[0] == ["1", "2", "3", ".", "5", "6", "."]
37 |     assert terms[1] == ["8", ".", "10", "."]
38 |     assert all(t.paragraph == 1 for s in split for t in s.layer(layer0.LAYER_ID).all)
39 |     top_scenes = [s.layer(layer1.LAYER_ID).top_scenes for s in split]
40 |     assert len(top_scenes[0]) == 2
41 |     assert len(top_scenes[1]) == 1
42 |     for t in top_scenes:
43 |         for n in t:
44 |             assert n.incoming[0].tag == layer1.EdgeTags.ParallelScene
45 | 
46 | 
47 | @pytest.mark.parametrize("create", (loaded, multi_sent, discontiguous, l1_passage))
48 | def test_split_join_sentences(create):
49 |     p = create()
50 |     split = convert.split2sentences(p, remarks=True)
51 |     copy = convert.join_passages(split)
52 |     diffutil.diff_passages(p, copy)
53 |     assert p.equals(copy)
54 | 
55 | 
56 | @pytest.mark.parametrize("create", (loaded, multi_sent, discontiguous, l1_passage))
57 | def test_split_join_paragraphs(create):
58 |     p = create()
59 |     split = convert.split2paragraphs(p, remarks=True)
60 |     copy = convert.join_passages(split)
61 |     diffutil.diff_passages(p, copy)
62 |     assert p.equals(copy)
63 | 
64 | 
65 | def _test_passages(passages):
66 |     for passage in passages:
67 |         assert passage.layer(layer0.LAYER_ID).all, "No terminals in passage " + passage.ID
68 |         assert len(passage.layer(layer1.LAYER_ID).all), "No non-terminals but the root in passage " + passage.ID
69 | 
70 | 
71 | def test_load_passage():
72 |     _test_passages(ioutil.read_files_and_dirs(glob(os.path.join("test_files", "standard3.xml"))))
73 | 
74 | 
75 | def test_load_multiple_passages():
76 |     """Test lazy-loading passages"""
77 |     files = 3 * ["test_files/standard3.xml"]
78 |     passages = ioutil.read_files_and_dirs(files)
79 |     assert len(files) == len(list(passages)), "Should load one passage per file"
80 |     assert len(files) == len(passages)
81 |     _test_passages(passages)
82 | 
83 | 
84 | def test_shuffle_passages():
85 |     """Test lazy-loading passages and shuffling them"""
86 |     files = 3 * ["test_files/standard3.xml"]
87 |     passages = ioutil.read_files_and_dirs(files)
88 |     random.shuffle(passages)
89 |     assert len(files) == len(passages)
90 |     _test_passages(passages)
91 | 


--------------------------------------------------------------------------------
/ucca/tests/test_layer0.py:
--------------------------------------------------------------------------------
 1 | from ucca import core, layer0
 2 | 
 3 | """Tests module layer0 functionality."""
 4 | 
 5 | 
 6 | def test_terminals():
 7 |     """Tests :class:`layer0`.Terminal new and inherited functionality."""
 8 |     p = core.Passage("1")
 9 |     layer0.Layer0(p)
10 |     terms = [
11 |         layer0.Terminal(ID="0.1", root=p,
12 |                         tag=layer0.NodeTags.Word,
13 |                         attrib={"text": "1",
14 |                                 "paragraph": 1,
15 |                                 "paragraph_position": 1}),
16 |         layer0.Terminal(ID="0.2", root=p,
17 |                         tag=layer0.NodeTags.Word,
18 |                         attrib={"text": "2",
19 |                                 "paragraph": 2,
20 |                                 "paragraph_position": 1}),
21 |         layer0.Terminal(ID="0.3", root=p,
22 |                         tag=layer0.NodeTags.Punct,
23 |                         attrib={"text": ".",
24 |                                 "paragraph": 2,
25 |                                 "paragraph_position": 2})
26 |     ]
27 | 
28 |     p_copy = core.Passage("2")
29 |     layer0.Layer0(p_copy)
30 |     equal_term = layer0.Terminal(ID="0.1", root=p_copy,
31 |                                  tag=layer0.NodeTags.Word,
32 |                                  attrib={"text": "1",
33 |                                          "paragraph": 1,
34 |                                          "paragraph_position": 1})
35 |     unequal_term = layer0.Terminal(ID="0.2", root=p_copy,
36 |                                    tag=layer0.NodeTags.Word,
37 |                                    attrib={"text": "two",
38 |                                            "paragraph": 2,
39 |                                            "paragraph_position": 1})
40 | 
41 |     assert [t.punct for t in terms] == [False, False, True]
42 |     assert [t.text for t in terms] == ["1", "2", "."]
43 |     assert [t.position for t in terms] == [1, 2, 3]
44 |     assert [t.paragraph for t in terms] == [1, 2, 2]
45 |     assert [t.para_pos for t in terms] == [1, 1, 2]
46 |     assert not (terms[0] == terms[1])
47 |     assert not (terms[0] == terms[2])
48 |     assert not (terms[1] == terms[2])
49 |     assert terms[0] == terms[0]
50 |     assert terms[0].equals(equal_term)
51 |     assert not (terms[1].equals(unequal_term))
52 |     assert p.copy(layer0.LAYER_ID).equals(p)
53 |     assert p_copy.copy(layer0.LAYER_ID).equals(p_copy)
54 | 
55 | 
56 | def test_layer0():
57 |     p = core.Passage("1")
58 |     l0 = layer0.Layer0(p)
59 |     t1 = l0.add_terminal(text="1", punct=False)
60 |     l0.add_terminal(text="2", punct=True, paragraph=2)
61 |     t3 = l0.add_terminal(text="3", punct=False, paragraph=2)
62 |     assert [x[0] for x in l0.pairs] == [1, 2, 3]
63 |     assert [t.para_pos for t in l0.all] == [1, 1, 2]
64 |     assert l0.words == (t1, t3)
65 |     assert p.copy(layer0.LAYER_ID).equals(p)
66 | 


--------------------------------------------------------------------------------
/ucca/tests/test_layer1.py:
--------------------------------------------------------------------------------
  1 | from ucca import layer1
  2 | from .conftest import l1_passage, discontiguous
  3 | 
  4 | """Tests layer1 module functionality and correctness."""
  5 | 
  6 | 
  7 | def test_creation():
  8 |     p = l1_passage()
  9 |     head = p.layer("1").heads[0]
 10 |     assert [x.tag for x in head] == ["L", "H", "H", "L", "H", "U"]
 11 |     assert [x.child.position for x in head.children[0]] == [1]
 12 |     assert [x.tag for x in head.children[1]] == ["P", "A", "U", "A"]
 13 |     assert [x.child.position for x in head.children[1].children[0]] == [2, 3, 4, 5]
 14 |     assert [x.child.position for x in head.children[1].children[1]] == [6, 7, 8, 9]
 15 |     assert [x.child.position for x in head.children[1].children[2]] == [10]
 16 |     assert (head.children[1][3].attrib.get("remote"))
 17 | 
 18 | 
 19 | def test_fnodes():
 20 |     p = l1_passage()
 21 |     l0 = p.layer("0")
 22 |     l1 = p.layer("1")
 23 | 
 24 |     terms = l0.all
 25 |     head, lkg1, lkg2 = l1.heads
 26 |     link1, ps1, ps2, link2, ps3, punct2 = head.children
 27 |     p1, a1, punct1 = [x.child for x in ps1 if not x.attrib.get("remote")]
 28 |     a2, d2 = [x.child for x in ps2 if not x.attrib.get("remote")]
 29 |     p3, a3, a4 = ps3.children
 30 | 
 31 |     assert lkg1.relation == link1
 32 |     assert lkg1.arguments == [ps1]
 33 |     assert ps2.process == p1
 34 |     assert ps1.participants == [a1, d2]
 35 |     assert ps3.participants == [a3, a4]
 36 | 
 37 |     assert ps1.get_terminals() == terms[1:10]
 38 |     assert ps1.get_terminals(punct=False, remotes=True) == terms[1:9] + terms[14:15]
 39 |     assert ps1.end_position == 10
 40 |     assert ps2.start_position == 11
 41 |     assert ps3.start_position == 17
 42 |     assert a4.start_position == -1
 43 | 
 44 |     assert ps1.fparent == head
 45 |     assert d2.fparent == ps2
 46 | 
 47 | 
 48 | def test_layer1():
 49 |     p = l1_passage()
 50 |     l1 = p.layer("1")
 51 | 
 52 |     head, lkg1, lkg2 = l1.heads
 53 |     link1, ps1, ps2, link2, ps3, punct2 = head.children
 54 |     p1, a1, punct1 = [x.child for x in ps1 if not x.attrib.get("remote")]
 55 | 
 56 |     assert l1.top_scenes == [ps1, ps2, ps3]
 57 |     assert l1.top_linkages == [lkg1, lkg2]
 58 | 
 59 |     # Changing the process tag of scene #1 to A and back, validate that
 60 |     # top scenes are updates accordingly
 61 |     p_edge = [e for e in ps1 if e.tag == layer1.EdgeTags.Process][0]
 62 |     p_edge.tag = layer1.EdgeTags.Participant
 63 |     assert l1.top_linkages == [lkg2]
 64 |     p_edge.tag = layer1.EdgeTags.Process
 65 |     assert l1.top_scenes == [ps1, ps2, ps3]
 66 |     assert l1.top_linkages == [lkg1, lkg2]
 67 | 
 68 | 
 69 | def test_str():
 70 |     p = l1_passage()
 71 |     assert [str(x) for x in p.layer("1").heads] == \
 72 |            ["[L 1] [H [P 2 3 4 5] [A 6 7 8 9] [U 10] "
 73 |             "... [A* 15] ] [H [P* 2 3 4 5] [A 11 12 "
 74 |             "13 14] [D 15] ] [L 16] [H [A IMPLICIT] [S "
 75 |             "17 18] [A 19] ] [U 20] ",
 76 |             "1.2-->1.3", "1.10-->1.7,1.11"]
 77 | 
 78 | 
 79 | def test_destroy():
 80 |     p = l1_passage()
 81 |     l1 = p.layer("1")
 82 | 
 83 |     head, lkg1, lkg2 = l1.heads
 84 |     link1, ps1, ps2, link2, ps3, punct2 = head.children
 85 |     p1, a1, punct1 = [x.child for x in ps1 if not x.attrib.get("remote")]
 86 | 
 87 |     ps1.destroy()
 88 |     assert head.children == [link1, ps2, link2, ps3, punct2]
 89 |     assert p1.parents == [ps2]
 90 |     assert not a1.parents
 91 |     assert not punct1.parents
 92 | 
 93 | 
 94 | def test_discontiguous():
 95 |     """Tests FNode.discontiguous and FNode.get_sequences"""
 96 |     p = discontiguous()
 97 |     l1 = p.layer("1")
 98 |     head = l1.heads[0]
 99 |     ps1, ps2, ps3 = head.children
100 |     d1, a1, p1, f1 = ps1.children
101 |     e1, c1, e2 = d1.children
102 |     d2, g2, p2, a2 = ps2.children
103 |     t14, p3, a3 = ps3.children
104 | 
105 |     # Checking discontiguous property
106 |     assert not ps1.discontiguous
107 |     assert not d1.discontiguous
108 |     assert not e1.discontiguous
109 |     assert not e2.discontiguous
110 |     assert c1.discontiguous
111 |     assert a1.discontiguous
112 |     assert p1.discontiguous
113 |     assert not f1.discontiguous
114 |     assert ps2.discontiguous
115 |     assert not p2.discontiguous
116 |     assert not a2.discontiguous
117 |     assert not ps3.discontiguous
118 |     assert not a3.discontiguous
119 | 
120 |     # Checking get_sequences -- should return only non-remote, non-implicit
121 |     # stretches of terminals
122 |     assert ps1.get_sequences() == [(1, 10)]
123 |     assert d1.get_sequences() == [(1, 4)]
124 |     assert e1.get_sequences() == [(1, 1)]
125 |     assert e2.get_sequences() == [(3, 3)]
126 |     assert c1.get_sequences() == [(2, 2), (4, 4)]
127 |     assert a1.get_sequences() == [(5, 5), (8, 8)]
128 |     assert p1.get_sequences() == [(6, 7), (10, 10)]
129 |     assert f1.get_sequences() == [(9, 9)]
130 |     assert ps2.get_sequences() == [(11, 14), (18, 20)]
131 |     assert p2.get_sequences() == [(11, 14)]
132 |     assert a2.get_sequences() == [(18, 20)]
133 |     assert not d2.get_sequences()
134 |     assert not g2.get_sequences()
135 |     assert ps3.get_sequences() == [(15, 17)]
136 |     assert a3.get_sequences() == [(16, 17)]
137 |     assert not p3.get_sequences()
138 | 


--------------------------------------------------------------------------------
/ucca/tests/test_textutil.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from ucca import layer0, convert, textutil
 4 | from .conftest import crossing, multi_sent, multi_sent_with_quotes, l1_passage, discontiguous, empty, PASSAGES
 5 | 
 6 | """Tests the textutil module functions and classes."""
 7 | 
 8 | 
 9 | @pytest.mark.parametrize("create, breaks", (
10 |         (multi_sent, [4, 7, 11]),
11 |         (crossing, [3, 7]),
12 |         (discontiguous, [20]),
13 |         (l1_passage, [20]),
14 |         (empty, []),
15 |         (multi_sent_with_quotes, [6, 9, 13]),
16 | ))
17 | def test_break2sentences(create, breaks):
18 |     """Tests identifying correctly sentence ends. """
19 |     assert textutil.break2sentences(create()) == breaks
20 | 
21 | 
22 | def test_word_vectors():
23 |     vectors, dim = textutil.get_word_vectors()
24 |     for word, vector in vectors.items():
25 |         assert len(vector) == dim, "Vector dimension for %s is %d != %d" % (word, len(vector), dim)
26 | 
27 | 
28 | @pytest.mark.parametrize("create", PASSAGES)
29 | @pytest.mark.parametrize("as_array", (True, False), ids=("array", "extra"))
30 | def test_annotate_passage(create, as_array):
31 |     passage = create()
32 |     textutil.annotate(passage, as_array=as_array)
33 |     for p in passage, convert.from_standard(convert.to_standard(passage)):
34 |         assert textutil.is_annotated(p, as_array=as_array), "Passage %s is not annotated" % passage.ID
35 |         for terminal in p.layer(layer0.LAYER_ID).all:
36 |             if as_array:
37 |                 assert terminal.tok is not None, "Terminal %s has no annotation" % terminal
38 |                 assert len(terminal.tok) == len(textutil.Attr)
39 |             else:
40 |                 for attr in textutil.Attr:
41 |                     assert attr.key in terminal.extra, "Terminal %s has no %s" % (terminal, attr.name)
42 | 
43 | 
44 | @pytest.mark.parametrize("as_array", (True, False), ids=("array", "extra"))
45 | @pytest.mark.parametrize("convert_and_back", (True, False), ids=("convert", "direct"))
46 | def test_annotate_all(as_array, convert_and_back):
47 |     passages = [create() for create in PASSAGES]
48 |     list(textutil.annotate_all(passages))
49 |     for passage, compare in textutil.annotate_all(((p, p) for p in passages), as_array=as_array, as_tuples=True):
50 |         assert passage is compare
51 |         p = (passage, convert.from_standard(convert.to_standard(passage)))[convert_and_back]
52 |         assert textutil.is_annotated(p, as_array=as_array), "Passage %s is not annotated" % passage.ID
53 |         for terminal in p.layer(layer0.LAYER_ID).all:
54 |             if as_array:
55 |                 assert terminal.tok is not None, "Terminal %s in passage %s has no annotation" % (terminal, passage.ID)
56 |                 assert len(terminal.tok) == len(textutil.Attr)
57 |             else:
58 |                 for attr in textutil.Attr:
59 |                     assert attr.key in terminal.extra, "Terminal %s in passage %s has no %s" % (
60 |                         terminal, passage.ID, attr.name)
61 | 
62 | 
63 | def assert_spacy_not_loaded(*args, **kwargs):
64 |     del args, kwargs
65 |     assert False, "Should not load spaCy when passage is pre-annotated"
66 | 
67 | 
68 | @pytest.mark.parametrize("create", PASSAGES)
69 | @pytest.mark.parametrize("as_array", (True, False), ids=("array", "extra"))
70 | @pytest.mark.parametrize("convert_and_back", (True, False), ids=("convert", "direct"))
71 | @pytest.mark.parametrize("partial", (True, False), ids=("partial", "full"))
72 | def test_preannotate_passage(create, as_array, convert_and_back, partial, monkeypatch):
73 |     if not partial:
74 |         monkeypatch.setattr(textutil, "get_nlp", assert_spacy_not_loaded)
75 |     passage = create()
76 |     l0 = passage.layer(layer0.LAYER_ID)
77 |     attr_values = list(range(10, 10 + len(textutil.Attr)))
78 |     if partial:
79 |         attr_values[textutil.Attr.ENT_TYPE.value] = ""
80 |     if as_array:
81 |         l0.extra["doc"] = [len(p) * [attr_values] for p in textutil.break2paragraphs(passage, return_terminals=True)]
82 |     else:
83 |         for terminal in l0.all:
84 |             for attr, value in zip(textutil.Attr, attr_values):
85 |                 if value:
86 |                     terminal.extra[attr.key] = value
87 |     passage = (passage, convert.from_standard(convert.to_standard(passage)))[convert_and_back]
88 |     if not partial:
89 |         assert textutil.is_annotated(passage, as_array=as_array, as_extra=not as_array), \
90 |             "Passage %s is not pre-annotated" % passage.ID
91 |     textutil.annotate(passage, as_array=as_array, as_extra=not as_array)
92 |     assert textutil.is_annotated(passage, as_array=as_array, as_extra=not as_array), \
93 |         "Passage %s is not annotated" % passage.ID
94 |     for terminal in l0.all:
95 |         for i, (attr, value) in enumerate(zip(textutil.Attr, attr_values)):
96 |             if value:
97 |                 assert (terminal.tok[i] if as_array else terminal.extra.get(attr.key)) == value, \
98 |                     "Terminal %s has wrong %s" % (terminal, attr.name)
99 | 


--------------------------------------------------------------------------------
/ucca/tests/test_visualization.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from ucca.visualization import draw, tikz
 4 | from .conftest import PASSAGES
 5 | 
 6 | """Tests the visualization module functions and classes."""
 7 | 
 8 | 
 9 | @pytest.mark.parametrize("create", PASSAGES)
10 | def test_draw(create):
11 |     import matplotlib
12 |     matplotlib.use('Agg')
13 |     draw(create())
14 | 
15 | 
16 | @pytest.mark.parametrize("create", PASSAGES)
17 | def test_tikz(create):
18 |     tikz(create())
19 | 


--------------------------------------------------------------------------------
/ucca_db/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huji-nlp/ucca/31e381f2b2693e82c8402b09a6d28f97d595a1eb/ucca_db/__init__.py


--------------------------------------------------------------------------------
/ucca_db/download.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from argparse import ArgumentParser
 3 | from xml.etree.ElementTree import tostring
 4 | 
 5 | from tqdm import tqdm
 6 | 
 7 | from ucca import convert
 8 | from ucca.ioutil import write_passage, external_write_mode
 9 | from ucca_db.api import get_by_xids, get_most_recent_passage_by_uid
10 | 
11 | desc = "Download passages from old UCCA annotation app"
12 | 
13 | 
14 | def get_by_method(method, id_field, passage_id=None, **kwargs):
15 |     if method == "xid":
16 |         return get_by_xids(xids=id_field, **kwargs)[0]
17 |     elif method == "uid":
18 |         return get_most_recent_passage_by_uid(id_field, passage_id, **kwargs)
19 |     raise ValueError("Unknown method: '%s'" % method)
20 | 
21 | 
22 | def main(args):
23 |     os.makedirs(args.outdir, exist_ok=True)
24 |     with open(args.filename, encoding="utf-8") as f:
25 |         t = list(map(str.split, f))
26 |         if not args.verbose:
27 |             t = tqdm(t, desc="Downloading", unit=" passages")
28 |         for passage_id, id_field in t:
29 |             if not args.verbose:
30 |                 t.set_postfix({"passage_id": passage_id, args.method: id_field})
31 |             if args.verbose:
32 |                 with external_write_mode():
33 |                     print("Getting passage " + passage_id + " with " + args.method + "=" + id_field, end="\t")
34 |             xml_root = get_by_method(id_field=id_field.split(","), passage_id=passage_id, **vars(args))
35 |             if xml_root is None:
36 |                 continue
37 |             if args.write_site:
38 |                 site_filename = passage_id + "_site_download.xml"
39 |                 with open(site_filename, "w", encoding="utf-8") as fsite:
40 |                     print(tostring(xml_root).decode(), file=fsite)
41 |                 if args.verbose:
42 |                     with external_write_mode():
43 |                         print("Wrote '%s'" % site_filename)
44 |             if args.write:
45 |                 write_passage(convert.from_site(xml_root), outdir=args.outdir, verbose=args.verbose)
46 | 
47 | 
48 | if __name__ == "__main__":
49 |     argparser = ArgumentParser(description=desc)
50 |     argparser.add_argument("filename", help="specification filename with (passage ID, xid OR uid) per passage")
51 |     argparser.add_argument("-m", "--method", default="uid", choices=("xid", "uid"), help="by xid or latest by paid,uid")
52 |     argparser.add_argument("-d", "--db-name", default="work", help="database name")
53 |     argparser.add_argument("-H", "--host-name", default="pgserver", help="host name")
54 |     argparser.add_argument("-o", "--outdir", default=".", help="directory to write created XML IDs to")
55 |     argparser.add_argument("-s", "--write-site", action="store_true", help="write site format, too, for debugging")
56 |     argparser.add_argument("-n", "--no-write", dest="write", action="store_false", help="do not really write any files")
57 |     argparser.add_argument("-x", "--write-xids", help="file to write xids to (for `uid' method)")
58 |     argparser.add_argument("-S", "--strict", action="store_true", help="fail if no result is found")
59 |     argparser.add_argument("-v", "--verbose", action="store_true", help="print tagged text for each passage")
60 |     main(argparser.parse_args())
61 | 


--------------------------------------------------------------------------------
/ucca_db/upload.py:
--------------------------------------------------------------------------------
 1 | from argparse import ArgumentParser
 2 | from xml.etree.ElementTree import tostring
 3 | 
 4 | from ucca import convert
 5 | from ucca.ioutil import get_passages_with_progress_bar
 6 | from ucca_db.api import CONNECTION, write_to_db
 7 | 
 8 | desc = "Upload passages to old UCCA annotation app"
 9 | 
10 | 
11 | def upload_passage(xml_root, site_filename=None, verbose=False, **kwargs):
12 |     decoded = tostring(xml_root).decode()
13 |     if site_filename:
14 |         with open(site_filename, "w", encoding="utf-8") as f:
15 |             print(decoded, file=f)
16 |         if verbose:
17 |             print("Wrote '%s'" % site_filename)
18 |     return write_to_db(xml=decoded, **kwargs)
19 | 
20 | 
21 | def main(args):
22 |     filenames = list(args.passages)
23 |     if args.filenames:
24 |         with open(args.filenames, encoding="utf-8") as f:
25 |             filenames += list(filter(None, map(str.strip, f)))
26 |     with open(args.out, "w", encoding="utf-8") as f:
27 |         for passage in get_passages_with_progress_bar(filenames):
28 |             out = upload_passage(convert.to_site(passage), verbose=args.verbose,
29 |                                  site_filename=passage.ID + "_site_upload.xml" if args.write_site else None,
30 |                                  db_name=args.db_name, host_name=args.host_name,
31 |                                  new_pid=passage.ID, new_prid=args.project_id, username=args.username)
32 |             print(passage.ID, out, file=f)
33 |             if args.verbose:
34 |                 print("Uploaded passage %s with xid=%s" % (passage.ID, out))
35 |     if CONNECTION is not None:
36 |         CONNECTION.commit()
37 |     print("Wrote '%s'" % args.out)
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     argparser = ArgumentParser(description=desc)
42 |     argparser.add_argument("passages", nargs="*", help="the corpus, given as xml/pickle file names")
43 |     argparser.add_argument("-f", "--filenames", help="read input passages filenames from file rather than command line")
44 |     argparser.add_argument("-d", "--db-name", default="work", help="database name")
45 |     argparser.add_argument("-H", "--host-name", default="pgserver", help="host name")
46 |     argparser.add_argument("-p", "--project-id", default="63", help="project ID")
47 |     argparser.add_argument("-u", "--username", default="danielh", help="username")
48 |     argparser.add_argument("-o", "--out", default="xids.txt", help="file to write created XML IDs to")
49 |     argparser.add_argument("--write-site", action="store_true", help="write site format for debugging before upload")
50 |     argparser.add_argument("-v", "--verbose", action="store_true", help="print tagged text for each passage")
51 |     main(argparser.parse_args())
52 | 


--------------------------------------------------------------------------------
/uccaapp/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huji-nlp/ucca/31e381f2b2693e82c8402b09a6d28f97d595a1eb/uccaapp/__init__.py


--------------------------------------------------------------------------------
/uccaapp/convert_and_evaluate.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | import argparse
 4 | from glob import glob
 5 | from requests.exceptions import HTTPError
 6 | 
 7 | from ucca.evaluation import evaluate, Scores
 8 | from ucca.ioutil import read_files_and_dirs
 9 | from uccaapp.download_task import TaskDownloader
10 | from uccaapp.upload_task import TaskUploader
11 | 
12 | try:
13 |     from simplejson.scanner import JSONDecodeError
14 | except ImportError:
15 |     from json.decoder import JSONDecodeError
16 | 
17 | desc = """Convert a passage file to JSON format and upload to UCCA-App as a completed task,
18 | then download task from UCCA-App and convert to a passage in standard format again,
19 | then evaluate the result against the original"""
20 | 
21 | 
22 | def main(filenames, write, **kwargs):
23 |     uploader = TaskUploader(**kwargs)
24 |     downloader = TaskDownloader(**kwargs)
25 |     scores = []
26 |     try:
27 |         for pattern in filenames:
28 |             filenames = sorted(glob(pattern))
29 |             if not filenames:
30 |                 raise IOError("Not found: " + pattern)
31 |             for ref in read_files_and_dirs(filenames):
32 |                 print("Converting passage " + ref.ID + "... ", end="")
33 |                 task = uploader.upload_task(ref)
34 |                 guessed, *_ = downloader.download_task(task["id"], write=write, **kwargs)
35 |                 score = evaluate(guessed, ref, **kwargs)
36 |                 print("F1=%.3f" % score.average_f1())
37 |                 scores.append(score)
38 |     except HTTPError as e:
39 |         try:
40 |             raise ValueError(e.response.json()) from e
41 |         except JSONDecodeError:
42 |             raise ValueError(e.response.text) from e
43 |     print()
44 |     if len(scores) > 1:
45 |         print("Aggregated scores:")
46 |     Scores.aggregate(scores).print()
47 | 
48 | 
49 | if __name__ == "__main__":
50 |     argument_parser = argparse.ArgumentParser(description=desc)
51 |     TaskUploader.add_arguments(argument_parser)
52 |     argument_parser.add_argument("--write", action="store_true", help="Write converted passage to file")
53 |     TaskDownloader.add_write_arguments(argument_parser)
54 |     main(**vars(argument_parser.parse_args()))
55 |     sys.exit(0)
56 | 


--------------------------------------------------------------------------------
/uccaapp/copy_categories.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import sys
 3 | 
 4 | import argparse
 5 | 
 6 | from uccaapp.api import ServerAccessor
 7 | 
 8 | desc = """Download categories from one UCCA-App server and upload to another UCCA-App server"""
 9 | 
10 | 
11 | def add_arguments(argparser):
12 |     argparser.add_argument("category-ids", nargs="+", type=int, help="IDs of tasks to export and import")
13 |     argparser.add_argument("--server-address-orig", required=True, help="UCCA-App origin server")
14 |     argparser.add_argument("--email-orig", help="UCCA-App origin email")
15 |     argparser.add_argument("--password-orig", help="UCCA-App origin password")
16 |     argparser.add_argument("--server-address-target", required=True, help="UCCA-App target server")
17 |     argparser.add_argument("--email-target", help="UCCA-App target email")
18 |     argparser.add_argument("--password-target", help="UCCA-App target password")
19 |     argparser.add_argument("-v", "--verbose", action="store_true", help="detailed output")
20 | 
21 | 
22 | def main(args):
23 |     server_accessor_origin = ServerAccessor(server_address=args.server_address_orig,
24 |                                             email=args.email_orig, password=args.password_orig,
25 |                                             verbose=args.verbose)
26 |     server_accessor_target = ServerAccessor(server_address=args.server_address_target,
27 |                                             email=args.email_target, password=args.password_target,
28 |                                             verbose=args.verbose)
29 |     for category_id in args.category_ids:
30 |         category_out = server_accessor_origin.get_category(category_id)
31 |         server_accessor_target.create_category(**category_out)
32 | 
33 | 
34 | if __name__ == "__main__":
35 |     argument_parser = argparse.ArgumentParser(description=desc)
36 |     add_arguments(argument_parser)
37 |     main(argument_parser.parse_args())
38 |     sys.exit(0)
39 | 


--------------------------------------------------------------------------------
/uccaapp/create_annotation_tasks.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import argparse
 3 | import sys
 4 | 
 5 | from tqdm import tqdm
 6 | 
 7 | from uccaapp.api import ServerAccessor
 8 | 
 9 | desc = """Create new annotation/review tasks for a specific user, given parent tokenization tasks (for creating 
10 | annotation tasks) or parent annotation tasks (for creating review tasks) """
11 | 
12 | 
13 | class AnnotationTaskCreator(ServerAccessor):
14 |     def __init__(self, project_id=None, **kwargs):
15 |         """
16 |         :param project_id: Specify project for created tasks, otherwise same as parent tasks
17 |         """
18 |         super().__init__(**kwargs)
19 |         if project_id is not None:
20 |             self.set_project(project_id)
21 | 
22 |     def create_tasks(self, filename, log=None, **kwargs):
23 |         log_h = open(log, "w", encoding="utf-8") if log else None
24 |         lines = list(self.read_lines(filename))
25 |         for user_id, task_id in tqdm(lines, unit="task", desc="Creating tasks"):
26 |             task = self.create_task(**self.build_task(user_id, task_id, **kwargs))
27 |             if log:
28 |                 print(task["id"], file=log_h, sep="\t", flush=True)
29 |         print("Uploaded %d tasks successfully." % len(lines), file=sys.stderr)
30 |         if log:
31 |             log_h.close()
32 | 
33 |     def build_task(self, user_id, task_id, review=False, manager_comment=None, strict=False, **kwargs):
34 |         del kwargs
35 |         user = self.get_user(user_id)
36 |         task = self.get_task(task_id)
37 |         assert task["type"] in (["ANNOTATION", "REVIEW"] if review else ["TOKENIZATION"]), \
38 |             "Wrong input task given: %s for task ID %s" % (task["type"], task_id)
39 |         if strict:
40 |             assert task["status"] == "SUBMITTED", "Parent task is not submitted: %s" % task_id
41 |         return dict(type="REVIEW" if review else "ANNOTATION", project=self.project or task["project"], user=user,
42 |                     passage=task["passage"], manager_comment=manager_comment or task.get("manager_comment", ""),
43 |                     user_comment=task.get("user_comment", ""), parent=task, is_demo=False, is_active=True)
44 | 
45 |     @staticmethod
46 |     def read_lines(filename):
47 |         with open(filename, encoding="utf-8") as f:
48 |             for line in f:
49 |                 fields = line.strip().split()
50 |                 try:
51 |                     user_id, task_id = fields
52 |                 except ValueError:
53 |                     print("Error in line: " + line.strip(), file=sys.stderr)
54 |                     continue
55 |                 yield user_id, task_id
56 | 
57 |     @staticmethod
58 |     def add_arguments(argparser):
59 |         argparser.add_argument("filename", help="a file where each line is a <User ID> <INPUT TASK ID>, "
60 |                                                 "where the input task may be an annotation/review task "
61 |                                                 "(if given --review) or a tokenization task")
62 |         ServerAccessor.add_arguments(argparser)
63 |         argparser.add_argument("-r", "--review", action="store_true", help="Create annotation/review task")
64 |         argparser.add_argument("-l", "--log", help="filename to write log of uploaded passages to")
65 |         argparser.add_argument("--manager-comment", help="Manager comment to set for all tasks")
66 |         ServerAccessor.add_project_id_argument(argparser)
67 |         argparser.add_argument("-s", "--strict", action="store_true", help="Require parent task to be submitted")
68 | 
69 | 
70 | def main(**kwargs):
71 |     AnnotationTaskCreator(**kwargs).create_tasks(**kwargs)
72 | 
73 | 
74 | if __name__ == "__main__":
75 |     argument_parser = argparse.ArgumentParser(description=desc)
76 |     AnnotationTaskCreator.add_arguments(argument_parser)
77 |     main(**vars(argument_parser.parse_args()))
78 |     sys.exit(0)
79 | 


--------------------------------------------------------------------------------
/uccaapp/create_tokenization_tasks.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import sys
 3 | 
 4 | import argparse
 5 | 
 6 | from uccaapp.create_annotation_tasks import ServerAccessor, AnnotationTaskCreator
 7 | 
 8 | desc = """Upload a list of tokenization tasks to a project"""
 9 | 
10 | 
11 | class TokenizationTaskCreator(AnnotationTaskCreator):
12 |     def __init__(self, project_id, **kwargs):
13 |         super().__init__(**kwargs)
14 |         self.set_project(project_id)
15 | 
16 |     def build_task(self, user_id, passage_id, **kwargs):
17 |         del kwargs
18 |         user = self.get_user(user_id)
19 |         passage = self.get_passage(passage_id)
20 |         return dict(type="TOKENIZATION", project=self.project, user=user, passage=passage,
21 |                     manager_comment="passage #%s" % passage["id"], user_comment="", parent=None, is_demo=False,
22 |                     is_active=True)
23 | 
24 |     @staticmethod
25 |     def add_arguments(argparser):
26 |         argparser.add_argument("filename", help="a file where each line is a <User ID> <Passage ID>")
27 |         argparser.add_argument("-l", "--log", help="filename to write log of uploaded passages to")
28 |         ServerAccessor.add_project_id_argument(argparser)
29 |         #ServerAccessor.add_user_id_argument(argparser)
30 |         ServerAccessor.add_arguments(argparser)
31 | 
32 | 
33 | def main(**kwargs):
34 |     TokenizationTaskCreator(**kwargs).create_tasks(**kwargs)
35 | 
36 | 
37 | if __name__ == "__main__":
38 |     argument_parser = argparse.ArgumentParser(description=desc)
39 |     TokenizationTaskCreator.add_arguments(argument_parser)
40 |     main(**vars(argument_parser.parse_args()))
41 |     sys.exit(0)
42 | 


--------------------------------------------------------------------------------
/uccaapp/download_task.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import argparse
  3 | import json
  4 | import sys
  5 | 
  6 | from tqdm import tqdm
  7 | 
  8 | from ucca import normalization, validation
  9 | from ucca.convert import from_json
 10 | from ucca.ioutil import write_passage
 11 | from uccaapp.api import ServerAccessor
 12 | 
 13 | desc = """Download task from UCCA-App and convert to a passage in standard format"""
 14 | 
 15 | 
 16 | class TaskDownloader(ServerAccessor):
 17 |     def __init__(self, **kwargs):
 18 |         super().__init__(**kwargs)
 19 | 
 20 |     def download_tasks(self, task_ids, by_filename=False, validate=None, log=None, **kwargs):
 21 |         if by_filename:
 22 |             task_ids_from_file = []
 23 |             for filename in task_ids:
 24 |                 with open(filename, 'r') as f:
 25 |                     task_ids_from_file += list(filter(None, map(str.strip, f)))
 26 |             task_ids = task_ids_from_file
 27 |         validate_h = open(validate, "w", encoding="utf-8") if validate else None
 28 |         log_h = open(log, "w", encoding="utf-8") if log else None
 29 |         for task_id in tqdm(task_ids, unit=" tasks", desc="Downloading"):
 30 |             yield self.download_task(task_id, validate=validate_h, log=log_h, **kwargs)
 31 |         if validate:
 32 |             validate_h.close()
 33 |         if log:
 34 |             log_h.close()
 35 | 
 36 |     def download_task(self, task_id, normalize=False, write=True, validate=None, binary=None, log=None, out_dir=None,
 37 |                       prefix=None, by_external_id=False, verbose=False, write_valid_only=False, strict=False, **kwargs):
 38 |         del kwargs
 39 |         task = self.get_user_task(task_id)
 40 |         user_id = task["user"]["id"]
 41 |         passage = None
 42 |         try:
 43 |             passage = next(iter(from_json(task, by_external_id=by_external_id)))
 44 |         except ValueError as e:
 45 |             if strict:
 46 |                 raise ValueError("Failed reading json for task %s:\n%s" % (task_id, json.dumps(task))) from e
 47 |             print("", task_id, user_id, "Failed reading json", file=validate or sys.stderr, sep="\t", flush=True)
 48 |         if normalize and passage is not None:
 49 |             try:
 50 |                 normalization.normalize(passage)
 51 |             except AssertionError as e:
 52 |                 if strict:
 53 |                     raise ValueError("Failed normalizing task %s:\n%s" % (task_id, json.dumps(task))) from e
 54 |                 print(passage.ID, task_id, user_id, "Failed normalizing task: %s" % e, file=validate or sys.stderr,
 55 |                       sep="\t", flush=True)
 56 |         if log:
 57 |             print(passage.ID, task_id, user_id, task["user_comment"], task["created_at"], task["updated_at"],
 58 |                   file=log, sep="\t", flush=True)
 59 |         ret = passage, task_id, user_id
 60 |         if validate or write_valid_only:
 61 |             for error in validation.validate(passage, linkage=False):
 62 |                 if validate:
 63 |                     print(passage.ID, task_id, user_id, error, file=validate, sep="\t", flush=True)
 64 |                 if write_valid_only:
 65 |                     return ret
 66 |         if write:
 67 |             write_passage(passage, binary=binary, outdir=out_dir, prefix=prefix, verbose=verbose)
 68 |         return ret
 69 | 
 70 |     @staticmethod
 71 |     def add_arguments(argparser):
 72 |         argparser.add_argument("task_ids", nargs="+", help="IDs of tasks to download and convert")
 73 |         argparser.add_argument("-f", "--by-filename", action="store_true", help="treat task_ids as a filename, "
 74 |                                                                                 "otherwise it is a list of IDs")
 75 |         TaskDownloader.add_write_arguments(argparser)
 76 |         argparser.add_argument("-V", "--validate", help="run validation on downloaded passages and save errors to file")
 77 |         argparser.add_argument("-N", "--normalize", action="store_true", help="normalize downloaded passages")
 78 |         argparser.add_argument("--strict", action="store_true", help="fail on reading or normalization error")
 79 |         argparser.add_argument("-l", "--log", help="filename to write log of downloaded passages to")
 80 |         ServerAccessor.add_arguments(argparser)
 81 | 
 82 |     @staticmethod
 83 |     def add_write_arguments(argparser):
 84 |         argparser.add_argument("-o", "--out-dir", default=".", help="output directory")
 85 |         argparser.add_argument("-p", "--prefix", default="", help="output filename prefix")
 86 |         argparser.add_argument("-x", "--by-external-id", action="store_true", help="save filename by external ID")
 87 |         argparser.add_argument("-b", "--binary", action="store_true", help="write in binary format (.pickle)")
 88 |         argparser.add_argument("-n", "--no-write", action="store_false", dest="write", help="do not write files")
 89 |         argparser.add_argument("--write-valid-only", action="store_true", help="only write passages that passed "
 90 |                                                                                "validation")
 91 | 
 92 | 
 93 | def main(**kwargs):
 94 |     list(TaskDownloader(**kwargs).download_tasks(**kwargs))
 95 | 
 96 | 
 97 | if __name__ == "__main__":
 98 |     argument_parser = argparse.ArgumentParser(description=desc)
 99 |     TaskDownloader.add_arguments(argument_parser)
100 |     main(**vars(argument_parser.parse_args()))
101 |     sys.exit(0)
102 | 


--------------------------------------------------------------------------------
/uccaapp/evaluate.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import sys
 3 | 
 4 | from tqdm import tqdm
 5 | 
 6 | from ucca.evaluation import evaluate, Scores, LABELED, UNLABELED
 7 | from uccaapp.download_task import TaskDownloader
 8 | 
 9 | desc = """Download tasks from UCCA-App and evaluate them"""
10 | 
11 | 
12 | def main(task_ids, by_filename=False, validate=None, log=None, **kwargs):
13 |     kwargs["write"] = False
14 |     if by_filename:
15 |         task_ids_from_file = []
16 |         for filename in task_ids:
17 |             with open(filename, 'r') as f:
18 |                 task_ids_from_file += zip(*list(map(str.split, filter(None, map(str.strip, f)))))
19 |         task_ids = task_ids_from_file
20 |     else:
21 |         task_ids = [[task_id] for task_id in task_ids]
22 |     assert len(task_ids) == 2, "Got %d lists of task IDs instead of two" % len(task_ids)
23 |     downloader = TaskDownloader(**kwargs)
24 |     scores = []
25 |     validate_h = open(validate, "w", encoding="utf-8") if validate else None
26 |     log_h = open(log, "w", encoding="utf-8") if log else None
27 |     if log:
28 |         fields = ["guessed", "ref"] + Scores.field_titles(eval_type=LABELED) + Scores.field_titles(eval_type=UNLABELED)
29 |         print(*fields, file=log_h, sep="\t", flush=True)
30 |     for task_id_pair in tqdm(list(zip(*task_ids)), unit=" tasks", desc="Evaluating"):
31 |         passage_pair = []
32 |         for task_id in task_id_pair:
33 |             passage, *_ = downloader.download_task(task_id, validate=validate_h, **kwargs)
34 |             passage_pair.append(passage)
35 |         score = evaluate(*passage_pair, **kwargs)
36 |         if log:
37 |             fields = list(task_id_pair) + score.fields(eval_type=LABELED) + score.fields(eval_type=UNLABELED)
38 |             print(*fields, file=log_h, sep="\t", flush=True)
39 |         scores.append(score)
40 |     if validate:
41 |         validate_h.close()
42 |     if log:
43 |         log_h.close()
44 |     print()
45 |     if len(scores) > 1:
46 |         print("Aggregated scores:")
47 |     Scores.aggregate(scores).print()
48 | 
49 | 
50 | def check_args(p, args):
51 |     if len(args.task_ids) not in (1, 2):
52 |         p.error("Must supply exactly two task IDs or files with IDs, but got %d arguments" % len(args.task_ids))
53 |     return args
54 | 
55 | 
56 | if __name__ == "__main__":
57 |     argument_parser = argparse.ArgumentParser(description=desc)
58 |     TaskDownloader.add_arguments(argument_parser)
59 |     main(**vars(check_args(argument_parser, argument_parser.parse_args())))
60 |     sys.exit(0)
61 | 


--------------------------------------------------------------------------------
/uccaapp/get_passage_id.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import sys
 3 | 
 4 | import argparse
 5 | from tqdm import tqdm
 6 | 
 7 | from uccaapp.api import ServerAccessor
 8 | 
 9 | desc = """Get passage ID for tasks"""
10 | 
11 | 
12 | class PassageIdGetter(ServerAccessor):
13 |     def __init__(self, **kwargs):
14 |         super().__init__(**kwargs)
15 | 
16 |     def get_passage_ids(self, filename, **kwargs):
17 |         del kwargs
18 |         with open(filename, encoding="utf-8") as f:
19 |             task_ids = list(map(str.strip, f))
20 |         for task_id in tqdm(task_ids, unit=" tasks", desc="Getting passage IDs"):
21 |             task = self.get_task(task_id)
22 |             passage_id = task["passage"]["id"]
23 |             yield passage_id
24 | 
25 |     @staticmethod
26 |     def add_arguments(argparser):
27 |         argparser.add_argument("filename", help="file with lines of the form <TASK ID>")
28 |         ServerAccessor.add_arguments(argparser)
29 | 
30 | 
31 | def main(**kwargs):
32 |     print(*PassageIdGetter(**kwargs).get_passage_ids(**kwargs), sep="\n")
33 | 
34 | 
35 | if __name__ == "__main__":
36 |     argument_parser = argparse.ArgumentParser(description=desc)
37 |     PassageIdGetter.add_arguments(argument_parser)
38 |     main(**vars(argument_parser.parse_args()))
39 |     sys.exit(0)
40 | 


--------------------------------------------------------------------------------
/uccaapp/set_external_id.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import argparse
 3 | import sys
 4 | 
 5 | from tqdm import tqdm
 6 | 
 7 | from uccaapp.api import ServerAccessor
 8 | 
 9 | desc = """Set the external ID for passages"""
10 | 
11 | 
12 | class ExternalIdSetter(ServerAccessor):
13 |     def __init__(self, **kwargs):
14 |         super().__init__(**kwargs)
15 | 
16 |     def set_external_ids(self, filename, by_task_id=False, **kwargs):
17 |         del kwargs
18 |         with open(filename, encoding="utf-8") as f:
19 |             passage_id_to_external_id = list(map(str.split, map(str.strip, f)))
20 |         for external_id, passage_id in tqdm(passage_id_to_external_id, unit=" passages", desc="Setting external IDs"):
21 |             if by_task_id:
22 |                 task = self.get_task(passage_id)
23 |                 passage_id = task["passage"]["id"]
24 |             passage = self.get_passage(passage_id)
25 |             if passage["external_id"] == external_id:
26 |                 continue
27 |             passage["external_id"] = external_id
28 |             passage_out = self.update_passage(**passage)
29 |             assert passage_out["external_id"] == external_id, "External ID failed to update for passage %s" % passage_id
30 |             yield passage_out
31 | 
32 |     @staticmethod
33 |     def add_arguments(argparser):
34 |         argparser.add_argument("filename", help="file with lines of the form <EXTERNAL ID> <PASSAGE ID>")
35 |         argparser.add_argument("--by-task-id", action="store_true", help="expect task ID instead of passage ID")
36 |         ServerAccessor.add_arguments(argparser)
37 | 
38 | 
39 | def main(**kwargs):
40 |     list(ExternalIdSetter(**kwargs).set_external_ids(**kwargs))
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     argument_parser = argparse.ArgumentParser(description=desc)
45 |     ExternalIdSetter.add_arguments(argument_parser)
46 |     main(**vars(argument_parser.parse_args()))
47 |     sys.exit(0)
48 | 


--------------------------------------------------------------------------------
/uccaapp/set_tasks_to_ongoing.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import sys
 3 | 
 4 | import argparse
 5 | from tqdm import tqdm
 6 | 
 7 | from uccaapp.api import ServerAccessor
 8 | 
 9 | desc = """Sets the status of submitted tasks to ONGOING or SUBMITTED"""
10 | 
11 | ONGOING_STATUS = "ONGOING"
12 | SUBMITTED_STATUS = "SUBMITTED"
13 | 
14 | 
15 | class TaskStatusSetter(ServerAccessor):
16 |     def __init__(self, **kwargs):
17 |         super().__init__(**kwargs)
18 | 
19 |     def set_task_status(self, status, filename, **kwargs):
20 |         del kwargs
21 |         with open(filename) as f:
22 |             task_ids = list(f.readlines())
23 |         for task_id in task_ids:
24 |             task = self.get_task(int(task_id))
25 |             task["status"] = status
26 |             task_out = self.update_task(**task)
27 |             assert task_out["status"] == status
28 |             yield task_out
29 | 
30 |     @staticmethod
31 |     def add_arguments(argparser):
32 |         argparser.add_argument("filename", help="file with lines, each with a different task ID")
33 |         argument_parser.add_argument("-s", "--status", help="should be ONGOING or SUBMITTED",
34 |                                      choices=[ONGOING_STATUS, SUBMITTED_STATUS])
35 |         ServerAccessor.add_arguments(argparser)
36 | 
37 | 
38 | def main(**kwargs):
39 |     list(TaskStatusSetter(**kwargs).set_task_status(**kwargs))
40 | 
41 | 
42 | if __name__ == "__main__":
43 |     argument_parser = argparse.ArgumentParser(description=desc)
44 |     TaskStatusSetter.add_arguments(argument_parser)
45 |     main(**vars(argument_parser.parse_args()))
46 |     sys.exit(0)
47 | 


--------------------------------------------------------------------------------
/uccaapp/submit_tasks.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import argparse
 3 | import json
 4 | import sys
 5 | 
 6 | import requests
 7 | 
 8 | from ucca import convert
 9 | from ucca import normalization, validation
10 | from uccaapp.api import ServerAccessor
11 | 
12 | desc = """Sets the status of submitted tasks to ONGOING"""
13 | 
14 | SUBMITTED_STATUS = "SUBMITTED"
15 | 
16 | class TaskSubmitter(ServerAccessor):
17 | 
18 |     def __init__(self, **kwargs):
19 |         super().__init__(**kwargs)
20 | 
21 |     def submit_tasks(self, filename, log_file, **kwargs):
22 |         del kwargs
23 |         log_file = open(log_file,'w')
24 |         with open(filename) as f:
25 |             task_ids = list(f.readlines())
26 |         for task_id in task_ids:
27 |             try:
28 |                 task_id = task_id.strip()
29 |                 task = self.get_user_task(int(task_id))
30 |                 if task['type'] not in ['ANNOTATION', 'REVIEW']:
31 |                     print(task_id, "NOT AN ANNOTATION/REVIEW TASK", file=log_file, sep="\t", flush=True)
32 |                     continue
33 |                 try:
34 |                     passage = next(iter(convert.from_json(task)))
35 |                 except ValueError as e:
36 |                     raise ValueError("Failed reading json for task %s:\n%s" % (task_id, json.dumps(task))) from e
37 |                 # validate the task
38 |                 normalization.normalize(passage)
39 |                 validation_errors = list(validation.validate(passage, linkage=False))
40 |                 if len(validation_errors) == 0:
41 |                         self.submit_task(**task)
42 |                         print(task_id, "SUBMITTED", file=log_file, sep="\t", flush=True)
43 |                 else:
44 |                     for error in validation_errors:
45 |                         print(task_id, error, file=log_file, sep="\t", flush=True)
46 |             except requests.exceptions.HTTPError as e:
47 |                 print(task_id, "HTTP Request Error: "+str(e), file=log_file, sep="\t", flush=True)
48 | 
49 | 
50 |     @staticmethod
51 |     def add_arguments(argparser):
52 |         argparser.add_argument("filename", help="file with lines, each with a different task ID")
53 |         argparser.add_argument("-l","--log_file", help="output log file")
54 | 
55 |         ServerAccessor.add_arguments(argparser)
56 | 
57 | 
58 | def main(**kwargs):
59 |     TaskSubmitter(**kwargs).submit_tasks(**kwargs)
60 | 
61 | 
62 | if __name__ == "__main__":
63 |     argument_parser = argparse.ArgumentParser(description=desc)
64 |     TaskSubmitter.add_arguments(argument_parser)
65 |     main(**vars(argument_parser.parse_args()))
66 |     sys.exit(0)
67 | 
68 | 
69 | 


--------------------------------------------------------------------------------
/uccaapp/tokenize_and_upload.py:
--------------------------------------------------------------------------------
 1 | import os.path
 2 | import sys
 3 | from argparse import ArgumentParser
 4 | 
 5 | from ucca.convert import from_text, to_json
 6 | from uccaapp.api import ServerAccessor
 7 | 
 8 | desc = """
 9 | Read input file as one line per paragraph, where paragraphs are separated by multiple newlines and an optional
10 | <DELIMITER>.
11 | Tokenize and upload as submitted tokenization tasks, then create annotation tasks from them.
12 | 
13 | Tokenization in Russian requires:
14 |     pip install git+https://github.com/aatimofeev/spacy_russian_tokenizer.git
15 | """
16 | 
17 | 
18 | class TokenizerUploader(ServerAccessor):
19 |     def __init__(self, user_id, source_id, project_id, lang=None, **kwargs):
20 |         super().__init__(**kwargs)
21 |         self.set_source(source_id)
22 |         self.set_project(project_id)
23 |         self.set_user(user_id)
24 | 
25 |     def tokenize_and_upload(self, filename, log=None, lang=None, **kwargs):
26 |         del kwargs
27 |         log_h = open(log, "w", encoding="utf-8") if log else None
28 |         prefix = os.path.splitext(os.path.basename(filename))[0].replace(" ", "_")
29 |         with open(filename, encoding="utf-8") as f:
30 |             for passage, text in from_text(f, passage_id=prefix, lang=lang, return_text=True):
31 |                 passage_out = self.create_passage(text=text, type="PUBLIC", source=self.source)
32 |                 task_in = dict(type="TOKENIZATION", status="SUBMITTED", project=self.project,
33 |                                user=self.user, passage=passage_out, manager_comment=passage.ID,
34 |                                user_comment="", parent=None, is_demo=False, is_active=True)
35 |                 tok_task_out = self.create_task(**task_in)
36 |                 tok_user_task_in = dict(tok_task_out)
37 |                 tok_user_task_in.update(to_json(passage, return_dict=True, tok_task=True))
38 |                 self.submit_task(**tok_user_task_in)
39 |                 task_in.update(parent=tok_task_out, type="ANNOTATION")
40 |                 ann_user_task_out = self.create_task(**task_in)
41 |                 print("Uploaded passage " + filename + " successfully.", file=sys.stderr)
42 |                 if log:
43 |                     print(passage.ID, passage_out["id"], tok_task_out["id"], ann_user_task_out["id"],
44 |                           file=log_h, sep="\t", flush=True)
45 |         if log:
46 |             log_h.close()
47 | 
48 |     @staticmethod
49 |     def add_arguments(argparser):
50 |         argparser.add_argument("filename", help="text file with one line paragraph, where paragraphs are separated "
51 |                                                 "by multiple newlines and an optional <DELIMITER>")
52 |         argparser.add_argument("-l", "--log", help="filename to write log of uploaded passages to")
53 |         argparser.add_argument("--lang", choices=["ru", "en", "fr", "de"], default="ru",
54 |                                help="language two-letter code, for tokenizer")
55 |         ServerAccessor.add_project_id_argument(argparser)
56 |         ServerAccessor.add_source_id_argument(argparser)
57 |         ServerAccessor.add_user_id_argument(argparser)
58 |         ServerAccessor.add_arguments(argparser)
59 | 
60 | 
61 | def main(**kwargs):
62 |     TokenizerUploader(**kwargs).tokenize_and_upload(**kwargs)
63 | 
64 | 
65 | if __name__ == "__main__":
66 |     argument_parser = ArgumentParser(description=desc)
67 |     TokenizerUploader.add_arguments(argument_parser)
68 |     main(**vars(argument_parser.parse_args()))
69 | 


--------------------------------------------------------------------------------
/uccaapp/transfer_categories.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import argparse
 3 | from ucca.convert import from_json
 4 | from uccaapp.api import ServerAccessor
 5 | 
 6 | desc = """Download task from UCCA-App and convert to a passage in standard format"""
 7 | 
 8 | 
 9 | 
10 | def add_arguments(argparser):
11 |     argparser.add_argument("category_ids", nargs="+", type=int, help="IDs of tasks to export and import")
12 |     argparser.add_argument("--server-address-orig", help="UCCA-App origin server")
13 |     argparser.add_argument("--email-orig", help="UCCA-App origin email")
14 |     argparser.add_argument("--password-orig", help="UCCA-App origin password")
15 |     argparser.add_argument("--server-address-target", help="UCCA-App target server")
16 |     argparser.add_argument("--email-target", help="UCCA-App target email")
17 |     argparser.add_argument("--password-target", help="UCCA-App target password")
18 | 
19 | 
20 | def main(args):
21 |     server_accessor_origin = ServerAccessor(server_address=args.server_address_orig,
22 |                                             email=args.email_orig, password=args.password_orig,auth_token=None,verbose=True)
23 |     server_accessor_target = ServerAccessor(server_address=args.server_address_target,
24 |                                             email=args.email_target, password=args.password_target,auth_token=None,verbose=True)
25 |     for category_id in args.category_ids:
26 |         #try:
27 |         category_out = server_accessor_origin.get_category(category_id)
28 |         server_accessor_target.create_category(**category_out)
29 |         #except:
30 |         #    sys.stderr.write('failed writing category with ID='+str(category_id))
31 |         #    continue
32 | 
33 | 
34 | 
35 | 
36 | 
37 | if __name__ == "__main__":
38 |     argument_parser = argparse.ArgumentParser(description=desc)
39 |     add_arguments(argument_parser)
40 |     main(argument_parser.parse_args())
41 | 


--------------------------------------------------------------------------------
/uccaapp/upload_conllu_passages.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import sys
 3 | 
 4 | import argparse
 5 | import re
 6 | from glob import glob
 7 | 
 8 | from ucca.convert import to_json, from_text
 9 | from uccaapp.api import ServerAccessor
10 | 
11 | try:
12 |     from simplejson.scanner import JSONDecodeError
13 | except ImportError:
14 |     from json.decoder import JSONDecodeError
15 | 
16 | desc = """Upload passages from CoNLL-U files including complete tokenization, and create annotation task for each"""
17 | 
18 | 
19 | class ConlluPassageUploader(ServerAccessor):
20 |     def __init__(self, user_id, annotation_user_id, source_id, project_id, **kwargs):
21 |         super().__init__(**kwargs)
22 |         self.set_source(source_id)
23 |         self.set_project(project_id)
24 |         self.set_user(user_id)
25 |         self.annotation_user = dict(id=annotation_user_id) if annotation_user_id else self.user
26 |         
27 |     def upload_passages(self, filenames, **kwargs):
28 |         del kwargs
29 |         for pattern in filenames:
30 |             filenames = sorted(glob(pattern))
31 |             if not filenames:
32 |                 raise IOError("Not found: " + pattern)
33 |             for filename in sorted(filenames):
34 |                 with open(filename, encoding="utf-8") as f:
35 |                     external_id = None
36 |                     tokens = []
37 |                     try:
38 |                         for line in f:
39 |                             line = line.strip()
40 |                             m = re.match(r"^# sent_id = (.*)", line)
41 |                             if m:
42 |                                 external_id = m.group(1)
43 |                             elif line:
44 |                                 tokens.append(line.split("\t")[1])
45 |                             else:
46 |                                 self.upload_passage(external_id, tokens)
47 |                                 external_id = None
48 |                                 tokens = []
49 |                         if tokens:
50 |                             self.upload_passage(external_id, tokens)
51 |                     except (IndexError, AssertionError) as e:
52 |                         raise ValueError(filename) from e
53 | 
54 |     def upload_passage(self, external_id, tokens):
55 |         assert external_id, "Missing external ID for passage %s" % tokens
56 |         assert tokens, "Empty passage %s" % external_id
57 |         passage_out = self.create_passage(text=" ".join(tokens), external_id=external_id, type="PUBLIC",
58 |                                           source=self.source)
59 |         task_in = dict(type="TOKENIZATION", status="SUBMITTED", project=self.project, user=self.user,
60 |                        passage=passage_out, manager_comment="External ID: "+external_id,
61 |                        user_comment="", parent=None, is_demo=False, is_active=True)
62 |         tok_task_out = self.create_task(**task_in)
63 |         tok_user_task_in = dict(tok_task_out)
64 |         passage = list(from_text(tokens, tokenized=True))[0]
65 |         tok_user_task_in.update(to_json(passage, return_dict=True, tok_task=True))
66 |         self.submit_task(**tok_user_task_in)
67 |         task_in = dict(type="ANNOTATION", status="NOT_STARTED", project=self.project, user=self.annotation_user,
68 |                        passage=tok_task_out["passage"], manager_comment="External ID: "+external_id,
69 |                        user_comment=external_id, parent=tok_task_out, is_demo=False, is_active=True)
70 |         self.create_task(**task_in)
71 |         print("Uploaded passage "+external_id+" successfully")
72 | 
73 |     @staticmethod
74 |     def add_arguments(argparser):
75 |         argparser.add_argument("filenames", nargs="+", help="filename pattern of CoNLL-U files")
76 |         ServerAccessor.add_project_id_argument(argparser)
77 |         ServerAccessor.add_source_id_argument(argparser)
78 |         ServerAccessor.add_user_id_argument(argparser)
79 |         argparser.add_argument("--annotation-user-id", type=int, help="user id for annotation tasks, if different")
80 |         ServerAccessor.add_arguments(argparser)
81 | 
82 | 
83 | def main(**kwargs):
84 |     ConlluPassageUploader(**kwargs).upload_passages(**kwargs)
85 | 
86 | 
87 | if __name__ == "__main__":
88 |     argument_parser = argparse.ArgumentParser(description=desc)
89 |     ConlluPassageUploader.add_arguments(argument_parser)
90 |     main(**vars(argument_parser.parse_args()))
91 |     sys.exit(0)
92 | 
93 | 


--------------------------------------------------------------------------------
/uccaapp/upload_streussel_passages.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import sys
 3 | 
 4 | import argparse
 5 | 
 6 | from ucca.convert import from_text, to_json
 7 | from uccaapp.api import ServerAccessor
 8 | 
 9 | desc = """Upload a passage from a streussel format file"""
10 | 
11 | 
12 | class StreusselPassageUploader(ServerAccessor):
13 |     def __init__(self, user_id, source_id, project_id, **kwargs):
14 |         super().__init__(**kwargs)
15 |         self.set_source(source_id)
16 |         self.set_project(project_id)
17 |         self.set_user(user_id)
18 | 
19 |     def upload_streussel_passage_file(self, filenames, log=None, **kwargs):
20 |         del kwargs
21 |         log_h = open(log, "w", encoding="utf-8") if log else None
22 |         with open(filenames) as f_all:
23 |             for filename in f_all:
24 |                 passage_text = ""
25 |                 external_id = "None given"
26 |                 filename = filename.strip()
27 |                 with open(filename, encoding="utf-8") as f:
28 |                     for line in f:
29 |                         line = line.strip()
30 |                         if not line:
31 |                             continue
32 |                         elif line.startswith("#"):
33 |                             fields = line.split()
34 |                             if len(fields) != 4 or fields[1] != "sent_id":
35 |                                 print("FORMAT ERROR in " + filename, file=sys.stderr)
36 |                             else:
37 |                                 external_id = fields[3].split("-")[1]
38 |                         else:
39 |                             passage_text = passage_text + " " + line
40 |                 passage_out = self.create_passage(text=passage_text.strip(), external_id=external_id, type="PUBLIC",
41 |                                                   source=self.source)
42 |                 task_in = dict(type="TOKENIZATION", status="SUBMITTED", project=self.project,
43 |                                user=self.user, passage=passage_out, manager_comment="External ID: " + external_id,
44 |                                user_comment="", parent=None, is_demo=False, is_active=True)
45 |                 tok_task_out = self.create_task(**task_in)
46 |                 tok_user_task_in = dict(tok_task_out)
47 | 
48 |                 passage = list(from_text(passage_text.split(), tokenized=True))[0]
49 |                 tok_user_task_in.update(to_json(passage, return_dict=True, tok_task=True))
50 | 
51 |                 self.submit_task(**tok_user_task_in)
52 |                 print("Uploaded passage " + filename + " successfully.", file=sys.stderr)
53 |                 if log:
54 |                     print(filename.split(".")[-2], passage_out["id"], tok_task_out["id"], file=log_h, sep="\t")
55 |         if log:
56 |             log_h.close()
57 | 
58 |     @staticmethod
59 |     def add_arguments(argparser):
60 |         argparser.add_argument("filenames", help="passage file names to convert and upload")
61 |         argparser.add_argument("-l", "--log", help="filename to write log of uploaded passages to")
62 |         ServerAccessor.add_project_id_argument(argparser)
63 |         ServerAccessor.add_source_id_argument(argparser)
64 |         ServerAccessor.add_user_id_argument(argparser)
65 |         ServerAccessor.add_arguments(argparser)
66 | 
67 | 
68 | def main(**kwargs):
69 |     StreusselPassageUploader(**kwargs).upload_streussel_passage_file(**kwargs)
70 | 
71 | 
72 | if __name__ == "__main__":
73 |     argument_parser = argparse.ArgumentParser(description=desc)
74 |     StreusselPassageUploader.add_arguments(argument_parser)
75 |     main(**vars(argument_parser.parse_args()))
76 |     sys.exit(0)
77 | 


--------------------------------------------------------------------------------
/uccaapp/upload_task.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import argparse
  3 | import logging
  4 | import sys
  5 | 
  6 | from requests.exceptions import HTTPError
  7 | import json
  8 | 
  9 | from ucca.convert import to_json, to_text
 10 | from ucca.ioutil import get_passages_with_progress_bar
 11 | from uccaapp.api import ServerAccessor
 12 | 
 13 | try:
 14 |     from simplejson.scanner import JSONDecodeError
 15 | except ImportError:
 16 |     from json.decoder import JSONDecodeError
 17 | 
 18 | desc = """Convert a passage file to JSON format and upload to UCCA-App as a completed task"""
 19 | 
 20 | # https://github.com/omriabnd/UCCA-App/blob/master/UCCAApp_REST_API_Reference.pdf
 21 | # ucca-demo.cs.huji.ac.il or ucca.staging.cs.huji.ac.il
 22 | # upload the parse as a (completed) task:
 23 | # 0. decide which project and user you want to assign it to
 24 | # 1. POST passage (easy format)
 25 | # 2. POST task x (of type tokenization)
 26 | # 3. PUT task x (submit)
 27 | # 4. POST task y (of type annotation with parent x; this is the more complicated format)
 28 | # 5. PUT task y (submit)
 29 | 
 30 | 
 31 | class TaskUploader(ServerAccessor):
 32 |     def __init__(self, user_id, source_id, project_id, **kwargs):
 33 |         super().__init__(**kwargs)
 34 |         self.set_source(source_id)
 35 |         self.set_project(project_id)
 36 |         self.set_user(user_id)
 37 |         
 38 |     def upload_tasks(self, filenames, log=None, submit=True, existing_ids=None, **kwargs):
 39 |         del kwargs
 40 |         log_h = open(log, "w", encoding="utf-8") if log else None
 41 |         if existing_ids:
 42 |             with open(existing_ids, "r", encoding="utf-8") as ids_h:
 43 |                 ids = {old_passage_id: (passage_id, tok_id, ann_id)
 44 |                        for (old_passage_id, passage_id, tok_id, ann_id)
 45 |                        in map(str.split, ids_h)}
 46 |         else:
 47 |             ids = None
 48 |         try:
 49 |             for passage in get_passages_with_progress_bar(filenames, desc="Uploading"):
 50 |                 logging.debug("Uploading passage %s" % passage.ID)
 51 |                 task = self.upload_task(passage, log=log_h, submit=submit, ids=ids)
 52 |                 logging.debug("Submitted task %d" % task["id"])
 53 |                 yield task
 54 |         except HTTPError as e:
 55 |             try:
 56 |                 raise ValueError((e.response.json() if e.response else json.loads(e.args[0]))["detail"]) from e
 57 |             except JSONDecodeError:
 58 |                 raise ValueError(e.response.text) from e
 59 |             finally:
 60 |                 if log:
 61 |                     log_h.close()
 62 | 
 63 |     def upload_task(self, passage, log=None, submit=True, ids=None, upload=True):
 64 |         if ids:
 65 |             passage_id, tok_id, ann_id = ids[passage.ID]
 66 |             passage_out = self.get_passage(passage_id)
 67 |             tok_user_task_out = tok_task_out = self.get_user_task(tok_id)
 68 |             ann_user_task_in = self.get_user_task(ann_id)
 69 |         else:
 70 |             passage_out = self.create_passage(text=to_text(passage, sentences=False)[0], type="PUBLIC",
 71 |                                               source=self.source, external_id=passage.ID) if upload else passage
 72 |             task_in = dict(type="TOKENIZATION", status="ONGOING", project=self.project, user=self.user,
 73 |                            passage=passage_out, manager_comment=passage.ID, user_comment=passage.ID, parent=None,
 74 |                            is_demo=False, is_active=True)
 75 |             tok_task_out = self.create_task(**task_in) if upload else task_in
 76 |             tok_user_task_in = dict(tok_task_out)
 77 |             tok_user_task_in.update(to_json(passage, return_dict=True, tok_task=True))
 78 |             tok_user_task_out = self.submit_task(**tok_user_task_in) if upload else tok_user_task_in
 79 |             task_in.update(parent=tok_task_out, type="ANNOTATION")
 80 |             ann_user_task_in = self.create_task(**task_in) if upload else task_in
 81 |         ann_user_task_in.update(
 82 |             to_json(passage, return_dict=True, tok_task=tok_user_task_out, all_categories=self.layer["categories"]))
 83 |         ann_user_task_out = self.submit_task(**ann_user_task_in, submit=submit) if upload else ann_user_task_in
 84 |         if log:
 85 |             print(passage.ID, passage_out["id"], tok_task_out["id"], ann_user_task_out["id"],
 86 |                   file=log, sep="\t", flush=True)
 87 |         return ann_user_task_out
 88 | 
 89 |     @staticmethod
 90 |     def add_arguments(argparser):
 91 |         argparser.add_argument("filenames", nargs="+", help="passage file names to convert and upload")
 92 |         argparser.add_argument("-l", "--log", help="filename to write log of uploaded passages to")
 93 |         argparser.add_argument("--no-submit", action="store_false", dest="submit", help="do not submit annotation task")
 94 |         argparser.add_argument("--existing-ids", help="use existing task IDs from file (output of --log); no creation")
 95 |         argparser.add_argument("-n", "--no-upload", action="store_false", dest="upload", help="do not upload anything")
 96 |         ServerAccessor.add_project_id_argument(argparser)
 97 |         ServerAccessor.add_source_id_argument(argparser)
 98 |         ServerAccessor.add_user_id_argument(argparser)
 99 |         ServerAccessor.add_arguments(argparser)
100 | 
101 | 
102 | def main(**kwargs):
103 |     list(TaskUploader(**kwargs).upload_tasks(**kwargs))
104 | 
105 | 
106 | if __name__ == "__main__":
107 |     argument_parser = argparse.ArgumentParser(description=desc)
108 |     TaskUploader.add_arguments(argument_parser)
109 |     main(**vars(argument_parser.parse_args()))
110 |     sys.exit(0)
111 | 


--------------------------------------------------------------------------------