├── .appveyor.yml ├── .github └── workflows │ ├── python-package.yml │ └── python-publish.yml ├── .gitignore ├── .travis.yml ├── EVALUATION.md ├── LICENSE.txt ├── README.md ├── ci └── deploy.sh ├── docs ├── Makefile ├── README ├── api.rst ├── conf.py ├── index.rst ├── requirements.txt ├── scripts.rst ├── short_defs.pdf ├── toy.xml ├── ucca_db.rst └── uccaapp.rst ├── requirements.distances.txt ├── requirements.txt ├── requirements.visualize.txt ├── scripts ├── __init__.py ├── annotate.py ├── convert_1_0_to_1_2.py ├── convert_2_0_to_1_2.py ├── convert_articles_and_reflexives.py ├── count_parents_children.py ├── count_tokens.py ├── distances │ ├── __init__.py │ └── align.py ├── evaluate_db.py ├── evaluate_standard.py ├── find_constructions.py ├── fix_tokenization.py ├── join_passages.py ├── join_sdp.py ├── load_word_vectors.py ├── match_text.py ├── normalize.py ├── pickle_to_standard.py ├── remove_br_tokens.py ├── replace_tokens_by_dict.py ├── set_external_id_offline.py ├── site_pickle_to_standard.py ├── site_to_standard.py ├── site_to_text.py ├── split_corpus.py ├── standard_to_json.py ├── standard_to_paragraphs.py ├── standard_to_pickle.py ├── standard_to_sentences.py ├── standard_to_site.py ├── standard_to_text.py ├── statistics.py ├── text_to_standard.py ├── unique_roles.py ├── validate.py ├── visualize.py └── visualize_as_text.py ├── setup.cfg ├── setup.py ├── test_files ├── 120_parsed.xml ├── implicit1.xml ├── implicit1_ref.xml ├── implicit2.xml ├── implicit2_ref.xml ├── site1.xml ├── site2.xml ├── site3.xml ├── site4.xml ├── site5.xml ├── standard3.xml ├── standard3_valid.xml └── toy_bad.xml ├── ucca ├── README.md ├── __init__.py ├── __version__.py ├── constructions.py ├── convert.py ├── core.py ├── diffutil.py ├── evaluation.py ├── ioutil.py ├── layer0.py ├── layer1.py ├── normalization.py ├── tests │ ├── __init__.py │ ├── conftest.py │ ├── test_constructions.py │ ├── test_convert.py │ ├── test_core.py │ ├── test_evaluation.py │ ├── test_ioutil.py │ ├── test_layer0.py │ ├── test_layer1.py │ ├── test_normalization.py │ ├── test_textutil.py │ ├── test_validation.py │ └── test_visualization.py ├── textutil.py ├── validation.py └── visualization.py ├── ucca_db ├── __init__.py ├── api.py ├── download.py └── upload.py └── uccaapp ├── __init__.py ├── api.py ├── convert_and_evaluate.py ├── copy_categories.py ├── create_annotation_tasks.py ├── create_tokenization_tasks.py ├── download_task.py ├── evaluate.py ├── export_units_by_filter.py ├── get_passage_id.py ├── set_external_id.py ├── set_tasks_to_ongoing.py ├── submit_tasks.py ├── tokenize_and_upload.py ├── transfer_categories.py ├── upload_conllu_passages.py ├── upload_streussel_passages.py └── upload_task.py /.appveyor.yml: -------------------------------------------------------------------------------- 1 | os: Visual Studio 2015 2 | 3 | platform: x64 4 | 5 | environment: 6 | MSVC_DEFAULT_OPTIONS: ON 7 | MINICONDA: "C:\\Miniconda36-x64" 8 | 9 | configuration: Release 10 | 11 | init: 12 | - cmd: cmake --version 13 | - cmd: msbuild /version 14 | 15 | install: 16 | - cmd: git submodule update --init --recursive 17 | - set PATH=%MINICONDA%;%MINICONDA%\Scripts;%PATH% 18 | - conda config --set always_yes yes --set changeps1 no 19 | - conda update -q conda 20 | - conda info -a 21 | - conda create -q -n test-env python=3.6 cython numpy matplotlib networkx pytest 22 | - activate test-env 23 | - pip install . 24 | - python -m spacy download en_core_web_md 25 | 26 | build: off 27 | 28 | test_script: 29 | - pytest --durations=0 -v ucca/tests 30 | -------------------------------------------------------------------------------- /.github/workflows/python-package.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Python package 5 | 6 | on: 7 | push: 8 | branches: [ master ] 9 | pull_request: 10 | branches: [ master ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | strategy: 17 | matrix: 18 | python-version: ['3.6', '3.7'] 19 | 20 | steps: 21 | - uses: actions/checkout@v2 22 | - name: Set up Python ${{ matrix.python-version }} 23 | uses: actions/setup-python@v2 24 | with: 25 | python-version: ${{ matrix.python-version }} 26 | - name: Install dependencies 27 | run: | 28 | python -m pip install --upgrade pip 29 | python -m pip install flake8 pytest 30 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 31 | if [ -f requirements.visualize.txt ]; then pip install -r requirements.visualize.txt; fi 32 | python -m spacy download en_core_web_md 33 | - name: Lint with flake8 34 | run: | 35 | # stop the build if there are Python syntax errors or undefined names 36 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 37 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 38 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 39 | - name: Test with pytest 40 | run: | 41 | pytest 42 | - name: Test TUPA 43 | run: | 44 | pip install -U --upgrade-strategy=only-if-needed tupa 45 | python -m tupa test_files/standard3.xml -t test_files/standard3.xml -I 1 --max-words-external=50 --word-dim=10 --lstm-layer-dim=10 --embedding-layer-dim=10 46 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflows will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | name: Upload Python Package 5 | 6 | on: 7 | release: 8 | types: [created] 9 | 10 | jobs: 11 | deploy: 12 | 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - uses: actions/checkout@v2 17 | - name: Set up Python 18 | uses: actions/setup-python@v2 19 | with: 20 | python-version: '3.x' 21 | - name: Install dependencies 22 | run: | 23 | python -m pip install --upgrade pip 24 | pip install setuptools wheel twine 25 | - name: Build and publish 26 | env: 27 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 28 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 29 | run: | 30 | python setup.py sdist bdist_wheel 31 | twine upload dist/* 32 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__* 2 | *.pyc 3 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: false 2 | language: python 3 | python: 3.6 4 | env: 5 | global: 6 | - TWINE_USERNAME=danielh 7 | - secure: QrZ/47sh/8WeeTLU37yfhW94bwO2ocsbMMIRebSS9Y+FssrCi9IbSuTp6NliXlJq17rozGtEf9alu9JetE8hnivACGJm0cz2/j3oYaeCxz8sbTpXeEr8JHiDk6MCfCD9VMrpeo04RBmI76BY1mwdCvxQSJEn/NtkI9jjSaqjLCLcaFWD7mTuYefxrPplROQJPu+jcW1snnubntuux1nRxULC3Ge/IRWb4OYajLJcPXiVsdleSNV9avLE2xIPTFZf4cwHpRxZslKgHeyCLk+JoDlL0qneB4UWB/SZF8CHoYvidPJDzG5NHAEgfxSqbUq3DRvgVAPqR0YoQd/MQbPLBN6v1aY2zbqHJtTS1xidnnYIs3gJWVAurx6WjkNc9QYwdN22EPmYDVquW2tZgvi2kHRoJY+gEYylJRY0jOzqYmZUV9WOZeeb2AzgXnVjQubEm0NSYCC3BYjkiSmwpDWTcr/HvCQ+9iOI1OD56F7B6oowzXBP0Z/IClMd9Pb3vs9cRr6di/Vf+ijjUeHQxyKHiv2R2mGnPuR8d/gR538xmbc/RlEt2tycMD25SBAeFdtlUfB5Si8llTSd6YktZzZhkHiaIPBYAVEbrK3832TM7B7sGAa8R6Y8gctP6ccE/kFpSdnFHuENgRu2VZBDx6q8UmkArRLbrCvzmbn658EySkc= 8 | jobs: 9 | include: 10 | - env: TEST=unit 11 | install: 12 | - pip install . 13 | - pip install pytest 14 | - pip install -r requirements.visualize.txt 15 | - python -m spacy download en_core_web_md 16 | script: pytest --durations=0 -v ucca/tests 17 | deploy: 18 | provider: script 19 | script: ci/deploy.sh 20 | on: 21 | repo: huji-nlp/ucca 22 | tags: true 23 | - env: TEST=tupa 24 | install: 25 | - pip install -U --upgrade-strategy=only-if-needed tupa 26 | - python -m spacy download en_core_web_md 27 | script: python -m tupa test_files/standard3.xml -t test_files/standard3.xml -I 1 --max-words-external=50 --word-dim=10 --lstm-layer-dim=10 --embedding-layer-dim=10 28 | -------------------------------------------------------------------------------- /EVALUATION.md: -------------------------------------------------------------------------------- 1 | The evaluation process is done through the `evaluate` function that is located in the [evaluation.py](ucca/evaluation.py) script. 2 | A wrapping script of the `evaluation.py` script is [evaluate_standard.py](scripts/evaluate_standard.py). For more details on how the scripts receives its argument, please write `evaluate_standard --help` in the prompt. 3 | The evaluation process compares the gold-standard annotation of a specific passage, with the calculated annotation of that same passage. 4 | Both passages are of `Passage` object type, which is an object that contains the connected graph that represents the annotation of the passage. 5 | The evaluation includes the recall, precision and F1 scores. The calculation of these scores is done by comparing each edge's labels and yield, which are the literals that are under the edge's child node (if we look at the annotation as a tree). 6 | We can also do an unlabeled evaluation, and then for each edge only its yield will be compared. It is important to know that when there is a remote edge, it is ignored in the yield comparison, but we do look at it when comparing lables of edges. 7 | Also, when there is an implicit node, edges going into them are evaluated by their parent's yield. 8 | 9 | Now let us look more closely at the `evaluate` function: 10 | 11 | The `evaluate` function receives the following input parameters: 12 | 1. guessed: Passage object to evaluate 13 | 2. ref: reference (gold standard) Passage object to compare to 14 | 3. converter: optional function to apply to passages before evaluation. One can choose to convert passages from the following formats to the `Passage` class: 15 | - site XML 16 | - standard XML 17 | - conll (CoNLL-X dependency parsing shared task) 18 | - sdp (SemEval 2015 semantic dependency parsing shared task) 19 | 4. verbose: whether to print the results 20 | 5. constructions: names of construction types to include in the evaluation. By construction we mean that the evaluation can be done on specific types of edges, for example just on the Proccess and State edges. If there is a need in doing the evaluation based on specific labels, a useful flag is `--constructions=categories` , which shows evaluation results per edge label (category). 21 | The default construction includes the following edges: 22 | - primary edges (`--constructions=primary`) 23 | - remote edges (`--constructions=remote`) 24 | - implicit edges (`--constructions=implicit`) 25 | Other types of edges that can be included are: 26 | - aspectual verbs (`--constructions=aspectual_verbs`) 27 | - light verbs (`--constructions=light_verbs`) 28 | - multi-word expressions (mwe) (`--constructions=mwe`) 29 | - predicate nouns (`--constructions=pred_nouns`) 30 | - predicate adjectives (`--constructions=pred_adjs`) 31 | - expletives (`--constructions=expletives`) 32 | 33 | If there is a need in doing the evaluation based on specific labels, a useful flag is `--constructions=categories` , which shows evaluation results per edge label (category). 34 | 6. units: whether to evaluate common units 35 | 7. fscore: whether to compute precision, recall and f1 score 36 | 8. errors: whether to print the mistakes (prints something similar to a confusion matrix). It is worth mentioning the `--as-table` option in the [evaluate_standard.py](scripts/evaluate_standard.py) script, that prints the confusion matrix as a table. 37 | 9. normalize: flatten centers and move common functions to root before evaluation - modifies passages. There's an option to normalize the passages jointly. In order to normalize them seperately, it should be done before calling `evaluate`. 38 | 10. eval_type: specific evaluation type(s) to limit to. One can choose one of the following evaluation types: 39 | - labeled - in the process of evaluation, both the labels of the edges and their yields are compared. 40 | - unlabeled - in the process of evaluation, only the edges' yields are compared. 41 | - weak_labeled - in the process of evaluation, certain types of labels will be considered the same - for example Process and State edges will be considered the same and only their yields will be compared, while Process and Participant will not be considered the same. 42 | 11. ref_yield_tags: reference passage for fine-grained evaluation. In other words, it enables us to do evaluation to edges of different types of labels (that are not part of the UCCA labels), such as subject, object and so on. Nevertheless, the recall, precision and f1 scores will still be calculated based on the UCCA parsing. 43 | 44 | The function evaluate returns a `Score` object, which contains the recall, precision and f1 scores of the generated annotation. 45 | For example, by running [test_validation.py](ucca/tests/test_validation.py), the line [Score](ucca/tests/test_evaluation.py#L331) generates a `Score` class. One of its elements is called `evaluators`, which comprises of three `EvaluatorResults` classes: 46 | - 'labeled' 47 | - 'unlabeled' 48 | - 'weak_labeled' 49 | 50 | Each of those `EvaluatorResults` classes may contain the results for any of the edges mentioned above. As a default it contains the results for 3 types of edges: 51 | - primary 52 | - remote 53 | - impicit 54 | 55 | The results for each such type of edges comprise of: 56 | - errors 57 | - f1 58 | - num_guessed 59 | - num_matches 60 | - num_only_guessed 61 | - num_unly_ref 62 | - num_ref 63 | - p (precision) 64 | - r (recall) 65 | 66 | For more details on the `evaluate` function, please see the following links: 67 | 68 | [evaluate](https://ucca.readthedocs.io/en/latest/api/ucca.evaluation.evaluate.html#ucca.evaluation.evaluate) 69 | 70 | [Scores](https://ucca.readthedocs.io/en/latest/api/ucca.evaluation.Scores.html#ucca.evaluation.Scores) 71 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Universal Conceptual Cognitive Annotation 2 | ============================ 3 | UCCA is a linguistic framework for semantic annotation, whose details 4 | are available at [the following paper](http://aclweb.org/anthology/P13-1023): 5 | 6 | @inproceedings{abend2013universal, 7 | author={Abend, Omri and Rappoport, Ari}, 8 | title={{U}niversal {C}onceptual {C}ognitive {A}nnotation ({UCCA})}, 9 | booktitle={Proc. of ACL}, 10 | month={August}, 11 | year={2013}, 12 | pages={228--238}, 13 | url={http://aclweb.org/anthology/P13-1023} 14 | } 15 | 16 | This Python 3 package provides an API to the UCCA annotation and tools to 17 | manipulate and process it. Its main features are conversion between different 18 | representations of UCCA annotations, and rich objects for all of the linguistic 19 | relations which appear in the theoretical framework (see `core`, `layer0`, `layer1` 20 | and `convert` modules under the `ucca` package). 21 | 22 | The `scripts` package contains various utilities for processing passage files. 23 | 24 | To parse text to UCCA graphs, use [TUPA, the UCCA parser](https://github.com/danielhers/tupa). 25 | 26 | 27 | Authors 28 | ------ 29 | * Amit Beka 30 | * Daniel Hershcovich: dh@di.ku.dk 31 | 32 | 33 | License 34 | ------- 35 | This package is licensed under the GPLv3 or later license. 36 | 37 | [ ~ Dependencies scanned by PyUp.io ~ ] 38 | [![Build Status (Travis CI)](https://travis-ci.org/danielhers/ucca.svg?branch=master)](https://travis-ci.org/danielhers/ucca) 39 | [![Build Status (AppVeyor)](https://ci.appveyor.com/api/projects/status/github/danielhers/ucca?svg=true)](https://ci.appveyor.com/project/danielh/ucca) 40 | [![Build Status (Docs)](https://readthedocs.org/projects/ucca/badge/?version=latest)](http://ucca.readthedocs.io/en/latest/) 41 | [![PyPI version](https://badge.fury.io/py/UCCA.svg)](https://badge.fury.io/py/UCCA) 42 | -------------------------------------------------------------------------------- /ci/deploy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -xe 3 | 4 | pip install collective.checkdocs twine 5 | python setup.py checkdocs || exit 1 6 | python setup.py sdist bdist_wheel 7 | twine upload --skip-existing dist/* 8 | 9 | -------------------------------------------------------------------------------- /docs/README: -------------------------------------------------------------------------------- 1 | The UCCA Corpus 2 | Version 1.1 3 | 29/12/2015 4 | =============== 5 | See updated guidelines at https://github.com/omriabnd/UCCA-Documents 6 | 7 | This bundle contains 369 passages annotated according to the foundational layer of UCCA. 8 | The passages are given as xmls in a format which is described below. The total number of tokens 9 | in this corpus is 158771. It also contains the annotation guidelines that were given to the annotators, 10 | a metadata file and a toy example XML. 11 | 12 | The dataset is a part of the UCCA project developed in the NLP lab of the Hebrew University 13 | by Omri Abend and Ari Rappoport. The users of this dataset are kindly requested to cite the 14 | following publication: 15 | 16 | "UCCA: A Novel Framework for Semantic Representation" / Omri Abend and Ari Rappoport, ACL 2013 17 | 18 | Example passages can be graphically viewed through our web application (URL: vm-05.cs.huji.ac.il). 19 | Please refer to our website (URL: homepages.inf.ed.ac.uk/oabend/) or email (oabend@inf.ed.ac.uk) 20 | for regular updates on the UCCA project and available resources. 21 | 22 | 23 | Files included 24 | -------------- 25 | 1. The passages files in an XML format. file names are of the form "ucca_passageXXX.xml" where XXX 26 | is the passage ID. Please see the UCCA resource webpage for a software package for reading and using 27 | these files. 28 | 2. toy.xml: a toy example for explaining the UCCA xml format. 29 | 3. metadata: a file that contains some metadata for the passages. Specifically it contains the source 30 | of the text used (i.e., the Wikipedia article it was taken from), and the index of the annotator 31 | that did the final proof-reading (it can be 2,3 or 6). 32 | 4. guidelines.pdf: the annotation guidelines that were given to the annotators are summarized in 33 | this file named "UCCA in a nutshell". Concise definitions are available through the UCCA website 34 | as well. 35 | 5. short_defs.pdf: a brief summary of the categories used by UCCA's foundational layer. 36 | 37 | 38 | XML format: 39 | ----------- 40 | 41 | The xml format allows easy extension with further layers. The top level of each xml is composed of 42 | the layers annotated over the passage. Each layer has a unique ID and a set of nodes that it introduces. 43 | Each node specifies its outbound edges. The ID of a node is formatted as 44 | ".". 45 | 46 | Layer 0 is a special layer which specifies the tokens of the passage and their linear order. Its nodes 47 | are therefore the tokens themselves. Each node may either be of type "Word" or of type "Punctuation". 48 | The attribute "paragraph" specifies the number of the paragraph the terminal belongs to, while 49 | "paragraph_position" specifies the position of the terminal inside that paragraph. The attribute 50 | "text" specifies the written form of the terminal. 51 | 52 | Layer 1 is the foundational layer of UCCA. Although non-terminal nodes of UCCA are generally untyped 53 | (their type is effectively determined by their inbound and outbound edges), the xml format does separate 54 | the nodes into three coarse-grained types: 55 | (1) FN (regular node) 56 | (2) PNCT (a node whose only descendant is a punctuation terminal) 57 | (3) LKG (a linkage node). 58 | We note that the node type does not provide any additional information, as it can be deterministically 59 | derived from the identity of its edges. It is therefore only used for easier readability. 60 | 61 | Each node specifies its outbound edges through its "edge" elements. The ID of the node to which the edge is 62 | directed is specified by the attribute "toID". The type of the edge may be either of the following: 63 | (1) any of the 13 categories of the foundational layer (abbreviated as A,P,S,D,C,E,N,R,T,H,L,F,G; see paper). 64 | (2) LR (Link Relation) or LA (Link Argument) for edges between a linkage node and its Linker or Parallel 65 | Scenes, respectively. 66 | (3) Terminal for an edge to a word terminal. 67 | (4) U for an edge to a punctuation terminal. 68 | 69 | A node in layer 1 may also be a leaf that represents an implicit unit. In this case, the node would have 70 | an attribute "implicit" with the value "true". 71 | 72 | Layer 2 is left empty as a place holder where future layers (e.g., coreference, linkage type, 73 | information structure) can be represented. UCCA is designed to allow an open-ended set of layers 74 | to be annotated on top of a given passage. 75 | 76 | 77 | Toy example: 78 | ------------ 79 | 80 | The file toy.xml contains the annotation of a simple sentence "After Graduation, Mary moved to New York 81 | City". The terminals can be seen under the element . 82 | 83 | Consider now the nodes of the foundational layer (those under the element ). 84 | 85 | Consider the node whose ID is "1.1". It has 5 children, one is a Linker (and therefore the edge leading 86 | to it bears the type L), two are Parallel Scenes (the edge leading to them bear the type H), 87 | 2 are punctuation marks (the edges leading to them bear the type U). 88 | 89 | Note that edges leading to terminals (i.e., to nodes in layer0) bear the type 'Terminal'. 90 | 91 | Consider node "1.13". This node is of type LKG, which means it represents a linkage relation. 92 | It has three children, a Linker (i.e., the linkage relation; the edge has the tag 'LR'), and 93 | two linkage arguments (bear the type 'LA'). 94 | 95 | 96 | Licensing: 97 | ---------- 98 | 99 | The texts are taken from the English Wikipedia (http://en.wikipedia.org). 100 | The specific articles they were taken from are listed in the metadata file. 101 | The Wikipedia texts, as well as the UCCA annotation is distributed under the 102 | "Attribution-ShareAlike 3.0 Unported" license (http://creativecommons.org/licenses/by-sa/3.0/). 103 | Please follow the link for exact details. 104 | 105 | 106 | ACKNOWLEDGEMENTS: 107 | ----------------- 108 | 109 | We would like to thank Tomer Eshet for his partnering in developing the UCCA web application, 110 | and Amit Beka for his help with UCCA's development set and software tools. We would also like 111 | to thank our four annotators for hard and thorough work. 112 | -------------------------------------------------------------------------------- /docs/api.rst: -------------------------------------------------------------------------------- 1 | .. _api: 2 | 3 | API Documentation 4 | ================= 5 | 6 | Getting Started 7 | --------------- 8 | 9 | To load UCCA passages from XML files, manipulate them and write to files, use the following code template:: 10 | 11 | from ucca.ioutil import get_passages_with_progress_bar, write_passage 12 | for passage in get_passages_with_progress_bar(filenames): 13 | ... 14 | write_passage(passage) 15 | 16 | Each passage instantiates the :class:`ucca.core.Passage` class. 17 | 18 | XML files can be downloaded from the various `UCCA corpora `__. 19 | 20 | .. automodapi:: ucca.constructions 21 | .. automodapi:: ucca.convert 22 | .. automodapi:: ucca.core 23 | .. automodapi:: ucca.diffutil 24 | .. automodapi:: ucca.evaluation 25 | .. automodapi:: ucca.ioutil 26 | .. automodapi:: ucca.layer0 27 | .. automodapi:: ucca.layer1 28 | .. automodapi:: ucca.normalization 29 | .. automodapi:: ucca.textutil 30 | .. automodapi:: ucca.validation 31 | .. automodapi:: ucca.visualization 32 | 33 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. UCCA documentation master file, created by 2 | sphinx-quickstart on Sun Oct 28 09:01:22 2018. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | 7 | .. include:: ../README.rst 8 | 9 | For more information about how to use this library, see the :ref:`api`. 10 | 11 | .. toctree:: 12 | :maxdepth: 2 13 | :caption: Contents: 14 | 15 | api 16 | scripts 17 | ucca_db 18 | uccaapp 19 | 20 | Indices and tables 21 | ================== 22 | 23 | * :ref:`genindex` 24 | * :ref:`modindex` 25 | * :ref:`search` 26 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx-automodapi>=0.12 2 | -------------------------------------------------------------------------------- /docs/scripts.rst: -------------------------------------------------------------------------------- 1 | .. _scripts: 2 | 3 | Scripts Documentation 4 | ===================== 5 | 6 | .. automodapi:: scripts.annotate 7 | .. automodapi:: scripts.convert_1_0_to_1_2 8 | .. automodapi:: scripts.convert_2_0_to_1_2 9 | .. automodapi:: scripts.count_parents_children 10 | .. automodapi:: scripts.evaluate_db 11 | .. automodapi:: scripts.evaluate_standard 12 | .. automodapi:: scripts.find_constructions 13 | .. automodapi:: scripts.fix_tokenization 14 | .. automodapi:: scripts.join_passages 15 | .. automodapi:: scripts.join_sdp 16 | .. automodapi:: scripts.load_word_vectors 17 | .. automodapi:: scripts.normalize 18 | .. automodapi:: scripts.pickle_to_standard 19 | .. automodapi:: scripts.replace_tokens_by_dict 20 | .. automodapi:: scripts.site_pickle_to_standard 21 | .. automodapi:: scripts.site_to_standard 22 | .. automodapi:: scripts.site_to_text 23 | .. automodapi:: scripts.split_corpus 24 | .. automodapi:: scripts.standard_to_pickle 25 | .. automodapi:: scripts.standard_to_sentences 26 | .. automodapi:: scripts.standard_to_site 27 | .. automodapi:: scripts.standard_to_text 28 | .. automodapi:: scripts.statistics 29 | .. automodapi:: scripts.unique_roles 30 | .. automodapi:: scripts.validate 31 | .. automodapi:: scripts.visualize 32 | 33 | -------------------------------------------------------------------------------- /docs/short_defs.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huji-nlp/ucca/31e381f2b2693e82c8402b09a6d28f97d595a1eb/docs/short_defs.pdf -------------------------------------------------------------------------------- /docs/toy.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | -------------------------------------------------------------------------------- /docs/ucca_db.rst: -------------------------------------------------------------------------------- 1 | .. _ucca_db: 2 | 3 | UCCA DB Documentation 4 | ===================== 5 | 6 | .. automodapi:: ucca_db.api 7 | .. automodapi:: ucca_db.download 8 | .. automodapi:: ucca_db.upload 9 | -------------------------------------------------------------------------------- /docs/uccaapp.rst: -------------------------------------------------------------------------------- 1 | .. _uccaapp: 2 | 3 | UCCA-App API Documentation 4 | ========================== 5 | 6 | .. automodapi:: uccaapp.api 7 | .. automodapi:: uccaapp.convert_and_evaluate 8 | .. automodapi:: uccaapp.copy_categories 9 | .. automodapi:: uccaapp.create_annotation_tasks 10 | .. automodapi:: uccaapp.create_tokenization_tasks 11 | .. automodapi:: uccaapp.download_task 12 | .. automodapi:: uccaapp.upload_conllu_passages 13 | .. automodapi:: uccaapp.upload_streussel_passages 14 | .. automodapi:: uccaapp.upload_task 15 | 16 | -------------------------------------------------------------------------------- /requirements.distances.txt: -------------------------------------------------------------------------------- 1 | distances>=1.0 2 | zss>=1.2 3 | munkres>=1.0.12 -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy>=1.15.0 2 | spacy==2.3.5 3 | requests>=2.18.4 4 | tqdm>=4.23.3 5 | -------------------------------------------------------------------------------- /requirements.visualize.txt: -------------------------------------------------------------------------------- 1 | matplotlib==3.3.3 2 | networkx>=2.0 -------------------------------------------------------------------------------- /scripts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huji-nlp/ucca/31e381f2b2693e82c8402b09a6d28f97d595a1eb/scripts/__init__.py -------------------------------------------------------------------------------- /scripts/annotate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | 5 | from ucca.ioutil import write_passage, get_passages_with_progress_bar 6 | from ucca.textutil import annotate_all, is_annotated 7 | 8 | desc = """Read UCCA standard format in XML or binary pickle, and write back with POS tags and dependency parse.""" 9 | 10 | 11 | def main(args): 12 | for passage in annotate_all(get_passages_with_progress_bar(args.filenames, desc="Annotating"), 13 | replace=True, as_array=args.as_array, verbose=args.verbose): 14 | assert is_annotated(passage, args.as_array), "Passage %s is not annotated" % passage.ID 15 | write_passage(passage, outdir=args.out_dir, verbose=args.verbose) 16 | 17 | 18 | if __name__ == '__main__': 19 | argparser = argparse.ArgumentParser(description=desc) 20 | argparser.add_argument("filenames", nargs="+", help="passage file names to annotate") 21 | argparser.add_argument("-o", "--out-dir", default=".", help="directory to write annotated files to") 22 | argparser.add_argument("-a", "--as-array", action="store_true", help="save annotations as array in passage level") 23 | argparser.add_argument("-v", "--verbose", action="store_true", help="print tagged text for each passage") 24 | main(argparser.parse_args()) 25 | -------------------------------------------------------------------------------- /scripts/convert_2_0_to_1_2.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import os 3 | from argparse import ArgumentParser 4 | 5 | from ucca import layer1 6 | from ucca.ioutil import get_passages_with_progress_bar, write_passage 7 | from ucca.normalization import destroy, copy_edge 8 | 9 | desc = """Convert the English Wiki corpus from version 2.0 to 1.2""" 10 | 11 | 12 | def replace_time_and_quantifier(edge): 13 | if edge.tag in (layer1.EdgeTags.Time, layer1.EdgeTags.Quantifier): 14 | edge.tag = layer1.EdgeTags.Adverbial if edge.parent.is_scene() else layer1.EdgeTags.Elaborator 15 | if len(edge.parent.parents) == 1 and edge.parent.incoming[0].tag == edge.tag: 16 | for e in edge.parent: 17 | copy_edge(e, parent=edge.parent.parents[0]) 18 | destroy(edge.parent) 19 | return True 20 | return False 21 | 22 | 23 | RULES = (replace_time_and_quantifier,) 24 | 25 | 26 | def convert_passage(passage, report_writer): 27 | for rule in RULES: 28 | for node in passage.layer(layer1.LAYER_ID).all: 29 | for edge in node: 30 | parent = edge.parent 31 | parent_str = str(parent) 32 | if rule(edge): 33 | report_writer.writerow((rule.__name__, passage.ID, edge, parent_str, parent)) 34 | 35 | 36 | def main(args): 37 | os.makedirs(args.outdir, exist_ok=True) 38 | with open(args.outfile, "w", encoding="utf-8", newline="") as f: 39 | writer = csv.writer(f) 40 | writer.writerow(("rule", "passage", "edge", "before", "after")) 41 | for passage in get_passages_with_progress_bar(args.passages, desc="Converting"): 42 | convert_passage(passage, report_writer=writer) 43 | write_passage(passage, outdir=args.outdir, prefix=args.prefix, verbose=args.verbose) 44 | f.flush() 45 | print("Wrote '%s'" % args.outfile) 46 | 47 | 48 | if __name__ == "__main__": 49 | argparser = ArgumentParser(description=desc) 50 | argparser.add_argument("passages", nargs="+", help="the corpus, given as xml/pickle file names") 51 | argparser.add_argument("-o", "--outdir", default=".", help="output directory") 52 | argparser.add_argument("-p", "--prefix", default="", help="output filename prefix") 53 | argparser.add_argument("-O", "--outfile", default=os.path.splitext(argparser.prog)[0] + ".csv", help="log file") 54 | argparser.add_argument("-v", "--verbose", action="store_true", help="print more information") 55 | main(argparser.parse_args()) 56 | -------------------------------------------------------------------------------- /scripts/convert_articles_and_reflexives.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import os 3 | from argparse import ArgumentParser 4 | 5 | from ucca import layer0, layer1 6 | from ucca.ioutil import get_passages_with_progress_bar, write_passage 7 | from ucca.normalization import fparent, copy_edge, traverse_up_centers 8 | 9 | desc = """Change articles to Function, complying with UCCA v2 guidelines""" 10 | 11 | ARTICLES = { 12 | "de": ("der", "die", "das", "den", "dem", "des", "ein", "eine", "einen", "einem", "eines"), 13 | "en": ("a", "an", "the"), 14 | } 15 | 16 | REFLEXIVES = { 17 | "en": ("herself", "himself", "itself", "themselves", "yourself", "yourselves", "myself", "ourselves", "oneself"), 18 | } 19 | 20 | NONE = { 21 | "de": ("kein", "keine", "keinen", "keines", "keiner", "keinem"), 22 | } 23 | 24 | 25 | def change_article_to_function(terminal, parent, lang): 26 | if terminal.text.lower() in ARTICLES[lang]: 27 | for edge in parent.incoming: 28 | if not edge.attrib.get("remote"): 29 | # First, remove Functions to avoid duplicates 30 | if len(edge.categories) > 1: 31 | edge.categories = [category for category in edge.categories 32 | if category.tag != layer1.EdgeTags.Function] 33 | # Then replace Elaborators to Functions 34 | for category in edge.categories: 35 | if category.tag == layer1.EdgeTags.Elaborator: 36 | category.tag = layer1.EdgeTags.Function 37 | return True 38 | 39 | 40 | def insert_reflexive_into_relation(terminal, parent, lang): 41 | if terminal.text.lower() in REFLEXIVES.get(lang, ()): 42 | for edge in parent.incoming: 43 | if not edge.attrib.get("remote"): 44 | for category in edge.categories: 45 | if category.tag == layer1.EdgeTags.Adverbial: 46 | for grandparent in parent.parents: 47 | new_parent = grandparent.process or grandparent.state 48 | if new_parent is not None: 49 | while any(layer1.EdgeTags.Center in e.tags for e in new_parent): 50 | new_parent = next(e for e in new_parent if layer1.EdgeTags.Center in e.tags).child 51 | parent.destroy() 52 | new_parent.add(layer1.EdgeTags.Terminal, terminal) 53 | return True 54 | 55 | 56 | def change_none_to_quantifier(terminal, parent, lang): 57 | if terminal.text.lower() in NONE.get(lang, ()): 58 | parent = traverse_up_centers(parent) 59 | for edge in parent.incoming: 60 | if not edge.attrib.get("remote"): 61 | for category in edge.categories: 62 | if category.tag == layer1.EdgeTags.Adverbial: 63 | for participant_edge in edge.parent: 64 | if layer1.EdgeTags.Participant in participant_edge.tags: 65 | new_parent = participant_edge.child 66 | if new_parent.start_position == terminal.position + 1: 67 | if not new_parent.centers: 68 | edges = new_parent.outgoing 69 | center = new_parent.layer.add_fnode(new_parent, layer1.EdgeTags.Center) 70 | for sub_edge in edges: 71 | copy_edge(sub_edge, center) 72 | new_parent.remove(sub_edge) 73 | category.tag = layer1.EdgeTags.Quantifier 74 | participant_edge.add(layer1.EdgeTags.Adverbial) 75 | copy_edge(edge, new_parent) 76 | edge.parent.remove(edge) 77 | return True 78 | 79 | 80 | RULES = (change_article_to_function, insert_reflexive_into_relation, change_none_to_quantifier) 81 | 82 | 83 | def convert_passage(passage, lang, report_writer): 84 | for rule in RULES: 85 | for terminal in passage.layer(layer0.LAYER_ID).all: 86 | parent = fparent(terminal) 87 | if len(parent.children) == 1 and rule(terminal, parent, lang): 88 | report_writer.writerow((rule.__name__, passage.ID, terminal.ID, parent, fparent(terminal))) 89 | 90 | 91 | def main(args): 92 | os.makedirs(args.outdir, exist_ok=True) 93 | with open(args.outfile, "w", encoding="utf-8", newline="") as f: 94 | writer = csv.writer(f) 95 | writer.writerow(("rule", "passage", "terminal", "before", "after")) 96 | for passage in get_passages_with_progress_bar(args.passages, desc="Converting"): 97 | convert_passage(passage, lang=passage.attrib.get("lang", args.lang), report_writer=writer) 98 | write_passage(passage, outdir=args.outdir, prefix=args.prefix, verbose=args.verbose) 99 | f.flush() 100 | print("Wrote '%s'" % args.outfile) 101 | 102 | 103 | if __name__ == "__main__": 104 | argparser = ArgumentParser(description=desc) 105 | argparser.add_argument("passages", nargs="+", help="the corpus, given as xml/pickle file names") 106 | argparser.add_argument("-l", "--lang", choices=ARTICLES, help="two-letter language code for article list") 107 | argparser.add_argument("-o", "--outdir", default=".", help="output directory") 108 | argparser.add_argument("-p", "--prefix", default="", help="output filename prefix") 109 | argparser.add_argument("-O", "--outfile", default=os.path.splitext(argparser.prog)[0] + ".csv", help="log file") 110 | argparser.add_argument("-v", "--verbose", action="store_true", help="print tagged text for each passage") 111 | main(argparser.parse_args()) 112 | -------------------------------------------------------------------------------- /scripts/count_parents_children.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import sys 5 | from collections import Counter, defaultdict 6 | 7 | from ucca import layer1 8 | from ucca.ioutil import get_passages_with_progress_bar 9 | 10 | desc = """Parses XML files in UCCA standard format, and creates a histogram for the number of parents per unit.""" 11 | 12 | 13 | def plot_histogram(counter, label, plot=None): 14 | import matplotlib.pyplot as plt 15 | plt.figure() 16 | nums = list(counter.keys()) 17 | counts = list(counter.values()) 18 | indices = range(len(counts)) 19 | bars = plt.bar(indices, counts, align="center") 20 | plt.xticks(indices, nums) 21 | top = 1.06 * max(counts) 22 | plt.ylim(min(counts), top) 23 | plt.xlabel("number of %s" % label) 24 | plt.ylabel("count") 25 | for bar in bars: 26 | count = bar.get_height() 27 | plt.text(bar.get_x() + bar.get_width() / 2., count, "%.1f%%" % (100.0 * count / sum(counts)), 28 | ha="center", va="bottom") 29 | if plot: 30 | plt.savefig(plot + "histogram_" + label + ".png") 31 | else: 32 | plt.show() 33 | 34 | 35 | def plot_pie(counter, label, plot=None): 36 | import matplotlib.pyplot as plt 37 | plt.figure() 38 | nums = list(counter.keys()) 39 | counts = list(counter.values()) 40 | plt.pie(counts, labels=nums, autopct="%1.1f%%", 41 | counterclock=True, wedgeprops={"edgecolor": "white"}) 42 | plt.axis("equal") 43 | if plot: 44 | plt.savefig(plot + "pie_" + label + ".png") 45 | else: 46 | plt.show() 47 | 48 | 49 | def main(args): 50 | histograms = defaultdict(Counter) 51 | for passage in get_passages_with_progress_bar(args.filenames): 52 | for node in passage.layer(layer1.LAYER_ID).all: 53 | if node.ID != "1.1": # Exclude the root node 54 | histograms["parents"][clip(node.incoming, 3)] += 1 55 | histograms["children"][clip(node.outgoing, 7)] += 1 56 | 57 | for label, counter in histograms.items(): 58 | handle = open(args.outfile + label + ".txt", "w", encoding="utf-8") if args.outfile else sys.stdout 59 | handle.writelines(["%s\t%d\n" % (num, count) for num, count in counter.items()]) 60 | if handle is not sys.stdout: 61 | handle.close() 62 | try: 63 | plot_histogram(counter, label, plot=args.plot) 64 | plot_pie(counter, label, plot=args.plot) 65 | except: 66 | pass 67 | 68 | 69 | def clip(l, m): 70 | return len(l) if len(l) <= m else ">%d" % m 71 | 72 | 73 | if __name__ == "__main__": 74 | argparser = argparse.ArgumentParser(description=desc) 75 | argparser.add_argument("filenames", nargs="+", help="file names to analyze") 76 | argparser.add_argument("-o", "--outfile", default="data/counts_", 77 | help="output file prefix for histogram") 78 | argparser.add_argument("-p", "--plot", default="data/plot_", 79 | help="output file prefix for plot image file") 80 | main(argparser.parse_args()) 81 | -------------------------------------------------------------------------------- /scripts/count_tokens.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import sys 4 | import urllib.request 5 | from itertools import product 6 | 7 | from ucca import layer0 8 | from uccaapp.download_task import TaskDownloader 9 | 10 | 11 | ## def main(output = None, comment = False, sentence_level = False, categories = (), tokens = (), tokens_mode = CONSECUTIVE, 12 | ## case_insensitive = False, tokens_by_file = False, remotes = False, write = False, **kwargs): 13 | ## if tokens_by_file: 14 | ## with open(tokens[0]) as f: 15 | ## token_lists = [line.strip().split() for line in f] 16 | ## elif tokens != (): 17 | ## token_lists = [tokens] 18 | ## else: 19 | ## token_lists = () 20 | 21 | ## filtered_nodes = [] 22 | ## for passage, task_id, user_id in TaskDownloader(**kwargs).download_tasks(write=False, **kwargs): 23 | ## if sentence_level: 24 | ## cur_passages = convert.split2sentences(passage) 25 | ## all_nodes = [p.layer(layer1.LAYER_ID).heads[0] for p in cur_passages] 26 | ## else: 27 | ## all_nodes = list(passage.layer(layer1.LAYER_ID).all) 28 | ## for node in all_nodes: 29 | ## if comment and node.extra.get("remarks"): 30 | ## filtered_nodes.append(("comment",node,task_id,user_id)) 31 | ## if remotes and len([n for n in node.outgoing if n.attrib.get("remote")]) > 0: 32 | ## filtered_nodes.append(("remotes", node, task_id, user_id)) 33 | ## if token_lists and not node.attrib.get("implicit"): 34 | ## for token_list in token_lists: 35 | ## unit_tokens = [t.text for t in node.get_terminals(punct=True)] 36 | ## if case_insensitive: 37 | ## unit_tokens = [x.lower() for x in unit_tokens] 38 | ## token_list = [x.lower() for x in token_list] 39 | ## if tokens_match(unit_tokens, token_list, tokens_mode): 40 | ## filtered_nodes.append(('TOKENS', node, task_id, user_id)) 41 | ## else: 42 | ## all_tags = [c.tag for edge in node for c in edge.categories] 43 | ## intersection = set(categories).intersection(all_tags) 44 | 45 | def count_tokens(**kwargs): 46 | output = [] 47 | for passage, task_id, user_id in TaskDownloader(**kwargs).download_tasks(**kwargs): 48 | num_tokens = len(passage.layer(layer0.LAYER_ID).all) 49 | output.append((num_tokens,task_id,user_id)) 50 | return output 51 | 52 | def main(output=None, tokens=(), **kwargs): 53 | kwargs["write"] = False 54 | f = open(output, 'w', encoding="utf-8") if output else sys.stdout 55 | for num_tokens, task_id, user_id in count_tokens(**kwargs): 56 | print(str(num_tokens), task_id, user_id, file=f, sep="\t", flush=True) 57 | if output: 58 | f.close() 59 | 60 | 61 | if __name__ == "__main__": 62 | argument_parser = argparse.ArgumentParser() 63 | TaskDownloader.add_arguments(argument_parser) 64 | argument_parser.add_argument("--output", help="output file name") 65 | main(**vars(argument_parser.parse_args())) 66 | 67 | -------------------------------------------------------------------------------- /scripts/distances/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huji-nlp/ucca/31e381f2b2693e82c8402b09a6d28f97d595a1eb/scripts/distances/__init__.py -------------------------------------------------------------------------------- /scripts/evaluate_db.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | The evaluation software for UCCA layer 1. 4 | """ 5 | 6 | from argparse import ArgumentParser 7 | 8 | from ucca import convert, constructions 9 | from ucca.evaluation import evaluate 10 | from ucca_db import api 11 | 12 | 13 | def main(args): 14 | keys = [args.guessed, args.ref] 15 | xmls = api.get_by_xids(db_name=args.db_filename, host_name=args.host, xids=keys) if args.from_xids else \ 16 | api.get_xml_trees(db_name=args.db_filename, host_name=args.host, pid=args.pid, usernames=keys) 17 | guessed, ref = [convert.from_site(x) for x in xmls] 18 | if args.units or args.fscore or args.errors: 19 | evaluate(guessed, ref, units=args.units, fscore=args.fscore, errors=args.errors, 20 | constructions=args.constructions, verbose=True) 21 | 22 | 23 | if __name__ == '__main__': 24 | argparser = ArgumentParser(description="Evaluate passages on UCCA DB") 25 | argparser.add_argument("--db", "-d", required=True, dest="db_filename", help="the db file name") 26 | argparser.add_argument("--host", "--hst", help="the host name") 27 | group = argparser.add_mutually_exclusive_group() 28 | group.add_argument("-p", "--pid", type=int, help="the passage ID") 29 | group.add_argument("-x", "--from_xids", action="store_true", 30 | help="interpret the ref and the guessed parameters as Xids in the db") 31 | argparser.add_argument("--guessed", "-g", required=True, 32 | help="if a db is defined - the username for the guessed annotation; " 33 | "else - the xml file name for the guessed annotation") 34 | argparser.add_argument("-r", "--ref", required=True, 35 | help="if a db is defined - the username for the reference annotation; " 36 | "else - the xml file name for the reference annotation") 37 | argparser.add_argument("-u", "--units", action="store_true", 38 | help="the units the annotations have in common, and those each has separately") 39 | argparser.add_argument("-f", "--fscore", action="store_true", 40 | help="outputs the traditional P,R,F instead of the scene structure evaluation") 41 | argparser.add_argument("-e", "--errors", action="store_true", 42 | help="prints the error distribution according to its frequency") 43 | constructions.add_argument(argparser) 44 | main(argparser.parse_args()) 45 | -------------------------------------------------------------------------------- /scripts/find_constructions.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | from argparse import ArgumentParser 4 | 5 | from ucca.constructions import extract_candidates, add_argument 6 | from ucca.ioutil import get_passages_with_progress_bar, external_write_mode 7 | 8 | 9 | def main(args): 10 | for passage in get_passages_with_progress_bar(args.passages): 11 | c2es = OrderedDict((c, [candidate.edge for candidate in candidates]) for c, candidates in 12 | extract_candidates(passage, constructions=args.constructions, verbose=args.verbose).items() 13 | if candidates) 14 | if any(c2es.values()): 15 | with external_write_mode(): 16 | if not args.verbose: 17 | print("%s:" % passage.ID) 18 | for construction, edges in c2es.items(): 19 | if edges: 20 | print(" %s:" % construction.description) 21 | for edge in edges: 22 | print(" %s [%s %s]" % (edge, edge.tag, edge.child)) 23 | print() 24 | 25 | 26 | if __name__ == "__main__": 27 | argparser = ArgumentParser(description="Extract linguistic constructions from UCCA corpus.") 28 | argparser.add_argument("passages", nargs="+", help="the corpus, given as xml/pickle file names") 29 | add_argument(argparser, False) 30 | argparser.add_argument("-v", "--verbose", action="store_true", help="print tagged text for each passage") 31 | main(argparser.parse_args()) 32 | -------------------------------------------------------------------------------- /scripts/join_passages.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import os 5 | import sys 6 | from collections import defaultdict 7 | 8 | import ucca.convert 9 | from ucca.ioutil import passage2file, get_passages 10 | 11 | desc = """Parses XML/pickle files in UCCA standard format, and writes a single passage. 12 | """ 13 | 14 | 15 | def main(args): 16 | os.makedirs(args.outdir, exist_ok=True) 17 | passages = list(get_passages(args.filenames)) 18 | if args.join_by_prefix: 19 | subsets = defaultdict(list) 20 | for passage in passages: 21 | subsets[passage.ID[:-3]].append(passage) 22 | else: 23 | subsets = {passages[0].ID: passages} 24 | for passage_id, subset in sorted(subsets.items()): 25 | print("Joining passages " + ", ".join(passage.ID for passage in subset), file=sys.stderr) 26 | joined = ucca.convert.join_passages(passages, passage_id=passage_id, remarks=args.remarks) 27 | outfile = "%s/%s.%s" % (args.outdir, args.prefix + joined.ID, "pickle" if args.binary else "xml") 28 | print("Writing joined passage file '%s'..." % outfile, file=sys.stderr) 29 | passage2file(joined, outfile, binary=args.binary) 30 | 31 | 32 | if __name__ == '__main__': 33 | argparser = argparse.ArgumentParser(description=desc) 34 | argparser.add_argument("filenames", nargs="+", help="passage file names to join") 35 | argparser.add_argument("-o", "--outdir", default=".", help="output directory") 36 | argparser.add_argument("-p", "--prefix", default="", help="output filename prefix") 37 | argparser.add_argument("-r", "--remarks", action="store_true", help="annotate original IDs") 38 | argparser.add_argument("-b", "--binary", action="store_true", help="write in pickle binary format (.pickle)") 39 | argparser.add_argument("-j", "--join-by-prefix", action="store_true", 40 | help="join each set of passages whose IDs share all but the last 3 characters") 41 | main(argparser.parse_args()) 42 | -------------------------------------------------------------------------------- /scripts/join_sdp.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import glob 5 | import os 6 | import sys 7 | 8 | desc = """Combines several SDP parsed files to one. 9 | """ 10 | 11 | 12 | def main(args): 13 | lines = [args.prefix + args.header + "\n"] 14 | for pattern in args.filenames: 15 | filenames = sorted(glob.glob(pattern)) 16 | if not filenames: 17 | raise IOError("Not found: " + pattern) 18 | for filename in filenames: 19 | base = os.path.basename(os.path.splitext(filename)[0]) 20 | lines.append(args.prefix + base + "\n") 21 | with open(filename, encoding="utf-8") as f: 22 | lines += f.readlines() 23 | f = sys.stdout if args.outfile is None else open(args.outfile, "w", encoding="utf-8") 24 | f.writelines(lines) 25 | if args.outfile is not None: 26 | f.close() 27 | 28 | 29 | if __name__ == '__main__': 30 | argparser = argparse.ArgumentParser(description=desc) 31 | argparser.add_argument("filenames", nargs="+", 32 | help="SDP file names to join") 33 | argparser.add_argument("-o", "--outfile", 34 | help="output filename (standard output if unspecified)") 35 | argparser.add_argument("-H", "--header", default="SDP 2015", 36 | help="first line in the file, not including prefix") 37 | argparser.add_argument("-p", "--prefix", default="#", 38 | help="prefix for comment lines") 39 | main(argparser.parse_args()) 40 | -------------------------------------------------------------------------------- /scripts/load_word_vectors.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | 5 | from ucca.textutil import get_word_vectors 6 | 7 | desc = """Load word vectors file to make sure it works.""" 8 | 9 | 10 | def main(args): 11 | for filename in args.filenames: 12 | vectors, dim = get_word_vectors(size=args.rows, dim=args.dim, filename=filename) 13 | print("Loaded %d rows, dim=%d" % (len(vectors), dim)) 14 | 15 | 16 | if __name__ == '__main__': 17 | argparser = argparse.ArgumentParser(description=desc) 18 | argparser.add_argument("filenames", nargs="+", help="word vector files to load") 19 | argparser.add_argument("-r", "--rows", type=int, help="maximum number of word vectors") 20 | argparser.add_argument("-d", "--dim", type=int, help="maximum dimension of word vectors") 21 | main(argparser.parse_args()) 22 | -------------------------------------------------------------------------------- /scripts/match_text.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import re 3 | import sys 4 | from glob import glob 5 | from itertools import groupby 6 | from operator import attrgetter 7 | 8 | from tqdm import tqdm 9 | 10 | from ucca import layer0 11 | from ucca.ioutil import get_passages_with_progress_bar 12 | 13 | 14 | def gen_lines(filenames): 15 | for filename in glob(filenames) or [filenames]: 16 | with open(filename, encoding="utf-8") as f: 17 | try: 18 | for line in map(str.strip, f): 19 | if line and not line.startswith("#"): 20 | yield re.sub(r"\[\d+\]", "", line) # Remove numbers inside brackets 21 | except UnicodeDecodeError as e: 22 | raise IOError("Failed reading '%s'" % filename) from e 23 | 24 | 25 | class CandidateMatcher: 26 | def __init__(self, text): 27 | self.text = text 28 | self.char_map = {} 29 | no_space_chars = [] 30 | for i, char in enumerate(text): 31 | if not char.isspace(): 32 | self.char_map[len(no_space_chars)] = i 33 | no_space_chars.append(char) 34 | self.no_space_text = "".join(no_space_chars) 35 | 36 | def __call__(self, no_space_text): 37 | try: 38 | index = self.no_space_text.index(no_space_text) 39 | return self.text[self.char_map[index]:self.char_map[index + len(no_space_text) - 1] + 1] 40 | except ValueError: 41 | return None 42 | 43 | 44 | def match_passage_text(passage, matchers, out): 45 | passage_tokens = sorted(passage.layer(layer0.LAYER_ID).all, key=attrgetter("position")) 46 | for paragraph, terminals in groupby(passage_tokens, key=attrgetter("paragraph")): 47 | tokens = [terminal.text for terminal in terminals] 48 | no_space_text = "".join(tokens) 49 | match = next(filter(None, (matcher(no_space_text) for matcher in matchers)), "@@@" + " ".join(tokens)) 50 | print(passage.ID, match, sep="\t", file=out) 51 | 52 | 53 | def alternative_spellings(text): 54 | yield text 55 | 56 | 57 | def main(args): 58 | matchers = [CandidateMatcher(spelling) for line in tqdm(list(gen_lines(args.text)), 59 | desc="Indexing " + args.text, unit=" lines") 60 | for spelling in alternative_spellings(line)] 61 | out = open(args.out, "w", encoding="utf-8") if args.out else sys.stdout 62 | for p in get_passages_with_progress_bar(args.filenames, desc="Matching", converters={}): 63 | match_passage_text(p, matchers, out) 64 | out.close() 65 | 66 | 67 | if __name__ == "__main__": 68 | argparser = argparse.ArgumentParser(description="Match UCCA passages to original text and print aligned lines") 69 | argparser.add_argument("text", help="file of text to match to") 70 | argparser.add_argument("filenames", nargs="+", help="files or directories of UCCA passages to match") 71 | argparser.add_argument("-o", "--out", default="text.tsv", help="output file") 72 | argparser.add_argument("-l", "--lang", default="en", help="spaCy language") 73 | main(argparser.parse_args()) 74 | -------------------------------------------------------------------------------- /scripts/normalize.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | from ucca.ioutil import get_passages_with_progress_bar, write_passage 5 | from ucca.normalization import normalize 6 | 7 | 8 | def main(args): 9 | if args.outdir: 10 | os.makedirs(args.outdir, exist_ok=True) 11 | for p in get_passages_with_progress_bar(args.filenames, desc="Normalizing", converters={}): 12 | normalize(p, extra=args.extra) 13 | write_passage(p, outdir=args.outdir, prefix=args.prefix, binary=args.binary, verbose=False) 14 | 15 | 16 | if __name__ == "__main__": 17 | argparser = argparse.ArgumentParser(description="Normalize UCCA passages") 18 | argparser.add_argument("filenames", nargs="+", help="files or directories to normalize") 19 | argparser.add_argument("-o", "--outdir", default=".", help="output directory") 20 | argparser.add_argument("-p", "--prefix", default="", help="output filename prefix") 21 | argparser.add_argument("-b", "--binary", action="store_true", help="write in pickle binary format (.pickle)") 22 | argparser.add_argument("-e", "--extra", action="store_true", help="extra normalization rules") 23 | main(argparser.parse_args()) 24 | -------------------------------------------------------------------------------- /scripts/pickle_to_standard.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | import os 4 | import sys 5 | 6 | from ucca.ioutil import file2passage, passage2file 7 | 8 | desc = """Parses pickle files in UCCA standard format, and writes them in XML format. 9 | """ 10 | 11 | 12 | def main(args): 13 | for filename in args.filenames: 14 | sys.stderr.write("Reading passage '%s'...\n" % filename) 15 | passage = file2passage(filename) 16 | basename = os.path.splitext(os.path.basename(filename))[0] 17 | outfile = args.outdir + os.path.sep + basename + ".xml" 18 | sys.stderr.write("Writing file '%s'...\n" % outfile) 19 | passage2file(passage, outfile) 20 | 21 | 22 | if __name__ == '__main__': 23 | argparser = argparse.ArgumentParser(description=desc) 24 | argparser.add_argument('filenames', nargs='+', help="pickle file names to convert") 25 | argparser.add_argument('-o', '--outdir', default='.', help="output directory") 26 | main(argparser.parse_args()) 27 | -------------------------------------------------------------------------------- /scripts/remove_br_tokens.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import ntpath 4 | import argparse 5 | import os 6 | from xml.etree.ElementTree import tostring 7 | 8 | from ucca import convert 9 | from ucca.ioutil import external_write_mode 10 | from ucca.ioutil import get_passages_with_progress_bar 11 | import xml.etree.ElementTree as ET 12 | 13 | desc = """Removes
tokens from a standard XML.""" 14 | 15 | 16 | def main(args): 17 | os.makedirs(args.outdir, exist_ok=True) 18 | for fn in args.filenames: 19 | tree = ET.parse(fn) 20 | root = tree.getroot() 21 | to_remove = [] 22 | old_to_new_ID = {} 23 | 24 | for node in root.getiterator(): 25 | if node.tag == 'layer' and node.attrib.get('layerID',None) == "0": 26 | layer0 = node 27 | break 28 | 29 | last_parag = "1" 30 | position_in_paragraph = 0 31 | position = 1 32 | for node in layer0.getiterator(): 33 | if node.tag == 'node': 34 | new_ID = '0.' + str(position) 35 | old_to_new_ID[node.attrib['ID']] = new_ID 36 | node.attrib['ID'] = new_ID 37 | for e in node.iter(): 38 | if e.tag == 'attributes': 39 | if e.attrib.get('text',None) in ['','
']: 40 | to_remove.append(node) 41 | else: 42 | position += 1 43 | if e.attrib.get('paragraph', "0") != last_parag: 44 | position_in_paragraph = 0 45 | last_parag = e.attrib.get('paragraph', "0") 46 | position_in_paragraph += 1 47 | e.attrib['paragraph_position'] = str(position_in_paragraph) 48 | 49 | for node in to_remove: 50 | layer0.remove(node) 51 | 52 | # fixing layer1 53 | for node in root.getiterator(): 54 | if node.tag == 'layer' and node.attrib.get('layerID',None) == "1": 55 | layer1 = node 56 | break 57 | 58 | for node in layer1.getiterator(): 59 | if node.tag == 'edge': 60 | if node.attrib.get("toID",None) in old_to_new_ID.keys(): 61 | node.attrib["toID"] = old_to_new_ID[node.attrib["toID"]] 62 | 63 | P = convert.from_standard(root) 64 | xml_str = tostring(root).decode() 65 | site_filename = os.path.join(args.outdir,ntpath.basename(fn)) 66 | f = open(site_filename,'w') 67 | f.write(xml_str) 68 | f.close() 69 | 70 | 71 | 72 | if __name__ == "__main__": 73 | argparser = argparse.ArgumentParser(description=desc) 74 | argparser.add_argument("filenames", nargs="+", help="XML file names to convert") 75 | argparser.add_argument("-o", "--outdir", default=".", help="output directory") 76 | argparser.add_argument("-v", "--verbose", action="store_true", help="verbose output") 77 | main(argparser.parse_args()) 78 | -------------------------------------------------------------------------------- /scripts/replace_tokens_by_dict.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | from glob import glob 4 | 5 | desc = """Replaces the tokens according to a dictionary.""" 6 | 7 | 8 | def read_dictionary_from_file(filename): 9 | f = open(filename, encoding="utf-8") 10 | d = {} 11 | for line in f: 12 | fields = line.strip().split() 13 | d[fields[0]] = fields[1] 14 | d[fields[0].strip().encode('ascii', 'xmlcharrefreplace').decode()] = \ 15 | fields[1].strip().encode('ascii', 'xmlcharrefreplace').decode() 16 | print(d) 17 | return d 18 | 19 | 20 | def main(args): 21 | os.makedirs(args.out_dir, exist_ok=True) 22 | replacement_dict = read_dictionary_from_file(args.dict) 23 | for pattern in args.filenames: 24 | for filename in sorted(glob(pattern)) or [pattern]: 25 | basename = os.path.basename(filename) 26 | with open(os.path.join(args.out_dir, basename), "w", encoding="utf-8") as outfile: 27 | with open(filename, encoding="utf-8") as infile: 28 | xml_string = infile.read() 29 | for k, v in replacement_dict.items(): 30 | if args.whole_word: 31 | xml_string = xml_string.replace("text=\"" + k + "\"", "text=\"" + v + "\"") 32 | else: 33 | xml_string = xml_string.replace(k, v) 34 | print(xml_string, file=outfile, end="") 35 | print("Done") 36 | 37 | 38 | if __name__ == "__main__": 39 | argparser = argparse.ArgumentParser(description=desc) 40 | argparser.add_argument("filenames", nargs="+", help="files to replace tokens in") 41 | argparser.add_argument("-o", "--out-dir", default=".", help="output directory for changed XMLs") 42 | argparser.add_argument("-d", "--dict", 43 | help="filename to read the dictionary from. the file should have one line per entry, in the" 44 | " format of ") 45 | argparser.add_argument("-w", "--whole-word", action="store_true", help="replace whole word") 46 | main(argparser.parse_args()) 47 | -------------------------------------------------------------------------------- /scripts/set_external_id_offline.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | import os 4 | import sys 5 | 6 | from ucca.ioutil import get_passages_with_progress_bar, write_passage 7 | 8 | desc = """Rename passages by a given mapping of IDs""" 9 | 10 | 11 | def main(filename, input_filenames, outdir): 12 | os.makedirs(outdir, exist_ok=True) 13 | with open(filename, encoding="utf-8") as f: 14 | pairs = [line.strip().split() for line in f] 15 | old_to_new_id = {old_id: new_id for new_id, old_id in pairs} 16 | for passage in get_passages_with_progress_bar(input_filenames, desc="Renaming"): 17 | passage._ID = old_to_new_id[passage.ID] 18 | write_passage(passage, outdir=outdir, verbose=False) 19 | 20 | 21 | if __name__ == "__main__": 22 | argument_parser = argparse.ArgumentParser(description=desc) 23 | argument_parser.add_argument("filename", help="file with lines of the form ") 24 | argument_parser.add_argument("input_filenames", help="filename pattern or directory with input passages") 25 | argument_parser.add_argument("-o", "--outdir", default=".", help="output directory") 26 | main(**vars(argument_parser.parse_args())) 27 | sys.exit(0) 28 | -------------------------------------------------------------------------------- /scripts/site_pickle_to_standard.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import pickle 4 | from glob import glob 5 | from xml.etree.ElementTree import Element 6 | 7 | import ucca.convert 8 | from ucca.ioutil import write_passage 9 | 10 | desc = """Parses pickle files containing XML in UCCA site format, and convert to standard XML""" 11 | 12 | 13 | def pickle_site2passage(filename): 14 | """Opens a pickle file containing XML in UCCA site format and returns its parsed Passage object""" 15 | with open(filename, "rb") as h: 16 | root = elem = pickle.load(h) 17 | while isinstance(elem, list): 18 | try: 19 | elem = next(e for e in elem if isinstance(e, (Element, list))) 20 | except StopIteration: 21 | raise ValueError("Cannot parse %s" % root) 22 | return ucca.convert.from_site(elem) 23 | 24 | 25 | def main(args): 26 | os.makedirs(args.out_dir, exist_ok=True) 27 | exceptions = [] 28 | for pattern in args.filenames: 29 | for filename in sorted(glob(pattern)) or [pattern]: 30 | print("Reading '%s'..." % filename) 31 | try: 32 | passage = pickle_site2passage(filename) 33 | write_passage(passage, outdir=args.out_dir, binary=args.binary, basename=os.path.basename(filename)) 34 | except ValueError as e: 35 | exceptions.append((filename, e)) 36 | if exceptions: 37 | for filename, e in exceptions: 38 | print("'%s': %s" % (filename, e)) 39 | 40 | 41 | if __name__ == "__main__": 42 | argparser = argparse.ArgumentParser(description=desc) 43 | argparser.add_argument("filenames", nargs="*", help="pickle file names to convert") 44 | argparser.add_argument("-o", "--out-dir", default=".", help="output directory") 45 | argparser.add_argument("-b", "--binary", help="output binary pickle") 46 | main(argparser.parse_args()) 47 | -------------------------------------------------------------------------------- /scripts/site_to_standard.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import sqlite3 4 | from glob import glob 5 | from xml.etree.ElementTree import ElementTree, fromstring 6 | 7 | import ucca.convert 8 | from ucca.ioutil import write_passage 9 | 10 | desc = """Parses an XML in UCCA site format. 11 | 12 | The input can be given as either an XML file or a DB file with passage ID 13 | and user name, and the output is either the standard format XML or 14 | a pickled object. 15 | Possible input methods are using a DB file with pid and user, which gets the 16 | annotation of the specified user for the specified passage from teh DB file, 17 | or using filenames of a site-formatted XML file. 18 | 19 | """ 20 | 21 | 22 | def site2passage(filename): 23 | """Opens a file and returns its parsed Passage object""" 24 | with open(filename, encoding="utf-8") as f: 25 | print("Reading '%s'..." % filename) 26 | return ucca.convert.from_site(ElementTree().parse(f)) 27 | 28 | 29 | def db2passage(handle, pid, user): 30 | """Gets the annotation of user to pid from the DB handle - returns a passage""" 31 | handle.execute("SELECT id FROM users WHERE username=?", (user,)) 32 | uid = handle.fetchone()[0] 33 | handle.execute("SELECT xml FROM xmls WHERE paid=? AND uid=? ORDER BY ts DESC", (pid, uid)) 34 | return ucca.convert.from_site(fromstring(handle.fetchone()[0])) 35 | 36 | 37 | def main(args): 38 | os.makedirs(args.out_dir, exist_ok=True) 39 | for filename, passage in ((filename, site2passage(filename)) for pattern in args.filenames 40 | for filename in sorted(glob(pattern)) or [pattern]) if args.filenames \ 41 | else ((pid, db2passage(sqlite3.connect(args.db).cursor(), pid, args.user)) for pid in args.pids): 42 | write_passage(passage, outdir=args.out_dir, binary=args.binary) 43 | 44 | 45 | def check_illegal_combinations(args): 46 | if args.db and not (args.pids and args.user): 47 | argparser.error("Must specify a username and a passage ID when using DB file option") 48 | if (args.pids or args.user) and not args.db: 49 | argparser.error("Cannot use user and passage ID options without DB file") 50 | return args 51 | 52 | 53 | if __name__ == "__main__": 54 | argparser = argparse.ArgumentParser(description=desc) 55 | argparser.add_argument("filenames", nargs="*", help="XML file name to convert") 56 | argparser.add_argument("-d", "--db", help="DB file to get input from") 57 | argparser.add_argument("-o", "--out-dir", default=".", help="output directory for standard XML") 58 | argparser.add_argument("-b", "--binary", help="output file for binary pickle") 59 | argparser.add_argument("-p", "--pids", nargs="*", type=int, help="PassageIDs to query DB") 60 | argparser.add_argument("-u", "--user", help="Username to DB query") 61 | main(check_illegal_combinations(argparser.parse_args())) 62 | -------------------------------------------------------------------------------- /scripts/site_to_text.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python3 2 | 3 | import argparse 4 | import pickle 5 | from xml.etree.ElementTree import ElementTree, fromstring 6 | 7 | import psycopg2 8 | 9 | import ucca.convert 10 | 11 | desc = """Parses an XML in UCCA site format. 12 | 13 | The input can be given as either an XML file or a DB file with passage ID 14 | and user name, and the output is either the standard format XML or 15 | a pickled object. 16 | Possible input methods are using a DB file with pid and user, which gets the 17 | annotation of the specified user for the specified passage from teh DB file, 18 | or using filename of a site-formatted XML file. 19 | 20 | """ 21 | 22 | 23 | def site2passage(filename): 24 | """Opens a file and returns its parsed Passage object""" 25 | with open(filename, encoding="utf-8") as f: 26 | etree = ElementTree().parse(f) 27 | return ucca.convert.from_site(etree) 28 | 29 | 30 | def db2passage(handle, pid, user): 31 | """Gets the annotation of user to pid from the DB handle - returns a passage""" 32 | handle.execute("SET search_path to oabend") 33 | handle.execute("SELECT id FROM users WHERE username=%s", (user,)) 34 | uid = handle.fetchone()[0] 35 | handle.execute("SELECT xml,ts FROM xmls WHERE paid=%s AND uid=%s " + 36 | "ORDER BY ts DESC", (pid, uid)) 37 | raw_xml, ts = handle.fetchone() 38 | #print('extracted passage from '+str(ts)) 39 | return ucca.convert.from_site(fromstring(raw_xml)) 40 | 41 | 42 | def main(args): 43 | # Checking for illegal combinations 44 | if args.db and args.filename: 45 | argparser.error("Only one source, XML or DB file, can be used") 46 | if (not args.db) and (not args.filename): 47 | argparser.error("Must specify one source, XML or DB file") 48 | if args.db and not (args.pid and args.user): 49 | argparser.error("Must specify a username and a passage ID when " + 50 | "using DB file option") 51 | if (args.pid or args.user) and not args.db: 52 | argparser.error("Cannot use user and passage ID options without DB file") 53 | 54 | if args.filename: 55 | passage = site2passage(args.filename) 56 | else: 57 | conn = psycopg2.connect(host=args.host, database=args.db) 58 | c = conn.cursor() 59 | passage = db2passage(c, args.pid, args.user) 60 | 61 | if args.binary: 62 | with open(args.binary, "wb") as binf: 63 | pickle.dump(passage, binf) 64 | else: 65 | output = ucca.convert.to_text(passage, lang=args.lang) 66 | if args.outfile: 67 | with open(args.outfile, "w", encoding="utf-8") as outf: 68 | outf.write(output) 69 | else: 70 | print(output) 71 | 72 | 73 | if __name__ == "__main__": 74 | argparser = argparse.ArgumentParser(description=desc) 75 | argparser.add_argument("filename", nargs="?", help="XML file name to convert") 76 | argparser.add_argument("-o", "--outfile", help="output file for standard XML") 77 | argparser.add_argument("-b", "--binary", help="output file for binary pickel") 78 | argparser.add_argument("-d", "--db", help="DB file to get input from") 79 | argparser.add_argument("--host", help="DB host server to get input from") 80 | argparser.add_argument("-p", "--pid", type=int, help="PassageID to query DB") 81 | argparser.add_argument("-u", "--user", help="Username to DB query") 82 | argparser.add_argument("-l", "--lang", default="en", help="language two-letter code for sentence model") 83 | main(argparser.parse_args()) 84 | -------------------------------------------------------------------------------- /scripts/split_corpus.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import re 4 | from shutil import copyfile 5 | 6 | desc = """Split a directory of files into "train", "dev" and "test" directories. 7 | All files not in either "train" or "dev" will go into "test". 8 | """ 9 | TRAIN_DEFAULT = 300 10 | DEV_DEFAULT = 34 11 | 12 | 13 | # TEST on all the rest 14 | 15 | 16 | def copy(src, dest, link=False): 17 | if link: 18 | try: 19 | os.symlink(src, dest) 20 | except (NotImplementedError, OSError): 21 | copyfile(src, dest) 22 | else: 23 | copyfile(src, dest) 24 | 25 | 26 | def numeric(s): 27 | try: 28 | return int(re.findall("([0-9]+)", s)[-1]) 29 | except (ValueError, IndexError) as e: 30 | raise ValueError("Cannot find numeric ID in '%s'" % s) from e 31 | 32 | 33 | def not_split_dir(filename): 34 | return filename not in ("train", "dev", "test") and not filename.startswith(".") 35 | 36 | 37 | def split_passages(directory, train, dev, link, quiet=False): 38 | filenames = sorted(filter(not_split_dir, os.listdir(directory)), key=numeric) 39 | assert filenames, "No files to split" 40 | assert train + dev <= len(filenames), "Not enough files to split: %d+%d>%d" % (train, dev, len(filenames)) 41 | for subdirectory in "train", "dev", "test": 42 | os.makedirs(os.path.join(directory, subdirectory), exist_ok=True) 43 | print("%d files to split: %d/%d/%d" % (len(filenames), train, dev, len(filenames) - train - dev)) 44 | print_format = "Creating link in %s to: " if link else "Copying to %s: " 45 | if not quiet: 46 | print(print_format % "train", end="", flush=True) 47 | for f in filenames[:train]: 48 | copy(os.path.join(directory, f), os.path.join(directory, "train", f), link) 49 | if not quiet: 50 | print(f, end=" ", flush=True) 51 | if not quiet: 52 | print() 53 | print(print_format % "dev", end="", flush=True) 54 | for f in filenames[train:train + dev]: 55 | copy(os.path.join(directory, f), os.path.join(directory, "dev", f), link) 56 | if not quiet: 57 | print(f, end=" ", flush=True) 58 | if not quiet: 59 | print() 60 | print(print_format % "test", end="", flush=True) 61 | for f in filenames[train + dev:]: 62 | copy(os.path.join(directory, f), os.path.join(directory, "test", f), link) 63 | if not quiet: 64 | print(f, end=" ", flush=True) 65 | if not quiet: 66 | print() 67 | 68 | 69 | def main(args): 70 | split_passages(os.path.abspath(args.directory), args.train, args.dev, link=args.link, quiet=args.quiet) 71 | 72 | 73 | if __name__ == "__main__": 74 | argparser = argparse.ArgumentParser(description=desc) 75 | argparser.add_argument("directory", default=".", nargs="?", help="directory to split (default: current directory)") 76 | argparser.add_argument("-t", "--train", type=int, default=TRAIN_DEFAULT, 77 | help="size of train split (default: %d)" % TRAIN_DEFAULT) 78 | argparser.add_argument("-d", "--dev", type=int, default=DEV_DEFAULT, 79 | help="size of dev split (default: %d)" % DEV_DEFAULT) 80 | argparser.add_argument("-l", "--link", action="store_true", help="create symbolic link instead of copying") 81 | argparser.add_argument("-q", "--quiet", action="store_true", help="less output") 82 | main(argparser.parse_args()) 83 | -------------------------------------------------------------------------------- /scripts/standard_to_json.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import os 5 | 6 | from ucca import convert 7 | from ucca.ioutil import external_write_mode 8 | from ucca.ioutil import get_passages_with_progress_bar 9 | 10 | desc = """Parses an XML in UCCA standard format, and writes them in new site format.""" 11 | 12 | 13 | def main(args): 14 | os.makedirs(args.outdir, exist_ok=True) 15 | for passage in get_passages_with_progress_bar(args.filenames): 16 | site_filename = os.path.join(args.outdir, passage.ID + ".json") 17 | with open(site_filename, "w", encoding="utf-8") as f: 18 | print("\n".join(convert.to_json(passage)), file=f) 19 | if args.verbose: 20 | with external_write_mode(): 21 | print("Wrote '%s'" % site_filename) 22 | 23 | 24 | if __name__ == "__main__": 25 | argparser = argparse.ArgumentParser(description=desc) 26 | argparser.add_argument("filenames", nargs="+", help="XML file names to convert") 27 | argparser.add_argument("-o", "--outdir", default=".", help="output directory") 28 | argparser.add_argument("-v", "--verbose", action="store_true", help="verbose output") 29 | main(argparser.parse_args()) 30 | -------------------------------------------------------------------------------- /scripts/standard_to_paragraphs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import os 5 | import sys 6 | from itertools import count 7 | 8 | from ucca.convert import split2paragraphs 9 | from ucca.ioutil import passage2file, get_passages_with_progress_bar, external_write_mode 10 | from ucca.normalization import normalize 11 | 12 | desc = """Parses XML files in UCCA standard format, and writes a passage per paragraph.""" 13 | 14 | 15 | def main(args): 16 | os.makedirs(args.outdir, exist_ok=True) 17 | i = 0 18 | for passage in get_passages_with_progress_bar(args.filenames, "Splitting"): 19 | for paragraph in split2paragraphs( 20 | passage, remarks=args.remarks, lang=args.lang, ids=map(str, count(i)) if args.enumerate else None): 21 | i += 1 22 | outfile = os.path.join(args.outdir, args.prefix + paragraph.ID + (".pickle" if args.binary else ".xml")) 23 | if args.verbose: 24 | with external_write_mode(): 25 | print(paragraph, file=sys.stderr) 26 | print("Writing passage file for paragraph '%s'..." % outfile, file=sys.stderr) 27 | if args.normalize: 28 | normalize(paragraph) 29 | passage2file(paragraph, outfile, binary=args.binary) 30 | 31 | 32 | if __name__ == "__main__": 33 | argparser = argparse.ArgumentParser(description=desc) 34 | argparser.add_argument("filenames", nargs="+", help="passage file names to convert") 35 | argparser.add_argument("-o", "--outdir", default=".", help="output directory") 36 | argparser.add_argument("-p", "--prefix", default="", help="output filename prefix") 37 | argparser.add_argument("-r", "--remarks", action="store_true", help="annotate original IDs") 38 | argparser.add_argument("-l", "--lang", default="en", help="language two-letter code for paragraph model") 39 | argparser.add_argument("-b", "--binary", action="store_true", help="write in pickle binary format (.pickle)") 40 | argparser.add_argument("-e", "--enumerate", action="store_true", help="set output paragraph ID by global order") 41 | argparser.add_argument("-N", "--no-normalize", dest="normalize", action="store_false", 42 | help="do not normalize passages after splitting") 43 | argparser.add_argument("-v", "--verbose", action="store_true", help="print information about every split paragraph") 44 | main(argparser.parse_args()) 45 | -------------------------------------------------------------------------------- /scripts/standard_to_pickle.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys 3 | 4 | import argparse 5 | import os 6 | from tqdm import tqdm 7 | 8 | from ucca.ioutil import file2passage, passage2file, external_write_mode 9 | 10 | desc = """Parses an XML in UCCA standard format, and writes them in binary Pickle format.""" 11 | 12 | 13 | def main(args): 14 | os.makedirs(args.outdir, exist_ok=True) 15 | for filename in tqdm(args.filenames, desc="Converting", unit=" passages"): 16 | if args.verbose: 17 | with external_write_mode(): 18 | print("Reading passage '%s'..." % filename, file=sys.stderr) 19 | passage = file2passage(filename) 20 | basename = os.path.splitext(os.path.basename(filename))[0] 21 | outfile = args.outdir + os.path.sep + basename + ".pickle" 22 | if args.verbose: 23 | with external_write_mode(): 24 | print("Writing file '%s'..." % outfile, file=sys.stderr) 25 | passage2file(passage, outfile, binary=True) 26 | 27 | 28 | if __name__ == '__main__': 29 | argparser = argparse.ArgumentParser(description=desc) 30 | argparser.add_argument('filenames', nargs='+', help="XML file names to convert") 31 | argparser.add_argument('-o', '--outdir', default='.', help="output directory") 32 | argparser.add_argument('-v', '--verbose', action="store_true", help="verbose output") 33 | main(argparser.parse_args()) 34 | -------------------------------------------------------------------------------- /scripts/standard_to_sentences.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import os 5 | import sys 6 | from itertools import count 7 | from logging import warning 8 | 9 | from ucca.convert import split2sentences, split_passage 10 | from ucca.ioutil import passage2file, get_passages_with_progress_bar, external_write_mode 11 | from ucca.normalization import normalize 12 | from ucca.textutil import extract_terminals 13 | 14 | desc = """Parses XML files in UCCA standard format, and writes a passage per sentence.""" 15 | 16 | NUM_NODES_WARNING = 500 # Warn if a sentence has more than this many nodes 17 | 18 | 19 | class Splitter: 20 | def __init__(self, sentences, enum=False, suffix_format="%03d", suffix_start=0): 21 | self.sentences = sentences 22 | self.sentence_to_index = {} 23 | for i, sentence in enumerate(sentences): 24 | self.sentence_to_index.setdefault(sentence, []).append(i) 25 | self.enumerate = enum 26 | self.suffix_format = suffix_format 27 | self.suffix_start = suffix_start 28 | self.index = 0 29 | self.matched_indices = set() 30 | 31 | @classmethod 32 | def read_file(cls, filename, **kwargs): 33 | if filename is None: 34 | return None 35 | with open(filename, encoding="utf-8") as f: 36 | sentences = [line.strip() for line in f] 37 | return cls(sentences, **kwargs) 38 | 39 | def split(self, passage): 40 | ends = [] 41 | ids = [] 42 | token_lists = [] 43 | for terminal in extract_terminals(passage): 44 | token_lists.append([]) 45 | for terminals in token_lists if self.index is None else [token_lists[0]]: 46 | terminals.append(terminal) 47 | sentence = " ".join(t.text for t in terminals) 48 | if self.index is not None and self.index < len(self.sentences) and self.sentences[ 49 | self.index].startswith(sentence): # Try matching next sentence rather than shortest 50 | index = self.index if self.sentences[self.index] == sentence else None 51 | else: 52 | indices = self.sentence_to_index.get(sentence) 53 | index = self.index = indices.pop(0) if indices else None 54 | if index is not None: 55 | self.matched_indices.add(index) 56 | last_end = terminals[0].position - 1 57 | if len(terminals) > 1 and last_end and last_end not in ends: 58 | ends.append(last_end) 59 | ends.append(terminal.position) 60 | ids.append(str(index)) 61 | token_lists = [] 62 | self.index += 1 63 | break 64 | return split_passage(passage, ends, ids=ids if self.enumerate else None, 65 | suffix_format=self.suffix_format, suffix_start=self.suffix_start) 66 | 67 | 68 | def main(args): 69 | splitter = Splitter.read_file(args.sentences, enum=args.enumerate, 70 | suffix_format=args.suffix_format, suffix_start=args.suffix_start) 71 | os.makedirs(args.outdir, exist_ok=True) 72 | i = 0 73 | for passage in get_passages_with_progress_bar(args.filenames, "Splitting"): 74 | for sentence in splitter.split(passage) if splitter else split2sentences( 75 | passage, remarks=args.remarks, lang=args.lang, ids=map(str, count(i)) if args.enumerate else None): 76 | i += 1 77 | outfile = os.path.join(args.outdir, args.prefix + sentence.ID + (".pickle" if args.binary else ".xml")) 78 | if len(sentence.nodes) > NUM_NODES_WARNING: 79 | warning(f"Sentence {i} in passage {passage.ID} has {len(sentence.nodes)} > {NUM_NODES_WARNING} nodes") 80 | if args.verbose: 81 | with external_write_mode(): 82 | print(sentence, file=sys.stderr) 83 | print("Writing passage file for sentence '%s'..." % outfile, file=sys.stderr) 84 | if args.normalize: 85 | normalize(sentence) 86 | passage2file(sentence, outfile, binary=args.binary) 87 | if splitter and len(splitter.matched_indices) < len(splitter.sentences): 88 | print("", "Unmatched sentences:", *[s for i, s in enumerate(splitter.sentences) 89 | if i not in splitter.matched_indices], sep="\n") 90 | 91 | 92 | if __name__ == "__main__": 93 | argparser = argparse.ArgumentParser(description=desc) 94 | argparser.add_argument("filenames", nargs="+", help="passage file names to convert") 95 | argparser.add_argument("-o", "--outdir", default=".", help="output directory") 96 | argparser.add_argument("-p", "--prefix", default="", help="output filename prefix") 97 | argparser.add_argument("-f", "--suffix-format", default="%03d", help="sentence number suffix format") 98 | argparser.add_argument("-i", "--suffix-start", type=int, default=0, help="start index for number suffix") 99 | argparser.add_argument("-r", "--remarks", action="store_true", help="annotate original IDs") 100 | argparser.add_argument("-l", "--lang", default="en", help="language two-letter code for sentence model") 101 | argparser.add_argument("-b", "--binary", action="store_true", help="write in pickle binary format (.pickle)") 102 | argparser.add_argument("-s", "--sentences", help="optional input file with sentence at each line to split by") 103 | argparser.add_argument("-e", "--enumerate", action="store_true", help="set each output sentence ID by global order") 104 | argparser.add_argument("-N", "--no-normalize", dest="normalize", action="store_false", 105 | help="do not normalize passages after splitting") 106 | argparser.add_argument("-v", "--verbose", action="store_true", help="print information about every split sentence") 107 | main(argparser.parse_args()) 108 | -------------------------------------------------------------------------------- /scripts/standard_to_site.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import os 5 | from xml.etree.ElementTree import tostring 6 | 7 | from ucca import convert 8 | from ucca.ioutil import external_write_mode 9 | from ucca.ioutil import get_passages_with_progress_bar 10 | 11 | desc = """Parses an XML in UCCA standard format, and writes them in old site format.""" 12 | 13 | 14 | def main(args): 15 | os.makedirs(args.outdir, exist_ok=True) 16 | for passage in get_passages_with_progress_bar(args.filenames): 17 | site_filename = os.path.join(args.outdir, passage.ID + ".xml") 18 | with open(site_filename, "w", encoding="utf-8") as f: 19 | print(tostring(convert.to_site(passage)).decode(), file=f) 20 | if args.verbose: 21 | with external_write_mode(): 22 | print("Wrote '%s'" % site_filename) 23 | 24 | 25 | if __name__ == "__main__": 26 | argparser = argparse.ArgumentParser(description=desc) 27 | argparser.add_argument("filenames", nargs="+", help="XML file names to convert") 28 | argparser.add_argument("-o", "--outdir", default=".", help="output directory") 29 | argparser.add_argument("-v", "--verbose", action="store_true", help="verbose output") 30 | main(argparser.parse_args()) 31 | -------------------------------------------------------------------------------- /scripts/standard_to_text.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import os 5 | import re 6 | from glob import glob 7 | 8 | from tqdm import tqdm 9 | 10 | from ucca.convert import to_text 11 | from ucca.ioutil import file2passage, get_passages_with_progress_bar 12 | 13 | desc = """Parses files in UCCA standard format, and writes as text files or a text file with a line per passage.""" 14 | 15 | 16 | def numeric(x): 17 | try: 18 | return tuple(map(int, re.findall("\d+", x))) 19 | except ValueError: 20 | return x 21 | 22 | 23 | def write_text(passage, f, sentences, lang, prepend_id=False): 24 | for line in to_text(passage, sentences=sentences, lang=lang): 25 | fields = [passage.ID, line] if prepend_id else [line] 26 | print(*fields, file=f, sep="\t") 27 | 28 | 29 | def main(args): 30 | os.makedirs(args.outdir, exist_ok=True) 31 | if args.join: 32 | out_file = os.path.join(args.outdir, args.join) 33 | with open(out_file, "w", encoding="utf-8") as f: 34 | for passage in get_passages_with_progress_bar(sorted(args.filenames, key=numeric), desc="Converting"): 35 | write_text(passage, f, sentences=args.sentences, lang=args.lang, prepend_id=args.prepend_id) 36 | print("Wrote '%s'." % out_file) 37 | else: # one file per passage 38 | for pattern in args.filenames: 39 | for filename in tqdm(glob(pattern) or [pattern], desc="Converting", unit=" passages"): 40 | passage = file2passage(filename) 41 | basename = os.path.splitext(os.path.basename(filename))[0] 42 | with open(os.path.join(args.outdir, basename + ".txt"), "w", encoding="utf-8") as f: 43 | write_text(passage, f, sentences=args.sentences, lang=args.lang, prepend_id=args.prepend_id) 44 | 45 | 46 | if __name__ == "__main__": 47 | argparser = argparse.ArgumentParser(description=desc) 48 | argparser.add_argument("filenames", nargs="+", help="passage file names to convert") 49 | argparser.add_argument("-o", "--outdir", default=".", help="output directory") 50 | argparser.add_argument("-s", "--sentences", action="store_true", help="split to sentences using spaCy") 51 | argparser.add_argument("-l", "--lang", default="en", help="language two-letter code for sentence model") 52 | argparser.add_argument("-j", "--join", help="write just one text file with this name, with one line per passage") 53 | argparser.add_argument("-p", "--prepend-id", action="store_true", help="prepend the passage ID to the output text") 54 | main(argparser.parse_args()) 55 | -------------------------------------------------------------------------------- /scripts/statistics.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | from collections import Counter 5 | 6 | import pandas as pd 7 | 8 | from ucca import layer0, layer1 9 | from ucca.ioutil import get_passages_with_progress_bar 10 | 11 | desc = """Prints statistics on UCCA passages""" 12 | 13 | 14 | def main(args): 15 | df = pd.DataFrame(index=args.directories, columns=["sentences", "tokens", "nodes", "discontinuous", "reentrant", 16 | "implicit", "edges", "primary", "remote"]) 17 | df.fillna(0, inplace=True) 18 | for i, directory in enumerate(args.directories): 19 | row = df.loc[directory] 20 | for passage in get_passages_with_progress_bar(directory, desc=directory): 21 | l1 = passage.layer(layer1.LAYER_ID) 22 | non_terminals = [n for n in l1.all if n not in l1.heads and len(n.get_terminals()) > 1] 23 | edges = {e for n in non_terminals for e in n} 24 | remote_counter = Counter(e.attrib.get("remote", False) for e in edges) 25 | row["sentences"] += 1 26 | row["tokens"] += len(passage.layer(layer0.LAYER_ID).all) 27 | row["nodes"] += len(non_terminals) 28 | row["discontinuous"] += sum(1 for n in non_terminals if n.discontiguous) 29 | row["reentrant"] += sum(1 for n in non_terminals if any(e.attrib.get("remote") for e in n.incoming)) 30 | row["edges"] += len(edges) 31 | row["primary"] += remote_counter[False] 32 | row["remote"] += remote_counter[True] 33 | row["implicit"] += sum(1 for n in l1.all if n.attrib.get("implicit")) 34 | 35 | # Change to percentages 36 | df["discontinuous"] *= 100. / df["nodes"] 37 | df["reentrant"] *= 100. / df["nodes"] 38 | df["implicit"] *= 100. / df["nodes"] 39 | df["primary"] *= 100. / df["edges"] 40 | df["remote"] *= 100. / df["edges"] 41 | 42 | # Print 43 | if args.outfile: 44 | df.T.to_csv(args.outfile, float_format="%.2f", sep="&", line_terminator=" \\\\\n") 45 | print("Saved to " + args.outfile) 46 | else: 47 | with pd.option_context("display.max_rows", None, "display.max_columns", None): 48 | print(df.T) 49 | 50 | 51 | if __name__ == '__main__': 52 | argparser = argparse.ArgumentParser(description=desc) 53 | argparser.add_argument("directories", nargs="+", help="directories to process") 54 | argparser.add_argument("-o", "--outfile", help="output file for statistics") 55 | main(argparser.parse_args()) 56 | -------------------------------------------------------------------------------- /scripts/text_to_standard.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import string 3 | from glob import glob 4 | 5 | from tqdm import tqdm 6 | 7 | from ucca import core, layer0, layer1 8 | from ucca.ioutil import write_passage 9 | 10 | PUNCTUATION = set(string.punctuation) 11 | 12 | 13 | def gen_lines(patterns): 14 | for pattern in patterns: 15 | for filename in glob(pattern) or [pattern]: 16 | with open(filename, encoding="utf-8") as f: 17 | for line in f: 18 | line = line.strip() 19 | if line: 20 | yield line 21 | 22 | 23 | def main(args): 24 | for i, line in enumerate(tqdm(gen_lines(args.filenames), unit=" lines", desc="Creating passages"), start=1): 25 | p = core.Passage(args.format % i) 26 | l0 = layer0.Layer0(p) 27 | layer1.Layer1(p) 28 | for tok in line.split(): 29 | l0.add_terminal(text=tok, punct=PUNCTUATION.issuperset(tok)) 30 | write_passage(p, outdir=args.out_dir, binary=args.binary, verbose=False) 31 | 32 | 33 | if __name__ == "__main__": 34 | argparser = argparse.ArgumentParser(description="Create unannotated passage files from tokenized and split text") 35 | argparser.add_argument("filenames", nargs="+", help="Input filenames containing tokenized and sentence-split text") 36 | argparser.add_argument("-o", "--out-dir", help="Directory to write output files to") 37 | argparser.add_argument("-f", "--format", default="1%04d0", help="String format for passage IDs") 38 | argparser.add_argument("-b", "--binary", action="store_true", help="Write Pickle files instead of XML") 39 | main(argparser.parse_args()) 40 | -------------------------------------------------------------------------------- /scripts/unique_roles.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | from collections import Counter 5 | 6 | from ucca import layer1 7 | from ucca.ioutil import get_passages_with_progress_bar 8 | 9 | desc = """Finds edge tags that are empirically always unique: occur at most once in edges per node 10 | """ 11 | 12 | 13 | def main(args): 14 | out = args.direction == "out" 15 | roles = set(tag for name, tag in layer1.EdgeTags.__dict__.items() 16 | if isinstance(tag, str) and not name.startswith('__')) 17 | for passage in get_passages_with_progress_bar([args.directory]): 18 | for node in passage.layer(layer1.LAYER_ID).all: 19 | counts = Counter(edge.tag for edge in (node if out else node.incoming)) 20 | roles.difference_update(tag for tag, count in counts.items() if count > 1) 21 | 22 | lines = "\n".join(sorted(roles)) 23 | print(lines) 24 | if args.outfile: 25 | with open(args.outfile, "w", encoding="utf-8") as f: 26 | print(lines, file=f) 27 | 28 | 29 | if __name__ == '__main__': 30 | argparser = argparse.ArgumentParser(description=desc) 31 | argparser.add_argument('-d', '--directory', required=True, help="directory with passage files to process") 32 | argparser.add_argument('-o', '--outfile', default="data/unique_roles.txt", help="output file for data") 33 | argparser.add_argument('-D', '--direction', default="out", help="direction of edges to check (out|in)") 34 | main(argparser.parse_args()) 35 | -------------------------------------------------------------------------------- /scripts/validate.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from multiprocessing import Pool 3 | 4 | import argparse 5 | 6 | from ucca.ioutil import get_passages_with_progress_bar, external_write_mode 7 | from ucca.normalization import normalize 8 | from ucca.validation import validate 9 | 10 | 11 | class Validator: 12 | def __init__(self, normalization=False, extra=False, linkage=True, multigraph=False, strict=False): 13 | self.normalization = normalization 14 | self.extra = extra 15 | self.linkage = linkage 16 | self.multigraph = multigraph 17 | self.strict = strict 18 | 19 | def validate_passage(self, passage): 20 | if self.normalization: 21 | normalize(passage, extra=self.extra) 22 | errors = list(validate(passage, linkage=self.linkage, multigraph=self.multigraph)) 23 | passage_id = passage.ID 24 | user_id = passage.attrib.get("userID") 25 | if user_id: 26 | passage_id += " " + user_id 27 | task_id = passage.attrib.get("annotationID") 28 | if task_id: 29 | passage_id += " " + task_id 30 | if self.strict: 31 | print_errors(passage_id, errors) 32 | return passage_id, errors 33 | 34 | 35 | def main(args): 36 | validator = Validator(args.normalize, args.extra, linkage=args.linkage, multigraph=args.multigraph, 37 | strict=args.strict) 38 | with Pool(10) as pool: 39 | errors = pool.map(validator.validate_passage, 40 | get_passages_with_progress_bar(args.filenames, desc="Validating", converters={})) 41 | errors = dict((k, v) for k, v in errors if v) 42 | if errors: 43 | if not args.strict: 44 | id_len = max(map(len, errors)) 45 | for passage_id, es in sorted(errors.items()): 46 | print_errors(passage_id, es, id_len) 47 | sys.exit(1) 48 | else: 49 | print("No errors found.") 50 | 51 | 52 | def print_errors(passage_id, errors, id_len=None): 53 | for i, e in enumerate(errors): 54 | with external_write_mode(): 55 | print("%-*s|%s" % (id_len or len(passage_id), "" if i else passage_id, e), flush=True) 56 | 57 | 58 | def check_args(parser, args): 59 | if args.extra and not args.normalize: 60 | parser.error("Cannot specify --extra without --normalize") 61 | return args 62 | 63 | 64 | if __name__ == "__main__": 65 | argparser = argparse.ArgumentParser(description="Validate UCCA passages") 66 | argparser.add_argument("filenames", nargs="+", help="files or directories to validate") 67 | argparser.add_argument("-S", "--strict", action="store_true", help="fail as soon as a violation is found") 68 | argparser.add_argument("-n", "--normalize", action="store_true", help="normalize before validation") 69 | argparser.add_argument("-e", "--extra", action="store_true", help="extra normalization rules") 70 | argparser.add_argument("--no-linkage", dest="linkage", action="store_false", help="skip linkage validations") 71 | argparser.add_argument("--multigraph", action="store_true", help="allow multiple edges with the same parent+child") 72 | main(check_args(argparser, argparser.parse_args())) 73 | -------------------------------------------------------------------------------- /scripts/visualize.py: -------------------------------------------------------------------------------- 1 | import os 2 | from argparse import ArgumentParser 3 | 4 | from ucca import visualization, layer0 5 | from ucca.convert import split2sentences 6 | from ucca.ioutil import get_passages, get_passages_with_progress_bar, external_write_mode 7 | 8 | 9 | def print_text(args, text, suffix): 10 | if args.out_dir: 11 | with open(os.path.join(args.out_dir, suffix), "w") as f: 12 | print(text, file=f) 13 | else: 14 | with external_write_mode(): 15 | print(text) 16 | 17 | 18 | def main(args): 19 | if args.out_dir: 20 | os.makedirs(args.out_dir, exist_ok=True) 21 | if not args.tikz: 22 | import matplotlib 23 | matplotlib.use('Agg') 24 | to_stdout = (args.tikz or args.standoff) and not args.out_dir 25 | t = args.passages 26 | t = get_passages(t) if to_stdout else get_passages_with_progress_bar(t, desc="Visualizing") 27 | if args.sentences: 28 | t = (sentence for passage in t for sentence in split2sentences(passage)) 29 | for passage in t: 30 | if args.tikz: 31 | print_text(args, visualization.tikz(passage), passage.ID + ".tikz.txt") 32 | elif args.standoff: 33 | print_text(args, visualization.standoff(passage), passage.ID + ".ann") 34 | else: 35 | import matplotlib.pyplot as plt 36 | width = len(passage.layer(layer0.LAYER_ID).all) * 19 / 27 37 | plt.figure(passage.ID, figsize=(width, width * 10 / 19)) 38 | visualization.draw(passage, node_ids=args.node_ids) 39 | if args.out_dir: 40 | plt.savefig(os.path.join(args.out_dir, passage.ID + "." + args.format)) 41 | plt.close() 42 | else: 43 | plt.show() 44 | 45 | 46 | if __name__ == "__main__": 47 | argparser = ArgumentParser(description="Visualize the given passages as graphs.") 48 | argparser.add_argument("passages", nargs="+", help="UCCA passages, given as xml/pickle file names") 49 | group = argparser.add_mutually_exclusive_group() 50 | group.add_argument("-t", "--tikz", action="store_true", help="print tikz code rather than showing plots") 51 | group.add_argument("-s", "--standoff", action="store_true", help="print standoff code rather than showing plots") 52 | argparser.add_argument("-o", "--out-dir", help="directory to save figures in (otherwise displayed immediately)") 53 | argparser.add_argument("-i", "--node-ids", action="store_true", help="print tikz code rather than showing plots") 54 | argparser.add_argument("-f", "--format", choices=("png", "svg"), default="png", help="image format") 55 | argparser.add_argument("--sentences", help="split to sentences to avoid huge plots") 56 | main(argparser.parse_args()) 57 | -------------------------------------------------------------------------------- /scripts/visualize_as_text.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from argparse import ArgumentParser 3 | from ucca.constructions import extract_candidates, add_argument 4 | import xml.etree.ElementTree as ETree 5 | from ucca import layer0, layer1, convert, textutil 6 | from operator import attrgetter, itemgetter 7 | from ucca.convert import split2sentences 8 | from ucca.ioutil import get_passages, get_passages_with_progress_bar, external_write_mode 9 | import re 10 | 11 | # Script by Sriram Chaudhury 12 | 13 | 14 | descr = {'Q':'Unknown', 'T':'Unknown', 'Terminal':'Terminal_node','P':'Process' ,'S':'State', 'A':'Participant', 'D':'Adverbial', 'C':'Center', 'E':'Elaborator', 'N':'Connector', 'R':'Relator', 'H':'Parallel_Scene', 'L':'Linker' ,'G':'Ground', 'F':'Function', 'U':'Punctuation' 15 | } 16 | 17 | def find_path(node,path): 18 | if(len(node.parents) >= 1): 19 | if(node.tag != 'Word') and (node.tag != 'Punctuation'): 20 | #path.append(node.ID+'--'+node.ftag+':'+descr[node.ftag]+'-->'+node.parents[0].ID) 21 | path.append(('-->('+node.ftag+':'+descr[node.ftag]+')-->'+node.parents[0].ID)) 22 | else: 23 | #path.append(node.text+'--'+'Terminal'+'-->'+node.parents[0].ID) 24 | path.append((node.text+'('+str(node.ID)+')'+'--Terminal-->'+node.parents[0].ID)) 25 | for j in node.parents: 26 | find_path(j,path) 27 | return path 28 | 29 | 30 | def find_children(node, path, level): 31 | remote_found = 0 32 | for edge in node: 33 | if(edge.attrib.get('remote')): 34 | t12 = edge 35 | remote_found = 1 36 | 37 | for ch in node.children: 38 | if (ch.tag != 'Word') and (ch.tag != 'Punctuation'): 39 | if(remote_found) and (ch.ID == t12.child.ID): 40 | path.append((ch.ftag, ch.ID+'*', level+1,True)) 41 | else: 42 | path.append((ch.ftag, ch.ID, level+1,False)) 43 | find_children(ch, path, level+1) 44 | else: 45 | path.append((ch.text, ch.ID, level+1,False)) 46 | path.append('End') 47 | 48 | return path 49 | 50 | 51 | 52 | 53 | def main(args): 54 | 55 | for passage in get_passages_with_progress_bar(args.passages): 56 | t = split2sentences(passage) 57 | sen_no = 0 58 | for sen in t: 59 | #print('sentence %d\n\n%s\n%s' %(i,convert.to_text(sen), convert.to_sequence(sen))) 60 | print('sentence %d\n\n%s\n' %(sen_no, convert.to_text(sen))) 61 | 62 | root = sen.nodes['1.1'] 63 | first = 1 64 | tab_len = {} 65 | tab_len[0] = len('1.1') 66 | for i in root.children: 67 | print('\n') 68 | path = [] 69 | level = 1 70 | path.append((i.ftag, i.ID, level,False)) 71 | path = find_children(i, path, level) 72 | end = 0 73 | if(first): 74 | pstr = root.ID 75 | first = 0 76 | else: 77 | for k in range(0, tab_len[0]): 78 | pstr = pstr + ' ' 79 | for j in path: 80 | if(j == 'End'): 81 | print(pstr) 82 | pstr = '' 83 | end = 1 84 | continue 85 | rel = j[0] 86 | nd = j[1] 87 | tab = int(j[2]) 88 | remote = j[3] 89 | if(end): 90 | q_mark = 0 91 | for k in range(0,tab_len[tab-1]): 92 | if(k == tab_len[q_mark]): 93 | pstr = pstr + '.' 94 | q_mark += 1 95 | else: 96 | pstr = pstr+' ' 97 | end = 0 98 | if(rel in descr): 99 | rel_desc = rel+':' +descr[rel] 100 | else: 101 | rel_desc = rel 102 | if(remote): 103 | pstr = pstr + '|-->Remote(' + rel_desc + ')-->' + nd 104 | else: 105 | pstr = pstr+'|-->('+rel_desc+')-->'+nd 106 | tab_len[tab] = len(pstr) 107 | 108 | print('-----------------------------------\n') 109 | sen_no += 1 110 | 111 | 112 | 113 | if __name__ == "__main__": 114 | argparser = ArgumentParser(description="Xml to conll and find the path of the word from UCCA xml file.") 115 | argparser.add_argument("passages", nargs="+", help="the corpus, given as xml/pickle file names") 116 | add_argument(argparser, False) 117 | main(argparser.parse_args()) 118 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | 5 | import os 6 | import re 7 | from glob import glob 8 | from setuptools import setup, find_packages 9 | 10 | from ucca.__version__ import VERSION 11 | 12 | try: 13 | this_file = __file__ 14 | except NameError: 15 | this_file = sys.argv[0] 16 | os.chdir(os.path.dirname(os.path.abspath(this_file))) 17 | 18 | extras_require = {} 19 | install_requires = [] 20 | for requirements_file in glob("requirements.*txt"): 21 | suffix = re.match(r"[^.]*\.(.*)\.?txt", requirements_file).group(1).rstrip(".") 22 | with open(requirements_file) as f: 23 | (extras_require.setdefault(suffix, []) if suffix else install_requires).extend(f.read().splitlines()) 24 | 25 | with open('README.md', encoding='utf-8') as f: 26 | long_description = f.read() 27 | 28 | setup(name="UCCA", 29 | version=VERSION, 30 | install_requires=install_requires, 31 | extras_require=extras_require, 32 | description="Universal Conceptual Cognitive Annotation", 33 | long_description=long_description, 34 | long_description_content_type='text/markdown', 35 | author="Daniel Hershcovich", 36 | author_email="danielh@cs.huji.ac.il", 37 | url="https://github.com/huji-nlp/ucca", 38 | classifiers=[ 39 | "Development Status :: 4 - Beta", 40 | "Intended Audience :: Science/Research", 41 | "Programming Language :: Python :: 3.6", 42 | "Topic :: Text Processing :: Linguistic", 43 | "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)", 44 | ], 45 | packages=find_packages(), 46 | ) 47 | -------------------------------------------------------------------------------- /test_files/implicit1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | -------------------------------------------------------------------------------- /test_files/implicit1_ref.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | -------------------------------------------------------------------------------- /test_files/implicit2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | -------------------------------------------------------------------------------- /test_files/site1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 1 9 | 10 | 11 | 2 12 | 13 | 14 | 3 15 | 16 | 17 | 4 18 | 19 | 20 | . 21 | 22 | 23 | 24 | 25 | 6 26 | 27 | 28 | 7 29 | 30 | 31 | 8 32 | 33 | 34 | 9 35 | 36 | 37 | 10 38 | 39 | 40 | . 41 | 42 | 43 | 44 | 45 | 12 46 | 47 | 48 | 13 49 | 50 | 51 | 14 52 | 53 | 54 | 15 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /test_files/site2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 1 10 | 11 | 12 | 13 | 14 | 2 15 | 16 | 17 | 18 | 19 | 20 | 3 21 | 22 | 23 | 4 24 | 25 | 26 | . 27 | 28 | 29 | 30 | 31 | 32 | 6 33 | 34 | 35 | 7 36 | 37 | 38 | 8 39 | 40 | 41 | 9 42 | 43 | 44 | 10 45 | 46 | 47 | . 48 | 49 | 50 | 51 | 52 | 12 53 | 54 | 55 | 13 56 | 57 | 58 | 14 59 | 60 | 61 | 15 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | -------------------------------------------------------------------------------- /test_files/site3.xml: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 1 13 | 14 | 15 | 16 | 17 | 2 18 | 19 | 20 | 21 | 22 | 23 | 3 24 | 25 | 26 | 4 27 | 28 | 29 | . 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 6 40 | 41 | 42 | 43 | 44 | 7 45 | 46 | 47 | 48 | 49 | 50 | 8 51 | 52 | 53 | 54 | 55 | 56 | 9 57 | 58 | 59 | 60 | 61 | 62 | 63 | 10 64 | 65 | 66 | 67 | . 68 | 69 | 70 | 71 | 72 | 73 | 12 74 | 75 | 76 | 77 | 78 | 13 79 | 80 | 81 | 82 | 83 | 14 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 15 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | -------------------------------------------------------------------------------- /test_files/standard3.xml: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /test_files/toy_bad.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | -------------------------------------------------------------------------------- /ucca/README.md: -------------------------------------------------------------------------------- 1 | `ucca` package 2 | ==================== 3 | 4 | List of Modules 5 | --------------- 6 | 1. `constructions`: extracting linguistic constructions from text 7 | 1. `convert`: converting between UCCA objects and various formats 8 | 1. `core`: basic objects of UCCA relations: `Node`, `Edge`, `Layer` and `Passage` 9 | 1. `evaluation`: comparing passages and inspecting the differences 10 | 1. `ioutil`: reading and writing `Passage` objects 11 | 1. `layer0`: text layer objects: `Layer0` and `Terminal` 12 | 1. `layer1`: foundational layer objects: `Layer1`, `FoundationalNode`, `PunctNode` and `Linkage` 13 | 1. `normalization`: modifying `Passage`s to standardized conventions 14 | 1. `textutil`: text processing utilities, including NLP pipeline 15 | 1. `validation`: checks for validity of `Passage`s 16 | 1. `visualization`: draw `Passage` as graph 17 | 18 | In addition, the `tests` package enables unit-testing. 19 | 20 | Authors 21 | ------ 22 | * Amit Beka: amit.beka@gmail.com 23 | * Daniel Hershcovich: danielh@cs.huji.ac.il -------------------------------------------------------------------------------- /ucca/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huji-nlp/ucca/31e381f2b2693e82c8402b09a6d28f97d595a1eb/ucca/__init__.py -------------------------------------------------------------------------------- /ucca/__version__.py: -------------------------------------------------------------------------------- 1 | VERSION = "1.3.11" 2 | # noinspection PyBroadException 3 | try: 4 | from subprocess import check_output, DEVNULL 5 | GIT_VERSION = check_output(["git", "describe", "--tags", "--always"], stderr=DEVNULL).decode().strip().lstrip("v") 6 | except: 7 | GIT_VERSION = VERSION 8 | -------------------------------------------------------------------------------- /ucca/diffutil.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from ucca.ioutil import passage2file 4 | 5 | 6 | def diff_passages(true_passage, pred_passage, write=False): 7 | """ 8 | Debug method to print missing or mistaken attributes, nodes and edges 9 | """ 10 | lines = list() 11 | if not true_passage._attrib.equals(pred_passage._attrib): 12 | lines.append("Passage attributes mismatch: %s, %s" % 13 | (true_passage._attrib, pred_passage._attrib)) 14 | try: 15 | for lid, l1 in true_passage._layers.items(): 16 | l2 = true_passage.layer(lid) 17 | if not l1._attrib.equals(l2._attrib): 18 | lines.append("Layer %d attributes mismatch: %s, %s" % 19 | (lid, l1._attrib, l2._attrib)) 20 | except KeyError: # no layer with same ID found 21 | lines.append("Missing layer: %s, %s" % 22 | (true_passage._layers, pred_passage._layers)) 23 | pred_ids = {node.extra.get("remarks", node.ID): node 24 | for node in pred_passage.missing_nodes(true_passage)} 25 | true_ids = {node.ID: node 26 | for node in true_passage.missing_nodes(pred_passage)} 27 | for pred_id, pred_node in list(pred_ids.items()): 28 | true_node = true_ids.get(pred_id) 29 | if true_node: 30 | pred_ids.pop(pred_id) 31 | true_ids.pop(pred_id) 32 | pred_edges = {edge.tag + "->" + edge.child.ID: edge for edge in 33 | pred_node.missing_edges(true_node)} 34 | true_edges = {edge.tag + "->" + edge.child.ID: edge for edge in 35 | true_node.missing_edges(pred_node)} 36 | intersection = set(pred_edges).intersection(set(true_edges)) 37 | pred_edges = {s: edge for s, edge in pred_edges.items() if s not in intersection} 38 | true_edges = {s: edge for s, edge in true_edges.items() if s not in intersection} 39 | 40 | node_lines = [] 41 | if not pred_node._attrib.equals(true_node._attrib): 42 | node_lines.append(" Attributes mismatch: %s, %s" % 43 | (sorted(true_node._attrib.items()), sorted(pred_node._attrib.items()))) 44 | if pred_edges: 45 | node_lines.append(" Mistake edges: %s" % ", ".join(pred_edges)) 46 | if true_edges: 47 | node_lines.append(" Missing edges: %s" % ", ".join(true_edges)) 48 | if node_lines: 49 | lines.append("For node " + pred_id + ":") 50 | lines.extend(node_lines) 51 | if pred_ids: 52 | lines.append("Mistake nodes: %s" % ", ".join(pred_ids)) 53 | if true_ids: 54 | lines.append("Missing nodes: %s" % ", ".join(true_ids)) 55 | if write and lines: 56 | outfile = "%s.xml" % true_passage.ID 57 | sys.stderr.write("Writing passage '%s'...\n" % outfile) 58 | passage2file(true_passage, outfile) 59 | outfile = "%s_pred.xml" % pred_passage.ID 60 | sys.stderr.write("Writing passage '%s'...\n" % outfile) 61 | passage2file(pred_passage, outfile) 62 | return "\n" + "\n".join(lines) 63 | -------------------------------------------------------------------------------- /ucca/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huji-nlp/ucca/31e381f2b2693e82c8402b09a6d28f97d595a1eb/ucca/tests/__init__.py -------------------------------------------------------------------------------- /ucca/tests/test_constructions.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | import pytest 4 | 5 | from ucca import textutil 6 | from ucca.constructions import CATEGORIES_NAME, DEFAULT, CONSTRUCTIONS, extract_candidates 7 | from .conftest import PASSAGES, loaded, loaded_valid, multi_sent, crossing, discontiguous, l1_passage, empty 8 | 9 | """Tests the constructions module functions and classes.""" 10 | 11 | 12 | def assert_spacy_not_loaded(*args, **kwargs): 13 | del args, kwargs 14 | assert False, "Should not load spaCy when passage is pre-annotated" 15 | 16 | 17 | def extract_and_check(p, constructions=None, expected=None): 18 | d = OrderedDict((construction, [candidate.edge for candidate in candidates]) for construction, candidates in 19 | extract_candidates(p, constructions=constructions).items() if candidates) 20 | if expected is not None: 21 | hist = {c.name: len(e) for c, e in d.items()} 22 | assert hist == expected, " != ".join(",".join(sorted(h)) for h in (hist, expected)) 23 | 24 | 25 | @pytest.mark.parametrize("create, expected", ( 26 | (loaded, {'P': 1, 'remote': 1, 'E': 3, 'primary': 15, 'U': 2, 'F': 1, 'C': 3, 'A': 1, 'D': 1, 'L': 2, 'mwe': 2, 27 | 'H': 5, 'implicit': 1, 'main_rel': 1}), 28 | (loaded_valid, {'P': 1, 'remote': 1, 'E': 3, 'primary': 15, 'U': 2, 'F': 1, 'C': 3, 'A': 1, 'D': 1, 'L': 2, 29 | 'mwe': 2, 'H': 5, 'implicit': 1, 'main_rel': 1}), 30 | (multi_sent, {'U': 4, 'P': 3, 'mwe': 2, 'H': 3, 'primary': 6, 'main_rel': 2}), 31 | (crossing, {'U': 3, 'P': 2, 'remote': 1, 'mwe': 1, 'H': 2, 'primary': 3, 'main_rel': 2}), 32 | (discontiguous, {'G': 1, 'U': 2, 'E': 2, 'primary': 13, 'P': 3, 'F': 1, 'C': 1, 'A': 3, 'D': 2, 33 | 'mwe': 6, 'H': 3, 'implicit':3, 'main_rel': 2}), 34 | (l1_passage, {'P': 2, 'mwe': 4, 'H': 3, 'primary': 11, 'U': 2, 'A': 5, 'D': 1, 'L': 2, 'remote': 2, 'S': 1, 35 | 'implicit':1, 'main_rel': 3}), 36 | 37 | (empty, {}), 38 | )) 39 | def test_extract_all(create, expected): 40 | extract_and_check(create(), constructions=CONSTRUCTIONS, expected=expected) 41 | 42 | 43 | @pytest.mark.parametrize("create", PASSAGES) 44 | @pytest.mark.parametrize("constructions", (DEFAULT, [CATEGORIES_NAME]), ids=("default", CATEGORIES_NAME)) 45 | def test_extract(create, constructions, monkeypatch): 46 | monkeypatch.setattr(textutil, "get_nlp", assert_spacy_not_loaded) 47 | extract_and_check(create(), constructions=constructions) 48 | -------------------------------------------------------------------------------- /ucca/tests/test_ioutil.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | import random 4 | from glob import glob 5 | 6 | from ucca import layer0, layer1, convert, ioutil, diffutil 7 | from .conftest import loaded, multi_sent, discontiguous, l1_passage 8 | 9 | """Tests the ioutil module functions and classes.""" 10 | 11 | 12 | def test_split2sentences(): 13 | """Tests splitting a passage by sentence ends. 14 | """ 15 | p = multi_sent() 16 | split = convert.split2sentences(p) 17 | assert len(split) == 3 18 | terms = [[t.text for t in s.layer(layer0.LAYER_ID).all] for s in split] 19 | assert terms[0] == ["1", "2", "3", "."] 20 | assert terms[1] == ["5", "6", "."] 21 | assert terms[2] == ["8", ".", "10", "."] 22 | assert all(t.paragraph == 1 for s in split for t in s.layer(layer0.LAYER_ID).all) 23 | top_scenes = [s.layer(layer1.LAYER_ID).top_scenes for s in split] 24 | for t in top_scenes: 25 | assert len(t) == 1 26 | assert t[0].incoming[0].tag == layer1.EdgeTags.ParallelScene 27 | 28 | 29 | def test_split2paragraphs(): 30 | """Tests splitting a passage by paragraph ends. 31 | """ 32 | p = multi_sent() 33 | split = convert.split2paragraphs(p) 34 | assert len(split) == 2 35 | terms = [[t.text for t in s.layer(layer0.LAYER_ID).all] for s in split] 36 | assert terms[0] == ["1", "2", "3", ".", "5", "6", "."] 37 | assert terms[1] == ["8", ".", "10", "."] 38 | assert all(t.paragraph == 1 for s in split for t in s.layer(layer0.LAYER_ID).all) 39 | top_scenes = [s.layer(layer1.LAYER_ID).top_scenes for s in split] 40 | assert len(top_scenes[0]) == 2 41 | assert len(top_scenes[1]) == 1 42 | for t in top_scenes: 43 | for n in t: 44 | assert n.incoming[0].tag == layer1.EdgeTags.ParallelScene 45 | 46 | 47 | @pytest.mark.parametrize("create", (loaded, multi_sent, discontiguous, l1_passage)) 48 | def test_split_join_sentences(create): 49 | p = create() 50 | split = convert.split2sentences(p, remarks=True) 51 | copy = convert.join_passages(split) 52 | diffutil.diff_passages(p, copy) 53 | assert p.equals(copy) 54 | 55 | 56 | @pytest.mark.parametrize("create", (loaded, multi_sent, discontiguous, l1_passage)) 57 | def test_split_join_paragraphs(create): 58 | p = create() 59 | split = convert.split2paragraphs(p, remarks=True) 60 | copy = convert.join_passages(split) 61 | diffutil.diff_passages(p, copy) 62 | assert p.equals(copy) 63 | 64 | 65 | def _test_passages(passages): 66 | for passage in passages: 67 | assert passage.layer(layer0.LAYER_ID).all, "No terminals in passage " + passage.ID 68 | assert len(passage.layer(layer1.LAYER_ID).all), "No non-terminals but the root in passage " + passage.ID 69 | 70 | 71 | def test_load_passage(): 72 | _test_passages(ioutil.read_files_and_dirs(glob(os.path.join("test_files", "standard3.xml")))) 73 | 74 | 75 | def test_load_multiple_passages(): 76 | """Test lazy-loading passages""" 77 | files = 3 * ["test_files/standard3.xml"] 78 | passages = ioutil.read_files_and_dirs(files) 79 | assert len(files) == len(list(passages)), "Should load one passage per file" 80 | assert len(files) == len(passages) 81 | _test_passages(passages) 82 | 83 | 84 | def test_shuffle_passages(): 85 | """Test lazy-loading passages and shuffling them""" 86 | files = 3 * ["test_files/standard3.xml"] 87 | passages = ioutil.read_files_and_dirs(files) 88 | random.shuffle(passages) 89 | assert len(files) == len(passages) 90 | _test_passages(passages) 91 | -------------------------------------------------------------------------------- /ucca/tests/test_layer0.py: -------------------------------------------------------------------------------- 1 | from ucca import core, layer0 2 | 3 | """Tests module layer0 functionality.""" 4 | 5 | 6 | def test_terminals(): 7 | """Tests :class:`layer0`.Terminal new and inherited functionality.""" 8 | p = core.Passage("1") 9 | layer0.Layer0(p) 10 | terms = [ 11 | layer0.Terminal(ID="0.1", root=p, 12 | tag=layer0.NodeTags.Word, 13 | attrib={"text": "1", 14 | "paragraph": 1, 15 | "paragraph_position": 1}), 16 | layer0.Terminal(ID="0.2", root=p, 17 | tag=layer0.NodeTags.Word, 18 | attrib={"text": "2", 19 | "paragraph": 2, 20 | "paragraph_position": 1}), 21 | layer0.Terminal(ID="0.3", root=p, 22 | tag=layer0.NodeTags.Punct, 23 | attrib={"text": ".", 24 | "paragraph": 2, 25 | "paragraph_position": 2}) 26 | ] 27 | 28 | p_copy = core.Passage("2") 29 | layer0.Layer0(p_copy) 30 | equal_term = layer0.Terminal(ID="0.1", root=p_copy, 31 | tag=layer0.NodeTags.Word, 32 | attrib={"text": "1", 33 | "paragraph": 1, 34 | "paragraph_position": 1}) 35 | unequal_term = layer0.Terminal(ID="0.2", root=p_copy, 36 | tag=layer0.NodeTags.Word, 37 | attrib={"text": "two", 38 | "paragraph": 2, 39 | "paragraph_position": 1}) 40 | 41 | assert [t.punct for t in terms] == [False, False, True] 42 | assert [t.text for t in terms] == ["1", "2", "."] 43 | assert [t.position for t in terms] == [1, 2, 3] 44 | assert [t.paragraph for t in terms] == [1, 2, 2] 45 | assert [t.para_pos for t in terms] == [1, 1, 2] 46 | assert not (terms[0] == terms[1]) 47 | assert not (terms[0] == terms[2]) 48 | assert not (terms[1] == terms[2]) 49 | assert terms[0] == terms[0] 50 | assert terms[0].equals(equal_term) 51 | assert not (terms[1].equals(unequal_term)) 52 | assert p.copy(layer0.LAYER_ID).equals(p) 53 | assert p_copy.copy(layer0.LAYER_ID).equals(p_copy) 54 | 55 | 56 | def test_layer0(): 57 | p = core.Passage("1") 58 | l0 = layer0.Layer0(p) 59 | t1 = l0.add_terminal(text="1", punct=False) 60 | l0.add_terminal(text="2", punct=True, paragraph=2) 61 | t3 = l0.add_terminal(text="3", punct=False, paragraph=2) 62 | assert [x[0] for x in l0.pairs] == [1, 2, 3] 63 | assert [t.para_pos for t in l0.all] == [1, 1, 2] 64 | assert l0.words == (t1, t3) 65 | assert p.copy(layer0.LAYER_ID).equals(p) 66 | -------------------------------------------------------------------------------- /ucca/tests/test_layer1.py: -------------------------------------------------------------------------------- 1 | from ucca import layer1 2 | from .conftest import l1_passage, discontiguous 3 | 4 | """Tests layer1 module functionality and correctness.""" 5 | 6 | 7 | def test_creation(): 8 | p = l1_passage() 9 | head = p.layer("1").heads[0] 10 | assert [x.tag for x in head] == ["L", "H", "H", "L", "H", "U"] 11 | assert [x.child.position for x in head.children[0]] == [1] 12 | assert [x.tag for x in head.children[1]] == ["P", "A", "U", "A"] 13 | assert [x.child.position for x in head.children[1].children[0]] == [2, 3, 4, 5] 14 | assert [x.child.position for x in head.children[1].children[1]] == [6, 7, 8, 9] 15 | assert [x.child.position for x in head.children[1].children[2]] == [10] 16 | assert (head.children[1][3].attrib.get("remote")) 17 | 18 | 19 | def test_fnodes(): 20 | p = l1_passage() 21 | l0 = p.layer("0") 22 | l1 = p.layer("1") 23 | 24 | terms = l0.all 25 | head, lkg1, lkg2 = l1.heads 26 | link1, ps1, ps2, link2, ps3, punct2 = head.children 27 | p1, a1, punct1 = [x.child for x in ps1 if not x.attrib.get("remote")] 28 | a2, d2 = [x.child for x in ps2 if not x.attrib.get("remote")] 29 | p3, a3, a4 = ps3.children 30 | 31 | assert lkg1.relation == link1 32 | assert lkg1.arguments == [ps1] 33 | assert ps2.process == p1 34 | assert ps1.participants == [a1, d2] 35 | assert ps3.participants == [a3, a4] 36 | 37 | assert ps1.get_terminals() == terms[1:10] 38 | assert ps1.get_terminals(punct=False, remotes=True) == terms[1:9] + terms[14:15] 39 | assert ps1.end_position == 10 40 | assert ps2.start_position == 11 41 | assert ps3.start_position == 17 42 | assert a4.start_position == -1 43 | 44 | assert ps1.fparent == head 45 | assert d2.fparent == ps2 46 | 47 | 48 | def test_layer1(): 49 | p = l1_passage() 50 | l1 = p.layer("1") 51 | 52 | head, lkg1, lkg2 = l1.heads 53 | link1, ps1, ps2, link2, ps3, punct2 = head.children 54 | p1, a1, punct1 = [x.child for x in ps1 if not x.attrib.get("remote")] 55 | 56 | assert l1.top_scenes == [ps1, ps2, ps3] 57 | assert l1.top_linkages == [lkg1, lkg2] 58 | 59 | # Changing the process tag of scene #1 to A and back, validate that 60 | # top scenes are updates accordingly 61 | p_edge = [e for e in ps1 if e.tag == layer1.EdgeTags.Process][0] 62 | p_edge.tag = layer1.EdgeTags.Participant 63 | assert l1.top_linkages == [lkg2] 64 | p_edge.tag = layer1.EdgeTags.Process 65 | assert l1.top_scenes == [ps1, ps2, ps3] 66 | assert l1.top_linkages == [lkg1, lkg2] 67 | 68 | 69 | def test_str(): 70 | p = l1_passage() 71 | assert [str(x) for x in p.layer("1").heads] == \ 72 | ["[L 1] [H [P 2 3 4 5] [A 6 7 8 9] [U 10] " 73 | "... [A* 15] ] [H [P* 2 3 4 5] [A 11 12 " 74 | "13 14] [D 15] ] [L 16] [H [A IMPLICIT] [S " 75 | "17 18] [A 19] ] [U 20] ", 76 | "1.2-->1.3", "1.10-->1.7,1.11"] 77 | 78 | 79 | def test_destroy(): 80 | p = l1_passage() 81 | l1 = p.layer("1") 82 | 83 | head, lkg1, lkg2 = l1.heads 84 | link1, ps1, ps2, link2, ps3, punct2 = head.children 85 | p1, a1, punct1 = [x.child for x in ps1 if not x.attrib.get("remote")] 86 | 87 | ps1.destroy() 88 | assert head.children == [link1, ps2, link2, ps3, punct2] 89 | assert p1.parents == [ps2] 90 | assert not a1.parents 91 | assert not punct1.parents 92 | 93 | 94 | def test_discontiguous(): 95 | """Tests FNode.discontiguous and FNode.get_sequences""" 96 | p = discontiguous() 97 | l1 = p.layer("1") 98 | head = l1.heads[0] 99 | ps1, ps2, ps3 = head.children 100 | d1, a1, p1, f1 = ps1.children 101 | e1, c1, e2 = d1.children 102 | d2, g2, p2, a2 = ps2.children 103 | t14, p3, a3 = ps3.children 104 | 105 | # Checking discontiguous property 106 | assert not ps1.discontiguous 107 | assert not d1.discontiguous 108 | assert not e1.discontiguous 109 | assert not e2.discontiguous 110 | assert c1.discontiguous 111 | assert a1.discontiguous 112 | assert p1.discontiguous 113 | assert not f1.discontiguous 114 | assert ps2.discontiguous 115 | assert not p2.discontiguous 116 | assert not a2.discontiguous 117 | assert not ps3.discontiguous 118 | assert not a3.discontiguous 119 | 120 | # Checking get_sequences -- should return only non-remote, non-implicit 121 | # stretches of terminals 122 | assert ps1.get_sequences() == [(1, 10)] 123 | assert d1.get_sequences() == [(1, 4)] 124 | assert e1.get_sequences() == [(1, 1)] 125 | assert e2.get_sequences() == [(3, 3)] 126 | assert c1.get_sequences() == [(2, 2), (4, 4)] 127 | assert a1.get_sequences() == [(5, 5), (8, 8)] 128 | assert p1.get_sequences() == [(6, 7), (10, 10)] 129 | assert f1.get_sequences() == [(9, 9)] 130 | assert ps2.get_sequences() == [(11, 14), (18, 20)] 131 | assert p2.get_sequences() == [(11, 14)] 132 | assert a2.get_sequences() == [(18, 20)] 133 | assert not d2.get_sequences() 134 | assert not g2.get_sequences() 135 | assert ps3.get_sequences() == [(15, 17)] 136 | assert a3.get_sequences() == [(16, 17)] 137 | assert not p3.get_sequences() 138 | -------------------------------------------------------------------------------- /ucca/tests/test_textutil.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from ucca import layer0, convert, textutil 4 | from .conftest import crossing, multi_sent, multi_sent_with_quotes, l1_passage, discontiguous, empty, PASSAGES 5 | 6 | """Tests the textutil module functions and classes.""" 7 | 8 | 9 | @pytest.mark.parametrize("create, breaks", ( 10 | (multi_sent, [4, 7, 11]), 11 | (crossing, [3, 7]), 12 | (discontiguous, [20]), 13 | (l1_passage, [20]), 14 | (empty, []), 15 | (multi_sent_with_quotes, [6, 9, 13]), 16 | )) 17 | def test_break2sentences(create, breaks): 18 | """Tests identifying correctly sentence ends. """ 19 | assert textutil.break2sentences(create()) == breaks 20 | 21 | 22 | def test_word_vectors(): 23 | vectors, dim = textutil.get_word_vectors() 24 | for word, vector in vectors.items(): 25 | assert len(vector) == dim, "Vector dimension for %s is %d != %d" % (word, len(vector), dim) 26 | 27 | 28 | @pytest.mark.parametrize("create", PASSAGES) 29 | @pytest.mark.parametrize("as_array", (True, False), ids=("array", "extra")) 30 | def test_annotate_passage(create, as_array): 31 | passage = create() 32 | textutil.annotate(passage, as_array=as_array) 33 | for p in passage, convert.from_standard(convert.to_standard(passage)): 34 | assert textutil.is_annotated(p, as_array=as_array), "Passage %s is not annotated" % passage.ID 35 | for terminal in p.layer(layer0.LAYER_ID).all: 36 | if as_array: 37 | assert terminal.tok is not None, "Terminal %s has no annotation" % terminal 38 | assert len(terminal.tok) == len(textutil.Attr) 39 | else: 40 | for attr in textutil.Attr: 41 | assert attr.key in terminal.extra, "Terminal %s has no %s" % (terminal, attr.name) 42 | 43 | 44 | @pytest.mark.parametrize("as_array", (True, False), ids=("array", "extra")) 45 | @pytest.mark.parametrize("convert_and_back", (True, False), ids=("convert", "direct")) 46 | def test_annotate_all(as_array, convert_and_back): 47 | passages = [create() for create in PASSAGES] 48 | list(textutil.annotate_all(passages)) 49 | for passage, compare in textutil.annotate_all(((p, p) for p in passages), as_array=as_array, as_tuples=True): 50 | assert passage is compare 51 | p = (passage, convert.from_standard(convert.to_standard(passage)))[convert_and_back] 52 | assert textutil.is_annotated(p, as_array=as_array), "Passage %s is not annotated" % passage.ID 53 | for terminal in p.layer(layer0.LAYER_ID).all: 54 | if as_array: 55 | assert terminal.tok is not None, "Terminal %s in passage %s has no annotation" % (terminal, passage.ID) 56 | assert len(terminal.tok) == len(textutil.Attr) 57 | else: 58 | for attr in textutil.Attr: 59 | assert attr.key in terminal.extra, "Terminal %s in passage %s has no %s" % ( 60 | terminal, passage.ID, attr.name) 61 | 62 | 63 | def assert_spacy_not_loaded(*args, **kwargs): 64 | del args, kwargs 65 | assert False, "Should not load spaCy when passage is pre-annotated" 66 | 67 | 68 | @pytest.mark.parametrize("create", PASSAGES) 69 | @pytest.mark.parametrize("as_array", (True, False), ids=("array", "extra")) 70 | @pytest.mark.parametrize("convert_and_back", (True, False), ids=("convert", "direct")) 71 | @pytest.mark.parametrize("partial", (True, False), ids=("partial", "full")) 72 | def test_preannotate_passage(create, as_array, convert_and_back, partial, monkeypatch): 73 | if not partial: 74 | monkeypatch.setattr(textutil, "get_nlp", assert_spacy_not_loaded) 75 | passage = create() 76 | l0 = passage.layer(layer0.LAYER_ID) 77 | attr_values = list(range(10, 10 + len(textutil.Attr))) 78 | if partial: 79 | attr_values[textutil.Attr.ENT_TYPE.value] = "" 80 | if as_array: 81 | l0.extra["doc"] = [len(p) * [attr_values] for p in textutil.break2paragraphs(passage, return_terminals=True)] 82 | else: 83 | for terminal in l0.all: 84 | for attr, value in zip(textutil.Attr, attr_values): 85 | if value: 86 | terminal.extra[attr.key] = value 87 | passage = (passage, convert.from_standard(convert.to_standard(passage)))[convert_and_back] 88 | if not partial: 89 | assert textutil.is_annotated(passage, as_array=as_array, as_extra=not as_array), \ 90 | "Passage %s is not pre-annotated" % passage.ID 91 | textutil.annotate(passage, as_array=as_array, as_extra=not as_array) 92 | assert textutil.is_annotated(passage, as_array=as_array, as_extra=not as_array), \ 93 | "Passage %s is not annotated" % passage.ID 94 | for terminal in l0.all: 95 | for i, (attr, value) in enumerate(zip(textutil.Attr, attr_values)): 96 | if value: 97 | assert (terminal.tok[i] if as_array else terminal.extra.get(attr.key)) == value, \ 98 | "Terminal %s has wrong %s" % (terminal, attr.name) 99 | -------------------------------------------------------------------------------- /ucca/tests/test_visualization.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from ucca.visualization import draw, tikz 4 | from .conftest import PASSAGES 5 | 6 | """Tests the visualization module functions and classes.""" 7 | 8 | 9 | @pytest.mark.parametrize("create", PASSAGES) 10 | def test_draw(create): 11 | import matplotlib 12 | matplotlib.use('Agg') 13 | draw(create()) 14 | 15 | 16 | @pytest.mark.parametrize("create", PASSAGES) 17 | def test_tikz(create): 18 | tikz(create()) 19 | -------------------------------------------------------------------------------- /ucca_db/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huji-nlp/ucca/31e381f2b2693e82c8402b09a6d28f97d595a1eb/ucca_db/__init__.py -------------------------------------------------------------------------------- /ucca_db/download.py: -------------------------------------------------------------------------------- 1 | import os 2 | from argparse import ArgumentParser 3 | from xml.etree.ElementTree import tostring 4 | 5 | from tqdm import tqdm 6 | 7 | from ucca import convert 8 | from ucca.ioutil import write_passage, external_write_mode 9 | from ucca_db.api import get_by_xids, get_most_recent_passage_by_uid 10 | 11 | desc = "Download passages from old UCCA annotation app" 12 | 13 | 14 | def get_by_method(method, id_field, passage_id=None, **kwargs): 15 | if method == "xid": 16 | return get_by_xids(xids=id_field, **kwargs)[0] 17 | elif method == "uid": 18 | return get_most_recent_passage_by_uid(id_field, passage_id, **kwargs) 19 | raise ValueError("Unknown method: '%s'" % method) 20 | 21 | 22 | def main(args): 23 | os.makedirs(args.outdir, exist_ok=True) 24 | with open(args.filename, encoding="utf-8") as f: 25 | t = list(map(str.split, f)) 26 | if not args.verbose: 27 | t = tqdm(t, desc="Downloading", unit=" passages") 28 | for passage_id, id_field in t: 29 | if not args.verbose: 30 | t.set_postfix({"passage_id": passage_id, args.method: id_field}) 31 | if args.verbose: 32 | with external_write_mode(): 33 | print("Getting passage " + passage_id + " with " + args.method + "=" + id_field, end="\t") 34 | xml_root = get_by_method(id_field=id_field.split(","), passage_id=passage_id, **vars(args)) 35 | if xml_root is None: 36 | continue 37 | if args.write_site: 38 | site_filename = passage_id + "_site_download.xml" 39 | with open(site_filename, "w", encoding="utf-8") as fsite: 40 | print(tostring(xml_root).decode(), file=fsite) 41 | if args.verbose: 42 | with external_write_mode(): 43 | print("Wrote '%s'" % site_filename) 44 | if args.write: 45 | write_passage(convert.from_site(xml_root), outdir=args.outdir, verbose=args.verbose) 46 | 47 | 48 | if __name__ == "__main__": 49 | argparser = ArgumentParser(description=desc) 50 | argparser.add_argument("filename", help="specification filename with (passage ID, xid OR uid) per passage") 51 | argparser.add_argument("-m", "--method", default="uid", choices=("xid", "uid"), help="by xid or latest by paid,uid") 52 | argparser.add_argument("-d", "--db-name", default="work", help="database name") 53 | argparser.add_argument("-H", "--host-name", default="pgserver", help="host name") 54 | argparser.add_argument("-o", "--outdir", default=".", help="directory to write created XML IDs to") 55 | argparser.add_argument("-s", "--write-site", action="store_true", help="write site format, too, for debugging") 56 | argparser.add_argument("-n", "--no-write", dest="write", action="store_false", help="do not really write any files") 57 | argparser.add_argument("-x", "--write-xids", help="file to write xids to (for `uid' method)") 58 | argparser.add_argument("-S", "--strict", action="store_true", help="fail if no result is found") 59 | argparser.add_argument("-v", "--verbose", action="store_true", help="print tagged text for each passage") 60 | main(argparser.parse_args()) 61 | -------------------------------------------------------------------------------- /ucca_db/upload.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | from xml.etree.ElementTree import tostring 3 | 4 | from ucca import convert 5 | from ucca.ioutil import get_passages_with_progress_bar 6 | from ucca_db.api import CONNECTION, write_to_db 7 | 8 | desc = "Upload passages to old UCCA annotation app" 9 | 10 | 11 | def upload_passage(xml_root, site_filename=None, verbose=False, **kwargs): 12 | decoded = tostring(xml_root).decode() 13 | if site_filename: 14 | with open(site_filename, "w", encoding="utf-8") as f: 15 | print(decoded, file=f) 16 | if verbose: 17 | print("Wrote '%s'" % site_filename) 18 | return write_to_db(xml=decoded, **kwargs) 19 | 20 | 21 | def main(args): 22 | filenames = list(args.passages) 23 | if args.filenames: 24 | with open(args.filenames, encoding="utf-8") as f: 25 | filenames += list(filter(None, map(str.strip, f))) 26 | with open(args.out, "w", encoding="utf-8") as f: 27 | for passage in get_passages_with_progress_bar(filenames): 28 | out = upload_passage(convert.to_site(passage), verbose=args.verbose, 29 | site_filename=passage.ID + "_site_upload.xml" if args.write_site else None, 30 | db_name=args.db_name, host_name=args.host_name, 31 | new_pid=passage.ID, new_prid=args.project_id, username=args.username) 32 | print(passage.ID, out, file=f) 33 | if args.verbose: 34 | print("Uploaded passage %s with xid=%s" % (passage.ID, out)) 35 | if CONNECTION is not None: 36 | CONNECTION.commit() 37 | print("Wrote '%s'" % args.out) 38 | 39 | 40 | if __name__ == "__main__": 41 | argparser = ArgumentParser(description=desc) 42 | argparser.add_argument("passages", nargs="*", help="the corpus, given as xml/pickle file names") 43 | argparser.add_argument("-f", "--filenames", help="read input passages filenames from file rather than command line") 44 | argparser.add_argument("-d", "--db-name", default="work", help="database name") 45 | argparser.add_argument("-H", "--host-name", default="pgserver", help="host name") 46 | argparser.add_argument("-p", "--project-id", default="63", help="project ID") 47 | argparser.add_argument("-u", "--username", default="danielh", help="username") 48 | argparser.add_argument("-o", "--out", default="xids.txt", help="file to write created XML IDs to") 49 | argparser.add_argument("--write-site", action="store_true", help="write site format for debugging before upload") 50 | argparser.add_argument("-v", "--verbose", action="store_true", help="print tagged text for each passage") 51 | main(argparser.parse_args()) 52 | -------------------------------------------------------------------------------- /uccaapp/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huji-nlp/ucca/31e381f2b2693e82c8402b09a6d28f97d595a1eb/uccaapp/__init__.py -------------------------------------------------------------------------------- /uccaapp/convert_and_evaluate.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import argparse 4 | from glob import glob 5 | from requests.exceptions import HTTPError 6 | 7 | from ucca.evaluation import evaluate, Scores 8 | from ucca.ioutil import read_files_and_dirs 9 | from uccaapp.download_task import TaskDownloader 10 | from uccaapp.upload_task import TaskUploader 11 | 12 | try: 13 | from simplejson.scanner import JSONDecodeError 14 | except ImportError: 15 | from json.decoder import JSONDecodeError 16 | 17 | desc = """Convert a passage file to JSON format and upload to UCCA-App as a completed task, 18 | then download task from UCCA-App and convert to a passage in standard format again, 19 | then evaluate the result against the original""" 20 | 21 | 22 | def main(filenames, write, **kwargs): 23 | uploader = TaskUploader(**kwargs) 24 | downloader = TaskDownloader(**kwargs) 25 | scores = [] 26 | try: 27 | for pattern in filenames: 28 | filenames = sorted(glob(pattern)) 29 | if not filenames: 30 | raise IOError("Not found: " + pattern) 31 | for ref in read_files_and_dirs(filenames): 32 | print("Converting passage " + ref.ID + "... ", end="") 33 | task = uploader.upload_task(ref) 34 | guessed, *_ = downloader.download_task(task["id"], write=write, **kwargs) 35 | score = evaluate(guessed, ref, **kwargs) 36 | print("F1=%.3f" % score.average_f1()) 37 | scores.append(score) 38 | except HTTPError as e: 39 | try: 40 | raise ValueError(e.response.json()) from e 41 | except JSONDecodeError: 42 | raise ValueError(e.response.text) from e 43 | print() 44 | if len(scores) > 1: 45 | print("Aggregated scores:") 46 | Scores.aggregate(scores).print() 47 | 48 | 49 | if __name__ == "__main__": 50 | argument_parser = argparse.ArgumentParser(description=desc) 51 | TaskUploader.add_arguments(argument_parser) 52 | argument_parser.add_argument("--write", action="store_true", help="Write converted passage to file") 53 | TaskDownloader.add_write_arguments(argument_parser) 54 | main(**vars(argument_parser.parse_args())) 55 | sys.exit(0) 56 | -------------------------------------------------------------------------------- /uccaapp/copy_categories.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys 3 | 4 | import argparse 5 | 6 | from uccaapp.api import ServerAccessor 7 | 8 | desc = """Download categories from one UCCA-App server and upload to another UCCA-App server""" 9 | 10 | 11 | def add_arguments(argparser): 12 | argparser.add_argument("category-ids", nargs="+", type=int, help="IDs of tasks to export and import") 13 | argparser.add_argument("--server-address-orig", required=True, help="UCCA-App origin server") 14 | argparser.add_argument("--email-orig", help="UCCA-App origin email") 15 | argparser.add_argument("--password-orig", help="UCCA-App origin password") 16 | argparser.add_argument("--server-address-target", required=True, help="UCCA-App target server") 17 | argparser.add_argument("--email-target", help="UCCA-App target email") 18 | argparser.add_argument("--password-target", help="UCCA-App target password") 19 | argparser.add_argument("-v", "--verbose", action="store_true", help="detailed output") 20 | 21 | 22 | def main(args): 23 | server_accessor_origin = ServerAccessor(server_address=args.server_address_orig, 24 | email=args.email_orig, password=args.password_orig, 25 | verbose=args.verbose) 26 | server_accessor_target = ServerAccessor(server_address=args.server_address_target, 27 | email=args.email_target, password=args.password_target, 28 | verbose=args.verbose) 29 | for category_id in args.category_ids: 30 | category_out = server_accessor_origin.get_category(category_id) 31 | server_accessor_target.create_category(**category_out) 32 | 33 | 34 | if __name__ == "__main__": 35 | argument_parser = argparse.ArgumentParser(description=desc) 36 | add_arguments(argument_parser) 37 | main(argument_parser.parse_args()) 38 | sys.exit(0) 39 | -------------------------------------------------------------------------------- /uccaapp/create_annotation_tasks.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | import sys 4 | 5 | from tqdm import tqdm 6 | 7 | from uccaapp.api import ServerAccessor 8 | 9 | desc = """Create new annotation/review tasks for a specific user, given parent tokenization tasks (for creating 10 | annotation tasks) or parent annotation tasks (for creating review tasks) """ 11 | 12 | 13 | class AnnotationTaskCreator(ServerAccessor): 14 | def __init__(self, project_id=None, **kwargs): 15 | """ 16 | :param project_id: Specify project for created tasks, otherwise same as parent tasks 17 | """ 18 | super().__init__(**kwargs) 19 | if project_id is not None: 20 | self.set_project(project_id) 21 | 22 | def create_tasks(self, filename, log=None, **kwargs): 23 | log_h = open(log, "w", encoding="utf-8") if log else None 24 | lines = list(self.read_lines(filename)) 25 | for user_id, task_id in tqdm(lines, unit="task", desc="Creating tasks"): 26 | task = self.create_task(**self.build_task(user_id, task_id, **kwargs)) 27 | if log: 28 | print(task["id"], file=log_h, sep="\t", flush=True) 29 | print("Uploaded %d tasks successfully." % len(lines), file=sys.stderr) 30 | if log: 31 | log_h.close() 32 | 33 | def build_task(self, user_id, task_id, review=False, manager_comment=None, strict=False, **kwargs): 34 | del kwargs 35 | user = self.get_user(user_id) 36 | task = self.get_task(task_id) 37 | assert task["type"] in (["ANNOTATION", "REVIEW"] if review else ["TOKENIZATION"]), \ 38 | "Wrong input task given: %s for task ID %s" % (task["type"], task_id) 39 | if strict: 40 | assert task["status"] == "SUBMITTED", "Parent task is not submitted: %s" % task_id 41 | return dict(type="REVIEW" if review else "ANNOTATION", project=self.project or task["project"], user=user, 42 | passage=task["passage"], manager_comment=manager_comment or task.get("manager_comment", ""), 43 | user_comment=task.get("user_comment", ""), parent=task, is_demo=False, is_active=True) 44 | 45 | @staticmethod 46 | def read_lines(filename): 47 | with open(filename, encoding="utf-8") as f: 48 | for line in f: 49 | fields = line.strip().split() 50 | try: 51 | user_id, task_id = fields 52 | except ValueError: 53 | print("Error in line: " + line.strip(), file=sys.stderr) 54 | continue 55 | yield user_id, task_id 56 | 57 | @staticmethod 58 | def add_arguments(argparser): 59 | argparser.add_argument("filename", help="a file where each line is a , " 60 | "where the input task may be an annotation/review task " 61 | "(if given --review) or a tokenization task") 62 | ServerAccessor.add_arguments(argparser) 63 | argparser.add_argument("-r", "--review", action="store_true", help="Create annotation/review task") 64 | argparser.add_argument("-l", "--log", help="filename to write log of uploaded passages to") 65 | argparser.add_argument("--manager-comment", help="Manager comment to set for all tasks") 66 | ServerAccessor.add_project_id_argument(argparser) 67 | argparser.add_argument("-s", "--strict", action="store_true", help="Require parent task to be submitted") 68 | 69 | 70 | def main(**kwargs): 71 | AnnotationTaskCreator(**kwargs).create_tasks(**kwargs) 72 | 73 | 74 | if __name__ == "__main__": 75 | argument_parser = argparse.ArgumentParser(description=desc) 76 | AnnotationTaskCreator.add_arguments(argument_parser) 77 | main(**vars(argument_parser.parse_args())) 78 | sys.exit(0) 79 | -------------------------------------------------------------------------------- /uccaapp/create_tokenization_tasks.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys 3 | 4 | import argparse 5 | 6 | from uccaapp.create_annotation_tasks import ServerAccessor, AnnotationTaskCreator 7 | 8 | desc = """Upload a list of tokenization tasks to a project""" 9 | 10 | 11 | class TokenizationTaskCreator(AnnotationTaskCreator): 12 | def __init__(self, project_id, **kwargs): 13 | super().__init__(**kwargs) 14 | self.set_project(project_id) 15 | 16 | def build_task(self, user_id, passage_id, **kwargs): 17 | del kwargs 18 | user = self.get_user(user_id) 19 | passage = self.get_passage(passage_id) 20 | return dict(type="TOKENIZATION", project=self.project, user=user, passage=passage, 21 | manager_comment="passage #%s" % passage["id"], user_comment="", parent=None, is_demo=False, 22 | is_active=True) 23 | 24 | @staticmethod 25 | def add_arguments(argparser): 26 | argparser.add_argument("filename", help="a file where each line is a ") 27 | argparser.add_argument("-l", "--log", help="filename to write log of uploaded passages to") 28 | ServerAccessor.add_project_id_argument(argparser) 29 | #ServerAccessor.add_user_id_argument(argparser) 30 | ServerAccessor.add_arguments(argparser) 31 | 32 | 33 | def main(**kwargs): 34 | TokenizationTaskCreator(**kwargs).create_tasks(**kwargs) 35 | 36 | 37 | if __name__ == "__main__": 38 | argument_parser = argparse.ArgumentParser(description=desc) 39 | TokenizationTaskCreator.add_arguments(argument_parser) 40 | main(**vars(argument_parser.parse_args())) 41 | sys.exit(0) 42 | -------------------------------------------------------------------------------- /uccaapp/download_task.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | import json 4 | import sys 5 | 6 | from tqdm import tqdm 7 | 8 | from ucca import normalization, validation 9 | from ucca.convert import from_json 10 | from ucca.ioutil import write_passage 11 | from uccaapp.api import ServerAccessor 12 | 13 | desc = """Download task from UCCA-App and convert to a passage in standard format""" 14 | 15 | 16 | class TaskDownloader(ServerAccessor): 17 | def __init__(self, **kwargs): 18 | super().__init__(**kwargs) 19 | 20 | def download_tasks(self, task_ids, by_filename=False, validate=None, log=None, **kwargs): 21 | if by_filename: 22 | task_ids_from_file = [] 23 | for filename in task_ids: 24 | with open(filename, 'r') as f: 25 | task_ids_from_file += list(filter(None, map(str.strip, f))) 26 | task_ids = task_ids_from_file 27 | validate_h = open(validate, "w", encoding="utf-8") if validate else None 28 | log_h = open(log, "w", encoding="utf-8") if log else None 29 | for task_id in tqdm(task_ids, unit=" tasks", desc="Downloading"): 30 | yield self.download_task(task_id, validate=validate_h, log=log_h, **kwargs) 31 | if validate: 32 | validate_h.close() 33 | if log: 34 | log_h.close() 35 | 36 | def download_task(self, task_id, normalize=False, write=True, validate=None, binary=None, log=None, out_dir=None, 37 | prefix=None, by_external_id=False, verbose=False, write_valid_only=False, strict=False, **kwargs): 38 | del kwargs 39 | task = self.get_user_task(task_id) 40 | user_id = task["user"]["id"] 41 | passage = None 42 | try: 43 | passage = next(iter(from_json(task, by_external_id=by_external_id))) 44 | except ValueError as e: 45 | if strict: 46 | raise ValueError("Failed reading json for task %s:\n%s" % (task_id, json.dumps(task))) from e 47 | print("", task_id, user_id, "Failed reading json", file=validate or sys.stderr, sep="\t", flush=True) 48 | if normalize and passage is not None: 49 | try: 50 | normalization.normalize(passage) 51 | except AssertionError as e: 52 | if strict: 53 | raise ValueError("Failed normalizing task %s:\n%s" % (task_id, json.dumps(task))) from e 54 | print(passage.ID, task_id, user_id, "Failed normalizing task: %s" % e, file=validate or sys.stderr, 55 | sep="\t", flush=True) 56 | if log: 57 | print(passage.ID, task_id, user_id, task["user_comment"], task["created_at"], task["updated_at"], 58 | file=log, sep="\t", flush=True) 59 | ret = passage, task_id, user_id 60 | if validate or write_valid_only: 61 | for error in validation.validate(passage, linkage=False): 62 | if validate: 63 | print(passage.ID, task_id, user_id, error, file=validate, sep="\t", flush=True) 64 | if write_valid_only: 65 | return ret 66 | if write: 67 | write_passage(passage, binary=binary, outdir=out_dir, prefix=prefix, verbose=verbose) 68 | return ret 69 | 70 | @staticmethod 71 | def add_arguments(argparser): 72 | argparser.add_argument("task_ids", nargs="+", help="IDs of tasks to download and convert") 73 | argparser.add_argument("-f", "--by-filename", action="store_true", help="treat task_ids as a filename, " 74 | "otherwise it is a list of IDs") 75 | TaskDownloader.add_write_arguments(argparser) 76 | argparser.add_argument("-V", "--validate", help="run validation on downloaded passages and save errors to file") 77 | argparser.add_argument("-N", "--normalize", action="store_true", help="normalize downloaded passages") 78 | argparser.add_argument("--strict", action="store_true", help="fail on reading or normalization error") 79 | argparser.add_argument("-l", "--log", help="filename to write log of downloaded passages to") 80 | ServerAccessor.add_arguments(argparser) 81 | 82 | @staticmethod 83 | def add_write_arguments(argparser): 84 | argparser.add_argument("-o", "--out-dir", default=".", help="output directory") 85 | argparser.add_argument("-p", "--prefix", default="", help="output filename prefix") 86 | argparser.add_argument("-x", "--by-external-id", action="store_true", help="save filename by external ID") 87 | argparser.add_argument("-b", "--binary", action="store_true", help="write in binary format (.pickle)") 88 | argparser.add_argument("-n", "--no-write", action="store_false", dest="write", help="do not write files") 89 | argparser.add_argument("--write-valid-only", action="store_true", help="only write passages that passed " 90 | "validation") 91 | 92 | 93 | def main(**kwargs): 94 | list(TaskDownloader(**kwargs).download_tasks(**kwargs)) 95 | 96 | 97 | if __name__ == "__main__": 98 | argument_parser = argparse.ArgumentParser(description=desc) 99 | TaskDownloader.add_arguments(argument_parser) 100 | main(**vars(argument_parser.parse_args())) 101 | sys.exit(0) 102 | -------------------------------------------------------------------------------- /uccaapp/evaluate.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | 4 | from tqdm import tqdm 5 | 6 | from ucca.evaluation import evaluate, Scores, LABELED, UNLABELED 7 | from uccaapp.download_task import TaskDownloader 8 | 9 | desc = """Download tasks from UCCA-App and evaluate them""" 10 | 11 | 12 | def main(task_ids, by_filename=False, validate=None, log=None, **kwargs): 13 | kwargs["write"] = False 14 | if by_filename: 15 | task_ids_from_file = [] 16 | for filename in task_ids: 17 | with open(filename, 'r') as f: 18 | task_ids_from_file += zip(*list(map(str.split, filter(None, map(str.strip, f))))) 19 | task_ids = task_ids_from_file 20 | else: 21 | task_ids = [[task_id] for task_id in task_ids] 22 | assert len(task_ids) == 2, "Got %d lists of task IDs instead of two" % len(task_ids) 23 | downloader = TaskDownloader(**kwargs) 24 | scores = [] 25 | validate_h = open(validate, "w", encoding="utf-8") if validate else None 26 | log_h = open(log, "w", encoding="utf-8") if log else None 27 | if log: 28 | fields = ["guessed", "ref"] + Scores.field_titles(eval_type=LABELED) + Scores.field_titles(eval_type=UNLABELED) 29 | print(*fields, file=log_h, sep="\t", flush=True) 30 | for task_id_pair in tqdm(list(zip(*task_ids)), unit=" tasks", desc="Evaluating"): 31 | passage_pair = [] 32 | for task_id in task_id_pair: 33 | passage, *_ = downloader.download_task(task_id, validate=validate_h, **kwargs) 34 | passage_pair.append(passage) 35 | score = evaluate(*passage_pair, **kwargs) 36 | if log: 37 | fields = list(task_id_pair) + score.fields(eval_type=LABELED) + score.fields(eval_type=UNLABELED) 38 | print(*fields, file=log_h, sep="\t", flush=True) 39 | scores.append(score) 40 | if validate: 41 | validate_h.close() 42 | if log: 43 | log_h.close() 44 | print() 45 | if len(scores) > 1: 46 | print("Aggregated scores:") 47 | Scores.aggregate(scores).print() 48 | 49 | 50 | def check_args(p, args): 51 | if len(args.task_ids) not in (1, 2): 52 | p.error("Must supply exactly two task IDs or files with IDs, but got %d arguments" % len(args.task_ids)) 53 | return args 54 | 55 | 56 | if __name__ == "__main__": 57 | argument_parser = argparse.ArgumentParser(description=desc) 58 | TaskDownloader.add_arguments(argument_parser) 59 | main(**vars(check_args(argument_parser, argument_parser.parse_args()))) 60 | sys.exit(0) 61 | -------------------------------------------------------------------------------- /uccaapp/get_passage_id.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys 3 | 4 | import argparse 5 | from tqdm import tqdm 6 | 7 | from uccaapp.api import ServerAccessor 8 | 9 | desc = """Get passage ID for tasks""" 10 | 11 | 12 | class PassageIdGetter(ServerAccessor): 13 | def __init__(self, **kwargs): 14 | super().__init__(**kwargs) 15 | 16 | def get_passage_ids(self, filename, **kwargs): 17 | del kwargs 18 | with open(filename, encoding="utf-8") as f: 19 | task_ids = list(map(str.strip, f)) 20 | for task_id in tqdm(task_ids, unit=" tasks", desc="Getting passage IDs"): 21 | task = self.get_task(task_id) 22 | passage_id = task["passage"]["id"] 23 | yield passage_id 24 | 25 | @staticmethod 26 | def add_arguments(argparser): 27 | argparser.add_argument("filename", help="file with lines of the form ") 28 | ServerAccessor.add_arguments(argparser) 29 | 30 | 31 | def main(**kwargs): 32 | print(*PassageIdGetter(**kwargs).get_passage_ids(**kwargs), sep="\n") 33 | 34 | 35 | if __name__ == "__main__": 36 | argument_parser = argparse.ArgumentParser(description=desc) 37 | PassageIdGetter.add_arguments(argument_parser) 38 | main(**vars(argument_parser.parse_args())) 39 | sys.exit(0) 40 | -------------------------------------------------------------------------------- /uccaapp/set_external_id.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | import sys 4 | 5 | from tqdm import tqdm 6 | 7 | from uccaapp.api import ServerAccessor 8 | 9 | desc = """Set the external ID for passages""" 10 | 11 | 12 | class ExternalIdSetter(ServerAccessor): 13 | def __init__(self, **kwargs): 14 | super().__init__(**kwargs) 15 | 16 | def set_external_ids(self, filename, by_task_id=False, **kwargs): 17 | del kwargs 18 | with open(filename, encoding="utf-8") as f: 19 | passage_id_to_external_id = list(map(str.split, map(str.strip, f))) 20 | for external_id, passage_id in tqdm(passage_id_to_external_id, unit=" passages", desc="Setting external IDs"): 21 | if by_task_id: 22 | task = self.get_task(passage_id) 23 | passage_id = task["passage"]["id"] 24 | passage = self.get_passage(passage_id) 25 | if passage["external_id"] == external_id: 26 | continue 27 | passage["external_id"] = external_id 28 | passage_out = self.update_passage(**passage) 29 | assert passage_out["external_id"] == external_id, "External ID failed to update for passage %s" % passage_id 30 | yield passage_out 31 | 32 | @staticmethod 33 | def add_arguments(argparser): 34 | argparser.add_argument("filename", help="file with lines of the form ") 35 | argparser.add_argument("--by-task-id", action="store_true", help="expect task ID instead of passage ID") 36 | ServerAccessor.add_arguments(argparser) 37 | 38 | 39 | def main(**kwargs): 40 | list(ExternalIdSetter(**kwargs).set_external_ids(**kwargs)) 41 | 42 | 43 | if __name__ == "__main__": 44 | argument_parser = argparse.ArgumentParser(description=desc) 45 | ExternalIdSetter.add_arguments(argument_parser) 46 | main(**vars(argument_parser.parse_args())) 47 | sys.exit(0) 48 | -------------------------------------------------------------------------------- /uccaapp/set_tasks_to_ongoing.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys 3 | 4 | import argparse 5 | from tqdm import tqdm 6 | 7 | from uccaapp.api import ServerAccessor 8 | 9 | desc = """Sets the status of submitted tasks to ONGOING or SUBMITTED""" 10 | 11 | ONGOING_STATUS = "ONGOING" 12 | SUBMITTED_STATUS = "SUBMITTED" 13 | 14 | 15 | class TaskStatusSetter(ServerAccessor): 16 | def __init__(self, **kwargs): 17 | super().__init__(**kwargs) 18 | 19 | def set_task_status(self, status, filename, **kwargs): 20 | del kwargs 21 | with open(filename) as f: 22 | task_ids = list(f.readlines()) 23 | for task_id in task_ids: 24 | task = self.get_task(int(task_id)) 25 | task["status"] = status 26 | task_out = self.update_task(**task) 27 | assert task_out["status"] == status 28 | yield task_out 29 | 30 | @staticmethod 31 | def add_arguments(argparser): 32 | argparser.add_argument("filename", help="file with lines, each with a different task ID") 33 | argument_parser.add_argument("-s", "--status", help="should be ONGOING or SUBMITTED", 34 | choices=[ONGOING_STATUS, SUBMITTED_STATUS]) 35 | ServerAccessor.add_arguments(argparser) 36 | 37 | 38 | def main(**kwargs): 39 | list(TaskStatusSetter(**kwargs).set_task_status(**kwargs)) 40 | 41 | 42 | if __name__ == "__main__": 43 | argument_parser = argparse.ArgumentParser(description=desc) 44 | TaskStatusSetter.add_arguments(argument_parser) 45 | main(**vars(argument_parser.parse_args())) 46 | sys.exit(0) 47 | -------------------------------------------------------------------------------- /uccaapp/submit_tasks.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | import json 4 | import sys 5 | 6 | import requests 7 | 8 | from ucca import convert 9 | from ucca import normalization, validation 10 | from uccaapp.api import ServerAccessor 11 | 12 | desc = """Sets the status of submitted tasks to ONGOING""" 13 | 14 | SUBMITTED_STATUS = "SUBMITTED" 15 | 16 | class TaskSubmitter(ServerAccessor): 17 | 18 | def __init__(self, **kwargs): 19 | super().__init__(**kwargs) 20 | 21 | def submit_tasks(self, filename, log_file, **kwargs): 22 | del kwargs 23 | log_file = open(log_file,'w') 24 | with open(filename) as f: 25 | task_ids = list(f.readlines()) 26 | for task_id in task_ids: 27 | try: 28 | task_id = task_id.strip() 29 | task = self.get_user_task(int(task_id)) 30 | if task['type'] not in ['ANNOTATION', 'REVIEW']: 31 | print(task_id, "NOT AN ANNOTATION/REVIEW TASK", file=log_file, sep="\t", flush=True) 32 | continue 33 | try: 34 | passage = next(iter(convert.from_json(task))) 35 | except ValueError as e: 36 | raise ValueError("Failed reading json for task %s:\n%s" % (task_id, json.dumps(task))) from e 37 | # validate the task 38 | normalization.normalize(passage) 39 | validation_errors = list(validation.validate(passage, linkage=False)) 40 | if len(validation_errors) == 0: 41 | self.submit_task(**task) 42 | print(task_id, "SUBMITTED", file=log_file, sep="\t", flush=True) 43 | else: 44 | for error in validation_errors: 45 | print(task_id, error, file=log_file, sep="\t", flush=True) 46 | except requests.exceptions.HTTPError as e: 47 | print(task_id, "HTTP Request Error: "+str(e), file=log_file, sep="\t", flush=True) 48 | 49 | 50 | @staticmethod 51 | def add_arguments(argparser): 52 | argparser.add_argument("filename", help="file with lines, each with a different task ID") 53 | argparser.add_argument("-l","--log_file", help="output log file") 54 | 55 | ServerAccessor.add_arguments(argparser) 56 | 57 | 58 | def main(**kwargs): 59 | TaskSubmitter(**kwargs).submit_tasks(**kwargs) 60 | 61 | 62 | if __name__ == "__main__": 63 | argument_parser = argparse.ArgumentParser(description=desc) 64 | TaskSubmitter.add_arguments(argument_parser) 65 | main(**vars(argument_parser.parse_args())) 66 | sys.exit(0) 67 | 68 | 69 | -------------------------------------------------------------------------------- /uccaapp/tokenize_and_upload.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | import sys 3 | from argparse import ArgumentParser 4 | 5 | from ucca.convert import from_text, to_json 6 | from uccaapp.api import ServerAccessor 7 | 8 | desc = """ 9 | Read input file as one line per paragraph, where paragraphs are separated by multiple newlines and an optional 10 | . 11 | Tokenize and upload as submitted tokenization tasks, then create annotation tasks from them. 12 | 13 | Tokenization in Russian requires: 14 | pip install git+https://github.com/aatimofeev/spacy_russian_tokenizer.git 15 | """ 16 | 17 | 18 | class TokenizerUploader(ServerAccessor): 19 | def __init__(self, user_id, source_id, project_id, lang=None, **kwargs): 20 | super().__init__(**kwargs) 21 | self.set_source(source_id) 22 | self.set_project(project_id) 23 | self.set_user(user_id) 24 | 25 | def tokenize_and_upload(self, filename, log=None, lang=None, **kwargs): 26 | del kwargs 27 | log_h = open(log, "w", encoding="utf-8") if log else None 28 | prefix = os.path.splitext(os.path.basename(filename))[0].replace(" ", "_") 29 | with open(filename, encoding="utf-8") as f: 30 | for passage, text in from_text(f, passage_id=prefix, lang=lang, return_text=True): 31 | passage_out = self.create_passage(text=text, type="PUBLIC", source=self.source) 32 | task_in = dict(type="TOKENIZATION", status="SUBMITTED", project=self.project, 33 | user=self.user, passage=passage_out, manager_comment=passage.ID, 34 | user_comment="", parent=None, is_demo=False, is_active=True) 35 | tok_task_out = self.create_task(**task_in) 36 | tok_user_task_in = dict(tok_task_out) 37 | tok_user_task_in.update(to_json(passage, return_dict=True, tok_task=True)) 38 | self.submit_task(**tok_user_task_in) 39 | task_in.update(parent=tok_task_out, type="ANNOTATION") 40 | ann_user_task_out = self.create_task(**task_in) 41 | print("Uploaded passage " + filename + " successfully.", file=sys.stderr) 42 | if log: 43 | print(passage.ID, passage_out["id"], tok_task_out["id"], ann_user_task_out["id"], 44 | file=log_h, sep="\t", flush=True) 45 | if log: 46 | log_h.close() 47 | 48 | @staticmethod 49 | def add_arguments(argparser): 50 | argparser.add_argument("filename", help="text file with one line paragraph, where paragraphs are separated " 51 | "by multiple newlines and an optional ") 52 | argparser.add_argument("-l", "--log", help="filename to write log of uploaded passages to") 53 | argparser.add_argument("--lang", choices=["ru", "en", "fr", "de"], default="ru", 54 | help="language two-letter code, for tokenizer") 55 | ServerAccessor.add_project_id_argument(argparser) 56 | ServerAccessor.add_source_id_argument(argparser) 57 | ServerAccessor.add_user_id_argument(argparser) 58 | ServerAccessor.add_arguments(argparser) 59 | 60 | 61 | def main(**kwargs): 62 | TokenizerUploader(**kwargs).tokenize_and_upload(**kwargs) 63 | 64 | 65 | if __name__ == "__main__": 66 | argument_parser = ArgumentParser(description=desc) 67 | TokenizerUploader.add_arguments(argument_parser) 68 | main(**vars(argument_parser.parse_args())) 69 | -------------------------------------------------------------------------------- /uccaapp/transfer_categories.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | from ucca.convert import from_json 4 | from uccaapp.api import ServerAccessor 5 | 6 | desc = """Download task from UCCA-App and convert to a passage in standard format""" 7 | 8 | 9 | 10 | def add_arguments(argparser): 11 | argparser.add_argument("category_ids", nargs="+", type=int, help="IDs of tasks to export and import") 12 | argparser.add_argument("--server-address-orig", help="UCCA-App origin server") 13 | argparser.add_argument("--email-orig", help="UCCA-App origin email") 14 | argparser.add_argument("--password-orig", help="UCCA-App origin password") 15 | argparser.add_argument("--server-address-target", help="UCCA-App target server") 16 | argparser.add_argument("--email-target", help="UCCA-App target email") 17 | argparser.add_argument("--password-target", help="UCCA-App target password") 18 | 19 | 20 | def main(args): 21 | server_accessor_origin = ServerAccessor(server_address=args.server_address_orig, 22 | email=args.email_orig, password=args.password_orig,auth_token=None,verbose=True) 23 | server_accessor_target = ServerAccessor(server_address=args.server_address_target, 24 | email=args.email_target, password=args.password_target,auth_token=None,verbose=True) 25 | for category_id in args.category_ids: 26 | #try: 27 | category_out = server_accessor_origin.get_category(category_id) 28 | server_accessor_target.create_category(**category_out) 29 | #except: 30 | # sys.stderr.write('failed writing category with ID='+str(category_id)) 31 | # continue 32 | 33 | 34 | 35 | 36 | 37 | if __name__ == "__main__": 38 | argument_parser = argparse.ArgumentParser(description=desc) 39 | add_arguments(argument_parser) 40 | main(argument_parser.parse_args()) 41 | -------------------------------------------------------------------------------- /uccaapp/upload_conllu_passages.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys 3 | 4 | import argparse 5 | import re 6 | from glob import glob 7 | 8 | from ucca.convert import to_json, from_text 9 | from uccaapp.api import ServerAccessor 10 | 11 | try: 12 | from simplejson.scanner import JSONDecodeError 13 | except ImportError: 14 | from json.decoder import JSONDecodeError 15 | 16 | desc = """Upload passages from CoNLL-U files including complete tokenization, and create annotation task for each""" 17 | 18 | 19 | class ConlluPassageUploader(ServerAccessor): 20 | def __init__(self, user_id, annotation_user_id, source_id, project_id, **kwargs): 21 | super().__init__(**kwargs) 22 | self.set_source(source_id) 23 | self.set_project(project_id) 24 | self.set_user(user_id) 25 | self.annotation_user = dict(id=annotation_user_id) if annotation_user_id else self.user 26 | 27 | def upload_passages(self, filenames, **kwargs): 28 | del kwargs 29 | for pattern in filenames: 30 | filenames = sorted(glob(pattern)) 31 | if not filenames: 32 | raise IOError("Not found: " + pattern) 33 | for filename in sorted(filenames): 34 | with open(filename, encoding="utf-8") as f: 35 | external_id = None 36 | tokens = [] 37 | try: 38 | for line in f: 39 | line = line.strip() 40 | m = re.match(r"^# sent_id = (.*)", line) 41 | if m: 42 | external_id = m.group(1) 43 | elif line: 44 | tokens.append(line.split("\t")[1]) 45 | else: 46 | self.upload_passage(external_id, tokens) 47 | external_id = None 48 | tokens = [] 49 | if tokens: 50 | self.upload_passage(external_id, tokens) 51 | except (IndexError, AssertionError) as e: 52 | raise ValueError(filename) from e 53 | 54 | def upload_passage(self, external_id, tokens): 55 | assert external_id, "Missing external ID for passage %s" % tokens 56 | assert tokens, "Empty passage %s" % external_id 57 | passage_out = self.create_passage(text=" ".join(tokens), external_id=external_id, type="PUBLIC", 58 | source=self.source) 59 | task_in = dict(type="TOKENIZATION", status="SUBMITTED", project=self.project, user=self.user, 60 | passage=passage_out, manager_comment="External ID: "+external_id, 61 | user_comment="", parent=None, is_demo=False, is_active=True) 62 | tok_task_out = self.create_task(**task_in) 63 | tok_user_task_in = dict(tok_task_out) 64 | passage = list(from_text(tokens, tokenized=True))[0] 65 | tok_user_task_in.update(to_json(passage, return_dict=True, tok_task=True)) 66 | self.submit_task(**tok_user_task_in) 67 | task_in = dict(type="ANNOTATION", status="NOT_STARTED", project=self.project, user=self.annotation_user, 68 | passage=tok_task_out["passage"], manager_comment="External ID: "+external_id, 69 | user_comment=external_id, parent=tok_task_out, is_demo=False, is_active=True) 70 | self.create_task(**task_in) 71 | print("Uploaded passage "+external_id+" successfully") 72 | 73 | @staticmethod 74 | def add_arguments(argparser): 75 | argparser.add_argument("filenames", nargs="+", help="filename pattern of CoNLL-U files") 76 | ServerAccessor.add_project_id_argument(argparser) 77 | ServerAccessor.add_source_id_argument(argparser) 78 | ServerAccessor.add_user_id_argument(argparser) 79 | argparser.add_argument("--annotation-user-id", type=int, help="user id for annotation tasks, if different") 80 | ServerAccessor.add_arguments(argparser) 81 | 82 | 83 | def main(**kwargs): 84 | ConlluPassageUploader(**kwargs).upload_passages(**kwargs) 85 | 86 | 87 | if __name__ == "__main__": 88 | argument_parser = argparse.ArgumentParser(description=desc) 89 | ConlluPassageUploader.add_arguments(argument_parser) 90 | main(**vars(argument_parser.parse_args())) 91 | sys.exit(0) 92 | 93 | -------------------------------------------------------------------------------- /uccaapp/upload_streussel_passages.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys 3 | 4 | import argparse 5 | 6 | from ucca.convert import from_text, to_json 7 | from uccaapp.api import ServerAccessor 8 | 9 | desc = """Upload a passage from a streussel format file""" 10 | 11 | 12 | class StreusselPassageUploader(ServerAccessor): 13 | def __init__(self, user_id, source_id, project_id, **kwargs): 14 | super().__init__(**kwargs) 15 | self.set_source(source_id) 16 | self.set_project(project_id) 17 | self.set_user(user_id) 18 | 19 | def upload_streussel_passage_file(self, filenames, log=None, **kwargs): 20 | del kwargs 21 | log_h = open(log, "w", encoding="utf-8") if log else None 22 | with open(filenames) as f_all: 23 | for filename in f_all: 24 | passage_text = "" 25 | external_id = "None given" 26 | filename = filename.strip() 27 | with open(filename, encoding="utf-8") as f: 28 | for line in f: 29 | line = line.strip() 30 | if not line: 31 | continue 32 | elif line.startswith("#"): 33 | fields = line.split() 34 | if len(fields) != 4 or fields[1] != "sent_id": 35 | print("FORMAT ERROR in " + filename, file=sys.stderr) 36 | else: 37 | external_id = fields[3].split("-")[1] 38 | else: 39 | passage_text = passage_text + " " + line 40 | passage_out = self.create_passage(text=passage_text.strip(), external_id=external_id, type="PUBLIC", 41 | source=self.source) 42 | task_in = dict(type="TOKENIZATION", status="SUBMITTED", project=self.project, 43 | user=self.user, passage=passage_out, manager_comment="External ID: " + external_id, 44 | user_comment="", parent=None, is_demo=False, is_active=True) 45 | tok_task_out = self.create_task(**task_in) 46 | tok_user_task_in = dict(tok_task_out) 47 | 48 | passage = list(from_text(passage_text.split(), tokenized=True))[0] 49 | tok_user_task_in.update(to_json(passage, return_dict=True, tok_task=True)) 50 | 51 | self.submit_task(**tok_user_task_in) 52 | print("Uploaded passage " + filename + " successfully.", file=sys.stderr) 53 | if log: 54 | print(filename.split(".")[-2], passage_out["id"], tok_task_out["id"], file=log_h, sep="\t") 55 | if log: 56 | log_h.close() 57 | 58 | @staticmethod 59 | def add_arguments(argparser): 60 | argparser.add_argument("filenames", help="passage file names to convert and upload") 61 | argparser.add_argument("-l", "--log", help="filename to write log of uploaded passages to") 62 | ServerAccessor.add_project_id_argument(argparser) 63 | ServerAccessor.add_source_id_argument(argparser) 64 | ServerAccessor.add_user_id_argument(argparser) 65 | ServerAccessor.add_arguments(argparser) 66 | 67 | 68 | def main(**kwargs): 69 | StreusselPassageUploader(**kwargs).upload_streussel_passage_file(**kwargs) 70 | 71 | 72 | if __name__ == "__main__": 73 | argument_parser = argparse.ArgumentParser(description=desc) 74 | StreusselPassageUploader.add_arguments(argument_parser) 75 | main(**vars(argument_parser.parse_args())) 76 | sys.exit(0) 77 | -------------------------------------------------------------------------------- /uccaapp/upload_task.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | import logging 4 | import sys 5 | 6 | from requests.exceptions import HTTPError 7 | import json 8 | 9 | from ucca.convert import to_json, to_text 10 | from ucca.ioutil import get_passages_with_progress_bar 11 | from uccaapp.api import ServerAccessor 12 | 13 | try: 14 | from simplejson.scanner import JSONDecodeError 15 | except ImportError: 16 | from json.decoder import JSONDecodeError 17 | 18 | desc = """Convert a passage file to JSON format and upload to UCCA-App as a completed task""" 19 | 20 | # https://github.com/omriabnd/UCCA-App/blob/master/UCCAApp_REST_API_Reference.pdf 21 | # ucca-demo.cs.huji.ac.il or ucca.staging.cs.huji.ac.il 22 | # upload the parse as a (completed) task: 23 | # 0. decide which project and user you want to assign it to 24 | # 1. POST passage (easy format) 25 | # 2. POST task x (of type tokenization) 26 | # 3. PUT task x (submit) 27 | # 4. POST task y (of type annotation with parent x; this is the more complicated format) 28 | # 5. PUT task y (submit) 29 | 30 | 31 | class TaskUploader(ServerAccessor): 32 | def __init__(self, user_id, source_id, project_id, **kwargs): 33 | super().__init__(**kwargs) 34 | self.set_source(source_id) 35 | self.set_project(project_id) 36 | self.set_user(user_id) 37 | 38 | def upload_tasks(self, filenames, log=None, submit=True, existing_ids=None, **kwargs): 39 | del kwargs 40 | log_h = open(log, "w", encoding="utf-8") if log else None 41 | if existing_ids: 42 | with open(existing_ids, "r", encoding="utf-8") as ids_h: 43 | ids = {old_passage_id: (passage_id, tok_id, ann_id) 44 | for (old_passage_id, passage_id, tok_id, ann_id) 45 | in map(str.split, ids_h)} 46 | else: 47 | ids = None 48 | try: 49 | for passage in get_passages_with_progress_bar(filenames, desc="Uploading"): 50 | logging.debug("Uploading passage %s" % passage.ID) 51 | task = self.upload_task(passage, log=log_h, submit=submit, ids=ids) 52 | logging.debug("Submitted task %d" % task["id"]) 53 | yield task 54 | except HTTPError as e: 55 | try: 56 | raise ValueError((e.response.json() if e.response else json.loads(e.args[0]))["detail"]) from e 57 | except JSONDecodeError: 58 | raise ValueError(e.response.text) from e 59 | finally: 60 | if log: 61 | log_h.close() 62 | 63 | def upload_task(self, passage, log=None, submit=True, ids=None, upload=True): 64 | if ids: 65 | passage_id, tok_id, ann_id = ids[passage.ID] 66 | passage_out = self.get_passage(passage_id) 67 | tok_user_task_out = tok_task_out = self.get_user_task(tok_id) 68 | ann_user_task_in = self.get_user_task(ann_id) 69 | else: 70 | passage_out = self.create_passage(text=to_text(passage, sentences=False)[0], type="PUBLIC", 71 | source=self.source, external_id=passage.ID) if upload else passage 72 | task_in = dict(type="TOKENIZATION", status="ONGOING", project=self.project, user=self.user, 73 | passage=passage_out, manager_comment=passage.ID, user_comment=passage.ID, parent=None, 74 | is_demo=False, is_active=True) 75 | tok_task_out = self.create_task(**task_in) if upload else task_in 76 | tok_user_task_in = dict(tok_task_out) 77 | tok_user_task_in.update(to_json(passage, return_dict=True, tok_task=True)) 78 | tok_user_task_out = self.submit_task(**tok_user_task_in) if upload else tok_user_task_in 79 | task_in.update(parent=tok_task_out, type="ANNOTATION") 80 | ann_user_task_in = self.create_task(**task_in) if upload else task_in 81 | ann_user_task_in.update( 82 | to_json(passage, return_dict=True, tok_task=tok_user_task_out, all_categories=self.layer["categories"])) 83 | ann_user_task_out = self.submit_task(**ann_user_task_in, submit=submit) if upload else ann_user_task_in 84 | if log: 85 | print(passage.ID, passage_out["id"], tok_task_out["id"], ann_user_task_out["id"], 86 | file=log, sep="\t", flush=True) 87 | return ann_user_task_out 88 | 89 | @staticmethod 90 | def add_arguments(argparser): 91 | argparser.add_argument("filenames", nargs="+", help="passage file names to convert and upload") 92 | argparser.add_argument("-l", "--log", help="filename to write log of uploaded passages to") 93 | argparser.add_argument("--no-submit", action="store_false", dest="submit", help="do not submit annotation task") 94 | argparser.add_argument("--existing-ids", help="use existing task IDs from file (output of --log); no creation") 95 | argparser.add_argument("-n", "--no-upload", action="store_false", dest="upload", help="do not upload anything") 96 | ServerAccessor.add_project_id_argument(argparser) 97 | ServerAccessor.add_source_id_argument(argparser) 98 | ServerAccessor.add_user_id_argument(argparser) 99 | ServerAccessor.add_arguments(argparser) 100 | 101 | 102 | def main(**kwargs): 103 | list(TaskUploader(**kwargs).upload_tasks(**kwargs)) 104 | 105 | 106 | if __name__ == "__main__": 107 | argument_parser = argparse.ArgumentParser(description=desc) 108 | TaskUploader.add_arguments(argument_parser) 109 | main(**vars(argument_parser.parse_args())) 110 | sys.exit(0) 111 | --------------------------------------------------------------------------------