├── dep_tregex ├── .gitignore ├── __init__.py ├── tree_state.py ├── conll.py ├── tree_action.py ├── tree_pattern.py ├── tree.py ├── tree_to_html.py ├── __main__.py └── tree_script.py ├── docs ├── Makefile ├── index.rst ├── conf.py ├── tree_ops.rst ├── tregex_tutorial.rst ├── quickstart.rst └── tregex_reference.rst ├── AUTHORS ├── script.txt ├── README.md ├── .gitignore └── LICENSE /dep_tregex/.gitignore: -------------------------------------------------------------------------------- 1 | *pyc 2 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | sphinx-build . html 3 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | The following authors have created the source code of "dep_tregex" published and 2 | distributed by YANDEX LLC as the owner: 3 | 4 | Sergey Gubanov 5 | -------------------------------------------------------------------------------- /script.txt: -------------------------------------------------------------------------------- 1 | { 2 | z $++ y form /..../ and can_head z 3 | :: 4 | set form Z 'ZZZ'; 5 | } 6 | 7 | { 8 | w $-- y form 'few' and can_head w 9 | :: 10 | set form w 'WWW'; 11 | } 12 | -------------------------------------------------------------------------------- /dep_tregex/__init__.py: -------------------------------------------------------------------------------- 1 | from dep_tregex.conll import * 2 | from dep_tregex.tree import * 3 | from dep_tregex.tree_action import * 4 | from dep_tregex.tree_pattern import * 5 | from dep_tregex.tree_script import * 6 | from dep_tregex.tree_state import * 7 | from dep_tregex.tree_to_html import * 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # dep_tregex 2 | 3 | Python 2 module that implements Stanford Tregex-inspired language for rule-based dependency tree manipulation. 4 | 5 | https://yandex.github.io/dep_tregex/ 6 | 7 | Bonus: tree visualization into SVG! 8 | 9 | ![What if Google morhped into GoogleOS?](https://yandex.github.io/dep_tregex/tree.svg) 10 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | ========== 2 | dep_tregex 3 | ========== 4 | 5 | ``dep_tregex`` is a Python 2 module that implements 6 | `Stanford Tregex `_-inspired 7 | language for rule-based dependency tree manipulation. 8 | 9 | We also have visualization and basic unix-like utilities that work on 10 | dependency trees in `CoNLL format `_: ``head``, 11 | ``wc``, ``shuf``, etc. 12 | 13 | .. toctree:: 14 | :maxdepth: 1 15 | 16 | Getting started 17 | Basic tree operations 18 | Tregex tutorial 19 | Tregex reference 20 | 21 | If you find a bug or if you want to suggest a feature, please write to 22 | ``esgv@yandex-team.ru``. 23 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | 56 | # PyBuilder 57 | target/ 58 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | (C) YANDEX LLC, 2016 2 | 3 | This program is free software: you can redistribute it and/or modify 4 | it under the terms of the GNU General Public License as published by 5 | the Free Software Foundation, either version 3 of the License, or 6 | (at your option) any later version in accordance with the following 7 | disclaimer about purposes of usage. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | 17 | This program is available for use in non-commercial purposes only. 18 | For use it in commercial purposes you shall receive the written permission 19 | from YANDEX LLC as the owner of this program. 20 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | import os 5 | import time 6 | 7 | ## ---------------------------------------------------------------------------- 8 | # Sphinx configuration 9 | 10 | # Enabled Sphinx extensions 11 | extensions = [ 12 | 'sphinx.ext.autodoc', 13 | 'sphinx.ext.intersphinx', 14 | 'sphinx.ext.extlinks', 15 | 'sphinx.ext.mathjax', 16 | ] 17 | 18 | # Default reStructuredText role. 19 | default_role = None 20 | 21 | # Print warnings for every missing reference 22 | nitpicky = True 23 | 24 | # Class or enumeration members are usually written in the same order it would 25 | # be good to document them 26 | autodoc_member_order = "bysource" 27 | 28 | # Master document is index.rst 29 | master_doc = "index" 30 | 31 | ## ---------------------------------------------------------------------------- 32 | # Options for HTML output 33 | 34 | html_theme_options = {"nosidebar": True} 35 | html_use_smartypants = True 36 | html_show_sourcelink = True 37 | 38 | # Customize footer 39 | # 40 | html_last_updated_fmt = "%b %d, %Y" 41 | html_use_index = True 42 | html_use_modindex = True 43 | html_show_sphinx = True 44 | html_show_copyright = True 45 | 46 | ## ---------------------------------------------------------------------------- 47 | # Intersphinx configuration 48 | 49 | intersphinx_mapping = {} 50 | 51 | # Set up intersphinx for official Python 2 documentation 52 | intersphinx_mapping['python'] = ("http://docs.python.org/2", None) 53 | 54 | ## ---------------------------------------------------------------------------- 55 | # Project-specific configuration 56 | 57 | project = "dep_tregex" 58 | copyright = "Yandex" 59 | -------------------------------------------------------------------------------- /docs/tree_ops.rst: -------------------------------------------------------------------------------- 1 | ======================== 2 | Unix-like tree utilities 3 | ======================== 4 | 5 | All utilities read trees from stdin and write trees to stdout. 6 | 7 | ``words`` 8 | ========= 9 | 10 | Extract and print words from trees. 11 | 12 | .. code-block:: none 13 | 14 | python -m'dep_tregex' words 22 | 23 | 24 | ``wc`` 25 | ====== 26 | 27 | Count number of trees. 28 | 29 | .. code-block:: none 30 | 31 | python -m'dep_tregex' wc that matches PATTERN" 81 | 82 | 83 | 84 | +-----------+ w1 -->. PATTERN 85 | | | 86 | | v "Has a *child* to the *right* 87 | w1 ... that matches PATTERN" 88 | 89 | 90 | 91 | +-----------+ w1 .--> PATTERN 92 | | | 93 | | v "Has a *head* to the *left* 94 | ... w1 that matches PATTERN" 95 | 96 | 97 | 98 | +-----------+ w1 .<-- PATTERN 99 | | | "Has a *child* to the *left* 100 | v | that matches PATTERN" 101 | ... w1 102 | 103 | 104 | If the arrow is short, like ``.<-`` vs ``.<--``, this means that the node and 105 | its head/child should be adjacent to each other, that there can be no nodes in 106 | between. 107 | 108 | You can almost imagine the tree from the pattern: 109 | 110 | .. code-block:: none 111 | 112 | (Visually) (Our metaphor) 113 | 114 | +----------------+ w1 <--. w2 and -->. w3 115 | | | 116 | | +-----+ | "Has a head to the right (w2) *and also* has 117 | v | v | a child to the right (w3)" 118 | w1 ... w3 ... w2 119 | 120 | +----------------+ w1 <--. w2 -->. w3 121 | | | +---+ 122 | | | | | "Has a head to the right (w2) *that* has a 123 | v | | v child to the right (w3)" 124 | w1 ... w2 ... w3 125 | 126 | +-------------------+ w1 <--. w2 .<-- w3 127 | | | 128 | | +---+ | "Has a head to the right (w2) that has a 129 | v v | | child to the *left* (w3)" 130 | w1 w3 ... w2 131 | 132 | 133 | Strings & regular expressions 134 | ----------------------------- 135 | 136 | Conditions like ``form`` or ``postag`` can either do an exact match 137 | or a regular expression match. 138 | 139 | .. code-block:: none 140 | 141 | n1 form 'cat' 142 | n1 form "dog" 143 | n1 form /cat|dog|catdog/ 144 | 145 | By default, regular expressions match the whole attribute (``/cat/`` won't 146 | match "lolcat"), and also are case-sensitive. If you want substring match or 147 | case sensitivity, use regex flags: 148 | 149 | .. code-block:: none 150 | 151 | n1 form /cat/ # case-sensitive, whole-string "cat" 152 | n1 form /cat/i # case-insensitive, whole-string "cat", "Cat", "CAT", ... 153 | n1 form /cat/g # case-sensitive, substring "cats", "lolcat", ... 154 | n1 form /cat/gi # case-insensitive, substring "CAT", "Lolcat", ... 155 | 156 | Backreferences 157 | -------------- 158 | 159 | Suppose you want to match nodes on the left of their head which have a sibling 160 | on the same side. 161 | 162 | .. code-block:: none 163 | 164 | +----------+ 165 | | | 166 | | +---+ | a <--. c .<-- b 167 | v v | | 168 | >>a<< b c 169 | 170 | 171 | This won't work the way you'd expect: most likely, the pattern will match with 172 | ``a`` and ``b`` assigned to the same node! 173 | 174 | You need another condition that ``a`` and ``b`` should not be the same node; 175 | backreferences come to the rescue. 176 | 177 | .. code-block:: none 178 | 179 | a <--. (c .<-- (b not == a)) 180 | # ^^^^--------- backreference match! 181 | 182 | .. warning:: 183 | 184 | There are severe restrictions on using backreferences. Please see the 185 | :ref:`description of node conditions `. 186 | 187 | Scripts 188 | ------- 189 | 190 | Now that you've mastered tree patterns, let's move on to the tree scripts. 191 | 192 | Tree scripts modify the tree. Each script consists of a pattern, that assigns 193 | backreferences, and of one or more actions. 194 | 195 | .. code-block:: none 196 | 197 | # 1. Delete all "cat"s. 198 | 199 | { 200 | x form /cat/i 201 | :: 202 | delete node x; 203 | } 204 | 205 | # 2. Move all "dog"s to the beginning. 206 | 207 | { 208 | x form /dog/i $-- (start not $- w) 209 | :: 210 | move node x before node start; 211 | } 212 | 213 | Pretty straighforward. Scripts are executed sequentially; each script is 214 | applied once to each "original" node of the tree: the script is not applied to 215 | the nodes created by it. 216 | 217 | ``move`` and ``copy`` actions 218 | ----------------------------- 219 | 220 | Probably the most important actions are ``move`` and ``copy``. 221 | 222 | .. code-block:: none 223 | 224 | (copy|move) (node|group) X (before|after) (node|group) Y 225 | 226 | e.g: 227 | 228 | copy node X before group Y 229 | move group X after group Y 230 | copy group X before node Y 231 | ... 232 | 233 | Let's discuss one of them, e.g. ``move group X after node Y``. 234 | 235 | First of all, ``group X`` means the action affects not only the node ``X`` but 236 | also its "group": children, children of children, etc. 237 | ``move group X after node Y`` does the following: 238 | 239 | - Gather X and, recursively, all children of X (its "group"); 240 | - Move gathered nodes right after the node Y, preserving initial order and 241 | heads. 242 | 243 | .. code-block:: none 244 | 245 | +========+ (arc X => Y emphasized for clarity) 246 | | +--+ | +--+ 247 | v | v | | v 248 | X x1 Y y1 249 | ^^^^^^ ^--------- position right after Y 250 | | 251 | +--------------- X & children 252 | 253 | 254 | move group X after node Y: 255 | 256 | 257 | +---------------+ 258 | | | 259 | | +==+ +--+ | 260 | | | v | v v 261 | Y X x1 y1 262 | ^^^^^^ 263 | 264 | This also works for non-projective trees. 265 | 266 | .. code-block:: none 267 | 268 | +================+ 269 | | +---------+ | 270 | | +--|----+ | | 271 | v | v v | | 272 | X y1 x1 Y 273 | ^^-------^^---------- X & children 274 | 275 | 276 | move group X after node Y: 277 | 278 | 279 | +---------+ 280 | | | +==+ +--+ 281 | v | | v | v 282 | y1 Y X x1 283 | 284 | If you want to move (or copy) just the selected word, leaving its children where 285 | they are, use ``node X`` instead of ``group X``. 286 | 287 | 288 | .. code-block:: none 289 | 290 | +================+ 291 | | +---------+ | 292 | | +--|----+ | | 293 | v | v v | | 294 | X y1 x1 Y 295 | ^^ ^^--------- X's children 296 | +------------------ X 297 | 298 | 299 | move node X after node Y: 300 | 301 | +-----------+ 302 | +----|----+ | 303 | | | | +==+ | 304 | v v | | v | 305 | y1 x1 Y X 306 | 307 | ``move ... after group Y`` moves after the last (leftmost) node of the group 308 | of ``Y``. ``move ... before group Y`` moves to the position before the first 309 | (rightmost) node of the group of ``Y``. 310 | 311 | Grouping 312 | ======== 313 | 314 | ``group X Y`` action creates a "virtual arc" from ``X`` to ``Y`` and from ``Y`` 315 | to ``X``. These arcs are not present in a tree, don't affect its connectivity 316 | and acyclicity, don't participate in neighborhood conditions like ``X <--. Y``, 317 | **but** they are traversed for the purpose of determining the *group* of a node. 318 | 319 | .. code-block:: none 320 | 321 | group X y2 322 | 323 | +--------+ 324 | | +--+| +--+ 325 | v v || | v 326 | X y2 Y y1 327 | ^^^^^^ ^--------- position right after Y 328 | | 329 | +--------------- X & its group 330 | 331 | 332 | move group X after node Y: 333 | 334 | 335 | +---------------+ 336 | |+--------+ | 337 | ||+--+ | | 338 | ||| v v v 339 | Y X y2 y1 340 | ^^^^^^ 341 | 342 | Formally, the group of a node ``X`` is the union of ``X``, all of the groups of 343 | the children of ``X`` and all of the groups of the nodes, grouped with ``X`` 344 | via ``group X Y`` or ``group Y X`` operations. 345 | -------------------------------------------------------------------------------- /docs/quickstart.rst: -------------------------------------------------------------------------------- 1 | =============== 2 | Getting started 3 | =============== 4 | 5 | 1. Clone the repo ``dep_tregex`` 6 | 7 | .. code-block:: none 8 | 9 | $ git clone https://github.com/yandex/dep_tregex.git 10 | 11 | 2. Test the module. 12 | 13 | .. code-block:: none 14 | 15 | $ cd dep_tregex/ 16 | $ python -m'dep_tregex' 17 | 18 | .. note:: 19 | 20 | If you use python2.6 (or earlier version), you'll have to specify the main module manually: 21 | 22 | .. code-block:: none 23 | 24 | $ python -m'dep_tregex.__main__' 25 | 26 | .. code-block:: none 27 | 28 | usage: python -mdep_tregex [-h] 29 | {words,wc,nth,head,tail,shuf,grep,sed,html,gdb} ... 30 | 31 | positional arguments: 32 | {words,wc,nth,head,tail,shuf,grep,sed,html,gdb} 33 | words extract words from tree 34 | wc count trees 35 | nth print only Nth tree 36 | head print only first N trees 37 | tail print only last N trees 38 | shuf shuffle trees 39 | grep filter trees by pattern 40 | sed apply tree scripts to trees 41 | html view trees in browser 42 | gdb view step-by-step invocation 43 | 44 | optional arguments: 45 | -h, --help show this help message and exit 46 | 47 | 3. Test a cooler feature. 48 | 49 | .. code-block:: none 50 | 51 | $ python -m'dep_tregex' html 58 | svg { display: block; } 59 | 60 | 89 | 90 | 152 | 153 | 154 | What 155 | 156 | 157 | 158 | 159 | 160 | root 161 | 162 | 163 | 164 | Morphed 165 | 166 | 167 | 168 | 169 | 170 | advcl 171 | 172 | 173 | 174 | if 175 | 176 | 177 | 178 | 179 | 180 | mark 181 | 182 | 183 | 184 | Google 185 | 186 | 187 | 188 | 189 | 190 | nsubj 191 | 192 | 193 | 194 | GoogleOS 195 | 196 | 197 | 198 | 199 | 200 | nmod 201 | 202 | 203 | 204 | ? 205 | 206 | 207 | 208 | 209 | 210 | punct 211 | 212 | 213 | 214 | Into 215 | 216 | 217 | 218 | 219 | 220 | case 221 | 222 | 223 | 224 | 4. You're all set (just don't leave the ``dep_tregex/`` folder, or just add it to your PYTHONPATH). 225 | -------------------------------------------------------------------------------- /docs/tregex_reference.rst: -------------------------------------------------------------------------------- 1 | ================ 2 | Tregex reference 3 | ================ 4 | 5 | .. contents:: 6 | :local: 7 | :depth: 1 8 | 9 | Syntax 10 | ------ 11 | 12 | .. productionlist:: 13 | S: `script`* 14 | script: '{' `pattern` '::' (`action` ';')* '}' 15 | pattern: ID [`condition`] 16 | : '(' `pattern` ')' 17 | condition: '-->.' `pattern` 18 | : '.<--' `pattern` 19 | : '.-->' `pattern` 20 | : '<--.' `pattern` 21 | : '->.' `pattern` 22 | : '.<-' `pattern` 23 | : '.->' `pattern` 24 | : '<-.' `pattern` 25 | : '>' `pattern` 26 | : '>>' `pattern` 27 | : '<' `pattern` 28 | : '<<' `pattern` 29 | : '$++' `pattern` 30 | : '$--' `pattern` 31 | : '$+' `pattern` 32 | : '$-' `pattern` 33 | : `attr` `string_cond` 34 | : 'is_top' 35 | : 'is_leaf' 36 | : 'can_head' ID 37 | : 'can_be_headed_by' ID 38 | : '==' ID 39 | : '(' `condition` ')' 40 | : 'not' `condition` 41 | : `condition` 'and' `condition` 42 | : `condition` 'or' `condition` 43 | string_cond: "STRING" 44 | : 'STRING' 45 | : /REGEX/i 46 | : /REGEX/g 47 | : /REGEX/ig 48 | : /REGEX/gi 49 | action: ('copy' | 'move') ('node' | 'group') ID ('before' | 'after') ('node' | 'group') ID 50 | : 'delete' ('node' | 'group') ID 51 | : 'set' `attr` ID STR 52 | : ('set_head' | 'try_set_head') ID ('headed_by' | 'heads') ID 53 | : 'group' ID ID 54 | attr: 'form' 55 | : 'lemma' 56 | : 'cpostag' 57 | : 'postag' 58 | : 'feats' 59 | : 'deprel' 60 | 61 | Script application 62 | ------------------ 63 | 64 | Scripts consist of a sequence of patterns, each pattern paired with a list of 65 | actions. 66 | 67 | .. code-block:: none 68 | 69 | 70 | # 1. Delete all "cat"s. 71 | 72 | { 73 | x form /cat/i 74 | :: 75 | delete node x; 76 | } 77 | 78 | # 2. Copy all "dog"s to the beginning. 79 | 80 | { 81 | x form /dog/i $-- (start not $- w) 82 | :: 83 | copy node x before node start; 84 | } 85 | 86 | Patterns and actions are separated by ``::``. 87 | 88 | Steps of the script are applied sequentially: first ``#1`` several times, 89 | then ``#2`` several times, etc. 90 | 91 | On each step, a script is applied to every possible node of the tree *once*, 92 | and not applied to the nodes created by the script itself. 93 | 94 | An example: 95 | 96 | .. code-block:: none 97 | 98 | +---------+ 99 | | +--+ | +--+ 100 | | v | v | v 101 | ROOT cat and dog 102 | 103 | # 1: pattern 104 | x node /cat/i 105 | 106 | +---------+ 107 | | +--+ | +--+ 108 | | v | v | v 109 | ROOT cat and dog 110 | {x} 111 | 112 | #1: actions 113 | delete node x 114 | 115 | +---------+ 116 | | | +--+ 117 | | v | v 118 | ROOT and dog 119 | 120 | # 1: doesn't match 121 | # 2: pattern 122 | x node /dog/i $-- (start not $- w) 123 | 124 | +---------+ 125 | | | +--+ 126 | | v | v 127 | ROOT and dog 128 | {start}{x} 129 | 130 | #2: actions 131 | copy node x before node start 132 | 133 | +---------+ 134 | | +--+ | +--+ 135 | | v | v | v 136 | ROOT dog and dog 137 | (new) (old) 138 | 139 | # 2: doesn't match 140 | # - Node "dog" (new) was created by script #2, and scripts are not applied 141 | # to nodes created by themselves. 142 | # - Node "dog" (old) was already matched by script #2. 143 | 144 | # Done. 145 | 146 | .. _ref-node-conditions: 147 | 148 | Node conditions 149 | --------------- 150 | 151 | ======================= = 152 | ``ATTR STR_COND`` Attribute matches :ref:`string condition `. Available attributes: ``form``, ``lemma``, ``cpostag``, ``postag``, ``feats``, ``deprel``. 153 | ``is_top`` Node's parent is the root 154 | ``is_leaf`` Node has no children 155 | ``can_head ID`` Whether the tree stays valid (connected & acyclic) if we attach a given :ref:`backreference ` to the node. 156 | ``can_be_headed_by ID`` If ``X can_be_headed_by Y`` matches whenever ``Y can_head X`` does. 157 | ``== ID`` Node matches a :ref:`backreference ` 158 | ======================= = 159 | 160 | .. _ref-backreferences: 161 | 162 | Backreferences 163 | -------------- 164 | 165 | Backreference matches can only be made in subconditions of the pattern where 166 | the reference was set. Like this: 167 | 168 | .. code-block:: none 169 | 170 | vvvv------ backreference match 171 | a <--. (c .<-- (b not == a)) 172 | ^ ^^^^^^^^^^^^^^^^^^^----- subcondition of 'a' 173 | +------------------------------- reference setup of 'a' 174 | 175 | This is wrong: 176 | 177 | .. code-block:: none 178 | 179 | vvvv--- BAD backreference match 180 | c .<-- (a) and .<-- (b not == a) 181 | ^^------------------------ 'a' has no subconditions 182 | | 183 | +------------------------- reference setup of 'a' 184 | 185 | .. warning:: 186 | 187 | If the backreference match is not in a subcondition, *the system might not 188 | raise an error*. Be careful. 189 | 190 | .. _ref-string-conditions: 191 | 192 | String conditions 193 | ----------------- 194 | 195 | Node conditions like ``form`` or ``deprel`` can be used either to match the form 196 | (or dependency relation) exactly, or with a regular expression. 197 | 198 | .. code-block:: none 199 | 200 | n1 form 'cat' 201 | n1 form "dog" 202 | n1 form /dog|cat/ 203 | 204 | - Strings can be enclosed either in single ``'`` or double ``"`` quotes. 205 | - Regular expressions use extended 206 | `PCRE syntax `_. 207 | - Regular expressions are matched *to the whole string*. If you want 208 | a substring match, e.g. to match a word with a "ni" inside, write ``/ni/g``. 209 | - Regular expressions are case-sensitive. Use ``/.../i`` for case-insensitive 210 | matching. 211 | - Strings support *no escaping*. E.g. you can't write a single-quoted string 212 | with a single quote inside. 213 | - In a similar fashion, regular expressions support no escaping of ``/``: you 214 | can't make a regular expression with ``/`` inside. 215 | - Conditions on FEATS field work like this: 216 | 217 | 1. Feats are printed as a string. 218 | 219 | .. code-block:: none 220 | 221 | Noun|Pnon|Nom|A3sg 222 | 223 | 2. A `string condition `_ is applied. 224 | 225 | .. code-block:: none 226 | 227 | w1 feats /Noun/g 228 | 229 | Neighborhood conditions 230 | ----------------------- 231 | 232 | ============================== = 233 | ``-->.`` Has a child to the right 234 | ``.<--`` Has a child to the left 235 | ``.-->`` Has a head to the left 236 | ``<--.`` Has a head to the right 237 | ``->.`` Has a child immediately to the right 238 | ``.<-`` Has a child immediately to the left 239 | ``.->`` Has a head immediately to the left 240 | ``<-.`` Has a head immediately to the right 241 | ``>`` Node has a child. 242 | ``<`` Node has a parent. 243 | ``>>`` Node has a descendant. 244 | ``<<`` Node has an ancestor. 245 | ``$++`` Has a neighbor to the right 246 | ``$--`` Has a neighbor to the left 247 | ``$+`` Has a neighbor immediately to the right 248 | ``$-`` Has a neighbor immediately to the left 249 | ============================== = 250 | 251 | Script actions 252 | -------------- 253 | 254 | ============================================================== = 255 | ``(move|copy) (node|group) ID (after|before) (node|group) ID`` Move or copy node (or the whole group) to given position 256 | ``delete (node|group) ID`` Delete a node (or the whole group) 257 | ``set ATTR ID STR`` Set node's attribute. Available attributes: ``form``, ``lemma``, ``cpostag``, ``postag``, ``feats``, ``deprel`` 258 | ``set_head IDa (headed_by|heads) IDb`` Set node's head (``IDb`` becomes the head of ``IDa`` if ``IDa headed_by IDb``, otherwise vice versa). *Fail* if tree becomes cyclic or disconnected 259 | ``try_set_head IDa headed_by IDb`` Set node's head. *Do not fail* if tree becomes cyclic or disconnected 260 | ``group IDa IDb`` Consider ``IDa`` in a group of ``IDb`` and vice versa 261 | ============================================================== = 262 | 263 | - ``group X Y`` creates virtual arcs from ``X`` to ``Y`` and from ``Y`` to 264 | ``X``, considered only for determining node's group in ``move``, ``copy``, and 265 | ``delete`` operations. 266 | - The *group* of node ``X`` is ``X``, union of the *groups* of children of 267 | ``X``, and union of the *groups* of nodes ``n`` that were grouped with ``X`` 268 | using ``group X n`` or ``group n X`` operation. 269 | - ``move`` and ``copy`` actions can move either the node or the whole group. 270 | If the whole group is moved, all nodes from the group are gathered and put 271 | together into desired position, one node adjacent to the other, preserving 272 | initial relative order. 273 | - ``move (node|group) Y after group X`` moves ``Y`` after the last node of 274 | ``X`` 's *group*. ``move (node|group) Y after group X`` moves ``Y`` before 275 | the first node of ``X`` 's *group*. 276 | - ``move`` and ``copy`` actions can make the tree non-projective. 277 | - ``set_head`` fails if the new head is (possibly indirect) child of the node 278 | we're trying to set head on. ``try_set_head`` does nothing in this case. The 279 | use of former is encouraged in development, latter -- in production. 280 | 281 | Root node 282 | --------- 283 | 284 | There is a special node in the tree, that binds it together: the ``ROOT`` node. 285 | 286 | .. code-block:: none 287 | 288 | +-----------+ 289 | | +--+ | +--+ 290 | | v | v | v 291 | (ROOT) cat and dog 292 | 293 | It is introduced for the tree to always be connected in case the tree 294 | syntactically encodes more than one sentence. 295 | 296 | .. code-block:: none 297 | 298 | +---------------------+ 299 | |+----+ | 300 | || |+----+ +---+| 301 | || v| v v |v 302 | (ROOT) cat . And dog 303 | \_____/ \______/<--- Sentence 2 304 | ^------------------ Sentence 1 305 | 306 | **The root node is never matched by any pattern**. 307 | 308 | Operator priority 309 | ----------------- 310 | 311 | ============= ======= 312 | 1 (highest) ``not`` 313 | 2 ``and`` 314 | 3 (lowest) ``or`` 315 | ============= ======= 316 | 317 | Also, ``and`` and ``or`` append conditions to the innermost node, e.g. 318 | 319 | .. code-block:: none 320 | 321 | a <--. b <--. c and .<-- d 322 | 323 | Is equivalent to 324 | 325 | .. code-block:: none 326 | 327 | a <--. (b <--. c and .<-- d) 328 | \____/ \____/ <----- Condition 2 on "b" 329 | ^-------------------- Condition 1 on "b" 330 | 331 | **NOT** to 332 | 333 | .. code-block:: none 334 | 335 | a <--. (b <--. c) and (.<-- d) 336 | \_____________/ \______/ <-- Condition 2 on "a" 337 | ^------------------------- Condition 1 on "a" 338 | -------------------------------------------------------------------------------- /dep_tregex/tree_pattern.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import re 4 | 5 | class TreePattern: 6 | """ 7 | Base class for a tree pattern. 8 | Tree pattern matches a single node in dependency tree. 9 | """ 10 | 11 | def match(self, tree, node, backrefs_map): 12 | """ 13 | Return whether a node matches this pattern. 14 | 15 | - tree: a Tree 16 | - node: node index (1-based, 0 means "root") to match on 17 | - backrefs_map: contains backreferences to nodes in tree 18 | (dict: unicode -> int), which will be available after the 19 | whole-pattern match. Backreferences also can be used to communicate 20 | with sub-patterns (see e.g. SetBackref and EqualsBackref). 21 | 22 | All patterns should comply with the invariant: 23 | 24 | * If pattern not matches, backrefs_map should be left intact. 25 | * If pattern matches, it may write something to backrefs_map. 26 | """ 27 | raise NotImplementedError() 28 | 29 | def compile_regex(pattern, ignore_case, anywhere): 30 | """ 31 | Return Python compiled regex. Match your string against it with 32 | 'r.search(s)'. 33 | 34 | - ignore_case: ignore case of s. 35 | - anywhere: if False, r.search(s) does a whole-string match. 36 | """ 37 | flags = re.UNICODE 38 | if ignore_case: 39 | flags = flags | re.IGNORECASE 40 | if not anywhere: 41 | pattern = '^' + pattern + '$' 42 | return re.compile(pattern, flags) 43 | 44 | ## ---------------------------------------------------------------------------- 45 | # Children 46 | 47 | class HasLeftChild(TreePattern): 48 | def __init__(self, condition): 49 | self.condition = condition 50 | 51 | def match(self, tree, node, backrefs_map): 52 | for child in tree.children(node): 53 | if child > node: 54 | continue 55 | if self.condition.match(tree, child, backrefs_map): 56 | return True 57 | return False 58 | 59 | class HasRightChild(TreePattern): 60 | def __init__(self, condition): 61 | self.condition = condition 62 | 63 | def match(self, tree, node, backrefs_map): 64 | for child in tree.children(node): 65 | if child < node: 66 | continue 67 | if self.condition.match(tree, child, backrefs_map): 68 | return True 69 | return False 70 | 71 | class HasChild(TreePattern): 72 | def __init__(self, condition): 73 | self.condition = condition 74 | 75 | def match(self, tree, node, backrefs_map): 76 | for child in tree.children(node): 77 | if self.condition.match(tree, child, backrefs_map): 78 | return True 79 | return False 80 | 81 | class HasSuccessor(TreePattern): 82 | def __init__(self, condition): 83 | self.condition = condition 84 | 85 | def match(self, tree, node, backrefs_map): 86 | for child in tree.children_recursive(node): 87 | if self.condition.match(tree, child, backrefs_map): 88 | return True 89 | return False 90 | 91 | class HasAdjacentLeftChild(TreePattern): 92 | def __init__(self, condition): 93 | self.condition = condition 94 | 95 | def match(self, tree, node, backrefs_map): 96 | for child in tree.children(node): 97 | if child + 1 != node: 98 | continue 99 | if self.condition.match(tree, child, backrefs_map): 100 | return True 101 | return False 102 | 103 | class HasAdjacentRightChild(TreePattern): 104 | def __init__(self, condition): 105 | self.condition = condition 106 | 107 | def match(self, tree, node, backrefs_map): 108 | for child in tree.children(node): 109 | if child - 1 != node: 110 | continue 111 | if self.condition.match(tree, child, backrefs_map): 112 | return True 113 | return False 114 | 115 | class HasAdjacentChild(TreePattern): 116 | def __init__(self, condition): 117 | self.condition = condition 118 | 119 | def match(self, tree, node, backrefs_map): 120 | for child in tree.children(node): 121 | if (child - node) not in [-1, +1]: 122 | continue 123 | if self.condition.match(tree, child, backrefs_map): 124 | return True 125 | return False 126 | 127 | ## ---------------------------------------------------------------------------- 128 | # Parents 129 | 130 | class HasLeftHead(TreePattern): 131 | def __init__(self, condition): 132 | self.condition = condition 133 | 134 | def match(self, tree, node, backrefs_map): 135 | if node == 0: 136 | return False 137 | 138 | head = tree.heads(node) 139 | return head < node and self.condition.match(tree, head, backrefs_map) 140 | 141 | class HasRightHead(TreePattern): 142 | def __init__(self, condition): 143 | self.condition = condition 144 | 145 | def match(self, tree, node, backrefs_map): 146 | if node == 0: 147 | return False 148 | 149 | head = tree.heads(node) 150 | return head > node and self.condition.match(tree, head, backrefs_map) 151 | 152 | class HasHead(TreePattern): 153 | def __init__(self, condition): 154 | self.condition = condition 155 | 156 | def match(self, tree, node, backrefs_map): 157 | if node == 0: 158 | return False 159 | 160 | head = tree.heads(node) 161 | return self.condition.match(tree, head, backrefs_map) 162 | 163 | class HasPredecessor(TreePattern): 164 | def __init__(self, condition): 165 | self.condition = condition 166 | 167 | def match(self, tree, node, backrefs_map): 168 | while True: 169 | node = tree.heads(node) 170 | if self.condition.match(tree, node, backrefs_map): 171 | return True 172 | if node == 0: 173 | break 174 | return False 175 | 176 | class HasAdjacentLeftHead(TreePattern): 177 | def __init__(self, condition): 178 | self.condition = condition 179 | 180 | def match(self, tree, node, backrefs_map): 181 | if node == 0: 182 | return False 183 | 184 | head = tree.heads(node) 185 | adjacent = (head + 1 == node) 186 | return adjacent and self.condition.match(tree, head, backrefs_map) 187 | 188 | class HasAdjacentRightHead(TreePattern): 189 | def __init__(self, condition): 190 | self.condition = condition 191 | 192 | def match(self, tree, node, backrefs_map): 193 | if node == 0: 194 | return False 195 | 196 | head = tree.heads(node) 197 | adjacent = (head - 1 == node) 198 | return adjacent and self.condition.match(tree, head, backrefs_map) 199 | 200 | class HasAdjacentHead(TreePattern): 201 | def __init__(self, condition): 202 | self.condition = condition 203 | 204 | def match(self, tree, node, backrefs_map): 205 | if node == 0: 206 | return False 207 | 208 | head = tree.heads(node) 209 | adjacent = (head - node) in [-1, +1] 210 | return adjacent and self.condition.match(tree, head, backrefs_map) 211 | 212 | ## ---------------------------------------------------------------------------- 213 | # Neighbors 214 | 215 | class HasLeftNeighbor(TreePattern): 216 | def __init__(self, condition): 217 | self.condition = condition 218 | 219 | def match(self, tree, node, backrefs_map): 220 | if node == 0: 221 | return False 222 | 223 | for neighbor in range(0, node): 224 | if self.condition.match(tree, neighbor, backrefs_map): 225 | return True 226 | return False 227 | 228 | class HasRightNeighbor(TreePattern): 229 | def __init__(self, condition): 230 | self.condition = condition 231 | 232 | def match(self, tree, node, backrefs_map): 233 | for neighbor in range(node + 1, len(tree) + 1): 234 | if self.condition.match(tree, neighbor, backrefs_map): 235 | return True 236 | return False 237 | 238 | class HasAdjacentLeftNeighbor(TreePattern): 239 | def __init__(self, condition): 240 | self.condition = condition 241 | 242 | def match(self, tree, node, backrefs_map): 243 | if node == 0: 244 | return False 245 | 246 | neighbor = node - 1 247 | return self.condition.match(tree, neighbor, backrefs_map) 248 | 249 | class HasAdjacentRightNeighbor(TreePattern): 250 | def __init__(self, condition): 251 | self.condition = condition 252 | 253 | def match(self, tree, node, backrefs_map): 254 | if node == len(tree): 255 | return False 256 | 257 | neighbor = node + 1 258 | return self.condition.match(tree, neighbor, backrefs_map) 259 | 260 | ## ---------------------------------------------------------------------------- 261 | # Misc. tree structure 262 | 263 | class CanHead(TreePattern): 264 | def __init__(self, backref): 265 | self.backref = backref 266 | 267 | def match(self, tree, node, backrefs_map): 268 | if self.backref not in backrefs_map: 269 | return False 270 | 271 | head = node 272 | child = backrefs_map[self.backref] 273 | return head not in [child] + tree.children_recursive(child) 274 | 275 | class CanBeHeadedBy(TreePattern): 276 | def __init__(self, backref): 277 | self.backref = backref 278 | 279 | def match(self, tree, node, backrefs_map): 280 | if self.backref not in backrefs_map: 281 | return False 282 | 283 | head = backrefs_map[self.backref] 284 | child = node 285 | return head not in [child] + tree.children_recursive(child) 286 | 287 | class IsRoot(TreePattern): 288 | def match(self, tree, node, backrefs_map): 289 | return node == 0 290 | 291 | class NotRoot(TreePattern): 292 | def __init__(self, condition): 293 | self.condition = condition 294 | 295 | def match(self, tree, node, backrefs_map): 296 | return node != 0 and self.condition.match(tree, node, backrefs_map) 297 | 298 | class IsTop(TreePattern): 299 | def match(self, tree, node, backrefs_map): 300 | return node != 0 and tree.heads(node) == 0 301 | 302 | class IsLeaf(TreePattern): 303 | def match(self, tree, node, backrefs_map): 304 | return not tree.children(node) 305 | 306 | ## ---------------------------------------------------------------------------- 307 | # Attributes 308 | 309 | class AttrMatches(TreePattern): 310 | def __init__(self, attr, pred_fn): 311 | self.attr = attr 312 | self.pred_fn = pred_fn 313 | 314 | def match(self, tree, node, backrefs_map): 315 | if node == 0: 316 | return False 317 | 318 | attr = getattr(tree, self.attr)(node) 319 | return self.pred_fn(attr) 320 | 321 | class FeatsMatch(TreePattern): 322 | def __init__(self, pred_fn): 323 | self.pred_fn = pred_fn 324 | 325 | def match(self, tree, node, backrefs_map): 326 | if node == 0: 327 | return False 328 | 329 | attr = u'|'.join(tree.feats(node)) 330 | return self.pred_fn(attr) 331 | 332 | ## ---------------------------------------------------------------------------- 333 | # Logic 334 | 335 | class And(TreePattern): 336 | def __init__(self, conditions): 337 | self.conditions = conditions 338 | 339 | def match(self, tree, node, backrefs_map): 340 | # Backup the initial backrefs map. 341 | old_map = backrefs_map.copy() 342 | 343 | for condition in self.conditions: 344 | if not condition.match(tree, node, backrefs_map): 345 | # Before returning, restore the old map, i.e. undo all changes 346 | # to backrefs_map. 347 | backrefs_map.clear() 348 | backrefs_map.update(old_map) 349 | return False 350 | return True 351 | 352 | class Or(TreePattern): 353 | def __init__(self, conditions): 354 | self.conditions = conditions 355 | 356 | def match(self, tree, node, backrefs_map): 357 | for condition in self.conditions: 358 | if condition.match(tree, node, backrefs_map): 359 | return True 360 | return False 361 | 362 | class Not(TreePattern): 363 | def __init__(self, condition): 364 | self.condition = condition 365 | 366 | def match(self, tree, node, backrefs_map): 367 | # If sub-condition matchesm 'not sub-condition' doesn't. Sub-condition 368 | # might modify the backrefs_map on successful match, but since 369 | # 'not sub-condition' doesn't match, these changes shouldn't be visible 370 | # to the outside world. 371 | copy = backrefs_map.copy() 372 | return not self.condition.match(tree, node, copy) 373 | 374 | class AlwaysTrue(TreePattern): 375 | def match(self, tree, node, backrefs_map): 376 | return True 377 | 378 | ## ---------------------------------------------------------------------------- 379 | # Backrefs 380 | 381 | class SetBackref(TreePattern): 382 | def __init__(self, backref, condition): 383 | self.backref = backref 384 | self.condition = condition 385 | 386 | def match(self, tree, node, backrefs_map): 387 | # Backup the old backreference value. 388 | old_backref = backrefs_map.get(self.backref) 389 | 390 | # Update the backref so the underlying condition can see it. 391 | backrefs_map[self.backref] = node 392 | 393 | # If condition fails, undo the changes to backrefs_map. 394 | if not self.condition.match(tree, node, backrefs_map): 395 | if old_backref is None: 396 | # If there were no such key in the map, delete it. 397 | del backrefs_map[self.backref] 398 | else: 399 | # If there was such key in the map, just restore it. 400 | backrefs_map[self.backref] = old_backref 401 | return False 402 | return True 403 | 404 | class EqualsBackref(TreePattern): 405 | def __init__(self, backref): 406 | self.backref = backref 407 | 408 | def match(self, tree, node, backrefs_map): 409 | return backrefs_map.get(self.backref) == node 410 | -------------------------------------------------------------------------------- /dep_tregex/tree.py: -------------------------------------------------------------------------------- 1 | def _check_is_not_a_str_list(l, name): 2 | if l and all(isinstance(s, str) for s in l): 3 | raise ValueError(( 4 | "'%s' is a list of 'str', not a list of 'unicode'. " 5 | "Please don't use non-unicode strings in Python 2.7. " 6 | "To convert 'str' to 'unicode', use s.decode('utf-8')." 7 | ) % name) 8 | 9 | class Tree: 10 | # - Constructor - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 11 | 12 | def __init__(self, forms, lemmas, cpostags, postags, feats, heads, deprels): 13 | """ 14 | Construct a tree. 15 | 16 | forms: list of 'unicode'. 17 | lemmas: list of 'unicode'. 18 | cpostags: list of 'unicode'. 19 | postags: list of 'unicode'. 20 | feats: list of featuresets; each featureset is a list of 'unicode'. 21 | head: list of int 22 | deprels: list of 'unicode' 23 | """ 24 | 25 | # Store. 26 | self._forms = list(forms) 27 | self._lemmas = list(lemmas) 28 | self._cpostags = list(cpostags) 29 | self._postags = list(postags) 30 | self._feats = list(feats) 31 | self._heads = list(heads) 32 | self._deprels = list(deprels) 33 | 34 | # Check lengths. 35 | N = len(self._forms) 36 | msg = 'invalid %s: %r. Expected %i elements.' 37 | if len(self._lemmas) != N: 38 | raise ValueError(msg % ('lemmas', self._lemmas, N)) 39 | if len(self._cpostags) != N: 40 | raise ValueError(msg % ('cpostags', self._cpostags, N)) 41 | if len(self._postags) != N: 42 | raise ValueError(msg % ('postags', self._postags, N)) 43 | if len(self._feats) != N: 44 | raise ValueError(msg % ('feats', self._feats, N)) 45 | if len(self._heads) != N: 46 | raise ValueError(msg % ('heads', self._heads, N)) 47 | if len(self._deprels) != N: 48 | raise ValueError(msg % ('deprels', self._deprels, N)) 49 | 50 | # Check indices. 51 | if not all(0 <= head <= N for head in self._heads): 52 | msg = 'invalid heads in %i-word tree: %r' 53 | raise ValueError(msg % (N, self._heads)) 54 | 55 | # Check unicodeness. 56 | _check_is_not_a_str_list(self._forms, 'Tree.forms') 57 | _check_is_not_a_str_list(self._lemmas, 'Tree.forms') 58 | _check_is_not_a_str_list(self._cpostags, 'Tree.cpostags') 59 | _check_is_not_a_str_list(self._postags, 'Tree.postags') 60 | _check_is_not_a_str_list(self._feats, 'Tree.feats') 61 | _check_is_not_a_str_list(self._heads, 'Tree.heads') 62 | _check_is_not_a_str_list(self._deprels, 'Tree.deprels') 63 | 64 | # Compose children index. 65 | self._children = [[] for node in range(N + 1)] 66 | for node, head in enumerate(self._heads, start=1): 67 | self._children[head].append(node) 68 | 69 | # Check tree validity: connectivity and looplessness. 70 | queue = [0] 71 | visited = set() 72 | i = 0 73 | 74 | while i < len(queue): 75 | node = queue[i] 76 | visited.add(node) 77 | i += 1 78 | 79 | for child in self.children(node): 80 | if child in visited: 81 | raise ValueError('loop in a tree; heads %r' % self._heads) 82 | queue.append(child) 83 | 84 | if len(queue) != len(self) + 1: 85 | raise ValueError('dicsonnected node, heads %r' % self._heads) 86 | 87 | # - Getters - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 88 | 89 | def __len__(self): 90 | """ 91 | Return number of words in the tree. 92 | """ 93 | return len(self._forms) 94 | 95 | def forms(self, i): 96 | """ 97 | Return FORM for i'th word. 98 | i is 1-based. 99 | """ 100 | if i <= 0: 101 | raise IndexError() 102 | return self._forms[i - 1] 103 | 104 | def lemmas(self, i): 105 | """ 106 | Return LEMMA for i'th word. 107 | i is 1-based. 108 | """ 109 | if i <= 0: 110 | raise IndexError() 111 | return self._lemmas[i - 1] 112 | 113 | def cpostags(self, i): 114 | """ 115 | Return CPOSTAG for i'th word. 116 | i is 1-based. 117 | """ 118 | if i <= 0: 119 | raise IndexError() 120 | return self._cpostags[i - 1] 121 | 122 | def postags(self, i): 123 | """ 124 | Return POSTAG for i'th word. 125 | i is 1-based. 126 | """ 127 | if i <= 0: 128 | raise IndexError() 129 | return self._postags[i - 1] 130 | 131 | def feats(self, i): 132 | """ 133 | Return FEATS for i'th word, a list of string features. 134 | i is 1-based. 135 | """ 136 | if i <= 0: 137 | raise IndexError() 138 | return self._feats[i - 1] 139 | 140 | def heads(self, i): 141 | """ 142 | Return HEAD for i'th word. 143 | i and result are 1-based. 144 | """ 145 | if i <= 0: 146 | raise IndexError() 147 | return self._heads[i - 1] 148 | 149 | def deprels(self, i): 150 | """ 151 | Return DEPREL for i'th word. 152 | i is 1-based. 153 | """ 154 | if i <= 0: 155 | raise IndexError() 156 | return self._deprels[i - 1] 157 | 158 | def children(self, i): 159 | """ 160 | Return a list of children for i'th word. 161 | i is 1-based; 0 means "root node". 162 | """ 163 | if i < 0: 164 | raise IndexError() 165 | return self._children[i] 166 | 167 | def children_recursive(self, i): 168 | """ 169 | Return a list of all descendants (children, grandchildren, etc.) for 170 | i'th word. 171 | i is 1-based; 0 means "root node". 172 | """ 173 | result = [] 174 | for child in self.children(i): 175 | result += [child] + self.children_recursive(child) 176 | return result 177 | 178 | # - Mutators - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 179 | 180 | def append(self, forms, lemmas, cpostags, postags, feats, heads, deprels): 181 | """ 182 | Append new nodes to the tree. 183 | Arguments are the same as in constructor. 184 | """ 185 | self.__init__( 186 | self._forms + list(forms), 187 | self._lemmas + list(lemmas), 188 | self._cpostags + list(cpostags), 189 | self._postags + list(postags), 190 | self._feats + list(feats), 191 | self._heads + list(heads), 192 | self._deprels + list(deprels) 193 | ) 194 | 195 | def reorder(self, new_index_by_old_index): 196 | """ 197 | Reorder nodes in the tree. 198 | Indices are 0-based. 199 | """ 200 | N = len(self) 201 | new_indices = new_index_by_old_index 202 | 203 | # Check remapping. 204 | # - No index should be occupied twice. 205 | # - No index should left unset. 206 | exc = ValueError('invalid reordering: %r' % new_indices) 207 | if len(set(new_indices)) != N: 208 | raise exc 209 | if sorted(new_indices) != range(N): 210 | raise exc 211 | 212 | # Reorder tree. 213 | forms = [None] * N 214 | lemmas = [None] * N 215 | cpostags = [None] * N 216 | postags = [None] * N 217 | feats = [None] * N 218 | heads = [None] * N 219 | deprels = [None] * N 220 | 221 | for old_index, new_index in enumerate(new_indices): 222 | old_head = self._heads[old_index] 223 | if old_head == 0: 224 | new_head = 0 225 | else: 226 | new_head = new_indices[old_head - 1] + 1 227 | 228 | forms[new_index] = self._forms[old_index] 229 | lemmas[new_index] = self._lemmas[old_index] 230 | cpostags[new_index] = self._cpostags[old_index] 231 | postags[new_index] = self._postags[old_index] 232 | feats[new_index] = self._feats[old_index] 233 | heads[new_index] = new_head 234 | deprels[new_index] = self._deprels[old_index] 235 | 236 | # Update. 237 | self.__init__(forms, lemmas, cpostags, postags, feats, heads, deprels) 238 | 239 | def delete(self, nodes): 240 | """ 241 | Delete specified nodes from the tree. 242 | Lift the arcs of the orphaned nodes until their heads are non-deleted. 243 | """ 244 | # Check indices. 245 | N = len(self) 246 | if not isinstance(nodes, (set, list, tuple)): 247 | nodes = [nodes] 248 | if not all(0 < node <= N for node in nodes): 249 | raise IndexError() 250 | 251 | # Reparent orphaned nodes. 252 | # Lift the arc until the parent is non-deleted node. 253 | # If all parents are deleted, we will hit the root eventually. 254 | deleted = set(nodes) 255 | alive_heads = [None] * N 256 | for node in range(1, N + 1): 257 | head = self.heads(node) 258 | while head in deleted: 259 | head = self.heads(head) 260 | alive_heads[node - 1] = head 261 | 262 | # Remap. 263 | new_nodes = {0: 0} 264 | new_node = 1 265 | 266 | for node in range(1, N + 1): 267 | if node in deleted: 268 | continue 269 | new_nodes[node] = new_node 270 | new_node += 1 271 | 272 | # Gather non-deleted stuff. 273 | forms = [] 274 | lemmas = [] 275 | cpostags = [] 276 | postags = [] 277 | feats = [] 278 | heads = [] 279 | deprels = [] 280 | 281 | for node in range(1, N + 1): 282 | if node in deleted: 283 | continue 284 | forms.append(self.forms(node)) 285 | lemmas.append(self.lemmas(node)) 286 | cpostags.append(self.cpostags(node)) 287 | postags.append(self.postags(node)) 288 | feats.append(self.feats(node)) 289 | heads.append(new_nodes[alive_heads[node - 1]]) 290 | deprels.append(self.deprels(node)) 291 | 292 | # Construct new tree. 293 | self.__init__(forms, lemmas, cpostags, postags, feats, heads, deprels) 294 | 295 | def set_head(self, node, head): 296 | """ 297 | Make 'head' the head of the 'node'. 298 | If that breaks tree-ness (e.g. creates a cycle), raise ValueError. 299 | """ 300 | # Check indices. 301 | if head in [node] + self.children_recursive(node): 302 | msg = 'future head %i is a (possibly indirect) child of %i' 303 | raise ValueError(msg % (head, node)) 304 | if node <= 0 or head < 0: 305 | raise IndexError() 306 | 307 | # Set head. 308 | heads = self._heads[:] 309 | heads[node - 1] = head 310 | 311 | # Construct new tree. 312 | self.__init__( 313 | self._forms, 314 | self._lemmas, 315 | self._cpostags, 316 | self._postags, 317 | self._feats, 318 | heads, 319 | self._deprels 320 | ) 321 | 322 | def append_copy(self, nodes): 323 | """ 324 | Append a copy of gathered-together nodes at the end of the tree. 325 | For every node, preserve the parent unless the parent was copied too. 326 | """ 327 | # Check indices. 328 | N = len(self) 329 | if not isinstance(nodes, (set, list, tuple)): 330 | nodes = [nodes] 331 | if not all(0 < node <= N for node in nodes): 332 | raise IndexError() 333 | 334 | # Determine where we want the new nodes. 335 | copied = sorted(nodes) 336 | new_nodes = {0: 0} 337 | new_node = N + 1 338 | for node in copied: 339 | new_nodes[node] = new_node 340 | new_node += 1 341 | 342 | # Prepare to append. 343 | forms = [] 344 | lemmas = [] 345 | cpostags = [] 346 | postags = [] 347 | feats = [] 348 | heads = [] 349 | deprels = [] 350 | 351 | for node in copied: 352 | head = self.heads(node) 353 | if head in copied: 354 | head = new_nodes[head] 355 | 356 | forms.append(self.forms(node)) 357 | lemmas.append(self.lemmas(node)) 358 | cpostags.append(self.cpostags(node)) 359 | postags.append(self.postags(node)) 360 | feats.append(self.feats(node)) 361 | heads.append(head) 362 | deprels.append(self.deprels(node)) 363 | 364 | # Append.` 365 | self.append(forms, lemmas, cpostags, postags, feats, heads, deprels) 366 | 367 | BEFORE = '-' 368 | AFTER = '+' 369 | 370 | def move(self, nodes, anchor, where): 371 | """ 372 | Move gathered-together nodes before or after the given anchor node. 373 | 374 | Moving nodes in a tree is basically a reordering. 375 | Return index remapping that would've been sufficient for reorder(). 376 | (I.e. new_index_by_old_index). 377 | 378 | 'anchor' can be either "-" (before) or "+" (after). 379 | """ 380 | N = len(self) 381 | if not isinstance(nodes, (set, list, tuple)): 382 | nodes = [nodes] 383 | if not all(0 < node <= N for node in nodes): 384 | raise IndexError() 385 | 386 | # Compose a reordering. 387 | what = set(nodes) 388 | new_indices = [None] * N 389 | new_index = 0 390 | 391 | # Nodes up to, but not including, anchor. 392 | for node in range(1, anchor): 393 | if node in what: 394 | continue 395 | new_indices[node - 1] = new_index 396 | new_index += 1 397 | 398 | # Anchor (move after). 399 | if where == self.AFTER and anchor != 0: 400 | new_indices[anchor - 1] = new_index 401 | new_index += 1 402 | 403 | # New nodes. 404 | for node in sorted(what): 405 | if node != anchor: 406 | new_indices[node - 1] = new_index 407 | new_index += 1 408 | 409 | # Anchor (move before). 410 | if where == self.BEFORE and anchor != 0: 411 | new_indices[anchor - 1] = new_index 412 | new_index += 1 413 | 414 | # Nodes from anchor (not including) to the end. 415 | for node in range(anchor + 1, N + 1): 416 | if node in what: 417 | continue 418 | new_indices[node - 1] = new_index 419 | new_index += 1 420 | 421 | # Reorder. 422 | self.reorder(new_indices) 423 | return new_indices 424 | -------------------------------------------------------------------------------- /dep_tregex/tree_to_html.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import cgi 4 | import sys 5 | import math 6 | 7 | ## ----------------------------------------------------------------------------- 8 | # Style 9 | 10 | # ~ All sizes are in px. ~ 11 | 12 | _TRANSITION = '0.1s ease-out, stroke-dasharray 0.02s' 13 | _BIG_FONT = 12 # Font for words. 14 | _SMALL_FONT = 10 # Font for deprels, lemmas, etc. 15 | _SMALL_LINE = _SMALL_FONT * 1.33 # Line height for small font. 16 | _ANGLE = math.pi / 3. # Angle at which the arc enters the word. 17 | _ARROW_SPREAD = math.pi / 12. # Angular width of an arrow tip. 18 | _ARROW_SIZE = _SMALL_FONT * .75 # Linear size of an arrow tip. 19 | _ARROW_SIZE_MIDDLE = _ARROW_SIZE * .85 # (see picture) 20 | _ARC_WIDTH = 0.5 # Arc line width. 21 | 22 | # ~ An arrow looks like this ~ 23 | # _ 24 | # __-- 25 | # __\ __-- 26 | # arrow __-- \ __-- ++ 27 | # sz __-- \ __-- ++ 28 | # __-- \ -- ++ 29 | # \-- o ++ ----- arrow spread 30 | # \ oooo ++ angle 31 | # \ ooooooo ++ 32 | # \ oooooooooo______________________++__________________v____ 33 | # ooooooooooooo __________________________________________|____ 34 | # | oooooooo|o ^ 35 | # | oooo|oo | 36 | # | |ooo arc width 37 | # | | o 38 | # |<--------->| 39 | # middle sz 40 | 41 | # Height of a "flight level" for arcs. 42 | _ARC_HEIGHT_UNIT = _SMALL_FONT * 1.2 43 | 44 | # Horizontal distance between endpoint of an incoming arc 45 | # and starting point of an outgoing arc of the same word. 46 | _PORT_OFFSET = _BIG_FONT / 2. 47 | 48 | _COLOR_BIG = '#000' # Color for inactive big text. 49 | _COLOR_BIG_H0 = '#888' # Very dim highlight color for big text. 50 | _COLOR_BIG_H1 = '#c00' # Dim highlight color for big text. 51 | _COLOR_BIG_H2 = '#f00' # Bright highlight color for big text. 52 | _COLOR_BIG_HU = '#08c' # User-highlighted big text. 53 | _COLOR_SMALL = '#444' # Color for incative small text. 54 | _COLOR_SMALL_H0 = '#666' # Very dim highlight color for small text. 55 | _COLOR_SMALL_H1 = '#800' # Dim highlight color for small text. 56 | _COLOR_SMALL_H2 = '#900' # Bright highlight color for small text 57 | _COLOR_SMALL_HU = '#048' # User-highlighted small text. 58 | 59 | # SVG-wide stylesheet. 60 | _STYLE = u"""\ 61 | """ % ( 90 | _TRANSITION, _SMALL_FONT, _SMALL_FONT, # Generic 91 | _BIG_FONT, _COLOR_BIG, _SMALL_FONT, _COLOR_SMALL, # .big, .small 92 | _COLOR_BIG_HU, _COLOR_SMALL_HU, # .user-hl 93 | _SMALL_FONT, _COLOR_SMALL, # .role 94 | _ARC_WIDTH, _COLOR_BIG, # .arc 95 | # Label highlight 96 | _COLOR_BIG_H2, _COLOR_SMALL_H2, _COLOR_BIG_H2, _COLOR_BIG_H2, _COLOR_BIG_H2, 97 | _COLOR_BIG_H2, _COLOR_BIG_H2, _COLOR_BIG_H2 # Arc hover 98 | ) 99 | 100 | _PROLOGUE_HTML = u"""\ 101 | 102 | 103 | 104 | 105 | 108 | %s 109 | 110 | 111 | """ % _STYLE 112 | 113 | _EPILOGUE_HTML = u"""\ 114 | 115 | 116 | """ 117 | 118 | # Styles that are applied to a tree node when its immediate head is 119 | # hovered over. 120 | _HEAD_HOVER_STYLES = [ 121 | u'.w%%i > text.big { fill: %s; }' % (_COLOR_BIG_H1,), 122 | u'.w%%i > text.small { fill: %s; }' % (_COLOR_SMALL_H1,), 123 | u'.a%%i > text.role { fill: %s; }' % (_COLOR_BIG_H1,), 124 | u'.a%%i > path.arc { stroke: %s; }' % (_COLOR_BIG_H1,), 125 | u'.a%%i > path.arrow { fill: %s; }' % (_COLOR_BIG_H1,) 126 | ] 127 | 128 | # Styles that are applied to a tree node when its parent is hovered over, 129 | _PARENT_HOVER_STYLES = [ 130 | u'.w%%i > text.big { fill: %s; }' % (_COLOR_BIG_H0,), 131 | u'.w%%i > text.small { fill: %s; }' % (_COLOR_SMALL_H0,), 132 | u'.a%%i > text.role { fill: %s; }' % (_COLOR_BIG_H0,), 133 | u'.a%%i > path.arc { stroke: %s; }' % (_COLOR_BIG_H0,), 134 | u'.a%%i > path.arrow { fill: %s; }' % (_COLOR_BIG_H0,) 135 | ] 136 | 137 | ## ----------------------------------------------------------------------------- 138 | # Utilities 139 | 140 | def _label(tree, node, fields): 141 | """ 142 | Compose a label for i'th word of a tree, according to 'fields'. 143 | Return a string (possibly multiline). 144 | """ 145 | # Form (always). 146 | label = tree.forms(node) 147 | 148 | # Lemma. 149 | if 'lemma' in fields: 150 | label += u'\n' + tree.lemmas(node) 151 | 152 | # Postags. 153 | postags = [] 154 | if 'cpostag' in fields: 155 | postags.append(tree.cpostags(node)) 156 | if 'postag' in fields: 157 | postags.append(tree.postags(node)) 158 | if postags: 159 | label += u'\n' + u'/'.join(postags) 160 | 161 | # Features. 162 | if 'feats' in fields: 163 | label += u'\n' + u'|'.join(tree.feats(node)) 164 | 165 | return cgi.escape(label) 166 | 167 | def _label_height(text): 168 | """ 169 | Return label text height. 170 | First line is in big font, other lines are in small font. 171 | """ 172 | return _BIG_FONT + _SMALL_LINE * text.count(u'\n') 173 | 174 | def _label_width(text): 175 | """ 176 | Return label text width. 177 | Width of a single glyph is considered to be equal to font size. 178 | First line is in big font, other lines are in small font. 179 | """ 180 | width = 0 181 | for lineno, line in enumerate(text.split(u'\n')): 182 | size = [_BIG_FONT, _SMALL_FONT][lineno > 0] # Cool idiom, huh? 183 | width = max(width, size * len(line)) 184 | return width 185 | 186 | # ~ Arcs are composed of two circular segments and a straight line. ~ 187 | # Segments touch the word at a specific angle. 188 | 189 | def _arc_radius(height_in_units): 190 | """ 191 | Return radius of a circular segment of an arc of a given height level. 192 | """ 193 | return height_in_units * _ARC_HEIGHT_UNIT / (1 - math.cos(_ANGLE)) 194 | 195 | def _arc_min_length(height_in_units): 196 | """ 197 | Return minimal horizontal size for an arc of a given "flight level". 198 | """ 199 | return 2 * _arc_radius(height_in_units) * math.sin(_ANGLE) 200 | 201 | def _parent_arc_start_offset(tree, node): 202 | head = tree.heads(node) 203 | head_head = tree.heads(head) 204 | 205 | projective = ( 206 | (head_head < head < node) or 207 | (node < head < head_head) or 208 | (head_head < node < head) or 209 | (head < node < head_head) 210 | ) 211 | 212 | if projective: 213 | if node < head: 214 | return -_PORT_OFFSET 215 | else: 216 | return +_PORT_OFFSET 217 | else: 218 | if head_head < head: 219 | return +_PORT_OFFSET 220 | else: 221 | return -_PORT_OFFSET 222 | 223 | def _draw_label(file, text, x, y, css_class): 224 | """ 225 | Draw a multiline label at given position. 226 | Enclose elements in a . 227 | """ 228 | width = _label_width(text) 229 | height = _label_height(text) 230 | 231 | # Start a group. 232 | file.write(u' \n' % css_class) 233 | 234 | # Invisible hover-rectangle. 235 | # Makes it easier to hover over the label. 236 | file.write(u' \n' % 237 | (x - width / 2, y, width, height)) 238 | 239 | # Lines of text. 240 | y += _BIG_FONT 241 | for lineno, line in enumerate(text.split(u'\n')): 242 | file.write(u' %s\n' % 243 | (x, y, 'big' if lineno == 0 else 'small', line)) 244 | y += _SMALL_LINE 245 | 246 | # End a group. 247 | file.write(u' \n') 248 | 249 | def _draw_root_arc(file, x, y, height_in_units, deprel, css_class): 250 | """ 251 | Draw a vertical "arc from the root" to the node at (x, y). 252 | Enclose elements in a . 253 | """ 254 | height = height_in_units * _ARC_HEIGHT_UNIT 255 | 256 | # Start. 257 | file.write(u' \n' % css_class) 258 | 259 | # Path. 260 | path = 'M %i %i L %i %i' % (x, y, x, y - height) 261 | file.write(u' \n' % path) 262 | file.write(u' \n' % path) 263 | 264 | # Arrow. 265 | _draw_arrow(file, x, y, math.pi / 2) 266 | 267 | # Role. 268 | deprel = cgi.escape(deprel) 269 | file.write(u' %s\n' % 270 | (x, y - height - 0.2 * _SMALL_FONT, deprel)) 271 | 272 | # End. 273 | file.write(u' \n') 274 | 275 | def _draw_arc(file, start_x, end_x, y, height_in_units, deprel, css_class): 276 | """ 277 | Draw an arc from the node at (start_x, y) to the node at (end_x, y). 278 | Enclose elements in a . 279 | """ 280 | height = height_in_units * _ARC_HEIGHT_UNIT 281 | radius = _arc_radius(height_in_units) 282 | length = _arc_min_length(height_in_units) 283 | 284 | # Start. 285 | file.write(u' \n' % css_class) 286 | 287 | # Path. 288 | path = ( 289 | 'M %.2f %.2f' 290 | 'A %.2f %.2f 0 0 1 %.2f %.2f' 291 | 'L %.2f %.2f' 292 | 'A %.2f %.2f 0 0 1 %.2f %.2f' 293 | ) % ( 294 | min(start_x, end_x), y, 295 | radius, radius, min(start_x, end_x) + length / 2, y - height, 296 | max(start_x, end_x) - length / 2, y - height, 297 | radius, radius, max(start_x, end_x), y 298 | ) 299 | file.write(u' \n' % path) 300 | file.write(u' \n' % path) 301 | 302 | # Arrow. 303 | arrow_angle = _ANGLE if start_x > end_x else math.pi - _ANGLE 304 | _draw_arrow(file, end_x, y, arrow_angle) 305 | 306 | # Role. 307 | deprel = cgi.escape(deprel) 308 | file.write(u' %s\n' % 309 | ((start_x + end_x) / 2, y - height - 0.2 * _SMALL_FONT, deprel)) 310 | 311 | # End. 312 | file.write(u' \n') 313 | 314 | def _draw_arrow(file, tip_x, tip_y, angle): 315 | """ 316 | Draw an arrow with a tip at (tip_x, tip_y), "attacking" the surface at a 317 | given angle. 318 | """ 319 | # Offset the tip. 320 | tip_x -= _ARROW_SIZE * 0.2 * math.cos(angle) 321 | tip_y += _ARROW_SIZE * 0.2 * math.sin(angle) 322 | 323 | # Draw the arrow. 324 | path = ( 325 | 'M %.2f %.2f' 326 | 'L %.2f %.2f' 327 | 'L %.2f %.2f' 328 | 'L %.2f %.2f' 329 | 'Z' 330 | ) % ( 331 | tip_x, tip_y, 332 | tip_x + _ARROW_SIZE * math.cos(angle - _ARROW_SPREAD), 333 | tip_y - _ARROW_SIZE * math.sin(angle - _ARROW_SPREAD), 334 | tip_x + _ARROW_SIZE_MIDDLE * math.cos(angle), 335 | tip_y - _ARROW_SIZE_MIDDLE * math.sin(angle), 336 | tip_x + _ARROW_SIZE * math.cos(angle + _ARROW_SPREAD), 337 | tip_y - _ARROW_SIZE * math.sin(angle + _ARROW_SPREAD), 338 | ) 339 | file.write(u' \n' % (path,)) 340 | 341 | ## ----------------------------------------------------------------------------- 342 | # Main 343 | 344 | def write_prologue_html(file): 345 | file.write(_PROLOGUE_HTML) 346 | 347 | _UID = 0 348 | 349 | def write_tree_html(file, tree, fields=[], highlight_nodes=[], static=False): 350 | N = len(tree) 351 | if N == 0: 352 | return 353 | 354 | # Collect all tree arcs. 355 | arcs = [(node, tree.heads(node)) for node in range(1, N + 1)] 356 | arc_length = lambda arc: abs(arc[0] - arc[1]) 357 | 358 | # Determine height of every arc: 1, 2, 3, etc. 359 | # At each position, track the occupied flight levels. 360 | arc_heights = [0] * N 361 | occupied = [set() for i in range(N)] 362 | 363 | # Assign lower levels to arcs sequentially, starting from shorter arcs. 364 | for arc in sorted(arcs, key=arc_length): 365 | node, head = arc 366 | start, end = min(arc) - 1, max(arc) 367 | 368 | # Skip arcs from the root (they go vertically). 369 | if head == 0: 370 | continue 371 | 372 | # Determine the occupied flight levels below arc. 373 | positions = occupied[start+1:end-1] + [set()] 374 | occupied_below_arc = set.union(*positions) 375 | 376 | # Find the first available flight level. 377 | level = 1 378 | while level in occupied_below_arc: 379 | level += 1 380 | 381 | # Remember the height of the arc. 382 | arc_heights[node - 1] = level 383 | for pos in range(start, end): 384 | occupied[pos].add(level) 385 | 386 | # Assign height for root arcs. 387 | root_height = max(arc_heights) + 1 388 | for i in range(N): 389 | if arc_heights[i] == 0: 390 | arc_heights[i] = root_height 391 | 392 | # Get and measure labels. 393 | labels = [_label(tree, node, fields) for node in range(1, N + 1)] 394 | label_widths = map(_label_width, labels) 395 | label_heights = map(_label_height, labels) 396 | 397 | # Determine words' centers. 398 | centers = [] 399 | start = _BIG_FONT 400 | for width in label_widths: 401 | centers.append(start + width / 2) 402 | start += width + _BIG_FONT 403 | 404 | # Shift words' centers to accomodate arcs. 405 | for node, head in arcs: 406 | if head == 0: 407 | continue 408 | 409 | # Compute real margin and minimal required margin. 410 | start, end = sorted((node, head)) 411 | margin = centers[end - 1] - centers[start - 1] - 2 * _PORT_OFFSET 412 | min_margin = _arc_min_length(arc_heights[node - 1]) 413 | 414 | # Shift words to the right. 415 | if margin < min_margin: 416 | for i in range(end - 1, len(centers)): 417 | centers[i] += min_margin - margin 418 | 419 | # Compute width and height. 420 | baseline = _BIG_FONT + (max(arc_heights) + 1) * _ARC_HEIGHT_UNIT 421 | svg_width = centers[-1] + label_widths[-1] / 2 + _BIG_FONT 422 | svg_height = baseline + max(label_heights) + _BIG_FONT 423 | 424 | # Assign UID. 425 | global _UID 426 | uid = 'svg%i' % _UID 427 | _UID += 1 428 | 429 | # Start drawing. 430 | file.write(u' \n' % 431 | (svg_width, svg_height, uid)) 432 | 433 | # Write hover styles. 434 | if not static: 435 | file.write(u' \n') 450 | 451 | # Write text and arcs in topsorted order. 452 | queue = tree.children(0)[:] 453 | i = 0 454 | 455 | while i < len(queue): 456 | node = queue[i] 457 | head = tree.heads(node) 458 | center = centers[node - 1] 459 | height = arc_heights[node - 1] 460 | deprel = tree.deprels(node) 461 | label_cls = 'w%i' % node 462 | if node in highlight_nodes: 463 | label_cls += ' user-hl' 464 | arc_cls = 'a%i' % node 465 | 466 | # Draw label. 467 | _draw_label(file, labels[node - 1], center, baseline, label_cls) 468 | 469 | # Draw arc. 470 | if head == 0: 471 | _draw_root_arc(file, center, baseline, height, deprel, arc_cls) 472 | else: 473 | head_center = centers[head - 1] 474 | head_center += _parent_arc_start_offset(tree, node) 475 | _draw_arc(file, head_center, center, baseline, height, deprel, arc_cls) 476 | 477 | # Enqueue children. 478 | queue += tree.children(node) 479 | i += 1 480 | 481 | # Done. 482 | file.write(u' \n') 483 | 484 | def write_epilogue_html(file): 485 | file.write(_EPILOGUE_HTML) 486 | -------------------------------------------------------------------------------- /dep_tregex/__main__.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import argparse 4 | import cgi 5 | import codecs 6 | import collections 7 | import random 8 | import sys 9 | import tempfile 10 | import webbrowser 11 | 12 | from dep_tregex.conll import * 13 | from dep_tregex.tree_script import * 14 | from dep_tregex.tree_to_html import * 15 | 16 | ## ---------------------------------------------------------------------------- 17 | # Actions 18 | 19 | # - Extract words - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 20 | 21 | def words(): 22 | for tree in read_trees_conll(sys.stdin): 23 | forms = [tree.forms(i) for i in range(1, len(tree) + 1)] 24 | print(u' '.join(forms)) 25 | 26 | # - Count trees - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 27 | 28 | def wc(): 29 | num = 0 30 | for tree in read_trees_conll(sys.stdin): 31 | num += 1 32 | print(num) 33 | 34 | # - N'th tree - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 35 | 36 | def nth(num): 37 | trees_read = 0 38 | for i, tree in enumerate(read_trees_conll(sys.stdin)): 39 | if i + 1 == num: 40 | write_tree_conll(sys.stdout, tree) 41 | 42 | # - Head - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 43 | 44 | def head(num): 45 | for i, tree in enumerate(read_trees_conll(sys.stdin)): 46 | if i < num: 47 | write_tree_conll(sys.stdout, tree) 48 | 49 | # - Tail - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 50 | 51 | def tail(num): 52 | queue = collections.deque([], maxlen=num) 53 | for i, tree in enumerate(read_trees_conll(sys.stdin)): 54 | if len(queue) == num: 55 | queue.popleft() 56 | queue.append(tree) 57 | 58 | for tree in queue: 59 | write_tree_conll(sys.stdout, tree) 60 | 61 | # - Not head - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 62 | 63 | def not_head(num): 64 | """ 65 | Print trees N, N+1, etc. (indices 1-based). 66 | """ 67 | for i, tree in enumerate(read_trees_conll(sys.stdin)): 68 | if i + 1 >= num: 69 | write_tree_conll(sys.stdout, tree) 70 | 71 | # - Shuffle - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 72 | 73 | def shuf(): 74 | trees = list(read_trees_conll(sys.stdin)) 75 | random.shuffle(trees) 76 | for tree in trees: 77 | write_tree_conll(sys.stdout, tree) 78 | 79 | # - HTML - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 80 | 81 | _HL_LIMIT = 100 82 | _HL_LIMIT_MSG = 'Too many trees; #%i and on will not be highlighted on hover.' 83 | _LIMIT_MSG = 'Printing only %i trees; override with --limit' 84 | 85 | def _html(limit, fields, file): 86 | """ 87 | Print trees from stdin and write HTML to 'file'. 88 | 89 | limit: maximal number of trees to print 90 | fields: CoNLL fields to print in trees 91 | file: file to write HTML to 92 | """ 93 | write_prologue_html(file) 94 | 95 | for i, tree in enumerate(read_trees_conll(sys.stdin)): 96 | # Respect the limits. 97 | if i == limit: 98 | print(_LIMIT_MSG % i, file=sys.stderr) 99 | if i >= limit: 100 | continue 101 | if i == _HL_LIMIT: 102 | print(_HL_LIMIT_MSG % i, file=sys.stderr) 103 | 104 | # Draw. 105 | static = i >= _HL_LIMIT 106 | write_tree_html(file, tree, fields, [], static) 107 | 108 | write_epilogue_html(file) 109 | 110 | def html(limit, fields, view, new): 111 | # If need not view in browser, write HTML to stdout. 112 | if not view: 113 | _html(limit, fields, file=sys.stdout) 114 | return 115 | 116 | # Create temporary file. 117 | f = tempfile.NamedTemporaryFile(delete=False, suffix='.html') 118 | filename = f.name 119 | f.close() 120 | 121 | # Write HTML to temporary file. 122 | with codecs.open(filename, 'wb', encoding='utf-8') as f: 123 | _html(limit, fields, file=f) 124 | 125 | # Open that file. 126 | webbrowser.open('file://' + filename, new=new*2) 127 | 128 | # - Grep - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 129 | 130 | def _grep_text(pattern): 131 | """ 132 | Read trees from stdin and print those who match the pattern. 133 | """ 134 | # Parse pattern. 135 | pattern = parse_pattern(pattern) 136 | 137 | for tree in read_trees_conll(sys.stdin): 138 | # Match. 139 | match = False 140 | for node in range(1, len(tree) + 1): 141 | if pattern.match(tree, node, {}): 142 | match = True 143 | break 144 | 145 | # Print. 146 | if match: 147 | write_tree_conll(sys.stdout, tree) 148 | 149 | def _grep_html(pattern, limit, fields, file): 150 | """ 151 | Read trees from stdin, and print those who match the pattern as HTML, 152 | matched nodes highlighted. 153 | 154 | pattern: pattern to match against 155 | limit: maximal number of trees to print 156 | fields: CoNLL fields to print in trees 157 | file: file to write HTML to 158 | """ 159 | # Parse pattern. 160 | pattern = parse_pattern(pattern) 161 | write_prologue_html(file) 162 | printed = 0 163 | 164 | for tree in read_trees_conll(sys.stdin): 165 | # Respect the limits. 166 | if printed == limit: 167 | print(_LIMIT_MSG % printed, file=sys.stderr) 168 | printed += 1 169 | if printed >= limit: 170 | continue 171 | if printed == _HL_LIMIT: 172 | print(_HL_LIMIT_MSG % printed, file=sys.stderr) 173 | 174 | # Match. 175 | matches = [] 176 | for node in range(1, len(tree) + 1): 177 | if pattern.match(tree, node, {}): 178 | matches.append(node) 179 | 180 | # Draw. 181 | static = printed >= _HL_LIMIT 182 | if matches: 183 | write_tree_html(file, tree, fields, matches, static) 184 | printed += 1 185 | 186 | write_epilogue_html(file) 187 | 188 | def grep(pattern, html, limit, fields, view, new): 189 | """ 190 | Read trees from stdin and print those who match the pattern. 191 | If 'html' is False, print CoNLL trees. 192 | If 'html' is True and 'view' is False, print HTML to stdout. 193 | If 'html' is True and 'view' is True, view HTML in browser. 194 | """ 195 | if not html: 196 | _grep_text(pattern) 197 | return 198 | 199 | if not view: 200 | _grep_html(pattern, limit, fields, file=sys.stdout) 201 | return 202 | 203 | # Create temporary file. 204 | f = tempfile.NamedTemporaryFile(delete=False, suffix='.html') 205 | filename = f.name 206 | f.close() 207 | 208 | # Write HTML to temporary file. 209 | with codecs.open(filename, 'wb', encoding='utf-8') as f: 210 | _grep_html(pattern, limit, fields, file=f) 211 | 212 | # Open that file. 213 | webbrowser.open('file://' + filename, new=new*2) 214 | 215 | # - Sed - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 216 | 217 | def sed(scripts_filename): 218 | # Read scripts. 219 | with open(scripts_filename, 'rt') as f: 220 | scripts = parse_scripts(f.read().decode('utf-8')) 221 | 222 | # Edit trees. 223 | for tree in read_trees_conll(sys.stdin): 224 | tree = run_tree_scripts(tree, scripts) 225 | write_tree_conll(sys.stdout, tree) 226 | 227 | # - Gdb - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 228 | 229 | _GDB_STYLES = """ 230 | 257 | 258 | """ 259 | 260 | def _gdb_tree(scripts, scripts_text, tree, fields, file): 261 | """ 262 | Debug-print a single tree transformation: visualize step-by-step script 263 | application as HTML and write to file. 264 | 265 | Don't write HTML prologue or epilogue. 266 | 267 | scripts: list of TreeScript 268 | scripts_text: text for 'scripts' 269 | tree: tree to apply scripts to 270 | fields: CoNLL fields to print in trees 271 | file: file to write HTML to 272 | """ 273 | 274 | # Original tree. 275 | file.write(u'

Original tree

\n') 276 | write_tree_html(file, tree, fields) 277 | 278 | # Whole script. 279 | file.write(u'

Whole script

\n') 280 | file.write(u'
%s
\n' % cgi.escape(scripts_text)) 281 | 282 | # Construct a tree state. 283 | backrefs_map = {} 284 | state = TreeState(tree, backrefs_map) 285 | exc = None 286 | 287 | # Apply scripts and log everything we can. 288 | for script_no, script in enumerate(scripts): 289 | # Reset the state 290 | state.unmark_all() 291 | for node in range(0, len(state.tree) + 1): 292 | state.mark(node) 293 | 294 | while True: 295 | backrefs_map.clear() 296 | 297 | # Find matching node. 298 | node = 0 299 | while node <= len(state.tree): 300 | if state.marked(node): 301 | if script.pattern.match(state.tree, node, backrefs_map): 302 | break 303 | node += 1 304 | 305 | # If no matching node, move on to the next script. 306 | if node == len(state.tree) + 1: 307 | break 308 | 309 | # Report the match. 310 | start, end, line, col = script.pos 311 | file.write(u'

Matched rule #%i

' % (script_no + 1)) 312 | file.write(u'
(at line %i, col %i)
\n' % 313 | (line, col)) 314 | file.write(u'
%s
\n' % cgi.escape(script.text)) 315 | file.write(u'

Match

\n') 316 | write_tree_html(file, tree, fields, [node]) 317 | 318 | # Apply all actions and print tree after each step. 319 | state.unmark(node) 320 | for action_no, action in enumerate(script.actions): 321 | start, end, line, col = action.pos 322 | 323 | # Try to apply the action. 324 | try: 325 | action.apply(state) 326 | except TreeActionError as e: 327 | exc = e 328 | 329 | # If not succeeded, print the exception. 330 | if exc: 331 | file.write( 332 | u'

Error action #%i

\n' % 333 | (action_no + 1)) 334 | file.write( 335 | u'
(at line %i, col %i)
\n' % 336 | (line, col)) 337 | file.write(u'
%s
\n' % cgi.escape(action.text)) 338 | file.write('
') 339 | file.write( 340 | u'
%s
\n' % 341 | cgi.escape(exc.msg)) 342 | break # Action loop. 343 | 344 | # Otherwise, print the tree. 345 | else: 346 | file.write( 347 | u'

After action #%i

\n' % 348 | (action_no + 1)) 349 | file.write( 350 | u'
(at line %i, col %i)
\n' % 351 | (line, col)) 352 | file.write(u'
%s
\n' % cgi.escape(action.text)) 353 | write_tree_html(file, tree, fields) 354 | 355 | if exc: 356 | break # Node loop. 357 | 358 | if exc: 359 | break # Script loop. 360 | 361 | # Final tree. 362 | if not exc: 363 | file.write(u'

Final tree

\n') 364 | write_tree_html(file, tree, fields) 365 | 366 | def _gdb(scripts_filename, fields, file): 367 | """ 368 | Debug-print trees from stdin and write HTML to 'file'. 369 | 370 | scripts_filename: path to scripts 371 | fields: CoNLL fields to print in trees 372 | file: file to write HTML to 373 | """ 374 | 375 | # Read scripts. 376 | with open(scripts_filename, 'rt') as f: 377 | scripts_text = f.read().decode('utf-8') 378 | scripts = parse_scripts(scripts_text) 379 | 380 | # Debug a single tree. 381 | for tree in read_trees_conll(sys.stdin): 382 | write_prologue_html(file) 383 | file.write(_GDB_STYLES) 384 | _gdb_tree(scripts, scripts_text, tree, fields, file) 385 | write_epilogue_html(file) 386 | break 387 | 388 | def gdb(scripts_filename, fields, view, new): 389 | """ 390 | Debug-print trees from stdin and either write HTML to stdout or open in 391 | browser. 392 | 393 | scripts_filename: path to scripts 394 | fields: CoNLL fields to print in trees 395 | view: if True, open in browser, otherwise print HTML to stdout 396 | new: if True, don't try to reuse old browser tabs (when viewing) 397 | """ 398 | 399 | # If need not view in browser, write HTML to stdout. 400 | if not view: 401 | _gdb(scripts_filename, fields, file=sys.stdout) 402 | return 403 | 404 | # Create temporary file. 405 | f = tempfile.NamedTemporaryFile(delete=False, suffix='.html') 406 | filename = f.name 407 | f.close() 408 | 409 | # Write HTML to temporary file. 410 | with codecs.open(filename, 'wb', encoding='utf-8') as f: 411 | _gdb(scripts_filename, fields, file=f) 412 | 413 | # Open that file. 414 | webbrowser.open('file://' + filename, new=new*2) 415 | 416 | ## ---------------------------------------------------------------------------- 417 | # Main 418 | 419 | if __name__ == '__main__': 420 | parser = argparse.ArgumentParser('python -mdep_tregex') 421 | subparsers = parser.add_subparsers(dest='cmd') 422 | 423 | def _add_html_arguments(p, limit=True): 424 | if limit: 425 | p.add_argument('--limit', help='draw only first N trees', type=int, 426 | metavar='N', default=10) 427 | p.add_argument('--lemma', help='include LEMMA field', 428 | action='store_true') 429 | p.add_argument('--cpostag', help='include CPOSTAG field', 430 | action='store_true') 431 | p.add_argument('--postag', help='include POSTAG field', 432 | action='store_true') 433 | p.add_argument('--feats', help='include FEATS field', 434 | action='store_true') 435 | p.add_argument('--print', help="don't open in browser, print to stdout", 436 | action='store_true') 437 | p.add_argument('--reuse-tab', help='reuse already opened browser tabs', 438 | action='store_true') 439 | 440 | def _fields_from_args(args): 441 | fields = [] 442 | if args.lemma: 443 | fields.append('lemma') 444 | if args.cpostag: 445 | fields.append('cpostag') 446 | if args.postag: 447 | fields.append('postag') 448 | if args.feats: 449 | fields.append('feats') 450 | return fields 451 | 452 | # Words 453 | words_p = subparsers.add_parser('words', help='extract words from tree') 454 | 455 | # Wc. 456 | wc_p = subparsers.add_parser('wc', help='count trees') 457 | 458 | # Nth 459 | nth_p = subparsers.add_parser('nth', help='print only Nth tree') 460 | nth_p.add_argument('N', help="print N'th tree (1-based)", type=int) 461 | 462 | # Head 463 | head_p = subparsers.add_parser('head', help='print only first N trees') 464 | head_p.add_argument('N', help='print first N trees (1-based)', type=int) 465 | 466 | # Tail 467 | tail_p = subparsers.add_parser('tail', help='print only last N trees') 468 | tail_p.add_argument('N', help='print last N trees (1-based)', type=str) 469 | 470 | # Shuffle. 471 | shuf_p = subparsers.add_parser('shuf', help='shuffle trees') 472 | 473 | # Grep. 474 | grep_p = subparsers.add_parser('grep', help='filter trees by pattern') 475 | grep_p.add_argument('PATTERN', help='dep-tregex pattern') 476 | grep_p.add_argument('--html', help='view matches in browser', 477 | action='store_true') 478 | _add_html_arguments(grep_p) 479 | 480 | # Sed. 481 | sed_p = subparsers.add_parser('sed', help='apply tree scripts to trees') 482 | sed_p.add_argument('FILE', help='scripts file') 483 | 484 | # Html 485 | html_p = subparsers.add_parser('html', help='view trees in browser') 486 | _add_html_arguments(html_p) 487 | 488 | # Gdb. 489 | gdb_p = subparsers.add_parser('gdb', help='view step-by-step invocation') 490 | gdb_p.add_argument('FILE', help='scripts file') 491 | _add_html_arguments(gdb_p, limit=False) 492 | 493 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 494 | 495 | # Print help when no arguments. 496 | if len(sys.argv) == 1: 497 | parser.print_help() 498 | sys.exit(0) 499 | 500 | args = parser.parse_args() 501 | 502 | if args.cmd == 'words': 503 | words() 504 | 505 | elif args.cmd == 'wc': 506 | wc() 507 | 508 | elif args.cmd == 'nth': 509 | if args.N <= 0: 510 | nth_p.error('N has to be positive') 511 | nth(args.N) 512 | 513 | elif args.cmd == 'head': 514 | if args.N <= 0: 515 | head_p.error('N has to be positive') 516 | head(args.N) 517 | 518 | elif args.cmd == 'tail': 519 | try: 520 | n = int(args.N) 521 | except ValueError: 522 | tail_p.error('invalid integer N: %r' % args.N) 523 | if n <= 0: 524 | tail_p.error('N has to be positive') 525 | 526 | if args.N[0] != '+': 527 | tail(n) 528 | else: 529 | not_head(n) 530 | 531 | elif args.cmd == 'shuf': 532 | shuf() 533 | 534 | elif args.cmd == 'grep': 535 | fields = _fields_from_args(args) 536 | new = not args.reuse_tab 537 | grep(args.PATTERN, args.html, args.limit, fields, not args.print, new) 538 | 539 | elif args.cmd == 'sed': 540 | sed(args.FILE) 541 | 542 | elif args.cmd == 'html': 543 | if args.limit <= 0: 544 | html_p.error('--limit has to be positive') 545 | fields = _fields_from_args(args) 546 | html(args.limit, fields, not args.print, not args.reuse_tab) 547 | 548 | elif args.cmd == 'gdb': 549 | fields = _fields_from_args(args) 550 | gdb(args.FILE, fields, not args.print, not args.reuse_tab) 551 | -------------------------------------------------------------------------------- /dep_tregex/tree_script.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import re 3 | import ply.lex 4 | import ply.yacc 5 | 6 | from dep_tregex.tree import * 7 | from dep_tregex.tree_pattern import * 8 | from dep_tregex.tree_action import * 9 | from dep_tregex.tree_state import * 10 | 11 | ## ---------------------------------------------------------------------------- 12 | # Script application 13 | 14 | class TreeScript: 15 | """ 16 | A TreePattern object coupled with several TreeAction objects. 17 | """ 18 | 19 | def __init__(self, pattern, actions): 20 | self.pattern = pattern 21 | self.actions = actions 22 | 23 | def run_tree_scripts(tree, scripts): 24 | """ 25 | Apply tree scripts in a specific manner. 26 | 27 | - Scripts are applied sequentially: first script several times, second 28 | script several times, etc. 29 | - Any given script is only applied to "original" nodes of the tree. 30 | "Original" nodes are nodes that existed at the time that script was 31 | first run. 32 | - Script is applied to each "original" node only once. 33 | - Script is applied until there are no "original" nodes left, to which 34 | that script hasn't been applied. 35 | """ 36 | backrefs_map = {} 37 | state = TreeState(copy.copy(tree), backrefs_map) 38 | 39 | for script in scripts: 40 | # Reset the state 41 | state.unmark_all() 42 | for node in range(0, len(state.tree) + 1): 43 | state.mark(node) 44 | 45 | while True: 46 | backrefs_map.clear() 47 | 48 | # Find matching node. 49 | node = 0 50 | while node <= len(state.tree): 51 | if state.marked(node): 52 | if script.pattern.match(state.tree, node, backrefs_map): 53 | break 54 | node += 1 55 | 56 | # If no matching node, move on to the next script. 57 | if node == len(state.tree) + 1: 58 | break 59 | 60 | # Apply all actions. 61 | state.unmark(node) 62 | for action in script.actions: 63 | action.apply(state) 64 | 65 | return state.tree 66 | 67 | ## ---------------------------------------------------------------------------- 68 | # Script parser 69 | 70 | class LexerError(ValueError): 71 | pass 72 | 73 | class ParserError(ValueError): 74 | pass 75 | 76 | class _TreeScriptParser: 77 | KEYWORDS = { 78 | 'and': 'AND', 79 | 'or': 'OR', 80 | 'not': 'NOT', 81 | 'is_top': 'IS_TOP', 82 | 'is_leaf': 'IS_LEAF', 83 | 'form': 'FORM', 84 | 'lemma': 'LEMMA', 85 | 'cpostag': 'CPOSTAG', 86 | 'postag': 'POSTAG', 87 | 'feats': 'FEATS', 88 | 'deprel': 'DEPREL', 89 | 'can_head': 'CAN_HEAD', 90 | 'can_be_headed_by': 'CAN_BE_HEADED_BY', 91 | 'copy': 'COPY', 92 | 'move': 'MOVE', 93 | 'delete': 'DELETE', 94 | 'node': 'NODE', 95 | 'group': 'GROUP', 96 | 'before': 'BEFORE', 97 | 'after': 'AFTER', 98 | 'set': 'SET', 99 | 'set_head': 'SET_HEAD', 100 | 'try_set_head': 'TRY_SET_HEAD', 101 | 'heads': 'HEADS', 102 | 'headed_by': 'HEADED_BY' 103 | } 104 | 105 | TOKENS = [ 106 | 'ID', 107 | 'STRING', 108 | 'REGEX', 109 | 'EQUALS', 110 | 'COMMAND_SEP', 111 | 'LPAREN', 112 | 'RPAREN', 113 | 'LBRACE', 114 | 'RBRACE', 115 | 'SEMICOLON', 116 | 'BINARY_OP' 117 | ] + list(KEYWORDS.values()) 118 | 119 | BINARY_OPS = { 120 | '.<--': HasLeftChild, 121 | '-->.': HasRightChild, 122 | '<--.': HasRightHead, 123 | '.-->': HasLeftHead, 124 | '.<-': HasAdjacentLeftChild, 125 | '->.': HasAdjacentRightChild, 126 | '<-.': HasAdjacentRightHead, 127 | '.->': HasAdjacentLeftHead, 128 | '>': HasChild, 129 | '>>': HasSuccessor, 130 | '<': HasHead, 131 | '<<': HasPredecessor, 132 | '$--': HasLeftNeighbor, 133 | '$++': HasRightNeighbor, 134 | '$-': HasAdjacentLeftNeighbor, 135 | '$+': HasAdjacentRightNeighbor 136 | } 137 | 138 | @classmethod 139 | def make_lexer(cls): 140 | tokens = cls.TOKENS 141 | t_ignore = ' ' 142 | 143 | def track(t): 144 | # Compute position. 145 | start, end = t.lexer.lexmatch.span(0) 146 | line = t.lexer.lineno 147 | last_newline = t.lexer.lexdata.rfind(u'\n', 0, t.lexpos) 148 | col = (t.lexpos - last_newline) 149 | 150 | # Embed position into value. 151 | t.value = (t.value, (start, end, line, col)) 152 | 153 | def t_ID(t): 154 | r'[_a-zA-Z][_a-zA-Z0-9]*' 155 | t.type = cls.KEYWORDS.get(t.value, 'ID') 156 | track(t) 157 | return t 158 | 159 | def t_STRING(t): 160 | r'"[^"]*"|' "'[^']*'" 161 | t.value = t.value[1:-1] 162 | track(t) 163 | return t 164 | 165 | def t_REGEX(t): 166 | r'/[^/]*/[ig]*' 167 | ignore_case = False 168 | anywhere = False 169 | while t.value[-1] in 'ig': 170 | if t.value[-1] == 'i': 171 | ignore_case = True 172 | if t.value[-1] == 'g': 173 | anywhere = True 174 | t.value = t.value[:-1] 175 | t.value = (t.value[1:-1], ignore_case, anywhere) 176 | track(t) 177 | return t 178 | 179 | def t_EQUALS(t): 180 | r'==' 181 | track(t) 182 | return t 183 | 184 | def t_BINARY_OP(t): 185 | track(t) 186 | return t 187 | binary_ops = sorted(cls.BINARY_OPS.keys(), key=len, reverse=True) 188 | t_BINARY_OP.__doc__ = '|'.join(map(re.escape, binary_ops)) 189 | 190 | def t_COMMAND_SEP(t): 191 | r'::' 192 | track(t) 193 | return t 194 | 195 | def t_LPAREN(t): 196 | r'\(' 197 | track(t) 198 | return t 199 | 200 | def t_RPAREN(t): 201 | r'\)' 202 | track(t) 203 | return t 204 | 205 | def t_LBRACE(t): 206 | r'\{' 207 | track(t) 208 | return t 209 | 210 | def t_RBRACE(t): 211 | r'\}' 212 | track(t) 213 | return t 214 | 215 | def t_SEMICOLON(t): 216 | r';' 217 | track(t) 218 | return t 219 | 220 | t_ignore_COMMENT = r'\#.*' 221 | 222 | def t_newline(t): 223 | r'\n+' 224 | t.lexer.lineno += len(t.value) 225 | 226 | def t_error(t): 227 | line = t.lexer.lineno 228 | last_newline = t.lexer.lexdata.rfind(u'\n', 0, t.lexpos) 229 | col = (t.lexpos - last_newline) 230 | c = t.value[0:1] 231 | msg = '(at line %i, col %i) invalid character %r' % (line, col, c) 232 | raise LexerError(msg) 233 | 234 | return ply.lex.lex() 235 | 236 | @classmethod 237 | def make_parser(cls, start): 238 | tokens = cls.TOKENS 239 | 240 | def untrack(p): 241 | s, pos = [None], [None] 242 | for i in range(1, len(p)): 243 | s.append(p[i][0]) 244 | pos.append(p[i][1]) 245 | 246 | known_pos = filter(bool, pos) 247 | if not known_pos: 248 | p0_pos = None 249 | else: 250 | start_0, end_0, line_0, col_0 = known_pos[0] 251 | start_n, end_n, line_n, col_n = known_pos[-1] 252 | p0_pos = (start_0, end_n, line_0, col_0) 253 | pos[0] = p0_pos 254 | 255 | return s, pos 256 | 257 | def track(p, pos): 258 | p[0] = (p[0], pos[0]) 259 | 260 | def p_error(p): 261 | if p: 262 | start, end, line, col = p.value[1] 263 | val = p.value[0] 264 | msg = '(at line %i, col %i) unexpected token %r' % \ 265 | (line, col, val) 266 | else: 267 | msg = 'unexpected end of file' 268 | raise ParserError(msg) 269 | 270 | def p_tree_scripts(p): 271 | """ 272 | tree_scripts : 273 | | tree_script tree_scripts 274 | """ 275 | s, pos = untrack(p) 276 | if len(p) == 1: 277 | p[0] = [] 278 | else: 279 | p[0] = [s[1]] + s[2] 280 | track(p, pos) 281 | 282 | def p_tree_pattern(p): 283 | """ 284 | tree_pattern : ID 285 | | ID condition 286 | | LPAREN tree_pattern RPAREN 287 | """ 288 | s, pos = untrack(p) 289 | if len(p) == 2: 290 | p[0] = SetBackref(s[1], NotRoot(AlwaysTrue())) 291 | elif len(p) == 3: 292 | p[0] = SetBackref(s[1], NotRoot(s[2])) 293 | elif len(p) == 4: 294 | p[0] = s[2] 295 | p[0].pos = pos[0] 296 | track(p, pos) 297 | 298 | def p_tree_script(p): 299 | """ 300 | tree_script : LBRACE tree_pattern COMMAND_SEP actions RBRACE 301 | """ 302 | s, pos = untrack(p) 303 | p[0] = TreeScript(s[2], s[4]) 304 | p[0].pos = pos[0] 305 | track(p, pos) 306 | 307 | def p_actions(p): 308 | """ 309 | actions : 310 | | action SEMICOLON actions 311 | """ 312 | s, pos = untrack(p) 313 | if len(p) == 1: 314 | p[0] = [] 315 | else: 316 | s[1].pos = pos[1] 317 | p[0] = [s[1]] + s[3] 318 | track(p, pos) 319 | 320 | def p_condition(p): 321 | """ 322 | condition : condition_or 323 | """ 324 | s, pos = untrack(p) 325 | p[0] = s[1] 326 | track(p, pos) 327 | 328 | def p_condition_or(p): 329 | """ 330 | condition_or : condition_and or_conditions 331 | """ 332 | s, pos = untrack(p) 333 | condition_and = s[1] 334 | or_conditions = s[2] 335 | 336 | if not or_conditions: 337 | p[0] = condition_and 338 | else: 339 | p[0] = Or([condition_and] + or_conditions) 340 | track(p, pos) 341 | 342 | def p_or_conditions(p): 343 | """ 344 | or_conditions : 345 | | OR condition_and or_conditions 346 | """ 347 | s, pos = untrack(p) 348 | if len(p) == 1: 349 | p[0] = [] 350 | else: 351 | p[0] = [s[2]] + s[3] 352 | track(p, pos) 353 | 354 | def p_condition_and(p): 355 | """ 356 | condition_and : condition_not and_conditions 357 | """ 358 | s, pos = untrack(p) 359 | condition_not = s[1] 360 | and_conditions = s[2] 361 | 362 | if not and_conditions: 363 | p[0] = condition_not 364 | else: 365 | p[0] = And([condition_not] + and_conditions) 366 | track(p, pos) 367 | 368 | def p_and_conditions(p): 369 | """ 370 | and_conditions : 371 | | AND condition_not and_conditions 372 | """ 373 | s, pos = untrack(p) 374 | if len(p) == 1: 375 | p[0] = [] 376 | else: 377 | p[0] = [s[2]] + s[3] 378 | track(p, pos) 379 | 380 | def p_condition_not(p): 381 | """ 382 | condition_not : condition_op 383 | | NOT condition_op 384 | """ 385 | s, pos = untrack(p) 386 | if len(p) == 2: 387 | p[0] = s[1] 388 | else: 389 | p[0] = Not(s[2]) 390 | track(p, pos) 391 | 392 | def p_condition_op_parens(p): 393 | """ 394 | condition_op : LPAREN condition RPAREN 395 | """ 396 | s, pos = untrack(p) 397 | p[0] = s[2] 398 | track(p, pos) 399 | 400 | def p_condition_op_binary(p): 401 | """ 402 | condition_op : BINARY_OP tree_pattern 403 | """ 404 | s, pos = untrack(p) 405 | p[0] = cls.BINARY_OPS[s[1]](s[2]) 406 | track(p, pos) 407 | 408 | def p_condition_op_equals(p): 409 | """ 410 | condition_op : EQUALS ID 411 | """ 412 | s, pos = untrack(p) 413 | p[0] = EqualsBackref(s[2]) 414 | track(p, pos) 415 | 416 | def p_condition_op_attr(p): 417 | """ 418 | condition_op : attr string_condition 419 | """ 420 | s, pos = untrack(p) 421 | if s[1] == 'feats': 422 | p[0] = FeatsMatch(pred_fn=s[2]) 423 | else: 424 | p[0] = AttrMatches(attr=s[1], pred_fn=s[2]) 425 | track(p, pos) 426 | 427 | def p_condition_op_is_top(p): 428 | """ 429 | condition_op : IS_TOP 430 | """ 431 | s, pos = untrack(p) 432 | p[0] = IsTop() 433 | track(p, pos) 434 | 435 | def p_condition_op_is_leaf(p): 436 | """ 437 | condition_op : IS_LEAF 438 | """ 439 | s, pos = untrack(p) 440 | p[0] = IsLeaf() 441 | track(p, pos) 442 | 443 | def p_condition_op_can_head(p): 444 | """ 445 | condition_op : CAN_HEAD ID 446 | """ 447 | s, pos = untrack(p) 448 | p[0] = CanHead(s[2]) 449 | track(p, pos) 450 | 451 | def p_condition_op_can_be_headed_by(p): 452 | """ 453 | condition_op : CAN_BE_HEADED_BY ID 454 | """ 455 | s, pos = untrack(p) 456 | p[0] = CanBeHeadedBy(s[2]) 457 | track(p, pos) 458 | 459 | def p_action_copy_move(p): 460 | """ 461 | action : COPY selector ID where selector ID 462 | | MOVE selector ID where selector ID 463 | """ 464 | s, pos = untrack(p) 465 | kwargs = { 466 | 'what': s[3], 467 | 'sel_what': s[2], 468 | 'where': s[4], 469 | 'anchor': s[6], 470 | 'sel_anchor': s[5] 471 | } 472 | 473 | if s[1] == 'copy': 474 | p[0] = Copy(**kwargs) 475 | else: 476 | p[0] = Move(**kwargs) 477 | track(p, pos) 478 | 479 | def p_action_delete(p): 480 | """ 481 | action : DELETE selector ID 482 | """ 483 | s, pos = untrack(p) 484 | p[0] = Delete(what=s[3], sel_what=s[2]) 485 | track(p, pos) 486 | 487 | def p_action_set(p): 488 | """ 489 | action : SET attr ID STRING 490 | """ 491 | s, pos = untrack(p) 492 | if s[2] == 'feats': 493 | newval = s[4].split(u'|') 494 | else: 495 | newval = s[4] 496 | newval_fn = lambda x, newval=newval: newval 497 | p[0] = MutateAttr(s[3], '_' + s[2], newval_fn) 498 | track(p, pos) 499 | 500 | def p_action_set_head(p): 501 | """ 502 | action : SET_HEAD ID HEADED_BY ID 503 | | SET_HEAD ID HEADS ID 504 | | TRY_SET_HEAD ID HEADED_BY ID 505 | | TRY_SET_HEAD ID HEADS ID 506 | """ 507 | s, pos = untrack(p) 508 | raise_ = (s[1] == 'set_head') 509 | if s[3] == 'headed_by': 510 | node, head = s[2], s[4] 511 | else: 512 | node, head = s[4], s[2] 513 | p[0] = SetHead(node=node, head=head, raise_on_invalid_head=raise_) 514 | track(p, pos) 515 | 516 | def p_action_group(p): 517 | """ 518 | action : GROUP ID ID 519 | """ 520 | s, pos = untrack(p) 521 | p[0] = GroupTogether(s[2], s[3]) 522 | track(p, pos) 523 | 524 | def p_attr(p): 525 | """ 526 | attr : FORM 527 | | LEMMA 528 | | CPOSTAG 529 | | POSTAG 530 | | FEATS 531 | | DEPREL 532 | """ 533 | s, pos = untrack(p) 534 | p[0] = { 535 | 'form': 'forms', 536 | 'lemma': 'lemmas', 537 | 'cpostag': 'cpostags', 538 | 'postag': 'postags', 539 | 'feats': 'feats', 540 | 'deprel': 'deprels' 541 | }[s[1]] 542 | track(p, pos) 543 | 544 | def p_string_condition_str(p): 545 | """ 546 | string_condition : STRING 547 | """ 548 | s, pos = untrack(p) 549 | p[0] = lambda x, string=s[1]: x == string 550 | track(p, pos) 551 | 552 | def p_string_condition_regex(p): 553 | """ 554 | string_condition : REGEX 555 | """ 556 | s, pos = untrack(p) 557 | pattern, ignore_case, anywhere = s[1] 558 | r = compile_regex(pattern, ignore_case, anywhere) 559 | p[0] = lambda x, r=r: r.search(x) 560 | track(p, pos) 561 | 562 | def p_selector(p): 563 | """ 564 | selector : NODE 565 | | GROUP 566 | """ 567 | s, pos = untrack(p) 568 | if s[1] == 'node': 569 | p[0] = NODE 570 | else: 571 | p[0] = GROUP 572 | track(p, pos) 573 | 574 | def p_where(p): 575 | """ 576 | where : BEFORE 577 | | AFTER 578 | """ 579 | s, pos = untrack(p) 580 | if s[1] == 'before': 581 | p[0] = Tree.BEFORE 582 | else: 583 | p[0] = Tree.AFTER 584 | track(p, pos) 585 | 586 | return ply.yacc.yacc( 587 | debug=0, 588 | write_tables=0, 589 | errorlog=ply.yacc.NullLogger() 590 | ) 591 | 592 | def __init__(self, start): 593 | self.lexer = self.make_lexer() 594 | self.parser = self.make_parser(start) 595 | 596 | def parse(self, text): 597 | res, pos = self.parser.parse(text, lexer=self.lexer) 598 | return res 599 | 600 | _TREE_SCRIPT_PARSER = None 601 | _TREE_PATTERN_PARSER = None 602 | 603 | def parse_pattern(text): 604 | """ 605 | Parse a text, contatining a single tree pattern. 606 | Return TreePattern object. 607 | """ 608 | 609 | # Compile parser on-demand. 610 | global _TREE_PATTERN_PARSER 611 | if _TREE_PATTERN_PARSER is None: 612 | _TREE_PATTERN_PARSER = _TreeScriptParser(start='tree_pattern') 613 | 614 | # Parse. 615 | return _TREE_PATTERN_PARSER.parse(text) 616 | 617 | def parse_scripts(text): 618 | """ 619 | Parse a text, contatining several tree scripts. 620 | Return list of TreeScript objects. 621 | """ 622 | 623 | global _TREE_SCRIPT_PARSER 624 | if _TREE_SCRIPT_PARSER is None: 625 | _TREE_SCRIPT_PARSER = _TreeScriptParser(start='tree_scripts') 626 | 627 | # Parse. 628 | scripts = _TREE_SCRIPT_PARSER.parse(text) 629 | 630 | # Augment scripts, patterns and actions with their text. 631 | for script in scripts: 632 | # Augment script. 633 | start, end, line, col = script.pos 634 | script.text = text[start:end] 635 | 636 | # Augment pattern. 637 | start, end, line, col = script.pattern.pos 638 | script.pattern.text = text[start:end] 639 | 640 | # Augment actions. 641 | for action in script.actions: 642 | start, end, line, col = action.pos 643 | action.text = text[start:end] 644 | 645 | return scripts 646 | --------------------------------------------------------------------------------