├── changes
    ├── .gitkeep
    ├── 114.feature.rst
    ├── 110.feature.rst
    ├── 116.feature.rst
    └── 118.feature.rst
├── rita
    ├── engine
    │   ├── __init__.py
    │   ├── translate_rust.py
    │   ├── translate_spacy.py
    │   └── translate_standalone.py
    ├── modules
    │   ├── __init__.py
    │   ├── regex.py
    │   ├── orth.py
    │   ├── tag.py
    │   ├── fuzzy.py
    │   ├── pluralize.py
    │   └── names.py
    ├── types.py
    ├── run.py
    ├── precompile.py
    ├── __init__.py
    ├── lexer.py
    ├── shortcuts.py
    ├── config.py
    ├── macros.py
    ├── utils.py
    ├── parser.py
    └── preprocess.py
├── .github
    ├── FUNDING.yml
    └── workflows
    │   ├── github-actions-deployment.yaml
    │   └── github-actions-main.yaml
├── examples
    ├── simple-import.rita
    ├── cyclical-import.rita
    ├── match-with-escaped-string.rita
    ├── excluding-word.rita
    ├── fuzzy-matching.rita
    ├── cars.txt
    ├── cheap-phones.rita
    ├── complex-number.rita
    ├── dress-match.rita
    └── color-car.rita
├── docs
    ├── assets
    │   ├── logo-1.png
    │   ├── logo-2.png
    │   ├── jetbrains.png
    │   ├── logo-100px.png
    │   └── jetbrains.svg
    ├── integration.md
    ├── index.md
    ├── extend.md
    ├── advanced.md
    ├── config.md
    ├── macros.md
    ├── engines.md
    ├── syntax.md
    ├── modules.md
    └── quickstart.md
├── Makefile
├── setup.cfg
├── .coveragerc
├── tests
    ├── test_utils.py
    ├── test_precompile.py
    ├── test_config.py
    ├── utils.py
    ├── test_run.py
    ├── test_lexer.py
    ├── test_parser.py
    ├── test_examples.py
    └── test_rules.py
├── mypy.ini
├── mkdocs.yml
├── changes_template.md
├── tox.ini
├── LICENSE
├── .gitignore
├── extra
    └── sublimetext
    │   └── RITA.sublime-syntax
├── pyproject.toml
├── README.md
├── CHANGELOG.md
└── poetry.lock


/changes/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rita/engine/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rita/modules/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: [zaibacu]
2 | 


--------------------------------------------------------------------------------
/examples/simple-import.rita:
--------------------------------------------------------------------------------
1 | @import "examples/simple-match.rita"


--------------------------------------------------------------------------------
/examples/cyclical-import.rita:
--------------------------------------------------------------------------------
1 | @import "examples/cyclical-import.rita"


--------------------------------------------------------------------------------
/changes/114.feature.rst:
--------------------------------------------------------------------------------
1 | Add spaCy wildcard instead of REGEX when using ANY
2 | 


--------------------------------------------------------------------------------
/examples/match-with-escaped-string.rita:
--------------------------------------------------------------------------------
1 | {WORD("5\""), WORD("Phone")} -> MARK("PHONE")


--------------------------------------------------------------------------------
/docs/assets/logo-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zaibacu/rita-dsl/HEAD/docs/assets/logo-1.png


--------------------------------------------------------------------------------
/docs/assets/logo-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zaibacu/rita-dsl/HEAD/docs/assets/logo-2.png


--------------------------------------------------------------------------------
/examples/excluding-word.rita:
--------------------------------------------------------------------------------
1 | {WORD("Weather"), WORD("is"), WORD("cold")!}->MARK("GOOD_WEATHER")


--------------------------------------------------------------------------------
/docs/assets/jetbrains.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zaibacu/rita-dsl/HEAD/docs/assets/jetbrains.png


--------------------------------------------------------------------------------
/examples/fuzzy-matching.rita:
--------------------------------------------------------------------------------
1 | !IMPORT("rita.modules.fuzzy")
2 | 
3 | FUZZY("squirrel") -> MARK("CRITTER")


--------------------------------------------------------------------------------
/docs/assets/logo-100px.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zaibacu/rita-dsl/HEAD/docs/assets/logo-100px.png


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | PYTHON=python
2 | 
3 | benchmark:
4 | 	${PYTHON} -m pytest --benchmark-only tests/ --benchmark-autosave
5 | 


--------------------------------------------------------------------------------
/changes/110.feature.rst:
--------------------------------------------------------------------------------
1 | Type Hints for core to improve robustness. Extra CI step to check for errors is added as well
2 | 


--------------------------------------------------------------------------------
/changes/116.feature.rst:
--------------------------------------------------------------------------------
1 | Add "+" operator by default when building spaCy `ENTITY(...)` to make it easier to read and understand.


--------------------------------------------------------------------------------
/examples/cars.txt:
--------------------------------------------------------------------------------
 1 | BMW
 2 | Audi
 3 | VW
 4 | Toyota
 5 | Mazda
 6 | Opel
 7 | Ford
 8 | Alfa Romeo
 9 | Peugot
10 | Fiat
11 | Nissan
12 | Subaru
13 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | description-file = README.md
 3 | 
 4 | [tool:pytest]
 5 | addopts = --benchmark-skip
 6 | 
 7 | [flake8]
 8 | max-line-length = 160
 9 | 
10 | [aliases]
11 | test=pytest


--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
 1 | [run]
 2 | branch = True
 3 | source =
 4 |     rita
 5 | 
 6 | omit = rita/engine/translate_rust.py
 7 | 
 8 | [report]
 9 | show_missing = True
10 | omit = rita/engine/translate_rust.py
11 | 


--------------------------------------------------------------------------------
/examples/cheap-phones.rita:
--------------------------------------------------------------------------------
1 | inexpensive = {"secondary", "inexpensive", "cheap"}
2 | 
3 | {IN_LIST(inexpensive), WORD("cell")?, WORD("phone")}->MARK("CHEAP_PHONE")
4 | {WORD("good"), WORD("value")}->MARK("CHEAP_PHONE")


--------------------------------------------------------------------------------
/examples/complex-number.rita:
--------------------------------------------------------------------------------
1 | Complex_Number = { NUM+, WORD("/")?, NUM? }
2 | {PATTERN(Complex_Number), WORD("inches"), WORD("Width")}->MARK("WIDTH")
3 | {PATTERN(Complex_Number), WORD("inches"), WORD("Height")}->MARK("HEIGHT")


--------------------------------------------------------------------------------
/rita/types.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Mapping, Tuple, List, AnyStr
2 | 
3 | opts = Mapping[Any, Any]
4 | RuleData = Tuple[AnyStr, List[Any], AnyStr]
5 | Patterns = List[RuleData]
6 | RuleGroup = Tuple[AnyStr, Patterns]
7 | Rules = List[RuleGroup]
8 | 


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
1 | from rita.utils import deaccent
2 | 
3 | 
4 | class TestDeaccent(object):
5 |     def test_lithuanian(self):
6 |         assert deaccent("Šarūnas") == "Sarunas"
7 |         assert deaccent("Kęstutis") == "Kestutis"
8 |         assert deaccent("Ąžuolas") == "Azuolas"
9 | 


--------------------------------------------------------------------------------
/changes/118.feature.rst:
--------------------------------------------------------------------------------
 1 | Use "IN" operator when defining ARRAYS in spaCy
 2 | 
 3 | Also, from now on, we can define arrays directly inside macro:
 4 | ```
 5 | IN_LIST("one", "two", "three")
 6 | ```
 7 | 
 8 | Which is equals to:
 9 | ```
10 | numbers = {"one", "two", "three"}
11 | IN_LIST(numbers)
12 | ```


--------------------------------------------------------------------------------
/rita/modules/regex.py:
--------------------------------------------------------------------------------
 1 | from rita.utils import ExtendedOp
 2 | 
 3 | 
 4 | def REGEX(regex_pattern, config, op=None):
 5 |     """
 6 |     Matches words based on a Regex pattern
 7 |     e.g. all words that start with an 'a' would be
 8 |     REGEX("^a")
 9 |     """
10 |     new_op = ExtendedOp(op)
11 |     new_op.local_regex_override = True
12 |     return "regex", regex_pattern, new_op
13 | 


--------------------------------------------------------------------------------
/rita/modules/orth.py:
--------------------------------------------------------------------------------
 1 | from rita.utils import ExtendedOp
 2 | 
 3 | 
 4 | def ORTH(value, config, op=None):
 5 |     """
 6 |     Ignores case-insensitive configuration and checks words as written
 7 |     that means case-sensitive even if configuration is case-insensitive
 8 |     """
 9 |     new_op = ExtendedOp(op)
10 |     new_op.case_sensitive_override = True
11 |     return "orth", value, new_op
12 | 


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
 1 | [mypy]
 2 | python_version = 3.9
 3 | exclude=parsetab.py
 4 | 
 5 | [mypy-inflect]
 6 | ignore_missing_imports = True
 7 | 
 8 | [mypy-spacy]
 9 | ignore_missing_imports = True
10 | 
11 | [mypy-spacy.pipeline]
12 | ignore_missing_imports = True
13 | 
14 | [mypy-ply]
15 | ignore_missing_imports = True
16 | 
17 | [mypy-ply.yacc]
18 | ignore_missing_imports = True
19 | 
20 | [mypy-ply.lex]
21 | ignore_missing_imports = True
22 | 


--------------------------------------------------------------------------------
/examples/dress-match.rita:
--------------------------------------------------------------------------------
1 | cuts = {"fitted", "wide-cut"}
2 | lengths = {"short", "long", "calf-length", "knee-length"}
3 | fabric_types = {"soft", "airy", "crinkled"}
4 | fabrics = {"velour", "chiffon", "knit", "woven", "stretch"}
5 | 
6 | {IN_LIST(cuts)?, IN_LIST(lengths), WORD("dress")}->MARK("DRESS_TYPE")
7 | {IN_LIST(lengths), IN_LIST(cuts), WORD("dress")}->MARK("DRESS_TYPE")
8 | {IN_LIST(fabric_types)?, IN_LIST(fabrics)}->MARK("DRESS_FABRIC")


--------------------------------------------------------------------------------
/examples/color-car.rita:
--------------------------------------------------------------------------------
1 | cars = LOAD("examples/cars.txt") # Load items from file
2 | colors = {"red", "green", "blue", "white", "black"} # Declare items inline
3 | 
4 | {IN_LIST(colors), WORD("car")} -> MARK("CAR_COLOR") # If first token is in list `colors` and second one is word `car`, label it
5 | 
6 | {IN_LIST(cars), WORD+} -> MARK("CAR_MODEL") # If first token is in list `cars` and follows by 1..N words, label it
7 | 
8 | {ENTITY("PERSON"), LEMMA("like"), WORD} -> MARK("LIKED_ACTION") # If first token is Person, followed by any word which has lemma `like`, label it


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: RITA DSL
 2 | site_author: Šarūnas Navickas
 3 | site_description: A DSL which allows to write rules for doing NLP
 4 | repo_url: https://github.com/zaibacu/rita-dsl
 5 | nav:
 6 |   - Home: index.md
 7 |   - Quickstart: quickstart.md
 8 |   - Syntax: syntax.md
 9 |   - Macros: macros.md
10 |   - Engines: engines.md
11 |   - Modules: modules.md
12 |   - Extending: extend.md
13 |   - Config: config.md
14 |   - Advanced: advanced.md
15 |   - Integrating into IDEs: integration.md
16 | theme: readthedocs
17 | markdown_extensions:
18 |   - toc:
19 |       permalink: True
20 | plugins:
21 |   - search
22 | 


--------------------------------------------------------------------------------
/docs/integration.md:
--------------------------------------------------------------------------------
 1 | # Integration
 2 | 
 3 | This section is dedicated to provide links to plugins which will make life easier using RITA language. If you created one - feel free to add to the list
 4 | 
 5 | ## Idea (IntelijJ, PyCharm and others)
 6 | 
 7 | [Rita-Language](https://plugins.jetbrains.com/plugin/15011-rita-language) - simple syntax markup plugin
 8 | 
 9 | ## SublimeText3
10 | 
11 | plugin can be found in `extra/sublimetext/RITA.sublime-syntax` - simple syntax markup plugin. Can be installed by copying to `Packages/User/` directory of SublimeText (for MacOS it's: `cd ~/Library/Application\ Support/Sublime\ Text\ 3/Packages/User`)


--------------------------------------------------------------------------------
/changes_template.md:
--------------------------------------------------------------------------------
 1 | {% for section, _ in sections.items() %}
 2 | {% set underline = underlines[0] %}
 3 | {% if section %}
 4 |   {{section}}
 5 | {{ underline * section|length }}
 6 | {% set underline = underlines[1] %}
 7 | {% endif %}
 8 | 
 9 | {% if sections[section] %}
10 | {% for category, val in definitions.items() if category in sections[section]%}
11 | {{ definitions[category]['name'] }}
12 | {{ underline * definitions[category]['name']|length }}
13 | 
14 | {% for text, values in sections[section][category].items() %}
15 | - {{ text }}
16 |   {{ values|join(',\n  ') }}
17 | {% endfor %}
18 | 
19 | {% endfor %}
20 | {% else %}
21 | No significant changes.
22 | 
23 | {% endif %}
24 | {% endfor %}
25 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | ![Rita Logo](assets/logo-2.png)
 2 | # RITA DSL
 3 | 
 4 | This is a language, loosely based on language [Apache UIMA RUTA](https://uima.apache.org/ruta.html), focused on writing manual language rules, which compiles into [spaCy](https://github.com/explosion/spaCy) compatible patterns. These patterns can be used for doing [manual NER](https://spacy.io/api/entityruler) as well as used in other processes, like retokenizing and pure matching
 5 | 
 6 | - [Live Demo](https://rita-dsl.io/#demo)
 7 | - [Simple Chat bot example](https://repl.it/talk/share/Simple-chatbot-done-with-Rita/53471)
 8 | - [Documentation](http://rita-dsl.readthedocs.io/)
 9 | - [QuickStart](https://rita-dsl.readthedocs.io/en/latest/quickstart/)
10 | - [Language Syntax Plugin for IntelijJ based IDEs](https://plugins.jetbrains.com/plugin/15011-rita-language) 
11 | 


--------------------------------------------------------------------------------
/docs/extend.md:
--------------------------------------------------------------------------------
 1 | # Extending
 2 | 
 3 | Custom modules can be loaded via `!IMPORT(<module_path>)`
 4 | 
 5 | Example of basic fuzzy matcher:
 6 | 
 7 | ```
 8 | !IMPORT("rita.modules.fuzzy")
 9 | 
10 | FUZZY("squirrel") -> MARK("CRITTER")
11 | ```
12 | 
13 | Code can be seen in: [fuzzy.py](https://github.com/zaibacu/rita-dsl/blob/master/rita/modules/fuzzy.py)
14 | 
15 | After import is done, custom macros defined in imported module can be executed. 
16 | 
17 | ## Interface for custom Macro
18 | 
19 | Each macro must have atleast two arguments 
20 | 
21 | - `op` - custom handling of `?`, `*` and `+` operators. If it has no use, argument can be defined as `def <macro>(*args, op=None)` and simply ignored inside code
22 | 
23 | - `context` - context is either `dict` or `list` type used to store results
24 | 
25 | All other arguments should be defined at the start
26 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py36, py37, py38, py39, spacy3, changelog
 3 | isolated_build = True
 4 | 
 5 | [testenv]
 6 | deps =
 7 |     codecov
 8 |     ply: ply==3.11
 9 |     spacy >= 3.0.0
10 |     inflect
11 |     pytest
12 |     pytest-mock
13 |     pytest-benchmark
14 | commands =
15 |     python -m spacy download en
16 |     python -m pytest tests
17 |     codecov --token="{env:CODECOV_TOKEN}"
18 | 
19 | [testenv:changelog]
20 | basepython = python3.9
21 | skip_install = true
22 | deps =
23 |      towncrier
24 | commands = towncrier --draft
25 | 
26 | [testenv:lint]
27 | basepython = python3.9
28 | skip_install = true
29 | deps = flake8
30 | commands =
31 |          flake8 rita/ --exclude=rita/parsetab.py
32 |          flake8 tests/
33 | 
34 | [testenv:mypy]
35 | basepython = python3.9
36 | skip_install = true
37 | deps = mypy
38 | commands =
39 |     mypy rita/
40 | 


--------------------------------------------------------------------------------
/docs/advanced.md:
--------------------------------------------------------------------------------
 1 | # Importing other rule files
 2 | 
 3 | When the corpus of rules becomes too large, it is possible to split it into multiple of files.
 4 | It can be done simply like this:
 5 | 
 6 | ```
 7 | @import "<file path>"
 8 | ```
 9 | 
10 | Eg.:
11 | ```
12 | @import "examples/simple-match.rita"
13 | ```
14 | 
15 | # Reusing patterns
16 | 
17 | You can define (since version 0.5.0+) pattern as a variable:
18 | 
19 | ```
20 | ComplexNumber = {NUM+, WORD("/")?, NUM?}
21 | 
22 | {PATTERN(ComplexNumber), WORD("inches"), WORD("Height")}->MARK("HEIGHT")
23 | {PATTERN(ComplexNumber), WORD("inches"), WORD("Width")}->MARK("WIDTH")
24 | ```
25 | 
26 | # Alias
27 | 
28 | You can alias frequently used macros to make their names shorter:
29 | 
30 | ```
31 | numbers = {"one", "two", "three"}
32 | @alias IN_LIST IL
33 | 
34 | IL(numbers) -> MARK("NUMBER")
35 | ```
36 | 
37 | Now using "IL" will actually call "IN_LIST" macro. 


--------------------------------------------------------------------------------
/rita/modules/tag.py:
--------------------------------------------------------------------------------
 1 | from rita.utils import ExtendedOp
 2 | 
 3 | 
 4 | def TAG(tag, config, op=None):
 5 |     """
 6 |     For generating POS/TAG patterns based on a Regex
 7 |     e.g. TAG("^NN|^JJ") for nouns or adjectives
 8 |     """
 9 |     values = {"tag": tag}
10 |     return "tag", values, ExtendedOp(op)
11 | 
12 | 
13 | def TAG_WORD(tag, value, config, op=None):
14 |     """
15 |     For generating TAG patterns with a word or a list
16 |     e.g. match only "proposed" when it is in the sentence a verb (and not an adjective):
17 |     TAG_WORD("^VB", "proposed")
18 |     e.g. match a list of words only to verbs
19 |     words = {"percived", "proposed"}
20 |     {TAG_WORD("^VB", words)?}->MARK("LABEL")
21 |     """
22 |     values = {"tag": tag}
23 |     if type(value) == list:
24 |         values["list"] = value
25 |     else:
26 |         values["word"] = value
27 |     return "tag", values, ExtendedOp(op)
28 | 


--------------------------------------------------------------------------------
/tests/test_precompile.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from rita.precompile import precompile
 4 | 
 5 | from utils import raw_compare
 6 | 
 7 | 
 8 | def test_rule_import():
 9 |     rules = """
10 |     @import "examples/color-car.rita"
11 |     """
12 | 
13 |     result = precompile(rules.strip())
14 |     with open("examples/color-car.rita", "r") as f:
15 |         assert result == f.read()
16 | 
17 | 
18 | def test_cyclical_import():
19 |     rules = """
20 |     @import "examples/cyclical-import.rita"
21 |     """
22 | 
23 |     with pytest.raises(RuntimeError):
24 |         precompile(rules)
25 | 
26 | 
27 | def test_alias():
28 |     rules = """
29 |     numbers = {"one", "two", "three"}
30 |     @alias IN_LIST IL
31 |     @alias MARK M
32 | 
33 |     IL(numbers)->M("HELLO")
34 |     """
35 | 
36 |     expected = """
37 |     numbers = {"one", "two", "three"}
38 | 
39 |     IN_LIST(numbers)->MARK("HELLO")
40 |     """
41 | 
42 |     result = precompile(rules.strip())
43 |     raw_compare(expected, result)
44 | 


--------------------------------------------------------------------------------
/.github/workflows/github-actions-deployment.yaml:
--------------------------------------------------------------------------------
 1 | name: Deploy
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - master
 6 |   pull_request:
 7 |     branches:
 8 |       - master
 9 |     types: 
10 |       - ready_for_review
11 |       - review_requested
12 | 
13 | jobs:
14 |   deployment:
15 |     name: "Deployment"
16 |     runs-on: "ubuntu-latest"
17 |     strategy:
18 |       matrix:
19 |         python-version: [ '3.9' ]
20 |     steps:
21 |       - uses: actions/checkout@v2
22 |       - name: Setup python
23 |         uses: actions/setup-python@v2
24 |         with:
25 |           python-version: ${{ matrix.python-version }}
26 |           architecture: x64
27 |       - name: Install Poetry
28 |         run: |
29 |           curl -sSL https://raw.githubusercontent.com/python-poetry/poetry/master/install-poetry.py | python -
30 | 
31 |       - name: Build
32 |         run: poetry build
33 | 
34 |       - name: Set Token
35 |         run: poetry config pypi-token.pypi ${{ secrets.PYPI_TOKEN }}
36 | 
37 |       - name: Poetry Publish
38 |         run: poetry publish


--------------------------------------------------------------------------------
/.github/workflows/github-actions-main.yaml:
--------------------------------------------------------------------------------
 1 | name: Testing
 2 | on: [push]
 3 | jobs:
 4 |   Testing:
 5 |     runs-on: ubuntu-latest
 6 |     strategy:
 7 |       matrix:
 8 |         python-version: [ '3.9' ]
 9 |     name: Testing on Python ${{ matrix.python-version }}
10 |     steps:
11 |       - uses: actions/checkout@v2
12 |       - name: Setup python
13 |         uses: actions/setup-python@v2
14 |         with:
15 |           python-version: ${{ matrix.python-version }}
16 |           architecture: x64
17 |       - run: pip install tox
18 |       - run: CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }} tox -e py39
19 | 
20 |   CheckCode:
21 |     runs-on: ubuntu-latest
22 |     strategy:
23 |       matrix:
24 |         python-version: [ '3.9' ]
25 |     name: CheckCode
26 |     steps:
27 |       - uses: actions/checkout@v2
28 |       - uses: actions/setup-python@v2
29 |         with:
30 |           python-version: '3.9'
31 |           architecture: 'x64'
32 |       - run: pip install tox
33 |       - run: CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }} tox -e lint
34 |       - run: CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }} tox -e mypy
35 | 
36 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Šarūnas Navickas
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/tests/test_config.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from rita.config import with_config, SessionConfig
 4 | 
 5 | 
 6 | @pytest.fixture
 7 | def cfg():
 8 |     return SessionConfig()
 9 | 
10 | 
11 | @with_config
12 | def test_config_decorator(config):
13 |     assert config
14 | 
15 | 
16 | def test_registered_engines(cfg):
17 |     assert len(cfg.available_engines) > 0
18 | 
19 | 
20 | def test_registered_engines_has_spacy(cfg):
21 |     pytest.importorskip("spacy", minversion="2.1")
22 |     from rita.engine.translate_spacy import compile_rules
23 |     assert len(cfg.available_engines) == 3
24 |     assert cfg.default_engine == compile_rules
25 | 
26 | 
27 | def test_default_values(cfg):
28 |     assert cfg.ignore_case
29 |     assert cfg.implicit_punct
30 |     assert not cfg.implicit_hyphon
31 | 
32 |     cfg.ignore_case = False
33 |     assert not cfg.ignore_case
34 | 
35 |     cfg.implicit_punct = False
36 |     assert not cfg.implicit_punct
37 | 
38 |     cfg.implicit_hyphon = True
39 |     assert cfg.implicit_hyphon
40 | 
41 | 
42 | def test_register_module(cfg):
43 |     cfg.register_module("rita.modules.fuzzy")
44 | 
45 |     assert len(cfg.modules) == 1
46 | 


--------------------------------------------------------------------------------
/rita/run.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import argparse
 3 | import logging
 4 | 
 5 | import rita
 6 | 
 7 | 
 8 | logger = logging.getLogger(__name__)
 9 | 
10 | 
11 | def main():
12 |     from rita.utils import RitaJSONEncoder
13 |     parser = argparse.ArgumentParser(
14 |         description="Compile rita -> spaCy patterns"
15 |     )
16 | 
17 |     parser.add_argument("-f", help=".rita rules file")
18 |     parser.add_argument(
19 |         "out",
20 |         help="output .jsonl file to store rules"
21 |     )
22 |     parser.add_argument("--debug", help="debug mode", action="store_true")
23 |     parser.add_argument("--engine", help="Engine to use when compiling rules", default="spacy")
24 |     args = parser.parse_args()
25 | 
26 |     if args.debug:
27 |         logging.basicConfig(level=logging.DEBUG)
28 |     else:
29 |         logging.basicConfig(level=logging.INFO)
30 | 
31 |     patterns = rita.compile(args.f, use_engine=args.engine)
32 | 
33 |     logger.info("Compiling rules using {} engine".format(args.engine))
34 | 
35 |     with open(args.out, "w") as f:
36 |         for pattern in patterns:
37 |             f.write(json.dumps(pattern, cls=RitaJSONEncoder) + "\n")
38 | 


--------------------------------------------------------------------------------
/rita/modules/fuzzy.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import string
 3 | 
 4 | from rita.macros import resolve_value
 5 | from rita.utils import ExtendedOp
 6 | 
 7 | 
 8 | char_translation = dict(
 9 |     [(c * 2, "{0}{{1,2}}".format(c)) for c in string.ascii_lowercase]
10 | )
11 | 
12 | find_re = "|".join(["({0})".format(s) for (s, _) in char_translation.items()])
13 | 
14 | slang = {"you": "u", "for": "4", "are": "r", "you are": "ur", "you're": "ur"}
15 | 
16 | 
17 | def premutations(initial):
18 |     # return initial value
19 |     yield initial
20 | 
21 |     """
22 |     if we have double letters, like `oo`, we can guess that
23 |     - user can sometimes enter both
24 |     - sometimes only single
25 |     """
26 |     double_letters = re.sub(
27 |         find_re,
28 |         lambda x: char_translation[x.group(0)],
29 |         initial
30 |     )
31 |     yield double_letters
32 | 
33 |     # if we have simple word, can add slang alternative
34 |     if initial in slang:
35 |         yield r"\s{0}\s".format(slang[initial])
36 | 
37 | 
38 | def FUZZY(name, config, op=None):
39 |     initial = resolve_value(name, config=config)
40 |     return "fuzzy", list(premutations(initial.lower())), ExtendedOp(op)
41 | 


--------------------------------------------------------------------------------
/docs/config.md:
--------------------------------------------------------------------------------
 1 | # Config
 2 | 
 3 | Configuration is mostly applied per-rule-basis, meaning, that different rules can have different configuration while running from same process.
 4 | 
 5 | ## Syntax
 6 | 
 7 | It is intended to do configuration from within the rule, like so:
 8 | 
 9 | ```
10 | !CONFIG("ignore_case", "Y")
11 | ```
12 | 
13 | First argument is config key, second value. `"1"`, `"Y"` and `"T"` results in `True`, `"0"`, `"N"`, `"F"` - in `False`
14 | 
15 | ## Configurations
16 | 
17 | | Setting            | Default              | Description                                                                   |
18 | |--------------------|----------------------|-------------------------------------------------------------------------------|
19 | | implicit_punct     |`T`                   |Automatically adds punctuation characters `,.!:\;` to the rules                |
20 | | ignore_case        |`T`                   |All rules are case-insensitive                                                 |
21 | | deaccent           |`T`                   |If provided word with accent letters, use two versions - with and without them |
22 |  | implicit_hyphon           |`F`                   |Automatically adds hyphon characters `-` to the rules. Enabling implicit_hyphon is disabling implicit_punct   | 


--------------------------------------------------------------------------------
/rita/precompile.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import logging
 3 | 
 4 | from functools import partial
 5 | from typing import Match
 6 | 
 7 | 
 8 | logger = logging.getLogger(__name__)
 9 | 
10 | 
11 | MAX_DEPTH = 5
12 | ALIAS_PATTERN = re.compile(r"@alias\s+(?P<original>(\w|[_])+)\s+(?P<alias>(\w|[_])+)")
13 | 
14 | 
15 | def handle_import(m: Match, depth: int = 0) -> str:
16 |     path = m.group("path")
17 |     logger.debug("Importing: {}".format(path))
18 |     with open(path, "r") as f:
19 |         return precompile(f.read(), depth+1)
20 | 
21 | 
22 | def precompile(raw: str, depth: int = 0) -> str:
23 |     if depth > MAX_DEPTH:
24 |         raise RuntimeError(
25 |             "Maximum depth limit has been reached. "
26 |             "Please check if you don't have cyclical imports"
27 |         )
28 | 
29 |     raw = re.sub(
30 |         r"@import\s+[\"'](?P<path>(\w|[/\-.])+)[\"']",
31 |         partial(handle_import, depth=depth),
32 |         raw
33 |     )
34 | 
35 |     for m in ALIAS_PATTERN.finditer(raw):
36 |         # Delete alias definition
37 |         full = m.group(0)
38 |         raw = raw.replace(full, "")
39 | 
40 |         original = m.group("original")
41 |         alias = m.group("alias")
42 |         raw = re.sub(r"(?:(\s|->|{{))(?P<alias>{})([\(])".format(alias), r"\1{}(".format(original), raw)
43 | 
44 |     return raw
45 | 


--------------------------------------------------------------------------------
/rita/modules/pluralize.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import logging
 3 | try:
 4 |     import inflect
 5 | except ImportError:
 6 |     logging.exception(
 7 |         "Pluralize module requires 'inflect' package to be installed."
 8 |         "Install it and try again"
 9 |     )
10 |     sys.exit(1)
11 | 
12 | from rita.macros import resolve_value
13 | from rita.utils import flatten, ExtendedOp
14 | 
15 | 
16 | def pluralizing(initial_list):
17 |     """"
18 |     For a list of nouns, it will return a list of the plurals and the initial nouns
19 |     """
20 |     p = inflect.engine()
21 |     plurals = [p.plural(word) for word in initial_list]
22 |     return initial_list + plurals
23 | 
24 | 
25 | def PLURALIZE(*args, config, op=None):
26 |     """
27 |     For a noun or a list of nouns, it will match any singular or plural word
28 |     Usage for a single word, e.g.:
29 |     PLURALIZE("car")
30 |     Usage for lists, e.g.:
31 |     vehicles = {"car", "bicycle", "ship"}
32 |     PLURALIZE(vehicles)
33 |     Will work even for regex or if the lemmatizer of spaCy is making an error
34 |     Has dependency to the Python inflect package https://pypi.org/project/inflect/
35 |     """
36 |     if type(args[0]) == list:
37 |         initial_list = [resolve_value(arg, config=config)
38 |                         for arg in flatten(args)]
39 |     else:
40 |         initial_list = [args[0]]
41 |     return "any_of", pluralizing(initial_list), ExtendedOp(op)
42 | 


--------------------------------------------------------------------------------
/rita/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import logging
 4 | from types import GeneratorType
 5 | 
 6 | from rita.config import with_config
 7 | from rita.preprocess import preprocess_rules
 8 | from rita.precompile import precompile
 9 | from rita.utils import timer, Timer
10 | 
11 | 
12 | logger = logging.getLogger(__name__)
13 | 
14 | __version__ = (0, 7, 4, os.getenv("VERSION_PATCH"))
15 | 
16 | 
17 | def get_version():
18 |     normalized = list([i for i in __version__ if i is not None])
19 |     if len(normalized) == 4:
20 |         return "{0}.{1}.{2}-{3}".format(*normalized)
21 |     else:
22 |         return "{0}.{1}.{2}".format(*normalized)
23 | 
24 | 
25 | @with_config
26 | def compile_string(raw, config, use_engine=None, **kwargs):
27 |     from rita.parser import RitaParser
28 |     t = Timer("Compilation")
29 |     for k, v in kwargs.items():
30 |         config.set_variable(k, v)
31 | 
32 |     with timer("Parsing"):
33 |         parser = RitaParser(config)
34 |         parser.build()
35 |         root = parser.parse(precompile(raw))
36 | 
37 |     logger.debug(root)
38 |     if use_engine:
39 |         compile_rules = config.set_engine(use_engine)
40 |     else:
41 |         compile_rules = config.default_engine
42 | 
43 |     with timer("Preprocessing"):
44 |         rules = list(preprocess_rules(root, config))
45 | 
46 |     with timer("Compiling"):
47 |         result = compile_rules(rules, config, **kwargs)
48 | 
49 |     if isinstance(result, GeneratorType):
50 |         patterns = list(result)
51 |         t.stop(debug=False)
52 |         return patterns
53 |     else:
54 |         t.stop(debug=False)
55 |         return result
56 | 
57 | 
58 | def compile(fname, use_engine=None, **kwargs):
59 |     with open(fname, "r") as f:
60 |         raw = f.read()
61 | 
62 |     return compile_string(raw, use_engine=use_engine, **kwargs)
63 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | 
107 | # yacc-lex
108 | *.out
109 | parsetab.py


--------------------------------------------------------------------------------
/tests/utils.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | import pytest
 4 | import rita
 5 | 
 6 | from rita.shortcuts import setup_spacy
 7 | 
 8 | 
 9 | def load_rules(rules_path):
10 |     with open(rules_path, "r") as f:
11 |         return f.read()
12 | 
13 | 
14 | def spacy_engine(rules, **kwargs):
15 |     spacy = pytest.importorskip("spacy", minversion="2.1")
16 |     nlp = spacy.load("en_core_web_sm")
17 |     setup_spacy(nlp, rules_string=rules, override_ents=True, **kwargs)
18 |     patterns = rita.compile_string(rules, **kwargs)
19 |     print(patterns)
20 | 
21 |     def parse(text):
22 |         doc = nlp(text)
23 |         return list([(e.text, e.label_) for e in doc.ents])
24 |     return parse
25 | 
26 | 
27 | def standalone_engine(rules, **kwargs):
28 |     parser = rita.compile_string(rules, use_engine="standalone", **kwargs)
29 |     print(parser.patterns)
30 | 
31 |     def parse(text):
32 |         results = list(parser.execute(text, include_submatches=False))
33 |         return list([(r["text"], r["label"]) for r in results])
34 |     return parse
35 | 
36 | 
37 | def rust_engine(rules, **kwargs):
38 |     from rita.engine.translate_rust import load_lib
39 |     lib = load_lib()
40 |     if lib is None:
41 |         pytest.skip("Missing rita-rust dynamic lib, skipping related tests")
42 |     print("Trying to run: {}".format(rules))
43 |     parser = rita.compile_string(rules, use_engine="rust", **kwargs)
44 |     print(parser.patterns)
45 | 
46 |     def parse(text):
47 |         results = list(parser.execute(text, include_submatches=False))
48 |         return list([(r["text"], r["label"]) for r in results])
49 |     return parse
50 | 
51 | 
52 | def normalize_output(r):
53 |     return re.sub(r"\s+", " ", r.strip().replace("\n", ""))
54 | 
55 | 
56 | def raw_compare(r1, r2):
57 |     r1 = normalize_output(r1)
58 |     r2 = normalize_output(r2)
59 | 
60 |     assert r1 == r2
61 | 


--------------------------------------------------------------------------------
/extra/sublimetext/RITA.sublime-syntax:
--------------------------------------------------------------------------------
 1 | %YAML 1.2
 2 | ---
 3 | # http://www.sublimetext.com/docs/syntax.html
 4 | name: Rita
 5 | file_extensions:
 6 |   - rita
 7 | scope: source.rita
 8 | contexts:
 9 |   variables:
10 |     - match: \b[a-z_]+\b
11 |       scope: variable.parameter.rita
12 |   keywords:
13 |     - match: \b[A-Z_]+\b
14 |       scope: keyword.control.rita
15 | 
16 |     - match: \(
17 |       push: args
18 | 
19 |     - match: \)
20 |       scope: invalid.illegal.stray-bracket-end
21 | 
22 |   main:
23 |     - include: variables
24 |     - match: '#'
25 |       scope: punctuation.definition.comment.rita
26 |       push: line_comment
27 | 
28 |     - match: "{"
29 |       push: pattern
30 | 
31 |     - match: "}"
32 |       scope: invalid.illegal.stray-bracket-end
33 | 
34 |     - match: ->
35 |       push: mark
36 | 
37 |     - match: =
38 |       push: assign_variable
39 | 
40 |   string:
41 |     - meta_scope: string.quoted.double.rita
42 |     - match: \\.
43 |       scope: constant.character.escape.rita
44 |     - match: '"'
45 |       pop: true
46 | 
47 |   line_comment:
48 |     - meta_scope: comment.line.rita
49 |     - match: $
50 |       pop: true
51 | 
52 |   args:
53 |     - include: variables
54 |     - match: '"'
55 |       push: string
56 |     - match: ','
57 |       scope: punctuation.separator.comma.rita
58 |     - match: \)
59 |       pop: true
60 | 
61 |   pattern:
62 |     - include: keywords
63 |     - match: "}"
64 |       pop: true
65 | 
66 |   assign_variable:
67 |     - match: '"'
68 |       push: string
69 |     - match: ','
70 |       scope: punctuation.separator.comma.rita
71 |     - match: "{"
72 |     - match: "}"
73 |       pop: true
74 | 
75 |   mark:
76 |     - match: \bMARK\b
77 |       scope: keyword.control.rita
78 | 
79 |     - match: '"'
80 |       push: string
81 | 
82 |     - match: \(
83 | 
84 |     - match: \)
85 |       pop: true
86 | 
87 | 


--------------------------------------------------------------------------------
/rita/lexer.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import ply.lex as lex
 3 | 
 4 | logger = logging.getLogger(__name__)
 5 | 
 6 | 
 7 | class RitaLexer(object):
 8 |     tokens = [
 9 |         "KEYWORD",
10 |         "LITERAL",
11 |         "NAME",
12 |         "LBRACKET",
13 |         "RBRACKET",
14 |         "LPAREN",
15 |         "RPAREN",
16 |         "ARROW",
17 |         "COMMA",
18 |         "MODIF_QMARK",
19 |         "MODIF_STAR",
20 |         "MODIF_PLUS",
21 |         "ASSIGN",
22 |         "EXEC",
23 |         "PIPE",
24 |     ]
25 | 
26 |     literals = ["{", "}", "(", ")", '"', ",", "=", "!", "|"]
27 | 
28 |     t_ignore = " \t"
29 |     t_ignore_COMMENT = r"\#.*"
30 |     t_ARROW = "->"
31 |     t_LBRACKET = "{"
32 |     t_RBRACKET = "}"
33 |     t_LPAREN = r"\("
34 |     t_RPAREN = r"\)"
35 |     t_COMMA = ","
36 |     t_MODIF_QMARK = r"\?"
37 |     t_MODIF_STAR = r"\*"
38 |     t_MODIF_PLUS = r"\+"
39 |     t_EXEC = r"!"
40 |     t_ASSIGN = r"="
41 |     t_PIPE = r"\|"
42 | 
43 |     # Define a rule so we can track line numbers
44 |     def t_newline(self, t):
45 |         r"\n+"
46 |         t.lexer.lineno += len(t.value)
47 | 
48 |     def t_KEYWORD(self, t):
49 |         r"[A-Z_]{3,}"
50 |         return t
51 | 
52 |     def t_LITERAL(self, t):
53 |         r'("|\')(\\.|.)+?("|\')'
54 |         t.value = t.value[1:-1]
55 |         return t
56 | 
57 |     def t_NAME(self, t):
58 |         r"\w+"
59 |         return t
60 | 
61 |     def t_error(self, t):
62 |         logger.error("Invalid Token: {}".format(t.value[0]))
63 |         t.lexer.skip(1)
64 | 
65 |     def build(self, **kwargs):
66 |         self.lexer = lex.lex(module=self, errorlog=logger, **kwargs)
67 |         return self.lexer
68 | 
69 |     def tokenize(self, data):
70 |         self.lexer.input(data)
71 |         while True:
72 |             t = self.lexer.token()
73 |             if t is None:
74 |                 break
75 |             yield t
76 | 


--------------------------------------------------------------------------------
/rita/modules/names.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from rita.macros import resolve_value
 4 | from rita.utils import flatten, ExtendedOp
 5 | 
 6 | logger = logging.getLogger(__name__)
 7 | 
 8 | 
 9 | STOP_NAMES = {"von", "van", "de", "dos"}
10 | 
11 | 
12 | def trim_name(name):
13 |     if name in STOP_NAMES:
14 |         return name
15 |     return name[0] + r"\."
16 | 
17 | 
18 | def trim_seniority(name):
19 |     if name.lower() == "junior":
20 |         return r"jr\."
21 |     elif name.lower() == "senior":
22 |         return r"sr\."
23 |     else:
24 |         return name
25 | 
26 | 
27 | def remove_empty(x):
28 |     return x.strip() != ""
29 | 
30 | 
31 | def generate_names(initial_list):
32 |     """"
33 |     Generates variations of names
34 |     Eg. {First Middle Last; First M. Last; F. M. Last}
35 |     """
36 |     for name in initial_list:
37 |         yield name.strip(),
38 | 
39 |         buff = name.strip().split(" ")
40 |         if len(buff) == 2:
41 |             yield trim_name(buff[0]), buff[1]
42 |         elif len(buff) == 3:
43 |             if buff[2].lower() == "junior" or buff[2].lower() == "senior":
44 |                 yield buff[0], buff[1], trim_seniority(buff[2])
45 |             else:
46 |                 yield buff[0], trim_name(buff[1]), buff[2]
47 |                 yield trim_name(buff[0]), trim_name(buff[1]), buff[2]
48 | 
49 | 
50 | def NAMES(*args, config, op=None):
51 |     if type(args[0]) == list:
52 |         initial_list = [resolve_value(arg, config=config)
53 |                         for arg in flatten(args)]
54 |     else:
55 |         initial_list = [args[0]]
56 | 
57 |     names = list([" ".join(filter(remove_empty, names))
58 |                   for names in generate_names(initial_list)])
59 |     logger.debug("Generated list of names: {}".format(names))
60 |     new_op = ExtendedOp(op)
61 |     new_op.case_sensitive_override = True
62 |     return "any_of", names, new_op
63 | 


--------------------------------------------------------------------------------
/rita/shortcuts.py:
--------------------------------------------------------------------------------
 1 | import rita
 2 | 
 3 | 
 4 | def setup_spacy(model, patterns=None, rules_path=None, rules_string=None, override_ents=True):
 5 |     import spacy
 6 |     major, _, _ = spacy.__version__.split(".")
 7 |     if major == "2":
 8 |         return _spacy_v2(model, patterns, rules_path, rules_string, override_ents)
 9 |     elif major == "3":
10 |         return _spacy_v3(model, patterns, rules_path, rules_string, override_ents)
11 |     else:
12 |         raise RuntimeError("Unsupported spaCy version: {}".format(major))
13 | 
14 | 
15 | def _spacy_v2(model, patterns=None, rules_path=None, rules_string=None, override_ents=True):
16 |     from spacy.pipeline import EntityRuler
17 |     ruler = EntityRuler(model, overwrite_ents=override_ents)
18 |     if not patterns:
19 |         if rules_path:
20 |             patterns = rita.compile(rules_path, use_engine="spacy")
21 |         elif rules_string:
22 |             patterns = rita.compile_string(rules_string, use_engine="spacy")
23 |         else:
24 |             raise RuntimeError("Please provides rules. Either `patterns`, `rules_path` or `rules_string`")
25 | 
26 |         ruler.add_patterns(patterns)
27 |     else:
28 |         ruler.from_disk(patterns)
29 | 
30 |     model.add_pipe(ruler)
31 |     return model
32 | 
33 | 
34 | def _spacy_v3(model, patterns=None, rules_path=None, rules_string=None, override_ents=True):
35 |     ruler = model.add_pipe("entity_ruler", config={"overwrite_ents": override_ents, "validate": True})
36 |     if not patterns:
37 |         if rules_path:
38 |             patterns = rita.compile(rules_path, use_engine="spacy")
39 |         elif rules_string:
40 |             patterns = rita.compile_string(rules_string, use_engine="spacy")
41 |         else:
42 |             raise RuntimeError("Please provides rules. Either `patterns`, `rules_path` or `rules_string`")
43 | 
44 |         ruler.add_patterns(patterns)
45 |     else:
46 |         ruler.from_disk(patterns)
47 |     return model
48 | 


--------------------------------------------------------------------------------
/docs/macros.md:
--------------------------------------------------------------------------------
 1 | # Macros
 2 | 
 3 | `ARG = Literal | Macro | Variable`
 4 | 
 5 | `ARGS = Array of ARG`
 6 | 
 7 | | Name    | Arguments            | Modifiers | Description                                 |
 8 | |---------|----------------------|-----------|---------------------------------------------|
 9 | | ANY     |`None`                |`?` `*` `+`|Placeholder for any kind of text             |
10 | | WORD    |`ARG`(Optional)       |`?` `*` `+`|Placeholder for any kind of word             |
11 | | NUM     |`ARG`(Optional)       |`?` `*` `+`|Placeholder for any kind of number           |
12 | | PUNCT   |`None`                |`?` `*` `+`|Placeholder for punctuation                  |
13 | | POS     |`ARG`                 |`?` `*` `+`|Match by PartOfSpeech                        |
14 | | LEMMA   |`ARG`                 |`?` `*` `+`|Match by Lemma                               |
15 | | ENTITY  |`ARG`                 |`?` `*` `+`|Match by Entity Type, eg. `PERSON`           |
16 | | PATTERN |`ARGS`                |`None`     |Wrapper for multiple of rules. **Covered by standard syntax, can be ignored**                |
17 | | IN_LIST |`ARGS`                |`?` `*` `+`|Match by any of defined values               |
18 | | PREFIX  |`ARGS`                |`None`     |Adds a prefix to next word or list           |
19 | | LOAD    |`ARG`                 |`None`     |Load array from file. Each line = new element|
20 | | MARK    |`ARG`                 |`None`     |Mark given pattern as a label                |
21 | | ASSIGN  |`Literal`, `ARG`      |`None`     |Assign value to variable. **Covered by standard syntax, can be ignored**                     |
22 | | EXEC  |`ARG`                   |`None`     |Execute macro. **Covered by standard syntax, can be ignored**                     |
23 | | IMPORT  |`Literal`             |`None`     |Import custom module, allowing custom macros to be executed|
24 | | CONFIG | `Literal`, `LITERAL`  |`None`     |Alows to modify config value                 |
25 | 


--------------------------------------------------------------------------------
/docs/engines.md:
--------------------------------------------------------------------------------
 1 | # Engines
 2 | 
 3 | In RITA what we call `engine` is a system we will compile rules to, and which will do the heavy lifting after that.
 4 | 
 5 | Currently there are three engines:
 6 | 
 7 | ## spaCy
 8 | 
 9 | Activated by using `rita.compile(<rules_file>, use_engine="spacy")`
10 | 
11 | Using this engine, all of the RITA rules will be compiled into spaCy patterns, which can be natively used by spaCy in various scenarios.
12 | Most often - to improve NER (Named Entity Recognition), by adding additional entities derived from your given rules
13 | 
14 | It requires to have spaCy package installed (`pip install spacy`) and to actually use it later, language model needs to be downloaded (`python -m spacy download <language_code>`)
15 | 
16 | ## Standalone
17 | 
18 | Activated by using `rita.compile(<rules_file>, use_engine="standalone")`. It compiles into pure regex and can be used with zero dependencies.
19 | By default, it uses Python `re` library. Since `0.5.10` version, you can give a custom regex implementation to use:
20 | eg. regex package: `rita.compile(<rules_file>, use_engine="standalone", regex_impl=regex)`
21 | 
22 | It is very lightweight, very fast (compared to spaCy), however lacking in some functionality which only proper language model can bring:
23 | - Patterns by entity (PERSON, ORGANIZATION, etc)
24 | - Patterns by Lemmas
25 | - Patterns by POS (Part Of Speech)
26 | 
27 | Only generic things, like WORD, NUMBER can be matched.
28 | 
29 | 
30 | ## Rust (new in `0.6.0`)
31 | 
32 | There's only an interface inside the code, engine itself is proprietary. 
33 | 
34 | In general it's identical to `standalone`, but differs in one crucial part - all of the rules are compiled into actual binary code and that provides large performance boost.
35 | It is proprietary, because there are various caveats, engine itself is a bit more fragile and needs to be tinkered to be optimized to very specific case
36 | (eg. few long texts with many matches vs a lot short texts with few matches).


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "rita-dsl"
 3 | version = "0.7.4"
 4 | description = "DSL for building language rules"
 5 | authors = [
 6 |     "Šarūnas Navickas <sarunas@navickas.info>"
 7 | ]
 8 | license = "MIT"
 9 | readme = "README.md"
10 | homepage = "https://github.com/zaibacu/rita-dsl"
11 | repository = "https://github.com/zaibacu/rita-dsl"
12 | documentation = "https://rita-dsl.readthedocs.io/en/latest/"
13 | packages = [{ include = "rita" }]
14 | 
15 | keywords = ["nlp", "rule-based", "dsl", "pyproject.toml"]
16 | 
17 | classifiers = [
18 |             "Programming Language :: Python",
19 |             "Programming Language :: Python :: 3",
20 |             "Programming Language :: Python :: 3.5",
21 |             "Programming Language :: Python :: 3.6",
22 |             "Programming Language :: Python :: 3.7",
23 |             "Programming Language :: Python :: 3.8",
24 |             "Programming Language :: Python :: 3.9",
25 |             "Programming Language :: Python :: 3.10",
26 |             "Programming Language :: Python :: Implementation :: CPython",
27 |             "Programming Language :: Python :: Implementation :: PyPy",
28 | ]
29 | 
30 | [tool.poetry.dependencies]
31 | python = "^3.5"
32 | ply = "3.11"
33 | 
34 | [tool.poetry.dev-dependencies]
35 | pytest = "^5.2.4"
36 | pytest-benchmark = "^3.2.2"
37 | pytest-cov = "^2.8.1"
38 | pytest-mock = "^2.0.0"
39 | 
40 | [tool.poetry.scripts]
41 | rita = "rita.run:main"
42 | 
43 | [build-system]
44 | requires = ["poetry>=0.12"]
45 | build-backend = "poetry.masonry.api"
46 | 
47 | [tool.towncrier]
48 | directory = "changes"
49 | package = "rita"
50 | filename = "CHANGELOG.md"
51 | underlines = ["*", "-"]
52 | template = "changes_template.md"
53 | title_format = "{name} {version} ({project_date})"
54 | 
55 | [[tool.towncrier.type]]
56 | directory = "breaking"
57 | name = "Backward-incompatible Changes"
58 | showcontent = true
59 | 
60 | [[tool.towncrier.type]]
61 | directory = "deprecation"
62 | name = "Deprecations"
63 | showcontent = true
64 | 
65 | [[tool.towncrier.type]]
66 | directory = "feature"
67 | name = "Features"
68 | showcontent = true
69 | 
70 | [[tool.towncrier.type]]
71 | directory = "fix"
72 | name = "Fix"
73 | showcontent = true
74 | 


--------------------------------------------------------------------------------
/tests/test_run.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import json
 4 | import tempfile
 5 | 
 6 | import pytest
 7 | import rita
 8 | 
 9 | from rita.run import main
10 | from rita.shortcuts import setup_spacy
11 | 
12 | 
13 | def test_help(mocker):
14 |     sys.argv = [
15 |         "rita-dsl"
16 |         "--help"
17 |     ]
18 | 
19 | 
20 | def test_debug(mocker):
21 |     sys.argv = [
22 |         "rita-dsl"
23 |         "--debug"
24 |     ]
25 | 
26 | 
27 | def test_simple_compile(mocker):
28 |     sys.argv = [
29 |         "rita-dsl",
30 |         "-f",
31 |         "examples/cheap-phones.rita",
32 |         "output.jsonl"
33 |     ]
34 |     main()
35 | 
36 | 
37 | def test_simple_spacy_compile(mocker):
38 |     sys.argv = [
39 |         "rita-dsl",
40 |         "-f",
41 |         "examples/cheap-phones.rita",
42 |         "--engine=spacy",
43 |         "output.jsonl"
44 |     ]
45 |     main()
46 | 
47 | 
48 | def test_simple_standalone_compile(mocker):
49 |     sys.argv = [
50 |         "rita-dsl",
51 |         "-f",
52 |         "examples/cheap-phones.rita",
53 |         "--engine=standalone",
54 |         "output.jsonl"
55 |     ]
56 |     main()
57 | 
58 | 
59 | def test_shortcuts_spacy_inline():
60 |     spacy = pytest.importorskip("spacy", minversion="2.1")
61 |     nlp = spacy.load("en_core_web_sm")
62 |     rules = """
63 |     {WORD("TEST")}->MARK("TEST")
64 |     """
65 |     setup_spacy(nlp, rules_string=rules)
66 | 
67 | 
68 | def test_shortcuts_spacy_file():
69 |     spacy = pytest.importorskip("spacy", minversion="2.1")
70 |     nlp = spacy.load("en_core_web_sm")
71 |     setup_spacy(nlp, rules_path="examples/color-car.rita")
72 | 
73 | 
74 | def test_shortcuts_spacy_compiled():
75 |     spacy = pytest.importorskip("spacy", minversion="2.1")
76 |     nlp = spacy.load("en_core_web_sm")
77 |     tmp = tempfile.NamedTemporaryFile(mode="w", encoding="UTF-8", suffix=".jsonl", delete=False)
78 |     patterns = rita.compile("examples/color-car.rita")
79 |     for pattern in patterns:
80 |         tmp.write(json.dumps(pattern) + "\n")
81 |     tmp.flush()
82 |     tmp.close()
83 |     setup_spacy(nlp, patterns=tmp.name)
84 |     os.unlink(tmp.name)
85 | 
86 | 
87 | def test_shortcuts_spacy_giving_no_rules():
88 |     spacy = pytest.importorskip("spacy", minversion="2.1")
89 |     nlp = spacy.load("en_core_web_sm")
90 |     with pytest.raises(RuntimeError):
91 |         setup_spacy(nlp)
92 | 


--------------------------------------------------------------------------------
/docs/syntax.md:
--------------------------------------------------------------------------------
 1 | # Syntax guide
 2 | 
 3 | ## The basic building blocks
 4 | 
 5 | You have `LITERAL` which is any kind of string behind quotes, eg.:
 6 | 
 7 | ```
 8 | "Hello world!"
 9 | ```
10 | 
11 | You have `MACRO` which is main backbone of everything.
12 | 
13 | Using parenthesis, you can pass arguments to macro:
14 | ```
15 | LOAD("path/filename.txt") # Load a text file
16 | ```
17 | 
18 | if macro doesn't require any, you can simply call it
19 | 
20 | ```
21 | WORD # Declare, that you'll have any kind of word
22 | ```
23 | 
24 | Also, macro can have modifier (if it supports it)
25 | 
26 | ```
27 | WORD+ # Declare, that you'll have 1..N words
28 | WORD* # Declare, that you'll have 0..N words
29 | WORD? # Declare, that you'll have 1 or no words
30 | WORD! # Declare, that you want to ignore this word
31 | ```
32 | 
33 | More examples
34 | 
35 | ```
36 | WORD("cat") # Declare, that you'll have exact word `cat`
37 | 
38 | {"red", "green", "blue"} # Declare array of words
39 | ```
40 | 
41 | **NOTE** All of the MACROS are spelled in capital letters
42 | 
43 | And finally you have `VARIABLE`. First you must declare it and later you can use just by spelling it's name
44 | 
45 | ```
46 | CarModels = LOAD("path/models.txt")
47 | 
48 | # ...
49 | 
50 | IN_LIST(CarModels) # Check if token is inside of list of car models we provided
51 | ```
52 | 
53 | If using directly inside macro, array can be writen with simple commas
54 | 
55 | ```
56 | IN_LIST("audi", "toyota", "bmw", "honda", "nissan", "ford")
57 | ```
58 | 
59 | 
60 | For our declarations to make any sense, we need to build an expression. More on that in next topic.
61 | 
62 | ## Expressions
63 | 
64 | This language is built on expressions. 
65 | One expression means:
66 | 
67 | a) Single rule defining entity
68 | 
69 | b) Single variable declaration
70 | 
71 | Rule expression ends with an arrow `->`, eg.:
72 | 
73 | `WORD("something") -> MARK("SOMETHING_LABEL")`
74 | 
75 | with MACRO `MARK` we're assigning a label to rule
76 | 
77 | Variable declaration expression ends with equals sign `=`, eg.:
78 | ```
79 | a = "Apple"
80 | ```
81 | 
82 | When building a rule, you may want to combine several rules into one, you can use array builder for that:
83 | 
84 | ```
85 | {IN_LIST({"red", "green", "blue", "white", "black", "silver", "brown"}), WORD("car")} -> MARK("CAR_COLOR")
86 | ```
87 | 
88 | we're saying: `If any of these color words are present in text and is followed by word "car", we assume this part can be labeled as "CAR_COLOR"`
89 | 
90 | ## Logical variants
91 | 
92 | You can say, that your rule expects either `word1`, or `word2`. Usually this can be achieved by writing two separate rules, but there's easier way: 
93 | ```
94 | {WORD("word1")|WORD("word2")}
95 | ```
96 | 
97 | Pipe character (`|`) marks a logical `OR` meaning that either right or left side can be matched. It works only on surface level, if you want nested logic - write separate rules.


--------------------------------------------------------------------------------
/docs/modules.md:
--------------------------------------------------------------------------------
  1 | # Modules
  2 | 
  3 | Modules are like plugins to the system, usually providing additional functionality at some cost - needs additional dependencies, supports only specific language etc.
  4 | That's why they are not included into the core system, but can be easily included into your rules.
  5 | 
  6 | eg.
  7 | ```
  8 | !IMPORT("rita.modules.fuzzy")
  9 | 
 10 | FUZZY("squirrel") -> MARK("CRITTER")
 11 | ```
 12 | 
 13 | **NOTE**: the import path can be any proper Python import. So this actually allows you to add extra functionality by not modifying RITA's source code.
 14 | More on that in [Extending section](./extend.md)
 15 | 
 16 | ## Fuzzy
 17 | 
 18 | This is more as an example rather than proper module. The main goal is to generate possible misspelled variants of given word, so that match matches more cases. 
 19 | Very useful when dealing with actual natural language, eg. comments, social media posts. Word `you` can be automatically matched by proper `you` and `u`, `for` as `for` and `4` etc.
 20 | 
 21 | Usage:
 22 | ```
 23 | !IMPORT("rita.modules.fuzzy")
 24 | 
 25 | FUZZY("squirrel") -> MARK("CRITTER")
 26 | ```
 27 | 
 28 | ## Pluralize
 29 | 
 30 | Takes list (or single) words, and creates plural version of each of these.
 31 | 
 32 | Requires: `inflect` library (`pip install inflect`) before using. Works only on english words.
 33 | 
 34 | Usage:
 35 | 
 36 | ```
 37 | !IMPORT("rita.modules.pluralize")
 38 | 
 39 | vehicles={"car", "motorbike", "bicycle", "ship", "plane"}
 40 | {NUM, PLURALIZE(vehicles)}->MARK("VEHICLES")
 41 | ```
 42 | 
 43 | ## Tag
 44 | 
 45 | This module offers two new macros: `TAG` and `TAG_WORD`.
 46 | 
 47 | 
 48 | `TAG` is used for generating POS/TAG patterns based on a Regex
 49 | e.g. `TAG("^NN|^JJ")` for nouns or adjectives.
 50 | 
 51 | Works only with spaCy engine
 52 | 
 53 | Usage:
 54 | 
 55 | ```
 56 | !IMPORT("rita.modules.tag")
 57 | 
 58 | {WORD*, TAG("^NN|^JJ")}->MARK("TAGGED_MATCH")
 59 | ```
 60 | 
 61 | `TAG_WORD` is for generating TAG patterns with a word or a list.
 62 | 
 63 | e.g. match only "proposed" when it is in the sentence a verb (and not an adjective):
 64 | 
 65 | ```
 66 | !IMPORT("rita.modules.tag")
 67 | 
 68 | TAG_WORD("^VB", "proposed")
 69 | ```
 70 | 
 71 | or e.g. match a list of words only to verbs
 72 | 
 73 | ```
 74 | !IMPORT("rita.modules.tag")
 75 | 
 76 | words = {"percived", "proposed"}
 77 | {TAG_WORD("^VB", words)?}->MARK("LABEL")
 78 | ```
 79 | 
 80 | ## Orth
 81 | 
 82 | Ignores case-insensitive configuration and checks words as written
 83 | that means case-sensitive even if configuration is case-insensitive.
 84 | Especially useful for acronyms and proper names. 
 85 | 
 86 | Works only with spaCy engine
 87 | 
 88 | Usage:
 89 | 
 90 | ```
 91 | !IMPORT("rita.modules.orth")
 92 | 
 93 | {ORTH("IEEE")}->MARK("TAGGED_MATCH")
 94 | ```
 95 | 
 96 | ## Regex
 97 | 
 98 | Matches words based on a Regex pattern
 99 | e.g. all words that start with an 'a' would be
100 | `REGEX("^a")`
101 | 
102 | ```
103 | !IMPORT("rita.modules.regex")
104 | 
105 | {REGEX("^a")}->MARK("TAGGED_MATCH")
106 | ```
107 | 
108 | ## Names
109 | 
110 | Takes list of full person names (First + Last, or First Middle Last) and generates shortened variations,
111 | eg. F. Last, First M. Last, F. M. Last etc.
112 | 
113 | ```
114 | !IMPORT("rita.modules.names")
115 | 
116 | names = {"Roy Jones junior", "Roy Jones senior", "Juan-Claude van Damme", "Jon Jones"}
117 | NAMES(names)->MARK("NAME_MATCH")
118 | ```
119 | 
120 | Useful when matching against fixed set of names


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ![Rita Logo](docs/assets/logo-2.png)
 2 | 
 3 | # RITA DSL
 4 | 
 5 | [![Documentation Status](https://readthedocs.org/projects/rita-dsl/badge/?version=latest)](http://rita-dsl.readthedocs.io/?badge=latest)
 6 | [![codecov](https://codecov.io/gh/zaibacu/rita-dsl/branch/master/graph/badge.svg)](https://codecov.io/gh/zaibacu/rita-dsl)
 7 | [![made-with-python](https://img.shields.io/badge/Made%20with-Python-1f425f.svg)](https://www.python.org/)
 8 | [![Maintenance](https://img.shields.io/badge/Maintained%3F-yes-green.svg)](https://github.com/zaibacu/rita-dsl/graphs/commit-activity)
 9 | [![PyPI version fury.io](https://badge.fury.io/py/rita-dsl.svg)](https://pypi.python.org/pypi/rita-dsl/)
10 | [![PyPI download month](https://img.shields.io/pypi/dm/rita-dsl.svg)](https://pypi.python.org/pypi/rita-dsl/)
11 | [![GitHub license](https://img.shields.io/github/license/zaibacu/rita-dsl.svg)](https://github.com/zaibacu/rita-dsl/blob/master/LICENSE)
12 | 
13 | This is a language, loosely based on language [Apache UIMA RUTA](https://uima.apache.org/ruta.html), focused on writing manual language rules, which compiles into either [spaCy](https://github.com/explosion/spaCy) compatible patterns, or pure regex. These patterns can be used for doing [manual NER](https://spacy.io/api/entityruler) as well as used in other processes, like retokenizing and pure matching
14 | 
15 | ## An Introduction Video
16 | [![Intro](https://img.youtube.com/vi/GScerMeWz68/0.jpg)](https://www.youtube.com/watch?v=GScerMeWz68)
17 | 
18 | ## Links
19 | - [Website](https://rita-dsl.io/)
20 | - [Simple Chat bot example](https://repl.it/talk/share/Simple-chatbot-done-with-Rita/53471)
21 | - [Documentation](http://rita-dsl.readthedocs.io/)
22 | - [QuickStart](https://rita-dsl.readthedocs.io/en/latest/quickstart/)
23 | - [Language Syntax Plugin for IntelijJ based IDEs](https://plugins.jetbrains.com/plugin/15011-rita-language) 
24 | 
25 | ## Support
26 | 
27 | [![reddit](https://img.shields.io/reddit/subreddit-subscribers/ritaDSL?style=social)](https://www.reddit.com/r/ritaDSL/)
28 | [![Gitter](https://badges.gitter.im/rita-dsl/community.svg)](https://gitter.im/rita-dsl/community?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge)
29 | 
30 | If you need consulting or some custom work done, you can [Contact Us](mailto:info@griaustinis.lt)
31 | 
32 | ## Install
33 | 
34 | `pip install rita-dsl`
35 | 
36 | ## Simple Rules example
37 | 
38 | ```python
39 | rules = """
40 | cuts = {"fitted", "wide-cut"}
41 | lengths = {"short", "long", "calf-length", "knee-length"}
42 | fabric_types = {"soft", "airy", "crinkled"}
43 | fabrics = {"velour", "chiffon", "knit", "woven", "stretch"}
44 | 
45 | {IN_LIST(cuts)?, IN_LIST(lengths), WORD("dress")}->MARK("DRESS_TYPE")
46 | {IN_LIST(lengths), IN_LIST(cuts), WORD("dress")}->MARK("DRESS_TYPE")
47 | {IN_LIST(fabric_types)?, IN_LIST(fabrics)}->MARK("DRESS_FABRIC")
48 | """
49 | ```
50 | 
51 | ### Loading in spaCy
52 | ```python
53 | import spacy
54 | from rita.shortcuts import setup_spacy
55 | 
56 | 
57 | nlp = spacy.load("en")
58 | setup_spacy(nlp, rules_string=rules)
59 | ```
60 | 
61 | And using it:
62 | ```python
63 | >>> r = nlp("She was wearing a short wide-cut dress")
64 | >>> [{"label": e.label_, "text": e.text} for e in r.ents]
65 | [{'label': 'DRESS_TYPE', 'text': 'short wide-cut dress'}]
66 | ```
67 | 
68 | ### Loading using Regex (standalone)
69 | ```python
70 | import rita
71 | 
72 | patterns = rita.compile_string(rules, use_engine="standalone")
73 | ```
74 | 
75 | And using it:
76 | ```python
77 | >>> list(patterns.execute("She was wearing a short wide-cut dress"))
78 | [{'end': 38, 'label': 'DRESS_TYPE', 'start': 18, 'text': 'short wide-cut dress'}]
79 | ```
80 | 


--------------------------------------------------------------------------------
/rita/config.py:
--------------------------------------------------------------------------------
  1 | import operator
  2 | import logging
  3 | from importlib import import_module
  4 | from typing import Any, Callable
  5 | 
  6 | try:
  7 |     from rita.engine.translate_spacy import compile_rules as spacy_engine
  8 | except ImportError:
  9 |     pass
 10 | 
 11 | from rita.engine.translate_standalone import compile_rules as standalone_engine
 12 | from rita.engine.translate_rust import compile_rules as rust_engine
 13 | 
 14 | from rita.utils import SingletonMixin
 15 | from rita.types import opts, Rules
 16 | 
 17 | 
 18 | logger = logging.getLogger(__name__)
 19 | 
 20 | CompileFN = Callable[[Rules, "Config", opts], Any]
 21 | 
 22 | 
 23 | class Config(SingletonMixin):
 24 |     def __init__(self):
 25 |         self.available_engines = []
 26 |         self.engines_by_key = {}
 27 |         self.current_engine = None
 28 | 
 29 |         try:
 30 |             self.register_engine(1, "spacy", spacy_engine)
 31 |         except NameError:
 32 |             # spacy_engine is not imported
 33 |             pass
 34 |         self.register_engine(2, "standalone", standalone_engine)
 35 |         self.register_engine(3, "rust", rust_engine)
 36 | 
 37 |     def register_engine(self, priority: int, key: str, compile_fn: CompileFN) -> None:
 38 |         self.available_engines.append((priority, key, compile_fn))
 39 |         self.engines_by_key[key] = compile_fn
 40 |         sorted(self.available_engines, key=operator.itemgetter(0))
 41 | 
 42 |     @property
 43 |     def default_engine(self) -> CompileFN:
 44 |         (_, key, compile_fn) = self.available_engines[0]
 45 |         self.current_engine = key
 46 |         return compile_fn
 47 | 
 48 |     def set_engine(self, key: str) -> CompileFN:
 49 |         self.current_engine = key
 50 |         return self.engines_by_key[key]
 51 | 
 52 |     @property
 53 |     def list_branching(self) -> bool:
 54 |         if self.current_engine == "spacy":
 55 |             return True
 56 | 
 57 |         return False
 58 | 
 59 | 
 60 | class SessionConfig(object):
 61 |     def __init__(self):
 62 |         self._root = Config()
 63 |         self.modules = []
 64 |         # Default config
 65 |         self._data = {
 66 |             "ignore_case": True,
 67 |             "implicit_punct": True,
 68 |             "deaccent": True,
 69 |             "implicit_hyphon": False,
 70 |         }
 71 |         self.variables = {}
 72 |         self._nested_group_count = 0
 73 | 
 74 |     def register_module(self, mod_name: str) -> None:
 75 |         logger.debug("Importing module: {}".format(mod_name))
 76 |         self.modules.append(import_module(mod_name))
 77 | 
 78 |     def set_variable(self, k: str, v: Any) -> None:
 79 |         self.variables[k] = v
 80 | 
 81 |     def get_variable(self, k: str) -> Any:
 82 |         return self.variables[k]
 83 | 
 84 |     def __getattr__(self, name):
 85 |         if name == "_root":
 86 |             return self._root
 87 | 
 88 |         elif name in self._data:
 89 |             return self._data[name]
 90 | 
 91 |         return getattr(self._root, name)
 92 | 
 93 |     def set_config(self, k, v):
 94 |         # Handle booleans first
 95 |         if v.upper() in ["1", "T", "Y"]:
 96 |             self._data[k] = True
 97 |         elif v.upper() in ["0", "F", "N"]:
 98 |             self._data[k] = False
 99 |         else:
100 |             self._data[k] = v
101 | 
102 |     def new_nested_group_id(self):
103 |         self._nested_group_count += 1
104 |         return self._nested_group_count
105 | 
106 | 
107 | def with_config(fn):
108 |     def wrapper(*args, **kwargs):
109 |         config = SessionConfig()
110 |         return fn(*args, config=config, **kwargs)
111 | 
112 |     return wrapper
113 | 


--------------------------------------------------------------------------------
/tests/test_lexer.py:
--------------------------------------------------------------------------------
  1 | from rita.lexer import RitaLexer
  2 | 
  3 | 
  4 | def test_tokenize_any_macro_wo_args_wo_type():
  5 |     lex = RitaLexer()
  6 |     lex.build()
  7 | 
  8 |     tokens = list(lex.tokenize("ANY"))
  9 |     assert len(tokens) == 1
 10 |     token = tokens[0]
 11 |     assert token.type == "KEYWORD"
 12 |     assert token.value == "ANY"
 13 | 
 14 | 
 15 | def test_tokenize_any_macro_wo_args_w_type():
 16 |     lex = RitaLexer()
 17 |     lex.build()
 18 | 
 19 |     tokens = list(lex.tokenize('ANY -> MARK("Placeholder")'))
 20 |     assert len(tokens) == 6
 21 |     t0 = tokens[0]
 22 |     assert t0.type == "KEYWORD"
 23 |     assert t0.value == "ANY"
 24 | 
 25 |     assert tokens[1].type == "ARROW"
 26 | 
 27 |     t2 = tokens[2]
 28 | 
 29 |     assert t2.type == "KEYWORD"
 30 |     assert t2.value == "MARK"
 31 | 
 32 |     t3 = tokens[4]
 33 | 
 34 |     assert t3.type == "LITERAL"
 35 |     assert t3.value == "Placeholder"
 36 | 
 37 | 
 38 | def test_tokenize_assign_literal():
 39 |     lex = RitaLexer()
 40 |     lex.build()
 41 | 
 42 |     tokens = list(lex.tokenize('Test = "Test"'))
 43 | 
 44 |     assert len(tokens) == 3
 45 | 
 46 |     assert tokens[0].type == "NAME"
 47 |     assert tokens[1].type == "ASSIGN"
 48 |     assert tokens[2].type == "LITERAL"
 49 | 
 50 | 
 51 | def test_tokenize_assign_macro():
 52 |     lex = RitaLexer()
 53 |     lex.build()
 54 | 
 55 |     tokens = list(lex.tokenize('Test = WORD("Test")'))
 56 | 
 57 |     assert len(tokens) == 6
 58 | 
 59 |     assert tokens[0].type == "NAME"
 60 |     assert tokens[1].type == "ASSIGN"
 61 |     assert tokens[2].type == "KEYWORD"
 62 |     assert tokens[4].type == "LITERAL"
 63 | 
 64 | 
 65 | def test_tokenize_exec_macro():
 66 |     lex = RitaLexer()
 67 |     lex.build()
 68 |     tokens = list(lex.tokenize('!IMPORT("module.test")'))
 69 |     assert len(tokens) == 5
 70 |     assert tokens[0].type == "EXEC"
 71 |     assert tokens[1].type == "KEYWORD"
 72 |     assert tokens[3].type == "LITERAL"
 73 | 
 74 | 
 75 | def test_tokenize_two_exec_macros():
 76 |     lex = RitaLexer()
 77 |     lex.build()
 78 |     tokens = list(
 79 |         lex.tokenize(
 80 |             """
 81 |             !CONFIG("setting.1", "1")
 82 |             !CONFIG("setting.2", "0")
 83 |             """
 84 |         )
 85 |     )
 86 |     assert len(tokens) == 14
 87 |     assert tokens[0].type == "EXEC"
 88 |     assert tokens[1].type == "KEYWORD"
 89 |     assert tokens[3].type == "LITERAL"
 90 |     assert tokens[5].type == "LITERAL"
 91 | 
 92 |     assert tokens[7].type == "EXEC"
 93 |     assert tokens[8].type == "KEYWORD"
 94 |     assert tokens[10].type == "LITERAL"
 95 |     assert tokens[12].type == "LITERAL"
 96 | 
 97 | 
 98 | def test_tokenize_list_w_one_item():
 99 |     lex = RitaLexer()
100 |     lex.build()
101 | 
102 |     tokens = list(
103 |         lex.tokenize(
104 |             """
105 |             members = { "first" }
106 |             """
107 |         )
108 |     )
109 | 
110 |     assert tokens[0].type == "NAME"
111 |     assert tokens[1].type == "ASSIGN"
112 |     assert tokens[3].type == "LITERAL"
113 | 
114 | 
115 | def test_tokenize_variable_w_escape():
116 |     lex = RitaLexer()
117 |     lex.build()
118 | 
119 |     tokens = list(
120 |         lex.tokenize(r'WORD("Hello \"World\"") -> MARK("GREETING")')
121 |     )
122 | 
123 |     print(tokens[2])
124 | 
125 |     assert tokens[0].type == "KEYWORD"
126 |     assert tokens[2].type == "LITERAL"
127 |     assert tokens[4].type == "ARROW"
128 |     assert tokens[5].type == "KEYWORD"
129 | 
130 | 
131 | def test_pattern_in_variable():
132 |     lex = RitaLexer()
133 |     lex.build()
134 | 
135 |     tokens = list(
136 |         lex.tokenize(r'COMPLEX_NUMBER = {NUM+, WORD("/")?, NUM}')
137 |     )
138 | 
139 |     assert len(tokens) == 14
140 | 


--------------------------------------------------------------------------------
/rita/macros.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import types
  3 | 
  4 | from rita.utils import flatten, ExtendedOp
  5 | 
  6 | logger = logging.getLogger(__name__)
  7 | 
  8 | 
  9 | def resolve_value(obj, config):
 10 |     logger.debug("Resolving value: {0}".format(obj))
 11 | 
 12 |     if isinstance(obj, str):
 13 |         return obj
 14 | 
 15 |     elif isinstance(obj, tuple):
 16 |         return obj
 17 | 
 18 |     elif isinstance(obj, list):
 19 |         return obj
 20 | 
 21 |     elif isinstance(obj, types.GeneratorType):
 22 |         return "either", list(obj), ExtendedOp(None)
 23 | 
 24 |     return obj(config=config)
 25 | 
 26 | 
 27 | def ANY(config, op=None):
 28 |     return "any", None, ExtendedOp(op)
 29 | 
 30 | 
 31 | def PUNCT(config, op=None):
 32 |     return "punct", None, ExtendedOp(op)
 33 | 
 34 | 
 35 | def MARK(type_, obj, config, op=None):
 36 |     return {
 37 |         "label": resolve_value(type_, config=config),
 38 |         "data": resolve_value(obj, config=config)
 39 |     }
 40 | 
 41 | 
 42 | def LOAD(*args, config):
 43 |     fpath = resolve_value(args[0], config=config)
 44 |     with open(fpath, "r") as f:
 45 |         return list([line.strip() for line in f.readlines()])
 46 | 
 47 | 
 48 | def ASSIGN(k, v, config, op=None):
 49 |     logger.debug("Assigning: {0} -> {1}".format(k, v))
 50 |     config.set_variable(k, resolve_value(v, config=config))
 51 | 
 52 | 
 53 | def IN_LIST(*args, config, op=None):
 54 |     return "any_of", [resolve_value(arg, config=config)
 55 |                       for arg in flatten(args)], ExtendedOp(op)
 56 | 
 57 | 
 58 | def PATTERN(*args, config, op=None):
 59 |     context = []
 60 |     for arg in args:
 61 |         result = resolve_value(arg, config=config)
 62 |         if isinstance(result, list):
 63 |             context.append(NESTED(result, config, op))
 64 |         else:
 65 |             context.append(result)
 66 | 
 67 |     return context
 68 | 
 69 | 
 70 | def NESTED(children, config, op=None):
 71 |     return "nested", children, op
 72 | 
 73 | 
 74 | def WORD(*args, config, op=None):
 75 |     if len(args) == 1:
 76 |         literal = resolve_value(args[0], config=config)
 77 |         return "value", literal, ExtendedOp(op)
 78 |     elif len(args) == 0:
 79 |         return "regex", r"((\w|['_-])+)", ExtendedOp(op)
 80 | 
 81 | 
 82 | def NUM(*args, config, op=None):
 83 |     if len(args) == 1:
 84 |         literal = resolve_value(args[0], config=config)
 85 |         return "value", literal, ExtendedOp(op)
 86 |     elif len(args) == 0:
 87 |         return "regex", r"((\d+[\.,]\d+)|(\d+))", ExtendedOp(op)
 88 | 
 89 | 
 90 | def POS(*args, config, op=None):
 91 |     if len(args) == 1:
 92 |         return "pos", resolve_value(args[0], config=config), ExtendedOp(op)
 93 |     else:
 94 |         return "pos", [resolve_value(arg, config=config) for arg in args], ExtendedOp(op)
 95 | 
 96 | 
 97 | def LEMMA(name, config, op=None):
 98 |     return "lemma", resolve_value(name, config=config), ExtendedOp(op)
 99 | 
100 | 
101 | def ENTITY(*args, config, op=None):
102 |     if len(args) == 1:
103 |         return "entity", resolve_value(args[0], config=config), ExtendedOp(op)
104 |     else:
105 |         return "entity", [resolve_value(arg, config=config) for arg in args], ExtendedOp(op)
106 | 
107 | 
108 | def PREFIX(name, config, op=None):
109 |     return "prefix", resolve_value(name, config=config), ExtendedOp(op)
110 | 
111 | 
112 | def IMPORT(module, config):
113 |     mod_name = resolve_value(module, config=config)
114 |     config.register_module(mod_name)
115 | 
116 | 
117 | def CONFIG(setting, value, config):
118 |     logger.debug("Config {0} -> {1}".format(setting, value))
119 |     config.set_config(setting, resolve_value(value, config=config))
120 | 
121 | 
122 | def EXEC(obj, config):
123 |     return resolve_value(obj, config=config)
124 | 


--------------------------------------------------------------------------------
/docs/quickstart.md:
--------------------------------------------------------------------------------
  1 | # Quick Start
  2 | Install it via `pip install rita-dsl`
  3 | 
  4 | You can start defining rules by creating file with extention `*.rita`
  5 | 
  6 | Bellow is complete example which can be used as a reference point
  7 | 
  8 | ```
  9 | cars = LOAD("examples/cars.txt") # Load items from file
 10 | colors = {"red", "green", "blue", "white", "black"} # Declare items inline
 11 | 
 12 | {IN_LIST(colors), WORD("car")} -> MARK("CAR_COLOR") # If first token is in list `colors` and second one is word `car`, label it
 13 | 
 14 | {IN_LIST(cars), WORD+} -> MARK("CAR_MODEL") # If first token is in list `cars` and follows by 1..N words, label it
 15 | 
 16 | {ENTITY("PERSON"), LEMMA("like"), WORD} -> MARK("LIKED_ACTION") # If first token is Person, followed by any word which has lemma `like`, label it
 17 | ```
 18 | 
 19 | Now you can compile these rules `rita -f <your-file>.rita output.jsonl`
 20 | 
 21 | # Using compiled rules
 22 | 
 23 | ## spaCy backend
 24 | 
 25 | ### NEW in 0.4.0: Shortcuts to simplify life:
 26 | ```
 27 | import spacy
 28 | from rita.shortcuts import setup_spacy
 29 | 
 30 | nlp = spacy.load("en_core_web_sm")
 31 | setup_spacy(nlp, ...)
 32 | ```
 33 | 
 34 | If comipling rules from string:
 35 | `setup_spacy(nlp, rules_string=rules)`
 36 | If loading rules from `.rita` file
 37 | `setup_spacy(nlp, rules_path="examples/car-colors.rita")`
 38 | If loading from spaCy compiled rules:
 39 | `setup_spacy(nlp, patterns="rules.jsonl")`
 40 | 
 41 | ### Doing it manually
 42 | ```python
 43 | import spacy
 44 | from spacy.pipeline import EntityRuler
 45 | 
 46 | nlp = spacy.load("en_core_web_sm")
 47 | ruler = EntityRuler(nlp, overwrite_ents=True)
 48 | ruler.from_disk("output.jsonl")
 49 | nlp.add_pipe(ruler)
 50 | ```
 51 | 
 52 | Everytime you'll parse text with spaCy, it will run usual workflow and apply these rules
 53 | 
 54 | ```python
 55 | text = """
 56 | Johny Silver was driving a red car. It was BMW X6 Mclass. Johny likes driving it very much.
 57 | """
 58 | 
 59 | doc = nlp(text)
 60 | 
 61 | entities = [(e.text, e.label_) for e in doc.ents]
 62 | print(entities)
 63 | 
 64 | assert entities[0] == ("Johny Silver", "PERSON")  # Normal NER
 65 | assert entities[1] == ("red car", "CAR_COLOR")  # Our first rule
 66 | assert entities[2] == ("BMW X6 Mclass", "CAR_MODEL")  # Our second rule
 67 | assert entities[3] == ("Johny likes driving", "LIKED_ACTION")  # Our third rule
 68 | ```
 69 | 
 70 | Alternativelly, if `rita` is used as a dependency in project and you prefer to compile rules dynamically, you can do:
 71 | 
 72 | ```python
 73 | import rita
 74 | import spacy
 75 | from spacy.pipeline import EntityRuler
 76 | 
 77 | nlp = spacy.load("en_core_web_sm")
 78 | ruler = EntityRuler(nlp, overwrite_ents=True)
 79 | 
 80 | patterns = rita.compile("examples/color-car.rita")
 81 | 
 82 | ruler.add_patterns(patterns)
 83 | nlp.add_pipe(ruler)
 84 | ```
 85 | 
 86 | If you don't want to use file to store rules, they can be compiled directly from string
 87 | 
 88 | ```python
 89 | patterns = rita.compile_string("""
 90 | {WORD("Hello"), WORD("World")}->MARK("GREETING")
 91 | """)
 92 | ```
 93 | 
 94 | 
 95 | ## Standalone Version
 96 | 
 97 | While it is highly recommended to use it with spaCy as a base, there can be cases when pure python regex is the only option.
 98 | 
 99 | You can pass rule compilation function explicitly. This concrete function will build regular expressions and create executor which accepts raw text and returns list of results.
100 | 
101 | Here's a test covering this case
102 | 
103 | ```python
104 | def test_standalone_simple():
105 |     patterns = rita.compile("examples/simple-match.rita", use_engine="standalone")
106 |     results = list(patterns.execute("Donald Trump was elected President in 2016 defeating Hilary Clinton."))
107 |     assert len(results) == 2
108 |     entities = list([(r["text"], r["label"]) for r in results])
109 | 
110 |     assert entities[0] == ("Donald Trump was elected", "WON_ELECTION")
111 |     assert entities[1] == ("defeating Hilary Clinton", "LOST_ELECTION")
112 | ```
113 | 
114 | **Since version** `0.5.10`: custom regex implementation can be given. Either to boost performance, or to improve matches. By default, standard Python `re` is used.
115 | 
116 | It can be passed like this:
117 | 
118 | ```python
119 | import rita
120 | import regex
121 | patterns = rita.compile("examples/simple-match.rita", use_engine="standalone", regex_impl=regex)
122 | ```
123 | 


--------------------------------------------------------------------------------
/docs/assets/jetbrains.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <!-- Generator: Adobe Illustrator 19.1.0, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
 3 | <svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
 4 | 	 width="120.1px" height="130.2px" viewBox="0 0 120.1 130.2" style="enable-background:new 0 0 120.1 130.2;" xml:space="preserve"
 5 | 	>
 6 | <g>
 7 | 	<linearGradient id="XMLID_2_" gradientUnits="userSpaceOnUse" x1="31.8412" y1="120.5578" x2="110.2402" y2="73.24">
 8 | 		<stop  offset="0" style="stop-color:#FCEE39"/>
 9 | 		<stop  offset="1" style="stop-color:#F37B3D"/>
10 | 	</linearGradient>
11 | 	<path id="XMLID_3041_" style="fill:url(#XMLID_2_);" d="M118.6,71.8c0.9-0.8,1.4-1.9,1.5-3.2c0.1-2.6-1.8-4.7-4.4-4.9
12 | 		c-1.2-0.1-2.4,0.4-3.3,1.1l0,0l-83.8,45.9c-1.9,0.8-3.6,2.2-4.7,4.1c-2.9,4.8-1.3,11,3.6,13.9c3.4,2,7.5,1.8,10.7-0.2l0,0l0,0
13 | 		c0.2-0.2,0.5-0.3,0.7-0.5l78-54.8C117.3,72.9,118.4,72.1,118.6,71.8L118.6,71.8L118.6,71.8z"/>
14 | 	<linearGradient id="XMLID_3_" gradientUnits="userSpaceOnUse" x1="48.3607" y1="6.9083" x2="119.9179" y2="69.5546">
15 | 		<stop  offset="0" style="stop-color:#EF5A6B"/>
16 | 		<stop  offset="0.57" style="stop-color:#F26F4E"/>
17 | 		<stop  offset="1" style="stop-color:#F37B3D"/>
18 | 	</linearGradient>
19 | 	<path id="XMLID_3049_" style="fill:url(#XMLID_3_);" d="M118.8,65.1L118.8,65.1L55,2.5C53.6,1,51.6,0,49.3,0
20 | 		c-4.3,0-7.7,3.5-7.7,7.7v0c0,2.1,0.8,3.9,2.1,5.3l0,0l0,0c0.4,0.4,0.8,0.7,1.2,1l67.4,57.7l0,0c0.8,0.7,1.8,1.2,3,1.3
21 | 		c2.6,0.1,4.7-1.8,4.9-4.4C120.2,67.3,119.7,66,118.8,65.1z"/>
22 | 	<linearGradient id="XMLID_4_" gradientUnits="userSpaceOnUse" x1="52.9467" y1="63.6407" x2="10.5379" y2="37.1562">
23 | 		<stop  offset="0" style="stop-color:#7C59A4"/>
24 | 		<stop  offset="0.3852" style="stop-color:#AF4C92"/>
25 | 		<stop  offset="0.7654" style="stop-color:#DC4183"/>
26 | 		<stop  offset="0.957" style="stop-color:#ED3D7D"/>
27 | 	</linearGradient>
28 | 	<path id="XMLID_3042_" style="fill:url(#XMLID_4_);" d="M57.1,59.5C57,59.5,17.7,28.5,16.9,28l0,0l0,0c-0.6-0.3-1.2-0.6-1.8-0.9
29 | 		c-5.8-2.2-12.2,0.8-14.4,6.6c-1.9,5.1,0.2,10.7,4.6,13.4l0,0l0,0C6,47.5,6.6,47.8,7.3,48c0.4,0.2,45.4,18.8,45.4,18.8l0,0
30 | 		c1.8,0.8,3.9,0.3,5.1-1.2C59.3,63.7,59,61,57.1,59.5z"/>
31 | 	<linearGradient id="XMLID_5_" gradientUnits="userSpaceOnUse" x1="52.1736" y1="3.7019" x2="10.7706" y2="37.8971">
32 | 		<stop  offset="0" style="stop-color:#EF5A6B"/>
33 | 		<stop  offset="0.364" style="stop-color:#EE4E72"/>
34 | 		<stop  offset="1" style="stop-color:#ED3D7D"/>
35 | 	</linearGradient>
36 | 	<path id="XMLID_3057_" style="fill:url(#XMLID_5_);" d="M49.3,0c-1.7,0-3.3,0.6-4.6,1.5L4.9,28.3c-0.1,0.1-0.2,0.1-0.2,0.2l-0.1,0
37 | 		l0,0c-1.7,1.2-3.1,3-3.9,5.1C-1.5,39.4,1.5,45.9,7.3,48c3.6,1.4,7.5,0.7,10.4-1.4l0,0l0,0c0.7-0.5,1.3-1,1.8-1.6l34.6-31.2l0,0
38 | 		c1.8-1.4,3-3.6,3-6.1v0C57.1,3.5,53.6,0,49.3,0z"/>
39 | 	<g id="XMLID_3008_">
40 | 		<rect id="XMLID_3033_" x="34.6" y="37.4" style="fill:#000000;" width="51" height="51"/>
41 | 		<rect id="XMLID_3032_" x="39" y="78.8" style="fill:#FFFFFF;" width="19.1" height="3.2"/>
42 | 		<g id="XMLID_3009_">
43 | 			<path id="XMLID_3030_" style="fill:#FFFFFF;" d="M38.8,50.8l1.5-1.4c0.4,0.5,0.8,0.8,1.3,0.8c0.6,0,0.9-0.4,0.9-1.2l0-5.3l2.3,0
44 | 				l0,5.3c0,1-0.3,1.8-0.8,2.3c-0.5,0.5-1.3,0.8-2.3,0.8C40.2,52.2,39.4,51.6,38.8,50.8z"/>
45 | 			<path id="XMLID_3028_" style="fill:#FFFFFF;" d="M45.3,43.8l6.7,0v1.9l-4.4,0V47l4,0l0,1.8l-4,0l0,1.3l4.5,0l0,2l-6.7,0
46 | 				L45.3,43.8z"/>
47 | 			<path id="XMLID_3026_" style="fill:#FFFFFF;" d="M55,45.8l-2.5,0l0-2l7.3,0l0,2l-2.5,0l0,6.3l-2.3,0L55,45.8z"/>
48 | 			<path id="XMLID_3022_" style="fill:#FFFFFF;" d="M39,54l4.3,0c1,0,1.8,0.3,2.3,0.7c0.3,0.3,0.5,0.8,0.5,1.4v0
49 | 				c0,1-0.5,1.5-1.3,1.9c1,0.3,1.6,0.9,1.6,2v0c0,1.4-1.2,2.3-3.1,2.3l-4.3,0L39,54z M43.8,56.6c0-0.5-0.4-0.7-1-0.7l-1.5,0l0,1.5
50 | 				l1.4,0C43.4,57.3,43.8,57.1,43.8,56.6L43.8,56.6z M43,59l-1.8,0l0,1.5H43c0.7,0,1.1-0.3,1.1-0.8v0C44.1,59.2,43.7,59,43,59z"/>
51 | 			<path id="XMLID_3019_" style="fill:#FFFFFF;" d="M46.8,54l3.9,0c1.3,0,2.1,0.3,2.7,0.9c0.5,0.5,0.7,1.1,0.7,1.9v0
52 | 				c0,1.3-0.7,2.1-1.7,2.6l2,2.9l-2.6,0l-1.7-2.5h-1l0,2.5l-2.3,0L46.8,54z M50.6,58c0.8,0,1.2-0.4,1.2-1v0c0-0.7-0.5-1-1.2-1
53 | 				l-1.5,0v2H50.6z"/>
54 | 			<path id="XMLID_3016_" style="fill:#FFFFFF;" d="M56.8,54l2.2,0l3.5,8.4l-2.5,0l-0.6-1.5l-3.2,0l-0.6,1.5l-2.4,0L56.8,54z
55 | 				 M58.8,59l-0.9-2.3L57,59L58.8,59z"/>
56 | 			<path id="XMLID_3014_" style="fill:#FFFFFF;" d="M62.8,54l2.3,0l0,8.3l-2.3,0L62.8,54z"/>
57 | 			<path id="XMLID_3012_" style="fill:#FFFFFF;" d="M65.7,54l2.1,0l3.4,4.4l0-4.4l2.3,0l0,8.3l-2,0L68,57.8l0,4.6l-2.3,0L65.7,54z"
58 | 				/>
59 | 			<path id="XMLID_3010_" style="fill:#FFFFFF;" d="M73.7,61.1l1.3-1.5c0.8,0.7,1.7,1,2.7,1c0.6,0,1-0.2,1-0.6v0
60 | 				c0-0.4-0.3-0.5-1.4-0.8c-1.8-0.4-3.1-0.9-3.1-2.6v0c0-1.5,1.2-2.7,3.2-2.7c1.4,0,2.5,0.4,3.4,1.1l-1.2,1.6
61 | 				c-0.8-0.5-1.6-0.8-2.3-0.8c-0.6,0-0.8,0.2-0.8,0.5v0c0,0.4,0.3,0.5,1.4,0.8c1.9,0.4,3.1,1,3.1,2.6v0c0,1.7-1.3,2.7-3.4,2.7
62 | 				C76.1,62.5,74.7,62,73.7,61.1z"/>
63 | 		</g>
64 | 	</g>
65 | </g>
66 | </svg>
67 | 


--------------------------------------------------------------------------------
/rita/engine/translate_rust.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | 
  4 | from platform import system
  5 | 
  6 | from ctypes import (c_char_p, c_int, c_uint, c_long, Structure, cdll, POINTER)
  7 | from typing import Any, TYPE_CHECKING, Tuple, List, AnyStr
  8 | 
  9 | from rita.engine.translate_standalone import rules_to_patterns, RuleExecutor
 10 | from rita.types import Rules
 11 | 
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | field = Tuple[AnyStr, Any]
 15 | fields = List[field]
 16 | 
 17 | if TYPE_CHECKING:
 18 |     # We cannot simply import SessionConfig because of cyclic imports
 19 |     from rita.config import SessionConfig
 20 | 
 21 | 
 22 | class NamedRangeResult(Structure):
 23 |     _fields_ = [
 24 |         ("start", c_long),
 25 |         ("end", c_long),
 26 |         ("name", c_char_p),
 27 |     ]
 28 | 
 29 | 
 30 | class ResultEntity(Structure):
 31 |     _fields_ = [
 32 |         ("label", c_char_p),
 33 |         ("start", c_long),
 34 |         ("end", c_long),
 35 |         ("sub_count", c_uint),
 36 |     ]
 37 | 
 38 | 
 39 | class Result(Structure):
 40 |     _fields_ = [
 41 |         ("count", c_uint)
 42 |     ]
 43 | 
 44 | 
 45 | class Context(Structure):
 46 |     _fields_: fields = []
 47 | 
 48 | 
 49 | def load_lib():
 50 |     try:
 51 |         os_name = system()
 52 |         if os_name == "Windows":
 53 |             lib = cdll.LoadLibrary("rita_rust.dll")
 54 |         elif os_name == "Darwin":
 55 |             lib = cdll.LoadLibrary("librita_rust.dylib")
 56 |         else:
 57 |             lib = cdll.LoadLibrary("librita_rust.so")
 58 |         lib.compile.restype = POINTER(Context)
 59 |         lib.execute.argtypes = [POINTER(Context), c_char_p]
 60 |         lib.execute.restype = POINTER(Result)
 61 |         lib.clean_env.argtypes = [POINTER(Context)]
 62 |         lib.clean_result.argtypes = [POINTER(Result)]
 63 |         lib.read_result.argtypes = [POINTER(Result), c_int]
 64 |         lib.read_result.restype = POINTER(ResultEntity)
 65 |         lib.read_submatch.argtypes = [POINTER(ResultEntity), c_int]
 66 |         lib.read_submatch.restype = POINTER(NamedRangeResult)
 67 |         return lib
 68 |     except Exception as ex:
 69 |         logger.error("Failed to load rita-rust library, reason: {}\n\n"
 70 |                      "Most likely you don't have required shared library to use it".format(ex))
 71 | 
 72 | 
 73 | class RustRuleExecutor(RuleExecutor):
 74 |     def __init__(self, patterns, config: "SessionConfig"):
 75 |         self.config = config
 76 |         self.context = None
 77 | 
 78 |         self.lib = load_lib()
 79 |         self.patterns = [self._build_regex_str(label, rules)
 80 |                          for label, rules in patterns]
 81 | 
 82 |         self.compile()
 83 | 
 84 |     @staticmethod
 85 |     def _build_regex_str(label, rules):
 86 |         indexed_rules = ["(?P<s{}>{})".format(i, r) if not r.startswith("(?P<") else r
 87 |                          for i, r in enumerate(rules)]
 88 |         return r"(?P<{0}>{1})".format(label, "".join(indexed_rules))
 89 | 
 90 |     def compile(self):
 91 |         flag = 0 if self.config.ignore_case else 1
 92 |         c_array = (c_char_p * len(self.patterns))(*list([p.encode("UTF-8") for p in self.patterns]))
 93 |         self.context = self.lib.compile(c_array, len(c_array), flag)
 94 |         return self.context
 95 | 
 96 |     def execute(self, text, include_submatches=True):
 97 |         result_ptr = self.lib.execute(self.context, text.encode("UTF-8"))
 98 |         count = result_ptr[0].count
 99 |         for i in range(0, count):
100 |             match_ptr = self.lib.read_result(result_ptr, i)
101 |             match = match_ptr[0]
102 |             matched_text = text[match.start:match.end].strip()
103 | 
104 |             def parse_subs():
105 |                 k = match.sub_count
106 |                 for j in range(0, k):
107 |                     s = self.lib.read_submatch(match_ptr, j)[0]
108 |                     start = s.start
109 |                     end = s.end
110 |                     sub_text = text[start:end]
111 | 
112 |                     if sub_text.strip() == "":
113 |                         continue
114 | 
115 |                     yield {
116 |                         "text": sub_text.strip(),
117 |                         "start": start,
118 |                         "end": end,
119 |                         "key": s.name.decode("UTF-8"),
120 |                     }
121 | 
122 |             yield {
123 |                 "start": match.start,
124 |                 "end": match.end,
125 |                 "text": matched_text,
126 |                 "label": match.label.decode("UTF-8"),
127 |                 "submatches": list(parse_subs()) if include_submatches else []
128 |             }
129 | 
130 |     def clean_context(self):
131 |         self.lib.clean_env(self.context)
132 | 
133 |     @staticmethod
134 |     def load(path):
135 |         from rita.config import SessionConfig
136 |         config = SessionConfig()
137 |         with open(path, "r") as f:
138 |             patterns = [(obj["label"], obj["rules"])
139 |                         for obj in map(json.loads, f.readlines())]
140 |             return RustRuleExecutor(patterns, config)
141 | 
142 | 
143 | def compile_rules(rules: Rules, config: "SessionConfig", **kwargs) -> RustRuleExecutor:
144 |     logger.info("Using rita-rust rule implementation")
145 |     patterns = [rules_to_patterns(*group, config=config) for group in rules]
146 |     executor = RustRuleExecutor(patterns, config)
147 |     return executor
148 | 


--------------------------------------------------------------------------------
/rita/utils.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | from contextlib import contextmanager
  4 | from unicodedata import normalize, category
  5 | from itertools import cycle, chain
  6 | from time import time
  7 | from json import JSONEncoder
  8 | 
  9 | logger = logging.getLogger(__name__)
 10 | 
 11 | 
 12 | class Node(object):
 13 |     """
 14 |     An utility structure. Has no meaning outside
 15 |     Allows to specify single path showing how it branches
 16 |     and by doing `unwrap` we get multiple lists for each possible variation
 17 |     """
 18 |     def __init__(self, data=None):
 19 |         self.data = data
 20 |         self.children = []
 21 |         self.next_node = None
 22 |         self.children_cycle = None
 23 |         self.ref_count = 0
 24 |         self.depth = 0
 25 |         self.current = None
 26 | 
 27 |     def add_child(self, c):
 28 |         self.children.append(Node(c))
 29 |         self.reset_cycle()
 30 | 
 31 |     def add_next(self, node):
 32 |         self.next_node = node
 33 | 
 34 |     @property
 35 |     def child(self):
 36 |         # Corner case of 0 depth
 37 |         if self.depth == 0:
 38 |             result = self.current
 39 |             self.next_child()
 40 |             return result
 41 | 
 42 |         if self.ref_count >= self.depth:
 43 |             self.next_child()
 44 |             self.ref_count = 0
 45 |         else:
 46 |             self.ref_count += 1
 47 |         return self.current
 48 | 
 49 |     def next_child(self):
 50 |         self.current = next(self.children_cycle)
 51 | 
 52 |     def reset_cycle(self):
 53 |         self.children_cycle = cycle(self.children)
 54 |         self.current = next(self.children_cycle)
 55 | 
 56 |     def unwrap(self):
 57 |         variants = 1
 58 |         current = self
 59 |         while current is not None:
 60 |             variants *= current.weight
 61 |             current = current.next_node
 62 | 
 63 |         logger.debug("Total variants: {}".format(variants))
 64 | 
 65 |         for i in range(0, variants):
 66 |             result = []
 67 |             current = self
 68 |             while current is not None:
 69 |                 if current.data:
 70 |                     result.append(current.data)
 71 |                 if len(current.children) > 0:
 72 |                     c = current.child
 73 |                     result.append(c.data)
 74 |                 current = current.next_node
 75 |             yield result
 76 | 
 77 |     @property
 78 |     def weight(self):
 79 |         if len(self.children) == 0:
 80 |             return 1
 81 |         else:
 82 |             return len(self.children)
 83 | 
 84 |     def __repr__(self):
 85 |         return "{data}[{children}] -> {next_node}".format(
 86 |             data=self.data,
 87 |             children=", ".join(map(str, self.children)),
 88 |             next_node=str(self.next_node)
 89 |         )
 90 | 
 91 | 
 92 | class SingletonMixin(object):
 93 |     _instance = None
 94 | 
 95 |     def __new__(class_, *args, **kwargs):
 96 |         if not isinstance(class_._instance, class_):
 97 |             class_._instance = object.__new__(class_, *args, **kwargs)
 98 |         return class_._instance
 99 | 
100 | 
101 | def deaccent(text):
102 |     return normalize("NFC",
103 |                      "".join(c
104 |                              for c in normalize("NFD", text)
105 |                              if category(c) != "Mn"))
106 | 
107 | 
108 | def flatten(lst, shallow=False):
109 |     def explode(v):
110 |         if callable(v):
111 |             return v()
112 |         else:
113 |             return v
114 | 
115 |     if len(lst) > 1 and not shallow:
116 |         return lst
117 | 
118 |     new_lst = map(explode, lst)
119 |     if shallow:
120 |         return new_lst
121 |     else:
122 |         return chain(*new_lst)
123 | 
124 | 
125 | class ExtendedOp(object):
126 |     def __init__(self, op=None):
127 |         self.case_sensitive_override = False
128 |         self.local_regex_override = False
129 |         if isinstance(op, ExtendedOp):
130 |             self.op = op.op
131 |             self.case_sensitive_override = op.case_sensitive_override
132 |             self.local_regex_override = op.local_regex_override
133 |         else:
134 |             self.op = op
135 | 
136 |     @property
137 |     def value(self):
138 |         return self.op
139 | 
140 |     def empty(self):
141 |         return self.op is None or self.op.strip() == ""
142 | 
143 |     def ignore_case(self, config):
144 |         if self.case_sensitive_override:
145 |             return False
146 |         else:
147 |             return config.ignore_case
148 | 
149 |     def __str__(self):
150 |         if self.op:
151 |             return self.op
152 |         return ""
153 | 
154 |     def __repr__(self):
155 |         return str(self)
156 | 
157 |     def __eq__(self, other):
158 |         if type(other) == str:
159 |             return self.op == other
160 | 
161 |         return (
162 |             self.op == other.op and
163 |             self.case_sensitive_override == other.case_sensitive_override and
164 |             self.local_regex_override == other.local_regex_override
165 |         )
166 | 
167 | 
168 | class Timer(object):
169 |     def __init__(self, title):
170 |         self.title = title
171 |         self.ts = time()
172 | 
173 |     def stop(self, debug=True):
174 |         now = time()
175 |         delta = int(now - self.ts) * 1000
176 |         msg = "{} took {}ms".format(self.title, delta)
177 |         if debug:
178 |             logger.debug(msg)
179 |         else:
180 |             logger.info(msg)
181 |         return delta
182 | 
183 | 
184 | @contextmanager
185 | def timer(title):
186 |     t = Timer(title)
187 |     yield
188 |     t.stop()
189 | 
190 | 
191 | class RitaJSONEncoder(JSONEncoder):
192 |     def default(self, o):
193 |         if isinstance(o, ExtendedOp):
194 |             return o.op
195 |         return o.__dict__
196 | 


--------------------------------------------------------------------------------
/rita/engine/translate_spacy.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | from functools import partial
  4 | from typing import Any, TYPE_CHECKING, Mapping, Callable, Generator, AnyStr
  5 | 
  6 | from rita.utils import ExtendedOp
  7 | from rita.types import Rules, Patterns
  8 | 
  9 | logger = logging.getLogger(__name__)
 10 | 
 11 | SpacyPattern = Generator[Mapping[AnyStr, Any], None, None]
 12 | ParseFn = Callable[[Any, "SessionConfig", ExtendedOp], SpacyPattern]
 13 | 
 14 | if TYPE_CHECKING:
 15 |     # We cannot simply import SessionConfig because of cyclic imports
 16 |     from rita.config import SessionConfig
 17 | 
 18 | 
 19 | def any_of_parse(lst, config: "SessionConfig", op: ExtendedOp) -> SpacyPattern:
 20 |     if op.ignore_case(config):
 21 |         normalized = sorted([item.lower()
 22 |                              for item in lst])
 23 |         base = {"LOWER": {"IN": normalized}}
 24 |     else:
 25 |         base = {"LOWER": {"IN": sorted(lst)}}
 26 | 
 27 |     if not op.empty():
 28 |         base["OP"] = op.value
 29 |     yield base
 30 | 
 31 | 
 32 | def regex_parse(r, config: "SessionConfig", op: ExtendedOp) -> SpacyPattern:
 33 |     if op.ignore_case(config):
 34 |         d = {"LOWER": {"REGEX": r.lower()}}
 35 |     else:
 36 |         d = {"TEXT": {"REGEX": r}}
 37 | 
 38 |     if not op.empty():
 39 |         d["OP"] = op.value
 40 |     yield d
 41 | 
 42 | 
 43 | def fuzzy_parse(r, config: "SessionConfig", op: ExtendedOp) -> SpacyPattern:
 44 |     # TODO: build premutations
 45 |     d = {"LOWER": {"REGEX": "({0})[.,?;!]?".format("|".join(r))}}
 46 |     if not op.empty():
 47 |         d["OP"] = op.value
 48 |     yield d
 49 | 
 50 | 
 51 | def generic_parse(tag, value, config: "SessionConfig", op: ExtendedOp) -> SpacyPattern:
 52 |     d = {}
 53 |     if isinstance(value, list) and len(value) > 1:
 54 |         value = {"IN": value}
 55 | 
 56 |     d[tag] = value
 57 | 
 58 |     if not op.empty():
 59 |         d["OP"] = op.value
 60 |     yield d
 61 | 
 62 | 
 63 | def entity_parse(value, config: "SessionConfig", op: ExtendedOp) -> SpacyPattern:
 64 |     tag = "ENT_TYPE"
 65 |     if op.empty():
 66 |         op.op = "+"
 67 |     return generic_parse(tag, value, config, op)
 68 | 
 69 | 
 70 | def punct_parse(_, config: "SessionConfig", op: ExtendedOp) -> SpacyPattern:
 71 |     d = dict()
 72 |     d["IS_PUNCT"] = True
 73 |     if not op.empty():
 74 |         d["OP"] = op.value
 75 |     yield d
 76 | 
 77 | 
 78 | def any_parse(_, config: "SessionConfig", op: ExtendedOp) -> SpacyPattern:
 79 |     d = dict()
 80 |     if not op.empty():
 81 |         d["OP"] = op.value
 82 |     yield d
 83 | 
 84 | 
 85 | def phrase_parse(value, config: "SessionConfig", op: ExtendedOp) -> SpacyPattern:
 86 |     """
 87 |     TODO: Does not support operators
 88 |     """
 89 |     splitter = next((s for s in ["-", " "]
 90 |                      if s in value), None)
 91 |     if splitter:
 92 |         buff = value.split(splitter)
 93 |         yield next(orth_parse(buff[0], config=config, op=ExtendedOp()))
 94 |         for b in buff[1:]:
 95 |             if splitter != " ":
 96 |                 yield next(orth_parse(splitter, config=config, op=ExtendedOp()))
 97 |             yield next(orth_parse(b, config=config, op=ExtendedOp()))
 98 |     else:
 99 |         yield next(orth_parse(value, config=config, op=ExtendedOp()))
100 | 
101 | 
102 | def tag_parse(values, config: "SessionConfig", op: ExtendedOp) -> SpacyPattern:
103 |     """
104 |     For generating POS/TAG patterns based on a Regex
105 |     e.g. TAG("^NN|^JJ") for adjectives or nouns
106 |     also deals with TAG_WORD for tag and word or tag and list
107 |     """
108 |     d = {"TAG": {"REGEX": values["tag"]}}
109 |     if "word" in values:
110 |         if op.ignore_case(config):
111 |             d["LOWER"] = values["word"].lower()
112 |         else:
113 |             d["TEXT"] = values["word"]
114 |     elif "list" in values:
115 |         lst = values["list"]
116 |         if op.ignore_case(config):
117 |             normalized = sorted([item.lower()
118 |                                  for item in lst])
119 |             d["LOWER"] = {"REGEX": r"^({0})$".format("|".join(normalized))}
120 |         else:
121 |             d["TEXT"] = {"REGEX": r"^({0})$".format("|".join(sorted(lst)))}
122 |     if not op.empty():
123 |         d["OP"] = op.value
124 |     yield d
125 | 
126 | 
127 | def nested_parse(values, config: "SessionConfig", op: ExtendedOp) -> SpacyPattern:
128 |     from rita.macros import resolve_value
129 |     results = rules_to_patterns("", [resolve_value(v, config=config)
130 |                                      for v in values], config=config)
131 |     return results["pattern"]
132 | 
133 | 
134 | def orth_parse(value, config: "SessionConfig", op: ExtendedOp) -> SpacyPattern:
135 |     d = {}
136 |     print(op.case_sensitive_override)
137 |     if op.ignore_case(config):
138 |         d["LOWER"] = value.lower()
139 |     else:
140 |         d["ORTH"] = value
141 | 
142 |     if not op.empty():
143 |         d["OP"] = op.value
144 |     yield d
145 | 
146 | 
147 | PARSERS: Mapping[str, ParseFn] = {
148 |     "any_of": any_of_parse,
149 |     "any": any_parse,
150 |     "value": orth_parse,
151 |     "regex": regex_parse,
152 |     "entity": entity_parse,
153 |     "lemma": partial(generic_parse, "LEMMA"),
154 |     "pos": partial(generic_parse, "POS"),
155 |     "punct": punct_parse,
156 |     "fuzzy": fuzzy_parse,
157 |     "phrase": phrase_parse,
158 |     "tag": tag_parse,
159 |     "nested": nested_parse,
160 |     "orth": orth_parse,
161 | }
162 | 
163 | 
164 | def rules_to_patterns(label: str, data: Patterns, config: "SessionConfig"):
165 |     logger.debug(data)
166 |     return {
167 |         "label": label,
168 |         "pattern": [p
169 |                     for (t, d, op) in data
170 |                     for p in PARSERS[t](d, config, ExtendedOp(op))],
171 |     }
172 | 
173 | 
174 | def compile_rules(rules: Rules, config: "SessionConfig", **kwargs):
175 |     logger.info("Using spaCy rules implementation")
176 |     return [rules_to_patterns(label, patterns, config=config)
177 |             for (label, patterns) in rules]
178 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | 0.7.0 (2021-02-02)
  2 | ****************************
  3 | 
  4 | Features
  5 | --------
  6 | 
  7 | - `standalone` engine now will return submatches list containing start and end for each part of match
  8 |   #93
  9 | - Partially covered https://github.com/zaibacu/rita-dsl/issues/70
 10 | 
 11 |   Allow nested patterns, like:
 12 | 
 13 |   .. code-block::
 14 | 
 15 |       num_with_fractions = {NUM, WORD("-")?, IN_LIST(fractions)}
 16 |       complex_number = {NUM|PATTERN(num_with_fractions)}
 17 | 
 18 |       {PATTERN(complex_number)}->MARK("NUMBER")
 19 |   #95
 20 | - Submatches for rita-rust engine
 21 |   #96
 22 | - Regex module which allows to specify word pattern, eg. `REGEX(^a)` means word must start with letter "a"
 23 | 
 24 |   Implemented by: Roland M. Mueller (https://github.com/rolandmueller)
 25 |   #101
 26 | - ORTH module which allows you to specify case sensitive entry while rest of the rules ignores case. Used for acronyms and proper names
 27 | 
 28 |   Implemented by: Roland M. Mueller (https://github.com/rolandmueller)
 29 |   #102
 30 | - Additional macro for `tag` module, allowing to tag specific word/list of words
 31 | 
 32 |   Implemented by: Roland M. Mueller (https://github.com/rolandmueller)
 33 |   #103
 34 | - Added `names` module which allows to generate person names variations
 35 |   #105
 36 | - spaCy v3 Support
 37 |   #109
 38 | 
 39 | Fix
 40 | ---
 41 | 
 42 | - Optimizations for Rust Engine
 43 | 
 44 |   - No need for passing text forward and backward, we can calculate from text[start:end]
 45 | 
 46 |   - Grouping and sorting logic can be done in binary code
 47 |   #88
 48 | - Fix NUM parsing bug
 49 |   #90
 50 | - Switch from `(^\s)` to `\b` when doing `IN_LIST`. Should solve several corner cases
 51 |   #91
 52 | - Fix floating point number matching
 53 |   #92
 54 | - revert #91 changes. Keep old way for word boundary
 55 |   #94
 56 | 
 57 | 
 58 | 0.6.0 (2020-08-29)
 59 | ****************************
 60 | 
 61 | Features
 62 | --------
 63 | 
 64 | - Implemented ability to alias macros, eg.:
 65 | 
 66 |   .. code-block::
 67 | 
 68 |       numbers = {"one", "two", "three"}
 69 |       @alias IN_LIST IL
 70 | 
 71 |       IL(numbers) -> MARK("NUMBER")
 72 | 
 73 |   Now using "IL" will actually call "IN_LIST" macro.
 74 |   #66
 75 | - introduce the TAG element as a module. Needs a new parser for the SpaCy translate.
 76 |   Would allow more flexible matching of detailed part-of-speech tag, like all adjectives or nouns: TAG("^NN|^JJ").
 77 | 
 78 |   Implemented by:
 79 |   Roland M. Mueller (https://github.com/rolandmueller)
 80 |   #81
 81 | - Add a new module for a PLURALIZE tag
 82 |   For a noun or a list of nouns, it will match any singular or plural word.
 83 | 
 84 |   Implemented by:
 85 |   Roland M. Mueller (https://github.com/rolandmueller)
 86 |   #82
 87 | - Add a new Configuration implicit_hyphon (default false) for automatically adding hyphon characters - to the rules.
 88 | 
 89 |   Implemented by:
 90 |   Roland M. Mueller (https://github.com/rolandmueller)
 91 |   #84
 92 | - Allow to give custom regex impl. By default `re` is used
 93 |   #86
 94 | - An interface to be able to use rust engine.
 95 | 
 96 |   In general it's identical to `standalone`, but differs in one crucial part - all of the rules are compiled into actual binary code and that provides large performance boost.
 97 |   It is proprietary, because there are various caveats, engine itself is a bit more fragile and needs to be tinkered to be optimized to very specific case
 98 |   (eg. few long texts with many matches vs a lot short texts with few matches).
 99 |   #87
100 | 
101 | Fix
102 | ---
103 | 
104 | - Fix `-` bug when it is used as stand alone word
105 |   #71
106 | - Fix regex matching, when shortest word is selected from IN_LIST
107 |   #72
108 | - Fix IN_LIST regex so that it wouldn't take part of word
109 |   #75
110 | - Fix IN_LIST operation bug - it was ignoring them
111 |   #77
112 | - Use list branching only when using spaCy Engine
113 |   #80
114 | 
115 | 
116 | 0.5.0 (2020-06-18)
117 | ****************************
118 | 
119 | Features
120 | --------
121 | 
122 | - Added `PREFIX` macro which allows to attach word in front of list items or words
123 |   #47
124 | - Allow to pass variables directly when doing `compile` and `compile_string`
125 |   #51
126 | - Allow to compile (and later load) rules using rita CLI while using standalone engine (spacy is already supported)
127 |   #53
128 | - Added ability to import rule files into rule file. Recursive import is supported as well.
129 |   #55
130 | - Added possibility to define pattern as a variable and reuse it in other patterns:
131 | 
132 |   Example:
133 |   .. code-block:: RITA
134 | 
135 |       ComplexNumber = {NUM+, WORD("/")?, NUM?}
136 | 
137 |       {PATTERN(ComplexNumber), WORD("inches"), WORD("Height")}->MARK("HEIGHT")
138 | 
139 |       {PATTERN(ComplexNumber), WORD("inches"), WORD("Width")}->MARK("WIDTH")
140 |   #64
141 | 
142 | Fix
143 | ---
144 | 
145 | - Fix issue with multiple wildcard words using standalone engine
146 |   #46
147 | - Don't crash when no rules are provided
148 |   #50
149 | - Fix Number and ANY-OF parsing
150 |   #59
151 | - Allow escape characters inside LITERAL
152 |   #62
153 | 
154 | 
155 | 0.4.0 (2020-01-25)
156 | ****************************
157 | 
158 | Features
159 | --------
160 | 
161 | - Support for deaccent. In general, if accented version of word is given, both deaccented and accented will be used to match. To turn iit off - `!CONFIG("deaccent", "N")`
162 |   #38
163 | - Added shortcuts module to simplify injecting into spaCy
164 |   #42
165 | 
166 | Fix
167 | ---
168 | 
169 | - Fix issue regarding Spacy rules with `IN_LIST` and using case-sensitive mode. It was creating Regex pattern which is not valid spacy pattern
170 |   #40
171 | 
172 | 
173 | 0.3.2 (2019-12-19)
174 | ***********************
175 | 
176 | Features
177 | --------
178 | 
179 | - - Introduced `towncrier` to track changes
180 |   - Added linter `flake8`
181 |   - Refactored code to match `pep8`
182 |   #32
183 | 
184 | Fix
185 | ---
186 | 
187 | - - Fix WORD split by `-`
188 | 
189 |   - Split by ` ` (empty space) as well
190 | 
191 |   - Coverage score increase
192 |   #35
193 | 
194 | 
195 | 


--------------------------------------------------------------------------------
/rita/parser.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | import ply.yacc as yacc
  4 | 
  5 | from functools import partial
  6 | 
  7 | from rita.lexer import RitaLexer
  8 | from rita import macros
  9 | 
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | 
 13 | def stub(*args, **kwargs):
 14 |     return None
 15 | 
 16 | 
 17 | def either(a, b):
 18 |     yield a
 19 |     yield b
 20 | 
 21 | 
 22 | def load_macro(name, config):
 23 |     try:
 24 |         return partial(getattr(macros, name), config=config)
 25 |     except Exception:
 26 |         pass
 27 | 
 28 |     def lazy_load(*args, **kwargs):
 29 |         logger.info(config.modules)
 30 |         for mod in config.modules:
 31 |             try:
 32 |                 fn = getattr(mod, name)
 33 |                 return fn(*args, **kwargs)
 34 |             except Exception as ex:
 35 |                 logger.error(ex)
 36 |                 continue
 37 | 
 38 |         raise RuntimeError("MACRO {} not loaded".format(name))
 39 | 
 40 |     return lazy_load
 41 | 
 42 | 
 43 | def var_wrapper(variable, config):
 44 |     def wrapper(*args, **kwargs):
 45 |         logger.debug("Variables: {}".format(config.variables))
 46 |         return config.get_variable(variable)
 47 | 
 48 |     return wrapper
 49 | 
 50 | 
 51 | class RitaParser(object):
 52 |     tokens = RitaLexer.tokens
 53 |     precedence = (
 54 |         ("nonassoc", "ARROW"),
 55 |         ("nonassoc", "PIPE"),
 56 |         ("nonassoc", "COMMA"),
 57 |         ("left", "EXEC"),
 58 |         ("left", "ASSIGN"),
 59 |         ("left", "RBRACKET", "LBRACKET", "LPAREN", "RPAREN"),
 60 |         ("left", "KEYWORD", "NAME", "LITERAL"),
 61 |         ("right", "MODIF_QMARK", "MODIF_STAR", "MODIF_PLUS"),
 62 |     )
 63 | 
 64 |     def __init__(self, config):
 65 |         self.config = config
 66 |         self.lexer = None
 67 |         self.parser = None
 68 | 
 69 |     def p_document(self, p):
 70 |         """
 71 |         DOCUMENT : MACRO_CHAIN
 72 |                  | MACRO_EXEC
 73 |                  | VARIABLE
 74 |         """
 75 |         logger.debug("Building initial document {}".format(p[1]))
 76 |         p[0] = [p[1]]
 77 | 
 78 |     def p_document_list(self, p):
 79 |         """
 80 |         DOCUMENT : DOCUMENT MACRO_CHAIN
 81 |                  | DOCUMENT MACRO_EXEC
 82 |                  | DOCUMENT VARIABLE
 83 |         """
 84 |         logger.debug("Extending document {}".format(p[2]))
 85 |         p[0] = p[1] + [p[2]]
 86 | 
 87 |     def p_macro_chain(self, p):
 88 |         " MACRO_CHAIN : MACRO ARROW MACRO "
 89 |         logger.debug("Have {0} -> {1}".format(p[1], p[3]))
 90 |         p[0] = partial(
 91 |             p[3],
 92 |             macros.PATTERN(p[1], config=self.config),
 93 |             config=self.config
 94 |         )
 95 | 
 96 |     def p_macro_chain_from_array(self, p):
 97 |         " MACRO_CHAIN : ARRAY ARROW MACRO "
 98 |         logger.debug("Have {0} -> {1}".format(p[1], p[3]))
 99 |         p[0] = partial(
100 |             p[3],
101 |             macros.PATTERN(*p[1], config=self.config),
102 |             config=self.config
103 |         )
104 | 
105 |     def p_macro_exec(self, p):
106 |         " MACRO_EXEC : EXEC MACRO "
107 |         logger.debug("Exec {0}".format(p[2]))
108 |         macros.EXEC(p[2], config=self.config)
109 |         p[0] = stub
110 | 
111 |     def p_macro_w_modif(self, p):
112 |         """
113 |         MACRO : MACRO MODIF_PLUS
114 |               | MACRO MODIF_STAR
115 |               | MACRO MODIF_QMARK
116 |               | MACRO EXEC
117 |         """
118 |         logger.debug("Adding modifier to Macro {}".format(p[1]))
119 |         fn = p[1]
120 |         p[0] = partial(fn, op=p[2])
121 | 
122 |     def p_macro_wo_args(self, p):
123 |         " MACRO : KEYWORD "
124 |         fn = load_macro(p[1], config=self.config)
125 |         logger.debug("Parsing macro (w/o args): {}".format(p[1]))
126 |         p[0] = fn
127 | 
128 |     def p_macro_w_args(self, p):
129 |         " MACRO : KEYWORD LPAREN ARGS RPAREN "
130 |         logger.debug("Parsing macro: {0}, args: {1}".format(p[1], p[3]))
131 |         fn = load_macro(p[1], config=self.config)
132 |         p[0] = partial(fn, *p[3])
133 | 
134 |     def p_macro_from_array(self, p):
135 |         """
136 |         MACRO : KEYWORD ARRAY
137 |               | KEYWORD ARG_ARRAY
138 |         """
139 |         logger.debug("Parsing macro: {0}, args: {1}".format(p[1], p[2]))
140 |         fn = load_macro(p[1], config=self.config)
141 |         p[0] = partial(fn, *p[2])
142 | 
143 |     def p_array(self, p):
144 |         " ARRAY : LBRACKET ARGS RBRACKET "
145 |         p[0] = p[2]
146 | 
147 |     def p_arg_array(self, p):
148 |         " ARG_ARRAY : LPAREN ARGS RPAREN "
149 |         p[0] = p[2]
150 | 
151 |     def p_variable(self, p):
152 |         " VARIABLE_NAME : NAME "
153 |         p[0] = var_wrapper(p[1], self.config)
154 | 
155 |     def p_variable_from_args(self, p):
156 |         " VARIABLE : NAME ASSIGN ARGS "
157 |         if len(p[3]) == 1:
158 |             macros.ASSIGN(p[1], p[3][0], config=self.config)
159 |         else:
160 |             macros.ASSIGN(p[1], p[3], config=self.config)
161 | 
162 |         p[0] = stub
163 | 
164 |     def p_either(self, p):
165 |         " ARG : ARG PIPE ARG "
166 |         p[0] = either(p[1], p[3])
167 | 
168 |     def p_arg_list(self, p):
169 |         " ARGS : ARGS COMMA ARG "
170 |         p[0] = p[1] + [p[3]]
171 | 
172 |     def p_args(self, p):
173 |         " ARGS : ARG "
174 |         p[0] = [p[1]]
175 | 
176 |     def p_arg(self, p):
177 |         " ARG : LITERAL "
178 |         p[0] = p[1]
179 | 
180 |     def p_arg_from_macro(self, p):
181 |         " ARG : MACRO "
182 |         p[0] = p[1]
183 | 
184 |     def p_arg_from_var(self, p):
185 |         " ARG : VARIABLE_NAME "
186 |         p[0] = p[1]()
187 | 
188 |     def p_arg_from_array(self, p):
189 |         " ARGS : ARRAY "
190 |         p[0] = p[1]
191 | 
192 |     def p_error(self, p):
193 |         if p:
194 |             logger.error("Syntax error at '{}'".format(p.value))
195 |         else:
196 |             logger.error("p is null")
197 | 
198 |     def build(self, **kwargs):
199 |         self.lexer = RitaLexer().build(**kwargs)
200 |         self.parser = yacc.yacc(module=self, errorlog=logger, **kwargs)
201 | 
202 |     def parse(self, data):
203 |         if data.strip() == "":
204 |             return []
205 | 
206 |         return self.parser.parse(r"{}".format(data), lexer=self.lexer, debug=logger)
207 | 


--------------------------------------------------------------------------------
/tests/test_parser.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import pytest
  3 | 
  4 | from rita.parser import RitaParser
  5 | from rita.config import SessionConfig
  6 | from rita.utils import ExtendedOp
  7 | 
  8 | 
  9 | @pytest.fixture
 10 | def config():
 11 |     return SessionConfig()
 12 | 
 13 | 
 14 | def test_parser_empty_rules(config):
 15 |     p = RitaParser(config)
 16 |     p.build()
 17 |     results = p.parse("")
 18 |     assert len(results) == 0
 19 | 
 20 | 
 21 | def test_parser_any_macro_wo_args_w_type(config):
 22 |     p = RitaParser(config)
 23 |     p.build()
 24 | 
 25 |     results = p.parse('ANY -> MARK("PlaceHolder")')
 26 |     assert len(results) == 1
 27 | 
 28 | 
 29 | def test_parser_any_macro_w_args_w_type(config):
 30 |     p = RitaParser(config)
 31 |     p.build()
 32 | 
 33 |     results = p.parse('{WORD("arg1")} -> MARK("PlaceHolder")')
 34 |     assert len(results) == 1
 35 | 
 36 | 
 37 | def test_parser_nested_macro(config):
 38 |     p = RitaParser(config)
 39 |     p.build()
 40 | 
 41 |     results = p.parse('{ANY, WORD("test")} -> MARK("Test")')
 42 |     assert len(results) == 1
 43 |     for result in results:
 44 |         print(result())
 45 | 
 46 | 
 47 | def test_parser_assign_literal_and_ignore_it(config):
 48 |     p = RitaParser(config)
 49 |     p.build(debug=True)
 50 | 
 51 |     results = p.parse(
 52 |         """
 53 |         my_variable = "Test"
 54 | 
 55 |         {WORD("something")} -> MARK("TEST")
 56 |         """
 57 |     )
 58 |     assert len(results) == 2
 59 | 
 60 |     rules = results[1]()
 61 | 
 62 |     print(rules)
 63 |     assert {"label": "TEST", "data": [("value", "something", ExtendedOp())]} == rules
 64 | 
 65 | 
 66 | def test_parser_assign_literal_and_use_it(config):
 67 |     p = RitaParser(config)
 68 |     p.build(debug=True)
 69 | 
 70 |     results = p.parse(
 71 |         """
 72 |         my_variable = "Test"
 73 | 
 74 |         {WORD(my_variable)} -> MARK("TEST")
 75 |         """
 76 |     )
 77 |     assert len(results) == 2
 78 | 
 79 |     rules = results[1]()
 80 | 
 81 |     print(rules)
 82 |     assert {"label": "TEST", "data": [("value", "Test", ExtendedOp())]} == rules
 83 | 
 84 | 
 85 | def test_parser_just_assign_macro(config):
 86 |     p = RitaParser(config)
 87 |     p.build(debug=True)
 88 | 
 89 |     results = p.parse(
 90 |         """
 91 |         x = WORD("Test")
 92 |         """
 93 |     )
 94 |     assert len(results) == 1
 95 | 
 96 | 
 97 | def test_parser_assign_two_variables(config):
 98 |     p = RitaParser(config)
 99 |     p.build(debug=True)
100 | 
101 |     results = p.parse(
102 |         """
103 |         a = "A"
104 |         b = "B"
105 |         """
106 |     )
107 |     assert len(results) == 2
108 | 
109 | 
110 | def test_parser_assign_macro_and_use_it(config):
111 |     p = RitaParser(config)
112 |     p.build(debug=True)
113 | 
114 |     results = p.parse(
115 |         """
116 |         my_variable = WORD("Test")
117 | 
118 |         {my_variable} -> MARK("TEST")
119 |         """
120 |     )
121 |     assert len(results) == 2
122 | 
123 |     rules = results[1]()
124 | 
125 |     print(rules)
126 |     assert {"label": "TEST", "data": [("value", "Test", ExtendedOp())]} == rules
127 | 
128 | 
129 | def test_parser_import_module(config):
130 |     p = RitaParser(config)
131 |     p.build(debug=True)
132 | 
133 |     results = p.parse(
134 |         """
135 |         IMPORT("rita.modules.fuzzy") -> EXEC
136 | 
137 |         FUZZY("test") -> MARK("FUZZY_MATCH")
138 |         """
139 |     )
140 | 
141 |     assert len(results) == 2
142 | 
143 | 
144 | def test_parser_import_module_shortcut(config, caplog):
145 |     caplog.set_level(logging.INFO)
146 |     p = RitaParser(config)
147 |     p.build(debug=True)
148 | 
149 |     results = p.parse(
150 |         """
151 |         !IMPORT("rita.modules.fuzzy")
152 | 
153 |         FUZZY("test") -> MARK("FUZZY_MATCH")
154 |         """
155 |     )
156 | 
157 |     assert len(results) == 2
158 | 
159 | 
160 | def test_parser_config(config):
161 |     p = RitaParser(config)
162 |     p.build(debug=True)
163 | 
164 |     p.parse(
165 |         """
166 |         !CONFIG("foo", "bar")
167 |         !CONFIG("testing", "1")
168 |         """
169 |     )
170 | 
171 |     assert config.foo == "bar"
172 |     assert config.testing
173 | 
174 | 
175 | def test_parser_list_w_one_item(config):
176 |     p = RitaParser(config)
177 |     p.build(debug=True)
178 | 
179 |     results = p.parse(
180 |         """
181 |         members = { "one" }
182 | 
183 |         IN_LIST(members) -> MARK("MEMBER")
184 |         """
185 |     )
186 | 
187 |     assert len(results) == 2
188 | 
189 | 
190 | def test_parser_list_w_two_items(config):
191 |     p = RitaParser(config)
192 |     p.build(debug=True)
193 | 
194 |     results = p.parse(
195 |         """
196 |         members = {"one", "two"}
197 | 
198 |         IN_LIST(members) -> MARK("MEMBER")
199 |         """
200 |     )
201 | 
202 |     assert len(results) == 2
203 | 
204 | 
205 | def test_parser_literal_w_escape(config):
206 |     p = RitaParser(config)
207 |     p.build(debug=True)
208 | 
209 |     results = p.parse(
210 |         r'WORD("Hello \"WORLD\"") -> MARK("TEST")'
211 |     )
212 | 
213 |     assert len(results) == 1
214 | 
215 | 
216 | def test_parser_pattern_in_variable(config):
217 |     p = RitaParser(config)
218 |     p.build(debug=True)
219 | 
220 |     results = p.parse(
221 |         '''
222 |         Complex_Number = { NUM+, WORD("/")?, NUM? }
223 |         {PATTERN(Complex_Number), WORD("inch")}->MARK("WIDTH")
224 |         '''
225 |     )
226 | 
227 |     print(results)
228 |     assert len(results) == 2
229 | 
230 | 
231 | def test_pattern_with_escaped_characters(config):
232 |     p = RitaParser(config)
233 |     p.build(debug=True)
234 | 
235 |     results = p.parse(
236 |         '''
237 |         special = { '"', "*", "-" }
238 |         IN_LIST(special)->MARK("TEST")
239 |         '''
240 |     )
241 | 
242 |     assert len(results) > 0
243 | 
244 |     rules = results[1]()
245 | 
246 |     assert {"label": "TEST", "data": [("any_of", ["\"", "*", "-"], ExtendedOp())]} == rules
247 | 
248 | 
249 | def test_parser_array_as_argument(config):
250 |     p = RitaParser(config)
251 |     p.build(debug=True)
252 | 
253 |     results = p.parse(
254 |         '''
255 |         special = { '"', "*", "-" }
256 |         POS(special)->MARK("TEST")
257 |         '''
258 |     )
259 | 
260 |     assert len(results) > 0
261 |     rules = results[1]()
262 |     assert {"label": "TEST", "data": [("pos", ["\"", "*", "-"], ExtendedOp())]} == rules
263 | 
264 | 
265 | def test_parser_inline_array_as_argument(config):
266 |     p = RitaParser(config)
267 |     p.build(debug=True)
268 | 
269 |     results = p.parse(
270 |         '''
271 |         POS('"', "*", "-")->MARK("TEST")
272 |         '''
273 |     )
274 | 
275 |     assert len(results) > 0
276 |     rules = results[0]()
277 |     assert {"label": "TEST", "data": [("pos", ["\"", "*", "-"], ExtendedOp())]} == rules
278 | 
279 | 
280 | def test_parser_inline_array_as_inlist_argument(config):
281 |     p = RitaParser(config)
282 |     p.build(debug=True)
283 | 
284 |     results = p.parse(
285 |         '''
286 |         IN_LIST('one', "two", "three")->MARK("TEST")
287 |         '''
288 |     )
289 | 
290 |     assert len(results) > 0
291 |     rules = results[0]()
292 |     assert {"label": "TEST", "data": [("any_of", ["one", "two", "three"], ExtendedOp())]} == rules
293 | 


--------------------------------------------------------------------------------
/rita/engine/translate_standalone.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import re
  3 | import json
  4 | 
  5 | from functools import partial
  6 | from itertools import groupby, chain
  7 | from concurrent.futures import ThreadPoolExecutor, as_completed
  8 | from typing import Any, TYPE_CHECKING, Mapping, Callable
  9 | 
 10 | from rita.utils import ExtendedOp
 11 | from rita.types import Rules, Patterns
 12 | 
 13 | logger = logging.getLogger(__name__)
 14 | 
 15 | ParseFn = Callable[[Any, "SessionConfig", ExtendedOp], str]
 16 | 
 17 | 
 18 | if TYPE_CHECKING:
 19 |     # We cannot simply import SessionConfig because of cyclic imports
 20 |     from rita.config import SessionConfig
 21 | 
 22 | 
 23 | def apply_operator(syntax, op: ExtendedOp) -> str:
 24 |     if op.empty():
 25 |         return syntax
 26 | 
 27 |     elif str(op) == "!":  # A bit complicated one
 28 |         return (r"((?!{})\w+)".format(syntax
 29 |                                       .rstrip(")")
 30 |                                       .lstrip("(")))
 31 |     else:
 32 |         return syntax + str(op)
 33 | 
 34 | 
 35 | def any_of_parse(lst, config: "SessionConfig", op: ExtendedOp) -> str:
 36 |     clause = r"((^|\s)(({0})\s?))".format("|".join(sorted(lst, key=lambda x: (-len(x), x))))
 37 |     return apply_operator(clause, op)
 38 | 
 39 | 
 40 | def regex_parse(r, config: "SessionConfig", op: ExtendedOp) -> str:
 41 |     if op.local_regex_override:
 42 |         return local_regex_parse(r, config, op)
 43 |     else:
 44 |         initial = "(" + r + r"\s?" + ")"
 45 |         return apply_operator(initial, op)
 46 | 
 47 | 
 48 | def local_regex_parse(r, config: "SessionConfig", op: ExtendedOp) -> str:
 49 |     if r[0] == "^" and r[-1] == "$":  # Fully strictly defined string?
 50 |         pattern = r[1:-1]
 51 |     elif r[0] == "^":  # We define start of the string
 52 |         pattern = r[1:] + r"\w*"
 53 |     elif r[-1] == "$":  # We define end of string
 54 |         pattern = r"\w*" + r[:-1]
 55 |     else:  # We define string inside word
 56 |         pattern = r"\w*" + r + r"\w*"
 57 | 
 58 |     initial = "(" + r"\b" + pattern + r"\b" + r"\s?" + ")"
 59 |     return apply_operator(initial, op)
 60 | 
 61 | 
 62 | def not_supported(key, *args, **kwargs) -> str:
 63 |     raise RuntimeError(
 64 |         "Rule '{0}' is not supported in standalone mode"
 65 |         .format(key)
 66 |     )
 67 | 
 68 | 
 69 | def person_parse(config: "SessionConfig", op: ExtendedOp) -> str:
 70 |     return apply_operator(r"([A-Z]\w+\s?)", op)
 71 | 
 72 | 
 73 | def entity_parse(value, config: "SessionConfig", op: ExtendedOp) -> str:
 74 |     if value == "PERSON":
 75 |         return person_parse(config, op=op)
 76 |     else:
 77 |         return not_supported(value)
 78 | 
 79 | 
 80 | def punct_parse(_, config: "SessionConfig", op: ExtendedOp) -> str:
 81 |     return apply_operator(r"([.,!;?:]\s?)", op)
 82 | 
 83 | 
 84 | def word_parse(value, config: "SessionConfig", op: ExtendedOp) -> str:
 85 |     initial = r"({}\s?)".format(value)
 86 |     return apply_operator(initial, op)
 87 | 
 88 | 
 89 | def fuzzy_parse(r, config: "SessionConfig", op: ExtendedOp) -> str:
 90 |     # TODO: build premutations
 91 |     return apply_operator(r"({0})[.,?;!]?".format("|".join(r)), op)
 92 | 
 93 | 
 94 | def phrase_parse(value, config: "SessionConfig", op: ExtendedOp) -> str:
 95 |     return apply_operator(r"({}\s?)".format(value), op)
 96 | 
 97 | 
 98 | def nested_parse(values, config: "SessionConfig", op: ExtendedOp) -> str:
 99 |     from rita.macros import resolve_value
100 |     (_, patterns) = rules_to_patterns("", [resolve_value(v, config=config)
101 |                                            for v in values], config=config)
102 |     return r"(?P<g{}>{})".format(config.new_nested_group_id(), "".join(patterns))
103 | 
104 | 
105 | def any_parse(_, config: "SessionConfig", op: ExtendedOp) -> str:
106 |     return regex_parse(r".*", config, op)
107 | 
108 | 
109 | PARSERS: Mapping[str, ParseFn] = {
110 |     "any_of": any_of_parse,
111 |     "any": any_parse,
112 |     "value": word_parse,
113 |     "regex": regex_parse,
114 |     "entity": entity_parse,
115 |     "lemma": partial(not_supported, "LEMMA"),
116 |     "pos": partial(not_supported, "POS"),
117 |     "punct": punct_parse,
118 |     "fuzzy": fuzzy_parse,
119 |     "phrase": phrase_parse,
120 |     "nested": nested_parse,
121 | }
122 | 
123 | 
124 | def rules_to_patterns(label: str, data: Patterns, config: "SessionConfig"):
125 |     logger.debug("data: {}".format(data))
126 | 
127 |     def gen():
128 |         """
129 |         Implicitly add spaces between rules
130 |         """
131 |         if len(data) == 0:
132 |             return
133 | 
134 |         yield data[0]
135 | 
136 |         for (t, d, op) in data[1:]:
137 |             yield t, d, op
138 | 
139 |     return (
140 |         label,
141 |         [PARSERS[t](d, config, op) for (t, d, op) in gen()],
142 |     )
143 | 
144 | 
145 | class RuleExecutor(object):
146 |     def __init__(self, patterns, config, regex_impl=re, max_workers=4):
147 |         self.config = config
148 |         self.regex_impl = regex_impl
149 |         self.patterns = [self.compile(label, rules)
150 |                          for label, rules in patterns]
151 |         self.raw_patterns = patterns
152 |         self.max_workers = max_workers
153 | 
154 |     def compile(self, label, rules):
155 |         flags = self.regex_impl.DOTALL
156 |         if self.config.ignore_case:
157 |             flags = flags | self.regex_impl.IGNORECASE
158 | 
159 |         indexed_rules = ["(?P<s{}>{})".format(i, r) if not r.startswith("(?P<") else r
160 |                          for i, r in enumerate(rules)]
161 |         regex_str = r"(?P<{0}>{1})".format(label, "".join(indexed_rules))
162 |         try:
163 |             return self.regex_impl.compile(regex_str, flags)
164 |         except Exception as ex:
165 |             logger.exception("Failed to compile: '{0}', Reason: \n{1}".format(regex_str, str(ex)))
166 |             return None
167 | 
168 |     def _match_task(self, pattern, text, include_submatches):
169 |         def gen():
170 |             for match in pattern.finditer(text):
171 |                 def submatches():
172 |                     for k, v in match.groupdict().items():
173 |                         if not v or v.strip() == "":
174 |                             continue
175 |                         yield {
176 |                             "key": k,
177 |                             "text": v.strip(),
178 |                             "start": match.start(k),
179 |                             "end": match.end(k)
180 |                         }
181 | 
182 |                 yield {
183 |                     "start": match.start(),
184 |                     "end": match.end(),
185 |                     "text": match.group().strip(),
186 |                     "label": match.lastgroup,
187 |                     "submatches": sorted(list(submatches()), key=lambda x: x["start"]) if include_submatches else []
188 |                 }
189 |         return list(gen())
190 | 
191 |     def _results(self, text, include_submatches):
192 |         with ThreadPoolExecutor(self.max_workers) as executor:
193 |             tasks = [executor.submit(self._match_task, p, text, include_submatches)
194 |                      for p in self.patterns]
195 |             for future in as_completed(tasks):
196 |                 yield future.result(timeout=1)
197 | 
198 |     def execute(self, text, include_submatches=True):
199 |         results = sorted(chain(*self._results(text, include_submatches)), key=lambda x: x["start"])
200 |         for k, g in groupby(results, lambda x: x["start"]):
201 |             group = list(g)
202 |             if len(group) == 1:
203 |                 yield group[0]
204 |             else:
205 |                 data = sorted(group, key=lambda x: -x["end"])
206 |                 yield data[0]
207 | 
208 |     @staticmethod
209 |     def load(path, regex_impl=re):
210 |         from rita.config import SessionConfig
211 |         config = SessionConfig()
212 |         with open(path, "r") as f:
213 |             patterns = [(obj["label"], obj["rules"])
214 |                         for obj in map(json.loads, f.readlines())]
215 |             return RuleExecutor(patterns, config, regex_impl=regex_impl)
216 | 
217 |     def save(self, path):
218 |         with open(path, "w") as f:
219 |             for pattern in self:
220 |                 f.write("{0}\n".format(json.dumps(pattern)))
221 | 
222 |     def __iter__(self):
223 |         for label, rules in self.raw_patterns:
224 |             yield {"label": label, "rules": rules}
225 | 
226 | 
227 | def compile_rules(rules: Rules, config: "SessionConfig", regex_impl=re, **kwargs) -> RuleExecutor:
228 |     logger.info("Using standalone rule implementation")
229 |     patterns = [rules_to_patterns(*group, config=config) for group in rules]
230 |     executor = RuleExecutor(patterns, config, regex_impl=regex_impl)
231 |     return executor
232 | 


--------------------------------------------------------------------------------
/rita/preprocess.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | from functools import reduce
  4 | from typing import Any, Mapping, Callable, List
  5 | 
  6 | from rita.utils import Node, deaccent, ExtendedOp
  7 | from rita.types import RuleGroup, Rules
  8 | from rita.config import SessionConfig
  9 | 
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | Pipeline = Callable[[Rules, "SessionConfig"], Rules]
 13 | 
 14 | 
 15 | def handle_prefix(rules: Rules, config: SessionConfig):
 16 |     """
 17 |     If we have a prefix and rule following it, we apply this prefix on that rule
 18 |     """
 19 |     def apply_prefix(pattern, prefix):
 20 |         (name, args, op) = pattern
 21 |         if name == "any_of":
 22 |             return (name, list(["{0}{1}".format(prefix, item)
 23 |                                 for item in args]), op)
 24 |         elif name == "value":
 25 |             return name, "{0}{1}".format(prefix, args), op
 26 |         else:
 27 |             logger.warning("Don't know how to apply prefix on: {}".format(name))
 28 |             return pattern
 29 | 
 30 |     def gen():
 31 |         prefix = None
 32 |         for p in pattern:
 33 |             (name, args, op) = p
 34 |             if name == "prefix":
 35 |                 prefix = args
 36 |             else:
 37 |                 if prefix:
 38 |                     yield apply_prefix(p, prefix)
 39 |                     prefix = None
 40 |                 else:
 41 |                     yield p
 42 |     for group_label, pattern in rules:
 43 |         yield group_label, list(gen())
 44 | 
 45 | 
 46 | def handle_deaccent(rules: Rules, config: SessionConfig):
 47 |     """
 48 |     If we get accented word, eg: {WORD("naïve"), WORD("bayes")}
 49 |     In case of word, it should become list => {IN_LIST({"naïve", "naive"}), WORD("bayes")}
 50 |     In case of list, it should extend list with extra items for accented and not accented versions
 51 |     """
 52 |     for group_label, pattern in rules:
 53 |         def gen():
 54 |             for p in pattern:
 55 |                 (name, args, op) = p
 56 |                 if name == "value":
 57 |                     (v1, v2) = (args, deaccent(args))
 58 |                     if v1 != v2:
 59 |                         yield "any_of", (v1, v2,), op
 60 |                     else:
 61 |                         yield p
 62 |                 elif name == "any_of":
 63 |                     def items():
 64 |                         for w in args:
 65 |                             (v1, v2) = (w, deaccent(w))
 66 |                             if v1 != v2:
 67 |                                 yield v1
 68 |                                 yield v2
 69 |                             else:
 70 |                                 yield v1
 71 | 
 72 |                     yield "any_of", list(items()), op
 73 |                 else:
 74 |                     yield p
 75 | 
 76 |         yield group_label, list(gen())
 77 | 
 78 | 
 79 | def add_implicit_punct(rules: Rules, config: SessionConfig):
 80 |     """
 81 |     When writing rule,
 82 |     user usually doesn't care about some punct characters between words.
 83 |     We add them implicitly (unless this setting is turned off)
 84 |     """
 85 |     for group_label, pattern in rules:
 86 |         def gen():
 87 |             for p in pattern:
 88 |                 yield p
 89 |                 yield "punct", None, ExtendedOp("?")
 90 | 
 91 |         if len(pattern) == 1:
 92 |             yield group_label, pattern
 93 |         else:
 94 |             yield group_label, list(gen())[:-1]
 95 | 
 96 | 
 97 | def add_implicit_hyphon(rules: Rules, config: SessionConfig):
 98 |     """
 99 |     When writing rule,
100 |     user usually doesn't care about hyphon characters - between words.
101 |     """
102 |     for group_label, pattern in rules:
103 |         def gen():
104 |             for p in pattern:
105 |                 yield p
106 |                 yield "value", "-", ExtendedOp("?")
107 | 
108 |         if len(pattern) == 1:
109 |             yield group_label, pattern
110 |         else:
111 |             yield group_label, list(gen())[:-1]
112 | 
113 | 
114 | def handle_multi_word(rules: Rules, config: SessionConfig):
115 |     """
116 |     spaCy splits everything in tokens.
117 |     Words with dash ends up in different tokens.
118 | 
119 |     We don't want for user to even care about this,
120 |     so we make this work implicitly
121 | 
122 |     WORD("Knee-length") => WORD("Knee"), WORD("-"), WORD("length")
123 |     """
124 |     for group_label, pattern in rules:
125 |         def gen():
126 |             for p in pattern:
127 |                 (name, args, op) = p
128 |                 if name == "value" and is_complex(args):
129 |                     yield "phrase", args, op
130 |                 else:
131 |                     yield p
132 | 
133 |         yield group_label, list(gen())
134 | 
135 | 
136 | def is_complex(arg: str) -> bool:
137 |     # if we want to use `-` as a word
138 |     if arg.strip() == "-":
139 |         return False
140 | 
141 |     splitters = ["-", " "]
142 |     return any([s in arg
143 |                 for s in splitters])
144 | 
145 | 
146 | def has_complex(args: List[str]) -> bool:
147 |     """
148 |     Tells if any of arguments will be impacted by tokenizer
149 |     """
150 |     return any([is_complex(a)
151 |                 for a in args])
152 | 
153 | 
154 | def branch_pattern(pattern, config: SessionConfig):
155 |     """
156 |     Creates multiple lists for each possible permutation
157 |     """
158 |     root = Node()
159 |     current = root
160 |     depth = 0
161 |     for idx, p in enumerate(pattern):
162 |         if p[0] == "either":
163 |             n = Node()
164 |             current.add_next(n)
165 |             current = n
166 |             current.depth = depth
167 |             for e in p[1]:
168 |                 current.add_child(e(config=config))
169 |                 depth += 1
170 |         elif p[0] == "any_of" and has_complex(p[1]):
171 |             _all = set(p[1])
172 |             _complex = set(filter(is_complex, _all))
173 |             simple = _all - _complex
174 |             n = Node()
175 |             current.add_next(n)
176 |             current = n
177 |             current.depth = depth
178 |             if len(simple) > 0:
179 |                 current.add_child(("any_of", simple, p[2]))
180 |             for c in sorted(_complex):
181 |                 current.add_child(("phrase", c, p[2]))
182 |                 depth += 1
183 |         else:
184 |             n = Node(p)
185 |             current.add_next(n)
186 |             current = n
187 |             current.depth = depth
188 | 
189 |     for p in root.unwrap():
190 |         yield p
191 | 
192 | 
193 | def handle_rule_branching(rules: Rules, config: SessionConfig):
194 |     """
195 |     If we have an OR statement, eg. `WORD(w1)|WORD(w2)`,
196 |     Generic approach is to clone rules and use w1 in one, w2 in other.
197 |     It may be an overkill, but some situations are not covered
198 |     with simple approach
199 |     """
200 |     for group_label, pattern in rules:
201 |         # Covering WORD(w1)|WORD(w2) case
202 |         if any([p == "either"
203 |                 for (p, _, _) in pattern]):
204 |             for p in branch_pattern(pattern, config):
205 |                 yield group_label, p
206 | 
207 |         # Covering case when there are complex items in list
208 |         elif config.list_branching and any([p == "any_of" and has_complex(o)
209 |                                             for (p, o, _) in pattern]):
210 |             for p in branch_pattern(pattern, config):
211 |                 yield group_label, p
212 |         else:
213 |             yield group_label, pattern
214 | 
215 | 
216 | def dummy(rules: Rules, config: SessionConfig):
217 |     """
218 |     Placeholder which does nothing
219 |     """
220 |     logger.debug("Initial rules: {}".format(rules))
221 |     return rules
222 | 
223 | 
224 | def rule_tuple(d: Mapping[str, Any]) -> RuleGroup:
225 |     return d["label"], d["data"]
226 | 
227 | 
228 | def expand_patterns(rules: Rules, config: SessionConfig):
229 |     """
230 |     We can have situations where inside pattern we have another pattern (via Variable).
231 |     We want to expand this inner pattern and prepend to outer pattern
232 |     """
233 |     for group_label, pattern in rules:
234 |         def gen():
235 |             for p in pattern:
236 |                 if type(p) is tuple:
237 |                     (k, other, op) = p
238 |                     if k == "nested":
239 |                         fn = other[0][0]
240 |                         children = other[0][1]
241 |                         yield fn, children, op
242 |                     else:
243 |                         yield p
244 |                 else:
245 |                     yield p
246 | 
247 |         yield group_label, list(gen())
248 | 
249 | 
250 | def flatten_2nd_level_nested(rules: Rules, config: SessionConfig):
251 |     """
252 |     1st level of nested: use PATTERN(...) inside of your rule
253 |     2nd level of nested: use PATTERN(...) which has PATTERN(...) and so on (recursively)
254 | 
255 |     we want to resolve up to 1st level
256 |     """
257 | 
258 |     for group_label, pattern in rules:
259 |         def gen():
260 |             for p in pattern:
261 |                 if type(p) is list:
262 |                     for item in p:
263 |                         yield item
264 |                 else:
265 |                     yield p
266 | 
267 |         yield group_label, list(gen())
268 | 
269 | 
270 | def preprocess_rules(root, config: SessionConfig) -> Rules:
271 |     logger.info("Preprocessing rules")
272 | 
273 |     rules = [rule_tuple(doc())
274 |              for doc in root
275 |              if doc and doc()]
276 | 
277 |     pipeline = [
278 |         dummy,
279 |         expand_patterns,
280 |         handle_deaccent,
281 |         handle_rule_branching,
282 |         flatten_2nd_level_nested,
283 |         handle_multi_word,
284 |         handle_prefix
285 |     ]
286 | 
287 |     if config.implicit_hyphon:
288 |         logger.info("Adding implicit Hyphons")
289 |         pipeline.append(add_implicit_hyphon)
290 |     elif config.implicit_punct:
291 |         logger.info("Adding implicit Punctuations")
292 |         pipeline.append(add_implicit_punct)
293 | 
294 |     return reduce(lambda acc, p: p(acc, config), pipeline, rules)
295 | 


--------------------------------------------------------------------------------
/tests/test_examples.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | import rita
  4 | 
  5 | from utils import spacy_engine, standalone_engine, rust_engine, load_rules
  6 | 
  7 | 
  8 | @pytest.fixture(scope="session")
  9 | def bench_text():
 10 |     # TODO: think of new case for testing
 11 |     pass
 12 | 
 13 | 
 14 | @pytest.mark.parametrize('engine', [spacy_engine])
 15 | def test_color_car(engine):
 16 |     text = "John Silver was driving a red car. It was BMW X6 Mclass. John likes driving it very much."
 17 |     parser = engine(load_rules("examples/color-car.rita"))
 18 |     entities = set(parser(text))
 19 |     print(entities)
 20 | 
 21 |     expected = set([
 22 |         ("John Silver", "PERSON"),       # Normal NER
 23 |         ("red car", "CAR_COLOR"),        # Our first rule
 24 |         ("BMW X6 Mclass", "CAR_MODEL"),  # Our second rule
 25 |         ("John likes driving", "LIKED_ACTION")  # Our third rule
 26 |     ])
 27 | 
 28 |     assert entities.issuperset(expected)
 29 | 
 30 | 
 31 | @pytest.mark.parametrize('engine', [spacy_engine, standalone_engine, rust_engine])
 32 | def test_fuzzy_matching(engine):
 33 |     parser = engine("""
 34 |     !IMPORT("rita.modules.fuzzy")
 35 | 
 36 |     FUZZY("squirrel") -> MARK("CRITTER")
 37 |     """)
 38 | 
 39 |     # Check if we're matching with capitalized word
 40 |     t1 = """
 41 |     Squirrel just walked outside
 42 |     """
 43 | 
 44 |     entities = parser(t1)
 45 | 
 46 |     assert len(entities) == 1
 47 |     assert entities[0] == ("Squirrel", "CRITTER")
 48 | 
 49 |     # Check if we're matching with all CAPS
 50 |     t2 = """
 51 |     there's a SQUIRREL
 52 |     """
 53 | 
 54 |     entities = parser(t2)
 55 | 
 56 |     assert len(entities) == 1
 57 |     assert entities[0] == ("SQUIRREL", "CRITTER")
 58 | 
 59 | 
 60 | @pytest.mark.parametrize('engine', [spacy_engine, standalone_engine, rust_engine])
 61 | def test_election(engine):
 62 |     parser = engine(
 63 |         """
 64 |         {ENTITY("PERSON")+, WORD("was"), WORD("elected")}->MARK("WON_ELECTION")
 65 |         {WORD("defeating"), ENTITY("PERSON")+}->MARK("LOST_ELECTION")
 66 |         """
 67 |     )
 68 |     text = "Donald Trump was elected President in 2016 defeating Hilary Clinton."
 69 | 
 70 |     entities = set(parser(text))
 71 |     expected = set([
 72 |         ("Donald Trump was elected", "WON_ELECTION"),
 73 |         ("defeating Hilary Clinton", "LOST_ELECTION"),
 74 |     ])
 75 |     print(entities)
 76 | 
 77 |     assert entities.issuperset(expected)
 78 | 
 79 | 
 80 | @pytest.mark.parametrize('engine', [spacy_engine, standalone_engine, rust_engine])
 81 | def test_dash_case(engine):
 82 |     parser = engine(load_rules("examples/dress-match.rita"))
 83 |     text = """
 84 |     Fitted, knee-length dress in soft velour
 85 |     """
 86 | 
 87 |     entities = set(parser(text))
 88 |     print(entities)
 89 |     expected = set([
 90 |         ("Fitted, knee-length dress", "DRESS_TYPE"),
 91 |         ("soft velour", "DRESS_FABRIC"),
 92 |     ])
 93 | 
 94 |     assert entities.issuperset(expected)
 95 | 
 96 | 
 97 | @pytest.mark.parametrize('engine', [spacy_engine, standalone_engine])
 98 | def test_exclude_word(engine):
 99 |     # Rust engine doesn't work here, because Re2 doesn't support backtracking operator
100 |     parser = engine(load_rules("examples/excluding-word.rita"))
101 | 
102 |     t1 = "weather is awesome"
103 |     t2 = "weather is cold"
104 | 
105 |     r1 = parser(t1)
106 |     r2 = parser(t2)
107 | 
108 |     assert r1[0] == ("weather is awesome", "GOOD_WEATHER")
109 |     assert len(r2) == 0
110 | 
111 | 
112 | @pytest.mark.parametrize('engine', [spacy_engine, standalone_engine, rust_engine])
113 | def test_escape_string(engine):
114 |     # If it compiles - good enough
115 |     engine(load_rules("examples/match-with-escaped-string.rita"))
116 | 
117 | 
118 | @pytest.mark.parametrize('engine', [spacy_engine, standalone_engine, rust_engine])
119 | def test_case_sensitive(engine):
120 |     parser = engine(
121 |         """
122 |         !CONFIG("ignore_case", "N")
123 | 
124 |         variants = {"Bitcoin", "BTC", "Bitcoin Cash"}
125 | 
126 |         {IN_LIST(variants)}->MARK("CRYPTO")
127 |         """
128 |     )
129 | 
130 |     text = """
131 |     A bitcoin mining magnate has proposed a new development fund for Bitcoin Cash.
132 |     According to BTC.TOP CEO Jiang Zhuoer, the scheme will 'tax' Bitcoin Cash mining rewards
133 |     in an effort to increase funding for Bitcoin Cash infrastructure.
134 |     """
135 | 
136 |     results = parser(text)
137 |     print(results)
138 |     filtered = list([r
139 |                     for r in results
140 |                     if r[1] == "CRYPTO"])
141 | 
142 |     assert len(filtered) > 0
143 |     assert filtered[0] == ("Bitcoin Cash", "CRYPTO")
144 | 
145 | 
146 | @pytest.mark.parametrize('engine', [spacy_engine, standalone_engine, rust_engine])
147 | def test_with_implicit_hyphon(engine):
148 |     parser = engine(
149 |         """
150 |         !CONFIG("implicit_punct", "N")
151 |         !CONFIG("implicit_hyphon", "Y")
152 |         {WORD("Hello"), WORD("World")}->MARK("HYPHON_LABEL")
153 |         WORD("Hello")->MARK("HELLO_LABEL")
154 |         """
155 |     )
156 | 
157 |     text = "Hello - world!"
158 |     results = parser(text)
159 |     print(results)
160 | 
161 |     assert len(results) == 1
162 |     assert results[0] == ("Hello - world", "HYPHON_LABEL")
163 | 
164 | 
165 | @pytest.mark.parametrize('engine', [spacy_engine, standalone_engine, rust_engine])
166 | def test_without_implicit_hyphon(engine):
167 |     parser = engine(
168 |         """
169 |         !CONFIG("implicit_punct", "N")
170 |         !CONFIG("implicit_hyphon", "N")
171 |         {WORD("Hello"), WORD("World")}->MARK("HYPHON_LABEL")
172 |         WORD("Hello")->MARK("HELLO_LABEL")
173 |         """
174 |     )
175 | 
176 |     text = "Hello - world!"
177 |     results = parser(text)
178 |     print(results)
179 | 
180 |     assert len(results) == 1
181 |     assert results[0] == ("Hello", "HELLO_LABEL")
182 | 
183 | 
184 | @pytest.mark.parametrize('engine', [spacy_engine, standalone_engine, rust_engine])
185 | def test_prefix(engine):
186 |     parser = engine(
187 |         """
188 |         science = {"mathematics", "physics"}
189 |         {PREFIX("meta"), IN_LIST(science)}->MARK("META_SCIENCE")
190 |         {PREFIX("pseudo"), WORD("science")}->MARK("PSEUDO_SCIENCE")
191 |         """
192 |     )
193 | 
194 |     text = """
195 |     This paper is full of metaphysics and pseudoscience
196 |     """
197 | 
198 |     results = parser(text)
199 |     print(results)
200 |     assert results[0] == ("metaphysics", "META_SCIENCE")
201 |     assert results[1] == ("pseudoscience", "PSEUDO_SCIENCE")
202 | 
203 | 
204 | @pytest.mark.parametrize('engine', ["standalone", "rust"])
205 | def test_compile_context(engine):
206 |     if engine == "rust":
207 |         from rita.engine.translate_rust import load_lib
208 |         lib = load_lib()
209 |         if lib is None:
210 |             pytest.skip("Missing rita-rust dynamic lib, skipping related tests")
211 |     rules = """
212 | 
213 |     {WORD*, IN_LIST(companies), WORD*}->MARK("SUSPISCIOUS_COMPANY")
214 |     """
215 |     parser = rita.compile_string(rules, use_engine=engine, companies=["CompanyA", "CompanyB"])
216 |     print(parser.patterns)
217 | 
218 |     results = list(parser.execute("CompanyB is doing it's dirty work."))
219 |     assert results[0] == {
220 |         "start": 0,
221 |         "end": 33,
222 |         "label": "SUSPISCIOUS_COMPANY",
223 |         "text": "CompanyB is doing it's dirty work",
224 |         "submatches": [
225 |             {
226 |                 "start": 0,
227 |                 "end": 33,
228 |                 "key": "SUSPISCIOUS_COMPANY",
229 |                 "text": "CompanyB is doing it's dirty work"
230 |             },
231 |             {
232 |                 "start": 0,
233 |                 "end": 9,
234 |                 "key": "s2",
235 |                 "text": "CompanyB"
236 |             },
237 |             {
238 |                 "start": 9,
239 |                 "end": 33,
240 |                 "key": "s4",
241 |                 "text": "is doing it's dirty work"
242 |             }
243 |         ],
244 |     }
245 | 
246 | 
247 | @pytest.mark.parametrize('engine', [spacy_engine, standalone_engine, rust_engine])
248 | def test_benchmark(benchmark, engine, bench_text):
249 |     """
250 |     These tests will only run if parameters:
251 |     `--benchmark-enable` or
252 |     `--benchmark-only`
253 |     are added
254 |     """
255 |     parser = engine(load_rules("examples/cheap-phones.rita"))
256 | 
257 |     def parse_rows(parser, rows):
258 |         for r in rows:
259 |             parser(r)
260 | 
261 |     benchmark.pedantic(
262 |         parse_rows,
263 |         args=(parser, bench_text),
264 |         iterations=3,
265 |         rounds=3
266 |     )
267 | 
268 | 
269 | @pytest.mark.parametrize('engine', [spacy_engine, standalone_engine, rust_engine])
270 | def test_variable_pattern(engine):
271 |     parser = engine("""
272 |     Complex_Number = { NUM+, WORD("/")?, NUM? }
273 |     {PATTERN(Complex_Number), WORD("inches"), WORD("Width")}->MARK("WIDTH")
274 |     {PATTERN(Complex_Number), WORD("inches"), WORD("Height")}->MARK("HEIGHT")
275 |     """)
276 |     text = """
277 |         It is 17 1/2 inches width and 10 inches height
278 |         """
279 | 
280 |     results = parser(text)
281 |     assert len(results) == 2
282 | 
283 | 
284 | @pytest.mark.parametrize('engine', [spacy_engine, standalone_engine, rust_engine])
285 | def test_inlist_longest(engine):
286 |     parser = engine("""
287 |     units = {"m", "mm", "cm"}
288 |     dimensions = {"width", "height", "length"}
289 |     {IN_LIST(dimensions), NUM, IN_LIST(units)}->MARK("TEST")
290 |     """)
291 | 
292 |     text = """
293 |     width 10 mm
294 |     """
295 | 
296 |     results = parser(text)
297 | 
298 |     assert len(results) == 1
299 |     (result, label) = results[0]
300 |     assert result == "width 10 mm"
301 | 
302 | 
303 | @pytest.mark.parametrize('engine', [standalone_engine, rust_engine])
304 | def test_inlist_word_based(engine):
305 |     parser = engine("""
306 |     units = {"m", "mm", "cm", "inches", "in"}
307 |     {IN_LIST(units), NUM}->MARK("TEST")
308 |     """)
309 | 
310 |     text = """
311 |     twin 20 turbo
312 |     """
313 | 
314 |     results = parser(text)
315 |     print(results)
316 |     assert len(results) == 0
317 | 
318 | 
319 | @pytest.mark.parametrize('engine', [standalone_engine, spacy_engine, rust_engine])
320 | def test_pluralize(engine):
321 |     pytest.importorskip("inflect")
322 |     parser = engine("""
323 |     !IMPORT("rita.modules.pluralize")
324 | 
325 |     vehicles={"car", "motorbike", "bicycle", "ship", "plane"}
326 |     {NUM, PLURALIZE(vehicles)}->MARK("VEHICLES")
327 |     """)
328 | 
329 |     text = """
330 |     There were 7 cars, 2 motorbikes, 1 ship, 1 bicycle and 9 planes
331 |     """
332 | 
333 |     results = set([text
334 |                    for text, label in parser(text)
335 |                    if label == "VEHICLES"])
336 |     print(results)
337 | 
338 |     assert len(results) == 5
339 |     assert {"7 cars", "2 motorbikes", "1 ship", "1 bicycle", "9 planes"} == results
340 | 
341 | 
342 | @pytest.mark.parametrize('engine', [spacy_engine])
343 | def test_orth_example(engine):
344 |     parser = engine("""
345 |     !IMPORT("rita.modules.orth")
346 | 
347 |     {ORTH("IEEE")}->MARK("TAGGED_MATCH")
348 |     {ORTH("ISO")?}->MARK("TAGGED_MATCH")
349 |     """)
350 | 
351 |     text = """
352 |     it should match IEEE or ISO, but should ignore ieee.
353 |     """
354 | 
355 |     results = set([text
356 |                    for text, label in parser(text)
357 |                    if label == "TAGGED_MATCH"])
358 | 
359 |     print(results)
360 |     assert len(results) == 2
361 |     assert {"IEEE", "ISO"} == results
362 | 
363 | 
364 | @pytest.mark.parametrize('engine', [standalone_engine, spacy_engine, rust_engine])
365 | def test_regex_module_start(engine):
366 |     parser = engine("""
367 |     !IMPORT("rita.modules.regex")
368 | 
369 |     {REGEX("^a")}->MARK("TAGGED_MATCH")
370 |     """)
371 | 
372 |     text = """
373 |     there are many letters in the alphabet
374 |     """
375 | 
376 |     results = set([text
377 |                    for text, label in parser(text)
378 |                    if label == "TAGGED_MATCH"])
379 | 
380 |     print(results)
381 |     assert len(results) == 2
382 |     assert {"are", "alphabet"} == results
383 | 
384 | 
385 | @pytest.mark.parametrize('engine', [standalone_engine, spacy_engine, rust_engine])
386 | def test_regex_module_end(engine):
387 |     parser = engine("""
388 |     !IMPORT("rita.modules.regex")
389 | 
390 |     {REGEX("e$")}->MARK("TAGGED_MATCH")
391 |     """)
392 | 
393 |     text = """
394 |     there are many letters in the alphabet
395 |     """
396 | 
397 |     results = set([text
398 |                    for text, label in parser(text)
399 |                    if label == "TAGGED_MATCH"])
400 | 
401 |     print(results)
402 |     assert len(results) == 3
403 |     assert {"there", "are", "the"} == results
404 | 
405 | 
406 | @pytest.mark.parametrize('engine', [standalone_engine, spacy_engine, rust_engine])
407 | def test_regex_module_middle(engine):
408 |     parser = engine("""
409 |     !IMPORT("rita.modules.regex")
410 | 
411 |     {REGEX("et")}->MARK("TAGGED_MATCH")
412 |     """)
413 | 
414 |     text = """
415 |     there are many letters in the alphabet
416 |     """
417 | 
418 |     results = set([text
419 |                    for text, label in parser(text)
420 |                    if label == "TAGGED_MATCH"])
421 | 
422 |     print(results)
423 |     assert len(results) == 2
424 |     assert {"letters", "alphabet"} == results
425 | 
426 | 
427 | @pytest.mark.parametrize('engine', [standalone_engine, spacy_engine, rust_engine])
428 | def test_regex_module_strict(engine):
429 |     parser = engine("""
430 |     !IMPORT("rita.modules.regex")
431 | 
432 |     {REGEX("^the$")}->MARK("TAGGED_MATCH")
433 |     """)
434 | 
435 |     text = """
436 |     there are many letters in the alphabet
437 |     """
438 | 
439 |     results = set([text
440 |                    for text, label in parser(text)
441 |                    if label == "TAGGED_MATCH"])
442 | 
443 |     print(results)
444 |     assert len(results) == 1
445 |     assert {"the"} == results
446 | 
447 | 
448 | @pytest.mark.parametrize('engine', [standalone_engine])
449 | def test_custom_regex_impl(engine):
450 |     import re
451 | 
452 |     class MyMatch(object):
453 |         def __init__(self, result):
454 |             self.result = result
455 | 
456 |         def start(self):
457 |             return 0
458 | 
459 |         def end(self):
460 |             return len(self.result)
461 | 
462 |         def group(self):
463 |             return self.result
464 | 
465 |         def groupdict(self):
466 |             return {}
467 | 
468 |         @property
469 |         def lastgroup(self):
470 |             return "TEST_MATCH"
471 | 
472 |     class MyCustomRegex(object):
473 |         DOTALL = re.DOTALL
474 |         IGNORECASE = re.IGNORECASE
475 | 
476 |         def compile(self, *args, **kwargs):
477 |             return self
478 | 
479 |         def match(self, *args, **kwargs):
480 |             return self
481 | 
482 |         def search(self, *args, **kwargs):
483 |             return self
484 | 
485 |         def finditer(self, text):
486 |             return [MyMatch("Hello new REGEX")]
487 | 
488 |     parser = engine("""
489 |     {WORD("Hello"), WORD("new"), WORD("regex")}->MARK("TEST_MATCH")
490 |     """, regex_impl=MyCustomRegex())
491 | 
492 |     results = parser("Hello new REGEX!")
493 | 
494 |     assert len(results) == 1
495 | 
496 | 
497 | @pytest.mark.parametrize('engine', [standalone_engine, rust_engine])
498 | def test_complex_number_match(engine):
499 |     parser = engine("""
500 |     fractions={"1 / 2", "3 / 4", "1 / 8", "3 / 8", "5 / 8", "7 / 8", "1 / 16", "3 / 16", "5 / 16", "7 / 16", "9 / 16",
501 |     "11 / 16", "13 / 16", "15 / 16", "1 / 32", "3 / 32", "5 / 32", "7 / 32", "9 / 32", "11 / 32", "13 / 32", "15 / 32",
502 |     "17 / 32", "19 / 32", "21 / 32", "23 / 32", "25 / 32", "27 / 32", "29 / 32", "31 / 32"}
503 | 
504 |     num_with_fractions = {NUM, WORD("-")?, IN_LIST(fractions)}
505 |     complex_number = {NUM|PATTERN(num_with_fractions)}
506 | 
507 |     {WORD("length"), PATTERN(complex_number)}->MARK("NUMBER")
508 |     """)
509 | 
510 |     simple_number = parser("length 50 cm")
511 |     assert len(simple_number) == 1
512 |     assert ("length 50", "NUMBER") == simple_number[0]
513 | 
514 |     complex_number = parser('length 10 1 / 2 "')
515 | 
516 |     assert len(complex_number) == 1
517 |     assert ("length 10 1 / 2", "NUMBER") == complex_number[0]
518 | 
519 | 
520 | @pytest.mark.parametrize('engine', [standalone_engine, rust_engine])
521 | def test_simple_float_number_match(engine):
522 |     parser = engine("""
523 |     NUM->MARK("NUMBER")
524 |     """)
525 | 
526 |     assert parser("25")[0] == ("25", "NUMBER")
527 |     assert parser("25.7")[0] == ("25.7", "NUMBER")
528 |     assert parser("19,6")[0] == ("19,6", "NUMBER")
529 | 
530 | 
531 | @pytest.mark.parametrize('engine', [standalone_engine, rust_engine])
532 | def test_invalid_entity(engine):
533 |     with pytest.raises(RuntimeError):
534 |         engine("""
535 |         ENTITY("ORG")->MARK("ORG_MATCH")
536 |         """)
537 | 
538 | 
539 | @pytest.mark.parametrize('engine', [spacy_engine])
540 | def test_multiple_pos(engine):
541 |     parser = engine("""
542 |     {POS("VERB", "NOUN")}->MARK("POS_MATCH")
543 |     """)
544 | 
545 |     text = """
546 |     Here we have a verb: doing and noun: stuff
547 |     """
548 | 
549 |     results = set([text
550 |                    for text, label in parser(text)
551 |                    if label == "POS_MATCH"])
552 | 
553 |     print(results)
554 |     assert len(results) == 5
555 |     assert {"noun", "have", "verb", "doing", "stuff"} == results
556 | 
557 | 
558 | @pytest.mark.parametrize('engine', [spacy_engine])
559 | def test_multiple_entities(engine):
560 |     parser = engine("""
561 |     {ENTITY("PERSON", "ORG")}->MARK("ENTITY_MATCH")
562 |     """)
563 | 
564 |     text = """
565 |     John has been working at AT&T for the past year
566 |     """
567 | 
568 |     results = set([text
569 |                    for text, label in parser(text)
570 |                    if label == "ENTITY_MATCH"])
571 | 
572 |     print(results)
573 |     assert len(results) == 2
574 |     assert {"AT&T", "John"} == results
575 | 


--------------------------------------------------------------------------------
/poetry.lock:
--------------------------------------------------------------------------------
  1 | [[package]]
  2 | name = "atomicwrites"
  3 | version = "1.4.0"
  4 | description = "Atomic file writes."
  5 | category = "dev"
  6 | optional = false
  7 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
  8 | 
  9 | [[package]]
 10 | name = "attrs"
 11 | version = "21.2.0"
 12 | description = "Classes Without Boilerplate"
 13 | category = "dev"
 14 | optional = false
 15 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
 16 | 
 17 | [package.extras]
 18 | dev = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface", "furo", "sphinx", "sphinx-notfound-page", "pre-commit"]
 19 | docs = ["furo", "sphinx", "zope.interface", "sphinx-notfound-page"]
 20 | tests = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface"]
 21 | tests_no_zope = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins"]
 22 | 
 23 | [[package]]
 24 | name = "colorama"
 25 | version = "0.4.4"
 26 | description = "Cross-platform colored terminal text."
 27 | category = "dev"
 28 | optional = false
 29 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
 30 | 
 31 | [[package]]
 32 | name = "coverage"
 33 | version = "5.5"
 34 | description = "Code coverage measurement for Python"
 35 | category = "dev"
 36 | optional = false
 37 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4"
 38 | 
 39 | [package.extras]
 40 | toml = ["toml"]
 41 | 
 42 | [[package]]
 43 | name = "importlib-metadata"
 44 | version = "2.1.1"
 45 | description = "Read metadata from Python packages"
 46 | category = "dev"
 47 | optional = false
 48 | python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7"
 49 | 
 50 | [package.dependencies]
 51 | zipp = ">=0.5"
 52 | 
 53 | [package.extras]
 54 | docs = ["sphinx", "rst.linker"]
 55 | testing = ["packaging", "pep517", "unittest2", "importlib-resources (>=1.3)"]
 56 | 
 57 | [[package]]
 58 | name = "more-itertools"
 59 | version = "8.10.0"
 60 | description = "More routines for operating on iterables, beyond itertools"
 61 | category = "dev"
 62 | optional = false
 63 | python-versions = ">=3.5"
 64 | 
 65 | [[package]]
 66 | name = "packaging"
 67 | version = "20.9"
 68 | description = "Core utilities for Python packages"
 69 | category = "dev"
 70 | optional = false
 71 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
 72 | 
 73 | [package.dependencies]
 74 | pyparsing = ">=2.0.2"
 75 | 
 76 | [[package]]
 77 | name = "pathlib2"
 78 | version = "2.3.6"
 79 | description = "Object-oriented filesystem paths"
 80 | category = "dev"
 81 | optional = false
 82 | python-versions = "*"
 83 | 
 84 | [package.dependencies]
 85 | six = "*"
 86 | 
 87 | [[package]]
 88 | name = "pluggy"
 89 | version = "0.13.1"
 90 | description = "plugin and hook calling mechanisms for python"
 91 | category = "dev"
 92 | optional = false
 93 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
 94 | 
 95 | [package.dependencies]
 96 | importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""}
 97 | 
 98 | [package.extras]
 99 | dev = ["pre-commit", "tox"]
100 | 
101 | [[package]]
102 | name = "ply"
103 | version = "3.11"
104 | description = "Python Lex & Yacc"
105 | category = "main"
106 | optional = false
107 | python-versions = "*"
108 | 
109 | [[package]]
110 | name = "py"
111 | version = "1.10.0"
112 | description = "library with cross-python path, ini-parsing, io, code, log facilities"
113 | category = "dev"
114 | optional = false
115 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
116 | 
117 | [[package]]
118 | name = "py-cpuinfo"
119 | version = "8.0.0"
120 | description = "Get CPU info with pure Python 2 & 3"
121 | category = "dev"
122 | optional = false
123 | python-versions = "*"
124 | 
125 | [[package]]
126 | name = "pyparsing"
127 | version = "2.4.7"
128 | description = "Python parsing module"
129 | category = "dev"
130 | optional = false
131 | python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
132 | 
133 | [[package]]
134 | name = "pytest"
135 | version = "5.4.3"
136 | description = "pytest: simple powerful testing with Python"
137 | category = "dev"
138 | optional = false
139 | python-versions = ">=3.5"
140 | 
141 | [package.dependencies]
142 | atomicwrites = {version = ">=1.0", markers = "sys_platform == \"win32\""}
143 | attrs = ">=17.4.0"
144 | colorama = {version = "*", markers = "sys_platform == \"win32\""}
145 | importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""}
146 | more-itertools = ">=4.0.0"
147 | packaging = "*"
148 | pathlib2 = {version = ">=2.2.0", markers = "python_version < \"3.6\""}
149 | pluggy = ">=0.12,<1.0"
150 | py = ">=1.5.0"
151 | wcwidth = "*"
152 | 
153 | [package.extras]
154 | checkqa-mypy = ["mypy (==v0.761)"]
155 | testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xmlschema"]
156 | 
157 | [[package]]
158 | name = "pytest-benchmark"
159 | version = "3.4.1"
160 | description = "A ``pytest`` fixture for benchmarking code. It will group the tests into rounds that are calibrated to the chosen timer."
161 | category = "dev"
162 | optional = false
163 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
164 | 
165 | [package.dependencies]
166 | py-cpuinfo = "*"
167 | pytest = ">=3.8"
168 | 
169 | [package.extras]
170 | aspect = ["aspectlib"]
171 | elasticsearch = ["elasticsearch"]
172 | histogram = ["pygal", "pygaljs"]
173 | 
174 | [[package]]
175 | name = "pytest-cov"
176 | version = "2.12.1"
177 | description = "Pytest plugin for measuring coverage."
178 | category = "dev"
179 | optional = false
180 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
181 | 
182 | [package.dependencies]
183 | coverage = ">=5.2.1"
184 | pytest = ">=4.6"
185 | toml = "*"
186 | 
187 | [package.extras]
188 | testing = ["fields", "hunter", "process-tests", "six", "pytest-xdist", "virtualenv"]
189 | 
190 | [[package]]
191 | name = "pytest-mock"
192 | version = "2.0.0"
193 | description = "Thin-wrapper around the mock package for easier use with py.test"
194 | category = "dev"
195 | optional = false
196 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
197 | 
198 | [package.dependencies]
199 | pytest = ">=2.7"
200 | 
201 | [package.extras]
202 | dev = ["pre-commit", "tox"]
203 | 
204 | [[package]]
205 | name = "six"
206 | version = "1.16.0"
207 | description = "Python 2 and 3 compatibility utilities"
208 | category = "dev"
209 | optional = false
210 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*"
211 | 
212 | [[package]]
213 | name = "toml"
214 | version = "0.10.2"
215 | description = "Python Library for Tom's Obvious, Minimal Language"
216 | category = "dev"
217 | optional = false
218 | python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
219 | 
220 | [[package]]
221 | name = "wcwidth"
222 | version = "0.2.5"
223 | description = "Measures the displayed width of unicode strings in a terminal"
224 | category = "dev"
225 | optional = false
226 | python-versions = "*"
227 | 
228 | [[package]]
229 | name = "zipp"
230 | version = "1.2.0"
231 | description = "Backport of pathlib-compatible object wrapper for zip files"
232 | category = "dev"
233 | optional = false
234 | python-versions = ">=2.7"
235 | 
236 | [package.extras]
237 | docs = ["sphinx", "jaraco.packaging (>=3.2)", "rst.linker (>=1.9)"]
238 | testing = ["pathlib2", "unittest2", "jaraco.itertools", "func-timeout"]
239 | 
240 | [metadata]
241 | lock-version = "1.1"
242 | python-versions = "^3.5"
243 | content-hash = "b43cc00e376732988dd656db2e2321f17b14fb6a1bd2caec1319e128ef76d8fa"
244 | 
245 | [metadata.files]
246 | atomicwrites = [
247 |     {file = "atomicwrites-1.4.0-py2.py3-none-any.whl", hash = "sha256:6d1784dea7c0c8d4a5172b6c620f40b6e4cbfdf96d783691f2e1302a7b88e197"},
248 |     {file = "atomicwrites-1.4.0.tar.gz", hash = "sha256:ae70396ad1a434f9c7046fd2dd196fc04b12f9e91ffb859164193be8b6168a7a"},
249 | ]
250 | attrs = [
251 |     {file = "attrs-21.2.0-py2.py3-none-any.whl", hash = "sha256:149e90d6d8ac20db7a955ad60cf0e6881a3f20d37096140088356da6c716b0b1"},
252 |     {file = "attrs-21.2.0.tar.gz", hash = "sha256:ef6aaac3ca6cd92904cdd0d83f629a15f18053ec84e6432106f7a4d04ae4f5fb"},
253 | ]
254 | colorama = [
255 |     {file = "colorama-0.4.4-py2.py3-none-any.whl", hash = "sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2"},
256 |     {file = "colorama-0.4.4.tar.gz", hash = "sha256:5941b2b48a20143d2267e95b1c2a7603ce057ee39fd88e7329b0c292aa16869b"},
257 | ]
258 | coverage = [
259 |     {file = "coverage-5.5-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:b6d534e4b2ab35c9f93f46229363e17f63c53ad01330df9f2d6bd1187e5eaacf"},
260 |     {file = "coverage-5.5-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:b7895207b4c843c76a25ab8c1e866261bcfe27bfaa20c192de5190121770672b"},
261 |     {file = "coverage-5.5-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:c2723d347ab06e7ddad1a58b2a821218239249a9e4365eaff6649d31180c1669"},
262 |     {file = "coverage-5.5-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:900fbf7759501bc7807fd6638c947d7a831fc9fdf742dc10f02956ff7220fa90"},
263 |     {file = "coverage-5.5-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:004d1880bed2d97151facef49f08e255a20ceb6f9432df75f4eef018fdd5a78c"},
264 |     {file = "coverage-5.5-cp27-cp27m-win32.whl", hash = "sha256:06191eb60f8d8a5bc046f3799f8a07a2d7aefb9504b0209aff0b47298333302a"},
265 |     {file = "coverage-5.5-cp27-cp27m-win_amd64.whl", hash = "sha256:7501140f755b725495941b43347ba8a2777407fc7f250d4f5a7d2a1050ba8e82"},
266 |     {file = "coverage-5.5-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:372da284cfd642d8e08ef606917846fa2ee350f64994bebfbd3afb0040436905"},
267 |     {file = "coverage-5.5-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:8963a499849a1fc54b35b1c9f162f4108017b2e6db2c46c1bed93a72262ed083"},
268 |     {file = "coverage-5.5-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:869a64f53488f40fa5b5b9dcb9e9b2962a66a87dab37790f3fcfb5144b996ef5"},
269 |     {file = "coverage-5.5-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:4a7697d8cb0f27399b0e393c0b90f0f1e40c82023ea4d45d22bce7032a5d7b81"},
270 |     {file = "coverage-5.5-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:8d0a0725ad7c1a0bcd8d1b437e191107d457e2ec1084b9f190630a4fb1af78e6"},
271 |     {file = "coverage-5.5-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:51cb9476a3987c8967ebab3f0fe144819781fca264f57f89760037a2ea191cb0"},
272 |     {file = "coverage-5.5-cp310-cp310-win_amd64.whl", hash = "sha256:c0891a6a97b09c1f3e073a890514d5012eb256845c451bd48f7968ef939bf4ae"},
273 |     {file = "coverage-5.5-cp35-cp35m-macosx_10_9_x86_64.whl", hash = "sha256:3487286bc29a5aa4b93a072e9592f22254291ce96a9fbc5251f566b6b7343cdb"},
274 |     {file = "coverage-5.5-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:deee1077aae10d8fa88cb02c845cfba9b62c55e1183f52f6ae6a2df6a2187160"},
275 |     {file = "coverage-5.5-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:f11642dddbb0253cc8853254301b51390ba0081750a8ac03f20ea8103f0c56b6"},
276 |     {file = "coverage-5.5-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:6c90e11318f0d3c436a42409f2749ee1a115cd8b067d7f14c148f1ce5574d701"},
277 |     {file = "coverage-5.5-cp35-cp35m-manylinux2010_x86_64.whl", hash = "sha256:30c77c1dc9f253283e34c27935fded5015f7d1abe83bc7821680ac444eaf7793"},
278 |     {file = "coverage-5.5-cp35-cp35m-win32.whl", hash = "sha256:9a1ef3b66e38ef8618ce5fdc7bea3d9f45f3624e2a66295eea5e57966c85909e"},
279 |     {file = "coverage-5.5-cp35-cp35m-win_amd64.whl", hash = "sha256:972c85d205b51e30e59525694670de6a8a89691186012535f9d7dbaa230e42c3"},
280 |     {file = "coverage-5.5-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:af0e781009aaf59e25c5a678122391cb0f345ac0ec272c7961dc5455e1c40066"},
281 |     {file = "coverage-5.5-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:74d881fc777ebb11c63736622b60cb9e4aee5cace591ce274fb69e582a12a61a"},
282 |     {file = "coverage-5.5-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:92b017ce34b68a7d67bd6d117e6d443a9bf63a2ecf8567bb3d8c6c7bc5014465"},
283 |     {file = "coverage-5.5-cp36-cp36m-manylinux2010_i686.whl", hash = "sha256:d636598c8305e1f90b439dbf4f66437de4a5e3c31fdf47ad29542478c8508bbb"},
284 |     {file = "coverage-5.5-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:41179b8a845742d1eb60449bdb2992196e211341818565abded11cfa90efb821"},
285 |     {file = "coverage-5.5-cp36-cp36m-win32.whl", hash = "sha256:040af6c32813fa3eae5305d53f18875bedd079960822ef8ec067a66dd8afcd45"},
286 |     {file = "coverage-5.5-cp36-cp36m-win_amd64.whl", hash = "sha256:5fec2d43a2cc6965edc0bb9e83e1e4b557f76f843a77a2496cbe719583ce8184"},
287 |     {file = "coverage-5.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:18ba8bbede96a2c3dde7b868de9dcbd55670690af0988713f0603f037848418a"},
288 |     {file = "coverage-5.5-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:2910f4d36a6a9b4214bb7038d537f015346f413a975d57ca6b43bf23d6563b53"},
289 |     {file = "coverage-5.5-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:f0b278ce10936db1a37e6954e15a3730bea96a0997c26d7fee88e6c396c2086d"},
290 |     {file = "coverage-5.5-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:796c9c3c79747146ebd278dbe1e5c5c05dd6b10cc3bcb8389dfdf844f3ead638"},
291 |     {file = "coverage-5.5-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:53194af30d5bad77fcba80e23a1441c71abfb3e01192034f8246e0d8f99528f3"},
292 |     {file = "coverage-5.5-cp37-cp37m-win32.whl", hash = "sha256:184a47bbe0aa6400ed2d41d8e9ed868b8205046518c52464fde713ea06e3a74a"},
293 |     {file = "coverage-5.5-cp37-cp37m-win_amd64.whl", hash = "sha256:2949cad1c5208b8298d5686d5a85b66aae46d73eec2c3e08c817dd3513e5848a"},
294 |     {file = "coverage-5.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:217658ec7187497e3f3ebd901afdca1af062b42cfe3e0dafea4cced3983739f6"},
295 |     {file = "coverage-5.5-cp38-cp38-manylinux1_i686.whl", hash = "sha256:1aa846f56c3d49205c952d8318e76ccc2ae23303351d9270ab220004c580cfe2"},
296 |     {file = "coverage-5.5-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:24d4a7de75446be83244eabbff746d66b9240ae020ced65d060815fac3423759"},
297 |     {file = "coverage-5.5-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:d1f8bf7b90ba55699b3a5e44930e93ff0189aa27186e96071fac7dd0d06a1873"},
298 |     {file = "coverage-5.5-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:970284a88b99673ccb2e4e334cfb38a10aab7cd44f7457564d11898a74b62d0a"},
299 |     {file = "coverage-5.5-cp38-cp38-win32.whl", hash = "sha256:01d84219b5cdbfc8122223b39a954820929497a1cb1422824bb86b07b74594b6"},
300 |     {file = "coverage-5.5-cp38-cp38-win_amd64.whl", hash = "sha256:2e0d881ad471768bf6e6c2bf905d183543f10098e3b3640fc029509530091502"},
301 |     {file = "coverage-5.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:d1f9ce122f83b2305592c11d64f181b87153fc2c2bbd3bb4a3dde8303cfb1a6b"},
302 |     {file = "coverage-5.5-cp39-cp39-manylinux1_i686.whl", hash = "sha256:13c4ee887eca0f4c5a247b75398d4114c37882658300e153113dafb1d76de529"},
303 |     {file = "coverage-5.5-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:52596d3d0e8bdf3af43db3e9ba8dcdaac724ba7b5ca3f6358529d56f7a166f8b"},
304 |     {file = "coverage-5.5-cp39-cp39-manylinux2010_i686.whl", hash = "sha256:2cafbbb3af0733db200c9b5f798d18953b1a304d3f86a938367de1567f4b5bff"},
305 |     {file = "coverage-5.5-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:44d654437b8ddd9eee7d1eaee28b7219bec228520ff809af170488fd2fed3e2b"},
306 |     {file = "coverage-5.5-cp39-cp39-win32.whl", hash = "sha256:d314ed732c25d29775e84a960c3c60808b682c08d86602ec2c3008e1202e3bb6"},
307 |     {file = "coverage-5.5-cp39-cp39-win_amd64.whl", hash = "sha256:13034c4409db851670bc9acd836243aeee299949bd5673e11844befcb0149f03"},
308 |     {file = "coverage-5.5-pp36-none-any.whl", hash = "sha256:f030f8873312a16414c0d8e1a1ddff2d3235655a2174e3648b4fa66b3f2f1079"},
309 |     {file = "coverage-5.5-pp37-none-any.whl", hash = "sha256:2a3859cb82dcbda1cfd3e6f71c27081d18aa251d20a17d87d26d4cd216fb0af4"},
310 |     {file = "coverage-5.5.tar.gz", hash = "sha256:ebe78fe9a0e874362175b02371bdfbee64d8edc42a044253ddf4ee7d3c15212c"},
311 | ]
312 | importlib-metadata = [
313 |     {file = "importlib_metadata-2.1.1-py2.py3-none-any.whl", hash = "sha256:c2d6341ff566f609e89a2acb2db190e5e1d23d5409d6cc8d2fe34d72443876d4"},
314 |     {file = "importlib_metadata-2.1.1.tar.gz", hash = "sha256:b8de9eff2b35fb037368f28a7df1df4e6436f578fa74423505b6c6a778d5b5dd"},
315 | ]
316 | more-itertools = [
317 |     {file = "more-itertools-8.10.0.tar.gz", hash = "sha256:1debcabeb1df793814859d64a81ad7cb10504c24349368ccf214c664c474f41f"},
318 |     {file = "more_itertools-8.10.0-py3-none-any.whl", hash = "sha256:56ddac45541718ba332db05f464bebfb0768110111affd27f66e0051f276fa43"},
319 | ]
320 | packaging = [
321 |     {file = "packaging-20.9-py2.py3-none-any.whl", hash = "sha256:67714da7f7bc052e064859c05c595155bd1ee9f69f76557e21f051443c20947a"},
322 |     {file = "packaging-20.9.tar.gz", hash = "sha256:5b327ac1320dc863dca72f4514ecc086f31186744b84a230374cc1fd776feae5"},
323 | ]
324 | pathlib2 = [
325 |     {file = "pathlib2-2.3.6-py2.py3-none-any.whl", hash = "sha256:3a130b266b3a36134dcc79c17b3c7ac9634f083825ca6ea9d8f557ee6195c9c8"},
326 |     {file = "pathlib2-2.3.6.tar.gz", hash = "sha256:7d8bcb5555003cdf4a8d2872c538faa3a0f5d20630cb360e518ca3b981795e5f"},
327 | ]
328 | pluggy = [
329 |     {file = "pluggy-0.13.1-py2.py3-none-any.whl", hash = "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d"},
330 |     {file = "pluggy-0.13.1.tar.gz", hash = "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0"},
331 | ]
332 | ply = [
333 |     {file = "ply-3.11-py2.py3-none-any.whl", hash = "sha256:096f9b8350b65ebd2fd1346b12452efe5b9607f7482813ffca50c22722a807ce"},
334 |     {file = "ply-3.11.tar.gz", hash = "sha256:00c7c1aaa88358b9c765b6d3000c6eec0ba42abca5351b095321aef446081da3"},
335 | ]
336 | py = [
337 |     {file = "py-1.10.0-py2.py3-none-any.whl", hash = "sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a"},
338 |     {file = "py-1.10.0.tar.gz", hash = "sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3"},
339 | ]
340 | py-cpuinfo = [
341 |     {file = "py-cpuinfo-8.0.0.tar.gz", hash = "sha256:5f269be0e08e33fd959de96b34cd4aeeeacac014dd8305f70eb28d06de2345c5"},
342 | ]
343 | pyparsing = [
344 |     {file = "pyparsing-2.4.7-py2.py3-none-any.whl", hash = "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"},
345 |     {file = "pyparsing-2.4.7.tar.gz", hash = "sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1"},
346 | ]
347 | pytest = [
348 |     {file = "pytest-5.4.3-py3-none-any.whl", hash = "sha256:5c0db86b698e8f170ba4582a492248919255fcd4c79b1ee64ace34301fb589a1"},
349 |     {file = "pytest-5.4.3.tar.gz", hash = "sha256:7979331bfcba207414f5e1263b5a0f8f521d0f457318836a7355531ed1a4c7d8"},
350 | ]
351 | pytest-benchmark = [
352 |     {file = "pytest-benchmark-3.4.1.tar.gz", hash = "sha256:40e263f912de5a81d891619032983557d62a3d85843f9a9f30b98baea0cd7b47"},
353 |     {file = "pytest_benchmark-3.4.1-py2.py3-none-any.whl", hash = "sha256:36d2b08c4882f6f997fd3126a3d6dfd70f3249cde178ed8bbc0b73db7c20f809"},
354 | ]
355 | pytest-cov = [
356 |     {file = "pytest-cov-2.12.1.tar.gz", hash = "sha256:261ceeb8c227b726249b376b8526b600f38667ee314f910353fa318caa01f4d7"},
357 |     {file = "pytest_cov-2.12.1-py2.py3-none-any.whl", hash = "sha256:261bb9e47e65bd099c89c3edf92972865210c36813f80ede5277dceb77a4a62a"},
358 | ]
359 | pytest-mock = [
360 |     {file = "pytest-mock-2.0.0.tar.gz", hash = "sha256:b35eb281e93aafed138db25c8772b95d3756108b601947f89af503f8c629413f"},
361 |     {file = "pytest_mock-2.0.0-py2.py3-none-any.whl", hash = "sha256:cb67402d87d5f53c579263d37971a164743dc33c159dfb4fb4a86f37c5552307"},
362 | ]
363 | six = [
364 |     {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"},
365 |     {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
366 | ]
367 | toml = [
368 |     {file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"},
369 |     {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"},
370 | ]
371 | wcwidth = [
372 |     {file = "wcwidth-0.2.5-py2.py3-none-any.whl", hash = "sha256:beb4802a9cebb9144e99086eff703a642a13d6a0052920003a230f3294bbe784"},
373 |     {file = "wcwidth-0.2.5.tar.gz", hash = "sha256:c4d647b99872929fdb7bdcaa4fbe7f01413ed3d98077df798530e5b04f116c83"},
374 | ]
375 | zipp = [
376 |     {file = "zipp-1.2.0-py2.py3-none-any.whl", hash = "sha256:e0d9e63797e483a30d27e09fffd308c59a700d365ec34e93cc100844168bf921"},
377 |     {file = "zipp-1.2.0.tar.gz", hash = "sha256:c70410551488251b0fee67b460fb9a536af8d6f9f008ad10ac51f615b6a521b1"},
378 | ]
379 | 


--------------------------------------------------------------------------------
/tests/test_rules.py:
--------------------------------------------------------------------------------
  1 | import os.path
  2 | import re
  3 | import tempfile
  4 | 
  5 | import pytest
  6 | import rita
  7 | 
  8 | 
  9 | class TestSpacy(object):
 10 |     @property
 11 |     def punct(self):
 12 |         return {'IS_PUNCT': True, 'OP': '?'}
 13 | 
 14 |     def compiler(self, rules):
 15 |         pytest.importorskip("spacy", minversion="2.1")
 16 |         return rita.compile_string(rules, use_engine="spacy")
 17 | 
 18 |     def test_punct(self):
 19 |         rules = self.compiler('PUNCT->MARK("SOME_PUNCT")')
 20 |         print(rules)
 21 |         assert len(rules) == 1
 22 |         assert rules[0] == {
 23 |             "pattern": [{"IS_PUNCT": True}],
 24 |             "label": "SOME_PUNCT"
 25 |         }
 26 | 
 27 |     def test_number(self):
 28 |         rules = self.compiler('NUM("42")->MARK("SOME_NUMBER")')
 29 |         print(rules)
 30 |         assert len(rules) == 1
 31 |         assert rules[0] == {
 32 |             "pattern": [{"LOWER": "42"}],
 33 |             "label": "SOME_NUMBER"
 34 |         }
 35 | 
 36 |     def test_pos(self):
 37 |         rules = self.compiler('POS("VERB")->MARK("SOME_POS")')
 38 |         print(rules)
 39 |         assert len(rules) == 1
 40 |         assert rules[0] == {
 41 |             "pattern": [{"POS": "VERB"}],
 42 |             "label": "SOME_POS"
 43 |         }
 44 | 
 45 |     def test_single_word(self):
 46 |         rules = self.compiler('WORD("Test")->MARK("SOME_LABEL")')
 47 |         print(rules)
 48 |         assert len(rules) == 1
 49 |         assert rules[0] == {
 50 |             "pattern": [{"LOWER": "test"}],
 51 |             "label": "SOME_LABEL"
 52 |         }
 53 | 
 54 |     def test_multiple_words(self):
 55 |         rules = self.compiler('''
 56 |         words = {"test1", "test2"}
 57 |         IN_LIST(words)->MARK("MULTI_LABEL")
 58 |         ''')
 59 |         print(rules)
 60 |         assert len(rules) == 1
 61 |         assert rules[0] == {
 62 |             "pattern": [{"LOWER": {"IN": ["test1", "test2"]}}],
 63 |             "label": "MULTI_LABEL"
 64 |         }
 65 | 
 66 |     def test_simple_pattern(self):
 67 |         rules = self.compiler('''
 68 |         {WORD("test1"), WORD("test2")}->MARK("SIMPLE_PATTERN")
 69 |         ''')
 70 |         print(rules)
 71 |         assert len(rules) == 1
 72 |         assert rules[0] == {
 73 |             "pattern": [{"LOWER": "test1"}, self.punct, {"LOWER": "test2"}],
 74 |             "label": "SIMPLE_PATTERN"
 75 |         }
 76 | 
 77 |     def test_or_branch(self):
 78 |         rules = self.compiler('''
 79 |         {WORD("test1")|WORD("test2")}->MARK("SPLIT_LABEL")
 80 |         ''')
 81 |         print(rules)
 82 |         assert len(rules) == 2
 83 |         assert rules[0] == {
 84 |             "pattern": [{"LOWER": "test1"}],
 85 |             "label": "SPLIT_LABEL"
 86 |         }
 87 |         assert rules[1] == {
 88 |             "pattern": [{"LOWER": "test2"}],
 89 |             "label": "SPLIT_LABEL"
 90 |         }
 91 | 
 92 |     def test_or_branch_multi(self):
 93 |         rules = self.compiler('''
 94 |         {WORD("test1")|WORD("test2"),WORD("test3")|WORD("test4")}->MARK("MULTI_SPLIT_LABEL")
 95 |         ''')
 96 |         print(rules)
 97 |         assert len(rules) == 4
 98 |         assert rules[0] == {
 99 |             "pattern": [{"LOWER": "test1"}, self.punct, {"LOWER": "test3"}],
100 |             "label": "MULTI_SPLIT_LABEL"
101 |         }
102 |         assert rules[1] == {
103 |             "pattern": [{"LOWER": "test2"}, self.punct, {"LOWER": "test3"}],
104 |             "label": "MULTI_SPLIT_LABEL"
105 |         }
106 |         assert rules[2] == {
107 |             "pattern": [{"LOWER": "test1"}, self.punct, {"LOWER": "test4"}],
108 |             "label": "MULTI_SPLIT_LABEL"
109 |         }
110 |         assert rules[3] == {
111 |             "pattern": [{"LOWER": "test2"}, self.punct, {"LOWER": "test4"}],
112 |             "label": "MULTI_SPLIT_LABEL"
113 |         }
114 | 
115 |     def test_or_branch_multi_w_single(self):
116 |         rules = self.compiler('''
117 |         numbers={"one", "two", "three"}
118 |         {WORD("test1")|WORD("test2"),IN_LIST(numbers),WORD("test3")|WORD("test4")}->MARK("MULTI_SPLIT_LABEL")
119 |         ''')
120 |         print(rules)
121 |         assert len(rules) == 4
122 |         list_items = {"LOWER": {"IN": ["one", "three", "two"]}}
123 |         assert rules[0] == {
124 |             "pattern": [{"LOWER": "test1"}, self.punct, list_items, self.punct, {"LOWER": "test3"}],
125 |             "label": "MULTI_SPLIT_LABEL"
126 |         }
127 |         assert rules[1] == {
128 |             "pattern": [{"LOWER": "test2"}, self.punct, list_items, self.punct, {"LOWER": "test3"}], "label": "MULTI_SPLIT_LABEL"}
129 |         assert rules[2] == {
130 |             "pattern": [{"LOWER": "test1"}, self.punct, list_items, self.punct, {"LOWER": "test4"}],
131 |             "label": "MULTI_SPLIT_LABEL"
132 |         }
133 |         assert rules[3] == {
134 |             "pattern": [{"LOWER": "test2"}, self.punct, list_items, self.punct, {"LOWER": "test4"}],
135 |             "label": "MULTI_SPLIT_LABEL"
136 |         }
137 | 
138 |     def test_branching_list(self):
139 |         rules = self.compiler('''
140 |         items={"test1", "test2", "test-3", "test4"}
141 |         {IN_LIST(items)}->MARK("SPLIT_LIST")
142 |         ''')
143 |         print(rules)
144 |         assert len(rules) == 2
145 |         assert rules[0] == {
146 |             "label": "SPLIT_LIST",
147 |             "pattern": [{"LOWER": {"IN": ["test1", "test2", "test4"]}}]
148 |         }
149 |         assert rules[1] == {
150 |             "label": "SPLIT_LIST",
151 |             "pattern": [{"LOWER": "test"}, {"LOWER": "-"}, {"LOWER": "3"}]
152 |         }
153 | 
154 |     def test_double_branching_list(self):
155 |         rules = self.compiler('''
156 |         items={"test1", "test2", "test-3", "test4", "test-5"}
157 |         {IN_LIST(items)}->MARK("SPLIT_LIST")
158 |         ''')
159 |         print(rules)
160 |         assert len(rules) == 3
161 |         assert rules[0] == {
162 |             "label": "SPLIT_LIST",
163 |             "pattern": [{"LOWER": {"IN": ["test1", "test2", "test4"]}}]
164 |         }
165 |         assert rules[1] == {
166 |             "label": "SPLIT_LIST",
167 |             "pattern": [{"LOWER": "test"}, {"LOWER": "-"}, {"LOWER": "3"}]
168 |         }
169 |         assert rules[2] == {
170 |             "label": "SPLIT_LIST",
171 |             "pattern": [{"LOWER": "test"}, {"LOWER": "-"}, {"LOWER": "5"}]
172 |         }
173 | 
174 |     def test_word_with_spaces(self):
175 |         rules = self.compiler('''
176 |         WORD("test1 test2")->MARK("SPLIT_WORD")
177 |         ''')
178 |         print(rules)
179 |         # It should be split into two: WORD("test1"), WORD("test2")
180 |         assert len(rules) == 1
181 |         assert rules[0] == {
182 |             "label": "SPLIT_WORD",
183 |             "pattern": [{"LOWER": "test1"}, {"LOWER": "test2"}]
184 |         }
185 | 
186 |     def test_word_with_dash(self):
187 |         rules = self.compiler('''
188 |         WORD("test1-test2")->MARK("SPLIT_WORD")
189 |         ''')
190 |         print(rules)
191 |         # It should be split into two: WORD("test1"), WORD("test2")
192 |         assert len(rules) == 1
193 |         assert rules[0] == {
194 |             "label": "SPLIT_WORD",
195 |             "pattern": [{"LOWER": "test1"}, {"LOWER": "-"}, {"LOWER": "test2"}]
196 |         }
197 | 
198 |     def test_word_with_accent(self):
199 |         rules = self.compiler('''
200 |         WORD("Šarūnas")->MARK("TWO_WORDS")
201 |         ''')
202 |         print(rules)
203 |         assert len(rules) == 1
204 |         assert rules[0] == {
205 |             "label": "TWO_WORDS",
206 |             "pattern": [{"LOWER": {"IN": ["sarunas", "šarūnas"]}}]
207 |         }
208 | 
209 |     def test_list_with_accent(self):
210 |         rules = self.compiler('''
211 |         names={"Jonas", "Jurgis", "Šarūnas"}
212 |         IN_LIST(names)->MARK("EXTENDED_LIST")
213 |         ''')
214 |         print(rules)
215 |         assert len(rules) == 1
216 |         assert rules[0] == {
217 |             "label": "EXTENDED_LIST",
218 |             "pattern": [{"LOWER": {"IN": ["jonas", "jurgis", "sarunas", "šarūnas"]}}]
219 |         }
220 | 
221 |     def test_prefix_on_word(self):
222 |         rules = self.compiler('''
223 |         {PREFIX("meta"), WORD("physics")}->MARK("META_WORD")
224 |         ''')
225 |         print(rules)
226 |         assert len(rules) == 1
227 |         assert rules[0] == {
228 |             "label": "META_WORD",
229 |             "pattern": [{"LOWER": "metaphysics"}]
230 |         }
231 | 
232 |     def test_prefix_on_list(self):
233 |         rules = self.compiler('''
234 |         science = {"physics", "mathematics"}
235 |         {PREFIX("meta"), IN_LIST(science)}->MARK("META_LIST")
236 |         ''')
237 |         print(rules)
238 |         assert len(rules) == 1
239 |         assert rules[0] == {
240 |             "label": "META_LIST",
241 |             "pattern": [{"LOWER": {"IN": ["metamathematics", "metaphysics"]}}]
242 |         }
243 | 
244 |     def test_prefix_on_unknown_type(self):
245 |         rules = self.compiler('''
246 |         {PREFIX("test"), ANY}->MARK("NOT_VALID")
247 |         ''')
248 |         print(rules)
249 |         assert len(rules) == 1
250 |         assert rules[0] == {
251 |             "label": "NOT_VALID",
252 |             "pattern": [{}]
253 |         }
254 | 
255 |     def test_multiple_optionals(self):
256 |         rules = self.compiler("""
257 |         {NUM+, WORD("-")?, NUM?, WORD("/")?, NUM?}->MARK("NUMBER_PATTERN")
258 |         """)
259 |         print(rules)
260 |         assert len(rules) == 1
261 |         assert rules[0] == {
262 |             "label": "NUMBER_PATTERN",
263 |             "pattern": [
264 |                 {"LOWER": {"REGEX": "((\\d+[\\.,]\\d+)|(\\d+))"}, "OP": "+"},
265 |                 {"IS_PUNCT": True, "OP": "?"},
266 |                 {"LOWER": "-", "OP": "?"},
267 |                 {"IS_PUNCT": True, "OP": "?"},
268 |                 {"LOWER": {"REGEX": "((\\d+[\\.,]\\d+)|(\\d+))"}, "OP": "?"},
269 |                 {"IS_PUNCT": True, "OP": "?"},
270 |                 {"LOWER": "/", "OP": "?"},
271 |                 {"IS_PUNCT": True, "OP": "?"},
272 |                 {"LOWER": {"REGEX": "((\\d+[\\.,]\\d+)|(\\d+))"}, "OP": "?"},
273 |             ]
274 |         }
275 | 
276 |     def test_optional_list(self):
277 |         rules = self.compiler("""
278 |         elements = {"one", "two"}
279 |         {IN_LIST(elements)?}->MARK("OPTIONAL_LIST")
280 |         """)
281 | 
282 |         print(rules)
283 | 
284 |         assert len(rules) == 1
285 |         assert rules[0] == {
286 |             "label": "OPTIONAL_LIST",
287 |             "pattern": [{"LOWER": {"IN": ["one", "two"]}, "OP": "?"}]
288 |         }
289 | 
290 |     def test_tag_module(self):
291 |         rules = self.compiler("""
292 |         !IMPORT("rita.modules.tag")
293 | 
294 |         TAG("^NN|^JJ")->MARK("TEST_TAG")
295 |         """)
296 | 
297 |         print(rules)
298 | 
299 |         assert len(rules) == 1
300 |         assert rules[0] == {
301 |             "label": "TEST_TAG",
302 |             "pattern": [{"TAG": {"REGEX": "^NN|^JJ"}}]
303 |         }
304 | 
305 |     def test_tag_word(self):
306 |         rules = self.compiler("""
307 |         !IMPORT("rita.modules.tag")
308 | 
309 |         TAG_WORD("^VB", "proposed")->MARK("TEST_TAG")
310 |         """)
311 | 
312 |         print(rules)
313 | 
314 |         assert len(rules) == 1
315 |         assert rules[0] == {
316 |             "label": "TEST_TAG",
317 |             "pattern": [{"LOWER": "proposed", "TAG": {"REGEX": "^VB"}}]
318 |         }
319 | 
320 |     def test_tag_list(self):
321 |         rules = self.compiler("""
322 |         !IMPORT("rita.modules.tag")
323 | 
324 |         words = {"perceived", "proposed"}
325 |         {TAG_WORD("^VB", words)}->MARK("TEST_TAG")
326 |         """)
327 | 
328 |         print(rules)
329 | 
330 |         assert len(rules) == 1
331 |         assert rules[0] == {
332 |             "label": "TEST_TAG",
333 |             "pattern": [{"LOWER": {"REGEX": "^(perceived|proposed)$"}, "TAG": {"REGEX": "^VB"}}]
334 |         }
335 | 
336 |     def test_tags_case_sensitive(self):
337 |         rules = self.compiler("""
338 |         !CONFIG("ignore_case", "F")
339 |         !IMPORT("rita.modules.tag")
340 | 
341 |         words = {"perceived", "proposed"}
342 |         TAG_WORD("^VB", "proposed")->MARK("TEST_TAG")
343 |         {TAG_WORD("^VB", words)}->MARK("TEST_TAG")
344 |         """)
345 | 
346 |         print(rules)
347 | 
348 |         assert len(rules) == 2
349 |         assert rules == [
350 |             {
351 |                 "label": "TEST_TAG",
352 |                 "pattern": [{"TEXT": "proposed", "TAG": {"REGEX": "^VB"}}]
353 |             },
354 |             {
355 |                 "label": "TEST_TAG",
356 |                 "pattern": [{"TEXT": {"REGEX": "^(perceived|proposed)$"}, "TAG": {"REGEX": "^VB"}}]
357 |             }
358 |         ]
359 | 
360 |     def test_generate_names(self):
361 |         rules = self.compiler("""
362 |         !IMPORT("rita.modules.names")
363 | 
364 |         names = {"Roy Jones junior", "Roy Jones senior", "Juan-Claude van Damme", "Jon Jones"}
365 |         NAMES(names)->MARK("NAME_MATCH")
366 |         NAMES("Kazushi Sakuraba")->MARK("NAME_MATCH")
367 |         """)
368 | 
369 |         print(rules)
370 |         assert len(rules) == 10
371 | 
372 |     def test_any_tag(self):
373 |         rules = self.compiler("""
374 |         ANY -> MARK("ANYTHING_GOES_HERE")
375 |         """)
376 |         print(rules)
377 |         assert len(rules) == 1
378 |         assert rules == [{"label": "ANYTHING_GOES_HERE", "pattern": [{}]}]
379 | 
380 |     def test_entity_tag_default(self):
381 |         rules = self.compiler("""
382 |         ENTITY("PERSON")->MARK("PERSON_FOUND")
383 |         """)
384 |         print(rules)
385 |         assert len(rules) == 1
386 |         assert rules == [{"label": "PERSON_FOUND", "pattern": [{"ENT_TYPE": "PERSON", "OP": "+"}]}]
387 | 
388 |     def test_entity_tag_override(self):
389 |         rules = self.compiler("""
390 |         {ENTITY("PERSON")*}->MARK("PERSON_FOUND")
391 |         """)
392 |         print(rules)
393 |         assert len(rules) == 1
394 |         assert rules == [{"label": "PERSON_FOUND", "pattern": [{"ENT_TYPE": "PERSON", "OP": "*"}]}]
395 | 
396 | 
397 | class TestStandalone(object):
398 |     @property
399 |     def punct(self):
400 |         return re.compile(r"[.,!;?:]")
401 | 
402 |     @property
403 |     def flags(self):
404 |         return re.DOTALL | re.IGNORECASE
405 | 
406 |     def compiler(self, rules):
407 |         return rita.compile_string(rules, use_engine="standalone").patterns
408 | 
409 |     def test_punct(self):
410 |         rules = self.compiler('PUNCT->MARK("SOME_PUNCT")')
411 |         print(rules)
412 |         assert len(rules) == 1
413 |         assert rules[0] == re.compile(r"(?P<SOME_PUNCT>(?P<s0>([.,!;?:]\s?)))", self.flags)
414 | 
415 |     def test_number(self):
416 |         rules = self.compiler('NUM("42")->MARK("SOME_NUMBER")')
417 |         print(rules)
418 |         assert len(rules) == 1
419 |         assert rules[0] == re.compile(r"(?P<SOME_NUMBER>(?P<s0>(42\s?)))", self.flags)
420 | 
421 |     def test_single_word(self):
422 |         rules = self.compiler('WORD("Test")->MARK("SOME_LABEL")')
423 |         print(rules)
424 |         assert len(rules) == 1
425 |         assert rules[0] == re.compile(r"(?P<SOME_LABEL>(?P<s0>(Test\s?)))", self.flags)
426 | 
427 |     def test_multiple_words(self):
428 |         rules = self.compiler('''
429 |         words = {"test1", "test2"}
430 |         IN_LIST(words)->MARK("MULTI_LABEL")
431 |         ''')
432 |         print(rules)
433 |         assert len(rules) == 1
434 |         assert rules[0] == re.compile(r"(?P<MULTI_LABEL>(?P<s0>((^|\s)((test1|test2)\s?))))", self.flags)
435 | 
436 |     def test_simple_pattern(self):
437 |         rules = self.compiler('''
438 |         {WORD("test1"), WORD("test2")}->MARK("SIMPLE_PATTERN")
439 |         ''')
440 |         print(rules)
441 |         assert len(rules) == 1
442 |         assert rules[0] == re.compile(
443 |             r"(?P<SIMPLE_PATTERN>(?P<s0>(test1\s?))(?P<s1>([.,!;?:]\s?)?)(?P<s2>(test2\s?)))",
444 |             self.flags
445 |         )
446 | 
447 |     def test_or_branch(self):
448 |         rules = self.compiler('''
449 |         {WORD("test1")|WORD("test2")}->MARK("SPLIT_LABEL")
450 |         ''')
451 |         print(rules)
452 |         assert len(rules) == 2
453 |         assert rules[0] == re.compile(r"(?P<SPLIT_LABEL>(?P<s0>(test1\s?)))", self.flags)
454 |         assert rules[1] == re.compile(r"(?P<SPLIT_LABEL>(?P<s0>(test2\s?)))", self.flags)
455 | 
456 |     def test_or_branch_multi(self):
457 |         rules = self.compiler('''
458 |         {WORD("test1")|WORD("test2"),WORD("test3")|WORD("test4")}->MARK("MULTI_SPLIT_LABEL")
459 |         ''')
460 |         print(rules)
461 |         assert len(rules) == 4
462 |         assert rules[0] == re.compile(
463 |             r"(?P<MULTI_SPLIT_LABEL>(?P<s0>(test1\s?))(?P<s1>([.,!;?:]\s?)?)(?P<s2>(test3\s?)))",
464 |             self.flags
465 |         )
466 | 
467 |         assert rules[1] == re.compile(
468 |             r"(?P<MULTI_SPLIT_LABEL>(?P<s0>(test2\s?))(?P<s1>([.,!;?:]\s?)?)(?P<s2>(test3\s?)))",
469 |             self.flags
470 |         )
471 | 
472 |         assert rules[2] == re.compile(
473 |             r"(?P<MULTI_SPLIT_LABEL>(?P<s0>(test1\s?))(?P<s1>([.,!;?:]\s?)?)(?P<s2>(test4\s?)))",
474 |             self.flags
475 |         )
476 | 
477 |         assert rules[3] == re.compile(
478 |             r"(?P<MULTI_SPLIT_LABEL>(?P<s0>(test2\s?))(?P<s1>([.,!;?:]\s?)?)(?P<s2>(test4\s?)))",
479 |             self.flags
480 |         )
481 | 
482 |     def test_or_branch_multi_w_single(self):
483 |         rules = self.compiler('''
484 |         numbers={"one", "two", "three"}
485 |         {WORD("test1")|WORD("test2"),IN_LIST(numbers),WORD("test3")|WORD("test4")}->MARK("MULTI_SPLIT_LABEL")
486 |         ''')
487 |         print(rules)
488 |         assert len(rules) == 4
489 |         assert rules[0] == re.compile(
490 |             r"(?P<MULTI_SPLIT_LABEL>(?P<s0>(test1\s?))(?P<s1>([.,!;?:]\s?)?)"
491 |             r"(?P<s2>((^|\s)((three|one|two)\s?)))(?P<s3>([.,!;?:]\s?)?)(?P<s4>(test3\s?)))",
492 |             self.flags
493 |         )
494 | 
495 |         assert rules[1] == re.compile(
496 |             r"(?P<MULTI_SPLIT_LABEL>(?P<s0>(test2\s?))(?P<s1>([.,!;?:]\s?)?)"
497 |             r"(?P<s2>((^|\s)((three|one|two)\s?)))(?P<s3>([.,!;?:]\s?)?)(?P<s4>(test3\s?)))",
498 |             self.flags
499 |         )
500 | 
501 |         assert rules[2] == re.compile(
502 |             r"(?P<MULTI_SPLIT_LABEL>(?P<s0>(test1\s?))(?P<s1>([.,!;?:]\s?)?)"
503 |             r"(?P<s2>((^|\s)((three|one|two)\s?)))(?P<s3>([.,!;?:]\s?)?)(?P<s4>(test4\s?)))",
504 |             self.flags
505 |         )
506 | 
507 |         assert rules[3] == re.compile(
508 |             r"(?P<MULTI_SPLIT_LABEL>(?P<s0>(test2\s?))(?P<s1>([.,!;?:]\s?)?)"
509 |             r"(?P<s2>((^|\s)((three|one|two)\s?)))(?P<s3>([.,!;?:]\s?)?)(?P<s4>(test4\s?)))",
510 |             self.flags
511 |         )
512 | 
513 |     def test_word_with_accent(self):
514 |         rules = self.compiler('''
515 |         WORD("Šarūnas")->MARK("TWO_WORDS")
516 |         ''')
517 |         print(rules)
518 |         assert len(rules) == 1
519 |         assert rules[0] == re.compile(
520 |             r"(?P<TWO_WORDS>(?P<s0>((^|\s)((Sarunas|Šarūnas)\s?))))",
521 |             self.flags
522 |         )
523 | 
524 |     def test_list_with_accent(self):
525 |         rules = self.compiler('''
526 |         names={"Jonas", "Jurgis", "Šarūnas"}
527 |         IN_LIST(names)->MARK("EXTENDED_LIST")
528 |         ''')
529 |         print(rules)
530 |         assert len(rules) == 1
531 |         assert rules[0] == re.compile(
532 |             r"(?P<EXTENDED_LIST>(?P<s0>((^|\s)((Sarunas|Šarūnas|Jurgis|Jonas)\s?))))",
533 |             self.flags
534 |         )
535 | 
536 |     def test_double_op(self):
537 |         rules = self.compiler('''
538 |         WORD+->MARK("DOUBLE_OP")
539 |         ''')
540 |         print(rules)
541 |         assert len(rules) == 1
542 |         assert rules[0] == re.compile(
543 |             r"(?P<DOUBLE_OP>(?P<s0>(((\w|['_-])+)\s?)+))",
544 |             self.flags
545 |         )
546 | 
547 |     def test_prefix_on_word(self):
548 |         rules = self.compiler('''
549 |         {PREFIX("meta"), WORD("physics")}->MARK("META_WORD")
550 |         ''')
551 |         print(rules)
552 |         assert len(rules) == 1
553 |         assert rules[0] == re.compile(r"(?P<META_WORD>(?P<s0>(metaphysics\s?)))", self.flags)
554 | 
555 |     def test_prefix_on_list(self):
556 |         rules = self.compiler('''
557 |         science = {"physics", "mathematics"}
558 |         {PREFIX("meta"), IN_LIST(science)}->MARK("META_LIST")
559 |         ''')
560 |         print(rules)
561 |         assert len(rules) == 1
562 |         assert rules[0] == re.compile(
563 |             r"(?P<META_LIST>(?P<s0>((^|\s)((metamathematics|metaphysics)\s?))))",
564 |             self.flags
565 |         )
566 | 
567 |     def test_prefix_on_unknown_type(self):
568 |         rules = self.compiler('''
569 |         {PREFIX("test"), ANY}->MARK("NOT_VALID")
570 |         ''')
571 |         print(rules)
572 |         assert len(rules) == 1
573 |         assert rules[0] == re.compile(r"(?P<NOT_VALID>(?P<s0>(.*\s?)))", self.flags)
574 | 
575 |     def test_save_and_load_rules_from_file(self):
576 |         rules = '''
577 |         {WORD("Hello"), WORD("world")}->MARK("HELLO")
578 |         '''
579 |         engine = rita.compile_string(rules, use_engine="standalone")
580 |         with tempfile.TemporaryDirectory() as tmpdir:
581 |             rules_path = os.path.join(tmpdir, "rules-example.json")
582 |             engine.save(rules_path)
583 |             engine.load(rules_path)
584 |             engine.execute("Hello world")
585 | 
586 |     def test_optional_list(self):
587 |         rules = self.compiler("""
588 |         elements = {"one", "two"}
589 |         {IN_LIST(elements)?}->MARK("OPTIONAL_LIST")
590 |         """)
591 | 
592 |         print(rules)
593 | 
594 |         assert len(rules) == 1
595 |         assert rules[0] == re.compile(r"(?P<OPTIONAL_LIST>(?P<s0>((^|\s)((one|two)\s?))?))", self.flags)
596 | 
597 |     def test_complex_list(self):
598 |         rules = self.compiler("""
599 |         fractions={"1 / 2", "3 / 4", "1 / 8", "3 / 8", "5 / 8", "7 / 8", "1 / 16", "3 / 16",
600 |                    "5 / 16", "7 / 16", "9 / 16", "11 / 16", "13 / 16", "15 / 16", "1 / 32",
601 |                    "3 / 32", "5 / 32", "7 / 32", "9 / 32", "11 / 32", "13 / 32", "15 / 32",
602 |                    "17 / 32", "19 / 32", "21 / 32", "23 / 32", "25 / 32", "27 / 32",
603 |                    "29 / 32", "31 / 32"}
604 |         {NUM+, WORD("-")?, IN_LIST(fractions)?}->MARK("COMPLEX_NUMBER")
605 |         """)
606 | 
607 |         print(rules)
608 | 
609 |         assert len(rules) == 1
610 | 
611 |     def test_generate_names(self):
612 |         rules = self.compiler("""
613 |         !IMPORT("rita.modules.names")
614 | 
615 |         names = {"Roy Jones junior", "Roy Jones senior", "Juan-Claude van Damme", "Jon Jones"}
616 |         NAMES(names)->MARK("NAME_MATCH")
617 |         NAMES("Kazushi Sakuraba")->MARK("NAME_MATCH")
618 |         """)
619 | 
620 |         print(rules)
621 |         assert len(rules) == 2
622 | 


--------------------------------------------------------------------------------