├── changes ├── .gitkeep ├── 114.feature.rst ├── 110.feature.rst ├── 116.feature.rst └── 118.feature.rst ├── rita ├── engine │ ├── __init__.py │ ├── translate_rust.py │ ├── translate_spacy.py │ └── translate_standalone.py ├── modules │ ├── __init__.py │ ├── regex.py │ ├── orth.py │ ├── tag.py │ ├── fuzzy.py │ ├── pluralize.py │ └── names.py ├── types.py ├── run.py ├── precompile.py ├── __init__.py ├── lexer.py ├── shortcuts.py ├── config.py ├── macros.py ├── utils.py ├── parser.py └── preprocess.py ├── .github ├── FUNDING.yml └── workflows │ ├── github-actions-deployment.yaml │ └── github-actions-main.yaml ├── examples ├── simple-import.rita ├── cyclical-import.rita ├── match-with-escaped-string.rita ├── excluding-word.rita ├── fuzzy-matching.rita ├── cars.txt ├── cheap-phones.rita ├── complex-number.rita ├── dress-match.rita └── color-car.rita ├── docs ├── assets │ ├── logo-1.png │ ├── logo-2.png │ ├── jetbrains.png │ ├── logo-100px.png │ └── jetbrains.svg ├── integration.md ├── index.md ├── extend.md ├── advanced.md ├── config.md ├── macros.md ├── engines.md ├── syntax.md ├── modules.md └── quickstart.md ├── Makefile ├── setup.cfg ├── .coveragerc ├── tests ├── test_utils.py ├── test_precompile.py ├── test_config.py ├── utils.py ├── test_run.py ├── test_lexer.py ├── test_parser.py ├── test_examples.py └── test_rules.py ├── mypy.ini ├── mkdocs.yml ├── changes_template.md ├── tox.ini ├── LICENSE ├── .gitignore ├── extra └── sublimetext │ └── RITA.sublime-syntax ├── pyproject.toml ├── README.md ├── CHANGELOG.md └── poetry.lock /changes/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rita/engine/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rita/modules/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: [zaibacu] 2 | -------------------------------------------------------------------------------- /examples/simple-import.rita: -------------------------------------------------------------------------------- 1 | @import "examples/simple-match.rita" -------------------------------------------------------------------------------- /examples/cyclical-import.rita: -------------------------------------------------------------------------------- 1 | @import "examples/cyclical-import.rita" -------------------------------------------------------------------------------- /changes/114.feature.rst: -------------------------------------------------------------------------------- 1 | Add spaCy wildcard instead of REGEX when using ANY 2 | -------------------------------------------------------------------------------- /examples/match-with-escaped-string.rita: -------------------------------------------------------------------------------- 1 | {WORD("5\""), WORD("Phone")} -> MARK("PHONE") -------------------------------------------------------------------------------- /docs/assets/logo-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zaibacu/rita-dsl/HEAD/docs/assets/logo-1.png -------------------------------------------------------------------------------- /docs/assets/logo-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zaibacu/rita-dsl/HEAD/docs/assets/logo-2.png -------------------------------------------------------------------------------- /examples/excluding-word.rita: -------------------------------------------------------------------------------- 1 | {WORD("Weather"), WORD("is"), WORD("cold")!}->MARK("GOOD_WEATHER") -------------------------------------------------------------------------------- /docs/assets/jetbrains.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zaibacu/rita-dsl/HEAD/docs/assets/jetbrains.png -------------------------------------------------------------------------------- /examples/fuzzy-matching.rita: -------------------------------------------------------------------------------- 1 | !IMPORT("rita.modules.fuzzy") 2 | 3 | FUZZY("squirrel") -> MARK("CRITTER") -------------------------------------------------------------------------------- /docs/assets/logo-100px.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zaibacu/rita-dsl/HEAD/docs/assets/logo-100px.png -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | PYTHON=python 2 | 3 | benchmark: 4 | ${PYTHON} -m pytest --benchmark-only tests/ --benchmark-autosave 5 | -------------------------------------------------------------------------------- /changes/110.feature.rst: -------------------------------------------------------------------------------- 1 | Type Hints for core to improve robustness. Extra CI step to check for errors is added as well 2 | -------------------------------------------------------------------------------- /changes/116.feature.rst: -------------------------------------------------------------------------------- 1 | Add "+" operator by default when building spaCy `ENTITY(...)` to make it easier to read and understand. -------------------------------------------------------------------------------- /examples/cars.txt: -------------------------------------------------------------------------------- 1 | BMW 2 | Audi 3 | VW 4 | Toyota 5 | Mazda 6 | Opel 7 | Ford 8 | Alfa Romeo 9 | Peugot 10 | Fiat 11 | Nissan 12 | Subaru 13 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | 4 | [tool:pytest] 5 | addopts = --benchmark-skip 6 | 7 | [flake8] 8 | max-line-length = 160 9 | 10 | [aliases] 11 | test=pytest -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | branch = True 3 | source = 4 | rita 5 | 6 | omit = rita/engine/translate_rust.py 7 | 8 | [report] 9 | show_missing = True 10 | omit = rita/engine/translate_rust.py 11 | -------------------------------------------------------------------------------- /examples/cheap-phones.rita: -------------------------------------------------------------------------------- 1 | inexpensive = {"secondary", "inexpensive", "cheap"} 2 | 3 | {IN_LIST(inexpensive), WORD("cell")?, WORD("phone")}->MARK("CHEAP_PHONE") 4 | {WORD("good"), WORD("value")}->MARK("CHEAP_PHONE") -------------------------------------------------------------------------------- /examples/complex-number.rita: -------------------------------------------------------------------------------- 1 | Complex_Number = { NUM+, WORD("/")?, NUM? } 2 | {PATTERN(Complex_Number), WORD("inches"), WORD("Width")}->MARK("WIDTH") 3 | {PATTERN(Complex_Number), WORD("inches"), WORD("Height")}->MARK("HEIGHT") -------------------------------------------------------------------------------- /rita/types.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Mapping, Tuple, List, AnyStr 2 | 3 | opts = Mapping[Any, Any] 4 | RuleData = Tuple[AnyStr, List[Any], AnyStr] 5 | Patterns = List[RuleData] 6 | RuleGroup = Tuple[AnyStr, Patterns] 7 | Rules = List[RuleGroup] 8 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | from rita.utils import deaccent 2 | 3 | 4 | class TestDeaccent(object): 5 | def test_lithuanian(self): 6 | assert deaccent("Šarūnas") == "Sarunas" 7 | assert deaccent("Kęstutis") == "Kestutis" 8 | assert deaccent("Ąžuolas") == "Azuolas" 9 | -------------------------------------------------------------------------------- /changes/118.feature.rst: -------------------------------------------------------------------------------- 1 | Use "IN" operator when defining ARRAYS in spaCy 2 | 3 | Also, from now on, we can define arrays directly inside macro: 4 | ``` 5 | IN_LIST("one", "two", "three") 6 | ``` 7 | 8 | Which is equals to: 9 | ``` 10 | numbers = {"one", "two", "three"} 11 | IN_LIST(numbers) 12 | ``` -------------------------------------------------------------------------------- /rita/modules/regex.py: -------------------------------------------------------------------------------- 1 | from rita.utils import ExtendedOp 2 | 3 | 4 | def REGEX(regex_pattern, config, op=None): 5 | """ 6 | Matches words based on a Regex pattern 7 | e.g. all words that start with an 'a' would be 8 | REGEX("^a") 9 | """ 10 | new_op = ExtendedOp(op) 11 | new_op.local_regex_override = True 12 | return "regex", regex_pattern, new_op 13 | -------------------------------------------------------------------------------- /rita/modules/orth.py: -------------------------------------------------------------------------------- 1 | from rita.utils import ExtendedOp 2 | 3 | 4 | def ORTH(value, config, op=None): 5 | """ 6 | Ignores case-insensitive configuration and checks words as written 7 | that means case-sensitive even if configuration is case-insensitive 8 | """ 9 | new_op = ExtendedOp(op) 10 | new_op.case_sensitive_override = True 11 | return "orth", value, new_op 12 | -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | python_version = 3.9 3 | exclude=parsetab.py 4 | 5 | [mypy-inflect] 6 | ignore_missing_imports = True 7 | 8 | [mypy-spacy] 9 | ignore_missing_imports = True 10 | 11 | [mypy-spacy.pipeline] 12 | ignore_missing_imports = True 13 | 14 | [mypy-ply] 15 | ignore_missing_imports = True 16 | 17 | [mypy-ply.yacc] 18 | ignore_missing_imports = True 19 | 20 | [mypy-ply.lex] 21 | ignore_missing_imports = True 22 | -------------------------------------------------------------------------------- /examples/dress-match.rita: -------------------------------------------------------------------------------- 1 | cuts = {"fitted", "wide-cut"} 2 | lengths = {"short", "long", "calf-length", "knee-length"} 3 | fabric_types = {"soft", "airy", "crinkled"} 4 | fabrics = {"velour", "chiffon", "knit", "woven", "stretch"} 5 | 6 | {IN_LIST(cuts)?, IN_LIST(lengths), WORD("dress")}->MARK("DRESS_TYPE") 7 | {IN_LIST(lengths), IN_LIST(cuts), WORD("dress")}->MARK("DRESS_TYPE") 8 | {IN_LIST(fabric_types)?, IN_LIST(fabrics)}->MARK("DRESS_FABRIC") -------------------------------------------------------------------------------- /examples/color-car.rita: -------------------------------------------------------------------------------- 1 | cars = LOAD("examples/cars.txt") # Load items from file 2 | colors = {"red", "green", "blue", "white", "black"} # Declare items inline 3 | 4 | {IN_LIST(colors), WORD("car")} -> MARK("CAR_COLOR") # If first token is in list `colors` and second one is word `car`, label it 5 | 6 | {IN_LIST(cars), WORD+} -> MARK("CAR_MODEL") # If first token is in list `cars` and follows by 1..N words, label it 7 | 8 | {ENTITY("PERSON"), LEMMA("like"), WORD} -> MARK("LIKED_ACTION") # If first token is Person, followed by any word which has lemma `like`, label it -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: RITA DSL 2 | site_author: Šarūnas Navickas 3 | site_description: A DSL which allows to write rules for doing NLP 4 | repo_url: https://github.com/zaibacu/rita-dsl 5 | nav: 6 | - Home: index.md 7 | - Quickstart: quickstart.md 8 | - Syntax: syntax.md 9 | - Macros: macros.md 10 | - Engines: engines.md 11 | - Modules: modules.md 12 | - Extending: extend.md 13 | - Config: config.md 14 | - Advanced: advanced.md 15 | - Integrating into IDEs: integration.md 16 | theme: readthedocs 17 | markdown_extensions: 18 | - toc: 19 | permalink: True 20 | plugins: 21 | - search 22 | -------------------------------------------------------------------------------- /docs/integration.md: -------------------------------------------------------------------------------- 1 | # Integration 2 | 3 | This section is dedicated to provide links to plugins which will make life easier using RITA language. If you created one - feel free to add to the list 4 | 5 | ## Idea (IntelijJ, PyCharm and others) 6 | 7 | [Rita-Language](https://plugins.jetbrains.com/plugin/15011-rita-language) - simple syntax markup plugin 8 | 9 | ## SublimeText3 10 | 11 | plugin can be found in `extra/sublimetext/RITA.sublime-syntax` - simple syntax markup plugin. Can be installed by copying to `Packages/User/` directory of SublimeText (for MacOS it's: `cd ~/Library/Application\ Support/Sublime\ Text\ 3/Packages/User`) -------------------------------------------------------------------------------- /changes_template.md: -------------------------------------------------------------------------------- 1 | {% for section, _ in sections.items() %} 2 | {% set underline = underlines[0] %} 3 | {% if section %} 4 | {{section}} 5 | {{ underline * section|length }} 6 | {% set underline = underlines[1] %} 7 | {% endif %} 8 | 9 | {% if sections[section] %} 10 | {% for category, val in definitions.items() if category in sections[section]%} 11 | {{ definitions[category]['name'] }} 12 | {{ underline * definitions[category]['name']|length }} 13 | 14 | {% for text, values in sections[section][category].items() %} 15 | - {{ text }} 16 | {{ values|join(',\n ') }} 17 | {% endfor %} 18 | 19 | {% endfor %} 20 | {% else %} 21 | No significant changes. 22 | 23 | {% endif %} 24 | {% endfor %} 25 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | ![Rita Logo](assets/logo-2.png) 2 | # RITA DSL 3 | 4 | This is a language, loosely based on language [Apache UIMA RUTA](https://uima.apache.org/ruta.html), focused on writing manual language rules, which compiles into [spaCy](https://github.com/explosion/spaCy) compatible patterns. These patterns can be used for doing [manual NER](https://spacy.io/api/entityruler) as well as used in other processes, like retokenizing and pure matching 5 | 6 | - [Live Demo](https://rita-dsl.io/#demo) 7 | - [Simple Chat bot example](https://repl.it/talk/share/Simple-chatbot-done-with-Rita/53471) 8 | - [Documentation](http://rita-dsl.readthedocs.io/) 9 | - [QuickStart](https://rita-dsl.readthedocs.io/en/latest/quickstart/) 10 | - [Language Syntax Plugin for IntelijJ based IDEs](https://plugins.jetbrains.com/plugin/15011-rita-language) 11 | -------------------------------------------------------------------------------- /docs/extend.md: -------------------------------------------------------------------------------- 1 | # Extending 2 | 3 | Custom modules can be loaded via `!IMPORT()` 4 | 5 | Example of basic fuzzy matcher: 6 | 7 | ``` 8 | !IMPORT("rita.modules.fuzzy") 9 | 10 | FUZZY("squirrel") -> MARK("CRITTER") 11 | ``` 12 | 13 | Code can be seen in: [fuzzy.py](https://github.com/zaibacu/rita-dsl/blob/master/rita/modules/fuzzy.py) 14 | 15 | After import is done, custom macros defined in imported module can be executed. 16 | 17 | ## Interface for custom Macro 18 | 19 | Each macro must have atleast two arguments 20 | 21 | - `op` - custom handling of `?`, `*` and `+` operators. If it has no use, argument can be defined as `def (*args, op=None)` and simply ignored inside code 22 | 23 | - `context` - context is either `dict` or `list` type used to store results 24 | 25 | All other arguments should be defined at the start 26 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py36, py37, py38, py39, spacy3, changelog 3 | isolated_build = True 4 | 5 | [testenv] 6 | deps = 7 | codecov 8 | ply: ply==3.11 9 | spacy >= 3.0.0 10 | inflect 11 | pytest 12 | pytest-mock 13 | pytest-benchmark 14 | commands = 15 | python -m spacy download en 16 | python -m pytest tests 17 | codecov --token="{env:CODECOV_TOKEN}" 18 | 19 | [testenv:changelog] 20 | basepython = python3.9 21 | skip_install = true 22 | deps = 23 | towncrier 24 | commands = towncrier --draft 25 | 26 | [testenv:lint] 27 | basepython = python3.9 28 | skip_install = true 29 | deps = flake8 30 | commands = 31 | flake8 rita/ --exclude=rita/parsetab.py 32 | flake8 tests/ 33 | 34 | [testenv:mypy] 35 | basepython = python3.9 36 | skip_install = true 37 | deps = mypy 38 | commands = 39 | mypy rita/ 40 | -------------------------------------------------------------------------------- /docs/advanced.md: -------------------------------------------------------------------------------- 1 | # Importing other rule files 2 | 3 | When the corpus of rules becomes too large, it is possible to split it into multiple of files. 4 | It can be done simply like this: 5 | 6 | ``` 7 | @import "" 8 | ``` 9 | 10 | Eg.: 11 | ``` 12 | @import "examples/simple-match.rita" 13 | ``` 14 | 15 | # Reusing patterns 16 | 17 | You can define (since version 0.5.0+) pattern as a variable: 18 | 19 | ``` 20 | ComplexNumber = {NUM+, WORD("/")?, NUM?} 21 | 22 | {PATTERN(ComplexNumber), WORD("inches"), WORD("Height")}->MARK("HEIGHT") 23 | {PATTERN(ComplexNumber), WORD("inches"), WORD("Width")}->MARK("WIDTH") 24 | ``` 25 | 26 | # Alias 27 | 28 | You can alias frequently used macros to make their names shorter: 29 | 30 | ``` 31 | numbers = {"one", "two", "three"} 32 | @alias IN_LIST IL 33 | 34 | IL(numbers) -> MARK("NUMBER") 35 | ``` 36 | 37 | Now using "IL" will actually call "IN_LIST" macro. -------------------------------------------------------------------------------- /rita/modules/tag.py: -------------------------------------------------------------------------------- 1 | from rita.utils import ExtendedOp 2 | 3 | 4 | def TAG(tag, config, op=None): 5 | """ 6 | For generating POS/TAG patterns based on a Regex 7 | e.g. TAG("^NN|^JJ") for nouns or adjectives 8 | """ 9 | values = {"tag": tag} 10 | return "tag", values, ExtendedOp(op) 11 | 12 | 13 | def TAG_WORD(tag, value, config, op=None): 14 | """ 15 | For generating TAG patterns with a word or a list 16 | e.g. match only "proposed" when it is in the sentence a verb (and not an adjective): 17 | TAG_WORD("^VB", "proposed") 18 | e.g. match a list of words only to verbs 19 | words = {"percived", "proposed"} 20 | {TAG_WORD("^VB", words)?}->MARK("LABEL") 21 | """ 22 | values = {"tag": tag} 23 | if type(value) == list: 24 | values["list"] = value 25 | else: 26 | values["word"] = value 27 | return "tag", values, ExtendedOp(op) 28 | -------------------------------------------------------------------------------- /tests/test_precompile.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from rita.precompile import precompile 4 | 5 | from utils import raw_compare 6 | 7 | 8 | def test_rule_import(): 9 | rules = """ 10 | @import "examples/color-car.rita" 11 | """ 12 | 13 | result = precompile(rules.strip()) 14 | with open("examples/color-car.rita", "r") as f: 15 | assert result == f.read() 16 | 17 | 18 | def test_cyclical_import(): 19 | rules = """ 20 | @import "examples/cyclical-import.rita" 21 | """ 22 | 23 | with pytest.raises(RuntimeError): 24 | precompile(rules) 25 | 26 | 27 | def test_alias(): 28 | rules = """ 29 | numbers = {"one", "two", "three"} 30 | @alias IN_LIST IL 31 | @alias MARK M 32 | 33 | IL(numbers)->M("HELLO") 34 | """ 35 | 36 | expected = """ 37 | numbers = {"one", "two", "three"} 38 | 39 | IN_LIST(numbers)->MARK("HELLO") 40 | """ 41 | 42 | result = precompile(rules.strip()) 43 | raw_compare(expected, result) 44 | -------------------------------------------------------------------------------- /.github/workflows/github-actions-deployment.yaml: -------------------------------------------------------------------------------- 1 | name: Deploy 2 | on: 3 | push: 4 | branches: 5 | - master 6 | pull_request: 7 | branches: 8 | - master 9 | types: 10 | - ready_for_review 11 | - review_requested 12 | 13 | jobs: 14 | deployment: 15 | name: "Deployment" 16 | runs-on: "ubuntu-latest" 17 | strategy: 18 | matrix: 19 | python-version: [ '3.9' ] 20 | steps: 21 | - uses: actions/checkout@v2 22 | - name: Setup python 23 | uses: actions/setup-python@v2 24 | with: 25 | python-version: ${{ matrix.python-version }} 26 | architecture: x64 27 | - name: Install Poetry 28 | run: | 29 | curl -sSL https://raw.githubusercontent.com/python-poetry/poetry/master/install-poetry.py | python - 30 | 31 | - name: Build 32 | run: poetry build 33 | 34 | - name: Set Token 35 | run: poetry config pypi-token.pypi ${{ secrets.PYPI_TOKEN }} 36 | 37 | - name: Poetry Publish 38 | run: poetry publish -------------------------------------------------------------------------------- /.github/workflows/github-actions-main.yaml: -------------------------------------------------------------------------------- 1 | name: Testing 2 | on: [push] 3 | jobs: 4 | Testing: 5 | runs-on: ubuntu-latest 6 | strategy: 7 | matrix: 8 | python-version: [ '3.9' ] 9 | name: Testing on Python ${{ matrix.python-version }} 10 | steps: 11 | - uses: actions/checkout@v2 12 | - name: Setup python 13 | uses: actions/setup-python@v2 14 | with: 15 | python-version: ${{ matrix.python-version }} 16 | architecture: x64 17 | - run: pip install tox 18 | - run: CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }} tox -e py39 19 | 20 | CheckCode: 21 | runs-on: ubuntu-latest 22 | strategy: 23 | matrix: 24 | python-version: [ '3.9' ] 25 | name: CheckCode 26 | steps: 27 | - uses: actions/checkout@v2 28 | - uses: actions/setup-python@v2 29 | with: 30 | python-version: '3.9' 31 | architecture: 'x64' 32 | - run: pip install tox 33 | - run: CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }} tox -e lint 34 | - run: CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }} tox -e mypy 35 | 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Šarūnas Navickas 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /tests/test_config.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from rita.config import with_config, SessionConfig 4 | 5 | 6 | @pytest.fixture 7 | def cfg(): 8 | return SessionConfig() 9 | 10 | 11 | @with_config 12 | def test_config_decorator(config): 13 | assert config 14 | 15 | 16 | def test_registered_engines(cfg): 17 | assert len(cfg.available_engines) > 0 18 | 19 | 20 | def test_registered_engines_has_spacy(cfg): 21 | pytest.importorskip("spacy", minversion="2.1") 22 | from rita.engine.translate_spacy import compile_rules 23 | assert len(cfg.available_engines) == 3 24 | assert cfg.default_engine == compile_rules 25 | 26 | 27 | def test_default_values(cfg): 28 | assert cfg.ignore_case 29 | assert cfg.implicit_punct 30 | assert not cfg.implicit_hyphon 31 | 32 | cfg.ignore_case = False 33 | assert not cfg.ignore_case 34 | 35 | cfg.implicit_punct = False 36 | assert not cfg.implicit_punct 37 | 38 | cfg.implicit_hyphon = True 39 | assert cfg.implicit_hyphon 40 | 41 | 42 | def test_register_module(cfg): 43 | cfg.register_module("rita.modules.fuzzy") 44 | 45 | assert len(cfg.modules) == 1 46 | -------------------------------------------------------------------------------- /rita/run.py: -------------------------------------------------------------------------------- 1 | import json 2 | import argparse 3 | import logging 4 | 5 | import rita 6 | 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | def main(): 12 | from rita.utils import RitaJSONEncoder 13 | parser = argparse.ArgumentParser( 14 | description="Compile rita -> spaCy patterns" 15 | ) 16 | 17 | parser.add_argument("-f", help=".rita rules file") 18 | parser.add_argument( 19 | "out", 20 | help="output .jsonl file to store rules" 21 | ) 22 | parser.add_argument("--debug", help="debug mode", action="store_true") 23 | parser.add_argument("--engine", help="Engine to use when compiling rules", default="spacy") 24 | args = parser.parse_args() 25 | 26 | if args.debug: 27 | logging.basicConfig(level=logging.DEBUG) 28 | else: 29 | logging.basicConfig(level=logging.INFO) 30 | 31 | patterns = rita.compile(args.f, use_engine=args.engine) 32 | 33 | logger.info("Compiling rules using {} engine".format(args.engine)) 34 | 35 | with open(args.out, "w") as f: 36 | for pattern in patterns: 37 | f.write(json.dumps(pattern, cls=RitaJSONEncoder) + "\n") 38 | -------------------------------------------------------------------------------- /rita/modules/fuzzy.py: -------------------------------------------------------------------------------- 1 | import re 2 | import string 3 | 4 | from rita.macros import resolve_value 5 | from rita.utils import ExtendedOp 6 | 7 | 8 | char_translation = dict( 9 | [(c * 2, "{0}{{1,2}}".format(c)) for c in string.ascii_lowercase] 10 | ) 11 | 12 | find_re = "|".join(["({0})".format(s) for (s, _) in char_translation.items()]) 13 | 14 | slang = {"you": "u", "for": "4", "are": "r", "you are": "ur", "you're": "ur"} 15 | 16 | 17 | def premutations(initial): 18 | # return initial value 19 | yield initial 20 | 21 | """ 22 | if we have double letters, like `oo`, we can guess that 23 | - user can sometimes enter both 24 | - sometimes only single 25 | """ 26 | double_letters = re.sub( 27 | find_re, 28 | lambda x: char_translation[x.group(0)], 29 | initial 30 | ) 31 | yield double_letters 32 | 33 | # if we have simple word, can add slang alternative 34 | if initial in slang: 35 | yield r"\s{0}\s".format(slang[initial]) 36 | 37 | 38 | def FUZZY(name, config, op=None): 39 | initial = resolve_value(name, config=config) 40 | return "fuzzy", list(premutations(initial.lower())), ExtendedOp(op) 41 | -------------------------------------------------------------------------------- /docs/config.md: -------------------------------------------------------------------------------- 1 | # Config 2 | 3 | Configuration is mostly applied per-rule-basis, meaning, that different rules can have different configuration while running from same process. 4 | 5 | ## Syntax 6 | 7 | It is intended to do configuration from within the rule, like so: 8 | 9 | ``` 10 | !CONFIG("ignore_case", "Y") 11 | ``` 12 | 13 | First argument is config key, second value. `"1"`, `"Y"` and `"T"` results in `True`, `"0"`, `"N"`, `"F"` - in `False` 14 | 15 | ## Configurations 16 | 17 | | Setting | Default | Description | 18 | |--------------------|----------------------|-------------------------------------------------------------------------------| 19 | | implicit_punct |`T` |Automatically adds punctuation characters `,.!:\;` to the rules | 20 | | ignore_case |`T` |All rules are case-insensitive | 21 | | deaccent |`T` |If provided word with accent letters, use two versions - with and without them | 22 | | implicit_hyphon |`F` |Automatically adds hyphon characters `-` to the rules. Enabling implicit_hyphon is disabling implicit_punct | -------------------------------------------------------------------------------- /rita/precompile.py: -------------------------------------------------------------------------------- 1 | import re 2 | import logging 3 | 4 | from functools import partial 5 | from typing import Match 6 | 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | MAX_DEPTH = 5 12 | ALIAS_PATTERN = re.compile(r"@alias\s+(?P(\w|[_])+)\s+(?P(\w|[_])+)") 13 | 14 | 15 | def handle_import(m: Match, depth: int = 0) -> str: 16 | path = m.group("path") 17 | logger.debug("Importing: {}".format(path)) 18 | with open(path, "r") as f: 19 | return precompile(f.read(), depth+1) 20 | 21 | 22 | def precompile(raw: str, depth: int = 0) -> str: 23 | if depth > MAX_DEPTH: 24 | raise RuntimeError( 25 | "Maximum depth limit has been reached. " 26 | "Please check if you don't have cyclical imports" 27 | ) 28 | 29 | raw = re.sub( 30 | r"@import\s+[\"'](?P(\w|[/\-.])+)[\"']", 31 | partial(handle_import, depth=depth), 32 | raw 33 | ) 34 | 35 | for m in ALIAS_PATTERN.finditer(raw): 36 | # Delete alias definition 37 | full = m.group(0) 38 | raw = raw.replace(full, "") 39 | 40 | original = m.group("original") 41 | alias = m.group("alias") 42 | raw = re.sub(r"(?:(\s|->|{{))(?P{})([\(])".format(alias), r"\1{}(".format(original), raw) 43 | 44 | return raw 45 | -------------------------------------------------------------------------------- /rita/modules/pluralize.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import logging 3 | try: 4 | import inflect 5 | except ImportError: 6 | logging.exception( 7 | "Pluralize module requires 'inflect' package to be installed." 8 | "Install it and try again" 9 | ) 10 | sys.exit(1) 11 | 12 | from rita.macros import resolve_value 13 | from rita.utils import flatten, ExtendedOp 14 | 15 | 16 | def pluralizing(initial_list): 17 | """" 18 | For a list of nouns, it will return a list of the plurals and the initial nouns 19 | """ 20 | p = inflect.engine() 21 | plurals = [p.plural(word) for word in initial_list] 22 | return initial_list + plurals 23 | 24 | 25 | def PLURALIZE(*args, config, op=None): 26 | """ 27 | For a noun or a list of nouns, it will match any singular or plural word 28 | Usage for a single word, e.g.: 29 | PLURALIZE("car") 30 | Usage for lists, e.g.: 31 | vehicles = {"car", "bicycle", "ship"} 32 | PLURALIZE(vehicles) 33 | Will work even for regex or if the lemmatizer of spaCy is making an error 34 | Has dependency to the Python inflect package https://pypi.org/project/inflect/ 35 | """ 36 | if type(args[0]) == list: 37 | initial_list = [resolve_value(arg, config=config) 38 | for arg in flatten(args)] 39 | else: 40 | initial_list = [args[0]] 41 | return "any_of", pluralizing(initial_list), ExtendedOp(op) 42 | -------------------------------------------------------------------------------- /rita/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import logging 4 | from types import GeneratorType 5 | 6 | from rita.config import with_config 7 | from rita.preprocess import preprocess_rules 8 | from rita.precompile import precompile 9 | from rita.utils import timer, Timer 10 | 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | __version__ = (0, 7, 4, os.getenv("VERSION_PATCH")) 15 | 16 | 17 | def get_version(): 18 | normalized = list([i for i in __version__ if i is not None]) 19 | if len(normalized) == 4: 20 | return "{0}.{1}.{2}-{3}".format(*normalized) 21 | else: 22 | return "{0}.{1}.{2}".format(*normalized) 23 | 24 | 25 | @with_config 26 | def compile_string(raw, config, use_engine=None, **kwargs): 27 | from rita.parser import RitaParser 28 | t = Timer("Compilation") 29 | for k, v in kwargs.items(): 30 | config.set_variable(k, v) 31 | 32 | with timer("Parsing"): 33 | parser = RitaParser(config) 34 | parser.build() 35 | root = parser.parse(precompile(raw)) 36 | 37 | logger.debug(root) 38 | if use_engine: 39 | compile_rules = config.set_engine(use_engine) 40 | else: 41 | compile_rules = config.default_engine 42 | 43 | with timer("Preprocessing"): 44 | rules = list(preprocess_rules(root, config)) 45 | 46 | with timer("Compiling"): 47 | result = compile_rules(rules, config, **kwargs) 48 | 49 | if isinstance(result, GeneratorType): 50 | patterns = list(result) 51 | t.stop(debug=False) 52 | return patterns 53 | else: 54 | t.stop(debug=False) 55 | return result 56 | 57 | 58 | def compile(fname, use_engine=None, **kwargs): 59 | with open(fname, "r") as f: 60 | raw = f.read() 61 | 62 | return compile_string(raw, use_engine=use_engine, **kwargs) 63 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | 107 | # yacc-lex 108 | *.out 109 | parsetab.py -------------------------------------------------------------------------------- /tests/utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import pytest 4 | import rita 5 | 6 | from rita.shortcuts import setup_spacy 7 | 8 | 9 | def load_rules(rules_path): 10 | with open(rules_path, "r") as f: 11 | return f.read() 12 | 13 | 14 | def spacy_engine(rules, **kwargs): 15 | spacy = pytest.importorskip("spacy", minversion="2.1") 16 | nlp = spacy.load("en_core_web_sm") 17 | setup_spacy(nlp, rules_string=rules, override_ents=True, **kwargs) 18 | patterns = rita.compile_string(rules, **kwargs) 19 | print(patterns) 20 | 21 | def parse(text): 22 | doc = nlp(text) 23 | return list([(e.text, e.label_) for e in doc.ents]) 24 | return parse 25 | 26 | 27 | def standalone_engine(rules, **kwargs): 28 | parser = rita.compile_string(rules, use_engine="standalone", **kwargs) 29 | print(parser.patterns) 30 | 31 | def parse(text): 32 | results = list(parser.execute(text, include_submatches=False)) 33 | return list([(r["text"], r["label"]) for r in results]) 34 | return parse 35 | 36 | 37 | def rust_engine(rules, **kwargs): 38 | from rita.engine.translate_rust import load_lib 39 | lib = load_lib() 40 | if lib is None: 41 | pytest.skip("Missing rita-rust dynamic lib, skipping related tests") 42 | print("Trying to run: {}".format(rules)) 43 | parser = rita.compile_string(rules, use_engine="rust", **kwargs) 44 | print(parser.patterns) 45 | 46 | def parse(text): 47 | results = list(parser.execute(text, include_submatches=False)) 48 | return list([(r["text"], r["label"]) for r in results]) 49 | return parse 50 | 51 | 52 | def normalize_output(r): 53 | return re.sub(r"\s+", " ", r.strip().replace("\n", "")) 54 | 55 | 56 | def raw_compare(r1, r2): 57 | r1 = normalize_output(r1) 58 | r2 = normalize_output(r2) 59 | 60 | assert r1 == r2 61 | -------------------------------------------------------------------------------- /extra/sublimetext/RITA.sublime-syntax: -------------------------------------------------------------------------------- 1 | %YAML 1.2 2 | --- 3 | # http://www.sublimetext.com/docs/syntax.html 4 | name: Rita 5 | file_extensions: 6 | - rita 7 | scope: source.rita 8 | contexts: 9 | variables: 10 | - match: \b[a-z_]+\b 11 | scope: variable.parameter.rita 12 | keywords: 13 | - match: \b[A-Z_]+\b 14 | scope: keyword.control.rita 15 | 16 | - match: \( 17 | push: args 18 | 19 | - match: \) 20 | scope: invalid.illegal.stray-bracket-end 21 | 22 | main: 23 | - include: variables 24 | - match: '#' 25 | scope: punctuation.definition.comment.rita 26 | push: line_comment 27 | 28 | - match: "{" 29 | push: pattern 30 | 31 | - match: "}" 32 | scope: invalid.illegal.stray-bracket-end 33 | 34 | - match: -> 35 | push: mark 36 | 37 | - match: = 38 | push: assign_variable 39 | 40 | string: 41 | - meta_scope: string.quoted.double.rita 42 | - match: \\. 43 | scope: constant.character.escape.rita 44 | - match: '"' 45 | pop: true 46 | 47 | line_comment: 48 | - meta_scope: comment.line.rita 49 | - match: $ 50 | pop: true 51 | 52 | args: 53 | - include: variables 54 | - match: '"' 55 | push: string 56 | - match: ',' 57 | scope: punctuation.separator.comma.rita 58 | - match: \) 59 | pop: true 60 | 61 | pattern: 62 | - include: keywords 63 | - match: "}" 64 | pop: true 65 | 66 | assign_variable: 67 | - match: '"' 68 | push: string 69 | - match: ',' 70 | scope: punctuation.separator.comma.rita 71 | - match: "{" 72 | - match: "}" 73 | pop: true 74 | 75 | mark: 76 | - match: \bMARK\b 77 | scope: keyword.control.rita 78 | 79 | - match: '"' 80 | push: string 81 | 82 | - match: \( 83 | 84 | - match: \) 85 | pop: true 86 | 87 | -------------------------------------------------------------------------------- /rita/lexer.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import ply.lex as lex 3 | 4 | logger = logging.getLogger(__name__) 5 | 6 | 7 | class RitaLexer(object): 8 | tokens = [ 9 | "KEYWORD", 10 | "LITERAL", 11 | "NAME", 12 | "LBRACKET", 13 | "RBRACKET", 14 | "LPAREN", 15 | "RPAREN", 16 | "ARROW", 17 | "COMMA", 18 | "MODIF_QMARK", 19 | "MODIF_STAR", 20 | "MODIF_PLUS", 21 | "ASSIGN", 22 | "EXEC", 23 | "PIPE", 24 | ] 25 | 26 | literals = ["{", "}", "(", ")", '"', ",", "=", "!", "|"] 27 | 28 | t_ignore = " \t" 29 | t_ignore_COMMENT = r"\#.*" 30 | t_ARROW = "->" 31 | t_LBRACKET = "{" 32 | t_RBRACKET = "}" 33 | t_LPAREN = r"\(" 34 | t_RPAREN = r"\)" 35 | t_COMMA = "," 36 | t_MODIF_QMARK = r"\?" 37 | t_MODIF_STAR = r"\*" 38 | t_MODIF_PLUS = r"\+" 39 | t_EXEC = r"!" 40 | t_ASSIGN = r"=" 41 | t_PIPE = r"\|" 42 | 43 | # Define a rule so we can track line numbers 44 | def t_newline(self, t): 45 | r"\n+" 46 | t.lexer.lineno += len(t.value) 47 | 48 | def t_KEYWORD(self, t): 49 | r"[A-Z_]{3,}" 50 | return t 51 | 52 | def t_LITERAL(self, t): 53 | r'("|\')(\\.|.)+?("|\')' 54 | t.value = t.value[1:-1] 55 | return t 56 | 57 | def t_NAME(self, t): 58 | r"\w+" 59 | return t 60 | 61 | def t_error(self, t): 62 | logger.error("Invalid Token: {}".format(t.value[0])) 63 | t.lexer.skip(1) 64 | 65 | def build(self, **kwargs): 66 | self.lexer = lex.lex(module=self, errorlog=logger, **kwargs) 67 | return self.lexer 68 | 69 | def tokenize(self, data): 70 | self.lexer.input(data) 71 | while True: 72 | t = self.lexer.token() 73 | if t is None: 74 | break 75 | yield t 76 | -------------------------------------------------------------------------------- /rita/modules/names.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from rita.macros import resolve_value 4 | from rita.utils import flatten, ExtendedOp 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | STOP_NAMES = {"von", "van", "de", "dos"} 10 | 11 | 12 | def trim_name(name): 13 | if name in STOP_NAMES: 14 | return name 15 | return name[0] + r"\." 16 | 17 | 18 | def trim_seniority(name): 19 | if name.lower() == "junior": 20 | return r"jr\." 21 | elif name.lower() == "senior": 22 | return r"sr\." 23 | else: 24 | return name 25 | 26 | 27 | def remove_empty(x): 28 | return x.strip() != "" 29 | 30 | 31 | def generate_names(initial_list): 32 | """" 33 | Generates variations of names 34 | Eg. {First Middle Last; First M. Last; F. M. Last} 35 | """ 36 | for name in initial_list: 37 | yield name.strip(), 38 | 39 | buff = name.strip().split(" ") 40 | if len(buff) == 2: 41 | yield trim_name(buff[0]), buff[1] 42 | elif len(buff) == 3: 43 | if buff[2].lower() == "junior" or buff[2].lower() == "senior": 44 | yield buff[0], buff[1], trim_seniority(buff[2]) 45 | else: 46 | yield buff[0], trim_name(buff[1]), buff[2] 47 | yield trim_name(buff[0]), trim_name(buff[1]), buff[2] 48 | 49 | 50 | def NAMES(*args, config, op=None): 51 | if type(args[0]) == list: 52 | initial_list = [resolve_value(arg, config=config) 53 | for arg in flatten(args)] 54 | else: 55 | initial_list = [args[0]] 56 | 57 | names = list([" ".join(filter(remove_empty, names)) 58 | for names in generate_names(initial_list)]) 59 | logger.debug("Generated list of names: {}".format(names)) 60 | new_op = ExtendedOp(op) 61 | new_op.case_sensitive_override = True 62 | return "any_of", names, new_op 63 | -------------------------------------------------------------------------------- /rita/shortcuts.py: -------------------------------------------------------------------------------- 1 | import rita 2 | 3 | 4 | def setup_spacy(model, patterns=None, rules_path=None, rules_string=None, override_ents=True): 5 | import spacy 6 | major, _, _ = spacy.__version__.split(".") 7 | if major == "2": 8 | return _spacy_v2(model, patterns, rules_path, rules_string, override_ents) 9 | elif major == "3": 10 | return _spacy_v3(model, patterns, rules_path, rules_string, override_ents) 11 | else: 12 | raise RuntimeError("Unsupported spaCy version: {}".format(major)) 13 | 14 | 15 | def _spacy_v2(model, patterns=None, rules_path=None, rules_string=None, override_ents=True): 16 | from spacy.pipeline import EntityRuler 17 | ruler = EntityRuler(model, overwrite_ents=override_ents) 18 | if not patterns: 19 | if rules_path: 20 | patterns = rita.compile(rules_path, use_engine="spacy") 21 | elif rules_string: 22 | patterns = rita.compile_string(rules_string, use_engine="spacy") 23 | else: 24 | raise RuntimeError("Please provides rules. Either `patterns`, `rules_path` or `rules_string`") 25 | 26 | ruler.add_patterns(patterns) 27 | else: 28 | ruler.from_disk(patterns) 29 | 30 | model.add_pipe(ruler) 31 | return model 32 | 33 | 34 | def _spacy_v3(model, patterns=None, rules_path=None, rules_string=None, override_ents=True): 35 | ruler = model.add_pipe("entity_ruler", config={"overwrite_ents": override_ents, "validate": True}) 36 | if not patterns: 37 | if rules_path: 38 | patterns = rita.compile(rules_path, use_engine="spacy") 39 | elif rules_string: 40 | patterns = rita.compile_string(rules_string, use_engine="spacy") 41 | else: 42 | raise RuntimeError("Please provides rules. Either `patterns`, `rules_path` or `rules_string`") 43 | 44 | ruler.add_patterns(patterns) 45 | else: 46 | ruler.from_disk(patterns) 47 | return model 48 | -------------------------------------------------------------------------------- /docs/macros.md: -------------------------------------------------------------------------------- 1 | # Macros 2 | 3 | `ARG = Literal | Macro | Variable` 4 | 5 | `ARGS = Array of ARG` 6 | 7 | | Name | Arguments | Modifiers | Description | 8 | |---------|----------------------|-----------|---------------------------------------------| 9 | | ANY |`None` |`?` `*` `+`|Placeholder for any kind of text | 10 | | WORD |`ARG`(Optional) |`?` `*` `+`|Placeholder for any kind of word | 11 | | NUM |`ARG`(Optional) |`?` `*` `+`|Placeholder for any kind of number | 12 | | PUNCT |`None` |`?` `*` `+`|Placeholder for punctuation | 13 | | POS |`ARG` |`?` `*` `+`|Match by PartOfSpeech | 14 | | LEMMA |`ARG` |`?` `*` `+`|Match by Lemma | 15 | | ENTITY |`ARG` |`?` `*` `+`|Match by Entity Type, eg. `PERSON` | 16 | | PATTERN |`ARGS` |`None` |Wrapper for multiple of rules. **Covered by standard syntax, can be ignored** | 17 | | IN_LIST |`ARGS` |`?` `*` `+`|Match by any of defined values | 18 | | PREFIX |`ARGS` |`None` |Adds a prefix to next word or list | 19 | | LOAD |`ARG` |`None` |Load array from file. Each line = new element| 20 | | MARK |`ARG` |`None` |Mark given pattern as a label | 21 | | ASSIGN |`Literal`, `ARG` |`None` |Assign value to variable. **Covered by standard syntax, can be ignored** | 22 | | EXEC |`ARG` |`None` |Execute macro. **Covered by standard syntax, can be ignored** | 23 | | IMPORT |`Literal` |`None` |Import custom module, allowing custom macros to be executed| 24 | | CONFIG | `Literal`, `LITERAL` |`None` |Alows to modify config value | 25 | -------------------------------------------------------------------------------- /docs/engines.md: -------------------------------------------------------------------------------- 1 | # Engines 2 | 3 | In RITA what we call `engine` is a system we will compile rules to, and which will do the heavy lifting after that. 4 | 5 | Currently there are three engines: 6 | 7 | ## spaCy 8 | 9 | Activated by using `rita.compile(, use_engine="spacy")` 10 | 11 | Using this engine, all of the RITA rules will be compiled into spaCy patterns, which can be natively used by spaCy in various scenarios. 12 | Most often - to improve NER (Named Entity Recognition), by adding additional entities derived from your given rules 13 | 14 | It requires to have spaCy package installed (`pip install spacy`) and to actually use it later, language model needs to be downloaded (`python -m spacy download `) 15 | 16 | ## Standalone 17 | 18 | Activated by using `rita.compile(, use_engine="standalone")`. It compiles into pure regex and can be used with zero dependencies. 19 | By default, it uses Python `re` library. Since `0.5.10` version, you can give a custom regex implementation to use: 20 | eg. regex package: `rita.compile(, use_engine="standalone", regex_impl=regex)` 21 | 22 | It is very lightweight, very fast (compared to spaCy), however lacking in some functionality which only proper language model can bring: 23 | - Patterns by entity (PERSON, ORGANIZATION, etc) 24 | - Patterns by Lemmas 25 | - Patterns by POS (Part Of Speech) 26 | 27 | Only generic things, like WORD, NUMBER can be matched. 28 | 29 | 30 | ## Rust (new in `0.6.0`) 31 | 32 | There's only an interface inside the code, engine itself is proprietary. 33 | 34 | In general it's identical to `standalone`, but differs in one crucial part - all of the rules are compiled into actual binary code and that provides large performance boost. 35 | It is proprietary, because there are various caveats, engine itself is a bit more fragile and needs to be tinkered to be optimized to very specific case 36 | (eg. few long texts with many matches vs a lot short texts with few matches). -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "rita-dsl" 3 | version = "0.7.4" 4 | description = "DSL for building language rules" 5 | authors = [ 6 | "Šarūnas Navickas " 7 | ] 8 | license = "MIT" 9 | readme = "README.md" 10 | homepage = "https://github.com/zaibacu/rita-dsl" 11 | repository = "https://github.com/zaibacu/rita-dsl" 12 | documentation = "https://rita-dsl.readthedocs.io/en/latest/" 13 | packages = [{ include = "rita" }] 14 | 15 | keywords = ["nlp", "rule-based", "dsl", "pyproject.toml"] 16 | 17 | classifiers = [ 18 | "Programming Language :: Python", 19 | "Programming Language :: Python :: 3", 20 | "Programming Language :: Python :: 3.5", 21 | "Programming Language :: Python :: 3.6", 22 | "Programming Language :: Python :: 3.7", 23 | "Programming Language :: Python :: 3.8", 24 | "Programming Language :: Python :: 3.9", 25 | "Programming Language :: Python :: 3.10", 26 | "Programming Language :: Python :: Implementation :: CPython", 27 | "Programming Language :: Python :: Implementation :: PyPy", 28 | ] 29 | 30 | [tool.poetry.dependencies] 31 | python = "^3.5" 32 | ply = "3.11" 33 | 34 | [tool.poetry.dev-dependencies] 35 | pytest = "^5.2.4" 36 | pytest-benchmark = "^3.2.2" 37 | pytest-cov = "^2.8.1" 38 | pytest-mock = "^2.0.0" 39 | 40 | [tool.poetry.scripts] 41 | rita = "rita.run:main" 42 | 43 | [build-system] 44 | requires = ["poetry>=0.12"] 45 | build-backend = "poetry.masonry.api" 46 | 47 | [tool.towncrier] 48 | directory = "changes" 49 | package = "rita" 50 | filename = "CHANGELOG.md" 51 | underlines = ["*", "-"] 52 | template = "changes_template.md" 53 | title_format = "{name} {version} ({project_date})" 54 | 55 | [[tool.towncrier.type]] 56 | directory = "breaking" 57 | name = "Backward-incompatible Changes" 58 | showcontent = true 59 | 60 | [[tool.towncrier.type]] 61 | directory = "deprecation" 62 | name = "Deprecations" 63 | showcontent = true 64 | 65 | [[tool.towncrier.type]] 66 | directory = "feature" 67 | name = "Features" 68 | showcontent = true 69 | 70 | [[tool.towncrier.type]] 71 | directory = "fix" 72 | name = "Fix" 73 | showcontent = true 74 | -------------------------------------------------------------------------------- /tests/test_run.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import json 4 | import tempfile 5 | 6 | import pytest 7 | import rita 8 | 9 | from rita.run import main 10 | from rita.shortcuts import setup_spacy 11 | 12 | 13 | def test_help(mocker): 14 | sys.argv = [ 15 | "rita-dsl" 16 | "--help" 17 | ] 18 | 19 | 20 | def test_debug(mocker): 21 | sys.argv = [ 22 | "rita-dsl" 23 | "--debug" 24 | ] 25 | 26 | 27 | def test_simple_compile(mocker): 28 | sys.argv = [ 29 | "rita-dsl", 30 | "-f", 31 | "examples/cheap-phones.rita", 32 | "output.jsonl" 33 | ] 34 | main() 35 | 36 | 37 | def test_simple_spacy_compile(mocker): 38 | sys.argv = [ 39 | "rita-dsl", 40 | "-f", 41 | "examples/cheap-phones.rita", 42 | "--engine=spacy", 43 | "output.jsonl" 44 | ] 45 | main() 46 | 47 | 48 | def test_simple_standalone_compile(mocker): 49 | sys.argv = [ 50 | "rita-dsl", 51 | "-f", 52 | "examples/cheap-phones.rita", 53 | "--engine=standalone", 54 | "output.jsonl" 55 | ] 56 | main() 57 | 58 | 59 | def test_shortcuts_spacy_inline(): 60 | spacy = pytest.importorskip("spacy", minversion="2.1") 61 | nlp = spacy.load("en_core_web_sm") 62 | rules = """ 63 | {WORD("TEST")}->MARK("TEST") 64 | """ 65 | setup_spacy(nlp, rules_string=rules) 66 | 67 | 68 | def test_shortcuts_spacy_file(): 69 | spacy = pytest.importorskip("spacy", minversion="2.1") 70 | nlp = spacy.load("en_core_web_sm") 71 | setup_spacy(nlp, rules_path="examples/color-car.rita") 72 | 73 | 74 | def test_shortcuts_spacy_compiled(): 75 | spacy = pytest.importorskip("spacy", minversion="2.1") 76 | nlp = spacy.load("en_core_web_sm") 77 | tmp = tempfile.NamedTemporaryFile(mode="w", encoding="UTF-8", suffix=".jsonl", delete=False) 78 | patterns = rita.compile("examples/color-car.rita") 79 | for pattern in patterns: 80 | tmp.write(json.dumps(pattern) + "\n") 81 | tmp.flush() 82 | tmp.close() 83 | setup_spacy(nlp, patterns=tmp.name) 84 | os.unlink(tmp.name) 85 | 86 | 87 | def test_shortcuts_spacy_giving_no_rules(): 88 | spacy = pytest.importorskip("spacy", minversion="2.1") 89 | nlp = spacy.load("en_core_web_sm") 90 | with pytest.raises(RuntimeError): 91 | setup_spacy(nlp) 92 | -------------------------------------------------------------------------------- /docs/syntax.md: -------------------------------------------------------------------------------- 1 | # Syntax guide 2 | 3 | ## The basic building blocks 4 | 5 | You have `LITERAL` which is any kind of string behind quotes, eg.: 6 | 7 | ``` 8 | "Hello world!" 9 | ``` 10 | 11 | You have `MACRO` which is main backbone of everything. 12 | 13 | Using parenthesis, you can pass arguments to macro: 14 | ``` 15 | LOAD("path/filename.txt") # Load a text file 16 | ``` 17 | 18 | if macro doesn't require any, you can simply call it 19 | 20 | ``` 21 | WORD # Declare, that you'll have any kind of word 22 | ``` 23 | 24 | Also, macro can have modifier (if it supports it) 25 | 26 | ``` 27 | WORD+ # Declare, that you'll have 1..N words 28 | WORD* # Declare, that you'll have 0..N words 29 | WORD? # Declare, that you'll have 1 or no words 30 | WORD! # Declare, that you want to ignore this word 31 | ``` 32 | 33 | More examples 34 | 35 | ``` 36 | WORD("cat") # Declare, that you'll have exact word `cat` 37 | 38 | {"red", "green", "blue"} # Declare array of words 39 | ``` 40 | 41 | **NOTE** All of the MACROS are spelled in capital letters 42 | 43 | And finally you have `VARIABLE`. First you must declare it and later you can use just by spelling it's name 44 | 45 | ``` 46 | CarModels = LOAD("path/models.txt") 47 | 48 | # ... 49 | 50 | IN_LIST(CarModels) # Check if token is inside of list of car models we provided 51 | ``` 52 | 53 | If using directly inside macro, array can be writen with simple commas 54 | 55 | ``` 56 | IN_LIST("audi", "toyota", "bmw", "honda", "nissan", "ford") 57 | ``` 58 | 59 | 60 | For our declarations to make any sense, we need to build an expression. More on that in next topic. 61 | 62 | ## Expressions 63 | 64 | This language is built on expressions. 65 | One expression means: 66 | 67 | a) Single rule defining entity 68 | 69 | b) Single variable declaration 70 | 71 | Rule expression ends with an arrow `->`, eg.: 72 | 73 | `WORD("something") -> MARK("SOMETHING_LABEL")` 74 | 75 | with MACRO `MARK` we're assigning a label to rule 76 | 77 | Variable declaration expression ends with equals sign `=`, eg.: 78 | ``` 79 | a = "Apple" 80 | ``` 81 | 82 | When building a rule, you may want to combine several rules into one, you can use array builder for that: 83 | 84 | ``` 85 | {IN_LIST({"red", "green", "blue", "white", "black", "silver", "brown"}), WORD("car")} -> MARK("CAR_COLOR") 86 | ``` 87 | 88 | we're saying: `If any of these color words are present in text and is followed by word "car", we assume this part can be labeled as "CAR_COLOR"` 89 | 90 | ## Logical variants 91 | 92 | You can say, that your rule expects either `word1`, or `word2`. Usually this can be achieved by writing two separate rules, but there's easier way: 93 | ``` 94 | {WORD("word1")|WORD("word2")} 95 | ``` 96 | 97 | Pipe character (`|`) marks a logical `OR` meaning that either right or left side can be matched. It works only on surface level, if you want nested logic - write separate rules. -------------------------------------------------------------------------------- /docs/modules.md: -------------------------------------------------------------------------------- 1 | # Modules 2 | 3 | Modules are like plugins to the system, usually providing additional functionality at some cost - needs additional dependencies, supports only specific language etc. 4 | That's why they are not included into the core system, but can be easily included into your rules. 5 | 6 | eg. 7 | ``` 8 | !IMPORT("rita.modules.fuzzy") 9 | 10 | FUZZY("squirrel") -> MARK("CRITTER") 11 | ``` 12 | 13 | **NOTE**: the import path can be any proper Python import. So this actually allows you to add extra functionality by not modifying RITA's source code. 14 | More on that in [Extending section](./extend.md) 15 | 16 | ## Fuzzy 17 | 18 | This is more as an example rather than proper module. The main goal is to generate possible misspelled variants of given word, so that match matches more cases. 19 | Very useful when dealing with actual natural language, eg. comments, social media posts. Word `you` can be automatically matched by proper `you` and `u`, `for` as `for` and `4` etc. 20 | 21 | Usage: 22 | ``` 23 | !IMPORT("rita.modules.fuzzy") 24 | 25 | FUZZY("squirrel") -> MARK("CRITTER") 26 | ``` 27 | 28 | ## Pluralize 29 | 30 | Takes list (or single) words, and creates plural version of each of these. 31 | 32 | Requires: `inflect` library (`pip install inflect`) before using. Works only on english words. 33 | 34 | Usage: 35 | 36 | ``` 37 | !IMPORT("rita.modules.pluralize") 38 | 39 | vehicles={"car", "motorbike", "bicycle", "ship", "plane"} 40 | {NUM, PLURALIZE(vehicles)}->MARK("VEHICLES") 41 | ``` 42 | 43 | ## Tag 44 | 45 | This module offers two new macros: `TAG` and `TAG_WORD`. 46 | 47 | 48 | `TAG` is used for generating POS/TAG patterns based on a Regex 49 | e.g. `TAG("^NN|^JJ")` for nouns or adjectives. 50 | 51 | Works only with spaCy engine 52 | 53 | Usage: 54 | 55 | ``` 56 | !IMPORT("rita.modules.tag") 57 | 58 | {WORD*, TAG("^NN|^JJ")}->MARK("TAGGED_MATCH") 59 | ``` 60 | 61 | `TAG_WORD` is for generating TAG patterns with a word or a list. 62 | 63 | e.g. match only "proposed" when it is in the sentence a verb (and not an adjective): 64 | 65 | ``` 66 | !IMPORT("rita.modules.tag") 67 | 68 | TAG_WORD("^VB", "proposed") 69 | ``` 70 | 71 | or e.g. match a list of words only to verbs 72 | 73 | ``` 74 | !IMPORT("rita.modules.tag") 75 | 76 | words = {"percived", "proposed"} 77 | {TAG_WORD("^VB", words)?}->MARK("LABEL") 78 | ``` 79 | 80 | ## Orth 81 | 82 | Ignores case-insensitive configuration and checks words as written 83 | that means case-sensitive even if configuration is case-insensitive. 84 | Especially useful for acronyms and proper names. 85 | 86 | Works only with spaCy engine 87 | 88 | Usage: 89 | 90 | ``` 91 | !IMPORT("rita.modules.orth") 92 | 93 | {ORTH("IEEE")}->MARK("TAGGED_MATCH") 94 | ``` 95 | 96 | ## Regex 97 | 98 | Matches words based on a Regex pattern 99 | e.g. all words that start with an 'a' would be 100 | `REGEX("^a")` 101 | 102 | ``` 103 | !IMPORT("rita.modules.regex") 104 | 105 | {REGEX("^a")}->MARK("TAGGED_MATCH") 106 | ``` 107 | 108 | ## Names 109 | 110 | Takes list of full person names (First + Last, or First Middle Last) and generates shortened variations, 111 | eg. F. Last, First M. Last, F. M. Last etc. 112 | 113 | ``` 114 | !IMPORT("rita.modules.names") 115 | 116 | names = {"Roy Jones junior", "Roy Jones senior", "Juan-Claude van Damme", "Jon Jones"} 117 | NAMES(names)->MARK("NAME_MATCH") 118 | ``` 119 | 120 | Useful when matching against fixed set of names -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![Rita Logo](docs/assets/logo-2.png) 2 | 3 | # RITA DSL 4 | 5 | [![Documentation Status](https://readthedocs.org/projects/rita-dsl/badge/?version=latest)](http://rita-dsl.readthedocs.io/?badge=latest) 6 | [![codecov](https://codecov.io/gh/zaibacu/rita-dsl/branch/master/graph/badge.svg)](https://codecov.io/gh/zaibacu/rita-dsl) 7 | [![made-with-python](https://img.shields.io/badge/Made%20with-Python-1f425f.svg)](https://www.python.org/) 8 | [![Maintenance](https://img.shields.io/badge/Maintained%3F-yes-green.svg)](https://github.com/zaibacu/rita-dsl/graphs/commit-activity) 9 | [![PyPI version fury.io](https://badge.fury.io/py/rita-dsl.svg)](https://pypi.python.org/pypi/rita-dsl/) 10 | [![PyPI download month](https://img.shields.io/pypi/dm/rita-dsl.svg)](https://pypi.python.org/pypi/rita-dsl/) 11 | [![GitHub license](https://img.shields.io/github/license/zaibacu/rita-dsl.svg)](https://github.com/zaibacu/rita-dsl/blob/master/LICENSE) 12 | 13 | This is a language, loosely based on language [Apache UIMA RUTA](https://uima.apache.org/ruta.html), focused on writing manual language rules, which compiles into either [spaCy](https://github.com/explosion/spaCy) compatible patterns, or pure regex. These patterns can be used for doing [manual NER](https://spacy.io/api/entityruler) as well as used in other processes, like retokenizing and pure matching 14 | 15 | ## An Introduction Video 16 | [![Intro](https://img.youtube.com/vi/GScerMeWz68/0.jpg)](https://www.youtube.com/watch?v=GScerMeWz68) 17 | 18 | ## Links 19 | - [Website](https://rita-dsl.io/) 20 | - [Simple Chat bot example](https://repl.it/talk/share/Simple-chatbot-done-with-Rita/53471) 21 | - [Documentation](http://rita-dsl.readthedocs.io/) 22 | - [QuickStart](https://rita-dsl.readthedocs.io/en/latest/quickstart/) 23 | - [Language Syntax Plugin for IntelijJ based IDEs](https://plugins.jetbrains.com/plugin/15011-rita-language) 24 | 25 | ## Support 26 | 27 | [![reddit](https://img.shields.io/reddit/subreddit-subscribers/ritaDSL?style=social)](https://www.reddit.com/r/ritaDSL/) 28 | [![Gitter](https://badges.gitter.im/rita-dsl/community.svg)](https://gitter.im/rita-dsl/community?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge) 29 | 30 | If you need consulting or some custom work done, you can [Contact Us](mailto:info@griaustinis.lt) 31 | 32 | ## Install 33 | 34 | `pip install rita-dsl` 35 | 36 | ## Simple Rules example 37 | 38 | ```python 39 | rules = """ 40 | cuts = {"fitted", "wide-cut"} 41 | lengths = {"short", "long", "calf-length", "knee-length"} 42 | fabric_types = {"soft", "airy", "crinkled"} 43 | fabrics = {"velour", "chiffon", "knit", "woven", "stretch"} 44 | 45 | {IN_LIST(cuts)?, IN_LIST(lengths), WORD("dress")}->MARK("DRESS_TYPE") 46 | {IN_LIST(lengths), IN_LIST(cuts), WORD("dress")}->MARK("DRESS_TYPE") 47 | {IN_LIST(fabric_types)?, IN_LIST(fabrics)}->MARK("DRESS_FABRIC") 48 | """ 49 | ``` 50 | 51 | ### Loading in spaCy 52 | ```python 53 | import spacy 54 | from rita.shortcuts import setup_spacy 55 | 56 | 57 | nlp = spacy.load("en") 58 | setup_spacy(nlp, rules_string=rules) 59 | ``` 60 | 61 | And using it: 62 | ```python 63 | >>> r = nlp("She was wearing a short wide-cut dress") 64 | >>> [{"label": e.label_, "text": e.text} for e in r.ents] 65 | [{'label': 'DRESS_TYPE', 'text': 'short wide-cut dress'}] 66 | ``` 67 | 68 | ### Loading using Regex (standalone) 69 | ```python 70 | import rita 71 | 72 | patterns = rita.compile_string(rules, use_engine="standalone") 73 | ``` 74 | 75 | And using it: 76 | ```python 77 | >>> list(patterns.execute("She was wearing a short wide-cut dress")) 78 | [{'end': 38, 'label': 'DRESS_TYPE', 'start': 18, 'text': 'short wide-cut dress'}] 79 | ``` 80 | -------------------------------------------------------------------------------- /rita/config.py: -------------------------------------------------------------------------------- 1 | import operator 2 | import logging 3 | from importlib import import_module 4 | from typing import Any, Callable 5 | 6 | try: 7 | from rita.engine.translate_spacy import compile_rules as spacy_engine 8 | except ImportError: 9 | pass 10 | 11 | from rita.engine.translate_standalone import compile_rules as standalone_engine 12 | from rita.engine.translate_rust import compile_rules as rust_engine 13 | 14 | from rita.utils import SingletonMixin 15 | from rita.types import opts, Rules 16 | 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | CompileFN = Callable[[Rules, "Config", opts], Any] 21 | 22 | 23 | class Config(SingletonMixin): 24 | def __init__(self): 25 | self.available_engines = [] 26 | self.engines_by_key = {} 27 | self.current_engine = None 28 | 29 | try: 30 | self.register_engine(1, "spacy", spacy_engine) 31 | except NameError: 32 | # spacy_engine is not imported 33 | pass 34 | self.register_engine(2, "standalone", standalone_engine) 35 | self.register_engine(3, "rust", rust_engine) 36 | 37 | def register_engine(self, priority: int, key: str, compile_fn: CompileFN) -> None: 38 | self.available_engines.append((priority, key, compile_fn)) 39 | self.engines_by_key[key] = compile_fn 40 | sorted(self.available_engines, key=operator.itemgetter(0)) 41 | 42 | @property 43 | def default_engine(self) -> CompileFN: 44 | (_, key, compile_fn) = self.available_engines[0] 45 | self.current_engine = key 46 | return compile_fn 47 | 48 | def set_engine(self, key: str) -> CompileFN: 49 | self.current_engine = key 50 | return self.engines_by_key[key] 51 | 52 | @property 53 | def list_branching(self) -> bool: 54 | if self.current_engine == "spacy": 55 | return True 56 | 57 | return False 58 | 59 | 60 | class SessionConfig(object): 61 | def __init__(self): 62 | self._root = Config() 63 | self.modules = [] 64 | # Default config 65 | self._data = { 66 | "ignore_case": True, 67 | "implicit_punct": True, 68 | "deaccent": True, 69 | "implicit_hyphon": False, 70 | } 71 | self.variables = {} 72 | self._nested_group_count = 0 73 | 74 | def register_module(self, mod_name: str) -> None: 75 | logger.debug("Importing module: {}".format(mod_name)) 76 | self.modules.append(import_module(mod_name)) 77 | 78 | def set_variable(self, k: str, v: Any) -> None: 79 | self.variables[k] = v 80 | 81 | def get_variable(self, k: str) -> Any: 82 | return self.variables[k] 83 | 84 | def __getattr__(self, name): 85 | if name == "_root": 86 | return self._root 87 | 88 | elif name in self._data: 89 | return self._data[name] 90 | 91 | return getattr(self._root, name) 92 | 93 | def set_config(self, k, v): 94 | # Handle booleans first 95 | if v.upper() in ["1", "T", "Y"]: 96 | self._data[k] = True 97 | elif v.upper() in ["0", "F", "N"]: 98 | self._data[k] = False 99 | else: 100 | self._data[k] = v 101 | 102 | def new_nested_group_id(self): 103 | self._nested_group_count += 1 104 | return self._nested_group_count 105 | 106 | 107 | def with_config(fn): 108 | def wrapper(*args, **kwargs): 109 | config = SessionConfig() 110 | return fn(*args, config=config, **kwargs) 111 | 112 | return wrapper 113 | -------------------------------------------------------------------------------- /tests/test_lexer.py: -------------------------------------------------------------------------------- 1 | from rita.lexer import RitaLexer 2 | 3 | 4 | def test_tokenize_any_macro_wo_args_wo_type(): 5 | lex = RitaLexer() 6 | lex.build() 7 | 8 | tokens = list(lex.tokenize("ANY")) 9 | assert len(tokens) == 1 10 | token = tokens[0] 11 | assert token.type == "KEYWORD" 12 | assert token.value == "ANY" 13 | 14 | 15 | def test_tokenize_any_macro_wo_args_w_type(): 16 | lex = RitaLexer() 17 | lex.build() 18 | 19 | tokens = list(lex.tokenize('ANY -> MARK("Placeholder")')) 20 | assert len(tokens) == 6 21 | t0 = tokens[0] 22 | assert t0.type == "KEYWORD" 23 | assert t0.value == "ANY" 24 | 25 | assert tokens[1].type == "ARROW" 26 | 27 | t2 = tokens[2] 28 | 29 | assert t2.type == "KEYWORD" 30 | assert t2.value == "MARK" 31 | 32 | t3 = tokens[4] 33 | 34 | assert t3.type == "LITERAL" 35 | assert t3.value == "Placeholder" 36 | 37 | 38 | def test_tokenize_assign_literal(): 39 | lex = RitaLexer() 40 | lex.build() 41 | 42 | tokens = list(lex.tokenize('Test = "Test"')) 43 | 44 | assert len(tokens) == 3 45 | 46 | assert tokens[0].type == "NAME" 47 | assert tokens[1].type == "ASSIGN" 48 | assert tokens[2].type == "LITERAL" 49 | 50 | 51 | def test_tokenize_assign_macro(): 52 | lex = RitaLexer() 53 | lex.build() 54 | 55 | tokens = list(lex.tokenize('Test = WORD("Test")')) 56 | 57 | assert len(tokens) == 6 58 | 59 | assert tokens[0].type == "NAME" 60 | assert tokens[1].type == "ASSIGN" 61 | assert tokens[2].type == "KEYWORD" 62 | assert tokens[4].type == "LITERAL" 63 | 64 | 65 | def test_tokenize_exec_macro(): 66 | lex = RitaLexer() 67 | lex.build() 68 | tokens = list(lex.tokenize('!IMPORT("module.test")')) 69 | assert len(tokens) == 5 70 | assert tokens[0].type == "EXEC" 71 | assert tokens[1].type == "KEYWORD" 72 | assert tokens[3].type == "LITERAL" 73 | 74 | 75 | def test_tokenize_two_exec_macros(): 76 | lex = RitaLexer() 77 | lex.build() 78 | tokens = list( 79 | lex.tokenize( 80 | """ 81 | !CONFIG("setting.1", "1") 82 | !CONFIG("setting.2", "0") 83 | """ 84 | ) 85 | ) 86 | assert len(tokens) == 14 87 | assert tokens[0].type == "EXEC" 88 | assert tokens[1].type == "KEYWORD" 89 | assert tokens[3].type == "LITERAL" 90 | assert tokens[5].type == "LITERAL" 91 | 92 | assert tokens[7].type == "EXEC" 93 | assert tokens[8].type == "KEYWORD" 94 | assert tokens[10].type == "LITERAL" 95 | assert tokens[12].type == "LITERAL" 96 | 97 | 98 | def test_tokenize_list_w_one_item(): 99 | lex = RitaLexer() 100 | lex.build() 101 | 102 | tokens = list( 103 | lex.tokenize( 104 | """ 105 | members = { "first" } 106 | """ 107 | ) 108 | ) 109 | 110 | assert tokens[0].type == "NAME" 111 | assert tokens[1].type == "ASSIGN" 112 | assert tokens[3].type == "LITERAL" 113 | 114 | 115 | def test_tokenize_variable_w_escape(): 116 | lex = RitaLexer() 117 | lex.build() 118 | 119 | tokens = list( 120 | lex.tokenize(r'WORD("Hello \"World\"") -> MARK("GREETING")') 121 | ) 122 | 123 | print(tokens[2]) 124 | 125 | assert tokens[0].type == "KEYWORD" 126 | assert tokens[2].type == "LITERAL" 127 | assert tokens[4].type == "ARROW" 128 | assert tokens[5].type == "KEYWORD" 129 | 130 | 131 | def test_pattern_in_variable(): 132 | lex = RitaLexer() 133 | lex.build() 134 | 135 | tokens = list( 136 | lex.tokenize(r'COMPLEX_NUMBER = {NUM+, WORD("/")?, NUM}') 137 | ) 138 | 139 | assert len(tokens) == 14 140 | -------------------------------------------------------------------------------- /rita/macros.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import types 3 | 4 | from rita.utils import flatten, ExtendedOp 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | def resolve_value(obj, config): 10 | logger.debug("Resolving value: {0}".format(obj)) 11 | 12 | if isinstance(obj, str): 13 | return obj 14 | 15 | elif isinstance(obj, tuple): 16 | return obj 17 | 18 | elif isinstance(obj, list): 19 | return obj 20 | 21 | elif isinstance(obj, types.GeneratorType): 22 | return "either", list(obj), ExtendedOp(None) 23 | 24 | return obj(config=config) 25 | 26 | 27 | def ANY(config, op=None): 28 | return "any", None, ExtendedOp(op) 29 | 30 | 31 | def PUNCT(config, op=None): 32 | return "punct", None, ExtendedOp(op) 33 | 34 | 35 | def MARK(type_, obj, config, op=None): 36 | return { 37 | "label": resolve_value(type_, config=config), 38 | "data": resolve_value(obj, config=config) 39 | } 40 | 41 | 42 | def LOAD(*args, config): 43 | fpath = resolve_value(args[0], config=config) 44 | with open(fpath, "r") as f: 45 | return list([line.strip() for line in f.readlines()]) 46 | 47 | 48 | def ASSIGN(k, v, config, op=None): 49 | logger.debug("Assigning: {0} -> {1}".format(k, v)) 50 | config.set_variable(k, resolve_value(v, config=config)) 51 | 52 | 53 | def IN_LIST(*args, config, op=None): 54 | return "any_of", [resolve_value(arg, config=config) 55 | for arg in flatten(args)], ExtendedOp(op) 56 | 57 | 58 | def PATTERN(*args, config, op=None): 59 | context = [] 60 | for arg in args: 61 | result = resolve_value(arg, config=config) 62 | if isinstance(result, list): 63 | context.append(NESTED(result, config, op)) 64 | else: 65 | context.append(result) 66 | 67 | return context 68 | 69 | 70 | def NESTED(children, config, op=None): 71 | return "nested", children, op 72 | 73 | 74 | def WORD(*args, config, op=None): 75 | if len(args) == 1: 76 | literal = resolve_value(args[0], config=config) 77 | return "value", literal, ExtendedOp(op) 78 | elif len(args) == 0: 79 | return "regex", r"((\w|['_-])+)", ExtendedOp(op) 80 | 81 | 82 | def NUM(*args, config, op=None): 83 | if len(args) == 1: 84 | literal = resolve_value(args[0], config=config) 85 | return "value", literal, ExtendedOp(op) 86 | elif len(args) == 0: 87 | return "regex", r"((\d+[\.,]\d+)|(\d+))", ExtendedOp(op) 88 | 89 | 90 | def POS(*args, config, op=None): 91 | if len(args) == 1: 92 | return "pos", resolve_value(args[0], config=config), ExtendedOp(op) 93 | else: 94 | return "pos", [resolve_value(arg, config=config) for arg in args], ExtendedOp(op) 95 | 96 | 97 | def LEMMA(name, config, op=None): 98 | return "lemma", resolve_value(name, config=config), ExtendedOp(op) 99 | 100 | 101 | def ENTITY(*args, config, op=None): 102 | if len(args) == 1: 103 | return "entity", resolve_value(args[0], config=config), ExtendedOp(op) 104 | else: 105 | return "entity", [resolve_value(arg, config=config) for arg in args], ExtendedOp(op) 106 | 107 | 108 | def PREFIX(name, config, op=None): 109 | return "prefix", resolve_value(name, config=config), ExtendedOp(op) 110 | 111 | 112 | def IMPORT(module, config): 113 | mod_name = resolve_value(module, config=config) 114 | config.register_module(mod_name) 115 | 116 | 117 | def CONFIG(setting, value, config): 118 | logger.debug("Config {0} -> {1}".format(setting, value)) 119 | config.set_config(setting, resolve_value(value, config=config)) 120 | 121 | 122 | def EXEC(obj, config): 123 | return resolve_value(obj, config=config) 124 | -------------------------------------------------------------------------------- /docs/quickstart.md: -------------------------------------------------------------------------------- 1 | # Quick Start 2 | Install it via `pip install rita-dsl` 3 | 4 | You can start defining rules by creating file with extention `*.rita` 5 | 6 | Bellow is complete example which can be used as a reference point 7 | 8 | ``` 9 | cars = LOAD("examples/cars.txt") # Load items from file 10 | colors = {"red", "green", "blue", "white", "black"} # Declare items inline 11 | 12 | {IN_LIST(colors), WORD("car")} -> MARK("CAR_COLOR") # If first token is in list `colors` and second one is word `car`, label it 13 | 14 | {IN_LIST(cars), WORD+} -> MARK("CAR_MODEL") # If first token is in list `cars` and follows by 1..N words, label it 15 | 16 | {ENTITY("PERSON"), LEMMA("like"), WORD} -> MARK("LIKED_ACTION") # If first token is Person, followed by any word which has lemma `like`, label it 17 | ``` 18 | 19 | Now you can compile these rules `rita -f .rita output.jsonl` 20 | 21 | # Using compiled rules 22 | 23 | ## spaCy backend 24 | 25 | ### NEW in 0.4.0: Shortcuts to simplify life: 26 | ``` 27 | import spacy 28 | from rita.shortcuts import setup_spacy 29 | 30 | nlp = spacy.load("en_core_web_sm") 31 | setup_spacy(nlp, ...) 32 | ``` 33 | 34 | If comipling rules from string: 35 | `setup_spacy(nlp, rules_string=rules)` 36 | If loading rules from `.rita` file 37 | `setup_spacy(nlp, rules_path="examples/car-colors.rita")` 38 | If loading from spaCy compiled rules: 39 | `setup_spacy(nlp, patterns="rules.jsonl")` 40 | 41 | ### Doing it manually 42 | ```python 43 | import spacy 44 | from spacy.pipeline import EntityRuler 45 | 46 | nlp = spacy.load("en_core_web_sm") 47 | ruler = EntityRuler(nlp, overwrite_ents=True) 48 | ruler.from_disk("output.jsonl") 49 | nlp.add_pipe(ruler) 50 | ``` 51 | 52 | Everytime you'll parse text with spaCy, it will run usual workflow and apply these rules 53 | 54 | ```python 55 | text = """ 56 | Johny Silver was driving a red car. It was BMW X6 Mclass. Johny likes driving it very much. 57 | """ 58 | 59 | doc = nlp(text) 60 | 61 | entities = [(e.text, e.label_) for e in doc.ents] 62 | print(entities) 63 | 64 | assert entities[0] == ("Johny Silver", "PERSON") # Normal NER 65 | assert entities[1] == ("red car", "CAR_COLOR") # Our first rule 66 | assert entities[2] == ("BMW X6 Mclass", "CAR_MODEL") # Our second rule 67 | assert entities[3] == ("Johny likes driving", "LIKED_ACTION") # Our third rule 68 | ``` 69 | 70 | Alternativelly, if `rita` is used as a dependency in project and you prefer to compile rules dynamically, you can do: 71 | 72 | ```python 73 | import rita 74 | import spacy 75 | from spacy.pipeline import EntityRuler 76 | 77 | nlp = spacy.load("en_core_web_sm") 78 | ruler = EntityRuler(nlp, overwrite_ents=True) 79 | 80 | patterns = rita.compile("examples/color-car.rita") 81 | 82 | ruler.add_patterns(patterns) 83 | nlp.add_pipe(ruler) 84 | ``` 85 | 86 | If you don't want to use file to store rules, they can be compiled directly from string 87 | 88 | ```python 89 | patterns = rita.compile_string(""" 90 | {WORD("Hello"), WORD("World")}->MARK("GREETING") 91 | """) 92 | ``` 93 | 94 | 95 | ## Standalone Version 96 | 97 | While it is highly recommended to use it with spaCy as a base, there can be cases when pure python regex is the only option. 98 | 99 | You can pass rule compilation function explicitly. This concrete function will build regular expressions and create executor which accepts raw text and returns list of results. 100 | 101 | Here's a test covering this case 102 | 103 | ```python 104 | def test_standalone_simple(): 105 | patterns = rita.compile("examples/simple-match.rita", use_engine="standalone") 106 | results = list(patterns.execute("Donald Trump was elected President in 2016 defeating Hilary Clinton.")) 107 | assert len(results) == 2 108 | entities = list([(r["text"], r["label"]) for r in results]) 109 | 110 | assert entities[0] == ("Donald Trump was elected", "WON_ELECTION") 111 | assert entities[1] == ("defeating Hilary Clinton", "LOST_ELECTION") 112 | ``` 113 | 114 | **Since version** `0.5.10`: custom regex implementation can be given. Either to boost performance, or to improve matches. By default, standard Python `re` is used. 115 | 116 | It can be passed like this: 117 | 118 | ```python 119 | import rita 120 | import regex 121 | patterns = rita.compile("examples/simple-match.rita", use_engine="standalone", regex_impl=regex) 122 | ``` 123 | -------------------------------------------------------------------------------- /docs/assets/jetbrains.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | 7 | 8 | 9 | 10 | 11 | 14 | 15 | 16 | 17 | 18 | 19 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 31 | 32 | 33 | 34 | 35 | 36 | 39 | 40 | 41 | 42 | 43 | 45 | 47 | 48 | 51 | 54 | 56 | 57 | 59 | 63 | 64 | 65 | 66 | 67 | -------------------------------------------------------------------------------- /rita/engine/translate_rust.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | 4 | from platform import system 5 | 6 | from ctypes import (c_char_p, c_int, c_uint, c_long, Structure, cdll, POINTER) 7 | from typing import Any, TYPE_CHECKING, Tuple, List, AnyStr 8 | 9 | from rita.engine.translate_standalone import rules_to_patterns, RuleExecutor 10 | from rita.types import Rules 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | field = Tuple[AnyStr, Any] 15 | fields = List[field] 16 | 17 | if TYPE_CHECKING: 18 | # We cannot simply import SessionConfig because of cyclic imports 19 | from rita.config import SessionConfig 20 | 21 | 22 | class NamedRangeResult(Structure): 23 | _fields_ = [ 24 | ("start", c_long), 25 | ("end", c_long), 26 | ("name", c_char_p), 27 | ] 28 | 29 | 30 | class ResultEntity(Structure): 31 | _fields_ = [ 32 | ("label", c_char_p), 33 | ("start", c_long), 34 | ("end", c_long), 35 | ("sub_count", c_uint), 36 | ] 37 | 38 | 39 | class Result(Structure): 40 | _fields_ = [ 41 | ("count", c_uint) 42 | ] 43 | 44 | 45 | class Context(Structure): 46 | _fields_: fields = [] 47 | 48 | 49 | def load_lib(): 50 | try: 51 | os_name = system() 52 | if os_name == "Windows": 53 | lib = cdll.LoadLibrary("rita_rust.dll") 54 | elif os_name == "Darwin": 55 | lib = cdll.LoadLibrary("librita_rust.dylib") 56 | else: 57 | lib = cdll.LoadLibrary("librita_rust.so") 58 | lib.compile.restype = POINTER(Context) 59 | lib.execute.argtypes = [POINTER(Context), c_char_p] 60 | lib.execute.restype = POINTER(Result) 61 | lib.clean_env.argtypes = [POINTER(Context)] 62 | lib.clean_result.argtypes = [POINTER(Result)] 63 | lib.read_result.argtypes = [POINTER(Result), c_int] 64 | lib.read_result.restype = POINTER(ResultEntity) 65 | lib.read_submatch.argtypes = [POINTER(ResultEntity), c_int] 66 | lib.read_submatch.restype = POINTER(NamedRangeResult) 67 | return lib 68 | except Exception as ex: 69 | logger.error("Failed to load rita-rust library, reason: {}\n\n" 70 | "Most likely you don't have required shared library to use it".format(ex)) 71 | 72 | 73 | class RustRuleExecutor(RuleExecutor): 74 | def __init__(self, patterns, config: "SessionConfig"): 75 | self.config = config 76 | self.context = None 77 | 78 | self.lib = load_lib() 79 | self.patterns = [self._build_regex_str(label, rules) 80 | for label, rules in patterns] 81 | 82 | self.compile() 83 | 84 | @staticmethod 85 | def _build_regex_str(label, rules): 86 | indexed_rules = ["(?P{})".format(i, r) if not r.startswith("(?P<") else r 87 | for i, r in enumerate(rules)] 88 | return r"(?P<{0}>{1})".format(label, "".join(indexed_rules)) 89 | 90 | def compile(self): 91 | flag = 0 if self.config.ignore_case else 1 92 | c_array = (c_char_p * len(self.patterns))(*list([p.encode("UTF-8") for p in self.patterns])) 93 | self.context = self.lib.compile(c_array, len(c_array), flag) 94 | return self.context 95 | 96 | def execute(self, text, include_submatches=True): 97 | result_ptr = self.lib.execute(self.context, text.encode("UTF-8")) 98 | count = result_ptr[0].count 99 | for i in range(0, count): 100 | match_ptr = self.lib.read_result(result_ptr, i) 101 | match = match_ptr[0] 102 | matched_text = text[match.start:match.end].strip() 103 | 104 | def parse_subs(): 105 | k = match.sub_count 106 | for j in range(0, k): 107 | s = self.lib.read_submatch(match_ptr, j)[0] 108 | start = s.start 109 | end = s.end 110 | sub_text = text[start:end] 111 | 112 | if sub_text.strip() == "": 113 | continue 114 | 115 | yield { 116 | "text": sub_text.strip(), 117 | "start": start, 118 | "end": end, 119 | "key": s.name.decode("UTF-8"), 120 | } 121 | 122 | yield { 123 | "start": match.start, 124 | "end": match.end, 125 | "text": matched_text, 126 | "label": match.label.decode("UTF-8"), 127 | "submatches": list(parse_subs()) if include_submatches else [] 128 | } 129 | 130 | def clean_context(self): 131 | self.lib.clean_env(self.context) 132 | 133 | @staticmethod 134 | def load(path): 135 | from rita.config import SessionConfig 136 | config = SessionConfig() 137 | with open(path, "r") as f: 138 | patterns = [(obj["label"], obj["rules"]) 139 | for obj in map(json.loads, f.readlines())] 140 | return RustRuleExecutor(patterns, config) 141 | 142 | 143 | def compile_rules(rules: Rules, config: "SessionConfig", **kwargs) -> RustRuleExecutor: 144 | logger.info("Using rita-rust rule implementation") 145 | patterns = [rules_to_patterns(*group, config=config) for group in rules] 146 | executor = RustRuleExecutor(patterns, config) 147 | return executor 148 | -------------------------------------------------------------------------------- /rita/utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from contextlib import contextmanager 4 | from unicodedata import normalize, category 5 | from itertools import cycle, chain 6 | from time import time 7 | from json import JSONEncoder 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | class Node(object): 13 | """ 14 | An utility structure. Has no meaning outside 15 | Allows to specify single path showing how it branches 16 | and by doing `unwrap` we get multiple lists for each possible variation 17 | """ 18 | def __init__(self, data=None): 19 | self.data = data 20 | self.children = [] 21 | self.next_node = None 22 | self.children_cycle = None 23 | self.ref_count = 0 24 | self.depth = 0 25 | self.current = None 26 | 27 | def add_child(self, c): 28 | self.children.append(Node(c)) 29 | self.reset_cycle() 30 | 31 | def add_next(self, node): 32 | self.next_node = node 33 | 34 | @property 35 | def child(self): 36 | # Corner case of 0 depth 37 | if self.depth == 0: 38 | result = self.current 39 | self.next_child() 40 | return result 41 | 42 | if self.ref_count >= self.depth: 43 | self.next_child() 44 | self.ref_count = 0 45 | else: 46 | self.ref_count += 1 47 | return self.current 48 | 49 | def next_child(self): 50 | self.current = next(self.children_cycle) 51 | 52 | def reset_cycle(self): 53 | self.children_cycle = cycle(self.children) 54 | self.current = next(self.children_cycle) 55 | 56 | def unwrap(self): 57 | variants = 1 58 | current = self 59 | while current is not None: 60 | variants *= current.weight 61 | current = current.next_node 62 | 63 | logger.debug("Total variants: {}".format(variants)) 64 | 65 | for i in range(0, variants): 66 | result = [] 67 | current = self 68 | while current is not None: 69 | if current.data: 70 | result.append(current.data) 71 | if len(current.children) > 0: 72 | c = current.child 73 | result.append(c.data) 74 | current = current.next_node 75 | yield result 76 | 77 | @property 78 | def weight(self): 79 | if len(self.children) == 0: 80 | return 1 81 | else: 82 | return len(self.children) 83 | 84 | def __repr__(self): 85 | return "{data}[{children}] -> {next_node}".format( 86 | data=self.data, 87 | children=", ".join(map(str, self.children)), 88 | next_node=str(self.next_node) 89 | ) 90 | 91 | 92 | class SingletonMixin(object): 93 | _instance = None 94 | 95 | def __new__(class_, *args, **kwargs): 96 | if not isinstance(class_._instance, class_): 97 | class_._instance = object.__new__(class_, *args, **kwargs) 98 | return class_._instance 99 | 100 | 101 | def deaccent(text): 102 | return normalize("NFC", 103 | "".join(c 104 | for c in normalize("NFD", text) 105 | if category(c) != "Mn")) 106 | 107 | 108 | def flatten(lst, shallow=False): 109 | def explode(v): 110 | if callable(v): 111 | return v() 112 | else: 113 | return v 114 | 115 | if len(lst) > 1 and not shallow: 116 | return lst 117 | 118 | new_lst = map(explode, lst) 119 | if shallow: 120 | return new_lst 121 | else: 122 | return chain(*new_lst) 123 | 124 | 125 | class ExtendedOp(object): 126 | def __init__(self, op=None): 127 | self.case_sensitive_override = False 128 | self.local_regex_override = False 129 | if isinstance(op, ExtendedOp): 130 | self.op = op.op 131 | self.case_sensitive_override = op.case_sensitive_override 132 | self.local_regex_override = op.local_regex_override 133 | else: 134 | self.op = op 135 | 136 | @property 137 | def value(self): 138 | return self.op 139 | 140 | def empty(self): 141 | return self.op is None or self.op.strip() == "" 142 | 143 | def ignore_case(self, config): 144 | if self.case_sensitive_override: 145 | return False 146 | else: 147 | return config.ignore_case 148 | 149 | def __str__(self): 150 | if self.op: 151 | return self.op 152 | return "" 153 | 154 | def __repr__(self): 155 | return str(self) 156 | 157 | def __eq__(self, other): 158 | if type(other) == str: 159 | return self.op == other 160 | 161 | return ( 162 | self.op == other.op and 163 | self.case_sensitive_override == other.case_sensitive_override and 164 | self.local_regex_override == other.local_regex_override 165 | ) 166 | 167 | 168 | class Timer(object): 169 | def __init__(self, title): 170 | self.title = title 171 | self.ts = time() 172 | 173 | def stop(self, debug=True): 174 | now = time() 175 | delta = int(now - self.ts) * 1000 176 | msg = "{} took {}ms".format(self.title, delta) 177 | if debug: 178 | logger.debug(msg) 179 | else: 180 | logger.info(msg) 181 | return delta 182 | 183 | 184 | @contextmanager 185 | def timer(title): 186 | t = Timer(title) 187 | yield 188 | t.stop() 189 | 190 | 191 | class RitaJSONEncoder(JSONEncoder): 192 | def default(self, o): 193 | if isinstance(o, ExtendedOp): 194 | return o.op 195 | return o.__dict__ 196 | -------------------------------------------------------------------------------- /rita/engine/translate_spacy.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from functools import partial 4 | from typing import Any, TYPE_CHECKING, Mapping, Callable, Generator, AnyStr 5 | 6 | from rita.utils import ExtendedOp 7 | from rita.types import Rules, Patterns 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | SpacyPattern = Generator[Mapping[AnyStr, Any], None, None] 12 | ParseFn = Callable[[Any, "SessionConfig", ExtendedOp], SpacyPattern] 13 | 14 | if TYPE_CHECKING: 15 | # We cannot simply import SessionConfig because of cyclic imports 16 | from rita.config import SessionConfig 17 | 18 | 19 | def any_of_parse(lst, config: "SessionConfig", op: ExtendedOp) -> SpacyPattern: 20 | if op.ignore_case(config): 21 | normalized = sorted([item.lower() 22 | for item in lst]) 23 | base = {"LOWER": {"IN": normalized}} 24 | else: 25 | base = {"LOWER": {"IN": sorted(lst)}} 26 | 27 | if not op.empty(): 28 | base["OP"] = op.value 29 | yield base 30 | 31 | 32 | def regex_parse(r, config: "SessionConfig", op: ExtendedOp) -> SpacyPattern: 33 | if op.ignore_case(config): 34 | d = {"LOWER": {"REGEX": r.lower()}} 35 | else: 36 | d = {"TEXT": {"REGEX": r}} 37 | 38 | if not op.empty(): 39 | d["OP"] = op.value 40 | yield d 41 | 42 | 43 | def fuzzy_parse(r, config: "SessionConfig", op: ExtendedOp) -> SpacyPattern: 44 | # TODO: build premutations 45 | d = {"LOWER": {"REGEX": "({0})[.,?;!]?".format("|".join(r))}} 46 | if not op.empty(): 47 | d["OP"] = op.value 48 | yield d 49 | 50 | 51 | def generic_parse(tag, value, config: "SessionConfig", op: ExtendedOp) -> SpacyPattern: 52 | d = {} 53 | if isinstance(value, list) and len(value) > 1: 54 | value = {"IN": value} 55 | 56 | d[tag] = value 57 | 58 | if not op.empty(): 59 | d["OP"] = op.value 60 | yield d 61 | 62 | 63 | def entity_parse(value, config: "SessionConfig", op: ExtendedOp) -> SpacyPattern: 64 | tag = "ENT_TYPE" 65 | if op.empty(): 66 | op.op = "+" 67 | return generic_parse(tag, value, config, op) 68 | 69 | 70 | def punct_parse(_, config: "SessionConfig", op: ExtendedOp) -> SpacyPattern: 71 | d = dict() 72 | d["IS_PUNCT"] = True 73 | if not op.empty(): 74 | d["OP"] = op.value 75 | yield d 76 | 77 | 78 | def any_parse(_, config: "SessionConfig", op: ExtendedOp) -> SpacyPattern: 79 | d = dict() 80 | if not op.empty(): 81 | d["OP"] = op.value 82 | yield d 83 | 84 | 85 | def phrase_parse(value, config: "SessionConfig", op: ExtendedOp) -> SpacyPattern: 86 | """ 87 | TODO: Does not support operators 88 | """ 89 | splitter = next((s for s in ["-", " "] 90 | if s in value), None) 91 | if splitter: 92 | buff = value.split(splitter) 93 | yield next(orth_parse(buff[0], config=config, op=ExtendedOp())) 94 | for b in buff[1:]: 95 | if splitter != " ": 96 | yield next(orth_parse(splitter, config=config, op=ExtendedOp())) 97 | yield next(orth_parse(b, config=config, op=ExtendedOp())) 98 | else: 99 | yield next(orth_parse(value, config=config, op=ExtendedOp())) 100 | 101 | 102 | def tag_parse(values, config: "SessionConfig", op: ExtendedOp) -> SpacyPattern: 103 | """ 104 | For generating POS/TAG patterns based on a Regex 105 | e.g. TAG("^NN|^JJ") for adjectives or nouns 106 | also deals with TAG_WORD for tag and word or tag and list 107 | """ 108 | d = {"TAG": {"REGEX": values["tag"]}} 109 | if "word" in values: 110 | if op.ignore_case(config): 111 | d["LOWER"] = values["word"].lower() 112 | else: 113 | d["TEXT"] = values["word"] 114 | elif "list" in values: 115 | lst = values["list"] 116 | if op.ignore_case(config): 117 | normalized = sorted([item.lower() 118 | for item in lst]) 119 | d["LOWER"] = {"REGEX": r"^({0})$".format("|".join(normalized))} 120 | else: 121 | d["TEXT"] = {"REGEX": r"^({0})$".format("|".join(sorted(lst)))} 122 | if not op.empty(): 123 | d["OP"] = op.value 124 | yield d 125 | 126 | 127 | def nested_parse(values, config: "SessionConfig", op: ExtendedOp) -> SpacyPattern: 128 | from rita.macros import resolve_value 129 | results = rules_to_patterns("", [resolve_value(v, config=config) 130 | for v in values], config=config) 131 | return results["pattern"] 132 | 133 | 134 | def orth_parse(value, config: "SessionConfig", op: ExtendedOp) -> SpacyPattern: 135 | d = {} 136 | print(op.case_sensitive_override) 137 | if op.ignore_case(config): 138 | d["LOWER"] = value.lower() 139 | else: 140 | d["ORTH"] = value 141 | 142 | if not op.empty(): 143 | d["OP"] = op.value 144 | yield d 145 | 146 | 147 | PARSERS: Mapping[str, ParseFn] = { 148 | "any_of": any_of_parse, 149 | "any": any_parse, 150 | "value": orth_parse, 151 | "regex": regex_parse, 152 | "entity": entity_parse, 153 | "lemma": partial(generic_parse, "LEMMA"), 154 | "pos": partial(generic_parse, "POS"), 155 | "punct": punct_parse, 156 | "fuzzy": fuzzy_parse, 157 | "phrase": phrase_parse, 158 | "tag": tag_parse, 159 | "nested": nested_parse, 160 | "orth": orth_parse, 161 | } 162 | 163 | 164 | def rules_to_patterns(label: str, data: Patterns, config: "SessionConfig"): 165 | logger.debug(data) 166 | return { 167 | "label": label, 168 | "pattern": [p 169 | for (t, d, op) in data 170 | for p in PARSERS[t](d, config, ExtendedOp(op))], 171 | } 172 | 173 | 174 | def compile_rules(rules: Rules, config: "SessionConfig", **kwargs): 175 | logger.info("Using spaCy rules implementation") 176 | return [rules_to_patterns(label, patterns, config=config) 177 | for (label, patterns) in rules] 178 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | 0.7.0 (2021-02-02) 2 | **************************** 3 | 4 | Features 5 | -------- 6 | 7 | - `standalone` engine now will return submatches list containing start and end for each part of match 8 | #93 9 | - Partially covered https://github.com/zaibacu/rita-dsl/issues/70 10 | 11 | Allow nested patterns, like: 12 | 13 | .. code-block:: 14 | 15 | num_with_fractions = {NUM, WORD("-")?, IN_LIST(fractions)} 16 | complex_number = {NUM|PATTERN(num_with_fractions)} 17 | 18 | {PATTERN(complex_number)}->MARK("NUMBER") 19 | #95 20 | - Submatches for rita-rust engine 21 | #96 22 | - Regex module which allows to specify word pattern, eg. `REGEX(^a)` means word must start with letter "a" 23 | 24 | Implemented by: Roland M. Mueller (https://github.com/rolandmueller) 25 | #101 26 | - ORTH module which allows you to specify case sensitive entry while rest of the rules ignores case. Used for acronyms and proper names 27 | 28 | Implemented by: Roland M. Mueller (https://github.com/rolandmueller) 29 | #102 30 | - Additional macro for `tag` module, allowing to tag specific word/list of words 31 | 32 | Implemented by: Roland M. Mueller (https://github.com/rolandmueller) 33 | #103 34 | - Added `names` module which allows to generate person names variations 35 | #105 36 | - spaCy v3 Support 37 | #109 38 | 39 | Fix 40 | --- 41 | 42 | - Optimizations for Rust Engine 43 | 44 | - No need for passing text forward and backward, we can calculate from text[start:end] 45 | 46 | - Grouping and sorting logic can be done in binary code 47 | #88 48 | - Fix NUM parsing bug 49 | #90 50 | - Switch from `(^\s)` to `\b` when doing `IN_LIST`. Should solve several corner cases 51 | #91 52 | - Fix floating point number matching 53 | #92 54 | - revert #91 changes. Keep old way for word boundary 55 | #94 56 | 57 | 58 | 0.6.0 (2020-08-29) 59 | **************************** 60 | 61 | Features 62 | -------- 63 | 64 | - Implemented ability to alias macros, eg.: 65 | 66 | .. code-block:: 67 | 68 | numbers = {"one", "two", "three"} 69 | @alias IN_LIST IL 70 | 71 | IL(numbers) -> MARK("NUMBER") 72 | 73 | Now using "IL" will actually call "IN_LIST" macro. 74 | #66 75 | - introduce the TAG element as a module. Needs a new parser for the SpaCy translate. 76 | Would allow more flexible matching of detailed part-of-speech tag, like all adjectives or nouns: TAG("^NN|^JJ"). 77 | 78 | Implemented by: 79 | Roland M. Mueller (https://github.com/rolandmueller) 80 | #81 81 | - Add a new module for a PLURALIZE tag 82 | For a noun or a list of nouns, it will match any singular or plural word. 83 | 84 | Implemented by: 85 | Roland M. Mueller (https://github.com/rolandmueller) 86 | #82 87 | - Add a new Configuration implicit_hyphon (default false) for automatically adding hyphon characters - to the rules. 88 | 89 | Implemented by: 90 | Roland M. Mueller (https://github.com/rolandmueller) 91 | #84 92 | - Allow to give custom regex impl. By default `re` is used 93 | #86 94 | - An interface to be able to use rust engine. 95 | 96 | In general it's identical to `standalone`, but differs in one crucial part - all of the rules are compiled into actual binary code and that provides large performance boost. 97 | It is proprietary, because there are various caveats, engine itself is a bit more fragile and needs to be tinkered to be optimized to very specific case 98 | (eg. few long texts with many matches vs a lot short texts with few matches). 99 | #87 100 | 101 | Fix 102 | --- 103 | 104 | - Fix `-` bug when it is used as stand alone word 105 | #71 106 | - Fix regex matching, when shortest word is selected from IN_LIST 107 | #72 108 | - Fix IN_LIST regex so that it wouldn't take part of word 109 | #75 110 | - Fix IN_LIST operation bug - it was ignoring them 111 | #77 112 | - Use list branching only when using spaCy Engine 113 | #80 114 | 115 | 116 | 0.5.0 (2020-06-18) 117 | **************************** 118 | 119 | Features 120 | -------- 121 | 122 | - Added `PREFIX` macro which allows to attach word in front of list items or words 123 | #47 124 | - Allow to pass variables directly when doing `compile` and `compile_string` 125 | #51 126 | - Allow to compile (and later load) rules using rita CLI while using standalone engine (spacy is already supported) 127 | #53 128 | - Added ability to import rule files into rule file. Recursive import is supported as well. 129 | #55 130 | - Added possibility to define pattern as a variable and reuse it in other patterns: 131 | 132 | Example: 133 | .. code-block:: RITA 134 | 135 | ComplexNumber = {NUM+, WORD("/")?, NUM?} 136 | 137 | {PATTERN(ComplexNumber), WORD("inches"), WORD("Height")}->MARK("HEIGHT") 138 | 139 | {PATTERN(ComplexNumber), WORD("inches"), WORD("Width")}->MARK("WIDTH") 140 | #64 141 | 142 | Fix 143 | --- 144 | 145 | - Fix issue with multiple wildcard words using standalone engine 146 | #46 147 | - Don't crash when no rules are provided 148 | #50 149 | - Fix Number and ANY-OF parsing 150 | #59 151 | - Allow escape characters inside LITERAL 152 | #62 153 | 154 | 155 | 0.4.0 (2020-01-25) 156 | **************************** 157 | 158 | Features 159 | -------- 160 | 161 | - Support for deaccent. In general, if accented version of word is given, both deaccented and accented will be used to match. To turn iit off - `!CONFIG("deaccent", "N")` 162 | #38 163 | - Added shortcuts module to simplify injecting into spaCy 164 | #42 165 | 166 | Fix 167 | --- 168 | 169 | - Fix issue regarding Spacy rules with `IN_LIST` and using case-sensitive mode. It was creating Regex pattern which is not valid spacy pattern 170 | #40 171 | 172 | 173 | 0.3.2 (2019-12-19) 174 | *********************** 175 | 176 | Features 177 | -------- 178 | 179 | - - Introduced `towncrier` to track changes 180 | - Added linter `flake8` 181 | - Refactored code to match `pep8` 182 | #32 183 | 184 | Fix 185 | --- 186 | 187 | - - Fix WORD split by `-` 188 | 189 | - Split by ` ` (empty space) as well 190 | 191 | - Coverage score increase 192 | #35 193 | 194 | 195 | -------------------------------------------------------------------------------- /rita/parser.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import ply.yacc as yacc 4 | 5 | from functools import partial 6 | 7 | from rita.lexer import RitaLexer 8 | from rita import macros 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | def stub(*args, **kwargs): 14 | return None 15 | 16 | 17 | def either(a, b): 18 | yield a 19 | yield b 20 | 21 | 22 | def load_macro(name, config): 23 | try: 24 | return partial(getattr(macros, name), config=config) 25 | except Exception: 26 | pass 27 | 28 | def lazy_load(*args, **kwargs): 29 | logger.info(config.modules) 30 | for mod in config.modules: 31 | try: 32 | fn = getattr(mod, name) 33 | return fn(*args, **kwargs) 34 | except Exception as ex: 35 | logger.error(ex) 36 | continue 37 | 38 | raise RuntimeError("MACRO {} not loaded".format(name)) 39 | 40 | return lazy_load 41 | 42 | 43 | def var_wrapper(variable, config): 44 | def wrapper(*args, **kwargs): 45 | logger.debug("Variables: {}".format(config.variables)) 46 | return config.get_variable(variable) 47 | 48 | return wrapper 49 | 50 | 51 | class RitaParser(object): 52 | tokens = RitaLexer.tokens 53 | precedence = ( 54 | ("nonassoc", "ARROW"), 55 | ("nonassoc", "PIPE"), 56 | ("nonassoc", "COMMA"), 57 | ("left", "EXEC"), 58 | ("left", "ASSIGN"), 59 | ("left", "RBRACKET", "LBRACKET", "LPAREN", "RPAREN"), 60 | ("left", "KEYWORD", "NAME", "LITERAL"), 61 | ("right", "MODIF_QMARK", "MODIF_STAR", "MODIF_PLUS"), 62 | ) 63 | 64 | def __init__(self, config): 65 | self.config = config 66 | self.lexer = None 67 | self.parser = None 68 | 69 | def p_document(self, p): 70 | """ 71 | DOCUMENT : MACRO_CHAIN 72 | | MACRO_EXEC 73 | | VARIABLE 74 | """ 75 | logger.debug("Building initial document {}".format(p[1])) 76 | p[0] = [p[1]] 77 | 78 | def p_document_list(self, p): 79 | """ 80 | DOCUMENT : DOCUMENT MACRO_CHAIN 81 | | DOCUMENT MACRO_EXEC 82 | | DOCUMENT VARIABLE 83 | """ 84 | logger.debug("Extending document {}".format(p[2])) 85 | p[0] = p[1] + [p[2]] 86 | 87 | def p_macro_chain(self, p): 88 | " MACRO_CHAIN : MACRO ARROW MACRO " 89 | logger.debug("Have {0} -> {1}".format(p[1], p[3])) 90 | p[0] = partial( 91 | p[3], 92 | macros.PATTERN(p[1], config=self.config), 93 | config=self.config 94 | ) 95 | 96 | def p_macro_chain_from_array(self, p): 97 | " MACRO_CHAIN : ARRAY ARROW MACRO " 98 | logger.debug("Have {0} -> {1}".format(p[1], p[3])) 99 | p[0] = partial( 100 | p[3], 101 | macros.PATTERN(*p[1], config=self.config), 102 | config=self.config 103 | ) 104 | 105 | def p_macro_exec(self, p): 106 | " MACRO_EXEC : EXEC MACRO " 107 | logger.debug("Exec {0}".format(p[2])) 108 | macros.EXEC(p[2], config=self.config) 109 | p[0] = stub 110 | 111 | def p_macro_w_modif(self, p): 112 | """ 113 | MACRO : MACRO MODIF_PLUS 114 | | MACRO MODIF_STAR 115 | | MACRO MODIF_QMARK 116 | | MACRO EXEC 117 | """ 118 | logger.debug("Adding modifier to Macro {}".format(p[1])) 119 | fn = p[1] 120 | p[0] = partial(fn, op=p[2]) 121 | 122 | def p_macro_wo_args(self, p): 123 | " MACRO : KEYWORD " 124 | fn = load_macro(p[1], config=self.config) 125 | logger.debug("Parsing macro (w/o args): {}".format(p[1])) 126 | p[0] = fn 127 | 128 | def p_macro_w_args(self, p): 129 | " MACRO : KEYWORD LPAREN ARGS RPAREN " 130 | logger.debug("Parsing macro: {0}, args: {1}".format(p[1], p[3])) 131 | fn = load_macro(p[1], config=self.config) 132 | p[0] = partial(fn, *p[3]) 133 | 134 | def p_macro_from_array(self, p): 135 | """ 136 | MACRO : KEYWORD ARRAY 137 | | KEYWORD ARG_ARRAY 138 | """ 139 | logger.debug("Parsing macro: {0}, args: {1}".format(p[1], p[2])) 140 | fn = load_macro(p[1], config=self.config) 141 | p[0] = partial(fn, *p[2]) 142 | 143 | def p_array(self, p): 144 | " ARRAY : LBRACKET ARGS RBRACKET " 145 | p[0] = p[2] 146 | 147 | def p_arg_array(self, p): 148 | " ARG_ARRAY : LPAREN ARGS RPAREN " 149 | p[0] = p[2] 150 | 151 | def p_variable(self, p): 152 | " VARIABLE_NAME : NAME " 153 | p[0] = var_wrapper(p[1], self.config) 154 | 155 | def p_variable_from_args(self, p): 156 | " VARIABLE : NAME ASSIGN ARGS " 157 | if len(p[3]) == 1: 158 | macros.ASSIGN(p[1], p[3][0], config=self.config) 159 | else: 160 | macros.ASSIGN(p[1], p[3], config=self.config) 161 | 162 | p[0] = stub 163 | 164 | def p_either(self, p): 165 | " ARG : ARG PIPE ARG " 166 | p[0] = either(p[1], p[3]) 167 | 168 | def p_arg_list(self, p): 169 | " ARGS : ARGS COMMA ARG " 170 | p[0] = p[1] + [p[3]] 171 | 172 | def p_args(self, p): 173 | " ARGS : ARG " 174 | p[0] = [p[1]] 175 | 176 | def p_arg(self, p): 177 | " ARG : LITERAL " 178 | p[0] = p[1] 179 | 180 | def p_arg_from_macro(self, p): 181 | " ARG : MACRO " 182 | p[0] = p[1] 183 | 184 | def p_arg_from_var(self, p): 185 | " ARG : VARIABLE_NAME " 186 | p[0] = p[1]() 187 | 188 | def p_arg_from_array(self, p): 189 | " ARGS : ARRAY " 190 | p[0] = p[1] 191 | 192 | def p_error(self, p): 193 | if p: 194 | logger.error("Syntax error at '{}'".format(p.value)) 195 | else: 196 | logger.error("p is null") 197 | 198 | def build(self, **kwargs): 199 | self.lexer = RitaLexer().build(**kwargs) 200 | self.parser = yacc.yacc(module=self, errorlog=logger, **kwargs) 201 | 202 | def parse(self, data): 203 | if data.strip() == "": 204 | return [] 205 | 206 | return self.parser.parse(r"{}".format(data), lexer=self.lexer, debug=logger) 207 | -------------------------------------------------------------------------------- /tests/test_parser.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pytest 3 | 4 | from rita.parser import RitaParser 5 | from rita.config import SessionConfig 6 | from rita.utils import ExtendedOp 7 | 8 | 9 | @pytest.fixture 10 | def config(): 11 | return SessionConfig() 12 | 13 | 14 | def test_parser_empty_rules(config): 15 | p = RitaParser(config) 16 | p.build() 17 | results = p.parse("") 18 | assert len(results) == 0 19 | 20 | 21 | def test_parser_any_macro_wo_args_w_type(config): 22 | p = RitaParser(config) 23 | p.build() 24 | 25 | results = p.parse('ANY -> MARK("PlaceHolder")') 26 | assert len(results) == 1 27 | 28 | 29 | def test_parser_any_macro_w_args_w_type(config): 30 | p = RitaParser(config) 31 | p.build() 32 | 33 | results = p.parse('{WORD("arg1")} -> MARK("PlaceHolder")') 34 | assert len(results) == 1 35 | 36 | 37 | def test_parser_nested_macro(config): 38 | p = RitaParser(config) 39 | p.build() 40 | 41 | results = p.parse('{ANY, WORD("test")} -> MARK("Test")') 42 | assert len(results) == 1 43 | for result in results: 44 | print(result()) 45 | 46 | 47 | def test_parser_assign_literal_and_ignore_it(config): 48 | p = RitaParser(config) 49 | p.build(debug=True) 50 | 51 | results = p.parse( 52 | """ 53 | my_variable = "Test" 54 | 55 | {WORD("something")} -> MARK("TEST") 56 | """ 57 | ) 58 | assert len(results) == 2 59 | 60 | rules = results[1]() 61 | 62 | print(rules) 63 | assert {"label": "TEST", "data": [("value", "something", ExtendedOp())]} == rules 64 | 65 | 66 | def test_parser_assign_literal_and_use_it(config): 67 | p = RitaParser(config) 68 | p.build(debug=True) 69 | 70 | results = p.parse( 71 | """ 72 | my_variable = "Test" 73 | 74 | {WORD(my_variable)} -> MARK("TEST") 75 | """ 76 | ) 77 | assert len(results) == 2 78 | 79 | rules = results[1]() 80 | 81 | print(rules) 82 | assert {"label": "TEST", "data": [("value", "Test", ExtendedOp())]} == rules 83 | 84 | 85 | def test_parser_just_assign_macro(config): 86 | p = RitaParser(config) 87 | p.build(debug=True) 88 | 89 | results = p.parse( 90 | """ 91 | x = WORD("Test") 92 | """ 93 | ) 94 | assert len(results) == 1 95 | 96 | 97 | def test_parser_assign_two_variables(config): 98 | p = RitaParser(config) 99 | p.build(debug=True) 100 | 101 | results = p.parse( 102 | """ 103 | a = "A" 104 | b = "B" 105 | """ 106 | ) 107 | assert len(results) == 2 108 | 109 | 110 | def test_parser_assign_macro_and_use_it(config): 111 | p = RitaParser(config) 112 | p.build(debug=True) 113 | 114 | results = p.parse( 115 | """ 116 | my_variable = WORD("Test") 117 | 118 | {my_variable} -> MARK("TEST") 119 | """ 120 | ) 121 | assert len(results) == 2 122 | 123 | rules = results[1]() 124 | 125 | print(rules) 126 | assert {"label": "TEST", "data": [("value", "Test", ExtendedOp())]} == rules 127 | 128 | 129 | def test_parser_import_module(config): 130 | p = RitaParser(config) 131 | p.build(debug=True) 132 | 133 | results = p.parse( 134 | """ 135 | IMPORT("rita.modules.fuzzy") -> EXEC 136 | 137 | FUZZY("test") -> MARK("FUZZY_MATCH") 138 | """ 139 | ) 140 | 141 | assert len(results) == 2 142 | 143 | 144 | def test_parser_import_module_shortcut(config, caplog): 145 | caplog.set_level(logging.INFO) 146 | p = RitaParser(config) 147 | p.build(debug=True) 148 | 149 | results = p.parse( 150 | """ 151 | !IMPORT("rita.modules.fuzzy") 152 | 153 | FUZZY("test") -> MARK("FUZZY_MATCH") 154 | """ 155 | ) 156 | 157 | assert len(results) == 2 158 | 159 | 160 | def test_parser_config(config): 161 | p = RitaParser(config) 162 | p.build(debug=True) 163 | 164 | p.parse( 165 | """ 166 | !CONFIG("foo", "bar") 167 | !CONFIG("testing", "1") 168 | """ 169 | ) 170 | 171 | assert config.foo == "bar" 172 | assert config.testing 173 | 174 | 175 | def test_parser_list_w_one_item(config): 176 | p = RitaParser(config) 177 | p.build(debug=True) 178 | 179 | results = p.parse( 180 | """ 181 | members = { "one" } 182 | 183 | IN_LIST(members) -> MARK("MEMBER") 184 | """ 185 | ) 186 | 187 | assert len(results) == 2 188 | 189 | 190 | def test_parser_list_w_two_items(config): 191 | p = RitaParser(config) 192 | p.build(debug=True) 193 | 194 | results = p.parse( 195 | """ 196 | members = {"one", "two"} 197 | 198 | IN_LIST(members) -> MARK("MEMBER") 199 | """ 200 | ) 201 | 202 | assert len(results) == 2 203 | 204 | 205 | def test_parser_literal_w_escape(config): 206 | p = RitaParser(config) 207 | p.build(debug=True) 208 | 209 | results = p.parse( 210 | r'WORD("Hello \"WORLD\"") -> MARK("TEST")' 211 | ) 212 | 213 | assert len(results) == 1 214 | 215 | 216 | def test_parser_pattern_in_variable(config): 217 | p = RitaParser(config) 218 | p.build(debug=True) 219 | 220 | results = p.parse( 221 | ''' 222 | Complex_Number = { NUM+, WORD("/")?, NUM? } 223 | {PATTERN(Complex_Number), WORD("inch")}->MARK("WIDTH") 224 | ''' 225 | ) 226 | 227 | print(results) 228 | assert len(results) == 2 229 | 230 | 231 | def test_pattern_with_escaped_characters(config): 232 | p = RitaParser(config) 233 | p.build(debug=True) 234 | 235 | results = p.parse( 236 | ''' 237 | special = { '"', "*", "-" } 238 | IN_LIST(special)->MARK("TEST") 239 | ''' 240 | ) 241 | 242 | assert len(results) > 0 243 | 244 | rules = results[1]() 245 | 246 | assert {"label": "TEST", "data": [("any_of", ["\"", "*", "-"], ExtendedOp())]} == rules 247 | 248 | 249 | def test_parser_array_as_argument(config): 250 | p = RitaParser(config) 251 | p.build(debug=True) 252 | 253 | results = p.parse( 254 | ''' 255 | special = { '"', "*", "-" } 256 | POS(special)->MARK("TEST") 257 | ''' 258 | ) 259 | 260 | assert len(results) > 0 261 | rules = results[1]() 262 | assert {"label": "TEST", "data": [("pos", ["\"", "*", "-"], ExtendedOp())]} == rules 263 | 264 | 265 | def test_parser_inline_array_as_argument(config): 266 | p = RitaParser(config) 267 | p.build(debug=True) 268 | 269 | results = p.parse( 270 | ''' 271 | POS('"', "*", "-")->MARK("TEST") 272 | ''' 273 | ) 274 | 275 | assert len(results) > 0 276 | rules = results[0]() 277 | assert {"label": "TEST", "data": [("pos", ["\"", "*", "-"], ExtendedOp())]} == rules 278 | 279 | 280 | def test_parser_inline_array_as_inlist_argument(config): 281 | p = RitaParser(config) 282 | p.build(debug=True) 283 | 284 | results = p.parse( 285 | ''' 286 | IN_LIST('one', "two", "three")->MARK("TEST") 287 | ''' 288 | ) 289 | 290 | assert len(results) > 0 291 | rules = results[0]() 292 | assert {"label": "TEST", "data": [("any_of", ["one", "two", "three"], ExtendedOp())]} == rules 293 | -------------------------------------------------------------------------------- /rita/engine/translate_standalone.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import re 3 | import json 4 | 5 | from functools import partial 6 | from itertools import groupby, chain 7 | from concurrent.futures import ThreadPoolExecutor, as_completed 8 | from typing import Any, TYPE_CHECKING, Mapping, Callable 9 | 10 | from rita.utils import ExtendedOp 11 | from rita.types import Rules, Patterns 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | ParseFn = Callable[[Any, "SessionConfig", ExtendedOp], str] 16 | 17 | 18 | if TYPE_CHECKING: 19 | # We cannot simply import SessionConfig because of cyclic imports 20 | from rita.config import SessionConfig 21 | 22 | 23 | def apply_operator(syntax, op: ExtendedOp) -> str: 24 | if op.empty(): 25 | return syntax 26 | 27 | elif str(op) == "!": # A bit complicated one 28 | return (r"((?!{})\w+)".format(syntax 29 | .rstrip(")") 30 | .lstrip("("))) 31 | else: 32 | return syntax + str(op) 33 | 34 | 35 | def any_of_parse(lst, config: "SessionConfig", op: ExtendedOp) -> str: 36 | clause = r"((^|\s)(({0})\s?))".format("|".join(sorted(lst, key=lambda x: (-len(x), x)))) 37 | return apply_operator(clause, op) 38 | 39 | 40 | def regex_parse(r, config: "SessionConfig", op: ExtendedOp) -> str: 41 | if op.local_regex_override: 42 | return local_regex_parse(r, config, op) 43 | else: 44 | initial = "(" + r + r"\s?" + ")" 45 | return apply_operator(initial, op) 46 | 47 | 48 | def local_regex_parse(r, config: "SessionConfig", op: ExtendedOp) -> str: 49 | if r[0] == "^" and r[-1] == "$": # Fully strictly defined string? 50 | pattern = r[1:-1] 51 | elif r[0] == "^": # We define start of the string 52 | pattern = r[1:] + r"\w*" 53 | elif r[-1] == "$": # We define end of string 54 | pattern = r"\w*" + r[:-1] 55 | else: # We define string inside word 56 | pattern = r"\w*" + r + r"\w*" 57 | 58 | initial = "(" + r"\b" + pattern + r"\b" + r"\s?" + ")" 59 | return apply_operator(initial, op) 60 | 61 | 62 | def not_supported(key, *args, **kwargs) -> str: 63 | raise RuntimeError( 64 | "Rule '{0}' is not supported in standalone mode" 65 | .format(key) 66 | ) 67 | 68 | 69 | def person_parse(config: "SessionConfig", op: ExtendedOp) -> str: 70 | return apply_operator(r"([A-Z]\w+\s?)", op) 71 | 72 | 73 | def entity_parse(value, config: "SessionConfig", op: ExtendedOp) -> str: 74 | if value == "PERSON": 75 | return person_parse(config, op=op) 76 | else: 77 | return not_supported(value) 78 | 79 | 80 | def punct_parse(_, config: "SessionConfig", op: ExtendedOp) -> str: 81 | return apply_operator(r"([.,!;?:]\s?)", op) 82 | 83 | 84 | def word_parse(value, config: "SessionConfig", op: ExtendedOp) -> str: 85 | initial = r"({}\s?)".format(value) 86 | return apply_operator(initial, op) 87 | 88 | 89 | def fuzzy_parse(r, config: "SessionConfig", op: ExtendedOp) -> str: 90 | # TODO: build premutations 91 | return apply_operator(r"({0})[.,?;!]?".format("|".join(r)), op) 92 | 93 | 94 | def phrase_parse(value, config: "SessionConfig", op: ExtendedOp) -> str: 95 | return apply_operator(r"({}\s?)".format(value), op) 96 | 97 | 98 | def nested_parse(values, config: "SessionConfig", op: ExtendedOp) -> str: 99 | from rita.macros import resolve_value 100 | (_, patterns) = rules_to_patterns("", [resolve_value(v, config=config) 101 | for v in values], config=config) 102 | return r"(?P{})".format(config.new_nested_group_id(), "".join(patterns)) 103 | 104 | 105 | def any_parse(_, config: "SessionConfig", op: ExtendedOp) -> str: 106 | return regex_parse(r".*", config, op) 107 | 108 | 109 | PARSERS: Mapping[str, ParseFn] = { 110 | "any_of": any_of_parse, 111 | "any": any_parse, 112 | "value": word_parse, 113 | "regex": regex_parse, 114 | "entity": entity_parse, 115 | "lemma": partial(not_supported, "LEMMA"), 116 | "pos": partial(not_supported, "POS"), 117 | "punct": punct_parse, 118 | "fuzzy": fuzzy_parse, 119 | "phrase": phrase_parse, 120 | "nested": nested_parse, 121 | } 122 | 123 | 124 | def rules_to_patterns(label: str, data: Patterns, config: "SessionConfig"): 125 | logger.debug("data: {}".format(data)) 126 | 127 | def gen(): 128 | """ 129 | Implicitly add spaces between rules 130 | """ 131 | if len(data) == 0: 132 | return 133 | 134 | yield data[0] 135 | 136 | for (t, d, op) in data[1:]: 137 | yield t, d, op 138 | 139 | return ( 140 | label, 141 | [PARSERS[t](d, config, op) for (t, d, op) in gen()], 142 | ) 143 | 144 | 145 | class RuleExecutor(object): 146 | def __init__(self, patterns, config, regex_impl=re, max_workers=4): 147 | self.config = config 148 | self.regex_impl = regex_impl 149 | self.patterns = [self.compile(label, rules) 150 | for label, rules in patterns] 151 | self.raw_patterns = patterns 152 | self.max_workers = max_workers 153 | 154 | def compile(self, label, rules): 155 | flags = self.regex_impl.DOTALL 156 | if self.config.ignore_case: 157 | flags = flags | self.regex_impl.IGNORECASE 158 | 159 | indexed_rules = ["(?P{})".format(i, r) if not r.startswith("(?P<") else r 160 | for i, r in enumerate(rules)] 161 | regex_str = r"(?P<{0}>{1})".format(label, "".join(indexed_rules)) 162 | try: 163 | return self.regex_impl.compile(regex_str, flags) 164 | except Exception as ex: 165 | logger.exception("Failed to compile: '{0}', Reason: \n{1}".format(regex_str, str(ex))) 166 | return None 167 | 168 | def _match_task(self, pattern, text, include_submatches): 169 | def gen(): 170 | for match in pattern.finditer(text): 171 | def submatches(): 172 | for k, v in match.groupdict().items(): 173 | if not v or v.strip() == "": 174 | continue 175 | yield { 176 | "key": k, 177 | "text": v.strip(), 178 | "start": match.start(k), 179 | "end": match.end(k) 180 | } 181 | 182 | yield { 183 | "start": match.start(), 184 | "end": match.end(), 185 | "text": match.group().strip(), 186 | "label": match.lastgroup, 187 | "submatches": sorted(list(submatches()), key=lambda x: x["start"]) if include_submatches else [] 188 | } 189 | return list(gen()) 190 | 191 | def _results(self, text, include_submatches): 192 | with ThreadPoolExecutor(self.max_workers) as executor: 193 | tasks = [executor.submit(self._match_task, p, text, include_submatches) 194 | for p in self.patterns] 195 | for future in as_completed(tasks): 196 | yield future.result(timeout=1) 197 | 198 | def execute(self, text, include_submatches=True): 199 | results = sorted(chain(*self._results(text, include_submatches)), key=lambda x: x["start"]) 200 | for k, g in groupby(results, lambda x: x["start"]): 201 | group = list(g) 202 | if len(group) == 1: 203 | yield group[0] 204 | else: 205 | data = sorted(group, key=lambda x: -x["end"]) 206 | yield data[0] 207 | 208 | @staticmethod 209 | def load(path, regex_impl=re): 210 | from rita.config import SessionConfig 211 | config = SessionConfig() 212 | with open(path, "r") as f: 213 | patterns = [(obj["label"], obj["rules"]) 214 | for obj in map(json.loads, f.readlines())] 215 | return RuleExecutor(patterns, config, regex_impl=regex_impl) 216 | 217 | def save(self, path): 218 | with open(path, "w") as f: 219 | for pattern in self: 220 | f.write("{0}\n".format(json.dumps(pattern))) 221 | 222 | def __iter__(self): 223 | for label, rules in self.raw_patterns: 224 | yield {"label": label, "rules": rules} 225 | 226 | 227 | def compile_rules(rules: Rules, config: "SessionConfig", regex_impl=re, **kwargs) -> RuleExecutor: 228 | logger.info("Using standalone rule implementation") 229 | patterns = [rules_to_patterns(*group, config=config) for group in rules] 230 | executor = RuleExecutor(patterns, config, regex_impl=regex_impl) 231 | return executor 232 | -------------------------------------------------------------------------------- /rita/preprocess.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from functools import reduce 4 | from typing import Any, Mapping, Callable, List 5 | 6 | from rita.utils import Node, deaccent, ExtendedOp 7 | from rita.types import RuleGroup, Rules 8 | from rita.config import SessionConfig 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | Pipeline = Callable[[Rules, "SessionConfig"], Rules] 13 | 14 | 15 | def handle_prefix(rules: Rules, config: SessionConfig): 16 | """ 17 | If we have a prefix and rule following it, we apply this prefix on that rule 18 | """ 19 | def apply_prefix(pattern, prefix): 20 | (name, args, op) = pattern 21 | if name == "any_of": 22 | return (name, list(["{0}{1}".format(prefix, item) 23 | for item in args]), op) 24 | elif name == "value": 25 | return name, "{0}{1}".format(prefix, args), op 26 | else: 27 | logger.warning("Don't know how to apply prefix on: {}".format(name)) 28 | return pattern 29 | 30 | def gen(): 31 | prefix = None 32 | for p in pattern: 33 | (name, args, op) = p 34 | if name == "prefix": 35 | prefix = args 36 | else: 37 | if prefix: 38 | yield apply_prefix(p, prefix) 39 | prefix = None 40 | else: 41 | yield p 42 | for group_label, pattern in rules: 43 | yield group_label, list(gen()) 44 | 45 | 46 | def handle_deaccent(rules: Rules, config: SessionConfig): 47 | """ 48 | If we get accented word, eg: {WORD("naïve"), WORD("bayes")} 49 | In case of word, it should become list => {IN_LIST({"naïve", "naive"}), WORD("bayes")} 50 | In case of list, it should extend list with extra items for accented and not accented versions 51 | """ 52 | for group_label, pattern in rules: 53 | def gen(): 54 | for p in pattern: 55 | (name, args, op) = p 56 | if name == "value": 57 | (v1, v2) = (args, deaccent(args)) 58 | if v1 != v2: 59 | yield "any_of", (v1, v2,), op 60 | else: 61 | yield p 62 | elif name == "any_of": 63 | def items(): 64 | for w in args: 65 | (v1, v2) = (w, deaccent(w)) 66 | if v1 != v2: 67 | yield v1 68 | yield v2 69 | else: 70 | yield v1 71 | 72 | yield "any_of", list(items()), op 73 | else: 74 | yield p 75 | 76 | yield group_label, list(gen()) 77 | 78 | 79 | def add_implicit_punct(rules: Rules, config: SessionConfig): 80 | """ 81 | When writing rule, 82 | user usually doesn't care about some punct characters between words. 83 | We add them implicitly (unless this setting is turned off) 84 | """ 85 | for group_label, pattern in rules: 86 | def gen(): 87 | for p in pattern: 88 | yield p 89 | yield "punct", None, ExtendedOp("?") 90 | 91 | if len(pattern) == 1: 92 | yield group_label, pattern 93 | else: 94 | yield group_label, list(gen())[:-1] 95 | 96 | 97 | def add_implicit_hyphon(rules: Rules, config: SessionConfig): 98 | """ 99 | When writing rule, 100 | user usually doesn't care about hyphon characters - between words. 101 | """ 102 | for group_label, pattern in rules: 103 | def gen(): 104 | for p in pattern: 105 | yield p 106 | yield "value", "-", ExtendedOp("?") 107 | 108 | if len(pattern) == 1: 109 | yield group_label, pattern 110 | else: 111 | yield group_label, list(gen())[:-1] 112 | 113 | 114 | def handle_multi_word(rules: Rules, config: SessionConfig): 115 | """ 116 | spaCy splits everything in tokens. 117 | Words with dash ends up in different tokens. 118 | 119 | We don't want for user to even care about this, 120 | so we make this work implicitly 121 | 122 | WORD("Knee-length") => WORD("Knee"), WORD("-"), WORD("length") 123 | """ 124 | for group_label, pattern in rules: 125 | def gen(): 126 | for p in pattern: 127 | (name, args, op) = p 128 | if name == "value" and is_complex(args): 129 | yield "phrase", args, op 130 | else: 131 | yield p 132 | 133 | yield group_label, list(gen()) 134 | 135 | 136 | def is_complex(arg: str) -> bool: 137 | # if we want to use `-` as a word 138 | if arg.strip() == "-": 139 | return False 140 | 141 | splitters = ["-", " "] 142 | return any([s in arg 143 | for s in splitters]) 144 | 145 | 146 | def has_complex(args: List[str]) -> bool: 147 | """ 148 | Tells if any of arguments will be impacted by tokenizer 149 | """ 150 | return any([is_complex(a) 151 | for a in args]) 152 | 153 | 154 | def branch_pattern(pattern, config: SessionConfig): 155 | """ 156 | Creates multiple lists for each possible permutation 157 | """ 158 | root = Node() 159 | current = root 160 | depth = 0 161 | for idx, p in enumerate(pattern): 162 | if p[0] == "either": 163 | n = Node() 164 | current.add_next(n) 165 | current = n 166 | current.depth = depth 167 | for e in p[1]: 168 | current.add_child(e(config=config)) 169 | depth += 1 170 | elif p[0] == "any_of" and has_complex(p[1]): 171 | _all = set(p[1]) 172 | _complex = set(filter(is_complex, _all)) 173 | simple = _all - _complex 174 | n = Node() 175 | current.add_next(n) 176 | current = n 177 | current.depth = depth 178 | if len(simple) > 0: 179 | current.add_child(("any_of", simple, p[2])) 180 | for c in sorted(_complex): 181 | current.add_child(("phrase", c, p[2])) 182 | depth += 1 183 | else: 184 | n = Node(p) 185 | current.add_next(n) 186 | current = n 187 | current.depth = depth 188 | 189 | for p in root.unwrap(): 190 | yield p 191 | 192 | 193 | def handle_rule_branching(rules: Rules, config: SessionConfig): 194 | """ 195 | If we have an OR statement, eg. `WORD(w1)|WORD(w2)`, 196 | Generic approach is to clone rules and use w1 in one, w2 in other. 197 | It may be an overkill, but some situations are not covered 198 | with simple approach 199 | """ 200 | for group_label, pattern in rules: 201 | # Covering WORD(w1)|WORD(w2) case 202 | if any([p == "either" 203 | for (p, _, _) in pattern]): 204 | for p in branch_pattern(pattern, config): 205 | yield group_label, p 206 | 207 | # Covering case when there are complex items in list 208 | elif config.list_branching and any([p == "any_of" and has_complex(o) 209 | for (p, o, _) in pattern]): 210 | for p in branch_pattern(pattern, config): 211 | yield group_label, p 212 | else: 213 | yield group_label, pattern 214 | 215 | 216 | def dummy(rules: Rules, config: SessionConfig): 217 | """ 218 | Placeholder which does nothing 219 | """ 220 | logger.debug("Initial rules: {}".format(rules)) 221 | return rules 222 | 223 | 224 | def rule_tuple(d: Mapping[str, Any]) -> RuleGroup: 225 | return d["label"], d["data"] 226 | 227 | 228 | def expand_patterns(rules: Rules, config: SessionConfig): 229 | """ 230 | We can have situations where inside pattern we have another pattern (via Variable). 231 | We want to expand this inner pattern and prepend to outer pattern 232 | """ 233 | for group_label, pattern in rules: 234 | def gen(): 235 | for p in pattern: 236 | if type(p) is tuple: 237 | (k, other, op) = p 238 | if k == "nested": 239 | fn = other[0][0] 240 | children = other[0][1] 241 | yield fn, children, op 242 | else: 243 | yield p 244 | else: 245 | yield p 246 | 247 | yield group_label, list(gen()) 248 | 249 | 250 | def flatten_2nd_level_nested(rules: Rules, config: SessionConfig): 251 | """ 252 | 1st level of nested: use PATTERN(...) inside of your rule 253 | 2nd level of nested: use PATTERN(...) which has PATTERN(...) and so on (recursively) 254 | 255 | we want to resolve up to 1st level 256 | """ 257 | 258 | for group_label, pattern in rules: 259 | def gen(): 260 | for p in pattern: 261 | if type(p) is list: 262 | for item in p: 263 | yield item 264 | else: 265 | yield p 266 | 267 | yield group_label, list(gen()) 268 | 269 | 270 | def preprocess_rules(root, config: SessionConfig) -> Rules: 271 | logger.info("Preprocessing rules") 272 | 273 | rules = [rule_tuple(doc()) 274 | for doc in root 275 | if doc and doc()] 276 | 277 | pipeline = [ 278 | dummy, 279 | expand_patterns, 280 | handle_deaccent, 281 | handle_rule_branching, 282 | flatten_2nd_level_nested, 283 | handle_multi_word, 284 | handle_prefix 285 | ] 286 | 287 | if config.implicit_hyphon: 288 | logger.info("Adding implicit Hyphons") 289 | pipeline.append(add_implicit_hyphon) 290 | elif config.implicit_punct: 291 | logger.info("Adding implicit Punctuations") 292 | pipeline.append(add_implicit_punct) 293 | 294 | return reduce(lambda acc, p: p(acc, config), pipeline, rules) 295 | -------------------------------------------------------------------------------- /tests/test_examples.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import rita 4 | 5 | from utils import spacy_engine, standalone_engine, rust_engine, load_rules 6 | 7 | 8 | @pytest.fixture(scope="session") 9 | def bench_text(): 10 | # TODO: think of new case for testing 11 | pass 12 | 13 | 14 | @pytest.mark.parametrize('engine', [spacy_engine]) 15 | def test_color_car(engine): 16 | text = "John Silver was driving a red car. It was BMW X6 Mclass. John likes driving it very much." 17 | parser = engine(load_rules("examples/color-car.rita")) 18 | entities = set(parser(text)) 19 | print(entities) 20 | 21 | expected = set([ 22 | ("John Silver", "PERSON"), # Normal NER 23 | ("red car", "CAR_COLOR"), # Our first rule 24 | ("BMW X6 Mclass", "CAR_MODEL"), # Our second rule 25 | ("John likes driving", "LIKED_ACTION") # Our third rule 26 | ]) 27 | 28 | assert entities.issuperset(expected) 29 | 30 | 31 | @pytest.mark.parametrize('engine', [spacy_engine, standalone_engine, rust_engine]) 32 | def test_fuzzy_matching(engine): 33 | parser = engine(""" 34 | !IMPORT("rita.modules.fuzzy") 35 | 36 | FUZZY("squirrel") -> MARK("CRITTER") 37 | """) 38 | 39 | # Check if we're matching with capitalized word 40 | t1 = """ 41 | Squirrel just walked outside 42 | """ 43 | 44 | entities = parser(t1) 45 | 46 | assert len(entities) == 1 47 | assert entities[0] == ("Squirrel", "CRITTER") 48 | 49 | # Check if we're matching with all CAPS 50 | t2 = """ 51 | there's a SQUIRREL 52 | """ 53 | 54 | entities = parser(t2) 55 | 56 | assert len(entities) == 1 57 | assert entities[0] == ("SQUIRREL", "CRITTER") 58 | 59 | 60 | @pytest.mark.parametrize('engine', [spacy_engine, standalone_engine, rust_engine]) 61 | def test_election(engine): 62 | parser = engine( 63 | """ 64 | {ENTITY("PERSON")+, WORD("was"), WORD("elected")}->MARK("WON_ELECTION") 65 | {WORD("defeating"), ENTITY("PERSON")+}->MARK("LOST_ELECTION") 66 | """ 67 | ) 68 | text = "Donald Trump was elected President in 2016 defeating Hilary Clinton." 69 | 70 | entities = set(parser(text)) 71 | expected = set([ 72 | ("Donald Trump was elected", "WON_ELECTION"), 73 | ("defeating Hilary Clinton", "LOST_ELECTION"), 74 | ]) 75 | print(entities) 76 | 77 | assert entities.issuperset(expected) 78 | 79 | 80 | @pytest.mark.parametrize('engine', [spacy_engine, standalone_engine, rust_engine]) 81 | def test_dash_case(engine): 82 | parser = engine(load_rules("examples/dress-match.rita")) 83 | text = """ 84 | Fitted, knee-length dress in soft velour 85 | """ 86 | 87 | entities = set(parser(text)) 88 | print(entities) 89 | expected = set([ 90 | ("Fitted, knee-length dress", "DRESS_TYPE"), 91 | ("soft velour", "DRESS_FABRIC"), 92 | ]) 93 | 94 | assert entities.issuperset(expected) 95 | 96 | 97 | @pytest.mark.parametrize('engine', [spacy_engine, standalone_engine]) 98 | def test_exclude_word(engine): 99 | # Rust engine doesn't work here, because Re2 doesn't support backtracking operator 100 | parser = engine(load_rules("examples/excluding-word.rita")) 101 | 102 | t1 = "weather is awesome" 103 | t2 = "weather is cold" 104 | 105 | r1 = parser(t1) 106 | r2 = parser(t2) 107 | 108 | assert r1[0] == ("weather is awesome", "GOOD_WEATHER") 109 | assert len(r2) == 0 110 | 111 | 112 | @pytest.mark.parametrize('engine', [spacy_engine, standalone_engine, rust_engine]) 113 | def test_escape_string(engine): 114 | # If it compiles - good enough 115 | engine(load_rules("examples/match-with-escaped-string.rita")) 116 | 117 | 118 | @pytest.mark.parametrize('engine', [spacy_engine, standalone_engine, rust_engine]) 119 | def test_case_sensitive(engine): 120 | parser = engine( 121 | """ 122 | !CONFIG("ignore_case", "N") 123 | 124 | variants = {"Bitcoin", "BTC", "Bitcoin Cash"} 125 | 126 | {IN_LIST(variants)}->MARK("CRYPTO") 127 | """ 128 | ) 129 | 130 | text = """ 131 | A bitcoin mining magnate has proposed a new development fund for Bitcoin Cash. 132 | According to BTC.TOP CEO Jiang Zhuoer, the scheme will 'tax' Bitcoin Cash mining rewards 133 | in an effort to increase funding for Bitcoin Cash infrastructure. 134 | """ 135 | 136 | results = parser(text) 137 | print(results) 138 | filtered = list([r 139 | for r in results 140 | if r[1] == "CRYPTO"]) 141 | 142 | assert len(filtered) > 0 143 | assert filtered[0] == ("Bitcoin Cash", "CRYPTO") 144 | 145 | 146 | @pytest.mark.parametrize('engine', [spacy_engine, standalone_engine, rust_engine]) 147 | def test_with_implicit_hyphon(engine): 148 | parser = engine( 149 | """ 150 | !CONFIG("implicit_punct", "N") 151 | !CONFIG("implicit_hyphon", "Y") 152 | {WORD("Hello"), WORD("World")}->MARK("HYPHON_LABEL") 153 | WORD("Hello")->MARK("HELLO_LABEL") 154 | """ 155 | ) 156 | 157 | text = "Hello - world!" 158 | results = parser(text) 159 | print(results) 160 | 161 | assert len(results) == 1 162 | assert results[0] == ("Hello - world", "HYPHON_LABEL") 163 | 164 | 165 | @pytest.mark.parametrize('engine', [spacy_engine, standalone_engine, rust_engine]) 166 | def test_without_implicit_hyphon(engine): 167 | parser = engine( 168 | """ 169 | !CONFIG("implicit_punct", "N") 170 | !CONFIG("implicit_hyphon", "N") 171 | {WORD("Hello"), WORD("World")}->MARK("HYPHON_LABEL") 172 | WORD("Hello")->MARK("HELLO_LABEL") 173 | """ 174 | ) 175 | 176 | text = "Hello - world!" 177 | results = parser(text) 178 | print(results) 179 | 180 | assert len(results) == 1 181 | assert results[0] == ("Hello", "HELLO_LABEL") 182 | 183 | 184 | @pytest.mark.parametrize('engine', [spacy_engine, standalone_engine, rust_engine]) 185 | def test_prefix(engine): 186 | parser = engine( 187 | """ 188 | science = {"mathematics", "physics"} 189 | {PREFIX("meta"), IN_LIST(science)}->MARK("META_SCIENCE") 190 | {PREFIX("pseudo"), WORD("science")}->MARK("PSEUDO_SCIENCE") 191 | """ 192 | ) 193 | 194 | text = """ 195 | This paper is full of metaphysics and pseudoscience 196 | """ 197 | 198 | results = parser(text) 199 | print(results) 200 | assert results[0] == ("metaphysics", "META_SCIENCE") 201 | assert results[1] == ("pseudoscience", "PSEUDO_SCIENCE") 202 | 203 | 204 | @pytest.mark.parametrize('engine', ["standalone", "rust"]) 205 | def test_compile_context(engine): 206 | if engine == "rust": 207 | from rita.engine.translate_rust import load_lib 208 | lib = load_lib() 209 | if lib is None: 210 | pytest.skip("Missing rita-rust dynamic lib, skipping related tests") 211 | rules = """ 212 | 213 | {WORD*, IN_LIST(companies), WORD*}->MARK("SUSPISCIOUS_COMPANY") 214 | """ 215 | parser = rita.compile_string(rules, use_engine=engine, companies=["CompanyA", "CompanyB"]) 216 | print(parser.patterns) 217 | 218 | results = list(parser.execute("CompanyB is doing it's dirty work.")) 219 | assert results[0] == { 220 | "start": 0, 221 | "end": 33, 222 | "label": "SUSPISCIOUS_COMPANY", 223 | "text": "CompanyB is doing it's dirty work", 224 | "submatches": [ 225 | { 226 | "start": 0, 227 | "end": 33, 228 | "key": "SUSPISCIOUS_COMPANY", 229 | "text": "CompanyB is doing it's dirty work" 230 | }, 231 | { 232 | "start": 0, 233 | "end": 9, 234 | "key": "s2", 235 | "text": "CompanyB" 236 | }, 237 | { 238 | "start": 9, 239 | "end": 33, 240 | "key": "s4", 241 | "text": "is doing it's dirty work" 242 | } 243 | ], 244 | } 245 | 246 | 247 | @pytest.mark.parametrize('engine', [spacy_engine, standalone_engine, rust_engine]) 248 | def test_benchmark(benchmark, engine, bench_text): 249 | """ 250 | These tests will only run if parameters: 251 | `--benchmark-enable` or 252 | `--benchmark-only` 253 | are added 254 | """ 255 | parser = engine(load_rules("examples/cheap-phones.rita")) 256 | 257 | def parse_rows(parser, rows): 258 | for r in rows: 259 | parser(r) 260 | 261 | benchmark.pedantic( 262 | parse_rows, 263 | args=(parser, bench_text), 264 | iterations=3, 265 | rounds=3 266 | ) 267 | 268 | 269 | @pytest.mark.parametrize('engine', [spacy_engine, standalone_engine, rust_engine]) 270 | def test_variable_pattern(engine): 271 | parser = engine(""" 272 | Complex_Number = { NUM+, WORD("/")?, NUM? } 273 | {PATTERN(Complex_Number), WORD("inches"), WORD("Width")}->MARK("WIDTH") 274 | {PATTERN(Complex_Number), WORD("inches"), WORD("Height")}->MARK("HEIGHT") 275 | """) 276 | text = """ 277 | It is 17 1/2 inches width and 10 inches height 278 | """ 279 | 280 | results = parser(text) 281 | assert len(results) == 2 282 | 283 | 284 | @pytest.mark.parametrize('engine', [spacy_engine, standalone_engine, rust_engine]) 285 | def test_inlist_longest(engine): 286 | parser = engine(""" 287 | units = {"m", "mm", "cm"} 288 | dimensions = {"width", "height", "length"} 289 | {IN_LIST(dimensions), NUM, IN_LIST(units)}->MARK("TEST") 290 | """) 291 | 292 | text = """ 293 | width 10 mm 294 | """ 295 | 296 | results = parser(text) 297 | 298 | assert len(results) == 1 299 | (result, label) = results[0] 300 | assert result == "width 10 mm" 301 | 302 | 303 | @pytest.mark.parametrize('engine', [standalone_engine, rust_engine]) 304 | def test_inlist_word_based(engine): 305 | parser = engine(""" 306 | units = {"m", "mm", "cm", "inches", "in"} 307 | {IN_LIST(units), NUM}->MARK("TEST") 308 | """) 309 | 310 | text = """ 311 | twin 20 turbo 312 | """ 313 | 314 | results = parser(text) 315 | print(results) 316 | assert len(results) == 0 317 | 318 | 319 | @pytest.mark.parametrize('engine', [standalone_engine, spacy_engine, rust_engine]) 320 | def test_pluralize(engine): 321 | pytest.importorskip("inflect") 322 | parser = engine(""" 323 | !IMPORT("rita.modules.pluralize") 324 | 325 | vehicles={"car", "motorbike", "bicycle", "ship", "plane"} 326 | {NUM, PLURALIZE(vehicles)}->MARK("VEHICLES") 327 | """) 328 | 329 | text = """ 330 | There were 7 cars, 2 motorbikes, 1 ship, 1 bicycle and 9 planes 331 | """ 332 | 333 | results = set([text 334 | for text, label in parser(text) 335 | if label == "VEHICLES"]) 336 | print(results) 337 | 338 | assert len(results) == 5 339 | assert {"7 cars", "2 motorbikes", "1 ship", "1 bicycle", "9 planes"} == results 340 | 341 | 342 | @pytest.mark.parametrize('engine', [spacy_engine]) 343 | def test_orth_example(engine): 344 | parser = engine(""" 345 | !IMPORT("rita.modules.orth") 346 | 347 | {ORTH("IEEE")}->MARK("TAGGED_MATCH") 348 | {ORTH("ISO")?}->MARK("TAGGED_MATCH") 349 | """) 350 | 351 | text = """ 352 | it should match IEEE or ISO, but should ignore ieee. 353 | """ 354 | 355 | results = set([text 356 | for text, label in parser(text) 357 | if label == "TAGGED_MATCH"]) 358 | 359 | print(results) 360 | assert len(results) == 2 361 | assert {"IEEE", "ISO"} == results 362 | 363 | 364 | @pytest.mark.parametrize('engine', [standalone_engine, spacy_engine, rust_engine]) 365 | def test_regex_module_start(engine): 366 | parser = engine(""" 367 | !IMPORT("rita.modules.regex") 368 | 369 | {REGEX("^a")}->MARK("TAGGED_MATCH") 370 | """) 371 | 372 | text = """ 373 | there are many letters in the alphabet 374 | """ 375 | 376 | results = set([text 377 | for text, label in parser(text) 378 | if label == "TAGGED_MATCH"]) 379 | 380 | print(results) 381 | assert len(results) == 2 382 | assert {"are", "alphabet"} == results 383 | 384 | 385 | @pytest.mark.parametrize('engine', [standalone_engine, spacy_engine, rust_engine]) 386 | def test_regex_module_end(engine): 387 | parser = engine(""" 388 | !IMPORT("rita.modules.regex") 389 | 390 | {REGEX("e$")}->MARK("TAGGED_MATCH") 391 | """) 392 | 393 | text = """ 394 | there are many letters in the alphabet 395 | """ 396 | 397 | results = set([text 398 | for text, label in parser(text) 399 | if label == "TAGGED_MATCH"]) 400 | 401 | print(results) 402 | assert len(results) == 3 403 | assert {"there", "are", "the"} == results 404 | 405 | 406 | @pytest.mark.parametrize('engine', [standalone_engine, spacy_engine, rust_engine]) 407 | def test_regex_module_middle(engine): 408 | parser = engine(""" 409 | !IMPORT("rita.modules.regex") 410 | 411 | {REGEX("et")}->MARK("TAGGED_MATCH") 412 | """) 413 | 414 | text = """ 415 | there are many letters in the alphabet 416 | """ 417 | 418 | results = set([text 419 | for text, label in parser(text) 420 | if label == "TAGGED_MATCH"]) 421 | 422 | print(results) 423 | assert len(results) == 2 424 | assert {"letters", "alphabet"} == results 425 | 426 | 427 | @pytest.mark.parametrize('engine', [standalone_engine, spacy_engine, rust_engine]) 428 | def test_regex_module_strict(engine): 429 | parser = engine(""" 430 | !IMPORT("rita.modules.regex") 431 | 432 | {REGEX("^the$")}->MARK("TAGGED_MATCH") 433 | """) 434 | 435 | text = """ 436 | there are many letters in the alphabet 437 | """ 438 | 439 | results = set([text 440 | for text, label in parser(text) 441 | if label == "TAGGED_MATCH"]) 442 | 443 | print(results) 444 | assert len(results) == 1 445 | assert {"the"} == results 446 | 447 | 448 | @pytest.mark.parametrize('engine', [standalone_engine]) 449 | def test_custom_regex_impl(engine): 450 | import re 451 | 452 | class MyMatch(object): 453 | def __init__(self, result): 454 | self.result = result 455 | 456 | def start(self): 457 | return 0 458 | 459 | def end(self): 460 | return len(self.result) 461 | 462 | def group(self): 463 | return self.result 464 | 465 | def groupdict(self): 466 | return {} 467 | 468 | @property 469 | def lastgroup(self): 470 | return "TEST_MATCH" 471 | 472 | class MyCustomRegex(object): 473 | DOTALL = re.DOTALL 474 | IGNORECASE = re.IGNORECASE 475 | 476 | def compile(self, *args, **kwargs): 477 | return self 478 | 479 | def match(self, *args, **kwargs): 480 | return self 481 | 482 | def search(self, *args, **kwargs): 483 | return self 484 | 485 | def finditer(self, text): 486 | return [MyMatch("Hello new REGEX")] 487 | 488 | parser = engine(""" 489 | {WORD("Hello"), WORD("new"), WORD("regex")}->MARK("TEST_MATCH") 490 | """, regex_impl=MyCustomRegex()) 491 | 492 | results = parser("Hello new REGEX!") 493 | 494 | assert len(results) == 1 495 | 496 | 497 | @pytest.mark.parametrize('engine', [standalone_engine, rust_engine]) 498 | def test_complex_number_match(engine): 499 | parser = engine(""" 500 | fractions={"1 / 2", "3 / 4", "1 / 8", "3 / 8", "5 / 8", "7 / 8", "1 / 16", "3 / 16", "5 / 16", "7 / 16", "9 / 16", 501 | "11 / 16", "13 / 16", "15 / 16", "1 / 32", "3 / 32", "5 / 32", "7 / 32", "9 / 32", "11 / 32", "13 / 32", "15 / 32", 502 | "17 / 32", "19 / 32", "21 / 32", "23 / 32", "25 / 32", "27 / 32", "29 / 32", "31 / 32"} 503 | 504 | num_with_fractions = {NUM, WORD("-")?, IN_LIST(fractions)} 505 | complex_number = {NUM|PATTERN(num_with_fractions)} 506 | 507 | {WORD("length"), PATTERN(complex_number)}->MARK("NUMBER") 508 | """) 509 | 510 | simple_number = parser("length 50 cm") 511 | assert len(simple_number) == 1 512 | assert ("length 50", "NUMBER") == simple_number[0] 513 | 514 | complex_number = parser('length 10 1 / 2 "') 515 | 516 | assert len(complex_number) == 1 517 | assert ("length 10 1 / 2", "NUMBER") == complex_number[0] 518 | 519 | 520 | @pytest.mark.parametrize('engine', [standalone_engine, rust_engine]) 521 | def test_simple_float_number_match(engine): 522 | parser = engine(""" 523 | NUM->MARK("NUMBER") 524 | """) 525 | 526 | assert parser("25")[0] == ("25", "NUMBER") 527 | assert parser("25.7")[0] == ("25.7", "NUMBER") 528 | assert parser("19,6")[0] == ("19,6", "NUMBER") 529 | 530 | 531 | @pytest.mark.parametrize('engine', [standalone_engine, rust_engine]) 532 | def test_invalid_entity(engine): 533 | with pytest.raises(RuntimeError): 534 | engine(""" 535 | ENTITY("ORG")->MARK("ORG_MATCH") 536 | """) 537 | 538 | 539 | @pytest.mark.parametrize('engine', [spacy_engine]) 540 | def test_multiple_pos(engine): 541 | parser = engine(""" 542 | {POS("VERB", "NOUN")}->MARK("POS_MATCH") 543 | """) 544 | 545 | text = """ 546 | Here we have a verb: doing and noun: stuff 547 | """ 548 | 549 | results = set([text 550 | for text, label in parser(text) 551 | if label == "POS_MATCH"]) 552 | 553 | print(results) 554 | assert len(results) == 5 555 | assert {"noun", "have", "verb", "doing", "stuff"} == results 556 | 557 | 558 | @pytest.mark.parametrize('engine', [spacy_engine]) 559 | def test_multiple_entities(engine): 560 | parser = engine(""" 561 | {ENTITY("PERSON", "ORG")}->MARK("ENTITY_MATCH") 562 | """) 563 | 564 | text = """ 565 | John has been working at AT&T for the past year 566 | """ 567 | 568 | results = set([text 569 | for text, label in parser(text) 570 | if label == "ENTITY_MATCH"]) 571 | 572 | print(results) 573 | assert len(results) == 2 574 | assert {"AT&T", "John"} == results 575 | -------------------------------------------------------------------------------- /poetry.lock: -------------------------------------------------------------------------------- 1 | [[package]] 2 | name = "atomicwrites" 3 | version = "1.4.0" 4 | description = "Atomic file writes." 5 | category = "dev" 6 | optional = false 7 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" 8 | 9 | [[package]] 10 | name = "attrs" 11 | version = "21.2.0" 12 | description = "Classes Without Boilerplate" 13 | category = "dev" 14 | optional = false 15 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" 16 | 17 | [package.extras] 18 | dev = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface", "furo", "sphinx", "sphinx-notfound-page", "pre-commit"] 19 | docs = ["furo", "sphinx", "zope.interface", "sphinx-notfound-page"] 20 | tests = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface"] 21 | tests_no_zope = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins"] 22 | 23 | [[package]] 24 | name = "colorama" 25 | version = "0.4.4" 26 | description = "Cross-platform colored terminal text." 27 | category = "dev" 28 | optional = false 29 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" 30 | 31 | [[package]] 32 | name = "coverage" 33 | version = "5.5" 34 | description = "Code coverage measurement for Python" 35 | category = "dev" 36 | optional = false 37 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4" 38 | 39 | [package.extras] 40 | toml = ["toml"] 41 | 42 | [[package]] 43 | name = "importlib-metadata" 44 | version = "2.1.1" 45 | description = "Read metadata from Python packages" 46 | category = "dev" 47 | optional = false 48 | python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7" 49 | 50 | [package.dependencies] 51 | zipp = ">=0.5" 52 | 53 | [package.extras] 54 | docs = ["sphinx", "rst.linker"] 55 | testing = ["packaging", "pep517", "unittest2", "importlib-resources (>=1.3)"] 56 | 57 | [[package]] 58 | name = "more-itertools" 59 | version = "8.10.0" 60 | description = "More routines for operating on iterables, beyond itertools" 61 | category = "dev" 62 | optional = false 63 | python-versions = ">=3.5" 64 | 65 | [[package]] 66 | name = "packaging" 67 | version = "20.9" 68 | description = "Core utilities for Python packages" 69 | category = "dev" 70 | optional = false 71 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" 72 | 73 | [package.dependencies] 74 | pyparsing = ">=2.0.2" 75 | 76 | [[package]] 77 | name = "pathlib2" 78 | version = "2.3.6" 79 | description = "Object-oriented filesystem paths" 80 | category = "dev" 81 | optional = false 82 | python-versions = "*" 83 | 84 | [package.dependencies] 85 | six = "*" 86 | 87 | [[package]] 88 | name = "pluggy" 89 | version = "0.13.1" 90 | description = "plugin and hook calling mechanisms for python" 91 | category = "dev" 92 | optional = false 93 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" 94 | 95 | [package.dependencies] 96 | importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} 97 | 98 | [package.extras] 99 | dev = ["pre-commit", "tox"] 100 | 101 | [[package]] 102 | name = "ply" 103 | version = "3.11" 104 | description = "Python Lex & Yacc" 105 | category = "main" 106 | optional = false 107 | python-versions = "*" 108 | 109 | [[package]] 110 | name = "py" 111 | version = "1.10.0" 112 | description = "library with cross-python path, ini-parsing, io, code, log facilities" 113 | category = "dev" 114 | optional = false 115 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" 116 | 117 | [[package]] 118 | name = "py-cpuinfo" 119 | version = "8.0.0" 120 | description = "Get CPU info with pure Python 2 & 3" 121 | category = "dev" 122 | optional = false 123 | python-versions = "*" 124 | 125 | [[package]] 126 | name = "pyparsing" 127 | version = "2.4.7" 128 | description = "Python parsing module" 129 | category = "dev" 130 | optional = false 131 | python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" 132 | 133 | [[package]] 134 | name = "pytest" 135 | version = "5.4.3" 136 | description = "pytest: simple powerful testing with Python" 137 | category = "dev" 138 | optional = false 139 | python-versions = ">=3.5" 140 | 141 | [package.dependencies] 142 | atomicwrites = {version = ">=1.0", markers = "sys_platform == \"win32\""} 143 | attrs = ">=17.4.0" 144 | colorama = {version = "*", markers = "sys_platform == \"win32\""} 145 | importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} 146 | more-itertools = ">=4.0.0" 147 | packaging = "*" 148 | pathlib2 = {version = ">=2.2.0", markers = "python_version < \"3.6\""} 149 | pluggy = ">=0.12,<1.0" 150 | py = ">=1.5.0" 151 | wcwidth = "*" 152 | 153 | [package.extras] 154 | checkqa-mypy = ["mypy (==v0.761)"] 155 | testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xmlschema"] 156 | 157 | [[package]] 158 | name = "pytest-benchmark" 159 | version = "3.4.1" 160 | description = "A ``pytest`` fixture for benchmarking code. It will group the tests into rounds that are calibrated to the chosen timer." 161 | category = "dev" 162 | optional = false 163 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" 164 | 165 | [package.dependencies] 166 | py-cpuinfo = "*" 167 | pytest = ">=3.8" 168 | 169 | [package.extras] 170 | aspect = ["aspectlib"] 171 | elasticsearch = ["elasticsearch"] 172 | histogram = ["pygal", "pygaljs"] 173 | 174 | [[package]] 175 | name = "pytest-cov" 176 | version = "2.12.1" 177 | description = "Pytest plugin for measuring coverage." 178 | category = "dev" 179 | optional = false 180 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" 181 | 182 | [package.dependencies] 183 | coverage = ">=5.2.1" 184 | pytest = ">=4.6" 185 | toml = "*" 186 | 187 | [package.extras] 188 | testing = ["fields", "hunter", "process-tests", "six", "pytest-xdist", "virtualenv"] 189 | 190 | [[package]] 191 | name = "pytest-mock" 192 | version = "2.0.0" 193 | description = "Thin-wrapper around the mock package for easier use with py.test" 194 | category = "dev" 195 | optional = false 196 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" 197 | 198 | [package.dependencies] 199 | pytest = ">=2.7" 200 | 201 | [package.extras] 202 | dev = ["pre-commit", "tox"] 203 | 204 | [[package]] 205 | name = "six" 206 | version = "1.16.0" 207 | description = "Python 2 and 3 compatibility utilities" 208 | category = "dev" 209 | optional = false 210 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" 211 | 212 | [[package]] 213 | name = "toml" 214 | version = "0.10.2" 215 | description = "Python Library for Tom's Obvious, Minimal Language" 216 | category = "dev" 217 | optional = false 218 | python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" 219 | 220 | [[package]] 221 | name = "wcwidth" 222 | version = "0.2.5" 223 | description = "Measures the displayed width of unicode strings in a terminal" 224 | category = "dev" 225 | optional = false 226 | python-versions = "*" 227 | 228 | [[package]] 229 | name = "zipp" 230 | version = "1.2.0" 231 | description = "Backport of pathlib-compatible object wrapper for zip files" 232 | category = "dev" 233 | optional = false 234 | python-versions = ">=2.7" 235 | 236 | [package.extras] 237 | docs = ["sphinx", "jaraco.packaging (>=3.2)", "rst.linker (>=1.9)"] 238 | testing = ["pathlib2", "unittest2", "jaraco.itertools", "func-timeout"] 239 | 240 | [metadata] 241 | lock-version = "1.1" 242 | python-versions = "^3.5" 243 | content-hash = "b43cc00e376732988dd656db2e2321f17b14fb6a1bd2caec1319e128ef76d8fa" 244 | 245 | [metadata.files] 246 | atomicwrites = [ 247 | {file = "atomicwrites-1.4.0-py2.py3-none-any.whl", hash = "sha256:6d1784dea7c0c8d4a5172b6c620f40b6e4cbfdf96d783691f2e1302a7b88e197"}, 248 | {file = "atomicwrites-1.4.0.tar.gz", hash = "sha256:ae70396ad1a434f9c7046fd2dd196fc04b12f9e91ffb859164193be8b6168a7a"}, 249 | ] 250 | attrs = [ 251 | {file = "attrs-21.2.0-py2.py3-none-any.whl", hash = "sha256:149e90d6d8ac20db7a955ad60cf0e6881a3f20d37096140088356da6c716b0b1"}, 252 | {file = "attrs-21.2.0.tar.gz", hash = "sha256:ef6aaac3ca6cd92904cdd0d83f629a15f18053ec84e6432106f7a4d04ae4f5fb"}, 253 | ] 254 | colorama = [ 255 | {file = "colorama-0.4.4-py2.py3-none-any.whl", hash = "sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2"}, 256 | {file = "colorama-0.4.4.tar.gz", hash = "sha256:5941b2b48a20143d2267e95b1c2a7603ce057ee39fd88e7329b0c292aa16869b"}, 257 | ] 258 | coverage = [ 259 | {file = "coverage-5.5-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:b6d534e4b2ab35c9f93f46229363e17f63c53ad01330df9f2d6bd1187e5eaacf"}, 260 | {file = "coverage-5.5-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:b7895207b4c843c76a25ab8c1e866261bcfe27bfaa20c192de5190121770672b"}, 261 | {file = "coverage-5.5-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:c2723d347ab06e7ddad1a58b2a821218239249a9e4365eaff6649d31180c1669"}, 262 | {file = "coverage-5.5-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:900fbf7759501bc7807fd6638c947d7a831fc9fdf742dc10f02956ff7220fa90"}, 263 | {file = "coverage-5.5-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:004d1880bed2d97151facef49f08e255a20ceb6f9432df75f4eef018fdd5a78c"}, 264 | {file = "coverage-5.5-cp27-cp27m-win32.whl", hash = "sha256:06191eb60f8d8a5bc046f3799f8a07a2d7aefb9504b0209aff0b47298333302a"}, 265 | {file = "coverage-5.5-cp27-cp27m-win_amd64.whl", hash = "sha256:7501140f755b725495941b43347ba8a2777407fc7f250d4f5a7d2a1050ba8e82"}, 266 | {file = "coverage-5.5-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:372da284cfd642d8e08ef606917846fa2ee350f64994bebfbd3afb0040436905"}, 267 | {file = "coverage-5.5-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:8963a499849a1fc54b35b1c9f162f4108017b2e6db2c46c1bed93a72262ed083"}, 268 | {file = "coverage-5.5-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:869a64f53488f40fa5b5b9dcb9e9b2962a66a87dab37790f3fcfb5144b996ef5"}, 269 | {file = "coverage-5.5-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:4a7697d8cb0f27399b0e393c0b90f0f1e40c82023ea4d45d22bce7032a5d7b81"}, 270 | {file = "coverage-5.5-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:8d0a0725ad7c1a0bcd8d1b437e191107d457e2ec1084b9f190630a4fb1af78e6"}, 271 | {file = "coverage-5.5-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:51cb9476a3987c8967ebab3f0fe144819781fca264f57f89760037a2ea191cb0"}, 272 | {file = "coverage-5.5-cp310-cp310-win_amd64.whl", hash = "sha256:c0891a6a97b09c1f3e073a890514d5012eb256845c451bd48f7968ef939bf4ae"}, 273 | {file = "coverage-5.5-cp35-cp35m-macosx_10_9_x86_64.whl", hash = "sha256:3487286bc29a5aa4b93a072e9592f22254291ce96a9fbc5251f566b6b7343cdb"}, 274 | {file = "coverage-5.5-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:deee1077aae10d8fa88cb02c845cfba9b62c55e1183f52f6ae6a2df6a2187160"}, 275 | {file = "coverage-5.5-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:f11642dddbb0253cc8853254301b51390ba0081750a8ac03f20ea8103f0c56b6"}, 276 | {file = "coverage-5.5-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:6c90e11318f0d3c436a42409f2749ee1a115cd8b067d7f14c148f1ce5574d701"}, 277 | {file = "coverage-5.5-cp35-cp35m-manylinux2010_x86_64.whl", hash = "sha256:30c77c1dc9f253283e34c27935fded5015f7d1abe83bc7821680ac444eaf7793"}, 278 | {file = "coverage-5.5-cp35-cp35m-win32.whl", hash = "sha256:9a1ef3b66e38ef8618ce5fdc7bea3d9f45f3624e2a66295eea5e57966c85909e"}, 279 | {file = "coverage-5.5-cp35-cp35m-win_amd64.whl", hash = "sha256:972c85d205b51e30e59525694670de6a8a89691186012535f9d7dbaa230e42c3"}, 280 | {file = "coverage-5.5-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:af0e781009aaf59e25c5a678122391cb0f345ac0ec272c7961dc5455e1c40066"}, 281 | {file = "coverage-5.5-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:74d881fc777ebb11c63736622b60cb9e4aee5cace591ce274fb69e582a12a61a"}, 282 | {file = "coverage-5.5-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:92b017ce34b68a7d67bd6d117e6d443a9bf63a2ecf8567bb3d8c6c7bc5014465"}, 283 | {file = "coverage-5.5-cp36-cp36m-manylinux2010_i686.whl", hash = "sha256:d636598c8305e1f90b439dbf4f66437de4a5e3c31fdf47ad29542478c8508bbb"}, 284 | {file = "coverage-5.5-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:41179b8a845742d1eb60449bdb2992196e211341818565abded11cfa90efb821"}, 285 | {file = "coverage-5.5-cp36-cp36m-win32.whl", hash = "sha256:040af6c32813fa3eae5305d53f18875bedd079960822ef8ec067a66dd8afcd45"}, 286 | {file = "coverage-5.5-cp36-cp36m-win_amd64.whl", hash = "sha256:5fec2d43a2cc6965edc0bb9e83e1e4b557f76f843a77a2496cbe719583ce8184"}, 287 | {file = "coverage-5.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:18ba8bbede96a2c3dde7b868de9dcbd55670690af0988713f0603f037848418a"}, 288 | {file = "coverage-5.5-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:2910f4d36a6a9b4214bb7038d537f015346f413a975d57ca6b43bf23d6563b53"}, 289 | {file = "coverage-5.5-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:f0b278ce10936db1a37e6954e15a3730bea96a0997c26d7fee88e6c396c2086d"}, 290 | {file = "coverage-5.5-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:796c9c3c79747146ebd278dbe1e5c5c05dd6b10cc3bcb8389dfdf844f3ead638"}, 291 | {file = "coverage-5.5-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:53194af30d5bad77fcba80e23a1441c71abfb3e01192034f8246e0d8f99528f3"}, 292 | {file = "coverage-5.5-cp37-cp37m-win32.whl", hash = "sha256:184a47bbe0aa6400ed2d41d8e9ed868b8205046518c52464fde713ea06e3a74a"}, 293 | {file = "coverage-5.5-cp37-cp37m-win_amd64.whl", hash = "sha256:2949cad1c5208b8298d5686d5a85b66aae46d73eec2c3e08c817dd3513e5848a"}, 294 | {file = "coverage-5.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:217658ec7187497e3f3ebd901afdca1af062b42cfe3e0dafea4cced3983739f6"}, 295 | {file = "coverage-5.5-cp38-cp38-manylinux1_i686.whl", hash = "sha256:1aa846f56c3d49205c952d8318e76ccc2ae23303351d9270ab220004c580cfe2"}, 296 | {file = "coverage-5.5-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:24d4a7de75446be83244eabbff746d66b9240ae020ced65d060815fac3423759"}, 297 | {file = "coverage-5.5-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:d1f8bf7b90ba55699b3a5e44930e93ff0189aa27186e96071fac7dd0d06a1873"}, 298 | {file = "coverage-5.5-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:970284a88b99673ccb2e4e334cfb38a10aab7cd44f7457564d11898a74b62d0a"}, 299 | {file = "coverage-5.5-cp38-cp38-win32.whl", hash = "sha256:01d84219b5cdbfc8122223b39a954820929497a1cb1422824bb86b07b74594b6"}, 300 | {file = "coverage-5.5-cp38-cp38-win_amd64.whl", hash = "sha256:2e0d881ad471768bf6e6c2bf905d183543f10098e3b3640fc029509530091502"}, 301 | {file = "coverage-5.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:d1f9ce122f83b2305592c11d64f181b87153fc2c2bbd3bb4a3dde8303cfb1a6b"}, 302 | {file = "coverage-5.5-cp39-cp39-manylinux1_i686.whl", hash = "sha256:13c4ee887eca0f4c5a247b75398d4114c37882658300e153113dafb1d76de529"}, 303 | {file = "coverage-5.5-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:52596d3d0e8bdf3af43db3e9ba8dcdaac724ba7b5ca3f6358529d56f7a166f8b"}, 304 | {file = "coverage-5.5-cp39-cp39-manylinux2010_i686.whl", hash = "sha256:2cafbbb3af0733db200c9b5f798d18953b1a304d3f86a938367de1567f4b5bff"}, 305 | {file = "coverage-5.5-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:44d654437b8ddd9eee7d1eaee28b7219bec228520ff809af170488fd2fed3e2b"}, 306 | {file = "coverage-5.5-cp39-cp39-win32.whl", hash = "sha256:d314ed732c25d29775e84a960c3c60808b682c08d86602ec2c3008e1202e3bb6"}, 307 | {file = "coverage-5.5-cp39-cp39-win_amd64.whl", hash = "sha256:13034c4409db851670bc9acd836243aeee299949bd5673e11844befcb0149f03"}, 308 | {file = "coverage-5.5-pp36-none-any.whl", hash = "sha256:f030f8873312a16414c0d8e1a1ddff2d3235655a2174e3648b4fa66b3f2f1079"}, 309 | {file = "coverage-5.5-pp37-none-any.whl", hash = "sha256:2a3859cb82dcbda1cfd3e6f71c27081d18aa251d20a17d87d26d4cd216fb0af4"}, 310 | {file = "coverage-5.5.tar.gz", hash = "sha256:ebe78fe9a0e874362175b02371bdfbee64d8edc42a044253ddf4ee7d3c15212c"}, 311 | ] 312 | importlib-metadata = [ 313 | {file = "importlib_metadata-2.1.1-py2.py3-none-any.whl", hash = "sha256:c2d6341ff566f609e89a2acb2db190e5e1d23d5409d6cc8d2fe34d72443876d4"}, 314 | {file = "importlib_metadata-2.1.1.tar.gz", hash = "sha256:b8de9eff2b35fb037368f28a7df1df4e6436f578fa74423505b6c6a778d5b5dd"}, 315 | ] 316 | more-itertools = [ 317 | {file = "more-itertools-8.10.0.tar.gz", hash = "sha256:1debcabeb1df793814859d64a81ad7cb10504c24349368ccf214c664c474f41f"}, 318 | {file = "more_itertools-8.10.0-py3-none-any.whl", hash = "sha256:56ddac45541718ba332db05f464bebfb0768110111affd27f66e0051f276fa43"}, 319 | ] 320 | packaging = [ 321 | {file = "packaging-20.9-py2.py3-none-any.whl", hash = "sha256:67714da7f7bc052e064859c05c595155bd1ee9f69f76557e21f051443c20947a"}, 322 | {file = "packaging-20.9.tar.gz", hash = "sha256:5b327ac1320dc863dca72f4514ecc086f31186744b84a230374cc1fd776feae5"}, 323 | ] 324 | pathlib2 = [ 325 | {file = "pathlib2-2.3.6-py2.py3-none-any.whl", hash = "sha256:3a130b266b3a36134dcc79c17b3c7ac9634f083825ca6ea9d8f557ee6195c9c8"}, 326 | {file = "pathlib2-2.3.6.tar.gz", hash = "sha256:7d8bcb5555003cdf4a8d2872c538faa3a0f5d20630cb360e518ca3b981795e5f"}, 327 | ] 328 | pluggy = [ 329 | {file = "pluggy-0.13.1-py2.py3-none-any.whl", hash = "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d"}, 330 | {file = "pluggy-0.13.1.tar.gz", hash = "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0"}, 331 | ] 332 | ply = [ 333 | {file = "ply-3.11-py2.py3-none-any.whl", hash = "sha256:096f9b8350b65ebd2fd1346b12452efe5b9607f7482813ffca50c22722a807ce"}, 334 | {file = "ply-3.11.tar.gz", hash = "sha256:00c7c1aaa88358b9c765b6d3000c6eec0ba42abca5351b095321aef446081da3"}, 335 | ] 336 | py = [ 337 | {file = "py-1.10.0-py2.py3-none-any.whl", hash = "sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a"}, 338 | {file = "py-1.10.0.tar.gz", hash = "sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3"}, 339 | ] 340 | py-cpuinfo = [ 341 | {file = "py-cpuinfo-8.0.0.tar.gz", hash = "sha256:5f269be0e08e33fd959de96b34cd4aeeeacac014dd8305f70eb28d06de2345c5"}, 342 | ] 343 | pyparsing = [ 344 | {file = "pyparsing-2.4.7-py2.py3-none-any.whl", hash = "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"}, 345 | {file = "pyparsing-2.4.7.tar.gz", hash = "sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1"}, 346 | ] 347 | pytest = [ 348 | {file = "pytest-5.4.3-py3-none-any.whl", hash = "sha256:5c0db86b698e8f170ba4582a492248919255fcd4c79b1ee64ace34301fb589a1"}, 349 | {file = "pytest-5.4.3.tar.gz", hash = "sha256:7979331bfcba207414f5e1263b5a0f8f521d0f457318836a7355531ed1a4c7d8"}, 350 | ] 351 | pytest-benchmark = [ 352 | {file = "pytest-benchmark-3.4.1.tar.gz", hash = "sha256:40e263f912de5a81d891619032983557d62a3d85843f9a9f30b98baea0cd7b47"}, 353 | {file = "pytest_benchmark-3.4.1-py2.py3-none-any.whl", hash = "sha256:36d2b08c4882f6f997fd3126a3d6dfd70f3249cde178ed8bbc0b73db7c20f809"}, 354 | ] 355 | pytest-cov = [ 356 | {file = "pytest-cov-2.12.1.tar.gz", hash = "sha256:261ceeb8c227b726249b376b8526b600f38667ee314f910353fa318caa01f4d7"}, 357 | {file = "pytest_cov-2.12.1-py2.py3-none-any.whl", hash = "sha256:261bb9e47e65bd099c89c3edf92972865210c36813f80ede5277dceb77a4a62a"}, 358 | ] 359 | pytest-mock = [ 360 | {file = "pytest-mock-2.0.0.tar.gz", hash = "sha256:b35eb281e93aafed138db25c8772b95d3756108b601947f89af503f8c629413f"}, 361 | {file = "pytest_mock-2.0.0-py2.py3-none-any.whl", hash = "sha256:cb67402d87d5f53c579263d37971a164743dc33c159dfb4fb4a86f37c5552307"}, 362 | ] 363 | six = [ 364 | {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, 365 | {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, 366 | ] 367 | toml = [ 368 | {file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"}, 369 | {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"}, 370 | ] 371 | wcwidth = [ 372 | {file = "wcwidth-0.2.5-py2.py3-none-any.whl", hash = "sha256:beb4802a9cebb9144e99086eff703a642a13d6a0052920003a230f3294bbe784"}, 373 | {file = "wcwidth-0.2.5.tar.gz", hash = "sha256:c4d647b99872929fdb7bdcaa4fbe7f01413ed3d98077df798530e5b04f116c83"}, 374 | ] 375 | zipp = [ 376 | {file = "zipp-1.2.0-py2.py3-none-any.whl", hash = "sha256:e0d9e63797e483a30d27e09fffd308c59a700d365ec34e93cc100844168bf921"}, 377 | {file = "zipp-1.2.0.tar.gz", hash = "sha256:c70410551488251b0fee67b460fb9a536af8d6f9f008ad10ac51f615b6a521b1"}, 378 | ] 379 | -------------------------------------------------------------------------------- /tests/test_rules.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | import re 3 | import tempfile 4 | 5 | import pytest 6 | import rita 7 | 8 | 9 | class TestSpacy(object): 10 | @property 11 | def punct(self): 12 | return {'IS_PUNCT': True, 'OP': '?'} 13 | 14 | def compiler(self, rules): 15 | pytest.importorskip("spacy", minversion="2.1") 16 | return rita.compile_string(rules, use_engine="spacy") 17 | 18 | def test_punct(self): 19 | rules = self.compiler('PUNCT->MARK("SOME_PUNCT")') 20 | print(rules) 21 | assert len(rules) == 1 22 | assert rules[0] == { 23 | "pattern": [{"IS_PUNCT": True}], 24 | "label": "SOME_PUNCT" 25 | } 26 | 27 | def test_number(self): 28 | rules = self.compiler('NUM("42")->MARK("SOME_NUMBER")') 29 | print(rules) 30 | assert len(rules) == 1 31 | assert rules[0] == { 32 | "pattern": [{"LOWER": "42"}], 33 | "label": "SOME_NUMBER" 34 | } 35 | 36 | def test_pos(self): 37 | rules = self.compiler('POS("VERB")->MARK("SOME_POS")') 38 | print(rules) 39 | assert len(rules) == 1 40 | assert rules[0] == { 41 | "pattern": [{"POS": "VERB"}], 42 | "label": "SOME_POS" 43 | } 44 | 45 | def test_single_word(self): 46 | rules = self.compiler('WORD("Test")->MARK("SOME_LABEL")') 47 | print(rules) 48 | assert len(rules) == 1 49 | assert rules[0] == { 50 | "pattern": [{"LOWER": "test"}], 51 | "label": "SOME_LABEL" 52 | } 53 | 54 | def test_multiple_words(self): 55 | rules = self.compiler(''' 56 | words = {"test1", "test2"} 57 | IN_LIST(words)->MARK("MULTI_LABEL") 58 | ''') 59 | print(rules) 60 | assert len(rules) == 1 61 | assert rules[0] == { 62 | "pattern": [{"LOWER": {"IN": ["test1", "test2"]}}], 63 | "label": "MULTI_LABEL" 64 | } 65 | 66 | def test_simple_pattern(self): 67 | rules = self.compiler(''' 68 | {WORD("test1"), WORD("test2")}->MARK("SIMPLE_PATTERN") 69 | ''') 70 | print(rules) 71 | assert len(rules) == 1 72 | assert rules[0] == { 73 | "pattern": [{"LOWER": "test1"}, self.punct, {"LOWER": "test2"}], 74 | "label": "SIMPLE_PATTERN" 75 | } 76 | 77 | def test_or_branch(self): 78 | rules = self.compiler(''' 79 | {WORD("test1")|WORD("test2")}->MARK("SPLIT_LABEL") 80 | ''') 81 | print(rules) 82 | assert len(rules) == 2 83 | assert rules[0] == { 84 | "pattern": [{"LOWER": "test1"}], 85 | "label": "SPLIT_LABEL" 86 | } 87 | assert rules[1] == { 88 | "pattern": [{"LOWER": "test2"}], 89 | "label": "SPLIT_LABEL" 90 | } 91 | 92 | def test_or_branch_multi(self): 93 | rules = self.compiler(''' 94 | {WORD("test1")|WORD("test2"),WORD("test3")|WORD("test4")}->MARK("MULTI_SPLIT_LABEL") 95 | ''') 96 | print(rules) 97 | assert len(rules) == 4 98 | assert rules[0] == { 99 | "pattern": [{"LOWER": "test1"}, self.punct, {"LOWER": "test3"}], 100 | "label": "MULTI_SPLIT_LABEL" 101 | } 102 | assert rules[1] == { 103 | "pattern": [{"LOWER": "test2"}, self.punct, {"LOWER": "test3"}], 104 | "label": "MULTI_SPLIT_LABEL" 105 | } 106 | assert rules[2] == { 107 | "pattern": [{"LOWER": "test1"}, self.punct, {"LOWER": "test4"}], 108 | "label": "MULTI_SPLIT_LABEL" 109 | } 110 | assert rules[3] == { 111 | "pattern": [{"LOWER": "test2"}, self.punct, {"LOWER": "test4"}], 112 | "label": "MULTI_SPLIT_LABEL" 113 | } 114 | 115 | def test_or_branch_multi_w_single(self): 116 | rules = self.compiler(''' 117 | numbers={"one", "two", "three"} 118 | {WORD("test1")|WORD("test2"),IN_LIST(numbers),WORD("test3")|WORD("test4")}->MARK("MULTI_SPLIT_LABEL") 119 | ''') 120 | print(rules) 121 | assert len(rules) == 4 122 | list_items = {"LOWER": {"IN": ["one", "three", "two"]}} 123 | assert rules[0] == { 124 | "pattern": [{"LOWER": "test1"}, self.punct, list_items, self.punct, {"LOWER": "test3"}], 125 | "label": "MULTI_SPLIT_LABEL" 126 | } 127 | assert rules[1] == { 128 | "pattern": [{"LOWER": "test2"}, self.punct, list_items, self.punct, {"LOWER": "test3"}], "label": "MULTI_SPLIT_LABEL"} 129 | assert rules[2] == { 130 | "pattern": [{"LOWER": "test1"}, self.punct, list_items, self.punct, {"LOWER": "test4"}], 131 | "label": "MULTI_SPLIT_LABEL" 132 | } 133 | assert rules[3] == { 134 | "pattern": [{"LOWER": "test2"}, self.punct, list_items, self.punct, {"LOWER": "test4"}], 135 | "label": "MULTI_SPLIT_LABEL" 136 | } 137 | 138 | def test_branching_list(self): 139 | rules = self.compiler(''' 140 | items={"test1", "test2", "test-3", "test4"} 141 | {IN_LIST(items)}->MARK("SPLIT_LIST") 142 | ''') 143 | print(rules) 144 | assert len(rules) == 2 145 | assert rules[0] == { 146 | "label": "SPLIT_LIST", 147 | "pattern": [{"LOWER": {"IN": ["test1", "test2", "test4"]}}] 148 | } 149 | assert rules[1] == { 150 | "label": "SPLIT_LIST", 151 | "pattern": [{"LOWER": "test"}, {"LOWER": "-"}, {"LOWER": "3"}] 152 | } 153 | 154 | def test_double_branching_list(self): 155 | rules = self.compiler(''' 156 | items={"test1", "test2", "test-3", "test4", "test-5"} 157 | {IN_LIST(items)}->MARK("SPLIT_LIST") 158 | ''') 159 | print(rules) 160 | assert len(rules) == 3 161 | assert rules[0] == { 162 | "label": "SPLIT_LIST", 163 | "pattern": [{"LOWER": {"IN": ["test1", "test2", "test4"]}}] 164 | } 165 | assert rules[1] == { 166 | "label": "SPLIT_LIST", 167 | "pattern": [{"LOWER": "test"}, {"LOWER": "-"}, {"LOWER": "3"}] 168 | } 169 | assert rules[2] == { 170 | "label": "SPLIT_LIST", 171 | "pattern": [{"LOWER": "test"}, {"LOWER": "-"}, {"LOWER": "5"}] 172 | } 173 | 174 | def test_word_with_spaces(self): 175 | rules = self.compiler(''' 176 | WORD("test1 test2")->MARK("SPLIT_WORD") 177 | ''') 178 | print(rules) 179 | # It should be split into two: WORD("test1"), WORD("test2") 180 | assert len(rules) == 1 181 | assert rules[0] == { 182 | "label": "SPLIT_WORD", 183 | "pattern": [{"LOWER": "test1"}, {"LOWER": "test2"}] 184 | } 185 | 186 | def test_word_with_dash(self): 187 | rules = self.compiler(''' 188 | WORD("test1-test2")->MARK("SPLIT_WORD") 189 | ''') 190 | print(rules) 191 | # It should be split into two: WORD("test1"), WORD("test2") 192 | assert len(rules) == 1 193 | assert rules[0] == { 194 | "label": "SPLIT_WORD", 195 | "pattern": [{"LOWER": "test1"}, {"LOWER": "-"}, {"LOWER": "test2"}] 196 | } 197 | 198 | def test_word_with_accent(self): 199 | rules = self.compiler(''' 200 | WORD("Šarūnas")->MARK("TWO_WORDS") 201 | ''') 202 | print(rules) 203 | assert len(rules) == 1 204 | assert rules[0] == { 205 | "label": "TWO_WORDS", 206 | "pattern": [{"LOWER": {"IN": ["sarunas", "šarūnas"]}}] 207 | } 208 | 209 | def test_list_with_accent(self): 210 | rules = self.compiler(''' 211 | names={"Jonas", "Jurgis", "Šarūnas"} 212 | IN_LIST(names)->MARK("EXTENDED_LIST") 213 | ''') 214 | print(rules) 215 | assert len(rules) == 1 216 | assert rules[0] == { 217 | "label": "EXTENDED_LIST", 218 | "pattern": [{"LOWER": {"IN": ["jonas", "jurgis", "sarunas", "šarūnas"]}}] 219 | } 220 | 221 | def test_prefix_on_word(self): 222 | rules = self.compiler(''' 223 | {PREFIX("meta"), WORD("physics")}->MARK("META_WORD") 224 | ''') 225 | print(rules) 226 | assert len(rules) == 1 227 | assert rules[0] == { 228 | "label": "META_WORD", 229 | "pattern": [{"LOWER": "metaphysics"}] 230 | } 231 | 232 | def test_prefix_on_list(self): 233 | rules = self.compiler(''' 234 | science = {"physics", "mathematics"} 235 | {PREFIX("meta"), IN_LIST(science)}->MARK("META_LIST") 236 | ''') 237 | print(rules) 238 | assert len(rules) == 1 239 | assert rules[0] == { 240 | "label": "META_LIST", 241 | "pattern": [{"LOWER": {"IN": ["metamathematics", "metaphysics"]}}] 242 | } 243 | 244 | def test_prefix_on_unknown_type(self): 245 | rules = self.compiler(''' 246 | {PREFIX("test"), ANY}->MARK("NOT_VALID") 247 | ''') 248 | print(rules) 249 | assert len(rules) == 1 250 | assert rules[0] == { 251 | "label": "NOT_VALID", 252 | "pattern": [{}] 253 | } 254 | 255 | def test_multiple_optionals(self): 256 | rules = self.compiler(""" 257 | {NUM+, WORD("-")?, NUM?, WORD("/")?, NUM?}->MARK("NUMBER_PATTERN") 258 | """) 259 | print(rules) 260 | assert len(rules) == 1 261 | assert rules[0] == { 262 | "label": "NUMBER_PATTERN", 263 | "pattern": [ 264 | {"LOWER": {"REGEX": "((\\d+[\\.,]\\d+)|(\\d+))"}, "OP": "+"}, 265 | {"IS_PUNCT": True, "OP": "?"}, 266 | {"LOWER": "-", "OP": "?"}, 267 | {"IS_PUNCT": True, "OP": "?"}, 268 | {"LOWER": {"REGEX": "((\\d+[\\.,]\\d+)|(\\d+))"}, "OP": "?"}, 269 | {"IS_PUNCT": True, "OP": "?"}, 270 | {"LOWER": "/", "OP": "?"}, 271 | {"IS_PUNCT": True, "OP": "?"}, 272 | {"LOWER": {"REGEX": "((\\d+[\\.,]\\d+)|(\\d+))"}, "OP": "?"}, 273 | ] 274 | } 275 | 276 | def test_optional_list(self): 277 | rules = self.compiler(""" 278 | elements = {"one", "two"} 279 | {IN_LIST(elements)?}->MARK("OPTIONAL_LIST") 280 | """) 281 | 282 | print(rules) 283 | 284 | assert len(rules) == 1 285 | assert rules[0] == { 286 | "label": "OPTIONAL_LIST", 287 | "pattern": [{"LOWER": {"IN": ["one", "two"]}, "OP": "?"}] 288 | } 289 | 290 | def test_tag_module(self): 291 | rules = self.compiler(""" 292 | !IMPORT("rita.modules.tag") 293 | 294 | TAG("^NN|^JJ")->MARK("TEST_TAG") 295 | """) 296 | 297 | print(rules) 298 | 299 | assert len(rules) == 1 300 | assert rules[0] == { 301 | "label": "TEST_TAG", 302 | "pattern": [{"TAG": {"REGEX": "^NN|^JJ"}}] 303 | } 304 | 305 | def test_tag_word(self): 306 | rules = self.compiler(""" 307 | !IMPORT("rita.modules.tag") 308 | 309 | TAG_WORD("^VB", "proposed")->MARK("TEST_TAG") 310 | """) 311 | 312 | print(rules) 313 | 314 | assert len(rules) == 1 315 | assert rules[0] == { 316 | "label": "TEST_TAG", 317 | "pattern": [{"LOWER": "proposed", "TAG": {"REGEX": "^VB"}}] 318 | } 319 | 320 | def test_tag_list(self): 321 | rules = self.compiler(""" 322 | !IMPORT("rita.modules.tag") 323 | 324 | words = {"perceived", "proposed"} 325 | {TAG_WORD("^VB", words)}->MARK("TEST_TAG") 326 | """) 327 | 328 | print(rules) 329 | 330 | assert len(rules) == 1 331 | assert rules[0] == { 332 | "label": "TEST_TAG", 333 | "pattern": [{"LOWER": {"REGEX": "^(perceived|proposed)$"}, "TAG": {"REGEX": "^VB"}}] 334 | } 335 | 336 | def test_tags_case_sensitive(self): 337 | rules = self.compiler(""" 338 | !CONFIG("ignore_case", "F") 339 | !IMPORT("rita.modules.tag") 340 | 341 | words = {"perceived", "proposed"} 342 | TAG_WORD("^VB", "proposed")->MARK("TEST_TAG") 343 | {TAG_WORD("^VB", words)}->MARK("TEST_TAG") 344 | """) 345 | 346 | print(rules) 347 | 348 | assert len(rules) == 2 349 | assert rules == [ 350 | { 351 | "label": "TEST_TAG", 352 | "pattern": [{"TEXT": "proposed", "TAG": {"REGEX": "^VB"}}] 353 | }, 354 | { 355 | "label": "TEST_TAG", 356 | "pattern": [{"TEXT": {"REGEX": "^(perceived|proposed)$"}, "TAG": {"REGEX": "^VB"}}] 357 | } 358 | ] 359 | 360 | def test_generate_names(self): 361 | rules = self.compiler(""" 362 | !IMPORT("rita.modules.names") 363 | 364 | names = {"Roy Jones junior", "Roy Jones senior", "Juan-Claude van Damme", "Jon Jones"} 365 | NAMES(names)->MARK("NAME_MATCH") 366 | NAMES("Kazushi Sakuraba")->MARK("NAME_MATCH") 367 | """) 368 | 369 | print(rules) 370 | assert len(rules) == 10 371 | 372 | def test_any_tag(self): 373 | rules = self.compiler(""" 374 | ANY -> MARK("ANYTHING_GOES_HERE") 375 | """) 376 | print(rules) 377 | assert len(rules) == 1 378 | assert rules == [{"label": "ANYTHING_GOES_HERE", "pattern": [{}]}] 379 | 380 | def test_entity_tag_default(self): 381 | rules = self.compiler(""" 382 | ENTITY("PERSON")->MARK("PERSON_FOUND") 383 | """) 384 | print(rules) 385 | assert len(rules) == 1 386 | assert rules == [{"label": "PERSON_FOUND", "pattern": [{"ENT_TYPE": "PERSON", "OP": "+"}]}] 387 | 388 | def test_entity_tag_override(self): 389 | rules = self.compiler(""" 390 | {ENTITY("PERSON")*}->MARK("PERSON_FOUND") 391 | """) 392 | print(rules) 393 | assert len(rules) == 1 394 | assert rules == [{"label": "PERSON_FOUND", "pattern": [{"ENT_TYPE": "PERSON", "OP": "*"}]}] 395 | 396 | 397 | class TestStandalone(object): 398 | @property 399 | def punct(self): 400 | return re.compile(r"[.,!;?:]") 401 | 402 | @property 403 | def flags(self): 404 | return re.DOTALL | re.IGNORECASE 405 | 406 | def compiler(self, rules): 407 | return rita.compile_string(rules, use_engine="standalone").patterns 408 | 409 | def test_punct(self): 410 | rules = self.compiler('PUNCT->MARK("SOME_PUNCT")') 411 | print(rules) 412 | assert len(rules) == 1 413 | assert rules[0] == re.compile(r"(?P(?P([.,!;?:]\s?)))", self.flags) 414 | 415 | def test_number(self): 416 | rules = self.compiler('NUM("42")->MARK("SOME_NUMBER")') 417 | print(rules) 418 | assert len(rules) == 1 419 | assert rules[0] == re.compile(r"(?P(?P(42\s?)))", self.flags) 420 | 421 | def test_single_word(self): 422 | rules = self.compiler('WORD("Test")->MARK("SOME_LABEL")') 423 | print(rules) 424 | assert len(rules) == 1 425 | assert rules[0] == re.compile(r"(?P(?P(Test\s?)))", self.flags) 426 | 427 | def test_multiple_words(self): 428 | rules = self.compiler(''' 429 | words = {"test1", "test2"} 430 | IN_LIST(words)->MARK("MULTI_LABEL") 431 | ''') 432 | print(rules) 433 | assert len(rules) == 1 434 | assert rules[0] == re.compile(r"(?P(?P((^|\s)((test1|test2)\s?))))", self.flags) 435 | 436 | def test_simple_pattern(self): 437 | rules = self.compiler(''' 438 | {WORD("test1"), WORD("test2")}->MARK("SIMPLE_PATTERN") 439 | ''') 440 | print(rules) 441 | assert len(rules) == 1 442 | assert rules[0] == re.compile( 443 | r"(?P(?P(test1\s?))(?P([.,!;?:]\s?)?)(?P(test2\s?)))", 444 | self.flags 445 | ) 446 | 447 | def test_or_branch(self): 448 | rules = self.compiler(''' 449 | {WORD("test1")|WORD("test2")}->MARK("SPLIT_LABEL") 450 | ''') 451 | print(rules) 452 | assert len(rules) == 2 453 | assert rules[0] == re.compile(r"(?P(?P(test1\s?)))", self.flags) 454 | assert rules[1] == re.compile(r"(?P(?P(test2\s?)))", self.flags) 455 | 456 | def test_or_branch_multi(self): 457 | rules = self.compiler(''' 458 | {WORD("test1")|WORD("test2"),WORD("test3")|WORD("test4")}->MARK("MULTI_SPLIT_LABEL") 459 | ''') 460 | print(rules) 461 | assert len(rules) == 4 462 | assert rules[0] == re.compile( 463 | r"(?P(?P(test1\s?))(?P([.,!;?:]\s?)?)(?P(test3\s?)))", 464 | self.flags 465 | ) 466 | 467 | assert rules[1] == re.compile( 468 | r"(?P(?P(test2\s?))(?P([.,!;?:]\s?)?)(?P(test3\s?)))", 469 | self.flags 470 | ) 471 | 472 | assert rules[2] == re.compile( 473 | r"(?P(?P(test1\s?))(?P([.,!;?:]\s?)?)(?P(test4\s?)))", 474 | self.flags 475 | ) 476 | 477 | assert rules[3] == re.compile( 478 | r"(?P(?P(test2\s?))(?P([.,!;?:]\s?)?)(?P(test4\s?)))", 479 | self.flags 480 | ) 481 | 482 | def test_or_branch_multi_w_single(self): 483 | rules = self.compiler(''' 484 | numbers={"one", "two", "three"} 485 | {WORD("test1")|WORD("test2"),IN_LIST(numbers),WORD("test3")|WORD("test4")}->MARK("MULTI_SPLIT_LABEL") 486 | ''') 487 | print(rules) 488 | assert len(rules) == 4 489 | assert rules[0] == re.compile( 490 | r"(?P(?P(test1\s?))(?P([.,!;?:]\s?)?)" 491 | r"(?P((^|\s)((three|one|two)\s?)))(?P([.,!;?:]\s?)?)(?P(test3\s?)))", 492 | self.flags 493 | ) 494 | 495 | assert rules[1] == re.compile( 496 | r"(?P(?P(test2\s?))(?P([.,!;?:]\s?)?)" 497 | r"(?P((^|\s)((three|one|two)\s?)))(?P([.,!;?:]\s?)?)(?P(test3\s?)))", 498 | self.flags 499 | ) 500 | 501 | assert rules[2] == re.compile( 502 | r"(?P(?P(test1\s?))(?P([.,!;?:]\s?)?)" 503 | r"(?P((^|\s)((three|one|two)\s?)))(?P([.,!;?:]\s?)?)(?P(test4\s?)))", 504 | self.flags 505 | ) 506 | 507 | assert rules[3] == re.compile( 508 | r"(?P(?P(test2\s?))(?P([.,!;?:]\s?)?)" 509 | r"(?P((^|\s)((three|one|two)\s?)))(?P([.,!;?:]\s?)?)(?P(test4\s?)))", 510 | self.flags 511 | ) 512 | 513 | def test_word_with_accent(self): 514 | rules = self.compiler(''' 515 | WORD("Šarūnas")->MARK("TWO_WORDS") 516 | ''') 517 | print(rules) 518 | assert len(rules) == 1 519 | assert rules[0] == re.compile( 520 | r"(?P(?P((^|\s)((Sarunas|Šarūnas)\s?))))", 521 | self.flags 522 | ) 523 | 524 | def test_list_with_accent(self): 525 | rules = self.compiler(''' 526 | names={"Jonas", "Jurgis", "Šarūnas"} 527 | IN_LIST(names)->MARK("EXTENDED_LIST") 528 | ''') 529 | print(rules) 530 | assert len(rules) == 1 531 | assert rules[0] == re.compile( 532 | r"(?P(?P((^|\s)((Sarunas|Šarūnas|Jurgis|Jonas)\s?))))", 533 | self.flags 534 | ) 535 | 536 | def test_double_op(self): 537 | rules = self.compiler(''' 538 | WORD+->MARK("DOUBLE_OP") 539 | ''') 540 | print(rules) 541 | assert len(rules) == 1 542 | assert rules[0] == re.compile( 543 | r"(?P(?P(((\w|['_-])+)\s?)+))", 544 | self.flags 545 | ) 546 | 547 | def test_prefix_on_word(self): 548 | rules = self.compiler(''' 549 | {PREFIX("meta"), WORD("physics")}->MARK("META_WORD") 550 | ''') 551 | print(rules) 552 | assert len(rules) == 1 553 | assert rules[0] == re.compile(r"(?P(?P(metaphysics\s?)))", self.flags) 554 | 555 | def test_prefix_on_list(self): 556 | rules = self.compiler(''' 557 | science = {"physics", "mathematics"} 558 | {PREFIX("meta"), IN_LIST(science)}->MARK("META_LIST") 559 | ''') 560 | print(rules) 561 | assert len(rules) == 1 562 | assert rules[0] == re.compile( 563 | r"(?P(?P((^|\s)((metamathematics|metaphysics)\s?))))", 564 | self.flags 565 | ) 566 | 567 | def test_prefix_on_unknown_type(self): 568 | rules = self.compiler(''' 569 | {PREFIX("test"), ANY}->MARK("NOT_VALID") 570 | ''') 571 | print(rules) 572 | assert len(rules) == 1 573 | assert rules[0] == re.compile(r"(?P(?P(.*\s?)))", self.flags) 574 | 575 | def test_save_and_load_rules_from_file(self): 576 | rules = ''' 577 | {WORD("Hello"), WORD("world")}->MARK("HELLO") 578 | ''' 579 | engine = rita.compile_string(rules, use_engine="standalone") 580 | with tempfile.TemporaryDirectory() as tmpdir: 581 | rules_path = os.path.join(tmpdir, "rules-example.json") 582 | engine.save(rules_path) 583 | engine.load(rules_path) 584 | engine.execute("Hello world") 585 | 586 | def test_optional_list(self): 587 | rules = self.compiler(""" 588 | elements = {"one", "two"} 589 | {IN_LIST(elements)?}->MARK("OPTIONAL_LIST") 590 | """) 591 | 592 | print(rules) 593 | 594 | assert len(rules) == 1 595 | assert rules[0] == re.compile(r"(?P(?P((^|\s)((one|two)\s?))?))", self.flags) 596 | 597 | def test_complex_list(self): 598 | rules = self.compiler(""" 599 | fractions={"1 / 2", "3 / 4", "1 / 8", "3 / 8", "5 / 8", "7 / 8", "1 / 16", "3 / 16", 600 | "5 / 16", "7 / 16", "9 / 16", "11 / 16", "13 / 16", "15 / 16", "1 / 32", 601 | "3 / 32", "5 / 32", "7 / 32", "9 / 32", "11 / 32", "13 / 32", "15 / 32", 602 | "17 / 32", "19 / 32", "21 / 32", "23 / 32", "25 / 32", "27 / 32", 603 | "29 / 32", "31 / 32"} 604 | {NUM+, WORD("-")?, IN_LIST(fractions)?}->MARK("COMPLEX_NUMBER") 605 | """) 606 | 607 | print(rules) 608 | 609 | assert len(rules) == 1 610 | 611 | def test_generate_names(self): 612 | rules = self.compiler(""" 613 | !IMPORT("rita.modules.names") 614 | 615 | names = {"Roy Jones junior", "Roy Jones senior", "Juan-Claude van Damme", "Jon Jones"} 616 | NAMES(names)->MARK("NAME_MATCH") 617 | NAMES("Kazushi Sakuraba")->MARK("NAME_MATCH") 618 | """) 619 | 620 | print(rules) 621 | assert len(rules) == 2 622 | --------------------------------------------------------------------------------