├── .coveragerc ├── .github └── workflows │ └── ci.yml ├── .gitignore ├── CHANGELOG.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── dev-requirements.txt ├── pylintrc ├── pyproject.toml ├── rebulk ├── __init__.py ├── __version__.py ├── builder.py ├── chain.py ├── debug.py ├── formatters.py ├── introspector.py ├── loose.py ├── match.py ├── pattern.py ├── processors.py ├── rebulk.py ├── remodule.py ├── rules.py ├── test │ ├── __init__.py │ ├── default_rules_module.py │ ├── rebulk_rules_module.py │ ├── rules_module.py │ ├── test_chain.py │ ├── test_debug.py │ ├── test_introspector.py │ ├── test_loose.py │ ├── test_match.py │ ├── test_pattern.py │ ├── test_processors.py │ ├── test_rebulk.py │ ├── test_rules.py │ ├── test_toposort.py │ └── test_validators.py ├── toposort.py ├── utils.py └── validators.py ├── requirements.txt ├── runtests.py ├── setup.py └── tox.ini /.coveragerc: -------------------------------------------------------------------------------- 1 | # .coveragerc to control coverage.py 2 | [run] 3 | include = 4 | rebulk/* 5 | omit = 6 | rebulk/__version__.py 7 | rebulk/test/* 8 | [report] 9 | exclude_lines = 10 | pragma: no cover -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: ci 2 | on: 3 | push: ~ 4 | pull_request: ~ 5 | jobs: 6 | build: 7 | if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name != github.repository 8 | runs-on: ubuntu-latest 9 | 10 | strategy: 11 | fail-fast: false 12 | matrix: 13 | python-version: [ "3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "pypy-3.8", "pypy-3.9", "pypy-3.10" ] 14 | regex: [ "1", "0" ] 15 | 16 | steps: 17 | - name: Setup python ${{ matrix.python-version }} 18 | uses: actions/setup-python@v4 19 | with: 20 | python-version: ${{ matrix.python-version }} 21 | 22 | - name: Checkout 23 | uses: actions/checkout@v4 24 | 25 | - name: Git User config 26 | run: | 27 | git config --global user.email "action@github.com" 28 | git config --global user.name "github-actions" 29 | 30 | - name: Install Dependencies 31 | run: | 32 | pip install -e .[dev,test] 33 | pip install coveralls 34 | 35 | - name: Install regex 36 | run: | 37 | pip install regex 38 | if: ${{ matrix.regex == '1' }} 39 | 40 | - run: pylint rebulk 41 | 42 | - run: coverage run -m pytest 43 | env: 44 | REBULK_REGEX_ENABLED: ${{ matrix.regex }} 45 | 46 | - run: python setup.py build 47 | 48 | - name: Coveralls 49 | run: coveralls 50 | env: 51 | COVERALLS_REPO_TOKEN: ${{ secrets.COVERALLS_REPO_TOKEN }} 52 | 53 | commitlint: 54 | if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name != github.repository 55 | runs-on: ubuntu-latest 56 | steps: 57 | - uses: actions/checkout@v4 58 | with: 59 | fetch-depth: 0 60 | - uses: wagoid/commitlint-github-action@v5 61 | 62 | release: 63 | if: ${{ github.ref == 'refs/heads/master' && github.event_name == 'push' }} 64 | needs: build 65 | 66 | runs-on: ubuntu-latest 67 | 68 | strategy: 69 | fail-fast: false 70 | matrix: 71 | python-version: [ 3.12 ] 72 | 73 | steps: 74 | - name: Setup python ${{ matrix.python-version }} 75 | uses: actions/setup-python@v4 76 | with: 77 | python-version: ${{ matrix.python-version }} 78 | 79 | - name: Checkout 80 | uses: actions/checkout@v4 81 | with: 82 | fetch-depth: 0 83 | 84 | - name: Git User config 85 | run: | 86 | git config --global user.email "action@github.com" 87 | git config --global user.name "github-actions" 88 | 89 | - name: Install Dependencies 90 | run: pip install -e .[dev,test] 91 | 92 | - name: Install python-semantic-release and twine 93 | run: pip install python-semantic-release twine 94 | 95 | - name: Bump version 96 | run: semantic-release version 97 | env: 98 | GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} 99 | 100 | - name: Upload to pypi 101 | run: twine upload --username "__token__" --password "${PYPI_TOKEN}" dist/* 102 | env: 103 | PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} 104 | 105 | - name: Publish release 106 | run: semantic-release publish 107 | env: 108 | GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} 109 | 110 | - name: Merge master to develop 111 | uses: robotology/gh-action-nightly-merge@v1.4.0 112 | with: 113 | stable_branch: 'master' 114 | development_branch: 'develop' 115 | allow_ff: true 116 | user_name: github-actions 117 | env: 118 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 119 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | dist/ 5 | build/ 6 | 7 | # Python dist 8 | *.egg-info/ 9 | .eggs/ 10 | 11 | # Coverage 12 | .coverage 13 | 14 | # PyEnv 15 | .python-version 16 | 17 | # Tox 18 | .tox/ 19 | 20 | # py.test 21 | lastfailed 22 | 23 | # Jetbrain 24 | *.iml 25 | .idea/ 26 | 27 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | Changelog 2 | ========= 3 | 4 | 5 | 6 | ## v3.2.0 (2023-02-18) 7 | ### Feature 8 | * **dependencies:** Add python 3.11 support and drop python 3.6 support ([`e4cb0d8`](https://github.com/Toilal/rebulk/commit/e4cb0d854cd8ea80da9abe46d2b3405a873e2020)) 9 | 10 | ### Fix 11 | * Remove pytest-runner from setup_requires ([`4483d17`](https://github.com/Toilal/rebulk/commit/4483d1777f6a61d20ed83da760663aec67e22042)) 12 | 13 | ## v3.1.0 (2021-11-04) 14 | ### Feature 15 | * **defaults:** Add overrides support ([#25](https://github.com/Toilal/rebulk/issues/25)) ([`f79e5ea`](https://github.com/Toilal/rebulk/commit/f79e5eab0806787ff19a4c668bf9f88413b67288)) 16 | * **python:** Add python 3.10 support, drop python 3.5 support ([`a5e6eb7`](https://github.com/Toilal/rebulk/commit/a5e6eb7bba979ee51e1c6c1e186bd224c989dfdc)) 17 | 18 | ## v3.0.1 (2020-12-25) 19 | ### Fix 20 | * **package:** Fix broken package `No such file or directory: 'CHANGELOG.md'` ([#24](https://github.com/Toilal/rebulk/issues/24)) ([`33895ff`](https://github.com/Toilal/rebulk/commit/33895ff358ff5051768fb98d4e840691e7af9bdf)) 21 | 22 | ### Documentation 23 | * **readme:** Add semantic release badge ([`78baca0`](https://github.com/Toilal/rebulk/commit/78baca0c529083d7f583ffec58aeb23734d67ce5)) 24 | * **readme:** Fix title ([`d5d4db5`](https://github.com/Toilal/rebulk/commit/d5d4db5cd7f6e2cb1308acd26bfb98838815fad4)) 25 | 26 | ## v3.0.0 (2020-12-23) 27 | ### Feature 28 | * **regex:** Replace REGEX_DISABLED environment variable with REBULK_REGEX_ENABLED ([`d5a8cad`](https://github.com/Toilal/rebulk/commit/d5a8cad6281533ee549a46ca70e1a25e5777eda3)) 29 | * Add python 3.8/3.9 support, drop python 2.7/3.4 support ([`048a15f`](https://github.com/Toilal/rebulk/commit/048a15f90833ba8d33ea84d56e9955d31b514dc3)) 30 | 31 | ### Breaking 32 | * regex module is now disabled by default, even if it's available in the python interpreter. You have to set REBULK_REGEX_ENABLED=1 in your environment to enable it, as this module may cause some issues. ([`d5a8cad`](https://github.com/Toilal/rebulk/commit/d5a8cad6281533ee549a46ca70e1a25e5777eda3)) 33 | * Python 2.7 and 3.4 support have been dropped ([`048a15f`](https://github.com/Toilal/rebulk/commit/048a15f90833ba8d33ea84d56e9955d31b514dc3)) 34 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Rémi Alvergnat 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.py 2 | include *.txt 3 | include *.ini 4 | include *.md 5 | include .coveragerc 6 | include LICENSE 7 | include pylintrc 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ReBulk 2 | ====== 3 | 4 | [![Latest Version](http://img.shields.io/pypi/v/rebulk.svg)](https://pypi.python.org/pypi/rebulk) 5 | [![MIT License](http://img.shields.io/badge/license-MIT-blue.svg)](https://pypi.python.org/pypi/rebulk) 6 | [![Build Status](https://img.shields.io/github/workflow/status/Toilal/rebulk/ci)](https://github.com/Toilal/rebulk/actions?query=workflow%3Aci) 7 | [![Coveralls](http://img.shields.io/coveralls/Toilal/rebulk.svg)](https://coveralls.io/r/Toilal/rebulk?branch=master) 8 | [![semantic-release](https://img.shields.io/badge/%20%20%F0%9F%93%A6%F0%9F%9A%80-semantic--release-e10079.svg)](https://github.com/relekang/python-semantic-release) 9 | 10 | 11 | ReBulk is a python library that performs advanced searches in strings 12 | that would be hard to implement using [re 13 | module](https://docs.python.org/3/library/re.html) or [String 14 | methods](https://docs.python.org/3/library/stdtypes.html#str) only. 15 | 16 | It includes some features like `Patterns`, `Match`, `Rule` that allows 17 | developers to build a custom and complex string matcher using a readable 18 | and extendable API. 19 | 20 | This project is hosted on GitHub: 21 | 22 | Install 23 | ======= 24 | 25 | ```sh 26 | $ pip install rebulk 27 | ``` 28 | 29 | Usage 30 | ===== 31 | 32 | Regular expression, string and function based patterns are declared in a 33 | `Rebulk` object. It use a fluent API to chain `string`, `regex`, and 34 | `functional` methods to define various patterns types. 35 | 36 | ```python 37 | >>> from rebulk import Rebulk 38 | >>> bulk = Rebulk().string('brown').regex(r'qu\w+').functional(lambda s: (20, 25)) 39 | ``` 40 | 41 | When `Rebulk` object is fully configured, you can call `matches` method 42 | with an input string to retrieve all `Match` objects found by registered 43 | pattern. 44 | 45 | ```python 46 | >>> bulk.matches("The quick brown fox jumps over the lazy dog") 47 | [, , ] 48 | ``` 49 | 50 | If multiple `Match` objects are found at the same position, only the 51 | longer one is kept. 52 | 53 | ```python 54 | >>> bulk = Rebulk().string('lakers').string('la') 55 | >>> bulk.matches("the lakers are from la") 56 | [, ] 57 | ``` 58 | 59 | String Patterns 60 | =============== 61 | 62 | String patterns are based on 63 | [str.find](https://docs.python.org/3/library/stdtypes.html#str.find) 64 | method to find matches, but returns all matches in the string. 65 | `ignore_case` can be enabled to ignore case. 66 | 67 | ```python 68 | >>> Rebulk().string('la').matches("lalalilala") 69 | [, , , ] 70 | 71 | >>> Rebulk().string('la').matches("LalAlilAla") 72 | [] 73 | 74 | >>> Rebulk().string('la', ignore_case=True).matches("LalAlilAla") 75 | [, , , ] 76 | ``` 77 | 78 | You can define several patterns with a single `string` method call. 79 | 80 | ```python 81 | >>> Rebulk().string('Winter', 'coming').matches("Winter is coming...") 82 | [, ] 83 | ``` 84 | 85 | Regular Expression Patterns 86 | =========================== 87 | 88 | Regular Expression patterns are based on a compiled regular expression. 89 | [re.finditer](https://docs.python.org/3/library/re.html#re.finditer) 90 | method is used to find matches. 91 | 92 | If [regex module](https://pypi.python.org/pypi/regex) is available, it 93 | can be used by rebulk instead of default [re 94 | module](https://docs.python.org/3/library/re.html). Enable it with `REBULK_REGEX_ENABLED=1` environment variable. 95 | 96 | ```python 97 | >>> Rebulk().regex(r'l\w').matches("lolita") 98 | [, ] 99 | ``` 100 | 101 | You can define several patterns with a single `regex` method call. 102 | 103 | ```python 104 | >>> Rebulk().regex(r'Wint\wr', r'com\w{3}').matches("Winter is coming...") 105 | [, ] 106 | ``` 107 | 108 | All keyword arguments from 109 | [re.compile](https://docs.python.org/3/library/re.html#re.compile) are 110 | supported. 111 | 112 | ```python 113 | >>> import re # import required for flags constant 114 | >>> Rebulk().regex('L[A-Z]KERS', flags=re.IGNORECASE) \ 115 | ... .matches("The LaKeRs are from La") 116 | [] 117 | 118 | >>> Rebulk().regex('L[A-Z]', 'L[A-Z]KERS', flags=re.IGNORECASE) \ 119 | ... .matches("The LaKeRs are from La") 120 | [, ] 121 | 122 | >>> Rebulk().regex(('L[A-Z]', re.IGNORECASE), ('L[a-z]KeRs')) \ 123 | ... .matches("The LaKeRs are from La") 124 | [, ] 125 | ``` 126 | 127 | If [regex module](https://pypi.python.org/pypi/regex) is available, it 128 | automatically supports repeated captures. 129 | 130 | ```python 131 | >>> # If regex module is available, repeated_captures is True by default. 132 | >>> matches = Rebulk().regex(r'(\d+)(?:-(\d+))+').matches("01-02-03-04") 133 | >>> matches[0].children # doctest:+SKIP 134 | [<01:(0, 2)>, <02:(3, 5)>, <03:(6, 8)>, <04:(9, 11)>] 135 | 136 | >>> # If regex module is not available, or if repeated_captures is forced to False. 137 | >>> matches = Rebulk().regex(r'(\d+)(?:-(\d+))+', repeated_captures=False) \ 138 | ... .matches("01-02-03-04") 139 | >>> matches[0].children 140 | [<01:(0, 2)+initiator=01-02-03-04>, <04:(9, 11)+initiator=01-02-03-04>] 141 | ``` 142 | 143 | - `abbreviations` 144 | 145 | Defined as a list of 2-tuple, each tuple is an abbreviation. It 146 | simply replace `tuple[0]` with `tuple[1]` in the expression. 147 | 148 | \>\>\> Rebulk().regex(r\'Custom-separators\', 149 | abbreviations=\[(\"-\", r\"\[W\_\]+\")\])\... 150 | .matches(\"Custom\_separators using-abbreviations\") 151 | \[\\] 152 | 153 | Functional Patterns 154 | =================== 155 | 156 | Functional Patterns are based on the evaluation of a function. 157 | 158 | The function should have the same parameters as `Rebulk.matches` method, 159 | that is the input string, and must return at least start index and end 160 | index of the `Match` object. 161 | 162 | ```python 163 | >>> def func(string): 164 | ... index = string.find('?') 165 | ... if index > -1: 166 | ... return 0, index - 11 167 | >>> Rebulk().functional(func).matches("Why do simple ? Forget about it ...") 168 | [] 169 | ``` 170 | 171 | You can also return a dict of keywords arguments for `Match` object. 172 | 173 | You can define several patterns with a single `functional` method call, 174 | and function used can return multiple matches. 175 | 176 | Chain Patterns 177 | ============== 178 | 179 | Chain Patterns are ordered composition of string, functional and regex 180 | patterns. Repeater can be set to define repetition on chain part. 181 | 182 | ```python 183 | >>> r = Rebulk().regex_defaults(flags=re.IGNORECASE)\ 184 | ... .defaults(children=True, formatter={'episode': int, 'version': int})\ 185 | ... .chain()\ 186 | ... .regex(r'e(?P\d{1,4})').repeater(1)\ 187 | ... .regex(r'v(?P\d+)').repeater('?')\ 188 | ... .regex(r'[ex-](?P\d{1,4})').repeater('*')\ 189 | ... .close() # .repeater(1) could be omitted as it's the default behavior 190 | >>> r.matches("This is E14v2-15-16-17").to_dict() # converts matches to dict 191 | MatchesDict([('episode', [14, 15, 16, 17]), ('version', 2)]) 192 | ``` 193 | 194 | Patterns parameters 195 | =================== 196 | 197 | All patterns have options that can be given as keyword arguments. 198 | 199 | - `validator` 200 | 201 | Function to validate `Match` value given by the pattern. Can also be 202 | a `dict`, to use `validator` with pattern named with key. 203 | 204 | ```python 205 | >>> def check_leap_year(match): 206 | ... return int(match.value) in [1980, 1984, 1988] 207 | >>> matches = Rebulk().regex(r'\d{4}', validator=check_leap_year) \ 208 | ... .matches("In year 1982 ...") 209 | >>> len(matches) 210 | 0 211 | >>> matches = Rebulk().regex(r'\d{4}', validator=check_leap_year) \ 212 | ... .matches("In year 1984 ...") 213 | >>> len(matches) 214 | 1 215 | ``` 216 | 217 | Some base validator functions are available in `rebulk.validators` 218 | module. Most of those functions have to be configured using 219 | `functools.partial` to map them to function accepting a single `match` 220 | argument. 221 | 222 | - `formatter` 223 | 224 | Function to convert `Match` value given by the pattern. Can also be 225 | a `dict`, to use `formatter` with matches named with key. 226 | 227 | ```python 228 | >>> def year_formatter(value): 229 | ... return int(value) 230 | >>> matches = Rebulk().regex(r'\d{4}', formatter=year_formatter) \ 231 | ... .matches("In year 1982 ...") 232 | >>> isinstance(matches[0].value, int) 233 | True 234 | ``` 235 | 236 | - `pre_match_processor` / `post_match_processor` 237 | 238 | Function to mutagen or invalidate a match generated by a pattern. 239 | 240 | Function has a single parameter which is the Match object. If 241 | function returns False, it will be considered as an invalid match. 242 | If function returns a match instance, it will replace the original 243 | match with this instance in the process. 244 | 245 | - `post_processor` 246 | 247 | Function to change the default output of the pattern. Function 248 | parameters are Matches list and Pattern object. 249 | 250 | - `name` 251 | 252 | The name of the pattern. It is automatically passed to `Match` 253 | objects generated by this pattern. 254 | 255 | - `tags` 256 | 257 | A list of string that qualifies this pattern. 258 | 259 | - `value` 260 | 261 | Override value property for generated `Match` objects. Can also be a 262 | `dict`, to use `value` with pattern named with key. 263 | 264 | - `validate_all` 265 | 266 | By default, validator is called for returned `Match` objects only. 267 | Enable this option to validate them all, parent and children 268 | included. 269 | 270 | - `format_all` 271 | 272 | By default, formatter is called for returned `Match` values only. 273 | Enable this option to format them all, parent and children included. 274 | 275 | - `disabled` 276 | 277 | A `function(context)` to disable the pattern if returning `True`. 278 | 279 | - `children` 280 | 281 | If `True`, all children `Match` objects will be retrieved instead of 282 | a single parent `Match` object. 283 | 284 | - `private` 285 | 286 | If `True`, `Match` objects generated from this pattern are available 287 | internally only. They will be removed at the end of `Rebulk.matches` 288 | method call. 289 | 290 | - `private_parent` 291 | 292 | Force parent matches to be returned and flag them as private. 293 | 294 | - `private_children` 295 | 296 | Force children matches to be returned and flag them as private. 297 | 298 | - `private_names` 299 | 300 | Matches names that will be declared as private 301 | 302 | - `ignore_names` 303 | 304 | Matches names that will be ignored from the pattern output, after 305 | validation. 306 | 307 | - `marker` 308 | 309 | If `true`, `Match` objects generated from this pattern will be 310 | markers matches instead of standard matches. They won\'t be included 311 | in `Matches` sequence, but will be available in `Matches.markers` 312 | sequence (see `Markers` section). 313 | 314 | Match 315 | ===== 316 | 317 | A `Match` object is the result created by a registered pattern. 318 | 319 | It has a `value` property defined, and position indices are available 320 | through `start`, `end` and `span` properties. 321 | 322 | In some case, it contains children `Match` objects in `children` 323 | property, and each child `Match` object reference its parent in `parent` 324 | property. Also, a `name` property can be defined for the match. 325 | 326 | If groups are defined in a Regular Expression pattern, each group match 327 | will be converted to a single `Match` object. If a group has a name 328 | defined (`(?Pgroup)`), it is set as `name` property in a child 329 | `Match` object. The whole regexp match (`re.group(0)`) will be converted 330 | to the main `Match` object, and all subgroups (1, 2, \... n) will be 331 | converted to `children` matches of the main `Match` object. 332 | 333 | ```python 334 | >>> matches = Rebulk() \ 335 | ... .regex(r"One, (?P\w+), Two, (?P\w+), Three, (?P\w+)") \ 336 | ... .matches("Zero, 0, One, 1, Two, 2, Three, 3, Four, 4") 337 | >>> matches 338 | [] 339 | >>> for child in matches[0].children: 340 | ... '%s = %s' % (child.name, child.value) 341 | 'one = 1' 342 | 'two = 2' 343 | 'three = 3' 344 | ``` 345 | 346 | It\'s possible to retrieve only children by using `children` parameters. 347 | You can also customize the way structure is generated with `every`, 348 | `private_parent` and `private_children` parameters. 349 | 350 | ```python 351 | >>> matches = Rebulk() \ 352 | ... .regex(r"One, (?P\w+), Two, (?P\w+), Three, (?P\w+)", children=True) \ 353 | ... .matches("Zero, 0, One, 1, Two, 2, Three, 3, Four, 4") 354 | >>> matches 355 | [<1:(14, 15)+name=one+initiator=One, 1, Two, 2, Three, 3>, <2:(22, 23)+name=two+initiator=One, 1, Two, 2, Three, 3>, <3:(32, 33)+name=three+initiator=One, 1, Two, 2, Three, 3>] 356 | ``` 357 | 358 | Match object has the following properties that can be given to Pattern 359 | objects 360 | 361 | - `formatter` 362 | 363 | Function to convert `Match` value given by the pattern. Can also be 364 | a `dict`, to use `formatter` with matches named with key. 365 | 366 | ```python 367 | >>> def year_formatter(value): 368 | ... return int(value) 369 | >>> matches = Rebulk().regex(r'\d{4}', formatter=year_formatter) \ 370 | ... .matches("In year 1982 ...") 371 | >>> isinstance(matches[0].value, int) 372 | True 373 | ``` 374 | 375 | - `format_all` 376 | 377 | By default, formatter is called for returned `Match` values only. 378 | Enable this option to format them all, parent and children included. 379 | 380 | - `conflict_solver` 381 | 382 | A `function(match, conflicting_match)` used to solve conflict. 383 | Returned object will be removed from matches by `ConflictSolver` 384 | default rule. If `__default__` string is returned, it will fallback 385 | to default behavior keeping longer match. 386 | 387 | Matches 388 | ======= 389 | 390 | A `Matches` object holds the result of `Rebulk.matches` method call. 391 | It\'s a sequence of `Match` objects and it behaves like a list. 392 | 393 | All methods accepts a `predicate` function to filter `Match` objects 394 | using a callable, and an `index` int to retrieve a single element from 395 | default returned matches. 396 | 397 | It has the following additional methods and properties on it. 398 | 399 | - `starting(index, predicate=None, index=None)` 400 | 401 | Retrieves a list of `Match` objects that starts at given index. 402 | 403 | - `ending(index, predicate=None, index=None)` 404 | 405 | Retrieves a list of `Match` objects that ends at given index. 406 | 407 | - `previous(match, predicate=None, index=None)` 408 | 409 | Retrieves a list of `Match` objects that are previous and nearest to 410 | match. 411 | 412 | - `next(match, predicate=None, index=None)` 413 | 414 | Retrieves a list of `Match` objects that are next and nearest to 415 | match. 416 | 417 | - `tagged(tag, predicate=None, index=None)` 418 | 419 | Retrieves a list of `Match` objects that have the given tag defined. 420 | 421 | - `named(name, predicate=None, index=None)` 422 | 423 | Retrieves a list of `Match` objects that have the given name. 424 | 425 | - `range(start=0, end=None, predicate=None, index=None)` 426 | 427 | Retrieves a list of `Match` objects for given range, sorted from 428 | start to end. 429 | 430 | - `holes(start=0, end=None, formatter=None, ignore=None, predicate=None, index=None)` 431 | 432 | Retrieves a list of *hole* `Match` objects for given range. A hole 433 | match is created for each range where no match is available. 434 | 435 | - `conflicting(match, predicate=None, index=None)` 436 | 437 | Retrieves a list of `Match` objects that conflicts with given match. 438 | 439 | - `chain_before(self, position, seps, start=0, predicate=None, index=None)`: 440 | 441 | Retrieves a list of chained matches, before position, matching 442 | predicate and separated by characters from seps only. 443 | 444 | - `chain_after(self, position, seps, end=None, predicate=None, index=None)`: 445 | 446 | Retrieves a list of chained matches, after position, matching 447 | predicate and separated by characters from seps only. 448 | 449 | - `at_match(match, predicate=None, index=None)` 450 | 451 | Retrieves a list of `Match` objects at the same position as match. 452 | 453 | - `at_span(span, predicate=None, index=None)` 454 | 455 | Retrieves a list of `Match` objects from given (start, end) tuple. 456 | 457 | - `at_index(pos, predicate=None, index=None)` 458 | 459 | Retrieves a list of `Match` objects from given position. 460 | 461 | - `names` 462 | 463 | Retrieves a sequence of all `Match.name` properties. 464 | 465 | - `tags` 466 | 467 | Retrieves a sequence of all `Match.tags` properties. 468 | 469 | - `to_dict(details=False, first_value=False, enforce_list=False)` 470 | 471 | Convert to an ordered dict, with `Match.name` as key and 472 | `Match.value` as value. 473 | 474 | It\'s a subclass of 475 | [OrderedDict](https://docs.python.org/2/library/collections.html#collections.OrderedDict), 476 | that contains a `matches` property which is a dict with `Match.name` 477 | as key and list of `Match` objects as value. 478 | 479 | If `first_value` is `True` and distinct values are found for the 480 | same name, value will be wrapped to a list. If `False`, first value 481 | only will be kept and values lists can be retrieved with 482 | `values_list` which is a dict with `Match.name` as key and list of 483 | `Match.value` as value. 484 | 485 | if `enforce_list` is `True`, all values will be wrapped to a list, 486 | even if a single value is found. 487 | 488 | If `details` is True, `Match.value` objects are replaced with 489 | complete `Match` object. 490 | 491 | - `markers` 492 | 493 | A custom `Matches` sequences specialized for `markers` matches (see 494 | below) 495 | 496 | Markers 497 | ======= 498 | 499 | If you have defined some patterns with `markers` property, then 500 | `Matches.markers` points to a special `Matches` sequence that contains 501 | only `markers` matches. This sequence supports all methods from 502 | `Matches`. 503 | 504 | Markers matches are not intended to be used in final result, but can be 505 | used to implement a `Rule`. 506 | 507 | Rules 508 | ===== 509 | 510 | Rules are a convenient and readable way to implement advanced 511 | conditional logic involving several `Match` objects. When a rule is 512 | triggered, it can perform an action on `Matches` object, like filtering 513 | out, adding additional tags or renaming. 514 | 515 | Rules are implemented by extending the abstract `Rule` class. They are 516 | registered using `Rebulk.rule` method by giving either a `Rule` 517 | instance, a `Rule` class or a module containing `Rule classes` only. 518 | 519 | For a rule to be triggered, `Rule.when` method must return `True`, or a 520 | non empty list of `Match` objects, or any other truthy object. When 521 | triggered, `Rule.then` method is called to perform the action with 522 | `when_response` parameter defined as the response of `Rule.when` call. 523 | 524 | Instead of implementing `Rule.then` method, you can define `consequence` 525 | class property with a Consequence classe or instance, like 526 | `RemoveMatch`, `RenameMatch` or `AppendMatch`. You can also use a list 527 | of consequence when required : `when_response` must then be iterable, 528 | and elements of this iterable will be given to each consequence in the 529 | same order. 530 | 531 | When many rules are registered, it can be useful to set `priority` class 532 | variable to define a priority integer between all rule executions 533 | (higher priorities will be executed first). You can also define 534 | `dependency` to declare another Rule class as dependency for the current 535 | rule, meaning that it will be executed before. 536 | 537 | For all rules with the same `priority` value, `when` is called before, 538 | and `then` is called after all. 539 | 540 | ```python 541 | >>> from rebulk import Rule, RemoveMatch 542 | 543 | >>> class FirstOnlyRule(Rule): 544 | ... consequence = RemoveMatch 545 | ... 546 | ... def when(self, matches, context): 547 | ... grabbed = matches.named("grabbed", 0) 548 | ... if grabbed and matches.previous(grabbed): 549 | ... return grabbed 550 | 551 | >>> rebulk = Rebulk() 552 | 553 | >>> rebulk.regex("This match(.*?)grabbed", name="grabbed") 554 | <...Rebulk object ...> 555 | >>> rebulk.regex("if it's(.*?)first match", private=True) 556 | <...Rebulk object at ...> 557 | >>> rebulk.rules(FirstOnlyRule) 558 | <...Rebulk object at ...> 559 | 560 | >>> rebulk.matches("This match is grabbed only if it's the first match") 561 | [] 562 | >>> rebulk.matches("if it's NOT the first match, This match is NOT grabbed") 563 | [] 564 | ``` 565 | -------------------------------------------------------------------------------- /dev-requirements.txt: -------------------------------------------------------------------------------- 1 | -e .[dev,test] 2 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.semantic_release] 2 | version_variables = ["rebulk/__version__.py:__version__"] 3 | commit_message = "chore(release): release v{version}" 4 | commit_author = "github-actions " 5 | 6 | [tool.pytest.ini_options] 7 | addopts = "--ignore=setup.py --ignore=build --doctest-modules --doctest-glob='README.rst'" 8 | -------------------------------------------------------------------------------- /rebulk/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Define simple search patterns in bulk to perform advanced matching on any string. 5 | """ 6 | # pylint:disable=import-self 7 | from .rebulk import Rebulk 8 | from .rules import Rule, CustomRule, AppendMatch, RemoveMatch, RenameMatch, AppendTags, RemoveTags 9 | from .processors import ConflictSolver, PrivateRemover, POST_PROCESS, PRE_PROCESS 10 | from .pattern import REGEX_ENABLED 11 | -------------------------------------------------------------------------------- /rebulk/__version__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Version module 5 | """ 6 | # pragma: no cover 7 | __version__ = '3.2.0' 8 | -------------------------------------------------------------------------------- /rebulk/builder.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Base builder class for Rebulk 5 | """ 6 | from abc import ABCMeta, abstractmethod 7 | from contextlib import contextmanager 8 | from copy import deepcopy 9 | from logging import getLogger 10 | 11 | from .loose import set_defaults 12 | from .pattern import RePattern, StringPattern, FunctionalPattern 13 | 14 | log = getLogger(__name__).log 15 | 16 | 17 | @contextmanager 18 | def overrides(kwargs): 19 | """ 20 | Implements override kwarg to restore initial kwarg arguments from overrides list after set_defaults calls. 21 | :param kwargs: 22 | :return: 23 | """ 24 | override_keys = kwargs.pop('overrides', None) 25 | backup = {} 26 | if override_keys: 27 | for override_key in override_keys: 28 | backup[override_key] = kwargs[override_key] 29 | 30 | yield backup 31 | 32 | kwargs.update(backup) 33 | 34 | 35 | class Builder(metaclass=ABCMeta): 36 | """ 37 | Base builder class for patterns 38 | """ 39 | 40 | def __init__(self): 41 | self._defaults = {} 42 | self._regex_defaults = {} 43 | self._string_defaults = {} 44 | self._functional_defaults = {} 45 | self._chain_defaults = {} 46 | 47 | def reset(self): 48 | """ 49 | Reset all defaults. 50 | 51 | :return: 52 | """ 53 | self.__init__() # pylint: disable=unnecessary-dunder-call 54 | 55 | def defaults(self, **kwargs): 56 | """ 57 | Define default keyword arguments for all patterns 58 | :param kwargs: 59 | :type kwargs: 60 | :return: 61 | :rtype: 62 | """ 63 | set_defaults(kwargs, self._defaults, override=True) 64 | return self 65 | 66 | def regex_defaults(self, **kwargs): 67 | """ 68 | Define default keyword arguments for functional patterns. 69 | :param kwargs: 70 | :type kwargs: 71 | :return: 72 | :rtype: 73 | """ 74 | set_defaults(kwargs, self._regex_defaults, override=True) 75 | return self 76 | 77 | def string_defaults(self, **kwargs): 78 | """ 79 | Define default keyword arguments for string patterns. 80 | :param kwargs: 81 | :type kwargs: 82 | :return: 83 | :rtype: 84 | """ 85 | set_defaults(kwargs, self._string_defaults, override=True) 86 | return self 87 | 88 | def functional_defaults(self, **kwargs): 89 | """ 90 | Define default keyword arguments for functional patterns. 91 | :param kwargs: 92 | :type kwargs: 93 | :return: 94 | :rtype: 95 | """ 96 | set_defaults(kwargs, self._functional_defaults, override=True) 97 | return self 98 | 99 | def chain_defaults(self, **kwargs): 100 | """ 101 | Define default keyword arguments for patterns chain. 102 | :param kwargs: 103 | :type kwargs: 104 | :return: 105 | :rtype: 106 | """ 107 | set_defaults(kwargs, self._chain_defaults, override=True) 108 | return self 109 | 110 | def build_re(self, *pattern, **kwargs): 111 | """ 112 | Builds a new regular expression pattern 113 | 114 | :param pattern: 115 | :type pattern: 116 | :param kwargs: 117 | :type kwargs: 118 | :return: 119 | :rtype: 120 | """ 121 | with overrides(kwargs): 122 | set_defaults(self._regex_defaults, kwargs) 123 | set_defaults(self._defaults, kwargs) 124 | 125 | return RePattern(*pattern, **kwargs) 126 | 127 | def build_string(self, *pattern, **kwargs): 128 | """ 129 | Builds a new string pattern 130 | 131 | :param pattern: 132 | :type pattern: 133 | :param kwargs: 134 | :type kwargs: 135 | :return: 136 | :rtype: 137 | """ 138 | with overrides(kwargs): 139 | set_defaults(self._string_defaults, kwargs) 140 | set_defaults(self._defaults, kwargs) 141 | 142 | return StringPattern(*pattern, **kwargs) 143 | 144 | def build_functional(self, *pattern, **kwargs): 145 | """ 146 | Builds a new functional pattern 147 | 148 | :param pattern: 149 | :type pattern: 150 | :param kwargs: 151 | :type kwargs: 152 | :return: 153 | :rtype: 154 | """ 155 | with overrides(kwargs): 156 | set_defaults(self._functional_defaults, kwargs) 157 | set_defaults(self._defaults, kwargs) 158 | 159 | return FunctionalPattern(*pattern, **kwargs) 160 | 161 | def build_chain(self, **kwargs): 162 | """ 163 | Builds a new patterns chain 164 | 165 | :param pattern: 166 | :type pattern: 167 | :param kwargs: 168 | :type kwargs: 169 | :return: 170 | :rtype: 171 | """ 172 | from .chain import Chain # pylint:disable=import-outside-toplevel,cyclic-import 173 | 174 | with overrides(kwargs): 175 | set_defaults(self._chain_defaults, kwargs) 176 | set_defaults(self._defaults, kwargs) 177 | 178 | chain = Chain(self, **kwargs) 179 | chain._defaults = deepcopy(self._defaults) # pylint: disable=protected-access 180 | chain._regex_defaults = deepcopy(self._regex_defaults) # pylint: disable=protected-access 181 | chain._functional_defaults = deepcopy(self._functional_defaults) # pylint: disable=protected-access 182 | chain._string_defaults = deepcopy(self._string_defaults) # pylint: disable=protected-access 183 | chain._chain_defaults = deepcopy(self._chain_defaults) # pylint: disable=protected-access 184 | 185 | return chain 186 | 187 | @abstractmethod 188 | def pattern(self, *pattern): 189 | """ 190 | Register a list of Pattern instance 191 | :param pattern: 192 | :return: 193 | """ 194 | 195 | def regex(self, *pattern, **kwargs): 196 | """ 197 | Add re pattern 198 | 199 | :param pattern: 200 | :type pattern: 201 | :return: self 202 | :rtype: Rebulk 203 | """ 204 | return self.pattern(self.build_re(*pattern, **kwargs)) 205 | 206 | def string(self, *pattern, **kwargs): 207 | """ 208 | Add string pattern 209 | 210 | :param pattern: 211 | :type pattern: 212 | :return: self 213 | :rtype: Rebulk 214 | """ 215 | return self.pattern(self.build_string(*pattern, **kwargs)) 216 | 217 | def functional(self, *pattern, **kwargs): 218 | """ 219 | Add functional pattern 220 | 221 | :param pattern: 222 | :type pattern: 223 | :return: self 224 | :rtype: Rebulk 225 | """ 226 | functional = self.build_functional(*pattern, **kwargs) 227 | return self.pattern(functional) 228 | 229 | def chain(self, **kwargs): 230 | """ 231 | Add patterns chain, using configuration of this rebulk 232 | 233 | :param pattern: 234 | :type pattern: 235 | :param kwargs: 236 | :type kwargs: 237 | :return: 238 | :rtype: 239 | """ 240 | chain = self.build_chain(**kwargs) 241 | self.pattern(chain) 242 | return chain 243 | -------------------------------------------------------------------------------- /rebulk/chain.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Chain patterns and handle repetiting capture group 5 | """ 6 | import itertools 7 | 8 | from .builder import Builder 9 | from .loose import call 10 | from .match import Match, Matches 11 | from .pattern import Pattern, filter_match_kwargs, BasePattern 12 | from .remodule import re 13 | 14 | 15 | class _InvalidChainException(Exception): 16 | """ 17 | Internal exception raised when a chain is not valid 18 | """ 19 | 20 | 21 | class Chain(Pattern, Builder): 22 | """ 23 | Definition of a pattern chain to search for. 24 | """ 25 | 26 | def __init__(self, parent, chain_breaker=None, **kwargs): # pylint: disable=super-init-not-called 27 | Builder.__init__(self) 28 | call(Pattern.__init__, self, **kwargs) 29 | self._kwargs = kwargs 30 | self._match_kwargs = filter_match_kwargs(kwargs) 31 | if callable(chain_breaker): 32 | self.chain_breaker = chain_breaker 33 | else: 34 | self.chain_breaker = None 35 | self.parent = parent 36 | self.parts = [] 37 | 38 | def pattern(self, *pattern): 39 | """ 40 | 41 | :param pattern: 42 | :return: 43 | """ 44 | if not pattern: 45 | raise ValueError("One pattern should be given to the chain") 46 | if len(pattern) > 1: 47 | raise ValueError("Only one pattern can be given to the chain") 48 | part = ChainPart(self, pattern[0]) 49 | self.parts.append(part) 50 | return part 51 | 52 | def close(self): 53 | """ 54 | Deeply close the chain 55 | :return: Rebulk instance 56 | """ 57 | parent = self.parent 58 | while isinstance(parent, Chain): 59 | parent = parent.parent 60 | return parent 61 | 62 | def _match(self, pattern, input_string, context=None): 63 | # pylint: disable=too-many-locals,too-many-nested-blocks 64 | chain_matches = [] 65 | chain_input_string = input_string 66 | offset = 0 67 | while offset < len(input_string): 68 | chain_found = False 69 | current_chain_matches = [] 70 | valid_chain = True 71 | for chain_part in self.parts: 72 | try: 73 | chain_part_matches, raw_chain_part_matches = chain_part.matches(chain_input_string, 74 | context, 75 | with_raw_matches=True) 76 | 77 | chain_found, chain_input_string, offset = \ 78 | self._to_next_chain_part(chain_part, chain_part_matches, raw_chain_part_matches, chain_found, 79 | input_string, chain_input_string, offset, current_chain_matches) 80 | except _InvalidChainException: 81 | valid_chain = False 82 | if current_chain_matches: 83 | offset = current_chain_matches[0].raw_end 84 | break 85 | if not chain_found: 86 | break 87 | if current_chain_matches and valid_chain: 88 | match = self._build_chain_match(current_chain_matches, input_string) 89 | chain_matches.append(match) 90 | 91 | return chain_matches 92 | 93 | def _to_next_chain_part(self, chain_part, chain_part_matches, raw_chain_part_matches, chain_found, 94 | input_string, chain_input_string, offset, current_chain_matches): 95 | Chain._fix_matches_offset(chain_part_matches, input_string, offset) 96 | Chain._fix_matches_offset(raw_chain_part_matches, input_string, offset) 97 | 98 | if raw_chain_part_matches: 99 | grouped_matches_dict = self._group_by_match_index(chain_part_matches) 100 | grouped_raw_matches_dict = self._group_by_match_index(raw_chain_part_matches) 101 | 102 | for match_index, grouped_raw_matches in grouped_raw_matches_dict.items(): 103 | chain_found = True 104 | offset = grouped_raw_matches[-1].raw_end 105 | chain_input_string = input_string[offset:] 106 | 107 | if not chain_part.is_hidden: 108 | grouped_matches = grouped_matches_dict.get(match_index, []) 109 | if self._chain_breaker_eval(current_chain_matches + grouped_matches): 110 | current_chain_matches.extend(grouped_matches) 111 | return chain_found, chain_input_string, offset 112 | 113 | def _process_match(self, match, match_index, child=False): 114 | """ 115 | Handle a match 116 | :param match: 117 | :type match: 118 | :param match_index: 119 | :type match_index: 120 | :param child: 121 | :type child: 122 | :return: 123 | :rtype: 124 | """ 125 | # pylint: disable=too-many-locals 126 | ret = super()._process_match(match, match_index, child=child) 127 | if ret: 128 | return True 129 | 130 | if match.children: 131 | last_pattern = match.children[-1].pattern 132 | last_pattern_groups = self._group_by_match_index( 133 | [child_ for child_ in match.children if child_.pattern == last_pattern] 134 | ) 135 | 136 | if last_pattern_groups: 137 | original_children = Matches(match.children) 138 | original_end = match.end 139 | 140 | for index in reversed(list(last_pattern_groups)): 141 | last_matches = last_pattern_groups[index] 142 | for last_match in last_matches: 143 | match.children.remove(last_match) 144 | match.end = match.children[-1].end if match.children else match.start 145 | ret = super()._process_match(match, match_index, child=child) 146 | if ret: 147 | return True 148 | 149 | match.children = original_children 150 | match.end = original_end 151 | 152 | return False 153 | 154 | def _build_chain_match(self, current_chain_matches, input_string): 155 | start = None 156 | end = None 157 | for match in current_chain_matches: 158 | if start is None or start > match.start: 159 | start = match.start 160 | if end is None or end < match.end: 161 | end = match.end 162 | match = call(Match, start, end, pattern=self, input_string=input_string, **self._match_kwargs) 163 | for chain_match in current_chain_matches: 164 | if chain_match.children: 165 | for child in chain_match.children: 166 | match.children.append(child) 167 | if chain_match not in match.children: 168 | match.children.append(chain_match) 169 | chain_match.parent = match 170 | return match 171 | 172 | def _chain_breaker_eval(self, matches): 173 | return not self.chain_breaker or not self.chain_breaker(Matches(matches)) 174 | 175 | @staticmethod 176 | def _fix_matches_offset(chain_part_matches, input_string, offset): 177 | for chain_part_match in chain_part_matches: 178 | if chain_part_match.input_string != input_string: 179 | chain_part_match.input_string = input_string 180 | chain_part_match.end += offset 181 | chain_part_match.start += offset 182 | if chain_part_match.children: 183 | Chain._fix_matches_offset(chain_part_match.children, input_string, offset) 184 | 185 | @staticmethod 186 | def _group_by_match_index(matches): 187 | grouped_matches_dict = {} 188 | for match_index, match in itertools.groupby(matches, lambda m: m.match_index): 189 | grouped_matches_dict[match_index] = list(match) 190 | return grouped_matches_dict 191 | 192 | @property 193 | def match_options(self): 194 | return {} 195 | 196 | @property 197 | def patterns(self): 198 | return [self] 199 | 200 | def __repr__(self): 201 | defined = "" 202 | if self.defined_at: 203 | defined = f"@{self.defined_at}" 204 | return f"<{self.__class__.__name__}{defined}:{self.parts}>" 205 | 206 | 207 | class ChainPart(BasePattern): 208 | """ 209 | Part of a pattern chain. 210 | """ 211 | 212 | def __init__(self, chain, pattern): 213 | self._chain = chain 214 | self.pattern = pattern 215 | self.repeater_start = 1 216 | self.repeater_end = 1 217 | self._hidden = False 218 | 219 | @property 220 | def _is_chain_start(self): 221 | return self._chain.parts[0] == self 222 | 223 | def matches(self, input_string, context=None, with_raw_matches=False): 224 | matches, raw_matches = self.pattern.matches(input_string, context=context, with_raw_matches=True) 225 | 226 | matches = self._truncate_repeater(matches, input_string) 227 | raw_matches = self._truncate_repeater(raw_matches, input_string) 228 | 229 | self._validate_repeater(raw_matches) 230 | 231 | if with_raw_matches: 232 | return matches, raw_matches 233 | 234 | return matches 235 | 236 | def _truncate_repeater(self, matches, input_string): 237 | if not matches: 238 | return matches 239 | 240 | if not self._is_chain_start: 241 | separator = input_string[0:matches[0].initiator.raw_start] 242 | if separator: 243 | return [] 244 | 245 | j = 1 246 | for i in range(0, len(matches) - 1): 247 | separator = input_string[matches[i].initiator.raw_end: 248 | matches[i + 1].initiator.raw_start] 249 | if separator: 250 | break 251 | j += 1 252 | truncated = matches[:j] 253 | if self.repeater_end is not None: 254 | truncated = [m for m in truncated if m.match_index < self.repeater_end] 255 | return truncated 256 | 257 | def _validate_repeater(self, matches): 258 | max_match_index = -1 259 | if matches: 260 | max_match_index = max(m.match_index for m in matches) 261 | if max_match_index + 1 < self.repeater_start: 262 | raise _InvalidChainException 263 | 264 | def chain(self): 265 | """ 266 | Add patterns chain, using configuration from this chain 267 | 268 | :return: 269 | :rtype: 270 | """ 271 | return self._chain.chain() 272 | 273 | def hidden(self, hidden=True): 274 | """ 275 | Hide chain part results from global chain result 276 | 277 | :param hidden: 278 | :type hidden: 279 | :return: 280 | :rtype: 281 | """ 282 | self._hidden = hidden 283 | return self 284 | 285 | @property 286 | def is_hidden(self): 287 | """ 288 | Check if the chain part is hidden 289 | :return: 290 | :rtype: 291 | """ 292 | return self._hidden 293 | 294 | def regex(self, *pattern, **kwargs): 295 | """ 296 | Add re pattern 297 | 298 | :param pattern: 299 | :type pattern: 300 | :param kwargs: 301 | :type kwargs: 302 | :return: 303 | :rtype: 304 | """ 305 | return self._chain.regex(*pattern, **kwargs) 306 | 307 | def functional(self, *pattern, **kwargs): 308 | """ 309 | Add functional pattern 310 | 311 | :param pattern: 312 | :type pattern: 313 | :param kwargs: 314 | :type kwargs: 315 | :return: 316 | :rtype: 317 | """ 318 | return self._chain.functional(*pattern, **kwargs) 319 | 320 | def string(self, *pattern, **kwargs): 321 | """ 322 | Add string pattern 323 | 324 | :param pattern: 325 | :type pattern: 326 | :param kwargs: 327 | :type kwargs: 328 | :return: 329 | :rtype: 330 | """ 331 | return self._chain.string(*pattern, **kwargs) 332 | 333 | def close(self): 334 | """ 335 | Close the chain builder to continue registering other patterns 336 | 337 | :return: 338 | :rtype: 339 | """ 340 | return self._chain.close() 341 | 342 | def repeater(self, value): 343 | """ 344 | Define the repeater of the current chain part. 345 | 346 | :param value: 347 | :type value: 348 | :return: 349 | :rtype: 350 | """ 351 | try: 352 | value = int(value) 353 | self.repeater_start = value 354 | self.repeater_end = value 355 | return self 356 | except ValueError: 357 | pass 358 | if value == '+': 359 | self.repeater_start = 1 360 | self.repeater_end = None 361 | if value == '*': 362 | self.repeater_start = 0 363 | self.repeater_end = None 364 | elif value == '?': 365 | self.repeater_start = 0 366 | self.repeater_end = 1 367 | else: 368 | match = re.match(r'\{\s*(\d*)\s*,?\s*(\d*)\s*\}', value) 369 | if match: 370 | start = match.group(1) 371 | end = match.group(2) 372 | if start or end: 373 | self.repeater_start = int(start) if start else 0 374 | self.repeater_end = int(end) if end else None 375 | return self 376 | 377 | def __repr__(self): 378 | return f"{self.pattern}({{{self.repeater_start},{self.repeater_end}}})" 379 | -------------------------------------------------------------------------------- /rebulk/debug.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Debug tools. 5 | 6 | Can be configured by changing values of those variable. 7 | 8 | DEBUG = False 9 | Enable this variable to activate debug features (like defined_at parameters). It can slow down Rebulk 10 | 11 | LOG_LEVEL = 0 12 | Default log level of generated rebulk logs. 13 | """ 14 | 15 | import inspect 16 | import logging 17 | import os 18 | from collections import namedtuple 19 | 20 | 21 | DEBUG = False 22 | LOG_LEVEL = logging.DEBUG 23 | 24 | 25 | class Frame(namedtuple('Frame', ['lineno', 'package', 'name', 'filename'])): 26 | """ 27 | Stack frame representation. 28 | """ 29 | __slots__ = () 30 | 31 | def __repr__(self): 32 | return f"{os.path.basename(self.filename)}#L{self.lineno}" 33 | 34 | 35 | def defined_at(): 36 | """ 37 | Get definition location of a pattern or a match (outside of rebulk package). 38 | :return: 39 | :rtype: 40 | """ 41 | if DEBUG: 42 | frame = inspect.currentframe() 43 | while frame: 44 | try: 45 | if frame.f_globals['__package__'] != __package__: 46 | break 47 | except KeyError: # pragma:no cover 48 | # If package is missing, consider we are in. Workaround for python 3.3. 49 | break 50 | frame = frame.f_back 51 | ret = Frame(frame.f_lineno, 52 | frame.f_globals.get('__package__'), 53 | frame.f_globals.get('__name__'), 54 | frame.f_code.co_filename) 55 | del frame 56 | return ret 57 | -------------------------------------------------------------------------------- /rebulk/formatters.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Formatter functions to use in patterns. 5 | 6 | All those function have last argument as match.value (str). 7 | """ 8 | 9 | 10 | def formatters(*chained_formatters): 11 | """ 12 | Chain formatter functions. 13 | :param chained_formatters: 14 | :type chained_formatters: 15 | :return: 16 | :rtype: 17 | """ 18 | 19 | def formatters_chain(input_string): # pylint:disable=missing-docstring 20 | for chained_formatter in chained_formatters: 21 | input_string = chained_formatter(input_string) 22 | return input_string 23 | 24 | return formatters_chain 25 | 26 | 27 | def default_formatter(input_string): 28 | """ 29 | Default formatter 30 | :param input_string: 31 | :return: 32 | """ 33 | return input_string 34 | -------------------------------------------------------------------------------- /rebulk/introspector.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Introspect rebulk object to retrieve capabilities. 5 | """ 6 | from abc import ABCMeta, abstractmethod 7 | from collections import defaultdict 8 | 9 | from .pattern import StringPattern, RePattern, FunctionalPattern 10 | from .utils import extend_safe 11 | 12 | 13 | class Description(metaclass=ABCMeta): 14 | """ 15 | Abstract class for a description. 16 | """ 17 | @property 18 | @abstractmethod 19 | def properties(self): # pragma: no cover 20 | """ 21 | Properties of described object. 22 | :return: all properties that described object can generate grouped by name. 23 | :rtype: dict 24 | """ 25 | 26 | 27 | class PatternDescription(Description): 28 | """ 29 | Description of a pattern. 30 | """ 31 | def __init__(self, pattern): # pylint:disable=too-many-branches 32 | self.pattern = pattern 33 | self._properties = defaultdict(list) 34 | 35 | if pattern.properties: 36 | for key, values in pattern.properties.items(): 37 | extend_safe(self._properties[key], values) 38 | elif 'value' in pattern.match_options: 39 | self._properties[pattern.name].append(pattern.match_options['value']) 40 | elif isinstance(pattern, StringPattern): 41 | extend_safe(self._properties[pattern.name], pattern.patterns) 42 | elif isinstance(pattern, RePattern): 43 | if pattern.name and pattern.name not in pattern.private_names: 44 | extend_safe(self._properties[pattern.name], [None]) 45 | if not pattern.private_children: 46 | for regex_pattern in pattern.patterns: 47 | for group_name, values in regex_pattern.groupindex.items(): 48 | if group_name not in pattern.private_names: 49 | extend_safe(self._properties[group_name], [None]) 50 | elif isinstance(pattern, FunctionalPattern): 51 | if pattern.name and pattern.name not in pattern.private_names: 52 | extend_safe(self._properties[pattern.name], [None]) 53 | 54 | 55 | @property 56 | def properties(self): 57 | """ 58 | Properties for this rule. 59 | :return: 60 | :rtype: dict 61 | """ 62 | return self._properties 63 | 64 | 65 | class RuleDescription(Description): 66 | """ 67 | Description of a rule. 68 | """ 69 | def __init__(self, rule): 70 | self.rule = rule 71 | 72 | self._properties = defaultdict(list) 73 | 74 | if rule.properties: 75 | for key, values in rule.properties.items(): 76 | extend_safe(self._properties[key], values) 77 | 78 | @property 79 | def properties(self): 80 | """ 81 | Properties for this rule. 82 | :return: 83 | :rtype: dict 84 | """ 85 | return self._properties 86 | 87 | 88 | class Introspection(Description): 89 | """ 90 | Introspection results. 91 | """ 92 | def __init__(self, rebulk, context=None): 93 | self.patterns = [PatternDescription(pattern) for pattern in rebulk.effective_patterns(context) 94 | if not pattern.private and not pattern.marker] 95 | self.rules = [RuleDescription(rule) for rule in rebulk.effective_rules(context)] 96 | 97 | @property 98 | def properties(self): 99 | """ 100 | Properties for Introspection results. 101 | :return: 102 | :rtype: 103 | """ 104 | properties = defaultdict(list) 105 | for pattern in self.patterns: 106 | for key, values in pattern.properties.items(): 107 | extend_safe(properties[key], values) 108 | for rule in self.rules: 109 | for key, values in rule.properties.items(): 110 | extend_safe(properties[key], values) 111 | return properties 112 | 113 | 114 | def introspect(rebulk, context=None): 115 | """ 116 | Introspect a Rebulk instance to grab defined objects and properties that can be generated. 117 | :param rebulk: 118 | :type rebulk: Rebulk 119 | :param context: 120 | :type context: 121 | :return: Introspection instance 122 | :rtype: Introspection 123 | """ 124 | return Introspection(rebulk, context) 125 | -------------------------------------------------------------------------------- /rebulk/loose.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Various utilities functions 5 | """ 6 | 7 | import sys 8 | 9 | from inspect import isclass 10 | try: 11 | from inspect import getfullargspec as getargspec 12 | 13 | _FULLARGSPEC_SUPPORTED = True 14 | except ImportError: 15 | _FULLARGSPEC_SUPPORTED = False 16 | from inspect import getargspec 17 | 18 | from .utils import is_iterable 19 | 20 | if sys.version_info < (3, 4, 0): # pragma: no cover 21 | def _constructor(class_): 22 | """ 23 | Retrieves constructor from given class 24 | 25 | :param class_: 26 | :type class_: class 27 | :return: constructor from given class 28 | :rtype: callable 29 | """ 30 | return class_.__init__ 31 | else: # pragma: no cover 32 | def _constructor(class_): 33 | """ 34 | Retrieves constructor from given class 35 | 36 | :param class_: 37 | :type class_: class 38 | :return: constructor from given class 39 | :rtype: callable 40 | """ 41 | return class_ 42 | 43 | 44 | def call(function, *args, **kwargs): 45 | """ 46 | Call a function or constructor with given args and kwargs after removing args and kwargs that doesn't match 47 | function or constructor signature 48 | 49 | :param function: Function or constructor to call 50 | :type function: callable 51 | :param args: 52 | :type args: 53 | :param kwargs: 54 | :type kwargs: 55 | :return: sale vakye as default function call 56 | :rtype: object 57 | """ 58 | func = constructor_args if isclass(function) else function_args 59 | call_args, call_kwargs = func(function, *args, ignore_unused=True, **kwargs) # @see #20 60 | return function(*call_args, **call_kwargs) 61 | 62 | 63 | def function_args(callable_, *args, **kwargs): 64 | """ 65 | Return (args, kwargs) matching the function signature 66 | 67 | :param callable: callable to inspect 68 | :type callable: callable 69 | :param args: 70 | :type args: 71 | :param kwargs: 72 | :type kwargs: 73 | :return: (args, kwargs) matching the function signature 74 | :rtype: tuple 75 | """ 76 | argspec = getargspec(callable_) # pylint:disable=deprecated-method 77 | return argspec_args(argspec, False, *args, **kwargs) 78 | 79 | 80 | def constructor_args(class_, *args, **kwargs): 81 | """ 82 | Return (args, kwargs) matching the function signature 83 | 84 | :param callable: callable to inspect 85 | :type callable: Callable 86 | :param args: 87 | :type args: 88 | :param kwargs: 89 | :type kwargs: 90 | :return: (args, kwargs) matching the function signature 91 | :rtype: tuple 92 | """ 93 | argspec = getargspec(_constructor(class_)) # pylint:disable=deprecated-method 94 | return argspec_args(argspec, True, *args, **kwargs) 95 | 96 | 97 | def argspec_args(argspec, constructor, *args, **kwargs): 98 | """ 99 | Return (args, kwargs) matching the argspec object 100 | 101 | :param argspec: argspec to use 102 | :type argspec: argspec 103 | :param constructor: is it a constructor ? 104 | :type constructor: bool 105 | :param args: 106 | :type args: 107 | :param kwargs: 108 | :type kwargs: 109 | :return: (args, kwargs) matching the function signature 110 | :rtype: tuple 111 | """ 112 | if argspec.varkw: 113 | call_kwarg = kwargs 114 | else: 115 | call_kwarg = dict((k, kwargs[k]) for k in kwargs if k in argspec.args) # pylint:disable=consider-using-dict-items 116 | if argspec.varargs: 117 | call_args = args 118 | else: 119 | call_args = args[:len(argspec.args) - (1 if constructor else 0)] 120 | return call_args, call_kwarg 121 | 122 | 123 | if not _FULLARGSPEC_SUPPORTED: 124 | def argspec_args_legacy(argspec, constructor, *args, **kwargs): 125 | """ 126 | Return (args, kwargs) matching the argspec object 127 | 128 | :param argspec: argspec to use 129 | :type argspec: argspec 130 | :param constructor: is it a constructor ? 131 | :type constructor: bool 132 | :param args: 133 | :type args: 134 | :param kwargs: 135 | :type kwargs: 136 | :return: (args, kwargs) matching the function signature 137 | :rtype: tuple 138 | """ 139 | if argspec.keywords: 140 | call_kwarg = kwargs 141 | else: 142 | call_kwarg = dict((k, kwargs[k]) for k in kwargs if k in argspec.args) # pylint:disable=consider-using-dict-items 143 | if argspec.varargs: 144 | call_args = args 145 | else: 146 | call_args = args[:len(argspec.args) - (1 if constructor else 0)] 147 | return call_args, call_kwarg 148 | 149 | 150 | argspec_args = argspec_args_legacy 151 | 152 | 153 | def ensure_list(param): 154 | """ 155 | Retrieves a list from given parameter. 156 | 157 | :param param: 158 | :type param: 159 | :return: 160 | :rtype: 161 | """ 162 | if not param: 163 | param = [] 164 | elif not is_iterable(param): 165 | param = [param] 166 | return param 167 | 168 | 169 | def ensure_dict(param, default_value, default_key=None): 170 | """ 171 | Retrieves a dict and a default value from given parameter. 172 | 173 | if parameter is not a dict, it will be promoted as the default value. 174 | 175 | :param param: 176 | :type param: 177 | :param default_value: 178 | :type default_value: 179 | :param default_key: 180 | :type default_key: 181 | :return: 182 | :rtype: 183 | """ 184 | if not param: 185 | param = default_value 186 | if not isinstance(param, dict): 187 | if param: 188 | default_value = param 189 | return {default_key: param}, default_value 190 | return param, default_value 191 | 192 | 193 | def filter_index(collection, predicate=None, index=None): 194 | """ 195 | Filter collection with predicate function and index. 196 | 197 | If index is not found, returns None. 198 | :param collection: 199 | :type collection: collection supporting iteration and slicing 200 | :param predicate: function to filter the collection with 201 | :type predicate: function 202 | :param index: position of a single element to retrieve 203 | :type index: int 204 | :return: filtered list, or single element of filtered list if index is defined 205 | :rtype: list or object 206 | """ 207 | if index is None and isinstance(predicate, int): 208 | index = predicate 209 | predicate = None 210 | if predicate: 211 | collection = collection.__class__(filter(predicate, collection)) 212 | if index is not None: 213 | try: 214 | collection = collection[index] 215 | except IndexError: 216 | collection = None 217 | return collection 218 | 219 | 220 | def set_defaults(defaults, kwargs, override=False): 221 | """ 222 | Set defaults from defaults dict to kwargs dict 223 | 224 | :param override: 225 | :type override: 226 | :param defaults: 227 | :type defaults: 228 | :param kwargs: 229 | :type kwargs: 230 | :return: 231 | :rtype: 232 | """ 233 | if 'clear' in defaults.keys() and defaults.pop('clear'): 234 | kwargs.clear() 235 | for key, value in defaults.items(): 236 | if key in kwargs: 237 | if isinstance(value, list) and isinstance(kwargs[key], list): 238 | kwargs[key] = list(value) + kwargs[key] 239 | elif isinstance(value, dict) and isinstance(kwargs[key], dict): 240 | set_defaults(value, kwargs[key]) 241 | if key not in kwargs or override: 242 | kwargs[key] = value 243 | -------------------------------------------------------------------------------- /rebulk/pattern.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Abstract pattern class definition along with various implementations (regexp, string, functional) 5 | """ 6 | # pylint: disable=super-init-not-called,wrong-import-position 7 | 8 | from abc import ABCMeta, abstractmethod 9 | 10 | from . import debug 11 | from .formatters import default_formatter 12 | from .loose import call, ensure_list, ensure_dict 13 | from .match import Match 14 | from .remodule import re, REGEX_ENABLED 15 | from .utils import find_all, is_iterable, get_first_defined 16 | from .validators import allways_true 17 | 18 | 19 | class BasePattern(metaclass=ABCMeta): 20 | """ 21 | Base class for Pattern like objects 22 | """ 23 | 24 | @abstractmethod 25 | def matches(self, input_string, context=None, with_raw_matches=False): 26 | """ 27 | Computes all matches for a given input 28 | 29 | :param input_string: the string to parse 30 | :type input_string: str 31 | :param context: the context 32 | :type context: dict 33 | :param with_raw_matches: should return details 34 | :type with_raw_matches: dict 35 | :return: matches based on input_string for this pattern 36 | :rtype: iterator[Match] 37 | """ 38 | 39 | 40 | class Pattern(BasePattern, metaclass=ABCMeta): 41 | """ 42 | Definition of a particular pattern to search for. 43 | """ 44 | 45 | def __init__(self, name=None, tags=None, formatter=None, value=None, validator=None, children=False, every=False, 46 | private_parent=False, private_children=False, private=False, private_names=None, ignore_names=None, 47 | marker=False, format_all=False, validate_all=False, disabled=lambda context: False, log_level=None, 48 | properties=None, post_processor=None, pre_match_processor=None, post_match_processor=None, **kwargs): 49 | """ 50 | :param name: Name of this pattern 51 | :type name: str 52 | :param tags: List of tags related to this pattern 53 | :type tags: list[str] 54 | :param formatter: dict (name, func) of formatter to use with this pattern. name is the match name to support, 55 | and func a function(input_string) that returns the formatted string. A single formatter function can also be 56 | passed as a shortcut for {None: formatter}. The returned formatted string with be set in Match.value property. 57 | :type formatter: dict[str, func] || func 58 | :param value: dict (name, value) of value to use with this pattern. name is the match name to support, 59 | and value an object for the match value. A single object value can also be 60 | passed as a shortcut for {None: value}. The value with be set in Match.value property. 61 | :type value: dict[str, object] || object 62 | :param validator: dict (name, func) of validator to use with this pattern. name is the match name to support, 63 | and func a function(match) that returns the a boolean. A single validator function can also be 64 | passed as a shortcut for {None: validator}. If return value is False, match will be ignored. 65 | :param children: generates children instead of parent 66 | :type children: bool 67 | :param every: generates both parent and children. 68 | :type every: bool 69 | :param private: flag this pattern as beeing private. 70 | :type private: bool 71 | :param private_parent: force return of parent and flag parent matches as private. 72 | :type private_parent: bool 73 | :param private_children: force return of children and flag children matches as private. 74 | :type private_children: bool 75 | :param private_names: force return of named matches as private. 76 | :type private_names: bool 77 | :param ignore_names: drop some named matches after validation. 78 | :type ignore_names: bool 79 | :param marker: flag this pattern as beeing a marker. 80 | :type private: bool 81 | :param format_all if True, pattern will format every match in the hierarchy (even match not yield). 82 | :type format_all: bool 83 | :param validate_all if True, pattern will validate every match in the hierarchy (even match not yield). 84 | :type validate_all: bool 85 | :param disabled: if True, this pattern is disabled. Can also be a function(context). 86 | :type disabled: bool|function 87 | :param log_lvl: Log level associated to this pattern 88 | :type log_lvl: int 89 | :param post_processor: Post processing function 90 | :type post_processor: func 91 | :param pre_match_processor: Pre match processing function 92 | :type pre_match_processor: func 93 | :param post_match_processor: Post match processing function 94 | :type post_match_processor: func 95 | """ 96 | # pylint:disable=too-many-locals,unused-argument 97 | self.name = name 98 | self.tags = ensure_list(tags) 99 | self.formatters, self._default_formatter = ensure_dict(formatter, default_formatter) 100 | self.values, self._default_value = ensure_dict(value, None) 101 | self.validators, self._default_validator = ensure_dict(validator, allways_true) 102 | self.every = every 103 | self.children = children 104 | self.private = private 105 | self.private_names = private_names if private_names else [] 106 | self.ignore_names = ignore_names if ignore_names else [] 107 | self.private_parent = private_parent 108 | self.private_children = private_children 109 | self.marker = marker 110 | self.format_all = format_all 111 | self.validate_all = validate_all 112 | if not callable(disabled): 113 | self.disabled = lambda context: disabled 114 | else: 115 | self.disabled = disabled 116 | self._log_level = log_level 117 | self._properties = properties 118 | self.defined_at = debug.defined_at() 119 | if not callable(post_processor): 120 | self.post_processor = None 121 | else: 122 | self.post_processor = post_processor 123 | if not callable(pre_match_processor): 124 | self.pre_match_processor = None 125 | else: 126 | self.pre_match_processor = pre_match_processor 127 | if not callable(post_match_processor): 128 | self.post_match_processor = None 129 | else: 130 | self.post_match_processor = post_match_processor 131 | 132 | @property 133 | def log_level(self): 134 | """ 135 | Log level for this pattern. 136 | :return: 137 | :rtype: 138 | """ 139 | return self._log_level if self._log_level is not None else debug.LOG_LEVEL 140 | 141 | def matches(self, input_string, context=None, with_raw_matches=False): 142 | """ 143 | Computes all matches for a given input 144 | 145 | :param input_string: the string to parse 146 | :type input_string: str 147 | :param context: the context 148 | :type context: dict 149 | :param with_raw_matches: should return details 150 | :type with_raw_matches: dict 151 | :return: matches based on input_string for this pattern 152 | :rtype: iterator[Match] 153 | """ 154 | # pylint: disable=too-many-branches 155 | 156 | matches = [] 157 | raw_matches = [] 158 | 159 | for pattern in self.patterns: 160 | match_index = 0 161 | for match in self._match(pattern, input_string, context): 162 | raw_matches.append(match) 163 | matches.extend(self._process_matches(match, match_index)) 164 | match_index += 1 165 | 166 | matches = self._post_process_matches(matches) 167 | 168 | if with_raw_matches: 169 | return matches, raw_matches 170 | return matches 171 | 172 | @property 173 | def _should_include_children(self): 174 | """ 175 | Check if children matches from this pattern should be included in matches results. 176 | :param match: 177 | :type match: 178 | :return: 179 | :rtype: 180 | """ 181 | return self.children or self.every 182 | 183 | @property 184 | def _should_include_parent(self): 185 | """ 186 | Check is a match from this pattern should be included in matches results. 187 | :param match: 188 | :type match: 189 | :return: 190 | :rtype: 191 | """ 192 | return not self.children or self.every 193 | 194 | @staticmethod 195 | def _match_config_property_keys(match, child=False): 196 | if match.name: 197 | yield match.name 198 | if child: 199 | yield '__children__' 200 | else: 201 | yield '__parent__' 202 | yield None 203 | 204 | @staticmethod 205 | def _process_match_index(match, match_index): 206 | """ 207 | Process match index from this pattern process state. 208 | 209 | :param match: 210 | :return: 211 | """ 212 | match.match_index = match_index 213 | 214 | def _process_match_private(self, match, child=False): 215 | """ 216 | Process match privacy from this pattern configuration. 217 | 218 | :param match: 219 | :param child: 220 | :return: 221 | """ 222 | 223 | if match.name and match.name in self.private_names or \ 224 | not child and self.private_parent or \ 225 | child and self.private_children: 226 | match.private = True 227 | 228 | def _process_match_value(self, match, child=False): 229 | """ 230 | Process match value from this pattern configuration. 231 | :param match: 232 | :return: 233 | """ 234 | keys = self._match_config_property_keys(match, child=child) 235 | pattern_value = get_first_defined(self.values, keys, self._default_value) 236 | if pattern_value: 237 | match.value = pattern_value 238 | 239 | def _process_match_formatter(self, match, child=False): 240 | """ 241 | Process match formatter from this pattern configuration. 242 | 243 | :param match: 244 | :return: 245 | """ 246 | included = self._should_include_children if child else self._should_include_parent 247 | if included or self.format_all: 248 | keys = self._match_config_property_keys(match, child=child) 249 | match.formatter = get_first_defined(self.formatters, keys, self._default_formatter) 250 | 251 | def _process_match_validator(self, match, child=False): 252 | """ 253 | Process match validation from this pattern configuration. 254 | 255 | :param match: 256 | :return: True if match is validated by the configured validator, False otherwise. 257 | """ 258 | included = self._should_include_children if child else self._should_include_parent 259 | if included or self.validate_all: 260 | keys = self._match_config_property_keys(match, child=child) 261 | validator = get_first_defined(self.validators, keys, self._default_validator) 262 | if validator and not validator(match): 263 | return False 264 | return True 265 | 266 | def _process_match(self, match, match_index, child=False): 267 | """ 268 | Process match from this pattern by setting all properties from defined configuration 269 | (index, private, value, formatter, validator, ...). 270 | 271 | :param match: 272 | :type match: 273 | :return: True if match is validated by the configured validator, False otherwise. 274 | :rtype: 275 | """ 276 | self._process_match_index(match, match_index) 277 | self._process_match_private(match, child) 278 | self._process_match_value(match, child) 279 | self._process_match_formatter(match, child) 280 | return self._process_match_validator(match, child) 281 | 282 | @staticmethod 283 | def _process_match_processor(match, processor): 284 | if processor: 285 | ret = processor(match) 286 | if ret is not None: 287 | return ret 288 | return match 289 | 290 | def _process_matches(self, match, match_index): 291 | """ 292 | Process and generate all matches for the given unprocessed match. 293 | :param match: 294 | :param match_index: 295 | :return: Process and dispatched matches. 296 | """ 297 | match = self._process_match_processor(match, self.pre_match_processor) 298 | if not match: 299 | return 300 | 301 | if not self._process_match(match, match_index): 302 | return 303 | 304 | for child in match.children: 305 | if not self._process_match(child, match_index, child=True): 306 | return 307 | 308 | match = self._process_match_processor(match, self.post_match_processor) 309 | if not match: 310 | return 311 | 312 | if (self._should_include_parent or self.private_parent) and match.name not in self.ignore_names: 313 | yield match 314 | if self._should_include_children or self.private_children: 315 | children = [x for x in match.children if x.name not in self.ignore_names] 316 | for child in children: 317 | yield child 318 | 319 | def _post_process_matches(self, matches): 320 | """ 321 | Post process matches with user defined function 322 | :param matches: 323 | :type matches: 324 | :return: 325 | :rtype: 326 | """ 327 | if self.post_processor: 328 | return self.post_processor(matches, self) 329 | return matches 330 | 331 | @property 332 | @abstractmethod 333 | def patterns(self): # pragma: no cover 334 | """ 335 | List of base patterns defined 336 | 337 | :return: A list of base patterns 338 | :rtype: list 339 | """ 340 | 341 | @property 342 | def properties(self): 343 | """ 344 | Properties names and values that can ben retrieved by this pattern. 345 | :return: 346 | :rtype: 347 | """ 348 | if self._properties: 349 | return self._properties 350 | return {} 351 | 352 | @property 353 | @abstractmethod 354 | def match_options(self): # pragma: no cover 355 | """ 356 | dict of default options for generated Match objects 357 | 358 | :return: **options to pass to Match constructor 359 | :rtype: dict 360 | """ 361 | 362 | @abstractmethod 363 | def _match(self, pattern, input_string, context=None): # pragma: no cover 364 | """ 365 | Computes all unprocess matches for a given pattern and input. 366 | 367 | :param pattern: the pattern to use 368 | :param input_string: the string to parse 369 | :type input_string: str 370 | :param context: the context 371 | :type context: dict 372 | :return: matches based on input_string for this pattern 373 | :rtype: iterator[Match] 374 | """ 375 | 376 | def __repr__(self): 377 | defined = "" 378 | if self.defined_at: 379 | defined = f"@{self.defined_at}" 380 | return f"<{self.__class__.__name__}{defined}:{self.__repr__patterns__}>" 381 | 382 | @property 383 | def __repr__patterns__(self): 384 | return self.patterns 385 | 386 | 387 | class StringPattern(Pattern): 388 | """ 389 | Definition of one or many strings to search for. 390 | """ 391 | 392 | def __init__(self, *patterns, **kwargs): 393 | super().__init__(**kwargs) 394 | self._patterns = patterns 395 | self._kwargs = kwargs 396 | self._match_kwargs = filter_match_kwargs(kwargs) 397 | 398 | @property 399 | def patterns(self): 400 | return self._patterns 401 | 402 | @property 403 | def match_options(self): 404 | return self._match_kwargs 405 | 406 | def _match(self, pattern, input_string, context=None): 407 | for index in find_all(input_string, pattern, **self._kwargs): 408 | match = Match(index, index + len(pattern), pattern=self, input_string=input_string, **self._match_kwargs) 409 | if match: 410 | yield match 411 | 412 | 413 | class RePattern(Pattern): 414 | """ 415 | Definition of one or many regular expression pattern to search for. 416 | """ 417 | 418 | def __init__(self, *patterns, **kwargs): 419 | super().__init__(**kwargs) 420 | self.repeated_captures = REGEX_ENABLED 421 | if 'repeated_captures' in kwargs: 422 | self.repeated_captures = kwargs.get('repeated_captures') 423 | if self.repeated_captures and not REGEX_ENABLED: # pragma: no cover 424 | raise NotImplementedError("repeated_capture is available only with regex module.") 425 | self.abbreviations = kwargs.get('abbreviations', []) 426 | self._kwargs = kwargs 427 | self._match_kwargs = filter_match_kwargs(kwargs) 428 | self._children_match_kwargs = filter_match_kwargs(kwargs, children=True) 429 | self._patterns = [] 430 | for pattern in patterns: 431 | if isinstance(pattern, str): 432 | if self.abbreviations and pattern: 433 | for key, replacement in self.abbreviations: 434 | pattern = pattern.replace(key, replacement) 435 | pattern = call(re.compile, pattern, **self._kwargs) 436 | elif isinstance(pattern, dict): 437 | if self.abbreviations and 'pattern' in pattern: 438 | for key, replacement in self.abbreviations: 439 | pattern['pattern'] = pattern['pattern'].replace(key, replacement) 440 | pattern = re.compile(**pattern) 441 | elif hasattr(pattern, '__iter__'): 442 | pattern = re.compile(*pattern) 443 | self._patterns.append(pattern) 444 | 445 | @property 446 | def patterns(self): 447 | return self._patterns 448 | 449 | @property 450 | def __repr__patterns__(self): 451 | return [pattern.pattern for pattern in self.patterns] 452 | 453 | @property 454 | def match_options(self): 455 | return self._match_kwargs 456 | 457 | def _match(self, pattern, input_string, context=None): 458 | names = dict((v, k) for k, v in pattern.groupindex.items()) 459 | for match_object in pattern.finditer(input_string): 460 | start = match_object.start() 461 | end = match_object.end() 462 | main_match = Match(start, end, pattern=self, input_string=input_string, **self._match_kwargs) 463 | 464 | if pattern.groups: 465 | for i in range(1, pattern.groups + 1): 466 | name = names.get(i, main_match.name) 467 | if self.repeated_captures: 468 | for start, end in match_object.spans(i): 469 | child_match = Match(start, end, name=name, parent=main_match, pattern=self, 470 | input_string=input_string, **self._children_match_kwargs) 471 | if child_match: 472 | main_match.children.append(child_match) 473 | else: 474 | start, end = match_object.span(i) 475 | if start > -1 and end > -1: 476 | child_match = Match(start, end, name=name, parent=main_match, pattern=self, 477 | input_string=input_string, **self._children_match_kwargs) 478 | if child_match: 479 | main_match.children.append(child_match) 480 | 481 | if main_match: 482 | yield main_match 483 | 484 | 485 | class FunctionalPattern(Pattern): 486 | """ 487 | Definition of one or many functional pattern to search for. 488 | """ 489 | 490 | def __init__(self, *patterns, **kwargs): 491 | super().__init__(**kwargs) 492 | self._patterns = patterns 493 | self._kwargs = kwargs 494 | self._match_kwargs = filter_match_kwargs(kwargs) 495 | 496 | @property 497 | def patterns(self): 498 | return self._patterns 499 | 500 | @property 501 | def match_options(self): 502 | return self._match_kwargs 503 | 504 | def _match(self, pattern, input_string, context=None): 505 | ret = call(pattern, input_string, context, **self._kwargs) 506 | if ret: 507 | if not is_iterable(ret) or isinstance(ret, dict) \ 508 | or (is_iterable(ret) and hasattr(ret, '__getitem__') and isinstance(ret[0], int)): 509 | args_iterable = [ret] 510 | else: 511 | args_iterable = ret 512 | for args in args_iterable: 513 | if isinstance(args, dict): 514 | options = args 515 | options.pop('input_string', None) 516 | options.pop('pattern', None) 517 | if self._match_kwargs: 518 | options = self._match_kwargs.copy() 519 | options.update(args) 520 | match = Match(pattern=self, input_string=input_string, **options) 521 | if match: 522 | yield match 523 | else: 524 | kwargs = self._match_kwargs 525 | if isinstance(args[-1], dict): 526 | kwargs = dict(kwargs) 527 | kwargs.update(args[-1]) 528 | args = args[:-1] 529 | match = Match(*args, pattern=self, input_string=input_string, **kwargs) 530 | if match: 531 | yield match 532 | 533 | 534 | def filter_match_kwargs(kwargs, children=False): 535 | """ 536 | Filters out kwargs for Match construction 537 | 538 | :param kwargs: 539 | :type kwargs: dict 540 | :param children: 541 | :type children: Flag to filter children matches 542 | :return: A filtered dict 543 | :rtype: dict 544 | """ 545 | kwargs = kwargs.copy() 546 | for key in ('pattern', 'start', 'end', 'parent', 'formatter', 'value'): 547 | if key in kwargs: 548 | del kwargs[key] 549 | if children: 550 | for key in ('name',): 551 | if key in kwargs: 552 | del kwargs[key] 553 | return kwargs 554 | -------------------------------------------------------------------------------- /rebulk/processors.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Processor functions 5 | """ 6 | from logging import getLogger 7 | 8 | from .utils import IdentitySet 9 | 10 | from .rules import Rule, RemoveMatch 11 | 12 | log = getLogger(__name__).log 13 | 14 | DEFAULT = '__default__' 15 | 16 | POST_PROCESS = -2048 17 | PRE_PROCESS = 2048 18 | 19 | 20 | def _default_conflict_solver(match, conflicting_match): 21 | """ 22 | Default conflict solver for matches, shorter matches if they conflicts with longer ones 23 | 24 | :param conflicting_match: 25 | :type conflicting_match: 26 | :param match: 27 | :type match: 28 | :return: 29 | :rtype: 30 | """ 31 | if len(conflicting_match.initiator) < len(match.initiator): 32 | return conflicting_match 33 | if len(match.initiator) < len(conflicting_match.initiator): 34 | return match 35 | return None 36 | 37 | 38 | class ConflictSolver(Rule): 39 | """ 40 | Remove conflicting matches. 41 | """ 42 | priority = PRE_PROCESS 43 | 44 | consequence = RemoveMatch 45 | 46 | @property 47 | def default_conflict_solver(self): 48 | """ 49 | Default conflict solver to use. 50 | """ 51 | return _default_conflict_solver 52 | 53 | def when(self, matches, context): 54 | # pylint:disable=too-many-nested-blocks 55 | to_remove_matches = IdentitySet() 56 | 57 | public_matches = [match for match in matches if not match.private] 58 | public_matches.sort(key=len) 59 | 60 | for match in public_matches: 61 | conflicting_matches = matches.conflicting(match) 62 | 63 | if conflicting_matches: 64 | # keep the match only if it's the longest 65 | conflicting_matches = [conflicting_match for conflicting_match in conflicting_matches if 66 | not conflicting_match.private] 67 | conflicting_matches.sort(key=len) 68 | 69 | for conflicting_match in conflicting_matches: 70 | conflict_solvers = [(self.default_conflict_solver, False)] 71 | 72 | if match.conflict_solver: 73 | conflict_solvers.append((match.conflict_solver, False)) 74 | if conflicting_match.conflict_solver: 75 | conflict_solvers.append((conflicting_match.conflict_solver, True)) 76 | 77 | for conflict_solver, reverse in reversed(conflict_solvers): 78 | if reverse: 79 | to_remove = conflict_solver(conflicting_match, match) 80 | else: 81 | to_remove = conflict_solver(match, conflicting_match) 82 | if to_remove == DEFAULT: 83 | continue 84 | if to_remove and to_remove not in to_remove_matches: 85 | both_matches = [match, conflicting_match] 86 | both_matches.remove(to_remove) 87 | to_keep = both_matches[0] 88 | 89 | if to_keep not in to_remove_matches: 90 | log(self.log_level, "Conflicting match %s will be removed in favor of match %s", 91 | to_remove, to_keep) 92 | 93 | to_remove_matches.add(to_remove) 94 | break 95 | return to_remove_matches 96 | 97 | 98 | class PrivateRemover(Rule): 99 | """ 100 | Removes private matches rule. 101 | """ 102 | priority = POST_PROCESS 103 | 104 | consequence = RemoveMatch 105 | 106 | def when(self, matches, context): 107 | return [match for match in matches if match.private] 108 | -------------------------------------------------------------------------------- /rebulk/rebulk.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Entry point functions and classes for Rebulk 5 | """ 6 | from logging import getLogger 7 | 8 | from .builder import Builder 9 | from .match import Matches 10 | from .processors import ConflictSolver, PrivateRemover 11 | from .rules import Rules 12 | from .utils import extend_safe 13 | 14 | log = getLogger(__name__).log 15 | 16 | 17 | class Rebulk(Builder): 18 | r""" 19 | Regular expression, string and function based patterns are declared in a ``Rebulk`` object. It use a fluent API to 20 | chain ``string``, ``regex``, and ``functional`` methods to define various patterns types. 21 | 22 | .. code-block:: python 23 | 24 | >>> from rebulk import Rebulk 25 | >>> bulk = Rebulk().string('brown').regex(r'qu\w+').functional(lambda s: (20, 25)) 26 | 27 | When ``Rebulk`` object is fully configured, you can call ``matches`` method with an input string to retrieve all 28 | ``Match`` objects found by registered pattern. 29 | 30 | .. code-block:: python 31 | 32 | >>> bulk.matches("The quick brown fox jumps over the lazy dog") 33 | [, , ] 34 | 35 | If multiple ``Match`` objects are found at the same position, only the longer one is kept. 36 | 37 | .. code-block:: python 38 | 39 | >>> bulk = Rebulk().string('lakers').string('la') 40 | >>> bulk.matches("the lakers are from la") 41 | [, ] 42 | """ 43 | 44 | # pylint:disable=protected-access 45 | 46 | def __init__(self, disabled=lambda context: False, default_rules=True): 47 | """ 48 | Creates a new Rebulk object. 49 | :param disabled: if True, this pattern is disabled. Can also be a function(context). 50 | :type disabled: bool|function 51 | :param default_rules: use default rules 52 | :type default_rules: 53 | :return: 54 | :rtype: 55 | """ 56 | super().__init__() 57 | if not callable(disabled): 58 | self.disabled = lambda context: disabled 59 | else: 60 | self.disabled = disabled 61 | self._patterns = [] 62 | self._rules = Rules() 63 | if default_rules: 64 | self.rules(ConflictSolver, PrivateRemover) 65 | self._rebulks = [] 66 | 67 | def pattern(self, *pattern): 68 | """ 69 | Add patterns objects 70 | 71 | :param pattern: 72 | :type pattern: rebulk.pattern.Pattern 73 | :return: self 74 | :rtype: Rebulk 75 | """ 76 | self._patterns.extend(pattern) 77 | return self 78 | 79 | def rules(self, *rules): 80 | """ 81 | Add rules as a module, class or instance. 82 | :param rules: 83 | :type rules: list[Rule] 84 | :return: 85 | """ 86 | self._rules.load(*rules) 87 | return self 88 | 89 | def rebulk(self, *rebulks): 90 | """ 91 | Add a children rebulk object 92 | :param rebulks: 93 | :type rebulks: Rebulk 94 | :return: 95 | """ 96 | self._rebulks.extend(rebulks) 97 | return self 98 | 99 | def matches(self, string, context=None): 100 | """ 101 | Search for all matches with current configuration against input_string 102 | :param string: string to search into 103 | :type string: str 104 | :param context: context to use 105 | :type context: dict 106 | :return: A custom list of matches 107 | :rtype: Matches 108 | """ 109 | matches = Matches(input_string=string) 110 | if context is None: 111 | context = {} 112 | 113 | self._matches_patterns(matches, context) 114 | 115 | self._execute_rules(matches, context) 116 | 117 | return matches 118 | 119 | def effective_rules(self, context=None): 120 | """ 121 | Get effective rules for this rebulk object and its children. 122 | :param context: 123 | :type context: 124 | :return: 125 | :rtype: 126 | """ 127 | rules = Rules() 128 | rules.extend(self._rules) 129 | for rebulk in self._rebulks: 130 | if not rebulk.disabled(context): 131 | extend_safe(rules, rebulk._rules) 132 | return rules 133 | 134 | def _execute_rules(self, matches, context): 135 | """ 136 | Execute rules for this rebulk and children. 137 | :param matches: 138 | :type matches: 139 | :param context: 140 | :type context: 141 | :return: 142 | :rtype: 143 | """ 144 | if not self.disabled(context): 145 | rules = self.effective_rules(context) 146 | rules.execute_all_rules(matches, context) 147 | 148 | def effective_patterns(self, context=None): 149 | """ 150 | Get effective patterns for this rebulk object and its children. 151 | :param context: 152 | :type context: 153 | :return: 154 | :rtype: 155 | """ 156 | patterns = list(self._patterns) 157 | for rebulk in self._rebulks: 158 | if not rebulk.disabled(context): 159 | extend_safe(patterns, rebulk._patterns) 160 | return patterns 161 | 162 | def _matches_patterns(self, matches, context): 163 | """ 164 | Search for all matches with current paterns agains input_string 165 | :param matches: matches list 166 | :type matches: Matches 167 | :param context: context to use 168 | :type context: dict 169 | :return: 170 | :rtype: 171 | """ 172 | if not self.disabled(context): 173 | patterns = self.effective_patterns(context) 174 | for pattern in patterns: 175 | if not pattern.disabled(context): 176 | pattern_matches = pattern.matches(matches.input_string, context) 177 | if pattern_matches: 178 | log(pattern.log_level, "Pattern has %s match(es). (%s)", len(pattern_matches), pattern) 179 | else: 180 | pass 181 | # log(pattern.log_level, "Pattern doesn't match. (%s)" % (pattern,)) 182 | for match in pattern_matches: 183 | if match.marker: 184 | log(pattern.log_level, "Marker found. (%s)", match) 185 | matches.markers.append(match) 186 | else: 187 | log(pattern.log_level, "Match found. (%s)", match) 188 | matches.append(match) 189 | else: 190 | log(pattern.log_level, "Pattern is disabled. (%s)", pattern) 191 | -------------------------------------------------------------------------------- /rebulk/remodule.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Uniform re module 5 | """ 6 | # pylint: disable-all 7 | import os 8 | import logging 9 | 10 | log = logging.getLogger(__name__).log 11 | 12 | REGEX_ENABLED = False 13 | if os.environ.get('REBULK_REGEX_ENABLED') in ["1", "true", "True", "Y"]: 14 | try: 15 | import regex as re 16 | REGEX_ENABLED = True 17 | except ImportError: 18 | log.warning('regex module is not available. Unset REBULK_REGEX_ENABLED environment variable, or install regex module to enabled it.') 19 | import re 20 | else: 21 | import re 22 | -------------------------------------------------------------------------------- /rebulk/rules.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Abstract rule class definition and rule engine implementation 5 | """ 6 | from abc import ABCMeta, abstractmethod 7 | import inspect 8 | from itertools import groupby 9 | from logging import getLogger 10 | 11 | from .utils import is_iterable 12 | 13 | from .toposort import toposort 14 | 15 | from . import debug 16 | 17 | log = getLogger(__name__).log 18 | 19 | 20 | class Consequence(metaclass=ABCMeta): 21 | """ 22 | Definition of a consequence to apply. 23 | """ 24 | @abstractmethod 25 | def then(self, matches, when_response, context): # pragma: no cover 26 | """ 27 | Action implementation. 28 | 29 | :param matches: 30 | :type matches: rebulk.match.Matches 31 | :param context: 32 | :type context: 33 | :param when_response: return object from when call. 34 | :type when_response: object 35 | :return: True if the action was runned, False if it wasn't. 36 | :rtype: bool 37 | """ 38 | 39 | 40 | class Condition(metaclass=ABCMeta): 41 | """ 42 | Definition of a condition to check. 43 | """ 44 | @abstractmethod 45 | def when(self, matches, context): # pragma: no cover 46 | """ 47 | Condition implementation. 48 | 49 | :param matches: 50 | :type matches: rebulk.match.Matches 51 | :param context: 52 | :type context: 53 | :return: truthy if rule should be triggered and execute then action, falsy if it should not. 54 | :rtype: object 55 | """ 56 | 57 | 58 | class CustomRule(Condition, Consequence, metaclass=ABCMeta): 59 | """ 60 | Definition of a rule to apply 61 | """ 62 | # pylint: disable=unused-argument, abstract-method 63 | priority = 0 64 | name = None 65 | dependency = None 66 | properties = {} 67 | 68 | def __init__(self, log_level=None): 69 | self.defined_at = debug.defined_at() 70 | if log_level is None and not hasattr(self, 'log_level'): 71 | self.log_level = debug.LOG_LEVEL 72 | 73 | def enabled(self, context): 74 | """ 75 | Disable rule. 76 | 77 | :param context: 78 | :type context: 79 | :return: True if rule is enabled, False if disabled 80 | :rtype: bool 81 | """ 82 | return True 83 | 84 | def __lt__(self, other): 85 | return self.priority > other.priority 86 | 87 | def __repr__(self): 88 | defined = "" 89 | if self.defined_at: 90 | defined = f"@{self.defined_at}" 91 | return f"<{self.name if self.name else self.__class__.__name__}{defined}>" 92 | 93 | def __eq__(self, other): 94 | return self.__class__ == other.__class__ 95 | 96 | def __hash__(self): 97 | return hash(self.__class__) 98 | 99 | 100 | class Rule(CustomRule): 101 | """ 102 | Definition of a rule to apply 103 | """ 104 | # pylint:disable=abstract-method 105 | consequence = None 106 | 107 | def then(self, matches, when_response, context): 108 | assert self.consequence 109 | if is_iterable(self.consequence): 110 | if not is_iterable(when_response): 111 | when_response = [when_response] 112 | iterator = iter(when_response) 113 | for cons in self.consequence: #pylint: disable=not-an-iterable 114 | if inspect.isclass(cons): 115 | cons = cons() 116 | cons.then(matches, next(iterator), context) 117 | else: 118 | cons = self.consequence 119 | if inspect.isclass(cons): 120 | cons = cons() # pylint:disable=not-callable 121 | cons.then(matches, when_response, context) 122 | 123 | 124 | class RemoveMatch(Consequence): # pylint: disable=abstract-method 125 | """ 126 | Remove matches returned by then 127 | """ 128 | def then(self, matches, when_response, context): 129 | if is_iterable(when_response): 130 | ret = [] 131 | when_response = list(when_response) 132 | for match in when_response: 133 | if match in matches: 134 | matches.remove(match) 135 | ret.append(match) 136 | return ret 137 | if when_response in matches: 138 | matches.remove(when_response) 139 | return when_response 140 | 141 | 142 | class AppendMatch(Consequence): # pylint: disable=abstract-method 143 | """ 144 | Append matches returned by then 145 | """ 146 | def __init__(self, match_name=None): 147 | self.match_name = match_name 148 | 149 | def then(self, matches, when_response, context): 150 | if is_iterable(when_response): 151 | ret = [] 152 | when_response = list(when_response) 153 | for match in when_response: 154 | if match not in matches: 155 | if self.match_name: 156 | match.name = self.match_name 157 | matches.append(match) 158 | ret.append(match) 159 | return ret 160 | if self.match_name: 161 | when_response.name = self.match_name 162 | if when_response not in matches: 163 | matches.append(when_response) 164 | return when_response 165 | 166 | 167 | class RenameMatch(Consequence): # pylint: disable=abstract-method 168 | """ 169 | Rename matches returned by then 170 | """ 171 | def __init__(self, match_name): 172 | self.match_name = match_name 173 | self.remove = RemoveMatch() 174 | self.append = AppendMatch() 175 | 176 | def then(self, matches, when_response, context): 177 | removed = self.remove.then(matches, when_response, context) 178 | if is_iterable(removed): 179 | removed = list(removed) 180 | for match in removed: 181 | match.name = self.match_name 182 | elif removed: 183 | removed.name = self.match_name 184 | if removed: 185 | self.append.then(matches, removed, context) 186 | 187 | 188 | class AppendTags(Consequence): # pylint: disable=abstract-method 189 | """ 190 | Add tags to returned matches 191 | """ 192 | def __init__(self, tags): 193 | self.tags = tags 194 | self.remove = RemoveMatch() 195 | self.append = AppendMatch() 196 | 197 | def then(self, matches, when_response, context): 198 | removed = self.remove.then(matches, when_response, context) 199 | if is_iterable(removed): 200 | removed = list(removed) 201 | for match in removed: 202 | match.tags.extend(self.tags) 203 | elif removed: 204 | removed.tags.extend(self.tags) # pylint: disable=no-member 205 | if removed: 206 | self.append.then(matches, removed, context) 207 | 208 | 209 | class RemoveTags(Consequence): # pylint: disable=abstract-method 210 | """ 211 | Remove tags from returned matches 212 | """ 213 | def __init__(self, tags): 214 | self.tags = tags 215 | self.remove = RemoveMatch() 216 | self.append = AppendMatch() 217 | 218 | def then(self, matches, when_response, context): 219 | removed = self.remove.then(matches, when_response, context) 220 | if is_iterable(removed): 221 | removed = list(removed) 222 | for match in removed: 223 | for tag in self.tags: 224 | if tag in match.tags: 225 | match.tags.remove(tag) 226 | elif removed: 227 | for tag in self.tags: 228 | if tag in removed.tags: # pylint: disable=no-member 229 | removed.tags.remove(tag) # pylint: disable=no-member 230 | if removed: 231 | self.append.then(matches, removed, context) 232 | 233 | 234 | class Rules(list): 235 | """ 236 | list of rules ready to execute. 237 | """ 238 | 239 | def __init__(self, *rules): 240 | super().__init__() 241 | self.load(*rules) 242 | 243 | def load(self, *rules): 244 | """ 245 | Load rules from a Rule module, class or instance 246 | 247 | :param rules: 248 | :type rules: 249 | :return: 250 | :rtype: 251 | """ 252 | for rule in rules: 253 | if inspect.ismodule(rule): 254 | self.load_module(rule) 255 | elif inspect.isclass(rule): 256 | self.load_class(rule) 257 | else: 258 | self.append(rule) 259 | 260 | def load_module(self, module): 261 | """ 262 | Load a rules module 263 | 264 | :param module: 265 | :type module: 266 | :return: 267 | :rtype: 268 | """ 269 | # pylint: disable=unused-variable 270 | for name, obj in inspect.getmembers(module, 271 | lambda member: hasattr(member, '__module__') 272 | and member.__module__ == module.__name__ 273 | and inspect.isclass): 274 | self.load_class(obj) 275 | 276 | def load_class(self, class_): 277 | """ 278 | Load a Rule class. 279 | 280 | :param class_: 281 | :type class_: 282 | :return: 283 | :rtype: 284 | """ 285 | self.append(class_()) 286 | 287 | def execute_all_rules(self, matches, context): 288 | """ 289 | Execute all rules from this rules list. All when condition with same priority will be performed before 290 | calling then actions. 291 | 292 | :param matches: 293 | :type matches: 294 | :param context: 295 | :type context: 296 | :return: 297 | :rtype: 298 | """ 299 | ret = [] 300 | for priority, priority_rules in groupby(sorted(self), lambda rule: rule.priority): 301 | sorted_rules = toposort_rules(list(priority_rules)) # Group by dependency graph toposort 302 | for rules_group in sorted_rules: 303 | rules_group = list(sorted(rules_group, key=self.index)) # Sort rules group based on initial ordering. 304 | group_log_level = None 305 | for rule in rules_group: 306 | if group_log_level is None or group_log_level < rule.log_level: 307 | group_log_level = rule.log_level 308 | log(group_log_level, "%s independent rule(s) at priority %s.", len(rules_group), priority) 309 | for rule in rules_group: 310 | when_response = execute_rule(rule, matches, context) 311 | if when_response is not None: 312 | ret.append((rule, when_response)) 313 | 314 | return ret 315 | 316 | 317 | def execute_rule(rule, matches, context): 318 | """ 319 | Execute the given rule. 320 | :param rule: 321 | :type rule: 322 | :param matches: 323 | :type matches: 324 | :param context: 325 | :type context: 326 | :return: 327 | :rtype: 328 | """ 329 | if rule.enabled(context): 330 | log(rule.log_level, "Checking rule condition: %s", rule) 331 | when_response = rule.when(matches, context) 332 | if when_response: 333 | log(rule.log_level, "Rule was triggered: %s", when_response) 334 | log(rule.log_level, "Running rule consequence: %s %s", rule, when_response) 335 | rule.then(matches, when_response, context) 336 | return when_response 337 | else: 338 | log(rule.log_level, "Rule is disabled: %s", rule) 339 | 340 | def toposort_rules(rules): 341 | """ 342 | Sort given rules using toposort with dependency parameter. 343 | :param rules: 344 | :type rules: 345 | :return: 346 | :rtype: 347 | """ 348 | graph = {} 349 | class_dict = {} 350 | for rule in rules: 351 | if rule.__class__ in class_dict: 352 | raise ValueError(f"Duplicate class rules are not allowed: {rule.__class__}") 353 | class_dict[rule.__class__] = rule 354 | for rule in rules: 355 | if not is_iterable(rule.dependency) and rule.dependency: 356 | rule_dependencies = [rule.dependency] 357 | else: 358 | rule_dependencies = rule.dependency 359 | dependencies = set() 360 | if rule_dependencies: 361 | for dependency in rule_dependencies: 362 | if inspect.isclass(dependency): 363 | dependency = class_dict.get(dependency) 364 | if dependency: 365 | dependencies.add(dependency) 366 | graph[rule] = dependencies 367 | return toposort(graph) 368 | -------------------------------------------------------------------------------- /rebulk/test/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # pylint: disable=pointless-statement, missing-docstring 4 | -------------------------------------------------------------------------------- /rebulk/test/default_rules_module.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # pylint: disable=pointless-statement, missing-docstring, invalid-name, len-as-condition 4 | from ..match import Match 5 | from ..rules import Rule, RemoveMatch, AppendMatch, RenameMatch, AppendTags, RemoveTags 6 | 7 | 8 | class RuleRemove0(Rule): 9 | consequence = RemoveMatch 10 | def when(self, matches, context): 11 | return matches[0] 12 | 13 | 14 | class RuleAppend0(Rule): 15 | consequence = AppendMatch() 16 | def when(self, matches, context): 17 | return Match(5, 10) 18 | 19 | class RuleRename0(Rule): 20 | consequence = [RenameMatch('renamed')] 21 | def when(self, matches, context): 22 | return [Match(5, 10, name="original")] 23 | 24 | class RuleRemove1(Rule): 25 | consequence = [RemoveMatch()] 26 | def when(self, matches, context): 27 | return [matches[0]] 28 | 29 | class RuleAppend1(Rule): 30 | consequence = [AppendMatch] 31 | def when(self, matches, context): 32 | return [Match(5, 10)] 33 | 34 | class RuleRename1(Rule): 35 | consequence = RenameMatch('renamed') 36 | def when(self, matches, context): 37 | return [Match(5, 10, name="original")] 38 | 39 | class RuleAppend2(Rule): 40 | consequence = [AppendMatch('renamed')] 41 | properties = {'renamed': [None]} 42 | def when(self, matches, context): 43 | return [Match(5, 10)] 44 | 45 | class RuleRename2(Rule): 46 | consequence = RenameMatch('renamed') 47 | def when(self, matches, context): 48 | return Match(5, 10, name="original") 49 | 50 | class RuleAppend3(Rule): 51 | consequence = AppendMatch('renamed') 52 | properties = {'renamed': [None]} 53 | def when(self, matches, context): 54 | return [Match(5, 10)] 55 | 56 | class RuleRename3(Rule): 57 | consequence = [RenameMatch('renamed')] 58 | def when(self, matches, context): 59 | return Match(5, 10, name="original") 60 | 61 | class RuleAppendTags0(Rule): 62 | consequence = AppendTags(['new-tag']) 63 | def when(self, matches, context): 64 | return matches.named('tags', 0) 65 | 66 | class RuleRemoveTags0(Rule): 67 | consequence = RemoveTags(['new-tag']) 68 | def when(self, matches, context): 69 | return matches.named('tags', 0) 70 | 71 | class RuleAppendTags1(Rule): 72 | consequence = AppendTags(['new-tag']) 73 | def when(self, matches, context): 74 | return matches.named('tags') 75 | 76 | class RuleRemoveTags1(Rule): 77 | consequence = RemoveTags(['new-tag']) 78 | def when(self, matches, context): 79 | return matches.named('tags') 80 | -------------------------------------------------------------------------------- /rebulk/test/rebulk_rules_module.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # pylint: disable=pointless-statement, missing-docstring, invalid-name, len-as-condition 4 | from rebulk.rules import Rule, RemoveMatch, CustomRule 5 | 6 | 7 | class RemoveAllButLastYear(Rule): 8 | consequence = RemoveMatch 9 | def when(self, matches, context): 10 | entries = matches.named('year') 11 | return entries[:-1] 12 | 13 | 14 | class PrefixedSuffixedYear(CustomRule): 15 | def when(self, matches, context): 16 | toRemove = [] 17 | years = matches.named('year') 18 | for year in years: 19 | if not matches.previous(year, lambda p: p.name == 'yearPrefix') and \ 20 | not matches.next(year, lambda n: n.name == 'yearSuffix'): 21 | toRemove.append(year) 22 | return toRemove 23 | 24 | def then(self, matches, when_response, context): 25 | for to_remove in when_response: 26 | matches.remove(to_remove) 27 | 28 | 29 | class PrefixedSuffixedYearNoLambda(Rule): 30 | consequence = RemoveMatch 31 | def when(self, matches, context): 32 | toRemove = [] 33 | years = matches.named('year') 34 | for year in years: 35 | if not [m for m in matches.previous(year) if m.name == 'yearPrefix'] and \ 36 | not [m for m in matches.next(year) if m.name == 'yearSuffix']: 37 | toRemove.append(year) 38 | return toRemove 39 | -------------------------------------------------------------------------------- /rebulk/test/rules_module.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # pylint: disable=pointless-statement, missing-docstring, invalid-name, len-as-condition 4 | from ..match import Match 5 | from ..rules import Rule 6 | 7 | 8 | class Rule3(Rule): 9 | def when(self, matches, context): 10 | return context.get('when') 11 | 12 | def then(self, matches, when_response, context): 13 | assert when_response in [True, False] 14 | matches.append(Match(3, 4)) 15 | 16 | 17 | class Rule2(Rule): 18 | dependency = Rule3 19 | 20 | def when(self, matches, context): 21 | return True 22 | 23 | def then(self, matches, when_response, context): 24 | assert when_response 25 | matches.append(Match(3, 4)) 26 | 27 | 28 | class Rule1(Rule): 29 | dependency = Rule2 30 | 31 | def when(self, matches, context): 32 | return True 33 | 34 | def then(self, matches, when_response, context): 35 | assert when_response 36 | matches.clear() 37 | 38 | 39 | class Rule0(Rule): 40 | dependency = Rule1 41 | 42 | def when(self, matches, context): 43 | return True 44 | 45 | def then(self, matches, when_response, context): 46 | assert when_response 47 | matches.append(Match(3, 4)) 48 | 49 | 50 | class Rule1Disabled(Rule1): 51 | name = "Disabled Rule1" 52 | 53 | def enabled(self, context): 54 | return False 55 | -------------------------------------------------------------------------------- /rebulk/test/test_chain.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # pylint: disable=pointless-statement, missing-docstring, no-member, len-as-condition, cyclic-import 4 | import re 5 | from functools import partial 6 | 7 | from rebulk.pattern import FunctionalPattern, StringPattern, RePattern 8 | from ..rebulk import Rebulk 9 | from ..validators import chars_surround 10 | 11 | 12 | def test_chain_close(): 13 | rebulk = Rebulk() 14 | ret = rebulk.chain().close() 15 | 16 | assert ret == rebulk 17 | assert len(rebulk.effective_patterns()) == 1 18 | 19 | 20 | def test_build_chain(): 21 | rebulk = Rebulk() 22 | 23 | def digit(input_string): 24 | i = input_string.find("1849") 25 | if i > -1: 26 | return i, i + len("1849") 27 | 28 | ret = rebulk.chain() \ 29 | .functional(digit) \ 30 | .string("test").repeater(2) \ 31 | .string("x").repeater('{1,3}') \ 32 | .string("optional").repeater('?') \ 33 | .regex("f?x").repeater('+') \ 34 | .close() 35 | 36 | assert ret == rebulk 37 | assert len(rebulk.effective_patterns()) == 1 38 | 39 | chain = rebulk.effective_patterns()[0] 40 | 41 | assert len(chain.parts) == 5 42 | 43 | assert isinstance(chain.parts[0].pattern, FunctionalPattern) 44 | assert chain.parts[0].repeater_start == 1 45 | assert chain.parts[0].repeater_end == 1 46 | 47 | assert isinstance(chain.parts[1].pattern, StringPattern) 48 | assert chain.parts[1].repeater_start == 2 49 | assert chain.parts[1].repeater_end == 2 50 | 51 | assert isinstance(chain.parts[2].pattern, StringPattern) 52 | assert chain.parts[2].repeater_start == 1 53 | assert chain.parts[2].repeater_end == 3 54 | 55 | assert isinstance(chain.parts[3].pattern, StringPattern) 56 | assert chain.parts[3].repeater_start == 0 57 | assert chain.parts[3].repeater_end == 1 58 | 59 | assert isinstance(chain.parts[4].pattern, RePattern) 60 | assert chain.parts[4].repeater_start == 1 61 | assert chain.parts[4].repeater_end is None 62 | 63 | 64 | def test_chain_defaults(): 65 | rebulk = Rebulk() 66 | rebulk.defaults(validator=lambda x: x.value.startswith('t'), ignore_names=['testIgnore'], children=True) 67 | 68 | rebulk.chain() \ 69 | .regex("(?Ptest)") \ 70 | .regex(" ").repeater("*") \ 71 | .regex("(?Pbest)") \ 72 | .regex(" ").repeater("*") \ 73 | .regex("(?PtestIgnore)") 74 | matches = rebulk.matches("test best testIgnore") 75 | 76 | assert len(matches) == 1 77 | assert matches[0].name == "test" 78 | 79 | 80 | def test_chain_with_validators(): 81 | def chain_validator(match): 82 | return match.value.startswith('t') and match.value.endswith('t') 83 | 84 | def default_validator(match): 85 | return match.value.startswith('t') and match.value.endswith('g') 86 | 87 | def custom_validator(match): 88 | return match.value.startswith('b') and match.value.endswith('t') 89 | 90 | rebulk = Rebulk() 91 | rebulk.defaults(children=True, validator=default_validator) 92 | 93 | rebulk.chain(validate_all=True, validator={'__parent__': chain_validator}) \ 94 | .regex("(?Ptesting)", validator=default_validator).repeater("+") \ 95 | .regex(" ").repeater("+") \ 96 | .regex("(?Pbest)", validator=custom_validator).repeater("+") 97 | matches = rebulk.matches("some testing best end") 98 | 99 | assert len(matches) == 2 100 | assert matches[0].name == "test" 101 | assert matches[1].name == "best" 102 | 103 | 104 | def test_matches_docs(): 105 | rebulk = Rebulk().regex_defaults(flags=re.IGNORECASE) \ 106 | .defaults(children=True, formatter={'episode': int, 'version': int}) \ 107 | .chain() \ 108 | .regex(r'e(?P\d{1,4})').repeater(1) \ 109 | .regex(r'v(?P\d+)').repeater('?') \ 110 | .regex(r'[ex-](?P\d{1,4})').repeater('*') \ 111 | .close() # .repeater(1) could be omitted as it's the default behavior 112 | 113 | result = rebulk.matches("This is E14v2-15-16-17").to_dict() # converts matches to dict 114 | 115 | assert 'episode' in result 116 | assert result['episode'] == [14, 15, 16, 17] 117 | assert 'version' in result 118 | assert result['version'] == 2 119 | 120 | 121 | def test_matches(): 122 | rebulk = Rebulk() 123 | 124 | def digit(input_string): 125 | i = input_string.find("1849") 126 | if i > -1: 127 | return i, i + len("1849") 128 | 129 | input_string = "1849testtestxxfixfux_foxabc1849testtestxoptionalfoxabc" 130 | 131 | chain = rebulk.chain() \ 132 | .functional(digit) \ 133 | .string("test").hidden().repeater(2) \ 134 | .string("x").hidden().repeater('{1,3}') \ 135 | .string("optional").hidden().repeater('?') \ 136 | .regex("f.?x", name='result').repeater('+') \ 137 | .close() 138 | 139 | matches = chain.matches(input_string) 140 | 141 | assert len(matches) == 2 142 | children = matches[0].children 143 | 144 | assert children[0].value == '1849' 145 | assert children[1].value == 'fix' 146 | assert children[2].value == 'fux' 147 | 148 | children = matches[1].children 149 | assert children[0].value == '1849' 150 | assert children[1].value == 'fox' 151 | 152 | input_string = "_1850testtestxoptionalfoxabc" 153 | matches = chain.matches(input_string) 154 | 155 | assert len(matches) == 0 156 | 157 | input_string = "_1849testtesttesttestxoptionalfoxabc" 158 | matches = chain.matches(input_string) 159 | 160 | assert len(matches) == 0 161 | 162 | input_string = "_1849testtestxxxxoptionalfoxabc" 163 | matches = chain.matches(input_string) 164 | 165 | assert len(matches) == 0 166 | 167 | input_string = "_1849testtestoptionalfoxabc" 168 | matches = chain.matches(input_string) 169 | 170 | assert len(matches) == 0 171 | 172 | input_string = "_1849testtestxoptionalabc" 173 | matches = chain.matches(input_string) 174 | 175 | assert len(matches) == 0 176 | 177 | input_string = "_1849testtestxoptionalfaxabc" 178 | matches = chain.matches(input_string) 179 | 180 | assert len(matches) == 1 181 | children = matches[0].children 182 | 183 | assert children[0].value == '1849' 184 | assert children[1].value == 'fax' 185 | 186 | 187 | def test_matches_2(): 188 | rebulk = Rebulk() \ 189 | .regex_defaults(flags=re.IGNORECASE) \ 190 | .defaults(children=True, formatter={'episode': int, 'version': int}) \ 191 | .chain() \ 192 | .regex(r'e(?P\d{1,4})') \ 193 | .regex(r'v(?P\d+)').repeater('?') \ 194 | .regex(r'[ex-](?P\d{1,4})').repeater('*') \ 195 | .close() 196 | 197 | matches = rebulk.matches("This is E14v2-15E16x17") 198 | assert len(matches) == 5 199 | 200 | assert matches[0].name == 'episode' 201 | assert matches[0].value == 14 202 | 203 | assert matches[1].name == 'version' 204 | assert matches[1].value == 2 205 | 206 | assert matches[2].name == 'episode' 207 | assert matches[2].value == 15 208 | 209 | assert matches[3].name == 'episode' 210 | assert matches[3].value == 16 211 | 212 | assert matches[4].name == 'episode' 213 | assert matches[4].value == 17 214 | 215 | 216 | def test_matches_3(): 217 | alt_dash = (r'@', r'[\W_]') # abbreviation 218 | 219 | match_names = ['season', 'episode'] 220 | other_names = ['screen_size', 'video_codec', 'audio_codec', 'audio_channels', 'container', 'date'] 221 | 222 | rebulk = Rebulk() 223 | rebulk.defaults(formatter={'season': int, 'episode': int}, 224 | tags=['SxxExx'], 225 | abbreviations=[alt_dash], 226 | private_names=['episodeSeparator', 'seasonSeparator'], 227 | children=True, 228 | private_parent=True, 229 | conflict_solver=lambda match, other: match 230 | if match.name in match_names and other.name in other_names 231 | else '__default__') 232 | 233 | rebulk.chain() \ 234 | .defaults(children=True, private_parent=True) \ 235 | .regex(r'(?P\d+)@?x@?(?P\d+)') \ 236 | .regex(r'(?Px|-|\+|&)(?P\d+)').repeater('*') \ 237 | .close() \ 238 | .chain() \ 239 | .defaults(children=True, private_parent=True) \ 240 | .regex(r'S(?P\d+)@?(?:xE|Ex|E|x)@?(?P\d+)') \ 241 | .regex(r'(?:(?PxE|Ex|E|x|-|\+|&)(?P\d+))').repeater('*') \ 242 | .close() \ 243 | .chain() \ 244 | .defaults(children=True, private_parent=True) \ 245 | .regex(r'S(?P\d+)') \ 246 | .regex(r'(?PS|-|\+|&)(?P\d+)').repeater('*') 247 | 248 | matches = rebulk.matches("test-01x02-03") 249 | assert len(matches) == 3 250 | 251 | assert matches[0].name == 'season' 252 | assert matches[0].value == 1 253 | 254 | assert matches[1].name == 'episode' 255 | assert matches[1].value == 2 256 | 257 | assert matches[2].name == 'episode' 258 | assert matches[2].value == 3 259 | 260 | matches = rebulk.matches("test-S01E02-03") 261 | 262 | assert len(matches) == 3 263 | assert matches[0].name == 'season' 264 | assert matches[0].value == 1 265 | 266 | assert matches[1].name == 'episode' 267 | assert matches[1].value == 2 268 | 269 | assert matches[2].name == 'episode' 270 | assert matches[2].value == 3 271 | 272 | matches = rebulk.matches("test-S01-02-03-04") 273 | 274 | assert len(matches) == 4 275 | assert matches[0].name == 'season' 276 | assert matches[0].value == 1 277 | 278 | assert matches[1].name == 'season' 279 | assert matches[1].value == 2 280 | 281 | assert matches[2].name == 'season' 282 | assert matches[2].value == 3 283 | 284 | assert matches[3].name == 'season' 285 | assert matches[3].value == 4 286 | 287 | 288 | def test_matches_4(): 289 | seps_surround = partial(chars_surround, " ") 290 | 291 | rebulk = Rebulk() 292 | rebulk.regex_defaults(flags=re.IGNORECASE) 293 | rebulk.defaults(validate_all=True, children=True) 294 | rebulk.defaults(private_names=['episodeSeparator', 'seasonSeparator'], private_parent=True) 295 | 296 | rebulk.chain(validator={'__parent__': seps_surround}, formatter={'episode': int, 'version': int}) \ 297 | .defaults(formatter={'episode': int, 'version': int}) \ 298 | .regex(r'e(?P\d{1,4})') \ 299 | .regex(r'v(?P\d+)').repeater('?') \ 300 | .regex(r'(?Pe|x|-)(?P\d{1,4})').repeater('*') 301 | 302 | matches = rebulk.matches("Some Series E01E02E03") 303 | assert len(matches) == 3 304 | 305 | assert matches[0].value == 1 306 | assert matches[1].value == 2 307 | assert matches[2].value == 3 308 | 309 | 310 | def test_matches_5(): 311 | seps_surround = partial(chars_surround, " ") 312 | 313 | rebulk = Rebulk() 314 | rebulk.regex_defaults(flags=re.IGNORECASE) 315 | 316 | rebulk.chain(private_names=['episodeSeparator', 'seasonSeparator'], validate_all=True, 317 | validator={'__parent__': seps_surround}, children=True, private_parent=True, 318 | formatter={'episode': int, 'version': int}) \ 319 | .defaults(children=True, private_parent=True) \ 320 | .regex(r'e(?P\d{1,4})') \ 321 | .regex(r'v(?P\d+)').repeater('?') \ 322 | .regex(r'(?Pe|x|-)(?P\d{1,4})').repeater('{2,3}') 323 | 324 | matches = rebulk.matches("Some Series E01E02E03") 325 | assert len(matches) == 3 326 | 327 | matches = rebulk.matches("Some Series E01E02") 328 | assert len(matches) == 0 329 | 330 | matches = rebulk.matches("Some Series E01E02E03E04E05E06") # Parent can't be validated, so no results at all 331 | assert len(matches) == 0 332 | 333 | 334 | def test_matches_6(): 335 | rebulk = Rebulk() 336 | rebulk.regex_defaults(flags=re.IGNORECASE) 337 | rebulk.defaults(private_names=['episodeSeparator', 'seasonSeparator'], validate_all=True, 338 | validator=None, children=True, private_parent=True) 339 | 340 | rebulk.chain(formatter={'episode': int, 'version': int}) \ 341 | .defaults(children=True, private_parent=True) \ 342 | .regex(r'e(?P\d{1,4})') \ 343 | .regex(r'v(?P\d+)').repeater('?') \ 344 | .regex(r'(?Pe|x|-)(?P\d{1,4})').repeater('{2,3}') 345 | 346 | matches = rebulk.matches("Some Series E01E02E03") 347 | assert len(matches) == 3 348 | 349 | matches = rebulk.matches("Some Series E01E02") 350 | assert len(matches) == 0 351 | 352 | matches = rebulk.matches("Some Series E01E02E03E04E05E06") # No validator on parent, so it should give 4 episodes. 353 | assert len(matches) == 4 354 | 355 | 356 | def test_matches_7(): 357 | seps_surround = partial(chars_surround, ' .-/') 358 | rebulk = Rebulk() 359 | rebulk.regex_defaults(flags=re.IGNORECASE) 360 | rebulk.defaults(children=True, private_parent=True) 361 | 362 | rebulk.chain(). \ 363 | regex(r'S(?P\d+)', validate_all=True, validator={'__parent__': seps_surround}). \ 364 | regex(r'[ -](?P\d+)', validator=seps_surround).repeater('*') 365 | 366 | matches = rebulk.matches("Some S01") 367 | assert len(matches) == 1 368 | matches[0].value = 1 369 | 370 | matches = rebulk.matches("Some S01-02") 371 | assert len(matches) == 2 372 | matches[0].value = 1 373 | matches[1].value = 2 374 | 375 | matches = rebulk.matches("programs4/Some S01-02") 376 | assert len(matches) == 2 377 | matches[0].value = 1 378 | matches[1].value = 2 379 | 380 | matches = rebulk.matches("programs4/SomeS01middle.S02-03.andS04here") 381 | assert len(matches) == 2 382 | matches[0].value = 2 383 | matches[1].value = 3 384 | 385 | matches = rebulk.matches("Some 02.and.S04-05.here") 386 | assert len(matches) == 2 387 | matches[0].value = 4 388 | matches[1].value = 5 389 | 390 | 391 | def test_chain_breaker(): 392 | def chain_breaker(matches): 393 | seasons = matches.named('season') 394 | if len(seasons) > 1: 395 | if seasons[-1].value - seasons[-2].value > 10: 396 | return True 397 | return False 398 | 399 | seps_surround = partial(chars_surround, ' .-/') 400 | rebulk = Rebulk() 401 | rebulk.regex_defaults(flags=re.IGNORECASE) 402 | rebulk.defaults(children=True, private_parent=True, formatter={'season': int}) 403 | 404 | rebulk.chain(chain_breaker=chain_breaker). \ 405 | regex(r'S(?P\d+)', validate_all=True, validator={'__parent__': seps_surround}). \ 406 | regex(r'[ -](?P\d+)', validator=seps_surround).repeater('*') 407 | 408 | matches = rebulk.matches("Some S01-02-03-50-51") 409 | assert len(matches) == 3 410 | matches[0].value = 1 411 | matches[1].value = 2 412 | matches[2].value = 3 413 | 414 | 415 | def test_chain_breaker_defaults(): 416 | def chain_breaker(matches): 417 | seasons = matches.named('season') 418 | if len(seasons) > 1: 419 | if seasons[-1].value - seasons[-2].value > 10: 420 | return True 421 | return False 422 | 423 | seps_surround = partial(chars_surround, ' .-/') 424 | rebulk = Rebulk() 425 | rebulk.regex_defaults(flags=re.IGNORECASE) 426 | rebulk.defaults(chain_breaker=chain_breaker, children=True, private_parent=True, formatter={'season': int}) 427 | 428 | rebulk.chain(). \ 429 | regex(r'S(?P\d+)', validate_all=True, validator={'__parent__': seps_surround}). \ 430 | regex(r'[ -](?P\d+)', validator=seps_surround).repeater('*') 431 | 432 | matches = rebulk.matches("Some S01-02-03-50-51") 433 | assert len(matches) == 3 434 | matches[0].value = 1 435 | matches[1].value = 2 436 | matches[2].value = 3 437 | 438 | 439 | def test_chain_breaker_defaults2(): 440 | def chain_breaker(matches): 441 | seasons = matches.named('season') 442 | if len(seasons) > 1: 443 | if seasons[-1].value - seasons[-2].value > 10: 444 | return True 445 | return False 446 | 447 | seps_surround = partial(chars_surround, ' .-/') 448 | rebulk = Rebulk() 449 | rebulk.regex_defaults(flags=re.IGNORECASE) 450 | rebulk.chain_defaults(chain_breaker=chain_breaker) 451 | rebulk.defaults(children=True, private_parent=True, formatter={'season': int}) 452 | 453 | rebulk.chain(). \ 454 | regex(r'S(?P\d+)', validate_all=True, validator={'__parent__': seps_surround}). \ 455 | regex(r'[ -](?P\d+)', validator=seps_surround).repeater('*') 456 | 457 | matches = rebulk.matches("Some S01-02-03-50-51") 458 | assert len(matches) == 3 459 | matches[0].value = 1 460 | matches[1].value = 2 461 | matches[2].value = 3 462 | -------------------------------------------------------------------------------- /rebulk/test/test_debug.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # pylint: disable=pointless-statement, missing-docstring, protected-access, invalid-name, len-as-condition 4 | 5 | from .default_rules_module import RuleRemove0 6 | from .. import debug 7 | from ..match import Match 8 | from ..pattern import StringPattern 9 | from ..rebulk import Rebulk 10 | 11 | 12 | class TestDebug: 13 | # request.addfinalizer(disable_debug) 14 | 15 | debug.DEBUG = True 16 | pattern = StringPattern(1, 3, value="es") 17 | 18 | match = Match(1, 3, value="es") 19 | rule = RuleRemove0() 20 | 21 | input_string = "This is a debug test" 22 | rebulk = Rebulk().string("debug") \ 23 | .string("is") 24 | 25 | matches = rebulk.matches(input_string) 26 | debug.DEBUG = False 27 | 28 | @classmethod 29 | def setup_class(cls): 30 | debug.DEBUG = True 31 | 32 | @classmethod 33 | def teardown_class(cls): 34 | debug.DEBUG = False 35 | 36 | def test_pattern(self): 37 | assert self.pattern.defined_at.lineno > 0 38 | assert self.pattern.defined_at.name == 'rebulk.test.test_debug' 39 | assert self.pattern.defined_at.filename.endswith('test_debug.py') 40 | 41 | assert str(self.pattern.defined_at).startswith('test_debug.py#L') 42 | assert repr(self.pattern).startswith(' 0 46 | assert self.match.defined_at.name == 'rebulk.test.test_debug' 47 | assert self.match.defined_at.filename.endswith('test_debug.py') 48 | 49 | assert str(self.match.defined_at).startswith('test_debug.py#L') 50 | 51 | def test_rule(self): 52 | assert self.rule.defined_at.lineno > 0 53 | assert self.rule.defined_at.name == 'rebulk.test.test_debug' 54 | assert self.rule.defined_at.filename.endswith('test_debug.py') 55 | 56 | assert str(self.rule.defined_at).startswith('test_debug.py#L') 57 | assert repr(self.rule).startswith(' 0 61 | assert self.rebulk._patterns[0].defined_at.name == 'rebulk.test.test_debug' 62 | assert self.rebulk._patterns[0].defined_at.filename.endswith('test_debug.py') 63 | 64 | assert str(self.rebulk._patterns[0].defined_at).startswith('test_debug.py#L') 65 | 66 | assert self.rebulk._patterns[1].defined_at.lineno > 0 67 | assert self.rebulk._patterns[1].defined_at.name == 'rebulk.test.test_debug' 68 | assert self.rebulk._patterns[1].defined_at.filename.endswith('test_debug.py') 69 | 70 | assert str(self.rebulk._patterns[1].defined_at).startswith('test_debug.py#L') 71 | 72 | assert self.matches[0].defined_at == self.rebulk._patterns[0].defined_at # pylint: disable=no-member 73 | assert self.matches[1].defined_at == self.rebulk._patterns[1].defined_at # pylint: disable=no-member 74 | 75 | def test_repr(self): 76 | str(self.matches) 77 | -------------------------------------------------------------------------------- /rebulk/test/test_introspector.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Introspector tests 5 | """ 6 | # pylint: disable=pointless-statement,missing-docstring,protected-access,invalid-name,len-as-condition 7 | from ..rebulk import Rebulk 8 | from .. import introspector 9 | from .default_rules_module import RuleAppend2, RuleAppend3 10 | 11 | 12 | def test_string_introspector(): 13 | rebulk = Rebulk().string('One', 'Two', 'Three', name='first').string('1', '2', '3', name='second') 14 | 15 | introspected = introspector.introspect(rebulk, None) 16 | 17 | assert len(introspected.patterns) == 2 18 | 19 | first_properties = introspected.patterns[0].properties 20 | assert len(first_properties) == 1 21 | first_properties['first'] == ['One', 'Two', 'Three'] 22 | 23 | second_properties = introspected.patterns[1].properties 24 | assert len(second_properties) == 1 25 | second_properties['second'] == ['1', '2', '3'] 26 | 27 | properties = introspected.properties 28 | assert len(properties) == 2 29 | assert properties['first'] == first_properties['first'] 30 | assert properties['second'] == second_properties['second'] 31 | 32 | 33 | def test_string_properties(): 34 | rebulk = Rebulk()\ 35 | .string('One', 'Two', 'Three', name='first', properties={'custom': ['One']})\ 36 | .string('1', '2', '3', name='second', properties={'custom': [1]}) 37 | 38 | introspected = introspector.introspect(rebulk, None) 39 | 40 | assert len(introspected.patterns) == 2 41 | assert len(introspected.rules) == 2 42 | 43 | first_properties = introspected.patterns[0].properties 44 | assert len(first_properties) == 1 45 | first_properties['custom'] == ['One'] 46 | 47 | second_properties = introspected.patterns[1].properties 48 | assert len(second_properties) == 1 49 | second_properties['custom'] == [1] 50 | 51 | properties = introspected.properties 52 | assert len(properties) == 1 53 | assert properties['custom'] == ['One', 1] 54 | 55 | 56 | def test_various_pattern(): 57 | rebulk = Rebulk()\ 58 | .regex('One', 'Two', 'Three', name='first', value="string") \ 59 | .string('1', '2', '3', name='second', value="digit") \ 60 | .string('4', '5', '6', name='third') \ 61 | .string('private', private=True) \ 62 | .functional(lambda string: (0, 5), name='func', value='test') \ 63 | .regex('One', 'Two', 'Three', name='regex_name') \ 64 | .regex('(?POne)(?PTwo)(?PThree)') \ 65 | .functional(lambda string: (6, 10), name='func2') \ 66 | .string('7', name='third') 67 | 68 | introspected = introspector.introspect(rebulk, None) 69 | 70 | assert len(introspected.patterns) == 8 71 | assert len(introspected.rules) == 2 72 | 73 | first_properties = introspected.patterns[0].properties 74 | assert len(first_properties) == 1 75 | first_properties['first'] == ['string'] 76 | 77 | second_properties = introspected.patterns[1].properties 78 | assert len(second_properties) == 1 79 | second_properties['second'] == ['digit'] 80 | 81 | third_properties = introspected.patterns[2].properties 82 | assert len(third_properties) == 1 83 | third_properties['third'] == ['4', '5', '6'] 84 | 85 | func_properties = introspected.patterns[3].properties 86 | assert len(func_properties) == 1 87 | func_properties['func'] == ['test'] 88 | 89 | regex_name_properties = introspected.patterns[4].properties 90 | assert len(regex_name_properties) == 1 91 | regex_name_properties['regex_name'] == [None] 92 | 93 | regex_groups_properties = introspected.patterns[5].properties 94 | assert len(regex_groups_properties) == 3 95 | regex_groups_properties['one'] == [None] 96 | regex_groups_properties['two'] == [None] 97 | regex_groups_properties['three'] == [None] 98 | 99 | func2_properties = introspected.patterns[6].properties 100 | assert len(func2_properties) == 1 101 | func2_properties['func2'] == [None] 102 | 103 | append_third_properties = introspected.patterns[7].properties 104 | assert len(append_third_properties) == 1 105 | append_third_properties['third'] == [None] 106 | 107 | properties = introspected.properties 108 | assert len(properties) == 9 109 | assert properties['first'] == first_properties['first'] 110 | assert properties['second'] == second_properties['second'] 111 | assert properties['third'] == third_properties['third'] + append_third_properties['third'] 112 | assert properties['func'] == func_properties['func'] 113 | assert properties['regex_name'] == regex_name_properties['regex_name'] 114 | assert properties['one'] == regex_groups_properties['one'] 115 | assert properties['two'] == regex_groups_properties['two'] 116 | assert properties['three'] == regex_groups_properties['three'] 117 | assert properties['func2'] == func2_properties['func2'] 118 | 119 | 120 | def test_rule_properties(): 121 | rebulk = Rebulk(default_rules=False).rules(RuleAppend2, RuleAppend3) 122 | 123 | introspected = introspector.introspect(rebulk, None) 124 | 125 | assert len(introspected.rules) == 2 126 | assert len(introspected.patterns) == 0 127 | 128 | rule_properties = introspected.rules[0].properties 129 | assert len(rule_properties) == 1 130 | assert rule_properties['renamed'] == [None] 131 | 132 | rule_properties = introspected.rules[1].properties 133 | assert len(rule_properties) == 1 134 | assert rule_properties['renamed'] == [None] 135 | 136 | properties = introspected.properties 137 | assert len(properties) == 1 138 | assert properties['renamed'] == [None] 139 | -------------------------------------------------------------------------------- /rebulk/test/test_loose.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # pylint: disable=pointless-statement, missing-docstring, invalid-name, len-as-condition 4 | 5 | from ..loose import call 6 | 7 | 8 | def test_loose_function(): 9 | 10 | def func(v1, v2, v3=3, v4=4): 11 | return v1 + v2 + v3 + v4 12 | 13 | assert call(func, 1, 2) == func(1, 2) 14 | assert call(func, 1, 2, 3, 5) == func(1, 2, 3, 5) 15 | assert call(func, 1, 2, v3=4, v4=5) == func(1, 2, v3=4, v4=5) 16 | assert call(func, 1, 2, 3, 4, 5) == func(1, 2, 3, 4) 17 | assert call(func, 1, 2, 3, 4, more=5) == func(1, 2, 3, 4) 18 | 19 | 20 | def test_loose_varargs_function(): 21 | def func(v1, v2, *args): 22 | return v1 + v2 + args[0] if len(args) > 0 else 3 + args[1] if len(args) > 1 else 4 23 | 24 | assert call(func, 1, 2) == func(1, 2) 25 | assert call(func, 1, 2, 3, 5) == func(1, 2, 3, 5) 26 | assert call(func, 1, 2, 3, 4, 5) == func(1, 2, 3, 4) 27 | 28 | 29 | def test_loose_kwargs_function(): 30 | def func(v1, v2, **kwargs): 31 | return v1 + v2 + kwargs.get('v3', 3) + kwargs.get('v4', 4) 32 | 33 | assert call(func, v1=1, v2=2) == func(v1=1, v2=2) 34 | assert call(func, v1=1, v2=2, v3=3, v4=5) == func(v1=1, v2=2, v3=3, v4=5) 35 | 36 | 37 | def test_loose_class(): 38 | class Dummy: 39 | def __init__(self, v1, v2, v3=3, v4=4): 40 | self.v1 = v1 41 | self.v2 = v2 42 | self.v3 = v3 43 | self.v4 = v4 44 | 45 | def call(self): 46 | return self.v1 + self.v2 + self.v3 + self.v4 47 | 48 | assert call(Dummy, 1, 2).call() == Dummy(1, 2).call() 49 | assert call(Dummy, 1, 2, 3, 5).call() == Dummy(1, 2, 3, 5).call() 50 | assert call(Dummy, 1, 2, v3=4, v4=5).call() == Dummy(1, 2, v3=4, v4=5).call() 51 | assert call(Dummy, 1, 2, 3, 4, 5).call() == Dummy(1, 2, 3, 4).call() 52 | assert call(Dummy, 1, 2, 3, 4, more=5).call() == Dummy(1, 2, 3, 4).call() 53 | 54 | 55 | def test_loose_varargs_class(): 56 | class Dummy: 57 | def __init__(self, v1, v2, *args): 58 | self.v1 = v1 59 | self.v2 = v2 60 | self.v3 = args[0] if len(args) > 0 else 3 61 | self.v4 = args[1] if len(args) > 1 else 4 62 | 63 | def call(self): 64 | return self.v1 + self.v2 + self.v3 + self.v4 65 | 66 | assert call(Dummy, 1, 2).call() == Dummy(1, 2).call() 67 | assert call(Dummy, 1, 2, 3, 5).call() == Dummy(1, 2, 3, 5).call() 68 | assert call(Dummy, 1, 2, 3, 4, 5).call() == Dummy(1, 2, 3, 4).call() 69 | 70 | 71 | def test_loose_kwargs_class(): 72 | class Dummy: 73 | def __init__(self, v1, v2, **kwargs): 74 | self.v1 = v1 75 | self.v2 = v2 76 | self.v3 = kwargs.get('v3', 3) 77 | self.v4 = kwargs.get('v4', 4) 78 | 79 | def call(self): 80 | return self.v1 + self.v2 + self.v3 + self.v4 81 | 82 | assert call(Dummy, v1=1, v2=2).call() == Dummy(v1=1, v2=2).call() 83 | assert call(Dummy, v1=1, v2=2, v3=3, v4=5).call() == Dummy(v1=1, v2=2, v3=3, v4=5).call() 84 | -------------------------------------------------------------------------------- /rebulk/test/test_match.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # pylint: disable=pointless-statement, missing-docstring, unneeded-not, len-as-condition 4 | 5 | import pytest 6 | 7 | from ..match import Match, Matches 8 | from ..pattern import StringPattern, RePattern 9 | from ..formatters import formatters 10 | 11 | 12 | class TestMatchClass: 13 | def test_repr(self): 14 | match1 = Match(1, 3, value="es") 15 | 16 | assert repr(match1) == '' 17 | 18 | match2 = Match(0, 4, value="test", private=True, name="abc", tags=['one', 'two']) 19 | 20 | assert repr(match2) == '' 21 | 22 | def test_names(self): 23 | parent = Match(0, 10, name="test") 24 | parent.children.append(Match(0, 10, name="child1", parent=parent)) 25 | parent.children.append(Match(0, 10, name="child2", parent=parent)) 26 | 27 | assert set(parent.names) == set(["child1", "child2"]) 28 | 29 | def test_equality(self): 30 | match1 = Match(1, 3, value="es") 31 | match2 = Match(1, 3, value="es") 32 | 33 | other = object() 34 | 35 | assert hash(match1) == hash(match2) 36 | assert hash(match1) != hash(other) 37 | 38 | assert match1 == match2 39 | assert not match1 == other 40 | 41 | def test_inequality(self): 42 | match1 = Match(0, 2, value="te") 43 | match2 = Match(2, 4, value="st") 44 | match3 = Match(0, 2, value="other") 45 | 46 | other = object() 47 | 48 | assert hash(match1) != hash(match2) 49 | assert hash(match1) != hash(match3) 50 | 51 | assert match1 != other 52 | assert match1 != match2 53 | assert match1 != match3 54 | 55 | def test_length(self): 56 | match1 = Match(0, 4, value="test") 57 | match2 = Match(0, 2, value="spanIsUsed") 58 | 59 | assert len(match1) == 4 60 | assert len(match2) == 2 61 | 62 | def test_compare(self): 63 | match1 = Match(0, 2, value="te") 64 | match2 = Match(2, 4, value="st") 65 | 66 | other = object() 67 | 68 | assert match1 < match2 69 | assert match1 <= match2 70 | 71 | assert match2 > match1 72 | assert match2 >= match1 73 | 74 | with pytest.raises(TypeError): 75 | match1 < other 76 | 77 | with pytest.raises(TypeError): 78 | match1 <= other 79 | 80 | with pytest.raises(TypeError): 81 | match1 > other 82 | 83 | with pytest.raises(TypeError): 84 | match1 >= other 85 | 86 | 87 | def test_value(self): 88 | match1 = Match(1, 3) 89 | match1.value = "test" 90 | 91 | assert match1.value == "test" 92 | 93 | 94 | class TestMatchesClass: 95 | match1 = Match(0, 2, value="te", name="start") 96 | match2 = Match(2, 3, value="s", tags="tag1") 97 | match3 = Match(3, 4, value="t", tags=["tag1", "tag2"]) 98 | match4 = Match(2, 4, value="st", name="end") 99 | 100 | def test_tag(self): 101 | matches = Matches() 102 | matches.append(self.match1) 103 | matches.append(self.match2) 104 | matches.append(self.match3) 105 | matches.append(self.match4) 106 | 107 | assert "start" in matches.names 108 | assert "end" in matches.names 109 | 110 | assert "tag1" in matches.tags 111 | assert "tag2" in matches.tags 112 | 113 | assert self.match3.tagged("tag1") 114 | assert not self.match3.tagged("start") 115 | 116 | tag1 = matches.tagged("tag1") 117 | assert len(tag1) == 2 118 | assert tag1[0] == self.match2 119 | assert tag1[1] == self.match3 120 | 121 | tag2 = matches.tagged("tag2") 122 | assert len(tag2) == 1 123 | assert tag2[0] == self.match3 124 | 125 | start = matches.named("start") 126 | assert len(start) == 1 127 | assert start[0] == self.match1 128 | 129 | end = matches.named("end") 130 | assert len(end) == 1 131 | assert end[0] == self.match4 132 | 133 | def test_base(self): 134 | matches = Matches() 135 | matches.append(self.match1) 136 | 137 | assert len(matches) == 1 138 | assert repr(matches) == repr([self.match1]) 139 | assert list(matches.starting(0)) == [self.match1] 140 | assert list(matches.ending(2)) == [self.match1] 141 | 142 | matches.append(self.match2) 143 | matches.append(self.match3) 144 | matches.append(self.match4) 145 | 146 | assert len(matches) == 4 147 | assert list(matches.starting(2)) == [self.match2, self.match4] 148 | assert list(matches.starting(3)) == [self.match3] 149 | assert list(matches.ending(3)) == [self.match2] 150 | assert list(matches.ending(4)) == [self.match3, self.match4] 151 | assert list(matches.range()) == [self.match1, self.match2, self.match4, self.match3] 152 | assert list(matches.range(0)) == [self.match1, self.match2, self.match4, self.match3] 153 | assert list(matches.range(0, 3)) == [self.match1, self.match2, self.match4] 154 | assert list(matches.range(2, 3)) == [self.match2, self.match4] 155 | assert list(matches.range(3, 4)) == [self.match4, self.match3] 156 | 157 | matches.remove(self.match1) 158 | assert len(matches) == 3 159 | assert len(matches.starting(0)) == 0 160 | assert len(matches.ending(2)) == 0 161 | 162 | matches.clear() 163 | 164 | assert len(matches) == 0 165 | assert len(matches.starting(0)) == 0 166 | assert len(matches.starting(2)) == 0 167 | assert len(matches.starting(3)) == 0 168 | assert len(matches.ending(2)) == 0 169 | assert len(matches.ending(3)) == 0 170 | assert len(matches.ending(4)) == 0 171 | 172 | def test_get_slices(self): 173 | matches = Matches() 174 | matches.append(self.match1) 175 | matches.append(self.match2) 176 | matches.append(self.match3) 177 | matches.append(self.match4) 178 | 179 | slice_matches = matches[1:3] 180 | 181 | assert isinstance(slice_matches, Matches) 182 | 183 | assert len(slice_matches) == 2 184 | assert slice_matches[0] == self.match2 185 | assert slice_matches[1] == self.match3 186 | 187 | def test_remove_slices(self): 188 | matches = Matches() 189 | matches.append(self.match1) 190 | matches.append(self.match2) 191 | matches.append(self.match3) 192 | matches.append(self.match4) 193 | 194 | del matches[1:3] 195 | 196 | assert len(matches) == 2 197 | assert matches[0] == self.match1 198 | assert matches[1] == self.match4 199 | 200 | def test_set_slices(self): 201 | matches = Matches() 202 | matches.append(self.match1) 203 | matches.append(self.match2) 204 | matches.append(self.match3) 205 | matches.append(self.match4) 206 | 207 | matches[1:3] = self.match1, self.match4 208 | 209 | assert len(matches) == 4 210 | assert matches[0] == self.match1 211 | assert matches[1] == self.match1 212 | assert matches[2] == self.match4 213 | assert matches[3] == self.match4 214 | 215 | def test_set_index(self): 216 | matches = Matches() 217 | matches.append(self.match1) 218 | matches.append(self.match2) 219 | matches.append(self.match3) 220 | 221 | matches[1] = self.match4 222 | 223 | assert len(matches) == 3 224 | assert matches[0] == self.match1 225 | assert matches[1] == self.match4 226 | assert matches[2] == self.match3 227 | 228 | def test_constructor(self): 229 | matches = Matches([self.match1, self.match2, self.match3, self.match4]) 230 | 231 | assert len(matches) == 4 232 | assert list(matches.starting(0)) == [self.match1] 233 | assert list(matches.ending(2)) == [self.match1] 234 | assert list(matches.starting(2)) == [self.match2, self.match4] 235 | assert list(matches.starting(3)) == [self.match3] 236 | assert list(matches.ending(3)) == [self.match2] 237 | assert list(matches.ending(4)) == [self.match3, self.match4] 238 | 239 | def test_constructor_kwargs(self): 240 | matches = Matches([self.match1, self.match2, self.match3, self.match4], input_string="test") 241 | 242 | assert len(matches) == 4 243 | assert matches.input_string == "test" 244 | assert list(matches.starting(0)) == [self.match1] 245 | assert list(matches.ending(2)) == [self.match1] 246 | assert list(matches.starting(2)) == [self.match2, self.match4] 247 | assert list(matches.starting(3)) == [self.match3] 248 | assert list(matches.ending(3)) == [self.match2] 249 | assert list(matches.ending(4)) == [self.match3, self.match4] 250 | 251 | def test_crop(self): 252 | input_string = "abcdefghijklmnopqrstuvwxyz" 253 | 254 | match1 = Match(1, 10, input_string=input_string) 255 | match2 = Match(0, 2, input_string=input_string) 256 | match3 = Match(8, 15, input_string=input_string) 257 | 258 | ret = match1.crop([match2, match3.span]) 259 | 260 | assert len(ret) == 1 261 | 262 | assert ret[0].span == (2, 8) 263 | assert ret[0].value == "cdefgh" 264 | 265 | ret = match1.crop((1, 10)) 266 | assert len(ret) == 0 267 | 268 | ret = match1.crop((1, 3)) 269 | assert len(ret) == 1 270 | assert ret[0].span == (3, 10) 271 | 272 | ret = match1.crop((7, 10)) 273 | assert len(ret) == 1 274 | assert ret[0].span == (1, 7) 275 | 276 | ret = match1.crop((0, 12)) 277 | assert len(ret) == 0 278 | 279 | ret = match1.crop((4, 6)) 280 | assert len(ret) == 2 281 | 282 | assert ret[0].span == (1, 4) 283 | assert ret[1].span == (6, 10) 284 | 285 | ret = match1.crop([(3, 5), (7, 9)]) 286 | assert len(ret) == 3 287 | 288 | assert ret[0].span == (1, 3) 289 | assert ret[1].span == (5, 7) 290 | assert ret[2].span == (9, 10) 291 | 292 | def test_split(self): 293 | input_string = "123 +word1 - word2 + word3 456" 294 | match = Match(3, len(input_string) - 3, input_string=input_string) 295 | splitted = match.split(" -+") 296 | 297 | assert len(splitted) == 3 298 | assert [split.value for split in splitted] == ["word1", "word2", "word3"] 299 | 300 | 301 | class TestMaches: 302 | def test_names(self): 303 | input_string = "One Two Three" 304 | 305 | matches = Matches() 306 | 307 | matches.extend(StringPattern("One", name="1-str", tags=["One", "str"]).matches(input_string)) 308 | matches.extend(RePattern("One", name="1-re", tags=["One", "re"]).matches(input_string)) 309 | matches.extend(StringPattern("Two", name="2-str", tags=["Two", "str"]).matches(input_string)) 310 | matches.extend(RePattern("Two", name="2-re", tags=["Two", "re"]).matches(input_string)) 311 | matches.extend(StringPattern("Three", name="3-str", tags=["Three", "str"]).matches(input_string)) 312 | matches.extend(RePattern("Three", name="3-re", tags=["Three", "re"]).matches(input_string)) 313 | 314 | assert set(matches.names) == set(["1-str", "1-re", "2-str", "2-re", "3-str", "3-re"]) 315 | 316 | def test_filters(self): 317 | input_string = "One Two Three" 318 | 319 | matches = Matches() 320 | 321 | matches.extend(StringPattern("One", name="1-str", tags=["One", "str"]).matches(input_string)) 322 | matches.extend(RePattern("One", name="1-re", tags=["One", "re"]).matches(input_string)) 323 | matches.extend(StringPattern("Two", name="2-str", tags=["Two", "str"]).matches(input_string)) 324 | matches.extend(RePattern("Two", name="2-re", tags=["Two", "re"]).matches(input_string)) 325 | matches.extend(StringPattern("Three", name="3-str", tags=["Three", "str"]).matches(input_string)) 326 | matches.extend(RePattern("Three", name="3-re", tags=["Three", "re"]).matches(input_string)) 327 | 328 | selection = matches.starting(0) 329 | assert len(selection) == 2 330 | 331 | selection = matches.starting(0, lambda m: "str" in m.tags) 332 | assert len(selection) == 1 333 | assert selection[0].pattern.name == "1-str" 334 | 335 | selection = matches.ending(7, predicate=lambda m: "str" in m.tags) 336 | assert len(selection) == 1 337 | assert selection[0].pattern.name == "2-str" 338 | 339 | selection = matches.previous(matches.named("2-str")[0]) 340 | assert len(selection) == 2 341 | assert selection[0].pattern.name == "1-str" 342 | assert selection[1].pattern.name == "1-re" 343 | 344 | selection = matches.previous(matches.named("2-str", 0), lambda m: "str" in m.tags) 345 | assert len(selection) == 1 346 | assert selection[0].pattern.name == "1-str" 347 | 348 | selection = matches.next(matches.named("2-str", 0)) 349 | assert len(selection) == 2 350 | assert selection[0].pattern.name == "3-str" 351 | assert selection[1].pattern.name == "3-re" 352 | 353 | selection = matches.next(matches.named("2-str", 0), index=0, predicate=lambda m: "re" in m.tags) 354 | assert selection is not None 355 | assert selection.pattern.name == "3-re" 356 | 357 | selection = matches.next(matches.named("2-str", index=0), lambda m: "re" in m.tags) 358 | assert len(selection) == 1 359 | assert selection[0].pattern.name == "3-re" 360 | 361 | selection = matches.named("2-str", lambda m: "re" in m.tags) 362 | assert len(selection) == 0 363 | 364 | selection = matches.named("2-re", lambda m: "re" in m.tags, 0) 365 | assert selection is not None 366 | assert selection.name == "2-re" # pylint:disable=no-member 367 | 368 | selection = matches.named("2-re", lambda m: "re" in m.tags) 369 | assert len(selection) == 1 370 | assert selection[0].name == "2-re" 371 | 372 | selection = matches.named("2-re", lambda m: "re" in m.tags, index=1000) 373 | assert selection is None 374 | 375 | def test_raw(self): 376 | input_string = "0123456789" 377 | 378 | match = Match(0, 10, input_string=input_string, formatter=lambda s: s*2) 379 | 380 | assert match.value == match.raw * 2 381 | assert match.raw == input_string 382 | 383 | match.raw_end = 9 384 | match.raw_start = 1 385 | 386 | assert match.value == match.raw * 2 387 | assert match.raw == input_string[1:9] 388 | 389 | match.raw_end = None 390 | match.raw_start = None 391 | 392 | assert match.value == match.raw * 2 393 | assert match.raw == input_string 394 | 395 | 396 | def test_formatter_chain(self): 397 | input_string = "100" 398 | 399 | match = Match(0, 3, input_string=input_string, formatter=formatters(int, lambda s: s*2, lambda s: s+10)) 400 | 401 | assert match.raw == input_string 402 | assert match.value == 100 * 2 + 10 403 | 404 | 405 | def test_to_dict(self): 406 | input_string = "One Two Two Three" 407 | 408 | matches = Matches() 409 | 410 | matches.extend(StringPattern("One", name="1", tags=["One", "str"]).matches(input_string)) 411 | matches.extend(RePattern("One", name="1", tags=["One", "re"]).matches(input_string)) 412 | matches.extend(StringPattern("Two", name="2", tags=["Two", "str"]).matches(input_string)) 413 | matches.extend(RePattern("Two", name="2", tags=["Two", "re"]).matches(input_string)) 414 | matches.extend(RePattern("Two", name="2", tags=["Two", "reBis"]).matches(input_string)) 415 | matches.extend(StringPattern("Three", name="3", tags=["Three", "str"]).matches(input_string)) 416 | matches.extend(RePattern("Three", name="3bis", tags=["Three", "re"]).matches(input_string)) 417 | matches.extend(RePattern(r"(\w+)", name="words").matches(input_string)) 418 | 419 | kvalues = matches.to_dict(first_value=True) 420 | assert kvalues == {"1": "One", 421 | "2": "Two", 422 | "3": "Three", 423 | "3bis": "Three", 424 | "words": "One"} 425 | assert kvalues.values_list["words"] == ["One", "Two", "Three"] 426 | 427 | kvalues = matches.to_dict(enforce_list=True) 428 | assert kvalues["words"] == ["One", "Two", "Three"] 429 | 430 | kvalues = matches.to_dict(details=True) 431 | assert kvalues["1"].value == "One" 432 | 433 | assert len(kvalues["2"]) == 2 434 | assert kvalues["2"][0].value == "Two" 435 | assert kvalues["2"][1].value == "Two" 436 | 437 | assert kvalues["3"].value == "Three" 438 | assert kvalues["3bis"].value == "Three" 439 | 440 | assert len(kvalues["words"]) == 4 441 | assert kvalues["words"][0].value == "One" 442 | assert kvalues["words"][1].value == "Two" 443 | assert kvalues["words"][2].value == "Two" 444 | assert kvalues["words"][3].value == "Three" 445 | 446 | kvalues = matches.to_dict(details=True) 447 | assert kvalues["1"].value == "One" 448 | 449 | assert len(kvalues.values_list["2"]) == 2 450 | assert kvalues.values_list["2"][0].value == "Two" 451 | assert kvalues.values_list["2"][1].value == "Two" 452 | 453 | assert kvalues["3"].value == "Three" 454 | assert kvalues["3bis"].value == "Three" 455 | 456 | assert len(kvalues.values_list["words"]) == 4 457 | assert kvalues.values_list["words"][0].value == "One" 458 | assert kvalues.values_list["words"][1].value == "Two" 459 | assert kvalues.values_list["words"][2].value == "Two" 460 | assert kvalues.values_list["words"][3].value == "Three" 461 | 462 | def test_chains(self): 463 | input_string = "wordX 10 20 30 40 wordA, wordB, wordC 70 80 wordX" 464 | 465 | matches = Matches(input_string=input_string) 466 | 467 | matches.extend(RePattern(r"\d+", name="digit").matches(input_string)) 468 | matches.extend(RePattern("[a-zA-Z]+", name="word").matches(input_string)) 469 | 470 | assert len(matches) == 11 471 | 472 | a_start = input_string.find('wordA') 473 | 474 | b_start = input_string.find('wordB') 475 | b_end = b_start + len('wordB') 476 | 477 | c_start = input_string.find('wordC') 478 | c_end = c_start + len('wordC') 479 | 480 | chain_before = matches.chain_before(b_start, " ,", predicate=lambda match: match.name == "word") 481 | assert len(chain_before) == 1 482 | assert chain_before[0].value == 'wordA' 483 | 484 | chain_before = matches.chain_before(Match(b_start, b_start), " ,", predicate=lambda match: match.name == "word") 485 | assert len(chain_before) == 1 486 | assert chain_before[0].value == 'wordA' 487 | 488 | chain_before = matches.chain_before(b_start, " ,", predicate=lambda match: match.name == "digit") 489 | assert len(chain_before) == 0 490 | 491 | chain_before = matches.chain_before(a_start, " ,", predicate=lambda match: match.name == "digit") 492 | assert len(chain_before) == 4 493 | assert [match.value for match in chain_before] == ["40", "30", "20", "10"] 494 | 495 | chain_after = matches.chain_after(b_end, " ,", predicate=lambda match: match.name == "word") 496 | assert len(chain_after) == 1 497 | assert chain_after[0].value == 'wordC' 498 | 499 | chain_after = matches.chain_after(Match(b_end, b_end), " ,", predicate=lambda match: match.name == "word") 500 | assert len(chain_after) == 1 501 | assert chain_after[0].value == 'wordC' 502 | 503 | chain_after = matches.chain_after(b_end, " ,", predicate=lambda match: match.name == "digit") 504 | assert len(chain_after) == 0 505 | 506 | chain_after = matches.chain_after(c_end, " ,", predicate=lambda match: match.name == "digit") 507 | assert len(chain_after) == 2 508 | assert [match.value for match in chain_after] == ["70", "80"] 509 | 510 | chain_after = matches.chain_after(c_end, " ,", end=10000, predicate=lambda match: match.name == "digit") 511 | assert len(chain_after) == 2 512 | assert [match.value for match in chain_after] == ["70", "80"] 513 | 514 | def test_holes(self): 515 | input_string = '1'*10+'2'*10+'3'*10+'4'*10+'5'*10+'6'*10+'7'*10 516 | 517 | hole1 = Match(0, 10, input_string=input_string) 518 | hole2 = Match(20, 30, input_string=input_string) 519 | hole3 = Match(30, 40, input_string=input_string) 520 | hole4 = Match(60, 70, input_string=input_string) 521 | 522 | matches = Matches([hole1, hole2], input_string=input_string) 523 | matches.append(hole3) 524 | matches.append(hole4) 525 | 526 | holes = list(matches.holes()) 527 | assert len(holes) == 2 528 | assert holes[0].span == (10, 20) 529 | assert holes[0].value == '2'*10 530 | assert holes[1].span == (40, 60) 531 | assert holes[1].value == '5' * 10 + '6' * 10 532 | 533 | holes = list(matches.holes(5, 15)) 534 | assert len(holes) == 1 535 | assert holes[0].span == (10, 15) 536 | assert holes[0].value == '2'*5 537 | 538 | holes = list(matches.holes(5, 15, formatter=lambda value: "formatted")) 539 | assert len(holes) == 1 540 | assert holes[0].span == (10, 15) 541 | assert holes[0].value == "formatted" 542 | 543 | holes = list(matches.holes(5, 15, predicate=lambda hole: False)) 544 | assert len(holes) == 0 545 | 546 | def test_holes_empty(self): 547 | input_string = "Test hole on empty matches" 548 | matches = Matches(input_string=input_string) 549 | holes = matches.holes() 550 | assert len(holes) == 1 551 | assert holes[0].value == input_string 552 | 553 | def test_holes_seps(self): 554 | input_string = "Test hole - with many separators + included" 555 | match = StringPattern("many").matches(input_string) 556 | 557 | matches = Matches(match, input_string) 558 | holes = matches.holes() 559 | 560 | assert len(holes) == 2 561 | 562 | holes = matches.holes(seps="-+") 563 | 564 | assert len(holes) == 4 565 | assert [hole.value for hole in holes] == ["Test hole ", " with ", " separators ", " included"] 566 | -------------------------------------------------------------------------------- /rebulk/test/test_processors.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # pylint: disable=pointless-statement, missing-docstring, no-member, len-as-condition 4 | 5 | from ..pattern import StringPattern, RePattern 6 | from ..processors import ConflictSolver 7 | from ..rules import execute_rule 8 | from ..match import Matches 9 | 10 | 11 | def test_conflict_1(): 12 | input_string = "abcdefghijklmnopqrstuvwxyz" 13 | 14 | pattern = StringPattern("ijklmn", "kl", "abcdef", "ab", "ef", "yz") 15 | matches = Matches(pattern.matches(input_string)) 16 | 17 | execute_rule(ConflictSolver(), matches, None) 18 | 19 | values = [x.value for x in matches] 20 | 21 | assert values == ["ijklmn", "abcdef", "yz"] 22 | 23 | 24 | def test_conflict_2(): 25 | input_string = "abcdefghijklmnopqrstuvwxyz" 26 | 27 | pattern = StringPattern("ijklmn", "jklmnopqrst") 28 | matches = Matches(pattern.matches(input_string)) 29 | 30 | execute_rule(ConflictSolver(), matches, None) 31 | 32 | values = [x.value for x in matches] 33 | 34 | assert values == ["jklmnopqrst"] 35 | 36 | 37 | def test_conflict_3(): 38 | input_string = "abcdefghijklmnopqrstuvwxyz" 39 | 40 | pattern = StringPattern("ijklmnopqrst", "jklmnopqrst") 41 | matches = Matches(pattern.matches(input_string)) 42 | 43 | execute_rule(ConflictSolver(), matches, None) 44 | 45 | values = [x.value for x in matches] 46 | 47 | assert values == ["ijklmnopqrst"] 48 | 49 | 50 | def test_conflict_4(): 51 | input_string = "123456789" 52 | 53 | pattern = StringPattern("123", "456789") 54 | matches = Matches(pattern.matches(input_string)) 55 | 56 | execute_rule(ConflictSolver(), matches, None) 57 | 58 | values = [x.value for x in matches] 59 | assert values == ["123", "456789"] 60 | 61 | 62 | def test_conflict_5(): 63 | input_string = "123456789" 64 | 65 | pattern = StringPattern("123456", "789") 66 | matches = Matches(pattern.matches(input_string)) 67 | 68 | execute_rule(ConflictSolver(), matches, None) 69 | 70 | values = [x.value for x in matches] 71 | assert values == ["123456", "789"] 72 | 73 | 74 | def test_prefer_longer_parent(): 75 | input_string = "xxx.1x02.xxx" 76 | 77 | re1 = RePattern("([0-9]+)x([0-9]+)", name='prefer', children=True, formatter=int) 78 | re2 = RePattern("x([0-9]+)", name='skip', children=True) 79 | 80 | matches = Matches(re1.matches(input_string)) 81 | matches.extend(re2.matches(input_string)) 82 | 83 | execute_rule(ConflictSolver(), matches, None) 84 | assert len(matches) == 2 85 | assert matches[0].value == 1 86 | assert matches[1].value == 2 87 | 88 | 89 | def test_conflict_solver_1(): 90 | input_string = "123456789" 91 | 92 | re1 = StringPattern("2345678", conflict_solver=lambda match, conflicting: '__default__') 93 | re2 = StringPattern("34567") 94 | 95 | matches = Matches(re1.matches(input_string)) 96 | matches.extend(re2.matches(input_string)) 97 | 98 | execute_rule(ConflictSolver(), matches, None) 99 | assert len(matches) == 1 100 | assert matches[0].value == "2345678" 101 | 102 | 103 | def test_conflict_solver_2(): 104 | input_string = "123456789" 105 | 106 | re1 = StringPattern("2345678", conflict_solver=lambda match, conflicting: '__default__') 107 | re2 = StringPattern("34567", conflict_solver=lambda match, conflicting: conflicting) 108 | 109 | matches = Matches(re1.matches(input_string)) 110 | matches.extend(re2.matches(input_string)) 111 | 112 | execute_rule(ConflictSolver(), matches, None) 113 | assert len(matches) == 1 114 | assert matches[0].value == "34567" 115 | 116 | 117 | def test_conflict_solver_3(): 118 | input_string = "123456789" 119 | 120 | re1 = StringPattern("2345678", conflict_solver=lambda match, conflicting: match) 121 | re2 = StringPattern("34567") 122 | 123 | matches = Matches(re1.matches(input_string)) 124 | matches.extend(re2.matches(input_string)) 125 | 126 | execute_rule(ConflictSolver(), matches, None) 127 | assert len(matches) == 1 128 | assert matches[0].value == "34567" 129 | 130 | 131 | def test_conflict_solver_4(): 132 | input_string = "123456789" 133 | 134 | re1 = StringPattern("2345678") 135 | re2 = StringPattern("34567", conflict_solver=lambda match, conflicting: conflicting) 136 | 137 | matches = Matches(re1.matches(input_string)) 138 | matches.extend(re2.matches(input_string)) 139 | 140 | execute_rule(ConflictSolver(), matches, None) 141 | assert len(matches) == 1 142 | assert matches[0].value == "34567" 143 | 144 | 145 | def test_conflict_solver_5(): 146 | input_string = "123456789" 147 | 148 | re1 = StringPattern("2345678", conflict_solver=lambda match, conflicting: conflicting) 149 | re2 = StringPattern("34567") 150 | 151 | matches = Matches(re1.matches(input_string)) 152 | matches.extend(re2.matches(input_string)) 153 | 154 | execute_rule(ConflictSolver(), matches, None) 155 | assert len(matches) == 1 156 | assert matches[0].value == "2345678" 157 | 158 | 159 | def test_conflict_solver_6(): 160 | input_string = "123456789" 161 | 162 | re1 = StringPattern("2345678") 163 | re2 = StringPattern("34567", conflict_solver=lambda match, conflicting: conflicting) 164 | 165 | matches = Matches(re1.matches(input_string)) 166 | matches.extend(re2.matches(input_string)) 167 | 168 | execute_rule(ConflictSolver(), matches, None) 169 | assert len(matches) == 1 170 | assert matches[0].value == "34567" 171 | 172 | 173 | def test_conflict_solver_7(): 174 | input_string = "102" 175 | 176 | re1 = StringPattern("102") 177 | re2 = StringPattern("02") 178 | 179 | matches = Matches(re2.matches(input_string)) 180 | matches.extend(re1.matches(input_string)) 181 | 182 | execute_rule(ConflictSolver(), matches, None) 183 | assert len(matches) == 1 184 | assert matches[0].value == "102" 185 | 186 | 187 | def test_unresolved(): 188 | input_string = "123456789" 189 | 190 | re1 = StringPattern("23456") 191 | re2 = StringPattern("34567") 192 | 193 | matches = Matches(re1.matches(input_string)) 194 | matches.extend(re2.matches(input_string)) 195 | 196 | execute_rule(ConflictSolver(), matches, None) 197 | assert len(matches) == 2 198 | 199 | re1 = StringPattern("34567") 200 | re2 = StringPattern("2345678", conflict_solver=lambda match, conflicting: None) 201 | 202 | matches = Matches(re1.matches(input_string)) 203 | matches.extend(re2.matches(input_string)) 204 | 205 | execute_rule(ConflictSolver(), matches, None) 206 | assert len(matches) == 2 207 | 208 | re1 = StringPattern("34567", conflict_solver=lambda match, conflicting: None) 209 | re2 = StringPattern("2345678") 210 | 211 | matches = Matches(re1.matches(input_string)) 212 | matches.extend(re2.matches(input_string)) 213 | 214 | execute_rule(ConflictSolver(), matches, None) 215 | assert len(matches) == 2 216 | -------------------------------------------------------------------------------- /rebulk/test/test_rebulk.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # pylint: disable=pointless-statement, missing-docstring, no-member, len-as-condition 4 | 5 | from ..rebulk import Rebulk 6 | from ..rules import Rule 7 | from . import rebulk_rules_module as rm 8 | 9 | 10 | def test_rebulk_simple(): 11 | rebulk = Rebulk() 12 | 13 | rebulk.string("quick") 14 | rebulk.regex("f.x") 15 | 16 | def func(input_string): 17 | i = input_string.find("over") 18 | if i > -1: 19 | return i, i + len("over") 20 | 21 | rebulk.functional(func) 22 | 23 | input_string = "The quick brown fox jumps over the lazy dog" 24 | 25 | matches = rebulk.matches(input_string) 26 | assert len(matches) == 3 27 | 28 | assert matches[0].value == "quick" 29 | assert matches[1].value == "fox" 30 | assert matches[2].value == "over" 31 | 32 | 33 | def test_rebulk_composition(): 34 | rebulk = Rebulk() 35 | 36 | rebulk.string("quick") 37 | rebulk.rebulk(Rebulk().regex("f.x")) 38 | 39 | rebulk.rebulk(Rebulk(disabled=lambda context: True).functional(lambda string: None)) 40 | 41 | input_string = "The quick brown fox jumps over the lazy dog" 42 | 43 | matches = rebulk.matches(input_string) 44 | assert len(matches) == 2 45 | 46 | assert matches[0].value == "quick" 47 | assert matches[1].value == "fox" 48 | 49 | 50 | def test_rebulk_context(): 51 | rebulk = Rebulk() 52 | 53 | context = {'nostring': True, 'word': 'lazy'} 54 | 55 | rebulk.string("quick", disabled=lambda context: context.get('nostring', False)) 56 | rebulk.regex("f.x", disabled=lambda context: context.get('noregex', False)) 57 | 58 | def func(input_string, context): 59 | word = context.get('word', 'over') 60 | i = input_string.find(word) 61 | if i > -1: 62 | return i, i + len(word) 63 | 64 | rebulk.functional(func) 65 | 66 | input_string = "The quick brown fox jumps over the lazy dog" 67 | 68 | matches = rebulk.matches(input_string, context) 69 | assert len(matches) == 2 70 | 71 | assert matches[0].value == "fox" 72 | assert matches[1].value == "lazy" 73 | 74 | 75 | def test_rebulk_prefer_longer(): 76 | input_string = "The quick brown fox jumps over the lazy dog" 77 | 78 | matches = Rebulk().string("quick").string("own").regex("br.{2}n").matches(input_string) 79 | 80 | assert len(matches) == 2 81 | 82 | assert matches[0].value == "quick" 83 | assert matches[1].value == "brown" 84 | 85 | 86 | def test_rebulk_defaults(): 87 | input_string = "The quick brown fox jumps over the lazy dog" 88 | 89 | def func(input_string): 90 | i = input_string.find("fox") 91 | if i > -1: 92 | return i, i + len("fox") 93 | 94 | matches = Rebulk()\ 95 | .string_defaults(name="string", tags=["a", "b"])\ 96 | .regex_defaults(name="regex") \ 97 | .functional_defaults(name="functional") \ 98 | .string("quick", tags=["c"])\ 99 | .functional(func)\ 100 | .regex("br.{2}n") \ 101 | .matches(input_string) 102 | assert matches[0].name == "string" 103 | assert matches[0].tags == ["a", "b", "c"] 104 | assert matches[1].name == "functional" 105 | assert matches[2].name == "regex" 106 | 107 | matches = Rebulk() \ 108 | .defaults(name="default", tags=["0"])\ 109 | .string_defaults(name="string", tags=["a", "b"]) \ 110 | .functional_defaults(name="functional", tags=["1"]) \ 111 | .string("quick", tags=["c"]) \ 112 | .functional(func) \ 113 | .regex("br.{2}n") \ 114 | .matches(input_string) 115 | assert matches[0].name == "string" 116 | assert matches[0].tags == ["0", "a", "b", "c"] 117 | assert matches[1].name == "functional" 118 | assert matches[1].tags == ["0", "1"] 119 | assert matches[2].name == "default" 120 | assert matches[2].tags == ["0"] 121 | 122 | 123 | def test_rebulk_defaults_overrides(): 124 | input_string = "The quick brown fox jumps over the lazy dog" 125 | 126 | def func(input_string): 127 | i = input_string.find("fox") 128 | if i > -1: 129 | return i, i + len("fox") 130 | 131 | matches = Rebulk() \ 132 | .string_defaults(name="string", tags=["a", "b"]) \ 133 | .regex_defaults(name="regex", tags=["d"]) \ 134 | .functional_defaults(name="functional") \ 135 | .string("quick", tags=["c"], overrides=["tags"]) \ 136 | .functional(func) \ 137 | .regex("br.{2}n") \ 138 | .matches(input_string) 139 | assert matches[0].name == "string" 140 | assert matches[0].tags == ["c"] 141 | assert matches[1].name == "functional" 142 | assert matches[2].name == "regex" 143 | assert matches[2].tags == ["d"] 144 | 145 | matches = Rebulk() \ 146 | .defaults(name="default", tags=["0"]) \ 147 | .string_defaults(name="string", tags=["a", "b"]) \ 148 | .functional_defaults(name="functional", tags=["1"]) \ 149 | .string("quick", tags=["c"]) \ 150 | .functional(func) \ 151 | .regex("br.{2}n") \ 152 | .matches(input_string) 153 | assert matches[0].name == "string" 154 | assert matches[0].tags == ["0", "a", "b", "c"] 155 | assert matches[1].name == "functional" 156 | assert matches[1].tags == ["0", "1"] 157 | assert matches[2].name == "default" 158 | assert matches[2].tags == ["0"] 159 | 160 | 161 | def test_rebulk_rebulk(): 162 | input_string = "The quick brown fox jumps over the lazy dog" 163 | 164 | base = Rebulk().string("quick") 165 | child = Rebulk().string("own").regex("br.{2}n") 166 | 167 | matches = base.rebulk(child).matches(input_string) 168 | 169 | assert len(matches) == 2 170 | 171 | assert matches[0].value == "quick" 172 | assert matches[1].value == "brown" 173 | 174 | 175 | def test_rebulk_no_default(): 176 | input_string = "The quick brown fox jumps over the lazy dog" 177 | 178 | matches = Rebulk(default_rules=False).string("quick").string("own").regex("br.{2}n").matches(input_string) 179 | 180 | assert len(matches) == 3 181 | 182 | assert matches[0].value == "quick" 183 | assert matches[1].value == "own" 184 | assert matches[2].value == "brown" 185 | 186 | 187 | def test_rebulk_empty_match(): 188 | input_string = "The quick brown fox jumps over the lazy dog" 189 | 190 | matches = Rebulk(default_rules=False).string("quick").string("own").regex("br(.*?)own", children=True)\ 191 | .matches(input_string) 192 | 193 | assert len(matches) == 2 194 | 195 | assert matches[0].value == "quick" 196 | assert matches[1].value == "own" 197 | 198 | 199 | def test_rebulk_tags_names(): 200 | rebulk = Rebulk() 201 | 202 | rebulk.string("quick", name="str", tags=["first", "other"]) 203 | rebulk.regex("f.x", tags="other") 204 | 205 | def func(input_string): 206 | i = input_string.find("over") 207 | if i > -1: 208 | return i, i + len("over"), {'tags': ['custom']} 209 | 210 | rebulk.functional(func, name="fn") 211 | 212 | def func2(input_string): 213 | i = input_string.find("lazy") 214 | if i > -1: 215 | return {'start': i, 'end': i + len("lazy"), 'tags': ['custom']} 216 | 217 | rebulk.functional(func2, name="fn") 218 | 219 | input_string = "The quick brown fox jumps over the lazy dog" 220 | 221 | matches = rebulk.matches(input_string) 222 | assert len(matches) == 4 223 | 224 | assert len(matches.named("str")) == 1 225 | assert len(matches.named("fn")) == 2 226 | assert len(matches.named("false")) == 0 227 | assert len(matches.tagged("false")) == 0 228 | assert len(matches.tagged("first")) == 1 229 | assert len(matches.tagged("other")) == 2 230 | assert len(matches.tagged("custom")) == 2 231 | 232 | 233 | def test_rebulk_rules_1(): 234 | rebulk = Rebulk() 235 | 236 | rebulk.regex(r'\d{4}', name="year") 237 | rebulk.rules(rm.RemoveAllButLastYear) 238 | 239 | matches = rebulk.matches("1984 keep only last 1968 entry 1982 case") 240 | assert len(matches) == 1 241 | assert matches[0].value == "1982" 242 | 243 | 244 | def test_rebulk_rules_2(): 245 | rebulk = Rebulk() 246 | 247 | rebulk.regex(r'\d{4}', name="year") 248 | rebulk.string(r'year', name="yearPrefix", private=True) 249 | rebulk.string(r'keep', name="yearSuffix", private=True) 250 | rebulk.rules(rm.PrefixedSuffixedYear) 251 | 252 | matches = rebulk.matches("Keep suffix 1984 keep prefixed year 1968 and remove the rest 1982") 253 | assert len(matches) == 2 254 | assert matches[0].value == "1984" 255 | assert matches[1].value == "1968" 256 | 257 | 258 | def test_rebulk_rules_3(): 259 | rebulk = Rebulk() 260 | 261 | rebulk.regex(r'\d{4}', name="year") 262 | rebulk.string(r'year', name="yearPrefix", private=True) 263 | rebulk.string(r'keep', name="yearSuffix", private=True) 264 | rebulk.rules(rm.PrefixedSuffixedYearNoLambda) 265 | 266 | matches = rebulk.matches("Keep suffix 1984 keep prefixed year 1968 and remove the rest 1982") 267 | assert len(matches) == 2 268 | assert matches[0].value == "1984" 269 | assert matches[1].value == "1968" 270 | 271 | 272 | def test_rebulk_rules_4(): 273 | class FirstOnlyRule(Rule): 274 | def when(self, matches, context): 275 | grabbed = matches.named("grabbed", 0) 276 | if grabbed and matches.previous(grabbed): 277 | return grabbed 278 | 279 | def then(self, matches, when_response, context): 280 | matches.remove(when_response) 281 | 282 | rebulk = Rebulk() 283 | 284 | rebulk.regex("This match (.*?)grabbed", name="grabbed") 285 | rebulk.regex("if it's (.*?)first match", private=True) 286 | 287 | rebulk.rules(FirstOnlyRule) 288 | 289 | matches = rebulk.matches("This match is grabbed only if it's the first match") 290 | assert len(matches) == 1 291 | assert matches[0].value == "This match is grabbed" 292 | 293 | matches = rebulk.matches("if it's NOT the first match, This match is NOT grabbed") 294 | assert len(matches) == 0 295 | 296 | 297 | class TestMarkers: 298 | def test_one_marker(self): 299 | class MarkerRule(Rule): 300 | def when(self, matches, context): 301 | word_match = matches.named("word", 0) 302 | marker = matches.markers.at_match(word_match, lambda marker: marker.name == "mark1", 0) 303 | if not marker: 304 | return word_match 305 | 306 | def then(self, matches, when_response, context): 307 | matches.remove(when_response) 308 | 309 | rebulk = Rebulk().regex(r'\(.*?\)', marker=True, name="mark1") \ 310 | .regex(r'\[.*?\]', marker=True, name="mark2") \ 311 | .string("word", name="word") \ 312 | .rules(MarkerRule) 313 | 314 | matches = rebulk.matches("grab (word) only if it's in parenthesis") 315 | 316 | assert len(matches) == 1 317 | assert matches[0].value == "word" 318 | 319 | matches = rebulk.matches("don't grab [word] if it's in braket") 320 | assert len(matches) == 0 321 | 322 | matches = rebulk.matches("don't grab word at all") 323 | assert len(matches) == 0 324 | 325 | def test_multiple_marker(self): 326 | class MarkerRule(Rule): 327 | def when(self, matches, context): 328 | word_match = matches.named("word", 0) 329 | marker = matches.markers.at_match(word_match, 330 | lambda marker: marker.name in ["mark1", "mark2"]) 331 | if len(marker) < 2: 332 | return word_match 333 | 334 | def then(self, matches, when_response, context): 335 | matches.remove(when_response) 336 | 337 | rebulk = Rebulk().regex(r'\(.*?\)', marker=True, name="mark1") \ 338 | .regex(r'\[.*?\]', marker=True, name="mark2") \ 339 | .regex("w.*?d", name="word") \ 340 | .rules(MarkerRule) 341 | 342 | matches = rebulk.matches("[grab (word) only] if it's in parenthesis and brakets") 343 | 344 | assert len(matches) == 1 345 | assert matches[0].value == "word" 346 | 347 | matches = rebulk.matches("[don't grab](word)[if brakets are outside]") 348 | assert len(matches) == 0 349 | 350 | matches = rebulk.matches("(grab w[or)d even] if it's partially in parenthesis and brakets") 351 | assert len(matches) == 1 352 | assert matches[0].value == "w[or)d" 353 | 354 | def test_at_index_marker(self): 355 | class MarkerRule(Rule): 356 | def when(self, matches, context): 357 | word_match = matches.named("word", 0) 358 | marker = matches.markers.at_index(word_match.start, 359 | lambda marker: marker.name == "mark1", 0) 360 | if not marker: 361 | return word_match 362 | 363 | def then(self, matches, when_response, context): 364 | matches.remove(when_response) 365 | 366 | rebulk = Rebulk().regex(r'\(.*?\)', marker=True, name="mark1") \ 367 | .regex("w.*?d", name="word") \ 368 | .rules(MarkerRule) 369 | 370 | matches = rebulk.matches("gr(ab wo)rd only if starting of match is inside parenthesis") 371 | 372 | assert len(matches) == 1 373 | assert matches[0].value == "wo)rd" 374 | 375 | matches = rebulk.matches("don't grab wo(rd if starting of match is not inside parenthesis") 376 | 377 | assert len(matches) == 0 378 | 379 | def test_remove_marker(self): 380 | class MarkerRule(Rule): 381 | def when(self, matches, context): 382 | marker = matches.markers.named("mark1", 0) 383 | if marker: 384 | return marker 385 | 386 | def then(self, matches, when_response, context): 387 | matches.markers.remove(when_response) 388 | 389 | rebulk = Rebulk().regex(r'\(.*?\)', marker=True, name="mark1") \ 390 | .regex("w.*?d", name="word") \ 391 | .rules(MarkerRule) 392 | 393 | matches = rebulk.matches("grab word event (if it's not) inside parenthesis") 394 | 395 | assert len(matches) == 1 396 | assert matches[0].value == "word" 397 | 398 | assert not matches.markers 399 | 400 | 401 | class TestUnicode: 402 | def test_rebulk_simple(self): 403 | input_string = "敏捷的棕色狐狸跳過懶狗" 404 | 405 | rebulk = Rebulk() 406 | 407 | rebulk.string("敏") 408 | rebulk.regex("捷") 409 | 410 | def func(input_string): 411 | i = input_string.find("的") 412 | if i > -1: 413 | return i, i + len("的") 414 | 415 | rebulk.functional(func) 416 | 417 | matches = rebulk.matches(input_string) 418 | assert len(matches) == 3 419 | 420 | assert matches[0].value == "敏" 421 | assert matches[1].value == "捷" 422 | assert matches[2].value == "的" 423 | 424 | 425 | class TestImmutable: 426 | def test_starting(self): 427 | input_string = "The quick brown fox jumps over the lazy dog" 428 | matches = Rebulk().string("quick").string("over").string("fox").matches(input_string) 429 | 430 | for i in range(0, len(input_string)): 431 | starting = matches.starting(i) 432 | for match in list(starting): 433 | starting.remove(match) 434 | 435 | assert len(matches) == 3 436 | 437 | def test_ending(self): 438 | input_string = "The quick brown fox jumps over the lazy dog" 439 | matches = Rebulk().string("quick").string("over").string("fox").matches(input_string) 440 | 441 | for i in range(0, len(input_string)): 442 | starting = matches.ending(i) 443 | for match in list(starting): 444 | starting.remove(match) 445 | 446 | assert len(matches) == 3 447 | 448 | def test_named(self): 449 | input_string = "The quick brown fox jumps over the lazy dog" 450 | matches = Rebulk().defaults(name='test').string("quick").string("over").string("fox").matches(input_string) 451 | 452 | named = matches.named('test') 453 | for match in list(named): 454 | named.remove(match) 455 | 456 | assert len(named) == 0 457 | assert len(matches) == 3 458 | -------------------------------------------------------------------------------- /rebulk/test/test_rules.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # pylint: disable=pointless-statement, missing-docstring, invalid-name, no-member, len-as-condition 4 | import pytest 5 | from rebulk.test.default_rules_module import RuleRemove0, RuleAppend0, RuleRename0, RuleAppend1, RuleRemove1, \ 6 | RuleRename1, RuleAppend2, RuleRename2, RuleAppend3, RuleRename3, RuleAppendTags0, RuleRemoveTags0, \ 7 | RuleAppendTags1, RuleRemoveTags1 8 | 9 | from ..rules import Rules 10 | from ..match import Matches, Match 11 | 12 | from .rules_module import Rule1, Rule2, Rule3, Rule0, Rule1Disabled 13 | from . import rules_module as rm 14 | 15 | 16 | def test_rule_priority(): 17 | matches = Matches([Match(1, 2)]) 18 | 19 | rules = Rules(Rule1, Rule2()) 20 | 21 | rules.execute_all_rules(matches, {}) 22 | assert len(matches) == 0 23 | matches = Matches([Match(1, 2)]) 24 | 25 | rules = Rules(Rule1(), Rule0) 26 | 27 | rules.execute_all_rules(matches, {}) 28 | assert len(matches) == 1 29 | assert matches[0] == Match(3, 4) 30 | 31 | 32 | def test_rules_duplicates(): 33 | matches = Matches([Match(1, 2)]) 34 | 35 | rules = Rules(Rule1, Rule1) 36 | 37 | with pytest.raises(ValueError): 38 | rules.execute_all_rules(matches, {}) 39 | 40 | 41 | def test_rule_disabled(): 42 | matches = Matches([Match(1, 2)]) 43 | 44 | rules = Rules(Rule1Disabled(), Rule2()) 45 | 46 | rules.execute_all_rules(matches, {}) 47 | assert len(matches) == 2 48 | assert matches[0] == Match(1, 2) 49 | assert matches[1] == Match(3, 4) 50 | 51 | 52 | def test_rule_when(): 53 | matches = Matches([Match(1, 2)]) 54 | 55 | rules = Rules(Rule3()) 56 | 57 | rules.execute_all_rules(matches, {'when': False}) 58 | assert len(matches) == 1 59 | assert matches[0] == Match(1, 2) 60 | 61 | matches = Matches([Match(1, 2)]) 62 | 63 | rules.execute_all_rules(matches, {'when': True}) 64 | assert len(matches) == 2 65 | assert matches[0] == Match(1, 2) 66 | assert matches[1] == Match(3, 4) 67 | 68 | 69 | class TestDefaultRules: 70 | def test_remove(self): 71 | rules = Rules(RuleRemove0) 72 | 73 | matches = Matches([Match(1, 2)]) 74 | rules.execute_all_rules(matches, {}) 75 | 76 | assert len(matches) == 0 77 | 78 | rules = Rules(RuleRemove1) 79 | 80 | matches = Matches([Match(1, 2)]) 81 | rules.execute_all_rules(matches, {}) 82 | 83 | assert len(matches) == 0 84 | 85 | def test_append(self): 86 | rules = Rules(RuleAppend0) 87 | 88 | matches = Matches([Match(1, 2)]) 89 | rules.execute_all_rules(matches, {}) 90 | 91 | assert len(matches) == 2 92 | 93 | rules = Rules(RuleAppend1) 94 | 95 | matches = Matches([Match(1, 2)]) 96 | rules.execute_all_rules(matches, {}) 97 | 98 | assert len(matches) == 2 99 | 100 | rules = Rules(RuleAppend2) 101 | 102 | matches = Matches([Match(1, 2)]) 103 | rules.execute_all_rules(matches, {}) 104 | 105 | assert len(matches) == 2 106 | assert len(matches.named('renamed')) == 1 107 | 108 | rules = Rules(RuleAppend3) 109 | 110 | matches = Matches([Match(1, 2)]) 111 | rules.execute_all_rules(matches, {}) 112 | 113 | assert len(matches) == 2 114 | assert len(matches.named('renamed')) == 1 115 | 116 | def test_rename(self): 117 | rules = Rules(RuleRename0) 118 | 119 | matches = Matches([Match(1, 2, name='original')]) 120 | rules.execute_all_rules(matches, {}) 121 | 122 | assert len(matches.named('original')) == 1 123 | assert len(matches.named('renamed')) == 0 124 | 125 | rules = Rules(RuleRename1) 126 | 127 | matches = Matches([Match(5, 10, name='original')]) 128 | rules.execute_all_rules(matches, {}) 129 | 130 | assert len(matches.named('original')) == 0 131 | assert len(matches.named('renamed')) == 1 132 | 133 | rules = Rules(RuleRename2) 134 | 135 | matches = Matches([Match(5, 10, name='original')]) 136 | rules.execute_all_rules(matches, {}) 137 | 138 | assert len(matches.named('original')) == 0 139 | assert len(matches.named('renamed')) == 1 140 | 141 | rules = Rules(RuleRename3) 142 | 143 | matches = Matches([Match(5, 10, name='original')]) 144 | rules.execute_all_rules(matches, {}) 145 | 146 | assert len(matches.named('original')) == 0 147 | assert len(matches.named('renamed')) == 1 148 | 149 | def test_append_tags(self): 150 | rules = Rules(RuleAppendTags0) 151 | 152 | matches = Matches([Match(1, 2, name='tags', tags=['other'])]) 153 | rules.execute_all_rules(matches, {}) 154 | 155 | assert len(matches.named('tags')) == 1 156 | assert matches.named('tags', index=0).tags == ['other', 'new-tag'] 157 | 158 | rules = Rules(RuleAppendTags1) 159 | 160 | matches = Matches([Match(1, 2, name='tags', tags=['other'])]) 161 | rules.execute_all_rules(matches, {}) 162 | 163 | assert len(matches.named('tags')) == 1 164 | assert matches.named('tags', index=0).tags == ['other', 'new-tag'] 165 | 166 | def test_remove_tags(self): 167 | rules = Rules(RuleRemoveTags0) 168 | 169 | matches = Matches([Match(1, 2, name='tags', tags=['other', 'new-tag'])]) 170 | rules.execute_all_rules(matches, {}) 171 | 172 | assert len(matches.named('tags')) == 1 173 | assert matches.named('tags', index=0).tags == ['other'] 174 | 175 | rules = Rules(RuleRemoveTags1) 176 | 177 | matches = Matches([Match(1, 2, name='tags', tags=['other', 'new-tag'])]) 178 | rules.execute_all_rules(matches, {}) 179 | 180 | assert len(matches.named('tags')) == 1 181 | assert matches.named('tags', index=0).tags == ['other'] 182 | 183 | 184 | def test_rule_module(): 185 | rules = Rules(rm) 186 | 187 | matches = Matches([Match(1, 2)]) 188 | rules.execute_all_rules(matches, {}) 189 | 190 | assert len(matches) == 1 191 | 192 | 193 | def test_rule_repr(): 194 | assert str(Rule0()) == "" 195 | assert str(Rule1()) == "" 196 | assert str(Rule2()) == "" 197 | assert str(Rule1Disabled()) == "" 198 | -------------------------------------------------------------------------------- /rebulk/test/test_toposort.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Copyright 2014 True Blade Systems, Inc. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Original: 12 | # - https://bitbucket.org/ericvsmith/toposort (1.4) 13 | # Modifications: 14 | # - port to pytest 15 | # pylint: skip-file 16 | 17 | import pytest 18 | from ..toposort import toposort, toposort_flatten, CyclicDependency 19 | 20 | 21 | class TestCase: 22 | def test_simple(self): 23 | results = list(toposort({2: set([11]), 9: set([11, 8]), 10: set([11, 3]), 11: set([7, 5]), 8: set([7, 3])})) 24 | expected = [set([3, 5, 7]), set([8, 11]), set([2, 9, 10])] 25 | assert results == expected 26 | 27 | # make sure self dependencies are ignored 28 | results = list(toposort({2: set([2, 11]), 9: set([11, 8]), 10: set([10, 11, 3]), 11: set([7, 5]), 8: set([7, 3])})) 29 | expected = [set([3, 5, 7]), set([8, 11]), set([2, 9, 10])] 30 | assert results == expected 31 | 32 | assert list(toposort({1: set()})) == [set([1])] 33 | assert list(toposort({1: set([1])})) == [set([1])] 34 | 35 | def test_no_dependencies(self): 36 | assert list(toposort({1: set([2]), 3: set([4]), 5: set([6])})) == [set([2, 4, 6]), set([1, 3, 5])] 37 | assert list(toposort({1: set(), 3: set(), 5: set()})) == [set([1, 3, 5])] 38 | 39 | def test_empty(self): 40 | assert list(toposort({})) == [] 41 | 42 | def test_strings(self): 43 | results = list(toposort({'2': set(['11']), '9': set(['11', '8']), '10': set(['11', '3']), '11': set(['7', '5']), '8': set(['7', '3'])})) 44 | expected = [set(['3', '5', '7']), set(['8', '11']), set(['2', '9', '10'])] 45 | assert results == expected 46 | 47 | def test_objects(self): 48 | o2 = object() 49 | o3 = object() 50 | o5 = object() 51 | o7 = object() 52 | o8 = object() 53 | o9 = object() 54 | o10 = object() 55 | o11 = object() 56 | results = list(toposort({o2: set([o11]), o9: set([o11, o8]), o10: set([o11, o3]), o11: set([o7, o5]), o8: set([o7, o3, o8])})) 57 | expected = [set([o3, o5, o7]), set([o8, o11]), set([o2, o9, o10])] 58 | assert results == expected 59 | 60 | def test_cycle(self): 61 | # a simple, 2 element cycle 62 | with pytest.raises(CyclicDependency): 63 | list(toposort({1: set([2]), 2: set([1])})) 64 | 65 | # an indirect cycle 66 | with pytest.raises(CyclicDependency): 67 | list(toposort({1: set([2]), 2: set([3]), 3: set([1])})) 68 | 69 | def test_input_not_modified(self): 70 | data = {2: set([11]), 71 | 9: set([11, 8]), 72 | 10: set([11, 3]), 73 | 11: set([7, 5]), 74 | 8: set([7, 3, 8]), # includes something self-referential 75 | } 76 | orig = data.copy() 77 | results = list(toposort(data)) 78 | assert data == orig 79 | 80 | def test_input_not_modified_when_cycle_error(self): 81 | data = {1: set([2]), 82 | 2: set([1]), 83 | 3: set([4]), 84 | } 85 | orig = data.copy() 86 | with pytest.raises(CyclicDependency): 87 | list(toposort(data)) 88 | assert data == orig 89 | 90 | 91 | class TestCaseAll: 92 | def test_sort_flatten(self): 93 | data = {2: set([11]), 94 | 9: set([11, 8]), 95 | 10: set([11, 3]), 96 | 11: set([7, 5]), 97 | 8: set([7, 3, 8]), # includes something self-referential 98 | } 99 | expected = [set([3, 5, 7]), set([8, 11]), set([2, 9, 10])] 100 | assert list(toposort(data)) == expected 101 | 102 | # now check the sorted results 103 | results = [] 104 | for item in expected: 105 | results.extend(sorted(item)) 106 | assert toposort_flatten(data) == results 107 | 108 | # and the unsorted results. break the results up into groups to compare them 109 | actual = toposort_flatten(data, False) 110 | results = [set([i for i in actual[0:3]]), set([i for i in actual[3:5]]), set([i for i in actual[5:8]])] 111 | assert results == expected 112 | -------------------------------------------------------------------------------- /rebulk/test/test_validators.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # pylint: disable=pointless-statement, missing-docstring, invalid-name,len-as-condition 4 | 5 | from functools import partial 6 | 7 | from rebulk.pattern import StringPattern 8 | 9 | from ..validators import chars_before, chars_after, chars_surround, validators 10 | 11 | chars = ' _.' 12 | left = partial(chars_before, chars) 13 | right = partial(chars_after, chars) 14 | surrounding = partial(chars_surround, chars) 15 | 16 | 17 | def test_left_chars(): 18 | matches = list(StringPattern("word", validator=left).matches("xxxwordxxx")) 19 | assert len(matches) == 0 20 | 21 | matches = list(StringPattern("word", validator=left).matches("xxx_wordxxx")) 22 | assert len(matches) == 1 23 | 24 | matches = list(StringPattern("word", validator=left).matches("wordxxx")) 25 | assert len(matches) == 1 26 | 27 | 28 | def test_right_chars(): 29 | matches = list(StringPattern("word", validator=right).matches("xxxwordxxx")) 30 | assert len(matches) == 0 31 | 32 | matches = list(StringPattern("word", validator=right).matches("xxxword.xxx")) 33 | assert len(matches) == 1 34 | 35 | matches = list(StringPattern("word", validator=right).matches("xxxword")) 36 | assert len(matches) == 1 37 | 38 | 39 | def test_surrounding_chars(): 40 | matches = list(StringPattern("word", validator=surrounding).matches("xxxword xxx")) 41 | assert len(matches) == 0 42 | 43 | matches = list(StringPattern("word", validator=surrounding).matches("xxx.wordxxx")) 44 | assert len(matches) == 0 45 | 46 | matches = list(StringPattern("word", validator=surrounding).matches("xxx word_xxx")) 47 | assert len(matches) == 1 48 | 49 | matches = list(StringPattern("word", validator=surrounding).matches("word")) 50 | assert len(matches) == 1 51 | 52 | 53 | def test_chain(): 54 | matches = list(StringPattern("word", validator=validators(left, right)).matches("xxxword xxx")) 55 | assert len(matches) == 0 56 | 57 | matches = list(StringPattern("word", validator=validators(left, right)).matches("xxx.wordxxx")) 58 | assert len(matches) == 0 59 | 60 | matches = list(StringPattern("word", validator=validators(left, right)).matches("xxx word_xxx")) 61 | assert len(matches) == 1 62 | 63 | matches = list(StringPattern("word", validator=validators(left, right)).matches("word")) 64 | assert len(matches) == 1 65 | -------------------------------------------------------------------------------- /rebulk/toposort.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Copyright 2014 True Blade Systems, Inc. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Original: 12 | # - https://bitbucket.org/ericvsmith/toposort (1.4) 13 | # Modifications: 14 | # - merged Pull request #2 for CyclicDependency error 15 | # - import reduce as original name 16 | # - support python 2.6 dict comprehension 17 | 18 | # pylint: skip-file 19 | from functools import reduce 20 | 21 | 22 | class CyclicDependency(ValueError): 23 | def __init__(self, cyclic): 24 | s = 'Cyclic dependencies exist among these items: {0}'.format(', '.join(repr(x) for x in cyclic.items())) 25 | super().__init__(s) 26 | self.cyclic = cyclic 27 | 28 | 29 | def toposort(data): 30 | """ 31 | Dependencies are expressed as a dictionary whose keys are items 32 | and whose values are a set of dependent items. Output is a list of 33 | sets in topological order. The first set consists of items with no 34 | dependences, each subsequent set consists of items that depend upon 35 | items in the preceeding sets. 36 | :param data: 37 | :type data: 38 | :return: 39 | :rtype: 40 | """ 41 | 42 | # Special case empty input. 43 | if len(data) == 0: 44 | return 45 | 46 | # Copy the input so as to leave it unmodified. 47 | data = data.copy() 48 | 49 | # Ignore self dependencies. 50 | for k, v in data.items(): 51 | v.discard(k) 52 | # Find all items that don't depend on anything. 53 | extra_items_in_deps = reduce(set.union, data.values()) - set(data.keys()) 54 | # Add empty dependences where needed. 55 | data.update(dict((item, set()) for item in extra_items_in_deps)) 56 | while True: 57 | ordered = set(item for item, dep in data.items() if len(dep) == 0) 58 | if not ordered: 59 | break 60 | yield ordered 61 | data = dict((item, (dep - ordered)) 62 | for item, dep in data.items() 63 | if item not in ordered) 64 | if len(data) != 0: 65 | raise CyclicDependency(data) 66 | 67 | 68 | def toposort_flatten(data, sort=True): 69 | """ 70 | Returns a single list of dependencies. For any set returned by 71 | toposort(), those items are sorted and appended to the result (just to 72 | make the results deterministic). 73 | :param data: 74 | :type data: 75 | :param sort: 76 | :type sort: 77 | :return: Single list of dependencies. 78 | :rtype: list 79 | """ 80 | 81 | result = [] 82 | for d in toposort(data): 83 | result.extend((sorted if sort else list)(d)) 84 | return result 85 | -------------------------------------------------------------------------------- /rebulk/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Various utilities functions 5 | """ 6 | from collections.abc import MutableSet 7 | 8 | from types import GeneratorType 9 | 10 | 11 | def find_all(string, sub, start=None, end=None, ignore_case=False, **kwargs): 12 | """ 13 | Return all indices in string s where substring sub is 14 | found, such that sub is contained in the slice s[start:end]. 15 | 16 | >>> list(find_all('The quick brown fox jumps over the lazy dog', 'fox')) 17 | [16] 18 | 19 | >>> list(find_all('The quick brown fox jumps over the lazy dog', 'mountain')) 20 | [] 21 | 22 | >>> list(find_all('The quick brown fox jumps over the lazy dog', 'The')) 23 | [0] 24 | 25 | >>> list(find_all( 26 | ... 'Carved symbols in a mountain hollow on the bank of an inlet irritated an eccentric person', 27 | ... 'an')) 28 | [44, 51, 70] 29 | 30 | >>> list(find_all( 31 | ... 'Carved symbols in a mountain hollow on the bank of an inlet irritated an eccentric person', 32 | ... 'an', 33 | ... 50, 34 | ... 60)) 35 | [51] 36 | 37 | :param string: the input string 38 | :type string: str 39 | :param sub: the substring 40 | :type sub: str 41 | :return: all indices in the input string 42 | :rtype: __generator[str] 43 | """ 44 | #pylint: disable=unused-argument 45 | if ignore_case: 46 | sub = sub.lower() 47 | string = string.lower() 48 | while True: 49 | start = string.find(sub, start, end) 50 | if start == -1: 51 | return 52 | yield start 53 | start += len(sub) 54 | 55 | 56 | def get_first_defined(data, keys, default_value=None): 57 | """ 58 | Get the first defined key in data. 59 | :param data: 60 | :type data: 61 | :param keys: 62 | :type keys: 63 | :param default_value: 64 | :type default_value: 65 | :return: 66 | :rtype: 67 | """ 68 | for key in keys: 69 | if key in data: 70 | return data[key] 71 | return default_value 72 | 73 | 74 | def is_iterable(obj): 75 | """ 76 | Are we being asked to look up a list of things, instead of a single thing? 77 | We check for the `__iter__` attribute so that this can cover types that 78 | don't have to be known by this module, such as NumPy arrays. 79 | 80 | Strings, however, should be considered as atomic values to look up, not 81 | iterables. 82 | 83 | We don't need to check for the Python 2 `unicode` type, because it doesn't 84 | have an `__iter__` attribute anyway. 85 | """ 86 | # pylint: disable=consider-using-ternary 87 | return hasattr(obj, '__iter__') and not isinstance(obj, str) or isinstance(obj, GeneratorType) 88 | 89 | 90 | def extend_safe(target, source): 91 | """ 92 | Extends source list to target list only if elements doesn't exists in target list. 93 | :param target: 94 | :type target: list 95 | :param source: 96 | :type source: list 97 | """ 98 | for elt in source: 99 | if elt not in target: 100 | target.append(elt) 101 | 102 | 103 | class _Ref: 104 | """ 105 | Reference for IdentitySet 106 | """ 107 | def __init__(self, value): 108 | self.value = value 109 | 110 | def __eq__(self, other): 111 | return self.value is other.value 112 | 113 | def __hash__(self): 114 | return id(self.value) 115 | 116 | 117 | class IdentitySet(MutableSet): # pragma: no cover 118 | """ 119 | Set based on identity 120 | """ 121 | def __init__(self, items=None): # pylint: disable=super-init-not-called 122 | if items is None: 123 | items = [] 124 | self.refs = set(map(_Ref, items)) 125 | 126 | def __contains__(self, elem): 127 | return _Ref(elem) in self.refs 128 | 129 | def __iter__(self): 130 | return (ref.value for ref in self.refs) 131 | 132 | def __len__(self): 133 | return len(self.refs) 134 | 135 | def add(self, value): 136 | self.refs.add(_Ref(value)) 137 | 138 | def discard(self, value): 139 | self.refs.discard(_Ref(value)) 140 | 141 | def update(self, iterable): 142 | """ 143 | Update set with iterable 144 | :param iterable: 145 | :type iterable: 146 | :return: 147 | :rtype: 148 | """ 149 | for elem in iterable: 150 | self.add(elem) 151 | 152 | def __repr__(self): # pragma: no cover 153 | return f"{type(self).__name__}({list(self)})" 154 | -------------------------------------------------------------------------------- /rebulk/validators.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Validator functions to use in patterns. 5 | 6 | All those function have last argument as match, so it's possible to use functools.partial to bind previous arguments. 7 | """ 8 | 9 | 10 | def chars_before(chars, match): 11 | """ 12 | Validate the match if left character is in a given sequence. 13 | 14 | :param chars: 15 | :type chars: 16 | :param match: 17 | :type match: 18 | :return: 19 | :rtype: 20 | """ 21 | if match.start <= 0: 22 | return True 23 | return match.input_string[match.start - 1] in chars 24 | 25 | 26 | def chars_after(chars, match): 27 | """ 28 | Validate the match if right character is in a given sequence. 29 | 30 | :param chars: 31 | :type chars: 32 | :param match: 33 | :type match: 34 | :return: 35 | :rtype: 36 | """ 37 | if match.end >= len(match.input_string): 38 | return True 39 | return match.input_string[match.end] in chars 40 | 41 | 42 | def chars_surround(chars, match): 43 | """ 44 | Validate the match if surrounding characters are in a given sequence. 45 | 46 | :param chars: 47 | :type chars: 48 | :param match: 49 | :type match: 50 | :return: 51 | :rtype: 52 | """ 53 | return chars_before(chars, match) and chars_after(chars, match) 54 | 55 | 56 | def validators(*chained_validators): 57 | """ 58 | Creates a validator chain from several validator functions. 59 | 60 | :param chained_validators: 61 | :type chained_validators: 62 | :return: 63 | :rtype: 64 | """ 65 | 66 | def validator_chain(match): # pylint:disable=missing-docstring 67 | for chained_validator in chained_validators: 68 | if not chained_validator(match): 69 | return False 70 | return True 71 | 72 | return validator_chain 73 | 74 | 75 | def allways_true(match): # pylint:disable=unused-argument 76 | """ 77 | A validator which is allways true 78 | :param match: 79 | :return: 80 | """ 81 | return True 82 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # see https://caremad.io/blog/setup-vs-requirement/ 2 | -e . 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import io 5 | import re 6 | 7 | from setuptools import setup, find_packages 8 | 9 | with io.open('CHANGELOG.md', encoding='utf-8') as f: 10 | changelog = f.read() 11 | 12 | with io.open('README.md', 'r', encoding='utf-8') as f: 13 | readme = f.read() 14 | 15 | install_requires = ['setuptools;python_version>="3.12"'] 16 | 17 | native_requires = ['regex'] 18 | 19 | dev_require = ['pytest', 'pylint', 'tox', 'python-semantic-release', 'twine'] 20 | 21 | tests_require = ['pytest', 'pylint'] 22 | 23 | with io.open('rebulk/__version__.py', 'r') as f: 24 | version = re.search(r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]$', f.read(), re.MULTILINE).group(1) 25 | 26 | args = dict(name='rebulk', 27 | version=version, 28 | description='Rebulk - Define simple search patterns in bulk to perform advanced matching on any string.', 29 | long_description=readme + '\n\n' + changelog, 30 | long_description_content_type='text/markdown', 31 | # Get strings from http://pypi.python.org/pypi?%3Aaction=list_classifiers 32 | classifiers=['Development Status :: 5 - Production/Stable', 33 | 'License :: OSI Approved :: MIT License', 34 | 'Operating System :: OS Independent', 35 | 'Intended Audience :: Developers', 36 | 'Programming Language :: Python :: 3', 37 | 'Programming Language :: Python :: 3.7', 38 | 'Programming Language :: Python :: 3.8', 39 | 'Programming Language :: Python :: 3.9', 40 | 'Programming Language :: Python :: 3.10', 41 | 'Programming Language :: Python :: 3.11', 42 | 'Programming Language :: Python :: 3.12', 43 | 'Topic :: Software Development :: Libraries :: Python Modules' 44 | ], 45 | keywords='re regexp regular expression search pattern string match', 46 | author='Rémi Alvergnat', 47 | author_email='toilal.dev@gmail.com', 48 | url='https://github.com/Toilal/rebulk/', 49 | download_url='https://pypi.python.org/packages/source/r/rebulk/rebulk-%s.tar.gz' % version, 50 | license='MIT', 51 | packages=find_packages(), 52 | include_package_data=True, 53 | install_requires=install_requires, 54 | tests_require=tests_require, 55 | test_suite='rebulk.test', 56 | zip_safe=True, 57 | extras_require={ 58 | 'test': tests_require, 59 | 'dev': dev_require, 60 | 'native': native_requires 61 | } 62 | ) 63 | 64 | setup(**args) 65 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py37,py38,py39,py310,py311,py312,pypy3.8,pypy3.9,pypy3.10 3 | 4 | [testenv] 5 | commands = 6 | {envbindir}/pip install -e .[dev] 7 | {envpython} setup.py test 8 | --------------------------------------------------------------------------------