├── .coveragerc
├── .github
    └── workflows
    │   └── ci.yml
├── .gitignore
├── CHANGELOG.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── dev-requirements.txt
├── pylintrc
├── pyproject.toml
├── rebulk
    ├── __init__.py
    ├── __version__.py
    ├── builder.py
    ├── chain.py
    ├── debug.py
    ├── formatters.py
    ├── introspector.py
    ├── loose.py
    ├── match.py
    ├── pattern.py
    ├── processors.py
    ├── rebulk.py
    ├── remodule.py
    ├── rules.py
    ├── test
    │   ├── __init__.py
    │   ├── default_rules_module.py
    │   ├── rebulk_rules_module.py
    │   ├── rules_module.py
    │   ├── test_chain.py
    │   ├── test_debug.py
    │   ├── test_introspector.py
    │   ├── test_loose.py
    │   ├── test_match.py
    │   ├── test_pattern.py
    │   ├── test_processors.py
    │   ├── test_rebulk.py
    │   ├── test_rules.py
    │   ├── test_toposort.py
    │   └── test_validators.py
    ├── toposort.py
    ├── utils.py
    └── validators.py
├── requirements.txt
├── runtests.py
├── setup.py
└── tox.ini


/.coveragerc:
--------------------------------------------------------------------------------
 1 | # .coveragerc to control coverage.py
 2 | [run]
 3 | include =
 4 |     rebulk/*
 5 | omit =
 6 |     rebulk/__version__.py
 7 |     rebulk/test/*
 8 | [report]
 9 | exclude_lines =
10 |     pragma: no cover


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
  1 | name: ci
  2 | on:
  3 |   push: ~
  4 |   pull_request: ~
  5 | jobs:
  6 |   build:
  7 |     if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name != github.repository
  8 |     runs-on: ubuntu-latest
  9 | 
 10 |     strategy:
 11 |       fail-fast: false
 12 |       matrix:
 13 |         python-version: [ "3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "pypy-3.8", "pypy-3.9", "pypy-3.10" ]
 14 |         regex: [ "1", "0" ]
 15 | 
 16 |     steps:
 17 |       - name: Setup python ${{ matrix.python-version }}
 18 |         uses: actions/setup-python@v4
 19 |         with:
 20 |           python-version: ${{ matrix.python-version }}
 21 | 
 22 |       - name: Checkout
 23 |         uses: actions/checkout@v4
 24 | 
 25 |       - name: Git User config
 26 |         run: |
 27 |           git config --global user.email "action@github.com"
 28 |           git config --global user.name "github-actions"
 29 | 
 30 |       - name: Install Dependencies
 31 |         run: |
 32 |           pip install -e .[dev,test]
 33 |           pip install coveralls
 34 | 
 35 |       - name: Install regex
 36 |         run: |
 37 |           pip install regex
 38 |         if: ${{ matrix.regex == '1' }}
 39 | 
 40 |       - run: pylint rebulk
 41 | 
 42 |       - run: coverage run -m pytest
 43 |         env:
 44 |           REBULK_REGEX_ENABLED: ${{ matrix.regex }}
 45 | 
 46 |       - run: python setup.py build
 47 | 
 48 |       - name: Coveralls
 49 |         run: coveralls
 50 |         env:
 51 |           COVERALLS_REPO_TOKEN: ${{ secrets.COVERALLS_REPO_TOKEN }}
 52 | 
 53 |   commitlint:
 54 |     if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name != github.repository
 55 |     runs-on: ubuntu-latest
 56 |     steps:
 57 |       - uses: actions/checkout@v4
 58 |         with:
 59 |           fetch-depth: 0
 60 |       - uses: wagoid/commitlint-github-action@v5
 61 | 
 62 |   release:
 63 |     if: ${{ github.ref == 'refs/heads/master' && github.event_name == 'push' }}
 64 |     needs: build
 65 | 
 66 |     runs-on: ubuntu-latest
 67 | 
 68 |     strategy:
 69 |       fail-fast: false
 70 |       matrix:
 71 |         python-version: [ 3.12 ]
 72 | 
 73 |     steps:
 74 |       - name: Setup python ${{ matrix.python-version }}
 75 |         uses: actions/setup-python@v4
 76 |         with:
 77 |           python-version: ${{ matrix.python-version }}
 78 | 
 79 |       - name: Checkout
 80 |         uses: actions/checkout@v4
 81 |         with:
 82 |           fetch-depth: 0
 83 | 
 84 |       - name: Git User config
 85 |         run: |
 86 |           git config --global user.email "action@github.com"
 87 |           git config --global user.name "github-actions"
 88 | 
 89 |       - name: Install Dependencies
 90 |         run: pip install -e .[dev,test]
 91 | 
 92 |       - name: Install python-semantic-release and twine
 93 |         run: pip install python-semantic-release twine
 94 | 
 95 |       - name: Bump version
 96 |         run: semantic-release version
 97 |         env:
 98 |           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 99 | 
100 |       - name: Upload to pypi
101 |         run: twine upload --username "__token__" --password "${PYPI_TOKEN}" dist/*
102 |         env:
103 |           PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
104 | 
105 |       - name: Publish release
106 |         run: semantic-release publish
107 |         env:
108 |           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
109 | 
110 |       - name: Merge master to develop
111 |         uses: robotology/gh-action-nightly-merge@v1.4.0
112 |         with:
113 |           stable_branch: 'master'
114 |           development_branch: 'develop'
115 |           allow_ff: true
116 |           user_name: github-actions
117 |         env:
118 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
119 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | dist/
 5 | build/
 6 | 
 7 | # Python dist
 8 | *.egg-info/
 9 | .eggs/
10 | 
11 | # Coverage
12 | .coverage
13 | 
14 | # PyEnv
15 | .python-version
16 | 
17 | # Tox
18 | .tox/
19 | 
20 | # py.test
21 | lastfailed
22 | 
23 | # Jetbrain
24 | *.iml
25 | .idea/
26 | 
27 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | Changelog
 2 | =========
 3 | 
 4 | <!--next-version-placeholder-->
 5 | 
 6 | ## v3.2.0 (2023-02-18)
 7 | ### Feature
 8 | * **dependencies:** Add python 3.11 support and drop python 3.6 support ([`e4cb0d8`](https://github.com/Toilal/rebulk/commit/e4cb0d854cd8ea80da9abe46d2b3405a873e2020))
 9 | 
10 | ### Fix
11 | * Remove pytest-runner from setup_requires ([`4483d17`](https://github.com/Toilal/rebulk/commit/4483d1777f6a61d20ed83da760663aec67e22042))
12 | 
13 | ## v3.1.0 (2021-11-04)
14 | ### Feature
15 | * **defaults:** Add overrides support ([#25](https://github.com/Toilal/rebulk/issues/25)) ([`f79e5ea`](https://github.com/Toilal/rebulk/commit/f79e5eab0806787ff19a4c668bf9f88413b67288))
16 | * **python:** Add python 3.10 support, drop python 3.5 support ([`a5e6eb7`](https://github.com/Toilal/rebulk/commit/a5e6eb7bba979ee51e1c6c1e186bd224c989dfdc))
17 | 
18 | ## v3.0.1 (2020-12-25)
19 | ### Fix
20 | * **package:** Fix broken package `No such file or directory: 'CHANGELOG.md'` ([#24](https://github.com/Toilal/rebulk/issues/24)) ([`33895ff`](https://github.com/Toilal/rebulk/commit/33895ff358ff5051768fb98d4e840691e7af9bdf))
21 | 
22 | ### Documentation
23 | * **readme:** Add semantic release badge ([`78baca0`](https://github.com/Toilal/rebulk/commit/78baca0c529083d7f583ffec58aeb23734d67ce5))
24 | * **readme:** Fix title ([`d5d4db5`](https://github.com/Toilal/rebulk/commit/d5d4db5cd7f6e2cb1308acd26bfb98838815fad4))
25 | 
26 | ## v3.0.0 (2020-12-23)
27 | ### Feature
28 | * **regex:** Replace REGEX_DISABLED environment variable with REBULK_REGEX_ENABLED ([`d5a8cad`](https://github.com/Toilal/rebulk/commit/d5a8cad6281533ee549a46ca70e1a25e5777eda3))
29 | * Add python 3.8/3.9 support, drop python 2.7/3.4 support ([`048a15f`](https://github.com/Toilal/rebulk/commit/048a15f90833ba8d33ea84d56e9955d31b514dc3))
30 | 
31 | ### Breaking
32 | * regex module is now disabled by default, even if it's available in the python interpreter. You have to set REBULK_REGEX_ENABLED=1 in your environment to enable it, as this module may cause some issues.  ([`d5a8cad`](https://github.com/Toilal/rebulk/commit/d5a8cad6281533ee549a46ca70e1a25e5777eda3))
33 | * Python 2.7 and 3.4 support have been dropped  ([`048a15f`](https://github.com/Toilal/rebulk/commit/048a15f90833ba8d33ea84d56e9955d31b514dc3))
34 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Rémi Alvergnat
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.py
2 | include *.txt
3 | include *.ini
4 | include *.md
5 | include .coveragerc
6 | include LICENSE
7 | include pylintrc
8 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ReBulk
  2 | ======
  3 | 
  4 | [![Latest Version](http://img.shields.io/pypi/v/rebulk.svg)](https://pypi.python.org/pypi/rebulk)
  5 | [![MIT License](http://img.shields.io/badge/license-MIT-blue.svg)](https://pypi.python.org/pypi/rebulk)
  6 | [![Build Status](https://img.shields.io/github/workflow/status/Toilal/rebulk/ci)](https://github.com/Toilal/rebulk/actions?query=workflow%3Aci)
  7 | [![Coveralls](http://img.shields.io/coveralls/Toilal/rebulk.svg)](https://coveralls.io/r/Toilal/rebulk?branch=master)
  8 | [![semantic-release](https://img.shields.io/badge/%20%20%F0%9F%93%A6%F0%9F%9A%80-semantic--release-e10079.svg)](https://github.com/relekang/python-semantic-release)
  9 | 
 10 | 
 11 | ReBulk is a python library that performs advanced searches in strings
 12 | that would be hard to implement using [re
 13 | module](https://docs.python.org/3/library/re.html) or [String
 14 | methods](https://docs.python.org/3/library/stdtypes.html#str) only.
 15 | 
 16 | It includes some features like `Patterns`, `Match`, `Rule` that allows
 17 | developers to build a custom and complex string matcher using a readable
 18 | and extendable API.
 19 | 
 20 | This project is hosted on GitHub: <https://github.com/Toilal/rebulk>
 21 | 
 22 | Install
 23 | =======
 24 | 
 25 | ```sh
 26 | $ pip install rebulk
 27 | ```
 28 | 
 29 | Usage
 30 | =====
 31 | 
 32 | Regular expression, string and function based patterns are declared in a
 33 | `Rebulk` object. It use a fluent API to chain `string`, `regex`, and
 34 | `functional` methods to define various patterns types.
 35 | 
 36 | ```python
 37 | >>> from rebulk import Rebulk
 38 | >>> bulk = Rebulk().string('brown').regex(r'qu\w+').functional(lambda s: (20, 25))
 39 | ```
 40 | 
 41 | When `Rebulk` object is fully configured, you can call `matches` method
 42 | with an input string to retrieve all `Match` objects found by registered
 43 | pattern.
 44 | 
 45 | ```python
 46 | >>> bulk.matches("The quick brown fox jumps over the lazy dog")
 47 | [<brown:(10, 15)>, <quick:(4, 9)>, <jumps:(20, 25)>]
 48 | ```
 49 | 
 50 | If multiple `Match` objects are found at the same position, only the
 51 | longer one is kept.
 52 | 
 53 | ```python
 54 | >>> bulk = Rebulk().string('lakers').string('la')
 55 | >>> bulk.matches("the lakers are from la")
 56 | [<lakers:(4, 10)>, <la:(20, 22)>]
 57 | ```
 58 | 
 59 | String Patterns
 60 | ===============
 61 | 
 62 | String patterns are based on
 63 | [str.find](https://docs.python.org/3/library/stdtypes.html#str.find)
 64 | method to find matches, but returns all matches in the string.
 65 | `ignore_case` can be enabled to ignore case.
 66 | 
 67 | ```python
 68 | >>> Rebulk().string('la').matches("lalalilala")
 69 | [<la:(0, 2)>, <la:(2, 4)>, <la:(6, 8)>, <la:(8, 10)>]
 70 | 
 71 | >>> Rebulk().string('la').matches("LalAlilAla")
 72 | [<la:(8, 10)>]
 73 | 
 74 | >>> Rebulk().string('la', ignore_case=True).matches("LalAlilAla")
 75 | [<La:(0, 2)>, <lA:(2, 4)>, <lA:(6, 8)>, <la:(8, 10)>]
 76 | ```
 77 | 
 78 | You can define several patterns with a single `string` method call.
 79 | 
 80 | ```python
 81 | >>> Rebulk().string('Winter', 'coming').matches("Winter is coming...")
 82 | [<Winter:(0, 6)>, <coming:(10, 16)>]
 83 | ```
 84 | 
 85 | Regular Expression Patterns
 86 | ===========================
 87 | 
 88 | Regular Expression patterns are based on a compiled regular expression.
 89 | [re.finditer](https://docs.python.org/3/library/re.html#re.finditer)
 90 | method is used to find matches.
 91 | 
 92 | If [regex module](https://pypi.python.org/pypi/regex) is available, it
 93 | can be used by rebulk instead of default [re
 94 | module](https://docs.python.org/3/library/re.html). Enable it with `REBULK_REGEX_ENABLED=1` environment variable.
 95 | 
 96 | ```python
 97 | >>> Rebulk().regex(r'l\w').matches("lolita")
 98 | [<lo:(0, 2)>, <li:(2, 4)>]
 99 | ```
100 | 
101 | You can define several patterns with a single `regex` method call.
102 | 
103 | ```python
104 | >>> Rebulk().regex(r'Wint\wr', r'com\w{3}').matches("Winter is coming...")
105 | [<Winter:(0, 6)>, <coming:(10, 16)>]
106 | ```
107 | 
108 | All keyword arguments from
109 | [re.compile](https://docs.python.org/3/library/re.html#re.compile) are
110 | supported.
111 | 
112 | ```python
113 | >>> import re  # import required for flags constant
114 | >>> Rebulk().regex('L[A-Z]KERS', flags=re.IGNORECASE) \
115 | ...         .matches("The LaKeRs are from La")
116 | [<LaKeRs:(4, 10)>]
117 | 
118 | >>> Rebulk().regex('L[A-Z]', 'L[A-Z]KERS', flags=re.IGNORECASE) \
119 | ...         .matches("The LaKeRs are from La")
120 | [<La:(20, 22)>, <LaKeRs:(4, 10)>]
121 | 
122 | >>> Rebulk().regex(('L[A-Z]', re.IGNORECASE), ('L[a-z]KeRs')) \
123 | ...         .matches("The LaKeRs are from La")
124 | [<La:(20, 22)>, <LaKeRs:(4, 10)>]
125 | ```
126 | 
127 | If [regex module](https://pypi.python.org/pypi/regex) is available, it
128 | automatically supports repeated captures.
129 | 
130 | ```python
131 | >>> # If regex module is available, repeated_captures is True by default.
132 | >>> matches = Rebulk().regex(r'(\d+)(?:-(\d+))+').matches("01-02-03-04")
133 | >>> matches[0].children # doctest:+SKIP
134 | [<01:(0, 2)>, <02:(3, 5)>, <03:(6, 8)>, <04:(9, 11)>]
135 | 
136 | >>> # If regex module is not available, or if repeated_captures is forced to False.
137 | >>> matches = Rebulk().regex(r'(\d+)(?:-(\d+))+', repeated_captures=False) \
138 | ...                   .matches("01-02-03-04")
139 | >>> matches[0].children
140 | [<01:(0, 2)+initiator=01-02-03-04>, <04:(9, 11)+initiator=01-02-03-04>]
141 | ```
142 | 
143 | -   `abbreviations`
144 | 
145 |     Defined as a list of 2-tuple, each tuple is an abbreviation. It
146 |     simply replace `tuple[0]` with `tuple[1]` in the expression.
147 | 
148 |     \>\>\> Rebulk().regex(r\'Custom-separators\',
149 |     abbreviations=\[(\"-\", r\"\[W\_\]+\")\])\...
150 |     .matches(\"Custom\_separators using-abbreviations\")
151 |     \[\<Custom\_separators:(0, 17)\>\]
152 | 
153 | Functional Patterns
154 | ===================
155 | 
156 | Functional Patterns are based on the evaluation of a function.
157 | 
158 | The function should have the same parameters as `Rebulk.matches` method,
159 | that is the input string, and must return at least start index and end
160 | index of the `Match` object.
161 | 
162 | ```python
163 | >>> def func(string):
164 | ...     index = string.find('?')
165 | ...     if index > -1:
166 | ...         return 0, index - 11
167 | >>> Rebulk().functional(func).matches("Why do simple ? Forget about it ...")
168 | [<Why:(0, 3)>]
169 | ```
170 | 
171 | You can also return a dict of keywords arguments for `Match` object.
172 | 
173 | You can define several patterns with a single `functional` method call,
174 | and function used can return multiple matches.
175 | 
176 | Chain Patterns
177 | ==============
178 | 
179 | Chain Patterns are ordered composition of string, functional and regex
180 | patterns. Repeater can be set to define repetition on chain part.
181 | 
182 | ```python
183 | >>> r = Rebulk().regex_defaults(flags=re.IGNORECASE)\
184 | ...             .defaults(children=True, formatter={'episode': int, 'version': int})\
185 | ...             .chain()\
186 | ...             .regex(r'e(?P<episode>\d{1,4})').repeater(1)\
187 | ...             .regex(r'v(?P<version>\d+)').repeater('?')\
188 | ...             .regex(r'[ex-](?P<episode>\d{1,4})').repeater('*')\
189 | ...             .close() # .repeater(1) could be omitted as it's the default behavior
190 | >>> r.matches("This is E14v2-15-16-17").to_dict()  # converts matches to dict
191 | MatchesDict([('episode', [14, 15, 16, 17]), ('version', 2)])
192 | ```
193 | 
194 | Patterns parameters
195 | ===================
196 | 
197 | All patterns have options that can be given as keyword arguments.
198 | 
199 | -   `validator`
200 | 
201 |     Function to validate `Match` value given by the pattern. Can also be
202 |     a `dict`, to use `validator` with pattern named with key.
203 | 
204 |     ```python
205 |     >>> def check_leap_year(match):
206 |     ...     return int(match.value) in [1980, 1984, 1988]
207 |     >>> matches = Rebulk().regex(r'\d{4}', validator=check_leap_year) \
208 |     ...                   .matches("In year 1982 ...")
209 |     >>> len(matches)
210 |     0
211 |     >>> matches = Rebulk().regex(r'\d{4}', validator=check_leap_year) \
212 |     ...                   .matches("In year 1984 ...")
213 |     >>> len(matches)
214 |     1
215 |     ```
216 | 
217 | Some base validator functions are available in `rebulk.validators`
218 | module. Most of those functions have to be configured using
219 | `functools.partial` to map them to function accepting a single `match`
220 | argument.
221 | 
222 | -   `formatter`
223 | 
224 |     Function to convert `Match` value given by the pattern. Can also be
225 |     a `dict`, to use `formatter` with matches named with key.
226 | 
227 |     ```python
228 |     >>> def year_formatter(value):
229 |     ...     return int(value)
230 |     >>> matches = Rebulk().regex(r'\d{4}', formatter=year_formatter) \
231 |     ...                   .matches("In year 1982 ...")
232 |     >>> isinstance(matches[0].value, int)
233 |     True
234 |     ```
235 | 
236 | -   `pre_match_processor` / `post_match_processor`
237 | 
238 |     Function to mutagen or invalidate a match generated by a pattern.
239 | 
240 |     Function has a single parameter which is the Match object. If
241 |     function returns False, it will be considered as an invalid match.
242 |     If function returns a match instance, it will replace the original
243 |     match with this instance in the process.
244 | 
245 | -   `post_processor`
246 | 
247 |     Function to change the default output of the pattern. Function
248 |     parameters are Matches list and Pattern object.
249 | 
250 | -   `name`
251 | 
252 |     The name of the pattern. It is automatically passed to `Match`
253 |     objects generated by this pattern.
254 | 
255 | -   `tags`
256 | 
257 |     A list of string that qualifies this pattern.
258 | 
259 | -   `value`
260 | 
261 |     Override value property for generated `Match` objects. Can also be a
262 |     `dict`, to use `value` with pattern named with key.
263 | 
264 | -   `validate_all`
265 | 
266 |     By default, validator is called for returned `Match` objects only.
267 |     Enable this option to validate them all, parent and children
268 |     included.
269 | 
270 | -   `format_all`
271 | 
272 |     By default, formatter is called for returned `Match` values only.
273 |     Enable this option to format them all, parent and children included.
274 | 
275 | -   `disabled`
276 | 
277 |     A `function(context)` to disable the pattern if returning `True`.
278 | 
279 | -   `children`
280 | 
281 |     If `True`, all children `Match` objects will be retrieved instead of
282 |     a single parent `Match` object.
283 | 
284 | -   `private`
285 | 
286 |     If `True`, `Match` objects generated from this pattern are available
287 |     internally only. They will be removed at the end of `Rebulk.matches`
288 |     method call.
289 | 
290 | -   `private_parent`
291 | 
292 |     Force parent matches to be returned and flag them as private.
293 | 
294 | -   `private_children`
295 | 
296 |     Force children matches to be returned and flag them as private.
297 | 
298 | -   `private_names`
299 | 
300 |     Matches names that will be declared as private
301 | 
302 | -   `ignore_names`
303 | 
304 |     Matches names that will be ignored from the pattern output, after
305 |     validation.
306 | 
307 | -   `marker`
308 | 
309 |     If `true`, `Match` objects generated from this pattern will be
310 |     markers matches instead of standard matches. They won\'t be included
311 |     in `Matches` sequence, but will be available in `Matches.markers`
312 |     sequence (see `Markers` section).
313 | 
314 | Match
315 | =====
316 | 
317 | A `Match` object is the result created by a registered pattern.
318 | 
319 | It has a `value` property defined, and position indices are available
320 | through `start`, `end` and `span` properties.
321 | 
322 | In some case, it contains children `Match` objects in `children`
323 | property, and each child `Match` object reference its parent in `parent`
324 | property. Also, a `name` property can be defined for the match.
325 | 
326 | If groups are defined in a Regular Expression pattern, each group match
327 | will be converted to a single `Match` object. If a group has a name
328 | defined (`(?P<name>group)`), it is set as `name` property in a child
329 | `Match` object. The whole regexp match (`re.group(0)`) will be converted
330 | to the main `Match` object, and all subgroups (1, 2, \... n) will be
331 | converted to `children` matches of the main `Match` object.
332 | 
333 | ```python
334 | >>> matches = Rebulk() \
335 | ...         .regex(r"One, (?P<one>\w+), Two, (?P<two>\w+), Three, (?P<three>\w+)") \
336 | ...         .matches("Zero, 0, One, 1, Two, 2, Three, 3, Four, 4")
337 | >>> matches
338 | [<One, 1, Two, 2, Three, 3:(9, 33)>]
339 | >>> for child in matches[0].children:
340 | ...     '%s = %s' % (child.name, child.value)
341 | 'one = 1'
342 | 'two = 2'
343 | 'three = 3'
344 | ```
345 | 
346 | It\'s possible to retrieve only children by using `children` parameters.
347 | You can also customize the way structure is generated with `every`,
348 | `private_parent` and `private_children` parameters.
349 | 
350 | ```python
351 | >>> matches = Rebulk() \
352 | ...         .regex(r"One, (?P<one>\w+), Two, (?P<two>\w+), Three, (?P<three>\w+)", children=True) \
353 | ...         .matches("Zero, 0, One, 1, Two, 2, Three, 3, Four, 4")
354 | >>> matches
355 | [<1:(14, 15)+name=one+initiator=One, 1, Two, 2, Three, 3>, <2:(22, 23)+name=two+initiator=One, 1, Two, 2, Three, 3>, <3:(32, 33)+name=three+initiator=One, 1, Two, 2, Three, 3>]
356 | ```
357 | 
358 | Match object has the following properties that can be given to Pattern
359 | objects
360 | 
361 | -   `formatter`
362 | 
363 |     Function to convert `Match` value given by the pattern. Can also be
364 |     a `dict`, to use `formatter` with matches named with key.
365 | 
366 |     ```python
367 |     >>> def year_formatter(value):
368 |     ...     return int(value)
369 |     >>> matches = Rebulk().regex(r'\d{4}', formatter=year_formatter) \
370 |     ...                   .matches("In year 1982 ...")
371 |     >>> isinstance(matches[0].value, int)
372 |     True
373 |     ```
374 | 
375 | -   `format_all`
376 | 
377 |     By default, formatter is called for returned `Match` values only.
378 |     Enable this option to format them all, parent and children included.
379 | 
380 | -   `conflict_solver`
381 | 
382 |     A `function(match, conflicting_match)` used to solve conflict.
383 |     Returned object will be removed from matches by `ConflictSolver`
384 |     default rule. If `__default__` string is returned, it will fallback
385 |     to default behavior keeping longer match.
386 | 
387 | Matches
388 | =======
389 | 
390 | A `Matches` object holds the result of `Rebulk.matches` method call.
391 | It\'s a sequence of `Match` objects and it behaves like a list.
392 | 
393 | All methods accepts a `predicate` function to filter `Match` objects
394 | using a callable, and an `index` int to retrieve a single element from
395 | default returned matches.
396 | 
397 | It has the following additional methods and properties on it.
398 | 
399 | -   `starting(index, predicate=None, index=None)`
400 | 
401 |     Retrieves a list of `Match` objects that starts at given index.
402 | 
403 | -   `ending(index, predicate=None, index=None)`
404 | 
405 |     Retrieves a list of `Match` objects that ends at given index.
406 | 
407 | -   `previous(match, predicate=None, index=None)`
408 | 
409 |     Retrieves a list of `Match` objects that are previous and nearest to
410 |     match.
411 | 
412 | -   `next(match, predicate=None, index=None)`
413 | 
414 |     Retrieves a list of `Match` objects that are next and nearest to
415 |     match.
416 | 
417 | -   `tagged(tag, predicate=None, index=None)`
418 | 
419 |     Retrieves a list of `Match` objects that have the given tag defined.
420 | 
421 | -   `named(name, predicate=None, index=None)`
422 | 
423 |     Retrieves a list of `Match` objects that have the given name.
424 | 
425 | -   `range(start=0, end=None, predicate=None, index=None)`
426 | 
427 |     Retrieves a list of `Match` objects for given range, sorted from
428 |     start to end.
429 | 
430 | -   `holes(start=0, end=None, formatter=None, ignore=None, predicate=None, index=None)`
431 | 
432 |     Retrieves a list of *hole* `Match` objects for given range. A hole
433 |     match is created for each range where no match is available.
434 | 
435 | -   `conflicting(match, predicate=None, index=None)`
436 | 
437 |     Retrieves a list of `Match` objects that conflicts with given match.
438 | 
439 | -   `chain_before(self, position, seps, start=0, predicate=None, index=None)`:
440 | 
441 |     Retrieves a list of chained matches, before position, matching
442 |     predicate and separated by characters from seps only.
443 | 
444 | -   `chain_after(self, position, seps, end=None, predicate=None, index=None)`:
445 | 
446 |     Retrieves a list of chained matches, after position, matching
447 |     predicate and separated by characters from seps only.
448 | 
449 | -   `at_match(match, predicate=None, index=None)`
450 | 
451 |     Retrieves a list of `Match` objects at the same position as match.
452 | 
453 | -   `at_span(span, predicate=None, index=None)`
454 | 
455 |     Retrieves a list of `Match` objects from given (start, end) tuple.
456 | 
457 | -   `at_index(pos, predicate=None, index=None)`
458 | 
459 |     Retrieves a list of `Match` objects from given position.
460 | 
461 | -   `names`
462 | 
463 |     Retrieves a sequence of all `Match.name` properties.
464 | 
465 | -   `tags`
466 | 
467 |     Retrieves a sequence of all `Match.tags` properties.
468 | 
469 | -   `to_dict(details=False, first_value=False, enforce_list=False)`
470 | 
471 |     Convert to an ordered dict, with `Match.name` as key and
472 |     `Match.value` as value.
473 | 
474 |     It\'s a subclass of
475 |     [OrderedDict](https://docs.python.org/2/library/collections.html#collections.OrderedDict),
476 |     that contains a `matches` property which is a dict with `Match.name`
477 |     as key and list of `Match` objects as value.
478 | 
479 |     If `first_value` is `True` and distinct values are found for the
480 |     same name, value will be wrapped to a list. If `False`, first value
481 |     only will be kept and values lists can be retrieved with
482 |     `values_list` which is a dict with `Match.name` as key and list of
483 |     `Match.value` as value.
484 | 
485 |     if `enforce_list` is `True`, all values will be wrapped to a list,
486 |     even if a single value is found.
487 | 
488 |     If `details` is True, `Match.value` objects are replaced with
489 |     complete `Match` object.
490 | 
491 | -   `markers`
492 | 
493 |     A custom `Matches` sequences specialized for `markers` matches (see
494 |     below)
495 | 
496 | Markers
497 | =======
498 | 
499 | If you have defined some patterns with `markers` property, then
500 | `Matches.markers` points to a special `Matches` sequence that contains
501 | only `markers` matches. This sequence supports all methods from
502 | `Matches`.
503 | 
504 | Markers matches are not intended to be used in final result, but can be
505 | used to implement a `Rule`.
506 | 
507 | Rules
508 | =====
509 | 
510 | Rules are a convenient and readable way to implement advanced
511 | conditional logic involving several `Match` objects. When a rule is
512 | triggered, it can perform an action on `Matches` object, like filtering
513 | out, adding additional tags or renaming.
514 | 
515 | Rules are implemented by extending the abstract `Rule` class. They are
516 | registered using `Rebulk.rule` method by giving either a `Rule`
517 | instance, a `Rule` class or a module containing `Rule classes` only.
518 | 
519 | For a rule to be triggered, `Rule.when` method must return `True`, or a
520 | non empty list of `Match` objects, or any other truthy object. When
521 | triggered, `Rule.then` method is called to perform the action with
522 | `when_response` parameter defined as the response of `Rule.when` call.
523 | 
524 | Instead of implementing `Rule.then` method, you can define `consequence`
525 | class property with a Consequence classe or instance, like
526 | `RemoveMatch`, `RenameMatch` or `AppendMatch`. You can also use a list
527 | of consequence when required : `when_response` must then be iterable,
528 | and elements of this iterable will be given to each consequence in the
529 | same order.
530 | 
531 | When many rules are registered, it can be useful to set `priority` class
532 | variable to define a priority integer between all rule executions
533 | (higher priorities will be executed first). You can also define
534 | `dependency` to declare another Rule class as dependency for the current
535 | rule, meaning that it will be executed before.
536 | 
537 | For all rules with the same `priority` value, `when` is called before,
538 | and `then` is called after all.
539 | 
540 | ```python
541 | >>> from rebulk import Rule, RemoveMatch
542 | 
543 | >>> class FirstOnlyRule(Rule):
544 | ...     consequence = RemoveMatch
545 | ...
546 | ...     def when(self, matches, context):
547 | ...         grabbed = matches.named("grabbed", 0)
548 | ...         if grabbed and matches.previous(grabbed):
549 | ...             return grabbed
550 | 
551 | >>> rebulk = Rebulk()
552 | 
553 | >>> rebulk.regex("This match(.*?)grabbed", name="grabbed")
554 | <...Rebulk object ...>
555 | >>> rebulk.regex("if it's(.*?)first match", private=True)
556 | <...Rebulk object at ...>
557 | >>> rebulk.rules(FirstOnlyRule)
558 | <...Rebulk object at ...>
559 | 
560 | >>> rebulk.matches("This match is grabbed only if it's the first match")
561 | [<This match is grabbed:(0, 21)+name=grabbed>]
562 | >>> rebulk.matches("if it's NOT the first match, This match is NOT grabbed")
563 | []
564 | ```
565 | 


--------------------------------------------------------------------------------
/dev-requirements.txt:
--------------------------------------------------------------------------------
1 | -e .[dev,test]
2 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.semantic_release]
2 | version_variables = ["rebulk/__version__.py:__version__"]
3 | commit_message = "chore(release): release v{version}"
4 | commit_author = "github-actions <actions@github.com>"
5 | 
6 | [tool.pytest.ini_options]
7 | addopts = "--ignore=setup.py --ignore=build --doctest-modules --doctest-glob='README.rst'"
8 | 


--------------------------------------------------------------------------------
/rebulk/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Define simple search patterns in bulk to perform advanced matching on any string.
 5 | """
 6 | #  pylint:disable=import-self
 7 | from .rebulk import Rebulk
 8 | from .rules import Rule, CustomRule, AppendMatch, RemoveMatch, RenameMatch, AppendTags, RemoveTags
 9 | from .processors import ConflictSolver, PrivateRemover, POST_PROCESS, PRE_PROCESS
10 | from .pattern import REGEX_ENABLED
11 | 


--------------------------------------------------------------------------------
/rebulk/__version__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | """
4 | Version module
5 | """
6 | # pragma: no cover
7 | __version__ = '3.2.0'
8 | 


--------------------------------------------------------------------------------
/rebulk/builder.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Base builder class for Rebulk
  5 | """
  6 | from abc import ABCMeta, abstractmethod
  7 | from contextlib import contextmanager
  8 | from copy import deepcopy
  9 | from logging import getLogger
 10 | 
 11 | from .loose import set_defaults
 12 | from .pattern import RePattern, StringPattern, FunctionalPattern
 13 | 
 14 | log = getLogger(__name__).log
 15 | 
 16 | 
 17 | @contextmanager
 18 | def overrides(kwargs):
 19 |     """
 20 |     Implements override kwarg to restore initial kwarg arguments from overrides list after set_defaults calls.
 21 |     :param kwargs:
 22 |     :return:
 23 |     """
 24 |     override_keys = kwargs.pop('overrides', None)
 25 |     backup = {}
 26 |     if override_keys:
 27 |         for override_key in override_keys:
 28 |             backup[override_key] = kwargs[override_key]
 29 | 
 30 |     yield backup
 31 | 
 32 |     kwargs.update(backup)
 33 | 
 34 | 
 35 | class Builder(metaclass=ABCMeta):
 36 |     """
 37 |     Base builder class for patterns
 38 |     """
 39 | 
 40 |     def __init__(self):
 41 |         self._defaults = {}
 42 |         self._regex_defaults = {}
 43 |         self._string_defaults = {}
 44 |         self._functional_defaults = {}
 45 |         self._chain_defaults = {}
 46 | 
 47 |     def reset(self):
 48 |         """
 49 |         Reset all defaults.
 50 | 
 51 |         :return:
 52 |         """
 53 |         self.__init__()  # pylint: disable=unnecessary-dunder-call
 54 | 
 55 |     def defaults(self, **kwargs):
 56 |         """
 57 |         Define default keyword arguments for all patterns
 58 |         :param kwargs:
 59 |         :type kwargs:
 60 |         :return:
 61 |         :rtype:
 62 |         """
 63 |         set_defaults(kwargs, self._defaults, override=True)
 64 |         return self
 65 | 
 66 |     def regex_defaults(self, **kwargs):
 67 |         """
 68 |         Define default keyword arguments for functional patterns.
 69 |         :param kwargs:
 70 |         :type kwargs:
 71 |         :return:
 72 |         :rtype:
 73 |         """
 74 |         set_defaults(kwargs, self._regex_defaults, override=True)
 75 |         return self
 76 | 
 77 |     def string_defaults(self, **kwargs):
 78 |         """
 79 |         Define default keyword arguments for string patterns.
 80 |         :param kwargs:
 81 |         :type kwargs:
 82 |         :return:
 83 |         :rtype:
 84 |         """
 85 |         set_defaults(kwargs, self._string_defaults, override=True)
 86 |         return self
 87 | 
 88 |     def functional_defaults(self, **kwargs):
 89 |         """
 90 |         Define default keyword arguments for functional patterns.
 91 |         :param kwargs:
 92 |         :type kwargs:
 93 |         :return:
 94 |         :rtype:
 95 |         """
 96 |         set_defaults(kwargs, self._functional_defaults, override=True)
 97 |         return self
 98 | 
 99 |     def chain_defaults(self, **kwargs):
100 |         """
101 |         Define default keyword arguments for patterns chain.
102 |         :param kwargs:
103 |         :type kwargs:
104 |         :return:
105 |         :rtype:
106 |         """
107 |         set_defaults(kwargs, self._chain_defaults, override=True)
108 |         return self
109 | 
110 |     def build_re(self, *pattern, **kwargs):
111 |         """
112 |         Builds a new regular expression pattern
113 | 
114 |         :param pattern:
115 |         :type pattern:
116 |         :param kwargs:
117 |         :type kwargs:
118 |         :return:
119 |         :rtype:
120 |         """
121 |         with overrides(kwargs):
122 |             set_defaults(self._regex_defaults, kwargs)
123 |             set_defaults(self._defaults, kwargs)
124 | 
125 |         return RePattern(*pattern, **kwargs)
126 | 
127 |     def build_string(self, *pattern, **kwargs):
128 |         """
129 |         Builds a new string pattern
130 | 
131 |         :param pattern:
132 |         :type pattern:
133 |         :param kwargs:
134 |         :type kwargs:
135 |         :return:
136 |         :rtype:
137 |         """
138 |         with overrides(kwargs):
139 |             set_defaults(self._string_defaults, kwargs)
140 |             set_defaults(self._defaults, kwargs)
141 | 
142 |         return StringPattern(*pattern, **kwargs)
143 | 
144 |     def build_functional(self, *pattern, **kwargs):
145 |         """
146 |         Builds a new functional pattern
147 | 
148 |         :param pattern:
149 |         :type pattern:
150 |         :param kwargs:
151 |         :type kwargs:
152 |         :return:
153 |         :rtype:
154 |         """
155 |         with overrides(kwargs):
156 |             set_defaults(self._functional_defaults, kwargs)
157 |             set_defaults(self._defaults, kwargs)
158 | 
159 |         return FunctionalPattern(*pattern, **kwargs)
160 | 
161 |     def build_chain(self, **kwargs):
162 |         """
163 |         Builds a new patterns chain
164 | 
165 |         :param pattern:
166 |         :type pattern:
167 |         :param kwargs:
168 |         :type kwargs:
169 |         :return:
170 |         :rtype:
171 |         """
172 |         from .chain import Chain  # pylint:disable=import-outside-toplevel,cyclic-import
173 | 
174 |         with overrides(kwargs):
175 |             set_defaults(self._chain_defaults, kwargs)
176 |             set_defaults(self._defaults, kwargs)
177 | 
178 |         chain = Chain(self, **kwargs)
179 |         chain._defaults = deepcopy(self._defaults)  # pylint: disable=protected-access
180 |         chain._regex_defaults = deepcopy(self._regex_defaults)  # pylint: disable=protected-access
181 |         chain._functional_defaults = deepcopy(self._functional_defaults)  # pylint: disable=protected-access
182 |         chain._string_defaults = deepcopy(self._string_defaults)  # pylint: disable=protected-access
183 |         chain._chain_defaults = deepcopy(self._chain_defaults)  # pylint: disable=protected-access
184 | 
185 |         return chain
186 | 
187 |     @abstractmethod
188 |     def pattern(self, *pattern):
189 |         """
190 |         Register a list of Pattern instance
191 |         :param pattern:
192 |         :return:
193 |         """
194 | 
195 |     def regex(self, *pattern, **kwargs):
196 |         """
197 |         Add re pattern
198 | 
199 |         :param pattern:
200 |         :type pattern:
201 |         :return: self
202 |         :rtype: Rebulk
203 |         """
204 |         return self.pattern(self.build_re(*pattern, **kwargs))
205 | 
206 |     def string(self, *pattern, **kwargs):
207 |         """
208 |         Add string pattern
209 | 
210 |         :param pattern:
211 |         :type pattern:
212 |         :return: self
213 |         :rtype: Rebulk
214 |         """
215 |         return self.pattern(self.build_string(*pattern, **kwargs))
216 | 
217 |     def functional(self, *pattern, **kwargs):
218 |         """
219 |         Add functional pattern
220 | 
221 |         :param pattern:
222 |         :type pattern:
223 |         :return: self
224 |         :rtype: Rebulk
225 |         """
226 |         functional = self.build_functional(*pattern, **kwargs)
227 |         return self.pattern(functional)
228 | 
229 |     def chain(self, **kwargs):
230 |         """
231 |         Add patterns chain, using configuration of this rebulk
232 | 
233 |         :param pattern:
234 |         :type pattern:
235 |         :param kwargs:
236 |         :type kwargs:
237 |         :return:
238 |         :rtype:
239 |         """
240 |         chain = self.build_chain(**kwargs)
241 |         self.pattern(chain)
242 |         return chain
243 | 


--------------------------------------------------------------------------------
/rebulk/chain.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Chain patterns and handle repetiting capture group
  5 | """
  6 | import itertools
  7 | 
  8 | from .builder import Builder
  9 | from .loose import call
 10 | from .match import Match, Matches
 11 | from .pattern import Pattern, filter_match_kwargs, BasePattern
 12 | from .remodule import re
 13 | 
 14 | 
 15 | class _InvalidChainException(Exception):
 16 |     """
 17 |     Internal exception raised when a chain is not valid
 18 |     """
 19 | 
 20 | 
 21 | class Chain(Pattern, Builder):
 22 |     """
 23 |     Definition of a pattern chain to search for.
 24 |     """
 25 | 
 26 |     def __init__(self, parent, chain_breaker=None, **kwargs):  # pylint: disable=super-init-not-called
 27 |         Builder.__init__(self)
 28 |         call(Pattern.__init__, self, **kwargs)
 29 |         self._kwargs = kwargs
 30 |         self._match_kwargs = filter_match_kwargs(kwargs)
 31 |         if callable(chain_breaker):
 32 |             self.chain_breaker = chain_breaker
 33 |         else:
 34 |             self.chain_breaker = None
 35 |         self.parent = parent
 36 |         self.parts = []
 37 | 
 38 |     def pattern(self, *pattern):
 39 |         """
 40 | 
 41 |         :param pattern:
 42 |         :return:
 43 |         """
 44 |         if not pattern:
 45 |             raise ValueError("One pattern should be given to the chain")
 46 |         if len(pattern) > 1:
 47 |             raise ValueError("Only one pattern can be given to the chain")
 48 |         part = ChainPart(self, pattern[0])
 49 |         self.parts.append(part)
 50 |         return part
 51 | 
 52 |     def close(self):
 53 |         """
 54 |         Deeply close the chain
 55 |         :return: Rebulk instance
 56 |         """
 57 |         parent = self.parent
 58 |         while isinstance(parent, Chain):
 59 |             parent = parent.parent
 60 |         return parent
 61 | 
 62 |     def _match(self, pattern, input_string, context=None):
 63 |         # pylint: disable=too-many-locals,too-many-nested-blocks
 64 |         chain_matches = []
 65 |         chain_input_string = input_string
 66 |         offset = 0
 67 |         while offset < len(input_string):
 68 |             chain_found = False
 69 |             current_chain_matches = []
 70 |             valid_chain = True
 71 |             for chain_part in self.parts:
 72 |                 try:
 73 |                     chain_part_matches, raw_chain_part_matches = chain_part.matches(chain_input_string,
 74 |                                                                                     context,
 75 |                                                                                     with_raw_matches=True)
 76 | 
 77 |                     chain_found, chain_input_string, offset = \
 78 |                         self._to_next_chain_part(chain_part, chain_part_matches, raw_chain_part_matches, chain_found,
 79 |                                                  input_string, chain_input_string, offset, current_chain_matches)
 80 |                 except _InvalidChainException:
 81 |                     valid_chain = False
 82 |                     if current_chain_matches:
 83 |                         offset = current_chain_matches[0].raw_end
 84 |                     break
 85 |             if not chain_found:
 86 |                 break
 87 |             if current_chain_matches and valid_chain:
 88 |                 match = self._build_chain_match(current_chain_matches, input_string)
 89 |                 chain_matches.append(match)
 90 | 
 91 |         return chain_matches
 92 | 
 93 |     def _to_next_chain_part(self, chain_part, chain_part_matches, raw_chain_part_matches, chain_found,
 94 |                             input_string, chain_input_string, offset, current_chain_matches):
 95 |         Chain._fix_matches_offset(chain_part_matches, input_string, offset)
 96 |         Chain._fix_matches_offset(raw_chain_part_matches, input_string, offset)
 97 | 
 98 |         if raw_chain_part_matches:
 99 |             grouped_matches_dict = self._group_by_match_index(chain_part_matches)
100 |             grouped_raw_matches_dict = self._group_by_match_index(raw_chain_part_matches)
101 | 
102 |             for match_index, grouped_raw_matches in grouped_raw_matches_dict.items():
103 |                 chain_found = True
104 |                 offset = grouped_raw_matches[-1].raw_end
105 |                 chain_input_string = input_string[offset:]
106 | 
107 |                 if not chain_part.is_hidden:
108 |                     grouped_matches = grouped_matches_dict.get(match_index, [])
109 |                     if self._chain_breaker_eval(current_chain_matches + grouped_matches):
110 |                         current_chain_matches.extend(grouped_matches)
111 |         return chain_found, chain_input_string, offset
112 | 
113 |     def _process_match(self, match, match_index, child=False):
114 |         """
115 |         Handle a match
116 |         :param match:
117 |         :type match:
118 |         :param match_index:
119 |         :type match_index:
120 |         :param child:
121 |         :type child:
122 |         :return:
123 |         :rtype:
124 |         """
125 |         # pylint: disable=too-many-locals
126 |         ret = super()._process_match(match, match_index, child=child)
127 |         if ret:
128 |             return True
129 | 
130 |         if match.children:
131 |             last_pattern = match.children[-1].pattern
132 |             last_pattern_groups = self._group_by_match_index(
133 |                 [child_ for child_ in match.children if child_.pattern == last_pattern]
134 |             )
135 | 
136 |             if last_pattern_groups:
137 |                 original_children = Matches(match.children)
138 |                 original_end = match.end
139 | 
140 |                 for index in reversed(list(last_pattern_groups)):
141 |                     last_matches = last_pattern_groups[index]
142 |                     for last_match in last_matches:
143 |                         match.children.remove(last_match)
144 |                     match.end = match.children[-1].end if match.children else match.start
145 |                     ret = super()._process_match(match, match_index, child=child)
146 |                     if ret:
147 |                         return True
148 | 
149 |                 match.children = original_children
150 |                 match.end = original_end
151 | 
152 |         return False
153 | 
154 |     def _build_chain_match(self, current_chain_matches, input_string):
155 |         start = None
156 |         end = None
157 |         for match in current_chain_matches:
158 |             if start is None or start > match.start:
159 |                 start = match.start
160 |             if end is None or end < match.end:
161 |                 end = match.end
162 |         match = call(Match, start, end, pattern=self, input_string=input_string, **self._match_kwargs)
163 |         for chain_match in current_chain_matches:
164 |             if chain_match.children:
165 |                 for child in chain_match.children:
166 |                     match.children.append(child)
167 |             if chain_match not in match.children:
168 |                 match.children.append(chain_match)
169 |                 chain_match.parent = match
170 |         return match
171 | 
172 |     def _chain_breaker_eval(self, matches):
173 |         return not self.chain_breaker or not self.chain_breaker(Matches(matches))
174 | 
175 |     @staticmethod
176 |     def _fix_matches_offset(chain_part_matches, input_string, offset):
177 |         for chain_part_match in chain_part_matches:
178 |             if chain_part_match.input_string != input_string:
179 |                 chain_part_match.input_string = input_string
180 |                 chain_part_match.end += offset
181 |                 chain_part_match.start += offset
182 |             if chain_part_match.children:
183 |                 Chain._fix_matches_offset(chain_part_match.children, input_string, offset)
184 | 
185 |     @staticmethod
186 |     def _group_by_match_index(matches):
187 |         grouped_matches_dict = {}
188 |         for match_index, match in itertools.groupby(matches, lambda m: m.match_index):
189 |             grouped_matches_dict[match_index] = list(match)
190 |         return grouped_matches_dict
191 | 
192 |     @property
193 |     def match_options(self):
194 |         return {}
195 | 
196 |     @property
197 |     def patterns(self):
198 |         return [self]
199 | 
200 |     def __repr__(self):
201 |         defined = ""
202 |         if self.defined_at:
203 |             defined = f"@{self.defined_at}"
204 |         return f"<{self.__class__.__name__}{defined}:{self.parts}>"
205 | 
206 | 
207 | class ChainPart(BasePattern):
208 |     """
209 |     Part of a pattern chain.
210 |     """
211 | 
212 |     def __init__(self, chain, pattern):
213 |         self._chain = chain
214 |         self.pattern = pattern
215 |         self.repeater_start = 1
216 |         self.repeater_end = 1
217 |         self._hidden = False
218 | 
219 |     @property
220 |     def _is_chain_start(self):
221 |         return self._chain.parts[0] == self
222 | 
223 |     def matches(self, input_string, context=None, with_raw_matches=False):
224 |         matches, raw_matches = self.pattern.matches(input_string, context=context, with_raw_matches=True)
225 | 
226 |         matches = self._truncate_repeater(matches, input_string)
227 |         raw_matches = self._truncate_repeater(raw_matches, input_string)
228 | 
229 |         self._validate_repeater(raw_matches)
230 | 
231 |         if with_raw_matches:
232 |             return matches, raw_matches
233 | 
234 |         return matches
235 | 
236 |     def _truncate_repeater(self, matches, input_string):
237 |         if not matches:
238 |             return matches
239 | 
240 |         if not self._is_chain_start:
241 |             separator = input_string[0:matches[0].initiator.raw_start]
242 |             if separator:
243 |                 return []
244 | 
245 |         j = 1
246 |         for i in range(0, len(matches) - 1):
247 |             separator = input_string[matches[i].initiator.raw_end:
248 |                                      matches[i + 1].initiator.raw_start]
249 |             if separator:
250 |                 break
251 |             j += 1
252 |         truncated = matches[:j]
253 |         if self.repeater_end is not None:
254 |             truncated = [m for m in truncated if m.match_index < self.repeater_end]
255 |         return truncated
256 | 
257 |     def _validate_repeater(self, matches):
258 |         max_match_index = -1
259 |         if matches:
260 |             max_match_index = max(m.match_index for m in matches)
261 |         if max_match_index + 1 < self.repeater_start:
262 |             raise _InvalidChainException
263 | 
264 |     def chain(self):
265 |         """
266 |         Add patterns chain, using configuration from this chain
267 | 
268 |         :return:
269 |         :rtype:
270 |         """
271 |         return self._chain.chain()
272 | 
273 |     def hidden(self, hidden=True):
274 |         """
275 |         Hide chain part results from global chain result
276 | 
277 |         :param hidden:
278 |         :type hidden:
279 |         :return:
280 |         :rtype:
281 |         """
282 |         self._hidden = hidden
283 |         return self
284 | 
285 |     @property
286 |     def is_hidden(self):
287 |         """
288 |         Check if the chain part is hidden
289 |         :return:
290 |         :rtype:
291 |         """
292 |         return self._hidden
293 | 
294 |     def regex(self, *pattern, **kwargs):
295 |         """
296 |         Add re pattern
297 | 
298 |         :param pattern:
299 |         :type pattern:
300 |         :param kwargs:
301 |         :type kwargs:
302 |         :return:
303 |         :rtype:
304 |         """
305 |         return self._chain.regex(*pattern, **kwargs)
306 | 
307 |     def functional(self, *pattern, **kwargs):
308 |         """
309 |         Add functional pattern
310 | 
311 |         :param pattern:
312 |         :type pattern:
313 |         :param kwargs:
314 |         :type kwargs:
315 |         :return:
316 |         :rtype:
317 |         """
318 |         return self._chain.functional(*pattern, **kwargs)
319 | 
320 |     def string(self, *pattern, **kwargs):
321 |         """
322 |         Add string pattern
323 | 
324 |         :param pattern:
325 |         :type pattern:
326 |         :param kwargs:
327 |         :type kwargs:
328 |         :return:
329 |         :rtype:
330 |         """
331 |         return self._chain.string(*pattern, **kwargs)
332 | 
333 |     def close(self):
334 |         """
335 |         Close the chain builder to continue registering other patterns
336 | 
337 |         :return:
338 |         :rtype:
339 |         """
340 |         return self._chain.close()
341 | 
342 |     def repeater(self, value):
343 |         """
344 |         Define the repeater of the current chain part.
345 | 
346 |         :param value:
347 |         :type value:
348 |         :return:
349 |         :rtype:
350 |         """
351 |         try:
352 |             value = int(value)
353 |             self.repeater_start = value
354 |             self.repeater_end = value
355 |             return self
356 |         except ValueError:
357 |             pass
358 |         if value == '+':
359 |             self.repeater_start = 1
360 |             self.repeater_end = None
361 |         if value == '*':
362 |             self.repeater_start = 0
363 |             self.repeater_end = None
364 |         elif value == '?':
365 |             self.repeater_start = 0
366 |             self.repeater_end = 1
367 |         else:
368 |             match = re.match(r'\{\s*(\d*)\s*,?\s*(\d*)\s*\}', value)
369 |             if match:
370 |                 start = match.group(1)
371 |                 end = match.group(2)
372 |                 if start or end:
373 |                     self.repeater_start = int(start) if start else 0
374 |                     self.repeater_end = int(end) if end else None
375 |         return self
376 | 
377 |     def __repr__(self):
378 |         return f"{self.pattern}({{{self.repeater_start},{self.repeater_end}}})"
379 | 


--------------------------------------------------------------------------------
/rebulk/debug.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Debug tools.
 5 | 
 6 | Can be configured by changing values of those variable.
 7 | 
 8 | DEBUG = False
 9 | Enable this variable to activate debug features (like defined_at parameters). It can slow down Rebulk
10 | 
11 | LOG_LEVEL = 0
12 | Default log level of generated rebulk logs.
13 | """
14 | 
15 | import inspect
16 | import logging
17 | import os
18 | from collections import namedtuple
19 | 
20 | 
21 | DEBUG = False
22 | LOG_LEVEL = logging.DEBUG
23 | 
24 | 
25 | class Frame(namedtuple('Frame', ['lineno', 'package', 'name', 'filename'])):
26 |     """
27 |     Stack frame representation.
28 |     """
29 |     __slots__ = ()
30 | 
31 |     def __repr__(self):
32 |         return f"{os.path.basename(self.filename)}#L{self.lineno}"
33 | 
34 | 
35 | def defined_at():
36 |     """
37 |     Get definition location of a pattern or a match (outside of rebulk package).
38 |     :return:
39 |     :rtype:
40 |     """
41 |     if DEBUG:
42 |         frame = inspect.currentframe()
43 |         while frame:
44 |             try:
45 |                 if frame.f_globals['__package__'] != __package__:
46 |                     break
47 |             except KeyError:  # pragma:no cover
48 |                 # If package is missing, consider we are in. Workaround for python 3.3.
49 |                 break
50 |             frame = frame.f_back
51 |         ret = Frame(frame.f_lineno,
52 |                     frame.f_globals.get('__package__'),
53 |                     frame.f_globals.get('__name__'),
54 |                     frame.f_code.co_filename)
55 |         del frame
56 |         return ret
57 | 


--------------------------------------------------------------------------------
/rebulk/formatters.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Formatter functions to use in patterns.
 5 | 
 6 | All those function have last argument as match.value (str).
 7 | """
 8 | 
 9 | 
10 | def formatters(*chained_formatters):
11 |     """
12 |     Chain formatter functions.
13 |     :param chained_formatters:
14 |     :type chained_formatters:
15 |     :return:
16 |     :rtype:
17 |     """
18 | 
19 |     def formatters_chain(input_string):  # pylint:disable=missing-docstring
20 |         for chained_formatter in chained_formatters:
21 |             input_string = chained_formatter(input_string)
22 |         return input_string
23 | 
24 |     return formatters_chain
25 | 
26 | 
27 | def default_formatter(input_string):
28 |     """
29 |     Default formatter
30 |     :param input_string:
31 |     :return:
32 |     """
33 |     return input_string
34 | 


--------------------------------------------------------------------------------
/rebulk/introspector.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Introspect rebulk object to retrieve capabilities.
  5 | """
  6 | from abc import ABCMeta, abstractmethod
  7 | from collections import defaultdict
  8 | 
  9 | from .pattern import StringPattern, RePattern, FunctionalPattern
 10 | from .utils import extend_safe
 11 | 
 12 | 
 13 | class Description(metaclass=ABCMeta):
 14 |     """
 15 |     Abstract class for a description.
 16 |     """
 17 |     @property
 18 |     @abstractmethod
 19 |     def properties(self):  # pragma: no cover
 20 |         """
 21 |         Properties of described object.
 22 |         :return: all properties that described object can generate grouped by name.
 23 |         :rtype: dict
 24 |         """
 25 | 
 26 | 
 27 | class PatternDescription(Description):
 28 |     """
 29 |     Description of a pattern.
 30 |     """
 31 |     def __init__(self, pattern):  # pylint:disable=too-many-branches
 32 |         self.pattern = pattern
 33 |         self._properties = defaultdict(list)
 34 | 
 35 |         if pattern.properties:
 36 |             for key, values in pattern.properties.items():
 37 |                 extend_safe(self._properties[key], values)
 38 |         elif 'value' in pattern.match_options:
 39 |             self._properties[pattern.name].append(pattern.match_options['value'])
 40 |         elif isinstance(pattern, StringPattern):
 41 |             extend_safe(self._properties[pattern.name], pattern.patterns)
 42 |         elif isinstance(pattern, RePattern):
 43 |             if pattern.name and pattern.name not in pattern.private_names:
 44 |                 extend_safe(self._properties[pattern.name], [None])
 45 |             if not pattern.private_children:
 46 |                 for regex_pattern in pattern.patterns:
 47 |                     for group_name, values in regex_pattern.groupindex.items():
 48 |                         if group_name not in pattern.private_names:
 49 |                             extend_safe(self._properties[group_name], [None])
 50 |         elif isinstance(pattern, FunctionalPattern):
 51 |             if pattern.name and pattern.name not in pattern.private_names:
 52 |                 extend_safe(self._properties[pattern.name], [None])
 53 | 
 54 | 
 55 |     @property
 56 |     def properties(self):
 57 |         """
 58 |         Properties for this rule.
 59 |         :return:
 60 |         :rtype: dict
 61 |         """
 62 |         return self._properties
 63 | 
 64 | 
 65 | class RuleDescription(Description):
 66 |     """
 67 |     Description of a rule.
 68 |     """
 69 |     def __init__(self, rule):
 70 |         self.rule = rule
 71 | 
 72 |         self._properties = defaultdict(list)
 73 | 
 74 |         if rule.properties:
 75 |             for key, values in rule.properties.items():
 76 |                 extend_safe(self._properties[key], values)
 77 | 
 78 |     @property
 79 |     def properties(self):
 80 |         """
 81 |         Properties for this rule.
 82 |         :return:
 83 |         :rtype: dict
 84 |         """
 85 |         return self._properties
 86 | 
 87 | 
 88 | class Introspection(Description):
 89 |     """
 90 |     Introspection results.
 91 |     """
 92 |     def __init__(self, rebulk, context=None):
 93 |         self.patterns = [PatternDescription(pattern) for pattern in rebulk.effective_patterns(context)
 94 |                          if not pattern.private and not pattern.marker]
 95 |         self.rules = [RuleDescription(rule) for rule in rebulk.effective_rules(context)]
 96 | 
 97 |     @property
 98 |     def properties(self):
 99 |         """
100 |         Properties for Introspection results.
101 |         :return:
102 |         :rtype:
103 |         """
104 |         properties = defaultdict(list)
105 |         for pattern in self.patterns:
106 |             for key, values in pattern.properties.items():
107 |                 extend_safe(properties[key], values)
108 |         for rule in self.rules:
109 |             for key, values in rule.properties.items():
110 |                 extend_safe(properties[key], values)
111 |         return properties
112 | 
113 | 
114 | def introspect(rebulk, context=None):
115 |     """
116 |     Introspect a Rebulk instance to grab defined objects and properties that can be generated.
117 |     :param rebulk:
118 |     :type rebulk: Rebulk
119 |     :param context:
120 |     :type context:
121 |     :return: Introspection instance
122 |     :rtype: Introspection
123 |     """
124 |     return Introspection(rebulk, context)
125 | 


--------------------------------------------------------------------------------
/rebulk/loose.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Various utilities functions
  5 | """
  6 | 
  7 | import sys
  8 | 
  9 | from inspect import isclass
 10 | try:
 11 |     from inspect import getfullargspec as getargspec
 12 | 
 13 |     _FULLARGSPEC_SUPPORTED = True
 14 | except ImportError:
 15 |     _FULLARGSPEC_SUPPORTED = False
 16 |     from inspect import getargspec
 17 | 
 18 | from .utils import is_iterable
 19 | 
 20 | if sys.version_info < (3, 4, 0):  # pragma: no cover
 21 |     def _constructor(class_):
 22 |         """
 23 |         Retrieves constructor from given class
 24 | 
 25 |         :param class_:
 26 |         :type class_: class
 27 |         :return: constructor from given class
 28 |         :rtype: callable
 29 |         """
 30 |         return class_.__init__
 31 | else:  # pragma: no cover
 32 |     def _constructor(class_):
 33 |         """
 34 |         Retrieves constructor from given class
 35 | 
 36 |         :param class_:
 37 |         :type class_: class
 38 |         :return: constructor from given class
 39 |         :rtype: callable
 40 |         """
 41 |         return class_
 42 | 
 43 | 
 44 | def call(function, *args, **kwargs):
 45 |     """
 46 |     Call a function or constructor with given args and kwargs after removing args and kwargs that doesn't match
 47 |     function or constructor signature
 48 | 
 49 |     :param function: Function or constructor to call
 50 |     :type function: callable
 51 |     :param args:
 52 |     :type args:
 53 |     :param kwargs:
 54 |     :type kwargs:
 55 |     :return: sale vakye as default function call
 56 |     :rtype: object
 57 |     """
 58 |     func = constructor_args if isclass(function) else function_args
 59 |     call_args, call_kwargs = func(function, *args, ignore_unused=True, **kwargs)  # @see #20
 60 |     return function(*call_args, **call_kwargs)
 61 | 
 62 | 
 63 | def function_args(callable_, *args, **kwargs):
 64 |     """
 65 |     Return (args, kwargs) matching the function signature
 66 | 
 67 |     :param callable: callable to inspect
 68 |     :type callable: callable
 69 |     :param args:
 70 |     :type args:
 71 |     :param kwargs:
 72 |     :type kwargs:
 73 |     :return: (args, kwargs) matching the function signature
 74 |     :rtype: tuple
 75 |     """
 76 |     argspec = getargspec(callable_)  # pylint:disable=deprecated-method
 77 |     return argspec_args(argspec, False, *args, **kwargs)
 78 | 
 79 | 
 80 | def constructor_args(class_, *args, **kwargs):
 81 |     """
 82 |     Return (args, kwargs) matching the function signature
 83 | 
 84 |     :param callable: callable to inspect
 85 |     :type callable: Callable
 86 |     :param args:
 87 |     :type args:
 88 |     :param kwargs:
 89 |     :type kwargs:
 90 |     :return: (args, kwargs) matching the function signature
 91 |     :rtype: tuple
 92 |     """
 93 |     argspec = getargspec(_constructor(class_))  # pylint:disable=deprecated-method
 94 |     return argspec_args(argspec, True, *args, **kwargs)
 95 | 
 96 | 
 97 | def argspec_args(argspec, constructor, *args, **kwargs):
 98 |     """
 99 |     Return (args, kwargs) matching the argspec object
100 | 
101 |     :param argspec: argspec to use
102 |     :type argspec: argspec
103 |     :param constructor: is it a constructor ?
104 |     :type constructor: bool
105 |     :param args:
106 |     :type args:
107 |     :param kwargs:
108 |     :type kwargs:
109 |     :return: (args, kwargs) matching the function signature
110 |     :rtype: tuple
111 |     """
112 |     if argspec.varkw:
113 |         call_kwarg = kwargs
114 |     else:
115 |         call_kwarg = dict((k, kwargs[k]) for k in kwargs if k in argspec.args) # pylint:disable=consider-using-dict-items
116 |     if argspec.varargs:
117 |         call_args = args
118 |     else:
119 |         call_args = args[:len(argspec.args) - (1 if constructor else 0)]
120 |     return call_args, call_kwarg
121 | 
122 | 
123 | if not _FULLARGSPEC_SUPPORTED:
124 |     def argspec_args_legacy(argspec, constructor, *args, **kwargs):
125 |         """
126 |         Return (args, kwargs) matching the argspec object
127 | 
128 |         :param argspec: argspec to use
129 |         :type argspec: argspec
130 |         :param constructor: is it a constructor ?
131 |         :type constructor: bool
132 |         :param args:
133 |         :type args:
134 |         :param kwargs:
135 |         :type kwargs:
136 |         :return: (args, kwargs) matching the function signature
137 |         :rtype: tuple
138 |         """
139 |         if argspec.keywords:
140 |             call_kwarg = kwargs
141 |         else:
142 |             call_kwarg = dict((k, kwargs[k]) for k in kwargs if k in argspec.args) # pylint:disable=consider-using-dict-items
143 |         if argspec.varargs:
144 |             call_args = args
145 |         else:
146 |             call_args = args[:len(argspec.args) - (1 if constructor else 0)]
147 |         return call_args, call_kwarg
148 | 
149 | 
150 |     argspec_args = argspec_args_legacy
151 | 
152 | 
153 | def ensure_list(param):
154 |     """
155 |     Retrieves a list from given parameter.
156 | 
157 |     :param param:
158 |     :type param:
159 |     :return:
160 |     :rtype:
161 |     """
162 |     if not param:
163 |         param = []
164 |     elif not is_iterable(param):
165 |         param = [param]
166 |     return param
167 | 
168 | 
169 | def ensure_dict(param, default_value, default_key=None):
170 |     """
171 |     Retrieves a dict and a default value from given parameter.
172 | 
173 |     if parameter is not a dict, it will be promoted as the default value.
174 | 
175 |     :param param:
176 |     :type param:
177 |     :param default_value:
178 |     :type default_value:
179 |     :param default_key:
180 |     :type default_key:
181 |     :return:
182 |     :rtype:
183 |     """
184 |     if not param:
185 |         param = default_value
186 |     if not isinstance(param, dict):
187 |         if param:
188 |             default_value = param
189 |         return {default_key: param}, default_value
190 |     return param, default_value
191 | 
192 | 
193 | def filter_index(collection, predicate=None, index=None):
194 |     """
195 |     Filter collection with predicate function and index.
196 | 
197 |     If index is not found, returns None.
198 |     :param collection:
199 |     :type collection: collection supporting iteration and slicing
200 |     :param predicate: function to filter the collection with
201 |     :type predicate: function
202 |     :param index: position of a single element to retrieve
203 |     :type index: int
204 |     :return: filtered list, or single element of filtered list if index is defined
205 |     :rtype: list or object
206 |     """
207 |     if index is None and isinstance(predicate, int):
208 |         index = predicate
209 |         predicate = None
210 |     if predicate:
211 |         collection = collection.__class__(filter(predicate, collection))
212 |     if index is not None:
213 |         try:
214 |             collection = collection[index]
215 |         except IndexError:
216 |             collection = None
217 |     return collection
218 | 
219 | 
220 | def set_defaults(defaults, kwargs, override=False):
221 |     """
222 |     Set defaults from defaults dict to kwargs dict
223 | 
224 |     :param override:
225 |     :type override:
226 |     :param defaults:
227 |     :type defaults:
228 |     :param kwargs:
229 |     :type kwargs:
230 |     :return:
231 |     :rtype:
232 |     """
233 |     if 'clear' in defaults.keys() and defaults.pop('clear'):
234 |         kwargs.clear()
235 |     for key, value in defaults.items():
236 |         if key in kwargs:
237 |             if isinstance(value, list) and isinstance(kwargs[key], list):
238 |                 kwargs[key] = list(value) + kwargs[key]
239 |             elif isinstance(value, dict) and isinstance(kwargs[key], dict):
240 |                 set_defaults(value, kwargs[key])
241 |         if key not in kwargs or override:
242 |             kwargs[key] = value
243 | 


--------------------------------------------------------------------------------
/rebulk/pattern.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Abstract pattern class definition along with various implementations (regexp, string, functional)
  5 | """
  6 | # pylint: disable=super-init-not-called,wrong-import-position
  7 | 
  8 | from abc import ABCMeta, abstractmethod
  9 | 
 10 | from . import debug
 11 | from .formatters import default_formatter
 12 | from .loose import call, ensure_list, ensure_dict
 13 | from .match import Match
 14 | from .remodule import re, REGEX_ENABLED
 15 | from .utils import find_all, is_iterable, get_first_defined
 16 | from .validators import allways_true
 17 | 
 18 | 
 19 | class BasePattern(metaclass=ABCMeta):
 20 |     """
 21 |     Base class for Pattern like objects
 22 |     """
 23 | 
 24 |     @abstractmethod
 25 |     def matches(self, input_string, context=None, with_raw_matches=False):
 26 |         """
 27 |         Computes all matches for a given input
 28 | 
 29 |         :param input_string: the string to parse
 30 |         :type input_string: str
 31 |         :param context: the context
 32 |         :type context: dict
 33 |         :param with_raw_matches: should return details
 34 |         :type with_raw_matches: dict
 35 |         :return: matches based on input_string for this pattern
 36 |         :rtype: iterator[Match]
 37 |         """
 38 | 
 39 | 
 40 | class Pattern(BasePattern, metaclass=ABCMeta):
 41 |     """
 42 |     Definition of a particular pattern to search for.
 43 |     """
 44 | 
 45 |     def __init__(self, name=None, tags=None, formatter=None, value=None, validator=None, children=False, every=False,
 46 |                  private_parent=False, private_children=False, private=False, private_names=None, ignore_names=None,
 47 |                  marker=False, format_all=False, validate_all=False, disabled=lambda context: False, log_level=None,
 48 |                  properties=None, post_processor=None, pre_match_processor=None, post_match_processor=None, **kwargs):
 49 |         """
 50 |         :param name: Name of this pattern
 51 |         :type name: str
 52 |         :param tags: List of tags related to this pattern
 53 |         :type tags: list[str]
 54 |         :param formatter: dict (name, func) of formatter to use with this pattern. name is the match name to support,
 55 |         and func a function(input_string) that returns the formatted string. A single formatter function can also be
 56 |         passed as a shortcut for {None: formatter}. The returned formatted string with be set in Match.value property.
 57 |         :type formatter: dict[str, func] || func
 58 |         :param value: dict (name, value) of value to use with this pattern. name is the match name to support,
 59 |         and value an object for the match value. A single object value can also be
 60 |         passed as a shortcut for {None: value}. The value with be set in Match.value property.
 61 |         :type value: dict[str, object] || object
 62 |         :param validator: dict (name, func) of validator to use with this pattern. name is the match name to support,
 63 |         and func a function(match) that returns the a boolean. A single validator function can also be
 64 |         passed as a shortcut for {None: validator}. If return value is False, match will be ignored.
 65 |         :param children: generates children instead of parent
 66 |         :type children: bool
 67 |         :param every: generates both parent and children.
 68 |         :type every: bool
 69 |         :param private: flag this pattern as beeing private.
 70 |         :type private: bool
 71 |         :param private_parent: force return of parent and flag parent matches as private.
 72 |         :type private_parent: bool
 73 |         :param private_children: force return of children and flag children matches as private.
 74 |         :type private_children: bool
 75 |         :param private_names: force return of named matches as private.
 76 |         :type private_names: bool
 77 |         :param ignore_names: drop some named matches after validation.
 78 |         :type ignore_names: bool
 79 |         :param marker: flag this pattern as beeing a marker.
 80 |         :type private: bool
 81 |         :param format_all if True, pattern will format every match in the hierarchy (even match not yield).
 82 |         :type format_all: bool
 83 |         :param validate_all if True, pattern will validate every match in the hierarchy (even match not yield).
 84 |         :type validate_all: bool
 85 |         :param disabled: if True, this pattern is disabled. Can also be a function(context).
 86 |         :type disabled: bool|function
 87 |         :param log_lvl: Log level associated to this pattern
 88 |         :type log_lvl: int
 89 |         :param post_processor: Post processing function
 90 |         :type post_processor: func
 91 |         :param pre_match_processor: Pre match processing function
 92 |         :type pre_match_processor: func
 93 |         :param post_match_processor: Post match processing function
 94 |         :type post_match_processor: func
 95 |         """
 96 |         # pylint:disable=too-many-locals,unused-argument
 97 |         self.name = name
 98 |         self.tags = ensure_list(tags)
 99 |         self.formatters, self._default_formatter = ensure_dict(formatter, default_formatter)
100 |         self.values, self._default_value = ensure_dict(value, None)
101 |         self.validators, self._default_validator = ensure_dict(validator, allways_true)
102 |         self.every = every
103 |         self.children = children
104 |         self.private = private
105 |         self.private_names = private_names if private_names else []
106 |         self.ignore_names = ignore_names if ignore_names else []
107 |         self.private_parent = private_parent
108 |         self.private_children = private_children
109 |         self.marker = marker
110 |         self.format_all = format_all
111 |         self.validate_all = validate_all
112 |         if not callable(disabled):
113 |             self.disabled = lambda context: disabled
114 |         else:
115 |             self.disabled = disabled
116 |         self._log_level = log_level
117 |         self._properties = properties
118 |         self.defined_at = debug.defined_at()
119 |         if not callable(post_processor):
120 |             self.post_processor = None
121 |         else:
122 |             self.post_processor = post_processor
123 |         if not callable(pre_match_processor):
124 |             self.pre_match_processor = None
125 |         else:
126 |             self.pre_match_processor = pre_match_processor
127 |         if not callable(post_match_processor):
128 |             self.post_match_processor = None
129 |         else:
130 |             self.post_match_processor = post_match_processor
131 | 
132 |     @property
133 |     def log_level(self):
134 |         """
135 |         Log level for this pattern.
136 |         :return:
137 |         :rtype:
138 |         """
139 |         return self._log_level if self._log_level is not None else debug.LOG_LEVEL
140 | 
141 |     def matches(self, input_string, context=None, with_raw_matches=False):
142 |         """
143 |         Computes all matches for a given input
144 | 
145 |         :param input_string: the string to parse
146 |         :type input_string: str
147 |         :param context: the context
148 |         :type context: dict
149 |         :param with_raw_matches: should return details
150 |         :type with_raw_matches: dict
151 |         :return: matches based on input_string for this pattern
152 |         :rtype: iterator[Match]
153 |         """
154 |         # pylint: disable=too-many-branches
155 | 
156 |         matches = []
157 |         raw_matches = []
158 | 
159 |         for pattern in self.patterns:
160 |             match_index = 0
161 |             for match in self._match(pattern, input_string, context):
162 |                 raw_matches.append(match)
163 |                 matches.extend(self._process_matches(match, match_index))
164 |                 match_index += 1
165 | 
166 |         matches = self._post_process_matches(matches)
167 | 
168 |         if with_raw_matches:
169 |             return matches, raw_matches
170 |         return matches
171 | 
172 |     @property
173 |     def _should_include_children(self):
174 |         """
175 |         Check if children matches from this pattern should be included in matches results.
176 |         :param match:
177 |         :type match:
178 |         :return:
179 |         :rtype:
180 |         """
181 |         return self.children or self.every
182 | 
183 |     @property
184 |     def _should_include_parent(self):
185 |         """
186 |         Check is a match from this pattern should be included in matches results.
187 |         :param match:
188 |         :type match:
189 |         :return:
190 |         :rtype:
191 |         """
192 |         return not self.children or self.every
193 | 
194 |     @staticmethod
195 |     def _match_config_property_keys(match, child=False):
196 |         if match.name:
197 |             yield match.name
198 |         if child:
199 |             yield '__children__'
200 |         else:
201 |             yield '__parent__'
202 |         yield None
203 | 
204 |     @staticmethod
205 |     def _process_match_index(match, match_index):
206 |         """
207 |         Process match index from this pattern process state.
208 | 
209 |         :param match:
210 |         :return:
211 |         """
212 |         match.match_index = match_index
213 | 
214 |     def _process_match_private(self, match, child=False):
215 |         """
216 |         Process match privacy from this pattern configuration.
217 | 
218 |         :param match:
219 |         :param child:
220 |         :return:
221 |         """
222 | 
223 |         if match.name and match.name in self.private_names or \
224 |                 not child and self.private_parent or \
225 |                 child and self.private_children:
226 |             match.private = True
227 | 
228 |     def _process_match_value(self, match, child=False):
229 |         """
230 |         Process match value from this pattern configuration.
231 |         :param match:
232 |         :return:
233 |         """
234 |         keys = self._match_config_property_keys(match, child=child)
235 |         pattern_value = get_first_defined(self.values, keys, self._default_value)
236 |         if pattern_value:
237 |             match.value = pattern_value
238 | 
239 |     def _process_match_formatter(self, match, child=False):
240 |         """
241 |         Process match formatter from this pattern configuration.
242 | 
243 |         :param match:
244 |         :return:
245 |         """
246 |         included = self._should_include_children if child else self._should_include_parent
247 |         if included or self.format_all:
248 |             keys = self._match_config_property_keys(match, child=child)
249 |             match.formatter = get_first_defined(self.formatters, keys, self._default_formatter)
250 | 
251 |     def _process_match_validator(self, match, child=False):
252 |         """
253 |         Process match validation from this pattern configuration.
254 | 
255 |         :param match:
256 |         :return: True if match is validated by the configured validator, False otherwise.
257 |         """
258 |         included = self._should_include_children if child else self._should_include_parent
259 |         if included or self.validate_all:
260 |             keys = self._match_config_property_keys(match, child=child)
261 |             validator = get_first_defined(self.validators, keys, self._default_validator)
262 |             if validator and not validator(match):
263 |                 return False
264 |         return True
265 | 
266 |     def _process_match(self, match, match_index, child=False):
267 |         """
268 |         Process match from this pattern by setting all properties from defined configuration
269 |         (index, private, value, formatter, validator, ...).
270 | 
271 |         :param match:
272 |         :type match:
273 |         :return: True if match is validated by the configured validator, False otherwise.
274 |         :rtype:
275 |         """
276 |         self._process_match_index(match, match_index)
277 |         self._process_match_private(match, child)
278 |         self._process_match_value(match, child)
279 |         self._process_match_formatter(match, child)
280 |         return self._process_match_validator(match, child)
281 | 
282 |     @staticmethod
283 |     def _process_match_processor(match, processor):
284 |         if processor:
285 |             ret = processor(match)
286 |             if ret is not None:
287 |                 return ret
288 |         return match
289 | 
290 |     def _process_matches(self, match, match_index):
291 |         """
292 |         Process and generate all matches for the given unprocessed match.
293 |         :param match:
294 |         :param match_index:
295 |         :return: Process and dispatched matches.
296 |         """
297 |         match = self._process_match_processor(match, self.pre_match_processor)
298 |         if not match:
299 |             return
300 | 
301 |         if not self._process_match(match, match_index):
302 |             return
303 | 
304 |         for child in match.children:
305 |             if not self._process_match(child, match_index, child=True):
306 |                 return
307 | 
308 |         match = self._process_match_processor(match, self.post_match_processor)
309 |         if not match:
310 |             return
311 | 
312 |         if (self._should_include_parent or self.private_parent) and match.name not in self.ignore_names:
313 |             yield match
314 |         if self._should_include_children or self.private_children:
315 |             children = [x for x in match.children if x.name not in self.ignore_names]
316 |             for child in children:
317 |                 yield child
318 | 
319 |     def _post_process_matches(self, matches):
320 |         """
321 |         Post process matches with user defined function
322 |         :param matches:
323 |         :type matches:
324 |         :return:
325 |         :rtype:
326 |         """
327 |         if self.post_processor:
328 |             return self.post_processor(matches, self)
329 |         return matches
330 | 
331 |     @property
332 |     @abstractmethod
333 |     def patterns(self):  # pragma: no cover
334 |         """
335 |         List of base patterns defined
336 | 
337 |         :return: A list of base patterns
338 |         :rtype: list
339 |         """
340 | 
341 |     @property
342 |     def properties(self):
343 |         """
344 |         Properties names and values that can ben retrieved by this pattern.
345 |         :return:
346 |         :rtype:
347 |         """
348 |         if self._properties:
349 |             return self._properties
350 |         return {}
351 | 
352 |     @property
353 |     @abstractmethod
354 |     def match_options(self):  # pragma: no cover
355 |         """
356 |         dict of default options for generated Match objects
357 | 
358 |         :return: **options to pass to Match constructor
359 |         :rtype: dict
360 |         """
361 | 
362 |     @abstractmethod
363 |     def _match(self, pattern, input_string, context=None):  # pragma: no cover
364 |         """
365 |         Computes all unprocess matches for a given pattern and input.
366 | 
367 |         :param pattern: the pattern to use
368 |         :param input_string: the string to parse
369 |         :type input_string: str
370 |         :param context: the context
371 |         :type context: dict
372 |         :return: matches based on input_string for this pattern
373 |         :rtype: iterator[Match]
374 |         """
375 | 
376 |     def __repr__(self):
377 |         defined = ""
378 |         if self.defined_at:
379 |             defined = f"@{self.defined_at}"
380 |         return f"<{self.__class__.__name__}{defined}:{self.__repr__patterns__}>"
381 | 
382 |     @property
383 |     def __repr__patterns__(self):
384 |         return self.patterns
385 | 
386 | 
387 | class StringPattern(Pattern):
388 |     """
389 |     Definition of one or many strings to search for.
390 |     """
391 | 
392 |     def __init__(self, *patterns, **kwargs):
393 |         super().__init__(**kwargs)
394 |         self._patterns = patterns
395 |         self._kwargs = kwargs
396 |         self._match_kwargs = filter_match_kwargs(kwargs)
397 | 
398 |     @property
399 |     def patterns(self):
400 |         return self._patterns
401 | 
402 |     @property
403 |     def match_options(self):
404 |         return self._match_kwargs
405 | 
406 |     def _match(self, pattern, input_string, context=None):
407 |         for index in find_all(input_string, pattern, **self._kwargs):
408 |             match = Match(index, index + len(pattern), pattern=self, input_string=input_string, **self._match_kwargs)
409 |             if match:
410 |                 yield match
411 | 
412 | 
413 | class RePattern(Pattern):
414 |     """
415 |     Definition of one or many regular expression pattern to search for.
416 |     """
417 | 
418 |     def __init__(self, *patterns, **kwargs):
419 |         super().__init__(**kwargs)
420 |         self.repeated_captures = REGEX_ENABLED
421 |         if 'repeated_captures' in kwargs:
422 |             self.repeated_captures = kwargs.get('repeated_captures')
423 |         if self.repeated_captures and not REGEX_ENABLED:  # pragma: no cover
424 |             raise NotImplementedError("repeated_capture is available only with regex module.")
425 |         self.abbreviations = kwargs.get('abbreviations', [])
426 |         self._kwargs = kwargs
427 |         self._match_kwargs = filter_match_kwargs(kwargs)
428 |         self._children_match_kwargs = filter_match_kwargs(kwargs, children=True)
429 |         self._patterns = []
430 |         for pattern in patterns:
431 |             if isinstance(pattern, str):
432 |                 if self.abbreviations and pattern:
433 |                     for key, replacement in self.abbreviations:
434 |                         pattern = pattern.replace(key, replacement)
435 |                 pattern = call(re.compile, pattern, **self._kwargs)
436 |             elif isinstance(pattern, dict):
437 |                 if self.abbreviations and 'pattern' in pattern:
438 |                     for key, replacement in self.abbreviations:
439 |                         pattern['pattern'] = pattern['pattern'].replace(key, replacement)
440 |                 pattern = re.compile(**pattern)
441 |             elif hasattr(pattern, '__iter__'):
442 |                 pattern = re.compile(*pattern)
443 |             self._patterns.append(pattern)
444 | 
445 |     @property
446 |     def patterns(self):
447 |         return self._patterns
448 | 
449 |     @property
450 |     def __repr__patterns__(self):
451 |         return [pattern.pattern for pattern in self.patterns]
452 | 
453 |     @property
454 |     def match_options(self):
455 |         return self._match_kwargs
456 | 
457 |     def _match(self, pattern, input_string, context=None):
458 |         names = dict((v, k) for k, v in pattern.groupindex.items())
459 |         for match_object in pattern.finditer(input_string):
460 |             start = match_object.start()
461 |             end = match_object.end()
462 |             main_match = Match(start, end, pattern=self, input_string=input_string, **self._match_kwargs)
463 | 
464 |             if pattern.groups:
465 |                 for i in range(1, pattern.groups + 1):
466 |                     name = names.get(i, main_match.name)
467 |                     if self.repeated_captures:
468 |                         for start, end in match_object.spans(i):
469 |                             child_match = Match(start, end, name=name, parent=main_match, pattern=self,
470 |                                                 input_string=input_string, **self._children_match_kwargs)
471 |                             if child_match:
472 |                                 main_match.children.append(child_match)
473 |                     else:
474 |                         start, end = match_object.span(i)
475 |                         if start > -1 and end > -1:
476 |                             child_match = Match(start, end, name=name, parent=main_match, pattern=self,
477 |                                                 input_string=input_string, **self._children_match_kwargs)
478 |                             if child_match:
479 |                                 main_match.children.append(child_match)
480 | 
481 |             if main_match:
482 |                 yield main_match
483 | 
484 | 
485 | class FunctionalPattern(Pattern):
486 |     """
487 |     Definition of one or many functional pattern to search for.
488 |     """
489 | 
490 |     def __init__(self, *patterns, **kwargs):
491 |         super().__init__(**kwargs)
492 |         self._patterns = patterns
493 |         self._kwargs = kwargs
494 |         self._match_kwargs = filter_match_kwargs(kwargs)
495 | 
496 |     @property
497 |     def patterns(self):
498 |         return self._patterns
499 | 
500 |     @property
501 |     def match_options(self):
502 |         return self._match_kwargs
503 | 
504 |     def _match(self, pattern, input_string, context=None):
505 |         ret = call(pattern, input_string, context, **self._kwargs)
506 |         if ret:
507 |             if not is_iterable(ret) or isinstance(ret, dict) \
508 |                     or (is_iterable(ret) and hasattr(ret, '__getitem__') and isinstance(ret[0], int)):
509 |                 args_iterable = [ret]
510 |             else:
511 |                 args_iterable = ret
512 |             for args in args_iterable:
513 |                 if isinstance(args, dict):
514 |                     options = args
515 |                     options.pop('input_string', None)
516 |                     options.pop('pattern', None)
517 |                     if self._match_kwargs:
518 |                         options = self._match_kwargs.copy()
519 |                         options.update(args)
520 |                     match = Match(pattern=self, input_string=input_string, **options)
521 |                     if match:
522 |                         yield match
523 |                 else:
524 |                     kwargs = self._match_kwargs
525 |                     if isinstance(args[-1], dict):
526 |                         kwargs = dict(kwargs)
527 |                         kwargs.update(args[-1])
528 |                         args = args[:-1]
529 |                     match = Match(*args, pattern=self, input_string=input_string, **kwargs)
530 |                     if match:
531 |                         yield match
532 | 
533 | 
534 | def filter_match_kwargs(kwargs, children=False):
535 |     """
536 |     Filters out kwargs for Match construction
537 | 
538 |     :param kwargs:
539 |     :type kwargs: dict
540 |     :param children:
541 |     :type children: Flag to filter children matches
542 |     :return: A filtered dict
543 |     :rtype: dict
544 |     """
545 |     kwargs = kwargs.copy()
546 |     for key in ('pattern', 'start', 'end', 'parent', 'formatter', 'value'):
547 |         if key in kwargs:
548 |             del kwargs[key]
549 |     if children:
550 |         for key in ('name',):
551 |             if key in kwargs:
552 |                 del kwargs[key]
553 |     return kwargs
554 | 


--------------------------------------------------------------------------------
/rebulk/processors.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Processor functions
  5 | """
  6 | from logging import getLogger
  7 | 
  8 | from .utils import IdentitySet
  9 | 
 10 | from .rules import Rule, RemoveMatch
 11 | 
 12 | log = getLogger(__name__).log
 13 | 
 14 | DEFAULT = '__default__'
 15 | 
 16 | POST_PROCESS = -2048
 17 | PRE_PROCESS = 2048
 18 | 
 19 | 
 20 | def _default_conflict_solver(match, conflicting_match):
 21 |     """
 22 |     Default conflict solver for matches, shorter matches if they conflicts with longer ones
 23 | 
 24 |     :param conflicting_match:
 25 |     :type conflicting_match:
 26 |     :param match:
 27 |     :type match:
 28 |     :return:
 29 |     :rtype:
 30 |     """
 31 |     if len(conflicting_match.initiator) < len(match.initiator):
 32 |         return conflicting_match
 33 |     if len(match.initiator) < len(conflicting_match.initiator):
 34 |         return match
 35 |     return None
 36 | 
 37 | 
 38 | class ConflictSolver(Rule):
 39 |     """
 40 |     Remove conflicting matches.
 41 |     """
 42 |     priority = PRE_PROCESS
 43 | 
 44 |     consequence = RemoveMatch
 45 | 
 46 |     @property
 47 |     def default_conflict_solver(self):
 48 |         """
 49 |         Default conflict solver to use.
 50 |         """
 51 |         return _default_conflict_solver
 52 | 
 53 |     def when(self, matches, context):
 54 |         # pylint:disable=too-many-nested-blocks
 55 |         to_remove_matches = IdentitySet()
 56 | 
 57 |         public_matches = [match for match in matches if not match.private]
 58 |         public_matches.sort(key=len)
 59 | 
 60 |         for match in public_matches:
 61 |             conflicting_matches = matches.conflicting(match)
 62 | 
 63 |             if conflicting_matches:
 64 |                 # keep the match only if it's the longest
 65 |                 conflicting_matches = [conflicting_match for conflicting_match in conflicting_matches if
 66 |                                        not conflicting_match.private]
 67 |                 conflicting_matches.sort(key=len)
 68 | 
 69 |                 for conflicting_match in conflicting_matches:
 70 |                     conflict_solvers = [(self.default_conflict_solver, False)]
 71 | 
 72 |                     if match.conflict_solver:
 73 |                         conflict_solvers.append((match.conflict_solver, False))
 74 |                     if conflicting_match.conflict_solver:
 75 |                         conflict_solvers.append((conflicting_match.conflict_solver, True))
 76 | 
 77 |                     for conflict_solver, reverse in reversed(conflict_solvers):
 78 |                         if reverse:
 79 |                             to_remove = conflict_solver(conflicting_match, match)
 80 |                         else:
 81 |                             to_remove = conflict_solver(match, conflicting_match)
 82 |                         if to_remove == DEFAULT:
 83 |                             continue
 84 |                         if to_remove and to_remove not in to_remove_matches:
 85 |                             both_matches = [match, conflicting_match]
 86 |                             both_matches.remove(to_remove)
 87 |                             to_keep = both_matches[0]
 88 | 
 89 |                             if to_keep not in to_remove_matches:
 90 |                                 log(self.log_level, "Conflicting match %s will be removed in favor of match %s",
 91 |                                     to_remove, to_keep)
 92 | 
 93 |                                 to_remove_matches.add(to_remove)
 94 |                         break
 95 |         return to_remove_matches
 96 | 
 97 | 
 98 | class PrivateRemover(Rule):
 99 |     """
100 |     Removes private matches rule.
101 |     """
102 |     priority = POST_PROCESS
103 | 
104 |     consequence = RemoveMatch
105 | 
106 |     def when(self, matches, context):
107 |         return [match for match in matches if match.private]
108 | 


--------------------------------------------------------------------------------
/rebulk/rebulk.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Entry point functions and classes for Rebulk
  5 | """
  6 | from logging import getLogger
  7 | 
  8 | from .builder import Builder
  9 | from .match import Matches
 10 | from .processors import ConflictSolver, PrivateRemover
 11 | from .rules import Rules
 12 | from .utils import extend_safe
 13 | 
 14 | log = getLogger(__name__).log
 15 | 
 16 | 
 17 | class Rebulk(Builder):
 18 |     r"""
 19 |     Regular expression, string and function based patterns are declared in a ``Rebulk`` object. It use a fluent API to
 20 |     chain ``string``, ``regex``, and ``functional`` methods to define various patterns types.
 21 | 
 22 |     .. code-block:: python
 23 | 
 24 |         >>> from rebulk import Rebulk
 25 |         >>> bulk = Rebulk().string('brown').regex(r'qu\w+').functional(lambda s: (20, 25))
 26 | 
 27 |     When ``Rebulk`` object is fully configured, you can call ``matches`` method with an input string to retrieve all
 28 |     ``Match`` objects found by registered pattern.
 29 | 
 30 |     .. code-block:: python
 31 | 
 32 |         >>> bulk.matches("The quick brown fox jumps over the lazy dog")
 33 |         [<brown:(10, 15)>, <quick:(4, 9)>, <jumps:(20, 25)>]
 34 | 
 35 |     If multiple ``Match`` objects are found at the same position, only the longer one is kept.
 36 | 
 37 |     .. code-block:: python
 38 | 
 39 |         >>> bulk = Rebulk().string('lakers').string('la')
 40 |         >>> bulk.matches("the lakers are from la")
 41 |         [<lakers:(4, 10)>, <la:(20, 22)>]
 42 |     """
 43 | 
 44 |     # pylint:disable=protected-access
 45 | 
 46 |     def __init__(self, disabled=lambda context: False, default_rules=True):
 47 |         """
 48 |         Creates a new Rebulk object.
 49 |         :param disabled: if True, this pattern is disabled. Can also be a function(context).
 50 |         :type disabled: bool|function
 51 |         :param default_rules: use default rules
 52 |         :type default_rules:
 53 |         :return:
 54 |         :rtype:
 55 |         """
 56 |         super().__init__()
 57 |         if not callable(disabled):
 58 |             self.disabled = lambda context: disabled
 59 |         else:
 60 |             self.disabled = disabled
 61 |         self._patterns = []
 62 |         self._rules = Rules()
 63 |         if default_rules:
 64 |             self.rules(ConflictSolver, PrivateRemover)
 65 |         self._rebulks = []
 66 | 
 67 |     def pattern(self, *pattern):
 68 |         """
 69 |         Add patterns objects
 70 | 
 71 |         :param pattern:
 72 |         :type pattern: rebulk.pattern.Pattern
 73 |         :return: self
 74 |         :rtype: Rebulk
 75 |         """
 76 |         self._patterns.extend(pattern)
 77 |         return self
 78 | 
 79 |     def rules(self, *rules):
 80 |         """
 81 |         Add rules as a module, class or instance.
 82 |         :param rules:
 83 |         :type rules: list[Rule]
 84 |         :return:
 85 |         """
 86 |         self._rules.load(*rules)
 87 |         return self
 88 | 
 89 |     def rebulk(self, *rebulks):
 90 |         """
 91 |         Add a children rebulk object
 92 |         :param rebulks:
 93 |         :type rebulks: Rebulk
 94 |         :return:
 95 |         """
 96 |         self._rebulks.extend(rebulks)
 97 |         return self
 98 | 
 99 |     def matches(self, string, context=None):
100 |         """
101 |         Search for all matches with current configuration against input_string
102 |         :param string: string to search into
103 |         :type string: str
104 |         :param context: context to use
105 |         :type context: dict
106 |         :return: A custom list of matches
107 |         :rtype: Matches
108 |         """
109 |         matches = Matches(input_string=string)
110 |         if context is None:
111 |             context = {}
112 | 
113 |         self._matches_patterns(matches, context)
114 | 
115 |         self._execute_rules(matches, context)
116 | 
117 |         return matches
118 | 
119 |     def effective_rules(self, context=None):
120 |         """
121 |         Get effective rules for this rebulk object and its children.
122 |         :param context:
123 |         :type context:
124 |         :return:
125 |         :rtype:
126 |         """
127 |         rules = Rules()
128 |         rules.extend(self._rules)
129 |         for rebulk in self._rebulks:
130 |             if not rebulk.disabled(context):
131 |                 extend_safe(rules, rebulk._rules)
132 |         return rules
133 | 
134 |     def _execute_rules(self, matches, context):
135 |         """
136 |         Execute rules for this rebulk and children.
137 |         :param matches:
138 |         :type matches:
139 |         :param context:
140 |         :type context:
141 |         :return:
142 |         :rtype:
143 |         """
144 |         if not self.disabled(context):
145 |             rules = self.effective_rules(context)
146 |             rules.execute_all_rules(matches, context)
147 | 
148 |     def effective_patterns(self, context=None):
149 |         """
150 |         Get effective patterns for this rebulk object and its children.
151 |         :param context:
152 |         :type context:
153 |         :return:
154 |         :rtype:
155 |         """
156 |         patterns = list(self._patterns)
157 |         for rebulk in self._rebulks:
158 |             if not rebulk.disabled(context):
159 |                 extend_safe(patterns, rebulk._patterns)
160 |         return patterns
161 | 
162 |     def _matches_patterns(self, matches, context):
163 |         """
164 |         Search for all matches with current paterns agains input_string
165 |         :param matches: matches list
166 |         :type matches: Matches
167 |         :param context: context to use
168 |         :type context: dict
169 |         :return:
170 |         :rtype:
171 |         """
172 |         if not self.disabled(context):
173 |             patterns = self.effective_patterns(context)
174 |             for pattern in patterns:
175 |                 if not pattern.disabled(context):
176 |                     pattern_matches = pattern.matches(matches.input_string, context)
177 |                     if pattern_matches:
178 |                         log(pattern.log_level, "Pattern has %s match(es). (%s)", len(pattern_matches), pattern)
179 |                     else:
180 |                         pass
181 |                         # log(pattern.log_level, "Pattern doesn't match. (%s)" % (pattern,))
182 |                     for match in pattern_matches:
183 |                         if match.marker:
184 |                             log(pattern.log_level, "Marker found. (%s)", match)
185 |                             matches.markers.append(match)
186 |                         else:
187 |                             log(pattern.log_level, "Match found. (%s)", match)
188 |                             matches.append(match)
189 |                 else:
190 |                     log(pattern.log_level, "Pattern is disabled. (%s)", pattern)
191 | 


--------------------------------------------------------------------------------
/rebulk/remodule.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Uniform re module
 5 | """
 6 | # pylint: disable-all
 7 | import os
 8 | import logging
 9 | 
10 | log = logging.getLogger(__name__).log
11 | 
12 | REGEX_ENABLED = False
13 | if os.environ.get('REBULK_REGEX_ENABLED') in ["1", "true", "True", "Y"]:
14 |     try:
15 |         import regex as re
16 |         REGEX_ENABLED = True
17 |     except ImportError:
18 |         log.warning('regex module is not available. Unset REBULK_REGEX_ENABLED environment variable, or install regex module to enabled it.')
19 |         import re
20 | else:
21 |     import re
22 | 


--------------------------------------------------------------------------------
/rebulk/rules.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Abstract rule class definition and rule engine implementation
  5 | """
  6 | from abc import ABCMeta, abstractmethod
  7 | import inspect
  8 | from itertools import groupby
  9 | from logging import getLogger
 10 | 
 11 | from .utils import is_iterable
 12 | 
 13 | from .toposort import toposort
 14 | 
 15 | from . import debug
 16 | 
 17 | log = getLogger(__name__).log
 18 | 
 19 | 
 20 | class Consequence(metaclass=ABCMeta):
 21 |     """
 22 |     Definition of a consequence to apply.
 23 |     """
 24 |     @abstractmethod
 25 |     def then(self, matches, when_response, context):  # pragma: no cover
 26 |         """
 27 |         Action implementation.
 28 | 
 29 |         :param matches:
 30 |         :type matches: rebulk.match.Matches
 31 |         :param context:
 32 |         :type context:
 33 |         :param when_response: return object from when call.
 34 |         :type when_response: object
 35 |         :return: True if the action was runned, False if it wasn't.
 36 |         :rtype: bool
 37 |         """
 38 | 
 39 | 
 40 | class Condition(metaclass=ABCMeta):
 41 |     """
 42 |     Definition of a condition to check.
 43 |     """
 44 |     @abstractmethod
 45 |     def when(self, matches, context):  # pragma: no cover
 46 |         """
 47 |         Condition implementation.
 48 | 
 49 |         :param matches:
 50 |         :type matches: rebulk.match.Matches
 51 |         :param context:
 52 |         :type context:
 53 |         :return: truthy if rule should be triggered and execute then action, falsy if it should not.
 54 |         :rtype: object
 55 |         """
 56 | 
 57 | 
 58 | class CustomRule(Condition, Consequence, metaclass=ABCMeta):
 59 |     """
 60 |     Definition of a rule to apply
 61 |     """
 62 |     # pylint: disable=unused-argument, abstract-method
 63 |     priority = 0
 64 |     name = None
 65 |     dependency = None
 66 |     properties = {}
 67 | 
 68 |     def __init__(self, log_level=None):
 69 |         self.defined_at = debug.defined_at()
 70 |         if log_level is None and not hasattr(self, 'log_level'):
 71 |             self.log_level = debug.LOG_LEVEL
 72 | 
 73 |     def enabled(self, context):
 74 |         """
 75 |         Disable rule.
 76 | 
 77 |         :param context:
 78 |         :type context:
 79 |         :return: True if rule is enabled, False if disabled
 80 |         :rtype: bool
 81 |         """
 82 |         return True
 83 | 
 84 |     def __lt__(self, other):
 85 |         return self.priority > other.priority
 86 | 
 87 |     def __repr__(self):
 88 |         defined = ""
 89 |         if self.defined_at:
 90 |             defined = f"@{self.defined_at}"
 91 |         return f"<{self.name if self.name else self.__class__.__name__}{defined}>"
 92 | 
 93 |     def __eq__(self, other):
 94 |         return self.__class__ == other.__class__
 95 | 
 96 |     def __hash__(self):
 97 |         return hash(self.__class__)
 98 | 
 99 | 
100 | class Rule(CustomRule):
101 |     """
102 |     Definition of a rule to apply
103 |     """
104 |     # pylint:disable=abstract-method
105 |     consequence = None
106 | 
107 |     def then(self, matches, when_response, context):
108 |         assert self.consequence
109 |         if is_iterable(self.consequence):
110 |             if not is_iterable(when_response):
111 |                 when_response = [when_response]
112 |             iterator = iter(when_response)
113 |             for cons in self.consequence:  #pylint: disable=not-an-iterable
114 |                 if inspect.isclass(cons):
115 |                     cons = cons()
116 |                 cons.then(matches, next(iterator), context)
117 |         else:
118 |             cons = self.consequence
119 |             if inspect.isclass(cons):
120 |                 cons = cons()  # pylint:disable=not-callable
121 |             cons.then(matches, when_response, context)
122 | 
123 | 
124 | class RemoveMatch(Consequence):  # pylint: disable=abstract-method
125 |     """
126 |     Remove matches returned by then
127 |     """
128 |     def then(self, matches, when_response, context):
129 |         if is_iterable(when_response):
130 |             ret = []
131 |             when_response = list(when_response)
132 |             for match in when_response:
133 |                 if match in matches:
134 |                     matches.remove(match)
135 |                     ret.append(match)
136 |             return ret
137 |         if when_response in matches:
138 |             matches.remove(when_response)
139 |             return when_response
140 | 
141 | 
142 | class AppendMatch(Consequence):  # pylint: disable=abstract-method
143 |     """
144 |     Append matches returned by then
145 |     """
146 |     def __init__(self, match_name=None):
147 |         self.match_name = match_name
148 | 
149 |     def then(self, matches, when_response, context):
150 |         if is_iterable(when_response):
151 |             ret = []
152 |             when_response = list(when_response)
153 |             for match in when_response:
154 |                 if match not in matches:
155 |                     if self.match_name:
156 |                         match.name = self.match_name
157 |                     matches.append(match)
158 |                     ret.append(match)
159 |             return ret
160 |         if self.match_name:
161 |             when_response.name = self.match_name
162 |         if when_response not in matches:
163 |             matches.append(when_response)
164 |             return when_response
165 | 
166 | 
167 | class RenameMatch(Consequence):  # pylint: disable=abstract-method
168 |     """
169 |     Rename matches returned by then
170 |     """
171 |     def __init__(self, match_name):
172 |         self.match_name = match_name
173 |         self.remove = RemoveMatch()
174 |         self.append = AppendMatch()
175 | 
176 |     def then(self, matches, when_response, context):
177 |         removed = self.remove.then(matches, when_response, context)
178 |         if is_iterable(removed):
179 |             removed = list(removed)
180 |             for match in removed:
181 |                 match.name = self.match_name
182 |         elif removed:
183 |             removed.name = self.match_name
184 |         if removed:
185 |             self.append.then(matches, removed, context)
186 | 
187 | 
188 | class AppendTags(Consequence):  # pylint: disable=abstract-method
189 |     """
190 |     Add tags to returned matches
191 |     """
192 |     def __init__(self, tags):
193 |         self.tags = tags
194 |         self.remove = RemoveMatch()
195 |         self.append = AppendMatch()
196 | 
197 |     def then(self, matches, when_response, context):
198 |         removed = self.remove.then(matches, when_response, context)
199 |         if is_iterable(removed):
200 |             removed = list(removed)
201 |             for match in removed:
202 |                 match.tags.extend(self.tags)
203 |         elif removed:
204 |             removed.tags.extend(self.tags)  # pylint: disable=no-member
205 |         if removed:
206 |             self.append.then(matches, removed, context)
207 | 
208 | 
209 | class RemoveTags(Consequence):  # pylint: disable=abstract-method
210 |     """
211 |     Remove tags from returned matches
212 |     """
213 |     def __init__(self, tags):
214 |         self.tags = tags
215 |         self.remove = RemoveMatch()
216 |         self.append = AppendMatch()
217 | 
218 |     def then(self, matches, when_response, context):
219 |         removed = self.remove.then(matches, when_response, context)
220 |         if is_iterable(removed):
221 |             removed = list(removed)
222 |             for match in removed:
223 |                 for tag in self.tags:
224 |                     if tag in match.tags:
225 |                         match.tags.remove(tag)
226 |         elif removed:
227 |             for tag in self.tags:
228 |                 if tag in removed.tags:  # pylint: disable=no-member
229 |                     removed.tags.remove(tag)  # pylint: disable=no-member
230 |         if removed:
231 |             self.append.then(matches, removed, context)
232 | 
233 | 
234 | class Rules(list):
235 |     """
236 |     list of rules ready to execute.
237 |     """
238 | 
239 |     def __init__(self, *rules):
240 |         super().__init__()
241 |         self.load(*rules)
242 | 
243 |     def load(self, *rules):
244 |         """
245 |         Load rules from a Rule module, class or instance
246 | 
247 |         :param rules:
248 |         :type rules:
249 |         :return:
250 |         :rtype:
251 |         """
252 |         for rule in rules:
253 |             if inspect.ismodule(rule):
254 |                 self.load_module(rule)
255 |             elif inspect.isclass(rule):
256 |                 self.load_class(rule)
257 |             else:
258 |                 self.append(rule)
259 | 
260 |     def load_module(self, module):
261 |         """
262 |         Load a rules module
263 | 
264 |         :param module:
265 |         :type module:
266 |         :return:
267 |         :rtype:
268 |         """
269 |         # pylint: disable=unused-variable
270 |         for name, obj in inspect.getmembers(module,
271 |                                             lambda member: hasattr(member, '__module__')
272 |                                             and member.__module__ == module.__name__
273 |                                             and inspect.isclass):
274 |             self.load_class(obj)
275 | 
276 |     def load_class(self, class_):
277 |         """
278 |         Load a Rule class.
279 | 
280 |         :param class_:
281 |         :type class_:
282 |         :return:
283 |         :rtype:
284 |         """
285 |         self.append(class_())
286 | 
287 |     def execute_all_rules(self, matches, context):
288 |         """
289 |         Execute all rules from this rules list. All when condition with same priority will be performed before
290 |         calling then actions.
291 | 
292 |         :param matches:
293 |         :type matches:
294 |         :param context:
295 |         :type context:
296 |         :return:
297 |         :rtype:
298 |         """
299 |         ret = []
300 |         for priority, priority_rules in groupby(sorted(self), lambda rule: rule.priority):
301 |             sorted_rules = toposort_rules(list(priority_rules))  # Group by dependency graph toposort
302 |             for rules_group in sorted_rules:
303 |                 rules_group = list(sorted(rules_group, key=self.index))  # Sort rules group based on initial ordering.
304 |                 group_log_level = None
305 |                 for rule in rules_group:
306 |                     if group_log_level is None or group_log_level < rule.log_level:
307 |                         group_log_level = rule.log_level
308 |                 log(group_log_level, "%s independent rule(s) at priority %s.", len(rules_group), priority)
309 |                 for rule in rules_group:
310 |                     when_response = execute_rule(rule, matches, context)
311 |                     if when_response is not None:
312 |                         ret.append((rule, when_response))
313 | 
314 |         return ret
315 | 
316 | 
317 | def execute_rule(rule, matches, context):
318 |     """
319 |     Execute the given rule.
320 |     :param rule:
321 |     :type rule:
322 |     :param matches:
323 |     :type matches:
324 |     :param context:
325 |     :type context:
326 |     :return:
327 |     :rtype:
328 |     """
329 |     if rule.enabled(context):
330 |         log(rule.log_level, "Checking rule condition: %s", rule)
331 |         when_response = rule.when(matches, context)
332 |         if when_response:
333 |             log(rule.log_level, "Rule was triggered: %s", when_response)
334 |             log(rule.log_level, "Running rule consequence: %s %s", rule, when_response)
335 |             rule.then(matches, when_response, context)
336 |             return when_response
337 |     else:
338 |         log(rule.log_level, "Rule is disabled: %s", rule)
339 | 
340 | def toposort_rules(rules):
341 |     """
342 |     Sort given rules using toposort with dependency parameter.
343 |     :param rules:
344 |     :type rules:
345 |     :return:
346 |     :rtype:
347 |     """
348 |     graph = {}
349 |     class_dict = {}
350 |     for rule in rules:
351 |         if rule.__class__ in class_dict:
352 |             raise ValueError(f"Duplicate class rules are not allowed: {rule.__class__}")
353 |         class_dict[rule.__class__] = rule
354 |     for rule in rules:
355 |         if not is_iterable(rule.dependency) and rule.dependency:
356 |             rule_dependencies = [rule.dependency]
357 |         else:
358 |             rule_dependencies = rule.dependency
359 |         dependencies = set()
360 |         if rule_dependencies:
361 |             for dependency in rule_dependencies:
362 |                 if inspect.isclass(dependency):
363 |                     dependency = class_dict.get(dependency)
364 |                 if dependency:
365 |                     dependencies.add(dependency)
366 |         graph[rule] = dependencies
367 |     return toposort(graph)
368 | 


--------------------------------------------------------------------------------
/rebulk/test/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | # pylint: disable=pointless-statement, missing-docstring
4 | 


--------------------------------------------------------------------------------
/rebulk/test/default_rules_module.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # pylint: disable=pointless-statement, missing-docstring, invalid-name, len-as-condition
 4 | from ..match import Match
 5 | from ..rules import Rule, RemoveMatch, AppendMatch, RenameMatch, AppendTags, RemoveTags
 6 | 
 7 | 
 8 | class RuleRemove0(Rule):
 9 |     consequence = RemoveMatch
10 |     def when(self, matches, context):
11 |         return matches[0]
12 | 
13 | 
14 | class RuleAppend0(Rule):
15 |     consequence = AppendMatch()
16 |     def when(self, matches, context):
17 |         return Match(5, 10)
18 | 
19 | class RuleRename0(Rule):
20 |     consequence = [RenameMatch('renamed')]
21 |     def when(self, matches, context):
22 |         return [Match(5, 10, name="original")]
23 | 
24 | class RuleRemove1(Rule):
25 |     consequence = [RemoveMatch()]
26 |     def when(self, matches, context):
27 |         return [matches[0]]
28 | 
29 | class RuleAppend1(Rule):
30 |     consequence = [AppendMatch]
31 |     def when(self, matches, context):
32 |         return [Match(5, 10)]
33 | 
34 | class RuleRename1(Rule):
35 |     consequence = RenameMatch('renamed')
36 |     def when(self, matches, context):
37 |         return [Match(5, 10, name="original")]
38 | 
39 | class RuleAppend2(Rule):
40 |     consequence = [AppendMatch('renamed')]
41 |     properties = {'renamed': [None]}
42 |     def when(self, matches, context):
43 |         return [Match(5, 10)]
44 | 
45 | class RuleRename2(Rule):
46 |     consequence = RenameMatch('renamed')
47 |     def when(self, matches, context):
48 |         return Match(5, 10, name="original")
49 | 
50 | class RuleAppend3(Rule):
51 |     consequence = AppendMatch('renamed')
52 |     properties = {'renamed': [None]}
53 |     def when(self, matches, context):
54 |         return [Match(5, 10)]
55 | 
56 | class RuleRename3(Rule):
57 |     consequence = [RenameMatch('renamed')]
58 |     def when(self, matches, context):
59 |         return Match(5, 10, name="original")
60 | 
61 | class RuleAppendTags0(Rule):
62 |     consequence = AppendTags(['new-tag'])
63 |     def when(self, matches, context):
64 |         return matches.named('tags', 0)
65 | 
66 | class RuleRemoveTags0(Rule):
67 |     consequence = RemoveTags(['new-tag'])
68 |     def when(self, matches, context):
69 |         return matches.named('tags', 0)
70 | 
71 | class RuleAppendTags1(Rule):
72 |     consequence = AppendTags(['new-tag'])
73 |     def when(self, matches, context):
74 |         return matches.named('tags')
75 | 
76 | class RuleRemoveTags1(Rule):
77 |     consequence = RemoveTags(['new-tag'])
78 |     def when(self, matches, context):
79 |         return matches.named('tags')
80 | 


--------------------------------------------------------------------------------
/rebulk/test/rebulk_rules_module.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # pylint: disable=pointless-statement, missing-docstring, invalid-name, len-as-condition
 4 | from rebulk.rules import Rule, RemoveMatch, CustomRule
 5 | 
 6 | 
 7 | class RemoveAllButLastYear(Rule):
 8 |     consequence = RemoveMatch
 9 |     def when(self, matches, context):
10 |         entries = matches.named('year')
11 |         return entries[:-1]
12 | 
13 | 
14 | class PrefixedSuffixedYear(CustomRule):
15 |     def when(self, matches, context):
16 |         toRemove = []
17 |         years = matches.named('year')
18 |         for year in years:
19 |             if not matches.previous(year, lambda p: p.name == 'yearPrefix') and \
20 |                    not matches.next(year, lambda n: n.name == 'yearSuffix'):
21 |                 toRemove.append(year)
22 |         return toRemove
23 | 
24 |     def then(self, matches, when_response, context):
25 |         for to_remove in when_response:
26 |             matches.remove(to_remove)
27 | 
28 | 
29 | class PrefixedSuffixedYearNoLambda(Rule):
30 |     consequence = RemoveMatch
31 |     def when(self, matches, context):
32 |         toRemove = []
33 |         years = matches.named('year')
34 |         for year in years:
35 |             if not [m for m in matches.previous(year) if m.name == 'yearPrefix'] and \
36 |                     not [m for m in matches.next(year) if m.name == 'yearSuffix']:
37 |                 toRemove.append(year)
38 |         return toRemove
39 | 


--------------------------------------------------------------------------------
/rebulk/test/rules_module.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # pylint: disable=pointless-statement, missing-docstring, invalid-name, len-as-condition
 4 | from ..match import Match
 5 | from ..rules import Rule
 6 | 
 7 | 
 8 | class Rule3(Rule):
 9 |     def when(self, matches, context):
10 |         return context.get('when')
11 | 
12 |     def then(self, matches, when_response, context):
13 |         assert when_response in [True, False]
14 |         matches.append(Match(3, 4))
15 | 
16 | 
17 | class Rule2(Rule):
18 |     dependency = Rule3
19 | 
20 |     def when(self, matches, context):
21 |         return True
22 | 
23 |     def then(self, matches, when_response, context):
24 |         assert when_response
25 |         matches.append(Match(3, 4))
26 | 
27 | 
28 | class Rule1(Rule):
29 |     dependency = Rule2
30 | 
31 |     def when(self, matches, context):
32 |         return True
33 | 
34 |     def then(self, matches, when_response, context):
35 |         assert when_response
36 |         matches.clear()
37 | 
38 | 
39 | class Rule0(Rule):
40 |     dependency = Rule1
41 | 
42 |     def when(self, matches, context):
43 |         return True
44 | 
45 |     def then(self, matches, when_response, context):
46 |         assert when_response
47 |         matches.append(Match(3, 4))
48 | 
49 | 
50 | class Rule1Disabled(Rule1):
51 |     name = "Disabled Rule1"
52 | 
53 |     def enabled(self, context):
54 |         return False
55 | 


--------------------------------------------------------------------------------
/rebulk/test/test_chain.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # pylint: disable=pointless-statement, missing-docstring, no-member, len-as-condition, cyclic-import
  4 | import re
  5 | from functools import partial
  6 | 
  7 | from rebulk.pattern import FunctionalPattern, StringPattern, RePattern
  8 | from ..rebulk import Rebulk
  9 | from ..validators import chars_surround
 10 | 
 11 | 
 12 | def test_chain_close():
 13 |     rebulk = Rebulk()
 14 |     ret = rebulk.chain().close()
 15 | 
 16 |     assert ret == rebulk
 17 |     assert len(rebulk.effective_patterns()) == 1
 18 | 
 19 | 
 20 | def test_build_chain():
 21 |     rebulk = Rebulk()
 22 | 
 23 |     def digit(input_string):
 24 |         i = input_string.find("1849")
 25 |         if i > -1:
 26 |             return i, i + len("1849")
 27 | 
 28 |     ret = rebulk.chain() \
 29 |         .functional(digit) \
 30 |         .string("test").repeater(2) \
 31 |         .string("x").repeater('{1,3}') \
 32 |         .string("optional").repeater('?') \
 33 |         .regex("f?x").repeater('+') \
 34 |         .close()
 35 | 
 36 |     assert ret == rebulk
 37 |     assert len(rebulk.effective_patterns()) == 1
 38 | 
 39 |     chain = rebulk.effective_patterns()[0]
 40 | 
 41 |     assert len(chain.parts) == 5
 42 | 
 43 |     assert isinstance(chain.parts[0].pattern, FunctionalPattern)
 44 |     assert chain.parts[0].repeater_start == 1
 45 |     assert chain.parts[0].repeater_end == 1
 46 | 
 47 |     assert isinstance(chain.parts[1].pattern, StringPattern)
 48 |     assert chain.parts[1].repeater_start == 2
 49 |     assert chain.parts[1].repeater_end == 2
 50 | 
 51 |     assert isinstance(chain.parts[2].pattern, StringPattern)
 52 |     assert chain.parts[2].repeater_start == 1
 53 |     assert chain.parts[2].repeater_end == 3
 54 | 
 55 |     assert isinstance(chain.parts[3].pattern, StringPattern)
 56 |     assert chain.parts[3].repeater_start == 0
 57 |     assert chain.parts[3].repeater_end == 1
 58 | 
 59 |     assert isinstance(chain.parts[4].pattern, RePattern)
 60 |     assert chain.parts[4].repeater_start == 1
 61 |     assert chain.parts[4].repeater_end is None
 62 | 
 63 | 
 64 | def test_chain_defaults():
 65 |     rebulk = Rebulk()
 66 |     rebulk.defaults(validator=lambda x: x.value.startswith('t'), ignore_names=['testIgnore'], children=True)
 67 | 
 68 |     rebulk.chain() \
 69 |         .regex("(?P<test>test)") \
 70 |         .regex(" ").repeater("*") \
 71 |         .regex("(?P<best>best)") \
 72 |         .regex(" ").repeater("*") \
 73 |         .regex("(?P<testIgnore>testIgnore)")
 74 |     matches = rebulk.matches("test best testIgnore")
 75 | 
 76 |     assert len(matches) == 1
 77 |     assert matches[0].name == "test"
 78 | 
 79 | 
 80 | def test_chain_with_validators():
 81 |     def chain_validator(match):
 82 |         return match.value.startswith('t') and match.value.endswith('t')
 83 | 
 84 |     def default_validator(match):
 85 |         return match.value.startswith('t') and match.value.endswith('g')
 86 | 
 87 |     def custom_validator(match):
 88 |         return match.value.startswith('b') and match.value.endswith('t')
 89 | 
 90 |     rebulk = Rebulk()
 91 |     rebulk.defaults(children=True, validator=default_validator)
 92 | 
 93 |     rebulk.chain(validate_all=True, validator={'__parent__': chain_validator}) \
 94 |         .regex("(?P<test>testing)", validator=default_validator).repeater("+") \
 95 |         .regex(" ").repeater("+") \
 96 |         .regex("(?P<best>best)", validator=custom_validator).repeater("+")
 97 |     matches = rebulk.matches("some testing best end")
 98 | 
 99 |     assert len(matches) == 2
100 |     assert matches[0].name == "test"
101 |     assert matches[1].name == "best"
102 | 
103 | 
104 | def test_matches_docs():
105 |     rebulk = Rebulk().regex_defaults(flags=re.IGNORECASE) \
106 |         .defaults(children=True, formatter={'episode': int, 'version': int}) \
107 |         .chain() \
108 |         .regex(r'e(?P<episode>\d{1,4})').repeater(1) \
109 |         .regex(r'v(?P<version>\d+)').repeater('?') \
110 |         .regex(r'[ex-](?P<episode>\d{1,4})').repeater('*') \
111 |         .close()  # .repeater(1) could be omitted as it's the default behavior
112 | 
113 |     result = rebulk.matches("This is E14v2-15-16-17").to_dict()  # converts matches to dict
114 | 
115 |     assert 'episode' in result
116 |     assert result['episode'] == [14, 15, 16, 17]
117 |     assert 'version' in result
118 |     assert result['version'] == 2
119 | 
120 | 
121 | def test_matches():
122 |     rebulk = Rebulk()
123 | 
124 |     def digit(input_string):
125 |         i = input_string.find("1849")
126 |         if i > -1:
127 |             return i, i + len("1849")
128 | 
129 |     input_string = "1849testtestxxfixfux_foxabc1849testtestxoptionalfoxabc"
130 | 
131 |     chain = rebulk.chain() \
132 |         .functional(digit) \
133 |         .string("test").hidden().repeater(2) \
134 |         .string("x").hidden().repeater('{1,3}') \
135 |         .string("optional").hidden().repeater('?') \
136 |         .regex("f.?x", name='result').repeater('+') \
137 |         .close()
138 | 
139 |     matches = chain.matches(input_string)
140 | 
141 |     assert len(matches) == 2
142 |     children = matches[0].children
143 | 
144 |     assert children[0].value == '1849'
145 |     assert children[1].value == 'fix'
146 |     assert children[2].value == 'fux'
147 | 
148 |     children = matches[1].children
149 |     assert children[0].value == '1849'
150 |     assert children[1].value == 'fox'
151 | 
152 |     input_string = "_1850testtestxoptionalfoxabc"
153 |     matches = chain.matches(input_string)
154 | 
155 |     assert len(matches) == 0
156 | 
157 |     input_string = "_1849testtesttesttestxoptionalfoxabc"
158 |     matches = chain.matches(input_string)
159 | 
160 |     assert len(matches) == 0
161 | 
162 |     input_string = "_1849testtestxxxxoptionalfoxabc"
163 |     matches = chain.matches(input_string)
164 | 
165 |     assert len(matches) == 0
166 | 
167 |     input_string = "_1849testtestoptionalfoxabc"
168 |     matches = chain.matches(input_string)
169 | 
170 |     assert len(matches) == 0
171 | 
172 |     input_string = "_1849testtestxoptionalabc"
173 |     matches = chain.matches(input_string)
174 | 
175 |     assert len(matches) == 0
176 | 
177 |     input_string = "_1849testtestxoptionalfaxabc"
178 |     matches = chain.matches(input_string)
179 | 
180 |     assert len(matches) == 1
181 |     children = matches[0].children
182 | 
183 |     assert children[0].value == '1849'
184 |     assert children[1].value == 'fax'
185 | 
186 | 
187 | def test_matches_2():
188 |     rebulk = Rebulk() \
189 |         .regex_defaults(flags=re.IGNORECASE) \
190 |         .defaults(children=True, formatter={'episode': int, 'version': int}) \
191 |         .chain() \
192 |         .regex(r'e(?P<episode>\d{1,4})') \
193 |         .regex(r'v(?P<version>\d+)').repeater('?') \
194 |         .regex(r'[ex-](?P<episode>\d{1,4})').repeater('*') \
195 |         .close()
196 | 
197 |     matches = rebulk.matches("This is E14v2-15E16x17")
198 |     assert len(matches) == 5
199 | 
200 |     assert matches[0].name == 'episode'
201 |     assert matches[0].value == 14
202 | 
203 |     assert matches[1].name == 'version'
204 |     assert matches[1].value == 2
205 | 
206 |     assert matches[2].name == 'episode'
207 |     assert matches[2].value == 15
208 | 
209 |     assert matches[3].name == 'episode'
210 |     assert matches[3].value == 16
211 | 
212 |     assert matches[4].name == 'episode'
213 |     assert matches[4].value == 17
214 | 
215 | 
216 | def test_matches_3():
217 |     alt_dash = (r'@', r'[\W_]')  # abbreviation
218 | 
219 |     match_names = ['season', 'episode']
220 |     other_names = ['screen_size', 'video_codec', 'audio_codec', 'audio_channels', 'container', 'date']
221 | 
222 |     rebulk = Rebulk()
223 |     rebulk.defaults(formatter={'season': int, 'episode': int},
224 |                     tags=['SxxExx'],
225 |                     abbreviations=[alt_dash],
226 |                     private_names=['episodeSeparator', 'seasonSeparator'],
227 |                     children=True,
228 |                     private_parent=True,
229 |                     conflict_solver=lambda match, other: match
230 |                     if match.name in match_names and other.name in other_names
231 |                     else '__default__')
232 | 
233 |     rebulk.chain() \
234 |         .defaults(children=True, private_parent=True) \
235 |         .regex(r'(?P<season>\d+)@?x@?(?P<episode>\d+)') \
236 |         .regex(r'(?P<episodeSeparator>x|-|\+|&)(?P<episode>\d+)').repeater('*') \
237 |         .close() \
238 |         .chain() \
239 |         .defaults(children=True, private_parent=True) \
240 |         .regex(r'S(?P<season>\d+)@?(?:xE|Ex|E|x)@?(?P<episode>\d+)') \
241 |         .regex(r'(?:(?P<episodeSeparator>xE|Ex|E|x|-|\+|&)(?P<episode>\d+))').repeater('*') \
242 |         .close() \
243 |         .chain() \
244 |         .defaults(children=True, private_parent=True) \
245 |         .regex(r'S(?P<season>\d+)') \
246 |         .regex(r'(?P<seasonSeparator>S|-|\+|&)(?P<season>\d+)').repeater('*')
247 | 
248 |     matches = rebulk.matches("test-01x02-03")
249 |     assert len(matches) == 3
250 | 
251 |     assert matches[0].name == 'season'
252 |     assert matches[0].value == 1
253 | 
254 |     assert matches[1].name == 'episode'
255 |     assert matches[1].value == 2
256 | 
257 |     assert matches[2].name == 'episode'
258 |     assert matches[2].value == 3
259 | 
260 |     matches = rebulk.matches("test-S01E02-03")
261 | 
262 |     assert len(matches) == 3
263 |     assert matches[0].name == 'season'
264 |     assert matches[0].value == 1
265 | 
266 |     assert matches[1].name == 'episode'
267 |     assert matches[1].value == 2
268 | 
269 |     assert matches[2].name == 'episode'
270 |     assert matches[2].value == 3
271 | 
272 |     matches = rebulk.matches("test-S01-02-03-04")
273 | 
274 |     assert len(matches) == 4
275 |     assert matches[0].name == 'season'
276 |     assert matches[0].value == 1
277 | 
278 |     assert matches[1].name == 'season'
279 |     assert matches[1].value == 2
280 | 
281 |     assert matches[2].name == 'season'
282 |     assert matches[2].value == 3
283 | 
284 |     assert matches[3].name == 'season'
285 |     assert matches[3].value == 4
286 | 
287 | 
288 | def test_matches_4():
289 |     seps_surround = partial(chars_surround, " ")
290 | 
291 |     rebulk = Rebulk()
292 |     rebulk.regex_defaults(flags=re.IGNORECASE)
293 |     rebulk.defaults(validate_all=True, children=True)
294 |     rebulk.defaults(private_names=['episodeSeparator', 'seasonSeparator'], private_parent=True)
295 | 
296 |     rebulk.chain(validator={'__parent__': seps_surround}, formatter={'episode': int, 'version': int}) \
297 |         .defaults(formatter={'episode': int, 'version': int}) \
298 |         .regex(r'e(?P<episode>\d{1,4})') \
299 |         .regex(r'v(?P<version>\d+)').repeater('?') \
300 |         .regex(r'(?P<episodeSeparator>e|x|-)(?P<episode>\d{1,4})').repeater('*')
301 | 
302 |     matches = rebulk.matches("Some Series E01E02E03")
303 |     assert len(matches) == 3
304 | 
305 |     assert matches[0].value == 1
306 |     assert matches[1].value == 2
307 |     assert matches[2].value == 3
308 | 
309 | 
310 | def test_matches_5():
311 |     seps_surround = partial(chars_surround, " ")
312 | 
313 |     rebulk = Rebulk()
314 |     rebulk.regex_defaults(flags=re.IGNORECASE)
315 | 
316 |     rebulk.chain(private_names=['episodeSeparator', 'seasonSeparator'], validate_all=True,
317 |                  validator={'__parent__': seps_surround}, children=True, private_parent=True,
318 |                  formatter={'episode': int, 'version': int}) \
319 |         .defaults(children=True, private_parent=True) \
320 |         .regex(r'e(?P<episode>\d{1,4})') \
321 |         .regex(r'v(?P<version>\d+)').repeater('?') \
322 |         .regex(r'(?P<episodeSeparator>e|x|-)(?P<episode>\d{1,4})').repeater('{2,3}')
323 | 
324 |     matches = rebulk.matches("Some Series E01E02E03")
325 |     assert len(matches) == 3
326 | 
327 |     matches = rebulk.matches("Some Series E01E02")
328 |     assert len(matches) == 0
329 | 
330 |     matches = rebulk.matches("Some Series E01E02E03E04E05E06")  # Parent can't be validated, so no results at all
331 |     assert len(matches) == 0
332 | 
333 | 
334 | def test_matches_6():
335 |     rebulk = Rebulk()
336 |     rebulk.regex_defaults(flags=re.IGNORECASE)
337 |     rebulk.defaults(private_names=['episodeSeparator', 'seasonSeparator'], validate_all=True,
338 |                     validator=None, children=True, private_parent=True)
339 | 
340 |     rebulk.chain(formatter={'episode': int, 'version': int}) \
341 |         .defaults(children=True, private_parent=True) \
342 |         .regex(r'e(?P<episode>\d{1,4})') \
343 |         .regex(r'v(?P<version>\d+)').repeater('?') \
344 |         .regex(r'(?P<episodeSeparator>e|x|-)(?P<episode>\d{1,4})').repeater('{2,3}')
345 | 
346 |     matches = rebulk.matches("Some Series E01E02E03")
347 |     assert len(matches) == 3
348 | 
349 |     matches = rebulk.matches("Some Series E01E02")
350 |     assert len(matches) == 0
351 | 
352 |     matches = rebulk.matches("Some Series E01E02E03E04E05E06")  # No validator on parent, so it should give 4 episodes.
353 |     assert len(matches) == 4
354 | 
355 | 
356 | def test_matches_7():
357 |     seps_surround = partial(chars_surround, ' .-/')
358 |     rebulk = Rebulk()
359 |     rebulk.regex_defaults(flags=re.IGNORECASE)
360 |     rebulk.defaults(children=True, private_parent=True)
361 | 
362 |     rebulk.chain(). \
363 |         regex(r'S(?P<season>\d+)', validate_all=True, validator={'__parent__': seps_surround}). \
364 |         regex(r'[ -](?P<season>\d+)', validator=seps_surround).repeater('*')
365 | 
366 |     matches = rebulk.matches("Some S01")
367 |     assert len(matches) == 1
368 |     matches[0].value = 1
369 | 
370 |     matches = rebulk.matches("Some S01-02")
371 |     assert len(matches) == 2
372 |     matches[0].value = 1
373 |     matches[1].value = 2
374 | 
375 |     matches = rebulk.matches("programs4/Some S01-02")
376 |     assert len(matches) == 2
377 |     matches[0].value = 1
378 |     matches[1].value = 2
379 | 
380 |     matches = rebulk.matches("programs4/SomeS01middle.S02-03.andS04here")
381 |     assert len(matches) == 2
382 |     matches[0].value = 2
383 |     matches[1].value = 3
384 | 
385 |     matches = rebulk.matches("Some 02.and.S04-05.here")
386 |     assert len(matches) == 2
387 |     matches[0].value = 4
388 |     matches[1].value = 5
389 | 
390 | 
391 | def test_chain_breaker():
392 |     def chain_breaker(matches):
393 |         seasons = matches.named('season')
394 |         if len(seasons) > 1:
395 |             if seasons[-1].value - seasons[-2].value > 10:
396 |                 return True
397 |         return False
398 | 
399 |     seps_surround = partial(chars_surround, ' .-/')
400 |     rebulk = Rebulk()
401 |     rebulk.regex_defaults(flags=re.IGNORECASE)
402 |     rebulk.defaults(children=True, private_parent=True, formatter={'season': int})
403 | 
404 |     rebulk.chain(chain_breaker=chain_breaker). \
405 |         regex(r'S(?P<season>\d+)', validate_all=True, validator={'__parent__': seps_surround}). \
406 |         regex(r'[ -](?P<season>\d+)', validator=seps_surround).repeater('*')
407 | 
408 |     matches = rebulk.matches("Some S01-02-03-50-51")
409 |     assert len(matches) == 3
410 |     matches[0].value = 1
411 |     matches[1].value = 2
412 |     matches[2].value = 3
413 | 
414 | 
415 | def test_chain_breaker_defaults():
416 |     def chain_breaker(matches):
417 |         seasons = matches.named('season')
418 |         if len(seasons) > 1:
419 |             if seasons[-1].value - seasons[-2].value > 10:
420 |                 return True
421 |         return False
422 | 
423 |     seps_surround = partial(chars_surround, ' .-/')
424 |     rebulk = Rebulk()
425 |     rebulk.regex_defaults(flags=re.IGNORECASE)
426 |     rebulk.defaults(chain_breaker=chain_breaker, children=True, private_parent=True, formatter={'season': int})
427 | 
428 |     rebulk.chain(). \
429 |         regex(r'S(?P<season>\d+)', validate_all=True, validator={'__parent__': seps_surround}). \
430 |         regex(r'[ -](?P<season>\d+)', validator=seps_surround).repeater('*')
431 | 
432 |     matches = rebulk.matches("Some S01-02-03-50-51")
433 |     assert len(matches) == 3
434 |     matches[0].value = 1
435 |     matches[1].value = 2
436 |     matches[2].value = 3
437 | 
438 | 
439 | def test_chain_breaker_defaults2():
440 |     def chain_breaker(matches):
441 |         seasons = matches.named('season')
442 |         if len(seasons) > 1:
443 |             if seasons[-1].value - seasons[-2].value > 10:
444 |                 return True
445 |         return False
446 | 
447 |     seps_surround = partial(chars_surround, ' .-/')
448 |     rebulk = Rebulk()
449 |     rebulk.regex_defaults(flags=re.IGNORECASE)
450 |     rebulk.chain_defaults(chain_breaker=chain_breaker)
451 |     rebulk.defaults(children=True, private_parent=True, formatter={'season': int})
452 | 
453 |     rebulk.chain(). \
454 |         regex(r'S(?P<season>\d+)', validate_all=True, validator={'__parent__': seps_surround}). \
455 |         regex(r'[ -](?P<season>\d+)', validator=seps_surround).repeater('*')
456 | 
457 |     matches = rebulk.matches("Some S01-02-03-50-51")
458 |     assert len(matches) == 3
459 |     matches[0].value = 1
460 |     matches[1].value = 2
461 |     matches[2].value = 3
462 | 


--------------------------------------------------------------------------------
/rebulk/test/test_debug.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # pylint: disable=pointless-statement, missing-docstring, protected-access, invalid-name, len-as-condition
 4 | 
 5 | from .default_rules_module import RuleRemove0
 6 | from .. import debug
 7 | from ..match import Match
 8 | from ..pattern import StringPattern
 9 | from ..rebulk import Rebulk
10 | 
11 | 
12 | class TestDebug:
13 |     # request.addfinalizer(disable_debug)
14 | 
15 |     debug.DEBUG = True
16 |     pattern = StringPattern(1, 3, value="es")
17 | 
18 |     match = Match(1, 3, value="es")
19 |     rule = RuleRemove0()
20 | 
21 |     input_string = "This is a debug test"
22 |     rebulk = Rebulk().string("debug") \
23 |         .string("is")
24 | 
25 |     matches = rebulk.matches(input_string)
26 |     debug.DEBUG = False
27 | 
28 |     @classmethod
29 |     def setup_class(cls):
30 |         debug.DEBUG = True
31 | 
32 |     @classmethod
33 |     def teardown_class(cls):
34 |         debug.DEBUG = False
35 | 
36 |     def test_pattern(self):
37 |         assert self.pattern.defined_at.lineno > 0
38 |         assert self.pattern.defined_at.name == 'rebulk.test.test_debug'
39 |         assert self.pattern.defined_at.filename.endswith('test_debug.py')
40 | 
41 |         assert str(self.pattern.defined_at).startswith('test_debug.py#L')
42 |         assert repr(self.pattern).startswith('<StringPattern@test_debug.py#L')
43 | 
44 |     def test_match(self):
45 |         assert self.match.defined_at.lineno > 0
46 |         assert self.match.defined_at.name == 'rebulk.test.test_debug'
47 |         assert self.match.defined_at.filename.endswith('test_debug.py')
48 | 
49 |         assert str(self.match.defined_at).startswith('test_debug.py#L')
50 | 
51 |     def test_rule(self):
52 |         assert self.rule.defined_at.lineno > 0
53 |         assert self.rule.defined_at.name == 'rebulk.test.test_debug'
54 |         assert self.rule.defined_at.filename.endswith('test_debug.py')
55 | 
56 |         assert str(self.rule.defined_at).startswith('test_debug.py#L')
57 |         assert repr(self.rule).startswith('<RuleRemove0@test_debug.py#L')
58 | 
59 |     def test_rebulk(self):
60 |         assert self.rebulk._patterns[0].defined_at.lineno > 0
61 |         assert self.rebulk._patterns[0].defined_at.name == 'rebulk.test.test_debug'
62 |         assert self.rebulk._patterns[0].defined_at.filename.endswith('test_debug.py')
63 | 
64 |         assert str(self.rebulk._patterns[0].defined_at).startswith('test_debug.py#L')
65 | 
66 |         assert self.rebulk._patterns[1].defined_at.lineno > 0
67 |         assert self.rebulk._patterns[1].defined_at.name == 'rebulk.test.test_debug'
68 |         assert self.rebulk._patterns[1].defined_at.filename.endswith('test_debug.py')
69 | 
70 |         assert str(self.rebulk._patterns[1].defined_at).startswith('test_debug.py#L')
71 | 
72 |         assert self.matches[0].defined_at == self.rebulk._patterns[0].defined_at  # pylint: disable=no-member
73 |         assert self.matches[1].defined_at == self.rebulk._patterns[1].defined_at  # pylint: disable=no-member
74 | 
75 |     def test_repr(self):
76 |         str(self.matches)
77 | 


--------------------------------------------------------------------------------
/rebulk/test/test_introspector.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Introspector tests
  5 | """
  6 | # pylint: disable=pointless-statement,missing-docstring,protected-access,invalid-name,len-as-condition
  7 | from ..rebulk import Rebulk
  8 | from .. import introspector
  9 | from .default_rules_module import RuleAppend2, RuleAppend3
 10 | 
 11 | 
 12 | def test_string_introspector():
 13 |     rebulk = Rebulk().string('One', 'Two', 'Three', name='first').string('1', '2', '3', name='second')
 14 | 
 15 |     introspected = introspector.introspect(rebulk, None)
 16 | 
 17 |     assert len(introspected.patterns) == 2
 18 | 
 19 |     first_properties = introspected.patterns[0].properties
 20 |     assert len(first_properties) == 1
 21 |     first_properties['first'] == ['One', 'Two', 'Three']
 22 | 
 23 |     second_properties = introspected.patterns[1].properties
 24 |     assert len(second_properties) == 1
 25 |     second_properties['second'] == ['1', '2', '3']
 26 | 
 27 |     properties = introspected.properties
 28 |     assert len(properties) == 2
 29 |     assert properties['first'] == first_properties['first']
 30 |     assert properties['second'] == second_properties['second']
 31 | 
 32 | 
 33 | def test_string_properties():
 34 |     rebulk = Rebulk()\
 35 |         .string('One', 'Two', 'Three', name='first', properties={'custom': ['One']})\
 36 |         .string('1', '2', '3', name='second', properties={'custom': [1]})
 37 | 
 38 |     introspected = introspector.introspect(rebulk, None)
 39 | 
 40 |     assert len(introspected.patterns) == 2
 41 |     assert len(introspected.rules) == 2
 42 | 
 43 |     first_properties = introspected.patterns[0].properties
 44 |     assert len(first_properties) == 1
 45 |     first_properties['custom'] == ['One']
 46 | 
 47 |     second_properties = introspected.patterns[1].properties
 48 |     assert len(second_properties) == 1
 49 |     second_properties['custom'] == [1]
 50 | 
 51 |     properties = introspected.properties
 52 |     assert len(properties) == 1
 53 |     assert properties['custom'] == ['One', 1]
 54 | 
 55 | 
 56 | def test_various_pattern():
 57 |     rebulk = Rebulk()\
 58 |         .regex('One', 'Two', 'Three', name='first', value="string") \
 59 |         .string('1', '2', '3', name='second', value="digit") \
 60 |         .string('4', '5', '6', name='third') \
 61 |         .string('private', private=True) \
 62 |         .functional(lambda string: (0, 5), name='func', value='test') \
 63 |         .regex('One', 'Two', 'Three', name='regex_name') \
 64 |         .regex('(?P<one>One)(?P<two>Two)(?P<three>Three)') \
 65 |         .functional(lambda string: (6, 10), name='func2') \
 66 |         .string('7', name='third')
 67 | 
 68 |     introspected = introspector.introspect(rebulk, None)
 69 | 
 70 |     assert len(introspected.patterns) == 8
 71 |     assert len(introspected.rules) == 2
 72 | 
 73 |     first_properties = introspected.patterns[0].properties
 74 |     assert len(first_properties) == 1
 75 |     first_properties['first'] == ['string']
 76 | 
 77 |     second_properties = introspected.patterns[1].properties
 78 |     assert len(second_properties) == 1
 79 |     second_properties['second'] == ['digit']
 80 | 
 81 |     third_properties = introspected.patterns[2].properties
 82 |     assert len(third_properties) == 1
 83 |     third_properties['third'] == ['4', '5', '6']
 84 | 
 85 |     func_properties = introspected.patterns[3].properties
 86 |     assert len(func_properties) == 1
 87 |     func_properties['func'] == ['test']
 88 | 
 89 |     regex_name_properties = introspected.patterns[4].properties
 90 |     assert len(regex_name_properties) == 1
 91 |     regex_name_properties['regex_name'] == [None]
 92 | 
 93 |     regex_groups_properties = introspected.patterns[5].properties
 94 |     assert len(regex_groups_properties) == 3
 95 |     regex_groups_properties['one'] == [None]
 96 |     regex_groups_properties['two'] == [None]
 97 |     regex_groups_properties['three'] == [None]
 98 | 
 99 |     func2_properties = introspected.patterns[6].properties
100 |     assert len(func2_properties) == 1
101 |     func2_properties['func2'] == [None]
102 | 
103 |     append_third_properties = introspected.patterns[7].properties
104 |     assert len(append_third_properties) == 1
105 |     append_third_properties['third'] == [None]
106 | 
107 |     properties = introspected.properties
108 |     assert len(properties) == 9
109 |     assert properties['first'] == first_properties['first']
110 |     assert properties['second'] == second_properties['second']
111 |     assert properties['third'] == third_properties['third'] + append_third_properties['third']
112 |     assert properties['func'] == func_properties['func']
113 |     assert properties['regex_name'] == regex_name_properties['regex_name']
114 |     assert properties['one'] == regex_groups_properties['one']
115 |     assert properties['two'] == regex_groups_properties['two']
116 |     assert properties['three'] == regex_groups_properties['three']
117 |     assert properties['func2'] == func2_properties['func2']
118 | 
119 | 
120 | def test_rule_properties():
121 |     rebulk = Rebulk(default_rules=False).rules(RuleAppend2, RuleAppend3)
122 | 
123 |     introspected = introspector.introspect(rebulk, None)
124 | 
125 |     assert len(introspected.rules) == 2
126 |     assert len(introspected.patterns) == 0
127 | 
128 |     rule_properties = introspected.rules[0].properties
129 |     assert len(rule_properties) == 1
130 |     assert rule_properties['renamed'] == [None]
131 | 
132 |     rule_properties = introspected.rules[1].properties
133 |     assert len(rule_properties) == 1
134 |     assert rule_properties['renamed'] == [None]
135 | 
136 |     properties = introspected.properties
137 |     assert len(properties) == 1
138 |     assert properties['renamed'] == [None]
139 | 


--------------------------------------------------------------------------------
/rebulk/test/test_loose.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # pylint: disable=pointless-statement, missing-docstring, invalid-name, len-as-condition
 4 | 
 5 | from ..loose import call
 6 | 
 7 | 
 8 | def test_loose_function():
 9 | 
10 |     def func(v1, v2, v3=3, v4=4):
11 |         return v1 + v2 + v3 + v4
12 | 
13 |     assert call(func, 1, 2) == func(1, 2)
14 |     assert call(func, 1, 2, 3, 5) == func(1, 2, 3, 5)
15 |     assert call(func, 1, 2, v3=4, v4=5) == func(1, 2, v3=4, v4=5)
16 |     assert call(func, 1, 2, 3, 4, 5) == func(1, 2, 3, 4)
17 |     assert call(func, 1, 2, 3, 4, more=5) == func(1, 2, 3, 4)
18 | 
19 | 
20 | def test_loose_varargs_function():
21 |     def func(v1, v2, *args):
22 |         return v1 + v2 + args[0] if len(args) > 0 else 3 + args[1] if len(args) > 1 else 4
23 | 
24 |     assert call(func, 1, 2) == func(1, 2)
25 |     assert call(func, 1, 2, 3, 5) == func(1, 2, 3, 5)
26 |     assert call(func, 1, 2, 3, 4, 5) == func(1, 2, 3, 4)
27 | 
28 | 
29 | def test_loose_kwargs_function():
30 |     def func(v1, v2, **kwargs):
31 |         return v1 + v2 + kwargs.get('v3', 3) + kwargs.get('v4', 4)
32 | 
33 |     assert call(func, v1=1, v2=2) == func(v1=1, v2=2)
34 |     assert call(func, v1=1, v2=2, v3=3, v4=5) == func(v1=1, v2=2, v3=3, v4=5)
35 | 
36 | 
37 | def test_loose_class():
38 |     class Dummy:
39 |         def __init__(self, v1, v2, v3=3, v4=4):
40 |             self.v1 = v1
41 |             self.v2 = v2
42 |             self.v3 = v3
43 |             self.v4 = v4
44 | 
45 |         def call(self):
46 |             return self.v1 + self.v2 + self.v3 + self.v4
47 | 
48 |     assert call(Dummy, 1, 2).call() == Dummy(1, 2).call()
49 |     assert call(Dummy, 1, 2, 3, 5).call() == Dummy(1, 2, 3, 5).call()
50 |     assert call(Dummy, 1, 2, v3=4, v4=5).call() == Dummy(1, 2, v3=4, v4=5).call()
51 |     assert call(Dummy, 1, 2, 3, 4, 5).call() == Dummy(1, 2, 3, 4).call()
52 |     assert call(Dummy, 1, 2, 3, 4, more=5).call() == Dummy(1, 2, 3, 4).call()
53 | 
54 | 
55 | def test_loose_varargs_class():
56 |     class Dummy:
57 |         def __init__(self, v1, v2, *args):
58 |             self.v1 = v1
59 |             self.v2 = v2
60 |             self.v3 = args[0] if len(args) > 0 else 3
61 |             self.v4 = args[1] if len(args) > 1 else 4
62 | 
63 |         def call(self):
64 |             return self.v1 + self.v2 + self.v3 + self.v4
65 | 
66 |     assert call(Dummy, 1, 2).call() == Dummy(1, 2).call()
67 |     assert call(Dummy, 1, 2, 3, 5).call() == Dummy(1, 2, 3, 5).call()
68 |     assert call(Dummy, 1, 2, 3, 4, 5).call() == Dummy(1, 2, 3, 4).call()
69 | 
70 | 
71 | def test_loose_kwargs_class():
72 |     class Dummy:
73 |         def __init__(self, v1, v2, **kwargs):
74 |             self.v1 = v1
75 |             self.v2 = v2
76 |             self.v3 = kwargs.get('v3', 3)
77 |             self.v4 = kwargs.get('v4', 4)
78 | 
79 |         def call(self):
80 |             return self.v1 + self.v2 + self.v3 + self.v4
81 | 
82 |     assert call(Dummy, v1=1, v2=2).call() == Dummy(v1=1, v2=2).call()
83 |     assert call(Dummy, v1=1, v2=2, v3=3, v4=5).call() == Dummy(v1=1, v2=2, v3=3, v4=5).call()
84 | 


--------------------------------------------------------------------------------
/rebulk/test/test_match.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # pylint: disable=pointless-statement, missing-docstring, unneeded-not, len-as-condition
  4 | 
  5 | import pytest
  6 | 
  7 | from ..match import Match, Matches
  8 | from ..pattern import StringPattern, RePattern
  9 | from ..formatters import formatters
 10 | 
 11 | 
 12 | class TestMatchClass:
 13 |     def test_repr(self):
 14 |         match1 = Match(1, 3, value="es")
 15 | 
 16 |         assert repr(match1) == '<es:(1, 3)>'
 17 | 
 18 |         match2 = Match(0, 4, value="test", private=True, name="abc", tags=['one', 'two'])
 19 | 
 20 |         assert repr(match2) == '<test:(0, 4)+private+name=abc+tags=[\'one\', \'two\']>'
 21 | 
 22 |     def test_names(self):
 23 |         parent = Match(0, 10, name="test")
 24 |         parent.children.append(Match(0, 10, name="child1", parent=parent))
 25 |         parent.children.append(Match(0, 10, name="child2", parent=parent))
 26 | 
 27 |         assert set(parent.names) == set(["child1", "child2"])
 28 | 
 29 |     def test_equality(self):
 30 |         match1 = Match(1, 3, value="es")
 31 |         match2 = Match(1, 3, value="es")
 32 | 
 33 |         other = object()
 34 | 
 35 |         assert hash(match1) == hash(match2)
 36 |         assert hash(match1) != hash(other)
 37 | 
 38 |         assert match1 == match2
 39 |         assert not match1 == other
 40 | 
 41 |     def test_inequality(self):
 42 |         match1 = Match(0, 2, value="te")
 43 |         match2 = Match(2, 4, value="st")
 44 |         match3 = Match(0, 2, value="other")
 45 | 
 46 |         other = object()
 47 | 
 48 |         assert hash(match1) != hash(match2)
 49 |         assert hash(match1) != hash(match3)
 50 | 
 51 |         assert match1 != other
 52 |         assert match1 != match2
 53 |         assert match1 != match3
 54 | 
 55 |     def test_length(self):
 56 |         match1 = Match(0, 4, value="test")
 57 |         match2 = Match(0, 2, value="spanIsUsed")
 58 | 
 59 |         assert len(match1) == 4
 60 |         assert len(match2) == 2
 61 | 
 62 |     def test_compare(self):
 63 |         match1 = Match(0, 2, value="te")
 64 |         match2 = Match(2, 4, value="st")
 65 | 
 66 |         other = object()
 67 | 
 68 |         assert match1 < match2
 69 |         assert match1 <= match2
 70 | 
 71 |         assert match2 > match1
 72 |         assert match2 >= match1
 73 | 
 74 |         with pytest.raises(TypeError):
 75 |             match1 < other
 76 | 
 77 |         with pytest.raises(TypeError):
 78 |             match1 <= other
 79 | 
 80 |         with pytest.raises(TypeError):
 81 |             match1 > other
 82 | 
 83 |         with pytest.raises(TypeError):
 84 |             match1 >= other
 85 | 
 86 | 
 87 |     def test_value(self):
 88 |         match1 = Match(1, 3)
 89 |         match1.value = "test"
 90 | 
 91 |         assert match1.value == "test"
 92 | 
 93 | 
 94 | class TestMatchesClass:
 95 |     match1 = Match(0, 2, value="te", name="start")
 96 |     match2 = Match(2, 3, value="s", tags="tag1")
 97 |     match3 = Match(3, 4, value="t", tags=["tag1", "tag2"])
 98 |     match4 = Match(2, 4, value="st", name="end")
 99 | 
100 |     def test_tag(self):
101 |         matches = Matches()
102 |         matches.append(self.match1)
103 |         matches.append(self.match2)
104 |         matches.append(self.match3)
105 |         matches.append(self.match4)
106 | 
107 |         assert "start" in matches.names
108 |         assert "end" in matches.names
109 | 
110 |         assert "tag1" in matches.tags
111 |         assert "tag2" in matches.tags
112 | 
113 |         assert self.match3.tagged("tag1")
114 |         assert not self.match3.tagged("start")
115 | 
116 |         tag1 = matches.tagged("tag1")
117 |         assert len(tag1) == 2
118 |         assert tag1[0] == self.match2
119 |         assert tag1[1] == self.match3
120 | 
121 |         tag2 = matches.tagged("tag2")
122 |         assert len(tag2) == 1
123 |         assert tag2[0] == self.match3
124 | 
125 |         start = matches.named("start")
126 |         assert len(start) == 1
127 |         assert start[0] == self.match1
128 | 
129 |         end = matches.named("end")
130 |         assert len(end) == 1
131 |         assert end[0] == self.match4
132 | 
133 |     def test_base(self):
134 |         matches = Matches()
135 |         matches.append(self.match1)
136 | 
137 |         assert len(matches) == 1
138 |         assert repr(matches) == repr([self.match1])
139 |         assert list(matches.starting(0)) == [self.match1]
140 |         assert list(matches.ending(2)) == [self.match1]
141 | 
142 |         matches.append(self.match2)
143 |         matches.append(self.match3)
144 |         matches.append(self.match4)
145 | 
146 |         assert len(matches) == 4
147 |         assert list(matches.starting(2)) == [self.match2, self.match4]
148 |         assert list(matches.starting(3)) == [self.match3]
149 |         assert list(matches.ending(3)) == [self.match2]
150 |         assert list(matches.ending(4)) == [self.match3, self.match4]
151 |         assert list(matches.range()) == [self.match1, self.match2, self.match4, self.match3]
152 |         assert list(matches.range(0)) == [self.match1, self.match2, self.match4, self.match3]
153 |         assert list(matches.range(0, 3)) == [self.match1, self.match2, self.match4]
154 |         assert list(matches.range(2, 3)) == [self.match2, self.match4]
155 |         assert list(matches.range(3, 4)) == [self.match4, self.match3]
156 | 
157 |         matches.remove(self.match1)
158 |         assert len(matches) == 3
159 |         assert len(matches.starting(0)) == 0
160 |         assert len(matches.ending(2)) == 0
161 | 
162 |         matches.clear()
163 | 
164 |         assert len(matches) == 0
165 |         assert len(matches.starting(0)) == 0
166 |         assert len(matches.starting(2)) == 0
167 |         assert len(matches.starting(3)) == 0
168 |         assert len(matches.ending(2)) == 0
169 |         assert len(matches.ending(3)) == 0
170 |         assert len(matches.ending(4)) == 0
171 | 
172 |     def test_get_slices(self):
173 |         matches = Matches()
174 |         matches.append(self.match1)
175 |         matches.append(self.match2)
176 |         matches.append(self.match3)
177 |         matches.append(self.match4)
178 | 
179 |         slice_matches = matches[1:3]
180 | 
181 |         assert isinstance(slice_matches, Matches)
182 | 
183 |         assert len(slice_matches) == 2
184 |         assert slice_matches[0] == self.match2
185 |         assert slice_matches[1] == self.match3
186 | 
187 |     def test_remove_slices(self):
188 |         matches = Matches()
189 |         matches.append(self.match1)
190 |         matches.append(self.match2)
191 |         matches.append(self.match3)
192 |         matches.append(self.match4)
193 | 
194 |         del matches[1:3]
195 | 
196 |         assert len(matches) == 2
197 |         assert matches[0] == self.match1
198 |         assert matches[1] == self.match4
199 | 
200 |     def test_set_slices(self):
201 |         matches = Matches()
202 |         matches.append(self.match1)
203 |         matches.append(self.match2)
204 |         matches.append(self.match3)
205 |         matches.append(self.match4)
206 | 
207 |         matches[1:3] = self.match1, self.match4
208 | 
209 |         assert len(matches) == 4
210 |         assert matches[0] == self.match1
211 |         assert matches[1] == self.match1
212 |         assert matches[2] == self.match4
213 |         assert matches[3] == self.match4
214 | 
215 |     def test_set_index(self):
216 |         matches = Matches()
217 |         matches.append(self.match1)
218 |         matches.append(self.match2)
219 |         matches.append(self.match3)
220 | 
221 |         matches[1] = self.match4
222 | 
223 |         assert len(matches) == 3
224 |         assert matches[0] == self.match1
225 |         assert matches[1] == self.match4
226 |         assert matches[2] == self.match3
227 | 
228 |     def test_constructor(self):
229 |         matches = Matches([self.match1, self.match2, self.match3, self.match4])
230 | 
231 |         assert len(matches) == 4
232 |         assert list(matches.starting(0)) == [self.match1]
233 |         assert list(matches.ending(2)) == [self.match1]
234 |         assert list(matches.starting(2)) == [self.match2, self.match4]
235 |         assert list(matches.starting(3)) == [self.match3]
236 |         assert list(matches.ending(3)) == [self.match2]
237 |         assert list(matches.ending(4)) == [self.match3, self.match4]
238 | 
239 |     def test_constructor_kwargs(self):
240 |         matches = Matches([self.match1, self.match2, self.match3, self.match4], input_string="test")
241 | 
242 |         assert len(matches) == 4
243 |         assert matches.input_string == "test"
244 |         assert list(matches.starting(0)) == [self.match1]
245 |         assert list(matches.ending(2)) == [self.match1]
246 |         assert list(matches.starting(2)) == [self.match2, self.match4]
247 |         assert list(matches.starting(3)) == [self.match3]
248 |         assert list(matches.ending(3)) == [self.match2]
249 |         assert list(matches.ending(4)) == [self.match3, self.match4]
250 | 
251 |     def test_crop(self):
252 |         input_string = "abcdefghijklmnopqrstuvwxyz"
253 | 
254 |         match1 = Match(1, 10, input_string=input_string)
255 |         match2 = Match(0, 2, input_string=input_string)
256 |         match3 = Match(8, 15, input_string=input_string)
257 | 
258 |         ret = match1.crop([match2, match3.span])
259 | 
260 |         assert len(ret) == 1
261 | 
262 |         assert ret[0].span == (2, 8)
263 |         assert ret[0].value == "cdefgh"
264 | 
265 |         ret = match1.crop((1, 10))
266 |         assert len(ret) == 0
267 | 
268 |         ret = match1.crop((1, 3))
269 |         assert len(ret) == 1
270 |         assert ret[0].span == (3, 10)
271 | 
272 |         ret = match1.crop((7, 10))
273 |         assert len(ret) == 1
274 |         assert ret[0].span == (1, 7)
275 | 
276 |         ret = match1.crop((0, 12))
277 |         assert len(ret) == 0
278 | 
279 |         ret = match1.crop((4, 6))
280 |         assert len(ret) == 2
281 | 
282 |         assert ret[0].span == (1, 4)
283 |         assert ret[1].span == (6, 10)
284 | 
285 |         ret = match1.crop([(3, 5), (7, 9)])
286 |         assert len(ret) == 3
287 | 
288 |         assert ret[0].span == (1, 3)
289 |         assert ret[1].span == (5, 7)
290 |         assert ret[2].span == (9, 10)
291 | 
292 |     def test_split(self):
293 |         input_string = "123 +word1  -  word2  + word3  456"
294 |         match = Match(3, len(input_string) - 3, input_string=input_string)
295 |         splitted = match.split(" -+")
296 | 
297 |         assert len(splitted) == 3
298 |         assert [split.value for split in splitted] == ["word1", "word2", "word3"]
299 | 
300 | 
301 | class TestMaches:
302 |     def test_names(self):
303 |         input_string = "One Two Three"
304 | 
305 |         matches = Matches()
306 | 
307 |         matches.extend(StringPattern("One", name="1-str", tags=["One", "str"]).matches(input_string))
308 |         matches.extend(RePattern("One", name="1-re", tags=["One", "re"]).matches(input_string))
309 |         matches.extend(StringPattern("Two", name="2-str", tags=["Two", "str"]).matches(input_string))
310 |         matches.extend(RePattern("Two", name="2-re", tags=["Two", "re"]).matches(input_string))
311 |         matches.extend(StringPattern("Three", name="3-str", tags=["Three", "str"]).matches(input_string))
312 |         matches.extend(RePattern("Three", name="3-re", tags=["Three", "re"]).matches(input_string))
313 | 
314 |         assert set(matches.names) == set(["1-str", "1-re", "2-str", "2-re", "3-str", "3-re"])
315 | 
316 |     def test_filters(self):
317 |         input_string = "One Two Three"
318 | 
319 |         matches = Matches()
320 | 
321 |         matches.extend(StringPattern("One", name="1-str", tags=["One", "str"]).matches(input_string))
322 |         matches.extend(RePattern("One", name="1-re", tags=["One", "re"]).matches(input_string))
323 |         matches.extend(StringPattern("Two", name="2-str", tags=["Two", "str"]).matches(input_string))
324 |         matches.extend(RePattern("Two", name="2-re", tags=["Two", "re"]).matches(input_string))
325 |         matches.extend(StringPattern("Three", name="3-str", tags=["Three", "str"]).matches(input_string))
326 |         matches.extend(RePattern("Three", name="3-re", tags=["Three", "re"]).matches(input_string))
327 | 
328 |         selection = matches.starting(0)
329 |         assert len(selection) == 2
330 | 
331 |         selection = matches.starting(0, lambda m: "str" in m.tags)
332 |         assert len(selection) == 1
333 |         assert selection[0].pattern.name == "1-str"
334 | 
335 |         selection = matches.ending(7, predicate=lambda m: "str" in m.tags)
336 |         assert len(selection) == 1
337 |         assert selection[0].pattern.name == "2-str"
338 | 
339 |         selection = matches.previous(matches.named("2-str")[0])
340 |         assert len(selection) == 2
341 |         assert selection[0].pattern.name == "1-str"
342 |         assert selection[1].pattern.name == "1-re"
343 | 
344 |         selection = matches.previous(matches.named("2-str", 0), lambda m: "str" in m.tags)
345 |         assert len(selection) == 1
346 |         assert selection[0].pattern.name == "1-str"
347 | 
348 |         selection = matches.next(matches.named("2-str", 0))
349 |         assert len(selection) == 2
350 |         assert selection[0].pattern.name == "3-str"
351 |         assert selection[1].pattern.name == "3-re"
352 | 
353 |         selection = matches.next(matches.named("2-str", 0), index=0, predicate=lambda m: "re" in m.tags)
354 |         assert selection is not None
355 |         assert selection.pattern.name == "3-re"
356 | 
357 |         selection = matches.next(matches.named("2-str", index=0), lambda m: "re" in m.tags)
358 |         assert len(selection) == 1
359 |         assert selection[0].pattern.name == "3-re"
360 | 
361 |         selection = matches.named("2-str", lambda m: "re" in m.tags)
362 |         assert len(selection) == 0
363 | 
364 |         selection = matches.named("2-re", lambda m: "re" in m.tags, 0)
365 |         assert selection is not None
366 |         assert selection.name == "2-re"  # pylint:disable=no-member
367 | 
368 |         selection = matches.named("2-re", lambda m: "re" in m.tags)
369 |         assert len(selection) == 1
370 |         assert selection[0].name == "2-re"
371 | 
372 |         selection = matches.named("2-re", lambda m: "re" in m.tags, index=1000)
373 |         assert selection is None
374 | 
375 |     def test_raw(self):
376 |         input_string = "0123456789"
377 | 
378 |         match = Match(0, 10, input_string=input_string, formatter=lambda s: s*2)
379 | 
380 |         assert match.value == match.raw * 2
381 |         assert match.raw == input_string
382 | 
383 |         match.raw_end = 9
384 |         match.raw_start = 1
385 | 
386 |         assert match.value == match.raw * 2
387 |         assert match.raw == input_string[1:9]
388 | 
389 |         match.raw_end = None
390 |         match.raw_start = None
391 | 
392 |         assert match.value == match.raw * 2
393 |         assert match.raw == input_string
394 | 
395 | 
396 |     def test_formatter_chain(self):
397 |         input_string = "100"
398 | 
399 |         match = Match(0, 3, input_string=input_string, formatter=formatters(int, lambda s: s*2, lambda  s: s+10))
400 | 
401 |         assert match.raw == input_string
402 |         assert match.value == 100 * 2 + 10
403 | 
404 | 
405 |     def test_to_dict(self):
406 |         input_string = "One Two Two Three"
407 | 
408 |         matches = Matches()
409 | 
410 |         matches.extend(StringPattern("One", name="1", tags=["One", "str"]).matches(input_string))
411 |         matches.extend(RePattern("One", name="1", tags=["One", "re"]).matches(input_string))
412 |         matches.extend(StringPattern("Two", name="2", tags=["Two", "str"]).matches(input_string))
413 |         matches.extend(RePattern("Two", name="2", tags=["Two", "re"]).matches(input_string))
414 |         matches.extend(RePattern("Two", name="2", tags=["Two", "reBis"]).matches(input_string))
415 |         matches.extend(StringPattern("Three", name="3", tags=["Three", "str"]).matches(input_string))
416 |         matches.extend(RePattern("Three", name="3bis", tags=["Three", "re"]).matches(input_string))
417 |         matches.extend(RePattern(r"(\w+)", name="words").matches(input_string))
418 | 
419 |         kvalues = matches.to_dict(first_value=True)
420 |         assert kvalues == {"1": "One",
421 |                            "2": "Two",
422 |                            "3": "Three",
423 |                            "3bis": "Three",
424 |                            "words": "One"}
425 |         assert kvalues.values_list["words"] == ["One", "Two", "Three"]
426 | 
427 |         kvalues = matches.to_dict(enforce_list=True)
428 |         assert kvalues["words"] == ["One", "Two", "Three"]
429 | 
430 |         kvalues = matches.to_dict(details=True)
431 |         assert kvalues["1"].value == "One"
432 | 
433 |         assert len(kvalues["2"]) == 2
434 |         assert kvalues["2"][0].value == "Two"
435 |         assert kvalues["2"][1].value == "Two"
436 | 
437 |         assert kvalues["3"].value == "Three"
438 |         assert kvalues["3bis"].value == "Three"
439 | 
440 |         assert len(kvalues["words"]) == 4
441 |         assert kvalues["words"][0].value == "One"
442 |         assert kvalues["words"][1].value == "Two"
443 |         assert kvalues["words"][2].value == "Two"
444 |         assert kvalues["words"][3].value == "Three"
445 | 
446 |         kvalues = matches.to_dict(details=True)
447 |         assert kvalues["1"].value == "One"
448 | 
449 |         assert len(kvalues.values_list["2"]) == 2
450 |         assert kvalues.values_list["2"][0].value == "Two"
451 |         assert kvalues.values_list["2"][1].value == "Two"
452 | 
453 |         assert kvalues["3"].value == "Three"
454 |         assert kvalues["3bis"].value == "Three"
455 | 
456 |         assert len(kvalues.values_list["words"]) == 4
457 |         assert kvalues.values_list["words"][0].value == "One"
458 |         assert kvalues.values_list["words"][1].value == "Two"
459 |         assert kvalues.values_list["words"][2].value == "Two"
460 |         assert kvalues.values_list["words"][3].value == "Three"
461 | 
462 |     def test_chains(self):
463 |         input_string = "wordX 10 20 30 40 wordA, wordB, wordC 70 80 wordX"
464 | 
465 |         matches = Matches(input_string=input_string)
466 | 
467 |         matches.extend(RePattern(r"\d+", name="digit").matches(input_string))
468 |         matches.extend(RePattern("[a-zA-Z]+", name="word").matches(input_string))
469 | 
470 |         assert len(matches) == 11
471 | 
472 |         a_start = input_string.find('wordA')
473 | 
474 |         b_start = input_string.find('wordB')
475 |         b_end = b_start + len('wordB')
476 | 
477 |         c_start = input_string.find('wordC')
478 |         c_end = c_start + len('wordC')
479 | 
480 |         chain_before = matches.chain_before(b_start, " ,", predicate=lambda match: match.name == "word")
481 |         assert len(chain_before) == 1
482 |         assert chain_before[0].value == 'wordA'
483 | 
484 |         chain_before = matches.chain_before(Match(b_start, b_start), " ,", predicate=lambda match: match.name == "word")
485 |         assert len(chain_before) == 1
486 |         assert chain_before[0].value == 'wordA'
487 | 
488 |         chain_before = matches.chain_before(b_start, " ,", predicate=lambda match: match.name == "digit")
489 |         assert len(chain_before) == 0
490 | 
491 |         chain_before = matches.chain_before(a_start, " ,", predicate=lambda match: match.name == "digit")
492 |         assert len(chain_before) == 4
493 |         assert [match.value for match in chain_before] == ["40", "30", "20", "10"]
494 | 
495 |         chain_after = matches.chain_after(b_end, " ,", predicate=lambda match: match.name == "word")
496 |         assert len(chain_after) == 1
497 |         assert chain_after[0].value == 'wordC'
498 | 
499 |         chain_after = matches.chain_after(Match(b_end, b_end), " ,", predicate=lambda match: match.name == "word")
500 |         assert len(chain_after) == 1
501 |         assert chain_after[0].value == 'wordC'
502 | 
503 |         chain_after = matches.chain_after(b_end, " ,", predicate=lambda match: match.name == "digit")
504 |         assert len(chain_after) == 0
505 | 
506 |         chain_after = matches.chain_after(c_end, " ,", predicate=lambda match: match.name == "digit")
507 |         assert len(chain_after) == 2
508 |         assert [match.value for match in chain_after] == ["70", "80"]
509 | 
510 |         chain_after = matches.chain_after(c_end, " ,", end=10000, predicate=lambda match: match.name == "digit")
511 |         assert len(chain_after) == 2
512 |         assert [match.value for match in chain_after] == ["70", "80"]
513 | 
514 |     def test_holes(self):
515 |         input_string = '1'*10+'2'*10+'3'*10+'4'*10+'5'*10+'6'*10+'7'*10
516 | 
517 |         hole1 = Match(0, 10, input_string=input_string)
518 |         hole2 = Match(20, 30, input_string=input_string)
519 |         hole3 = Match(30, 40, input_string=input_string)
520 |         hole4 = Match(60, 70, input_string=input_string)
521 | 
522 |         matches = Matches([hole1, hole2], input_string=input_string)
523 |         matches.append(hole3)
524 |         matches.append(hole4)
525 | 
526 |         holes = list(matches.holes())
527 |         assert len(holes) == 2
528 |         assert holes[0].span == (10, 20)
529 |         assert holes[0].value == '2'*10
530 |         assert holes[1].span == (40, 60)
531 |         assert holes[1].value == '5' * 10 + '6' * 10
532 | 
533 |         holes = list(matches.holes(5, 15))
534 |         assert len(holes) == 1
535 |         assert holes[0].span == (10, 15)
536 |         assert holes[0].value == '2'*5
537 | 
538 |         holes = list(matches.holes(5, 15, formatter=lambda value: "formatted"))
539 |         assert len(holes) == 1
540 |         assert holes[0].span == (10, 15)
541 |         assert holes[0].value == "formatted"
542 | 
543 |         holes = list(matches.holes(5, 15, predicate=lambda hole: False))
544 |         assert len(holes) == 0
545 | 
546 |     def test_holes_empty(self):
547 |         input_string = "Test hole on empty matches"
548 |         matches = Matches(input_string=input_string)
549 |         holes = matches.holes()
550 |         assert len(holes) == 1
551 |         assert holes[0].value == input_string
552 | 
553 |     def test_holes_seps(self):
554 |         input_string = "Test hole - with many separators + included"
555 |         match = StringPattern("many").matches(input_string)
556 | 
557 |         matches = Matches(match, input_string)
558 |         holes = matches.holes()
559 | 
560 |         assert len(holes) == 2
561 | 
562 |         holes = matches.holes(seps="-+")
563 | 
564 |         assert len(holes) == 4
565 |         assert [hole.value for hole in holes] == ["Test hole ", " with ", " separators ", " included"]
566 | 


--------------------------------------------------------------------------------
/rebulk/test/test_processors.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # pylint: disable=pointless-statement, missing-docstring, no-member, len-as-condition
  4 | 
  5 | from ..pattern import StringPattern, RePattern
  6 | from ..processors import ConflictSolver
  7 | from ..rules import execute_rule
  8 | from ..match import Matches
  9 | 
 10 | 
 11 | def test_conflict_1():
 12 |     input_string = "abcdefghijklmnopqrstuvwxyz"
 13 | 
 14 |     pattern = StringPattern("ijklmn", "kl", "abcdef", "ab", "ef", "yz")
 15 |     matches = Matches(pattern.matches(input_string))
 16 | 
 17 |     execute_rule(ConflictSolver(), matches, None)
 18 | 
 19 |     values = [x.value for x in matches]
 20 | 
 21 |     assert values == ["ijklmn", "abcdef", "yz"]
 22 | 
 23 | 
 24 | def test_conflict_2():
 25 |     input_string = "abcdefghijklmnopqrstuvwxyz"
 26 | 
 27 |     pattern = StringPattern("ijklmn", "jklmnopqrst")
 28 |     matches = Matches(pattern.matches(input_string))
 29 | 
 30 |     execute_rule(ConflictSolver(), matches, None)
 31 | 
 32 |     values = [x.value for x in matches]
 33 | 
 34 |     assert values == ["jklmnopqrst"]
 35 | 
 36 | 
 37 | def test_conflict_3():
 38 |     input_string = "abcdefghijklmnopqrstuvwxyz"
 39 | 
 40 |     pattern = StringPattern("ijklmnopqrst", "jklmnopqrst")
 41 |     matches = Matches(pattern.matches(input_string))
 42 | 
 43 |     execute_rule(ConflictSolver(), matches, None)
 44 | 
 45 |     values = [x.value for x in matches]
 46 | 
 47 |     assert values == ["ijklmnopqrst"]
 48 | 
 49 | 
 50 | def test_conflict_4():
 51 |     input_string = "123456789"
 52 | 
 53 |     pattern = StringPattern("123", "456789")
 54 |     matches = Matches(pattern.matches(input_string))
 55 | 
 56 |     execute_rule(ConflictSolver(), matches, None)
 57 | 
 58 |     values = [x.value for x in matches]
 59 |     assert values == ["123", "456789"]
 60 | 
 61 | 
 62 | def test_conflict_5():
 63 |     input_string = "123456789"
 64 | 
 65 |     pattern = StringPattern("123456", "789")
 66 |     matches = Matches(pattern.matches(input_string))
 67 | 
 68 |     execute_rule(ConflictSolver(), matches, None)
 69 | 
 70 |     values = [x.value for x in matches]
 71 |     assert values == ["123456", "789"]
 72 | 
 73 | 
 74 | def test_prefer_longer_parent():
 75 |     input_string = "xxx.1x02.xxx"
 76 | 
 77 |     re1 = RePattern("([0-9]+)x([0-9]+)", name='prefer', children=True, formatter=int)
 78 |     re2 = RePattern("x([0-9]+)", name='skip', children=True)
 79 | 
 80 |     matches = Matches(re1.matches(input_string))
 81 |     matches.extend(re2.matches(input_string))
 82 | 
 83 |     execute_rule(ConflictSolver(), matches, None)
 84 |     assert len(matches) == 2
 85 |     assert matches[0].value == 1
 86 |     assert matches[1].value == 2
 87 | 
 88 | 
 89 | def test_conflict_solver_1():
 90 |     input_string = "123456789"
 91 | 
 92 |     re1 = StringPattern("2345678", conflict_solver=lambda match, conflicting: '__default__')
 93 |     re2 = StringPattern("34567")
 94 | 
 95 |     matches = Matches(re1.matches(input_string))
 96 |     matches.extend(re2.matches(input_string))
 97 | 
 98 |     execute_rule(ConflictSolver(), matches, None)
 99 |     assert len(matches) == 1
100 |     assert matches[0].value == "2345678"
101 | 
102 | 
103 | def test_conflict_solver_2():
104 |     input_string = "123456789"
105 | 
106 |     re1 = StringPattern("2345678", conflict_solver=lambda match, conflicting: '__default__')
107 |     re2 = StringPattern("34567", conflict_solver=lambda match, conflicting: conflicting)
108 | 
109 |     matches = Matches(re1.matches(input_string))
110 |     matches.extend(re2.matches(input_string))
111 | 
112 |     execute_rule(ConflictSolver(), matches, None)
113 |     assert len(matches) == 1
114 |     assert matches[0].value == "34567"
115 | 
116 | 
117 | def test_conflict_solver_3():
118 |     input_string = "123456789"
119 | 
120 |     re1 = StringPattern("2345678", conflict_solver=lambda match, conflicting: match)
121 |     re2 = StringPattern("34567")
122 | 
123 |     matches = Matches(re1.matches(input_string))
124 |     matches.extend(re2.matches(input_string))
125 | 
126 |     execute_rule(ConflictSolver(), matches, None)
127 |     assert len(matches) == 1
128 |     assert matches[0].value == "34567"
129 | 
130 | 
131 | def test_conflict_solver_4():
132 |     input_string = "123456789"
133 | 
134 |     re1 = StringPattern("2345678")
135 |     re2 = StringPattern("34567", conflict_solver=lambda match, conflicting: conflicting)
136 | 
137 |     matches = Matches(re1.matches(input_string))
138 |     matches.extend(re2.matches(input_string))
139 | 
140 |     execute_rule(ConflictSolver(), matches, None)
141 |     assert len(matches) == 1
142 |     assert matches[0].value == "34567"
143 | 
144 | 
145 | def test_conflict_solver_5():
146 |     input_string = "123456789"
147 | 
148 |     re1 = StringPattern("2345678", conflict_solver=lambda match, conflicting: conflicting)
149 |     re2 = StringPattern("34567")
150 | 
151 |     matches = Matches(re1.matches(input_string))
152 |     matches.extend(re2.matches(input_string))
153 | 
154 |     execute_rule(ConflictSolver(), matches, None)
155 |     assert len(matches) == 1
156 |     assert matches[0].value == "2345678"
157 | 
158 | 
159 | def test_conflict_solver_6():
160 |     input_string = "123456789"
161 | 
162 |     re1 = StringPattern("2345678")
163 |     re2 = StringPattern("34567", conflict_solver=lambda match, conflicting: conflicting)
164 | 
165 |     matches = Matches(re1.matches(input_string))
166 |     matches.extend(re2.matches(input_string))
167 | 
168 |     execute_rule(ConflictSolver(), matches, None)
169 |     assert len(matches) == 1
170 |     assert matches[0].value == "34567"
171 | 
172 | 
173 | def test_conflict_solver_7():
174 |     input_string = "102"
175 | 
176 |     re1 = StringPattern("102")
177 |     re2 = StringPattern("02")
178 | 
179 |     matches = Matches(re2.matches(input_string))
180 |     matches.extend(re1.matches(input_string))
181 | 
182 |     execute_rule(ConflictSolver(), matches, None)
183 |     assert len(matches) == 1
184 |     assert matches[0].value == "102"
185 | 
186 | 
187 | def test_unresolved():
188 |     input_string = "123456789"
189 | 
190 |     re1 = StringPattern("23456")
191 |     re2 = StringPattern("34567")
192 | 
193 |     matches = Matches(re1.matches(input_string))
194 |     matches.extend(re2.matches(input_string))
195 | 
196 |     execute_rule(ConflictSolver(), matches, None)
197 |     assert len(matches) == 2
198 | 
199 |     re1 = StringPattern("34567")
200 |     re2 = StringPattern("2345678", conflict_solver=lambda match, conflicting: None)
201 | 
202 |     matches = Matches(re1.matches(input_string))
203 |     matches.extend(re2.matches(input_string))
204 | 
205 |     execute_rule(ConflictSolver(), matches, None)
206 |     assert len(matches) == 2
207 | 
208 |     re1 = StringPattern("34567", conflict_solver=lambda match, conflicting: None)
209 |     re2 = StringPattern("2345678")
210 | 
211 |     matches = Matches(re1.matches(input_string))
212 |     matches.extend(re2.matches(input_string))
213 | 
214 |     execute_rule(ConflictSolver(), matches, None)
215 |     assert len(matches) == 2
216 | 


--------------------------------------------------------------------------------
/rebulk/test/test_rebulk.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # pylint: disable=pointless-statement, missing-docstring, no-member, len-as-condition
  4 | 
  5 | from ..rebulk import Rebulk
  6 | from ..rules import Rule
  7 | from . import rebulk_rules_module as rm
  8 | 
  9 | 
 10 | def test_rebulk_simple():
 11 |     rebulk = Rebulk()
 12 | 
 13 |     rebulk.string("quick")
 14 |     rebulk.regex("f.x")
 15 | 
 16 |     def func(input_string):
 17 |         i = input_string.find("over")
 18 |         if i > -1:
 19 |             return i, i + len("over")
 20 | 
 21 |     rebulk.functional(func)
 22 | 
 23 |     input_string = "The quick brown fox jumps over the lazy dog"
 24 | 
 25 |     matches = rebulk.matches(input_string)
 26 |     assert len(matches) == 3
 27 | 
 28 |     assert matches[0].value == "quick"
 29 |     assert matches[1].value == "fox"
 30 |     assert matches[2].value == "over"
 31 | 
 32 | 
 33 | def test_rebulk_composition():
 34 |     rebulk = Rebulk()
 35 | 
 36 |     rebulk.string("quick")
 37 |     rebulk.rebulk(Rebulk().regex("f.x"))
 38 | 
 39 |     rebulk.rebulk(Rebulk(disabled=lambda context: True).functional(lambda string: None))
 40 | 
 41 |     input_string = "The quick brown fox jumps over the lazy dog"
 42 | 
 43 |     matches = rebulk.matches(input_string)
 44 |     assert len(matches) == 2
 45 | 
 46 |     assert matches[0].value == "quick"
 47 |     assert matches[1].value == "fox"
 48 | 
 49 | 
 50 | def test_rebulk_context():
 51 |     rebulk = Rebulk()
 52 | 
 53 |     context = {'nostring': True, 'word': 'lazy'}
 54 | 
 55 |     rebulk.string("quick", disabled=lambda context: context.get('nostring', False))
 56 |     rebulk.regex("f.x", disabled=lambda context: context.get('noregex', False))
 57 | 
 58 |     def func(input_string, context):
 59 |         word = context.get('word', 'over')
 60 |         i = input_string.find(word)
 61 |         if i > -1:
 62 |             return i, i + len(word)
 63 | 
 64 |     rebulk.functional(func)
 65 | 
 66 |     input_string = "The quick brown fox jumps over the lazy dog"
 67 | 
 68 |     matches = rebulk.matches(input_string, context)
 69 |     assert len(matches) == 2
 70 | 
 71 |     assert matches[0].value == "fox"
 72 |     assert matches[1].value == "lazy"
 73 | 
 74 | 
 75 | def test_rebulk_prefer_longer():
 76 |     input_string = "The quick brown fox jumps over the lazy dog"
 77 | 
 78 |     matches = Rebulk().string("quick").string("own").regex("br.{2}n").matches(input_string)
 79 | 
 80 |     assert len(matches) == 2
 81 | 
 82 |     assert matches[0].value == "quick"
 83 |     assert matches[1].value == "brown"
 84 | 
 85 | 
 86 | def test_rebulk_defaults():
 87 |     input_string = "The quick brown fox jumps over the lazy dog"
 88 | 
 89 |     def func(input_string):
 90 |         i = input_string.find("fox")
 91 |         if i > -1:
 92 |             return i, i + len("fox")
 93 | 
 94 |     matches = Rebulk()\
 95 |         .string_defaults(name="string", tags=["a", "b"])\
 96 |         .regex_defaults(name="regex") \
 97 |         .functional_defaults(name="functional") \
 98 |         .string("quick", tags=["c"])\
 99 |         .functional(func)\
100 |         .regex("br.{2}n") \
101 |         .matches(input_string)
102 |     assert matches[0].name == "string"
103 |     assert matches[0].tags == ["a", "b", "c"]
104 |     assert matches[1].name == "functional"
105 |     assert matches[2].name == "regex"
106 | 
107 |     matches = Rebulk() \
108 |         .defaults(name="default", tags=["0"])\
109 |         .string_defaults(name="string", tags=["a", "b"]) \
110 |         .functional_defaults(name="functional", tags=["1"]) \
111 |         .string("quick", tags=["c"]) \
112 |         .functional(func) \
113 |         .regex("br.{2}n") \
114 |         .matches(input_string)
115 |     assert matches[0].name == "string"
116 |     assert matches[0].tags == ["0", "a", "b", "c"]
117 |     assert matches[1].name == "functional"
118 |     assert matches[1].tags == ["0", "1"]
119 |     assert matches[2].name == "default"
120 |     assert matches[2].tags == ["0"]
121 | 
122 | 
123 | def test_rebulk_defaults_overrides():
124 |     input_string = "The quick brown fox jumps over the lazy dog"
125 | 
126 |     def func(input_string):
127 |         i = input_string.find("fox")
128 |         if i > -1:
129 |             return i, i + len("fox")
130 | 
131 |     matches = Rebulk() \
132 |         .string_defaults(name="string", tags=["a", "b"]) \
133 |         .regex_defaults(name="regex", tags=["d"]) \
134 |         .functional_defaults(name="functional") \
135 |         .string("quick", tags=["c"], overrides=["tags"]) \
136 |         .functional(func) \
137 |         .regex("br.{2}n") \
138 |         .matches(input_string)
139 |     assert matches[0].name == "string"
140 |     assert matches[0].tags == ["c"]
141 |     assert matches[1].name == "functional"
142 |     assert matches[2].name == "regex"
143 |     assert matches[2].tags == ["d"]
144 | 
145 |     matches = Rebulk() \
146 |         .defaults(name="default", tags=["0"]) \
147 |         .string_defaults(name="string", tags=["a", "b"]) \
148 |         .functional_defaults(name="functional", tags=["1"]) \
149 |         .string("quick", tags=["c"]) \
150 |         .functional(func) \
151 |         .regex("br.{2}n") \
152 |         .matches(input_string)
153 |     assert matches[0].name == "string"
154 |     assert matches[0].tags == ["0", "a", "b", "c"]
155 |     assert matches[1].name == "functional"
156 |     assert matches[1].tags == ["0", "1"]
157 |     assert matches[2].name == "default"
158 |     assert matches[2].tags == ["0"]
159 | 
160 | 
161 | def test_rebulk_rebulk():
162 |     input_string = "The quick brown fox jumps over the lazy dog"
163 | 
164 |     base = Rebulk().string("quick")
165 |     child = Rebulk().string("own").regex("br.{2}n")
166 | 
167 |     matches = base.rebulk(child).matches(input_string)
168 | 
169 |     assert len(matches) == 2
170 | 
171 |     assert matches[0].value == "quick"
172 |     assert matches[1].value == "brown"
173 | 
174 | 
175 | def test_rebulk_no_default():
176 |     input_string = "The quick brown fox jumps over the lazy dog"
177 | 
178 |     matches = Rebulk(default_rules=False).string("quick").string("own").regex("br.{2}n").matches(input_string)
179 | 
180 |     assert len(matches) == 3
181 | 
182 |     assert matches[0].value == "quick"
183 |     assert matches[1].value == "own"
184 |     assert matches[2].value == "brown"
185 | 
186 | 
187 | def test_rebulk_empty_match():
188 |     input_string = "The quick brown fox jumps over the lazy dog"
189 | 
190 |     matches = Rebulk(default_rules=False).string("quick").string("own").regex("br(.*?)own", children=True)\
191 |         .matches(input_string)
192 | 
193 |     assert len(matches) == 2
194 | 
195 |     assert matches[0].value == "quick"
196 |     assert matches[1].value == "own"
197 | 
198 | 
199 | def test_rebulk_tags_names():
200 |     rebulk = Rebulk()
201 | 
202 |     rebulk.string("quick", name="str", tags=["first", "other"])
203 |     rebulk.regex("f.x", tags="other")
204 | 
205 |     def func(input_string):
206 |         i = input_string.find("over")
207 |         if i > -1:
208 |             return i, i + len("over"), {'tags': ['custom']}
209 | 
210 |     rebulk.functional(func, name="fn")
211 | 
212 |     def func2(input_string):
213 |         i = input_string.find("lazy")
214 |         if i > -1:
215 |             return {'start': i, 'end': i + len("lazy"), 'tags': ['custom']}
216 | 
217 |     rebulk.functional(func2, name="fn")
218 | 
219 |     input_string = "The quick brown fox jumps over the lazy dog"
220 | 
221 |     matches = rebulk.matches(input_string)
222 |     assert len(matches) == 4
223 | 
224 |     assert len(matches.named("str")) == 1
225 |     assert len(matches.named("fn")) == 2
226 |     assert len(matches.named("false")) == 0
227 |     assert len(matches.tagged("false")) == 0
228 |     assert len(matches.tagged("first")) == 1
229 |     assert len(matches.tagged("other")) == 2
230 |     assert len(matches.tagged("custom")) == 2
231 | 
232 | 
233 | def test_rebulk_rules_1():
234 |     rebulk = Rebulk()
235 | 
236 |     rebulk.regex(r'\d{4}', name="year")
237 |     rebulk.rules(rm.RemoveAllButLastYear)
238 | 
239 |     matches = rebulk.matches("1984 keep only last 1968 entry 1982 case")
240 |     assert len(matches) == 1
241 |     assert matches[0].value == "1982"
242 | 
243 | 
244 | def test_rebulk_rules_2():
245 |     rebulk = Rebulk()
246 | 
247 |     rebulk.regex(r'\d{4}', name="year")
248 |     rebulk.string(r'year', name="yearPrefix", private=True)
249 |     rebulk.string(r'keep', name="yearSuffix", private=True)
250 |     rebulk.rules(rm.PrefixedSuffixedYear)
251 | 
252 |     matches = rebulk.matches("Keep suffix 1984 keep prefixed year 1968 and remove the rest 1982")
253 |     assert len(matches) == 2
254 |     assert matches[0].value == "1984"
255 |     assert matches[1].value == "1968"
256 | 
257 | 
258 | def test_rebulk_rules_3():
259 |     rebulk = Rebulk()
260 | 
261 |     rebulk.regex(r'\d{4}', name="year")
262 |     rebulk.string(r'year', name="yearPrefix", private=True)
263 |     rebulk.string(r'keep', name="yearSuffix", private=True)
264 |     rebulk.rules(rm.PrefixedSuffixedYearNoLambda)
265 | 
266 |     matches = rebulk.matches("Keep suffix 1984 keep prefixed year 1968 and remove the rest 1982")
267 |     assert len(matches) == 2
268 |     assert matches[0].value == "1984"
269 |     assert matches[1].value == "1968"
270 | 
271 | 
272 | def test_rebulk_rules_4():
273 |     class FirstOnlyRule(Rule):
274 |         def when(self, matches, context):
275 |             grabbed = matches.named("grabbed", 0)
276 |             if grabbed and matches.previous(grabbed):
277 |                 return grabbed
278 | 
279 |         def then(self, matches, when_response, context):
280 |             matches.remove(when_response)
281 | 
282 |     rebulk = Rebulk()
283 | 
284 |     rebulk.regex("This match (.*?)grabbed", name="grabbed")
285 |     rebulk.regex("if it's (.*?)first match", private=True)
286 | 
287 |     rebulk.rules(FirstOnlyRule)
288 | 
289 |     matches = rebulk.matches("This match is grabbed only if it's the first match")
290 |     assert len(matches) == 1
291 |     assert matches[0].value == "This match is grabbed"
292 | 
293 |     matches = rebulk.matches("if it's NOT the first match, This match is NOT grabbed")
294 |     assert len(matches) == 0
295 | 
296 | 
297 | class TestMarkers:
298 |     def test_one_marker(self):
299 |         class MarkerRule(Rule):
300 |             def when(self, matches, context):
301 |                 word_match = matches.named("word", 0)
302 |                 marker = matches.markers.at_match(word_match, lambda marker: marker.name == "mark1", 0)
303 |                 if not marker:
304 |                     return word_match
305 | 
306 |             def then(self, matches, when_response, context):
307 |                 matches.remove(when_response)
308 | 
309 |         rebulk = Rebulk().regex(r'\(.*?\)', marker=True, name="mark1") \
310 |             .regex(r'\[.*?\]', marker=True, name="mark2") \
311 |             .string("word", name="word") \
312 |             .rules(MarkerRule)
313 | 
314 |         matches = rebulk.matches("grab (word) only if it's in parenthesis")
315 | 
316 |         assert len(matches) == 1
317 |         assert matches[0].value == "word"
318 | 
319 |         matches = rebulk.matches("don't grab [word] if it's in braket")
320 |         assert len(matches) == 0
321 | 
322 |         matches = rebulk.matches("don't grab word at all")
323 |         assert len(matches) == 0
324 | 
325 |     def test_multiple_marker(self):
326 |         class MarkerRule(Rule):
327 |             def when(self, matches, context):
328 |                 word_match = matches.named("word", 0)
329 |                 marker = matches.markers.at_match(word_match,
330 |                                                   lambda marker: marker.name in ["mark1", "mark2"])
331 |                 if len(marker) < 2:
332 |                     return word_match
333 | 
334 |             def then(self, matches, when_response, context):
335 |                 matches.remove(when_response)
336 | 
337 |         rebulk = Rebulk().regex(r'\(.*?\)', marker=True, name="mark1") \
338 |             .regex(r'\[.*?\]', marker=True, name="mark2") \
339 |             .regex("w.*?d", name="word") \
340 |             .rules(MarkerRule)
341 | 
342 |         matches = rebulk.matches("[grab (word) only] if it's in parenthesis and brakets")
343 | 
344 |         assert len(matches) == 1
345 |         assert matches[0].value == "word"
346 | 
347 |         matches = rebulk.matches("[don't grab](word)[if brakets are outside]")
348 |         assert len(matches) == 0
349 | 
350 |         matches = rebulk.matches("(grab w[or)d even] if it's partially in parenthesis and brakets")
351 |         assert len(matches) == 1
352 |         assert matches[0].value == "w[or)d"
353 | 
354 |     def test_at_index_marker(self):
355 |         class MarkerRule(Rule):
356 |             def when(self, matches, context):
357 |                 word_match = matches.named("word", 0)
358 |                 marker = matches.markers.at_index(word_match.start,
359 |                                                   lambda marker: marker.name == "mark1", 0)
360 |                 if not marker:
361 |                     return word_match
362 | 
363 |             def then(self, matches, when_response, context):
364 |                 matches.remove(when_response)
365 | 
366 |         rebulk = Rebulk().regex(r'\(.*?\)', marker=True, name="mark1") \
367 |             .regex("w.*?d", name="word") \
368 |             .rules(MarkerRule)
369 | 
370 |         matches = rebulk.matches("gr(ab wo)rd only if starting of match is inside parenthesis")
371 | 
372 |         assert len(matches) == 1
373 |         assert matches[0].value == "wo)rd"
374 | 
375 |         matches = rebulk.matches("don't grab wo(rd if starting of match is not inside parenthesis")
376 | 
377 |         assert len(matches) == 0
378 | 
379 |     def test_remove_marker(self):
380 |         class MarkerRule(Rule):
381 |             def when(self, matches, context):
382 |                 marker = matches.markers.named("mark1", 0)
383 |                 if marker:
384 |                     return marker
385 | 
386 |             def then(self, matches, when_response, context):
387 |                 matches.markers.remove(when_response)
388 | 
389 |         rebulk = Rebulk().regex(r'\(.*?\)', marker=True, name="mark1") \
390 |             .regex("w.*?d", name="word") \
391 |             .rules(MarkerRule)
392 | 
393 |         matches = rebulk.matches("grab word event (if it's not) inside parenthesis")
394 | 
395 |         assert len(matches) == 1
396 |         assert matches[0].value == "word"
397 | 
398 |         assert not matches.markers
399 | 
400 | 
401 | class TestUnicode:
402 |     def test_rebulk_simple(self):
403 |         input_string = "敏捷的棕色狐狸跳過懶狗"
404 | 
405 |         rebulk = Rebulk()
406 | 
407 |         rebulk.string("敏")
408 |         rebulk.regex("捷")
409 | 
410 |         def func(input_string):
411 |             i = input_string.find("的")
412 |             if i > -1:
413 |                 return i, i + len("的")
414 | 
415 |         rebulk.functional(func)
416 | 
417 |         matches = rebulk.matches(input_string)
418 |         assert len(matches) == 3
419 | 
420 |         assert matches[0].value == "敏"
421 |         assert matches[1].value == "捷"
422 |         assert matches[2].value == "的"
423 | 
424 | 
425 | class TestImmutable:
426 |     def test_starting(self):
427 |         input_string = "The quick brown fox jumps over the lazy dog"
428 |         matches = Rebulk().string("quick").string("over").string("fox").matches(input_string)
429 | 
430 |         for i in range(0, len(input_string)):
431 |             starting = matches.starting(i)
432 |             for match in list(starting):
433 |                 starting.remove(match)
434 | 
435 |         assert len(matches) == 3
436 | 
437 |     def test_ending(self):
438 |         input_string = "The quick brown fox jumps over the lazy dog"
439 |         matches = Rebulk().string("quick").string("over").string("fox").matches(input_string)
440 | 
441 |         for i in range(0, len(input_string)):
442 |             starting = matches.ending(i)
443 |             for match in list(starting):
444 |                 starting.remove(match)
445 | 
446 |         assert len(matches) == 3
447 | 
448 |     def test_named(self):
449 |         input_string = "The quick brown fox jumps over the lazy dog"
450 |         matches = Rebulk().defaults(name='test').string("quick").string("over").string("fox").matches(input_string)
451 | 
452 |         named = matches.named('test')
453 |         for match in list(named):
454 |             named.remove(match)
455 | 
456 |         assert len(named) == 0
457 |         assert len(matches) == 3
458 | 


--------------------------------------------------------------------------------
/rebulk/test/test_rules.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # pylint: disable=pointless-statement, missing-docstring, invalid-name, no-member, len-as-condition
  4 | import pytest
  5 | from rebulk.test.default_rules_module import RuleRemove0, RuleAppend0, RuleRename0, RuleAppend1, RuleRemove1, \
  6 |     RuleRename1, RuleAppend2, RuleRename2, RuleAppend3, RuleRename3, RuleAppendTags0, RuleRemoveTags0, \
  7 |     RuleAppendTags1, RuleRemoveTags1
  8 | 
  9 | from ..rules import Rules
 10 | from ..match import Matches, Match
 11 | 
 12 | from .rules_module import Rule1, Rule2, Rule3, Rule0, Rule1Disabled
 13 | from . import rules_module as rm
 14 | 
 15 | 
 16 | def test_rule_priority():
 17 |     matches = Matches([Match(1, 2)])
 18 | 
 19 |     rules = Rules(Rule1, Rule2())
 20 | 
 21 |     rules.execute_all_rules(matches, {})
 22 |     assert len(matches) == 0
 23 |     matches = Matches([Match(1, 2)])
 24 | 
 25 |     rules = Rules(Rule1(), Rule0)
 26 | 
 27 |     rules.execute_all_rules(matches, {})
 28 |     assert len(matches) == 1
 29 |     assert matches[0] == Match(3, 4)
 30 | 
 31 | 
 32 | def test_rules_duplicates():
 33 |     matches = Matches([Match(1, 2)])
 34 | 
 35 |     rules = Rules(Rule1, Rule1)
 36 | 
 37 |     with pytest.raises(ValueError):
 38 |         rules.execute_all_rules(matches, {})
 39 | 
 40 | 
 41 | def test_rule_disabled():
 42 |     matches = Matches([Match(1, 2)])
 43 | 
 44 |     rules = Rules(Rule1Disabled(), Rule2())
 45 | 
 46 |     rules.execute_all_rules(matches, {})
 47 |     assert len(matches) == 2
 48 |     assert matches[0] == Match(1, 2)
 49 |     assert matches[1] == Match(3, 4)
 50 | 
 51 | 
 52 | def test_rule_when():
 53 |     matches = Matches([Match(1, 2)])
 54 | 
 55 |     rules = Rules(Rule3())
 56 | 
 57 |     rules.execute_all_rules(matches, {'when': False})
 58 |     assert len(matches) == 1
 59 |     assert matches[0] == Match(1, 2)
 60 | 
 61 |     matches = Matches([Match(1, 2)])
 62 | 
 63 |     rules.execute_all_rules(matches, {'when': True})
 64 |     assert len(matches) == 2
 65 |     assert matches[0] == Match(1, 2)
 66 |     assert matches[1] == Match(3, 4)
 67 | 
 68 | 
 69 | class TestDefaultRules:
 70 |     def test_remove(self):
 71 |         rules = Rules(RuleRemove0)
 72 | 
 73 |         matches = Matches([Match(1, 2)])
 74 |         rules.execute_all_rules(matches, {})
 75 | 
 76 |         assert len(matches) == 0
 77 | 
 78 |         rules = Rules(RuleRemove1)
 79 | 
 80 |         matches = Matches([Match(1, 2)])
 81 |         rules.execute_all_rules(matches, {})
 82 | 
 83 |         assert len(matches) == 0
 84 | 
 85 |     def test_append(self):
 86 |         rules = Rules(RuleAppend0)
 87 | 
 88 |         matches = Matches([Match(1, 2)])
 89 |         rules.execute_all_rules(matches, {})
 90 | 
 91 |         assert len(matches) == 2
 92 | 
 93 |         rules = Rules(RuleAppend1)
 94 | 
 95 |         matches = Matches([Match(1, 2)])
 96 |         rules.execute_all_rules(matches, {})
 97 | 
 98 |         assert len(matches) == 2
 99 | 
100 |         rules = Rules(RuleAppend2)
101 | 
102 |         matches = Matches([Match(1, 2)])
103 |         rules.execute_all_rules(matches, {})
104 | 
105 |         assert len(matches) == 2
106 |         assert len(matches.named('renamed')) == 1
107 | 
108 |         rules = Rules(RuleAppend3)
109 | 
110 |         matches = Matches([Match(1, 2)])
111 |         rules.execute_all_rules(matches, {})
112 | 
113 |         assert len(matches) == 2
114 |         assert len(matches.named('renamed')) == 1
115 | 
116 |     def test_rename(self):
117 |         rules = Rules(RuleRename0)
118 | 
119 |         matches = Matches([Match(1, 2, name='original')])
120 |         rules.execute_all_rules(matches, {})
121 | 
122 |         assert len(matches.named('original')) == 1
123 |         assert len(matches.named('renamed')) == 0
124 | 
125 |         rules = Rules(RuleRename1)
126 | 
127 |         matches = Matches([Match(5, 10, name='original')])
128 |         rules.execute_all_rules(matches, {})
129 | 
130 |         assert len(matches.named('original')) == 0
131 |         assert len(matches.named('renamed')) == 1
132 | 
133 |         rules = Rules(RuleRename2)
134 | 
135 |         matches = Matches([Match(5, 10, name='original')])
136 |         rules.execute_all_rules(matches, {})
137 | 
138 |         assert len(matches.named('original')) == 0
139 |         assert len(matches.named('renamed')) == 1
140 | 
141 |         rules = Rules(RuleRename3)
142 | 
143 |         matches = Matches([Match(5, 10, name='original')])
144 |         rules.execute_all_rules(matches, {})
145 | 
146 |         assert len(matches.named('original')) == 0
147 |         assert len(matches.named('renamed')) == 1
148 | 
149 |     def test_append_tags(self):
150 |         rules = Rules(RuleAppendTags0)
151 | 
152 |         matches = Matches([Match(1, 2, name='tags', tags=['other'])])
153 |         rules.execute_all_rules(matches, {})
154 | 
155 |         assert len(matches.named('tags')) == 1
156 |         assert matches.named('tags', index=0).tags == ['other', 'new-tag']
157 | 
158 |         rules = Rules(RuleAppendTags1)
159 | 
160 |         matches = Matches([Match(1, 2, name='tags', tags=['other'])])
161 |         rules.execute_all_rules(matches, {})
162 | 
163 |         assert len(matches.named('tags')) == 1
164 |         assert matches.named('tags', index=0).tags == ['other', 'new-tag']
165 | 
166 |     def test_remove_tags(self):
167 |         rules = Rules(RuleRemoveTags0)
168 | 
169 |         matches = Matches([Match(1, 2, name='tags', tags=['other', 'new-tag'])])
170 |         rules.execute_all_rules(matches, {})
171 | 
172 |         assert len(matches.named('tags')) == 1
173 |         assert matches.named('tags', index=0).tags == ['other']
174 | 
175 |         rules = Rules(RuleRemoveTags1)
176 | 
177 |         matches = Matches([Match(1, 2, name='tags', tags=['other', 'new-tag'])])
178 |         rules.execute_all_rules(matches, {})
179 | 
180 |         assert len(matches.named('tags')) == 1
181 |         assert matches.named('tags', index=0).tags == ['other']
182 | 
183 | 
184 | def test_rule_module():
185 |     rules = Rules(rm)
186 | 
187 |     matches = Matches([Match(1, 2)])
188 |     rules.execute_all_rules(matches, {})
189 | 
190 |     assert len(matches) == 1
191 | 
192 | 
193 | def test_rule_repr():
194 |     assert str(Rule0()) == "<Rule0>"
195 |     assert str(Rule1()) == "<Rule1>"
196 |     assert str(Rule2()) == "<Rule2>"
197 |     assert str(Rule1Disabled()) == "<Disabled Rule1>"
198 | 


--------------------------------------------------------------------------------
/rebulk/test/test_toposort.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # Copyright 2014 True Blade Systems, Inc.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | # http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Original:
 12 | #   - https://bitbucket.org/ericvsmith/toposort (1.4)
 13 | # Modifications:
 14 | #   - port to pytest
 15 | # pylint: skip-file
 16 | 
 17 | import pytest
 18 | from ..toposort import toposort, toposort_flatten, CyclicDependency
 19 | 
 20 | 
 21 | class TestCase:
 22 |     def test_simple(self):
 23 |         results = list(toposort({2: set([11]), 9: set([11, 8]), 10: set([11, 3]), 11: set([7, 5]), 8: set([7, 3])}))
 24 |         expected = [set([3, 5, 7]), set([8, 11]), set([2, 9, 10])]
 25 |         assert results == expected
 26 | 
 27 |         # make sure self dependencies are ignored
 28 |         results = list(toposort({2: set([2, 11]), 9: set([11, 8]), 10: set([10, 11, 3]), 11: set([7, 5]), 8: set([7, 3])}))
 29 |         expected = [set([3, 5, 7]), set([8, 11]), set([2, 9, 10])]
 30 |         assert results == expected
 31 | 
 32 |         assert list(toposort({1: set()})) == [set([1])]
 33 |         assert list(toposort({1: set([1])})) == [set([1])]
 34 | 
 35 |     def test_no_dependencies(self):
 36 |         assert list(toposort({1: set([2]), 3: set([4]), 5: set([6])})) == [set([2, 4, 6]), set([1, 3, 5])]
 37 |         assert list(toposort({1: set(), 3: set(), 5: set()})) == [set([1, 3, 5])]
 38 | 
 39 |     def test_empty(self):
 40 |         assert list(toposort({})) == []
 41 | 
 42 |     def test_strings(self):
 43 |         results = list(toposort({'2': set(['11']), '9': set(['11', '8']), '10': set(['11', '3']), '11': set(['7', '5']), '8': set(['7', '3'])}))
 44 |         expected = [set(['3', '5', '7']), set(['8', '11']), set(['2', '9', '10'])]
 45 |         assert results == expected
 46 | 
 47 |     def test_objects(self):
 48 |         o2 = object()
 49 |         o3 = object()
 50 |         o5 = object()
 51 |         o7 = object()
 52 |         o8 = object()
 53 |         o9 = object()
 54 |         o10 = object()
 55 |         o11 = object()
 56 |         results = list(toposort({o2: set([o11]), o9: set([o11, o8]), o10: set([o11, o3]), o11: set([o7, o5]), o8: set([o7, o3, o8])}))
 57 |         expected = [set([o3, o5, o7]), set([o8, o11]), set([o2, o9, o10])]
 58 |         assert results == expected
 59 | 
 60 |     def test_cycle(self):
 61 |         # a simple, 2 element cycle
 62 |         with pytest.raises(CyclicDependency):
 63 |             list(toposort({1: set([2]), 2: set([1])}))
 64 | 
 65 |         # an indirect cycle
 66 |         with pytest.raises(CyclicDependency):
 67 |             list(toposort({1: set([2]), 2: set([3]), 3: set([1])}))
 68 | 
 69 |     def test_input_not_modified(self):
 70 |         data = {2: set([11]),
 71 |                 9: set([11, 8]),
 72 |                 10: set([11, 3]),
 73 |                 11: set([7, 5]),
 74 |                 8: set([7, 3, 8]),  # includes something self-referential
 75 |                 }
 76 |         orig = data.copy()
 77 |         results = list(toposort(data))
 78 |         assert data == orig
 79 | 
 80 |     def test_input_not_modified_when_cycle_error(self):
 81 |         data = {1: set([2]),
 82 |                 2: set([1]),
 83 |                 3: set([4]),
 84 |                 }
 85 |         orig = data.copy()
 86 |         with pytest.raises(CyclicDependency):
 87 |             list(toposort(data))
 88 |         assert data == orig
 89 | 
 90 | 
 91 | class TestCaseAll:
 92 |     def test_sort_flatten(self):
 93 |         data = {2: set([11]),
 94 |                 9: set([11, 8]),
 95 |                 10: set([11, 3]),
 96 |                 11: set([7, 5]),
 97 |                 8: set([7, 3, 8]),  # includes something self-referential
 98 |                 }
 99 |         expected = [set([3, 5, 7]), set([8, 11]), set([2, 9, 10])]
100 |         assert list(toposort(data)) == expected
101 | 
102 |         # now check the sorted results
103 |         results = []
104 |         for item in expected:
105 |             results.extend(sorted(item))
106 |         assert toposort_flatten(data) == results
107 | 
108 |         # and the unsorted results. break the results up into groups to compare them
109 |         actual = toposort_flatten(data, False)
110 |         results = [set([i for i in actual[0:3]]), set([i for i in actual[3:5]]), set([i for i in actual[5:8]])]
111 |         assert results == expected
112 | 


--------------------------------------------------------------------------------
/rebulk/test/test_validators.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # pylint: disable=pointless-statement, missing-docstring, invalid-name,len-as-condition
 4 | 
 5 | from functools import partial
 6 | 
 7 | from rebulk.pattern import StringPattern
 8 | 
 9 | from ..validators import chars_before, chars_after, chars_surround, validators
10 | 
11 | chars = ' _.'
12 | left = partial(chars_before, chars)
13 | right = partial(chars_after, chars)
14 | surrounding = partial(chars_surround, chars)
15 | 
16 | 
17 | def test_left_chars():
18 |     matches = list(StringPattern("word", validator=left).matches("xxxwordxxx"))
19 |     assert len(matches) == 0
20 | 
21 |     matches = list(StringPattern("word", validator=left).matches("xxx_wordxxx"))
22 |     assert len(matches) == 1
23 | 
24 |     matches = list(StringPattern("word", validator=left).matches("wordxxx"))
25 |     assert len(matches) == 1
26 | 
27 | 
28 | def test_right_chars():
29 |     matches = list(StringPattern("word", validator=right).matches("xxxwordxxx"))
30 |     assert len(matches) == 0
31 | 
32 |     matches = list(StringPattern("word", validator=right).matches("xxxword.xxx"))
33 |     assert len(matches) == 1
34 | 
35 |     matches = list(StringPattern("word", validator=right).matches("xxxword"))
36 |     assert len(matches) == 1
37 | 
38 | 
39 | def test_surrounding_chars():
40 |     matches = list(StringPattern("word", validator=surrounding).matches("xxxword xxx"))
41 |     assert len(matches) == 0
42 | 
43 |     matches = list(StringPattern("word", validator=surrounding).matches("xxx.wordxxx"))
44 |     assert len(matches) == 0
45 | 
46 |     matches = list(StringPattern("word", validator=surrounding).matches("xxx word_xxx"))
47 |     assert len(matches) == 1
48 | 
49 |     matches = list(StringPattern("word", validator=surrounding).matches("word"))
50 |     assert len(matches) == 1
51 | 
52 | 
53 | def test_chain():
54 |     matches = list(StringPattern("word", validator=validators(left, right)).matches("xxxword xxx"))
55 |     assert len(matches) == 0
56 | 
57 |     matches = list(StringPattern("word", validator=validators(left, right)).matches("xxx.wordxxx"))
58 |     assert len(matches) == 0
59 | 
60 |     matches = list(StringPattern("word", validator=validators(left, right)).matches("xxx word_xxx"))
61 |     assert len(matches) == 1
62 | 
63 |     matches = list(StringPattern("word", validator=validators(left, right)).matches("word"))
64 |     assert len(matches) == 1
65 | 


--------------------------------------------------------------------------------
/rebulk/toposort.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # Copyright 2014 True Blade Systems, Inc.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Original:
12 | #   - https://bitbucket.org/ericvsmith/toposort (1.4)
13 | # Modifications:
14 | #   - merged Pull request #2 for CyclicDependency error
15 | #   - import reduce as original name
16 | #   - support python 2.6 dict comprehension
17 | 
18 | # pylint: skip-file
19 | from functools import reduce
20 | 
21 | 
22 | class CyclicDependency(ValueError):
23 |     def __init__(self, cyclic):
24 |         s = 'Cyclic dependencies exist among these items: {0}'.format(', '.join(repr(x) for x in cyclic.items()))
25 |         super().__init__(s)
26 |         self.cyclic = cyclic
27 | 
28 | 
29 | def toposort(data):
30 |     """
31 |     Dependencies are expressed as a dictionary whose keys are items
32 |     and whose values are a set of dependent items. Output is a list of
33 |     sets in topological order. The first set consists of items with no
34 |     dependences, each subsequent set consists of items that depend upon
35 |     items in the preceeding sets.
36 |     :param data:
37 |     :type data:
38 |     :return:
39 |     :rtype:
40 |     """
41 | 
42 |     # Special case empty input.
43 |     if len(data) == 0:
44 |         return
45 | 
46 |     # Copy the input so as to leave it unmodified.
47 |     data = data.copy()
48 | 
49 |     # Ignore self dependencies.
50 |     for k, v in data.items():
51 |         v.discard(k)
52 |     # Find all items that don't depend on anything.
53 |     extra_items_in_deps = reduce(set.union, data.values()) - set(data.keys())
54 |     # Add empty dependences where needed.
55 |     data.update(dict((item, set()) for item in extra_items_in_deps))
56 |     while True:
57 |         ordered = set(item for item, dep in data.items() if len(dep) == 0)
58 |         if not ordered:
59 |             break
60 |         yield ordered
61 |         data = dict((item, (dep - ordered))
62 |                 for item, dep in data.items()
63 |                 if item not in ordered)
64 |     if len(data) != 0:
65 |         raise CyclicDependency(data)
66 | 
67 | 
68 | def toposort_flatten(data, sort=True):
69 |     """
70 |     Returns a single list of dependencies. For any set returned by
71 |     toposort(), those items are sorted and appended to the result (just to
72 |     make the results deterministic).
73 |     :param data:
74 |     :type data:
75 |     :param sort:
76 |     :type sort:
77 |     :return: Single list of dependencies.
78 |     :rtype: list
79 |     """
80 | 
81 |     result = []
82 |     for d in toposort(data):
83 |         result.extend((sorted if sort else list)(d))
84 |     return result
85 | 


--------------------------------------------------------------------------------
/rebulk/utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Various utilities functions
  5 | """
  6 | from collections.abc import MutableSet
  7 | 
  8 | from types import GeneratorType
  9 | 
 10 | 
 11 | def find_all(string, sub, start=None, end=None, ignore_case=False, **kwargs):
 12 |     """
 13 |     Return all indices in string s where substring sub is
 14 |     found, such that sub is contained in the slice s[start:end].
 15 | 
 16 |     >>> list(find_all('The quick brown fox jumps over the lazy dog', 'fox'))
 17 |     [16]
 18 | 
 19 |     >>> list(find_all('The quick brown fox jumps over the lazy dog', 'mountain'))
 20 |     []
 21 | 
 22 |     >>> list(find_all('The quick brown fox jumps over the lazy dog', 'The'))
 23 |     [0]
 24 | 
 25 |     >>> list(find_all(
 26 |     ... 'Carved symbols in a mountain hollow on the bank of an inlet irritated an eccentric person',
 27 |     ... 'an'))
 28 |     [44, 51, 70]
 29 | 
 30 |     >>> list(find_all(
 31 |     ... 'Carved symbols in a mountain hollow on the bank of an inlet irritated an eccentric person',
 32 |     ... 'an',
 33 |     ... 50,
 34 |     ... 60))
 35 |     [51]
 36 | 
 37 |     :param string: the input string
 38 |     :type string: str
 39 |     :param sub: the substring
 40 |     :type sub: str
 41 |     :return: all indices in the input string
 42 |     :rtype: __generator[str]
 43 |     """
 44 |     #pylint: disable=unused-argument
 45 |     if ignore_case:
 46 |         sub = sub.lower()
 47 |         string = string.lower()
 48 |     while True:
 49 |         start = string.find(sub, start, end)
 50 |         if start == -1:
 51 |             return
 52 |         yield start
 53 |         start += len(sub)
 54 | 
 55 | 
 56 | def get_first_defined(data, keys, default_value=None):
 57 |     """
 58 |     Get the first defined key in data.
 59 |     :param data:
 60 |     :type data:
 61 |     :param keys:
 62 |     :type keys:
 63 |     :param default_value:
 64 |     :type default_value:
 65 |     :return:
 66 |     :rtype:
 67 |     """
 68 |     for key in keys:
 69 |         if key in data:
 70 |             return data[key]
 71 |     return default_value
 72 | 
 73 | 
 74 | def is_iterable(obj):
 75 |     """
 76 |     Are we being asked to look up a list of things, instead of a single thing?
 77 |     We check for the `__iter__` attribute so that this can cover types that
 78 |     don't have to be known by this module, such as NumPy arrays.
 79 | 
 80 |     Strings, however, should be considered as atomic values to look up, not
 81 |     iterables.
 82 | 
 83 |     We don't need to check for the Python 2 `unicode` type, because it doesn't
 84 |     have an `__iter__` attribute anyway.
 85 |     """
 86 |     # pylint: disable=consider-using-ternary
 87 |     return hasattr(obj, '__iter__') and not isinstance(obj, str) or isinstance(obj, GeneratorType)
 88 | 
 89 | 
 90 | def extend_safe(target, source):
 91 |     """
 92 |     Extends source list to target list only if elements doesn't exists in target list.
 93 |     :param target:
 94 |     :type target: list
 95 |     :param source:
 96 |     :type source: list
 97 |     """
 98 |     for elt in source:
 99 |         if elt not in target:
100 |             target.append(elt)
101 | 
102 | 
103 | class _Ref:
104 |     """
105 |     Reference for IdentitySet
106 |     """
107 |     def __init__(self, value):
108 |         self.value = value
109 | 
110 |     def __eq__(self, other):
111 |         return self.value is other.value
112 | 
113 |     def __hash__(self):
114 |         return id(self.value)
115 | 
116 | 
117 | class IdentitySet(MutableSet):  # pragma: no cover
118 |     """
119 |     Set based on identity
120 |     """
121 |     def __init__(self, items=None):  # pylint: disable=super-init-not-called
122 |         if items is None:
123 |             items = []
124 |         self.refs = set(map(_Ref, items))
125 | 
126 |     def __contains__(self, elem):
127 |         return _Ref(elem) in self.refs
128 | 
129 |     def __iter__(self):
130 |         return (ref.value for ref in self.refs)
131 | 
132 |     def __len__(self):
133 |         return len(self.refs)
134 | 
135 |     def add(self, value):
136 |         self.refs.add(_Ref(value))
137 | 
138 |     def discard(self, value):
139 |         self.refs.discard(_Ref(value))
140 | 
141 |     def update(self, iterable):
142 |         """
143 |         Update set with iterable
144 |         :param iterable:
145 |         :type iterable:
146 |         :return:
147 |         :rtype:
148 |         """
149 |         for elem in iterable:
150 |             self.add(elem)
151 | 
152 |     def __repr__(self):  # pragma: no cover
153 |         return f"{type(self).__name__}({list(self)})"
154 | 


--------------------------------------------------------------------------------
/rebulk/validators.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Validator functions to use in patterns.
 5 | 
 6 | All those function have last argument as match, so it's possible to use functools.partial to bind previous arguments.
 7 | """
 8 | 
 9 | 
10 | def chars_before(chars, match):
11 |     """
12 |     Validate the match if left character is in a given sequence.
13 | 
14 |     :param chars:
15 |     :type chars:
16 |     :param match:
17 |     :type match:
18 |     :return:
19 |     :rtype:
20 |     """
21 |     if match.start <= 0:
22 |         return True
23 |     return match.input_string[match.start - 1] in chars
24 | 
25 | 
26 | def chars_after(chars, match):
27 |     """
28 |     Validate the match if right character is in a given sequence.
29 | 
30 |     :param chars:
31 |     :type chars:
32 |     :param match:
33 |     :type match:
34 |     :return:
35 |     :rtype:
36 |     """
37 |     if match.end >= len(match.input_string):
38 |         return True
39 |     return match.input_string[match.end] in chars
40 | 
41 | 
42 | def chars_surround(chars, match):
43 |     """
44 |     Validate the match if surrounding characters are in a given sequence.
45 | 
46 |     :param chars:
47 |     :type chars:
48 |     :param match:
49 |     :type match:
50 |     :return:
51 |     :rtype:
52 |     """
53 |     return chars_before(chars, match) and chars_after(chars, match)
54 | 
55 | 
56 | def validators(*chained_validators):
57 |     """
58 |     Creates a validator chain from several validator functions.
59 | 
60 |     :param chained_validators:
61 |     :type chained_validators:
62 |     :return:
63 |     :rtype:
64 |     """
65 | 
66 |     def validator_chain(match):  # pylint:disable=missing-docstring
67 |         for chained_validator in chained_validators:
68 |             if not chained_validator(match):
69 |                 return False
70 |         return True
71 | 
72 |     return validator_chain
73 | 
74 | 
75 | def allways_true(match):  # pylint:disable=unused-argument
76 |     """
77 |     A validator which is allways true
78 |     :param match:
79 |     :return:
80 |     """
81 |     return True
82 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | # see https://caremad.io/blog/setup-vs-requirement/
2 | -e .
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import io
 5 | import re
 6 | 
 7 | from setuptools import setup, find_packages
 8 | 
 9 | with io.open('CHANGELOG.md', encoding='utf-8') as f:
10 |     changelog = f.read()
11 | 
12 | with io.open('README.md', 'r', encoding='utf-8') as f:
13 |     readme = f.read()
14 | 
15 | install_requires = ['setuptools;python_version>="3.12"']
16 | 
17 | native_requires = ['regex']
18 | 
19 | dev_require = ['pytest', 'pylint', 'tox', 'python-semantic-release', 'twine']
20 | 
21 | tests_require = ['pytest', 'pylint']
22 | 
23 | with io.open('rebulk/__version__.py', 'r') as f:
24 |     version = re.search(r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]$', f.read(), re.MULTILINE).group(1)
25 | 
26 | args = dict(name='rebulk',
27 |             version=version,
28 |             description='Rebulk - Define simple search patterns in bulk to perform advanced matching on any string.',
29 |             long_description=readme + '\n\n' + changelog,
30 |             long_description_content_type='text/markdown',
31 |             # Get strings from http://pypi.python.org/pypi?%3Aaction=list_classifiers
32 |             classifiers=['Development Status :: 5 - Production/Stable',
33 |                          'License :: OSI Approved :: MIT License',
34 |                          'Operating System :: OS Independent',
35 |                          'Intended Audience :: Developers',
36 |                          'Programming Language :: Python :: 3',
37 |                          'Programming Language :: Python :: 3.7',
38 |                          'Programming Language :: Python :: 3.8',
39 |                          'Programming Language :: Python :: 3.9',
40 |                          'Programming Language :: Python :: 3.10',
41 |                          'Programming Language :: Python :: 3.11',
42 |                          'Programming Language :: Python :: 3.12',
43 |                          'Topic :: Software Development :: Libraries :: Python Modules'
44 |                          ],
45 |             keywords='re regexp regular expression search pattern string match',
46 |             author='Rémi Alvergnat',
47 |             author_email='toilal.dev@gmail.com',
48 |             url='https://github.com/Toilal/rebulk/',
49 |             download_url='https://pypi.python.org/packages/source/r/rebulk/rebulk-%s.tar.gz' % version,
50 |             license='MIT',
51 |             packages=find_packages(),
52 |             include_package_data=True,
53 |             install_requires=install_requires,
54 |             tests_require=tests_require,
55 |             test_suite='rebulk.test',
56 |             zip_safe=True,
57 |             extras_require={
58 |                 'test': tests_require,
59 |                 'dev': dev_require,
60 |                 'native': native_requires
61 |             }
62 |             )
63 | 
64 | setup(**args)
65 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [tox]
2 | envlist = py37,py38,py39,py310,py311,py312,pypy3.8,pypy3.9,pypy3.10
3 | 
4 | [testenv]
5 | commands =
6 |     {envbindir}/pip install -e .[dev]
7 |     {envpython} setup.py test
8 | 


--------------------------------------------------------------------------------