├── .gitignore
├── .travis.yml
├── CHANGELOG.rst
├── LICENSE
├── MANIFEST.in
├── README.rst
├── dev-requirements.txt
├── hypothesis_regex.py
├── setup.cfg
├── setup.py
├── tests
    └── test_regex.py
└── tox.ini


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | *.pyo
 3 | /build/
 4 | /dist/
 5 | /.tox/
 6 | /.eggs/
 7 | /.cache/
 8 | /.hypothesis/
 9 | /hypothesis_regex.egg-info/
10 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # Config file for automatic testing at travis-ci.org
 2 | 
 3 | language: python
 4 | # http://blog.travis-ci.com/2014-12-17-faster-builds-with-container-based-infrastructure/
 5 | sudo: false
 6 | python:
 7 |   - "3.6"
 8 |   - "3.5"
 9 |   - "3.4"
10 |   - "3.3"
11 |   - "2.7"
12 |   - "pypy"
13 | 
14 | before_install:
15 |   - pip install -U pip
16 | 
17 | install:
18 |   - pip install -U .[reco]
19 |   - pip install -U -r dev-requirements.txt
20 | 
21 | script: python setup.py test
22 | 


--------------------------------------------------------------------------------
/CHANGELOG.rst:
--------------------------------------------------------------------------------
1 | Changelog
2 | ---------
3 | 
4 | 0.1 (2017-05-15)
5 | ++++++++++++++++
6 | 
7 | * Initial release.
8 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2017 Maxim Kulkin
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.rst LICENSE
2 | recursive-include tests *
3 | recursive-exclude tests *.pyc
4 | recursive-exclude tests *.pyo
5 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | ****************
  2 | hypothesis-regex
  3 | ****************
  4 | 
  5 | .. image:: https://img.shields.io/pypi/l/hypothesis-regex.svg
  6 |     :target: https://github.com/maximkulkin/hypothesis-regex/blob/master/LICENSE
  7 |     :alt: License: MIT
  8 | 
  9 | .. image:: https://img.shields.io/travis/maximkulkin/hypothesis-regex.svg
 10 |     :target: https://travis-ci.org/maximkulkin/hypothesis-regex
 11 |     :alt: Build Status
 12 | 
 13 | .. image:: https://img.shields.io/pypi/v/hypothesis-regex.svg
 14 |     :target: https://pypi.python.org/pypi/hypothesis-regex
 15 |     :alt: PyPI
 16 | 
 17 | `Hypothesis <https://hypothesis.readthedocs.io/en/latest/>`_ extension 
 18 | to allow generating strings based on regex. Useful in case you have some schema
 19 | (e.g. JSON Schema) which already has regular expressions validating data.
 20 | 
 21 | Example
 22 | =======
 23 | 
 24 | .. code:: python
 25 | 
 26 |     from hypothesis_regex import regex
 27 |     import requests
 28 |     import json
 29 | 
 30 |     EMAIL_REGEX = r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]{2,}\.[a-zA-Z0-9-.]{2,}$"
 31 | 
 32 |     @given(regex(EMAIL_REGEX))
 33 |     def test_registering_user(email):
 34 |         response = requests.post('/signup', json.dumps({'email': email}))
 35 |         assert response.status_code == 201
 36 | 
 37 | Features
 38 | ========
 39 | 
 40 | Regex strategy returns strings that always match given regex (this check is
 41 | enforced by a filter) and it tries to do that in an effective way so that less
 42 | generated examples are filtered out. However, some regex constructs may decrease
 43 | strategy efficiency and should be used with caution:
 44 | 
 45 | * "^" and "$" in the middle of a string - do not do anything.
 46 | * "\\b" and "\\B" (word boundary and not a word boundary) - do not do anything and
 47 |   instead just rely on top-level regex match filter to filter out non-matching
 48 |   examples.
 49 | * positive lookaheads and lookbehinds just generate data they should match (as if
 50 |   it was part of preceeding/following parts).
 51 | * negative lookaheads and lookbehinds do not do anything so it relies on
 52 |   preceeding/following parts to generate correct strings (otherwise the example will
 53 |   be filtered out).
 54 | * "(?(id)yes-pattern|no-pattern)" does not actually check if group with given id
 55 |   was actually used and instead just generates either yes- or no-pattern.
 56 | 
 57 | Regex strategy tries to go all crazy about generated data (e.g. "$" at the end of a
 58 | string either does not generate anything or generate a newline). The idea is not to
 59 | generate a nicely looking strings but instead any craze unexpected combination that
 60 | will still match your given regex so you can prepare for those and handle them in
 61 | most apropriate way.
 62 | 
 63 | You can use regex flags to get more control on strategy:
 64 | 
 65 | * re.IGNORECASE - literals or literal ranges generate both lowercase and uppercase
 66 |   letters. E.g. `r'a'` will generate both `"a"` and `"A"`, or `'[a-z]'` will generate
 67 |   both lowercase and uppercase english characters.
 68 | * re.DOTALL - "." char will be able to generate newlines
 69 | * re.UNICODE - character categories
 70 |   ("\\w", "\\d" or "\\s" and their negations) will generate unicode characters.
 71 |   This is default for Python 3, see re.ASCII to reverse it.
 72 | 
 73 | There are two ways to pass regex flags:
 74 | 
 75 | 1. By passing compiled regex with that flags: `regex(re.compile('abc', re.IGNORECASE))`
 76 | 2. By using inline flags syntax: `regex('(?i)abc')`
 77 | 
 78 | Installation
 79 | ============
 80 | ::
 81 | 
 82 |     $ pip install hypothesis-regex
 83 | 
 84 | Requirements
 85 | ============
 86 | 
 87 | - Python >= 2.7 and <= 3.6
 88 | - `hypothesis <https://pypi.python.org/pypi/hypothesis>`__ >= 3.8
 89 | 
 90 | Project Links
 91 | =============
 92 | 
 93 | - PyPI: https://pypi.python.org/pypi/hypothesis-regex
 94 | - Issues: https://github.com/maximkulkin/hypothesis-regex/issues
 95 | 
 96 | License
 97 | =======
 98 | 
 99 | MIT licensed. See the bundled `LICENSE <https://github.com/maximkulkin/hypothesis-regex/blob/master/LICENSE>`_ file for more details.
100 | 


--------------------------------------------------------------------------------
/dev-requirements.txt:
--------------------------------------------------------------------------------
1 | pytest>=2.9
2 | tox>=1.5
3 | 


--------------------------------------------------------------------------------
/hypothesis_regex.py:
--------------------------------------------------------------------------------
  1 | from collections import namedtuple
  2 | import re
  3 | import six
  4 | import six.moves
  5 | import string
  6 | import sre_parse as sre
  7 | import sys
  8 | import hypothesis.errors as he
  9 | import hypothesis.strategies as hs
 10 | 
 11 | __all__ = ['regex']
 12 | 
 13 | 
 14 | HAS_SUBPATTERN_FLAGS = sys.version_info[:2] >= (3, 6)
 15 | 
 16 | 
 17 | UNICODE_CATEGORIES = set([
 18 |     'Cf', 'Cn', 'Co', 'LC', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
 19 |     'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe',
 20 |     'Pf', 'Pi', 'Po', 'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl',
 21 |     'Zp', 'Zs',
 22 | ])
 23 | 
 24 | 
 25 | SPACE_CHARS = u' \t\n\r\f\v'
 26 | UNICODE_SPACE_CHARS = SPACE_CHARS + u'\x1c\x1d\x1e\x1f\x85'
 27 | UNICODE_DIGIT_CATEGORIES = set(['Nd'])
 28 | UNICODE_SPACE_CATEGORIES = set(['Zs', 'Zl', 'Zp'])
 29 | UNICODE_LETTER_CATEGORIES = set(['LC', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu'])
 30 | UNICODE_WORD_CATEGORIES = UNICODE_LETTER_CATEGORIES | set(['Nd', 'Nl', 'No'])
 31 | 
 32 | HAS_WEIRD_WORD_CHARS = (2, 7) <= sys.version_info[:2] < (3, 4)
 33 | UNICODE_WEIRD_NONWORD_CHARS = u'\U00012432\U00012433\U00012456\U00012457'
 34 | 
 35 | 
 36 | class Context(object):
 37 |     __slots__ = ['groups', 'flags']
 38 | 
 39 |     def __init__(self, groups=None, flags=0):
 40 |         self.groups = groups or {}
 41 |         self.flags = flags
 42 | 
 43 | 
 44 | class CharactersBuilder(object):
 45 |     '''
 46 |     Helper object that allows to configure `characters()` strategy with various
 47 |     unicode categories and characters. Also allows negation of configured set.
 48 | 
 49 |     :param negate: If True, configure `characters()` to match anything other than
 50 |         configured character set
 51 |     :param flags: Regex flags. They affect how and which characters are matched
 52 |     '''
 53 |     def __init__(self, negate=False, flags=0):
 54 |         self._categories = set()
 55 |         self._whitelist_chars = set()
 56 |         self._blacklist_chars = set()
 57 |         self._negate = negate
 58 |         self._ignorecase = flags & re.IGNORECASE
 59 |         self._unicode = (not flags & re.ASCII) \
 60 |             if six.PY3 else bool(flags & re.UNICODE)
 61 | 
 62 |     @property
 63 |     def strategy(self):
 64 |         'Returns resulting strategy that generates configured char set'
 65 |         max_codepoint = None if self._unicode else 127
 66 | 
 67 |         strategies = []
 68 |         if self._negate:
 69 |             if self._categories or self._whitelist_chars:
 70 |                 strategies.append(
 71 |                     hs.characters(
 72 |                         blacklist_categories=self._categories | set(['Cc', 'Cs']),
 73 |                         blacklist_characters=self._whitelist_chars,
 74 |                         max_codepoint=max_codepoint,
 75 |                     )
 76 |                 )
 77 |             if self._blacklist_chars:
 78 |                 strategies.append(
 79 |                     hs.sampled_from(
 80 |                         list(self._blacklist_chars - self._whitelist_chars)
 81 |                     )
 82 |                 )
 83 |         else:
 84 |             if self._categories or self._blacklist_chars:
 85 |                 strategies.append(
 86 |                     hs.characters(
 87 |                         whitelist_categories=self._categories,
 88 |                         blacklist_characters=self._blacklist_chars,
 89 |                         max_codepoint=max_codepoint,
 90 |                     )
 91 |                 )
 92 |             if self._whitelist_chars:
 93 |                 strategies.append(
 94 |                     hs.sampled_from(
 95 |                         list(self._whitelist_chars - self._blacklist_chars)
 96 |                     )
 97 |                 )
 98 | 
 99 |         return hs.one_of(*strategies) if strategies else hs.just(u'')
100 | 
101 |     def add_category(self, category):
102 |         '''
103 |         Add unicode category to set
104 | 
105 |         Unicode categories are strings like 'Ll', 'Lu', 'Nd', etc.
106 |         See `unicodedata.category()`
107 |         '''
108 |         if category == sre.CATEGORY_DIGIT:
109 |             self._categories |= UNICODE_DIGIT_CATEGORIES
110 |         elif category == sre.CATEGORY_NOT_DIGIT:
111 |             self._categories |= UNICODE_CATEGORIES - UNICODE_DIGIT_CATEGORIES
112 |         elif category == sre.CATEGORY_SPACE:
113 |             self._categories |= UNICODE_SPACE_CATEGORIES
114 |             for c in (UNICODE_SPACE_CHARS if self._unicode else SPACE_CHARS):
115 |                 self._whitelist_chars.add(c)
116 |         elif category == sre.CATEGORY_NOT_SPACE:
117 |             self._categories |= UNICODE_CATEGORIES - UNICODE_SPACE_CATEGORIES
118 |             for c in (UNICODE_SPACE_CHARS if self._unicode else SPACE_CHARS):
119 |                 self._blacklist_chars.add(c)
120 |         elif category == sre.CATEGORY_WORD:
121 |             self._categories |= UNICODE_WORD_CATEGORIES
122 |             self._whitelist_chars.add(u'_')
123 |             if HAS_WEIRD_WORD_CHARS and self._unicode:
124 |                 for c in UNICODE_WEIRD_NONWORD_CHARS:
125 |                     self._blacklist_chars.add(c)
126 |         elif category == sre.CATEGORY_NOT_WORD:
127 |             self._categories |= UNICODE_CATEGORIES - UNICODE_WORD_CATEGORIES
128 |             self._blacklist_chars.add(u'_')
129 |             if HAS_WEIRD_WORD_CHARS and self._unicode:
130 |                 for c in UNICODE_WEIRD_NONWORD_CHARS:
131 |                     self._whitelist_chars.add(c)
132 | 
133 |     def add_chars(self, chars):
134 |         'Add given chars to char set'
135 |         for c in chars:
136 |             if self._ignorecase:
137 |                 self._whitelist_chars.add(c.lower())
138 |                 self._whitelist_chars.add(c.upper())
139 |             else:
140 |                 self._whitelist_chars.add(c)
141 | 
142 | 
143 | @hs.defines_strategy
144 | def regex(regex):
145 |     """Return strategy that generates strings that match given regex.
146 | 
147 |     Regex can be either a string or compiled regex (through `re.compile()`).
148 | 
149 |     You can use regex flags (such as `re.IGNORECASE`, `re.DOTALL` or `re.UNICODE`)
150 |     to control generation. Flags can be passed either in compiled regex (specify
151 |     flags in call to `re.compile()`) or inside pattern with (?iLmsux) group.
152 | 
153 |     Some tricky regular expressions are partly supported or not supported at all.
154 |     "^" and "$" do not affect generation. Positive lookahead/lookbehind groups
155 |     are considered normal groups. Negative lookahead/lookbehind groups do not do
156 |     anything. Ternary regex groups ('(?(name)yes-pattern|no-pattern)') are not
157 |     supported at all.
158 |     """
159 |     if not hasattr(regex, 'pattern'):
160 |         regex = re.compile(regex)
161 | 
162 |     pattern = regex.pattern
163 |     flags = regex.flags
164 | 
165 |     codes = sre.parse(pattern)
166 | 
167 |     return _strategy(codes, Context(flags=flags)).filter(regex.match)
168 | 
169 | 
170 | def _strategy(codes, context):
171 |     """
172 |     Convert SRE regex parse tree to strategy that generates strings matching that
173 |     regex represented by that parse tree.
174 | 
175 |     `codes` is either a list of SRE regex elements representations or a particular
176 |     element representation. Each element is a tuple of element code (as string) and
177 |     parameters. E.g. regex 'ab[0-9]+' compiles to following elements:
178 | 
179 |         [
180 |             ('literal', 97),
181 |             ('literal', 98),
182 |             ('max_repeat', (1, 4294967295, [
183 |                 ('in', [
184 |                     ('range', (48, 57))
185 |                 ])
186 |             ]))
187 |         ]
188 | 
189 |     The function recursively traverses regex element tree and converts each element
190 |     to strategy that generates strings that match that element.
191 | 
192 |     Context stores
193 |     1. List of groups (for backreferences)
194 |     2. Active regex flags (e.g. IGNORECASE, DOTALL, UNICODE, they affect behavior
195 |        of various inner strategies)
196 |     """
197 |     if not isinstance(codes, tuple):
198 |         # List of codes
199 |         strategies = []
200 | 
201 |         i = 0
202 |         while i < len(codes):
203 |             if codes[i][0] == sre.LITERAL and not (context.flags & re.IGNORECASE):
204 |                 # Merge subsequent "literals" into one `just()` strategy
205 |                 # that generates corresponding text if no IGNORECASE
206 |                 j = i + 1
207 |                 while j < len(codes) and codes[j][0] == sre.LITERAL:
208 |                     j += 1
209 | 
210 |                 strategies.append(hs.just(
211 |                     u''.join([six.unichr(charcode) for (_, charcode) in codes[i:j]])
212 |                 ))
213 | 
214 |                 i = j
215 |             else:
216 |                 strategies.append(_strategy(codes[i], context))
217 |                 i += 1
218 | 
219 |         return hs.tuples(*strategies).map(u''.join)
220 |     else:
221 |         # Single code
222 |         code, value = codes
223 |         if code == sre.LITERAL:
224 |             # Regex 'a' (single char)
225 |             c = six.unichr(value)
226 |             if context.flags & re.IGNORECASE:
227 |                 return hs.sampled_from([c.lower(), c.upper()])
228 |             else:
229 |                 return hs.just(c)
230 | 
231 |         elif code == sre.NOT_LITERAL:
232 |             # Regex '[^a]' (negation of a single char)
233 |             c = six.unichr(value)
234 |             blacklist = set([c.lower(), c.upper()]) \
235 |                 if context.flags & re.IGNORECASE else [c]
236 |             return hs.characters(blacklist_characters=blacklist)
237 | 
238 |         elif code == sre.IN:
239 |             # Regex '[abc0-9]' (set of characters)
240 |             charsets = value
241 | 
242 |             builder = CharactersBuilder(negate=charsets[0][0] == sre.NEGATE,
243 |                                         flags=context.flags)
244 | 
245 |             for charset_code, charset_value in charsets:
246 |                 if charset_code == sre.NEGATE:
247 |                     # Regex '[^...]' (negation)
248 |                     pass
249 |                 elif charset_code == sre.LITERAL:
250 |                     # Regex '[a]' (single char)
251 |                     builder.add_chars(six.unichr(charset_value))
252 |                 elif charset_code == sre.RANGE:
253 |                     # Regex '[a-z]' (char range)
254 |                     low, high = charset_value
255 |                     for x in six.moves.range(low, high+1):
256 |                         builder.add_chars(six.unichr(x))
257 |                 elif charset_code == sre.CATEGORY:
258 |                     # Regex '[\w]' (char category)
259 |                     builder.add_category(charset_value)
260 |                 else:
261 |                     raise he.InvalidArgument(
262 |                         'Unknown charset code: %s' % charset_code
263 |                     )
264 | 
265 |             return builder.strategy
266 | 
267 |         elif code == sre.ANY:
268 |             # Regex '.' (any char)
269 |             if context.flags & re.DOTALL:
270 |                 return hs.characters()
271 |             else:
272 |                 return hs.characters(blacklist_characters="\n")
273 | 
274 |         elif code == sre.AT:
275 |             # Regexes like '^...', '...$', '\bfoo', '\Bfoo'
276 |             if value == sre.AT_END:
277 |                 return hs.one_of(hs.just(u''), hs.just(u'\n'))
278 |             return hs.just('')
279 | 
280 |         elif code == sre.SUBPATTERN:
281 |             # Various groups: '(...)', '(:...)' or '(?P<name>...)'
282 |             old_flags = context.flags
283 |             if HAS_SUBPATTERN_FLAGS:
284 |                 context.flags = (context.flags | value[1]) & ~value[2]
285 | 
286 |             strat = _strategy(value[-1], context)
287 | 
288 |             context.flags = old_flags
289 | 
290 |             if value[0]:
291 |                 context.groups[value[0]] = strat
292 |                 strat = hs.shared(strat, key=value[0])
293 | 
294 |             return strat
295 | 
296 |         elif code == sre.GROUPREF:
297 |             # Regex '\\1' or '(?P=name)' (group reference)
298 |             return hs.shared(context.groups[value], key=value)
299 | 
300 |         elif code == sre.ASSERT:
301 |             # Regex '(?=...)' or '(?<=...)' (positive lookahead/lookbehind)
302 |             return _strategy(value[1], context)
303 | 
304 |         elif code == sre.ASSERT_NOT:
305 |             # Regex '(?!...)' or '(?<!...)' (negative lookahead/lookbehind)
306 |             return hs.just('')
307 | 
308 |         elif code == sre.BRANCH:
309 |             # Regex 'a|b|c' (branch)
310 |             return hs.one_of([_strategy(branch, context) for branch in value[1]])
311 | 
312 |         elif code in [sre.MIN_REPEAT, sre.MAX_REPEAT]:
313 |             # Regexes 'a?', 'a*', 'a+' and their non-greedy variants (repeaters)
314 |             at_least, at_most, regex = value
315 |             if at_most == 4294967295:
316 |                 at_most = None
317 |             return hs.lists(_strategy(regex, context),
318 |                             min_size=at_least,
319 |                             max_size=at_most).map(''.join)
320 | 
321 |         elif code == sre.GROUPREF_EXISTS:
322 |             # Regex '(?(id/name)yes-pattern|no-pattern)' (if group exists selection)
323 |             return hs.one_of(
324 |                 _strategy(value[1], context),
325 |                 _strategy(value[2], context) if value[2] else hs.just(u''),
326 |             )
327 | 
328 |         else:
329 |             raise he.InvalidArgument('Unknown code point: %s' % repr(code))
330 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [aliases]
2 | test = pytest
3 | 
4 | [tools:pytest]
5 | norecursedirs = .git .tox .cache .hypothesis
6 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from setuptools import setup
 4 | 
 5 | 
 6 | def read(path):
 7 |     with open(path) as f:
 8 |         return f.read()
 9 | 
10 | 
11 | setup(
12 |     name='hypothesis-regex',
13 |     version='0.3.1',
14 |     description=('Hypothesis extension to allow generating strings based on regex'),
15 |     long_description=read('README.rst'),
16 |     author='Maxim Kulkin',
17 |     author_email='maxim.kulkin@gmail.com',
18 |     url='https://github.com/maximkulkin/hypothesis-regex',
19 |     license='MIT',
20 |     keywords=('hypothesis', 'regex'),
21 |     py_modules=['hypothesis_regex'],
22 |     install_requires=[
23 |         'hypothesis>=3.8',
24 |         'six>=1.10',
25 |     ],
26 |     setup_requires=['pytest-runner'],
27 |     tests_require=['pytest'],
28 |     classifiers=[
29 |         'Development Status :: 4 - Beta',
30 |         'Intended Audience :: Developers',
31 |         'License :: OSI Approved :: MIT License',
32 |         'Programming Language :: Python :: 2',
33 |         'Programming Language :: Python :: 2.7',
34 |         'Programming Language :: Python :: 3',
35 |         'Programming Language :: Python :: 3.3',
36 |         'Programming Language :: Python :: 3.4',
37 |         'Programming Language :: Python :: 3.5',
38 |         'Programming Language :: Python :: 3.6',
39 |         'Programming Language :: Python :: Implementation :: CPython',
40 |         'Programming Language :: Python :: Implementation :: PyPy',
41 |     ],
42 | )
43 | 


--------------------------------------------------------------------------------
/tests/test_regex.py:
--------------------------------------------------------------------------------
  1 | import hypothesis as h
  2 | import hypothesis.errors as he
  3 | 
  4 | from hypothesis_regex import regex, UNICODE_CATEGORIES, UNICODE_DIGIT_CATEGORIES, \
  5 |     UNICODE_SPACE_CATEGORIES, UNICODE_WORD_CATEGORIES, UNICODE_WEIRD_NONWORD_CHARS, \
  6 |     SPACE_CHARS, UNICODE_SPACE_CHARS, HAS_WEIRD_WORD_CHARS
  7 | import pytest
  8 | import re
  9 | import six
 10 | import six.moves
 11 | import sys
 12 | import unicodedata
 13 | 
 14 | 
 15 | def is_ascii(s):
 16 |     return all(ord(c) < 128 for c in s)
 17 | 
 18 | 
 19 | def is_digit(s):
 20 |     return all(unicodedata.category(c) in UNICODE_DIGIT_CATEGORIES for c in s)
 21 | 
 22 | 
 23 | def is_space(s):
 24 |     return all(c in SPACE_CHARS for c in s)
 25 | 
 26 | 
 27 | def is_unicode_space(s):
 28 |     return all(
 29 |         unicodedata.category(c) in UNICODE_SPACE_CATEGORIES or \
 30 |         c in UNICODE_SPACE_CHARS
 31 |         for c in s
 32 |     )
 33 | 
 34 | 
 35 | def is_word(s):
 36 |     return all(
 37 |         c == '_' or (
 38 |             (not HAS_WEIRD_WORD_CHARS or c not in UNICODE_WEIRD_NONWORD_CHARS) and
 39 |             unicodedata.category(c) in UNICODE_WORD_CATEGORIES
 40 |         )
 41 |         for c in s
 42 |     )
 43 | 
 44 | 
 45 | def ascii_regex(pattern):
 46 |     flags = re.ASCII if six.PY3 else 0
 47 |     return re.compile(pattern, flags)
 48 | 
 49 | 
 50 | def unicode_regex(pattern):
 51 |     return re.compile(pattern, re.UNICODE)
 52 | 
 53 | 
 54 | class TestRegexUnicodeMatching:
 55 |     def _test_matching_pattern(self, pattern, isvalidchar, unicode=False):
 56 |         r = unicode_regex(pattern) if unicode else ascii_regex(pattern)
 57 | 
 58 |         codepoints = six.moves.range(0, sys.maxunicode+1) \
 59 |             if unicode else six.moves.range(1, 128)
 60 |         for c in [six.unichr(x) for x in codepoints]:
 61 |             if isvalidchar(c):
 62 |                 assert r.match(c), (
 63 |                     '"%s" supposed to match "%s" (%r, category "%s"), '
 64 |                     'but it doesnt' % (pattern, c, c, unicodedata.category(c))
 65 |                 )
 66 |             else:
 67 |                 assert not r.match(c), (
 68 |                     '"%s" supposed not to match "%s" (%r, category "%s"), '
 69 |                     'but it does' % (pattern, c, c, unicodedata.category(c))
 70 |                 )
 71 | 
 72 |     def test_matching_ascii_word_chars(self):
 73 |         self._test_matching_pattern(r'\w', is_word)
 74 | 
 75 |     def test_matching_unicode_word_chars(self):
 76 |         self._test_matching_pattern(r'\w', is_word, unicode=True)
 77 | 
 78 |     def test_matching_ascii_non_word_chars(self):
 79 |         self._test_matching_pattern(r'\W', lambda s: not is_word(s))
 80 | 
 81 |     def test_matching_unicode_non_word_chars(self):
 82 |         self._test_matching_pattern(r'\W', lambda s: not is_word(s), unicode=True)
 83 | 
 84 |     def test_matching_ascii_digits(self):
 85 |         self._test_matching_pattern(r'\d', is_digit)
 86 | 
 87 |     def test_matching_unicode_digits(self):
 88 |         self._test_matching_pattern(r'\d', is_digit, unicode=True)
 89 | 
 90 |     def test_matching_ascii_non_digits(self):
 91 |         self._test_matching_pattern(r'\D', lambda s: not is_digit(s))
 92 | 
 93 |     def test_matching_unicode_non_digits(self):
 94 |         self._test_matching_pattern(r'\D', lambda s: not is_digit(s), unicode=True)
 95 | 
 96 |     def test_matching_ascii_spaces(self):
 97 |         self._test_matching_pattern(r'\s', is_space)
 98 | 
 99 |     def test_matching_unicode_spaces(self):
100 |         self._test_matching_pattern(r'\s', is_unicode_space, unicode=True)
101 | 
102 |     def test_matching_ascii_non_spaces(self):
103 |         self._test_matching_pattern(r'\S', lambda s: not is_space(s))
104 | 
105 |     def test_matching_unicode_non_spaces(self):
106 |         self._test_matching_pattern(r'\S', lambda s: not is_unicode_space(s),
107 |                                     unicode=True)
108 | 
109 | 
110 | def assert_all_examples(strategy, predicate):
111 |     '''
112 |     Checks that there are no examples with given strategy
113 |     that do not match predicate.
114 | 
115 |     :param strategy: Hypothesis strategy to check
116 |     :param predicate: (callable) Predicate that takes string example and returns bool
117 |     '''
118 |     @h.settings(max_examples=1000, max_iterations=5000)
119 |     @h.given(strategy)
120 |     def assert_examples(s):
121 |         assert predicate(s),'Found %r using strategy %s which does not match' % (
122 |             s, strategy,
123 |         )
124 | 
125 |     assert_examples()
126 | 
127 | 
128 | def assert_can_generate(pattern):
129 |     '''
130 |     Checks that regex strategy for given pattern generates examples
131 |     that match that regex pattern
132 |     '''
133 |     compiled_pattern = re.compile(pattern)
134 |     strategy = regex(pattern)
135 | 
136 |     assert_all_examples(strategy, compiled_pattern.match)
137 | 
138 | 
139 | class TestRegexStrategy:
140 |     @pytest.mark.parametrize('pattern', ['abc', '[a][b][c]'])
141 |     def test_literals(self, pattern):
142 |         assert_can_generate(pattern)
143 | 
144 |     @pytest.mark.parametrize('pattern', [re.compile('a', re.IGNORECASE), '(?i)a'])
145 |     def test_literals_with_ignorecase(self, pattern):
146 |         strategy = regex(pattern)
147 | 
148 |         h.find(strategy, lambda s: s == 'a')
149 |         h.find(strategy, lambda s: s == 'A')
150 | 
151 |     def test_not_literal(self):
152 |         assert_can_generate('[^a][^b][^c]')
153 | 
154 |     @pytest.mark.parametrize('pattern', [
155 |         re.compile('[^a][^b]', re.IGNORECASE),
156 |         '(?i)[^a][^b]'
157 |     ])
158 |     def test_not_literal_with_ignorecase(self, pattern):
159 |         assert_all_examples(
160 |             regex(pattern),
161 |             lambda s: s[0] not in ('a', 'A') and s[1] not in ('b', 'B')
162 |         )
163 | 
164 |     def test_any(self):
165 |         assert_can_generate('.')
166 | 
167 |     def test_any_doesnt_generate_newline(self):
168 |         assert_all_examples(regex('.'), lambda s: s != '\n')
169 | 
170 |     @pytest.mark.parametrize('pattern', [re.compile('.', re.DOTALL), '(?s).'])
171 |     def test_any_with_dotall_generate_newline(self, pattern):
172 |         h.find(regex(pattern), lambda s: s == '\n')
173 | 
174 |     def test_range(self):
175 |         assert_can_generate('[a-z0-9_]')
176 | 
177 |     def test_negative_range(self):
178 |         assert_can_generate('[^a-z0-9_]')
179 | 
180 |     @pytest.mark.parametrize('pattern', [r'\d', '[\d]', '[^\D]'])
181 |     def test_ascii_digits(self, pattern):
182 |         strategy = regex(ascii_regex(pattern))
183 | 
184 |         assert_all_examples(strategy, lambda s: is_digit(s) and is_ascii(s))
185 | 
186 |     @pytest.mark.parametrize('pattern', [r'\d', '[\d]', '[^\D]'])
187 |     def test_unicode_digits(self, pattern):
188 |         strategy = regex(unicode_regex(pattern))
189 | 
190 |         h.find(strategy, lambda s: is_digit(s) and is_ascii(s))
191 |         h.find(strategy, lambda s: is_digit(s) and not is_ascii(s))
192 | 
193 |         assert_all_examples(strategy, is_digit)
194 | 
195 |     @pytest.mark.parametrize('pattern', [r'\D', '[\D]', '[^\d]'])
196 |     def test_ascii_non_digits(self, pattern):
197 |         strategy = regex(ascii_regex(pattern))
198 | 
199 |         assert_all_examples(strategy, lambda s: not is_digit(s) and is_ascii(s))
200 | 
201 |     @pytest.mark.parametrize('pattern', [r'\D', '[\D]', '[^\d]'])
202 |     def test_unicode_non_digits(self, pattern):
203 |         strategy = regex(unicode_regex(pattern))
204 | 
205 |         h.find(strategy, lambda s: not is_digit(s) and is_ascii(s))
206 |         h.find(strategy, lambda s: not is_digit(s) and not is_ascii(s))
207 | 
208 |         assert_all_examples(strategy, lambda s: not is_digit(s))
209 | 
210 |     @pytest.mark.parametrize('pattern', [r'\s', '[\s]', '[^\S]'])
211 |     def test_ascii_whitespace(self, pattern):
212 |         strategy = regex(ascii_regex(pattern))
213 | 
214 |         assert_all_examples(strategy, lambda s: is_space(s) and is_ascii(s))
215 | 
216 |     @pytest.mark.parametrize('pattern', [r'\s', '[\s]', '[^\S]'])
217 |     def test_unicode_whitespace(self, pattern):
218 |         strategy = regex(unicode_regex(pattern))
219 | 
220 |         h.find(strategy, lambda s: is_unicode_space(s) and is_ascii(s))
221 |         h.find(strategy, lambda s: is_unicode_space(s) and not is_ascii(s))
222 | 
223 |         assert_all_examples(strategy, is_unicode_space)
224 | 
225 |     @pytest.mark.parametrize('pattern', [r'\S', '[\S]', '[^\s]'])
226 |     def test_ascii_non_whitespace(self, pattern):
227 |         strategy = regex(ascii_regex(pattern))
228 | 
229 |         assert_all_examples(strategy, lambda s: not is_space(s) and is_ascii(s))
230 | 
231 |     @pytest.mark.parametrize('pattern', [r'\S', '[\S]', '[^\s]'])
232 |     def test_unicode_non_whitespace(self, pattern):
233 |         strategy = regex(unicode_regex(pattern))
234 | 
235 |         h.find(strategy, lambda s: not is_unicode_space(s) and is_ascii(s))
236 |         h.find(strategy, lambda s: not is_unicode_space(s) and not is_ascii(s))
237 | 
238 |         assert_all_examples(strategy, lambda s: not is_unicode_space(s))
239 | 
240 |     @pytest.mark.parametrize('pattern', [r'\w', '[\w]', '[^\W]'])
241 |     def test_ascii_word(self, pattern):
242 |         strategy = regex(ascii_regex(pattern))
243 | 
244 |         assert_all_examples(strategy, lambda s: is_word(s) and is_ascii(s))
245 | 
246 |     @pytest.mark.parametrize('pattern', [r'\w', '[\w]', '[^\W]'])
247 |     def test_unicode_word(self, pattern):
248 |         strategy = regex(unicode_regex(pattern))
249 | 
250 |         h.find(strategy, lambda s: is_word(s) and is_ascii(s))
251 |         h.find(strategy, lambda s: is_word(s) and not is_ascii(s))
252 | 
253 |         assert_all_examples(strategy, is_word)
254 | 
255 |     @pytest.mark.parametrize('pattern', [r'\W', '[\W]', '[^\w]'])
256 |     def test_ascii_non_word(self, pattern):
257 |         strategy = regex(ascii_regex(pattern))
258 | 
259 |         assert_all_examples(strategy, lambda s: not is_word(s) and is_ascii(s))
260 | 
261 |     @pytest.mark.parametrize('pattern', [r'\W', '[\W]', '[^\w]'])
262 |     def test_unicode_non_word(self, pattern):
263 |         strategy = regex(unicode_regex(pattern))
264 | 
265 |         h.find(strategy, lambda s: not is_word(s) and is_ascii(s))
266 |         h.find(strategy, lambda s: not is_word(s) and not is_ascii(s))
267 | 
268 |         assert_all_examples(strategy, lambda s: not is_word(s))
269 | 
270 |     def test_question_mark_quantifier(self):
271 |         assert_can_generate('ab?')
272 | 
273 |     def test_asterisk_quantifier(self):
274 |         assert_can_generate('ab*')
275 | 
276 |     def test_plus_quantifier(self):
277 |         assert_can_generate('ab+')
278 | 
279 |     def test_repeater(self):
280 |         assert_can_generate('ab{5}')
281 |         assert_can_generate('ab{5,10}')
282 |         assert_can_generate('ab{,10}')
283 |         assert_can_generate('ab{5,}')
284 | 
285 |     def test_branch(self):
286 |         assert_can_generate('ab|cd|ef')
287 | 
288 |     def test_group(self):
289 |         assert_can_generate('(foo)+')
290 | 
291 |     def test_group_backreference(self):
292 |         assert_can_generate('([\'"])[a-z]+\\1')
293 | 
294 |     def test_non_capturing_group(self):
295 |         assert_can_generate('(?:[a-z])([\'"])[a-z]+\\1')
296 | 
297 |     def test_named_groups(self):
298 |         assert_can_generate('(?P<foo>[\'"])[a-z]+(?P=foo)')
299 | 
300 |     def test_begining(self):
301 |         assert_can_generate('^abc')
302 | 
303 |     def test_caret_in_the_middle_does_not_generate_anything(self):
304 |         r = re.compile('a^b')
305 | 
306 |         with pytest.raises(he.NoSuchExample):
307 |             h.find(regex(r), r.match)
308 | 
309 |     def test_end(self):
310 |         strategy = regex('abc$')
311 | 
312 |         h.find(strategy, lambda s: s == 'abc')
313 |         h.find(strategy, lambda s: s == 'abc\n')
314 | 
315 |     def test_groupref_exists(self):
316 |         assert_all_examples(
317 |             regex('^(<)?a(?(1)>)$'),
318 |             lambda s: s in ('a', 'a\n', '<a>', '<a>\n')
319 |         )
320 |         assert_all_examples(
321 |             regex('^(a)?(?(1)b|c)$'),
322 |             lambda s: s in ('ab', 'ab\n', 'c', 'c\n')
323 |         )
324 | 
325 |     @pytest.mark.skipif(sys.version_info[:2] < (3, 6), reason='requires Python 3.6')
326 |     def test_subpattern_flags(self):
327 |         strategy = regex('(?i)a(?-i:b)')
328 | 
329 |         # "a" is case insensitive
330 |         h.find(strategy, lambda s: s[0] == 'a')
331 |         h.find(strategy, lambda s: s[0] == 'A')
332 |         # "b" is case sensitive
333 |         h.find(strategy, lambda s: s[1] == 'b')
334 | 
335 |         with pytest.raises(he.NoSuchExample):
336 |             h.find(strategy, lambda s: s[1] == 'B')
337 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [tox]
2 | envlist=py27,py33,py34,py35,py36,pypy
3 | [testenv]
4 | deps=
5 |   -rdev-requirements.txt
6 | commands=
7 |   python setup.py test
8 | 


--------------------------------------------------------------------------------