├── .coveragerc
├── .gitignore
├── .travis.yml
├── LICENSE
├── MANIFEST.in
├── README.rst
├── pytest.ini
├── setup.cfg
├── setup.py
├── src
└── os_urlpattern
│ ├── VERSION
│ ├── __init__.py
│ ├── cmdline.py
│ ├── compat.py
│ ├── config
│ ├── __init__.py
│ └── default_config.cfg
│ ├── definition.py
│ ├── exceptions.py
│ ├── formatter.py
│ ├── parse_utils.py
│ ├── parsed_piece_view.py
│ ├── parser.py
│ ├── pattern.py
│ ├── pattern_cluster.py
│ ├── pattern_maker.py
│ ├── pattern_matcher.py
│ ├── piece_pattern_node.py
│ └── utils.py
├── tests
├── __init__.py
├── data
│ └── urls_example.txt
├── test_cmdline.py
├── test_config.py
├── test_formatter.py
├── test_parse_utils.py
├── test_parsed_piece_view.py
├── test_pattern.py
├── test_pattern_maker.py
├── test_pattern_matcher.py
└── test_piece_pattern_node.py
└── tox.ini
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | parallel = true
3 | branch = true
4 | source = os_urlpattern
5 |
6 | [paths]
7 | source =
8 | src/os_urlpattern
9 | .tox/*/lib/python*/site-packages/os_urlpattern
10 | .tox/*/site-packages/os_urlpattern
11 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 |
49 | # Translations
50 | *.mo
51 | *.pot
52 |
53 | # Django stuff:
54 | *.log
55 | local_settings.py
56 |
57 | # Flask stuff:
58 | instance/
59 | .webassets-cache
60 |
61 | # Scrapy stuff:
62 | .scrapy
63 |
64 | # Sphinx documentation
65 | docs/_build/
66 |
67 | # PyBuilder
68 | target/
69 |
70 | # Jupyter Notebook
71 | .ipynb_checkpoints
72 |
73 | # pyenv
74 | .python-version
75 |
76 | # celery beat schedule file
77 | celerybeat-schedule
78 |
79 | # SageMath parsed files
80 | *.sage.py
81 |
82 | # dotenv
83 | .env
84 |
85 | # virtualenv
86 | .venv
87 | venv/
88 | ENV/
89 |
90 | # Spyder project settings
91 | .spyderproject
92 | .spyproject
93 |
94 | # Rope project settings
95 | .ropeproject
96 |
97 | # mkdocs documentation
98 | /site
99 |
100 | # mypy
101 | .mypy_cache/
102 |
103 | # vscode
104 | .vscode/
105 |
106 | # pytest
107 | .pytest_cache/
108 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | sudo: false
3 | matrix:
4 | include:
5 | - python: 2.7
6 | env: TOXENV=py27,codecov
7 | - python: 3.6
8 | env: TOXENV=py36,codecov
9 | - python: pypy
10 | env: TOXENV=pypy
11 | - python: pypy3
12 | env: TOXENV=pypy3
13 | install:
14 | - pip install -U pip tox
15 | script:
16 | - tox
17 | deploy:
18 | provider: pypi
19 | user: cfhamlet
20 | password:
21 | secure: eGq3kLUT6D3grZ2ZlCaJ5e/9Ma3HkOLZQDDcMsWUs/zUqpngI/9ibplgbOcxpRxKCgFKn5GFDV9ZsKk00fEfYWpe4WZW2vG6mu3k63oB4FMkUQ4GGoQKcXdR27aNtNhvTzU3VPDgyEpNI5QJmTLJp3Y3fbzcjL3a87kschf6B46MP4Nu3NqWuXZDYIZN6GY8HwD6J3Ii15nl4rCS6phdYdKckyVX8coNQVWkljx+ZtfGMkClsui9BynKBNVwufm3/F1zwWI1UXCrU3v4FxqiCmK2CYSX7tdFcGHaVTf0NqscbPxZgPvM+1tUBbW1M5N5GlUf5f7CxwtFWEqFTlz926gzYrHUaewmjILWDm6OxWAKjuks8lgywQq2twYpd8UVlRywvjfaobGpptoBevuxgr/uzipeckWR0X1SiqUaFnKzuLOnVeZ9I1ixA5zcIR74xnjEOvBnMpeawzZsIidoQcn4PRzbyaR4uDxnYyWB5yW/Q9d1UbAYOe0QyQY6NnZzvkRovkge3H/Wlk+K2P0qSUmmznWSDekdBcm4yr3bZsujgWOKS3c9L/OHH+P3YVAC1x0304xGveWt0cU/sfTPpEi99N+0QOxPQX3CnutFkXZIgR4nsGWnZYnMngrr8eHIfav+Ms20UTYwjsn79vfXc10kkesQtW863GdFXBYfw3c=
22 | on:
23 | tags: true
24 | condition: ${TRAVIS_PYTHON_VERSION} == 2.7
25 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2017 Ozzy
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | include README.rst
3 | include MANIFEST.in
4 | graft src
5 | graft tests
6 | global-exclude __pycache__
7 | global-exclude *.py[co]
8 |
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | =============
2 | os-urlpattern
3 | =============
4 |
5 | .. image:: https://travis-ci.org/cfhamlet/os-urlpattern.svg?branch=master
6 | :target: https://travis-ci.org/cfhamlet/os-urlpattern
7 |
8 | .. image:: https://codecov.io/gh/cfhamlet/os-urlpattern/branch/master/graph/badge.svg
9 | :target: https://codecov.io/gh/cfhamlet/os-urlpattern
10 |
11 | .. image:: https://img.shields.io/pypi/pyversions/os-urlpattern.svg
12 | :alt: PyPI - Python Version
13 | :target: https://pypi.python.org/pypi/os-urlpattern
14 |
15 | .. image:: https://img.shields.io/pypi/v/os-urlpattern.svg
16 | :alt: PyPI
17 | :target: https://pypi.python.org/pypi/os-urlpattern
18 |
19 |
20 | This package is used for unsupervised URLs clustering. Furthermore, it generate URL patterns(RegEx)
21 | from clusters for matching purpose. It is a pure python package tested under python2.7 python3.6,
22 | `pypy `_ can also be used for performance(4x-8x). Command line tools are provided
23 | for standalone clustering and matching, APIs are also convenient. Several extra packages can be
24 | installed for additional features. Under CPython 1cpu, 100 thousand URLs clustering cost almost 1min
25 | and 200M memory. Built-in matching strategy is efficient enough in most use cases(4k/s, depend on
26 | patterns complexity).
27 |
28 | .. code:: console
29 |
30 | $ pip install -U os-urlpattern
31 | $ wget -qO- 'https://git.io/f4QlP' | pattern-make
32 | /[0-9]{2}[\.]html
33 | http://example.com/01.html
34 | http://example.com/02.html
35 | http://example.com/03.html
36 | /[0-9]{3}/test[0-9]{2}[\.]html
37 | http://example.com/123/test01.html
38 | http://example.com/456/test02.html
39 | http://example.com/789/test03.html
40 |
41 |
42 | ==============
43 | Aknowledgement
44 | ==============
45 |
46 | Similar URLs
47 | =============
48 |
49 | * URLs with the same **URL structure**.
50 |
51 | * Components of the parsed URLs at the same position are in the same **character space**.
52 |
53 | * Different types of charactors may be in the same order in most cases.
54 |
55 |
56 | URL structure
57 | ==============
58 |
59 | Typically, URL can be parsed into 6 components:
60 |
61 | ``:///;?#``
62 |
63 | Because different sites may have similar URLs structure and is rare, so
64 | and are ignored, are used to define URL structure.
65 |
66 | If the URLs have the same path levels, same query keys(also keys order) and with the same
67 | fragment existence, their URL structure should be the same.
68 |
69 | ::
70 |
71 | http://example.com/p1/p2?k1=v1&k2=v2#pos
72 |
73 | URL structure:
74 | path levels: 2
75 | query keys: k1, k2
76 | have fragment: True
77 |
78 | Character space
79 | ===============
80 |
81 | Consider `RFC 3986 (Section 2: Characters) `_,
82 | URL with the following characters would be legal:
83 |
84 | ``ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~:/?#[]@!$&'()*+,;=%<>\"{}^|``
85 |
86 | There are three major character space: lower-case letters(a-z), upper-case letters(A-Z),
87 | number letters(0-9). Other symbols are in their own character space.
88 |
89 | ::
90 |
91 | HeLlOwoRd666!
92 |
93 | character space: a-z A-Z 0-9 !
94 |
95 | Order consideration
96 | =====================
97 |
98 | Split a string by character, consecutive character space can be joined. In most cases, order is a
99 | distinguished feature.
100 |
101 | ::
102 |
103 | HELLOword666!
104 |
105 | split into: HELLO word 666 !
106 |
107 | character space order: A-Z a-z 0-9 !
108 |
109 |
110 | Mix
111 | =====================
112 | Complex consecutive major character space can be mixed, order is less important.
113 |
114 | ::
115 |
116 | HellWorld666!
117 |
118 | split into: H ell W orld 666 !
119 |
120 | major join: HellWorld666 !
121 |
122 | character space order: A-Za-z0-9 !
123 |
124 | Because of URL quote, '%' can be mixed with major character space.
125 |
126 | ::
127 |
128 | %E4%BD%A0%E5%A5%BD!
129 |
130 | split into: % E 4 % BD % A 0 % E 5 % A 5 % BD !
131 |
132 | major join: %E4%BD%A0%E5%A5%BD !
133 |
134 | character space order: A-Z0-9% !
135 |
136 |
137 | URL pattern
138 | ============
139 |
140 | URL pattern is used to express each cluster. It is normal regex string. Each URL in
141 | the same cluster can be matched with the pattern.
142 |
143 | ::
144 |
145 | pattern examples:
146 |
147 | /news/[0-9]{8}/[a-z]+[\\.]html
148 | /newsShow[\\.]asp[\\?]dataID=[0-9]+
149 | /thread[\\-][0-9]+[\\-][0-9][\\-]1[\\.]html
150 |
151 | The built-in matching strategy is strict, it can't tolerate incomplet matching.
152 |
153 | ::
154 |
155 | letter: helloword
156 |
157 | pattern01: [a-z0-9]+ # not match, because no number in the letter
158 | pattern02: [a-z]+ # match
159 |
160 |
161 | ========
162 | Install
163 | ========
164 |
165 | Install with pip
166 |
167 | ``$ pip install os-urlpattern``
168 |
169 | Install extra packages
170 |
171 | .. list-table::
172 | :header-rows: 1
173 |
174 | * - subpackage
175 | - install command
176 | - enables
177 | * - memory
178 | - ``pip install os-urlpattern[memroy]``
179 | - Show memory useage
180 | * - ete-tree
181 | - ``pip install os-urlpattern[ete-tree]``
182 | - Enable `ete `_ pattern tree formatter
183 |
184 | ========
185 | Usage
186 | ========
187 |
188 | Command line
189 | =============
190 |
191 | * **pattern-make**
192 |
193 | Load urls, cluster and dump patterns.
194 |
195 | .. code:: console
196 |
197 | $ pattern-make -h
198 | usage: pattern-make [-h] [-v] [-i INPUTS [INPUTS ...]]
199 | [-l {NOTSET,DEBUG,INFO,WARN,ERROR,FATAL}] [-c CONFIG]
200 | [-f {PATTERN,CLUSTER,JSON,ETE,INLINE,NULL}]
201 |
202 | optional arguments:
203 | -h, --help show this help message and exit
204 | -v, --version show program's version number and exit
205 | -i INPUTS [INPUTS ...], --inputs INPUTS [INPUTS ...]
206 | input files to be processed (default: stdin)
207 | -l {NOTSET,DEBUG,INFO,WARN,ERROR,FATAL}, --loglevel {NOTSET,DEBUG,INFO,WARN,ERROR,FATAL}
208 | log level (default: NOTSET)
209 | -c CONFIG, --config CONFIG
210 | config file
211 | -f {PATTERN,CLUSTER,JSON,ETE,INLINE,NULL}, --formatter {PATTERN,CLUSTER,JSON,ETE,INLINE,NULL}
212 | output formatter (default: CLUSTER)
213 |
214 | Dump clustered URLs with patterns:
215 |
216 | .. code:: console
217 |
218 | $ cat urls.txt | pattern-make -L debug > clustered.txt
219 |
220 | Only generate URL patterns:
221 |
222 | .. code:: console
223 |
224 | $ cat urls.txt | pattern-make -L debug -F pattern > patterns.txt
225 |
226 | Generate pattern tree from URLs(`ete `_ installed):
227 |
228 | .. code:: console
229 |
230 | $ cat urls.txt | pattern-make -L debug -F ete
231 |
232 | * **pattern-match**
233 |
234 | Load patterns, dump URLs matched results.
235 |
236 | .. code:: console
237 |
238 | $ pattern-match -h
239 | usage: pattern-match [-h] [-v] [-i INPUTS [INPUTS ...]]
240 | [-l {NOTSET,DEBUG,INFO,WARN,ERROR,FATAL}] -p
241 | PATTERN_FILES [PATTERN_FILES ...] [-a]
242 |
243 | optional arguments:
244 | -h, --help show this help message and exit
245 | -v, --version show program's version number and exit
246 | -i INPUTS [INPUTS ...], --inputs INPUTS [INPUTS ...]
247 | input files to be processed (default: stdin)
248 | -l {NOTSET,DEBUG,INFO,WARN,ERROR,FATAL}, --loglevel {NOTSET,DEBUG,INFO,WARN,ERROR,FATAL}
249 | log level (default: NOTSET)
250 | -p PATTERN_FILES [PATTERN_FILES ...], --pattern-files PATTERN_FILES [PATTERN_FILES ...]
251 | pattern files to be loaded
252 | -a, --all-matched all matched patterns
253 |
254 |
255 | Match URLs:
256 |
257 | .. code:: console
258 |
259 | $ cat urls.txt | pattern-match -L debug -p patterns.txt
260 |
261 | APIs
262 | =====
263 |
264 | * Cluster and generate URL patterns:
265 |
266 | .. code:: python
267 |
268 | from os_urlpattern.formatter import pformat
269 | from os_urlpattern.pattern_maker import PatternMaker
270 |
271 | pattern_maker = PatternMaker()
272 |
273 | # load URLs(unicode)
274 | for url in urls:
275 | pattern_maker.load(url)
276 |
277 | # cluster and print pattern
278 | for url_meta, clustered in pattern_maker.make():
279 | for pattern in pformat('pattern', url_meta, clustered):
280 | # do whatever you want
281 | pass
282 |
283 |
284 | * Match URLs:
285 |
286 | .. code:: python
287 |
288 | from os_urlpattern.pattern_matcher import PatternMatcher
289 |
290 | pattern_matcher = PatternMatcher()
291 |
292 | # load url_pattern(unicode)
293 | for url_pattern in url_patterns:
294 | # meta will bind to matched result
295 | pattern_matcher.load(url_pattern, meta=url_pattern)
296 |
297 | # match URL(unicode)
298 | for url in urls:
299 | matched_results = patterm_matcher.match(url)
300 | # the best matched result:
301 | # sorted(matched_results, reverse=True)[0]
302 | patterns = [n.meta for n in matched_results]
303 |
304 |
305 | * Low-level APIs:
306 |
307 | It is necessary to use low-level APIs for customizing processing procdure,
308 | especially for parallel computing or working on an distributed cluster(hadoop).
309 |
310 | **Key points: same fuzzy-digest same maker and same matcher.**
311 |
312 | Use ``os_urlpattern.parser.fuzzy_digest`` to get fuzzy digest from URL,
313 | URL pattern or URLMeta and parsed pieces/patterns.
314 |
315 | A brief All-In-One example:
316 |
317 | .. code:: python
318 |
319 | from __future__ import print_function, unicode_literals
320 | from os_urlpattern.formatter import pformat
321 | from os_urlpattern.parser import fuzzy_digest, parse
322 | from os_urlpattern.pattern_maker import Maker
323 | from os_urlpattern.pattern_matcher import Matcher
324 |
325 | urls = ['http://t.com/%02d.html' % i for i in xrange(0,10)]
326 | makers = {}
327 | matchers = {}
328 |
329 | # Init makers from URLs(unicode).
330 | for url in urls:
331 | url_meta, parsed_pieces = parse(url)
332 |
333 | # same digest same maker
334 | digest = fuzzy_digest(url_meta, parsed_pieces)
335 | if digest not in makers:
336 | makers[digest] = Maker(url_meta)
337 | makers[digest].load(parsed_pieces)
338 |
339 | # Iterate makers, do clustering, generate URL pattern and init matchers.
340 | for maker in makers.values():
341 | for clustered in maker.make():
342 | for pattern in pformat('pattern', maker.url_meta, clustered):
343 | # init matchers
344 | url_meta, parsed_patterns = parse(pattern)
345 | digest = fuzzy_digest(url_meta, parsed_patterns)
346 | if digest not in matchers:
347 | matchers[digest] = Matcher(url_meta)
348 | matchers[digest].load(parsed_patterns, pattern)
349 |
350 | # Match URLs(unicode).
351 | for url in urls:
352 | url_meta, parsed_pieces = parse(url)
353 |
354 | # same digest same matcher
355 | digest = fuzzy_digest(url_meta, parsed_pieces)
356 | if digest in matchers:
357 | matched = [n.meta for n in matchers[digest].match(parsed_pieces)]
358 | print(url, *matched, sep="\t")
359 | else: # no matched at all
360 | pass
361 |
362 |
363 |
364 | ============
365 | Unit Tests
366 | ============
367 |
368 | ``$ tox``
369 |
370 | ============
371 | License
372 | ============
373 |
374 | MIT licensed.
375 |
--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | addopts = -s --fulltrace -v
3 | env =
4 | COVERAGE = true
5 |
6 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [bdist_wheel]
2 | universal = 1
3 |
4 | [metadata]
5 | description-file = README.rst
6 | license_file = LICENSE
7 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import find_packages, setup
2 |
3 |
4 | def read(*filenames, **kwargs):
5 | import io
6 | from os.path import join, dirname
7 | encoding = kwargs.get('encoding', 'utf-8')
8 | sep = kwargs.get('sep', '\n')
9 | buf = []
10 | for filename in filenames:
11 | with io.open(join(dirname(__file__), filename), encoding=encoding) as f:
12 | buf.append(f.read())
13 | return sep.join(buf)
14 |
15 |
16 | setup(
17 | name='os-urlpattern',
18 | version=read('src/os_urlpattern/VERSION'),
19 | packages=find_packages(where='src'),
20 | package_dir={'': 'src'},
21 | include_package_data=True,
22 | license='MIT License',
23 | description='Cluster url pattern automatically.',
24 | long_description=open('README.rst').read(),
25 | author='Ozzy',
26 | author_email='cfhamlet@gmail.com',
27 | url='https://github.com/cfhamlet/os-urlpattern',
28 | zip_safe=False,
29 | entry_points={
30 | 'console_scripts': [
31 | 'pattern-make = os_urlpattern.cmdline:make',
32 | 'pattern-match = os_urlpattern.cmdline:match',
33 | ]
34 | },
35 | extras_require={
36 | 'memory': ['psutil'],
37 | 'ete-tree': ['six', 'ete3']
38 | },
39 | classifiers=[
40 | 'Development Status :: 2 - Pre-Alpha',
41 | 'Intended Audience :: Developers',
42 | 'License :: OSI Approved :: MIT License',
43 | 'Natural Language :: English',
44 | 'Programming Language :: Python :: 2',
45 | 'Programming Language :: Python :: 2.7',
46 | 'Programming Language :: Python :: 3',
47 | 'Programming Language :: Python :: 3.6',
48 | 'Programming Language :: Python :: Implementation :: CPython',
49 | 'Programming Language :: Python :: Implementation :: PyPy',
50 | ])
51 |
--------------------------------------------------------------------------------
/src/os_urlpattern/VERSION:
--------------------------------------------------------------------------------
1 | 0.1.11
2 |
--------------------------------------------------------------------------------
/src/os_urlpattern/__init__.py:
--------------------------------------------------------------------------------
1 | """os-urlpattern.
2 |
3 | Unsupervised URLs clustering, generate and match URL pattern.
4 | """
5 | import sys
6 | __all__ = ['__version__', 'version_info']
7 |
8 | import pkgutil
9 | __version__ = pkgutil.get_data(__package__, 'VERSION').decode('ascii').strip()
10 | version_info = tuple(int(v) if v.isdigit() else v
11 | for v in __version__.split('.'))
12 |
13 | if sys.version_info < (2, 7):
14 | sys.exit("os-urlpattern %s requires Python 2.7" % __version__)
15 |
16 | del pkgutil
17 | del sys
18 |
--------------------------------------------------------------------------------
/src/os_urlpattern/cmdline.py:
--------------------------------------------------------------------------------
1 | """Command line tools.
2 |
3 | pattern-make:
4 | Load URLs, cluster then generate URL pattern.
5 |
6 | pattern-matcher:
7 | Load pattern, match URL and get matched results.
8 |
9 | """
10 | from __future__ import print_function, unicode_literals
11 |
12 | import argparse
13 | import logging.config
14 | import sys
15 | import time
16 | from collections import Counter
17 | from itertools import chain
18 |
19 | from . import __version__
20 | from .compat import binary_stdin, binary_stdout
21 | from .config import get_default_config
22 | from .definition import DEFAULT_ENCODING
23 | from .exceptions import (InvalidCharException, InvalidPatternException,
24 | IrregularURLException)
25 | from .formatter import FORMATTERS, pformat
26 | from .pattern_maker import PatternMaker
27 | from .pattern_matcher import PatternMatcher
28 | from .utils import LogSpeedAdapter, MemoryUsageFormatter, pretty_counter
29 |
30 | _DEFAULT_LOGGING = {
31 | 'version': 1,
32 | 'disable_existing_loggers': True,
33 | 'incremental': True,
34 | }
35 |
36 |
37 | def _config_logging(log_level):
38 | logging.config.dictConfig(_DEFAULT_LOGGING)
39 | if log_level == 'NOTSET':
40 | handler = logging.NullHandler()
41 | else:
42 | handler = logging.StreamHandler()
43 | formatter = MemoryUsageFormatter(
44 | fmt='[%(asctime)s] [%(name)s] [%(levelname)s] [%(memory)s] %(message)s',
45 | datefmt='%Y-%m-%d %H:%M:%S',
46 | )
47 | logging.root.setLevel(logging.NOTSET)
48 | handler.setFormatter(formatter)
49 | handler.setLevel(log_level)
50 | logging.root.addHandler(handler)
51 |
52 |
53 | class Command(object):
54 | def __init__(self, config=None):
55 | self._config = config
56 | self._logger = logging.getLogger(self.__class__.__name__)
57 |
58 | def add_argument(self, parser):
59 |
60 | parser.add_argument('-v', '--version',
61 | action='version',
62 | version='%(prog)s {version}'.format(
63 | version=__version__)
64 | )
65 |
66 | parser.add_argument('-i', '--inputs',
67 | help='input files to be processed (default: stdin)',
68 | nargs='+',
69 | type=argparse.FileType('rb'),
70 | default=[binary_stdin],
71 | dest='inputs')
72 |
73 | parser.add_argument('-l', '--loglevel',
74 | help='log level (default: NOTSET)',
75 | default='NOTSET',
76 | action='store',
77 | dest='log_level',
78 | choices=['NOTSET', 'DEBUG', 'INFO',
79 | 'WARN', 'ERROR', 'FATAL'],
80 | type=lambda s: s.upper())
81 |
82 | def process_args(self, args):
83 | _config_logging(args.log_level)
84 |
85 | def run(self, args):
86 | raise NotImplementedError
87 |
88 |
89 | class MakePatternCommand(Command):
90 |
91 | def process_args(self, args):
92 | super(MakePatternCommand, self).process_args(args)
93 | if args.config:
94 | self._config.readfp(args.config[0])
95 |
96 | def add_argument(self, parser):
97 | super(MakePatternCommand, self).add_argument(parser)
98 | parser.add_argument('-c', '--config',
99 | help='config file',
100 | nargs=1,
101 | type=argparse.FileType('r'),
102 | dest='config')
103 |
104 | parser.add_argument('-f', '--formatter',
105 | help='output formatter (default: CLUSTER)',
106 | default='CLUSTER',
107 | action='store',
108 | dest='format_type',
109 | choices=FORMATTERS.keys(),
110 | type=lambda s: s.upper())
111 |
112 | def _load(self, pattern_maker, args):
113 | load_url = args.format_type in ('CLUSTER', 'INLINE')
114 | stats = Counter()
115 | with LogSpeedAdapter(self._logger, 5000) as speed_logger:
116 | load = pattern_maker.load
117 | for line in chain.from_iterable(args.inputs):
118 | speed_logger.debug('[LOADING]')
119 | stats['ALL'] += 1
120 | line = line.strip()
121 | if not line:
122 | stats['EMPTY'] += 1
123 | continue
124 | try:
125 | url = line.decode(DEFAULT_ENCODING)
126 | _, is_new = load(url, meta=url if load_url else None)
127 | if is_new:
128 | stats['UNIQ'] += 1
129 | stats['VALID'] += 1
130 | except (InvalidPatternException,
131 | IrregularURLException,
132 | InvalidCharException,
133 | UnicodeDecodeError,
134 | ValueError) as e:
135 | self._logger.warn('%s, %r', str(e), line)
136 | stats['INVALID'] += 1
137 | continue
138 | except Exception as e:
139 | self._logger.error('%s, %r', str(e), line)
140 | stats['INVALID'] += 1
141 | continue
142 | self._logger.debug('[LOADED] %s', pretty_counter(stats))
143 |
144 | def _process(self, pattern_maker, args):
145 | combine = args.format_type == 'ETE'
146 | s = time.time()
147 | for maker in pattern_maker.makers:
148 | for root in maker.make(combine):
149 | e = time.time()
150 | self._logger.debug('[CLUSTER] %d %.2fs', root.count, e - s)
151 | for record in pformat(args.format_type, maker.url_meta, root):
152 | print(record)
153 | s = time.time()
154 |
155 | def run(self, args):
156 | pattern_maker = PatternMaker(self._config)
157 | self._load(pattern_maker, args)
158 | self._process(pattern_maker, args)
159 |
160 |
161 | class MatchPatternCommand(Command):
162 | def __init__(self):
163 | super(MatchPatternCommand, self).__init__()
164 |
165 | def add_argument(self, parser):
166 | super(MatchPatternCommand, self).add_argument(parser)
167 | parser.add_argument('-p', '--pattern-files',
168 | help='pattern files to be loaded',
169 | nargs='+',
170 | type=argparse.FileType('rb'),
171 | required=True,
172 | dest='pattern_files')
173 |
174 | parser.add_argument('-a', '--all-matched',
175 | help='all matched patterns',
176 | default=False,
177 | action='store_true',
178 | dest='all_matched')
179 |
180 | def _load(self, pattern_matcher, args):
181 | stats = Counter()
182 | p_inputs = args.pattern_files
183 | self._logger.debug('[LOAD] %d pattern file%s: %s',
184 | len(p_inputs),
185 | 's' if len(p_inputs) > 1 else '',
186 | ', '.join([p.name for p in p_inputs]))
187 | with LogSpeedAdapter(self._logger, 1000) as speed_logger:
188 | load = pattern_matcher.load
189 | for line in chain.from_iterable(p_inputs):
190 | speed_logger.debug('[LOADING]')
191 | stats['ALL'] += 1
192 | line = line.rstrip()
193 | if not line.startswith(b'/'):
194 | stats['UNKNOW'] += 1
195 | continue
196 | try:
197 | pattern = line.decode(DEFAULT_ENCODING)
198 | load(pattern, meta=pattern)
199 | stats['VALID'] += 1
200 | except Exception as e:
201 | self._logger.warn("%s, %r", str(e), line)
202 | stats['INVALID'] += 1
203 | self._logger.debug('[LOAD] Finished %s', pretty_counter(stats))
204 |
205 | def _match_result(self, pattern_matcher, raw_url, args):
206 | result = None
207 | try:
208 | url = raw_url.decode(DEFAULT_ENCODING)
209 | result = pattern_matcher.match(url)
210 | if not args.all_matched:
211 | result = sorted(result, reverse=True)
212 | result = result[:1]
213 | result = '\t'.join([r.meta for r in result]
214 | ).encode(DEFAULT_ENCODING)
215 | except (InvalidPatternException,
216 | IrregularURLException,
217 | InvalidCharException,
218 | UnicodeDecodeError,
219 | ValueError) as e:
220 | result = b'E'
221 | self._logger.warn("%s, %r", str(e), raw_url)
222 | except Exception as e:
223 | result = b'E'
224 | self._logger.error("%s, %r", str(e), raw_url)
225 | return result
226 |
227 | def _match(self, pattern_matcher, args):
228 | speed_logger = LogSpeedAdapter(self._logger, 5000)
229 | write = binary_stdout.write
230 | for line in chain.from_iterable(args.inputs):
231 | speed_logger.debug('[MATCHING]')
232 | line = line.strip()
233 | result = self._match_result(pattern_matcher, line, args)
234 | if not result:
235 | result = b'N'
236 | write(result)
237 | write(b'\t')
238 | write(line)
239 | write(b'\n')
240 |
241 | def run(self, args):
242 | pattern_matcher = PatternMatcher()
243 | self._load(pattern_matcher, args)
244 | self._match(pattern_matcher, args)
245 |
246 |
247 | def _execute(command, argv=None):
248 | argv = argv or sys.argv
249 | parser = argparse.ArgumentParser()
250 | command.add_argument(parser)
251 | args = parser.parse_args(argv[1:])
252 | command.process_args(args)
253 | command.run(args)
254 |
255 |
256 | def make(argv=None):
257 | _execute(MakePatternCommand(get_default_config()), argv)
258 |
259 |
260 | def match(argv=None):
261 | _execute(MatchPatternCommand(), argv)
262 |
--------------------------------------------------------------------------------
/src/os_urlpattern/compat.py:
--------------------------------------------------------------------------------
1 | """Compatible import.
2 | """
3 |
4 | from __future__ import unicode_literals
5 | import operator
6 | import string
7 | import sys
8 |
9 | _PY3 = sys.version_info[0] >= 3
10 |
11 | if _PY3:
12 | from io import StringIO
13 | iteritems = operator.methodcaller("items")
14 | itervalues = operator.methodcaller("values")
15 | from urllib.parse import urlparse, ParseResult
16 | from configparser import ConfigParser
17 | binary_stdin = sys.stdin.buffer
18 | binary_stdout = sys.stdout.buffer
19 | else:
20 | try:
21 | from cStringIO import StringIO # safe, only process ascii
22 | except ImportError:
23 | from StringIO import StringIO
24 | iteritems = operator.methodcaller("iteritems")
25 | itervalues = operator.methodcaller("itervalues")
26 | from urlparse import urlparse, ParseResult
27 | from ConfigParser import ConfigParser
28 | binary_stdin = sys.stdin
29 | binary_stdout = sys.stdout
30 |
--------------------------------------------------------------------------------
/src/os_urlpattern/config/__init__.py:
--------------------------------------------------------------------------------
1 | """Configure.
2 | """
3 | from ..compat import ConfigParser
4 |
5 | def get_default_config():
6 | """Get default configure instance.
7 |
8 | Returns:
9 | Config -- default confiure instance
10 | """
11 | import os
12 | path = os.path.dirname(__file__)
13 | cfg = ConfigParser()
14 | cfg.read(os.path.join(path, 'default_config.cfg'))
15 | return cfg
16 |
--------------------------------------------------------------------------------
/src/os_urlpattern/config/default_config.cfg:
--------------------------------------------------------------------------------
1 | [make]
2 | min_cluster_num = 3
--------------------------------------------------------------------------------
/src/os_urlpattern/definition.py:
--------------------------------------------------------------------------------
1 | """Definition of global constant varialbles.
2 | """
3 |
4 | from __future__ import unicode_literals
5 |
6 | import hashlib
7 | import string
8 |
9 | from .pattern import Pattern
10 |
11 | DEFAULT_ENCODING = 'UTF-8'
12 |
13 |
14 | class Symbols(object):
15 | PLUS = '+'
16 | EMPTY = ''
17 | SLASH = '/'
18 | EQUALS = '='
19 | NUMBER = '#'
20 | PERCENT = '%'
21 | QUESTION = '?'
22 | BRACES_L = '{'
23 | BRACES_R = '}'
24 | AMPERSAND = '&'
25 | BACKSLASH = '\\'
26 | BRACKETS_L = '['
27 | BRACKETS_R = ']'
28 |
29 |
30 | class BasePatternRule(object):
31 | DIGIT = '0-9'
32 | BASE_ASCII_LOWER = 'a-z'
33 | BASE_ASCII_UPPER = 'A-Z'
34 | BASE_ASCII = 'A-Za-z'
35 | BASE_DIGIT_AND_ASCII_LOWER = '0-9a-z'
36 | BASE_DIGIT_AND_ASCII_UPPER = '0-9A-Z'
37 | BASE_DIGIT_AND_ASCII = '0-9A-Za-z'
38 | SINGLE_DIGIT = '[0-9]'
39 | SINGLE_ASCII_LOWER = '[a-z]'
40 | SINGLE_ASCII_UPPER = '[A-Z]'
41 | MULTI_DIGIT = '[0-9]+'
42 | MULTI_ASCII_LOWER = '[a-z]+'
43 | MULTI_ASCII_UPPER = '[A-Z]+'
44 | MULTI_ASCII = '[A-Za-z]+'
45 | MULTI_DIGIT_AND_ASCII_LOWER = '[0-9a-z]+'
46 | MULTI_DIGIT_AND_ASCII_UPPER = '[0-9A-Z]+'
47 | MULTI_DIGIT_AND_ASCII = '[0-9A-Za-z]+'
48 | DOT = '\\.'
49 | EMPTY = ''
50 | SINGLE_QUESTION = '[\\?]'
51 |
52 |
53 | ZERO_DIGEST = hashlib.md5(b'0').hexdigest().upper()
54 | QUERY_PART_RESERVED_CHARS = frozenset([Symbols.EQUALS])
55 | EMPTY_TUPLE = ()
56 | BLANK_TUPLE = (BasePatternRule.EMPTY,)
57 |
58 | # 26 letters rules
59 | CHAR_AND_RULE_LIST = []
60 | ASCII_AND_RULE_LIST = []
61 | ASCII_AND_RULE_LIST.extend([(i, BasePatternRule.BASE_ASCII_LOWER)
62 | for i in string.ascii_lowercase])
63 | ASCII_AND_RULE_LIST.extend([(i, BasePatternRule.BASE_ASCII_UPPER)
64 | for i in string.ascii_uppercase])
65 | CHAR_AND_RULE_LIST.extend(ASCII_AND_RULE_LIST)
66 |
67 | # digit rules
68 | DIGIT_AND_RULE_LIST = [(i, BasePatternRule.DIGIT)
69 | for i in string.digits]
70 | CHAR_AND_RULE_LIST.extend(DIGIT_AND_RULE_LIST)
71 |
72 | # digit and 26 letters set
73 | DIGIT_SET = frozenset([i for i in string.digits])
74 | ASCII_LOWER_SET = frozenset([i for i in string.ascii_lowercase])
75 | ASCII_UPPER_SET = frozenset([i for i in string.ascii_uppercase])
76 | ASCII_DIGIT_SET = frozenset([c for c, _ in CHAR_AND_RULE_LIST])
77 |
78 | # do not escaped symbol rules
79 | SYMBOL = '%&_@#;:,=<>~/'
80 | SYMBOL_SET = frozenset([i for i in SYMBOL])
81 | SYMBOL_AND_RULE_LIST = [(i, i) for i in SYMBOL_SET]
82 | CHAR_AND_RULE_LIST.extend(SYMBOL_AND_RULE_LIST)
83 |
84 | # escaped symbol rules
85 | ESCAPE = '.+\\"\'()[]{}*$^?|!-'
86 | ESCAPE_SET = frozenset([i for i in ESCAPE])
87 | ESCAPE_AND_RULE_LIST = [(i, '\\%s' % i) for i in ESCAPE_SET]
88 | CHAR_AND_RULE_LIST.extend(ESCAPE_AND_RULE_LIST)
89 |
90 | # all char and rule mapping
91 | CHAR_RULE_DICT = dict(CHAR_AND_RULE_LIST)
92 | RULE_SET = frozenset([r for _, r in CHAR_AND_RULE_LIST])
93 |
94 | # ==
95 | RULE_SIGN_DICT = dict(
96 | [(v, k) for k, v in SYMBOL_AND_RULE_LIST + ESCAPE_AND_RULE_LIST])
97 | SIGN_RULE_SET = frozenset(RULE_SIGN_DICT.keys())
98 |
99 | # ==
100 | DIGIT_AND_ASCII_LOWER_RULE_LIST = [BasePatternRule.DIGIT,
101 | BasePatternRule.BASE_ASCII_LOWER]
102 | DIGIT_AND_ASCII_UPPER_RULE_LIST = [BasePatternRule.DIGIT,
103 | BasePatternRule.BASE_ASCII_UPPER]
104 | DIGIT_AND_ASCII_RULE_LIST = [BasePatternRule.DIGIT,
105 | BasePatternRule.BASE_ASCII_LOWER,
106 | BasePatternRule.BASE_ASCII_UPPER,
107 | BasePatternRule.BASE_ASCII]
108 |
109 | DIGIT_AND_ASCII_UPPER_RULE_SET = frozenset(DIGIT_AND_ASCII_UPPER_RULE_LIST)
110 | DIGIT_AND_ASCII_LOWER_RULE_SET = frozenset(DIGIT_AND_ASCII_LOWER_RULE_LIST)
111 | DIGIT_AND_ASCII_RULE_SET = frozenset(DIGIT_AND_ASCII_RULE_LIST)
112 |
113 | # ==
114 | BASE_ASCII_RULE_SET = frozenset([BasePatternRule.BASE_ASCII,
115 | BasePatternRule.BASE_ASCII_LOWER,
116 | BasePatternRule.BASE_ASCII_UPPER])
117 |
118 | MULTI_ASCII_RULE_SET = frozenset([BasePatternRule.MULTI_ASCII,
119 | BasePatternRule.MULTI_ASCII_LOWER,
120 | BasePatternRule.MULTI_ASCII_UPPER])
121 |
122 | MIXED_RULE_SET = DIGIT_AND_ASCII_RULE_SET.union([Symbols.PERCENT])
123 |
124 |
125 | class BasePattern(object):
126 | SINGLE_DIGIT = Pattern(BasePatternRule.SINGLE_DIGIT)
127 | SINGLE_ASCII_LOWER = Pattern(BasePatternRule.SINGLE_ASCII_LOWER)
128 | SINGLE_ASCII_UPPER = Pattern(BasePatternRule.SINGLE_ASCII_UPPER)
129 | MULTI_DIGIT = Pattern(BasePatternRule.MULTI_DIGIT)
130 | MULTI_ASCII_LOWER = Pattern(BasePatternRule.MULTI_ASCII_LOWER)
131 | MULTI_ASCII_UPPER = Pattern(BasePatternRule.MULTI_ASCII_UPPER)
132 | MULTI_DIGIT_AND_ASCII_LOWER = Pattern(
133 | BasePatternRule.MULTI_DIGIT_AND_ASCII_LOWER)
134 | MULTI_DIGIT_AND_ASCII_UPPER = Pattern(
135 | BasePatternRule.MULTI_DIGIT_AND_ASCII_UPPER)
136 | MULTI_DIGIT_AND_ASCII = Pattern(BasePatternRule.MULTI_DIGIT_AND_ASCII)
137 | DOT = Pattern(BasePatternRule.DOT)
138 | EMPTY = Pattern(BasePatternRule.EMPTY)
139 |
--------------------------------------------------------------------------------
/src/os_urlpattern/exceptions.py:
--------------------------------------------------------------------------------
1 | """Custom Exceptions.
2 | """
3 |
4 |
5 | class IrregularURLException(Exception):
6 | pass
7 |
8 |
9 | class InvalidPatternException(Exception):
10 | pass
11 |
12 |
13 | class InvalidCharException(Exception):
14 | pass
15 |
--------------------------------------------------------------------------------
/src/os_urlpattern/formatter.py:
--------------------------------------------------------------------------------
1 | """Clustered record formatter.
2 | """
3 | from __future__ import unicode_literals
4 |
5 | import json
6 | import sys
7 |
8 | from .definition import BasePatternRule, Symbols
9 | from .parse_utils import pack
10 | from .utils import dump_tree, get_classes
11 |
12 |
13 | class Formatter(object):
14 | """Base class for format clustered data.
15 |
16 | The subclass must define format method which yield formatted strings.
17 | """
18 |
19 | def format(self, url_meta, root, **kwargs):
20 | """Format the clustered tree.
21 |
22 | Args:
23 | url_meta (URLMeta): The url_meta.
24 | root (TreeNode): Root node of the clustered tree.
25 | **kwargs: Arbitray keyword arguments.
26 |
27 | Yields:
28 | str: the formatted string.
29 |
30 | """
31 | return
32 | yield
33 |
34 |
35 | class PatternFormatter(Formatter):
36 | """Pattern only formatter."""
37 |
38 | def format(self, url_meta, root, **kwargs):
39 | """Yield URL pattern string.
40 |
41 | Args:
42 | url_meta (URLMeta): The URLMeta object.
43 | root (TreeNode): Root of a clustered piece tree.
44 | **kwargs: Arbitray keyword arguments.
45 |
46 | Yields:
47 | str: URL pattern string.
48 |
49 | """
50 | for nodes in dump_tree(root):
51 | yield pack(url_meta, [p.pattern for p in nodes[1:]])
52 | break
53 |
54 |
55 | class ClusterFormatter(PatternFormatter):
56 | """URL pattern and meta data formatter.
57 |
58 | Yield URL pattern string first, then all meta data strings.
59 | """
60 |
61 | def format(self, url_meta, root, **kwargs):
62 | """Yield URL pattern and all bound meta data strings.
63 |
64 | Args:
65 | url_meta (URLMeta): The URLMeta object.
66 | root (TreeNode): Root of a clustered piece tree.
67 | **kwargs: Arbitray keyword arguments.
68 |
69 | Yields:
70 | object: URL pattern string first, then all meta
71 | data string prefixed with '\t'.
72 |
73 | """
74 | for r in super(ClusterFormatter, self).format(url_meta, root, **kwargs):
75 | yield r
76 |
77 | for nodes in dump_tree(root):
78 | if nodes[-1].meta is None:
79 | continue
80 | for obj in nodes[-1].meta:
81 | yield '\t'.join(('', str(obj)))
82 |
83 |
84 | class InlineFormatter(PatternFormatter):
85 | """URL pattern and meta data formatter.
86 |
87 | URL pattern and meta data string in one line.
88 | """
89 |
90 | def format(self, url_meta, root, **kwargs):
91 | """Yield URL pattern with each bound meta data string in on line.
92 |
93 | Args:
94 | url_meta (URLMeta): The URLMeta object.
95 | root (TreeNode): Root of a clustered piece tree.
96 | **kwargs: Arbitray keyword arguments.
97 |
98 | Yields:
99 | object: URL pattern string + '\t' + str(meta)
100 |
101 | """
102 | url_pattern_string = None
103 | for r in super(InlineFormatter, self).format(url_meta, root, **kwargs):
104 | url_pattern_string = r
105 |
106 | for nodes in dump_tree(root):
107 | if nodes[-1].meta is None:
108 | continue
109 | for obj in nodes[-1].meta:
110 | yield '\t'.join((url_pattern_string, str(obj)))
111 |
112 |
113 | class JsonFormatter(Formatter):
114 | """Json formatter.
115 |
116 | Yiled Json string, {"ptn":url_pattern, "cnt":count}
117 | ptn: URL pattern string.
118 | cnt: Number of uniq path in the cluster.
119 | """
120 |
121 | def format(self, url_meta, root, **kwargs):
122 | """Yield json format string.
123 |
124 | Args:
125 | url_meta (URLMeta): The URLMeta object.
126 | root (TreeNode): Root of a clustered piece tree.
127 | **kwargs: Arbitray keyword arguments.
128 |
129 | Yields:
130 | str: Json string, key-value:
131 | ptn: URL pattern string.
132 | cnt: Number of uniq path in the cluster.
133 | """
134 | for nodes in dump_tree(root):
135 | p = pack(url_meta, [p.pattern for p in nodes[1:]])
136 | yield json.dumps({'ptn': p, 'cnt': root.count})
137 | break
138 |
139 |
140 | class ETEFormatter(Formatter):
141 | """Ete tree formatter."""
142 |
143 | def __init__(self):
144 | import ete3
145 |
146 | def format(self, url_meta, root, **kwargs):
147 | """Yield ete tree string.
148 |
149 | Args:
150 | url_meta (URLMeta): The URLMeta object.
151 | root (TreeNode): Root of a pattern tree.
152 | **kwargs: Arbitray keyword arguments.
153 |
154 | Yields:
155 | str: An ete tree string.
156 | """
157 | def f(pattern_node):
158 | sep = Symbols.EMPTY
159 | query_key = Symbols.EMPTY
160 | path_depth = url_meta.path_depth
161 | query_depth = len(url_meta.query_keys)
162 | current_level = pattern_node.level
163 | if path_depth < current_level \
164 | and current_level <= (path_depth + query_depth):
165 | sep = Symbols.AMPERSAND
166 | if current_level == path_depth + 1:
167 | sep = BasePatternRule.SINGLE_QUESTION
168 | query_key = url_meta.query_keys[current_level - path_depth - 1]
169 | elif current_level == path_depth + query_depth + 1:
170 | sep = Symbols.NUMBER
171 | return ' {sep}{query_key}{pattern_string}({count}) '.format(
172 | count=pattern_node.count,
173 | pattern_string=pattern_node.value,
174 | query_key=query_key,
175 | sep=sep)
176 |
177 | if root.count <= 0:
178 | return
179 |
180 | ete_tree = get_ete_tree(root, format=f)
181 | yield ete_tree.get_ascii(show_internal=True)
182 |
183 |
184 | def get_ete_tree(root_node, format=str):
185 | """Transfor a tree-like object into ete tree.
186 |
187 | Args:
188 | root_node (TreeNode): The root of the tree.
189 | format (callable, optional): Defaults to str.
190 | A callable object to format the ete tree node.
191 |
192 | Returns:
193 | ete3.Tree: The ete tree.
194 | """
195 | from ete3 import Tree
196 |
197 | def add_children(node, ete_node):
198 | for child in node.children:
199 | ete_child = ete_node.add_child(name=format(child))
200 | add_children(child, ete_child)
201 |
202 | ete_root_node = Tree(name=format(root_node))
203 | add_children(root_node, ete_root_node)
204 | return ete_root_node
205 |
206 |
207 | def pformat(name, url_meta, root, **kwargs):
208 | """Shortcut for formatting.
209 |
210 | Args:
211 | name (str): Format type.
212 | url_meta (URLMeta): The URLMeta object.
213 | root (TreeNode): Root of a clustered tree.
214 | **kwargs: Arbitray keyword arguments.
215 |
216 | Returns:
217 | Iterator: For iterate formatted strings.
218 | """
219 | return FORMATTERS[name.upper()].format(url_meta, root, **kwargs)
220 |
221 |
222 | # Auto discover Formatter classes and init FORMATTERS.
223 | FORMATTERS = {}
224 | for c_cls in get_classes(sys.modules[__name__], Formatter):
225 | c_name = c_cls.__name__
226 | t = c_name.rfind('Formatter')
227 | if t < 0:
228 | raise ImportError('Invalid formatter name: %s' % c_name)
229 | name = c_name[0:t].upper() if c_name[0:t] else 'NULL'
230 | try:
231 | FORMATTERS[name] = c_cls()
232 | except:
233 | pass
234 |
--------------------------------------------------------------------------------
/src/os_urlpattern/parse_utils.py:
--------------------------------------------------------------------------------
1 | """Utilitis for parsing URL and pattern.
2 | """
3 |
4 | from __future__ import unicode_literals
5 |
6 | import hashlib
7 | from collections import namedtuple
8 |
9 | from .compat import ParseResult, StringIO, urlparse
10 | from .definition import (ASCII_DIGIT_SET, BLANK_TUPLE, CHAR_RULE_DICT,
11 | DEFAULT_ENCODING, DIGIT_AND_ASCII_RULE_SET,
12 | EMPTY_TUPLE, MIXED_RULE_SET,
13 | QUERY_PART_RESERVED_CHARS, RULE_SET, SIGN_RULE_SET,
14 | BasePatternRule, Symbols)
15 | from .exceptions import (InvalidCharException, InvalidPatternException,
16 | IrregularURLException)
17 |
18 | URLPatternParseResult = namedtuple(
19 | 'URLPatternParseResult', 'path query fragment')
20 |
21 |
22 | class URLMeta(namedtuple('URLMeta', 'path_depth query_keys has_fragment')):
23 | """The URL structure meta.
24 |
25 | Attributes:
26 | path_depth (int): The num of URL path levels.
27 | querys_keys (:obj:`tuple` of :obj:`str`): Query keys.
28 | has_fragment (bool): Whether the URL have fragmemt component.
29 |
30 | """
31 | __slots__ = ()
32 |
33 | @property
34 | def depth(self):
35 | return self.path_depth + len(self.query_keys) + (1 if self.has_fragment else 0)
36 |
37 |
38 | def specify_rule(rule, num):
39 | """Specify the format of the rule.
40 |
41 | num == 1 will return [rule], single
42 | num > 1 will return [rule]{num}, with number
43 | num < 0 will return [rule]+, wildcard
44 | num == 0 will raise ValueError
45 |
46 | Args:
47 | rule (str): The raw rule string to be secified.
48 | num (int): The num of the rule. Can't be 0.
49 |
50 | Raises:
51 | ValueError: If the num == 0.
52 |
53 | Returns:
54 | str: The specified format of the rule.
55 |
56 | Examples:
57 |
58 | >>> from os_urlpattern.parse_utils import specify_rule
59 | >>> specify_rule('a-z', 1)
60 | [a-z]
61 | >>> specify_rule('a-z', 2)
62 | [a-z]{2}
63 | >>> specify_rule('a-z', -1)
64 | [a-z]+
65 |
66 | """
67 |
68 | if num == 1:
69 | return '[%s]' % rule
70 | elif num < 0:
71 | return '[%s]+' % rule
72 | elif num > 1:
73 | return '[%s]{%d}' % (rule, num)
74 | else:
75 | raise ValueError('Invalid num %s' % str(num))
76 |
77 |
78 | def wildcard_rule(rule):
79 | """The wildcard format of the rule.
80 |
81 | Shortcut of specify_rule(rule, -1).
82 |
83 | Args:
84 | rule (str): The raw rule string to be secified.
85 |
86 | Returns:
87 | str: The wildcard format of the rule.
88 | """
89 | return specify_rule(rule, -1)
90 |
91 |
92 | def normalize(raw_string, reserved_chars=None):
93 | """Normalize a string.
94 |
95 | Transfor the continuous same signs in the string to the format of
96 | [sign_rule]{num}, if the sign is not in zhe reserved_chars.
97 |
98 | Args:
99 | raw_string (str): The string to be normalized.
100 | reserved_chars ([type], optional): Defaults to None. Reserved chars
101 | which are not to be normalized.
102 |
103 | Returns:
104 | str: The normalized string.
105 |
106 | Examples:
107 |
108 | >>> from os_urlpattern.parse_utils import normalize
109 | >>> normalize('abc==123---')
110 | u'abc[=]{2}123[\\-]{3}'
111 |
112 | """
113 | normalized = StringIO()
114 | frag = StringIO()
115 | last_c = None
116 | for c in raw_string:
117 | if c in ASCII_DIGIT_SET:
118 | if last_c and last_c not in ASCII_DIGIT_SET:
119 | frag.seek(0)
120 | w = frag.read()
121 | l = len(w)
122 | if l > 0:
123 | if not reserved_chars or w[0] not in reserved_chars:
124 | r = CHAR_RULE_DICT.get(w[0])
125 | w = specify_rule(r, l)
126 | normalized.write(w)
127 | frag = StringIO()
128 | else:
129 | if last_c != c:
130 | frag.seek(0)
131 | w = frag.read()
132 | l = len(w)
133 | if l > 0 and w[0] not in ASCII_DIGIT_SET and \
134 | (not reserved_chars or w[0] not in reserved_chars):
135 | r = CHAR_RULE_DICT.get(w[0])
136 | w = specify_rule(r, l)
137 | normalized.write(w)
138 | frag = StringIO()
139 | frag.write(c)
140 | last_c = c
141 |
142 | frag.seek(0)
143 | w = frag.read()
144 | l = len(w)
145 | if last_c and last_c not in ASCII_DIGIT_SET and \
146 | (not reserved_chars or w[0] not in reserved_chars):
147 | r = CHAR_RULE_DICT.get(w[0])
148 | w = specify_rule(r, l)
149 | normalized.write(w)
150 | normalized.seek(0)
151 | return normalized.read()
152 |
153 |
154 | def parse_url(url):
155 | """Parse a URL into 6 components.
156 |
157 | :///;?#
158 |
159 | Like the built-in urlparse method, but handle some unusual situation.
160 |
161 | Args:
162 | url (str): The URL to be parsed.
163 |
164 | Returns:
165 | ParseResult: A 6-tuple, (scheme, netloc, path, params, query, fragment).
166 | """
167 | scheme, netloc, path, params, query, fragment = urlparse(url)
168 | if not fragment:
169 | if url[-1] != Symbols.NUMBER:
170 | fragment = None
171 | if not query and url[-1] != Symbols.QUESTION:
172 | query = None
173 | elif not query and url[-2] != Symbols.QUESTION:
174 | query = None
175 | elif not query:
176 | if url[len(url) - len(fragment) - 2] != Symbols.QUESTION:
177 | query = None
178 | return ParseResult(scheme, netloc, path, params, query, fragment)
179 |
180 |
181 | def filter_useless(objs):
182 | """Filter the useless objects.
183 |
184 | If bool(object) == False, the object is useless except the last one.
185 |
186 | Args:
187 | objs (sequence): The objects will be filtered.
188 |
189 | Returns:
190 | iterable: The filterd objs
191 |
192 | Examples:
193 |
194 | >>> from os_urlpattern.parse_utils import filter_useless
195 | >>> filter_useless([0,1,0,0])
196 | [1, 0]
197 |
198 | """
199 | keep = {'c': 0, 'l': len(objs)}
200 |
201 | def _filterd(x):
202 | keep['c'] += 1
203 | if not x:
204 | if keep['c'] == keep['l']:
205 | return True
206 | return False
207 | else:
208 | return True
209 |
210 | return objs.__class__(filter(_filterd, objs))
211 |
212 |
213 | def parse_query_string(query_string):
214 | """Parse query string into keys and values.
215 |
216 | Args:
217 | query_string (str): The string to be parsed.
218 |
219 | Raises:
220 | IrregularURLException: Invalid query string.
221 |
222 | Returns:
223 | tuple: A 2-tuple, (keys and values).
224 | """
225 | if query_string is None:
226 | return EMPTY_TUPLE, EMPTY_TUPLE
227 | elif query_string == Symbols.EMPTY:
228 | return BLANK_TUPLE, BLANK_TUPLE
229 | elif query_string.endswith(Symbols.AMPERSAND):
230 | raise IrregularURLException("Invalid '&' pos")
231 | kv_type = True # query_key True, query_value False
232 | last_c = None
233 | kv_buf = {True: StringIO(), False: StringIO()}
234 | kv_list = {True: [], False: []}
235 | for i in query_string:
236 | if i == Symbols.EQUALS and kv_type:
237 | s = kv_buf[kv_type]
238 | s.write(i)
239 | s.seek(0)
240 | kv_list[kv_type].append(s.read())
241 | kv_buf[kv_type] = StringIO()
242 | kv_type = not kv_type
243 | elif i == Symbols.AMPERSAND:
244 | if last_c is None or last_c == Symbols.AMPERSAND:
245 | raise IrregularURLException("Invalid '&' pos")
246 | s = kv_buf[kv_type]
247 | s.seek(0)
248 | kv_list[kv_type].append(s.read())
249 | kv_buf[kv_type] = StringIO()
250 | if kv_type:
251 | kv_list[False].append(Symbols.EMPTY) # treat as value-less
252 | else:
253 | kv_type = not kv_type
254 | else:
255 | s = kv_buf[kv_type]
256 | s.write(i)
257 | last_c = i
258 |
259 | s = kv_buf[kv_type]
260 | s.seek(0)
261 | kv_list[kv_type].append(s.read())
262 | if kv_type: # treat as value-less
263 | kv_list[False].append(Symbols.EMPTY)
264 |
265 | # Only one query without value, treat as key-less.
266 | if len(kv_list[True]) == 1 and not kv_list[True][0].endswith(Symbols.EQUALS):
267 | kv_list[False][0], kv_list[True][0] = kv_list[True][0], kv_list[False][0]
268 | return tuple(kv_list[True]), tuple(kv_list[False])
269 |
270 |
271 | def mix(pieces, rules):
272 | """Combine the sub-pieces and sub-rules.
273 |
274 | If the sub pieces have continuous letter num and percent sign fragments
275 | will be combine into one piece as well as the rules.
276 |
277 | Args:
278 | pieces (sequence): The raw pieces.
279 | rules (sequence): The rules.
280 |
281 | Returns:
282 | tuple: A 2-tuple, (mixed_pieces, mixed_rules)
283 | """
284 | mixed_pieces = []
285 | mixed_rules = []
286 |
287 | t_pieces = []
288 | t_rules = []
289 | t_mix = False
290 | for piece, rule in zip(pieces, rules):
291 | if rule in MIXED_RULE_SET:
292 | if t_rules and not t_mix:
293 | mixed_pieces.extend(t_pieces)
294 | mixed_rules.extend(t_rules)
295 | t_pieces = []
296 | t_rules = []
297 | t_mix = True
298 | else:
299 | if t_rules and t_mix:
300 | mixed_pieces.append(''.join(t_pieces))
301 | mixed_rules.append(''.join(sorted(set(t_rules))))
302 | t_pieces = []
303 | t_rules = []
304 | t_mix = False
305 | t_pieces.append(piece)
306 | t_rules.append(rule)
307 | if t_mix:
308 | mixed_pieces.append(''.join(t_pieces))
309 | mixed_rules.append(''.join(sorted(set(t_rules))))
310 | else:
311 | mixed_pieces.extend(t_pieces)
312 | mixed_rules.extend(t_rules)
313 | return pieces.__class__(mixed_pieces), rules.__class__(mixed_rules)
314 |
315 |
316 | def unpack(result, normalize_key=True):
317 | """Split the ParseResult object into URLMeta and pieces.
318 |
319 | Args:
320 | result ([type]): The ParseResult object.
321 | normalize_key (bool, optional): Defaults to True.
322 | Whether normalize the query keys.
323 |
324 | Raises:
325 | IrregularURLException: Invalid URL.
326 |
327 | Returns:
328 | tuple: A 2-tuple, (url_meta, pieces).
329 | """
330 | pieces = filter_useless(result.path.split(Symbols.SLASH)[1:])
331 | path_depth = len(pieces)
332 | if path_depth <= 0:
333 | raise IrregularURLException('Invalid url depth')
334 |
335 | keys, values = parse_query_string(result.query)
336 | if normalize_key:
337 | keys = tuple([normalize(key, QUERY_PART_RESERVED_CHARS)
338 | for key in keys])
339 | has_fragment = False if result.fragment is None else True
340 |
341 | url_meta = URLMeta(path_depth, keys, has_fragment)
342 | pieces.extend(values)
343 | if has_fragment:
344 | pieces.append(result.fragment)
345 | return url_meta, tuple(pieces)
346 |
347 |
348 | def pack(url_meta, objs):
349 | """Pack into URL-like string.
350 |
351 | Args:
352 | url_meta (URLMeta): The URLMeta object.
353 | objs (sequence): The objects to be packed.
354 |
355 | Returns:
356 | str: The packed URL-like string.
357 | """
358 | s = StringIO()
359 | s.write(Symbols.SLASH)
360 | query_depth = len(url_meta.query_keys)
361 | idx = url_meta.path_depth + query_depth
362 | p = Symbols.SLASH.join([str(p) for p in objs[0:url_meta.path_depth]])
363 | s.write(p)
364 | if query_depth > 0:
365 | s.write(BasePatternRule.SINGLE_QUESTION)
366 | kv = zip(url_meta.query_keys,
367 | [str(p) for p in objs[url_meta.path_depth:idx]])
368 | s.write(Symbols.AMPERSAND.join(
369 | [''.join((str(k), str(v))) for k, v in kv]))
370 |
371 | if url_meta.has_fragment:
372 | s.write(Symbols.NUMBER)
373 | s.write(''.join([str(p) for p in objs[idx:]]))
374 | s.seek(0)
375 | return s.read()
376 |
377 |
378 | def analyze_url(url):
379 | """Parse a URL to URLMeta object and raw pieces.
380 |
381 | Args:
382 | url (str): The URL to be parsed.
383 |
384 | Returns:
385 | tuple: A 2-tuple, (url_meta, pieces).
386 | """
387 |
388 | result = parse_url(url)
389 | return unpack(result, True)
390 |
391 |
392 | def fuzzy_join(objs, sep='/'):
393 | """Join the fuzzy_rule of the objects into one string.
394 |
395 | Args:
396 | objs (sequence): The objects each of which have fuzzy_rule property.
397 | sep (str): Defaults to '/'. Seperator for joining.
398 |
399 | Returns:
400 | str: The joined fuzzy_rule string.
401 | """
402 | return sep.join([p.fuzzy_rule for p in objs])
403 |
404 |
405 | class ParsedPiece(object):
406 | """The parsed piece object.
407 |
408 | It contains the sub-pieces of a piece and the corresponding sub-rules.
409 | With it, you can get fuzzy rule and the length of the entire piece.
410 | It is can be used as map key.
411 |
412 | """
413 | __slots__ = ('pieces', 'rules', '_piece', '_piece_length', '_fuzzy_rule')
414 |
415 | def __init__(self, pieces, rules):
416 | """Init the ParsedPiece object.
417 |
418 | Args:
419 | pieces (tuple): The tuple of parsed pieces.
420 | rules (tuple): The tuple of the rules of each parsed pieces.
421 | """
422 | self.pieces = pieces
423 | self.rules = rules
424 | self._piece_length = -1
425 | self._piece = pieces[0] if len(pieces) == 1 else None
426 | self._fuzzy_rule = rules[0] if len(rules) == 1 else None
427 |
428 | @property
429 | def fuzzy_rule(self):
430 | if not self._fuzzy_rule:
431 | self._fuzzy_rule = ''.join(sorted(set(self.rules)))
432 | return self._fuzzy_rule
433 |
434 | @property
435 | def piece_length(self):
436 | """Get the literal length of the piece.
437 |
438 | Not the number of the characters of the piece.
439 |
440 | Note:
441 |
442 | '[%]{2}' have 6 characters, but literal length is 2.
443 |
444 | Returns:
445 | int: The literal length of the piece.
446 | """
447 | if self._piece_length < 0:
448 | piece = self.piece
449 | length_base = length = len(piece)
450 | idx = 0
451 | while idx < length_base:
452 | c = piece[idx]
453 | if c == Symbols.BRACKETS_L or c == Symbols.BRACKETS_R:
454 | if idx == 0 or piece[idx - 1] != Symbols.BACKSLASH:
455 | length += -1
456 | elif c == Symbols.BACKSLASH:
457 | if piece[idx + 1] != Symbols.BACKSLASH:
458 | length += -1
459 | elif c == Symbols.BRACES_L:
460 | if piece[idx - 1] == Symbols.BRACKETS_R:
461 | e = piece.index(Symbols.BRACES_R, idx)
462 | length += int(piece[idx + 1:e]) - 1 - (e - idx + 1)
463 | idx = e
464 | idx += 1
465 |
466 | self._piece_length = length
467 | return self._piece_length
468 |
469 | def __eq__(self, o):
470 | if not isinstance(o, ParsedPiece):
471 | return False
472 | return self.piece == o.piece
473 |
474 | def __hash__(self):
475 | return hash(self.piece)
476 |
477 | @property
478 | def piece(self):
479 | if self._piece is None:
480 | self._piece = ''.join(self.pieces)
481 | return self._piece
482 |
483 | def __str__(self):
484 | return str(zip(self.pieces, self.rules))
485 |
486 | __repr__ = __str__
487 |
488 |
489 | EMPTY_PARSED_PIECE = ParsedPiece(EMPTY_TUPLE, EMPTY_TUPLE)
490 |
491 |
492 | class PieceParser(object):
493 | """Parser to parse the URL piece.
494 |
495 | Used it to generate ParsedPiece object from the piece of URL.
496 | Not thread safe.
497 | """
498 | __slots__ = ('_rules', '_pieces')
499 |
500 | def __init__(self):
501 | self._reset()
502 |
503 | def _reset(self):
504 | self._rules = []
505 | self._pieces = []
506 |
507 | def parse(self, piece):
508 | """Parse a string into small sub-pieces with rules.
509 |
510 | The consecutive charactors in the same charactor space
511 | will be joined into one sub-piece, the corresponding
512 | rule(charactor space) can also be got.
513 |
514 | Args:
515 | piece (str): A string to be parsed.
516 |
517 | Returns:
518 | tuple: 2-tuple, (pieces, rules).
519 | """
520 |
521 | self._reset()
522 | self._preprocess(piece)
523 | return self._create_parsed_piece()
524 |
525 | def _preprocess(self, piece):
526 | for c in piece:
527 | self._define(c)
528 | for idx, buf in enumerate(self._pieces):
529 | buf.seek(0)
530 | letter = buf.read()
531 | self._pieces[idx] = self._normalize(
532 | letter, self._rules[idx])
533 |
534 | def _define(self, char):
535 | last_rule = self._rules[-1] if self._rules else None
536 | if char not in CHAR_RULE_DICT:
537 | raise InvalidCharException("Invalid char %r" % char)
538 | rule = CHAR_RULE_DICT[char]
539 |
540 | if last_rule != rule:
541 | self._pieces.append(StringIO())
542 | self._rules.append(rule)
543 | self._pieces[-1].write(char)
544 |
545 | def _normalize(self, letter, rule):
546 | if rule in SIGN_RULE_SET:
547 | return specify_rule(rule, len(letter))
548 | return letter
549 |
550 | def _create_parsed_piece(self):
551 | return ParsedPiece(tuple(self._pieces), tuple(self._rules))
552 |
553 |
554 | def fuzzy_digest(url_meta, objs):
555 | """Generate hex digest string from URLMeta and objects' fuzzy_rules.
556 |
557 | Args:
558 | url_meta (URLMeta): The URLMeta object.
559 | objs (sequence): Each object hava fuzzy_rule property.
560 |
561 | Returns:
562 | str: Digest value as a string of hexadecimal digits.
563 | """
564 | return digest(url_meta, [obj.fuzzy_rule for obj in objs])
565 |
566 |
567 | def digest(url_meta, objs):
568 | """Generate hex digest string from URLMeta and objects.
569 |
570 | Args:
571 | url_meta (URLMeta): The URLMeta object.
572 | objs (sequence): The sequence of objects.
573 |
574 | Returns:
575 | str: Digest value as a string of hexadecimal digits.
576 | """
577 | return hashlib.md5(pack(url_meta, objs).encode(DEFAULT_ENCODING)).hexdigest()
578 |
579 |
580 | def parse_url_pattern_string(url_pattern_string):
581 | """Parse a URL pattern string into 3 components.
582 |
583 | [\\?]#
584 |
585 | Args:
586 | url_pattern_string (str): The url pattern string to be parsed.
587 |
588 | Returns:
589 | URLPatternParseResult: A 3-tuple, (path, query, fragment).
590 | """
591 | idx_p = 0
592 | idx_q = url_pattern_string.find(BasePatternRule.SINGLE_QUESTION)
593 | idx_f = url_pattern_string.find(Symbols.NUMBER)
594 | path = query = fragment = None
595 | if idx_q < 0 and idx_f < 0:
596 | path = url_pattern_string[idx_p:]
597 | elif idx_q > 0 and idx_f > 0:
598 | if idx_f > idx_q:
599 | path = url_pattern_string[idx_p:idx_q]
600 | query = url_pattern_string[idx_q + 4:idx_f]
601 | else:
602 | path = url_pattern_string[idx_p:idx_f]
603 | fragment = url_pattern_string[idx_f + 1:]
604 | elif idx_q < 0 and idx_f > 0:
605 | path = url_pattern_string[idx_p:idx_f]
606 | fragment = url_pattern_string[idx_f + 1:]
607 | elif idx_q > 0 and idx_f < 0:
608 | path = url_pattern_string[idx_p:idx_q]
609 | query = url_pattern_string[idx_q + 4:]
610 |
611 | return URLPatternParseResult(path, query, fragment)
612 |
613 |
614 | def analyze_url_pattern_string(url_pattern_string):
615 | """Parse a URL pattern string into URLMeta object and pattern string pieces.
616 |
617 | Args:
618 | url_pattern_string (str): The URL pattern string to be parsed.
619 |
620 | Returns:
621 | tuple: A 2-tuple, (url_meta, pattern_strings).
622 | """
623 | result = parse_url_pattern_string(url_pattern_string)
624 | return unpack(result, False)
625 |
626 |
627 | def parse_pattern_string(pattern_string):
628 | """Parse a pattern string into pattern unit strings.
629 |
630 | Args:
631 | pattern_string (str): The pattern string to be parsed.
632 |
633 | Returns:
634 | tuple: Pattern unit strings.
635 | """
636 | if pattern_string == Symbols.EMPTY:
637 | return BLANK_TUPLE
638 | pattern_unit_strings = []
639 | l = len(pattern_string)
640 | s = StringIO()
641 | idx = 0
642 | last_rule = None
643 | while idx < l:
644 | c = pattern_string[idx]
645 | if c == Symbols.BRACKETS_L:
646 | if last_rule is not None:
647 | s.seek(0)
648 | pattern_unit_strings.append(s.read())
649 | s = StringIO()
650 | last_rule = None
651 |
652 | idx_s = idx
653 | while True:
654 | idx = pattern_string.find(Symbols.BRACKETS_R, idx + 1)
655 | if idx < 0:
656 | raise InvalidPatternException(
657 | "Missing '%s'" % Symbols.BRACKETS_R)
658 | elif pattern_string[idx - 1] == Symbols.BACKSLASH:
659 | continue
660 | break
661 | if idx + 1 < l:
662 | if pattern_string[idx + 1] == Symbols.BRACES_L:
663 | old_idx = idx + 2
664 | idx = pattern_string.find(Symbols.BRACES_R, idx + 1)
665 | if idx < 0:
666 | raise InvalidPatternException(
667 | "Missing '%s'" % Symbols.BRACES_R)
668 | num_str = pattern_string[old_idx:idx]
669 | if not num_str.isdigit():
670 | raise InvalidPatternException(
671 | "Invalid num %r" % num_str)
672 |
673 | elif pattern_string[idx + 1] == Symbols.PLUS:
674 | idx += 1
675 | idx += 1
676 | pattern_unit_strings.append(pattern_string[idx_s:idx])
677 | else:
678 | if c not in CHAR_RULE_DICT:
679 | raise InvalidPatternException("Invaid char %r" % c)
680 | rule = CHAR_RULE_DICT[c]
681 | if rule not in DIGIT_AND_ASCII_RULE_SET:
682 | raise InvalidPatternException(
683 | 'Invalid pattern')
684 | if last_rule is None:
685 | s.write(c)
686 | else:
687 | if rule == last_rule:
688 | s.write(c)
689 | else:
690 | s.seek(0)
691 | pattern_unit_strings.append(s.read())
692 | s = StringIO()
693 | s.write(c)
694 | last_rule = rule
695 | idx += 1
696 | if last_rule is not None:
697 | s.seek(0)
698 | pattern_unit_strings.append(s.read())
699 |
700 | return tuple(pattern_unit_strings)
701 |
702 |
703 | def parse_pattern_unit_string(pattern_unit_string):
704 | """Parse pattern unit string into rules and literal num.
705 |
706 | Args:
707 | pattern_unit_string (str): The pattern unit string to be parsed.
708 |
709 | Returns:
710 | tuple: A 2-tuple, (rules, num).
711 | """
712 | rules = set()
713 | num = 1
714 | if pattern_unit_string == Symbols.EMPTY:
715 | rules.add(Symbols.EMPTY)
716 | elif pattern_unit_string[0] != Symbols.BRACKETS_L:
717 | rules.add(CHAR_RULE_DICT[pattern_unit_string[0]])
718 | num = len(pattern_unit_string)
719 | else:
720 | if pattern_unit_string[-1] == Symbols.BRACKETS_R:
721 | num = 1
722 | elif pattern_unit_string[-1] == Symbols.BRACES_R:
723 | t = pattern_unit_string.rfind(Symbols.BRACES_L)
724 | num_str = pattern_unit_string[t + 1:-1]
725 | if not num_str.isdigit():
726 | raise InvalidPatternException("Invalid num %r" % num_str)
727 | num = int(num_str)
728 | elif pattern_unit_string[-1] == Symbols.PLUS:
729 | num = -1
730 | t = pattern_unit_string.rfind(Symbols.BRACKETS_R)
731 | p_str = pattern_unit_string[1:t]
732 | l = len(p_str)
733 | idx = 0
734 | while idx < l:
735 | c = p_str[idx]
736 | n = 3
737 | if c in ASCII_DIGIT_SET:
738 | pass
739 | elif c == Symbols.BACKSLASH:
740 | n = 2
741 | else:
742 | n = 1
743 | rule = p_str[idx:idx + n]
744 | if rule not in RULE_SET:
745 | raise InvalidPatternException("Invalid rule %r" % rule)
746 | rules.add(rule)
747 | idx += n
748 | if (num > 0 and len(rules) > num) or num == 0:
749 | raise InvalidPatternException('Insufficient number')
750 | return rules, num
751 |
--------------------------------------------------------------------------------
/src/os_urlpattern/parsed_piece_view.py:
--------------------------------------------------------------------------------
1 | """ParsedPieceView and subclass implementation.
2 | """
3 | from __future__ import unicode_literals
4 |
5 | from .definition import DIGIT_AND_ASCII_RULE_SET, BasePatternRule
6 | from .parse_utils import ParsedPiece, fuzzy_join, mix
7 | from .utils import pick
8 |
9 |
10 | class ParsedPieceView(object):
11 | """The base class of parsed piece view.
12 |
13 | View object is a wrapper of parsed piece, which have individual
14 | view, parsed_piece and parsed_pieces propertys are based on the
15 | raw parsed piece.
16 |
17 | """
18 | __slots__ = ('parsed_piece', '_parsed_pieces', '_view')
19 |
20 | def __init__(self, parsed_piece):
21 | self.parsed_piece = parsed_piece
22 | self._parsed_pieces = None
23 | self._view = None
24 |
25 | def __eq__(self, o):
26 | if not isinstance(o, ParsedPieceView):
27 | return False
28 | return self.view == o.view
29 |
30 | def __hash__(self):
31 | return hash(self.view)
32 |
33 | @property
34 | def view(self):
35 | if self._view is None:
36 | self._view = fuzzy_join(self.parsed_pieces)
37 | return self._view
38 |
39 | @property
40 | def parsed_pieces(self):
41 | if self._parsed_pieces:
42 | return self._parsed_pieces
43 |
44 | self._parsed_pieces = [ParsedPiece((piece,), (rule,)) for piece, rule in zip(
45 | self.parsed_piece.pieces, self.parsed_piece.rules)]
46 | return self._parsed_pieces
47 |
48 |
49 | class PieceView(ParsedPieceView):
50 |
51 | def __init__(self, parsed_piece):
52 | super(PieceView, self).__init__(parsed_piece)
53 | self._view = self.parsed_piece.piece
54 |
55 |
56 | class LengthView(ParsedPieceView):
57 |
58 | def __init__(self, parsed_piece):
59 | super(LengthView, self).__init__(parsed_piece)
60 | self._view = self.parsed_piece.piece_length
61 |
62 |
63 | class MultiView(ParsedPieceView):
64 | pass
65 |
66 |
67 | class MixedView(ParsedPieceView):
68 |
69 | @property
70 | def parsed_pieces(self):
71 | if self._parsed_pieces:
72 | return self._parsed_pieces
73 |
74 | if len(self.parsed_piece.rules) <= 1:
75 | self._parsed_pieces = [self.parsed_piece]
76 | else:
77 | mixed_pieces, mixed_rules = mix(
78 | self.parsed_piece.pieces, self.parsed_piece.rules)
79 |
80 | self._parsed_pieces = [ParsedPiece(
81 | (piece,), (rule,)) for piece, rule in zip(mixed_pieces, mixed_rules)]
82 | return self._parsed_pieces
83 |
84 |
85 | class LastDotSplitFuzzyView(ParsedPieceView):
86 |
87 | @property
88 | def parsed_pieces(self):
89 | if self._parsed_pieces:
90 | return self._parsed_pieces
91 | rules = self.parsed_piece.rules
92 | dot_idx = None
93 | part_num = len(rules)
94 | for idx, rule in enumerate(reversed(rules)):
95 | if idx > 2:
96 | break
97 | if rule == BasePatternRule.DOT:
98 | dot_idx = part_num - idx - 1
99 | break
100 | self._parsed_pieces = [ParsedPiece((self.parsed_piece.piece,),
101 | (self.parsed_piece.fuzzy_rule,))]
102 | if dot_idx is not None:
103 | skip = False
104 | for rule in self.parsed_piece.rules[dot_idx + 1:]:
105 | if rule not in DIGIT_AND_ASCII_RULE_SET:
106 | skip = True
107 | break
108 | if not skip:
109 | pieces = []
110 | rules = []
111 | pieces.append(''.join(self.parsed_piece.pieces[0:dot_idx]))
112 | pieces.append(self.parsed_piece.pieces[dot_idx])
113 | rules.append(
114 | ''.join(sorted(set(self.parsed_piece.rules[0:dot_idx]))))
115 | rules.append(self.parsed_piece.rules[dot_idx])
116 | mixed_pieces, mixed_rules = mix(
117 | self.parsed_piece.pieces[dot_idx + 1:],
118 | self.parsed_piece.rules[dot_idx + 1:])
119 | pieces.extend(mixed_pieces)
120 | rules.extend(mixed_rules)
121 | self._parsed_pieces = [ParsedPiece(
122 | (piece,), (rule,)) for piece, rule in zip(pieces, rules)]
123 | return self._parsed_pieces
124 |
125 |
126 | class FuzzyView(ParsedPieceView):
127 |
128 | def __init__(self, parsed_piece):
129 | super(FuzzyView, self).__init__(parsed_piece)
130 | self._view = self.parsed_piece.fuzzy_rule
131 |
132 | @property
133 | def parsed_pieces(self):
134 | if self._parsed_pieces:
135 | return self._parsed_pieces
136 | self._parsed_pieces = [ParsedPiece((self.parsed_piece.piece,),
137 | (self.parsed_piece.fuzzy_rule,))]
138 | return self._parsed_pieces
139 |
140 |
141 | def view_cls_from_pattern(pattern, is_last_path=False):
142 | """Get ParsedPieceView class from pattern.
143 |
144 | ParsedPieceView type can be deduced from the pattern.
145 |
146 | Args:
147 | pattern (Pattern): The Pattern object.
148 | is_last_path (bool, optional): Defaults to False. Whether the pattern
149 | is at the last path position.
150 |
151 | Returns:
152 | class: The class of ParsedPieceView.
153 | """
154 | view_cls = PieceView
155 | pattern_units = pattern.pattern_units
156 | if len(pattern_units) == 1:
157 | pattern_unit = pattern_units[0]
158 | if not pattern_unit.is_literal():
159 | if pattern_unit.num < 0:
160 | view_cls = FuzzyView
161 | else:
162 | view_cls = LengthView
163 | else:
164 | for pattern_unit in pattern_units:
165 | if not pattern_unit.is_literal():
166 | if len(pattern_unit.rules) > 1:
167 | view_cls = MixedView
168 | break
169 | else:
170 | view_cls = MultiView
171 | if is_last_path \
172 | and len(pattern_units) == 3 \
173 | and view_cls != PieceView \
174 | and len(pattern_units[1].rules) == 1 \
175 | and pick(pattern_units[1].rules) == BasePatternRule.DOT \
176 | and not (set(pattern_units[-1].rules) - DIGIT_AND_ASCII_RULE_SET):
177 | view_cls = LastDotSplitFuzzyView
178 |
179 | return view_cls
180 |
--------------------------------------------------------------------------------
/src/os_urlpattern/parser.py:
--------------------------------------------------------------------------------
1 | """High-level APIs for parsing.
2 | """
3 |
4 | from __future__ import unicode_literals
5 |
6 | from .parse_utils import fuzzy_digest as _fuzzy_digest
7 | from .parse_utils import PieceParser, analyze_url, analyze_url_pattern_string
8 |
9 |
10 | def parse(url_or_pattern):
11 | """Parse URL or URL pattern string.
12 |
13 | Args:
14 | url_or_pattern (str): URL or URL pattern.
15 |
16 | Returns:
17 | tuple: 2-tuples, (url_meta, parsed_pieces)
18 | """
19 | url_meta = None
20 | parsed_pieces = None
21 | if url_or_pattern.startswith('/'): # URL pattern
22 | from .pattern_matcher import MatchPattern
23 | url_meta, pattern_strings = analyze_url_pattern_string(url_or_pattern)
24 | parsed_pieces = tuple([MatchPattern(p, i == url_meta.path_depth)
25 | for i, p in enumerate(pattern_strings, 1)])
26 | else: # URL
27 | parser = PieceParser()
28 | url_meta, pieces = analyze_url(url_or_pattern)
29 | parsed_pieces = tuple([parser.parse(piece) for piece in pieces])
30 |
31 | return url_meta, parsed_pieces
32 |
33 |
34 | def fuzzy_digest(*args):
35 | """Generate hex fuzzy digest string from URL or URL pattern.
36 |
37 | Args:
38 | *args: Can be a single argument string, or 2 arguments
39 | URLMeta and objects.
40 |
41 | Returns:
42 | str: Digest value as a string of hexadecimal digits.
43 | """
44 | l = len(args)
45 | url_meta = None
46 | objs = None
47 | if l == 2:
48 | url_meta, objs = args
49 | elif l == 1:
50 | url_meta, objs = parse(args[0])
51 | else:
52 | raise ValueError('Not digestable')
53 | return _fuzzy_digest(url_meta, objs)
54 |
--------------------------------------------------------------------------------
/src/os_urlpattern/pattern.py:
--------------------------------------------------------------------------------
1 | """Pattern class.
2 | """
3 | from __future__ import unicode_literals
4 |
5 | import re
6 |
7 | from .utils import pick
8 |
9 |
10 | class PatternUnit(object):
11 | """Sub-piece of pattern."""
12 |
13 | __slots__ = ('pattern_unit_string', 'rules', 'num', '_fuzzy_rule')
14 |
15 | def __init__(self, pattern_unit_string):
16 | self.pattern_unit_string = pattern_unit_string
17 | from .parse_utils import parse_pattern_unit_string
18 | self.rules, self.num = parse_pattern_unit_string(pattern_unit_string)
19 | self._fuzzy_rule = None
20 |
21 | def is_literal(self):
22 | """Whether this unit string is literal or not.
23 |
24 | Note:
25 | According to the char representation, fixed-length
26 | single sign is literal, like: [\\.]{2} [\\-]
27 |
28 | Returns:
29 | bool: Whether it is literal.
30 | """
31 |
32 | from .definition import DIGIT_AND_ASCII_RULE_SET, Symbols
33 | r = False
34 | if not self.pattern_unit_string.startswith(Symbols.BRACKETS_L):
35 | r = True
36 | elif len(self.rules) == 1:
37 | if self.num > 0:
38 | rule = pick(self.rules)
39 | if rule not in DIGIT_AND_ASCII_RULE_SET:
40 | r = True
41 | return r
42 |
43 | @property
44 | def fuzzy_rule(self):
45 | if self._fuzzy_rule is None:
46 | self._fuzzy_rule = ''.join(sorted(self.rules))
47 | return self._fuzzy_rule
48 |
49 | def __str__(self):
50 | return ' '.join((self.pattern_unit_string, self.fuzzy_rule, str(self.num)))
51 |
52 | __repr__ = __str__
53 |
54 |
55 | class Pattern(object):
56 | """Pattern for handle pattern string. """
57 |
58 | __slots__ = ('pattern_string', '_pattern_regex',
59 | '_pattern_units', '_fuzzy_rule')
60 |
61 | def __init__(self, pattern_string):
62 | self.pattern_string = pattern_string
63 | self._pattern_regex = None
64 | self._pattern_units = None
65 | self._fuzzy_rule = None
66 |
67 | @property
68 | def pattern_units(self):
69 | """tuple: Pattern units."""
70 |
71 | from .parse_utils import parse_pattern_string
72 | if self._pattern_units is None:
73 | self._pattern_units = tuple([PatternUnit(
74 | u) for u in parse_pattern_string(self.pattern_string)])
75 | return self._pattern_units
76 |
77 | def __str__(self):
78 | return self.pattern_string
79 |
80 | __repr__ = __str__
81 |
82 | def __hash__(self):
83 | return hash(self.pattern_string)
84 |
85 | def __eq__(self, o):
86 | return self.pattern_string == o.pattern_string
87 |
88 | def match(self, piece):
89 | if not self._pattern_regex:
90 | self._pattern_regex = re.compile(
91 | ''.join(('^', self.pattern_string, '$')))
92 | return True if re.match(self._pattern_regex, piece) else False
93 |
94 | @property
95 | def fuzzy_rule(self):
96 | """str: All rules of the pattern join into a string."""
97 | if self._fuzzy_rule is None:
98 | self._fuzzy_rule = ''.join(sorted(set.union(
99 | *[u.rules for u in self.pattern_units])))
100 | return self._fuzzy_rule
101 |
--------------------------------------------------------------------------------
/src/os_urlpattern/pattern_cluster.py:
--------------------------------------------------------------------------------
1 | """Cluster algorithm.
2 | """
3 | from __future__ import unicode_literals
4 |
5 | from collections import Counter, OrderedDict, namedtuple
6 |
7 | from .compat import itervalues
8 | from .parse_utils import (EMPTY_PARSED_PIECE, URLMeta, specify_rule,
9 | wildcard_rule)
10 | from .parsed_piece_view import LastDotSplitFuzzyView, MixedView, MultiView
11 | from .pattern import Pattern
12 | from .piece_pattern_node import (PiecePatternNode, build_from_parsed_pieces,
13 | build_from_piece_pattern_nodes)
14 | from .utils import Bag, cached_property, dump_tree, pick
15 |
16 |
17 | class TBag(Bag):
18 | __slots__ = ('stats',)
19 |
20 | def __init__(self):
21 | super(TBag, self).__init__()
22 | self.stats = Counter()
23 |
24 | @property
25 | def count(self):
26 | return self.stats['count']
27 |
28 | def add(self, obj):
29 | super(TBag, self).add(obj)
30 | self.stats['count'] += obj.count
31 |
32 | def set_pattern(self, pattern):
33 | for obj in self:
34 | obj.set_pattern(pattern)
35 |
36 |
37 | class TBucket(TBag):
38 |
39 | def __init__(self):
40 | super(TBucket, self).__init__()
41 | self._objs = {}
42 |
43 | def __getitem__(self, key):
44 | return self._objs[key]
45 |
46 | def __contains__(self, key):
47 | return key in self._objs
48 |
49 | def __iter__(self):
50 | return iter(itervalues(self._objs))
51 |
52 | def add(self, obj):
53 | raise NotImplementedError
54 |
55 |
56 | class PieceBag(TBag):
57 | """A bag contain all of the nodes with same piece.
58 |
59 | The nodes should on the same branch of a tree at the same level.
60 | """
61 |
62 | __slots__ = ('_p_nodes',)
63 |
64 | def __init__(self):
65 | super(PieceBag, self).__init__()
66 | self._p_nodes = set()
67 |
68 | def add(self, piece_pattern_node):
69 | super(PieceBag, self).add(piece_pattern_node)
70 | self._p_nodes.add(piece_pattern_node.parrent)
71 | self.stats['p_nodes_count'] += piece_pattern_node.parrent.count \
72 | if piece_pattern_node.parrent is not None \
73 | else piece_pattern_node.count
74 |
75 | @property
76 | def p_nodes(self):
77 | return self._p_nodes
78 |
79 |
80 | class PieceBagBucket(TBucket):
81 | __slots__ = ('_p_nodes',)
82 |
83 | def __init__(self):
84 | super(PieceBagBucket, self).__init__()
85 | self._p_nodes = set()
86 |
87 | def add(self, obj):
88 | if isinstance(obj, PiecePatternNode):
89 | piece = obj.piece
90 | if piece not in self._objs:
91 | self._objs[piece] = PieceBag()
92 | self._objs[piece].add(obj)
93 | elif isinstance(obj, PieceBag):
94 | piece = obj.pick().piece
95 | if piece in self._objs:
96 | raise ValueError('duplicated')
97 | self._objs[piece] = obj
98 | else:
99 | raise ValueError('not PiecePatternNode nor PieceBag')
100 |
101 | self.stats['count'] += obj.count
102 |
103 | @property
104 | def p_nodes(self):
105 | if not self._p_nodes:
106 | for piece_bag in self:
107 | self._p_nodes.update(piece_bag.p_nodes)
108 | return self._p_nodes
109 |
110 |
111 | class ViewPieceBag(namedtuple('ViewPieceBag', ['view', 'piece_bag'])):
112 | __slots__ = ()
113 |
114 | def set_pattern(self, pattern):
115 | return self.piece_bag.set_pattern(pattern)
116 |
117 |
118 | class ViewPieceBagBucket(PieceBagBucket):
119 | __slots__ = ('_url_meta', '_root')
120 |
121 | def __init__(self, url_meta):
122 | super(ViewPieceBagBucket, self).__init__()
123 | self._url_meta = url_meta
124 | self._root = PiecePatternNode((EMPTY_PARSED_PIECE, None))
125 |
126 | def add(self, view_piece_bag, build_tree=True):
127 | piece_bag = view_piece_bag.piece_bag
128 | self._objs[piece_bag.pick().piece] = view_piece_bag
129 | self.stats['count'] += piece_bag.count
130 |
131 | if not build_tree:
132 | return
133 | view = view_piece_bag.view
134 |
135 | build_from_parsed_pieces(
136 | self._root, view.parsed_pieces, count=piece_bag.count, uniq=False)
137 |
138 | def cluster(self, config, **kwargs):
139 | for clustered in cluster(config, self._url_meta, self._root, **kwargs):
140 | yield self._transfer(clustered)
141 |
142 | def _transfer(self, root):
143 | pattern = None
144 | bucket = ViewPieceBagBucket(self._url_meta)
145 | for nodes in dump_tree(root):
146 | piece = ''.join([p.piece for p in nodes[1:]])
147 | view_piece_bag = self[piece]
148 | bucket.add(view_piece_bag, False)
149 | if pattern is None:
150 | pattern = Pattern(
151 | ''.join([str(p.pattern) for p in nodes[1:]]))
152 | return bucket, pattern
153 |
154 |
155 | def confused(total, max_part, threshold):
156 | """Determine whether it is too complex to become a cluster.
157 |
158 | If a data set have several(= threshold and o_part >= threshold:
174 | return True
175 | return abs(max_part - o_part) < threshold - 1
176 |
177 |
178 | class SeekResult(object):
179 | FOUND = 1
180 | IMPOSSIBLE = 2
181 | UNKNOW = 3
182 | BACKWARD = 4
183 |
184 |
185 | class PatternCluster(object):
186 | """Base class of cluster."""
187 |
188 | def __init__(self, processor):
189 | self._processor = processor
190 | self._min_cluster_num = processor.config.getint(
191 | 'make', 'min_cluster_num')
192 | self._patterns = set()
193 |
194 | @property
195 | def pre_level_processor(self):
196 | return self._processor.pre_level_processor
197 |
198 | def cluster(self):
199 | pass
200 |
201 | def add(self, obj):
202 | pass
203 |
204 | @property
205 | def pattern_num(self):
206 | return len(self._patterns)
207 |
208 | def seek_cluster(self, package):
209 | return SeekResult.UNKNOW
210 |
211 |
212 | class PiecePatternCluster(PatternCluster):
213 |
214 | def __init__(self, processor):
215 | super(PiecePatternCluster, self).__init__(processor)
216 | self._bucket = PieceBagBucket()
217 |
218 | def seek_cluster(self, package):
219 | p_nodes_count = sum([p.count for p in package.p_nodes])
220 | if p_nodes_count - package.count >= self._min_cluster_num:
221 | return SeekResult.IMPOSSIBLE
222 |
223 | return SeekResult.UNKNOW
224 |
225 | def iter_nodes(self):
226 | return self._bucket.iter_all()
227 |
228 | def add(self, piece_pattern_node):
229 | self._bucket.add(piece_pattern_node)
230 |
231 | def _set_pattern(self, piece_bag, update_patterns=False):
232 | pattern = Pattern(piece_bag.pick().piece)
233 | piece_bag.set_pattern(pattern)
234 | if update_patterns:
235 | self._patterns.add(pattern)
236 |
237 | def cluster(self):
238 | if not self._bucket:
239 | return
240 | procesor = self._processor
241 | if procesor.is_last_level() \
242 | and 'last_path_as_pattern' in procesor.kwargs \
243 | and procesor.kwargs['last_path_as_pattern']:
244 | for piece_bag in self._bucket:
245 | self._set_pattern(piece_bag, True)
246 | return
247 |
248 | mcn = self._min_cluster_num
249 | if len(self._bucket) < mcn:
250 | max_count = max(self._bucket, key=lambda x: x.count).count
251 | if not confused(self._bucket.count, max_count, mcn):
252 | for piece_bag in self._bucket:
253 | self._set_pattern(piece_bag, True)
254 | return
255 |
256 | for piece_bag in self._bucket:
257 | stats = piece_bag.stats
258 | count = piece_bag.count
259 | if count < mcn \
260 | or stats['p_nodes_count'] - count >= mcn \
261 | or not self.pre_level_processor.seek_cluster(piece_bag):
262 | self._set_pattern(piece_bag)
263 | self._add_to_forward_cluster(piece_bag)
264 | else:
265 | self._set_pattern(piece_bag, True)
266 |
267 | def _add_to_forward_cluster(self, piece_bag):
268 | parsed_piece = piece_bag.pick().parsed_piece
269 | if len(parsed_piece.pieces) == 1:
270 | self._processor.get_cluster(LengthPatternCluster).add(piece_bag)
271 | return
272 |
273 | view = MultiView(parsed_piece)
274 | p_cls = BasePatternCluster
275 | vl = len(view.parsed_pieces)
276 |
277 | if vl == 3 and self._processor.is_last_path():
278 | ldsf_view = LastDotSplitFuzzyView(parsed_piece)
279 | if view == ldsf_view:
280 | view = ldsf_view
281 | p_cls = LastDotSplitFuzzyPatternCluster
282 | elif vl > 3:
283 | mixed_view = MixedView(parsed_piece)
284 | mvl = len(mixed_view.parsed_pieces)
285 | if mvl == 1:
286 | self._processor.get_cluster(
287 | LengthPatternCluster).add(piece_bag)
288 | return
289 | elif vl - mvl >= self._min_cluster_num:
290 | if mvl == 3 and self._processor.is_last_path():
291 | ldsf_view = LastDotSplitFuzzyView(parsed_piece)
292 | if mixed_view == ldsf_view:
293 | view = ldsf_view
294 | p_cls = LastDotSplitFuzzyPatternCluster
295 | else:
296 | view = mixed_view
297 | p_cls = MixedPatternCluster
298 | else:
299 | view = mixed_view
300 | p_cls = MixedPatternCluster
301 |
302 | self._processor.get_cluster(p_cls).add(
303 | ViewPieceBag(view, piece_bag))
304 |
305 |
306 | class LengthPatternCluster(PatternCluster):
307 | def __init__(self, processor):
308 | super(LengthPatternCluster, self).__init__(processor)
309 | self._length_buckets = {}
310 |
311 | def add(self, piece_bag):
312 | piece_length = piece_bag.pick().parsed_piece.piece_length
313 | if piece_length not in self._length_buckets:
314 | self._length_buckets[piece_length] = PieceBagBucket()
315 | self._length_buckets[piece_length].add(piece_bag)
316 |
317 | def _length_as_cluster(self, length_bucket):
318 | if len(length_bucket) < self._min_cluster_num:
319 | if length_bucket.count < self._min_cluster_num:
320 | return False
321 | max_count = max(length_bucket, key=lambda x: x.count).count
322 | if not confused(length_bucket.count, max_count, self._min_cluster_num):
323 | return False
324 |
325 | return True
326 |
327 | def _update_patterns(self, bucket):
328 | for piece_bag in bucket:
329 | self._patterns.add(piece_bag.pick().pattern)
330 |
331 | def cluster(self):
332 | if not self._length_buckets:
333 | return
334 | mcn = self._min_cluster_num
335 | if len(self._length_buckets) < mcn:
336 | total = sum([c.count for c in itervalues(self._length_buckets)])
337 | max_bucket = max(itervalues(self._length_buckets),
338 | key=lambda x: x.count)
339 | if not confused(total, max_bucket.count, mcn):
340 | for bucket in itervalues(self._length_buckets):
341 | if self._length_as_cluster(bucket):
342 | self._set_pattern(bucket, True)
343 | else:
344 | self._update_patterns(bucket)
345 | return
346 |
347 | forward_cluster = self._processor.get_cluster(FuzzyPatternCluster)
348 | for length_bucket in itervalues(self._length_buckets):
349 | if self._length_as_cluster(length_bucket):
350 | if self.pre_level_processor.seek_cluster(length_bucket):
351 | self._set_pattern(length_bucket, True)
352 | continue
353 | self._set_pattern(length_bucket)
354 |
355 | forward_cluster.add(length_bucket)
356 |
357 | def _set_pattern(self, length_bucket, update_patterns=False):
358 | parsed_piece = length_bucket.pick().parsed_piece
359 | length = parsed_piece.piece_length
360 | pattern = Pattern(specify_rule(parsed_piece.fuzzy_rule, length))
361 | length_bucket.set_pattern(pattern)
362 | if update_patterns:
363 | self._patterns.add(pattern)
364 |
365 |
366 | class MultiPatternCluster(PatternCluster):
367 | def __init__(self, processor):
368 | super(MultiPatternCluster, self).__init__(processor)
369 | self._buckets = {}
370 |
371 | def cluster(self):
372 | for bucket in itervalues(self._buckets):
373 | if bucket.count < self._min_cluster_num:
374 | self._to_forward_cluster(bucket)
375 | continue
376 | for b, pattern in self._cluster(bucket):
377 | if self._as_cluster(b, pattern):
378 | self._set_pattern(b, pattern)
379 | else:
380 | self._to_forward_cluster(b)
381 |
382 | def _cluster(self, bucket):
383 | for b, pattern in bucket.cluster(self._processor.config):
384 | yield b, pattern
385 |
386 | def _to_forward_cluster(self, bucket):
387 | for view_piece_bag in bucket:
388 | self._add_to_forward_cluster(view_piece_bag)
389 |
390 | def _add_to_forward_cluster(self, view_piece_bag):
391 | pass
392 |
393 | def _as_cluster(self, bucket, pattern):
394 | if bucket.count < self._min_cluster_num:
395 | return False
396 | return True
397 |
398 | def _set_pattern(self, bucket, pattern):
399 | bucket.set_pattern(pattern)
400 | self._patterns.add(pattern)
401 |
402 | def add(self, view_piece_bag):
403 | view = view_piece_bag.view
404 | if view not in self._buckets:
405 | url_meta = URLMeta(len(view.parsed_pieces), [], False)
406 | self._buckets[view] = ViewPieceBagBucket(url_meta)
407 | self._buckets[view].add(view_piece_bag)
408 |
409 |
410 | class BasePatternCluster(MultiPatternCluster):
411 |
412 | def _add_to_forward_cluster(self, view_piece_bag):
413 | view = view_piece_bag.view
414 | piece_bag = view_piece_bag.piece_bag
415 | parsed_piece = piece_bag.pick().parsed_piece
416 |
417 | mixed_view = MixedView(parsed_piece)
418 | mvl = len(mixed_view.parsed_pieces)
419 |
420 | p_cls = MixedPatternCluster
421 |
422 | if view == mixed_view:
423 | if self._processor.is_last_path():
424 | ldsf_view = LastDotSplitFuzzyView(parsed_piece)
425 | if len(ldsf_view.parsed_pieces) == 1:
426 | self._processor.get_cluster(
427 | LengthPatternCluster).add(piece_bag)
428 | return
429 | else:
430 | view = ldsf_view
431 | p_cls = LastDotSplitFuzzyPatternCluster
432 | else:
433 | self._processor.get_cluster(
434 | LengthPatternCluster).add(piece_bag)
435 | return
436 | else:
437 | view = mixed_view
438 | if mvl == 1:
439 | self._processor.get_cluster(
440 | LengthPatternCluster).add(piece_bag)
441 | return
442 | elif mvl == 3 and self._processor.is_last_path():
443 | ldsf_view = LastDotSplitFuzzyView(parsed_piece)
444 | if mixed_view == ldsf_view:
445 | view = ldsf_view
446 | p_cls = LastDotSplitFuzzyPatternCluster
447 |
448 | self._processor.get_cluster(p_cls).add(
449 | ViewPieceBag(view, piece_bag))
450 |
451 |
452 | class MixedPatternCluster(MultiPatternCluster):
453 |
454 | def _add_to_forward_cluster(self, view_piece_bag):
455 | view = view_piece_bag.view
456 | piece_bag = view_piece_bag.piece_bag
457 | parsed_piece = piece_bag.pick().parsed_piece
458 |
459 | if self._processor.is_last_path():
460 | ldsf_view = LastDotSplitFuzzyView(parsed_piece)
461 | if len(ldsf_view.parsed_pieces) == 1:
462 | self._processor.get_cluster(
463 | LengthPatternCluster).add(piece_bag)
464 | return
465 | else:
466 | view = ldsf_view
467 | p_cls = LastDotSplitFuzzyPatternCluster
468 | else:
469 | self._processor.get_cluster(
470 | LengthPatternCluster).add(piece_bag)
471 | return
472 |
473 | self._processor.get_cluster(p_cls).add(
474 | ViewPieceBag(view, piece_bag))
475 |
476 |
477 | class LastDotSplitFuzzyPatternCluster(MultiPatternCluster):
478 |
479 | def _cluster(self, bucket):
480 | for b, pattern in bucket.cluster(self._processor.config,
481 | last_path_as_pattern=True):
482 | yield b, pattern
483 |
484 | def _add_to_forward_cluster(self, view_piece_bag):
485 | self._processor.get_cluster(LengthPatternCluster).add(
486 | view_piece_bag.piece_bag)
487 |
488 |
489 | class FuzzyPatternCluster(PatternCluster):
490 | def __init__(self, processor):
491 | super(FuzzyPatternCluster, self).__init__(processor)
492 | self._cached = TBag()
493 | self._force_pattern = False
494 | self._fuzzy_pattern = None
495 |
496 | def add(self, bucket):
497 | if self._force_pattern:
498 | self._set_pattern(bucket)
499 | else:
500 | self._cached.add(bucket)
501 | if len(self._cached) >= self._min_cluster_num:
502 | self._force_pattern = True
503 |
504 | def _update_patterns(self):
505 | for bucket in self._cached:
506 | for piece_bag in bucket:
507 | self._patterns.add(piece_bag.pick().pattern)
508 |
509 | def cluster(self):
510 | if self._force_pattern:
511 | self._set_pattern(self._cached)
512 | else:
513 | if self._cached.count < self._min_cluster_num:
514 | self._update_patterns()
515 | return
516 | max_count = max(self._cached, key=lambda x: x.count).count
517 | if confused(self._cached.count, max_count, self._min_cluster_num):
518 | self._set_pattern(self._cached)
519 | else:
520 | self._update_patterns()
521 |
522 | def _set_pattern(self, package):
523 | if self._fuzzy_pattern is None:
524 | self._fuzzy_pattern = Pattern(
525 | wildcard_rule(package.pick().parsed_piece.fuzzy_rule))
526 | self._patterns.add(self._fuzzy_pattern)
527 | package.set_pattern(self._fuzzy_pattern)
528 |
529 |
530 | CLUSTER_CLASSES = [PiecePatternCluster,
531 | BasePatternCluster,
532 | MixedPatternCluster,
533 | LastDotSplitFuzzyPatternCluster,
534 | LengthPatternCluster,
535 | FuzzyPatternCluster]
536 |
537 |
538 | class ClusterProcessor(object):
539 | def __init__(self, config, url_meta, pre_level_processor, **kwargs):
540 | self._config = config
541 | self._url_meta = url_meta
542 | self._pattern_clusters = OrderedDict(
543 | [(c.__name__, c(self)) for c in CLUSTER_CLASSES])
544 | self._pre_level_processor = pre_level_processor
545 | self._next_level_processors = {}
546 | self._kwargs = kwargs
547 |
548 | @cached_property
549 | def level(self):
550 | l = 0
551 | n = self.pre_level_processor
552 | while n is not None:
553 | l += 1
554 | n = n.pre_level_processor
555 | return l
556 |
557 | def is_last_level(self):
558 | return self._url_meta.depth == self.level
559 |
560 | def is_last_path(self):
561 | return self._url_meta.path_depth == self.level
562 |
563 | @property
564 | def kwargs(self):
565 | return self._kwargs
566 |
567 | @property
568 | def next_level_processors(self):
569 | return self._next_level_processors.values()
570 |
571 | def _backward_package(self, package):
572 | bucket = PieceBagBucket()
573 | for p_node in package.p_nodes:
574 | if p_node.piece in bucket:
575 | continue
576 | bucket.add(p_node)
577 | return bucket
578 |
579 | def seek_cluster(self, package):
580 | if self._pre_level_processor is None:
581 | return False
582 | for c in itervalues(self._pattern_clusters):
583 | res = c.seek_cluster(package)
584 | if res == SeekResult.FOUND:
585 | return True
586 | elif res == SeekResult.IMPOSSIBLE:
587 | break
588 | elif res == SeekResult.BACKWARD:
589 | pack = self._backward_package(package)
590 | return self._pre_level_processor.seek_cluster(pack)
591 | elif res == SeekResult.UNKNOW:
592 | continue
593 | else:
594 | raise ValueError('invalid seek result')
595 |
596 | return False
597 |
598 | def get_cluster(self, cluster_cls):
599 | return self._pattern_clusters[cluster_cls.__name__]
600 |
601 | @property
602 | def config(self):
603 | return self._config
604 |
605 | @property
606 | def pre_level_processor(self):
607 | return self._pre_level_processor
608 |
609 | def _process(self):
610 | for c in itervalues(self._pattern_clusters):
611 | c.cluster()
612 |
613 | def add(self, node, add_children=False):
614 | c = self.get_cluster(PiecePatternCluster)
615 | if add_children:
616 | for child in node.children:
617 | c.add(child)
618 | else:
619 | c.add(node)
620 |
621 | @property
622 | def pattern_num(self):
623 | return sum([c.pattern_num for c in itervalues(self._pattern_clusters)])
624 |
625 | def process(self):
626 | self._process()
627 | if self.is_last_level():
628 | return
629 |
630 | self._create_next_level_processors()
631 |
632 | for processor in itervalues(self._next_level_processors):
633 | processor.process()
634 |
635 | def _create_next_level_processors(self):
636 |
637 | pp_cluster = self.get_cluster(PiecePatternCluster)
638 | processors = self._next_level_processors
639 |
640 | for node in pp_cluster.iter_nodes():
641 | pattern = node.pattern
642 | if pattern not in processors:
643 | processors[pattern] = ClusterProcessor(
644 | self._config,
645 | self._url_meta,
646 | self, **self.kwargs)
647 | processor = processors[pattern]
648 | processor.add(node, add_children=True)
649 |
650 |
651 | def split_by_pattern(root):
652 | """Split the piece pattern tree by pattern path.
653 |
654 | Args:
655 | root (PiecePatternNode): The root of piece pattern tree.
656 |
657 | Returns:
658 | iterator: Iterator of sub-trees.
659 | """
660 | tree_roots = {}
661 | for nodes in dump_tree(root):
662 | pid = hash("/".join([str(p.pattern) for p in nodes]))
663 | if pid not in tree_roots:
664 | tree_roots[pid] = PiecePatternNode((EMPTY_PARSED_PIECE, None))
665 | sub_root = tree_roots[pid]
666 | build_from_piece_pattern_nodes(sub_root, nodes[1:])
667 |
668 | return itervalues(tree_roots)
669 |
670 |
671 | def _can_be_splited(processor):
672 | """Check whether the processor tree can be splited.
673 |
674 | Args:
675 | processor (ClusterProcessor): The root node of cluster processor.
676 |
677 | Returns:
678 | bool: Whether the processor tree can be splited.
679 | """
680 | while True:
681 | pattern_num = processor.pattern_num
682 | if pattern_num > 1:
683 | return True
684 | l = len(processor.next_level_processors)
685 | if l <= 0:
686 | break
687 | elif l > 1:
688 | return True
689 | processor = pick(processor.next_level_processors)
690 |
691 | return False
692 |
693 |
694 | def process(config, url_meta, root, **kwargs):
695 | """Start clustering.
696 |
697 | Args:
698 | config (Config): The configure object.
699 | url_meta (URLMeta): The URLMeta object.
700 | root (PiecePatternNode): The root of the piece pattern tree.
701 | **kwargs: Keyword arguments.
702 |
703 | Returns:
704 | bool: Whether the clustered tree can be split.
705 | """
706 | processor = ClusterProcessor(config, url_meta, None, **kwargs)
707 | processor.add(root)
708 | processor.process()
709 | return _can_be_splited(processor)
710 |
711 |
712 | def cluster(config, url_meta, root, **kwargs):
713 | """Entrance of the cluster workflow.
714 |
715 | Args:
716 | config (Config): The configure object.
717 | url_meta (URLMeta): The URLMeta object.
718 | root (PiecePatternNode): The root of the piece pattern tree.
719 | **kwargs: Keyword arguments.
720 |
721 | Yields:
722 | PiecePatternNode: The clustered sub piece pattern tree root.
723 |
724 | """
725 | if root.count <= 0:
726 | return
727 | if not process(config, url_meta, root, **kwargs):
728 | yield root
729 | return
730 | for sub_root in split_by_pattern(root):
731 | for clustered in cluster(config, url_meta, sub_root, **kwargs):
732 | yield clustered
733 |
--------------------------------------------------------------------------------
/src/os_urlpattern/pattern_maker.py:
--------------------------------------------------------------------------------
1 | """Pattern clustering procedure APIs.
2 | """
3 | from .compat import itervalues
4 | from .config import get_default_config
5 | from .definition import BasePattern
6 | from .parse_utils import EMPTY_PARSED_PIECE, ParsedPiece
7 | from .parser import fuzzy_digest, parse
8 | from .pattern_cluster import cluster
9 | from .piece_pattern_node import PiecePatternNode, build_from_parsed_pieces
10 | from .utils import TreeNode, build_tree, dump_tree, pick
11 |
12 |
13 | class PatternMaker(object):
14 | """Scaffold for simplifying clustering.
15 |
16 | After load urls, iterate all sub makers make cluster
17 | individually or cluster all by calling make method.
18 | """
19 |
20 | def __init__(self, config=None):
21 | self._config = get_default_config() if config is None else config
22 | self._makers = {}
23 |
24 | @property
25 | def makers(self):
26 | """iterable: For iterating all sub makers."""
27 | return itervalues(self._makers)
28 |
29 | def load(self, url, meta=None):
30 | """Load url and meta.
31 |
32 | Args:
33 | url (str): The URL to be loaded.
34 | meta (object, optional): Defaults to None. Meta data will be
35 | merged at each cluster and can be accessed by clustered
36 | node's meta property.
37 |
38 | Returns:
39 | tuple: 2-tules, (node, is_new).
40 | """
41 | url_meta, parsed_pieces = parse(url)
42 | if not isinstance(parsed_pieces[0], ParsedPiece):
43 | raise ValueError('Invalid URL')
44 | sid = fuzzy_digest(url_meta, parsed_pieces)
45 | if sid not in self._makers:
46 | self._makers[sid] = Maker(url_meta, self._config)
47 | return self._makers[sid].load(parsed_pieces, meta=meta)
48 |
49 | def make(self, combine=False):
50 | """Iterate all sub makers, start clustering and yield clustered.
51 |
52 | Args:
53 | combine (bool, optional): Defaults to False. Combine the
54 | same url_meta clusters into a patten tree.
55 |
56 | Yields:
57 | tuple: 2-tuple, (url_meta, clustered). The clustered is the
58 | root of a clustered tree.
59 | """
60 | for maker in self.makers:
61 | for clustered in maker.make(combine):
62 | yield maker.url_meta, clustered
63 |
64 |
65 | class Maker(object):
66 | """Low-level APIs for clustering.
67 |
68 | Suppose this will only be used for same fuzzy-digest clustering.
69 | """
70 |
71 | def __init__(self, url_meta, config=None):
72 | self._url_meta = url_meta
73 | self._config = get_default_config() if config is None else config
74 | self._root = PiecePatternNode((EMPTY_PARSED_PIECE, None))
75 |
76 | @property
77 | def url_meta(self):
78 | """URLMeta: The URLMeta object."""
79 | return self._url_meta
80 |
81 | def load(self, parsed_pieces, meta=None):
82 | """Load parsed pieces and meta.
83 |
84 | Args:
85 | parsed_pieces (list): The parsed pieces to be loaded.
86 | meta (object, optional): Defaults to None. Meta data will be
87 | merged at each cluster and can be accessed by clustered
88 | node's meta property.
89 |
90 | Returns:
91 | tuple: 2-tules, (node, is_new).
92 | """
93 | return build_from_parsed_pieces(self._root,
94 | parsed_pieces,
95 | meta=meta)
96 |
97 | def _cluster(self):
98 | for clustered in cluster(self._config,
99 | self._url_meta,
100 | self._root):
101 | yield clustered
102 |
103 | def _combine_clusters(self):
104 | root = TreeNode(BasePattern.EMPTY)
105 | for clustered in self._cluster():
106 | nodes = pick(dump_tree(clustered))
107 | build_tree(root, [(n.pattern, n.pattern)
108 | for n in nodes[1:]], nodes[0].count)
109 |
110 | yield root
111 |
112 | def make(self, combine=False):
113 | """Start clustering and yield clustered.
114 |
115 | Args:
116 | combine (bool, optional): Defaults to False. Combine the
117 | clusters into a patten tree.
118 |
119 | Yields:
120 | TreeNode: Root of the clustered tree. If combine=False yield
121 | all clustered parsed piece trees otherwise yield a
122 | combined pattern tree.
123 | """
124 | if combine:
125 | return self._combine_clusters()
126 | return self._cluster()
127 |
--------------------------------------------------------------------------------
/src/os_urlpattern/pattern_matcher.py:
--------------------------------------------------------------------------------
1 | """Pattern matching APIs.
2 | """
3 | from __future__ import unicode_literals
4 |
5 | from functools import total_ordering
6 |
7 | from .definition import BasePatternRule
8 | from .parse_utils import MIXED_RULE_SET, PieceParser, fuzzy_join
9 | from .parsed_piece_view import (FuzzyView, LastDotSplitFuzzyView, LengthView,
10 | MixedView, MultiView, PieceView,
11 | view_cls_from_pattern)
12 | from .parser import fuzzy_digest, parse
13 | from .pattern import Pattern
14 | from .utils import TreeNode, build_tree
15 |
16 |
17 | @total_ordering
18 | class MatchPattern(Pattern):
19 | """Pattern used for matching.
20 |
21 | It is comparable and has a view_cls property to
22 | identify the pattern type.
23 | """
24 | __slots__ = ('view_cls', '_cmp_key')
25 |
26 | def __init__(self, pattern_string, is_last_path=False):
27 | super(MatchPattern, self).__init__(pattern_string)
28 | self.view_cls = view_cls_from_pattern(self, is_last_path)
29 | self._cmp_key = None
30 |
31 | @property
32 | def cmp_key(self):
33 | """str: Used for sort."""
34 |
35 | if self._cmp_key is None:
36 | l = [MatchPattern(u.pattern_unit_string)
37 | for u in reversed(self.pattern_units)]
38 | self._cmp_key = ''.join([str(VIEW_ORDER[p.view_cls]) for p in l])
39 | return self._cmp_key
40 |
41 | def __ne__(self, other):
42 | return self.pattern_string != other.pattern_string
43 |
44 | def __lt__(self, other):
45 | if self.view_cls == other.view_cls:
46 | return self.cmp_key > other.cmp_key
47 | return VIEW_ORDER[self.view_cls] > VIEW_ORDER[other.view_cls]
48 |
49 |
50 | EMPTY_MATCH_PATTERN = MatchPattern(BasePatternRule.EMPTY)
51 |
52 |
53 | class ViewMatcher(object):
54 | """Base class for different type of view matcher.
55 |
56 | Init with a specified ParsedPieceView class.
57 | Filled with same view-type match node.
58 | Get all matched nodes.
59 | """
60 | __slots__ = ('view_cls', '_matchers')
61 |
62 | def __init__(self, view_cls):
63 | self.view_cls = view_cls
64 | self._matchers = {}
65 |
66 | def add_match_node(self, match_node):
67 | pass
68 |
69 | def match(self, parsed_piece):
70 | view = self.view_cls(parsed_piece)
71 | if view.view not in self._matchers:
72 | return []
73 | parsed_pieces = view.parsed_pieces
74 | matched_result = []
75 | self._matchers[view.view].match(
76 | parsed_pieces, 0, matched_result)
77 | return [n.meta for n in matched_result]
78 |
79 |
80 | class PiecePatternViewMatcher(ViewMatcher):
81 |
82 | def add_match_node(self, match_node):
83 | if match_node.pattern.pattern_string not in self._matchers:
84 | self._matchers[match_node.pattern.pattern_string] = [match_node]
85 |
86 | def match(self, parsed_piece):
87 | return [] if parsed_piece.piece not in self._matchers \
88 | else self._matchers[parsed_piece.piece]
89 |
90 |
91 | class LengthPatternViewMatcher(ViewMatcher):
92 |
93 | def add_match_node(self, match_node):
94 | length = match_node.pattern.pattern_units[0].num
95 | self._matchers[length] = [match_node]
96 |
97 | def match(self, parsed_piece):
98 | return [] if parsed_piece.piece_length not in self._matchers \
99 | else self._matchers[parsed_piece.piece_length]
100 |
101 |
102 | class MultiPatternViewMatcher(ViewMatcher):
103 |
104 | def add_match_node(self, match_node):
105 | pattern = match_node.pattern
106 | r = fuzzy_join(pattern.pattern_units)
107 | if r not in self._matchers:
108 | self._matchers[r] = PatternMatchNode(EMPTY_MATCH_PATTERN)
109 | patterns = [MatchPattern(p.pattern_unit_string)
110 | for p in pattern.pattern_units]
111 | matcher = self._matchers[r]
112 | build_tree(matcher, patterns, meta=match_node)
113 |
114 |
115 | class MixedPatternViewMatcher(MultiPatternViewMatcher):
116 |
117 | def _pattern(self, pattern_units):
118 | return MatchPattern(''.join([p.pattern_unit_string for p in pattern_units]))
119 |
120 | def add_match_node(self, match_node):
121 | patterns = []
122 | t = []
123 | for pattern_unit in match_node.pattern.pattern_units:
124 | if not pattern_unit.is_literal() \
125 | or pattern_unit.fuzzy_rule not in MIXED_RULE_SET:
126 | if t:
127 | patterns.append(self._pattern(t))
128 | t = []
129 | patterns.append(self._pattern([pattern_unit]))
130 | else:
131 | t.append(pattern_unit)
132 |
133 | if t:
134 | patterns.append(self._pattern(t))
135 |
136 | r = fuzzy_join(patterns)
137 | if r not in self._matchers:
138 | self._matchers[r] = PatternMatchNode(EMPTY_MATCH_PATTERN)
139 | matcher = self._matchers[r]
140 | build_tree(matcher, patterns, meta=match_node)
141 |
142 |
143 | class FuzzyPatternViewMatcher(ViewMatcher):
144 |
145 | def __init__(self, view_cls):
146 | super(FuzzyPatternViewMatcher, self).__init__(view_cls)
147 | self._matchers = []
148 |
149 | def add_match_node(self, match_node):
150 | self._matchers.append(match_node)
151 |
152 | def match(self, parsed_piece):
153 | return self._matchers
154 |
155 |
156 | VIEW_MATCHERS = [
157 | (PieceView, PiecePatternViewMatcher),
158 | (MultiView, MultiPatternViewMatcher),
159 | (MixedView, MultiPatternViewMatcher),
160 | (LastDotSplitFuzzyView, MultiPatternViewMatcher),
161 | (LengthView, LengthPatternViewMatcher),
162 | (FuzzyView, FuzzyPatternViewMatcher),
163 | ]
164 |
165 | VIEW_ORDER = dict([(item[0], _idx) for _idx, item in enumerate(VIEW_MATCHERS)])
166 |
167 |
168 | def get_view_matcher_cls(view_cls):
169 | """Get specified ViewMatcher class from ParsedPieceView class.
170 |
171 | Args:
172 | view_cls (ParsedPieceView): Class of a specified ParsedPieceView.
173 |
174 | Returns:
175 | class(ViewMatcher): The Corresponding ViewMatcher class.
176 | """
177 | idx = VIEW_ORDER[view_cls]
178 | return VIEW_MATCHERS[idx][1]
179 |
180 |
181 | @total_ordering
182 | class PatternMatchNode(TreeNode):
183 | """Node for building a match tree."""
184 |
185 | __slots__ = ('_view_matchers',)
186 |
187 | def __init__(self, value):
188 | super(PatternMatchNode, self).__init__(value)
189 | self._view_matchers = []
190 |
191 | @property
192 | def view_cls(self):
193 | return self.pattern.view_cls
194 |
195 | def match(self, parsed_pieces, idx, matched_nodes):
196 | """DF find all matched nodes.
197 |
198 | If a path from root to leaf match all the corresponding pieces,
199 | the leaf node is called matched node.This mathed shoud be called
200 | by the root node, with idx=0 and a list which will be filled with
201 | all matched nodes.
202 |
203 | Args:
204 | parsed_pieces (sequence): All of the parsed pieces to be matched.
205 | idx (int): Indecate which piece of the whole parsed pieces should
206 | try to match this node.
207 | matched_nodes (list of PatternMatchNode): Filled with all of the
208 | matched leaf nodes.
209 | """
210 | parsed_piece = parsed_pieces[idx]
211 | for matcher in self._view_matchers:
212 | nodes = matcher.match(parsed_piece)
213 | if not nodes:
214 | continue
215 | if nodes[0].leaf():
216 | matched_nodes.extend(nodes)
217 | continue
218 | self._deep_match(nodes, parsed_pieces, idx + 1,
219 | matched_nodes)
220 |
221 | def _deep_match(self, nodes, parsed_pieces, idx, matched_nodes):
222 | for node in nodes:
223 | node.match(parsed_pieces, idx, matched_nodes)
224 |
225 | def _get_matcher(self, view_cls):
226 | s = 0
227 | e = len(self._view_matchers)
228 | while e > s:
229 | t = (e - s) // 2 + s
230 | matcher = self._view_matchers[t]
231 | if matcher.view_cls == view_cls:
232 | return matcher
233 | tid = VIEW_ORDER[matcher.view_cls]
234 | vid = VIEW_ORDER[view_cls]
235 | if tid < vid:
236 | s = t + 1
237 | else:
238 | e = t
239 |
240 | matcher = get_view_matcher_cls(view_cls)(view_cls)
241 | self._view_matchers.insert(e, matcher)
242 | return matcher
243 |
244 | @property
245 | def pattern(self):
246 | return self.value
247 |
248 | def add_child(self, pattern):
249 | child, is_new = super(PatternMatchNode, self).add_child(
250 | (pattern, pattern))
251 | if is_new:
252 | matcher = self._get_matcher(child.view_cls)
253 | matcher.add_match_node(child)
254 | return child, is_new
255 |
256 | def __lt__(self, other):
257 | if id(self) == id(other) or self.parrent is None:
258 | return False
259 | if self.pattern == other.pattern:
260 | return self.parrent < other.parrent
261 | return self.pattern < other.pattern
262 |
263 |
264 | class PatternMatcher(object):
265 | """Offer match processing APIs.
266 |
267 | Common procedure:
268 | 1. Init a PatternMatcher.
269 | 2. Load pattern string.
270 | 3. Match url.
271 | """
272 |
273 | def __init__(self):
274 | self._parser = PieceParser()
275 | self._matchers = {}
276 |
277 | def load(self, url_pattern_string, meta=None):
278 | """Load URL pattern string.
279 |
280 | Args:
281 | url_pattern_string (str): URL pattern string.
282 | meta (any, optional): Defaults to None. It will bind to
283 | matched result's meta property.
284 |
285 | Returns:
286 | tuple: 2-tules, (node, is_new).
287 | """
288 | url_meta, parsed_patterns = parse(url_pattern_string)
289 | if not isinstance(parsed_patterns[0], MatchPattern):
290 | raise ValueError('Invalid URL pattern')
291 | sid = fuzzy_digest(url_meta, parsed_patterns)
292 | if sid not in self._matchers:
293 | self._matchers[sid] = Matcher(url_meta)
294 | matcher = self._matchers[sid]
295 | return matcher.load(parsed_patterns, meta=meta)
296 |
297 | def match(self, url):
298 | """Match url, get the matched results.
299 |
300 | Args:
301 | url (str): The URL to be matched.
302 |
303 | Returns:
304 | list: List of matched pattern nodes, if no match return [].
305 | Bound meta data can be accessed with node.meta.
306 | """
307 | url_meta, parsed_pieces = parse(url)
308 | sid = fuzzy_digest(url_meta, parsed_pieces)
309 | if sid in self._matchers:
310 | return self._matchers[sid].match(parsed_pieces)
311 | return []
312 |
313 |
314 | class Matcher(object):
315 | """Low-level APIs for matching.
316 |
317 | Suppose this will only be used for same fuzzy-digest matching.
318 | """
319 |
320 | def __init__(self, url_meta):
321 | self._url_meta = url_meta
322 | self._root = PatternMatchNode(EMPTY_MATCH_PATTERN)
323 |
324 | @property
325 | def url_meta(self):
326 | """URLMeta: The URLMeta object."""
327 | return self._url_meta
328 |
329 | def match(self, parsed_pieces):
330 | """Match URL parsed peices.
331 |
332 | Args:
333 | parsed_pieces (sequence): URL parsed pieces.
334 |
335 | Returns:
336 | list: List of matched pattern nodes, if no match return [].
337 | Bound meta data can be accessed with node.meta.
338 | """
339 |
340 | matched_nodes = []
341 | self._root.match(parsed_pieces, 0, matched_nodes)
342 | return matched_nodes
343 |
344 | def load(self, parsed_patterns, meta=None):
345 | """Load from parsed URL pattern.
346 |
347 | Args:
348 | parsed_patterns (sequence): MatchNodes.
349 | meta (any, optional): Defaults to None. It will bind to
350 | matched result's meta property.
351 |
352 | Returns:
353 | tuple: 2-tules, (node, is_new).
354 | """
355 | return build_tree(self._root, parsed_patterns, meta=meta)
356 |
--------------------------------------------------------------------------------
/src/os_urlpattern/piece_pattern_node.py:
--------------------------------------------------------------------------------
1 | """Raw parsed piece tree.
2 |
3 | Build a tree from the parsed URL pieces.
4 | """
5 | from __future__ import unicode_literals
6 |
7 | from .compat import itervalues
8 | from .parse_utils import EMPTY_PARSED_PIECE
9 | from .pattern import Pattern
10 | from .utils import TreeNode, build_tree
11 |
12 |
13 | class PiecePatternNode(TreeNode):
14 | """Node for building raw piece tree."""
15 |
16 | __slots__ = ('_pattern',)
17 |
18 | def __init__(self, parsed_piece_and_pattern):
19 | parsed_piece, self._pattern = parsed_piece_and_pattern
20 | super(PiecePatternNode, self).__init__(parsed_piece)
21 |
22 | def set_pattern(self, pattern):
23 | self._pattern = pattern
24 |
25 | @property
26 | def pattern(self):
27 | if self._pattern is None:
28 | self._pattern = Pattern(self.piece)
29 | return self._pattern
30 |
31 | @property
32 | def piece(self):
33 | return self.parsed_piece.piece
34 |
35 | @property
36 | def parsed_piece(self):
37 | return self.value
38 |
39 | @property
40 | def children_num(self):
41 | return len(self._children)
42 |
43 | def incr_count(self, count, recur=False):
44 | self.count += count
45 | node = self.parrent if recur else None
46 | while node:
47 | node.incr_count(count)
48 | node = node.parrent
49 |
50 | def __str__(self):
51 | return ' '.join((self.piece, str(self.pattern)))
52 |
53 | def add_meta(self, data):
54 | if data is None:
55 | return
56 | if self.meta is None:
57 | self.meta = set()
58 | self.meta.add(data)
59 |
60 | def update_meta(self, data):
61 | if not data:
62 | return
63 | if self.meta is None:
64 | self.meta = set()
65 | self.meta.update(data)
66 |
67 |
68 | def build_from_parsed_pieces(root, parsed_pieces, count=1, meta=None, uniq=True):
69 | """Build piece pattern tree from parsed pieces.
70 |
71 | Args:
72 | root (PiecePatternNode): The root node of the a tree.
73 | parsed_pieces (sequence): The parsed pieces.
74 | count (int, optional): Defaults to 1.
75 | meta ([type], optional): Defaults to None. The meta data will bind to the leaf node.
76 | uniq (bool, optional): Defaults to True. The duplicated node edge will not add.
77 |
78 | Returns:
79 | tuple: 2-tuple, (leaf_node, is_new)
80 | """
81 | node, is_new = build_tree(root, [(parsed_piece.piece, (parsed_piece, None))
82 | for parsed_piece in parsed_pieces], count)
83 | if uniq and not is_new:
84 | node.incr_count(0 - count, True)
85 | node.add_meta(meta)
86 | return node, is_new
87 |
88 |
89 | def build_from_piece_pattern_nodes(root, piece_pattern_nodes):
90 | """Build piece pattern tree from piece pattern tree edge.
91 |
92 | Args:
93 | root (PiecePatternNode): The root node of the a tree.
94 | piece_pattern_nodes (sequence): piece pattern tree edge.
95 |
96 | Returns:
97 | tuple: 2-tuple, (leaf_node, is_new)
98 | """
99 | last = piece_pattern_nodes[-1]
100 | node, is_new = build_tree(root, [(p.piece, (p.parsed_piece, p.pattern))
101 | for p in piece_pattern_nodes], last.count)
102 | node.update_meta(last.meta)
103 | return node, is_new
104 |
--------------------------------------------------------------------------------
/src/os_urlpattern/utils.py:
--------------------------------------------------------------------------------
1 | """Utilities.
2 | """
3 | import inspect
4 | import logging
5 | import math
6 | import os
7 | import time
8 | from functools import partial
9 |
10 | from .compat import iteritems, itervalues
11 |
12 |
13 | def pretty_counter(counter):
14 | """Format a dict like object.
15 |
16 | Args:
17 | counter (dict): The dict like object to be formatted.
18 |
19 | Returns:
20 | str: Formatted string.
21 | """
22 |
23 | return ", ".join(['{0}:{1}'.format(k, v) for k, v in iteritems(counter)])
24 |
25 |
26 | def pick(iterable):
27 | """Get an obj from iterable object. """
28 |
29 | for obj in iterable:
30 | return obj
31 |
32 |
33 | class Bag(object):
34 | """Uniq objects container.
35 |
36 | The objects in the bag can also be Bag instance.
37 | Use pick method to get a most inside object.
38 | Use iter_all method to iterate objects inside all inner bags.
39 | """
40 |
41 | __slots__ = ('_objs',)
42 |
43 | def __init__(self):
44 | self._objs = set()
45 |
46 | def add(self, obj):
47 | self._objs.add(obj)
48 |
49 | def __len__(self):
50 | return len(self._objs)
51 |
52 | def pick(self):
53 | obj = pick(self)
54 | while isinstance(obj, Bag):
55 | obj = pick(obj)
56 | return obj
57 |
58 | def __iter__(self):
59 | return iter(self._objs)
60 |
61 | def iter_all(self):
62 | for obj in self:
63 | if isinstance(obj, Bag):
64 | for o in obj.iter_all():
65 | yield o
66 | else:
67 | yield obj
68 |
69 |
70 | class TreeNode(object):
71 | """Node of a tree."""
72 |
73 | __slots__ = ('parrent', '_children', 'count',
74 | 'value', 'meta', '_level')
75 |
76 | def __init__(self, value):
77 | self.parrent = None
78 | self.count = 0
79 | self.value = value
80 | self.meta = None
81 | self._level = None
82 | self._children = None
83 |
84 | def leaf(self):
85 | return not self._children
86 |
87 | @property
88 | def level(self):
89 | """int: The level from root."""
90 | if self._level is None:
91 | l = 0
92 | n = self.parrent
93 | while n is not None:
94 | l += 1
95 | n = n.parrent
96 | self._level = l
97 | return self._level
98 |
99 | @property
100 | def children(self):
101 | return itervalues(self._children if self._children is not None else {})
102 |
103 | def add_child(self, kv):
104 | """Add a node to the children data set.
105 |
106 | Args:
107 | kv (pair): Key-value object, the key is used to identify
108 | a uniq node, the value is the node's data.
109 |
110 | Returns:
111 | tuple: 2-tuple, (node, is_new).
112 | """
113 |
114 | if self._children is None:
115 | self._children = {}
116 | k, v = kv
117 | is_new = False
118 | if k not in self._children:
119 | self._children[k] = self.__class__(v)
120 | self._children[k].parrent = self
121 | is_new = True
122 | child = self._children[k]
123 | return child, is_new
124 |
125 |
126 | def build_tree(root, kv_sequence, count=1, meta=None):
127 | """Build a tee.
128 |
129 | This method will call the node's add_child(kv) to build tree.
130 |
131 | Args:
132 | root (TreeNode): Root node of a tree.
133 | kv_sequence (sequence): Objects will be used to build a tree.
134 | count (int, optional): Defaults to 1. Will increase the nodes count.
135 | meta (any, optional): Defaults to None. Will bind to the leaf node.
136 |
137 | Returns:
138 | tuple: 2-tuple, (node, is_new)
139 | """
140 | node = root
141 | node.count += count
142 | for kv in kv_sequence:
143 | node, is_new = node.add_child(kv)
144 | node.count += count
145 | if meta is not None:
146 | node.meta = meta
147 |
148 | return node, is_new
149 |
150 |
151 | def dump_tree(root):
152 | """Dump each path of a tree.
153 |
154 | Args:
155 | root (TreeNode): The root node of a tree.
156 |
157 | Yields:
158 | list: List contains nodes from root to leaf as one path.
159 | """
160 | olist = []
161 |
162 | def _dump(node, _nodes):
163 | _nodes.append(node)
164 | if node.leaf():
165 | yield _nodes
166 | return
167 | for child in node.children:
168 | for nodes in _dump(child, _nodes):
169 | yield nodes
170 | _nodes.pop(-1)
171 |
172 | for nodes in _dump(root, olist):
173 | yield nodes
174 |
175 |
176 | class LogSpeedAdapter(logging.LoggerAdapter):
177 | """Logger adapter for speed logging.
178 |
179 | Log only once when called every interal times,
180 | include total count and average speed.
181 | Used as 'with statement' for logging huge loop processing.
182 |
183 | """
184 |
185 | def __init__(self, logger, interval):
186 | super(LogSpeedAdapter, self).__init__(logger, {})
187 | self._count = 0
188 | assert(interval) > 0
189 | self._interval = interval
190 | self._start_time = time.time()
191 | self._replace()
192 |
193 | def _replace(self):
194 | for name in ['debug', 'info', 'warning', 'error', 'exception', 'critical']:
195 | setattr(self, name, partial(self._log, name))
196 | self.log = self._log
197 |
198 | def _log(self, name, msg, *args, **kwargs):
199 | self._count += 1
200 |
201 | if self._count % self._interval == 0:
202 | speed = self._speed()
203 | extra_msg = '{count} {speed:.1f}/s'.format(
204 | count=self._count, speed=speed)
205 | msg = ' '.join((msg, extra_msg))
206 | if isinstance(name, int):
207 | name = logging.getLevelName(name)
208 | getattr(self.logger, name)(msg, *args, **kwargs)
209 |
210 | def _speed(self):
211 | return self._count / (time.time() - self._start_time)
212 |
213 | def __enter__(self):
214 | self._start_time = time.time()
215 | return self
216 |
217 | def __exit__(self, exc_type, exc_value, exc_tb):
218 | pass
219 |
220 |
221 | def used_memory():
222 | """Human readable memory usage(Byte).
223 |
224 | Returns:
225 | str: Memory usage.
226 | """
227 |
228 | try:
229 | import psutil
230 | except:
231 | return '-'
232 | p = psutil.Process(os.getpid())
233 | memory = p.memory_info().rss
234 | return format_byte(memory)
235 |
236 |
237 | # global variables for format_byte
238 | _UNIT_SUFFIXES = ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y']
239 | _LOG_1024 = math.log(1024)
240 | _SUFFIXES_LENGTH = len(_UNIT_SUFFIXES)
241 |
242 |
243 | def format_byte(value, precision=2):
244 | """Format byte size into human readable.
245 |
246 | Args:
247 | value (int): The byte size.
248 | precision (int, optional): Defaults to 2. Precision.
249 |
250 | Returns:
251 | str: Human readable format.
252 | """
253 |
254 | factor = float(10 ** precision)
255 | suffix = min(int(math.log(value) / _LOG_1024), _SUFFIXES_LENGTH)
256 | num = math.ceil(value / (1024.0 ** suffix) * factor) / factor
257 | return ''.join((str(num), _UNIT_SUFFIXES[suffix]))
258 |
259 |
260 | class MemoryUsageFormatter(logging.Formatter):
261 | """Formatter support memory keyword."""
262 |
263 | def __init__(self, fmt=None, datefmt=None):
264 | super(MemoryUsageFormatter, self).__init__(fmt, datefmt)
265 | self._log_memory = True
266 | if fmt and '%(memory)s' not in fmt:
267 | self._log_memory = False
268 |
269 | def format(self, record):
270 | if self._log_memory and 'memory' not in record.__dict__:
271 | record.__dict__['memory'] = used_memory()
272 | return super(MemoryUsageFormatter, self).format(record)
273 |
274 |
275 | class cached_property(object):
276 | """Decrator for cache class property."""
277 |
278 | def __init__(self, func):
279 | self.__doc__ = getattr(func, "__doc__")
280 | self.func = func
281 |
282 | def __get__(self, obj, cls):
283 | if obj is None:
284 | return self
285 |
286 | value = obj.__dict__[self.func.__name__] = self.func(obj)
287 | return value
288 |
289 |
290 | def get_classes(module, base_cls, include_base_cls=True):
291 | """Get specified classes form module.
292 |
293 | Args:
294 | module (module): Where to find classes.
295 | base_cls (type): The base class.
296 | include_base_cls (bool, optional): Defaults to True.
297 | Whether include base class.
298 |
299 | Returns:
300 | list: The specified classes.
301 | """
302 | def is_class(c):
303 | return inspect.isclass(c) \
304 | and issubclass(c, base_cls) \
305 | and (include_base_cls or c != base_cls)
306 | return [c for _, c in inspect.getmembers(module, is_class)]
307 |
308 |
309 | def with_metaclass(meta, *bases):
310 | """Create a base class with a metaclass.
311 |
312 | From six.
313 | """
314 | # This requires a bit of explanation: the basic idea is to make a dummy
315 | # metaclass for one level of class instantiation that replaces itself with
316 | # the actual metaclass.
317 | class metaclass(type):
318 |
319 | def __new__(cls, name, this_bases, d):
320 | return meta(name, bases, d)
321 |
322 | @classmethod
323 | def __prepare__(cls, name, this_bases):
324 | return meta.__prepare__(name, bases)
325 | return type.__new__(metaclass, 'temporary_class', (), {})
326 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cfhamlet/os-urlpattern/9311aff896ad591b2a9123d256f629f5d142dfc6/tests/__init__.py
--------------------------------------------------------------------------------
/tests/data/urls_example.txt:
--------------------------------------------------------------------------------
1 | http://example.com/01.html
2 | http://example.com/123/test01.html
3 | http://example.com/02.html
4 | http://example.com/456/test02.html
5 | http://example.com/03.html
6 | http://example.com/789/test03.html
7 |
--------------------------------------------------------------------------------
/tests/test_cmdline.py:
--------------------------------------------------------------------------------
1 | import hashlib
2 | import os
3 | import shlex
4 | import subprocess
5 | import sys
6 |
7 | import pytest
8 |
9 | from os_urlpattern.cmdline import make, match
10 |
11 |
12 | def call(cmdline, env=None, **kwargs):
13 | if env is None:
14 | env = os.environ.copy()
15 | if env.get('COVERAGE', None) is not None:
16 | env['COVERAGE_PROCESS_START'] = os.path.abspath('.coveragerc')
17 |
18 | cmd = 'python -u %s %s' % (os.path.abspath(__file__), cmdline)
19 | proc = subprocess.Popen(shlex.split(cmd),
20 | stdout=subprocess.PIPE,
21 | stderr=subprocess.PIPE,
22 | cwd=os.getcwd(),
23 | env=env,
24 | **kwargs)
25 | stdout, stderr = proc.communicate()
26 | return stdout, stderr
27 |
28 |
29 | def test_make(tmpdir):
30 | num = 9
31 | urls = ['http://example.com/abc%02d?id=%02d#abc' %
32 | (i, i) for i in range(0, num)]
33 | data = "\n".join(urls)
34 | f = tmpdir.join('urls.txt')
35 | f.write(data)
36 | cmdline = 'make -i %s' % f.strpath
37 | stdout, _ = call(cmdline)
38 | assert b'/abc[0-9]{2}' in stdout
39 | assert urls[0].encode() in stdout
40 |
41 | cmdline = 'make -i %s -f pattern' % f.strpath
42 | stdout, _ = call(cmdline)
43 | assert b'/abc[0-9]{2}' in stdout
44 | assert urls[0].encode() not in stdout
45 |
46 | cmdline = 'make -i %s -f ete' % f.strpath
47 | stdout, _ = call(cmdline)
48 | assert b' abc[0-9]{2}(%d) ' % num
49 | assert b' [\\?]id=[0-9]{2}(%d) ' % num
50 | assert b' - #abc(%d)' % num
51 |
52 |
53 | def test_make_digest_type_urls(tmpdir):
54 | urls = ['http://example.com/%s.html' % j for j in
55 | [hashlib.md5(str(i).encode()).hexdigest() for i in range(0, 9)]]
56 |
57 | data = "\n".join(urls)
58 | f = tmpdir.join('urls.txt')
59 | f.write(data)
60 | cmdline = 'make -i %s -f pattern ' % f.strpath
61 | stdout, _ = call(cmdline)
62 | assert b'[0-9a-z]{32}[\\.]html' in stdout
63 |
64 |
65 | def test_make_noise(tmpdir):
66 | urls = ['http://example.com/abc%02d?id=%02d#abc' %
67 | (i, i) for i in range(0, 8)]
68 | urls.append('http://example.com/abc009?id=09#abc')
69 |
70 | data = "\n".join(urls)
71 | f = tmpdir.join('urls.txt')
72 | f.write(data)
73 | cmdline = 'make -i %s -f pattern ' % f.strpath
74 | stdout, _ = call(cmdline)
75 | assert b'/abc[0-9]{2}' in stdout
76 | assert b'/abc009' in stdout
77 |
78 |
79 | def test_make_fuzzy(tmpdir):
80 | urls = [
81 | 'sdjfpewiefh',
82 | 'dfsdksd',
83 | 'dffalldsfisslkfdksd',
84 | 'didif',
85 | 'dif',
86 | ]
87 | urls = ['http://example.com/abc/' + i for i in urls]
88 | data = "\n".join(urls)
89 | f = tmpdir.join('urls01.txt')
90 | f.write(data)
91 | cmdline = 'make -i %s -f pattern ' % f.strpath
92 | stdout, _ = call(cmdline)
93 | assert b'/abc/[a-z]+' in stdout
94 |
95 | urls = [i + '.html' for i in urls]
96 | data = "\n".join(urls)
97 | f = tmpdir.join('urls02.txt')
98 | f.write(data)
99 | cmdline = 'make -i %s -f pattern ' % f.strpath
100 | stdout, _ = call(cmdline)
101 | assert b'/abc/[a-z]+[\\.]html' in stdout
102 |
103 |
104 | def test_match(tmpdir):
105 | pattern = b'/abc[0-9]{2}'
106 | fp = tmpdir.join('patterns.txt')
107 | fp.write(pattern)
108 |
109 | urls = ['http://example.com/abc%02d' % i for i in range(1, 10)]
110 | data = "\n".join(urls)
111 | fu = tmpdir.join('urls.txt')
112 | fu.write(data)
113 |
114 | cmdline = 'match -i %s -p %s' % (fu.strpath, fp.strpath)
115 | stdout, _ = call(cmdline)
116 |
117 | assert pattern in stdout
118 |
119 |
120 | if __name__ == "__main__":
121 | sys.path.insert(0, os.getcwd())
122 | if os.getenv('COVERAGE_PROCESS_START'):
123 | import coverage
124 | coverage.process_startup()
125 | cmds = {'make': make, 'match': match}
126 | cmds[sys.argv.pop(1)]()
127 |
--------------------------------------------------------------------------------
/tests/test_config.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from os_urlpattern.config import get_default_config
4 |
5 |
6 | def test_get_default_config():
7 | config = get_default_config()
8 | assert config.getint('make', 'min_cluster_num') == 3
9 |
--------------------------------------------------------------------------------
/tests/test_formatter.py:
--------------------------------------------------------------------------------
1 | from __future__ import unicode_literals
2 |
3 | import json
4 |
5 | import pytest
6 |
7 | from os_urlpattern.formatter import pformat
8 | from os_urlpattern.pattern_maker import PatternMaker
9 |
10 |
11 | @pytest.fixture(scope='function')
12 | def p_maker():
13 | p_maker = PatternMaker()
14 | for url in ['http://www.example.com/abc/%02d.html' % i for i in range(0, 10)]:
15 | p_maker.load(url, meta=url)
16 |
17 | return p_maker
18 |
19 |
20 | def test_inline(p_maker):
21 | for url_meta, clustered in p_maker.make():
22 | for o in pformat('inline', url_meta, clustered):
23 | assert '/abc/[0-9]{2}[\\.]html\thttp' in o
24 |
25 |
26 | def test_json(p_maker):
27 | for url_meta, clustered in p_maker.make():
28 | for o in pformat('json', url_meta, clustered):
29 | d = json.loads(o)
30 | assert d['ptn'] == '/abc/[0-9]{2}[\\.]html'
31 | assert d['cnt'] == 10
32 |
--------------------------------------------------------------------------------
/tests/test_parse_utils.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from os_urlpattern.exceptions import (InvalidCharException,
4 | InvalidPatternException,
5 | IrregularURLException)
6 | from os_urlpattern.parse_utils import (PieceParser, URLMeta, analyze_url,
7 | analyze_url_pattern_string, digest,
8 | filter_useless, fuzzy_digest, normalize,
9 | pack, parse_pattern_string,
10 | parse_pattern_unit_string,
11 | parse_query_string, parse_url)
12 | from os_urlpattern.pattern import Pattern
13 |
14 |
15 | def test_normalize_str():
16 | data = [
17 | ('a', 'a'),
18 | ('ab=', 'ab[=]'),
19 | ('ab1=a', 'ab1[=]a'),
20 | ('ab==a', 'ab[=]{2}a'),
21 | ('ab=={a', 'ab[=]{2}[\\{]a'),
22 | ('=', '[=]'),
23 | ('==', '[=]{2}'),
24 | ('==+a', '[=]{2}[\\+]a'),
25 | ('\\', '[\\\\]'),
26 | ]
27 | for i, j in data:
28 | assert normalize(i) == j
29 |
30 |
31 | def test_parse_url():
32 | data = [
33 | ('http://www.test.com/', ('',), [('depth', 1)]),
34 | ('http://www.test.com/?', ('', ''), [('depth', 2)]),
35 | ('http://www.test.com/abc/def?k=v#xxx', ('abc', 'def', 'v', 'xxx'),
36 | [('depth', 4), ('has_fragment', True)]),
37 | ]
38 | for url, p, m in data:
39 | url_meta, parts = analyze_url(url)
40 | assert parts == p
41 | for k, v in m:
42 | assert getattr(url_meta, k) == v
43 | with pytest.raises(IrregularURLException):
44 | analyze_url('http://www.g.com')
45 |
46 |
47 | def test_parse_query_string():
48 | data = [
49 | ('a', ('',), ('a',)),
50 | ('a=', ('a=',), ('',)),
51 | ('a&b', ('a', 'b'), ('', '')),
52 | ('a=1', ('a=',), ('1',)),
53 | ('a=1&b=2', ('a=', 'b='), ('1', '2')),
54 | ]
55 | for q, k, v in data:
56 | assert parse_query_string(q) == (k, v)
57 |
58 | data = ['a&', 'a&&b', 'a=1&']
59 |
60 | for i in data:
61 | with pytest.raises(IrregularURLException):
62 | parse_query_string(i)
63 |
64 |
65 | def test_analyze_url():
66 | data = [
67 | ['http://www.g.com/test', ('path', '/test'),
68 | ('query', None), ('fragment', None)],
69 | ['http://www.g.com/test?',
70 | ('query', ''), ('fragment', None)],
71 | ['http://www.g.com/test?#',
72 | ('query', ''), ('fragment', '')],
73 | ['http://www.g.com/test?#abc',
74 | ('query', ''), ('fragment', 'abc')],
75 | ['http://www.g.com/test#abc',
76 | ('query', None), ('fragment', 'abc')],
77 | ['http://www.g.com/test?a#',
78 | ('query', 'a'), ('fragment', '')],
79 | ['http://www.g.com/test?a##',
80 | ('query', 'a'), ('fragment', '#')],
81 | ['http://www.g.com/test#?',
82 | ('query', None), ('fragment', '?')],
83 | ]
84 | for check in data:
85 | url = check[0]
86 | r = parse_url(url)
87 | for attr, expect in check[1:]:
88 | assert getattr(r, attr) == expect
89 |
90 |
91 | def test_filter_useless_part():
92 | data = [
93 | ('/', ['']),
94 | ('//', ['']),
95 | ('', ['']),
96 | ('/a/b', ['a', 'b']),
97 | ('/a/b/', ['a', 'b', '']),
98 | ('/a/b//', ['a', 'b', '']),
99 | ('/a/b///c', ['a', 'b', 'c']),
100 | ('a/b///c', ['a', 'b', 'c']),
101 | ]
102 | for s, expect in data:
103 | assert filter_useless(s.split('/')) == expect
104 |
105 |
106 | def test_piece_parser():
107 | parser = PieceParser()
108 | data = [
109 | ('abc', ('abc', ), ('a-z', )),
110 | ('abc.exe', ('abc', '[\\.]', 'exe'), ('a-z', '\\.', 'a-z')),
111 | ('%' * 10, ('[%]{10}', ), ('%', )),
112 | ('abc1D..exe', ('abc', '1', 'D',
113 | '[\\.]{2}', 'exe'), ('a-z', '0-9', 'A-Z', '\\.', 'a-z')),
114 | ('@<>..', ('[@]', '[<]', '[>]', '[\\.]{2}'), ('@', '<', '>', '\\.')),
115 | ]
116 | for piece, expected_pieces, expected_rules in data:
117 | parsed = parser.parse(piece)
118 | assert parsed.rules == expected_rules
119 | assert parsed.pieces == expected_pieces
120 | assert parsed.piece_length == len(piece)
121 | with pytest.raises(InvalidCharException):
122 | parser.parse(' a')
123 |
124 |
125 | def test_unpack_pack():
126 | data = [
127 | ('http://www.g.com/', '/'),
128 | ('http://www.g.com/abc', '/abc'),
129 | ('http://www.g.com/abc?a=1#c', '/abc[\\?]a=1#c'),
130 | ('http://www.g.com/abc???a=1#c', '/abc[\\?][\\?]{2}a=1#c'),
131 | ('http://www.g.com/abc?=1#c', '/abc[\\?]=1#c'),
132 | ('http://www.g.com/abc?a=1#', '/abc[\\?]a=1#'),
133 | ('http://www.g.com/abc?a=1&b=2#', '/abc[\\?]a=1&b=2#'),
134 | ]
135 | for url, expected in data:
136 | assert pack(*analyze_url(url)) == expected
137 |
138 |
139 | def test_url_meta():
140 | url_meta1 = URLMeta(1, ['key1', 'key2'], False)
141 | assert url_meta1.depth == 3
142 | url_meta2 = URLMeta(1, ['key1', 'key2'], True)
143 | assert url_meta2.depth == 4
144 | url_meta3 = URLMeta(1, ['key1', 'key2'], False)
145 |
146 |
147 | def test_parse_url_pattern():
148 | data = [
149 | 'http://www.g.com/',
150 | 'http://www.g.com/abc',
151 | 'http://www.g.com/abc?a=1#c',
152 | 'http://www.g.com/abc???a=1#c',
153 | 'http://www.g.com/abc?=1#c',
154 | 'http://www.g.com/abc?a=1#',
155 | 'http://www.g.com/abc?a=1&b=2#',
156 | ]
157 | for url in data:
158 | meta1, parts1 = analyze_url(url)
159 | pattern_string = pack(meta1, parts1)
160 | meta2, parts2 = analyze_url_pattern_string(pattern_string)
161 | assert meta1 == meta2
162 | assert len(parts1) == len(parts2)
163 |
164 |
165 | def test_parse_pattern_string():
166 | data = [
167 | ('abc', 1),
168 | ('[0-9]{2}abc', 2),
169 | ('abc[0-9]+', 2),
170 | ('abc[\\[\\?][a-z]', 3),
171 | ('', 1),
172 | ('abcAbc', 3),
173 | ]
174 | for p_str, num in data:
175 | ps = parse_pattern_string(p_str)
176 | assert ''.join([str(u) for u in ps]) == p_str
177 | assert len(ps) == num
178 |
179 | invalid_data = [
180 | '[a-z',
181 | 'a-z]',
182 | '[a-z]{-}',
183 | '[a-z]{-2}',
184 | '?',
185 | '[a-z]++',
186 | ]
187 |
188 | for data in invalid_data:
189 | with pytest.raises(InvalidPatternException):
190 | parse_pattern_string(data)
191 |
192 |
193 | def test_parse_pattern_unit_string():
194 | data = [
195 | ('[a-z]', set(['a-z']), 1),
196 | ('[a-z]+', set(['a-z']), -1),
197 | ('', set(['']), 1),
198 | ('[%\\+]{12}', set(['%', '\\+']), 12),
199 | ]
200 | for p_str, e_rules, e_num in data:
201 | rules, num = parse_pattern_unit_string(p_str)
202 | assert num == e_num
203 | assert rules == e_rules
204 |
205 | invalid_data = [
206 | '[z-a]',
207 | '[z-a]{abc}',
208 | '[z-a]{-1}',
209 | '[\\._]',
210 | '[0-9a-z]',
211 | ]
212 | for data in invalid_data:
213 | with pytest.raises(InvalidPatternException):
214 | parse_pattern_unit_string(data)
215 |
216 |
217 | def test_parse_url_pattern_string():
218 | patterns = [
219 | ('/AaBb/123456.shtml', '/[A-Za-z]+/[0-9]{6}[\\.]shtml'),
220 | ('/abc/123/index.html', '/abc/123/index[\\.]html'),
221 | ('/12345678/index.asp?id=123',
222 | '/[0-9]{8}/[a-z]+[\\.]asp[\\?]id=[0-9]+'),
223 | ('/newsShow.asp?dataID=1', '/newsShow[\\.]asp[\\?]dataID=[0-9]+'),
224 | ]
225 |
226 | for url, pattern in patterns:
227 | url = 'http://example.com' + url
228 | um1, pieces = analyze_url(url)
229 | um2, pattern_strings = analyze_url_pattern_string(pattern)
230 | assert um1 == um2
231 | for p, s in zip(pattern_strings, pieces):
232 | assert Pattern(p).match(s)
233 |
234 |
235 | def test_digest():
236 | parser = PieceParser()
237 | data = [
238 | ('/abc/', '/abcdef/'),
239 | ('/abc/index.html?k1=v1&k2=v2', '/abc/html.htm?k1=c01&k2=2m'),
240 | ('/abc/index.html?k1=v1#abc', '/abc/html.htm?k1=c01#def'),
241 | ]
242 |
243 | for urls in data:
244 | urls = ['http://example.com' + u for u in urls]
245 | digests = set()
246 | for url in urls:
247 | url_meta, pieces = analyze_url(url)
248 | parsed_pieces = [parser.parse(piece) for piece in pieces]
249 | sid = digest(url_meta, [p.fuzzy_rule for p in parsed_pieces])
250 | assert fuzzy_digest(url_meta, parsed_pieces) == sid
251 | digests.add(sid)
252 | assert len(digests) == 1
253 |
--------------------------------------------------------------------------------
/tests/test_parsed_piece_view.py:
--------------------------------------------------------------------------------
1 | from os_urlpattern.parsed_piece_view import (FuzzyView, LastDotSplitFuzzyView,
2 | LengthView, MixedView, MultiView,
3 | PieceView, view_cls_from_pattern)
4 | from os_urlpattern.pattern import Pattern
5 |
6 |
7 | def test_view_cls_from_pattern():
8 | data = [
9 | ('abc', PieceView, False),
10 | ('[a-z]{2}', LengthView, False),
11 | ('[a-z]+', FuzzyView, False),
12 | ('abc[A-Z]{2}', MultiView, False),
13 | ('[A-Za-z]{3}123', MixedView, False),
14 | ('[A-Za-z]+[\\.]html', LastDotSplitFuzzyView, True),
15 | ('id[_][0-9A-Za-z]+[\.][a-z]+', MixedView, True),
16 | ]
17 |
18 | for p_str, view_cls, is_last_path in data:
19 | assert view_cls_from_pattern(Pattern(p_str), is_last_path) == view_cls
20 |
--------------------------------------------------------------------------------
/tests/test_pattern.py:
--------------------------------------------------------------------------------
1 | from os_urlpattern.parse_utils import specify_rule, wildcard_rule
2 | from os_urlpattern.pattern import Pattern, PatternUnit
3 |
4 |
5 | def test_equal():
6 | p1 = Pattern('[a-z]+')
7 | p2 = Pattern('[a-z]+')
8 | p3 = Pattern('[a-z]')
9 | assert p1 == p2
10 | assert p1 != p3
11 |
12 |
13 | def test_fuzzy_rule():
14 | data = [
15 | ('123', '0-9'),
16 | ('abc', 'a-z'),
17 | ('a1b2c3', '0-9a-z'),
18 | ('a1b2c3D4', '0-9A-Za-z'),
19 | ('a1[\\-]b2[\\-]c3[_]D4', '0-9A-Z\-_a-z'),
20 | ('[a-z]+', 'a-z'),
21 | ]
22 |
23 | for s, r in data:
24 | p = Pattern(s)
25 | assert p.fuzzy_rule == r
26 | pw = Pattern(wildcard_rule(p.fuzzy_rule))
27 | assert pw.fuzzy_rule == r
28 | pn = Pattern(specify_rule(p.fuzzy_rule, 10))
29 | assert pn.fuzzy_rule == r
30 |
31 |
32 | def test_pattern_unit():
33 | data = [
34 | ('[a-z]+', 'a-z', -1, False),
35 | ('[a-z]{3}', 'a-z', 3, False),
36 | ('abc', 'a-z', 3, True),
37 | ('[0-9]', '0-9', 1, False),
38 | ('[\\.]{2}', '\\.', 2, True),
39 | ('[\\.]', '\\.', 1, True),
40 | ('[\\._]{2}', '\\._', 2, False),
41 | ]
42 |
43 | for s, fuzzy_rule, num, literal in data:
44 | pu = PatternUnit(s)
45 | assert pu.fuzzy_rule == fuzzy_rule
46 | assert pu.num == num
47 | assert pu.is_literal() == literal
48 |
--------------------------------------------------------------------------------
/tests/test_pattern_maker.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from os_urlpattern.config import get_default_config
4 | from os_urlpattern.parse_utils import pack
5 | from os_urlpattern.pattern_maker import PatternMaker
6 | from os_urlpattern.utils import dump_tree
7 |
8 |
9 | @pytest.fixture(scope='function')
10 | def config():
11 | return get_default_config()
12 |
13 |
14 | @pytest.fixture(scope='function')
15 | def pattern_maker(config):
16 | return PatternMaker(config)
17 |
18 |
19 | def test_load(config):
20 | pm = PatternMaker(config)
21 | urls = ['http://example.com' + u for u in ['/a', '/a/b', '/a/b/c']]
22 | for url in urls:
23 | pm.load(url, meta=url)
24 | assert len(list(pm.makers)) == len(urls)
25 | for _, clustered in pm.make():
26 | for nodes in dump_tree(clustered):
27 | assert len(nodes[-1].meta) == 1
28 |
29 | config.set('make', 'drop_url', 'true')
30 | pm = PatternMaker(config)
31 | urls = ['http://example.com' + u for u in ['/a', '/b', '/c']]
32 | for url in urls:
33 | pm.load(url)
34 | assert len(list(pm.makers)) == 1
35 | for _, clustered in pm.make():
36 | for nodes in dump_tree(clustered):
37 | assert nodes[-1].meta is None
38 |
39 |
40 | def cluster_and_test(urls, pattern_string):
41 | pm = PatternMaker(get_default_config())
42 | for url in urls:
43 | pm.load(url)
44 |
45 | for url_meta, clustered in pm.make(combine=True):
46 | for nodes in dump_tree(clustered):
47 | assert pack(
48 | url_meta, [n.value for n in nodes[1:]]) == pattern_string
49 |
50 |
51 | def test_make():
52 | urls = ['http://example.com' + u for u in ['/a01', '/b02', '/c03']]
53 | cluster_and_test(urls, '/[a-z][0-9]{2}')
54 | urls = ['http://example.com' + u for u in ['/3h4hd9s9w9d9',
55 | '/9s2m1m3j2d10', '/i2i2g4g23j0m']]
56 | cluster_and_test(urls, '/[0-9a-z]{12}')
57 | urls = [u + '.html' for u in urls]
58 | cluster_and_test(urls, '/[0-9a-z]{12}[\\.]html')
59 | urls = [u + '?id=%02d' % i for i, u in enumerate(urls, 1)]
60 | cluster_and_test(urls, '/[0-9a-z]{12}[\\.]html[\\?]id=[0-9]{2}')
61 |
62 | urls = ['http://example.com' + u for u in ['/3h4hd9s9w9ddsadf9',
63 |
64 | '/9s2m1m3j2d10', '/i2i2g4g23j0dsdm']]
65 | cluster_and_test(urls, '/[0-9a-z]+')
66 |
--------------------------------------------------------------------------------
/tests/test_pattern_matcher.py:
--------------------------------------------------------------------------------
1 | from os_urlpattern.pattern_matcher import PatternMatcher
2 |
3 |
4 | def match(patterns, urls, num, most_match=None):
5 | pm = PatternMatcher()
6 | for pattern in patterns:
7 | pm.load(pattern)
8 | for url in urls:
9 | matched = pm.match(url)
10 | assert len(matched) > num
11 | if most_match:
12 | sorted(matched)
13 | matched[-1].meta == most_match
14 |
15 |
16 | def test_match():
17 | urls = ['http://example.com/abc%02d' % i for i in range(1, 10)]
18 | patterns = [
19 | '/abc[0-9]{2}',
20 | '/abc[0-9]+',
21 | '/[a-z]+[0-9]{2}',
22 | '/[a-z]{3}[0-9]{2}',
23 | '/[0-9a-z]+',
24 | '/[0-9a-z]{5}',
25 | ]
26 | for pattern in patterns:
27 | match([pattern], urls, 0)
28 | match(patterns, urls, 3, '/abc[0-9]{2}')
29 |
--------------------------------------------------------------------------------
/tests/test_piece_pattern_node.py:
--------------------------------------------------------------------------------
1 | from __future__ import unicode_literals
2 |
3 | from os_urlpattern.parse_utils import (EMPTY_PARSED_PIECE, PieceParser,
4 | analyze_url)
5 | from os_urlpattern.piece_pattern_node import (PiecePatternNode,
6 | build_from_parsed_pieces,
7 | build_from_piece_pattern_nodes)
8 | from os_urlpattern.utils import dump_tree, pick
9 |
10 |
11 | def test_count():
12 | num = 100
13 | urls = ['http://test.com/abc/%d' % i for i in range(num)]
14 | parser = PieceParser()
15 | root = PiecePatternNode((EMPTY_PARSED_PIECE, None))
16 | for url in urls:
17 | _, pieces = analyze_url(url)
18 | parsed_pieces = [parser.parse(piece) for piece in pieces]
19 | build_from_parsed_pieces(root, parsed_pieces)
20 | assert root.count == num
21 | for url in urls:
22 | _, pieces = analyze_url(url)
23 | parsed_pieces = [parser.parse(piece) for piece in pieces]
24 | build_from_parsed_pieces(root, parsed_pieces)
25 | assert root.count == num
26 | root01 = PiecePatternNode((EMPTY_PARSED_PIECE, None))
27 | for nodes in dump_tree(root):
28 | build_from_piece_pattern_nodes(root01, nodes[1:])
29 | assert root01.count == num
30 |
31 | nodes = pick(dump_tree(root))
32 | assert nodes[-1].parrent.children_num == num
33 | assert str(nodes[-1].parrent.pattern) == "abc"
34 |
--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | # Tox (https://tox.readthedocs.io/) is a tool for running tests
2 | # in multiple virtualenvs. This configuration file will run the
3 | # test suite on all supported python versions. To use it, "pip install tox"
4 | # and then run "tox" from this directory.
5 |
6 | [tox]
7 | envlist = py{27,36,py,py3}, coverage-report
8 |
9 | [base]
10 | deps =
11 | pytest > 2.10
12 | coverage
13 | pytest-env
14 |
15 | [testenv]
16 | commands =
17 | coverage run -m pytest {posargs}
18 |
19 | deps =
20 | {[base]deps}
21 | six
22 | ete3
23 |
24 | [testenv:coverage-report]
25 | deps = coverage
26 | skip_install = true
27 | commands =
28 | coverage combine
29 | coverage report
30 |
31 | [testenv:codecov]
32 | passenv = CI TRAVIS TRAVIS_* APPVEYOR APPVEYOR_*
33 | deps = codecov
34 | skip_install = true
35 | commands =
36 | coverage combine
37 | coverage report
38 | codecov
39 |
40 |
--------------------------------------------------------------------------------