├── .coveragerc ├── .gitignore ├── .travis.yml ├── LICENSE ├── MANIFEST.in ├── README.rst ├── pytest.ini ├── setup.cfg ├── setup.py ├── src └── os_urlpattern │ ├── VERSION │ ├── __init__.py │ ├── cmdline.py │ ├── compat.py │ ├── config │ ├── __init__.py │ └── default_config.cfg │ ├── definition.py │ ├── exceptions.py │ ├── formatter.py │ ├── parse_utils.py │ ├── parsed_piece_view.py │ ├── parser.py │ ├── pattern.py │ ├── pattern_cluster.py │ ├── pattern_maker.py │ ├── pattern_matcher.py │ ├── piece_pattern_node.py │ └── utils.py ├── tests ├── __init__.py ├── data │ └── urls_example.txt ├── test_cmdline.py ├── test_config.py ├── test_formatter.py ├── test_parse_utils.py ├── test_parsed_piece_view.py ├── test_pattern.py ├── test_pattern_maker.py ├── test_pattern_matcher.py └── test_piece_pattern_node.py └── tox.ini /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | parallel = true 3 | branch = true 4 | source = os_urlpattern 5 | 6 | [paths] 7 | source = 8 | src/os_urlpattern 9 | .tox/*/lib/python*/site-packages/os_urlpattern 10 | .tox/*/site-packages/os_urlpattern 11 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | 103 | # vscode 104 | .vscode/ 105 | 106 | # pytest 107 | .pytest_cache/ 108 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | sudo: false 3 | matrix: 4 | include: 5 | - python: 2.7 6 | env: TOXENV=py27,codecov 7 | - python: 3.6 8 | env: TOXENV=py36,codecov 9 | - python: pypy 10 | env: TOXENV=pypy 11 | - python: pypy3 12 | env: TOXENV=pypy3 13 | install: 14 | - pip install -U pip tox 15 | script: 16 | - tox 17 | deploy: 18 | provider: pypi 19 | user: cfhamlet 20 | password: 21 | secure: eGq3kLUT6D3grZ2ZlCaJ5e/9Ma3HkOLZQDDcMsWUs/zUqpngI/9ibplgbOcxpRxKCgFKn5GFDV9ZsKk00fEfYWpe4WZW2vG6mu3k63oB4FMkUQ4GGoQKcXdR27aNtNhvTzU3VPDgyEpNI5QJmTLJp3Y3fbzcjL3a87kschf6B46MP4Nu3NqWuXZDYIZN6GY8HwD6J3Ii15nl4rCS6phdYdKckyVX8coNQVWkljx+ZtfGMkClsui9BynKBNVwufm3/F1zwWI1UXCrU3v4FxqiCmK2CYSX7tdFcGHaVTf0NqscbPxZgPvM+1tUBbW1M5N5GlUf5f7CxwtFWEqFTlz926gzYrHUaewmjILWDm6OxWAKjuks8lgywQq2twYpd8UVlRywvjfaobGpptoBevuxgr/uzipeckWR0X1SiqUaFnKzuLOnVeZ9I1ixA5zcIR74xnjEOvBnMpeawzZsIidoQcn4PRzbyaR4uDxnYyWB5yW/Q9d1UbAYOe0QyQY6NnZzvkRovkge3H/Wlk+K2P0qSUmmznWSDekdBcm4yr3bZsujgWOKS3c9L/OHH+P3YVAC1x0304xGveWt0cU/sfTPpEi99N+0QOxPQX3CnutFkXZIgR4nsGWnZYnMngrr8eHIfav+Ms20UTYwjsn79vfXc10kkesQtW863GdFXBYfw3c= 22 | on: 23 | tags: true 24 | condition: ${TRAVIS_PYTHON_VERSION} == 2.7 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Ozzy 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include README.rst 3 | include MANIFEST.in 4 | graft src 5 | graft tests 6 | global-exclude __pycache__ 7 | global-exclude *.py[co] 8 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ============= 2 | os-urlpattern 3 | ============= 4 | 5 | .. image:: https://travis-ci.org/cfhamlet/os-urlpattern.svg?branch=master 6 | :target: https://travis-ci.org/cfhamlet/os-urlpattern 7 | 8 | .. image:: https://codecov.io/gh/cfhamlet/os-urlpattern/branch/master/graph/badge.svg 9 | :target: https://codecov.io/gh/cfhamlet/os-urlpattern 10 | 11 | .. image:: https://img.shields.io/pypi/pyversions/os-urlpattern.svg 12 | :alt: PyPI - Python Version 13 | :target: https://pypi.python.org/pypi/os-urlpattern 14 | 15 | .. image:: https://img.shields.io/pypi/v/os-urlpattern.svg 16 | :alt: PyPI 17 | :target: https://pypi.python.org/pypi/os-urlpattern 18 | 19 | 20 | This package is used for unsupervised URLs clustering. Furthermore, it generate URL patterns(RegEx) 21 | from clusters for matching purpose. It is a pure python package tested under python2.7 python3.6, 22 | `pypy `_ can also be used for performance(4x-8x). Command line tools are provided 23 | for standalone clustering and matching, APIs are also convenient. Several extra packages can be 24 | installed for additional features. Under CPython 1cpu, 100 thousand URLs clustering cost almost 1min 25 | and 200M memory. Built-in matching strategy is efficient enough in most use cases(4k/s, depend on 26 | patterns complexity). 27 | 28 | .. code:: console 29 | 30 | $ pip install -U os-urlpattern 31 | $ wget -qO- 'https://git.io/f4QlP' | pattern-make 32 | /[0-9]{2}[\.]html 33 | http://example.com/01.html 34 | http://example.com/02.html 35 | http://example.com/03.html 36 | /[0-9]{3}/test[0-9]{2}[\.]html 37 | http://example.com/123/test01.html 38 | http://example.com/456/test02.html 39 | http://example.com/789/test03.html 40 | 41 | 42 | ============== 43 | Aknowledgement 44 | ============== 45 | 46 | Similar URLs 47 | ============= 48 | 49 | * URLs with the same **URL structure**. 50 | 51 | * Components of the parsed URLs at the same position are in the same **character space**. 52 | 53 | * Different types of charactors may be in the same order in most cases. 54 | 55 | 56 | URL structure 57 | ============== 58 | 59 | Typically, URL can be parsed into 6 components: 60 | 61 | ``:///;?#`` 62 | 63 | Because different sites may have similar URLs structure and is rare, so 64 | and are ignored, are used to define URL structure. 65 | 66 | If the URLs have the same path levels, same query keys(also keys order) and with the same 67 | fragment existence, their URL structure should be the same. 68 | 69 | :: 70 | 71 | http://example.com/p1/p2?k1=v1&k2=v2#pos 72 | 73 | URL structure: 74 | path levels: 2 75 | query keys: k1, k2 76 | have fragment: True 77 | 78 | Character space 79 | =============== 80 | 81 | Consider `RFC 3986 (Section 2: Characters) `_, 82 | URL with the following characters would be legal: 83 | 84 | ``ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~:/?#[]@!$&'()*+,;=%<>\"{}^|`` 85 | 86 | There are three major character space: lower-case letters(a-z), upper-case letters(A-Z), 87 | number letters(0-9). Other symbols are in their own character space. 88 | 89 | :: 90 | 91 | HeLlOwoRd666! 92 | 93 | character space: a-z A-Z 0-9 ! 94 | 95 | Order consideration 96 | ===================== 97 | 98 | Split a string by character, consecutive character space can be joined. In most cases, order is a 99 | distinguished feature. 100 | 101 | :: 102 | 103 | HELLOword666! 104 | 105 | split into: HELLO word 666 ! 106 | 107 | character space order: A-Z a-z 0-9 ! 108 | 109 | 110 | Mix 111 | ===================== 112 | Complex consecutive major character space can be mixed, order is less important. 113 | 114 | :: 115 | 116 | HellWorld666! 117 | 118 | split into: H ell W orld 666 ! 119 | 120 | major join: HellWorld666 ! 121 | 122 | character space order: A-Za-z0-9 ! 123 | 124 | Because of URL quote, '%' can be mixed with major character space. 125 | 126 | :: 127 | 128 | %E4%BD%A0%E5%A5%BD! 129 | 130 | split into: % E 4 % BD % A 0 % E 5 % A 5 % BD ! 131 | 132 | major join: %E4%BD%A0%E5%A5%BD ! 133 | 134 | character space order: A-Z0-9% ! 135 | 136 | 137 | URL pattern 138 | ============ 139 | 140 | URL pattern is used to express each cluster. It is normal regex string. Each URL in 141 | the same cluster can be matched with the pattern. 142 | 143 | :: 144 | 145 | pattern examples: 146 | 147 | /news/[0-9]{8}/[a-z]+[\\.]html 148 | /newsShow[\\.]asp[\\?]dataID=[0-9]+ 149 | /thread[\\-][0-9]+[\\-][0-9][\\-]1[\\.]html 150 | 151 | The built-in matching strategy is strict, it can't tolerate incomplet matching. 152 | 153 | :: 154 | 155 | letter: helloword 156 | 157 | pattern01: [a-z0-9]+ # not match, because no number in the letter 158 | pattern02: [a-z]+ # match 159 | 160 | 161 | ======== 162 | Install 163 | ======== 164 | 165 | Install with pip 166 | 167 | ``$ pip install os-urlpattern`` 168 | 169 | Install extra packages 170 | 171 | .. list-table:: 172 | :header-rows: 1 173 | 174 | * - subpackage 175 | - install command 176 | - enables 177 | * - memory 178 | - ``pip install os-urlpattern[memroy]`` 179 | - Show memory useage 180 | * - ete-tree 181 | - ``pip install os-urlpattern[ete-tree]`` 182 | - Enable `ete `_ pattern tree formatter 183 | 184 | ======== 185 | Usage 186 | ======== 187 | 188 | Command line 189 | ============= 190 | 191 | * **pattern-make** 192 | 193 | Load urls, cluster and dump patterns. 194 | 195 | .. code:: console 196 | 197 | $ pattern-make -h 198 | usage: pattern-make [-h] [-v] [-i INPUTS [INPUTS ...]] 199 | [-l {NOTSET,DEBUG,INFO,WARN,ERROR,FATAL}] [-c CONFIG] 200 | [-f {PATTERN,CLUSTER,JSON,ETE,INLINE,NULL}] 201 | 202 | optional arguments: 203 | -h, --help show this help message and exit 204 | -v, --version show program's version number and exit 205 | -i INPUTS [INPUTS ...], --inputs INPUTS [INPUTS ...] 206 | input files to be processed (default: stdin) 207 | -l {NOTSET,DEBUG,INFO,WARN,ERROR,FATAL}, --loglevel {NOTSET,DEBUG,INFO,WARN,ERROR,FATAL} 208 | log level (default: NOTSET) 209 | -c CONFIG, --config CONFIG 210 | config file 211 | -f {PATTERN,CLUSTER,JSON,ETE,INLINE,NULL}, --formatter {PATTERN,CLUSTER,JSON,ETE,INLINE,NULL} 212 | output formatter (default: CLUSTER) 213 | 214 | Dump clustered URLs with patterns: 215 | 216 | .. code:: console 217 | 218 | $ cat urls.txt | pattern-make -L debug > clustered.txt 219 | 220 | Only generate URL patterns: 221 | 222 | .. code:: console 223 | 224 | $ cat urls.txt | pattern-make -L debug -F pattern > patterns.txt 225 | 226 | Generate pattern tree from URLs(`ete `_ installed): 227 | 228 | .. code:: console 229 | 230 | $ cat urls.txt | pattern-make -L debug -F ete 231 | 232 | * **pattern-match** 233 | 234 | Load patterns, dump URLs matched results. 235 | 236 | .. code:: console 237 | 238 | $ pattern-match -h 239 | usage: pattern-match [-h] [-v] [-i INPUTS [INPUTS ...]] 240 | [-l {NOTSET,DEBUG,INFO,WARN,ERROR,FATAL}] -p 241 | PATTERN_FILES [PATTERN_FILES ...] [-a] 242 | 243 | optional arguments: 244 | -h, --help show this help message and exit 245 | -v, --version show program's version number and exit 246 | -i INPUTS [INPUTS ...], --inputs INPUTS [INPUTS ...] 247 | input files to be processed (default: stdin) 248 | -l {NOTSET,DEBUG,INFO,WARN,ERROR,FATAL}, --loglevel {NOTSET,DEBUG,INFO,WARN,ERROR,FATAL} 249 | log level (default: NOTSET) 250 | -p PATTERN_FILES [PATTERN_FILES ...], --pattern-files PATTERN_FILES [PATTERN_FILES ...] 251 | pattern files to be loaded 252 | -a, --all-matched all matched patterns 253 | 254 | 255 | Match URLs: 256 | 257 | .. code:: console 258 | 259 | $ cat urls.txt | pattern-match -L debug -p patterns.txt 260 | 261 | APIs 262 | ===== 263 | 264 | * Cluster and generate URL patterns: 265 | 266 | .. code:: python 267 | 268 | from os_urlpattern.formatter import pformat 269 | from os_urlpattern.pattern_maker import PatternMaker 270 | 271 | pattern_maker = PatternMaker() 272 | 273 | # load URLs(unicode) 274 | for url in urls: 275 | pattern_maker.load(url) 276 | 277 | # cluster and print pattern 278 | for url_meta, clustered in pattern_maker.make(): 279 | for pattern in pformat('pattern', url_meta, clustered): 280 | # do whatever you want 281 | pass 282 | 283 | 284 | * Match URLs: 285 | 286 | .. code:: python 287 | 288 | from os_urlpattern.pattern_matcher import PatternMatcher 289 | 290 | pattern_matcher = PatternMatcher() 291 | 292 | # load url_pattern(unicode) 293 | for url_pattern in url_patterns: 294 | # meta will bind to matched result 295 | pattern_matcher.load(url_pattern, meta=url_pattern) 296 | 297 | # match URL(unicode) 298 | for url in urls: 299 | matched_results = patterm_matcher.match(url) 300 | # the best matched result: 301 | # sorted(matched_results, reverse=True)[0] 302 | patterns = [n.meta for n in matched_results] 303 | 304 | 305 | * Low-level APIs: 306 | 307 | It is necessary to use low-level APIs for customizing processing procdure, 308 | especially for parallel computing or working on an distributed cluster(hadoop). 309 | 310 | **Key points: same fuzzy-digest same maker and same matcher.** 311 | 312 | Use ``os_urlpattern.parser.fuzzy_digest`` to get fuzzy digest from URL, 313 | URL pattern or URLMeta and parsed pieces/patterns. 314 | 315 | A brief All-In-One example: 316 | 317 | .. code:: python 318 | 319 | from __future__ import print_function, unicode_literals 320 | from os_urlpattern.formatter import pformat 321 | from os_urlpattern.parser import fuzzy_digest, parse 322 | from os_urlpattern.pattern_maker import Maker 323 | from os_urlpattern.pattern_matcher import Matcher 324 | 325 | urls = ['http://t.com/%02d.html' % i for i in xrange(0,10)] 326 | makers = {} 327 | matchers = {} 328 | 329 | # Init makers from URLs(unicode). 330 | for url in urls: 331 | url_meta, parsed_pieces = parse(url) 332 | 333 | # same digest same maker 334 | digest = fuzzy_digest(url_meta, parsed_pieces) 335 | if digest not in makers: 336 | makers[digest] = Maker(url_meta) 337 | makers[digest].load(parsed_pieces) 338 | 339 | # Iterate makers, do clustering, generate URL pattern and init matchers. 340 | for maker in makers.values(): 341 | for clustered in maker.make(): 342 | for pattern in pformat('pattern', maker.url_meta, clustered): 343 | # init matchers 344 | url_meta, parsed_patterns = parse(pattern) 345 | digest = fuzzy_digest(url_meta, parsed_patterns) 346 | if digest not in matchers: 347 | matchers[digest] = Matcher(url_meta) 348 | matchers[digest].load(parsed_patterns, pattern) 349 | 350 | # Match URLs(unicode). 351 | for url in urls: 352 | url_meta, parsed_pieces = parse(url) 353 | 354 | # same digest same matcher 355 | digest = fuzzy_digest(url_meta, parsed_pieces) 356 | if digest in matchers: 357 | matched = [n.meta for n in matchers[digest].match(parsed_pieces)] 358 | print(url, *matched, sep="\t") 359 | else: # no matched at all 360 | pass 361 | 362 | 363 | 364 | ============ 365 | Unit Tests 366 | ============ 367 | 368 | ``$ tox`` 369 | 370 | ============ 371 | License 372 | ============ 373 | 374 | MIT licensed. 375 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | addopts = -s --fulltrace -v 3 | env = 4 | COVERAGE = true 5 | 6 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal = 1 3 | 4 | [metadata] 5 | description-file = README.rst 6 | license_file = LICENSE 7 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | 4 | def read(*filenames, **kwargs): 5 | import io 6 | from os.path import join, dirname 7 | encoding = kwargs.get('encoding', 'utf-8') 8 | sep = kwargs.get('sep', '\n') 9 | buf = [] 10 | for filename in filenames: 11 | with io.open(join(dirname(__file__), filename), encoding=encoding) as f: 12 | buf.append(f.read()) 13 | return sep.join(buf) 14 | 15 | 16 | setup( 17 | name='os-urlpattern', 18 | version=read('src/os_urlpattern/VERSION'), 19 | packages=find_packages(where='src'), 20 | package_dir={'': 'src'}, 21 | include_package_data=True, 22 | license='MIT License', 23 | description='Cluster url pattern automatically.', 24 | long_description=open('README.rst').read(), 25 | author='Ozzy', 26 | author_email='cfhamlet@gmail.com', 27 | url='https://github.com/cfhamlet/os-urlpattern', 28 | zip_safe=False, 29 | entry_points={ 30 | 'console_scripts': [ 31 | 'pattern-make = os_urlpattern.cmdline:make', 32 | 'pattern-match = os_urlpattern.cmdline:match', 33 | ] 34 | }, 35 | extras_require={ 36 | 'memory': ['psutil'], 37 | 'ete-tree': ['six', 'ete3'] 38 | }, 39 | classifiers=[ 40 | 'Development Status :: 2 - Pre-Alpha', 41 | 'Intended Audience :: Developers', 42 | 'License :: OSI Approved :: MIT License', 43 | 'Natural Language :: English', 44 | 'Programming Language :: Python :: 2', 45 | 'Programming Language :: Python :: 2.7', 46 | 'Programming Language :: Python :: 3', 47 | 'Programming Language :: Python :: 3.6', 48 | 'Programming Language :: Python :: Implementation :: CPython', 49 | 'Programming Language :: Python :: Implementation :: PyPy', 50 | ]) 51 | -------------------------------------------------------------------------------- /src/os_urlpattern/VERSION: -------------------------------------------------------------------------------- 1 | 0.1.11 2 | -------------------------------------------------------------------------------- /src/os_urlpattern/__init__.py: -------------------------------------------------------------------------------- 1 | """os-urlpattern. 2 | 3 | Unsupervised URLs clustering, generate and match URL pattern. 4 | """ 5 | import sys 6 | __all__ = ['__version__', 'version_info'] 7 | 8 | import pkgutil 9 | __version__ = pkgutil.get_data(__package__, 'VERSION').decode('ascii').strip() 10 | version_info = tuple(int(v) if v.isdigit() else v 11 | for v in __version__.split('.')) 12 | 13 | if sys.version_info < (2, 7): 14 | sys.exit("os-urlpattern %s requires Python 2.7" % __version__) 15 | 16 | del pkgutil 17 | del sys 18 | -------------------------------------------------------------------------------- /src/os_urlpattern/cmdline.py: -------------------------------------------------------------------------------- 1 | """Command line tools. 2 | 3 | pattern-make: 4 | Load URLs, cluster then generate URL pattern. 5 | 6 | pattern-matcher: 7 | Load pattern, match URL and get matched results. 8 | 9 | """ 10 | from __future__ import print_function, unicode_literals 11 | 12 | import argparse 13 | import logging.config 14 | import sys 15 | import time 16 | from collections import Counter 17 | from itertools import chain 18 | 19 | from . import __version__ 20 | from .compat import binary_stdin, binary_stdout 21 | from .config import get_default_config 22 | from .definition import DEFAULT_ENCODING 23 | from .exceptions import (InvalidCharException, InvalidPatternException, 24 | IrregularURLException) 25 | from .formatter import FORMATTERS, pformat 26 | from .pattern_maker import PatternMaker 27 | from .pattern_matcher import PatternMatcher 28 | from .utils import LogSpeedAdapter, MemoryUsageFormatter, pretty_counter 29 | 30 | _DEFAULT_LOGGING = { 31 | 'version': 1, 32 | 'disable_existing_loggers': True, 33 | 'incremental': True, 34 | } 35 | 36 | 37 | def _config_logging(log_level): 38 | logging.config.dictConfig(_DEFAULT_LOGGING) 39 | if log_level == 'NOTSET': 40 | handler = logging.NullHandler() 41 | else: 42 | handler = logging.StreamHandler() 43 | formatter = MemoryUsageFormatter( 44 | fmt='[%(asctime)s] [%(name)s] [%(levelname)s] [%(memory)s] %(message)s', 45 | datefmt='%Y-%m-%d %H:%M:%S', 46 | ) 47 | logging.root.setLevel(logging.NOTSET) 48 | handler.setFormatter(formatter) 49 | handler.setLevel(log_level) 50 | logging.root.addHandler(handler) 51 | 52 | 53 | class Command(object): 54 | def __init__(self, config=None): 55 | self._config = config 56 | self._logger = logging.getLogger(self.__class__.__name__) 57 | 58 | def add_argument(self, parser): 59 | 60 | parser.add_argument('-v', '--version', 61 | action='version', 62 | version='%(prog)s {version}'.format( 63 | version=__version__) 64 | ) 65 | 66 | parser.add_argument('-i', '--inputs', 67 | help='input files to be processed (default: stdin)', 68 | nargs='+', 69 | type=argparse.FileType('rb'), 70 | default=[binary_stdin], 71 | dest='inputs') 72 | 73 | parser.add_argument('-l', '--loglevel', 74 | help='log level (default: NOTSET)', 75 | default='NOTSET', 76 | action='store', 77 | dest='log_level', 78 | choices=['NOTSET', 'DEBUG', 'INFO', 79 | 'WARN', 'ERROR', 'FATAL'], 80 | type=lambda s: s.upper()) 81 | 82 | def process_args(self, args): 83 | _config_logging(args.log_level) 84 | 85 | def run(self, args): 86 | raise NotImplementedError 87 | 88 | 89 | class MakePatternCommand(Command): 90 | 91 | def process_args(self, args): 92 | super(MakePatternCommand, self).process_args(args) 93 | if args.config: 94 | self._config.readfp(args.config[0]) 95 | 96 | def add_argument(self, parser): 97 | super(MakePatternCommand, self).add_argument(parser) 98 | parser.add_argument('-c', '--config', 99 | help='config file', 100 | nargs=1, 101 | type=argparse.FileType('r'), 102 | dest='config') 103 | 104 | parser.add_argument('-f', '--formatter', 105 | help='output formatter (default: CLUSTER)', 106 | default='CLUSTER', 107 | action='store', 108 | dest='format_type', 109 | choices=FORMATTERS.keys(), 110 | type=lambda s: s.upper()) 111 | 112 | def _load(self, pattern_maker, args): 113 | load_url = args.format_type in ('CLUSTER', 'INLINE') 114 | stats = Counter() 115 | with LogSpeedAdapter(self._logger, 5000) as speed_logger: 116 | load = pattern_maker.load 117 | for line in chain.from_iterable(args.inputs): 118 | speed_logger.debug('[LOADING]') 119 | stats['ALL'] += 1 120 | line = line.strip() 121 | if not line: 122 | stats['EMPTY'] += 1 123 | continue 124 | try: 125 | url = line.decode(DEFAULT_ENCODING) 126 | _, is_new = load(url, meta=url if load_url else None) 127 | if is_new: 128 | stats['UNIQ'] += 1 129 | stats['VALID'] += 1 130 | except (InvalidPatternException, 131 | IrregularURLException, 132 | InvalidCharException, 133 | UnicodeDecodeError, 134 | ValueError) as e: 135 | self._logger.warn('%s, %r', str(e), line) 136 | stats['INVALID'] += 1 137 | continue 138 | except Exception as e: 139 | self._logger.error('%s, %r', str(e), line) 140 | stats['INVALID'] += 1 141 | continue 142 | self._logger.debug('[LOADED] %s', pretty_counter(stats)) 143 | 144 | def _process(self, pattern_maker, args): 145 | combine = args.format_type == 'ETE' 146 | s = time.time() 147 | for maker in pattern_maker.makers: 148 | for root in maker.make(combine): 149 | e = time.time() 150 | self._logger.debug('[CLUSTER] %d %.2fs', root.count, e - s) 151 | for record in pformat(args.format_type, maker.url_meta, root): 152 | print(record) 153 | s = time.time() 154 | 155 | def run(self, args): 156 | pattern_maker = PatternMaker(self._config) 157 | self._load(pattern_maker, args) 158 | self._process(pattern_maker, args) 159 | 160 | 161 | class MatchPatternCommand(Command): 162 | def __init__(self): 163 | super(MatchPatternCommand, self).__init__() 164 | 165 | def add_argument(self, parser): 166 | super(MatchPatternCommand, self).add_argument(parser) 167 | parser.add_argument('-p', '--pattern-files', 168 | help='pattern files to be loaded', 169 | nargs='+', 170 | type=argparse.FileType('rb'), 171 | required=True, 172 | dest='pattern_files') 173 | 174 | parser.add_argument('-a', '--all-matched', 175 | help='all matched patterns', 176 | default=False, 177 | action='store_true', 178 | dest='all_matched') 179 | 180 | def _load(self, pattern_matcher, args): 181 | stats = Counter() 182 | p_inputs = args.pattern_files 183 | self._logger.debug('[LOAD] %d pattern file%s: %s', 184 | len(p_inputs), 185 | 's' if len(p_inputs) > 1 else '', 186 | ', '.join([p.name for p in p_inputs])) 187 | with LogSpeedAdapter(self._logger, 1000) as speed_logger: 188 | load = pattern_matcher.load 189 | for line in chain.from_iterable(p_inputs): 190 | speed_logger.debug('[LOADING]') 191 | stats['ALL'] += 1 192 | line = line.rstrip() 193 | if not line.startswith(b'/'): 194 | stats['UNKNOW'] += 1 195 | continue 196 | try: 197 | pattern = line.decode(DEFAULT_ENCODING) 198 | load(pattern, meta=pattern) 199 | stats['VALID'] += 1 200 | except Exception as e: 201 | self._logger.warn("%s, %r", str(e), line) 202 | stats['INVALID'] += 1 203 | self._logger.debug('[LOAD] Finished %s', pretty_counter(stats)) 204 | 205 | def _match_result(self, pattern_matcher, raw_url, args): 206 | result = None 207 | try: 208 | url = raw_url.decode(DEFAULT_ENCODING) 209 | result = pattern_matcher.match(url) 210 | if not args.all_matched: 211 | result = sorted(result, reverse=True) 212 | result = result[:1] 213 | result = '\t'.join([r.meta for r in result] 214 | ).encode(DEFAULT_ENCODING) 215 | except (InvalidPatternException, 216 | IrregularURLException, 217 | InvalidCharException, 218 | UnicodeDecodeError, 219 | ValueError) as e: 220 | result = b'E' 221 | self._logger.warn("%s, %r", str(e), raw_url) 222 | except Exception as e: 223 | result = b'E' 224 | self._logger.error("%s, %r", str(e), raw_url) 225 | return result 226 | 227 | def _match(self, pattern_matcher, args): 228 | speed_logger = LogSpeedAdapter(self._logger, 5000) 229 | write = binary_stdout.write 230 | for line in chain.from_iterable(args.inputs): 231 | speed_logger.debug('[MATCHING]') 232 | line = line.strip() 233 | result = self._match_result(pattern_matcher, line, args) 234 | if not result: 235 | result = b'N' 236 | write(result) 237 | write(b'\t') 238 | write(line) 239 | write(b'\n') 240 | 241 | def run(self, args): 242 | pattern_matcher = PatternMatcher() 243 | self._load(pattern_matcher, args) 244 | self._match(pattern_matcher, args) 245 | 246 | 247 | def _execute(command, argv=None): 248 | argv = argv or sys.argv 249 | parser = argparse.ArgumentParser() 250 | command.add_argument(parser) 251 | args = parser.parse_args(argv[1:]) 252 | command.process_args(args) 253 | command.run(args) 254 | 255 | 256 | def make(argv=None): 257 | _execute(MakePatternCommand(get_default_config()), argv) 258 | 259 | 260 | def match(argv=None): 261 | _execute(MatchPatternCommand(), argv) 262 | -------------------------------------------------------------------------------- /src/os_urlpattern/compat.py: -------------------------------------------------------------------------------- 1 | """Compatible import. 2 | """ 3 | 4 | from __future__ import unicode_literals 5 | import operator 6 | import string 7 | import sys 8 | 9 | _PY3 = sys.version_info[0] >= 3 10 | 11 | if _PY3: 12 | from io import StringIO 13 | iteritems = operator.methodcaller("items") 14 | itervalues = operator.methodcaller("values") 15 | from urllib.parse import urlparse, ParseResult 16 | from configparser import ConfigParser 17 | binary_stdin = sys.stdin.buffer 18 | binary_stdout = sys.stdout.buffer 19 | else: 20 | try: 21 | from cStringIO import StringIO # safe, only process ascii 22 | except ImportError: 23 | from StringIO import StringIO 24 | iteritems = operator.methodcaller("iteritems") 25 | itervalues = operator.methodcaller("itervalues") 26 | from urlparse import urlparse, ParseResult 27 | from ConfigParser import ConfigParser 28 | binary_stdin = sys.stdin 29 | binary_stdout = sys.stdout 30 | -------------------------------------------------------------------------------- /src/os_urlpattern/config/__init__.py: -------------------------------------------------------------------------------- 1 | """Configure. 2 | """ 3 | from ..compat import ConfigParser 4 | 5 | def get_default_config(): 6 | """Get default configure instance. 7 | 8 | Returns: 9 | Config -- default confiure instance 10 | """ 11 | import os 12 | path = os.path.dirname(__file__) 13 | cfg = ConfigParser() 14 | cfg.read(os.path.join(path, 'default_config.cfg')) 15 | return cfg 16 | -------------------------------------------------------------------------------- /src/os_urlpattern/config/default_config.cfg: -------------------------------------------------------------------------------- 1 | [make] 2 | min_cluster_num = 3 -------------------------------------------------------------------------------- /src/os_urlpattern/definition.py: -------------------------------------------------------------------------------- 1 | """Definition of global constant varialbles. 2 | """ 3 | 4 | from __future__ import unicode_literals 5 | 6 | import hashlib 7 | import string 8 | 9 | from .pattern import Pattern 10 | 11 | DEFAULT_ENCODING = 'UTF-8' 12 | 13 | 14 | class Symbols(object): 15 | PLUS = '+' 16 | EMPTY = '' 17 | SLASH = '/' 18 | EQUALS = '=' 19 | NUMBER = '#' 20 | PERCENT = '%' 21 | QUESTION = '?' 22 | BRACES_L = '{' 23 | BRACES_R = '}' 24 | AMPERSAND = '&' 25 | BACKSLASH = '\\' 26 | BRACKETS_L = '[' 27 | BRACKETS_R = ']' 28 | 29 | 30 | class BasePatternRule(object): 31 | DIGIT = '0-9' 32 | BASE_ASCII_LOWER = 'a-z' 33 | BASE_ASCII_UPPER = 'A-Z' 34 | BASE_ASCII = 'A-Za-z' 35 | BASE_DIGIT_AND_ASCII_LOWER = '0-9a-z' 36 | BASE_DIGIT_AND_ASCII_UPPER = '0-9A-Z' 37 | BASE_DIGIT_AND_ASCII = '0-9A-Za-z' 38 | SINGLE_DIGIT = '[0-9]' 39 | SINGLE_ASCII_LOWER = '[a-z]' 40 | SINGLE_ASCII_UPPER = '[A-Z]' 41 | MULTI_DIGIT = '[0-9]+' 42 | MULTI_ASCII_LOWER = '[a-z]+' 43 | MULTI_ASCII_UPPER = '[A-Z]+' 44 | MULTI_ASCII = '[A-Za-z]+' 45 | MULTI_DIGIT_AND_ASCII_LOWER = '[0-9a-z]+' 46 | MULTI_DIGIT_AND_ASCII_UPPER = '[0-9A-Z]+' 47 | MULTI_DIGIT_AND_ASCII = '[0-9A-Za-z]+' 48 | DOT = '\\.' 49 | EMPTY = '' 50 | SINGLE_QUESTION = '[\\?]' 51 | 52 | 53 | ZERO_DIGEST = hashlib.md5(b'0').hexdigest().upper() 54 | QUERY_PART_RESERVED_CHARS = frozenset([Symbols.EQUALS]) 55 | EMPTY_TUPLE = () 56 | BLANK_TUPLE = (BasePatternRule.EMPTY,) 57 | 58 | # 26 letters rules 59 | CHAR_AND_RULE_LIST = [] 60 | ASCII_AND_RULE_LIST = [] 61 | ASCII_AND_RULE_LIST.extend([(i, BasePatternRule.BASE_ASCII_LOWER) 62 | for i in string.ascii_lowercase]) 63 | ASCII_AND_RULE_LIST.extend([(i, BasePatternRule.BASE_ASCII_UPPER) 64 | for i in string.ascii_uppercase]) 65 | CHAR_AND_RULE_LIST.extend(ASCII_AND_RULE_LIST) 66 | 67 | # digit rules 68 | DIGIT_AND_RULE_LIST = [(i, BasePatternRule.DIGIT) 69 | for i in string.digits] 70 | CHAR_AND_RULE_LIST.extend(DIGIT_AND_RULE_LIST) 71 | 72 | # digit and 26 letters set 73 | DIGIT_SET = frozenset([i for i in string.digits]) 74 | ASCII_LOWER_SET = frozenset([i for i in string.ascii_lowercase]) 75 | ASCII_UPPER_SET = frozenset([i for i in string.ascii_uppercase]) 76 | ASCII_DIGIT_SET = frozenset([c for c, _ in CHAR_AND_RULE_LIST]) 77 | 78 | # do not escaped symbol rules 79 | SYMBOL = '%&_@#;:,=<>~/' 80 | SYMBOL_SET = frozenset([i for i in SYMBOL]) 81 | SYMBOL_AND_RULE_LIST = [(i, i) for i in SYMBOL_SET] 82 | CHAR_AND_RULE_LIST.extend(SYMBOL_AND_RULE_LIST) 83 | 84 | # escaped symbol rules 85 | ESCAPE = '.+\\"\'()[]{}*$^?|!-' 86 | ESCAPE_SET = frozenset([i for i in ESCAPE]) 87 | ESCAPE_AND_RULE_LIST = [(i, '\\%s' % i) for i in ESCAPE_SET] 88 | CHAR_AND_RULE_LIST.extend(ESCAPE_AND_RULE_LIST) 89 | 90 | # all char and rule mapping 91 | CHAR_RULE_DICT = dict(CHAR_AND_RULE_LIST) 92 | RULE_SET = frozenset([r for _, r in CHAR_AND_RULE_LIST]) 93 | 94 | # == 95 | RULE_SIGN_DICT = dict( 96 | [(v, k) for k, v in SYMBOL_AND_RULE_LIST + ESCAPE_AND_RULE_LIST]) 97 | SIGN_RULE_SET = frozenset(RULE_SIGN_DICT.keys()) 98 | 99 | # == 100 | DIGIT_AND_ASCII_LOWER_RULE_LIST = [BasePatternRule.DIGIT, 101 | BasePatternRule.BASE_ASCII_LOWER] 102 | DIGIT_AND_ASCII_UPPER_RULE_LIST = [BasePatternRule.DIGIT, 103 | BasePatternRule.BASE_ASCII_UPPER] 104 | DIGIT_AND_ASCII_RULE_LIST = [BasePatternRule.DIGIT, 105 | BasePatternRule.BASE_ASCII_LOWER, 106 | BasePatternRule.BASE_ASCII_UPPER, 107 | BasePatternRule.BASE_ASCII] 108 | 109 | DIGIT_AND_ASCII_UPPER_RULE_SET = frozenset(DIGIT_AND_ASCII_UPPER_RULE_LIST) 110 | DIGIT_AND_ASCII_LOWER_RULE_SET = frozenset(DIGIT_AND_ASCII_LOWER_RULE_LIST) 111 | DIGIT_AND_ASCII_RULE_SET = frozenset(DIGIT_AND_ASCII_RULE_LIST) 112 | 113 | # == 114 | BASE_ASCII_RULE_SET = frozenset([BasePatternRule.BASE_ASCII, 115 | BasePatternRule.BASE_ASCII_LOWER, 116 | BasePatternRule.BASE_ASCII_UPPER]) 117 | 118 | MULTI_ASCII_RULE_SET = frozenset([BasePatternRule.MULTI_ASCII, 119 | BasePatternRule.MULTI_ASCII_LOWER, 120 | BasePatternRule.MULTI_ASCII_UPPER]) 121 | 122 | MIXED_RULE_SET = DIGIT_AND_ASCII_RULE_SET.union([Symbols.PERCENT]) 123 | 124 | 125 | class BasePattern(object): 126 | SINGLE_DIGIT = Pattern(BasePatternRule.SINGLE_DIGIT) 127 | SINGLE_ASCII_LOWER = Pattern(BasePatternRule.SINGLE_ASCII_LOWER) 128 | SINGLE_ASCII_UPPER = Pattern(BasePatternRule.SINGLE_ASCII_UPPER) 129 | MULTI_DIGIT = Pattern(BasePatternRule.MULTI_DIGIT) 130 | MULTI_ASCII_LOWER = Pattern(BasePatternRule.MULTI_ASCII_LOWER) 131 | MULTI_ASCII_UPPER = Pattern(BasePatternRule.MULTI_ASCII_UPPER) 132 | MULTI_DIGIT_AND_ASCII_LOWER = Pattern( 133 | BasePatternRule.MULTI_DIGIT_AND_ASCII_LOWER) 134 | MULTI_DIGIT_AND_ASCII_UPPER = Pattern( 135 | BasePatternRule.MULTI_DIGIT_AND_ASCII_UPPER) 136 | MULTI_DIGIT_AND_ASCII = Pattern(BasePatternRule.MULTI_DIGIT_AND_ASCII) 137 | DOT = Pattern(BasePatternRule.DOT) 138 | EMPTY = Pattern(BasePatternRule.EMPTY) 139 | -------------------------------------------------------------------------------- /src/os_urlpattern/exceptions.py: -------------------------------------------------------------------------------- 1 | """Custom Exceptions. 2 | """ 3 | 4 | 5 | class IrregularURLException(Exception): 6 | pass 7 | 8 | 9 | class InvalidPatternException(Exception): 10 | pass 11 | 12 | 13 | class InvalidCharException(Exception): 14 | pass 15 | -------------------------------------------------------------------------------- /src/os_urlpattern/formatter.py: -------------------------------------------------------------------------------- 1 | """Clustered record formatter. 2 | """ 3 | from __future__ import unicode_literals 4 | 5 | import json 6 | import sys 7 | 8 | from .definition import BasePatternRule, Symbols 9 | from .parse_utils import pack 10 | from .utils import dump_tree, get_classes 11 | 12 | 13 | class Formatter(object): 14 | """Base class for format clustered data. 15 | 16 | The subclass must define format method which yield formatted strings. 17 | """ 18 | 19 | def format(self, url_meta, root, **kwargs): 20 | """Format the clustered tree. 21 | 22 | Args: 23 | url_meta (URLMeta): The url_meta. 24 | root (TreeNode): Root node of the clustered tree. 25 | **kwargs: Arbitray keyword arguments. 26 | 27 | Yields: 28 | str: the formatted string. 29 | 30 | """ 31 | return 32 | yield 33 | 34 | 35 | class PatternFormatter(Formatter): 36 | """Pattern only formatter.""" 37 | 38 | def format(self, url_meta, root, **kwargs): 39 | """Yield URL pattern string. 40 | 41 | Args: 42 | url_meta (URLMeta): The URLMeta object. 43 | root (TreeNode): Root of a clustered piece tree. 44 | **kwargs: Arbitray keyword arguments. 45 | 46 | Yields: 47 | str: URL pattern string. 48 | 49 | """ 50 | for nodes in dump_tree(root): 51 | yield pack(url_meta, [p.pattern for p in nodes[1:]]) 52 | break 53 | 54 | 55 | class ClusterFormatter(PatternFormatter): 56 | """URL pattern and meta data formatter. 57 | 58 | Yield URL pattern string first, then all meta data strings. 59 | """ 60 | 61 | def format(self, url_meta, root, **kwargs): 62 | """Yield URL pattern and all bound meta data strings. 63 | 64 | Args: 65 | url_meta (URLMeta): The URLMeta object. 66 | root (TreeNode): Root of a clustered piece tree. 67 | **kwargs: Arbitray keyword arguments. 68 | 69 | Yields: 70 | object: URL pattern string first, then all meta 71 | data string prefixed with '\t'. 72 | 73 | """ 74 | for r in super(ClusterFormatter, self).format(url_meta, root, **kwargs): 75 | yield r 76 | 77 | for nodes in dump_tree(root): 78 | if nodes[-1].meta is None: 79 | continue 80 | for obj in nodes[-1].meta: 81 | yield '\t'.join(('', str(obj))) 82 | 83 | 84 | class InlineFormatter(PatternFormatter): 85 | """URL pattern and meta data formatter. 86 | 87 | URL pattern and meta data string in one line. 88 | """ 89 | 90 | def format(self, url_meta, root, **kwargs): 91 | """Yield URL pattern with each bound meta data string in on line. 92 | 93 | Args: 94 | url_meta (URLMeta): The URLMeta object. 95 | root (TreeNode): Root of a clustered piece tree. 96 | **kwargs: Arbitray keyword arguments. 97 | 98 | Yields: 99 | object: URL pattern string + '\t' + str(meta) 100 | 101 | """ 102 | url_pattern_string = None 103 | for r in super(InlineFormatter, self).format(url_meta, root, **kwargs): 104 | url_pattern_string = r 105 | 106 | for nodes in dump_tree(root): 107 | if nodes[-1].meta is None: 108 | continue 109 | for obj in nodes[-1].meta: 110 | yield '\t'.join((url_pattern_string, str(obj))) 111 | 112 | 113 | class JsonFormatter(Formatter): 114 | """Json formatter. 115 | 116 | Yiled Json string, {"ptn":url_pattern, "cnt":count} 117 | ptn: URL pattern string. 118 | cnt: Number of uniq path in the cluster. 119 | """ 120 | 121 | def format(self, url_meta, root, **kwargs): 122 | """Yield json format string. 123 | 124 | Args: 125 | url_meta (URLMeta): The URLMeta object. 126 | root (TreeNode): Root of a clustered piece tree. 127 | **kwargs: Arbitray keyword arguments. 128 | 129 | Yields: 130 | str: Json string, key-value: 131 | ptn: URL pattern string. 132 | cnt: Number of uniq path in the cluster. 133 | """ 134 | for nodes in dump_tree(root): 135 | p = pack(url_meta, [p.pattern for p in nodes[1:]]) 136 | yield json.dumps({'ptn': p, 'cnt': root.count}) 137 | break 138 | 139 | 140 | class ETEFormatter(Formatter): 141 | """Ete tree formatter.""" 142 | 143 | def __init__(self): 144 | import ete3 145 | 146 | def format(self, url_meta, root, **kwargs): 147 | """Yield ete tree string. 148 | 149 | Args: 150 | url_meta (URLMeta): The URLMeta object. 151 | root (TreeNode): Root of a pattern tree. 152 | **kwargs: Arbitray keyword arguments. 153 | 154 | Yields: 155 | str: An ete tree string. 156 | """ 157 | def f(pattern_node): 158 | sep = Symbols.EMPTY 159 | query_key = Symbols.EMPTY 160 | path_depth = url_meta.path_depth 161 | query_depth = len(url_meta.query_keys) 162 | current_level = pattern_node.level 163 | if path_depth < current_level \ 164 | and current_level <= (path_depth + query_depth): 165 | sep = Symbols.AMPERSAND 166 | if current_level == path_depth + 1: 167 | sep = BasePatternRule.SINGLE_QUESTION 168 | query_key = url_meta.query_keys[current_level - path_depth - 1] 169 | elif current_level == path_depth + query_depth + 1: 170 | sep = Symbols.NUMBER 171 | return ' {sep}{query_key}{pattern_string}({count}) '.format( 172 | count=pattern_node.count, 173 | pattern_string=pattern_node.value, 174 | query_key=query_key, 175 | sep=sep) 176 | 177 | if root.count <= 0: 178 | return 179 | 180 | ete_tree = get_ete_tree(root, format=f) 181 | yield ete_tree.get_ascii(show_internal=True) 182 | 183 | 184 | def get_ete_tree(root_node, format=str): 185 | """Transfor a tree-like object into ete tree. 186 | 187 | Args: 188 | root_node (TreeNode): The root of the tree. 189 | format (callable, optional): Defaults to str. 190 | A callable object to format the ete tree node. 191 | 192 | Returns: 193 | ete3.Tree: The ete tree. 194 | """ 195 | from ete3 import Tree 196 | 197 | def add_children(node, ete_node): 198 | for child in node.children: 199 | ete_child = ete_node.add_child(name=format(child)) 200 | add_children(child, ete_child) 201 | 202 | ete_root_node = Tree(name=format(root_node)) 203 | add_children(root_node, ete_root_node) 204 | return ete_root_node 205 | 206 | 207 | def pformat(name, url_meta, root, **kwargs): 208 | """Shortcut for formatting. 209 | 210 | Args: 211 | name (str): Format type. 212 | url_meta (URLMeta): The URLMeta object. 213 | root (TreeNode): Root of a clustered tree. 214 | **kwargs: Arbitray keyword arguments. 215 | 216 | Returns: 217 | Iterator: For iterate formatted strings. 218 | """ 219 | return FORMATTERS[name.upper()].format(url_meta, root, **kwargs) 220 | 221 | 222 | # Auto discover Formatter classes and init FORMATTERS. 223 | FORMATTERS = {} 224 | for c_cls in get_classes(sys.modules[__name__], Formatter): 225 | c_name = c_cls.__name__ 226 | t = c_name.rfind('Formatter') 227 | if t < 0: 228 | raise ImportError('Invalid formatter name: %s' % c_name) 229 | name = c_name[0:t].upper() if c_name[0:t] else 'NULL' 230 | try: 231 | FORMATTERS[name] = c_cls() 232 | except: 233 | pass 234 | -------------------------------------------------------------------------------- /src/os_urlpattern/parse_utils.py: -------------------------------------------------------------------------------- 1 | """Utilitis for parsing URL and pattern. 2 | """ 3 | 4 | from __future__ import unicode_literals 5 | 6 | import hashlib 7 | from collections import namedtuple 8 | 9 | from .compat import ParseResult, StringIO, urlparse 10 | from .definition import (ASCII_DIGIT_SET, BLANK_TUPLE, CHAR_RULE_DICT, 11 | DEFAULT_ENCODING, DIGIT_AND_ASCII_RULE_SET, 12 | EMPTY_TUPLE, MIXED_RULE_SET, 13 | QUERY_PART_RESERVED_CHARS, RULE_SET, SIGN_RULE_SET, 14 | BasePatternRule, Symbols) 15 | from .exceptions import (InvalidCharException, InvalidPatternException, 16 | IrregularURLException) 17 | 18 | URLPatternParseResult = namedtuple( 19 | 'URLPatternParseResult', 'path query fragment') 20 | 21 | 22 | class URLMeta(namedtuple('URLMeta', 'path_depth query_keys has_fragment')): 23 | """The URL structure meta. 24 | 25 | Attributes: 26 | path_depth (int): The num of URL path levels. 27 | querys_keys (:obj:`tuple` of :obj:`str`): Query keys. 28 | has_fragment (bool): Whether the URL have fragmemt component. 29 | 30 | """ 31 | __slots__ = () 32 | 33 | @property 34 | def depth(self): 35 | return self.path_depth + len(self.query_keys) + (1 if self.has_fragment else 0) 36 | 37 | 38 | def specify_rule(rule, num): 39 | """Specify the format of the rule. 40 | 41 | num == 1 will return [rule], single 42 | num > 1 will return [rule]{num}, with number 43 | num < 0 will return [rule]+, wildcard 44 | num == 0 will raise ValueError 45 | 46 | Args: 47 | rule (str): The raw rule string to be secified. 48 | num (int): The num of the rule. Can't be 0. 49 | 50 | Raises: 51 | ValueError: If the num == 0. 52 | 53 | Returns: 54 | str: The specified format of the rule. 55 | 56 | Examples: 57 | 58 | >>> from os_urlpattern.parse_utils import specify_rule 59 | >>> specify_rule('a-z', 1) 60 | [a-z] 61 | >>> specify_rule('a-z', 2) 62 | [a-z]{2} 63 | >>> specify_rule('a-z', -1) 64 | [a-z]+ 65 | 66 | """ 67 | 68 | if num == 1: 69 | return '[%s]' % rule 70 | elif num < 0: 71 | return '[%s]+' % rule 72 | elif num > 1: 73 | return '[%s]{%d}' % (rule, num) 74 | else: 75 | raise ValueError('Invalid num %s' % str(num)) 76 | 77 | 78 | def wildcard_rule(rule): 79 | """The wildcard format of the rule. 80 | 81 | Shortcut of specify_rule(rule, -1). 82 | 83 | Args: 84 | rule (str): The raw rule string to be secified. 85 | 86 | Returns: 87 | str: The wildcard format of the rule. 88 | """ 89 | return specify_rule(rule, -1) 90 | 91 | 92 | def normalize(raw_string, reserved_chars=None): 93 | """Normalize a string. 94 | 95 | Transfor the continuous same signs in the string to the format of 96 | [sign_rule]{num}, if the sign is not in zhe reserved_chars. 97 | 98 | Args: 99 | raw_string (str): The string to be normalized. 100 | reserved_chars ([type], optional): Defaults to None. Reserved chars 101 | which are not to be normalized. 102 | 103 | Returns: 104 | str: The normalized string. 105 | 106 | Examples: 107 | 108 | >>> from os_urlpattern.parse_utils import normalize 109 | >>> normalize('abc==123---') 110 | u'abc[=]{2}123[\\-]{3}' 111 | 112 | """ 113 | normalized = StringIO() 114 | frag = StringIO() 115 | last_c = None 116 | for c in raw_string: 117 | if c in ASCII_DIGIT_SET: 118 | if last_c and last_c not in ASCII_DIGIT_SET: 119 | frag.seek(0) 120 | w = frag.read() 121 | l = len(w) 122 | if l > 0: 123 | if not reserved_chars or w[0] not in reserved_chars: 124 | r = CHAR_RULE_DICT.get(w[0]) 125 | w = specify_rule(r, l) 126 | normalized.write(w) 127 | frag = StringIO() 128 | else: 129 | if last_c != c: 130 | frag.seek(0) 131 | w = frag.read() 132 | l = len(w) 133 | if l > 0 and w[0] not in ASCII_DIGIT_SET and \ 134 | (not reserved_chars or w[0] not in reserved_chars): 135 | r = CHAR_RULE_DICT.get(w[0]) 136 | w = specify_rule(r, l) 137 | normalized.write(w) 138 | frag = StringIO() 139 | frag.write(c) 140 | last_c = c 141 | 142 | frag.seek(0) 143 | w = frag.read() 144 | l = len(w) 145 | if last_c and last_c not in ASCII_DIGIT_SET and \ 146 | (not reserved_chars or w[0] not in reserved_chars): 147 | r = CHAR_RULE_DICT.get(w[0]) 148 | w = specify_rule(r, l) 149 | normalized.write(w) 150 | normalized.seek(0) 151 | return normalized.read() 152 | 153 | 154 | def parse_url(url): 155 | """Parse a URL into 6 components. 156 | 157 | :///;?# 158 | 159 | Like the built-in urlparse method, but handle some unusual situation. 160 | 161 | Args: 162 | url (str): The URL to be parsed. 163 | 164 | Returns: 165 | ParseResult: A 6-tuple, (scheme, netloc, path, params, query, fragment). 166 | """ 167 | scheme, netloc, path, params, query, fragment = urlparse(url) 168 | if not fragment: 169 | if url[-1] != Symbols.NUMBER: 170 | fragment = None 171 | if not query and url[-1] != Symbols.QUESTION: 172 | query = None 173 | elif not query and url[-2] != Symbols.QUESTION: 174 | query = None 175 | elif not query: 176 | if url[len(url) - len(fragment) - 2] != Symbols.QUESTION: 177 | query = None 178 | return ParseResult(scheme, netloc, path, params, query, fragment) 179 | 180 | 181 | def filter_useless(objs): 182 | """Filter the useless objects. 183 | 184 | If bool(object) == False, the object is useless except the last one. 185 | 186 | Args: 187 | objs (sequence): The objects will be filtered. 188 | 189 | Returns: 190 | iterable: The filterd objs 191 | 192 | Examples: 193 | 194 | >>> from os_urlpattern.parse_utils import filter_useless 195 | >>> filter_useless([0,1,0,0]) 196 | [1, 0] 197 | 198 | """ 199 | keep = {'c': 0, 'l': len(objs)} 200 | 201 | def _filterd(x): 202 | keep['c'] += 1 203 | if not x: 204 | if keep['c'] == keep['l']: 205 | return True 206 | return False 207 | else: 208 | return True 209 | 210 | return objs.__class__(filter(_filterd, objs)) 211 | 212 | 213 | def parse_query_string(query_string): 214 | """Parse query string into keys and values. 215 | 216 | Args: 217 | query_string (str): The string to be parsed. 218 | 219 | Raises: 220 | IrregularURLException: Invalid query string. 221 | 222 | Returns: 223 | tuple: A 2-tuple, (keys and values). 224 | """ 225 | if query_string is None: 226 | return EMPTY_TUPLE, EMPTY_TUPLE 227 | elif query_string == Symbols.EMPTY: 228 | return BLANK_TUPLE, BLANK_TUPLE 229 | elif query_string.endswith(Symbols.AMPERSAND): 230 | raise IrregularURLException("Invalid '&' pos") 231 | kv_type = True # query_key True, query_value False 232 | last_c = None 233 | kv_buf = {True: StringIO(), False: StringIO()} 234 | kv_list = {True: [], False: []} 235 | for i in query_string: 236 | if i == Symbols.EQUALS and kv_type: 237 | s = kv_buf[kv_type] 238 | s.write(i) 239 | s.seek(0) 240 | kv_list[kv_type].append(s.read()) 241 | kv_buf[kv_type] = StringIO() 242 | kv_type = not kv_type 243 | elif i == Symbols.AMPERSAND: 244 | if last_c is None or last_c == Symbols.AMPERSAND: 245 | raise IrregularURLException("Invalid '&' pos") 246 | s = kv_buf[kv_type] 247 | s.seek(0) 248 | kv_list[kv_type].append(s.read()) 249 | kv_buf[kv_type] = StringIO() 250 | if kv_type: 251 | kv_list[False].append(Symbols.EMPTY) # treat as value-less 252 | else: 253 | kv_type = not kv_type 254 | else: 255 | s = kv_buf[kv_type] 256 | s.write(i) 257 | last_c = i 258 | 259 | s = kv_buf[kv_type] 260 | s.seek(0) 261 | kv_list[kv_type].append(s.read()) 262 | if kv_type: # treat as value-less 263 | kv_list[False].append(Symbols.EMPTY) 264 | 265 | # Only one query without value, treat as key-less. 266 | if len(kv_list[True]) == 1 and not kv_list[True][0].endswith(Symbols.EQUALS): 267 | kv_list[False][0], kv_list[True][0] = kv_list[True][0], kv_list[False][0] 268 | return tuple(kv_list[True]), tuple(kv_list[False]) 269 | 270 | 271 | def mix(pieces, rules): 272 | """Combine the sub-pieces and sub-rules. 273 | 274 | If the sub pieces have continuous letter num and percent sign fragments 275 | will be combine into one piece as well as the rules. 276 | 277 | Args: 278 | pieces (sequence): The raw pieces. 279 | rules (sequence): The rules. 280 | 281 | Returns: 282 | tuple: A 2-tuple, (mixed_pieces, mixed_rules) 283 | """ 284 | mixed_pieces = [] 285 | mixed_rules = [] 286 | 287 | t_pieces = [] 288 | t_rules = [] 289 | t_mix = False 290 | for piece, rule in zip(pieces, rules): 291 | if rule in MIXED_RULE_SET: 292 | if t_rules and not t_mix: 293 | mixed_pieces.extend(t_pieces) 294 | mixed_rules.extend(t_rules) 295 | t_pieces = [] 296 | t_rules = [] 297 | t_mix = True 298 | else: 299 | if t_rules and t_mix: 300 | mixed_pieces.append(''.join(t_pieces)) 301 | mixed_rules.append(''.join(sorted(set(t_rules)))) 302 | t_pieces = [] 303 | t_rules = [] 304 | t_mix = False 305 | t_pieces.append(piece) 306 | t_rules.append(rule) 307 | if t_mix: 308 | mixed_pieces.append(''.join(t_pieces)) 309 | mixed_rules.append(''.join(sorted(set(t_rules)))) 310 | else: 311 | mixed_pieces.extend(t_pieces) 312 | mixed_rules.extend(t_rules) 313 | return pieces.__class__(mixed_pieces), rules.__class__(mixed_rules) 314 | 315 | 316 | def unpack(result, normalize_key=True): 317 | """Split the ParseResult object into URLMeta and pieces. 318 | 319 | Args: 320 | result ([type]): The ParseResult object. 321 | normalize_key (bool, optional): Defaults to True. 322 | Whether normalize the query keys. 323 | 324 | Raises: 325 | IrregularURLException: Invalid URL. 326 | 327 | Returns: 328 | tuple: A 2-tuple, (url_meta, pieces). 329 | """ 330 | pieces = filter_useless(result.path.split(Symbols.SLASH)[1:]) 331 | path_depth = len(pieces) 332 | if path_depth <= 0: 333 | raise IrregularURLException('Invalid url depth') 334 | 335 | keys, values = parse_query_string(result.query) 336 | if normalize_key: 337 | keys = tuple([normalize(key, QUERY_PART_RESERVED_CHARS) 338 | for key in keys]) 339 | has_fragment = False if result.fragment is None else True 340 | 341 | url_meta = URLMeta(path_depth, keys, has_fragment) 342 | pieces.extend(values) 343 | if has_fragment: 344 | pieces.append(result.fragment) 345 | return url_meta, tuple(pieces) 346 | 347 | 348 | def pack(url_meta, objs): 349 | """Pack into URL-like string. 350 | 351 | Args: 352 | url_meta (URLMeta): The URLMeta object. 353 | objs (sequence): The objects to be packed. 354 | 355 | Returns: 356 | str: The packed URL-like string. 357 | """ 358 | s = StringIO() 359 | s.write(Symbols.SLASH) 360 | query_depth = len(url_meta.query_keys) 361 | idx = url_meta.path_depth + query_depth 362 | p = Symbols.SLASH.join([str(p) for p in objs[0:url_meta.path_depth]]) 363 | s.write(p) 364 | if query_depth > 0: 365 | s.write(BasePatternRule.SINGLE_QUESTION) 366 | kv = zip(url_meta.query_keys, 367 | [str(p) for p in objs[url_meta.path_depth:idx]]) 368 | s.write(Symbols.AMPERSAND.join( 369 | [''.join((str(k), str(v))) for k, v in kv])) 370 | 371 | if url_meta.has_fragment: 372 | s.write(Symbols.NUMBER) 373 | s.write(''.join([str(p) for p in objs[idx:]])) 374 | s.seek(0) 375 | return s.read() 376 | 377 | 378 | def analyze_url(url): 379 | """Parse a URL to URLMeta object and raw pieces. 380 | 381 | Args: 382 | url (str): The URL to be parsed. 383 | 384 | Returns: 385 | tuple: A 2-tuple, (url_meta, pieces). 386 | """ 387 | 388 | result = parse_url(url) 389 | return unpack(result, True) 390 | 391 | 392 | def fuzzy_join(objs, sep='/'): 393 | """Join the fuzzy_rule of the objects into one string. 394 | 395 | Args: 396 | objs (sequence): The objects each of which have fuzzy_rule property. 397 | sep (str): Defaults to '/'. Seperator for joining. 398 | 399 | Returns: 400 | str: The joined fuzzy_rule string. 401 | """ 402 | return sep.join([p.fuzzy_rule for p in objs]) 403 | 404 | 405 | class ParsedPiece(object): 406 | """The parsed piece object. 407 | 408 | It contains the sub-pieces of a piece and the corresponding sub-rules. 409 | With it, you can get fuzzy rule and the length of the entire piece. 410 | It is can be used as map key. 411 | 412 | """ 413 | __slots__ = ('pieces', 'rules', '_piece', '_piece_length', '_fuzzy_rule') 414 | 415 | def __init__(self, pieces, rules): 416 | """Init the ParsedPiece object. 417 | 418 | Args: 419 | pieces (tuple): The tuple of parsed pieces. 420 | rules (tuple): The tuple of the rules of each parsed pieces. 421 | """ 422 | self.pieces = pieces 423 | self.rules = rules 424 | self._piece_length = -1 425 | self._piece = pieces[0] if len(pieces) == 1 else None 426 | self._fuzzy_rule = rules[0] if len(rules) == 1 else None 427 | 428 | @property 429 | def fuzzy_rule(self): 430 | if not self._fuzzy_rule: 431 | self._fuzzy_rule = ''.join(sorted(set(self.rules))) 432 | return self._fuzzy_rule 433 | 434 | @property 435 | def piece_length(self): 436 | """Get the literal length of the piece. 437 | 438 | Not the number of the characters of the piece. 439 | 440 | Note: 441 | 442 | '[%]{2}' have 6 characters, but literal length is 2. 443 | 444 | Returns: 445 | int: The literal length of the piece. 446 | """ 447 | if self._piece_length < 0: 448 | piece = self.piece 449 | length_base = length = len(piece) 450 | idx = 0 451 | while idx < length_base: 452 | c = piece[idx] 453 | if c == Symbols.BRACKETS_L or c == Symbols.BRACKETS_R: 454 | if idx == 0 or piece[idx - 1] != Symbols.BACKSLASH: 455 | length += -1 456 | elif c == Symbols.BACKSLASH: 457 | if piece[idx + 1] != Symbols.BACKSLASH: 458 | length += -1 459 | elif c == Symbols.BRACES_L: 460 | if piece[idx - 1] == Symbols.BRACKETS_R: 461 | e = piece.index(Symbols.BRACES_R, idx) 462 | length += int(piece[idx + 1:e]) - 1 - (e - idx + 1) 463 | idx = e 464 | idx += 1 465 | 466 | self._piece_length = length 467 | return self._piece_length 468 | 469 | def __eq__(self, o): 470 | if not isinstance(o, ParsedPiece): 471 | return False 472 | return self.piece == o.piece 473 | 474 | def __hash__(self): 475 | return hash(self.piece) 476 | 477 | @property 478 | def piece(self): 479 | if self._piece is None: 480 | self._piece = ''.join(self.pieces) 481 | return self._piece 482 | 483 | def __str__(self): 484 | return str(zip(self.pieces, self.rules)) 485 | 486 | __repr__ = __str__ 487 | 488 | 489 | EMPTY_PARSED_PIECE = ParsedPiece(EMPTY_TUPLE, EMPTY_TUPLE) 490 | 491 | 492 | class PieceParser(object): 493 | """Parser to parse the URL piece. 494 | 495 | Used it to generate ParsedPiece object from the piece of URL. 496 | Not thread safe. 497 | """ 498 | __slots__ = ('_rules', '_pieces') 499 | 500 | def __init__(self): 501 | self._reset() 502 | 503 | def _reset(self): 504 | self._rules = [] 505 | self._pieces = [] 506 | 507 | def parse(self, piece): 508 | """Parse a string into small sub-pieces with rules. 509 | 510 | The consecutive charactors in the same charactor space 511 | will be joined into one sub-piece, the corresponding 512 | rule(charactor space) can also be got. 513 | 514 | Args: 515 | piece (str): A string to be parsed. 516 | 517 | Returns: 518 | tuple: 2-tuple, (pieces, rules). 519 | """ 520 | 521 | self._reset() 522 | self._preprocess(piece) 523 | return self._create_parsed_piece() 524 | 525 | def _preprocess(self, piece): 526 | for c in piece: 527 | self._define(c) 528 | for idx, buf in enumerate(self._pieces): 529 | buf.seek(0) 530 | letter = buf.read() 531 | self._pieces[idx] = self._normalize( 532 | letter, self._rules[idx]) 533 | 534 | def _define(self, char): 535 | last_rule = self._rules[-1] if self._rules else None 536 | if char not in CHAR_RULE_DICT: 537 | raise InvalidCharException("Invalid char %r" % char) 538 | rule = CHAR_RULE_DICT[char] 539 | 540 | if last_rule != rule: 541 | self._pieces.append(StringIO()) 542 | self._rules.append(rule) 543 | self._pieces[-1].write(char) 544 | 545 | def _normalize(self, letter, rule): 546 | if rule in SIGN_RULE_SET: 547 | return specify_rule(rule, len(letter)) 548 | return letter 549 | 550 | def _create_parsed_piece(self): 551 | return ParsedPiece(tuple(self._pieces), tuple(self._rules)) 552 | 553 | 554 | def fuzzy_digest(url_meta, objs): 555 | """Generate hex digest string from URLMeta and objects' fuzzy_rules. 556 | 557 | Args: 558 | url_meta (URLMeta): The URLMeta object. 559 | objs (sequence): Each object hava fuzzy_rule property. 560 | 561 | Returns: 562 | str: Digest value as a string of hexadecimal digits. 563 | """ 564 | return digest(url_meta, [obj.fuzzy_rule for obj in objs]) 565 | 566 | 567 | def digest(url_meta, objs): 568 | """Generate hex digest string from URLMeta and objects. 569 | 570 | Args: 571 | url_meta (URLMeta): The URLMeta object. 572 | objs (sequence): The sequence of objects. 573 | 574 | Returns: 575 | str: Digest value as a string of hexadecimal digits. 576 | """ 577 | return hashlib.md5(pack(url_meta, objs).encode(DEFAULT_ENCODING)).hexdigest() 578 | 579 | 580 | def parse_url_pattern_string(url_pattern_string): 581 | """Parse a URL pattern string into 3 components. 582 | 583 | [\\?]# 584 | 585 | Args: 586 | url_pattern_string (str): The url pattern string to be parsed. 587 | 588 | Returns: 589 | URLPatternParseResult: A 3-tuple, (path, query, fragment). 590 | """ 591 | idx_p = 0 592 | idx_q = url_pattern_string.find(BasePatternRule.SINGLE_QUESTION) 593 | idx_f = url_pattern_string.find(Symbols.NUMBER) 594 | path = query = fragment = None 595 | if idx_q < 0 and idx_f < 0: 596 | path = url_pattern_string[idx_p:] 597 | elif idx_q > 0 and idx_f > 0: 598 | if idx_f > idx_q: 599 | path = url_pattern_string[idx_p:idx_q] 600 | query = url_pattern_string[idx_q + 4:idx_f] 601 | else: 602 | path = url_pattern_string[idx_p:idx_f] 603 | fragment = url_pattern_string[idx_f + 1:] 604 | elif idx_q < 0 and idx_f > 0: 605 | path = url_pattern_string[idx_p:idx_f] 606 | fragment = url_pattern_string[idx_f + 1:] 607 | elif idx_q > 0 and idx_f < 0: 608 | path = url_pattern_string[idx_p:idx_q] 609 | query = url_pattern_string[idx_q + 4:] 610 | 611 | return URLPatternParseResult(path, query, fragment) 612 | 613 | 614 | def analyze_url_pattern_string(url_pattern_string): 615 | """Parse a URL pattern string into URLMeta object and pattern string pieces. 616 | 617 | Args: 618 | url_pattern_string (str): The URL pattern string to be parsed. 619 | 620 | Returns: 621 | tuple: A 2-tuple, (url_meta, pattern_strings). 622 | """ 623 | result = parse_url_pattern_string(url_pattern_string) 624 | return unpack(result, False) 625 | 626 | 627 | def parse_pattern_string(pattern_string): 628 | """Parse a pattern string into pattern unit strings. 629 | 630 | Args: 631 | pattern_string (str): The pattern string to be parsed. 632 | 633 | Returns: 634 | tuple: Pattern unit strings. 635 | """ 636 | if pattern_string == Symbols.EMPTY: 637 | return BLANK_TUPLE 638 | pattern_unit_strings = [] 639 | l = len(pattern_string) 640 | s = StringIO() 641 | idx = 0 642 | last_rule = None 643 | while idx < l: 644 | c = pattern_string[idx] 645 | if c == Symbols.BRACKETS_L: 646 | if last_rule is not None: 647 | s.seek(0) 648 | pattern_unit_strings.append(s.read()) 649 | s = StringIO() 650 | last_rule = None 651 | 652 | idx_s = idx 653 | while True: 654 | idx = pattern_string.find(Symbols.BRACKETS_R, idx + 1) 655 | if idx < 0: 656 | raise InvalidPatternException( 657 | "Missing '%s'" % Symbols.BRACKETS_R) 658 | elif pattern_string[idx - 1] == Symbols.BACKSLASH: 659 | continue 660 | break 661 | if idx + 1 < l: 662 | if pattern_string[idx + 1] == Symbols.BRACES_L: 663 | old_idx = idx + 2 664 | idx = pattern_string.find(Symbols.BRACES_R, idx + 1) 665 | if idx < 0: 666 | raise InvalidPatternException( 667 | "Missing '%s'" % Symbols.BRACES_R) 668 | num_str = pattern_string[old_idx:idx] 669 | if not num_str.isdigit(): 670 | raise InvalidPatternException( 671 | "Invalid num %r" % num_str) 672 | 673 | elif pattern_string[idx + 1] == Symbols.PLUS: 674 | idx += 1 675 | idx += 1 676 | pattern_unit_strings.append(pattern_string[idx_s:idx]) 677 | else: 678 | if c not in CHAR_RULE_DICT: 679 | raise InvalidPatternException("Invaid char %r" % c) 680 | rule = CHAR_RULE_DICT[c] 681 | if rule not in DIGIT_AND_ASCII_RULE_SET: 682 | raise InvalidPatternException( 683 | 'Invalid pattern') 684 | if last_rule is None: 685 | s.write(c) 686 | else: 687 | if rule == last_rule: 688 | s.write(c) 689 | else: 690 | s.seek(0) 691 | pattern_unit_strings.append(s.read()) 692 | s = StringIO() 693 | s.write(c) 694 | last_rule = rule 695 | idx += 1 696 | if last_rule is not None: 697 | s.seek(0) 698 | pattern_unit_strings.append(s.read()) 699 | 700 | return tuple(pattern_unit_strings) 701 | 702 | 703 | def parse_pattern_unit_string(pattern_unit_string): 704 | """Parse pattern unit string into rules and literal num. 705 | 706 | Args: 707 | pattern_unit_string (str): The pattern unit string to be parsed. 708 | 709 | Returns: 710 | tuple: A 2-tuple, (rules, num). 711 | """ 712 | rules = set() 713 | num = 1 714 | if pattern_unit_string == Symbols.EMPTY: 715 | rules.add(Symbols.EMPTY) 716 | elif pattern_unit_string[0] != Symbols.BRACKETS_L: 717 | rules.add(CHAR_RULE_DICT[pattern_unit_string[0]]) 718 | num = len(pattern_unit_string) 719 | else: 720 | if pattern_unit_string[-1] == Symbols.BRACKETS_R: 721 | num = 1 722 | elif pattern_unit_string[-1] == Symbols.BRACES_R: 723 | t = pattern_unit_string.rfind(Symbols.BRACES_L) 724 | num_str = pattern_unit_string[t + 1:-1] 725 | if not num_str.isdigit(): 726 | raise InvalidPatternException("Invalid num %r" % num_str) 727 | num = int(num_str) 728 | elif pattern_unit_string[-1] == Symbols.PLUS: 729 | num = -1 730 | t = pattern_unit_string.rfind(Symbols.BRACKETS_R) 731 | p_str = pattern_unit_string[1:t] 732 | l = len(p_str) 733 | idx = 0 734 | while idx < l: 735 | c = p_str[idx] 736 | n = 3 737 | if c in ASCII_DIGIT_SET: 738 | pass 739 | elif c == Symbols.BACKSLASH: 740 | n = 2 741 | else: 742 | n = 1 743 | rule = p_str[idx:idx + n] 744 | if rule not in RULE_SET: 745 | raise InvalidPatternException("Invalid rule %r" % rule) 746 | rules.add(rule) 747 | idx += n 748 | if (num > 0 and len(rules) > num) or num == 0: 749 | raise InvalidPatternException('Insufficient number') 750 | return rules, num 751 | -------------------------------------------------------------------------------- /src/os_urlpattern/parsed_piece_view.py: -------------------------------------------------------------------------------- 1 | """ParsedPieceView and subclass implementation. 2 | """ 3 | from __future__ import unicode_literals 4 | 5 | from .definition import DIGIT_AND_ASCII_RULE_SET, BasePatternRule 6 | from .parse_utils import ParsedPiece, fuzzy_join, mix 7 | from .utils import pick 8 | 9 | 10 | class ParsedPieceView(object): 11 | """The base class of parsed piece view. 12 | 13 | View object is a wrapper of parsed piece, which have individual 14 | view, parsed_piece and parsed_pieces propertys are based on the 15 | raw parsed piece. 16 | 17 | """ 18 | __slots__ = ('parsed_piece', '_parsed_pieces', '_view') 19 | 20 | def __init__(self, parsed_piece): 21 | self.parsed_piece = parsed_piece 22 | self._parsed_pieces = None 23 | self._view = None 24 | 25 | def __eq__(self, o): 26 | if not isinstance(o, ParsedPieceView): 27 | return False 28 | return self.view == o.view 29 | 30 | def __hash__(self): 31 | return hash(self.view) 32 | 33 | @property 34 | def view(self): 35 | if self._view is None: 36 | self._view = fuzzy_join(self.parsed_pieces) 37 | return self._view 38 | 39 | @property 40 | def parsed_pieces(self): 41 | if self._parsed_pieces: 42 | return self._parsed_pieces 43 | 44 | self._parsed_pieces = [ParsedPiece((piece,), (rule,)) for piece, rule in zip( 45 | self.parsed_piece.pieces, self.parsed_piece.rules)] 46 | return self._parsed_pieces 47 | 48 | 49 | class PieceView(ParsedPieceView): 50 | 51 | def __init__(self, parsed_piece): 52 | super(PieceView, self).__init__(parsed_piece) 53 | self._view = self.parsed_piece.piece 54 | 55 | 56 | class LengthView(ParsedPieceView): 57 | 58 | def __init__(self, parsed_piece): 59 | super(LengthView, self).__init__(parsed_piece) 60 | self._view = self.parsed_piece.piece_length 61 | 62 | 63 | class MultiView(ParsedPieceView): 64 | pass 65 | 66 | 67 | class MixedView(ParsedPieceView): 68 | 69 | @property 70 | def parsed_pieces(self): 71 | if self._parsed_pieces: 72 | return self._parsed_pieces 73 | 74 | if len(self.parsed_piece.rules) <= 1: 75 | self._parsed_pieces = [self.parsed_piece] 76 | else: 77 | mixed_pieces, mixed_rules = mix( 78 | self.parsed_piece.pieces, self.parsed_piece.rules) 79 | 80 | self._parsed_pieces = [ParsedPiece( 81 | (piece,), (rule,)) for piece, rule in zip(mixed_pieces, mixed_rules)] 82 | return self._parsed_pieces 83 | 84 | 85 | class LastDotSplitFuzzyView(ParsedPieceView): 86 | 87 | @property 88 | def parsed_pieces(self): 89 | if self._parsed_pieces: 90 | return self._parsed_pieces 91 | rules = self.parsed_piece.rules 92 | dot_idx = None 93 | part_num = len(rules) 94 | for idx, rule in enumerate(reversed(rules)): 95 | if idx > 2: 96 | break 97 | if rule == BasePatternRule.DOT: 98 | dot_idx = part_num - idx - 1 99 | break 100 | self._parsed_pieces = [ParsedPiece((self.parsed_piece.piece,), 101 | (self.parsed_piece.fuzzy_rule,))] 102 | if dot_idx is not None: 103 | skip = False 104 | for rule in self.parsed_piece.rules[dot_idx + 1:]: 105 | if rule not in DIGIT_AND_ASCII_RULE_SET: 106 | skip = True 107 | break 108 | if not skip: 109 | pieces = [] 110 | rules = [] 111 | pieces.append(''.join(self.parsed_piece.pieces[0:dot_idx])) 112 | pieces.append(self.parsed_piece.pieces[dot_idx]) 113 | rules.append( 114 | ''.join(sorted(set(self.parsed_piece.rules[0:dot_idx])))) 115 | rules.append(self.parsed_piece.rules[dot_idx]) 116 | mixed_pieces, mixed_rules = mix( 117 | self.parsed_piece.pieces[dot_idx + 1:], 118 | self.parsed_piece.rules[dot_idx + 1:]) 119 | pieces.extend(mixed_pieces) 120 | rules.extend(mixed_rules) 121 | self._parsed_pieces = [ParsedPiece( 122 | (piece,), (rule,)) for piece, rule in zip(pieces, rules)] 123 | return self._parsed_pieces 124 | 125 | 126 | class FuzzyView(ParsedPieceView): 127 | 128 | def __init__(self, parsed_piece): 129 | super(FuzzyView, self).__init__(parsed_piece) 130 | self._view = self.parsed_piece.fuzzy_rule 131 | 132 | @property 133 | def parsed_pieces(self): 134 | if self._parsed_pieces: 135 | return self._parsed_pieces 136 | self._parsed_pieces = [ParsedPiece((self.parsed_piece.piece,), 137 | (self.parsed_piece.fuzzy_rule,))] 138 | return self._parsed_pieces 139 | 140 | 141 | def view_cls_from_pattern(pattern, is_last_path=False): 142 | """Get ParsedPieceView class from pattern. 143 | 144 | ParsedPieceView type can be deduced from the pattern. 145 | 146 | Args: 147 | pattern (Pattern): The Pattern object. 148 | is_last_path (bool, optional): Defaults to False. Whether the pattern 149 | is at the last path position. 150 | 151 | Returns: 152 | class: The class of ParsedPieceView. 153 | """ 154 | view_cls = PieceView 155 | pattern_units = pattern.pattern_units 156 | if len(pattern_units) == 1: 157 | pattern_unit = pattern_units[0] 158 | if not pattern_unit.is_literal(): 159 | if pattern_unit.num < 0: 160 | view_cls = FuzzyView 161 | else: 162 | view_cls = LengthView 163 | else: 164 | for pattern_unit in pattern_units: 165 | if not pattern_unit.is_literal(): 166 | if len(pattern_unit.rules) > 1: 167 | view_cls = MixedView 168 | break 169 | else: 170 | view_cls = MultiView 171 | if is_last_path \ 172 | and len(pattern_units) == 3 \ 173 | and view_cls != PieceView \ 174 | and len(pattern_units[1].rules) == 1 \ 175 | and pick(pattern_units[1].rules) == BasePatternRule.DOT \ 176 | and not (set(pattern_units[-1].rules) - DIGIT_AND_ASCII_RULE_SET): 177 | view_cls = LastDotSplitFuzzyView 178 | 179 | return view_cls 180 | -------------------------------------------------------------------------------- /src/os_urlpattern/parser.py: -------------------------------------------------------------------------------- 1 | """High-level APIs for parsing. 2 | """ 3 | 4 | from __future__ import unicode_literals 5 | 6 | from .parse_utils import fuzzy_digest as _fuzzy_digest 7 | from .parse_utils import PieceParser, analyze_url, analyze_url_pattern_string 8 | 9 | 10 | def parse(url_or_pattern): 11 | """Parse URL or URL pattern string. 12 | 13 | Args: 14 | url_or_pattern (str): URL or URL pattern. 15 | 16 | Returns: 17 | tuple: 2-tuples, (url_meta, parsed_pieces) 18 | """ 19 | url_meta = None 20 | parsed_pieces = None 21 | if url_or_pattern.startswith('/'): # URL pattern 22 | from .pattern_matcher import MatchPattern 23 | url_meta, pattern_strings = analyze_url_pattern_string(url_or_pattern) 24 | parsed_pieces = tuple([MatchPattern(p, i == url_meta.path_depth) 25 | for i, p in enumerate(pattern_strings, 1)]) 26 | else: # URL 27 | parser = PieceParser() 28 | url_meta, pieces = analyze_url(url_or_pattern) 29 | parsed_pieces = tuple([parser.parse(piece) for piece in pieces]) 30 | 31 | return url_meta, parsed_pieces 32 | 33 | 34 | def fuzzy_digest(*args): 35 | """Generate hex fuzzy digest string from URL or URL pattern. 36 | 37 | Args: 38 | *args: Can be a single argument string, or 2 arguments 39 | URLMeta and objects. 40 | 41 | Returns: 42 | str: Digest value as a string of hexadecimal digits. 43 | """ 44 | l = len(args) 45 | url_meta = None 46 | objs = None 47 | if l == 2: 48 | url_meta, objs = args 49 | elif l == 1: 50 | url_meta, objs = parse(args[0]) 51 | else: 52 | raise ValueError('Not digestable') 53 | return _fuzzy_digest(url_meta, objs) 54 | -------------------------------------------------------------------------------- /src/os_urlpattern/pattern.py: -------------------------------------------------------------------------------- 1 | """Pattern class. 2 | """ 3 | from __future__ import unicode_literals 4 | 5 | import re 6 | 7 | from .utils import pick 8 | 9 | 10 | class PatternUnit(object): 11 | """Sub-piece of pattern.""" 12 | 13 | __slots__ = ('pattern_unit_string', 'rules', 'num', '_fuzzy_rule') 14 | 15 | def __init__(self, pattern_unit_string): 16 | self.pattern_unit_string = pattern_unit_string 17 | from .parse_utils import parse_pattern_unit_string 18 | self.rules, self.num = parse_pattern_unit_string(pattern_unit_string) 19 | self._fuzzy_rule = None 20 | 21 | def is_literal(self): 22 | """Whether this unit string is literal or not. 23 | 24 | Note: 25 | According to the char representation, fixed-length 26 | single sign is literal, like: [\\.]{2} [\\-] 27 | 28 | Returns: 29 | bool: Whether it is literal. 30 | """ 31 | 32 | from .definition import DIGIT_AND_ASCII_RULE_SET, Symbols 33 | r = False 34 | if not self.pattern_unit_string.startswith(Symbols.BRACKETS_L): 35 | r = True 36 | elif len(self.rules) == 1: 37 | if self.num > 0: 38 | rule = pick(self.rules) 39 | if rule not in DIGIT_AND_ASCII_RULE_SET: 40 | r = True 41 | return r 42 | 43 | @property 44 | def fuzzy_rule(self): 45 | if self._fuzzy_rule is None: 46 | self._fuzzy_rule = ''.join(sorted(self.rules)) 47 | return self._fuzzy_rule 48 | 49 | def __str__(self): 50 | return ' '.join((self.pattern_unit_string, self.fuzzy_rule, str(self.num))) 51 | 52 | __repr__ = __str__ 53 | 54 | 55 | class Pattern(object): 56 | """Pattern for handle pattern string. """ 57 | 58 | __slots__ = ('pattern_string', '_pattern_regex', 59 | '_pattern_units', '_fuzzy_rule') 60 | 61 | def __init__(self, pattern_string): 62 | self.pattern_string = pattern_string 63 | self._pattern_regex = None 64 | self._pattern_units = None 65 | self._fuzzy_rule = None 66 | 67 | @property 68 | def pattern_units(self): 69 | """tuple: Pattern units.""" 70 | 71 | from .parse_utils import parse_pattern_string 72 | if self._pattern_units is None: 73 | self._pattern_units = tuple([PatternUnit( 74 | u) for u in parse_pattern_string(self.pattern_string)]) 75 | return self._pattern_units 76 | 77 | def __str__(self): 78 | return self.pattern_string 79 | 80 | __repr__ = __str__ 81 | 82 | def __hash__(self): 83 | return hash(self.pattern_string) 84 | 85 | def __eq__(self, o): 86 | return self.pattern_string == o.pattern_string 87 | 88 | def match(self, piece): 89 | if not self._pattern_regex: 90 | self._pattern_regex = re.compile( 91 | ''.join(('^', self.pattern_string, '$'))) 92 | return True if re.match(self._pattern_regex, piece) else False 93 | 94 | @property 95 | def fuzzy_rule(self): 96 | """str: All rules of the pattern join into a string.""" 97 | if self._fuzzy_rule is None: 98 | self._fuzzy_rule = ''.join(sorted(set.union( 99 | *[u.rules for u in self.pattern_units]))) 100 | return self._fuzzy_rule 101 | -------------------------------------------------------------------------------- /src/os_urlpattern/pattern_cluster.py: -------------------------------------------------------------------------------- 1 | """Cluster algorithm. 2 | """ 3 | from __future__ import unicode_literals 4 | 5 | from collections import Counter, OrderedDict, namedtuple 6 | 7 | from .compat import itervalues 8 | from .parse_utils import (EMPTY_PARSED_PIECE, URLMeta, specify_rule, 9 | wildcard_rule) 10 | from .parsed_piece_view import LastDotSplitFuzzyView, MixedView, MultiView 11 | from .pattern import Pattern 12 | from .piece_pattern_node import (PiecePatternNode, build_from_parsed_pieces, 13 | build_from_piece_pattern_nodes) 14 | from .utils import Bag, cached_property, dump_tree, pick 15 | 16 | 17 | class TBag(Bag): 18 | __slots__ = ('stats',) 19 | 20 | def __init__(self): 21 | super(TBag, self).__init__() 22 | self.stats = Counter() 23 | 24 | @property 25 | def count(self): 26 | return self.stats['count'] 27 | 28 | def add(self, obj): 29 | super(TBag, self).add(obj) 30 | self.stats['count'] += obj.count 31 | 32 | def set_pattern(self, pattern): 33 | for obj in self: 34 | obj.set_pattern(pattern) 35 | 36 | 37 | class TBucket(TBag): 38 | 39 | def __init__(self): 40 | super(TBucket, self).__init__() 41 | self._objs = {} 42 | 43 | def __getitem__(self, key): 44 | return self._objs[key] 45 | 46 | def __contains__(self, key): 47 | return key in self._objs 48 | 49 | def __iter__(self): 50 | return iter(itervalues(self._objs)) 51 | 52 | def add(self, obj): 53 | raise NotImplementedError 54 | 55 | 56 | class PieceBag(TBag): 57 | """A bag contain all of the nodes with same piece. 58 | 59 | The nodes should on the same branch of a tree at the same level. 60 | """ 61 | 62 | __slots__ = ('_p_nodes',) 63 | 64 | def __init__(self): 65 | super(PieceBag, self).__init__() 66 | self._p_nodes = set() 67 | 68 | def add(self, piece_pattern_node): 69 | super(PieceBag, self).add(piece_pattern_node) 70 | self._p_nodes.add(piece_pattern_node.parrent) 71 | self.stats['p_nodes_count'] += piece_pattern_node.parrent.count \ 72 | if piece_pattern_node.parrent is not None \ 73 | else piece_pattern_node.count 74 | 75 | @property 76 | def p_nodes(self): 77 | return self._p_nodes 78 | 79 | 80 | class PieceBagBucket(TBucket): 81 | __slots__ = ('_p_nodes',) 82 | 83 | def __init__(self): 84 | super(PieceBagBucket, self).__init__() 85 | self._p_nodes = set() 86 | 87 | def add(self, obj): 88 | if isinstance(obj, PiecePatternNode): 89 | piece = obj.piece 90 | if piece not in self._objs: 91 | self._objs[piece] = PieceBag() 92 | self._objs[piece].add(obj) 93 | elif isinstance(obj, PieceBag): 94 | piece = obj.pick().piece 95 | if piece in self._objs: 96 | raise ValueError('duplicated') 97 | self._objs[piece] = obj 98 | else: 99 | raise ValueError('not PiecePatternNode nor PieceBag') 100 | 101 | self.stats['count'] += obj.count 102 | 103 | @property 104 | def p_nodes(self): 105 | if not self._p_nodes: 106 | for piece_bag in self: 107 | self._p_nodes.update(piece_bag.p_nodes) 108 | return self._p_nodes 109 | 110 | 111 | class ViewPieceBag(namedtuple('ViewPieceBag', ['view', 'piece_bag'])): 112 | __slots__ = () 113 | 114 | def set_pattern(self, pattern): 115 | return self.piece_bag.set_pattern(pattern) 116 | 117 | 118 | class ViewPieceBagBucket(PieceBagBucket): 119 | __slots__ = ('_url_meta', '_root') 120 | 121 | def __init__(self, url_meta): 122 | super(ViewPieceBagBucket, self).__init__() 123 | self._url_meta = url_meta 124 | self._root = PiecePatternNode((EMPTY_PARSED_PIECE, None)) 125 | 126 | def add(self, view_piece_bag, build_tree=True): 127 | piece_bag = view_piece_bag.piece_bag 128 | self._objs[piece_bag.pick().piece] = view_piece_bag 129 | self.stats['count'] += piece_bag.count 130 | 131 | if not build_tree: 132 | return 133 | view = view_piece_bag.view 134 | 135 | build_from_parsed_pieces( 136 | self._root, view.parsed_pieces, count=piece_bag.count, uniq=False) 137 | 138 | def cluster(self, config, **kwargs): 139 | for clustered in cluster(config, self._url_meta, self._root, **kwargs): 140 | yield self._transfer(clustered) 141 | 142 | def _transfer(self, root): 143 | pattern = None 144 | bucket = ViewPieceBagBucket(self._url_meta) 145 | for nodes in dump_tree(root): 146 | piece = ''.join([p.piece for p in nodes[1:]]) 147 | view_piece_bag = self[piece] 148 | bucket.add(view_piece_bag, False) 149 | if pattern is None: 150 | pattern = Pattern( 151 | ''.join([str(p.pattern) for p in nodes[1:]])) 152 | return bucket, pattern 153 | 154 | 155 | def confused(total, max_part, threshold): 156 | """Determine whether it is too complex to become a cluster. 157 | 158 | If a data set have several(= threshold and o_part >= threshold: 174 | return True 175 | return abs(max_part - o_part) < threshold - 1 176 | 177 | 178 | class SeekResult(object): 179 | FOUND = 1 180 | IMPOSSIBLE = 2 181 | UNKNOW = 3 182 | BACKWARD = 4 183 | 184 | 185 | class PatternCluster(object): 186 | """Base class of cluster.""" 187 | 188 | def __init__(self, processor): 189 | self._processor = processor 190 | self._min_cluster_num = processor.config.getint( 191 | 'make', 'min_cluster_num') 192 | self._patterns = set() 193 | 194 | @property 195 | def pre_level_processor(self): 196 | return self._processor.pre_level_processor 197 | 198 | def cluster(self): 199 | pass 200 | 201 | def add(self, obj): 202 | pass 203 | 204 | @property 205 | def pattern_num(self): 206 | return len(self._patterns) 207 | 208 | def seek_cluster(self, package): 209 | return SeekResult.UNKNOW 210 | 211 | 212 | class PiecePatternCluster(PatternCluster): 213 | 214 | def __init__(self, processor): 215 | super(PiecePatternCluster, self).__init__(processor) 216 | self._bucket = PieceBagBucket() 217 | 218 | def seek_cluster(self, package): 219 | p_nodes_count = sum([p.count for p in package.p_nodes]) 220 | if p_nodes_count - package.count >= self._min_cluster_num: 221 | return SeekResult.IMPOSSIBLE 222 | 223 | return SeekResult.UNKNOW 224 | 225 | def iter_nodes(self): 226 | return self._bucket.iter_all() 227 | 228 | def add(self, piece_pattern_node): 229 | self._bucket.add(piece_pattern_node) 230 | 231 | def _set_pattern(self, piece_bag, update_patterns=False): 232 | pattern = Pattern(piece_bag.pick().piece) 233 | piece_bag.set_pattern(pattern) 234 | if update_patterns: 235 | self._patterns.add(pattern) 236 | 237 | def cluster(self): 238 | if not self._bucket: 239 | return 240 | procesor = self._processor 241 | if procesor.is_last_level() \ 242 | and 'last_path_as_pattern' in procesor.kwargs \ 243 | and procesor.kwargs['last_path_as_pattern']: 244 | for piece_bag in self._bucket: 245 | self._set_pattern(piece_bag, True) 246 | return 247 | 248 | mcn = self._min_cluster_num 249 | if len(self._bucket) < mcn: 250 | max_count = max(self._bucket, key=lambda x: x.count).count 251 | if not confused(self._bucket.count, max_count, mcn): 252 | for piece_bag in self._bucket: 253 | self._set_pattern(piece_bag, True) 254 | return 255 | 256 | for piece_bag in self._bucket: 257 | stats = piece_bag.stats 258 | count = piece_bag.count 259 | if count < mcn \ 260 | or stats['p_nodes_count'] - count >= mcn \ 261 | or not self.pre_level_processor.seek_cluster(piece_bag): 262 | self._set_pattern(piece_bag) 263 | self._add_to_forward_cluster(piece_bag) 264 | else: 265 | self._set_pattern(piece_bag, True) 266 | 267 | def _add_to_forward_cluster(self, piece_bag): 268 | parsed_piece = piece_bag.pick().parsed_piece 269 | if len(parsed_piece.pieces) == 1: 270 | self._processor.get_cluster(LengthPatternCluster).add(piece_bag) 271 | return 272 | 273 | view = MultiView(parsed_piece) 274 | p_cls = BasePatternCluster 275 | vl = len(view.parsed_pieces) 276 | 277 | if vl == 3 and self._processor.is_last_path(): 278 | ldsf_view = LastDotSplitFuzzyView(parsed_piece) 279 | if view == ldsf_view: 280 | view = ldsf_view 281 | p_cls = LastDotSplitFuzzyPatternCluster 282 | elif vl > 3: 283 | mixed_view = MixedView(parsed_piece) 284 | mvl = len(mixed_view.parsed_pieces) 285 | if mvl == 1: 286 | self._processor.get_cluster( 287 | LengthPatternCluster).add(piece_bag) 288 | return 289 | elif vl - mvl >= self._min_cluster_num: 290 | if mvl == 3 and self._processor.is_last_path(): 291 | ldsf_view = LastDotSplitFuzzyView(parsed_piece) 292 | if mixed_view == ldsf_view: 293 | view = ldsf_view 294 | p_cls = LastDotSplitFuzzyPatternCluster 295 | else: 296 | view = mixed_view 297 | p_cls = MixedPatternCluster 298 | else: 299 | view = mixed_view 300 | p_cls = MixedPatternCluster 301 | 302 | self._processor.get_cluster(p_cls).add( 303 | ViewPieceBag(view, piece_bag)) 304 | 305 | 306 | class LengthPatternCluster(PatternCluster): 307 | def __init__(self, processor): 308 | super(LengthPatternCluster, self).__init__(processor) 309 | self._length_buckets = {} 310 | 311 | def add(self, piece_bag): 312 | piece_length = piece_bag.pick().parsed_piece.piece_length 313 | if piece_length not in self._length_buckets: 314 | self._length_buckets[piece_length] = PieceBagBucket() 315 | self._length_buckets[piece_length].add(piece_bag) 316 | 317 | def _length_as_cluster(self, length_bucket): 318 | if len(length_bucket) < self._min_cluster_num: 319 | if length_bucket.count < self._min_cluster_num: 320 | return False 321 | max_count = max(length_bucket, key=lambda x: x.count).count 322 | if not confused(length_bucket.count, max_count, self._min_cluster_num): 323 | return False 324 | 325 | return True 326 | 327 | def _update_patterns(self, bucket): 328 | for piece_bag in bucket: 329 | self._patterns.add(piece_bag.pick().pattern) 330 | 331 | def cluster(self): 332 | if not self._length_buckets: 333 | return 334 | mcn = self._min_cluster_num 335 | if len(self._length_buckets) < mcn: 336 | total = sum([c.count for c in itervalues(self._length_buckets)]) 337 | max_bucket = max(itervalues(self._length_buckets), 338 | key=lambda x: x.count) 339 | if not confused(total, max_bucket.count, mcn): 340 | for bucket in itervalues(self._length_buckets): 341 | if self._length_as_cluster(bucket): 342 | self._set_pattern(bucket, True) 343 | else: 344 | self._update_patterns(bucket) 345 | return 346 | 347 | forward_cluster = self._processor.get_cluster(FuzzyPatternCluster) 348 | for length_bucket in itervalues(self._length_buckets): 349 | if self._length_as_cluster(length_bucket): 350 | if self.pre_level_processor.seek_cluster(length_bucket): 351 | self._set_pattern(length_bucket, True) 352 | continue 353 | self._set_pattern(length_bucket) 354 | 355 | forward_cluster.add(length_bucket) 356 | 357 | def _set_pattern(self, length_bucket, update_patterns=False): 358 | parsed_piece = length_bucket.pick().parsed_piece 359 | length = parsed_piece.piece_length 360 | pattern = Pattern(specify_rule(parsed_piece.fuzzy_rule, length)) 361 | length_bucket.set_pattern(pattern) 362 | if update_patterns: 363 | self._patterns.add(pattern) 364 | 365 | 366 | class MultiPatternCluster(PatternCluster): 367 | def __init__(self, processor): 368 | super(MultiPatternCluster, self).__init__(processor) 369 | self._buckets = {} 370 | 371 | def cluster(self): 372 | for bucket in itervalues(self._buckets): 373 | if bucket.count < self._min_cluster_num: 374 | self._to_forward_cluster(bucket) 375 | continue 376 | for b, pattern in self._cluster(bucket): 377 | if self._as_cluster(b, pattern): 378 | self._set_pattern(b, pattern) 379 | else: 380 | self._to_forward_cluster(b) 381 | 382 | def _cluster(self, bucket): 383 | for b, pattern in bucket.cluster(self._processor.config): 384 | yield b, pattern 385 | 386 | def _to_forward_cluster(self, bucket): 387 | for view_piece_bag in bucket: 388 | self._add_to_forward_cluster(view_piece_bag) 389 | 390 | def _add_to_forward_cluster(self, view_piece_bag): 391 | pass 392 | 393 | def _as_cluster(self, bucket, pattern): 394 | if bucket.count < self._min_cluster_num: 395 | return False 396 | return True 397 | 398 | def _set_pattern(self, bucket, pattern): 399 | bucket.set_pattern(pattern) 400 | self._patterns.add(pattern) 401 | 402 | def add(self, view_piece_bag): 403 | view = view_piece_bag.view 404 | if view not in self._buckets: 405 | url_meta = URLMeta(len(view.parsed_pieces), [], False) 406 | self._buckets[view] = ViewPieceBagBucket(url_meta) 407 | self._buckets[view].add(view_piece_bag) 408 | 409 | 410 | class BasePatternCluster(MultiPatternCluster): 411 | 412 | def _add_to_forward_cluster(self, view_piece_bag): 413 | view = view_piece_bag.view 414 | piece_bag = view_piece_bag.piece_bag 415 | parsed_piece = piece_bag.pick().parsed_piece 416 | 417 | mixed_view = MixedView(parsed_piece) 418 | mvl = len(mixed_view.parsed_pieces) 419 | 420 | p_cls = MixedPatternCluster 421 | 422 | if view == mixed_view: 423 | if self._processor.is_last_path(): 424 | ldsf_view = LastDotSplitFuzzyView(parsed_piece) 425 | if len(ldsf_view.parsed_pieces) == 1: 426 | self._processor.get_cluster( 427 | LengthPatternCluster).add(piece_bag) 428 | return 429 | else: 430 | view = ldsf_view 431 | p_cls = LastDotSplitFuzzyPatternCluster 432 | else: 433 | self._processor.get_cluster( 434 | LengthPatternCluster).add(piece_bag) 435 | return 436 | else: 437 | view = mixed_view 438 | if mvl == 1: 439 | self._processor.get_cluster( 440 | LengthPatternCluster).add(piece_bag) 441 | return 442 | elif mvl == 3 and self._processor.is_last_path(): 443 | ldsf_view = LastDotSplitFuzzyView(parsed_piece) 444 | if mixed_view == ldsf_view: 445 | view = ldsf_view 446 | p_cls = LastDotSplitFuzzyPatternCluster 447 | 448 | self._processor.get_cluster(p_cls).add( 449 | ViewPieceBag(view, piece_bag)) 450 | 451 | 452 | class MixedPatternCluster(MultiPatternCluster): 453 | 454 | def _add_to_forward_cluster(self, view_piece_bag): 455 | view = view_piece_bag.view 456 | piece_bag = view_piece_bag.piece_bag 457 | parsed_piece = piece_bag.pick().parsed_piece 458 | 459 | if self._processor.is_last_path(): 460 | ldsf_view = LastDotSplitFuzzyView(parsed_piece) 461 | if len(ldsf_view.parsed_pieces) == 1: 462 | self._processor.get_cluster( 463 | LengthPatternCluster).add(piece_bag) 464 | return 465 | else: 466 | view = ldsf_view 467 | p_cls = LastDotSplitFuzzyPatternCluster 468 | else: 469 | self._processor.get_cluster( 470 | LengthPatternCluster).add(piece_bag) 471 | return 472 | 473 | self._processor.get_cluster(p_cls).add( 474 | ViewPieceBag(view, piece_bag)) 475 | 476 | 477 | class LastDotSplitFuzzyPatternCluster(MultiPatternCluster): 478 | 479 | def _cluster(self, bucket): 480 | for b, pattern in bucket.cluster(self._processor.config, 481 | last_path_as_pattern=True): 482 | yield b, pattern 483 | 484 | def _add_to_forward_cluster(self, view_piece_bag): 485 | self._processor.get_cluster(LengthPatternCluster).add( 486 | view_piece_bag.piece_bag) 487 | 488 | 489 | class FuzzyPatternCluster(PatternCluster): 490 | def __init__(self, processor): 491 | super(FuzzyPatternCluster, self).__init__(processor) 492 | self._cached = TBag() 493 | self._force_pattern = False 494 | self._fuzzy_pattern = None 495 | 496 | def add(self, bucket): 497 | if self._force_pattern: 498 | self._set_pattern(bucket) 499 | else: 500 | self._cached.add(bucket) 501 | if len(self._cached) >= self._min_cluster_num: 502 | self._force_pattern = True 503 | 504 | def _update_patterns(self): 505 | for bucket in self._cached: 506 | for piece_bag in bucket: 507 | self._patterns.add(piece_bag.pick().pattern) 508 | 509 | def cluster(self): 510 | if self._force_pattern: 511 | self._set_pattern(self._cached) 512 | else: 513 | if self._cached.count < self._min_cluster_num: 514 | self._update_patterns() 515 | return 516 | max_count = max(self._cached, key=lambda x: x.count).count 517 | if confused(self._cached.count, max_count, self._min_cluster_num): 518 | self._set_pattern(self._cached) 519 | else: 520 | self._update_patterns() 521 | 522 | def _set_pattern(self, package): 523 | if self._fuzzy_pattern is None: 524 | self._fuzzy_pattern = Pattern( 525 | wildcard_rule(package.pick().parsed_piece.fuzzy_rule)) 526 | self._patterns.add(self._fuzzy_pattern) 527 | package.set_pattern(self._fuzzy_pattern) 528 | 529 | 530 | CLUSTER_CLASSES = [PiecePatternCluster, 531 | BasePatternCluster, 532 | MixedPatternCluster, 533 | LastDotSplitFuzzyPatternCluster, 534 | LengthPatternCluster, 535 | FuzzyPatternCluster] 536 | 537 | 538 | class ClusterProcessor(object): 539 | def __init__(self, config, url_meta, pre_level_processor, **kwargs): 540 | self._config = config 541 | self._url_meta = url_meta 542 | self._pattern_clusters = OrderedDict( 543 | [(c.__name__, c(self)) for c in CLUSTER_CLASSES]) 544 | self._pre_level_processor = pre_level_processor 545 | self._next_level_processors = {} 546 | self._kwargs = kwargs 547 | 548 | @cached_property 549 | def level(self): 550 | l = 0 551 | n = self.pre_level_processor 552 | while n is not None: 553 | l += 1 554 | n = n.pre_level_processor 555 | return l 556 | 557 | def is_last_level(self): 558 | return self._url_meta.depth == self.level 559 | 560 | def is_last_path(self): 561 | return self._url_meta.path_depth == self.level 562 | 563 | @property 564 | def kwargs(self): 565 | return self._kwargs 566 | 567 | @property 568 | def next_level_processors(self): 569 | return self._next_level_processors.values() 570 | 571 | def _backward_package(self, package): 572 | bucket = PieceBagBucket() 573 | for p_node in package.p_nodes: 574 | if p_node.piece in bucket: 575 | continue 576 | bucket.add(p_node) 577 | return bucket 578 | 579 | def seek_cluster(self, package): 580 | if self._pre_level_processor is None: 581 | return False 582 | for c in itervalues(self._pattern_clusters): 583 | res = c.seek_cluster(package) 584 | if res == SeekResult.FOUND: 585 | return True 586 | elif res == SeekResult.IMPOSSIBLE: 587 | break 588 | elif res == SeekResult.BACKWARD: 589 | pack = self._backward_package(package) 590 | return self._pre_level_processor.seek_cluster(pack) 591 | elif res == SeekResult.UNKNOW: 592 | continue 593 | else: 594 | raise ValueError('invalid seek result') 595 | 596 | return False 597 | 598 | def get_cluster(self, cluster_cls): 599 | return self._pattern_clusters[cluster_cls.__name__] 600 | 601 | @property 602 | def config(self): 603 | return self._config 604 | 605 | @property 606 | def pre_level_processor(self): 607 | return self._pre_level_processor 608 | 609 | def _process(self): 610 | for c in itervalues(self._pattern_clusters): 611 | c.cluster() 612 | 613 | def add(self, node, add_children=False): 614 | c = self.get_cluster(PiecePatternCluster) 615 | if add_children: 616 | for child in node.children: 617 | c.add(child) 618 | else: 619 | c.add(node) 620 | 621 | @property 622 | def pattern_num(self): 623 | return sum([c.pattern_num for c in itervalues(self._pattern_clusters)]) 624 | 625 | def process(self): 626 | self._process() 627 | if self.is_last_level(): 628 | return 629 | 630 | self._create_next_level_processors() 631 | 632 | for processor in itervalues(self._next_level_processors): 633 | processor.process() 634 | 635 | def _create_next_level_processors(self): 636 | 637 | pp_cluster = self.get_cluster(PiecePatternCluster) 638 | processors = self._next_level_processors 639 | 640 | for node in pp_cluster.iter_nodes(): 641 | pattern = node.pattern 642 | if pattern not in processors: 643 | processors[pattern] = ClusterProcessor( 644 | self._config, 645 | self._url_meta, 646 | self, **self.kwargs) 647 | processor = processors[pattern] 648 | processor.add(node, add_children=True) 649 | 650 | 651 | def split_by_pattern(root): 652 | """Split the piece pattern tree by pattern path. 653 | 654 | Args: 655 | root (PiecePatternNode): The root of piece pattern tree. 656 | 657 | Returns: 658 | iterator: Iterator of sub-trees. 659 | """ 660 | tree_roots = {} 661 | for nodes in dump_tree(root): 662 | pid = hash("/".join([str(p.pattern) for p in nodes])) 663 | if pid not in tree_roots: 664 | tree_roots[pid] = PiecePatternNode((EMPTY_PARSED_PIECE, None)) 665 | sub_root = tree_roots[pid] 666 | build_from_piece_pattern_nodes(sub_root, nodes[1:]) 667 | 668 | return itervalues(tree_roots) 669 | 670 | 671 | def _can_be_splited(processor): 672 | """Check whether the processor tree can be splited. 673 | 674 | Args: 675 | processor (ClusterProcessor): The root node of cluster processor. 676 | 677 | Returns: 678 | bool: Whether the processor tree can be splited. 679 | """ 680 | while True: 681 | pattern_num = processor.pattern_num 682 | if pattern_num > 1: 683 | return True 684 | l = len(processor.next_level_processors) 685 | if l <= 0: 686 | break 687 | elif l > 1: 688 | return True 689 | processor = pick(processor.next_level_processors) 690 | 691 | return False 692 | 693 | 694 | def process(config, url_meta, root, **kwargs): 695 | """Start clustering. 696 | 697 | Args: 698 | config (Config): The configure object. 699 | url_meta (URLMeta): The URLMeta object. 700 | root (PiecePatternNode): The root of the piece pattern tree. 701 | **kwargs: Keyword arguments. 702 | 703 | Returns: 704 | bool: Whether the clustered tree can be split. 705 | """ 706 | processor = ClusterProcessor(config, url_meta, None, **kwargs) 707 | processor.add(root) 708 | processor.process() 709 | return _can_be_splited(processor) 710 | 711 | 712 | def cluster(config, url_meta, root, **kwargs): 713 | """Entrance of the cluster workflow. 714 | 715 | Args: 716 | config (Config): The configure object. 717 | url_meta (URLMeta): The URLMeta object. 718 | root (PiecePatternNode): The root of the piece pattern tree. 719 | **kwargs: Keyword arguments. 720 | 721 | Yields: 722 | PiecePatternNode: The clustered sub piece pattern tree root. 723 | 724 | """ 725 | if root.count <= 0: 726 | return 727 | if not process(config, url_meta, root, **kwargs): 728 | yield root 729 | return 730 | for sub_root in split_by_pattern(root): 731 | for clustered in cluster(config, url_meta, sub_root, **kwargs): 732 | yield clustered 733 | -------------------------------------------------------------------------------- /src/os_urlpattern/pattern_maker.py: -------------------------------------------------------------------------------- 1 | """Pattern clustering procedure APIs. 2 | """ 3 | from .compat import itervalues 4 | from .config import get_default_config 5 | from .definition import BasePattern 6 | from .parse_utils import EMPTY_PARSED_PIECE, ParsedPiece 7 | from .parser import fuzzy_digest, parse 8 | from .pattern_cluster import cluster 9 | from .piece_pattern_node import PiecePatternNode, build_from_parsed_pieces 10 | from .utils import TreeNode, build_tree, dump_tree, pick 11 | 12 | 13 | class PatternMaker(object): 14 | """Scaffold for simplifying clustering. 15 | 16 | After load urls, iterate all sub makers make cluster 17 | individually or cluster all by calling make method. 18 | """ 19 | 20 | def __init__(self, config=None): 21 | self._config = get_default_config() if config is None else config 22 | self._makers = {} 23 | 24 | @property 25 | def makers(self): 26 | """iterable: For iterating all sub makers.""" 27 | return itervalues(self._makers) 28 | 29 | def load(self, url, meta=None): 30 | """Load url and meta. 31 | 32 | Args: 33 | url (str): The URL to be loaded. 34 | meta (object, optional): Defaults to None. Meta data will be 35 | merged at each cluster and can be accessed by clustered 36 | node's meta property. 37 | 38 | Returns: 39 | tuple: 2-tules, (node, is_new). 40 | """ 41 | url_meta, parsed_pieces = parse(url) 42 | if not isinstance(parsed_pieces[0], ParsedPiece): 43 | raise ValueError('Invalid URL') 44 | sid = fuzzy_digest(url_meta, parsed_pieces) 45 | if sid not in self._makers: 46 | self._makers[sid] = Maker(url_meta, self._config) 47 | return self._makers[sid].load(parsed_pieces, meta=meta) 48 | 49 | def make(self, combine=False): 50 | """Iterate all sub makers, start clustering and yield clustered. 51 | 52 | Args: 53 | combine (bool, optional): Defaults to False. Combine the 54 | same url_meta clusters into a patten tree. 55 | 56 | Yields: 57 | tuple: 2-tuple, (url_meta, clustered). The clustered is the 58 | root of a clustered tree. 59 | """ 60 | for maker in self.makers: 61 | for clustered in maker.make(combine): 62 | yield maker.url_meta, clustered 63 | 64 | 65 | class Maker(object): 66 | """Low-level APIs for clustering. 67 | 68 | Suppose this will only be used for same fuzzy-digest clustering. 69 | """ 70 | 71 | def __init__(self, url_meta, config=None): 72 | self._url_meta = url_meta 73 | self._config = get_default_config() if config is None else config 74 | self._root = PiecePatternNode((EMPTY_PARSED_PIECE, None)) 75 | 76 | @property 77 | def url_meta(self): 78 | """URLMeta: The URLMeta object.""" 79 | return self._url_meta 80 | 81 | def load(self, parsed_pieces, meta=None): 82 | """Load parsed pieces and meta. 83 | 84 | Args: 85 | parsed_pieces (list): The parsed pieces to be loaded. 86 | meta (object, optional): Defaults to None. Meta data will be 87 | merged at each cluster and can be accessed by clustered 88 | node's meta property. 89 | 90 | Returns: 91 | tuple: 2-tules, (node, is_new). 92 | """ 93 | return build_from_parsed_pieces(self._root, 94 | parsed_pieces, 95 | meta=meta) 96 | 97 | def _cluster(self): 98 | for clustered in cluster(self._config, 99 | self._url_meta, 100 | self._root): 101 | yield clustered 102 | 103 | def _combine_clusters(self): 104 | root = TreeNode(BasePattern.EMPTY) 105 | for clustered in self._cluster(): 106 | nodes = pick(dump_tree(clustered)) 107 | build_tree(root, [(n.pattern, n.pattern) 108 | for n in nodes[1:]], nodes[0].count) 109 | 110 | yield root 111 | 112 | def make(self, combine=False): 113 | """Start clustering and yield clustered. 114 | 115 | Args: 116 | combine (bool, optional): Defaults to False. Combine the 117 | clusters into a patten tree. 118 | 119 | Yields: 120 | TreeNode: Root of the clustered tree. If combine=False yield 121 | all clustered parsed piece trees otherwise yield a 122 | combined pattern tree. 123 | """ 124 | if combine: 125 | return self._combine_clusters() 126 | return self._cluster() 127 | -------------------------------------------------------------------------------- /src/os_urlpattern/pattern_matcher.py: -------------------------------------------------------------------------------- 1 | """Pattern matching APIs. 2 | """ 3 | from __future__ import unicode_literals 4 | 5 | from functools import total_ordering 6 | 7 | from .definition import BasePatternRule 8 | from .parse_utils import MIXED_RULE_SET, PieceParser, fuzzy_join 9 | from .parsed_piece_view import (FuzzyView, LastDotSplitFuzzyView, LengthView, 10 | MixedView, MultiView, PieceView, 11 | view_cls_from_pattern) 12 | from .parser import fuzzy_digest, parse 13 | from .pattern import Pattern 14 | from .utils import TreeNode, build_tree 15 | 16 | 17 | @total_ordering 18 | class MatchPattern(Pattern): 19 | """Pattern used for matching. 20 | 21 | It is comparable and has a view_cls property to 22 | identify the pattern type. 23 | """ 24 | __slots__ = ('view_cls', '_cmp_key') 25 | 26 | def __init__(self, pattern_string, is_last_path=False): 27 | super(MatchPattern, self).__init__(pattern_string) 28 | self.view_cls = view_cls_from_pattern(self, is_last_path) 29 | self._cmp_key = None 30 | 31 | @property 32 | def cmp_key(self): 33 | """str: Used for sort.""" 34 | 35 | if self._cmp_key is None: 36 | l = [MatchPattern(u.pattern_unit_string) 37 | for u in reversed(self.pattern_units)] 38 | self._cmp_key = ''.join([str(VIEW_ORDER[p.view_cls]) for p in l]) 39 | return self._cmp_key 40 | 41 | def __ne__(self, other): 42 | return self.pattern_string != other.pattern_string 43 | 44 | def __lt__(self, other): 45 | if self.view_cls == other.view_cls: 46 | return self.cmp_key > other.cmp_key 47 | return VIEW_ORDER[self.view_cls] > VIEW_ORDER[other.view_cls] 48 | 49 | 50 | EMPTY_MATCH_PATTERN = MatchPattern(BasePatternRule.EMPTY) 51 | 52 | 53 | class ViewMatcher(object): 54 | """Base class for different type of view matcher. 55 | 56 | Init with a specified ParsedPieceView class. 57 | Filled with same view-type match node. 58 | Get all matched nodes. 59 | """ 60 | __slots__ = ('view_cls', '_matchers') 61 | 62 | def __init__(self, view_cls): 63 | self.view_cls = view_cls 64 | self._matchers = {} 65 | 66 | def add_match_node(self, match_node): 67 | pass 68 | 69 | def match(self, parsed_piece): 70 | view = self.view_cls(parsed_piece) 71 | if view.view not in self._matchers: 72 | return [] 73 | parsed_pieces = view.parsed_pieces 74 | matched_result = [] 75 | self._matchers[view.view].match( 76 | parsed_pieces, 0, matched_result) 77 | return [n.meta for n in matched_result] 78 | 79 | 80 | class PiecePatternViewMatcher(ViewMatcher): 81 | 82 | def add_match_node(self, match_node): 83 | if match_node.pattern.pattern_string not in self._matchers: 84 | self._matchers[match_node.pattern.pattern_string] = [match_node] 85 | 86 | def match(self, parsed_piece): 87 | return [] if parsed_piece.piece not in self._matchers \ 88 | else self._matchers[parsed_piece.piece] 89 | 90 | 91 | class LengthPatternViewMatcher(ViewMatcher): 92 | 93 | def add_match_node(self, match_node): 94 | length = match_node.pattern.pattern_units[0].num 95 | self._matchers[length] = [match_node] 96 | 97 | def match(self, parsed_piece): 98 | return [] if parsed_piece.piece_length not in self._matchers \ 99 | else self._matchers[parsed_piece.piece_length] 100 | 101 | 102 | class MultiPatternViewMatcher(ViewMatcher): 103 | 104 | def add_match_node(self, match_node): 105 | pattern = match_node.pattern 106 | r = fuzzy_join(pattern.pattern_units) 107 | if r not in self._matchers: 108 | self._matchers[r] = PatternMatchNode(EMPTY_MATCH_PATTERN) 109 | patterns = [MatchPattern(p.pattern_unit_string) 110 | for p in pattern.pattern_units] 111 | matcher = self._matchers[r] 112 | build_tree(matcher, patterns, meta=match_node) 113 | 114 | 115 | class MixedPatternViewMatcher(MultiPatternViewMatcher): 116 | 117 | def _pattern(self, pattern_units): 118 | return MatchPattern(''.join([p.pattern_unit_string for p in pattern_units])) 119 | 120 | def add_match_node(self, match_node): 121 | patterns = [] 122 | t = [] 123 | for pattern_unit in match_node.pattern.pattern_units: 124 | if not pattern_unit.is_literal() \ 125 | or pattern_unit.fuzzy_rule not in MIXED_RULE_SET: 126 | if t: 127 | patterns.append(self._pattern(t)) 128 | t = [] 129 | patterns.append(self._pattern([pattern_unit])) 130 | else: 131 | t.append(pattern_unit) 132 | 133 | if t: 134 | patterns.append(self._pattern(t)) 135 | 136 | r = fuzzy_join(patterns) 137 | if r not in self._matchers: 138 | self._matchers[r] = PatternMatchNode(EMPTY_MATCH_PATTERN) 139 | matcher = self._matchers[r] 140 | build_tree(matcher, patterns, meta=match_node) 141 | 142 | 143 | class FuzzyPatternViewMatcher(ViewMatcher): 144 | 145 | def __init__(self, view_cls): 146 | super(FuzzyPatternViewMatcher, self).__init__(view_cls) 147 | self._matchers = [] 148 | 149 | def add_match_node(self, match_node): 150 | self._matchers.append(match_node) 151 | 152 | def match(self, parsed_piece): 153 | return self._matchers 154 | 155 | 156 | VIEW_MATCHERS = [ 157 | (PieceView, PiecePatternViewMatcher), 158 | (MultiView, MultiPatternViewMatcher), 159 | (MixedView, MultiPatternViewMatcher), 160 | (LastDotSplitFuzzyView, MultiPatternViewMatcher), 161 | (LengthView, LengthPatternViewMatcher), 162 | (FuzzyView, FuzzyPatternViewMatcher), 163 | ] 164 | 165 | VIEW_ORDER = dict([(item[0], _idx) for _idx, item in enumerate(VIEW_MATCHERS)]) 166 | 167 | 168 | def get_view_matcher_cls(view_cls): 169 | """Get specified ViewMatcher class from ParsedPieceView class. 170 | 171 | Args: 172 | view_cls (ParsedPieceView): Class of a specified ParsedPieceView. 173 | 174 | Returns: 175 | class(ViewMatcher): The Corresponding ViewMatcher class. 176 | """ 177 | idx = VIEW_ORDER[view_cls] 178 | return VIEW_MATCHERS[idx][1] 179 | 180 | 181 | @total_ordering 182 | class PatternMatchNode(TreeNode): 183 | """Node for building a match tree.""" 184 | 185 | __slots__ = ('_view_matchers',) 186 | 187 | def __init__(self, value): 188 | super(PatternMatchNode, self).__init__(value) 189 | self._view_matchers = [] 190 | 191 | @property 192 | def view_cls(self): 193 | return self.pattern.view_cls 194 | 195 | def match(self, parsed_pieces, idx, matched_nodes): 196 | """DF find all matched nodes. 197 | 198 | If a path from root to leaf match all the corresponding pieces, 199 | the leaf node is called matched node.This mathed shoud be called 200 | by the root node, with idx=0 and a list which will be filled with 201 | all matched nodes. 202 | 203 | Args: 204 | parsed_pieces (sequence): All of the parsed pieces to be matched. 205 | idx (int): Indecate which piece of the whole parsed pieces should 206 | try to match this node. 207 | matched_nodes (list of PatternMatchNode): Filled with all of the 208 | matched leaf nodes. 209 | """ 210 | parsed_piece = parsed_pieces[idx] 211 | for matcher in self._view_matchers: 212 | nodes = matcher.match(parsed_piece) 213 | if not nodes: 214 | continue 215 | if nodes[0].leaf(): 216 | matched_nodes.extend(nodes) 217 | continue 218 | self._deep_match(nodes, parsed_pieces, idx + 1, 219 | matched_nodes) 220 | 221 | def _deep_match(self, nodes, parsed_pieces, idx, matched_nodes): 222 | for node in nodes: 223 | node.match(parsed_pieces, idx, matched_nodes) 224 | 225 | def _get_matcher(self, view_cls): 226 | s = 0 227 | e = len(self._view_matchers) 228 | while e > s: 229 | t = (e - s) // 2 + s 230 | matcher = self._view_matchers[t] 231 | if matcher.view_cls == view_cls: 232 | return matcher 233 | tid = VIEW_ORDER[matcher.view_cls] 234 | vid = VIEW_ORDER[view_cls] 235 | if tid < vid: 236 | s = t + 1 237 | else: 238 | e = t 239 | 240 | matcher = get_view_matcher_cls(view_cls)(view_cls) 241 | self._view_matchers.insert(e, matcher) 242 | return matcher 243 | 244 | @property 245 | def pattern(self): 246 | return self.value 247 | 248 | def add_child(self, pattern): 249 | child, is_new = super(PatternMatchNode, self).add_child( 250 | (pattern, pattern)) 251 | if is_new: 252 | matcher = self._get_matcher(child.view_cls) 253 | matcher.add_match_node(child) 254 | return child, is_new 255 | 256 | def __lt__(self, other): 257 | if id(self) == id(other) or self.parrent is None: 258 | return False 259 | if self.pattern == other.pattern: 260 | return self.parrent < other.parrent 261 | return self.pattern < other.pattern 262 | 263 | 264 | class PatternMatcher(object): 265 | """Offer match processing APIs. 266 | 267 | Common procedure: 268 | 1. Init a PatternMatcher. 269 | 2. Load pattern string. 270 | 3. Match url. 271 | """ 272 | 273 | def __init__(self): 274 | self._parser = PieceParser() 275 | self._matchers = {} 276 | 277 | def load(self, url_pattern_string, meta=None): 278 | """Load URL pattern string. 279 | 280 | Args: 281 | url_pattern_string (str): URL pattern string. 282 | meta (any, optional): Defaults to None. It will bind to 283 | matched result's meta property. 284 | 285 | Returns: 286 | tuple: 2-tules, (node, is_new). 287 | """ 288 | url_meta, parsed_patterns = parse(url_pattern_string) 289 | if not isinstance(parsed_patterns[0], MatchPattern): 290 | raise ValueError('Invalid URL pattern') 291 | sid = fuzzy_digest(url_meta, parsed_patterns) 292 | if sid not in self._matchers: 293 | self._matchers[sid] = Matcher(url_meta) 294 | matcher = self._matchers[sid] 295 | return matcher.load(parsed_patterns, meta=meta) 296 | 297 | def match(self, url): 298 | """Match url, get the matched results. 299 | 300 | Args: 301 | url (str): The URL to be matched. 302 | 303 | Returns: 304 | list: List of matched pattern nodes, if no match return []. 305 | Bound meta data can be accessed with node.meta. 306 | """ 307 | url_meta, parsed_pieces = parse(url) 308 | sid = fuzzy_digest(url_meta, parsed_pieces) 309 | if sid in self._matchers: 310 | return self._matchers[sid].match(parsed_pieces) 311 | return [] 312 | 313 | 314 | class Matcher(object): 315 | """Low-level APIs for matching. 316 | 317 | Suppose this will only be used for same fuzzy-digest matching. 318 | """ 319 | 320 | def __init__(self, url_meta): 321 | self._url_meta = url_meta 322 | self._root = PatternMatchNode(EMPTY_MATCH_PATTERN) 323 | 324 | @property 325 | def url_meta(self): 326 | """URLMeta: The URLMeta object.""" 327 | return self._url_meta 328 | 329 | def match(self, parsed_pieces): 330 | """Match URL parsed peices. 331 | 332 | Args: 333 | parsed_pieces (sequence): URL parsed pieces. 334 | 335 | Returns: 336 | list: List of matched pattern nodes, if no match return []. 337 | Bound meta data can be accessed with node.meta. 338 | """ 339 | 340 | matched_nodes = [] 341 | self._root.match(parsed_pieces, 0, matched_nodes) 342 | return matched_nodes 343 | 344 | def load(self, parsed_patterns, meta=None): 345 | """Load from parsed URL pattern. 346 | 347 | Args: 348 | parsed_patterns (sequence): MatchNodes. 349 | meta (any, optional): Defaults to None. It will bind to 350 | matched result's meta property. 351 | 352 | Returns: 353 | tuple: 2-tules, (node, is_new). 354 | """ 355 | return build_tree(self._root, parsed_patterns, meta=meta) 356 | -------------------------------------------------------------------------------- /src/os_urlpattern/piece_pattern_node.py: -------------------------------------------------------------------------------- 1 | """Raw parsed piece tree. 2 | 3 | Build a tree from the parsed URL pieces. 4 | """ 5 | from __future__ import unicode_literals 6 | 7 | from .compat import itervalues 8 | from .parse_utils import EMPTY_PARSED_PIECE 9 | from .pattern import Pattern 10 | from .utils import TreeNode, build_tree 11 | 12 | 13 | class PiecePatternNode(TreeNode): 14 | """Node for building raw piece tree.""" 15 | 16 | __slots__ = ('_pattern',) 17 | 18 | def __init__(self, parsed_piece_and_pattern): 19 | parsed_piece, self._pattern = parsed_piece_and_pattern 20 | super(PiecePatternNode, self).__init__(parsed_piece) 21 | 22 | def set_pattern(self, pattern): 23 | self._pattern = pattern 24 | 25 | @property 26 | def pattern(self): 27 | if self._pattern is None: 28 | self._pattern = Pattern(self.piece) 29 | return self._pattern 30 | 31 | @property 32 | def piece(self): 33 | return self.parsed_piece.piece 34 | 35 | @property 36 | def parsed_piece(self): 37 | return self.value 38 | 39 | @property 40 | def children_num(self): 41 | return len(self._children) 42 | 43 | def incr_count(self, count, recur=False): 44 | self.count += count 45 | node = self.parrent if recur else None 46 | while node: 47 | node.incr_count(count) 48 | node = node.parrent 49 | 50 | def __str__(self): 51 | return ' '.join((self.piece, str(self.pattern))) 52 | 53 | def add_meta(self, data): 54 | if data is None: 55 | return 56 | if self.meta is None: 57 | self.meta = set() 58 | self.meta.add(data) 59 | 60 | def update_meta(self, data): 61 | if not data: 62 | return 63 | if self.meta is None: 64 | self.meta = set() 65 | self.meta.update(data) 66 | 67 | 68 | def build_from_parsed_pieces(root, parsed_pieces, count=1, meta=None, uniq=True): 69 | """Build piece pattern tree from parsed pieces. 70 | 71 | Args: 72 | root (PiecePatternNode): The root node of the a tree. 73 | parsed_pieces (sequence): The parsed pieces. 74 | count (int, optional): Defaults to 1. 75 | meta ([type], optional): Defaults to None. The meta data will bind to the leaf node. 76 | uniq (bool, optional): Defaults to True. The duplicated node edge will not add. 77 | 78 | Returns: 79 | tuple: 2-tuple, (leaf_node, is_new) 80 | """ 81 | node, is_new = build_tree(root, [(parsed_piece.piece, (parsed_piece, None)) 82 | for parsed_piece in parsed_pieces], count) 83 | if uniq and not is_new: 84 | node.incr_count(0 - count, True) 85 | node.add_meta(meta) 86 | return node, is_new 87 | 88 | 89 | def build_from_piece_pattern_nodes(root, piece_pattern_nodes): 90 | """Build piece pattern tree from piece pattern tree edge. 91 | 92 | Args: 93 | root (PiecePatternNode): The root node of the a tree. 94 | piece_pattern_nodes (sequence): piece pattern tree edge. 95 | 96 | Returns: 97 | tuple: 2-tuple, (leaf_node, is_new) 98 | """ 99 | last = piece_pattern_nodes[-1] 100 | node, is_new = build_tree(root, [(p.piece, (p.parsed_piece, p.pattern)) 101 | for p in piece_pattern_nodes], last.count) 102 | node.update_meta(last.meta) 103 | return node, is_new 104 | -------------------------------------------------------------------------------- /src/os_urlpattern/utils.py: -------------------------------------------------------------------------------- 1 | """Utilities. 2 | """ 3 | import inspect 4 | import logging 5 | import math 6 | import os 7 | import time 8 | from functools import partial 9 | 10 | from .compat import iteritems, itervalues 11 | 12 | 13 | def pretty_counter(counter): 14 | """Format a dict like object. 15 | 16 | Args: 17 | counter (dict): The dict like object to be formatted. 18 | 19 | Returns: 20 | str: Formatted string. 21 | """ 22 | 23 | return ", ".join(['{0}:{1}'.format(k, v) for k, v in iteritems(counter)]) 24 | 25 | 26 | def pick(iterable): 27 | """Get an obj from iterable object. """ 28 | 29 | for obj in iterable: 30 | return obj 31 | 32 | 33 | class Bag(object): 34 | """Uniq objects container. 35 | 36 | The objects in the bag can also be Bag instance. 37 | Use pick method to get a most inside object. 38 | Use iter_all method to iterate objects inside all inner bags. 39 | """ 40 | 41 | __slots__ = ('_objs',) 42 | 43 | def __init__(self): 44 | self._objs = set() 45 | 46 | def add(self, obj): 47 | self._objs.add(obj) 48 | 49 | def __len__(self): 50 | return len(self._objs) 51 | 52 | def pick(self): 53 | obj = pick(self) 54 | while isinstance(obj, Bag): 55 | obj = pick(obj) 56 | return obj 57 | 58 | def __iter__(self): 59 | return iter(self._objs) 60 | 61 | def iter_all(self): 62 | for obj in self: 63 | if isinstance(obj, Bag): 64 | for o in obj.iter_all(): 65 | yield o 66 | else: 67 | yield obj 68 | 69 | 70 | class TreeNode(object): 71 | """Node of a tree.""" 72 | 73 | __slots__ = ('parrent', '_children', 'count', 74 | 'value', 'meta', '_level') 75 | 76 | def __init__(self, value): 77 | self.parrent = None 78 | self.count = 0 79 | self.value = value 80 | self.meta = None 81 | self._level = None 82 | self._children = None 83 | 84 | def leaf(self): 85 | return not self._children 86 | 87 | @property 88 | def level(self): 89 | """int: The level from root.""" 90 | if self._level is None: 91 | l = 0 92 | n = self.parrent 93 | while n is not None: 94 | l += 1 95 | n = n.parrent 96 | self._level = l 97 | return self._level 98 | 99 | @property 100 | def children(self): 101 | return itervalues(self._children if self._children is not None else {}) 102 | 103 | def add_child(self, kv): 104 | """Add a node to the children data set. 105 | 106 | Args: 107 | kv (pair): Key-value object, the key is used to identify 108 | a uniq node, the value is the node's data. 109 | 110 | Returns: 111 | tuple: 2-tuple, (node, is_new). 112 | """ 113 | 114 | if self._children is None: 115 | self._children = {} 116 | k, v = kv 117 | is_new = False 118 | if k not in self._children: 119 | self._children[k] = self.__class__(v) 120 | self._children[k].parrent = self 121 | is_new = True 122 | child = self._children[k] 123 | return child, is_new 124 | 125 | 126 | def build_tree(root, kv_sequence, count=1, meta=None): 127 | """Build a tee. 128 | 129 | This method will call the node's add_child(kv) to build tree. 130 | 131 | Args: 132 | root (TreeNode): Root node of a tree. 133 | kv_sequence (sequence): Objects will be used to build a tree. 134 | count (int, optional): Defaults to 1. Will increase the nodes count. 135 | meta (any, optional): Defaults to None. Will bind to the leaf node. 136 | 137 | Returns: 138 | tuple: 2-tuple, (node, is_new) 139 | """ 140 | node = root 141 | node.count += count 142 | for kv in kv_sequence: 143 | node, is_new = node.add_child(kv) 144 | node.count += count 145 | if meta is not None: 146 | node.meta = meta 147 | 148 | return node, is_new 149 | 150 | 151 | def dump_tree(root): 152 | """Dump each path of a tree. 153 | 154 | Args: 155 | root (TreeNode): The root node of a tree. 156 | 157 | Yields: 158 | list: List contains nodes from root to leaf as one path. 159 | """ 160 | olist = [] 161 | 162 | def _dump(node, _nodes): 163 | _nodes.append(node) 164 | if node.leaf(): 165 | yield _nodes 166 | return 167 | for child in node.children: 168 | for nodes in _dump(child, _nodes): 169 | yield nodes 170 | _nodes.pop(-1) 171 | 172 | for nodes in _dump(root, olist): 173 | yield nodes 174 | 175 | 176 | class LogSpeedAdapter(logging.LoggerAdapter): 177 | """Logger adapter for speed logging. 178 | 179 | Log only once when called every interal times, 180 | include total count and average speed. 181 | Used as 'with statement' for logging huge loop processing. 182 | 183 | """ 184 | 185 | def __init__(self, logger, interval): 186 | super(LogSpeedAdapter, self).__init__(logger, {}) 187 | self._count = 0 188 | assert(interval) > 0 189 | self._interval = interval 190 | self._start_time = time.time() 191 | self._replace() 192 | 193 | def _replace(self): 194 | for name in ['debug', 'info', 'warning', 'error', 'exception', 'critical']: 195 | setattr(self, name, partial(self._log, name)) 196 | self.log = self._log 197 | 198 | def _log(self, name, msg, *args, **kwargs): 199 | self._count += 1 200 | 201 | if self._count % self._interval == 0: 202 | speed = self._speed() 203 | extra_msg = '{count} {speed:.1f}/s'.format( 204 | count=self._count, speed=speed) 205 | msg = ' '.join((msg, extra_msg)) 206 | if isinstance(name, int): 207 | name = logging.getLevelName(name) 208 | getattr(self.logger, name)(msg, *args, **kwargs) 209 | 210 | def _speed(self): 211 | return self._count / (time.time() - self._start_time) 212 | 213 | def __enter__(self): 214 | self._start_time = time.time() 215 | return self 216 | 217 | def __exit__(self, exc_type, exc_value, exc_tb): 218 | pass 219 | 220 | 221 | def used_memory(): 222 | """Human readable memory usage(Byte). 223 | 224 | Returns: 225 | str: Memory usage. 226 | """ 227 | 228 | try: 229 | import psutil 230 | except: 231 | return '-' 232 | p = psutil.Process(os.getpid()) 233 | memory = p.memory_info().rss 234 | return format_byte(memory) 235 | 236 | 237 | # global variables for format_byte 238 | _UNIT_SUFFIXES = ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y'] 239 | _LOG_1024 = math.log(1024) 240 | _SUFFIXES_LENGTH = len(_UNIT_SUFFIXES) 241 | 242 | 243 | def format_byte(value, precision=2): 244 | """Format byte size into human readable. 245 | 246 | Args: 247 | value (int): The byte size. 248 | precision (int, optional): Defaults to 2. Precision. 249 | 250 | Returns: 251 | str: Human readable format. 252 | """ 253 | 254 | factor = float(10 ** precision) 255 | suffix = min(int(math.log(value) / _LOG_1024), _SUFFIXES_LENGTH) 256 | num = math.ceil(value / (1024.0 ** suffix) * factor) / factor 257 | return ''.join((str(num), _UNIT_SUFFIXES[suffix])) 258 | 259 | 260 | class MemoryUsageFormatter(logging.Formatter): 261 | """Formatter support memory keyword.""" 262 | 263 | def __init__(self, fmt=None, datefmt=None): 264 | super(MemoryUsageFormatter, self).__init__(fmt, datefmt) 265 | self._log_memory = True 266 | if fmt and '%(memory)s' not in fmt: 267 | self._log_memory = False 268 | 269 | def format(self, record): 270 | if self._log_memory and 'memory' not in record.__dict__: 271 | record.__dict__['memory'] = used_memory() 272 | return super(MemoryUsageFormatter, self).format(record) 273 | 274 | 275 | class cached_property(object): 276 | """Decrator for cache class property.""" 277 | 278 | def __init__(self, func): 279 | self.__doc__ = getattr(func, "__doc__") 280 | self.func = func 281 | 282 | def __get__(self, obj, cls): 283 | if obj is None: 284 | return self 285 | 286 | value = obj.__dict__[self.func.__name__] = self.func(obj) 287 | return value 288 | 289 | 290 | def get_classes(module, base_cls, include_base_cls=True): 291 | """Get specified classes form module. 292 | 293 | Args: 294 | module (module): Where to find classes. 295 | base_cls (type): The base class. 296 | include_base_cls (bool, optional): Defaults to True. 297 | Whether include base class. 298 | 299 | Returns: 300 | list: The specified classes. 301 | """ 302 | def is_class(c): 303 | return inspect.isclass(c) \ 304 | and issubclass(c, base_cls) \ 305 | and (include_base_cls or c != base_cls) 306 | return [c for _, c in inspect.getmembers(module, is_class)] 307 | 308 | 309 | def with_metaclass(meta, *bases): 310 | """Create a base class with a metaclass. 311 | 312 | From six. 313 | """ 314 | # This requires a bit of explanation: the basic idea is to make a dummy 315 | # metaclass for one level of class instantiation that replaces itself with 316 | # the actual metaclass. 317 | class metaclass(type): 318 | 319 | def __new__(cls, name, this_bases, d): 320 | return meta(name, bases, d) 321 | 322 | @classmethod 323 | def __prepare__(cls, name, this_bases): 324 | return meta.__prepare__(name, bases) 325 | return type.__new__(metaclass, 'temporary_class', (), {}) 326 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cfhamlet/os-urlpattern/9311aff896ad591b2a9123d256f629f5d142dfc6/tests/__init__.py -------------------------------------------------------------------------------- /tests/data/urls_example.txt: -------------------------------------------------------------------------------- 1 | http://example.com/01.html 2 | http://example.com/123/test01.html 3 | http://example.com/02.html 4 | http://example.com/456/test02.html 5 | http://example.com/03.html 6 | http://example.com/789/test03.html 7 | -------------------------------------------------------------------------------- /tests/test_cmdline.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import os 3 | import shlex 4 | import subprocess 5 | import sys 6 | 7 | import pytest 8 | 9 | from os_urlpattern.cmdline import make, match 10 | 11 | 12 | def call(cmdline, env=None, **kwargs): 13 | if env is None: 14 | env = os.environ.copy() 15 | if env.get('COVERAGE', None) is not None: 16 | env['COVERAGE_PROCESS_START'] = os.path.abspath('.coveragerc') 17 | 18 | cmd = 'python -u %s %s' % (os.path.abspath(__file__), cmdline) 19 | proc = subprocess.Popen(shlex.split(cmd), 20 | stdout=subprocess.PIPE, 21 | stderr=subprocess.PIPE, 22 | cwd=os.getcwd(), 23 | env=env, 24 | **kwargs) 25 | stdout, stderr = proc.communicate() 26 | return stdout, stderr 27 | 28 | 29 | def test_make(tmpdir): 30 | num = 9 31 | urls = ['http://example.com/abc%02d?id=%02d#abc' % 32 | (i, i) for i in range(0, num)] 33 | data = "\n".join(urls) 34 | f = tmpdir.join('urls.txt') 35 | f.write(data) 36 | cmdline = 'make -i %s' % f.strpath 37 | stdout, _ = call(cmdline) 38 | assert b'/abc[0-9]{2}' in stdout 39 | assert urls[0].encode() in stdout 40 | 41 | cmdline = 'make -i %s -f pattern' % f.strpath 42 | stdout, _ = call(cmdline) 43 | assert b'/abc[0-9]{2}' in stdout 44 | assert urls[0].encode() not in stdout 45 | 46 | cmdline = 'make -i %s -f ete' % f.strpath 47 | stdout, _ = call(cmdline) 48 | assert b' abc[0-9]{2}(%d) ' % num 49 | assert b' [\\?]id=[0-9]{2}(%d) ' % num 50 | assert b' - #abc(%d)' % num 51 | 52 | 53 | def test_make_digest_type_urls(tmpdir): 54 | urls = ['http://example.com/%s.html' % j for j in 55 | [hashlib.md5(str(i).encode()).hexdigest() for i in range(0, 9)]] 56 | 57 | data = "\n".join(urls) 58 | f = tmpdir.join('urls.txt') 59 | f.write(data) 60 | cmdline = 'make -i %s -f pattern ' % f.strpath 61 | stdout, _ = call(cmdline) 62 | assert b'[0-9a-z]{32}[\\.]html' in stdout 63 | 64 | 65 | def test_make_noise(tmpdir): 66 | urls = ['http://example.com/abc%02d?id=%02d#abc' % 67 | (i, i) for i in range(0, 8)] 68 | urls.append('http://example.com/abc009?id=09#abc') 69 | 70 | data = "\n".join(urls) 71 | f = tmpdir.join('urls.txt') 72 | f.write(data) 73 | cmdline = 'make -i %s -f pattern ' % f.strpath 74 | stdout, _ = call(cmdline) 75 | assert b'/abc[0-9]{2}' in stdout 76 | assert b'/abc009' in stdout 77 | 78 | 79 | def test_make_fuzzy(tmpdir): 80 | urls = [ 81 | 'sdjfpewiefh', 82 | 'dfsdksd', 83 | 'dffalldsfisslkfdksd', 84 | 'didif', 85 | 'dif', 86 | ] 87 | urls = ['http://example.com/abc/' + i for i in urls] 88 | data = "\n".join(urls) 89 | f = tmpdir.join('urls01.txt') 90 | f.write(data) 91 | cmdline = 'make -i %s -f pattern ' % f.strpath 92 | stdout, _ = call(cmdline) 93 | assert b'/abc/[a-z]+' in stdout 94 | 95 | urls = [i + '.html' for i in urls] 96 | data = "\n".join(urls) 97 | f = tmpdir.join('urls02.txt') 98 | f.write(data) 99 | cmdline = 'make -i %s -f pattern ' % f.strpath 100 | stdout, _ = call(cmdline) 101 | assert b'/abc/[a-z]+[\\.]html' in stdout 102 | 103 | 104 | def test_match(tmpdir): 105 | pattern = b'/abc[0-9]{2}' 106 | fp = tmpdir.join('patterns.txt') 107 | fp.write(pattern) 108 | 109 | urls = ['http://example.com/abc%02d' % i for i in range(1, 10)] 110 | data = "\n".join(urls) 111 | fu = tmpdir.join('urls.txt') 112 | fu.write(data) 113 | 114 | cmdline = 'match -i %s -p %s' % (fu.strpath, fp.strpath) 115 | stdout, _ = call(cmdline) 116 | 117 | assert pattern in stdout 118 | 119 | 120 | if __name__ == "__main__": 121 | sys.path.insert(0, os.getcwd()) 122 | if os.getenv('COVERAGE_PROCESS_START'): 123 | import coverage 124 | coverage.process_startup() 125 | cmds = {'make': make, 'match': match} 126 | cmds[sys.argv.pop(1)]() 127 | -------------------------------------------------------------------------------- /tests/test_config.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from os_urlpattern.config import get_default_config 4 | 5 | 6 | def test_get_default_config(): 7 | config = get_default_config() 8 | assert config.getint('make', 'min_cluster_num') == 3 9 | -------------------------------------------------------------------------------- /tests/test_formatter.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | 3 | import json 4 | 5 | import pytest 6 | 7 | from os_urlpattern.formatter import pformat 8 | from os_urlpattern.pattern_maker import PatternMaker 9 | 10 | 11 | @pytest.fixture(scope='function') 12 | def p_maker(): 13 | p_maker = PatternMaker() 14 | for url in ['http://www.example.com/abc/%02d.html' % i for i in range(0, 10)]: 15 | p_maker.load(url, meta=url) 16 | 17 | return p_maker 18 | 19 | 20 | def test_inline(p_maker): 21 | for url_meta, clustered in p_maker.make(): 22 | for o in pformat('inline', url_meta, clustered): 23 | assert '/abc/[0-9]{2}[\\.]html\thttp' in o 24 | 25 | 26 | def test_json(p_maker): 27 | for url_meta, clustered in p_maker.make(): 28 | for o in pformat('json', url_meta, clustered): 29 | d = json.loads(o) 30 | assert d['ptn'] == '/abc/[0-9]{2}[\\.]html' 31 | assert d['cnt'] == 10 32 | -------------------------------------------------------------------------------- /tests/test_parse_utils.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from os_urlpattern.exceptions import (InvalidCharException, 4 | InvalidPatternException, 5 | IrregularURLException) 6 | from os_urlpattern.parse_utils import (PieceParser, URLMeta, analyze_url, 7 | analyze_url_pattern_string, digest, 8 | filter_useless, fuzzy_digest, normalize, 9 | pack, parse_pattern_string, 10 | parse_pattern_unit_string, 11 | parse_query_string, parse_url) 12 | from os_urlpattern.pattern import Pattern 13 | 14 | 15 | def test_normalize_str(): 16 | data = [ 17 | ('a', 'a'), 18 | ('ab=', 'ab[=]'), 19 | ('ab1=a', 'ab1[=]a'), 20 | ('ab==a', 'ab[=]{2}a'), 21 | ('ab=={a', 'ab[=]{2}[\\{]a'), 22 | ('=', '[=]'), 23 | ('==', '[=]{2}'), 24 | ('==+a', '[=]{2}[\\+]a'), 25 | ('\\', '[\\\\]'), 26 | ] 27 | for i, j in data: 28 | assert normalize(i) == j 29 | 30 | 31 | def test_parse_url(): 32 | data = [ 33 | ('http://www.test.com/', ('',), [('depth', 1)]), 34 | ('http://www.test.com/?', ('', ''), [('depth', 2)]), 35 | ('http://www.test.com/abc/def?k=v#xxx', ('abc', 'def', 'v', 'xxx'), 36 | [('depth', 4), ('has_fragment', True)]), 37 | ] 38 | for url, p, m in data: 39 | url_meta, parts = analyze_url(url) 40 | assert parts == p 41 | for k, v in m: 42 | assert getattr(url_meta, k) == v 43 | with pytest.raises(IrregularURLException): 44 | analyze_url('http://www.g.com') 45 | 46 | 47 | def test_parse_query_string(): 48 | data = [ 49 | ('a', ('',), ('a',)), 50 | ('a=', ('a=',), ('',)), 51 | ('a&b', ('a', 'b'), ('', '')), 52 | ('a=1', ('a=',), ('1',)), 53 | ('a=1&b=2', ('a=', 'b='), ('1', '2')), 54 | ] 55 | for q, k, v in data: 56 | assert parse_query_string(q) == (k, v) 57 | 58 | data = ['a&', 'a&&b', 'a=1&'] 59 | 60 | for i in data: 61 | with pytest.raises(IrregularURLException): 62 | parse_query_string(i) 63 | 64 | 65 | def test_analyze_url(): 66 | data = [ 67 | ['http://www.g.com/test', ('path', '/test'), 68 | ('query', None), ('fragment', None)], 69 | ['http://www.g.com/test?', 70 | ('query', ''), ('fragment', None)], 71 | ['http://www.g.com/test?#', 72 | ('query', ''), ('fragment', '')], 73 | ['http://www.g.com/test?#abc', 74 | ('query', ''), ('fragment', 'abc')], 75 | ['http://www.g.com/test#abc', 76 | ('query', None), ('fragment', 'abc')], 77 | ['http://www.g.com/test?a#', 78 | ('query', 'a'), ('fragment', '')], 79 | ['http://www.g.com/test?a##', 80 | ('query', 'a'), ('fragment', '#')], 81 | ['http://www.g.com/test#?', 82 | ('query', None), ('fragment', '?')], 83 | ] 84 | for check in data: 85 | url = check[0] 86 | r = parse_url(url) 87 | for attr, expect in check[1:]: 88 | assert getattr(r, attr) == expect 89 | 90 | 91 | def test_filter_useless_part(): 92 | data = [ 93 | ('/', ['']), 94 | ('//', ['']), 95 | ('', ['']), 96 | ('/a/b', ['a', 'b']), 97 | ('/a/b/', ['a', 'b', '']), 98 | ('/a/b//', ['a', 'b', '']), 99 | ('/a/b///c', ['a', 'b', 'c']), 100 | ('a/b///c', ['a', 'b', 'c']), 101 | ] 102 | for s, expect in data: 103 | assert filter_useless(s.split('/')) == expect 104 | 105 | 106 | def test_piece_parser(): 107 | parser = PieceParser() 108 | data = [ 109 | ('abc', ('abc', ), ('a-z', )), 110 | ('abc.exe', ('abc', '[\\.]', 'exe'), ('a-z', '\\.', 'a-z')), 111 | ('%' * 10, ('[%]{10}', ), ('%', )), 112 | ('abc1D..exe', ('abc', '1', 'D', 113 | '[\\.]{2}', 'exe'), ('a-z', '0-9', 'A-Z', '\\.', 'a-z')), 114 | ('@<>..', ('[@]', '[<]', '[>]', '[\\.]{2}'), ('@', '<', '>', '\\.')), 115 | ] 116 | for piece, expected_pieces, expected_rules in data: 117 | parsed = parser.parse(piece) 118 | assert parsed.rules == expected_rules 119 | assert parsed.pieces == expected_pieces 120 | assert parsed.piece_length == len(piece) 121 | with pytest.raises(InvalidCharException): 122 | parser.parse(' a') 123 | 124 | 125 | def test_unpack_pack(): 126 | data = [ 127 | ('http://www.g.com/', '/'), 128 | ('http://www.g.com/abc', '/abc'), 129 | ('http://www.g.com/abc?a=1#c', '/abc[\\?]a=1#c'), 130 | ('http://www.g.com/abc???a=1#c', '/abc[\\?][\\?]{2}a=1#c'), 131 | ('http://www.g.com/abc?=1#c', '/abc[\\?]=1#c'), 132 | ('http://www.g.com/abc?a=1#', '/abc[\\?]a=1#'), 133 | ('http://www.g.com/abc?a=1&b=2#', '/abc[\\?]a=1&b=2#'), 134 | ] 135 | for url, expected in data: 136 | assert pack(*analyze_url(url)) == expected 137 | 138 | 139 | def test_url_meta(): 140 | url_meta1 = URLMeta(1, ['key1', 'key2'], False) 141 | assert url_meta1.depth == 3 142 | url_meta2 = URLMeta(1, ['key1', 'key2'], True) 143 | assert url_meta2.depth == 4 144 | url_meta3 = URLMeta(1, ['key1', 'key2'], False) 145 | 146 | 147 | def test_parse_url_pattern(): 148 | data = [ 149 | 'http://www.g.com/', 150 | 'http://www.g.com/abc', 151 | 'http://www.g.com/abc?a=1#c', 152 | 'http://www.g.com/abc???a=1#c', 153 | 'http://www.g.com/abc?=1#c', 154 | 'http://www.g.com/abc?a=1#', 155 | 'http://www.g.com/abc?a=1&b=2#', 156 | ] 157 | for url in data: 158 | meta1, parts1 = analyze_url(url) 159 | pattern_string = pack(meta1, parts1) 160 | meta2, parts2 = analyze_url_pattern_string(pattern_string) 161 | assert meta1 == meta2 162 | assert len(parts1) == len(parts2) 163 | 164 | 165 | def test_parse_pattern_string(): 166 | data = [ 167 | ('abc', 1), 168 | ('[0-9]{2}abc', 2), 169 | ('abc[0-9]+', 2), 170 | ('abc[\\[\\?][a-z]', 3), 171 | ('', 1), 172 | ('abcAbc', 3), 173 | ] 174 | for p_str, num in data: 175 | ps = parse_pattern_string(p_str) 176 | assert ''.join([str(u) for u in ps]) == p_str 177 | assert len(ps) == num 178 | 179 | invalid_data = [ 180 | '[a-z', 181 | 'a-z]', 182 | '[a-z]{-}', 183 | '[a-z]{-2}', 184 | '?', 185 | '[a-z]++', 186 | ] 187 | 188 | for data in invalid_data: 189 | with pytest.raises(InvalidPatternException): 190 | parse_pattern_string(data) 191 | 192 | 193 | def test_parse_pattern_unit_string(): 194 | data = [ 195 | ('[a-z]', set(['a-z']), 1), 196 | ('[a-z]+', set(['a-z']), -1), 197 | ('', set(['']), 1), 198 | ('[%\\+]{12}', set(['%', '\\+']), 12), 199 | ] 200 | for p_str, e_rules, e_num in data: 201 | rules, num = parse_pattern_unit_string(p_str) 202 | assert num == e_num 203 | assert rules == e_rules 204 | 205 | invalid_data = [ 206 | '[z-a]', 207 | '[z-a]{abc}', 208 | '[z-a]{-1}', 209 | '[\\._]', 210 | '[0-9a-z]', 211 | ] 212 | for data in invalid_data: 213 | with pytest.raises(InvalidPatternException): 214 | parse_pattern_unit_string(data) 215 | 216 | 217 | def test_parse_url_pattern_string(): 218 | patterns = [ 219 | ('/AaBb/123456.shtml', '/[A-Za-z]+/[0-9]{6}[\\.]shtml'), 220 | ('/abc/123/index.html', '/abc/123/index[\\.]html'), 221 | ('/12345678/index.asp?id=123', 222 | '/[0-9]{8}/[a-z]+[\\.]asp[\\?]id=[0-9]+'), 223 | ('/newsShow.asp?dataID=1', '/newsShow[\\.]asp[\\?]dataID=[0-9]+'), 224 | ] 225 | 226 | for url, pattern in patterns: 227 | url = 'http://example.com' + url 228 | um1, pieces = analyze_url(url) 229 | um2, pattern_strings = analyze_url_pattern_string(pattern) 230 | assert um1 == um2 231 | for p, s in zip(pattern_strings, pieces): 232 | assert Pattern(p).match(s) 233 | 234 | 235 | def test_digest(): 236 | parser = PieceParser() 237 | data = [ 238 | ('/abc/', '/abcdef/'), 239 | ('/abc/index.html?k1=v1&k2=v2', '/abc/html.htm?k1=c01&k2=2m'), 240 | ('/abc/index.html?k1=v1#abc', '/abc/html.htm?k1=c01#def'), 241 | ] 242 | 243 | for urls in data: 244 | urls = ['http://example.com' + u for u in urls] 245 | digests = set() 246 | for url in urls: 247 | url_meta, pieces = analyze_url(url) 248 | parsed_pieces = [parser.parse(piece) for piece in pieces] 249 | sid = digest(url_meta, [p.fuzzy_rule for p in parsed_pieces]) 250 | assert fuzzy_digest(url_meta, parsed_pieces) == sid 251 | digests.add(sid) 252 | assert len(digests) == 1 253 | -------------------------------------------------------------------------------- /tests/test_parsed_piece_view.py: -------------------------------------------------------------------------------- 1 | from os_urlpattern.parsed_piece_view import (FuzzyView, LastDotSplitFuzzyView, 2 | LengthView, MixedView, MultiView, 3 | PieceView, view_cls_from_pattern) 4 | from os_urlpattern.pattern import Pattern 5 | 6 | 7 | def test_view_cls_from_pattern(): 8 | data = [ 9 | ('abc', PieceView, False), 10 | ('[a-z]{2}', LengthView, False), 11 | ('[a-z]+', FuzzyView, False), 12 | ('abc[A-Z]{2}', MultiView, False), 13 | ('[A-Za-z]{3}123', MixedView, False), 14 | ('[A-Za-z]+[\\.]html', LastDotSplitFuzzyView, True), 15 | ('id[_][0-9A-Za-z]+[\.][a-z]+', MixedView, True), 16 | ] 17 | 18 | for p_str, view_cls, is_last_path in data: 19 | assert view_cls_from_pattern(Pattern(p_str), is_last_path) == view_cls 20 | -------------------------------------------------------------------------------- /tests/test_pattern.py: -------------------------------------------------------------------------------- 1 | from os_urlpattern.parse_utils import specify_rule, wildcard_rule 2 | from os_urlpattern.pattern import Pattern, PatternUnit 3 | 4 | 5 | def test_equal(): 6 | p1 = Pattern('[a-z]+') 7 | p2 = Pattern('[a-z]+') 8 | p3 = Pattern('[a-z]') 9 | assert p1 == p2 10 | assert p1 != p3 11 | 12 | 13 | def test_fuzzy_rule(): 14 | data = [ 15 | ('123', '0-9'), 16 | ('abc', 'a-z'), 17 | ('a1b2c3', '0-9a-z'), 18 | ('a1b2c3D4', '0-9A-Za-z'), 19 | ('a1[\\-]b2[\\-]c3[_]D4', '0-9A-Z\-_a-z'), 20 | ('[a-z]+', 'a-z'), 21 | ] 22 | 23 | for s, r in data: 24 | p = Pattern(s) 25 | assert p.fuzzy_rule == r 26 | pw = Pattern(wildcard_rule(p.fuzzy_rule)) 27 | assert pw.fuzzy_rule == r 28 | pn = Pattern(specify_rule(p.fuzzy_rule, 10)) 29 | assert pn.fuzzy_rule == r 30 | 31 | 32 | def test_pattern_unit(): 33 | data = [ 34 | ('[a-z]+', 'a-z', -1, False), 35 | ('[a-z]{3}', 'a-z', 3, False), 36 | ('abc', 'a-z', 3, True), 37 | ('[0-9]', '0-9', 1, False), 38 | ('[\\.]{2}', '\\.', 2, True), 39 | ('[\\.]', '\\.', 1, True), 40 | ('[\\._]{2}', '\\._', 2, False), 41 | ] 42 | 43 | for s, fuzzy_rule, num, literal in data: 44 | pu = PatternUnit(s) 45 | assert pu.fuzzy_rule == fuzzy_rule 46 | assert pu.num == num 47 | assert pu.is_literal() == literal 48 | -------------------------------------------------------------------------------- /tests/test_pattern_maker.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from os_urlpattern.config import get_default_config 4 | from os_urlpattern.parse_utils import pack 5 | from os_urlpattern.pattern_maker import PatternMaker 6 | from os_urlpattern.utils import dump_tree 7 | 8 | 9 | @pytest.fixture(scope='function') 10 | def config(): 11 | return get_default_config() 12 | 13 | 14 | @pytest.fixture(scope='function') 15 | def pattern_maker(config): 16 | return PatternMaker(config) 17 | 18 | 19 | def test_load(config): 20 | pm = PatternMaker(config) 21 | urls = ['http://example.com' + u for u in ['/a', '/a/b', '/a/b/c']] 22 | for url in urls: 23 | pm.load(url, meta=url) 24 | assert len(list(pm.makers)) == len(urls) 25 | for _, clustered in pm.make(): 26 | for nodes in dump_tree(clustered): 27 | assert len(nodes[-1].meta) == 1 28 | 29 | config.set('make', 'drop_url', 'true') 30 | pm = PatternMaker(config) 31 | urls = ['http://example.com' + u for u in ['/a', '/b', '/c']] 32 | for url in urls: 33 | pm.load(url) 34 | assert len(list(pm.makers)) == 1 35 | for _, clustered in pm.make(): 36 | for nodes in dump_tree(clustered): 37 | assert nodes[-1].meta is None 38 | 39 | 40 | def cluster_and_test(urls, pattern_string): 41 | pm = PatternMaker(get_default_config()) 42 | for url in urls: 43 | pm.load(url) 44 | 45 | for url_meta, clustered in pm.make(combine=True): 46 | for nodes in dump_tree(clustered): 47 | assert pack( 48 | url_meta, [n.value for n in nodes[1:]]) == pattern_string 49 | 50 | 51 | def test_make(): 52 | urls = ['http://example.com' + u for u in ['/a01', '/b02', '/c03']] 53 | cluster_and_test(urls, '/[a-z][0-9]{2}') 54 | urls = ['http://example.com' + u for u in ['/3h4hd9s9w9d9', 55 | '/9s2m1m3j2d10', '/i2i2g4g23j0m']] 56 | cluster_and_test(urls, '/[0-9a-z]{12}') 57 | urls = [u + '.html' for u in urls] 58 | cluster_and_test(urls, '/[0-9a-z]{12}[\\.]html') 59 | urls = [u + '?id=%02d' % i for i, u in enumerate(urls, 1)] 60 | cluster_and_test(urls, '/[0-9a-z]{12}[\\.]html[\\?]id=[0-9]{2}') 61 | 62 | urls = ['http://example.com' + u for u in ['/3h4hd9s9w9ddsadf9', 63 | 64 | '/9s2m1m3j2d10', '/i2i2g4g23j0dsdm']] 65 | cluster_and_test(urls, '/[0-9a-z]+') 66 | -------------------------------------------------------------------------------- /tests/test_pattern_matcher.py: -------------------------------------------------------------------------------- 1 | from os_urlpattern.pattern_matcher import PatternMatcher 2 | 3 | 4 | def match(patterns, urls, num, most_match=None): 5 | pm = PatternMatcher() 6 | for pattern in patterns: 7 | pm.load(pattern) 8 | for url in urls: 9 | matched = pm.match(url) 10 | assert len(matched) > num 11 | if most_match: 12 | sorted(matched) 13 | matched[-1].meta == most_match 14 | 15 | 16 | def test_match(): 17 | urls = ['http://example.com/abc%02d' % i for i in range(1, 10)] 18 | patterns = [ 19 | '/abc[0-9]{2}', 20 | '/abc[0-9]+', 21 | '/[a-z]+[0-9]{2}', 22 | '/[a-z]{3}[0-9]{2}', 23 | '/[0-9a-z]+', 24 | '/[0-9a-z]{5}', 25 | ] 26 | for pattern in patterns: 27 | match([pattern], urls, 0) 28 | match(patterns, urls, 3, '/abc[0-9]{2}') 29 | -------------------------------------------------------------------------------- /tests/test_piece_pattern_node.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | 3 | from os_urlpattern.parse_utils import (EMPTY_PARSED_PIECE, PieceParser, 4 | analyze_url) 5 | from os_urlpattern.piece_pattern_node import (PiecePatternNode, 6 | build_from_parsed_pieces, 7 | build_from_piece_pattern_nodes) 8 | from os_urlpattern.utils import dump_tree, pick 9 | 10 | 11 | def test_count(): 12 | num = 100 13 | urls = ['http://test.com/abc/%d' % i for i in range(num)] 14 | parser = PieceParser() 15 | root = PiecePatternNode((EMPTY_PARSED_PIECE, None)) 16 | for url in urls: 17 | _, pieces = analyze_url(url) 18 | parsed_pieces = [parser.parse(piece) for piece in pieces] 19 | build_from_parsed_pieces(root, parsed_pieces) 20 | assert root.count == num 21 | for url in urls: 22 | _, pieces = analyze_url(url) 23 | parsed_pieces = [parser.parse(piece) for piece in pieces] 24 | build_from_parsed_pieces(root, parsed_pieces) 25 | assert root.count == num 26 | root01 = PiecePatternNode((EMPTY_PARSED_PIECE, None)) 27 | for nodes in dump_tree(root): 28 | build_from_piece_pattern_nodes(root01, nodes[1:]) 29 | assert root01.count == num 30 | 31 | nodes = pick(dump_tree(root)) 32 | assert nodes[-1].parrent.children_num == num 33 | assert str(nodes[-1].parrent.pattern) == "abc" 34 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | # Tox (https://tox.readthedocs.io/) is a tool for running tests 2 | # in multiple virtualenvs. This configuration file will run the 3 | # test suite on all supported python versions. To use it, "pip install tox" 4 | # and then run "tox" from this directory. 5 | 6 | [tox] 7 | envlist = py{27,36,py,py3}, coverage-report 8 | 9 | [base] 10 | deps = 11 | pytest > 2.10 12 | coverage 13 | pytest-env 14 | 15 | [testenv] 16 | commands = 17 | coverage run -m pytest {posargs} 18 | 19 | deps = 20 | {[base]deps} 21 | six 22 | ete3 23 | 24 | [testenv:coverage-report] 25 | deps = coverage 26 | skip_install = true 27 | commands = 28 | coverage combine 29 | coverage report 30 | 31 | [testenv:codecov] 32 | passenv = CI TRAVIS TRAVIS_* APPVEYOR APPVEYOR_* 33 | deps = codecov 34 | skip_install = true 35 | commands = 36 | coverage combine 37 | coverage report 38 | codecov 39 | 40 | --------------------------------------------------------------------------------