├── .coveragerc
├── .gitignore
├── .travis.yml
├── LICENSE
├── MANIFEST.in
├── README.rst
├── pytest.ini
├── setup.cfg
├── setup.py
├── src
    └── os_urlpattern
    │   ├── VERSION
    │   ├── __init__.py
    │   ├── cmdline.py
    │   ├── compat.py
    │   ├── config
    │       ├── __init__.py
    │       └── default_config.cfg
    │   ├── definition.py
    │   ├── exceptions.py
    │   ├── formatter.py
    │   ├── parse_utils.py
    │   ├── parsed_piece_view.py
    │   ├── parser.py
    │   ├── pattern.py
    │   ├── pattern_cluster.py
    │   ├── pattern_maker.py
    │   ├── pattern_matcher.py
    │   ├── piece_pattern_node.py
    │   └── utils.py
├── tests
    ├── __init__.py
    ├── data
    │   └── urls_example.txt
    ├── test_cmdline.py
    ├── test_config.py
    ├── test_formatter.py
    ├── test_parse_utils.py
    ├── test_parsed_piece_view.py
    ├── test_pattern.py
    ├── test_pattern_maker.py
    ├── test_pattern_matcher.py
    └── test_piece_pattern_node.py
└── tox.ini


/.coveragerc:
--------------------------------------------------------------------------------
 1 | [run]
 2 | parallel = true
 3 | branch = true
 4 | source = os_urlpattern
 5 | 
 6 | [paths]
 7 | source =
 8 |     src/os_urlpattern
 9 |     .tox/*/lib/python*/site-packages/os_urlpattern
10 |     .tox/*/site-packages/os_urlpattern
11 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 
103 | # vscode
104 | .vscode/
105 | 
106 | # pytest
107 | .pytest_cache/
108 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | sudo: false
 3 | matrix:
 4 |   include:
 5 |     - python: 2.7
 6 |       env: TOXENV=py27,codecov
 7 |     - python: 3.6
 8 |       env: TOXENV=py36,codecov
 9 |     - python: pypy
10 |       env: TOXENV=pypy
11 |     - python: pypy3
12 |       env: TOXENV=pypy3
13 | install:
14 | - pip install -U pip tox
15 | script:
16 | - tox
17 | deploy:
18 |   provider: pypi
19 |   user: cfhamlet
20 |   password:
21 |     secure: eGq3kLUT6D3grZ2ZlCaJ5e/9Ma3HkOLZQDDcMsWUs/zUqpngI/9ibplgbOcxpRxKCgFKn5GFDV9ZsKk00fEfYWpe4WZW2vG6mu3k63oB4FMkUQ4GGoQKcXdR27aNtNhvTzU3VPDgyEpNI5QJmTLJp3Y3fbzcjL3a87kschf6B46MP4Nu3NqWuXZDYIZN6GY8HwD6J3Ii15nl4rCS6phdYdKckyVX8coNQVWkljx+ZtfGMkClsui9BynKBNVwufm3/F1zwWI1UXCrU3v4FxqiCmK2CYSX7tdFcGHaVTf0NqscbPxZgPvM+1tUBbW1M5N5GlUf5f7CxwtFWEqFTlz926gzYrHUaewmjILWDm6OxWAKjuks8lgywQq2twYpd8UVlRywvjfaobGpptoBevuxgr/uzipeckWR0X1SiqUaFnKzuLOnVeZ9I1ixA5zcIR74xnjEOvBnMpeawzZsIidoQcn4PRzbyaR4uDxnYyWB5yW/Q9d1UbAYOe0QyQY6NnZzvkRovkge3H/Wlk+K2P0qSUmmznWSDekdBcm4yr3bZsujgWOKS3c9L/OHH+P3YVAC1x0304xGveWt0cU/sfTPpEi99N+0QOxPQX3CnutFkXZIgR4nsGWnZYnMngrr8eHIfav+Ms20UTYwjsn79vfXc10kkesQtW863GdFXBYfw3c=
22 |   on:
23 |     tags: true
24 |     condition: ${TRAVIS_PYTHON_VERSION} == 2.7
25 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Ozzy
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | include README.rst
3 | include MANIFEST.in
4 | graft src
5 | graft tests
6 | global-exclude __pycache__
7 | global-exclude *.py[co]
8 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | =============
  2 | os-urlpattern
  3 | =============
  4 | 
  5 | .. image:: https://travis-ci.org/cfhamlet/os-urlpattern.svg?branch=master
  6 |    :target: https://travis-ci.org/cfhamlet/os-urlpattern
  7 | 
  8 | .. image:: https://codecov.io/gh/cfhamlet/os-urlpattern/branch/master/graph/badge.svg
  9 |    :target: https://codecov.io/gh/cfhamlet/os-urlpattern
 10 | 
 11 | .. image:: https://img.shields.io/pypi/pyversions/os-urlpattern.svg
 12 |    :alt: PyPI - Python Version
 13 |    :target: https://pypi.python.org/pypi/os-urlpattern
 14 |   
 15 | .. image:: https://img.shields.io/pypi/v/os-urlpattern.svg
 16 |    :alt: PyPI
 17 |    :target: https://pypi.python.org/pypi/os-urlpattern
 18 | 
 19 | 
 20 | This package is used for unsupervised URLs clustering. Furthermore, it generate URL patterns(RegEx) 
 21 | from clusters for matching purpose. It is a pure python package tested under python2.7 python3.6, 
 22 | `pypy <http://pypy.org/>`_ can also be used for performance(4x-8x). Command line tools are provided 
 23 | for standalone clustering and matching, APIs are also convenient. Several extra packages can be 
 24 | installed for additional features. Under CPython 1cpu, 100 thousand URLs clustering cost almost 1min 
 25 | and 200M memory. Built-in matching strategy is efficient enough in most use cases(4k/s, depend on 
 26 | patterns complexity).
 27 | 
 28 | .. code:: console
 29 | 
 30 |   $ pip install -U os-urlpattern
 31 |   $ wget -qO- 'https://git.io/f4QlP' | pattern-make
 32 |   /[0-9]{2}[\.]html
 33 |         http://example.com/01.html
 34 |         http://example.com/02.html
 35 |         http://example.com/03.html
 36 |   /[0-9]{3}/test[0-9]{2}[\.]html
 37 |         http://example.com/123/test01.html
 38 |         http://example.com/456/test02.html
 39 |         http://example.com/789/test03.html
 40 | 
 41 | 
 42 | ==============
 43 | Aknowledgement
 44 | ==============
 45 | 
 46 | Similar URLs
 47 | =============
 48 |   
 49 | * URLs with the same **URL structure**.
 50 | 
 51 | * Components of the parsed URLs at the same position are in the same **character space**.
 52 | 
 53 | * Different types of charactors may be in the same order in most cases.
 54 | 
 55 | 
 56 | URL structure
 57 | ==============
 58 | 
 59 | Typically, URL can be parsed into 6 components:
 60 | 
 61 | ``<scheme>://<netloc>/<path>;<params>?<query>#<fragment>``
 62 | 
 63 | Because different sites may have similar URLs structure and <params> is rare, so <schema> 
 64 | <netloc> and <params> are ignored, <path> <query> <fragment> are used to define URL structure.
 65 | 
 66 | If the URLs have the same path levels, same query keys(also keys order) and with the same 
 67 | fragment existence, their URL structure should be the same. 
 68 | 
 69 | ::
 70 |     
 71 |   http://example.com/p1/p2?k1=v1&k2=v2#pos
 72 | 
 73 |   URL structure:
 74 |   path levels: 2
 75 |   query keys: k1, k2
 76 |   have fragment: True
 77 | 
 78 | Character space
 79 | ===============
 80 | 
 81 | Consider `RFC 3986 (Section 2: Characters) <https://tools.ietf.org/html/rfc3986#section-2>`_,
 82 | URL with the following characters would be legal:
 83 | 
 84 | ``ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~:/?#[]@!$&'()*+,;=%<>\"{}^|``
 85 | 
 86 | There are three major character space: lower-case letters(a-z), upper-case letters(A-Z), 
 87 | number letters(0-9). Other symbols are in their own character space.
 88 |   
 89 | ::
 90 | 
 91 |   HeLlOwoRd666!
 92 | 
 93 |   character space: a-z A-Z 0-9 !
 94 |       
 95 | Order consideration
 96 | =====================
 97 | 
 98 | Split a string by character, consecutive character space can be joined. In most cases, order is a 
 99 | distinguished feature.
100 | 
101 | ::
102 | 
103 |   HELLOword666!
104 | 
105 |   split into: HELLO word 666 !
106 | 
107 |   character space order: A-Z a-z 0-9 !
108 | 
109 | 
110 | Mix
111 | =====================
112 | Complex consecutive major character space can be mixed, order is less important.
113 | 
114 | ::
115 | 
116 |   HellWorld666!
117 | 
118 |   split into: H ell W orld 666 !
119 | 
120 |   major join: HellWorld666 !
121 | 
122 |   character space order: A-Za-z0-9 !
123 | 
124 | Because of URL quote, '%' can be mixed with major character space.
125 | 
126 | ::
127 | 
128 |   %E4%BD%A0%E5%A5%BD!
129 | 
130 |   split into: % E 4 % BD % A 0 % E 5 % A 5 % BD !
131 | 
132 |   major join: %E4%BD%A0%E5%A5%BD !
133 | 
134 |   character space order: A-Z0-9% !
135 | 
136 | 
137 | URL pattern
138 | ============
139 | 
140 | URL pattern is used to express each cluster. It is normal regex string. Each URL in 
141 | the same cluster can be matched with the pattern.
142 | 
143 | ::
144 | 
145 |   pattern examples:
146 | 
147 |   /news/[0-9]{8}/[a-z]+[\\.]html
148 |   /newsShow[\\.]asp[\\?]dataID=[0-9]+
149 |   /thread[\\-][0-9]+[\\-][0-9][\\-]1[\\.]html
150 | 
151 | The built-in matching strategy is strict, it can't tolerate incomplet matching.
152 |   
153 | ::
154 | 
155 |   letter: helloword
156 | 
157 |   pattern01: [a-z0-9]+  # not match, because no number in the letter
158 |   pattern02: [a-z]+ # match
159 | 
160 | 
161 | ========
162 | Install
163 | ========
164 | 
165 | Install with pip
166 | 
167 | ``$ pip install os-urlpattern``
168 | 
169 | Install extra packages
170 | 
171 | .. list-table::
172 |   :header-rows: 1
173 |     
174 |   * - subpackage 
175 |     - install command
176 |     - enables
177 |   * - memory
178 |     - ``pip install os-urlpattern[memroy]``
179 |     - Show memory useage
180 |   * - ete-tree
181 |     - ``pip install os-urlpattern[ete-tree]``
182 |     - Enable `ete <https://github.com/etetoolkit/ete>`_ pattern tree formatter
183 | 
184 | ========
185 | Usage
186 | ========
187 | 
188 | Command line
189 | =============
190 | 
191 | * **pattern-make**
192 |     
193 |   Load urls, cluster and dump patterns.
194 | 
195 |   .. code:: console
196 |     
197 |     $ pattern-make -h
198 |     usage: pattern-make [-h] [-v] [-i INPUTS [INPUTS ...]]
199 |                         [-l {NOTSET,DEBUG,INFO,WARN,ERROR,FATAL}] [-c CONFIG]
200 |                         [-f {PATTERN,CLUSTER,JSON,ETE,INLINE,NULL}]
201 | 
202 |     optional arguments:
203 |       -h, --help            show this help message and exit
204 |       -v, --version         show program's version number and exit
205 |       -i INPUTS [INPUTS ...], --inputs INPUTS [INPUTS ...]
206 |                             input files to be processed (default: stdin)
207 |       -l {NOTSET,DEBUG,INFO,WARN,ERROR,FATAL}, --loglevel {NOTSET,DEBUG,INFO,WARN,ERROR,FATAL}
208 |                             log level (default: NOTSET)
209 |       -c CONFIG, --config CONFIG
210 |                             config file
211 |       -f {PATTERN,CLUSTER,JSON,ETE,INLINE,NULL}, --formatter {PATTERN,CLUSTER,JSON,ETE,INLINE,NULL}
212 |                             output formatter (default: CLUSTER)
213 |   
214 |   Dump clustered URLs with patterns:
215 | 
216 |   .. code:: console
217 |   
218 |     $ cat urls.txt | pattern-make -L debug > clustered.txt
219 | 
220 |   Only generate URL patterns:
221 | 
222 |   .. code:: console
223 |   
224 |     $ cat urls.txt | pattern-make -L debug -F pattern > patterns.txt
225 |   
226 |   Generate pattern tree from URLs(`ete <https://github.com/etetoolkit/ete>`_ installed):
227 | 
228 |   .. code:: console
229 |     
230 |     $ cat urls.txt | pattern-make -L debug -F ete
231 | 
232 | * **pattern-match**
233 | 
234 |   Load patterns, dump URLs matched results.
235 | 
236 |   .. code:: console
237 |     
238 |     $ pattern-match -h
239 |     usage: pattern-match [-h] [-v] [-i INPUTS [INPUTS ...]]
240 |                          [-l {NOTSET,DEBUG,INFO,WARN,ERROR,FATAL}] -p
241 |                          PATTERN_FILES [PATTERN_FILES ...] [-a]
242 | 
243 |     optional arguments:
244 |       -h, --help            show this help message and exit
245 |       -v, --version         show program's version number and exit
246 |       -i INPUTS [INPUTS ...], --inputs INPUTS [INPUTS ...]
247 |                             input files to be processed (default: stdin)
248 |       -l {NOTSET,DEBUG,INFO,WARN,ERROR,FATAL}, --loglevel {NOTSET,DEBUG,INFO,WARN,ERROR,FATAL}
249 |                             log level (default: NOTSET)
250 |       -p PATTERN_FILES [PATTERN_FILES ...], --pattern-files PATTERN_FILES [PATTERN_FILES ...]
251 |                             pattern files to be loaded
252 |       -a, --all-matched     all matched patterns
253 | 
254 | 
255 |   Match URLs:
256 | 
257 |   .. code:: console
258 |   
259 |     $ cat urls.txt | pattern-match -L debug -p patterns.txt
260 | 
261 | APIs
262 | =====
263 | 
264 | * Cluster and generate URL patterns:
265 | 
266 |   .. code:: python 
267 | 
268 |     from os_urlpattern.formatter import pformat
269 |     from os_urlpattern.pattern_maker import PatternMaker
270 | 
271 |     pattern_maker = PatternMaker()
272 | 
273 |     # load URLs(unicode)
274 |     for url in urls:
275 |         pattern_maker.load(url)
276 | 
277 |     # cluster and print pattern
278 |     for url_meta, clustered in pattern_maker.make():
279 |         for pattern in pformat('pattern', url_meta, clustered):
280 |             # do whatever you want
281 |             pass
282 | 
283 | 
284 | * Match URLs:
285 | 
286 |   .. code:: python 
287 |   
288 |     from os_urlpattern.pattern_matcher import PatternMatcher
289 | 
290 |     pattern_matcher = PatternMatcher()
291 | 
292 |     # load url_pattern(unicode)
293 |     for url_pattern in url_patterns:
294 |         # meta will bind to matched result
295 |         pattern_matcher.load(url_pattern, meta=url_pattern)
296 | 
297 |     # match URL(unicode)
298 |     for url in urls:
299 |         matched_results = patterm_matcher.match(url)
300 |         # the best matched result:
301 |         # sorted(matched_results, reverse=True)[0]
302 |         patterns = [n.meta for n in matched_results]
303 | 
304 | 
305 | * Low-level APIs:
306 | 
307 |   It is necessary to use low-level APIs for customizing processing procdure,
308 |   especially for parallel computing or working on an distributed cluster(hadoop).
309 | 
310 |   **Key points: same fuzzy-digest same maker and same matcher.**
311 | 
312 |   Use ``os_urlpattern.parser.fuzzy_digest`` to get fuzzy digest from URL,
313 |   URL pattern or URLMeta and parsed pieces/patterns.
314 | 
315 |   A brief All-In-One example:
316 | 
317 |   .. code:: python 
318 |   
319 |     from __future__ import print_function, unicode_literals
320 |     from os_urlpattern.formatter import pformat
321 |     from os_urlpattern.parser import fuzzy_digest, parse
322 |     from os_urlpattern.pattern_maker import Maker
323 |     from os_urlpattern.pattern_matcher import Matcher
324 | 
325 |     urls = ['http://t.com/%02d.html' % i for i in xrange(0,10)]
326 |     makers = {}
327 |     matchers = {}
328 | 
329 |     # Init makers from URLs(unicode).
330 |     for url in urls:
331 |         url_meta, parsed_pieces = parse(url)
332 |         
333 |         # same digest same maker
334 |         digest = fuzzy_digest(url_meta, parsed_pieces)
335 |         if digest not in makers:
336 |             makers[digest] = Maker(url_meta)
337 |         makers[digest].load(parsed_pieces)
338 | 
339 |     # Iterate makers, do clustering, generate URL pattern and init matchers.
340 |     for maker in makers.values():
341 |         for clustered in maker.make():
342 |             for pattern in pformat('pattern', maker.url_meta, clustered):
343 |                 # init matchers
344 |                 url_meta, parsed_patterns = parse(pattern)
345 |                 digest = fuzzy_digest(url_meta, parsed_patterns)
346 |                 if digest not in matchers:
347 |                     matchers[digest] = Matcher(url_meta)
348 |                 matchers[digest].load(parsed_patterns, pattern)
349 |     
350 |     # Match URLs(unicode).
351 |     for url in urls:
352 |         url_meta, parsed_pieces = parse(url)
353 | 
354 |         # same digest same matcher
355 |         digest = fuzzy_digest(url_meta, parsed_pieces)
356 |         if digest in matchers:
357 |             matched = [n.meta for n in matchers[digest].match(parsed_pieces)]
358 |             print(url, *matched, sep="\t")        
359 |         else: # no matched at all
360 |             pass
361 | 
362 | 
363 | 
364 | ============
365 | Unit Tests
366 | ============
367 | 
368 | ``$ tox``
369 | 
370 | ============
371 | License
372 | ============
373 | 
374 | MIT licensed.
375 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | addopts = -s --fulltrace -v
3 | env = 
4 |     COVERAGE = true
5 |     
6 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [bdist_wheel]
2 | universal = 1
3 | 
4 | [metadata]
5 | description-file = README.rst
6 | license_file = LICENSE
7 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages, setup
 2 | 
 3 | 
 4 | def read(*filenames, **kwargs):
 5 |     import io
 6 |     from os.path import join, dirname
 7 |     encoding = kwargs.get('encoding', 'utf-8')
 8 |     sep = kwargs.get('sep', '\n')
 9 |     buf = []
10 |     for filename in filenames:
11 |         with io.open(join(dirname(__file__), filename), encoding=encoding) as f:
12 |             buf.append(f.read())
13 |     return sep.join(buf)
14 | 
15 | 
16 | setup(
17 |     name='os-urlpattern',
18 |     version=read('src/os_urlpattern/VERSION'),
19 |     packages=find_packages(where='src'),
20 |     package_dir={'': 'src'},
21 |     include_package_data=True,
22 |     license='MIT License',
23 |     description='Cluster url pattern automatically.',
24 |     long_description=open('README.rst').read(),
25 |     author='Ozzy',
26 |     author_email='cfhamlet@gmail.com',
27 |     url='https://github.com/cfhamlet/os-urlpattern',
28 |     zip_safe=False,
29 |     entry_points={
30 |         'console_scripts': [
31 |             'pattern-make = os_urlpattern.cmdline:make',
32 |             'pattern-match = os_urlpattern.cmdline:match',
33 |         ]
34 |     },
35 |     extras_require={
36 |         'memory': ['psutil'],
37 |         'ete-tree': ['six', 'ete3']
38 |     },
39 |     classifiers=[
40 |         'Development Status :: 2 - Pre-Alpha',
41 |         'Intended Audience :: Developers',
42 |         'License :: OSI Approved :: MIT License',
43 |         'Natural Language :: English',
44 |         'Programming Language :: Python :: 2',
45 |         'Programming Language :: Python :: 2.7',
46 |         'Programming Language :: Python :: 3',
47 |         'Programming Language :: Python :: 3.6',
48 |         'Programming Language :: Python :: Implementation :: CPython',
49 |         'Programming Language :: Python :: Implementation :: PyPy',
50 |     ])
51 | 


--------------------------------------------------------------------------------
/src/os_urlpattern/VERSION:
--------------------------------------------------------------------------------
1 | 0.1.11
2 | 


--------------------------------------------------------------------------------
/src/os_urlpattern/__init__.py:
--------------------------------------------------------------------------------
 1 | """os-urlpattern.
 2 | 
 3 | Unsupervised URLs clustering, generate and match URL pattern.
 4 | """
 5 | import sys
 6 | __all__ = ['__version__', 'version_info']
 7 | 
 8 | import pkgutil
 9 | __version__ = pkgutil.get_data(__package__, 'VERSION').decode('ascii').strip()
10 | version_info = tuple(int(v) if v.isdigit() else v
11 |                      for v in __version__.split('.'))
12 | 
13 | if sys.version_info < (2, 7):
14 |     sys.exit("os-urlpattern %s requires Python 2.7" % __version__)
15 | 
16 | del pkgutil
17 | del sys
18 | 


--------------------------------------------------------------------------------
/src/os_urlpattern/cmdline.py:
--------------------------------------------------------------------------------
  1 | """Command line tools.
  2 | 
  3 | pattern-make:
  4 |     Load URLs, cluster then generate URL pattern.
  5 | 
  6 | pattern-matcher:
  7 |     Load pattern, match URL and get matched results.
  8 | 
  9 | """
 10 | from __future__ import print_function, unicode_literals
 11 | 
 12 | import argparse
 13 | import logging.config
 14 | import sys
 15 | import time
 16 | from collections import Counter
 17 | from itertools import chain
 18 | 
 19 | from . import __version__
 20 | from .compat import binary_stdin, binary_stdout
 21 | from .config import get_default_config
 22 | from .definition import DEFAULT_ENCODING
 23 | from .exceptions import (InvalidCharException, InvalidPatternException,
 24 |                          IrregularURLException)
 25 | from .formatter import FORMATTERS, pformat
 26 | from .pattern_maker import PatternMaker
 27 | from .pattern_matcher import PatternMatcher
 28 | from .utils import LogSpeedAdapter, MemoryUsageFormatter, pretty_counter
 29 | 
 30 | _DEFAULT_LOGGING = {
 31 |     'version': 1,
 32 |     'disable_existing_loggers': True,
 33 |     'incremental': True,
 34 | }
 35 | 
 36 | 
 37 | def _config_logging(log_level):
 38 |     logging.config.dictConfig(_DEFAULT_LOGGING)
 39 |     if log_level == 'NOTSET':
 40 |         handler = logging.NullHandler()
 41 |     else:
 42 |         handler = logging.StreamHandler()
 43 |     formatter = MemoryUsageFormatter(
 44 |         fmt='[%(asctime)s] [%(name)s] [%(levelname)s] [%(memory)s] %(message)s',
 45 |         datefmt='%Y-%m-%d %H:%M:%S',
 46 |     )
 47 |     logging.root.setLevel(logging.NOTSET)
 48 |     handler.setFormatter(formatter)
 49 |     handler.setLevel(log_level)
 50 |     logging.root.addHandler(handler)
 51 | 
 52 | 
 53 | class Command(object):
 54 |     def __init__(self, config=None):
 55 |         self._config = config
 56 |         self._logger = logging.getLogger(self.__class__.__name__)
 57 | 
 58 |     def add_argument(self, parser):
 59 | 
 60 |         parser.add_argument('-v', '--version',
 61 |                             action='version',
 62 |                             version='%(prog)s {version}'.format(
 63 |                                 version=__version__)
 64 |                             )
 65 | 
 66 |         parser.add_argument('-i', '--inputs',
 67 |                             help='input files to be processed (default: stdin)',
 68 |                             nargs='+',
 69 |                             type=argparse.FileType('rb'),
 70 |                             default=[binary_stdin],
 71 |                             dest='inputs')
 72 | 
 73 |         parser.add_argument('-l', '--loglevel',
 74 |                             help='log level (default: NOTSET)',
 75 |                             default='NOTSET',
 76 |                             action='store',
 77 |                             dest='log_level',
 78 |                             choices=['NOTSET', 'DEBUG', 'INFO',
 79 |                                      'WARN', 'ERROR', 'FATAL'],
 80 |                             type=lambda s: s.upper())
 81 | 
 82 |     def process_args(self, args):
 83 |         _config_logging(args.log_level)
 84 | 
 85 |     def run(self, args):
 86 |         raise NotImplementedError
 87 | 
 88 | 
 89 | class MakePatternCommand(Command):
 90 | 
 91 |     def process_args(self, args):
 92 |         super(MakePatternCommand, self).process_args(args)
 93 |         if args.config:
 94 |             self._config.readfp(args.config[0])
 95 | 
 96 |     def add_argument(self, parser):
 97 |         super(MakePatternCommand, self).add_argument(parser)
 98 |         parser.add_argument('-c', '--config',
 99 |                             help='config file',
100 |                             nargs=1,
101 |                             type=argparse.FileType('r'),
102 |                             dest='config')
103 | 
104 |         parser.add_argument('-f', '--formatter',
105 |                             help='output formatter (default: CLUSTER)',
106 |                             default='CLUSTER',
107 |                             action='store',
108 |                             dest='format_type',
109 |                             choices=FORMATTERS.keys(),
110 |                             type=lambda s: s.upper())
111 | 
112 |     def _load(self, pattern_maker, args):
113 |         load_url = args.format_type in ('CLUSTER', 'INLINE')
114 |         stats = Counter()
115 |         with LogSpeedAdapter(self._logger, 5000) as speed_logger:
116 |             load = pattern_maker.load
117 |             for line in chain.from_iterable(args.inputs):
118 |                 speed_logger.debug('[LOADING]')
119 |                 stats['ALL'] += 1
120 |                 line = line.strip()
121 |                 if not line:
122 |                     stats['EMPTY'] += 1
123 |                     continue
124 |                 try:
125 |                     url = line.decode(DEFAULT_ENCODING)
126 |                     _, is_new = load(url, meta=url if load_url else None)
127 |                     if is_new:
128 |                         stats['UNIQ'] += 1
129 |                     stats['VALID'] += 1
130 |                 except (InvalidPatternException,
131 |                         IrregularURLException,
132 |                         InvalidCharException,
133 |                         UnicodeDecodeError,
134 |                         ValueError) as e:
135 |                     self._logger.warn('%s, %r', str(e), line)
136 |                     stats['INVALID'] += 1
137 |                     continue
138 |                 except Exception as e:
139 |                     self._logger.error('%s, %r', str(e), line)
140 |                     stats['INVALID'] += 1
141 |                     continue
142 |         self._logger.debug('[LOADED] %s', pretty_counter(stats))
143 | 
144 |     def _process(self, pattern_maker, args):
145 |         combine = args.format_type == 'ETE'
146 |         s = time.time()
147 |         for maker in pattern_maker.makers:
148 |             for root in maker.make(combine):
149 |                 e = time.time()
150 |                 self._logger.debug('[CLUSTER] %d %.2fs', root.count, e - s)
151 |                 for record in pformat(args.format_type, maker.url_meta, root):
152 |                     print(record)
153 |                 s = time.time()
154 | 
155 |     def run(self, args):
156 |         pattern_maker = PatternMaker(self._config)
157 |         self._load(pattern_maker, args)
158 |         self._process(pattern_maker, args)
159 | 
160 | 
161 | class MatchPatternCommand(Command):
162 |     def __init__(self):
163 |         super(MatchPatternCommand, self).__init__()
164 | 
165 |     def add_argument(self, parser):
166 |         super(MatchPatternCommand, self).add_argument(parser)
167 |         parser.add_argument('-p', '--pattern-files',
168 |                             help='pattern files to be loaded',
169 |                             nargs='+',
170 |                             type=argparse.FileType('rb'),
171 |                             required=True,
172 |                             dest='pattern_files')
173 | 
174 |         parser.add_argument('-a', '--all-matched',
175 |                             help='all matched patterns',
176 |                             default=False,
177 |                             action='store_true',
178 |                             dest='all_matched')
179 | 
180 |     def _load(self, pattern_matcher, args):
181 |         stats = Counter()
182 |         p_inputs = args.pattern_files
183 |         self._logger.debug('[LOAD] %d pattern file%s: %s',
184 |                            len(p_inputs),
185 |                            's' if len(p_inputs) > 1 else '',
186 |                            ', '.join([p.name for p in p_inputs]))
187 |         with LogSpeedAdapter(self._logger, 1000) as speed_logger:
188 |             load = pattern_matcher.load
189 |             for line in chain.from_iterable(p_inputs):
190 |                 speed_logger.debug('[LOADING]')
191 |                 stats['ALL'] += 1
192 |                 line = line.rstrip()
193 |                 if not line.startswith(b'/'):
194 |                     stats['UNKNOW'] += 1
195 |                     continue
196 |                 try:
197 |                     pattern = line.decode(DEFAULT_ENCODING)
198 |                     load(pattern, meta=pattern)
199 |                     stats['VALID'] += 1
200 |                 except Exception as e:
201 |                     self._logger.warn("%s, %r", str(e), line)
202 |                     stats['INVALID'] += 1
203 |         self._logger.debug('[LOAD] Finished %s', pretty_counter(stats))
204 | 
205 |     def _match_result(self, pattern_matcher, raw_url, args):
206 |         result = None
207 |         try:
208 |             url = raw_url.decode(DEFAULT_ENCODING)
209 |             result = pattern_matcher.match(url)
210 |             if not args.all_matched:
211 |                 result = sorted(result, reverse=True)
212 |                 result = result[:1]
213 |             result = '\t'.join([r.meta for r in result]
214 |                                ).encode(DEFAULT_ENCODING)
215 |         except (InvalidPatternException,
216 |                 IrregularURLException,
217 |                 InvalidCharException,
218 |                 UnicodeDecodeError,
219 |                 ValueError) as e:
220 |             result = b'E'
221 |             self._logger.warn("%s, %r", str(e), raw_url)
222 |         except Exception as e:
223 |             result = b'E'
224 |             self._logger.error("%s, %r", str(e), raw_url)
225 |         return result
226 | 
227 |     def _match(self, pattern_matcher, args):
228 |         speed_logger = LogSpeedAdapter(self._logger, 5000)
229 |         write = binary_stdout.write
230 |         for line in chain.from_iterable(args.inputs):
231 |             speed_logger.debug('[MATCHING]')
232 |             line = line.strip()
233 |             result = self._match_result(pattern_matcher, line, args)
234 |             if not result:
235 |                 result = b'N'
236 |             write(result)
237 |             write(b'\t')
238 |             write(line)
239 |             write(b'\n')
240 | 
241 |     def run(self, args):
242 |         pattern_matcher = PatternMatcher()
243 |         self._load(pattern_matcher, args)
244 |         self._match(pattern_matcher, args)
245 | 
246 | 
247 | def _execute(command, argv=None):
248 |     argv = argv or sys.argv
249 |     parser = argparse.ArgumentParser()
250 |     command.add_argument(parser)
251 |     args = parser.parse_args(argv[1:])
252 |     command.process_args(args)
253 |     command.run(args)
254 | 
255 | 
256 | def make(argv=None):
257 |     _execute(MakePatternCommand(get_default_config()), argv)
258 | 
259 | 
260 | def match(argv=None):
261 |     _execute(MatchPatternCommand(), argv)
262 | 


--------------------------------------------------------------------------------
/src/os_urlpattern/compat.py:
--------------------------------------------------------------------------------
 1 | """Compatible import.
 2 | """
 3 | 
 4 | from __future__ import unicode_literals
 5 | import operator
 6 | import string
 7 | import sys
 8 | 
 9 | _PY3 = sys.version_info[0] >= 3
10 | 
11 | if _PY3:
12 |     from io import StringIO
13 |     iteritems = operator.methodcaller("items")
14 |     itervalues = operator.methodcaller("values")
15 |     from urllib.parse import urlparse, ParseResult
16 |     from configparser import ConfigParser
17 |     binary_stdin = sys.stdin.buffer
18 |     binary_stdout = sys.stdout.buffer
19 | else:
20 |     try:
21 |         from cStringIO import StringIO  # safe, only process ascii
22 |     except ImportError:
23 |         from StringIO import StringIO
24 |     iteritems = operator.methodcaller("iteritems")
25 |     itervalues = operator.methodcaller("itervalues")
26 |     from urlparse import urlparse, ParseResult
27 |     from ConfigParser import ConfigParser
28 |     binary_stdin = sys.stdin
29 |     binary_stdout = sys.stdout
30 | 


--------------------------------------------------------------------------------
/src/os_urlpattern/config/__init__.py:
--------------------------------------------------------------------------------
 1 | """Configure.
 2 | """
 3 | from  ..compat import ConfigParser
 4 | 
 5 | def get_default_config():
 6 |     """Get default configure instance.
 7 | 
 8 |     Returns:
 9 |         Config -- default confiure instance
10 |     """
11 |     import os
12 |     path = os.path.dirname(__file__)
13 |     cfg = ConfigParser()
14 |     cfg.read(os.path.join(path, 'default_config.cfg'))
15 |     return cfg
16 | 


--------------------------------------------------------------------------------
/src/os_urlpattern/config/default_config.cfg:
--------------------------------------------------------------------------------
1 | [make]
2 | min_cluster_num = 3


--------------------------------------------------------------------------------
/src/os_urlpattern/definition.py:
--------------------------------------------------------------------------------
  1 | """Definition of global constant varialbles.
  2 | """
  3 | 
  4 | from __future__ import unicode_literals
  5 | 
  6 | import hashlib
  7 | import string
  8 | 
  9 | from .pattern import Pattern
 10 | 
 11 | DEFAULT_ENCODING = 'UTF-8'
 12 | 
 13 | 
 14 | class Symbols(object):
 15 |     PLUS = '+'
 16 |     EMPTY = ''
 17 |     SLASH = '/'
 18 |     EQUALS = '='
 19 |     NUMBER = '#'
 20 |     PERCENT = '%'
 21 |     QUESTION = '?'
 22 |     BRACES_L = '{'
 23 |     BRACES_R = '}'
 24 |     AMPERSAND = '&'
 25 |     BACKSLASH = '\\'
 26 |     BRACKETS_L = '['
 27 |     BRACKETS_R = ']'
 28 | 
 29 | 
 30 | class BasePatternRule(object):
 31 |     DIGIT = '0-9'
 32 |     BASE_ASCII_LOWER = 'a-z'
 33 |     BASE_ASCII_UPPER = 'A-Z'
 34 |     BASE_ASCII = 'A-Za-z'
 35 |     BASE_DIGIT_AND_ASCII_LOWER = '0-9a-z'
 36 |     BASE_DIGIT_AND_ASCII_UPPER = '0-9A-Z'
 37 |     BASE_DIGIT_AND_ASCII = '0-9A-Za-z'
 38 |     SINGLE_DIGIT = '[0-9]'
 39 |     SINGLE_ASCII_LOWER = '[a-z]'
 40 |     SINGLE_ASCII_UPPER = '[A-Z]'
 41 |     MULTI_DIGIT = '[0-9]+'
 42 |     MULTI_ASCII_LOWER = '[a-z]+'
 43 |     MULTI_ASCII_UPPER = '[A-Z]+'
 44 |     MULTI_ASCII = '[A-Za-z]+'
 45 |     MULTI_DIGIT_AND_ASCII_LOWER = '[0-9a-z]+'
 46 |     MULTI_DIGIT_AND_ASCII_UPPER = '[0-9A-Z]+'
 47 |     MULTI_DIGIT_AND_ASCII = '[0-9A-Za-z]+'
 48 |     DOT = '\\.'
 49 |     EMPTY = ''
 50 |     SINGLE_QUESTION = '[\\?]'
 51 | 
 52 | 
 53 | ZERO_DIGEST = hashlib.md5(b'0').hexdigest().upper()
 54 | QUERY_PART_RESERVED_CHARS = frozenset([Symbols.EQUALS])
 55 | EMPTY_TUPLE = ()
 56 | BLANK_TUPLE = (BasePatternRule.EMPTY,)
 57 | 
 58 | # 26 letters rules
 59 | CHAR_AND_RULE_LIST = []
 60 | ASCII_AND_RULE_LIST = []
 61 | ASCII_AND_RULE_LIST.extend([(i, BasePatternRule.BASE_ASCII_LOWER)
 62 |                             for i in string.ascii_lowercase])
 63 | ASCII_AND_RULE_LIST.extend([(i, BasePatternRule.BASE_ASCII_UPPER)
 64 |                             for i in string.ascii_uppercase])
 65 | CHAR_AND_RULE_LIST.extend(ASCII_AND_RULE_LIST)
 66 | 
 67 | # digit rules
 68 | DIGIT_AND_RULE_LIST = [(i, BasePatternRule.DIGIT)
 69 |                        for i in string.digits]
 70 | CHAR_AND_RULE_LIST.extend(DIGIT_AND_RULE_LIST)
 71 | 
 72 | # digit and 26 letters set
 73 | DIGIT_SET = frozenset([i for i in string.digits])
 74 | ASCII_LOWER_SET = frozenset([i for i in string.ascii_lowercase])
 75 | ASCII_UPPER_SET = frozenset([i for i in string.ascii_uppercase])
 76 | ASCII_DIGIT_SET = frozenset([c for c, _ in CHAR_AND_RULE_LIST])
 77 | 
 78 | # do not escaped symbol rules
 79 | SYMBOL = '%&_@#;:,=<>~/'
 80 | SYMBOL_SET = frozenset([i for i in SYMBOL])
 81 | SYMBOL_AND_RULE_LIST = [(i, i) for i in SYMBOL_SET]
 82 | CHAR_AND_RULE_LIST.extend(SYMBOL_AND_RULE_LIST)
 83 | 
 84 | # escaped symbol rules
 85 | ESCAPE = '.+\\"\'()[]{}*$^?|!-'
 86 | ESCAPE_SET = frozenset([i for i in ESCAPE])
 87 | ESCAPE_AND_RULE_LIST = [(i, '\\%s' % i) for i in ESCAPE_SET]
 88 | CHAR_AND_RULE_LIST.extend(ESCAPE_AND_RULE_LIST)
 89 | 
 90 | # all char and rule mapping
 91 | CHAR_RULE_DICT = dict(CHAR_AND_RULE_LIST)
 92 | RULE_SET = frozenset([r for _, r in CHAR_AND_RULE_LIST])
 93 | 
 94 | # ==
 95 | RULE_SIGN_DICT = dict(
 96 |     [(v, k) for k, v in SYMBOL_AND_RULE_LIST + ESCAPE_AND_RULE_LIST])
 97 | SIGN_RULE_SET = frozenset(RULE_SIGN_DICT.keys())
 98 | 
 99 | # ==
100 | DIGIT_AND_ASCII_LOWER_RULE_LIST = [BasePatternRule.DIGIT,
101 |                                    BasePatternRule.BASE_ASCII_LOWER]
102 | DIGIT_AND_ASCII_UPPER_RULE_LIST = [BasePatternRule.DIGIT,
103 |                                    BasePatternRule.BASE_ASCII_UPPER]
104 | DIGIT_AND_ASCII_RULE_LIST = [BasePatternRule.DIGIT,
105 |                              BasePatternRule.BASE_ASCII_LOWER,
106 |                              BasePatternRule.BASE_ASCII_UPPER,
107 |                              BasePatternRule.BASE_ASCII]
108 | 
109 | DIGIT_AND_ASCII_UPPER_RULE_SET = frozenset(DIGIT_AND_ASCII_UPPER_RULE_LIST)
110 | DIGIT_AND_ASCII_LOWER_RULE_SET = frozenset(DIGIT_AND_ASCII_LOWER_RULE_LIST)
111 | DIGIT_AND_ASCII_RULE_SET = frozenset(DIGIT_AND_ASCII_RULE_LIST)
112 | 
113 | # ==
114 | BASE_ASCII_RULE_SET = frozenset([BasePatternRule.BASE_ASCII,
115 |                                  BasePatternRule.BASE_ASCII_LOWER,
116 |                                  BasePatternRule.BASE_ASCII_UPPER])
117 | 
118 | MULTI_ASCII_RULE_SET = frozenset([BasePatternRule.MULTI_ASCII,
119 |                                   BasePatternRule.MULTI_ASCII_LOWER,
120 |                                   BasePatternRule.MULTI_ASCII_UPPER])
121 | 
122 | MIXED_RULE_SET = DIGIT_AND_ASCII_RULE_SET.union([Symbols.PERCENT])
123 | 
124 | 
125 | class BasePattern(object):
126 |     SINGLE_DIGIT = Pattern(BasePatternRule.SINGLE_DIGIT)
127 |     SINGLE_ASCII_LOWER = Pattern(BasePatternRule.SINGLE_ASCII_LOWER)
128 |     SINGLE_ASCII_UPPER = Pattern(BasePatternRule.SINGLE_ASCII_UPPER)
129 |     MULTI_DIGIT = Pattern(BasePatternRule.MULTI_DIGIT)
130 |     MULTI_ASCII_LOWER = Pattern(BasePatternRule.MULTI_ASCII_LOWER)
131 |     MULTI_ASCII_UPPER = Pattern(BasePatternRule.MULTI_ASCII_UPPER)
132 |     MULTI_DIGIT_AND_ASCII_LOWER = Pattern(
133 |         BasePatternRule.MULTI_DIGIT_AND_ASCII_LOWER)
134 |     MULTI_DIGIT_AND_ASCII_UPPER = Pattern(
135 |         BasePatternRule.MULTI_DIGIT_AND_ASCII_UPPER)
136 |     MULTI_DIGIT_AND_ASCII = Pattern(BasePatternRule.MULTI_DIGIT_AND_ASCII)
137 |     DOT = Pattern(BasePatternRule.DOT)
138 |     EMPTY = Pattern(BasePatternRule.EMPTY)
139 | 


--------------------------------------------------------------------------------
/src/os_urlpattern/exceptions.py:
--------------------------------------------------------------------------------
 1 | """Custom Exceptions.
 2 | """
 3 | 
 4 | 
 5 | class IrregularURLException(Exception):
 6 |     pass
 7 | 
 8 | 
 9 | class InvalidPatternException(Exception):
10 |     pass
11 | 
12 | 
13 | class InvalidCharException(Exception):
14 |     pass
15 | 


--------------------------------------------------------------------------------
/src/os_urlpattern/formatter.py:
--------------------------------------------------------------------------------
  1 | """Clustered record formatter.
  2 | """
  3 | from __future__ import unicode_literals
  4 | 
  5 | import json
  6 | import sys
  7 | 
  8 | from .definition import BasePatternRule, Symbols
  9 | from .parse_utils import pack
 10 | from .utils import dump_tree, get_classes
 11 | 
 12 | 
 13 | class Formatter(object):
 14 |     """Base class for format clustered data.
 15 | 
 16 |     The subclass must define format method which yield formatted strings.
 17 |     """
 18 | 
 19 |     def format(self, url_meta, root, **kwargs):
 20 |         """Format the clustered tree.
 21 | 
 22 |         Args:
 23 |             url_meta (URLMeta): The url_meta.
 24 |             root (TreeNode): Root node of the clustered tree.
 25 |             **kwargs: Arbitray keyword arguments.
 26 | 
 27 |         Yields:
 28 |             str: the formatted string.
 29 | 
 30 |         """
 31 |         return
 32 |         yield
 33 | 
 34 | 
 35 | class PatternFormatter(Formatter):
 36 |     """Pattern only formatter."""
 37 | 
 38 |     def format(self, url_meta, root, **kwargs):
 39 |         """Yield URL pattern string.
 40 | 
 41 |         Args:
 42 |             url_meta (URLMeta): The URLMeta object.
 43 |             root (TreeNode): Root of a clustered piece tree.
 44 |             **kwargs: Arbitray keyword arguments.
 45 | 
 46 |         Yields:
 47 |             str: URL pattern string.
 48 | 
 49 |         """
 50 |         for nodes in dump_tree(root):
 51 |             yield pack(url_meta, [p.pattern for p in nodes[1:]])
 52 |             break
 53 | 
 54 | 
 55 | class ClusterFormatter(PatternFormatter):
 56 |     """URL pattern and meta data formatter.
 57 | 
 58 |     Yield URL pattern string first, then all meta data strings.
 59 |     """
 60 | 
 61 |     def format(self, url_meta, root, **kwargs):
 62 |         """Yield URL pattern and all bound meta data strings.
 63 | 
 64 |         Args:
 65 |             url_meta (URLMeta): The URLMeta object.
 66 |             root (TreeNode): Root of a clustered piece tree.
 67 |             **kwargs: Arbitray keyword arguments.
 68 | 
 69 |         Yields:
 70 |             object: URL pattern string first, then all meta
 71 |                 data string prefixed with '\t'.
 72 | 
 73 |         """
 74 |         for r in super(ClusterFormatter, self).format(url_meta, root, **kwargs):
 75 |             yield r
 76 | 
 77 |         for nodes in dump_tree(root):
 78 |             if nodes[-1].meta is None:
 79 |                 continue
 80 |             for obj in nodes[-1].meta:
 81 |                 yield '\t'.join(('', str(obj)))
 82 | 
 83 | 
 84 | class InlineFormatter(PatternFormatter):
 85 |     """URL pattern and meta data formatter.
 86 | 
 87 |     URL pattern and meta data string in one line.
 88 |     """
 89 | 
 90 |     def format(self, url_meta, root, **kwargs):
 91 |         """Yield URL pattern with each bound meta data string in on line.
 92 | 
 93 |         Args:
 94 |             url_meta (URLMeta): The URLMeta object.
 95 |             root (TreeNode): Root of a clustered piece tree.
 96 |             **kwargs: Arbitray keyword arguments.
 97 | 
 98 |         Yields:
 99 |             object: URL pattern string + '\t' + str(meta)
100 | 
101 |         """
102 |         url_pattern_string = None
103 |         for r in super(InlineFormatter, self).format(url_meta, root, **kwargs):
104 |             url_pattern_string = r
105 | 
106 |         for nodes in dump_tree(root):
107 |             if nodes[-1].meta is None:
108 |                 continue
109 |             for obj in nodes[-1].meta:
110 |                 yield '\t'.join((url_pattern_string, str(obj)))
111 | 
112 | 
113 | class JsonFormatter(Formatter):
114 |     """Json formatter.
115 | 
116 |     Yiled Json string, {"ptn":url_pattern, "cnt":count}
117 |         ptn: URL pattern string.
118 |         cnt: Number of uniq path in the cluster.
119 |     """
120 | 
121 |     def format(self, url_meta, root, **kwargs):
122 |         """Yield json format string.
123 | 
124 |         Args:
125 |             url_meta (URLMeta): The URLMeta object.
126 |             root (TreeNode): Root of a clustered piece tree.
127 |             **kwargs: Arbitray keyword arguments.
128 | 
129 |         Yields:
130 |             str: Json string, key-value:
131 |                 ptn: URL pattern string.
132 |                 cnt: Number of uniq path in the cluster.
133 |         """
134 |         for nodes in dump_tree(root):
135 |             p = pack(url_meta, [p.pattern for p in nodes[1:]])
136 |             yield json.dumps({'ptn': p, 'cnt': root.count})
137 |             break
138 | 
139 | 
140 | class ETEFormatter(Formatter):
141 |     """Ete tree formatter."""
142 | 
143 |     def __init__(self):
144 |         import ete3
145 | 
146 |     def format(self, url_meta, root, **kwargs):
147 |         """Yield ete tree string.
148 | 
149 |         Args:
150 |             url_meta (URLMeta): The URLMeta object.
151 |             root (TreeNode): Root of a pattern tree.
152 |             **kwargs: Arbitray keyword arguments.
153 | 
154 |         Yields:
155 |             str: An ete tree string.
156 |         """
157 |         def f(pattern_node):
158 |             sep = Symbols.EMPTY
159 |             query_key = Symbols.EMPTY
160 |             path_depth = url_meta.path_depth
161 |             query_depth = len(url_meta.query_keys)
162 |             current_level = pattern_node.level
163 |             if path_depth < current_level \
164 |                     and current_level <= (path_depth + query_depth):
165 |                 sep = Symbols.AMPERSAND
166 |                 if current_level == path_depth + 1:
167 |                     sep = BasePatternRule.SINGLE_QUESTION
168 |                 query_key = url_meta.query_keys[current_level - path_depth - 1]
169 |             elif current_level == path_depth + query_depth + 1:
170 |                 sep = Symbols.NUMBER
171 |             return ' {sep}{query_key}{pattern_string}({count}) '.format(
172 |                 count=pattern_node.count,
173 |                 pattern_string=pattern_node.value,
174 |                 query_key=query_key,
175 |                 sep=sep)
176 | 
177 |         if root.count <= 0:
178 |             return
179 | 
180 |         ete_tree = get_ete_tree(root, format=f)
181 |         yield ete_tree.get_ascii(show_internal=True)
182 | 
183 | 
184 | def get_ete_tree(root_node, format=str):
185 |     """Transfor a tree-like object into ete tree.
186 | 
187 |     Args:
188 |         root_node (TreeNode): The root of the tree.
189 |         format (callable, optional): Defaults to str.
190 |             A callable object to format the ete tree node.
191 | 
192 |     Returns:
193 |         ete3.Tree: The ete tree.
194 |     """
195 |     from ete3 import Tree
196 | 
197 |     def add_children(node, ete_node):
198 |         for child in node.children:
199 |             ete_child = ete_node.add_child(name=format(child))
200 |             add_children(child, ete_child)
201 | 
202 |     ete_root_node = Tree(name=format(root_node))
203 |     add_children(root_node, ete_root_node)
204 |     return ete_root_node
205 | 
206 | 
207 | def pformat(name, url_meta, root, **kwargs):
208 |     """Shortcut for formatting.
209 | 
210 |     Args:
211 |         name (str): Format type.
212 |         url_meta (URLMeta): The URLMeta object.
213 |         root (TreeNode): Root of a clustered tree.
214 |         **kwargs: Arbitray keyword arguments.
215 | 
216 |     Returns:
217 |         Iterator: For iterate formatted strings.
218 |     """
219 |     return FORMATTERS[name.upper()].format(url_meta, root, **kwargs)
220 | 
221 | 
222 | # Auto discover Formatter classes and init FORMATTERS.
223 | FORMATTERS = {}
224 | for c_cls in get_classes(sys.modules[__name__], Formatter):
225 |     c_name = c_cls.__name__
226 |     t = c_name.rfind('Formatter')
227 |     if t < 0:
228 |         raise ImportError('Invalid formatter name: %s' % c_name)
229 |     name = c_name[0:t].upper() if c_name[0:t] else 'NULL'
230 |     try:
231 |         FORMATTERS[name] = c_cls()
232 |     except:
233 |         pass
234 | 


--------------------------------------------------------------------------------
/src/os_urlpattern/parse_utils.py:
--------------------------------------------------------------------------------
  1 | """Utilitis for parsing URL and pattern.
  2 | """
  3 | 
  4 | from __future__ import unicode_literals
  5 | 
  6 | import hashlib
  7 | from collections import namedtuple
  8 | 
  9 | from .compat import ParseResult, StringIO, urlparse
 10 | from .definition import (ASCII_DIGIT_SET, BLANK_TUPLE, CHAR_RULE_DICT,
 11 |                          DEFAULT_ENCODING, DIGIT_AND_ASCII_RULE_SET,
 12 |                          EMPTY_TUPLE, MIXED_RULE_SET,
 13 |                          QUERY_PART_RESERVED_CHARS, RULE_SET, SIGN_RULE_SET,
 14 |                          BasePatternRule, Symbols)
 15 | from .exceptions import (InvalidCharException, InvalidPatternException,
 16 |                          IrregularURLException)
 17 | 
 18 | URLPatternParseResult = namedtuple(
 19 |     'URLPatternParseResult', 'path query fragment')
 20 | 
 21 | 
 22 | class URLMeta(namedtuple('URLMeta', 'path_depth query_keys has_fragment')):
 23 |     """The URL structure meta.
 24 | 
 25 |     Attributes:
 26 |         path_depth (int): The num of URL path levels.
 27 |         querys_keys (:obj:`tuple` of :obj:`str`): Query keys.
 28 |         has_fragment (bool): Whether the URL have fragmemt component.
 29 | 
 30 |     """
 31 |     __slots__ = ()
 32 | 
 33 |     @property
 34 |     def depth(self):
 35 |         return self.path_depth + len(self.query_keys) + (1 if self.has_fragment else 0)
 36 | 
 37 | 
 38 | def specify_rule(rule, num):
 39 |     """Specify the format of the rule.
 40 | 
 41 |     num == 1 will return [rule], single
 42 |     num > 1  will return [rule]{num}, with number
 43 |     num < 0  will return [rule]+, wildcard
 44 |     num == 0 will raise ValueError
 45 | 
 46 |     Args:
 47 |         rule (str): The raw rule string to be secified.
 48 |         num (int): The num of the rule. Can't be 0.
 49 | 
 50 |     Raises:
 51 |         ValueError: If the num == 0.
 52 | 
 53 |     Returns:
 54 |         str: The specified format of the rule.
 55 | 
 56 |     Examples:
 57 | 
 58 |         >>> from os_urlpattern.parse_utils import specify_rule
 59 |         >>> specify_rule('a-z', 1)
 60 |         [a-z]
 61 |         >>> specify_rule('a-z', 2)
 62 |         [a-z]{2}
 63 |         >>> specify_rule('a-z', -1)
 64 |         [a-z]+
 65 | 
 66 |     """
 67 | 
 68 |     if num == 1:
 69 |         return '[%s]' % rule
 70 |     elif num < 0:
 71 |         return '[%s]+' % rule
 72 |     elif num > 1:
 73 |         return '[%s]{%d}' % (rule, num)
 74 |     else:
 75 |         raise ValueError('Invalid num %s' % str(num))
 76 | 
 77 | 
 78 | def wildcard_rule(rule):
 79 |     """The wildcard format of the rule.
 80 | 
 81 |     Shortcut of specify_rule(rule, -1).
 82 | 
 83 |     Args:
 84 |         rule (str): The raw rule string to be secified.
 85 | 
 86 |     Returns:
 87 |         str: The wildcard format of the rule.
 88 |     """
 89 |     return specify_rule(rule, -1)
 90 | 
 91 | 
 92 | def normalize(raw_string, reserved_chars=None):
 93 |     """Normalize a string.
 94 | 
 95 |     Transfor the continuous same signs in the string to the format of
 96 |     [sign_rule]{num}, if the sign is not in zhe reserved_chars.
 97 | 
 98 |     Args:
 99 |         raw_string (str): The string to be normalized.
100 |         reserved_chars ([type], optional): Defaults to None. Reserved chars
101 |             which are not to be normalized.
102 | 
103 |     Returns:
104 |         str: The normalized string.
105 | 
106 |     Examples:
107 | 
108 |         >>> from os_urlpattern.parse_utils import normalize
109 |         >>> normalize('abc==123---')
110 |         u'abc[=]{2}123[\\-]{3}'
111 | 
112 |     """
113 |     normalized = StringIO()
114 |     frag = StringIO()
115 |     last_c = None
116 |     for c in raw_string:
117 |         if c in ASCII_DIGIT_SET:
118 |             if last_c and last_c not in ASCII_DIGIT_SET:
119 |                 frag.seek(0)
120 |                 w = frag.read()
121 |                 l = len(w)
122 |                 if l > 0:
123 |                     if not reserved_chars or w[0] not in reserved_chars:
124 |                         r = CHAR_RULE_DICT.get(w[0])
125 |                         w = specify_rule(r, l)
126 |                     normalized.write(w)
127 |                     frag = StringIO()
128 |         else:
129 |             if last_c != c:
130 |                 frag.seek(0)
131 |                 w = frag.read()
132 |                 l = len(w)
133 |                 if l > 0 and w[0] not in ASCII_DIGIT_SET and \
134 |                         (not reserved_chars or w[0] not in reserved_chars):
135 |                     r = CHAR_RULE_DICT.get(w[0])
136 |                     w = specify_rule(r, l)
137 |                 normalized.write(w)
138 |                 frag = StringIO()
139 |         frag.write(c)
140 |         last_c = c
141 | 
142 |     frag.seek(0)
143 |     w = frag.read()
144 |     l = len(w)
145 |     if last_c and last_c not in ASCII_DIGIT_SET and \
146 |             (not reserved_chars or w[0] not in reserved_chars):
147 |         r = CHAR_RULE_DICT.get(w[0])
148 |         w = specify_rule(r, l)
149 |     normalized.write(w)
150 |     normalized.seek(0)
151 |     return normalized.read()
152 | 
153 | 
154 | def parse_url(url):
155 |     """Parse a URL into 6 components.
156 | 
157 |     <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
158 | 
159 |     Like the built-in urlparse method, but handle some unusual situation.
160 | 
161 |     Args:
162 |         url (str): The URL to be parsed.
163 | 
164 |     Returns:
165 |         ParseResult: A 6-tuple, (scheme, netloc, path, params, query, fragment).
166 |     """
167 |     scheme, netloc, path, params, query, fragment = urlparse(url)
168 |     if not fragment:
169 |         if url[-1] != Symbols.NUMBER:
170 |             fragment = None
171 |             if not query and url[-1] != Symbols.QUESTION:
172 |                 query = None
173 |         elif not query and url[-2] != Symbols.QUESTION:
174 |             query = None
175 |     elif not query:
176 |         if url[len(url) - len(fragment) - 2] != Symbols.QUESTION:
177 |             query = None
178 |     return ParseResult(scheme, netloc, path, params, query, fragment)
179 | 
180 | 
181 | def filter_useless(objs):
182 |     """Filter the useless objects.
183 | 
184 |     If bool(object) == False, the object is useless except the last one.
185 | 
186 |     Args:
187 |         objs (sequence): The objects will be filtered.
188 | 
189 |     Returns:
190 |         iterable: The filterd objs
191 | 
192 |     Examples:
193 | 
194 |         >>> from os_urlpattern.parse_utils import filter_useless
195 |         >>> filter_useless([0,1,0,0])
196 |         [1, 0]
197 | 
198 |     """
199 |     keep = {'c': 0, 'l': len(objs)}
200 | 
201 |     def _filterd(x):
202 |         keep['c'] += 1
203 |         if not x:
204 |             if keep['c'] == keep['l']:
205 |                 return True
206 |             return False
207 |         else:
208 |             return True
209 | 
210 |     return objs.__class__(filter(_filterd, objs))
211 | 
212 | 
213 | def parse_query_string(query_string):
214 |     """Parse query string into keys and values.
215 | 
216 |     Args:
217 |         query_string (str): The string to be parsed.
218 | 
219 |     Raises:
220 |         IrregularURLException: Invalid query string.
221 | 
222 |     Returns:
223 |         tuple: A 2-tuple, (keys and values).
224 |     """
225 |     if query_string is None:
226 |         return EMPTY_TUPLE, EMPTY_TUPLE
227 |     elif query_string == Symbols.EMPTY:
228 |         return BLANK_TUPLE, BLANK_TUPLE
229 |     elif query_string.endswith(Symbols.AMPERSAND):
230 |         raise IrregularURLException("Invalid '&' pos")
231 |     kv_type = True  # query_key True, query_value False
232 |     last_c = None
233 |     kv_buf = {True: StringIO(), False: StringIO()}
234 |     kv_list = {True: [], False: []}
235 |     for i in query_string:
236 |         if i == Symbols.EQUALS and kv_type:
237 |             s = kv_buf[kv_type]
238 |             s.write(i)
239 |             s.seek(0)
240 |             kv_list[kv_type].append(s.read())
241 |             kv_buf[kv_type] = StringIO()
242 |             kv_type = not kv_type
243 |         elif i == Symbols.AMPERSAND:
244 |             if last_c is None or last_c == Symbols.AMPERSAND:
245 |                 raise IrregularURLException("Invalid '&' pos")
246 |             s = kv_buf[kv_type]
247 |             s.seek(0)
248 |             kv_list[kv_type].append(s.read())
249 |             kv_buf[kv_type] = StringIO()
250 |             if kv_type:
251 |                 kv_list[False].append(Symbols.EMPTY)  # treat as value-less
252 |             else:
253 |                 kv_type = not kv_type
254 |         else:
255 |             s = kv_buf[kv_type]
256 |             s.write(i)
257 |         last_c = i
258 | 
259 |     s = kv_buf[kv_type]
260 |     s.seek(0)
261 |     kv_list[kv_type].append(s.read())
262 |     if kv_type:  # treat as value-less
263 |         kv_list[False].append(Symbols.EMPTY)
264 | 
265 |     # Only one query without value, treat as key-less.
266 |     if len(kv_list[True]) == 1 and not kv_list[True][0].endswith(Symbols.EQUALS):
267 |         kv_list[False][0], kv_list[True][0] = kv_list[True][0], kv_list[False][0]
268 |     return tuple(kv_list[True]), tuple(kv_list[False])
269 | 
270 | 
271 | def mix(pieces, rules):
272 |     """Combine the sub-pieces and sub-rules.
273 | 
274 |     If the sub pieces have continuous letter num and percent sign fragments
275 |     will be combine into one piece as well as the rules.
276 | 
277 |     Args:
278 |         pieces (sequence): The raw pieces.
279 |         rules (sequence): The rules.
280 | 
281 |     Returns:
282 |         tuple: A 2-tuple, (mixed_pieces, mixed_rules)
283 |     """
284 |     mixed_pieces = []
285 |     mixed_rules = []
286 | 
287 |     t_pieces = []
288 |     t_rules = []
289 |     t_mix = False
290 |     for piece, rule in zip(pieces, rules):
291 |         if rule in MIXED_RULE_SET:
292 |             if t_rules and not t_mix:
293 |                 mixed_pieces.extend(t_pieces)
294 |                 mixed_rules.extend(t_rules)
295 |                 t_pieces = []
296 |                 t_rules = []
297 |             t_mix = True
298 |         else:
299 |             if t_rules and t_mix:
300 |                 mixed_pieces.append(''.join(t_pieces))
301 |                 mixed_rules.append(''.join(sorted(set(t_rules))))
302 |                 t_pieces = []
303 |                 t_rules = []
304 |             t_mix = False
305 |         t_pieces.append(piece)
306 |         t_rules.append(rule)
307 |     if t_mix:
308 |         mixed_pieces.append(''.join(t_pieces))
309 |         mixed_rules.append(''.join(sorted(set(t_rules))))
310 |     else:
311 |         mixed_pieces.extend(t_pieces)
312 |         mixed_rules.extend(t_rules)
313 |     return pieces.__class__(mixed_pieces), rules.__class__(mixed_rules)
314 | 
315 | 
316 | def unpack(result, normalize_key=True):
317 |     """Split the ParseResult object into URLMeta and pieces.
318 | 
319 |     Args:
320 |         result ([type]): The ParseResult object.
321 |         normalize_key (bool, optional): Defaults to True.
322 |             Whether normalize the query keys.
323 | 
324 |     Raises:
325 |         IrregularURLException: Invalid URL.
326 | 
327 |     Returns:
328 |         tuple: A 2-tuple, (url_meta, pieces).
329 |     """
330 |     pieces = filter_useless(result.path.split(Symbols.SLASH)[1:])
331 |     path_depth = len(pieces)
332 |     if path_depth <= 0:
333 |         raise IrregularURLException('Invalid url depth')
334 | 
335 |     keys, values = parse_query_string(result.query)
336 |     if normalize_key:
337 |         keys = tuple([normalize(key, QUERY_PART_RESERVED_CHARS)
338 |                       for key in keys])
339 |     has_fragment = False if result.fragment is None else True
340 | 
341 |     url_meta = URLMeta(path_depth, keys, has_fragment)
342 |     pieces.extend(values)
343 |     if has_fragment:
344 |         pieces.append(result.fragment)
345 |     return url_meta, tuple(pieces)
346 | 
347 | 
348 | def pack(url_meta, objs):
349 |     """Pack into URL-like string.
350 | 
351 |     Args:
352 |         url_meta (URLMeta): The URLMeta object.
353 |         objs (sequence): The objects to be packed.
354 | 
355 |     Returns:
356 |         str: The packed URL-like string.
357 |     """
358 |     s = StringIO()
359 |     s.write(Symbols.SLASH)
360 |     query_depth = len(url_meta.query_keys)
361 |     idx = url_meta.path_depth + query_depth
362 |     p = Symbols.SLASH.join([str(p) for p in objs[0:url_meta.path_depth]])
363 |     s.write(p)
364 |     if query_depth > 0:
365 |         s.write(BasePatternRule.SINGLE_QUESTION)
366 |         kv = zip(url_meta.query_keys,
367 |                  [str(p) for p in objs[url_meta.path_depth:idx]])
368 |         s.write(Symbols.AMPERSAND.join(
369 |             [''.join((str(k), str(v))) for k, v in kv]))
370 | 
371 |     if url_meta.has_fragment:
372 |         s.write(Symbols.NUMBER)
373 |         s.write(''.join([str(p) for p in objs[idx:]]))
374 |     s.seek(0)
375 |     return s.read()
376 | 
377 | 
378 | def analyze_url(url):
379 |     """Parse a URL to URLMeta object and raw pieces.
380 | 
381 |     Args:
382 |         url (str): The URL to be parsed.
383 | 
384 |     Returns:
385 |         tuple: A 2-tuple, (url_meta, pieces).
386 |     """
387 | 
388 |     result = parse_url(url)
389 |     return unpack(result, True)
390 | 
391 | 
392 | def fuzzy_join(objs, sep='/'):
393 |     """Join the fuzzy_rule of the objects into one string.
394 | 
395 |     Args:
396 |         objs (sequence): The objects each of which have fuzzy_rule property.
397 |         sep (str): Defaults to '/'. Seperator for joining.
398 | 
399 |     Returns:
400 |         str: The joined fuzzy_rule string.
401 |     """
402 |     return sep.join([p.fuzzy_rule for p in objs])
403 | 
404 | 
405 | class ParsedPiece(object):
406 |     """The parsed piece object.
407 | 
408 |     It contains the sub-pieces of a piece and the corresponding sub-rules.
409 |     With it, you can get fuzzy rule and the length of the entire piece.
410 |     It is can be used as map key.
411 | 
412 |     """
413 |     __slots__ = ('pieces', 'rules', '_piece', '_piece_length', '_fuzzy_rule')
414 | 
415 |     def __init__(self, pieces, rules):
416 |         """Init the ParsedPiece object.
417 | 
418 |         Args:
419 |             pieces (tuple): The tuple of parsed pieces.
420 |             rules (tuple): The tuple of the rules of each parsed pieces.
421 |         """
422 |         self.pieces = pieces
423 |         self.rules = rules
424 |         self._piece_length = -1
425 |         self._piece = pieces[0] if len(pieces) == 1 else None
426 |         self._fuzzy_rule = rules[0] if len(rules) == 1 else None
427 | 
428 |     @property
429 |     def fuzzy_rule(self):
430 |         if not self._fuzzy_rule:
431 |             self._fuzzy_rule = ''.join(sorted(set(self.rules)))
432 |         return self._fuzzy_rule
433 | 
434 |     @property
435 |     def piece_length(self):
436 |         """Get the literal length of the piece.
437 | 
438 |         Not the number of the characters of the piece.
439 | 
440 |         Note:
441 | 
442 |             '[%]{2}' have 6 characters, but literal length is 2.
443 | 
444 |         Returns:
445 |             int: The literal length of the piece.
446 |         """
447 |         if self._piece_length < 0:
448 |             piece = self.piece
449 |             length_base = length = len(piece)
450 |             idx = 0
451 |             while idx < length_base:
452 |                 c = piece[idx]
453 |                 if c == Symbols.BRACKETS_L or c == Symbols.BRACKETS_R:
454 |                     if idx == 0 or piece[idx - 1] != Symbols.BACKSLASH:
455 |                         length += -1
456 |                 elif c == Symbols.BACKSLASH:
457 |                     if piece[idx + 1] != Symbols.BACKSLASH:
458 |                         length += -1
459 |                 elif c == Symbols.BRACES_L:
460 |                     if piece[idx - 1] == Symbols.BRACKETS_R:
461 |                         e = piece.index(Symbols.BRACES_R, idx)
462 |                         length += int(piece[idx + 1:e]) - 1 - (e - idx + 1)
463 |                         idx = e
464 |                 idx += 1
465 | 
466 |             self._piece_length = length
467 |         return self._piece_length
468 | 
469 |     def __eq__(self, o):
470 |         if not isinstance(o, ParsedPiece):
471 |             return False
472 |         return self.piece == o.piece
473 | 
474 |     def __hash__(self):
475 |         return hash(self.piece)
476 | 
477 |     @property
478 |     def piece(self):
479 |         if self._piece is None:
480 |             self._piece = ''.join(self.pieces)
481 |         return self._piece
482 | 
483 |     def __str__(self):
484 |         return str(zip(self.pieces, self.rules))
485 | 
486 |     __repr__ = __str__
487 | 
488 | 
489 | EMPTY_PARSED_PIECE = ParsedPiece(EMPTY_TUPLE, EMPTY_TUPLE)
490 | 
491 | 
492 | class PieceParser(object):
493 |     """Parser to parse the URL piece.
494 | 
495 |     Used it to generate ParsedPiece object from the piece of URL.
496 |     Not thread safe.
497 |     """
498 |     __slots__ = ('_rules', '_pieces')
499 | 
500 |     def __init__(self):
501 |         self._reset()
502 | 
503 |     def _reset(self):
504 |         self._rules = []
505 |         self._pieces = []
506 | 
507 |     def parse(self, piece):
508 |         """Parse a string into small sub-pieces with rules.
509 | 
510 |         The consecutive charactors in the same charactor space
511 |         will be joined into one sub-piece, the corresponding
512 |         rule(charactor space) can also be got.
513 | 
514 |         Args:
515 |             piece (str): A string to be parsed.
516 | 
517 |         Returns:
518 |             tuple: 2-tuple, (pieces, rules).
519 |         """
520 | 
521 |         self._reset()
522 |         self._preprocess(piece)
523 |         return self._create_parsed_piece()
524 | 
525 |     def _preprocess(self, piece):
526 |         for c in piece:
527 |             self._define(c)
528 |         for idx, buf in enumerate(self._pieces):
529 |             buf.seek(0)
530 |             letter = buf.read()
531 |             self._pieces[idx] = self._normalize(
532 |                 letter, self._rules[idx])
533 | 
534 |     def _define(self, char):
535 |         last_rule = self._rules[-1] if self._rules else None
536 |         if char not in CHAR_RULE_DICT:
537 |             raise InvalidCharException("Invalid char %r" % char)
538 |         rule = CHAR_RULE_DICT[char]
539 | 
540 |         if last_rule != rule:
541 |             self._pieces.append(StringIO())
542 |             self._rules.append(rule)
543 |         self._pieces[-1].write(char)
544 | 
545 |     def _normalize(self, letter, rule):
546 |         if rule in SIGN_RULE_SET:
547 |             return specify_rule(rule, len(letter))
548 |         return letter
549 | 
550 |     def _create_parsed_piece(self):
551 |         return ParsedPiece(tuple(self._pieces), tuple(self._rules))
552 | 
553 | 
554 | def fuzzy_digest(url_meta, objs):
555 |     """Generate hex digest string from URLMeta and objects' fuzzy_rules.
556 | 
557 |     Args:
558 |         url_meta (URLMeta): The URLMeta object.
559 |         objs (sequence): Each object hava fuzzy_rule property.
560 | 
561 |     Returns:
562 |         str: Digest value as a string of hexadecimal digits.
563 |     """
564 |     return digest(url_meta, [obj.fuzzy_rule for obj in objs])
565 | 
566 | 
567 | def digest(url_meta, objs):
568 |     """Generate hex digest string from URLMeta and objects.
569 | 
570 |     Args:
571 |         url_meta (URLMeta): The URLMeta object.
572 |         objs (sequence): The sequence of objects.
573 | 
574 |     Returns:
575 |         str: Digest value as a string of hexadecimal digits.
576 |     """
577 |     return hashlib.md5(pack(url_meta, objs).encode(DEFAULT_ENCODING)).hexdigest()
578 | 
579 | 
580 | def parse_url_pattern_string(url_pattern_string):
581 |     """Parse a URL pattern string into 3 components.
582 | 
583 |     <path>[\\?]<query>#<fragment>
584 | 
585 |     Args:
586 |         url_pattern_string (str): The url pattern string to be parsed.
587 | 
588 |     Returns:
589 |         URLPatternParseResult: A 3-tuple, (path, query, fragment).
590 |     """
591 |     idx_p = 0
592 |     idx_q = url_pattern_string.find(BasePatternRule.SINGLE_QUESTION)
593 |     idx_f = url_pattern_string.find(Symbols.NUMBER)
594 |     path = query = fragment = None
595 |     if idx_q < 0 and idx_f < 0:
596 |         path = url_pattern_string[idx_p:]
597 |     elif idx_q > 0 and idx_f > 0:
598 |         if idx_f > idx_q:
599 |             path = url_pattern_string[idx_p:idx_q]
600 |             query = url_pattern_string[idx_q + 4:idx_f]
601 |         else:
602 |             path = url_pattern_string[idx_p:idx_f]
603 |         fragment = url_pattern_string[idx_f + 1:]
604 |     elif idx_q < 0 and idx_f > 0:
605 |         path = url_pattern_string[idx_p:idx_f]
606 |         fragment = url_pattern_string[idx_f + 1:]
607 |     elif idx_q > 0 and idx_f < 0:
608 |         path = url_pattern_string[idx_p:idx_q]
609 |         query = url_pattern_string[idx_q + 4:]
610 | 
611 |     return URLPatternParseResult(path, query, fragment)
612 | 
613 | 
614 | def analyze_url_pattern_string(url_pattern_string):
615 |     """Parse a URL pattern string into URLMeta object and pattern string pieces.
616 | 
617 |     Args:
618 |         url_pattern_string (str): The URL pattern string to be parsed.
619 | 
620 |     Returns:
621 |         tuple: A 2-tuple, (url_meta, pattern_strings).
622 |     """
623 |     result = parse_url_pattern_string(url_pattern_string)
624 |     return unpack(result, False)
625 | 
626 | 
627 | def parse_pattern_string(pattern_string):
628 |     """Parse a pattern string into pattern unit strings.
629 | 
630 |     Args:
631 |         pattern_string (str): The pattern string to be parsed.
632 | 
633 |     Returns:
634 |         tuple: Pattern unit strings.
635 |     """
636 |     if pattern_string == Symbols.EMPTY:
637 |         return BLANK_TUPLE
638 |     pattern_unit_strings = []
639 |     l = len(pattern_string)
640 |     s = StringIO()
641 |     idx = 0
642 |     last_rule = None
643 |     while idx < l:
644 |         c = pattern_string[idx]
645 |         if c == Symbols.BRACKETS_L:
646 |             if last_rule is not None:
647 |                 s.seek(0)
648 |                 pattern_unit_strings.append(s.read())
649 |                 s = StringIO()
650 |                 last_rule = None
651 | 
652 |             idx_s = idx
653 |             while True:
654 |                 idx = pattern_string.find(Symbols.BRACKETS_R, idx + 1)
655 |                 if idx < 0:
656 |                     raise InvalidPatternException(
657 |                         "Missing '%s'" % Symbols.BRACKETS_R)
658 |                 elif pattern_string[idx - 1] == Symbols.BACKSLASH:
659 |                     continue
660 |                 break
661 |             if idx + 1 < l:
662 |                 if pattern_string[idx + 1] == Symbols.BRACES_L:
663 |                     old_idx = idx + 2
664 |                     idx = pattern_string.find(Symbols.BRACES_R, idx + 1)
665 |                     if idx < 0:
666 |                         raise InvalidPatternException(
667 |                             "Missing '%s'" % Symbols.BRACES_R)
668 |                     num_str = pattern_string[old_idx:idx]
669 |                     if not num_str.isdigit():
670 |                         raise InvalidPatternException(
671 |                             "Invalid num %r" % num_str)
672 | 
673 |                 elif pattern_string[idx + 1] == Symbols.PLUS:
674 |                     idx += 1
675 |             idx += 1
676 |             pattern_unit_strings.append(pattern_string[idx_s:idx])
677 |         else:
678 |             if c not in CHAR_RULE_DICT:
679 |                 raise InvalidPatternException("Invaid char %r" % c)
680 |             rule = CHAR_RULE_DICT[c]
681 |             if rule not in DIGIT_AND_ASCII_RULE_SET:
682 |                 raise InvalidPatternException(
683 |                     'Invalid pattern')
684 |             if last_rule is None:
685 |                 s.write(c)
686 |             else:
687 |                 if rule == last_rule:
688 |                     s.write(c)
689 |                 else:
690 |                     s.seek(0)
691 |                     pattern_unit_strings.append(s.read())
692 |                     s = StringIO()
693 |                     s.write(c)
694 |             last_rule = rule
695 |             idx += 1
696 |     if last_rule is not None:
697 |         s.seek(0)
698 |         pattern_unit_strings.append(s.read())
699 | 
700 |     return tuple(pattern_unit_strings)
701 | 
702 | 
703 | def parse_pattern_unit_string(pattern_unit_string):
704 |     """Parse pattern unit string into rules and literal num.
705 | 
706 |     Args:
707 |         pattern_unit_string (str): The pattern unit string to be parsed.
708 | 
709 |     Returns:
710 |         tuple: A 2-tuple, (rules, num).
711 |     """
712 |     rules = set()
713 |     num = 1
714 |     if pattern_unit_string == Symbols.EMPTY:
715 |         rules.add(Symbols.EMPTY)
716 |     elif pattern_unit_string[0] != Symbols.BRACKETS_L:
717 |         rules.add(CHAR_RULE_DICT[pattern_unit_string[0]])
718 |         num = len(pattern_unit_string)
719 |     else:
720 |         if pattern_unit_string[-1] == Symbols.BRACKETS_R:
721 |             num = 1
722 |         elif pattern_unit_string[-1] == Symbols.BRACES_R:
723 |             t = pattern_unit_string.rfind(Symbols.BRACES_L)
724 |             num_str = pattern_unit_string[t + 1:-1]
725 |             if not num_str.isdigit():
726 |                 raise InvalidPatternException("Invalid num %r" % num_str)
727 |             num = int(num_str)
728 |         elif pattern_unit_string[-1] == Symbols.PLUS:
729 |             num = -1
730 |         t = pattern_unit_string.rfind(Symbols.BRACKETS_R)
731 |         p_str = pattern_unit_string[1:t]
732 |         l = len(p_str)
733 |         idx = 0
734 |         while idx < l:
735 |             c = p_str[idx]
736 |             n = 3
737 |             if c in ASCII_DIGIT_SET:
738 |                 pass
739 |             elif c == Symbols.BACKSLASH:
740 |                 n = 2
741 |             else:
742 |                 n = 1
743 |             rule = p_str[idx:idx + n]
744 |             if rule not in RULE_SET:
745 |                 raise InvalidPatternException("Invalid rule %r" % rule)
746 |             rules.add(rule)
747 |             idx += n
748 |     if (num > 0 and len(rules) > num) or num == 0:
749 |         raise InvalidPatternException('Insufficient number')
750 |     return rules, num
751 | 


--------------------------------------------------------------------------------
/src/os_urlpattern/parsed_piece_view.py:
--------------------------------------------------------------------------------
  1 | """ParsedPieceView and subclass implementation.
  2 | """
  3 | from __future__ import unicode_literals
  4 | 
  5 | from .definition import DIGIT_AND_ASCII_RULE_SET, BasePatternRule
  6 | from .parse_utils import ParsedPiece, fuzzy_join, mix
  7 | from .utils import pick
  8 | 
  9 | 
 10 | class ParsedPieceView(object):
 11 |     """The base class of parsed piece view.
 12 | 
 13 |     View object is a wrapper of parsed piece, which have individual
 14 |     view, parsed_piece and parsed_pieces propertys are based on the
 15 |     raw parsed piece.
 16 | 
 17 |     """
 18 |     __slots__ = ('parsed_piece', '_parsed_pieces', '_view')
 19 | 
 20 |     def __init__(self, parsed_piece):
 21 |         self.parsed_piece = parsed_piece
 22 |         self._parsed_pieces = None
 23 |         self._view = None
 24 | 
 25 |     def __eq__(self, o):
 26 |         if not isinstance(o, ParsedPieceView):
 27 |             return False
 28 |         return self.view == o.view
 29 | 
 30 |     def __hash__(self):
 31 |         return hash(self.view)
 32 | 
 33 |     @property
 34 |     def view(self):
 35 |         if self._view is None:
 36 |             self._view = fuzzy_join(self.parsed_pieces)
 37 |         return self._view
 38 | 
 39 |     @property
 40 |     def parsed_pieces(self):
 41 |         if self._parsed_pieces:
 42 |             return self._parsed_pieces
 43 | 
 44 |         self._parsed_pieces = [ParsedPiece((piece,), (rule,)) for piece, rule in zip(
 45 |             self.parsed_piece.pieces, self.parsed_piece.rules)]
 46 |         return self._parsed_pieces
 47 | 
 48 | 
 49 | class PieceView(ParsedPieceView):
 50 | 
 51 |     def __init__(self, parsed_piece):
 52 |         super(PieceView, self).__init__(parsed_piece)
 53 |         self._view = self.parsed_piece.piece
 54 | 
 55 | 
 56 | class LengthView(ParsedPieceView):
 57 | 
 58 |     def __init__(self, parsed_piece):
 59 |         super(LengthView, self).__init__(parsed_piece)
 60 |         self._view = self.parsed_piece.piece_length
 61 | 
 62 | 
 63 | class MultiView(ParsedPieceView):
 64 |     pass
 65 | 
 66 | 
 67 | class MixedView(ParsedPieceView):
 68 | 
 69 |     @property
 70 |     def parsed_pieces(self):
 71 |         if self._parsed_pieces:
 72 |             return self._parsed_pieces
 73 | 
 74 |         if len(self.parsed_piece.rules) <= 1:
 75 |             self._parsed_pieces = [self.parsed_piece]
 76 |         else:
 77 |             mixed_pieces, mixed_rules = mix(
 78 |                 self.parsed_piece.pieces, self.parsed_piece.rules)
 79 | 
 80 |             self._parsed_pieces = [ParsedPiece(
 81 |                 (piece,), (rule,)) for piece, rule in zip(mixed_pieces, mixed_rules)]
 82 |         return self._parsed_pieces
 83 | 
 84 | 
 85 | class LastDotSplitFuzzyView(ParsedPieceView):
 86 | 
 87 |     @property
 88 |     def parsed_pieces(self):
 89 |         if self._parsed_pieces:
 90 |             return self._parsed_pieces
 91 |         rules = self.parsed_piece.rules
 92 |         dot_idx = None
 93 |         part_num = len(rules)
 94 |         for idx, rule in enumerate(reversed(rules)):
 95 |             if idx > 2:
 96 |                 break
 97 |             if rule == BasePatternRule.DOT:
 98 |                 dot_idx = part_num - idx - 1
 99 |                 break
100 |         self._parsed_pieces = [ParsedPiece((self.parsed_piece.piece,),
101 |                                            (self.parsed_piece.fuzzy_rule,))]
102 |         if dot_idx is not None:
103 |             skip = False
104 |             for rule in self.parsed_piece.rules[dot_idx + 1:]:
105 |                 if rule not in DIGIT_AND_ASCII_RULE_SET:
106 |                     skip = True
107 |                     break
108 |             if not skip:
109 |                 pieces = []
110 |                 rules = []
111 |                 pieces.append(''.join(self.parsed_piece.pieces[0:dot_idx]))
112 |                 pieces.append(self.parsed_piece.pieces[dot_idx])
113 |                 rules.append(
114 |                     ''.join(sorted(set(self.parsed_piece.rules[0:dot_idx]))))
115 |                 rules.append(self.parsed_piece.rules[dot_idx])
116 |                 mixed_pieces, mixed_rules = mix(
117 |                     self.parsed_piece.pieces[dot_idx + 1:],
118 |                     self.parsed_piece.rules[dot_idx + 1:])
119 |                 pieces.extend(mixed_pieces)
120 |                 rules.extend(mixed_rules)
121 |                 self._parsed_pieces = [ParsedPiece(
122 |                     (piece,), (rule,)) for piece, rule in zip(pieces, rules)]
123 |         return self._parsed_pieces
124 | 
125 | 
126 | class FuzzyView(ParsedPieceView):
127 | 
128 |     def __init__(self, parsed_piece):
129 |         super(FuzzyView, self).__init__(parsed_piece)
130 |         self._view = self.parsed_piece.fuzzy_rule
131 | 
132 |     @property
133 |     def parsed_pieces(self):
134 |         if self._parsed_pieces:
135 |             return self._parsed_pieces
136 |         self._parsed_pieces = [ParsedPiece((self.parsed_piece.piece,),
137 |                                            (self.parsed_piece.fuzzy_rule,))]
138 |         return self._parsed_pieces
139 | 
140 | 
141 | def view_cls_from_pattern(pattern, is_last_path=False):
142 |     """Get ParsedPieceView class from pattern.
143 | 
144 |     ParsedPieceView type can be deduced from the pattern.
145 | 
146 |     Args:
147 |         pattern (Pattern): The Pattern object.
148 |         is_last_path (bool, optional): Defaults to False. Whether the pattern
149 |             is at the last path position.
150 | 
151 |     Returns:
152 |         class: The class of ParsedPieceView.
153 |     """
154 |     view_cls = PieceView
155 |     pattern_units = pattern.pattern_units
156 |     if len(pattern_units) == 1:
157 |         pattern_unit = pattern_units[0]
158 |         if not pattern_unit.is_literal():
159 |             if pattern_unit.num < 0:
160 |                 view_cls = FuzzyView
161 |             else:
162 |                 view_cls = LengthView
163 |     else:
164 |         for pattern_unit in pattern_units:
165 |             if not pattern_unit.is_literal():
166 |                 if len(pattern_unit.rules) > 1:
167 |                     view_cls = MixedView
168 |                     break
169 |                 else:
170 |                     view_cls = MultiView
171 |         if is_last_path \
172 |                 and len(pattern_units) == 3 \
173 |                 and view_cls != PieceView \
174 |                 and len(pattern_units[1].rules) == 1 \
175 |                 and pick(pattern_units[1].rules) == BasePatternRule.DOT \
176 |                 and not (set(pattern_units[-1].rules) - DIGIT_AND_ASCII_RULE_SET):
177 |             view_cls = LastDotSplitFuzzyView
178 | 
179 |     return view_cls
180 | 


--------------------------------------------------------------------------------
/src/os_urlpattern/parser.py:
--------------------------------------------------------------------------------
 1 | """High-level APIs for parsing.
 2 | """
 3 | 
 4 | from __future__ import unicode_literals
 5 | 
 6 | from .parse_utils import fuzzy_digest as _fuzzy_digest
 7 | from .parse_utils import PieceParser, analyze_url, analyze_url_pattern_string
 8 | 
 9 | 
10 | def parse(url_or_pattern):
11 |     """Parse URL or URL pattern string.
12 | 
13 |     Args:
14 |         url_or_pattern (str): URL or URL pattern.
15 | 
16 |     Returns:
17 |         tuple: 2-tuples, (url_meta, parsed_pieces)
18 |     """
19 |     url_meta = None
20 |     parsed_pieces = None
21 |     if url_or_pattern.startswith('/'):  # URL pattern
22 |         from .pattern_matcher import MatchPattern
23 |         url_meta, pattern_strings = analyze_url_pattern_string(url_or_pattern)
24 |         parsed_pieces = tuple([MatchPattern(p, i == url_meta.path_depth)
25 |                                for i, p in enumerate(pattern_strings, 1)])
26 |     else:  # URL
27 |         parser = PieceParser()
28 |         url_meta, pieces = analyze_url(url_or_pattern)
29 |         parsed_pieces = tuple([parser.parse(piece) for piece in pieces])
30 | 
31 |     return url_meta, parsed_pieces
32 | 
33 | 
34 | def fuzzy_digest(*args):
35 |     """Generate hex fuzzy digest string from URL or URL pattern.
36 | 
37 |     Args:
38 |         *args: Can be a single argument string, or 2 arguments
39 |             URLMeta and objects.
40 | 
41 |     Returns:
42 |         str: Digest value as a string of hexadecimal digits.
43 |     """
44 |     l = len(args)
45 |     url_meta = None
46 |     objs = None
47 |     if l == 2:
48 |         url_meta, objs = args
49 |     elif l == 1:
50 |         url_meta, objs = parse(args[0])
51 |     else:
52 |         raise ValueError('Not digestable')
53 |     return _fuzzy_digest(url_meta, objs)
54 | 


--------------------------------------------------------------------------------
/src/os_urlpattern/pattern.py:
--------------------------------------------------------------------------------
  1 | """Pattern class.
  2 | """
  3 | from __future__ import unicode_literals
  4 | 
  5 | import re
  6 | 
  7 | from .utils import pick
  8 | 
  9 | 
 10 | class PatternUnit(object):
 11 |     """Sub-piece of pattern."""
 12 | 
 13 |     __slots__ = ('pattern_unit_string', 'rules', 'num', '_fuzzy_rule')
 14 | 
 15 |     def __init__(self, pattern_unit_string):
 16 |         self.pattern_unit_string = pattern_unit_string
 17 |         from .parse_utils import parse_pattern_unit_string
 18 |         self.rules, self.num = parse_pattern_unit_string(pattern_unit_string)
 19 |         self._fuzzy_rule = None
 20 | 
 21 |     def is_literal(self):
 22 |         """Whether this unit string is literal or not.
 23 | 
 24 |         Note:
 25 |             According to the char representation, fixed-length
 26 |             single sign is literal, like:  [\\.]{2}  [\\-]
 27 | 
 28 |         Returns:
 29 |             bool: Whether it is literal.
 30 |         """
 31 | 
 32 |         from .definition import DIGIT_AND_ASCII_RULE_SET, Symbols
 33 |         r = False
 34 |         if not self.pattern_unit_string.startswith(Symbols.BRACKETS_L):
 35 |             r = True
 36 |         elif len(self.rules) == 1:
 37 |             if self.num > 0:
 38 |                 rule = pick(self.rules)
 39 |                 if rule not in DIGIT_AND_ASCII_RULE_SET:
 40 |                     r = True
 41 |         return r
 42 | 
 43 |     @property
 44 |     def fuzzy_rule(self):
 45 |         if self._fuzzy_rule is None:
 46 |             self._fuzzy_rule = ''.join(sorted(self.rules))
 47 |         return self._fuzzy_rule
 48 | 
 49 |     def __str__(self):
 50 |         return ' '.join((self.pattern_unit_string, self.fuzzy_rule, str(self.num)))
 51 | 
 52 |     __repr__ = __str__
 53 | 
 54 | 
 55 | class Pattern(object):
 56 |     """Pattern for handle pattern string. """
 57 | 
 58 |     __slots__ = ('pattern_string', '_pattern_regex',
 59 |                  '_pattern_units', '_fuzzy_rule')
 60 | 
 61 |     def __init__(self, pattern_string):
 62 |         self.pattern_string = pattern_string
 63 |         self._pattern_regex = None
 64 |         self._pattern_units = None
 65 |         self._fuzzy_rule = None
 66 | 
 67 |     @property
 68 |     def pattern_units(self):
 69 |         """tuple: Pattern units."""
 70 | 
 71 |         from .parse_utils import parse_pattern_string
 72 |         if self._pattern_units is None:
 73 |             self._pattern_units = tuple([PatternUnit(
 74 |                 u) for u in parse_pattern_string(self.pattern_string)])
 75 |         return self._pattern_units
 76 | 
 77 |     def __str__(self):
 78 |         return self.pattern_string
 79 | 
 80 |     __repr__ = __str__
 81 | 
 82 |     def __hash__(self):
 83 |         return hash(self.pattern_string)
 84 | 
 85 |     def __eq__(self, o):
 86 |         return self.pattern_string == o.pattern_string
 87 | 
 88 |     def match(self, piece):
 89 |         if not self._pattern_regex:
 90 |             self._pattern_regex = re.compile(
 91 |                 ''.join(('^', self.pattern_string, '$')))
 92 |         return True if re.match(self._pattern_regex, piece) else False
 93 | 
 94 |     @property
 95 |     def fuzzy_rule(self):
 96 |         """str: All rules of the pattern join into a string."""
 97 |         if self._fuzzy_rule is None:
 98 |             self._fuzzy_rule = ''.join(sorted(set.union(
 99 |                 *[u.rules for u in self.pattern_units])))
100 |         return self._fuzzy_rule
101 | 


--------------------------------------------------------------------------------
/src/os_urlpattern/pattern_cluster.py:
--------------------------------------------------------------------------------
  1 | """Cluster algorithm.
  2 | """
  3 | from __future__ import unicode_literals
  4 | 
  5 | from collections import Counter, OrderedDict, namedtuple
  6 | 
  7 | from .compat import itervalues
  8 | from .parse_utils import (EMPTY_PARSED_PIECE, URLMeta, specify_rule,
  9 |                           wildcard_rule)
 10 | from .parsed_piece_view import LastDotSplitFuzzyView, MixedView, MultiView
 11 | from .pattern import Pattern
 12 | from .piece_pattern_node import (PiecePatternNode, build_from_parsed_pieces,
 13 |                                  build_from_piece_pattern_nodes)
 14 | from .utils import Bag, cached_property, dump_tree, pick
 15 | 
 16 | 
 17 | class TBag(Bag):
 18 |     __slots__ = ('stats',)
 19 | 
 20 |     def __init__(self):
 21 |         super(TBag, self).__init__()
 22 |         self.stats = Counter()
 23 | 
 24 |     @property
 25 |     def count(self):
 26 |         return self.stats['count']
 27 | 
 28 |     def add(self, obj):
 29 |         super(TBag, self).add(obj)
 30 |         self.stats['count'] += obj.count
 31 | 
 32 |     def set_pattern(self, pattern):
 33 |         for obj in self:
 34 |             obj.set_pattern(pattern)
 35 | 
 36 | 
 37 | class TBucket(TBag):
 38 | 
 39 |     def __init__(self):
 40 |         super(TBucket, self).__init__()
 41 |         self._objs = {}
 42 | 
 43 |     def __getitem__(self, key):
 44 |         return self._objs[key]
 45 | 
 46 |     def __contains__(self, key):
 47 |         return key in self._objs
 48 | 
 49 |     def __iter__(self):
 50 |         return iter(itervalues(self._objs))
 51 | 
 52 |     def add(self, obj):
 53 |         raise NotImplementedError
 54 | 
 55 | 
 56 | class PieceBag(TBag):
 57 |     """A bag contain all of the nodes with same piece.
 58 | 
 59 |     The nodes should on the same branch of a tree at the same level.
 60 |     """
 61 | 
 62 |     __slots__ = ('_p_nodes',)
 63 | 
 64 |     def __init__(self):
 65 |         super(PieceBag, self).__init__()
 66 |         self._p_nodes = set()
 67 | 
 68 |     def add(self, piece_pattern_node):
 69 |         super(PieceBag, self).add(piece_pattern_node)
 70 |         self._p_nodes.add(piece_pattern_node.parrent)
 71 |         self.stats['p_nodes_count'] += piece_pattern_node.parrent.count \
 72 |             if piece_pattern_node.parrent is not None \
 73 |             else piece_pattern_node.count
 74 | 
 75 |     @property
 76 |     def p_nodes(self):
 77 |         return self._p_nodes
 78 | 
 79 | 
 80 | class PieceBagBucket(TBucket):
 81 |     __slots__ = ('_p_nodes',)
 82 | 
 83 |     def __init__(self):
 84 |         super(PieceBagBucket, self).__init__()
 85 |         self._p_nodes = set()
 86 | 
 87 |     def add(self, obj):
 88 |         if isinstance(obj, PiecePatternNode):
 89 |             piece = obj.piece
 90 |             if piece not in self._objs:
 91 |                 self._objs[piece] = PieceBag()
 92 |             self._objs[piece].add(obj)
 93 |         elif isinstance(obj, PieceBag):
 94 |             piece = obj.pick().piece
 95 |             if piece in self._objs:
 96 |                 raise ValueError('duplicated')
 97 |             self._objs[piece] = obj
 98 |         else:
 99 |             raise ValueError('not PiecePatternNode nor PieceBag')
100 | 
101 |         self.stats['count'] += obj.count
102 | 
103 |     @property
104 |     def p_nodes(self):
105 |         if not self._p_nodes:
106 |             for piece_bag in self:
107 |                 self._p_nodes.update(piece_bag.p_nodes)
108 |         return self._p_nodes
109 | 
110 | 
111 | class ViewPieceBag(namedtuple('ViewPieceBag', ['view', 'piece_bag'])):
112 |     __slots__ = ()
113 | 
114 |     def set_pattern(self, pattern):
115 |         return self.piece_bag.set_pattern(pattern)
116 | 
117 | 
118 | class ViewPieceBagBucket(PieceBagBucket):
119 |     __slots__ = ('_url_meta', '_root')
120 | 
121 |     def __init__(self, url_meta):
122 |         super(ViewPieceBagBucket, self).__init__()
123 |         self._url_meta = url_meta
124 |         self._root = PiecePatternNode((EMPTY_PARSED_PIECE, None))
125 | 
126 |     def add(self, view_piece_bag, build_tree=True):
127 |         piece_bag = view_piece_bag.piece_bag
128 |         self._objs[piece_bag.pick().piece] = view_piece_bag
129 |         self.stats['count'] += piece_bag.count
130 | 
131 |         if not build_tree:
132 |             return
133 |         view = view_piece_bag.view
134 | 
135 |         build_from_parsed_pieces(
136 |             self._root, view.parsed_pieces, count=piece_bag.count, uniq=False)
137 | 
138 |     def cluster(self, config, **kwargs):
139 |         for clustered in cluster(config, self._url_meta, self._root, **kwargs):
140 |             yield self._transfer(clustered)
141 | 
142 |     def _transfer(self, root):
143 |         pattern = None
144 |         bucket = ViewPieceBagBucket(self._url_meta)
145 |         for nodes in dump_tree(root):
146 |             piece = ''.join([p.piece for p in nodes[1:]])
147 |             view_piece_bag = self[piece]
148 |             bucket.add(view_piece_bag, False)
149 |             if pattern is None:
150 |                 pattern = Pattern(
151 |                     ''.join([str(p.pattern) for p in nodes[1:]]))
152 |         return bucket, pattern
153 | 
154 | 
155 | def confused(total, max_part, threshold):
156 |     """Determine whether it is too complex to become a cluster.
157 | 
158 |     If a data set have several(<threshold) sub parts, this method use the total
159 |     count of the data set, the count of the max sub set and min cluster threshold
160 |     to determine whether it is too complex to become a cluster.
161 | 
162 |     Args:
163 |         total (int): Total count of a data set.
164 |         max_part (int): The max part of the data set.
165 |         threshold (int): The min threashold of a cluster.
166 | 
167 |     Returns:
168 |         bool: Too complex return True, otherwise False.
169 |     """
170 |     if total < threshold:
171 |         return False
172 |     o_part = total - max_part
173 |     if max_part >= threshold and o_part >= threshold:
174 |         return True
175 |     return abs(max_part - o_part) < threshold - 1
176 | 
177 | 
178 | class SeekResult(object):
179 |     FOUND = 1
180 |     IMPOSSIBLE = 2
181 |     UNKNOW = 3
182 |     BACKWARD = 4
183 | 
184 | 
185 | class PatternCluster(object):
186 |     """Base class of cluster."""
187 | 
188 |     def __init__(self, processor):
189 |         self._processor = processor
190 |         self._min_cluster_num = processor.config.getint(
191 |             'make', 'min_cluster_num')
192 |         self._patterns = set()
193 | 
194 |     @property
195 |     def pre_level_processor(self):
196 |         return self._processor.pre_level_processor
197 | 
198 |     def cluster(self):
199 |         pass
200 | 
201 |     def add(self, obj):
202 |         pass
203 | 
204 |     @property
205 |     def pattern_num(self):
206 |         return len(self._patterns)
207 | 
208 |     def seek_cluster(self, package):
209 |         return SeekResult.UNKNOW
210 | 
211 | 
212 | class PiecePatternCluster(PatternCluster):
213 | 
214 |     def __init__(self, processor):
215 |         super(PiecePatternCluster, self).__init__(processor)
216 |         self._bucket = PieceBagBucket()
217 | 
218 |     def seek_cluster(self, package):
219 |         p_nodes_count = sum([p.count for p in package.p_nodes])
220 |         if p_nodes_count - package.count >= self._min_cluster_num:
221 |             return SeekResult.IMPOSSIBLE
222 | 
223 |         return SeekResult.UNKNOW
224 | 
225 |     def iter_nodes(self):
226 |         return self._bucket.iter_all()
227 | 
228 |     def add(self, piece_pattern_node):
229 |         self._bucket.add(piece_pattern_node)
230 | 
231 |     def _set_pattern(self, piece_bag, update_patterns=False):
232 |         pattern = Pattern(piece_bag.pick().piece)
233 |         piece_bag.set_pattern(pattern)
234 |         if update_patterns:
235 |             self._patterns.add(pattern)
236 | 
237 |     def cluster(self):
238 |         if not self._bucket:
239 |             return
240 |         procesor = self._processor
241 |         if procesor.is_last_level() \
242 |                 and 'last_path_as_pattern' in procesor.kwargs \
243 |                 and procesor.kwargs['last_path_as_pattern']:
244 |             for piece_bag in self._bucket:
245 |                 self._set_pattern(piece_bag, True)
246 |             return
247 | 
248 |         mcn = self._min_cluster_num
249 |         if len(self._bucket) < mcn:
250 |             max_count = max(self._bucket, key=lambda x: x.count).count
251 |             if not confused(self._bucket.count, max_count, mcn):
252 |                 for piece_bag in self._bucket:
253 |                     self._set_pattern(piece_bag, True)
254 |                 return
255 | 
256 |         for piece_bag in self._bucket:
257 |             stats = piece_bag.stats
258 |             count = piece_bag.count
259 |             if count < mcn \
260 |                     or stats['p_nodes_count'] - count >= mcn \
261 |                     or not self.pre_level_processor.seek_cluster(piece_bag):
262 |                 self._set_pattern(piece_bag)
263 |                 self._add_to_forward_cluster(piece_bag)
264 |             else:
265 |                 self._set_pattern(piece_bag, True)
266 | 
267 |     def _add_to_forward_cluster(self, piece_bag):
268 |         parsed_piece = piece_bag.pick().parsed_piece
269 |         if len(parsed_piece.pieces) == 1:
270 |             self._processor.get_cluster(LengthPatternCluster).add(piece_bag)
271 |             return
272 | 
273 |         view = MultiView(parsed_piece)
274 |         p_cls = BasePatternCluster
275 |         vl = len(view.parsed_pieces)
276 | 
277 |         if vl == 3 and self._processor.is_last_path():
278 |             ldsf_view = LastDotSplitFuzzyView(parsed_piece)
279 |             if view == ldsf_view:
280 |                 view = ldsf_view
281 |                 p_cls = LastDotSplitFuzzyPatternCluster
282 |         elif vl > 3:
283 |             mixed_view = MixedView(parsed_piece)
284 |             mvl = len(mixed_view.parsed_pieces)
285 |             if mvl == 1:
286 |                 self._processor.get_cluster(
287 |                     LengthPatternCluster).add(piece_bag)
288 |                 return
289 |             elif vl - mvl >= self._min_cluster_num:
290 |                 if mvl == 3 and self._processor.is_last_path():
291 |                     ldsf_view = LastDotSplitFuzzyView(parsed_piece)
292 |                     if mixed_view == ldsf_view:
293 |                         view = ldsf_view
294 |                         p_cls = LastDotSplitFuzzyPatternCluster
295 |                     else:
296 |                         view = mixed_view
297 |                         p_cls = MixedPatternCluster
298 |                 else:
299 |                     view = mixed_view
300 |                     p_cls = MixedPatternCluster
301 | 
302 |         self._processor.get_cluster(p_cls).add(
303 |             ViewPieceBag(view, piece_bag))
304 | 
305 | 
306 | class LengthPatternCluster(PatternCluster):
307 |     def __init__(self, processor):
308 |         super(LengthPatternCluster, self).__init__(processor)
309 |         self._length_buckets = {}
310 | 
311 |     def add(self, piece_bag):
312 |         piece_length = piece_bag.pick().parsed_piece.piece_length
313 |         if piece_length not in self._length_buckets:
314 |             self._length_buckets[piece_length] = PieceBagBucket()
315 |         self._length_buckets[piece_length].add(piece_bag)
316 | 
317 |     def _length_as_cluster(self, length_bucket):
318 |         if len(length_bucket) < self._min_cluster_num:
319 |             if length_bucket.count < self._min_cluster_num:
320 |                 return False
321 |             max_count = max(length_bucket, key=lambda x: x.count).count
322 |             if not confused(length_bucket.count, max_count, self._min_cluster_num):
323 |                 return False
324 | 
325 |         return True
326 | 
327 |     def _update_patterns(self, bucket):
328 |         for piece_bag in bucket:
329 |             self._patterns.add(piece_bag.pick().pattern)
330 | 
331 |     def cluster(self):
332 |         if not self._length_buckets:
333 |             return
334 |         mcn = self._min_cluster_num
335 |         if len(self._length_buckets) < mcn:
336 |             total = sum([c.count for c in itervalues(self._length_buckets)])
337 |             max_bucket = max(itervalues(self._length_buckets),
338 |                              key=lambda x: x.count)
339 |             if not confused(total, max_bucket.count, mcn):
340 |                 for bucket in itervalues(self._length_buckets):
341 |                     if self._length_as_cluster(bucket):
342 |                         self._set_pattern(bucket, True)
343 |                     else:
344 |                         self._update_patterns(bucket)
345 |                 return
346 | 
347 |         forward_cluster = self._processor.get_cluster(FuzzyPatternCluster)
348 |         for length_bucket in itervalues(self._length_buckets):
349 |             if self._length_as_cluster(length_bucket):
350 |                 if self.pre_level_processor.seek_cluster(length_bucket):
351 |                     self._set_pattern(length_bucket, True)
352 |                     continue
353 |                 self._set_pattern(length_bucket)
354 | 
355 |             forward_cluster.add(length_bucket)
356 | 
357 |     def _set_pattern(self, length_bucket, update_patterns=False):
358 |         parsed_piece = length_bucket.pick().parsed_piece
359 |         length = parsed_piece.piece_length
360 |         pattern = Pattern(specify_rule(parsed_piece.fuzzy_rule, length))
361 |         length_bucket.set_pattern(pattern)
362 |         if update_patterns:
363 |             self._patterns.add(pattern)
364 | 
365 | 
366 | class MultiPatternCluster(PatternCluster):
367 |     def __init__(self, processor):
368 |         super(MultiPatternCluster, self).__init__(processor)
369 |         self._buckets = {}
370 | 
371 |     def cluster(self):
372 |         for bucket in itervalues(self._buckets):
373 |             if bucket.count < self._min_cluster_num:
374 |                 self._to_forward_cluster(bucket)
375 |                 continue
376 |             for b, pattern in self._cluster(bucket):
377 |                 if self._as_cluster(b, pattern):
378 |                     self._set_pattern(b, pattern)
379 |                 else:
380 |                     self._to_forward_cluster(b)
381 | 
382 |     def _cluster(self, bucket):
383 |         for b, pattern in bucket.cluster(self._processor.config):
384 |             yield b, pattern
385 | 
386 |     def _to_forward_cluster(self, bucket):
387 |         for view_piece_bag in bucket:
388 |             self._add_to_forward_cluster(view_piece_bag)
389 | 
390 |     def _add_to_forward_cluster(self, view_piece_bag):
391 |         pass
392 | 
393 |     def _as_cluster(self, bucket, pattern):
394 |         if bucket.count < self._min_cluster_num:
395 |             return False
396 |         return True
397 | 
398 |     def _set_pattern(self, bucket, pattern):
399 |         bucket.set_pattern(pattern)
400 |         self._patterns.add(pattern)
401 | 
402 |     def add(self, view_piece_bag):
403 |         view = view_piece_bag.view
404 |         if view not in self._buckets:
405 |             url_meta = URLMeta(len(view.parsed_pieces), [], False)
406 |             self._buckets[view] = ViewPieceBagBucket(url_meta)
407 |         self._buckets[view].add(view_piece_bag)
408 | 
409 | 
410 | class BasePatternCluster(MultiPatternCluster):
411 | 
412 |     def _add_to_forward_cluster(self, view_piece_bag):
413 |         view = view_piece_bag.view
414 |         piece_bag = view_piece_bag.piece_bag
415 |         parsed_piece = piece_bag.pick().parsed_piece
416 | 
417 |         mixed_view = MixedView(parsed_piece)
418 |         mvl = len(mixed_view.parsed_pieces)
419 | 
420 |         p_cls = MixedPatternCluster
421 | 
422 |         if view == mixed_view:
423 |             if self._processor.is_last_path():
424 |                 ldsf_view = LastDotSplitFuzzyView(parsed_piece)
425 |                 if len(ldsf_view.parsed_pieces) == 1:
426 |                     self._processor.get_cluster(
427 |                         LengthPatternCluster).add(piece_bag)
428 |                     return
429 |                 else:
430 |                     view = ldsf_view
431 |                     p_cls = LastDotSplitFuzzyPatternCluster
432 |             else:
433 |                 self._processor.get_cluster(
434 |                     LengthPatternCluster).add(piece_bag)
435 |                 return
436 |         else:
437 |             view = mixed_view
438 |             if mvl == 1:
439 |                 self._processor.get_cluster(
440 |                     LengthPatternCluster).add(piece_bag)
441 |                 return
442 |             elif mvl == 3 and self._processor.is_last_path():
443 |                 ldsf_view = LastDotSplitFuzzyView(parsed_piece)
444 |                 if mixed_view == ldsf_view:
445 |                     view = ldsf_view
446 |                     p_cls = LastDotSplitFuzzyPatternCluster
447 | 
448 |         self._processor.get_cluster(p_cls).add(
449 |             ViewPieceBag(view, piece_bag))
450 | 
451 | 
452 | class MixedPatternCluster(MultiPatternCluster):
453 | 
454 |     def _add_to_forward_cluster(self, view_piece_bag):
455 |         view = view_piece_bag.view
456 |         piece_bag = view_piece_bag.piece_bag
457 |         parsed_piece = piece_bag.pick().parsed_piece
458 | 
459 |         if self._processor.is_last_path():
460 |             ldsf_view = LastDotSplitFuzzyView(parsed_piece)
461 |             if len(ldsf_view.parsed_pieces) == 1:
462 |                 self._processor.get_cluster(
463 |                     LengthPatternCluster).add(piece_bag)
464 |                 return
465 |             else:
466 |                 view = ldsf_view
467 |                 p_cls = LastDotSplitFuzzyPatternCluster
468 |         else:
469 |             self._processor.get_cluster(
470 |                 LengthPatternCluster).add(piece_bag)
471 |             return
472 | 
473 |         self._processor.get_cluster(p_cls).add(
474 |             ViewPieceBag(view, piece_bag))
475 | 
476 | 
477 | class LastDotSplitFuzzyPatternCluster(MultiPatternCluster):
478 | 
479 |     def _cluster(self, bucket):
480 |         for b, pattern in bucket.cluster(self._processor.config,
481 |                                          last_path_as_pattern=True):
482 |             yield b, pattern
483 | 
484 |     def _add_to_forward_cluster(self, view_piece_bag):
485 |         self._processor.get_cluster(LengthPatternCluster).add(
486 |             view_piece_bag.piece_bag)
487 | 
488 | 
489 | class FuzzyPatternCluster(PatternCluster):
490 |     def __init__(self, processor):
491 |         super(FuzzyPatternCluster, self).__init__(processor)
492 |         self._cached = TBag()
493 |         self._force_pattern = False
494 |         self._fuzzy_pattern = None
495 | 
496 |     def add(self, bucket):
497 |         if self._force_pattern:
498 |             self._set_pattern(bucket)
499 |         else:
500 |             self._cached.add(bucket)
501 |             if len(self._cached) >= self._min_cluster_num:
502 |                 self._force_pattern = True
503 | 
504 |     def _update_patterns(self):
505 |         for bucket in self._cached:
506 |             for piece_bag in bucket:
507 |                 self._patterns.add(piece_bag.pick().pattern)
508 | 
509 |     def cluster(self):
510 |         if self._force_pattern:
511 |             self._set_pattern(self._cached)
512 |         else:
513 |             if self._cached.count < self._min_cluster_num:
514 |                 self._update_patterns()
515 |                 return
516 |             max_count = max(self._cached, key=lambda x: x.count).count
517 |             if confused(self._cached.count, max_count, self._min_cluster_num):
518 |                 self._set_pattern(self._cached)
519 |             else:
520 |                 self._update_patterns()
521 | 
522 |     def _set_pattern(self, package):
523 |         if self._fuzzy_pattern is None:
524 |             self._fuzzy_pattern = Pattern(
525 |                 wildcard_rule(package.pick().parsed_piece.fuzzy_rule))
526 |             self._patterns.add(self._fuzzy_pattern)
527 |         package.set_pattern(self._fuzzy_pattern)
528 | 
529 | 
530 | CLUSTER_CLASSES = [PiecePatternCluster,
531 |                    BasePatternCluster,
532 |                    MixedPatternCluster,
533 |                    LastDotSplitFuzzyPatternCluster,
534 |                    LengthPatternCluster,
535 |                    FuzzyPatternCluster]
536 | 
537 | 
538 | class ClusterProcessor(object):
539 |     def __init__(self, config, url_meta, pre_level_processor, **kwargs):
540 |         self._config = config
541 |         self._url_meta = url_meta
542 |         self._pattern_clusters = OrderedDict(
543 |             [(c.__name__, c(self)) for c in CLUSTER_CLASSES])
544 |         self._pre_level_processor = pre_level_processor
545 |         self._next_level_processors = {}
546 |         self._kwargs = kwargs
547 | 
548 |     @cached_property
549 |     def level(self):
550 |         l = 0
551 |         n = self.pre_level_processor
552 |         while n is not None:
553 |             l += 1
554 |             n = n.pre_level_processor
555 |         return l
556 | 
557 |     def is_last_level(self):
558 |         return self._url_meta.depth == self.level
559 | 
560 |     def is_last_path(self):
561 |         return self._url_meta.path_depth == self.level
562 | 
563 |     @property
564 |     def kwargs(self):
565 |         return self._kwargs
566 | 
567 |     @property
568 |     def next_level_processors(self):
569 |         return self._next_level_processors.values()
570 | 
571 |     def _backward_package(self, package):
572 |         bucket = PieceBagBucket()
573 |         for p_node in package.p_nodes:
574 |             if p_node.piece in bucket:
575 |                 continue
576 |             bucket.add(p_node)
577 |         return bucket
578 | 
579 |     def seek_cluster(self, package):
580 |         if self._pre_level_processor is None:
581 |             return False
582 |         for c in itervalues(self._pattern_clusters):
583 |             res = c.seek_cluster(package)
584 |             if res == SeekResult.FOUND:
585 |                 return True
586 |             elif res == SeekResult.IMPOSSIBLE:
587 |                 break
588 |             elif res == SeekResult.BACKWARD:
589 |                 pack = self._backward_package(package)
590 |                 return self._pre_level_processor.seek_cluster(pack)
591 |             elif res == SeekResult.UNKNOW:
592 |                 continue
593 |             else:
594 |                 raise ValueError('invalid seek result')
595 | 
596 |         return False
597 | 
598 |     def get_cluster(self, cluster_cls):
599 |         return self._pattern_clusters[cluster_cls.__name__]
600 | 
601 |     @property
602 |     def config(self):
603 |         return self._config
604 | 
605 |     @property
606 |     def pre_level_processor(self):
607 |         return self._pre_level_processor
608 | 
609 |     def _process(self):
610 |         for c in itervalues(self._pattern_clusters):
611 |             c.cluster()
612 | 
613 |     def add(self, node, add_children=False):
614 |         c = self.get_cluster(PiecePatternCluster)
615 |         if add_children:
616 |             for child in node.children:
617 |                 c.add(child)
618 |         else:
619 |             c.add(node)
620 | 
621 |     @property
622 |     def pattern_num(self):
623 |         return sum([c.pattern_num for c in itervalues(self._pattern_clusters)])
624 | 
625 |     def process(self):
626 |         self._process()
627 |         if self.is_last_level():
628 |             return
629 | 
630 |         self._create_next_level_processors()
631 | 
632 |         for processor in itervalues(self._next_level_processors):
633 |             processor.process()
634 | 
635 |     def _create_next_level_processors(self):
636 | 
637 |         pp_cluster = self.get_cluster(PiecePatternCluster)
638 |         processors = self._next_level_processors
639 | 
640 |         for node in pp_cluster.iter_nodes():
641 |             pattern = node.pattern
642 |             if pattern not in processors:
643 |                 processors[pattern] = ClusterProcessor(
644 |                     self._config,
645 |                     self._url_meta,
646 |                     self, **self.kwargs)
647 |             processor = processors[pattern]
648 |             processor.add(node, add_children=True)
649 | 
650 | 
651 | def split_by_pattern(root):
652 |     """Split the piece pattern tree by pattern path.
653 | 
654 |     Args:
655 |         root (PiecePatternNode): The root of piece pattern tree.
656 | 
657 |     Returns:
658 |         iterator: Iterator of sub-trees.
659 |     """
660 |     tree_roots = {}
661 |     for nodes in dump_tree(root):
662 |         pid = hash("/".join([str(p.pattern) for p in nodes]))
663 |         if pid not in tree_roots:
664 |             tree_roots[pid] = PiecePatternNode((EMPTY_PARSED_PIECE, None))
665 |         sub_root = tree_roots[pid]
666 |         build_from_piece_pattern_nodes(sub_root, nodes[1:])
667 | 
668 |     return itervalues(tree_roots)
669 | 
670 | 
671 | def _can_be_splited(processor):
672 |     """Check whether the processor tree can be splited.
673 | 
674 |     Args:
675 |         processor (ClusterProcessor): The root node of cluster processor.
676 | 
677 |     Returns:
678 |         bool: Whether the processor tree can be splited.
679 |     """
680 |     while True:
681 |         pattern_num = processor.pattern_num
682 |         if pattern_num > 1:
683 |             return True
684 |         l = len(processor.next_level_processors)
685 |         if l <= 0:
686 |             break
687 |         elif l > 1:
688 |             return True
689 |         processor = pick(processor.next_level_processors)
690 | 
691 |     return False
692 | 
693 | 
694 | def process(config, url_meta, root, **kwargs):
695 |     """Start clustering.
696 | 
697 |     Args:
698 |         config (Config): The configure object.
699 |         url_meta (URLMeta): The URLMeta object.
700 |         root (PiecePatternNode): The root of the piece pattern tree.
701 |         **kwargs: Keyword arguments.
702 | 
703 |     Returns:
704 |         bool: Whether the clustered tree can be split.
705 |     """
706 |     processor = ClusterProcessor(config, url_meta, None, **kwargs)
707 |     processor.add(root)
708 |     processor.process()
709 |     return _can_be_splited(processor)
710 | 
711 | 
712 | def cluster(config, url_meta, root, **kwargs):
713 |     """Entrance of the cluster workflow.
714 | 
715 |     Args:
716 |         config (Config): The configure object.
717 |         url_meta (URLMeta): The URLMeta object.
718 |         root (PiecePatternNode): The root of the piece pattern tree.
719 |         **kwargs: Keyword arguments.
720 | 
721 |     Yields:
722 |         PiecePatternNode: The clustered sub piece pattern tree root.
723 | 
724 |     """
725 |     if root.count <= 0:
726 |         return
727 |     if not process(config, url_meta, root, **kwargs):
728 |         yield root
729 |         return
730 |     for sub_root in split_by_pattern(root):
731 |         for clustered in cluster(config, url_meta, sub_root, **kwargs):
732 |             yield clustered
733 | 


--------------------------------------------------------------------------------
/src/os_urlpattern/pattern_maker.py:
--------------------------------------------------------------------------------
  1 | """Pattern clustering procedure APIs.
  2 | """
  3 | from .compat import itervalues
  4 | from .config import get_default_config
  5 | from .definition import BasePattern
  6 | from .parse_utils import EMPTY_PARSED_PIECE, ParsedPiece
  7 | from .parser import fuzzy_digest, parse
  8 | from .pattern_cluster import cluster
  9 | from .piece_pattern_node import PiecePatternNode, build_from_parsed_pieces
 10 | from .utils import TreeNode, build_tree, dump_tree, pick
 11 | 
 12 | 
 13 | class PatternMaker(object):
 14 |     """Scaffold for simplifying clustering.
 15 | 
 16 |     After load urls, iterate all sub makers make cluster
 17 |     individually or cluster all by calling make method.
 18 |     """
 19 | 
 20 |     def __init__(self, config=None):
 21 |         self._config = get_default_config() if config is None else config
 22 |         self._makers = {}
 23 | 
 24 |     @property
 25 |     def makers(self):
 26 |         """iterable: For iterating all sub makers."""
 27 |         return itervalues(self._makers)
 28 | 
 29 |     def load(self, url, meta=None):
 30 |         """Load url and meta.
 31 | 
 32 |         Args:
 33 |             url (str): The URL to be loaded.
 34 |             meta (object, optional): Defaults to None. Meta data will be
 35 |                 merged at each cluster and can be accessed by clustered
 36 |                 node's meta property.
 37 | 
 38 |         Returns:
 39 |             tuple: 2-tules, (node, is_new).
 40 |         """
 41 |         url_meta, parsed_pieces = parse(url)
 42 |         if not isinstance(parsed_pieces[0], ParsedPiece):
 43 |             raise ValueError('Invalid URL')
 44 |         sid = fuzzy_digest(url_meta, parsed_pieces)
 45 |         if sid not in self._makers:
 46 |             self._makers[sid] = Maker(url_meta, self._config)
 47 |         return self._makers[sid].load(parsed_pieces, meta=meta)
 48 | 
 49 |     def make(self, combine=False):
 50 |         """Iterate all sub makers, start clustering and yield clustered.
 51 | 
 52 |         Args:
 53 |             combine (bool, optional): Defaults to False. Combine the
 54 |                 same url_meta clusters into a patten tree.
 55 | 
 56 |         Yields:
 57 |             tuple: 2-tuple, (url_meta, clustered). The clustered is the
 58 |                 root of a clustered tree.
 59 |         """
 60 |         for maker in self.makers:
 61 |             for clustered in maker.make(combine):
 62 |                 yield maker.url_meta, clustered
 63 | 
 64 | 
 65 | class Maker(object):
 66 |     """Low-level APIs for clustering.
 67 | 
 68 |     Suppose this will only be used for same fuzzy-digest clustering.
 69 |     """
 70 | 
 71 |     def __init__(self, url_meta, config=None):
 72 |         self._url_meta = url_meta
 73 |         self._config = get_default_config() if config is None else config
 74 |         self._root = PiecePatternNode((EMPTY_PARSED_PIECE, None))
 75 | 
 76 |     @property
 77 |     def url_meta(self):
 78 |         """URLMeta: The URLMeta object."""
 79 |         return self._url_meta
 80 | 
 81 |     def load(self, parsed_pieces, meta=None):
 82 |         """Load parsed pieces and meta.
 83 | 
 84 |         Args:
 85 |             parsed_pieces (list): The parsed pieces to be loaded.
 86 |             meta (object, optional): Defaults to None. Meta data will be
 87 |                 merged at each cluster and can be accessed by clustered
 88 |                 node's meta property.
 89 | 
 90 |         Returns:
 91 |             tuple: 2-tules, (node, is_new).
 92 |         """
 93 |         return build_from_parsed_pieces(self._root,
 94 |                                         parsed_pieces,
 95 |                                         meta=meta)
 96 | 
 97 |     def _cluster(self):
 98 |         for clustered in cluster(self._config,
 99 |                                  self._url_meta,
100 |                                  self._root):
101 |             yield clustered
102 | 
103 |     def _combine_clusters(self):
104 |         root = TreeNode(BasePattern.EMPTY)
105 |         for clustered in self._cluster():
106 |             nodes = pick(dump_tree(clustered))
107 |             build_tree(root, [(n.pattern, n.pattern)
108 |                               for n in nodes[1:]], nodes[0].count)
109 | 
110 |         yield root
111 | 
112 |     def make(self, combine=False):
113 |         """Start clustering and yield clustered.
114 | 
115 |         Args:
116 |             combine (bool, optional): Defaults to False. Combine the
117 |                 clusters into a patten tree.
118 | 
119 |         Yields:
120 |             TreeNode: Root of the clustered tree. If combine=False yield
121 |                 all clustered parsed piece trees otherwise yield a
122 |                 combined pattern tree.
123 |         """
124 |         if combine:
125 |             return self._combine_clusters()
126 |         return self._cluster()
127 | 


--------------------------------------------------------------------------------
/src/os_urlpattern/pattern_matcher.py:
--------------------------------------------------------------------------------
  1 | """Pattern matching APIs.
  2 | """
  3 | from __future__ import unicode_literals
  4 | 
  5 | from functools import total_ordering
  6 | 
  7 | from .definition import BasePatternRule
  8 | from .parse_utils import MIXED_RULE_SET, PieceParser, fuzzy_join
  9 | from .parsed_piece_view import (FuzzyView, LastDotSplitFuzzyView, LengthView,
 10 |                                 MixedView, MultiView, PieceView,
 11 |                                 view_cls_from_pattern)
 12 | from .parser import fuzzy_digest, parse
 13 | from .pattern import Pattern
 14 | from .utils import TreeNode, build_tree
 15 | 
 16 | 
 17 | @total_ordering
 18 | class MatchPattern(Pattern):
 19 |     """Pattern used for matching.
 20 | 
 21 |     It is comparable and has a view_cls property to
 22 |     identify the pattern type.
 23 |     """
 24 |     __slots__ = ('view_cls', '_cmp_key')
 25 | 
 26 |     def __init__(self, pattern_string, is_last_path=False):
 27 |         super(MatchPattern, self).__init__(pattern_string)
 28 |         self.view_cls = view_cls_from_pattern(self, is_last_path)
 29 |         self._cmp_key = None
 30 | 
 31 |     @property
 32 |     def cmp_key(self):
 33 |         """str: Used for sort."""
 34 | 
 35 |         if self._cmp_key is None:
 36 |             l = [MatchPattern(u.pattern_unit_string)
 37 |                  for u in reversed(self.pattern_units)]
 38 |             self._cmp_key = ''.join([str(VIEW_ORDER[p.view_cls]) for p in l])
 39 |         return self._cmp_key
 40 | 
 41 |     def __ne__(self, other):
 42 |         return self.pattern_string != other.pattern_string
 43 | 
 44 |     def __lt__(self, other):
 45 |         if self.view_cls == other.view_cls:
 46 |             return self.cmp_key > other.cmp_key
 47 |         return VIEW_ORDER[self.view_cls] > VIEW_ORDER[other.view_cls]
 48 | 
 49 | 
 50 | EMPTY_MATCH_PATTERN = MatchPattern(BasePatternRule.EMPTY)
 51 | 
 52 | 
 53 | class ViewMatcher(object):
 54 |     """Base class for different type of view matcher.
 55 | 
 56 |     Init with a specified ParsedPieceView class.
 57 |     Filled with same view-type match node.
 58 |     Get all matched nodes.    
 59 |     """
 60 |     __slots__ = ('view_cls', '_matchers')
 61 | 
 62 |     def __init__(self, view_cls):
 63 |         self.view_cls = view_cls
 64 |         self._matchers = {}
 65 | 
 66 |     def add_match_node(self, match_node):
 67 |         pass
 68 | 
 69 |     def match(self, parsed_piece):
 70 |         view = self.view_cls(parsed_piece)
 71 |         if view.view not in self._matchers:
 72 |             return []
 73 |         parsed_pieces = view.parsed_pieces
 74 |         matched_result = []
 75 |         self._matchers[view.view].match(
 76 |             parsed_pieces, 0, matched_result)
 77 |         return [n.meta for n in matched_result]
 78 | 
 79 | 
 80 | class PiecePatternViewMatcher(ViewMatcher):
 81 | 
 82 |     def add_match_node(self, match_node):
 83 |         if match_node.pattern.pattern_string not in self._matchers:
 84 |             self._matchers[match_node.pattern.pattern_string] = [match_node]
 85 | 
 86 |     def match(self, parsed_piece):
 87 |         return [] if parsed_piece.piece not in self._matchers \
 88 |             else self._matchers[parsed_piece.piece]
 89 | 
 90 | 
 91 | class LengthPatternViewMatcher(ViewMatcher):
 92 | 
 93 |     def add_match_node(self, match_node):
 94 |         length = match_node.pattern.pattern_units[0].num
 95 |         self._matchers[length] = [match_node]
 96 | 
 97 |     def match(self, parsed_piece):
 98 |         return [] if parsed_piece.piece_length not in self._matchers \
 99 |             else self._matchers[parsed_piece.piece_length]
100 | 
101 | 
102 | class MultiPatternViewMatcher(ViewMatcher):
103 | 
104 |     def add_match_node(self, match_node):
105 |         pattern = match_node.pattern
106 |         r = fuzzy_join(pattern.pattern_units)
107 |         if r not in self._matchers:
108 |             self._matchers[r] = PatternMatchNode(EMPTY_MATCH_PATTERN)
109 |         patterns = [MatchPattern(p.pattern_unit_string)
110 |                     for p in pattern.pattern_units]
111 |         matcher = self._matchers[r]
112 |         build_tree(matcher, patterns, meta=match_node)
113 | 
114 | 
115 | class MixedPatternViewMatcher(MultiPatternViewMatcher):
116 | 
117 |     def _pattern(self, pattern_units):
118 |         return MatchPattern(''.join([p.pattern_unit_string for p in pattern_units]))
119 | 
120 |     def add_match_node(self, match_node):
121 |         patterns = []
122 |         t = []
123 |         for pattern_unit in match_node.pattern.pattern_units:
124 |             if not pattern_unit.is_literal() \
125 |                     or pattern_unit.fuzzy_rule not in MIXED_RULE_SET:
126 |                 if t:
127 |                     patterns.append(self._pattern(t))
128 |                     t = []
129 |                 patterns.append(self._pattern([pattern_unit]))
130 |             else:
131 |                 t.append(pattern_unit)
132 | 
133 |         if t:
134 |             patterns.append(self._pattern(t))
135 | 
136 |         r = fuzzy_join(patterns)
137 |         if r not in self._matchers:
138 |             self._matchers[r] = PatternMatchNode(EMPTY_MATCH_PATTERN)
139 |         matcher = self._matchers[r]
140 |         build_tree(matcher, patterns, meta=match_node)
141 | 
142 | 
143 | class FuzzyPatternViewMatcher(ViewMatcher):
144 | 
145 |     def __init__(self, view_cls):
146 |         super(FuzzyPatternViewMatcher, self).__init__(view_cls)
147 |         self._matchers = []
148 | 
149 |     def add_match_node(self, match_node):
150 |         self._matchers.append(match_node)
151 | 
152 |     def match(self, parsed_piece):
153 |         return self._matchers
154 | 
155 | 
156 | VIEW_MATCHERS = [
157 |     (PieceView, PiecePatternViewMatcher),
158 |     (MultiView, MultiPatternViewMatcher),
159 |     (MixedView, MultiPatternViewMatcher),
160 |     (LastDotSplitFuzzyView, MultiPatternViewMatcher),
161 |     (LengthView, LengthPatternViewMatcher),
162 |     (FuzzyView, FuzzyPatternViewMatcher),
163 | ]
164 | 
165 | VIEW_ORDER = dict([(item[0], _idx) for _idx, item in enumerate(VIEW_MATCHERS)])
166 | 
167 | 
168 | def get_view_matcher_cls(view_cls):
169 |     """Get specified ViewMatcher class from ParsedPieceView class.
170 | 
171 |     Args:
172 |         view_cls (ParsedPieceView): Class of a specified ParsedPieceView.
173 | 
174 |     Returns:
175 |         class(ViewMatcher): The Corresponding ViewMatcher class.
176 |     """
177 |     idx = VIEW_ORDER[view_cls]
178 |     return VIEW_MATCHERS[idx][1]
179 | 
180 | 
181 | @total_ordering
182 | class PatternMatchNode(TreeNode):
183 |     """Node for building a match tree."""
184 | 
185 |     __slots__ = ('_view_matchers',)
186 | 
187 |     def __init__(self, value):
188 |         super(PatternMatchNode, self).__init__(value)
189 |         self._view_matchers = []
190 | 
191 |     @property
192 |     def view_cls(self):
193 |         return self.pattern.view_cls
194 | 
195 |     def match(self, parsed_pieces, idx, matched_nodes):
196 |         """DF find all matched nodes.
197 | 
198 |         If a path from root to leaf match all the corresponding pieces,
199 |         the leaf node is called matched node.This mathed shoud be called
200 |         by the root node, with idx=0 and a list which will be filled with
201 |         all matched nodes.
202 | 
203 |         Args:
204 |             parsed_pieces (sequence): All of the parsed pieces to be matched.
205 |             idx (int): Indecate which piece of the whole parsed pieces should
206 |                 try to match this node.
207 |             matched_nodes (list of PatternMatchNode): Filled with all of the
208 |                 matched leaf nodes.
209 |         """
210 |         parsed_piece = parsed_pieces[idx]
211 |         for matcher in self._view_matchers:
212 |             nodes = matcher.match(parsed_piece)
213 |             if not nodes:
214 |                 continue
215 |             if nodes[0].leaf():
216 |                 matched_nodes.extend(nodes)
217 |                 continue
218 |             self._deep_match(nodes, parsed_pieces, idx + 1,
219 |                              matched_nodes)
220 | 
221 |     def _deep_match(self, nodes, parsed_pieces, idx, matched_nodes):
222 |         for node in nodes:
223 |             node.match(parsed_pieces, idx, matched_nodes)
224 | 
225 |     def _get_matcher(self, view_cls):
226 |         s = 0
227 |         e = len(self._view_matchers)
228 |         while e > s:
229 |             t = (e - s) // 2 + s
230 |             matcher = self._view_matchers[t]
231 |             if matcher.view_cls == view_cls:
232 |                 return matcher
233 |             tid = VIEW_ORDER[matcher.view_cls]
234 |             vid = VIEW_ORDER[view_cls]
235 |             if tid < vid:
236 |                 s = t + 1
237 |             else:
238 |                 e = t
239 | 
240 |         matcher = get_view_matcher_cls(view_cls)(view_cls)
241 |         self._view_matchers.insert(e, matcher)
242 |         return matcher
243 | 
244 |     @property
245 |     def pattern(self):
246 |         return self.value
247 | 
248 |     def add_child(self, pattern):
249 |         child, is_new = super(PatternMatchNode, self).add_child(
250 |             (pattern, pattern))
251 |         if is_new:
252 |             matcher = self._get_matcher(child.view_cls)
253 |             matcher.add_match_node(child)
254 |         return child, is_new
255 | 
256 |     def __lt__(self, other):
257 |         if id(self) == id(other) or self.parrent is None:
258 |             return False
259 |         if self.pattern == other.pattern:
260 |             return self.parrent < other.parrent
261 |         return self.pattern < other.pattern
262 | 
263 | 
264 | class PatternMatcher(object):
265 |     """Offer match processing APIs.
266 | 
267 |     Common procedure:
268 |     1. Init a PatternMatcher.
269 |     2. Load pattern string.
270 |     3. Match url.
271 |     """
272 | 
273 |     def __init__(self):
274 |         self._parser = PieceParser()
275 |         self._matchers = {}
276 | 
277 |     def load(self, url_pattern_string, meta=None):
278 |         """Load URL pattern string.
279 | 
280 |         Args:
281 |             url_pattern_string (str): URL pattern string.
282 |             meta (any, optional): Defaults to None. It will bind to
283 |                 matched result's meta property.
284 | 
285 |         Returns:
286 |             tuple: 2-tules, (node, is_new).
287 |         """
288 |         url_meta, parsed_patterns = parse(url_pattern_string)
289 |         if not isinstance(parsed_patterns[0], MatchPattern):
290 |             raise ValueError('Invalid URL pattern')
291 |         sid = fuzzy_digest(url_meta, parsed_patterns)
292 |         if sid not in self._matchers:
293 |             self._matchers[sid] = Matcher(url_meta)
294 |         matcher = self._matchers[sid]
295 |         return matcher.load(parsed_patterns, meta=meta)
296 | 
297 |     def match(self, url):
298 |         """Match url, get the matched results.
299 | 
300 |         Args:
301 |             url (str): The URL to be matched.
302 | 
303 |         Returns:
304 |             list: List of matched pattern nodes, if no match return [].
305 |               Bound meta data can be accessed with node.meta.
306 |         """
307 |         url_meta, parsed_pieces = parse(url)
308 |         sid = fuzzy_digest(url_meta, parsed_pieces)
309 |         if sid in self._matchers:
310 |             return self._matchers[sid].match(parsed_pieces)
311 |         return []
312 | 
313 | 
314 | class Matcher(object):
315 |     """Low-level APIs for matching.
316 | 
317 |     Suppose this will only be used for same fuzzy-digest matching.
318 |     """
319 | 
320 |     def __init__(self, url_meta):
321 |         self._url_meta = url_meta
322 |         self._root = PatternMatchNode(EMPTY_MATCH_PATTERN)
323 | 
324 |     @property
325 |     def url_meta(self):
326 |         """URLMeta: The URLMeta object."""
327 |         return self._url_meta
328 | 
329 |     def match(self, parsed_pieces):
330 |         """Match URL parsed peices.
331 | 
332 |         Args:
333 |             parsed_pieces (sequence): URL parsed pieces.
334 | 
335 |         Returns:
336 |             list: List of matched pattern nodes, if no match return [].
337 |               Bound meta data can be accessed with node.meta.
338 |         """
339 | 
340 |         matched_nodes = []
341 |         self._root.match(parsed_pieces, 0, matched_nodes)
342 |         return matched_nodes
343 | 
344 |     def load(self, parsed_patterns, meta=None):
345 |         """Load from parsed URL pattern.
346 | 
347 |         Args:
348 |             parsed_patterns (sequence): MatchNodes.
349 |             meta (any, optional): Defaults to None. It will bind to
350 |                 matched result's meta property.
351 | 
352 |         Returns:
353 |             tuple: 2-tules, (node, is_new).
354 |         """
355 |         return build_tree(self._root, parsed_patterns, meta=meta)
356 | 


--------------------------------------------------------------------------------
/src/os_urlpattern/piece_pattern_node.py:
--------------------------------------------------------------------------------
  1 | """Raw parsed piece tree.
  2 | 
  3 | Build a tree from the parsed URL pieces.
  4 | """
  5 | from __future__ import unicode_literals
  6 | 
  7 | from .compat import itervalues
  8 | from .parse_utils import EMPTY_PARSED_PIECE
  9 | from .pattern import Pattern
 10 | from .utils import TreeNode, build_tree
 11 | 
 12 | 
 13 | class PiecePatternNode(TreeNode):
 14 |     """Node for building raw piece tree."""
 15 | 
 16 |     __slots__ = ('_pattern',)
 17 | 
 18 |     def __init__(self, parsed_piece_and_pattern):
 19 |         parsed_piece, self._pattern = parsed_piece_and_pattern
 20 |         super(PiecePatternNode, self).__init__(parsed_piece)
 21 | 
 22 |     def set_pattern(self, pattern):
 23 |         self._pattern = pattern
 24 | 
 25 |     @property
 26 |     def pattern(self):
 27 |         if self._pattern is None:
 28 |             self._pattern = Pattern(self.piece)
 29 |         return self._pattern
 30 | 
 31 |     @property
 32 |     def piece(self):
 33 |         return self.parsed_piece.piece
 34 | 
 35 |     @property
 36 |     def parsed_piece(self):
 37 |         return self.value
 38 | 
 39 |     @property
 40 |     def children_num(self):
 41 |         return len(self._children)
 42 | 
 43 |     def incr_count(self, count, recur=False):
 44 |         self.count += count
 45 |         node = self.parrent if recur else None
 46 |         while node:
 47 |             node.incr_count(count)
 48 |             node = node.parrent
 49 | 
 50 |     def __str__(self):
 51 |         return ' '.join((self.piece, str(self.pattern)))
 52 | 
 53 |     def add_meta(self, data):
 54 |         if data is None:
 55 |             return
 56 |         if self.meta is None:
 57 |             self.meta = set()
 58 |         self.meta.add(data)
 59 | 
 60 |     def update_meta(self, data):
 61 |         if not data:
 62 |             return
 63 |         if self.meta is None:
 64 |             self.meta = set()
 65 |         self.meta.update(data)
 66 | 
 67 | 
 68 | def build_from_parsed_pieces(root, parsed_pieces, count=1, meta=None, uniq=True):
 69 |     """Build piece pattern tree from parsed pieces.
 70 | 
 71 |     Args:
 72 |         root (PiecePatternNode): The root node of the a tree.
 73 |         parsed_pieces (sequence): The parsed pieces.
 74 |         count (int, optional): Defaults to 1. 
 75 |         meta ([type], optional): Defaults to None. The meta data will bind to the leaf node.
 76 |         uniq (bool, optional): Defaults to True. The duplicated node edge will not add.
 77 | 
 78 |     Returns:
 79 |         tuple: 2-tuple, (leaf_node, is_new)
 80 |     """
 81 |     node, is_new = build_tree(root, [(parsed_piece.piece, (parsed_piece, None))
 82 |                                      for parsed_piece in parsed_pieces], count)
 83 |     if uniq and not is_new:
 84 |         node.incr_count(0 - count, True)
 85 |     node.add_meta(meta)
 86 |     return node, is_new
 87 | 
 88 | 
 89 | def build_from_piece_pattern_nodes(root, piece_pattern_nodes):
 90 |     """Build piece pattern tree from piece pattern tree edge.
 91 | 
 92 |     Args:
 93 |         root (PiecePatternNode): The root node of the a tree.
 94 |         piece_pattern_nodes (sequence): piece pattern tree edge.
 95 | 
 96 |     Returns:
 97 |         tuple: 2-tuple, (leaf_node, is_new)
 98 |     """
 99 |     last = piece_pattern_nodes[-1]
100 |     node, is_new = build_tree(root, [(p.piece, (p.parsed_piece, p.pattern))
101 |                                      for p in piece_pattern_nodes], last.count)
102 |     node.update_meta(last.meta)
103 |     return node, is_new
104 | 


--------------------------------------------------------------------------------
/src/os_urlpattern/utils.py:
--------------------------------------------------------------------------------
  1 | """Utilities.
  2 | """
  3 | import inspect
  4 | import logging
  5 | import math
  6 | import os
  7 | import time
  8 | from functools import partial
  9 | 
 10 | from .compat import iteritems, itervalues
 11 | 
 12 | 
 13 | def pretty_counter(counter):
 14 |     """Format a dict like object.
 15 | 
 16 |     Args:
 17 |         counter (dict): The dict like object to be formatted.
 18 | 
 19 |     Returns:
 20 |         str: Formatted string.
 21 |     """
 22 | 
 23 |     return ", ".join(['{0}:{1}'.format(k, v) for k, v in iteritems(counter)])
 24 | 
 25 | 
 26 | def pick(iterable):
 27 |     """Get an obj from iterable object. """
 28 | 
 29 |     for obj in iterable:
 30 |         return obj
 31 | 
 32 | 
 33 | class Bag(object):
 34 |     """Uniq objects container.
 35 | 
 36 |     The objects in the bag can also be Bag instance.
 37 |     Use pick method to get a most inside object.
 38 |     Use iter_all method to iterate objects inside all inner bags.
 39 |     """
 40 | 
 41 |     __slots__ = ('_objs',)
 42 | 
 43 |     def __init__(self):
 44 |         self._objs = set()
 45 | 
 46 |     def add(self, obj):
 47 |         self._objs.add(obj)
 48 | 
 49 |     def __len__(self):
 50 |         return len(self._objs)
 51 | 
 52 |     def pick(self):
 53 |         obj = pick(self)
 54 |         while isinstance(obj, Bag):
 55 |             obj = pick(obj)
 56 |         return obj
 57 | 
 58 |     def __iter__(self):
 59 |         return iter(self._objs)
 60 | 
 61 |     def iter_all(self):
 62 |         for obj in self:
 63 |             if isinstance(obj, Bag):
 64 |                 for o in obj.iter_all():
 65 |                     yield o
 66 |             else:
 67 |                 yield obj
 68 | 
 69 | 
 70 | class TreeNode(object):
 71 |     """Node of a tree."""
 72 | 
 73 |     __slots__ = ('parrent', '_children', 'count',
 74 |                  'value', 'meta', '_level')
 75 | 
 76 |     def __init__(self, value):
 77 |         self.parrent = None
 78 |         self.count = 0
 79 |         self.value = value
 80 |         self.meta = None
 81 |         self._level = None
 82 |         self._children = None
 83 | 
 84 |     def leaf(self):
 85 |         return not self._children
 86 | 
 87 |     @property
 88 |     def level(self):
 89 |         """int: The level from root."""
 90 |         if self._level is None:
 91 |             l = 0
 92 |             n = self.parrent
 93 |             while n is not None:
 94 |                 l += 1
 95 |                 n = n.parrent
 96 |             self._level = l
 97 |         return self._level
 98 | 
 99 |     @property
100 |     def children(self):
101 |         return itervalues(self._children if self._children is not None else {})
102 | 
103 |     def add_child(self, kv):
104 |         """Add a node to the children data set.
105 | 
106 |         Args:
107 |             kv (pair): Key-value object, the key is used to identify
108 |                 a uniq node, the value is the node's data.
109 | 
110 |         Returns:
111 |             tuple: 2-tuple, (node, is_new).
112 |         """
113 | 
114 |         if self._children is None:
115 |             self._children = {}
116 |         k, v = kv
117 |         is_new = False
118 |         if k not in self._children:
119 |             self._children[k] = self.__class__(v)
120 |             self._children[k].parrent = self
121 |             is_new = True
122 |         child = self._children[k]
123 |         return child, is_new
124 | 
125 | 
126 | def build_tree(root, kv_sequence, count=1, meta=None):
127 |     """Build a tee.
128 | 
129 |     This method will call the node's add_child(kv) to build tree.
130 | 
131 |     Args:
132 |         root (TreeNode): Root node of a tree.
133 |         kv_sequence (sequence): Objects will be used to build a tree.
134 |         count (int, optional): Defaults to 1. Will increase the nodes count.
135 |         meta (any, optional): Defaults to None. Will bind to the leaf node.
136 | 
137 |     Returns:
138 |         tuple: 2-tuple, (node, is_new)
139 |     """
140 |     node = root
141 |     node.count += count
142 |     for kv in kv_sequence:
143 |         node, is_new = node.add_child(kv)
144 |         node.count += count
145 |     if meta is not None:
146 |         node.meta = meta
147 | 
148 |     return node, is_new
149 | 
150 | 
151 | def dump_tree(root):
152 |     """Dump each path of a tree.
153 | 
154 |     Args:
155 |         root (TreeNode): The root node of a tree.
156 | 
157 |     Yields:
158 |         list: List contains nodes from root to leaf as one path.
159 |     """
160 |     olist = []
161 | 
162 |     def _dump(node, _nodes):
163 |         _nodes.append(node)
164 |         if node.leaf():
165 |             yield _nodes
166 |             return
167 |         for child in node.children:
168 |             for nodes in _dump(child, _nodes):
169 |                 yield nodes
170 |             _nodes.pop(-1)
171 | 
172 |     for nodes in _dump(root, olist):
173 |         yield nodes
174 | 
175 | 
176 | class LogSpeedAdapter(logging.LoggerAdapter):
177 |     """Logger adapter for speed logging.
178 | 
179 |     Log only once when called every interal times,
180 |     include total count and average speed.
181 |     Used as 'with statement' for logging huge loop processing.
182 | 
183 |     """
184 | 
185 |     def __init__(self, logger, interval):
186 |         super(LogSpeedAdapter, self).__init__(logger, {})
187 |         self._count = 0
188 |         assert(interval) > 0
189 |         self._interval = interval
190 |         self._start_time = time.time()
191 |         self._replace()
192 | 
193 |     def _replace(self):
194 |         for name in ['debug', 'info', 'warning', 'error', 'exception', 'critical']:
195 |             setattr(self, name, partial(self._log, name))
196 |         self.log = self._log
197 | 
198 |     def _log(self, name, msg, *args, **kwargs):
199 |         self._count += 1
200 | 
201 |         if self._count % self._interval == 0:
202 |             speed = self._speed()
203 |             extra_msg = '{count} {speed:.1f}/s'.format(
204 |                 count=self._count, speed=speed)
205 |             msg = ' '.join((msg, extra_msg))
206 |             if isinstance(name, int):
207 |                 name = logging.getLevelName(name)
208 |             getattr(self.logger, name)(msg, *args, **kwargs)
209 | 
210 |     def _speed(self):
211 |         return self._count / (time.time() - self._start_time)
212 | 
213 |     def __enter__(self):
214 |         self._start_time = time.time()
215 |         return self
216 | 
217 |     def __exit__(self, exc_type, exc_value, exc_tb):
218 |         pass
219 | 
220 | 
221 | def used_memory():
222 |     """Human readable memory usage(Byte).
223 | 
224 |     Returns:
225 |         str: Memory usage.
226 |     """
227 | 
228 |     try:
229 |         import psutil
230 |     except:
231 |         return '-'
232 |     p = psutil.Process(os.getpid())
233 |     memory = p.memory_info().rss
234 |     return format_byte(memory)
235 | 
236 | 
237 | # global variables for format_byte
238 | _UNIT_SUFFIXES = ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y']
239 | _LOG_1024 = math.log(1024)
240 | _SUFFIXES_LENGTH = len(_UNIT_SUFFIXES)
241 | 
242 | 
243 | def format_byte(value, precision=2):
244 |     """Format byte size into human readable.
245 | 
246 |     Args:
247 |         value (int): The byte size.
248 |         precision (int, optional): Defaults to 2. Precision.
249 | 
250 |     Returns:
251 |         str: Human readable format.
252 |     """
253 | 
254 |     factor = float(10 ** precision)
255 |     suffix = min(int(math.log(value) / _LOG_1024), _SUFFIXES_LENGTH)
256 |     num = math.ceil(value / (1024.0 ** suffix) * factor) / factor
257 |     return ''.join((str(num), _UNIT_SUFFIXES[suffix]))
258 | 
259 | 
260 | class MemoryUsageFormatter(logging.Formatter):
261 |     """Formatter support memory keyword."""
262 | 
263 |     def __init__(self, fmt=None, datefmt=None):
264 |         super(MemoryUsageFormatter, self).__init__(fmt, datefmt)
265 |         self._log_memory = True
266 |         if fmt and '%(memory)s' not in fmt:
267 |             self._log_memory = False
268 | 
269 |     def format(self, record):
270 |         if self._log_memory and 'memory' not in record.__dict__:
271 |             record.__dict__['memory'] = used_memory()
272 |         return super(MemoryUsageFormatter, self).format(record)
273 | 
274 | 
275 | class cached_property(object):
276 |     """Decrator for cache class property."""
277 | 
278 |     def __init__(self, func):
279 |         self.__doc__ = getattr(func, "__doc__")
280 |         self.func = func
281 | 
282 |     def __get__(self, obj, cls):
283 |         if obj is None:
284 |             return self
285 | 
286 |         value = obj.__dict__[self.func.__name__] = self.func(obj)
287 |         return value
288 | 
289 | 
290 | def get_classes(module, base_cls, include_base_cls=True):
291 |     """Get specified classes form module.
292 | 
293 |     Args:
294 |         module (module): Where to find classes.
295 |         base_cls (type): The base class.
296 |         include_base_cls (bool, optional): Defaults to True.
297 |             Whether include base class.
298 | 
299 |     Returns:
300 |         list: The specified classes.
301 |     """
302 |     def is_class(c):
303 |         return inspect.isclass(c) \
304 |             and issubclass(c, base_cls) \
305 |             and (include_base_cls or c != base_cls)
306 |     return [c for _, c in inspect.getmembers(module, is_class)]
307 | 
308 | 
309 | def with_metaclass(meta, *bases):
310 |     """Create a base class with a metaclass.
311 | 
312 |     From six.
313 |     """
314 |     # This requires a bit of explanation: the basic idea is to make a dummy
315 |     # metaclass for one level of class instantiation that replaces itself with
316 |     # the actual metaclass.
317 |     class metaclass(type):
318 | 
319 |         def __new__(cls, name, this_bases, d):
320 |             return meta(name, bases, d)
321 | 
322 |         @classmethod
323 |         def __prepare__(cls, name, this_bases):
324 |             return meta.__prepare__(name, bases)
325 |     return type.__new__(metaclass, 'temporary_class', (), {})
326 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cfhamlet/os-urlpattern/9311aff896ad591b2a9123d256f629f5d142dfc6/tests/__init__.py


--------------------------------------------------------------------------------
/tests/data/urls_example.txt:
--------------------------------------------------------------------------------
1 | http://example.com/01.html
2 | http://example.com/123/test01.html
3 | http://example.com/02.html
4 | http://example.com/456/test02.html
5 | http://example.com/03.html
6 | http://example.com/789/test03.html
7 | 


--------------------------------------------------------------------------------
/tests/test_cmdline.py:
--------------------------------------------------------------------------------
  1 | import hashlib
  2 | import os
  3 | import shlex
  4 | import subprocess
  5 | import sys
  6 | 
  7 | import pytest
  8 | 
  9 | from os_urlpattern.cmdline import make, match
 10 | 
 11 | 
 12 | def call(cmdline, env=None, **kwargs):
 13 |     if env is None:
 14 |         env = os.environ.copy()
 15 |     if env.get('COVERAGE', None) is not None:
 16 |         env['COVERAGE_PROCESS_START'] = os.path.abspath('.coveragerc')
 17 | 
 18 |     cmd = 'python -u %s %s' % (os.path.abspath(__file__), cmdline)
 19 |     proc = subprocess.Popen(shlex.split(cmd),
 20 |                             stdout=subprocess.PIPE,
 21 |                             stderr=subprocess.PIPE,
 22 |                             cwd=os.getcwd(),
 23 |                             env=env,
 24 |                             **kwargs)
 25 |     stdout, stderr = proc.communicate()
 26 |     return stdout, stderr
 27 | 
 28 | 
 29 | def test_make(tmpdir):
 30 |     num = 9
 31 |     urls = ['http://example.com/abc%02d?id=%02d#abc' %
 32 |             (i, i) for i in range(0, num)]
 33 |     data = "\n".join(urls)
 34 |     f = tmpdir.join('urls.txt')
 35 |     f.write(data)
 36 |     cmdline = 'make -i %s' % f.strpath
 37 |     stdout, _ = call(cmdline)
 38 |     assert b'/abc[0-9]{2}' in stdout
 39 |     assert urls[0].encode() in stdout
 40 | 
 41 |     cmdline = 'make -i %s -f pattern' % f.strpath
 42 |     stdout, _ = call(cmdline)
 43 |     assert b'/abc[0-9]{2}' in stdout
 44 |     assert urls[0].encode() not in stdout
 45 | 
 46 |     cmdline = 'make -i %s -f ete' % f.strpath
 47 |     stdout, _ = call(cmdline)
 48 |     assert b' abc[0-9]{2}(%d) ' % num
 49 |     assert b' [\\?]id=[0-9]{2}(%d) ' % num
 50 |     assert b' - #abc(%d)' % num
 51 | 
 52 | 
 53 | def test_make_digest_type_urls(tmpdir):
 54 |     urls = ['http://example.com/%s.html' % j for j in
 55 |             [hashlib.md5(str(i).encode()).hexdigest() for i in range(0, 9)]]
 56 | 
 57 |     data = "\n".join(urls)
 58 |     f = tmpdir.join('urls.txt')
 59 |     f.write(data)
 60 |     cmdline = 'make -i %s -f pattern ' % f.strpath
 61 |     stdout, _ = call(cmdline)
 62 |     assert b'[0-9a-z]{32}[\\.]html' in stdout
 63 | 
 64 | 
 65 | def test_make_noise(tmpdir):
 66 |     urls = ['http://example.com/abc%02d?id=%02d#abc' %
 67 |             (i, i) for i in range(0, 8)]
 68 |     urls.append('http://example.com/abc009?id=09#abc')
 69 | 
 70 |     data = "\n".join(urls)
 71 |     f = tmpdir.join('urls.txt')
 72 |     f.write(data)
 73 |     cmdline = 'make -i %s -f pattern ' % f.strpath
 74 |     stdout, _ = call(cmdline)
 75 |     assert b'/abc[0-9]{2}' in stdout
 76 |     assert b'/abc009' in stdout
 77 | 
 78 | 
 79 | def test_make_fuzzy(tmpdir):
 80 |     urls = [
 81 |         'sdjfpewiefh',
 82 |         'dfsdksd',
 83 |         'dffalldsfisslkfdksd',
 84 |         'didif',
 85 |         'dif',
 86 |     ]
 87 |     urls = ['http://example.com/abc/' + i for i in urls]
 88 |     data = "\n".join(urls)
 89 |     f = tmpdir.join('urls01.txt')
 90 |     f.write(data)
 91 |     cmdline = 'make -i %s -f pattern ' % f.strpath
 92 |     stdout, _ = call(cmdline)
 93 |     assert b'/abc/[a-z]+' in stdout
 94 | 
 95 |     urls = [i + '.html' for i in urls]
 96 |     data = "\n".join(urls)
 97 |     f = tmpdir.join('urls02.txt')
 98 |     f.write(data)
 99 |     cmdline = 'make -i %s -f pattern ' % f.strpath
100 |     stdout, _ = call(cmdline)
101 |     assert b'/abc/[a-z]+[\\.]html' in stdout
102 | 
103 | 
104 | def test_match(tmpdir):
105 |     pattern = b'/abc[0-9]{2}'
106 |     fp = tmpdir.join('patterns.txt')
107 |     fp.write(pattern)
108 | 
109 |     urls = ['http://example.com/abc%02d' % i for i in range(1, 10)]
110 |     data = "\n".join(urls)
111 |     fu = tmpdir.join('urls.txt')
112 |     fu.write(data)
113 | 
114 |     cmdline = 'match -i %s -p %s' % (fu.strpath, fp.strpath)
115 |     stdout, _ = call(cmdline)
116 | 
117 |     assert pattern in stdout
118 | 
119 | 
120 | if __name__ == "__main__":
121 |     sys.path.insert(0, os.getcwd())
122 |     if os.getenv('COVERAGE_PROCESS_START'):
123 |         import coverage
124 |         coverage.process_startup()
125 |     cmds = {'make': make, 'match': match}
126 |     cmds[sys.argv.pop(1)]()
127 | 


--------------------------------------------------------------------------------
/tests/test_config.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | 
3 | from os_urlpattern.config import get_default_config
4 | 
5 | 
6 | def test_get_default_config():
7 |     config = get_default_config()
8 |     assert config.getint('make', 'min_cluster_num') == 3
9 | 


--------------------------------------------------------------------------------
/tests/test_formatter.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | 
 3 | import json
 4 | 
 5 | import pytest
 6 | 
 7 | from os_urlpattern.formatter import pformat
 8 | from os_urlpattern.pattern_maker import PatternMaker
 9 | 
10 | 
11 | @pytest.fixture(scope='function')
12 | def p_maker():
13 |     p_maker = PatternMaker()
14 |     for url in ['http://www.example.com/abc/%02d.html' % i for i in range(0, 10)]:
15 |         p_maker.load(url, meta=url)
16 | 
17 |     return p_maker
18 | 
19 | 
20 | def test_inline(p_maker):
21 |     for url_meta, clustered in p_maker.make():
22 |         for o in pformat('inline', url_meta, clustered):
23 |             assert '/abc/[0-9]{2}[\\.]html\thttp' in o
24 | 
25 | 
26 | def test_json(p_maker):
27 |     for url_meta, clustered in p_maker.make():
28 |         for o in pformat('json', url_meta, clustered):
29 |             d = json.loads(o)
30 |             assert d['ptn'] == '/abc/[0-9]{2}[\\.]html'
31 |             assert d['cnt'] == 10
32 | 


--------------------------------------------------------------------------------
/tests/test_parse_utils.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | from os_urlpattern.exceptions import (InvalidCharException,
  4 |                                       InvalidPatternException,
  5 |                                       IrregularURLException)
  6 | from os_urlpattern.parse_utils import (PieceParser, URLMeta, analyze_url,
  7 |                                        analyze_url_pattern_string, digest,
  8 |                                        filter_useless, fuzzy_digest, normalize,
  9 |                                        pack, parse_pattern_string,
 10 |                                        parse_pattern_unit_string,
 11 |                                        parse_query_string, parse_url)
 12 | from os_urlpattern.pattern import Pattern
 13 | 
 14 | 
 15 | def test_normalize_str():
 16 |     data = [
 17 |         ('a', 'a'),
 18 |         ('ab=', 'ab[=]'),
 19 |         ('ab1=a', 'ab1[=]a'),
 20 |         ('ab==a', 'ab[=]{2}a'),
 21 |         ('ab=={a', 'ab[=]{2}[\\{]a'),
 22 |         ('=', '[=]'),
 23 |         ('==', '[=]{2}'),
 24 |         ('==+a', '[=]{2}[\\+]a'),
 25 |         ('\\', '[\\\\]'),
 26 |     ]
 27 |     for i, j in data:
 28 |         assert normalize(i) == j
 29 | 
 30 | 
 31 | def test_parse_url():
 32 |     data = [
 33 |         ('http://www.test.com/', ('',), [('depth', 1)]),
 34 |         ('http://www.test.com/?', ('', ''), [('depth', 2)]),
 35 |         ('http://www.test.com/abc/def?k=v#xxx', ('abc', 'def', 'v', 'xxx'),
 36 |          [('depth', 4), ('has_fragment', True)]),
 37 |     ]
 38 |     for url, p, m in data:
 39 |         url_meta, parts = analyze_url(url)
 40 |         assert parts == p
 41 |         for k, v in m:
 42 |             assert getattr(url_meta, k) == v
 43 |     with pytest.raises(IrregularURLException):
 44 |         analyze_url('http://www.g.com')
 45 | 
 46 | 
 47 | def test_parse_query_string():
 48 |     data = [
 49 |         ('a', ('',), ('a',)),
 50 |         ('a=', ('a=',), ('',)),
 51 |         ('a&b', ('a', 'b'), ('', '')),
 52 |         ('a=1', ('a=',), ('1',)),
 53 |         ('a=1&b=2', ('a=', 'b='), ('1', '2')),
 54 |     ]
 55 |     for q, k, v in data:
 56 |         assert parse_query_string(q) == (k, v)
 57 | 
 58 |     data = ['a&', 'a&&b', 'a=1&']
 59 | 
 60 |     for i in data:
 61 |         with pytest.raises(IrregularURLException):
 62 |             parse_query_string(i)
 63 | 
 64 | 
 65 | def test_analyze_url():
 66 |     data = [
 67 |         ['http://www.g.com/test', ('path', '/test'),
 68 |          ('query', None), ('fragment', None)],
 69 |         ['http://www.g.com/test?',
 70 |             ('query', ''), ('fragment', None)],
 71 |         ['http://www.g.com/test?#',
 72 |             ('query', ''), ('fragment', '')],
 73 |         ['http://www.g.com/test?#abc',
 74 |             ('query', ''), ('fragment', 'abc')],
 75 |         ['http://www.g.com/test#abc',
 76 |             ('query', None), ('fragment', 'abc')],
 77 |         ['http://www.g.com/test?a#',
 78 |             ('query', 'a'), ('fragment', '')],
 79 |         ['http://www.g.com/test?a##',
 80 |             ('query', 'a'), ('fragment', '#')],
 81 |         ['http://www.g.com/test#?',
 82 |             ('query', None), ('fragment', '?')],
 83 |     ]
 84 |     for check in data:
 85 |         url = check[0]
 86 |         r = parse_url(url)
 87 |         for attr, expect in check[1:]:
 88 |             assert getattr(r, attr) == expect
 89 | 
 90 | 
 91 | def test_filter_useless_part():
 92 |     data = [
 93 |         ('/', ['']),
 94 |         ('//', ['']),
 95 |         ('', ['']),
 96 |         ('/a/b', ['a', 'b']),
 97 |         ('/a/b/', ['a', 'b', '']),
 98 |         ('/a/b//', ['a', 'b', '']),
 99 |         ('/a/b///c', ['a', 'b', 'c']),
100 |         ('a/b///c', ['a', 'b', 'c']),
101 |     ]
102 |     for s, expect in data:
103 |         assert filter_useless(s.split('/')) == expect
104 | 
105 | 
106 | def test_piece_parser():
107 |     parser = PieceParser()
108 |     data = [
109 |         ('abc', ('abc', ), ('a-z', )),
110 |         ('abc.exe', ('abc', '[\\.]', 'exe'), ('a-z', '\\.', 'a-z')),
111 |         ('%' * 10, ('[%]{10}', ), ('%', )),
112 |         ('abc1D..exe',  ('abc', '1', 'D',
113 |                          '[\\.]{2}', 'exe'), ('a-z', '0-9', 'A-Z', '\\.', 'a-z')),
114 |         ('@<>..', ('[@]', '[<]', '[>]', '[\\.]{2}'), ('@', '<', '>', '\\.')),
115 |     ]
116 |     for piece, expected_pieces, expected_rules in data:
117 |         parsed = parser.parse(piece)
118 |         assert parsed.rules == expected_rules
119 |         assert parsed.pieces == expected_pieces
120 |         assert parsed.piece_length == len(piece)
121 |     with pytest.raises(InvalidCharException):
122 |         parser.parse(' a')
123 | 
124 | 
125 | def test_unpack_pack():
126 |     data = [
127 |         ('http://www.g.com/', '/'),
128 |         ('http://www.g.com/abc', '/abc'),
129 |         ('http://www.g.com/abc?a=1#c', '/abc[\\?]a=1#c'),
130 |         ('http://www.g.com/abc???a=1#c', '/abc[\\?][\\?]{2}a=1#c'),
131 |         ('http://www.g.com/abc?=1#c', '/abc[\\?]=1#c'),
132 |         ('http://www.g.com/abc?a=1#', '/abc[\\?]a=1#'),
133 |         ('http://www.g.com/abc?a=1&b=2#', '/abc[\\?]a=1&b=2#'),
134 |     ]
135 |     for url, expected in data:
136 |         assert pack(*analyze_url(url)) == expected
137 | 
138 | 
139 | def test_url_meta():
140 |     url_meta1 = URLMeta(1, ['key1', 'key2'], False)
141 |     assert url_meta1.depth == 3
142 |     url_meta2 = URLMeta(1, ['key1', 'key2'], True)
143 |     assert url_meta2.depth == 4
144 |     url_meta3 = URLMeta(1, ['key1', 'key2'], False)
145 | 
146 | 
147 | def test_parse_url_pattern():
148 |     data = [
149 |         'http://www.g.com/',
150 |         'http://www.g.com/abc',
151 |         'http://www.g.com/abc?a=1#c',
152 |         'http://www.g.com/abc???a=1#c',
153 |         'http://www.g.com/abc?=1#c',
154 |         'http://www.g.com/abc?a=1#',
155 |         'http://www.g.com/abc?a=1&b=2#',
156 |     ]
157 |     for url in data:
158 |         meta1, parts1 = analyze_url(url)
159 |         pattern_string = pack(meta1, parts1)
160 |         meta2, parts2 = analyze_url_pattern_string(pattern_string)
161 |         assert meta1 == meta2
162 |         assert len(parts1) == len(parts2)
163 | 
164 | 
165 | def test_parse_pattern_string():
166 |     data = [
167 |         ('abc', 1),
168 |         ('[0-9]{2}abc', 2),
169 |         ('abc[0-9]+', 2),
170 |         ('abc[\\[\\?][a-z]', 3),
171 |         ('', 1),
172 |         ('abcAbc', 3),
173 |     ]
174 |     for p_str, num in data:
175 |         ps = parse_pattern_string(p_str)
176 |         assert ''.join([str(u) for u in ps]) == p_str
177 |         assert len(ps) == num
178 | 
179 |     invalid_data = [
180 |         '[a-z',
181 |         'a-z]',
182 |         '[a-z]{-}',
183 |         '[a-z]{-2}',
184 |         '?',
185 |         '[a-z]++',
186 |     ]
187 | 
188 |     for data in invalid_data:
189 |         with pytest.raises(InvalidPatternException):
190 |             parse_pattern_string(data)
191 | 
192 | 
193 | def test_parse_pattern_unit_string():
194 |     data = [
195 |         ('[a-z]', set(['a-z']), 1),
196 |         ('[a-z]+', set(['a-z']), -1),
197 |         ('', set(['']), 1),
198 |         ('[%\\+]{12}', set(['%', '\\+']), 12),
199 |     ]
200 |     for p_str, e_rules, e_num in data:
201 |         rules, num = parse_pattern_unit_string(p_str)
202 |         assert num == e_num
203 |         assert rules == e_rules
204 | 
205 |     invalid_data = [
206 |         '[z-a]',
207 |         '[z-a]{abc}',
208 |         '[z-a]{-1}',
209 |         '[\\._]',
210 |         '[0-9a-z]',
211 |     ]
212 |     for data in invalid_data:
213 |         with pytest.raises(InvalidPatternException):
214 |             parse_pattern_unit_string(data)
215 | 
216 | 
217 | def test_parse_url_pattern_string():
218 |     patterns = [
219 |         ('/AaBb/123456.shtml', '/[A-Za-z]+/[0-9]{6}[\\.]shtml'),
220 |         ('/abc/123/index.html', '/abc/123/index[\\.]html'),
221 |         ('/12345678/index.asp?id=123',
222 |          '/[0-9]{8}/[a-z]+[\\.]asp[\\?]id=[0-9]+'),
223 |         ('/newsShow.asp?dataID=1', '/newsShow[\\.]asp[\\?]dataID=[0-9]+'),
224 |     ]
225 | 
226 |     for url, pattern in patterns:
227 |         url = 'http://example.com' + url
228 |         um1, pieces = analyze_url(url)
229 |         um2, pattern_strings = analyze_url_pattern_string(pattern)
230 |         assert um1 == um2
231 |         for p, s in zip(pattern_strings, pieces):
232 |             assert Pattern(p).match(s)
233 | 
234 | 
235 | def test_digest():
236 |     parser = PieceParser()
237 |     data = [
238 |         ('/abc/', '/abcdef/'),
239 |         ('/abc/index.html?k1=v1&k2=v2', '/abc/html.htm?k1=c01&k2=2m'),
240 |         ('/abc/index.html?k1=v1#abc', '/abc/html.htm?k1=c01#def'),
241 |     ]
242 | 
243 |     for urls in data:
244 |         urls = ['http://example.com' + u for u in urls]
245 |         digests = set()
246 |         for url in urls:
247 |             url_meta, pieces = analyze_url(url)
248 |             parsed_pieces = [parser.parse(piece) for piece in pieces]
249 |             sid = digest(url_meta, [p.fuzzy_rule for p in parsed_pieces])
250 |             assert fuzzy_digest(url_meta, parsed_pieces) == sid
251 |             digests.add(sid)
252 |         assert len(digests) == 1
253 | 


--------------------------------------------------------------------------------
/tests/test_parsed_piece_view.py:
--------------------------------------------------------------------------------
 1 | from os_urlpattern.parsed_piece_view import (FuzzyView, LastDotSplitFuzzyView,
 2 |                                              LengthView, MixedView, MultiView,
 3 |                                              PieceView, view_cls_from_pattern)
 4 | from os_urlpattern.pattern import Pattern
 5 | 
 6 | 
 7 | def test_view_cls_from_pattern():
 8 |     data = [
 9 |         ('abc', PieceView, False),
10 |         ('[a-z]{2}', LengthView, False),
11 |         ('[a-z]+', FuzzyView, False),
12 |         ('abc[A-Z]{2}', MultiView, False),
13 |         ('[A-Za-z]{3}123', MixedView, False),
14 |         ('[A-Za-z]+[\\.]html', LastDotSplitFuzzyView, True),
15 |         ('id[_][0-9A-Za-z]+[\.][a-z]+', MixedView, True),
16 |     ]
17 | 
18 |     for p_str, view_cls, is_last_path in data:
19 |         assert view_cls_from_pattern(Pattern(p_str), is_last_path) == view_cls
20 | 


--------------------------------------------------------------------------------
/tests/test_pattern.py:
--------------------------------------------------------------------------------
 1 | from os_urlpattern.parse_utils import specify_rule, wildcard_rule
 2 | from os_urlpattern.pattern import Pattern, PatternUnit
 3 | 
 4 | 
 5 | def test_equal():
 6 |     p1 = Pattern('[a-z]+')
 7 |     p2 = Pattern('[a-z]+')
 8 |     p3 = Pattern('[a-z]')
 9 |     assert p1 == p2
10 |     assert p1 != p3
11 | 
12 | 
13 | def test_fuzzy_rule():
14 |     data = [
15 |         ('123', '0-9'),
16 |         ('abc', 'a-z'),
17 |         ('a1b2c3', '0-9a-z'),
18 |         ('a1b2c3D4', '0-9A-Za-z'),
19 |         ('a1[\\-]b2[\\-]c3[_]D4', '0-9A-Z\-_a-z'),
20 |         ('[a-z]+', 'a-z'),
21 |     ]
22 | 
23 |     for s, r in data:
24 |         p = Pattern(s)
25 |         assert p.fuzzy_rule == r
26 |         pw = Pattern(wildcard_rule(p.fuzzy_rule))
27 |         assert pw.fuzzy_rule == r
28 |         pn = Pattern(specify_rule(p.fuzzy_rule, 10))
29 |         assert pn.fuzzy_rule == r
30 | 
31 | 
32 | def test_pattern_unit():
33 |     data = [
34 |         ('[a-z]+', 'a-z', -1, False),
35 |         ('[a-z]{3}', 'a-z', 3, False),
36 |         ('abc', 'a-z', 3, True),
37 |         ('[0-9]', '0-9', 1, False),
38 |         ('[\\.]{2}', '\\.', 2, True),
39 |         ('[\\.]', '\\.', 1, True),
40 |         ('[\\._]{2}', '\\._', 2, False),
41 |     ]
42 | 
43 |     for s, fuzzy_rule, num, literal in data:
44 |         pu = PatternUnit(s)
45 |         assert pu.fuzzy_rule == fuzzy_rule
46 |         assert pu.num == num
47 |         assert pu.is_literal() == literal
48 | 


--------------------------------------------------------------------------------
/tests/test_pattern_maker.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from os_urlpattern.config import get_default_config
 4 | from os_urlpattern.parse_utils import pack
 5 | from os_urlpattern.pattern_maker import PatternMaker
 6 | from os_urlpattern.utils import dump_tree
 7 | 
 8 | 
 9 | @pytest.fixture(scope='function')
10 | def config():
11 |     return get_default_config()
12 | 
13 | 
14 | @pytest.fixture(scope='function')
15 | def pattern_maker(config):
16 |     return PatternMaker(config)
17 | 
18 | 
19 | def test_load(config):
20 |     pm = PatternMaker(config)
21 |     urls = ['http://example.com' + u for u in ['/a', '/a/b', '/a/b/c']]
22 |     for url in urls:
23 |         pm.load(url, meta=url)
24 |     assert len(list(pm.makers)) == len(urls)
25 |     for _, clustered in pm.make():
26 |         for nodes in dump_tree(clustered):
27 |             assert len(nodes[-1].meta) == 1
28 | 
29 |     config.set('make', 'drop_url', 'true')
30 |     pm = PatternMaker(config)
31 |     urls = ['http://example.com' + u for u in ['/a', '/b', '/c']]
32 |     for url in urls:
33 |         pm.load(url)
34 |     assert len(list(pm.makers)) == 1
35 |     for _, clustered in pm.make():
36 |         for nodes in dump_tree(clustered):
37 |             assert nodes[-1].meta is None
38 | 
39 | 
40 | def cluster_and_test(urls, pattern_string):
41 |     pm = PatternMaker(get_default_config())
42 |     for url in urls:
43 |         pm.load(url)
44 | 
45 |     for url_meta, clustered in pm.make(combine=True):
46 |         for nodes in dump_tree(clustered):
47 |             assert pack(
48 |                 url_meta, [n.value for n in nodes[1:]]) == pattern_string
49 | 
50 | 
51 | def test_make():
52 |     urls = ['http://example.com' + u for u in ['/a01', '/b02', '/c03']]
53 |     cluster_and_test(urls, '/[a-z][0-9]{2}')
54 |     urls = ['http://example.com' + u for u in ['/3h4hd9s9w9d9',
55 |                                                '/9s2m1m3j2d10', '/i2i2g4g23j0m']]
56 |     cluster_and_test(urls, '/[0-9a-z]{12}')
57 |     urls = [u + '.html' for u in urls]
58 |     cluster_and_test(urls, '/[0-9a-z]{12}[\\.]html')
59 |     urls = [u + '?id=%02d' % i for i, u in enumerate(urls, 1)]
60 |     cluster_and_test(urls, '/[0-9a-z]{12}[\\.]html[\\?]id=[0-9]{2}')
61 | 
62 |     urls = ['http://example.com' + u for u in ['/3h4hd9s9w9ddsadf9',
63 | 
64 |                                                '/9s2m1m3j2d10', '/i2i2g4g23j0dsdm']]
65 |     cluster_and_test(urls, '/[0-9a-z]+')
66 | 


--------------------------------------------------------------------------------
/tests/test_pattern_matcher.py:
--------------------------------------------------------------------------------
 1 | from os_urlpattern.pattern_matcher import PatternMatcher
 2 | 
 3 | 
 4 | def match(patterns, urls, num, most_match=None):
 5 |     pm = PatternMatcher()
 6 |     for pattern in patterns:
 7 |         pm.load(pattern)
 8 |     for url in urls:
 9 |         matched = pm.match(url)
10 |         assert len(matched) > num
11 |         if most_match:
12 |             sorted(matched)
13 |             matched[-1].meta == most_match
14 | 
15 | 
16 | def test_match():
17 |     urls = ['http://example.com/abc%02d' % i for i in range(1, 10)]
18 |     patterns = [
19 |         '/abc[0-9]{2}',
20 |         '/abc[0-9]+',
21 |         '/[a-z]+[0-9]{2}',
22 |         '/[a-z]{3}[0-9]{2}',
23 |         '/[0-9a-z]+',
24 |         '/[0-9a-z]{5}',
25 |     ]
26 |     for pattern in patterns:
27 |         match([pattern], urls, 0)
28 |     match(patterns, urls, 3, '/abc[0-9]{2}')
29 | 


--------------------------------------------------------------------------------
/tests/test_piece_pattern_node.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | 
 3 | from os_urlpattern.parse_utils import (EMPTY_PARSED_PIECE, PieceParser,
 4 |                                        analyze_url)
 5 | from os_urlpattern.piece_pattern_node import (PiecePatternNode,
 6 |                                               build_from_parsed_pieces,
 7 |                                               build_from_piece_pattern_nodes)
 8 | from os_urlpattern.utils import dump_tree, pick
 9 | 
10 | 
11 | def test_count():
12 |     num = 100
13 |     urls = ['http://test.com/abc/%d' % i for i in range(num)]
14 |     parser = PieceParser()
15 |     root = PiecePatternNode((EMPTY_PARSED_PIECE, None))
16 |     for url in urls:
17 |         _, pieces = analyze_url(url)
18 |         parsed_pieces = [parser.parse(piece) for piece in pieces]
19 |         build_from_parsed_pieces(root, parsed_pieces)
20 |     assert root.count == num
21 |     for url in urls:
22 |         _, pieces = analyze_url(url)
23 |         parsed_pieces = [parser.parse(piece) for piece in pieces]
24 |         build_from_parsed_pieces(root, parsed_pieces)
25 |     assert root.count == num
26 |     root01 = PiecePatternNode((EMPTY_PARSED_PIECE, None))
27 |     for nodes in dump_tree(root):
28 |         build_from_piece_pattern_nodes(root01, nodes[1:])
29 |     assert root01.count == num
30 | 
31 |     nodes = pick(dump_tree(root))
32 |     assert nodes[-1].parrent.children_num == num
33 |     assert str(nodes[-1].parrent.pattern) == "abc"
34 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | # Tox (https://tox.readthedocs.io/) is a tool for running tests
 2 | # in multiple virtualenvs. This configuration file will run the
 3 | # test suite on all supported python versions. To use it, "pip install tox"
 4 | # and then run "tox" from this directory.
 5 | 
 6 | [tox]
 7 | envlist = py{27,36,py,py3}, coverage-report
 8 | 
 9 | [base]
10 | deps = 
11 |     pytest > 2.10
12 |     coverage
13 |     pytest-env
14 | 
15 | [testenv]
16 | commands = 
17 |     coverage run -m pytest {posargs}
18 | 
19 | deps = 
20 |     {[base]deps}
21 |     six
22 |     ete3
23 | 
24 | [testenv:coverage-report]
25 | deps = coverage
26 | skip_install = true
27 | commands =
28 |     coverage combine
29 |     coverage report
30 | 
31 | [testenv:codecov]
32 | passenv = CI TRAVIS TRAVIS_* APPVEYOR APPVEYOR_*
33 | deps = codecov
34 | skip_install = true
35 | commands =
36 |     coverage combine
37 |     coverage report
38 |     codecov
39 | 
40 | 


--------------------------------------------------------------------------------