├── .gitignore ├── .travis.yml ├── CHANGES.rst ├── LICENSE.txt ├── README.rst ├── setup.py ├── tests ├── __init__.py └── test_url_summary.py ├── tox.ini ├── url-summary-example.png └── url_summary ├── __init__.py └── url_summary.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | __pycache__ 3 | dist 4 | *.egg-info 5 | .cache 6 | .coverage 7 | .tox 8 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | sudo: false 3 | branches: 4 | only: 5 | - master 6 | - /^\d\.\d+$/ 7 | 8 | matrix: 9 | include: 10 | - python: 2.7 11 | env: TOXENV=py27 12 | - python: 3.5 13 | env: TOXENV=py35 14 | - python: 3.6 15 | env: TOXENV=py36 16 | 17 | 18 | install: 19 | - pip install -U pip tox codecov 20 | 21 | script: tox 22 | 23 | after_success: 24 | - codecov 25 | 26 | cache: 27 | directories: 28 | - $HOME/.cache/pip -------------------------------------------------------------------------------- /CHANGES.rst: -------------------------------------------------------------------------------- 1 | Changelog 2 | ========= 3 | 4 | 0.0.4 (2017-05-11) 5 | ------------------ 6 | 7 | - Documentation fixes 8 | 9 | 10 | 0.0.3 (2017-05-11) 11 | ------------------ 12 | 13 | - Add number of unique query key values 14 | - Python 2 support 15 | - Tests and CI on Travis 16 | 17 | 18 | 0.0.2 (2017-05-11) 19 | ------------------ 20 | 21 | - Show query arguments with empty values correctly 22 | 23 | 24 | 0.0.1 (2017-04-13) 25 | ------------------ 26 | 27 | - Initial release 28 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) url-summary developers. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | url-summary 2 | =========== 3 | 4 | .. image:: https://img.shields.io/pypi/v/url-summary.svg 5 | :target: https://pypi.python.org/pypi/url-summary 6 | :alt: PyPI Version 7 | 8 | .. image:: https://img.shields.io/travis/TeamHG-Memex/url-summary/master.svg 9 | :target: http://travis-ci.org/TeamHG-Memex/url-summary 10 | :alt: Build Status 11 | 12 | .. image:: http://codecov.io/github/TeamHG-Memex/url-summary/coverage.svg?branch=master 13 | :target: http://codecov.io/github/TeamHG-Memex/url-summary?branch=master 14 | :alt: Code Coverage 15 | 16 | Show summary of a large number of URLs in a Jupyter Notebook: analyze domains, paths, query keys and values. 17 | This is useful if you want to have a quick glance at URLs obtained by crawling. 18 | 19 | .. image:: https://raw.githubusercontent.com/TeamHG-Memex/url-summary/master/url-summary-example.png 20 | :alt: url-summary example 21 | 22 | Installation 23 | ------------ 24 | 25 | Install from PyPI:: 26 | 27 | pip install url-summary 28 | 29 | 30 | Usage 31 | ----- 32 | 33 | :: 34 | 35 | import url_summary 36 | 37 | url_summary.get_summary(urls) 38 | 39 | Will show as a summary for a given list (or iterable) of urls. 40 | ``top_items`` (20 by default) controls how many top-level items to show, 41 | and ``top_urls`` (3 by default) sets the number of random urls to show 42 | for each top-level item. 43 | 44 | Returned object ``url_summary.UrlSummaryResult`` 45 | is a list subclass with a nice Jupyter Notebook display. 46 | 47 | 48 | License 49 | ------- 50 | 51 | License is MIT. 52 | 53 | ---- 54 | 55 | .. image:: https://hyperiongray.s3.amazonaws.com/define-hg.svg 56 | :target: https://www.hyperiongray.com/?pk_campaign=github&pk_kwd=url-summary 57 | :alt: define hyperiongray 58 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import setup 3 | 4 | 5 | def read(fname): 6 | return open(os.path.join(os.path.dirname(__file__), fname)).read() 7 | 8 | 9 | setup( 10 | name='url-summary', 11 | version='0.0.4', 12 | author='Konstantin Lopuhin', 13 | author_email='kostia.lopuhin@gmail.com', 14 | description='Display a summary of urls in a notebook', 15 | license='MIT', 16 | url='https://github.com/TeamHG-Memex/url-summary', 17 | packages=['url_summary'], 18 | install_requires=[ 19 | 'six', 20 | 'typing', 21 | ], 22 | long_description=read('README.rst'), 23 | classifiers=[ 24 | 'Development Status :: 3 - Alpha', 25 | 'Topic :: Internet :: WWW/HTTP :: Indexing/Search', 26 | 'License :: OSI Approved :: MIT License', 27 | 'Programming Language :: Python', 28 | 'Programming Language :: Python :: 2', 29 | 'Programming Language :: Python :: 2.7', 30 | 'Programming Language :: Python :: 3', 31 | 'Programming Language :: Python :: 3.3', 32 | 'Programming Language :: Python :: 3.4', 33 | 'Programming Language :: Python :: 3.5', 34 | 'Programming Language :: Python :: 3.6', 35 | ], 36 | ) 37 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/url-summary/affb4a08d08d1c79d2df40cb318ae40d531e9583/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_url_summary.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import url_summary 4 | 5 | 6 | def test_get_summary(): 7 | urls = [ 8 | 'http://example-one.com', 9 | 'http://example.com', 10 | 'http://example.com/foo', 11 | 'http://example.com/foo/one', 12 | 'http://example.com/foo/two', 13 | 'http://example.com/foo/two?sort=asc', 14 | 'http://example.com/foo/two?sort=asc&page=1', 15 | 'http://example.com/foo/two?sort=asc&page=2', 16 | 'http://example.com/foo/two?sort=asc&page=3', 17 | 'http://example.com/foo/two?sort=desc&page=3', 18 | 'http://example.com/foo/two?page', 19 | 'http://example.com/foo/two?page=', 20 | ] 21 | assert list(url_summary.get_summary(urls, sample=False)) == [ 22 | (('all', ''), 23 | {'len': 12, 24 | 'sample': ['http://example-one.com', 25 | 'http://example.com', 26 | 'http://example.com/foo']}), 27 | (('netloc', 'example.com'), 28 | {'len': 11, 29 | 'sample': ['http://example.com', 30 | 'http://example.com/foo', 31 | 'http://example.com/foo/one']}), 32 | (('path start', '/foo'), 33 | {'len': 10, 34 | 'sample': ['http://example.com/foo', 35 | 'http://example.com/foo/one', 36 | 'http://example.com/foo/two']}), 37 | (('path start', '/foo/two'), 38 | {'len': 8, 39 | 'sample': ['http://example.com/foo/two', 40 | 'http://example.com/foo/two?sort=asc', 41 | 'http://example.com/foo/two?sort=asc&page=1']}), 42 | (('query key', '?page'), 43 | {'len': 6, 44 | 'len_v_set': 5, 45 | 'sample': ['http://example.com/foo/two?sort=asc&page=1', 46 | 'http://example.com/foo/two?sort=asc&page=2', 47 | 'http://example.com/foo/two?sort=asc&page=3']}), 48 | (('query key', '?sort'), 49 | {'len': 5, 50 | 'len_v_set': 2, 51 | 'sample': ['http://example.com/foo/two?sort=asc', 52 | 'http://example.com/foo/two?sort=asc&page=1', 53 | 'http://example.com/foo/two?sort=asc&page=2']}), 54 | (('query key=value', '?sort=asc'), 55 | {'len': 4, 56 | 'sample': ['http://example.com/foo/two?sort=asc', 57 | 'http://example.com/foo/two?sort=asc&page=1', 58 | 'http://example.com/foo/two?sort=asc&page=2']}), 59 | (('query key=value', '?page=3'), 60 | {'len': 2, 61 | 'sample': ['http://example.com/foo/two?sort=asc&page=3', 62 | 'http://example.com/foo/two?sort=desc&page=3']}), 63 | (('netloc', 'example-one.com'), 64 | {'len': 1, 'sample': ['http://example-one.com']}), 65 | (('path start', '/foo/one'), 66 | {'len': 1, 'sample': ['http://example.com/foo/one']}), 67 | (('query key=value', '?page='), 68 | {'len': 1, 'sample': ['http://example.com/foo/two?page']}), 69 | (('query key=value', '?page=1'), 70 | {'len': 1, 'sample': ['http://example.com/foo/two?sort=asc&page=1']}), 71 | (('query key=value', '?page=2'), 72 | {'len': 1, 'sample': ['http://example.com/foo/two?sort=asc&page=2']}), 73 | (('query key=value', '?page='), 74 | {'len': 1, 'sample': ['http://example.com/foo/two?page=']}), 75 | (('query key=value', '?sort=desc'), 76 | {'len': 1, 'sample': ['http://example.com/foo/two?sort=desc&page=3']})] 77 | 78 | url_summary.get_summary(urls) 79 | 80 | 81 | def test_render(): 82 | s = url_summary.get_summary(['http://example.com/foo/two?sort=asc']) 83 | 84 | def normalize(html): 85 | html = re.sub(r'id=".*?"', '', html) 86 | html = re.sub(r'onclick=".*?"', '', html) 87 | html = re.sub('\s+', ' ', html).strip() 88 | html = re.sub('>\s+', '>', html) 89 | html = re.sub('\s+>', '>', html) 90 | html = re.sub('>', '>\n', html).strip() 91 | return html 92 | 93 | assert (normalize('\n text here') == 94 | '\ntext here') 95 | 96 | print(normalize(s._repr_html_())) 97 | assert normalize(s._repr_html_()) == normalize(''' 98 | 172 | ''') 173 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py27,py35,py36 3 | 4 | [testenv] 5 | deps= 6 | pytest 7 | pytest-cov 8 | 9 | commands= 10 | pip install -e . 11 | py.test --doctest-modules --cov=url_summary {posargs: url_summary tests} 12 | -------------------------------------------------------------------------------- /url-summary-example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/url-summary/affb4a08d08d1c79d2df40cb318ae40d531e9583/url-summary-example.png -------------------------------------------------------------------------------- /url_summary/__init__.py: -------------------------------------------------------------------------------- 1 | from .url_summary import get_summary, UrlSummaryResult -------------------------------------------------------------------------------- /url_summary/url_summary.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import random 3 | from uuid import uuid4 4 | from six.moves.urllib.parse import ( 5 | urlsplit, parse_qsl, ParseResult, urlunsplit, quote_plus) 6 | from typing import Iterable 7 | 8 | 9 | def get_summary(urls, top_items=20, top_urls=3, sample=True): 10 | # type: (Iterable[str], int, int) -> UrlSummaryResult 11 | """ Return a summary for given list or iterable of ``urls``. 12 | ``top_items`` (20 by default) controls how many top-level items to show, 13 | and ``top_urls`` (3 by default) sets the number of random urls to show 14 | for each top-level item. 15 | Returns a UrlSummaryResult: a list subclass that has a nice 16 | Jupyter Notebook display. 17 | """ 18 | index = defaultdict(list) 19 | value_index = defaultdict(set) 20 | for url in urls: 21 | index['all', ''].append(url) 22 | parsed = urlsplit(url) # type: ParseResult 23 | index['netloc', format(parsed.netloc)].append(url) 24 | path = parsed.path.rstrip('/').split('/') 25 | for i in range(1, len(path)): 26 | index['path start', '/'.join(path[: i + 1])].append(url) 27 | for k, v in _parse_qsl(parsed.query or ''): 28 | index['query key', '?{}'.format(k)].append(url) 29 | value_index[k].add(v) 30 | index['query key=value', '?{}={}'.format(k, v)].append(url) 31 | items = sorted(index.items(), key=lambda x: (-len(x[1]), x[0])) 32 | summary = [] 33 | for k, v in items[:top_items]: 34 | stat = {'len': len(v), 'sample': sorted(_sample(v, top_urls, sample=sample))} 35 | if k[0] == 'query key': 36 | stat['len_v_set'] = len(value_index.get(k[1][1:])) 37 | summary.append((k, stat)) 38 | return UrlSummaryResult(summary) 39 | 40 | 41 | def _sample(lst, n, seed=42, sample=True): 42 | if len(lst) <= n: 43 | return lst 44 | elif sample: 45 | random.seed(seed) 46 | return random.sample(lst, n) 47 | else: 48 | return lst[:n] 49 | 50 | 51 | def _quote(s): 52 | return quote_plus(s, safe='/') 53 | 54 | 55 | def _parse_qsl(s): 56 | return parse_qsl(s, keep_blank_values=True) 57 | 58 | 59 | def _bold(x, bold=True): 60 | return '{}'.format(x) if bold else x 61 | 62 | 63 | def _urlencode_quoted(x): 64 | return '&'.join('{}={}'.format(k, v) for k, v in x) 65 | 66 | 67 | class UrlSummaryResult(list): 68 | def _repr_html_(self): 69 | return '
    {}
'.format( 70 | '\n'.join(self._render_sample(field, value, stat) 71 | for (field, value), stat in self)) 72 | 73 | def _render_sample(self, field, value, stat): 74 | el_id = uuid4() 75 | # Using "hidden" class defined by the Jupyter notebook 76 | sample_elements = [self._render_url(url, field, value) for url in stat['sample']] 77 | if stat['len'] > len(sample_elements): 78 | sample_elements.append('…') 79 | return '''\ 80 |
  • 81 | {n:,} {field}: {value}{extra} 88 | 89 |
  • '''.format( 90 | id=el_id, 91 | n=stat['len'], 92 | field=field, 93 | value=value, 94 | extra=(' ({len_v_set:,} unique values)'.format(**stat) 95 | if 'len_v_set' in stat else ''), 96 | sample='\n'.join('
  • {}
  • '.format(el) for el in sample_elements), 97 | ) 98 | 99 | def _render_url(self, url, field, value): 100 | return '{url}'.format( 101 | href=url, url=self._highlight(url, field, value)) 102 | 103 | def _highlight(self, url, field, value): 104 | if field == 'all': 105 | return url 106 | parsed = urlsplit(url) # type: ParseResult 107 | netloc = parsed.netloc 108 | path = parsed.path 109 | query = parsed.query 110 | if field == 'netloc': 111 | netloc = _bold(parsed.netloc) 112 | elif field == 'path start': 113 | s = len(value) 114 | path = '{}{}'.format(_bold(parsed.path[1:s]), parsed.path[s:]) 115 | elif field == 'query key': 116 | key_value = value[1:] 117 | query = _urlencode_quoted( 118 | [(_bold(_quote(k), k == key_value), _quote(v)) 119 | for k, v in _parse_qsl(query)]) 120 | elif field == 'query key=value': 121 | key_value, value_value = value[1:].split('=', 1) 122 | query = _urlencode_quoted( 123 | [(_bold(_quote(k), bold), _bold(_quote(v), bold)) 124 | for bold, k, v in ( 125 | (k == key_value and v == value_value, k, v) 126 | for k, v in _parse_qsl(query))]) 127 | return urlunsplit((parsed.scheme, netloc, path, query, parsed.fragment)) 128 | --------------------------------------------------------------------------------