├── .gitignore
├── .travis.yml
├── CHANGES.rst
├── LICENSE.txt
├── README.rst
├── setup.py
├── tests
    ├── __init__.py
    └── test_url_summary.py
├── tox.ini
├── url-summary-example.png
└── url_summary
    ├── __init__.py
    └── url_summary.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | __pycache__
3 | dist
4 | *.egg-info
5 | .cache
6 | .coverage
7 | .tox
8 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | sudo: false
 3 | branches:
 4 |     only:
 5 |         - master
 6 |         - /^\d\.\d+$/
 7 | 
 8 | matrix:
 9 |   include:
10 |     - python: 2.7
11 |       env: TOXENV=py27
12 |     - python: 3.5
13 |       env: TOXENV=py35
14 |     - python: 3.6
15 |       env: TOXENV=py36
16 | 
17 | 
18 | install:
19 |     - pip install -U pip tox codecov
20 | 
21 | script: tox
22 | 
23 | after_success:
24 |     - codecov
25 | 
26 | cache:
27 |     directories:
28 |         - $HOME/.cache/pip


--------------------------------------------------------------------------------
/CHANGES.rst:
--------------------------------------------------------------------------------
 1 | Changelog
 2 | =========
 3 | 
 4 | 0.0.4 (2017-05-11)
 5 | ------------------
 6 | 
 7 | - Documentation fixes
 8 | 
 9 | 
10 | 0.0.3 (2017-05-11)
11 | ------------------
12 | 
13 | - Add number of unique query key values
14 | - Python 2 support
15 | - Tests and CI on Travis
16 | 
17 | 
18 | 0.0.2 (2017-05-11)
19 | ------------------
20 | 
21 | - Show query arguments with empty values correctly
22 | 
23 | 
24 | 0.0.1 (2017-04-13)
25 | ------------------
26 | 
27 | - Initial release
28 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) url-summary developers.
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | url-summary
 2 | ===========
 3 | 
 4 | .. image:: https://img.shields.io/pypi/v/url-summary.svg
 5 |    :target: https://pypi.python.org/pypi/url-summary
 6 |    :alt: PyPI Version
 7 | 
 8 | .. image:: https://img.shields.io/travis/TeamHG-Memex/url-summary/master.svg
 9 |    :target: http://travis-ci.org/TeamHG-Memex/url-summary
10 |    :alt: Build Status
11 | 
12 | .. image:: http://codecov.io/github/TeamHG-Memex/url-summary/coverage.svg?branch=master
13 |    :target: http://codecov.io/github/TeamHG-Memex/url-summary?branch=master
14 |    :alt: Code Coverage
15 | 
16 | Show summary of a large number of URLs in a Jupyter Notebook: analyze domains, paths, query keys and values.
17 | This is useful if you want to have a quick glance at URLs obtained by crawling.
18 | 
19 | .. image:: https://raw.githubusercontent.com/TeamHG-Memex/url-summary/master/url-summary-example.png
20 |    :alt: url-summary example
21 | 
22 | Installation
23 | ------------
24 | 
25 | Install from PyPI::
26 | 
27 |     pip install url-summary
28 | 
29 | 
30 | Usage
31 | -----
32 | 
33 | ::
34 | 
35 |     import url_summary
36 | 
37 |     url_summary.get_summary(urls)
38 | 
39 | Will show as a summary for a given list (or iterable) of urls.
40 | ``top_items`` (20 by default) controls how many top-level items to show,
41 | and ``top_urls`` (3 by default) sets the number of random urls to show
42 | for each top-level item.
43 | 
44 | Returned object ``url_summary.UrlSummaryResult``
45 | is a list subclass with a nice Jupyter Notebook display.
46 | 
47 | 
48 | License
49 | -------
50 | 
51 | License is MIT.
52 | 
53 | ----
54 | 
55 | .. image:: https://hyperiongray.s3.amazonaws.com/define-hg.svg
56 | 	:target: https://www.hyperiongray.com/?pk_campaign=github&pk_kwd=url-summary
57 | 	:alt: define hyperiongray
58 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from setuptools import setup
 3 | 
 4 | 
 5 | def read(fname):
 6 |     return open(os.path.join(os.path.dirname(__file__), fname)).read()
 7 | 
 8 | 
 9 | setup(
10 |     name='url-summary',
11 |     version='0.0.4',
12 |     author='Konstantin Lopuhin',
13 |     author_email='kostia.lopuhin@gmail.com',
14 |     description='Display a summary of urls in a notebook',
15 |     license='MIT',
16 |     url='https://github.com/TeamHG-Memex/url-summary',
17 |     packages=['url_summary'],
18 |     install_requires=[
19 |         'six',
20 |         'typing',
21 |     ],
22 |     long_description=read('README.rst'),
23 |     classifiers=[
24 |         'Development Status :: 3 - Alpha',
25 |         'Topic :: Internet :: WWW/HTTP :: Indexing/Search',
26 |         'License :: OSI Approved :: MIT License',
27 |         'Programming Language :: Python',
28 |         'Programming Language :: Python :: 2',
29 |         'Programming Language :: Python :: 2.7',
30 |         'Programming Language :: Python :: 3',
31 |         'Programming Language :: Python :: 3.3',
32 |         'Programming Language :: Python :: 3.4',
33 |         'Programming Language :: Python :: 3.5',
34 |         'Programming Language :: Python :: 3.6',
35 |     ],
36 | )
37 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TeamHG-Memex/url-summary/affb4a08d08d1c79d2df40cb318ae40d531e9583/tests/__init__.py


--------------------------------------------------------------------------------
/tests/test_url_summary.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | import url_summary
  4 | 
  5 | 
  6 | def test_get_summary():
  7 |     urls = [
  8 |         'http://example-one.com',
  9 |         'http://example.com',
 10 |         'http://example.com/foo',
 11 |         'http://example.com/foo/one',
 12 |         'http://example.com/foo/two',
 13 |         'http://example.com/foo/two?sort=asc',
 14 |         'http://example.com/foo/two?sort=asc&page=1',
 15 |         'http://example.com/foo/two?sort=asc&page=2',
 16 |         'http://example.com/foo/two?sort=asc&page=3',
 17 |         'http://example.com/foo/two?sort=desc&page=3',
 18 |         'http://example.com/foo/two?page',
 19 |         'http://example.com/foo/two?page=<blink>',
 20 |     ]
 21 |     assert list(url_summary.get_summary(urls, sample=False)) == [
 22 |         (('all', ''),
 23 |          {'len': 12,
 24 |           'sample': ['http://example-one.com',
 25 |                      'http://example.com',
 26 |                      'http://example.com/foo']}),
 27 |         (('netloc', 'example.com'),
 28 |          {'len': 11,
 29 |           'sample': ['http://example.com',
 30 |                      'http://example.com/foo',
 31 |                      'http://example.com/foo/one']}),
 32 |         (('path start', '/foo'),
 33 |          {'len': 10,
 34 |           'sample': ['http://example.com/foo',
 35 |                      'http://example.com/foo/one',
 36 |                      'http://example.com/foo/two']}),
 37 |         (('path start', '/foo/two'),
 38 |          {'len': 8,
 39 |           'sample': ['http://example.com/foo/two',
 40 |                      'http://example.com/foo/two?sort=asc',
 41 |                      'http://example.com/foo/two?sort=asc&page=1']}),
 42 |         (('query key', '?page'),
 43 |          {'len': 6,
 44 |           'len_v_set': 5,
 45 |           'sample': ['http://example.com/foo/two?sort=asc&page=1',
 46 |                      'http://example.com/foo/two?sort=asc&page=2',
 47 |                      'http://example.com/foo/two?sort=asc&page=3']}),
 48 |         (('query key', '?sort'),
 49 |          {'len': 5,
 50 |           'len_v_set': 2,
 51 |           'sample': ['http://example.com/foo/two?sort=asc',
 52 |                      'http://example.com/foo/two?sort=asc&page=1',
 53 |                      'http://example.com/foo/two?sort=asc&page=2']}),
 54 |         (('query key=value', '?sort=asc'),
 55 |          {'len': 4,
 56 |           'sample': ['http://example.com/foo/two?sort=asc',
 57 |                      'http://example.com/foo/two?sort=asc&page=1',
 58 |                      'http://example.com/foo/two?sort=asc&page=2']}),
 59 |         (('query key=value', '?page=3'),
 60 |          {'len': 2,
 61 |           'sample': ['http://example.com/foo/two?sort=asc&page=3',
 62 |                      'http://example.com/foo/two?sort=desc&page=3']}),
 63 |         (('netloc', 'example-one.com'),
 64 |          {'len': 1, 'sample': ['http://example-one.com']}),
 65 |         (('path start', '/foo/one'),
 66 |          {'len': 1, 'sample': ['http://example.com/foo/one']}),
 67 |         (('query key=value', '?page='),
 68 |          {'len': 1, 'sample': ['http://example.com/foo/two?page']}),
 69 |         (('query key=value', '?page=1'),
 70 |          {'len': 1, 'sample': ['http://example.com/foo/two?sort=asc&page=1']}),
 71 |         (('query key=value', '?page=2'),
 72 |          {'len': 1, 'sample': ['http://example.com/foo/two?sort=asc&page=2']}),
 73 |         (('query key=value', '?page=<blink>'),
 74 |          {'len': 1, 'sample': ['http://example.com/foo/two?page=<blink>']}),
 75 |         (('query key=value', '?sort=desc'),
 76 |          {'len': 1, 'sample': ['http://example.com/foo/two?sort=desc&page=3']})]
 77 | 
 78 |     url_summary.get_summary(urls)
 79 | 
 80 | 
 81 | def test_render():
 82 |     s = url_summary.get_summary(['http://example.com/foo/two?sort=asc'])
 83 | 
 84 |     def normalize(html):
 85 |         html = re.sub(r'id=".*?"', '', html)
 86 |         html = re.sub(r'onclick=".*?"', '', html)
 87 |         html = re.sub('\s+', ' ', html).strip()
 88 |         html = re.sub('>\s+', '>', html)
 89 |         html = re.sub('\s+>', '>', html)
 90 |         html = re.sub('>', '>\n', html).strip()
 91 |         return html
 92 | 
 93 |     assert (normalize('<a id="1" foo="bar">\n  text here</a>') ==
 94 |             '<a foo="bar">\ntext here</a>')
 95 | 
 96 |     print(normalize(s._repr_html_()))
 97 |     assert normalize(s._repr_html_()) == normalize('''
 98 |     <ul>
 99 |         <li>
100 |             <span href="#" style="cursor: pointer">
101 |                 1 all: <b></b>
102 |                 <span>&#9658;</span>
103 |             </span>
104 |             <ul class="hidden" style="margin-top: 0">
105 |                 <li><a href="http://example.com/foo/two?sort=asc" target="_blank">
106 |                     http://example.com/foo/two?sort=asc</a>
107 |                 </li>
108 |             </ul>
109 |         </li>
110 |         <li>
111 |             <span href="#" style="cursor: pointer">
112 |                 1 netloc: <b>example.com</b>
113 |                 <span>&#9658;</span>
114 |             </span>
115 |             <ul class="hidden" style="margin-top: 0">
116 |                 <li>
117 |                     <a href="http://example.com/foo/two?sort=asc" target="_blank">
118 |                         http://<b style="color: black">example.com</b>/foo/two?sort=asc</a>
119 |                 </li>
120 |             </ul>
121 |         </li>
122 |         <li>
123 |             <span href="#" style="cursor: pointer">
124 |                 1 path start: <b>/foo</b>
125 |                 <span>&#9658;</span>
126 |             </span>
127 |             <ul class="hidden" style="margin-top: 0">
128 |                 <li>
129 |                     <a href="http://example.com/foo/two?sort=asc" target="_blank">
130 |                         http://example.com/<b style="color: black">foo</b>/two?sort=asc</a>
131 |                 </li>
132 |             </ul>
133 |         </li>
134 |         <li>
135 |             <span href="#" style="cursor: pointer">
136 |                 1 path start: <b>/foo/two</b>
137 |                 <span>&#9658;</span>
138 |             </span>
139 |             <ul class="hidden" style="margin-top: 0">
140 |                 <li>
141 |                     <a href="http://example.com/foo/two?sort=asc" target="_blank">
142 |                         http://example.com/<b style="color: black">foo/two</b>?sort=asc</a>
143 |                 </li>
144 |             </ul>
145 |         </li>
146 |         <li>
147 |             <span href="#" style="cursor: pointer">
148 |                 1 query key: <b>?sort</b> (1 unique values)
149 |                 <span>&#9658;</span>
150 |             </span>
151 |             <ul class="hidden" style="margin-top: 0">
152 |                 <li>
153 |                     <a href="http://example.com/foo/two?sort=asc" target="_blank">
154 |                         http://example.com/foo/two?<b style="color: black">sort</b>=asc</a>
155 |                 </li>
156 |             </ul>
157 |         </li>
158 |         <li>
159 |             <span href="#" style="cursor: pointer">
160 |                 1 query key=value: <b>?sort=asc</b>
161 |                 <span>&#9658;</span>
162 |             </span>
163 |             <ul class="hidden" style="margin-top: 0">
164 |                 <li>
165 |                     <a href="http://example.com/foo/two?sort=asc" target="_blank">
166 |                         http://example.com/foo/two?<b style="color: black">sort</b>=<b style="color: black">asc</b>
167 |                     </a>
168 |                 </li>
169 |             </ul>
170 |         </li>
171 |     </ul>
172 |     ''')
173 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py27,py35,py36
 3 | 
 4 | [testenv]
 5 | deps=
 6 |     pytest
 7 |     pytest-cov
 8 | 
 9 | commands=
10 |     pip install -e .
11 |     py.test --doctest-modules --cov=url_summary {posargs: url_summary tests}
12 | 


--------------------------------------------------------------------------------
/url-summary-example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TeamHG-Memex/url-summary/affb4a08d08d1c79d2df40cb318ae40d531e9583/url-summary-example.png


--------------------------------------------------------------------------------
/url_summary/__init__.py:
--------------------------------------------------------------------------------
1 | from .url_summary import get_summary, UrlSummaryResult


--------------------------------------------------------------------------------
/url_summary/url_summary.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | import random
  3 | from uuid import uuid4
  4 | from six.moves.urllib.parse import (
  5 |     urlsplit, parse_qsl, ParseResult, urlunsplit, quote_plus)
  6 | from typing import Iterable
  7 | 
  8 | 
  9 | def get_summary(urls, top_items=20, top_urls=3, sample=True):
 10 |     # type: (Iterable[str], int, int) -> UrlSummaryResult
 11 |     """ Return a summary for given list or iterable of ``urls``.
 12 |     ``top_items`` (20 by default) controls how many top-level items to show,
 13 |     and ``top_urls`` (3 by default) sets the number of random urls to show
 14 |     for each top-level item.
 15 |     Returns a UrlSummaryResult: a list subclass that has a nice
 16 |     Jupyter Notebook display.
 17 |     """
 18 |     index = defaultdict(list)
 19 |     value_index = defaultdict(set)
 20 |     for url in urls:
 21 |         index['all', ''].append(url)
 22 |         parsed = urlsplit(url)  # type: ParseResult
 23 |         index['netloc', format(parsed.netloc)].append(url)
 24 |         path = parsed.path.rstrip('/').split('/')
 25 |         for i in range(1, len(path)):
 26 |             index['path start', '/'.join(path[: i + 1])].append(url)
 27 |         for k, v in _parse_qsl(parsed.query or ''):
 28 |             index['query key', '?{}'.format(k)].append(url)
 29 |             value_index[k].add(v)
 30 |             index['query key=value', '?{}={}'.format(k, v)].append(url)
 31 |     items = sorted(index.items(), key=lambda x: (-len(x[1]), x[0]))
 32 |     summary = []
 33 |     for k, v in items[:top_items]:
 34 |         stat = {'len': len(v), 'sample': sorted(_sample(v, top_urls, sample=sample))}
 35 |         if k[0] == 'query key':
 36 |             stat['len_v_set'] = len(value_index.get(k[1][1:]))
 37 |         summary.append((k, stat))
 38 |     return UrlSummaryResult(summary)
 39 | 
 40 | 
 41 | def _sample(lst, n, seed=42, sample=True):
 42 |     if len(lst) <= n:
 43 |         return lst
 44 |     elif sample:
 45 |         random.seed(seed)
 46 |         return random.sample(lst, n)
 47 |     else:
 48 |         return lst[:n]
 49 | 
 50 | 
 51 | def _quote(s):
 52 |     return quote_plus(s, safe='/')
 53 | 
 54 | 
 55 | def _parse_qsl(s):
 56 |     return parse_qsl(s, keep_blank_values=True)
 57 | 
 58 | 
 59 | def _bold(x, bold=True):
 60 |     return '<b style="color: black">{}</b>'.format(x) if bold else x
 61 | 
 62 | 
 63 | def _urlencode_quoted(x):
 64 |     return '&'.join('{}={}'.format(k, v) for k, v in x)
 65 | 
 66 | 
 67 | class UrlSummaryResult(list):
 68 |     def _repr_html_(self):
 69 |         return '<ul>{}</ul>'.format(
 70 |             '\n'.join(self._render_sample(field, value, stat)
 71 |                       for (field, value), stat in self))
 72 | 
 73 |     def _render_sample(self, field, value, stat):
 74 |         el_id = uuid4()
 75 |         # Using "hidden" class defined by the Jupyter notebook
 76 |         sample_elements = [self._render_url(url, field, value) for url in stat['sample']]
 77 |         if stat['len'] > len(sample_elements):
 78 |             sample_elements.append('&hellip;')
 79 |         return '''\
 80 |         <li>
 81 |             <span href="#" style="cursor: pointer"
 82 |              onclick="\
 83 |                 var el = document.getElementById('{id}'); \
 84 |                 this.getElementsByTagName('SPAN')[0].textContent = \
 85 |                     el.classList.contains('hidden') ? '&#9660' : '&#9658'; \
 86 |                 el.classList.toggle('hidden')"
 87 |              >{n:,} {field}: <b>{value}</b>{extra} <span>&#9658;</span></span>
 88 |             <ul id="{id}" class="hidden" style="margin-top: 0">{sample}</ul>
 89 |         </li>'''.format(
 90 |             id=el_id,
 91 |             n=stat['len'],
 92 |             field=field,
 93 |             value=value,
 94 |             extra=(' ({len_v_set:,} unique values)'.format(**stat)
 95 |                    if 'len_v_set' in stat else ''),
 96 |             sample='\n'.join('<li>{}</li>'.format(el) for el in sample_elements),
 97 |         )
 98 | 
 99 |     def _render_url(self, url, field, value):
100 |         return '<a href="{href}" target="_blank">{url}</a>'.format(
101 |             href=url, url=self._highlight(url, field, value))
102 | 
103 |     def _highlight(self, url, field, value):
104 |         if field == 'all':
105 |             return url
106 |         parsed = urlsplit(url)  # type: ParseResult
107 |         netloc = parsed.netloc
108 |         path = parsed.path
109 |         query = parsed.query
110 |         if field == 'netloc':
111 |             netloc = _bold(parsed.netloc)
112 |         elif field == 'path start':
113 |             s = len(value)
114 |             path = '{}{}'.format(_bold(parsed.path[1:s]), parsed.path[s:])
115 |         elif field == 'query key':
116 |             key_value = value[1:]
117 |             query = _urlencode_quoted(
118 |                 [(_bold(_quote(k), k == key_value), _quote(v))
119 |                  for k, v in _parse_qsl(query)])
120 |         elif field == 'query key=value':
121 |             key_value, value_value = value[1:].split('=', 1)
122 |             query = _urlencode_quoted(
123 |                 [(_bold(_quote(k), bold), _bold(_quote(v), bold))
124 |                  for bold, k, v in (
125 |                      (k == key_value and v == value_value, k, v)
126 |                      for k, v in _parse_qsl(query))])
127 |         return urlunsplit((parsed.scheme, netloc, path, query, parsed.fragment))
128 | 


--------------------------------------------------------------------------------