├── README.md ├── __init__.py ├── .gitignore ├── routekit.py ├── dockit.py ├── LICENSE ├── cpustats.py ├── netstats.py ├── stringkit.py ├── procstats.py ├── pipekit.py ├── markup.py ├── filekit.py ├── favicon.py ├── core.py ├── datakit.py ├── logkit.py ├── imagekit.py ├── timekit.py ├── taskkit.py ├── decorators.py └── urlkit.py /README.md: -------------------------------------------------------------------------------- 1 | python-utils 2 | ============ 3 | 4 | A set of libraries I constantly re-use for a number of projects, so that I have a canonical, always-updated reference/sub-repo I can refer to. 5 | 6 | Some of these were designed to cope with older Python runtimes or to provide smaller, more manageable dependencies for common tasks. 7 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Copyright (c) 2012, Rui Carmo 6 | Description: Utility functions 7 | License: MIT (see LICENSE.md for details) 8 | """ 9 | 10 | import os 11 | import sys 12 | import logging 13 | 14 | log = logging.getLogger() 15 | 16 | # export commonly-used submodule symbols 17 | from utils.core import Struct, Singleton, get_config, tb 18 | from utils.filekit import path_for, locate 19 | from utils.timekit import time_since -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | bin/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # Installer logs 26 | pip-log.txt 27 | pip-delete-this-directory.txt 28 | 29 | # Unit test / coverage reports 30 | htmlcov/ 31 | .tox/ 32 | .coverage 33 | .cache 34 | nosetests.xml 35 | coverage.xml 36 | 37 | # Translations 38 | *.mo 39 | 40 | # Mr Developer 41 | .mr.developer.cfg 42 | .project 43 | .pydevproject 44 | 45 | # Rope 46 | .ropeproject 47 | 48 | # Django stuff: 49 | *.log 50 | *.pot 51 | 52 | # Sphinx documentation 53 | docs/_build/ 54 | 55 | -------------------------------------------------------------------------------- /routekit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Copyright (c) 2014, Rui Carmo 6 | Description: Bottle-specific utility functions 7 | License: MIT (see LICENSE.md for details) 8 | """ 9 | 10 | import os 11 | import sys 12 | import logging 13 | import json 14 | 15 | log = logging.getLogger() 16 | 17 | def inspect_routes(app): 18 | for route in app.routes: 19 | if 'mountpoint' in route.config: 20 | prefix = route.config['mountpoint']['prefix'] 21 | subapp = route.config['mountpoint']['target'] 22 | 23 | for prefixes, route in inspect_routes(subapp): 24 | yield [prefix] + prefixes, route 25 | else: 26 | yield [], route 27 | 28 | def dump_routes(app): 29 | for prefixes, route in inspect_routes(app): 30 | abs_prefix = '/'.join(part for p in prefixes for part in p.split('/')) 31 | log.warn("Prefix:'%s' Route:'%s' [%s] %s" % (abs_prefix, route.rule, route.method, route.callback)) 32 | -------------------------------------------------------------------------------- /dockit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Copyright (c) 2012, Rui Carmo 6 | Description: Docstring utility functions 7 | License: MIT (see LICENSE.md for details) 8 | """ 9 | 10 | import logging 11 | import inspect 12 | from bottle import app 13 | 14 | log = logging.getLogger() 15 | 16 | def docs(): 17 | """Gather all docstrings related to routes and return them grouped by module""" 18 | 19 | routes = [] 20 | modules = {} 21 | for route in app().routes: 22 | doc = inspect.getdoc(route.callback) or inspect.getcomments(route.callback) 23 | if not doc: 24 | doc = '' 25 | module = inspect.getmodule(route.callback).__name__ 26 | item = { 27 | 'method': route.method, 28 | 'route': route.rule, 29 | 'function': route.callback.__name__, 30 | 'module': module, 31 | 'doc': inspect.cleandoc(doc) 32 | } 33 | if not module in modules: 34 | modules[module] = [] 35 | modules[module].append(item) 36 | return modules -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Rui Carmo 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /cpustats.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Copyright (c) 2012, Rui Carmo 6 | Description: Utility functions for retrieving CPU statistics 7 | License: MIT (see LICENSE.md for details) 8 | """ 9 | 10 | import logging 11 | import time 12 | 13 | log = logging.getLogger() 14 | 15 | 16 | def stats(): 17 | """Retrieves all CPU counters""" 18 | cpu = open('/proc/stat','r').readlines()[0] 19 | return map(float,cpu.split()[1:5]) 20 | 21 | 22 | def usage(interval=0.1): 23 | """Estimates overall CPU usage during a short time interval""" 24 | t1 = stats() 25 | time.sleep(interval) 26 | t2 = stats() 27 | delta = [t2[i] - t1[i] for i in range(len(t1))] 28 | try: 29 | return 1.0 - (delta[-1:].pop()/(sum(delta)*1.0)) 30 | except: 31 | return 0.0 32 | 33 | 34 | def freqency(cpu='cpu0'): 35 | """Retrieves the current CPU speed in MHz - for a single CPU""" 36 | return float(open('/sys/devices/system/cpu/%s/cpufreq/scaling_cur_freq' % cpu,'r').read().strip())/1000.0 37 | 38 | 39 | def temperature(): 40 | """Retrieves the current CPU core temperature in degrees Celsius - tailored to the Raspberry Pi""" 41 | return float(open('/sys/class/thermal/thermal_zone0/temp','r').read().strip())/1000.0 42 | -------------------------------------------------------------------------------- /netstats.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Copyright (c) 2013, Rui Carmo 6 | Description: Network utility functions 7 | License: MIT (see LICENSE.md for details) 8 | """ 9 | 10 | import re 11 | import logging 12 | import socket 13 | import struct 14 | 15 | log = logging.getLogger() 16 | 17 | 18 | def valid_mac_address(addr): 19 | """Validate a physical Ethernet address""" 20 | return re.match("[0-9a-f]{2}([-:][0-9a-f]{2}){5}$", addr.lower()) 21 | 22 | 23 | def valid_ip_address(addr): 24 | """Quick and dirty way to validate any kind of IP address""" 25 | try: 26 | socket.inet_aton(addr) 27 | return True 28 | except socket.error: 29 | return False 30 | 31 | 32 | def get_net_bytes(dev='eth0'): 33 | """Read network interface traffic counters""" 34 | return { 35 | 'rx': float(open('/sys/class/net/%s/statistics/rx_bytes' % dev,'r').read().strip()), 36 | 'tx': float(open('/sys/class/net/%s/statistics/tx_bytes' % dev,'r').read().strip()) 37 | } 38 | 39 | 40 | def get_mac_address(dev="eth0"): 41 | """Retrieves the MAC address from the /sys virtual filesystem - will only work on Linux.""" 42 | return open('/sys/class/net/%s/address' % dev,'r').read().strip() 43 | 44 | 45 | def get_ip_address(dev="eth0"): 46 | """Retrieves the IP address via SIOCGIFADDR - only tested on Linux.""" 47 | try: 48 | s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) 49 | return socket.inet_ntoa(fcntl.ioctl(s.fileno(),0x8915,struct.pack('256s', dev[:15]))[20:24]) 50 | except: 51 | return None -------------------------------------------------------------------------------- /stringkit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Copyright (c) 2012, Rui Carmo 6 | Description: Utility functions for retrieving process information 7 | License: MIT (see LICENSE.md for details) 8 | """ 9 | 10 | import sys 11 | import re 12 | import logging 13 | import htmlentitydefs 14 | import unicodedata 15 | 16 | log = logging.getLogger() 17 | 18 | 19 | def rsplit(s, sep=None, maxsplit=-1): 20 | """Equivalent to str.split, except splitting from the right""" 21 | if sys.version_info < (2, 4, 0): 22 | if sep is not None: 23 | sep = sep[::-1] 24 | L = s[::-1].split(sep, maxsplit) 25 | L.reverse() 26 | return [s[::-1] for s in L] 27 | else: 28 | return s.rsplit(sep, maxsplit) 29 | 30 | 31 | def shrink(line, bound=50, rep='[...]'): 32 | """Shrinks a string, adding an ellipsis to the middle""" 33 | l = len(line) 34 | if l < bound: 35 | return line 36 | if bound <= len(rep): 37 | return rep 38 | k = bound - len(rep) 39 | return line[0:k / 2] + rep + line[-k / 2:] 40 | 41 | 42 | def convert_entity(m): 43 | """Converts entities to codepoints where applicable""" 44 | if m.group(1) == '#': 45 | try: 46 | return unichr(int(m.group(2))) 47 | except ValueError: 48 | return '%s;' % m.group(2) 49 | try: 50 | return unichr(htmlentitydefs.name2codepoint[m.group(2)]) 51 | except KeyError: 52 | return '&%s;' % m.group(2) 53 | 54 | 55 | def convert_html(buffer): 56 | """Replaces all entities with codepoints""" 57 | return re.sub(r'&(#?)(.+?);', convertentity, buffer) 58 | 59 | 60 | def munge_string(buffer): 61 | """Builds anchor IDs""" 62 | return re.sub("[\W+]", "-", buffer.lower()) 63 | 64 | 65 | def remove_diacritics(buffer): 66 | """Remove diactritical marks in Latin characters""" 67 | unicodedata.normalize('NFKD', unicode(buffer)).encode('ASCII', 'ignore') 68 | -------------------------------------------------------------------------------- /procstats.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Copyright (c) 2012, Rui Carmo 6 | Description: Utility functions for retrieving process information 7 | License: MIT (see LICENSE.md for details) 8 | """ 9 | 10 | import os 11 | import logging 12 | import platform 13 | import __builtin__ 14 | 15 | log = logging.getLogger() 16 | 17 | # Module globals 18 | openfiles = set() 19 | oldfile = __builtin__.file 20 | oldopen = __builtin__.open 21 | patched = False 22 | 23 | 24 | class _file(oldfile): 25 | """File wrapper""" 26 | def __init__(self, *args): 27 | self.x = args[0] 28 | log.debug("FILE OPEN: %s" % str(self.x)) 29 | oldfile.__init__(self, *args) 30 | openfiles.add(self) 31 | 32 | def close(self): 33 | log.debug("FILE CLOSED: %s" % str(self.x)) 34 | oldfile.close(self) 35 | openfiles.remove(self) 36 | 37 | 38 | def _open(*args): 39 | return newfile(*args) 40 | 41 | 42 | def monkeypatch_files(): 43 | """Wraps builtin file operations to allow us to track open files""" 44 | __builtin__.file = _file 45 | __builtin__.open = _open 46 | patched = True 47 | 48 | 49 | def get_open_fd_count(): 50 | if 'Darwin' in platform.platform(): 51 | pid = os.getpid() 52 | procs = subprocess.check_output([ "lsof", '-w', '-Ff', "-p", str(pid)]) 53 | nprocs = len(filter(lambda s: s and s[0] == 'f' and s[1:].isdigit(),procs.split('\n'))) 54 | return nprocs 55 | # check if we've monkeypatched anything 56 | if patched: 57 | return len(get_open_files()) 58 | else: 59 | # Will only work for Linux 60 | return len(os.listdir('/proc/self/fd')) 61 | 62 | 63 | def get_open_files(): 64 | return [f.x for f in openfiles] 65 | 66 | 67 | def stats(pid): 68 | """Retrieve process kernel counters""" 69 | stats = open('/proc/%d/status' % pid,'r').readlines() 70 | return dict(filter(lambda x: len(x)==2,map(lambda x: x.split()[:2],stats))) 71 | 72 | 73 | def rss(pid): 74 | """Retrieve a process' resident set size""" 75 | try: 76 | return int(stats(pid)['VmRSS:']) 77 | except: 78 | return 0 79 | -------------------------------------------------------------------------------- /pipekit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Copyright (c) 2012, Rui Carmo 6 | Description: Pipeline patterns, mostly taken from itertools recipes 7 | License: MIT (see LICENSE.md for details) 8 | """ 9 | 10 | import itertools 11 | import collections 12 | 13 | def chunk(chunk_size=32): 14 | """Group chunk_size elements into lists""" 15 | 16 | def chunker(gen): 17 | gen = iter(gen) 18 | chunk = [] 19 | try: 20 | while True: 21 | for _ in xrange(chunk_size): 22 | chunk.append(gen.next()) 23 | yield chunk 24 | chunk = [] 25 | except StopIteration: 26 | if chunk: 27 | yield chunk 28 | 29 | return chunker 30 | 31 | 32 | def flatten(gen): 33 | """Flatten a sequence, but only one level deep.""" 34 | 35 | return itertools.chain.from_iterable(gen) 36 | 37 | 38 | 39 | def sink(iter, steps=None): 40 | """Sink data from an iterator, effecting any results from it being consumed.""" 41 | 42 | if steps is None: 43 | # feed the entire iterator into a zero-length deque 44 | collections.deque(iter, maxlen=0) 45 | else: 46 | # advance to the empty slice starting at position 'steps' 47 | next(itertools.islice(iter, steps, steps), None) 48 | 49 | 50 | 51 | def make_unique(seq, transform=None): 52 | """Remove duplicate items from a sequence""" 53 | 54 | if transform is None: 55 | def transform(x): return x 56 | seen = {} 57 | for item in seq: 58 | marker = transform(item) 59 | if marker not in seen: 60 | seen[marker] = True 61 | yield item 62 | 63 | 64 | def pipeline(source, functions): 65 | """Apply an array of functions to a source iterable""" 66 | 67 | return reduce(lambda x, y: y(x), functions, source) 68 | 69 | 70 | if __name__=='__main__': 71 | 72 | def sum(iter): 73 | for i in iter: 74 | yield i + 1 75 | 76 | steps = [ 77 | sum, 78 | chunk(8), 79 | chunk(4) 80 | ] 81 | p = pipeline(xrange(64), steps) 82 | for i in p: 83 | print i 84 | 85 | -------------------------------------------------------------------------------- /markup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | Core classes 5 | 6 | Created by Rui Carmo on 2006-09-10. 7 | Published under the MIT license. 8 | """ 9 | 10 | import logging 11 | 12 | log = logging.getLogger() 13 | 14 | 15 | def sanitize_title(title): 16 | """Generate a usable anchor from a title string""" 17 | 18 | return re.sub("[\W+]","-",title.lower()) 19 | 20 | 21 | def parse_rfc822(buffer, mime_type='text/plain'): 22 | """Helper function for parsing metadata out of a plaintext buffer""" 23 | 24 | headers = {} 25 | markup = '' 26 | if mime_type in ['text/plain', 'text/x-textile', 'text/x-markdown']: 27 | try: 28 | (header_lines,markup) = buffer.split("\n\n", 1) 29 | for header in header_lines.strip().split("\n"): 30 | (name, value) = header.strip().split(":", 1) 31 | headers[name.lower().strip()] = unicode(value.strip()) 32 | if 'content-type' in headers: 33 | mime_type = headers['content-type'] 34 | except: 35 | raise TypeError, "Invalid file format." 36 | return headers, markup, mime_type 37 | 38 | 39 | def render_markup(raw, markup=u'text/html'): 40 | """Turn markup into nice HTML""" 41 | 42 | # Allow module to load regardless of textile or markdown support 43 | try: 44 | import textile 45 | import smartypants 46 | import markdown 47 | except ImportError: 48 | pass 49 | 50 | def _markdown(raw): 51 | log.debug("Rendering Markdown") 52 | return markdown.Markdown(extensions=['extra','toc','smarty','codehilite','meta','sane_lists'], safe_mode=False).convert(raw) 53 | 54 | def _plaintext(raw): 55 | log.debug("Rendering plaintext") 56 | return u'
\n%s' % raw 57 | 58 | def _textile(raw): 59 | log.debug("Rendering Textile") 60 | return smartypants.smartyPants(textile.textile(unicode(raw), head_offset=0, validate=0, sanitize=1, encoding='utf-8', output='utf-8')) 61 | 62 | def _html(raw): 63 | return raw 64 | 65 | return { 66 | u'text/plain' : _plaintext, 67 | u'text/x-web-markdown': _markdown, 68 | u'text/x-markdown' : _markdown, 69 | u'text/markdown' : _markdown, 70 | u'text/textile' : _textile, 71 | u'text/x-textile' : _textile, 72 | u'text/html' : _html}[markup](raw) 73 | 74 | -------------------------------------------------------------------------------- /filekit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Copyright (c) 2012, Rui Carmo 6 | Description: File utility functions 7 | License: MIT (see LICENSE.md for details) 8 | """ 9 | 10 | import os 11 | import sys 12 | import logging 13 | import zipfile 14 | 15 | log = logging.getLogger() 16 | 17 | def path_for(name, script=__file__): 18 | """Build absolute paths to resources based on app path""" 19 | 20 | if 'uwsgi' in sys.argv: 21 | return os.path.join(os.path.abspath(os.path.join(os.path.dirname(script),'..')),name) 22 | return os.path.abspath(os.path.join(os.path.dirname(sys.argv[0]),name)) 23 | 24 | 25 | def locate(pattern, root=os.getcwd()): 26 | """Generator for iterating inside a file tree""" 27 | 28 | for path, dirs, files in os.walk(root): 29 | for filename in [os.path.abspath(os.path.join(path, filename)) for filename in files if fnmatch.fnmatch(filename, pattern)]: 30 | yield filename 31 | 32 | 33 | def walk(top, topdown=True, onerror=None, followlinks=False, ziparchive=None, zipdepth=0): 34 | """Reimplementation of os.walk to traverse ZIP files as well""" 35 | 36 | try: 37 | if (os.path.splitext(top)[1]).lower() == '.zip': 38 | if ziparchive: 39 | # skip nested ZIPs. 40 | yield top, [], [] 41 | else: 42 | ziparchive = zipfile.ZipFile(top) 43 | names = list(set(map(lambda x: [p+'/' for p in x.split('/') if p != ""][zipdepth],ziparchive.namelist()))) 44 | else: 45 | names = os.listdir(top) 46 | except error, err: 47 | if onerror is not None: 48 | onerror(err) 49 | return 50 | 51 | dirs, nondirs = [], [] 52 | if ziparchive: 53 | for name in names: 54 | if name == '__MACOSX/': 55 | continue 56 | if name[-1::] == '/': 57 | dirs.append(name) 58 | else: 59 | nondirs.append(name) 60 | else: 61 | for name in names: 62 | if os.path.isdir(os.path.join(top, name)): 63 | dirs.append(name) 64 | else: 65 | nondirs.append(name) 66 | if topdown: 67 | yield top, dirs, nondirs 68 | for name in dirs: 69 | new_path = os.path.join(top, name) 70 | if ziparchive: 71 | for x in walk(new_path, topdown, onerror, followlinks): 72 | yield x 73 | else: 74 | if followlinks or not islink(new_path): 75 | for x in walk(new_path, topdown, onerror, followlinks): 76 | yield x 77 | if not topdown: 78 | yield top, dirs, nondirs -------------------------------------------------------------------------------- /favicon.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Favicon retrieval 5 | 6 | Created by: Rui Carmo 7 | License: MIT (see LICENSE for details) 8 | """ 9 | import logging 10 | 11 | log = logging.getLogger() 12 | 13 | import urlparse 14 | from utils.urlkit import fetch, data_uri 15 | from bs4 import BeautifulSoup 16 | 17 | _default = "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAMAAAAoLQ9TAAAAxlBMVEUAAABOWZ5BTZhCTZhHUpt7g7d5gbZ5grZ5grZ6grZsda9sdq9tdq9tdrBtd7Bye7JxerJye7JzfLN0fbNdaKdeaadfaahfaqhha6ldZ6dfaahfaqhjbat3gLV6grZ6grd8hLh/h7mAh7mFjLxfaahgaqlha6libKpjbapRXKBSXKBSXaFTXqFUX6KNmcKXo8idqcujrs6uuNWzvdi5wtu+x96/x97EzOHJ0eXQ1ufV2+vb4O/g5fHm6vXr7/fx9Pv8/f////8y4F8aAAAALnRSTlMACR0dI1BRUVJSiIiIiIi8vb29vdbW1tbW4uLi4uzs7Ozs7Ozx8fHx8f39/f39FstVagAAALBJREFUGBllwUFOw0AMQNFve6Yhk6RFAhZsev9rwRap6iKZtp4kRrCE9+APAZGuvGX8q3oEhtgwHUexYVP2wNByei025qdx8LaF0U1noGWTdlq2VSmlhwgjNht6jPNLcpgU5HGUSyIn1UNWkEbKKCiDBz+EIOGedKpwSOP2aBixP4Pd9hZZP653ZZkrvzzqrWIE3mfRld4/Zw9BrCv9e3hcl+pbGMTaQvb1fpnXPfjnG2UzUabhPViuAAAAAElFTkSuQmCC" 18 | 19 | def google_fetcher(site): 20 | """Fetch the favicon via Google services""" 21 | endpoint = "http://www.google.com/s2/favicons?domain=%s" % urlparse.urlparse(site).hostname 22 | try: 23 | res = fetch(endpoint) 24 | except Exception, e: 25 | log.error("could not fetch %s: %s" % (endpoint, e)) 26 | return None 27 | return data_uri(res['content-type'], res['data']) 28 | 29 | 30 | def dumb_fetcher(site): 31 | """Fetch the favicon the dumb way""" 32 | endpoint = "http://%s/favicon.ico" % urlparse.urlparse(site).hostname 33 | try: 34 | res = fetch(endpoint) 35 | except Exception, e: 36 | log.error("could not fetch %s: %s" % (endpoint, e)) 37 | return None 38 | return data_uri(res['content-type'], res['data']) 39 | 40 | 41 | def html_fetcher(site): 42 | """Fetch the favicon the hard way""" 43 | endpoint = "http://%s" % urlparse.urlparse(site).hostname 44 | try: 45 | res = fetch(endpoint) 46 | except Exception, e: 47 | log.error("Could not fetch %s: %s" % (endpoint, e)) 48 | return None 49 | 50 | try: 51 | soup = BeautifulSoup(res['data']) 52 | except Exception, e: 53 | log.error("Could not parse %s: %s" % (endpoint, e)) 54 | return None 55 | 56 | link = soup.find("link", rel="shortcut icon") 57 | if not link: 58 | return None 59 | url = link['href'] 60 | try: 61 | res = fetch(url) 62 | except Exception, e: 63 | log.error("could not fetch %s: %s" % (endpoint, e)) 64 | return None 65 | return data_uri(res['content-type'], res['data']) 66 | 67 | 68 | def fetch_anyway(site): 69 | global _default 70 | data = None 71 | for handler in [google_fetcher,dumb_fetcher,html_fetcher]: 72 | data = handler(site) 73 | if data: 74 | return data 75 | return _default 76 | -------------------------------------------------------------------------------- /core.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Copyright (c) 2012, Rui Carmo 6 | Description: Core utility functions 7 | License: MIT (see LICENSE.md for details) 8 | """ 9 | 10 | import os 11 | import sys 12 | import logging 13 | import json 14 | 15 | log = logging.getLogger() 16 | 17 | from filekit import path_for 18 | 19 | class Singleton(type): 20 | """An implemetation of the Singleton pattern (use as metaclass)""" 21 | 22 | _instances = {} 23 | 24 | def __call__(cls, *args, **kwargs): 25 | if cls not in cls._instances: 26 | cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs) 27 | return cls._instances[cls] 28 | 29 | 30 | class Struct(dict): 31 | """An object that recursively builds itself from a dict and allows easy access to attributes""" 32 | 33 | def __init__(self, obj): 34 | dict.__init__(self, obj) 35 | for k, v in obj.iteritems(): 36 | if isinstance(v, dict): 37 | self.__dict__[k] = Struct(v) 38 | else: 39 | self.__dict__[k] = v 40 | 41 | def __getattr__(self, attr): 42 | try: 43 | return self.__dict__[attr] 44 | except KeyError: 45 | raise AttributeError(attr) 46 | 47 | def __setitem__(self, key, value): 48 | super(Struct, self).__setitem__(key, value) 49 | self.__dict__[key] = value 50 | 51 | def __setattr__(self, attr, value): 52 | self.__setitem__(attr, value) 53 | 54 | 55 | def json_str(item, bind_env=True): 56 | """Helper function to cast JSON unicode data to plain str and bind environment variables""" 57 | 58 | if isinstance(item, dict): 59 | return {json_str(key,bind_env=bind_env): json_str(value,bind_env=bind_env) for key, value in item.iteritems()} 60 | elif isinstance(item, list): 61 | return [json_str(element, bind_env=bind_env) for element in item] 62 | elif isinstance(item, unicode) and bind_env: 63 | env = os.environ 64 | env.update({"APPLICATION_ROOT": path_for('')}) 65 | try: 66 | return item.encode('utf-8') % env 67 | except: 68 | return item.encode('utf-8') 69 | else: 70 | return item 71 | 72 | 73 | def get_config(filename=None): 74 | """Parses a configuration file and returns a Struct for managing the configuration""" 75 | 76 | if not filename: 77 | return Struct({}) 78 | return Struct(json.load(open(filename, 'r'),object_hook=json_str)) 79 | 80 | 81 | def safe_eval(buffer): 82 | """Perform safe evaluation of a (very) small subset of Python functions""" 83 | 84 | if '%' == buffer[0]: 85 | try: 86 | return eval(buffer[1:],{"__builtins__":None},{"environ":os.environ}) 87 | except Exception, e: 88 | log.error('Error %s while doing safe_eval of %s' % (e, buffer)) 89 | return None 90 | return buffer 91 | 92 | 93 | def tb(): 94 | """Return a concise traceback summary""" 95 | 96 | etype, value, tb = sys.exc_info() 97 | return "%s: %s (%s@%s:%d)" % (etype.__name__, value, tb.tb_frame.f_code.co_name, os.path.basename(tb.tb_frame.f_code.co_filename), tb.tb_lineno) 98 | -------------------------------------------------------------------------------- /datakit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Copyright (c) 2013, Rui Carmo 6 | Description: Clustering and statistics helpers 7 | License: MIT (see LICENSE.md for details) 8 | """ 9 | 10 | import logging 11 | import re 12 | 13 | log = logging.getLogger() 14 | 15 | _stopwords = {"en":"i,a,an,are,as,at,be,by,for,from,how,in,is,it,of,on,or,that,the,this,to,was,what,when,where".split(',')} 16 | 17 | 18 | def strip_stopwords(sentence, lang="en"): 19 | """Removes stopwords and normalizes whitespace - adapted from Django""" 20 | 21 | global _stopwords 22 | words = sentence.split() 23 | sentence = [] 24 | for word in words: 25 | if word.lower() not in _stopwords[lang]: 26 | sentence.append(word) 27 | return u' '.join(sentence) 28 | 29 | 30 | def jaccard_distance(a, b): 31 | """A simple distance function based on string overlap - adapted from sample code by Deepak Thukral""" 32 | #Tokenize string into bag of words 33 | feature1 = set(re.findall('\w+', strip_stopwords(a.lower()))[:100]) 34 | feature2 = set(re.findall('\w+', strip_stopwords(b.lower()))[:100]) 35 | similarity = 1.0 * len(feature1.intersection(feature2)) / len(feature1.union(feature2)) 36 | return 1 - similarity 37 | 38 | 39 | 40 | def levenshtein_distance(a, b, limit=None): 41 | """Returns the Levenshtein edit distance between two strings - adapted from Whoosh""" 42 | 43 | a = ''.join(re.findall('\w+', strip_stopwords(a.lower()))) 44 | b = ''.join(re.findall('\w+', strip_stopwords(b.lower()))) 45 | 46 | prev = None 47 | thisrow = range(1, len(b) + 1) + [0] 48 | for x in xrange(len(a)): 49 | # Python lists wrap around for negative indices, so put the 50 | # leftmost column at the *end* of the list. This matches with 51 | # the zero-indexed strings and saves extra calculation. 52 | prev, thisrow = thisrow, [0] * len(b) + [x + 1] 53 | for y in xrange(len(b)): 54 | delcost = prev[y] + 1 55 | addcost = thisrow[y - 1] + 1 56 | subcost = prev[y - 1] + (a[x] != b[y]) 57 | thisrow[y] = min(delcost, addcost, subcost) 58 | 59 | if limit and x > limit and min(thisrow) > limit: 60 | return limit + 1 61 | 62 | return thisrow[len(b) - 1] 63 | 64 | 65 | def damerau_levenshtein_distance(a, b, limit=None): 66 | """Returns the Damerau-Levenshtein edit distance between two strings - adapted from Whoosh""" 67 | 68 | a = ''.join(re.findall('\w+', strip_stopwords(a.lower()))) 69 | b = ''.join(re.findall('\w+', strip_stopwords(b.lower()))) 70 | 71 | oneago = None 72 | thisrow = list(range(1, len(b) + 1)) + [0] 73 | for x in xrange(len(a)): 74 | # Python lists wrap around for negative indices, so put the 75 | # leftmost column at the *end* of the list. This matches with 76 | # the zero-indexed strings and saves extra calculation. 77 | twoago, oneago, thisrow = oneago, thisrow, [0] * len(b) + [x + 1] 78 | for y in xrange(len(b)): 79 | delcost = oneago[y] + 1 80 | addcost = thisrow[y - 1] + 1 81 | subcost = oneago[y - 1] + (a[x] != b[y]) 82 | thisrow[y] = min(delcost, addcost, subcost) 83 | # This block deals with transpositions 84 | if (x > 0 and y > 0 and a[x] == b[y - 1] 85 | and a[x - 1] == b[y] and a[x] != b[y]): 86 | thisrow[y] = min(thisrow[y], twoago[y - 2] + 1) 87 | 88 | if limit and x > limit and min(thisrow) > limit: 89 | return limit + 1 90 | 91 | return thisrow[len(b) - 1] 92 | -------------------------------------------------------------------------------- /logkit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Copyright (c) 2012, Rui Carmo 6 | Description: Utility functions 7 | License: MIT (see LICENSE.md for details) 8 | """ 9 | 10 | import json 11 | import logging 12 | import os 13 | 14 | log = logging.getLogger() 15 | 16 | from collections import deque 17 | 18 | from pygments import highlight 19 | from pygments.lexers import get_lexer_by_name, guess_lexer 20 | from pygments.formatters import TerminalFormatter, Terminal256Formatter, NullFormatter 21 | 22 | 23 | class InMemoryHandler(logging.Handler): 24 | """In memory logging handler with a circular buffer""" 25 | 26 | def __init__(self, limit=8192): 27 | # run the regular Handler __init__ 28 | logging.Handler.__init__(self) 29 | # Our custom argument 30 | self.limit = limit 31 | self.flush() 32 | 33 | def emit(self, record): 34 | self.records.append(self.format(record)) 35 | 36 | def flush(self): 37 | self.records = deque([], self.limit) 38 | 39 | def dump(self): 40 | return self.records 41 | 42 | 43 | class ColorFormatter(logging.Formatter) : 44 | """Console logging formatter with coloring""" 45 | _colors = { 46 | "DEBUG" : "\033[22;32m", # green 47 | "INFO" : "\033[01;34m", # violet 48 | "WARNING" : "\033[22;35m", # magenta 49 | "ERROR" : "\033[22;31m", # red 50 | "CRITICAL": "\033[01;31m" # bold red 51 | }; 52 | 53 | def format(self, record): 54 | if 'color' in os.environ.get('TERM', ''): 55 | if(self._colors.has_key(record.levelname)): 56 | record.levelname = "%s%s\033[0;0m" % (self._colors[record.levelname], record.levelname) 57 | record.msg = "\033[37m\033[1m%s\033[0;0m" % record.msg 58 | return logging.Formatter.format(self, record) 59 | 60 | 61 | class PygmentsHandler(logging.StreamHandler): 62 | """Console logging handler with syntax highlighting""" 63 | 64 | def __init__(self, stream=None, syntax="guess", encoding='utf-8', style='default'): 65 | # run the regular Handler __init__ 66 | logging.StreamHandler.__init__(self,stream) 67 | self.pformatter = (Terminal256Formatter(encoding=encoding, style=style) 68 | if '256color' in os.environ.get('TERM', '') 69 | else TerminalFormatter(encoding=encoding,style=style)) 70 | if not stream.isatty(): 71 | self.pformatter = NullFormatter 72 | if syntax == "guess": 73 | self.lexer = guess_lexer 74 | else: 75 | self.lexer = get_lexer_by_name(syntax) 76 | 77 | def emit(self, record): 78 | if self.pformatter == NullFormatter: 79 | return 80 | msg = self.format(record) 81 | # Note that the guessing also applies to any log formatting 82 | if self.lexer == guess_lexer: 83 | lexer = guess_lexer(msg) 84 | self.stream.write(highlight(msg,lexer,self.pformatter)) 85 | return 86 | self.stream.write(highlight(msg,self.lexer,self.pformatter)) 87 | 88 | 89 | def json_ansi(item, stream, sort_keys=True, indent=0, separators=(',', ':'), encoding='utf-8', style='default'): 90 | """Helper function to pretty-print JSON via Pygments""" 91 | 92 | formatter = (Terminal256Formatter(encoding=encoding,style=style) 93 | if '256color' in os.environ.get('TERM', '') 94 | else TerminalFormatter(encoding=encoding,style=style)) 95 | if not stream.isatty(): 96 | formatter = NullFormatter 97 | lexer = get_lexer_by_name('json') 98 | stream.write(highlight(json.dumps(item,sort_keys, indent, separators),lexer,formatter)) 99 | -------------------------------------------------------------------------------- /imagekit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Image utilities 5 | 6 | Created by: Rui Carmo 7 | License: MIT (see LICENSE for details) 8 | """ 9 | 10 | from operator import itemgetter 11 | 12 | def linear_partition(seq, k): 13 | if k <= 0: 14 | return [] 15 | n = len(seq) - 1 16 | if k > n: 17 | return map(lambda x: [x], seq) 18 | table, solution = linear_partition_table(seq, k) 19 | k, ans = k-2, [] 20 | while k >= 0: 21 | ans = [[seq[i] for i in xrange(solution[n-1][k]+1, n+1)]] + ans 22 | n, k = solution[n-1][k], k-1 23 | return [[seq[i] for i in xrange(0, n+1)]] + ans 24 | 25 | 26 | def linear_partition_table(seq, k): 27 | n = len(seq) 28 | table = [[0] * k for x in xrange(n)] 29 | solution = [[0] * (k-1) for x in xrange(n-1)] 30 | for i in xrange(n): 31 | table[i][0] = seq[i] + (table[i-1][0] if i else 0) 32 | for j in xrange(k): 33 | table[0][j] = seq[0] 34 | for i in xrange(1, n): 35 | for j in xrange(1, k): 36 | table[i][j], solution[i-1][j-1] = min( 37 | ((max(table[x][j-1], table[i][0]-table[x][0]), x) for x in xrange(i)), 38 | key=itemgetter(0)) 39 | return (table, solution) 40 | 41 | 42 | def get_info(data): 43 | """Parses a small buffer and attempts to return basic image metadata""" 44 | 45 | data = str(data) 46 | size = len(data) 47 | height = -1 48 | width = -1 49 | content_type = '' 50 | 51 | # handle GIFs 52 | if (size >= 10) and data[:6] in ('GIF87a', 'GIF89a'): 53 | # Check to see if content_type is correct 54 | content_type = 'image/gif' 55 | w, h = struct.unpack("