├── README.md ├── __init__.py ├── .gitignore ├── routekit.py ├── dockit.py ├── LICENSE ├── cpustats.py ├── netstats.py ├── stringkit.py ├── procstats.py ├── pipekit.py ├── markup.py ├── filekit.py ├── favicon.py ├── core.py ├── datakit.py ├── logkit.py ├── imagekit.py ├── timekit.py ├── taskkit.py ├── decorators.py └── urlkit.py /README.md: -------------------------------------------------------------------------------- 1 | python-utils 2 | ============ 3 | 4 | A set of libraries I constantly re-use for a number of projects, so that I have a canonical, always-updated reference/sub-repo I can refer to. 5 | 6 | Some of these were designed to cope with older Python runtimes or to provide smaller, more manageable dependencies for common tasks. 7 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Copyright (c) 2012, Rui Carmo 6 | Description: Utility functions 7 | License: MIT (see LICENSE.md for details) 8 | """ 9 | 10 | import os 11 | import sys 12 | import logging 13 | 14 | log = logging.getLogger() 15 | 16 | # export commonly-used submodule symbols 17 | from utils.core import Struct, Singleton, get_config, tb 18 | from utils.filekit import path_for, locate 19 | from utils.timekit import time_since -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | bin/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # Installer logs 26 | pip-log.txt 27 | pip-delete-this-directory.txt 28 | 29 | # Unit test / coverage reports 30 | htmlcov/ 31 | .tox/ 32 | .coverage 33 | .cache 34 | nosetests.xml 35 | coverage.xml 36 | 37 | # Translations 38 | *.mo 39 | 40 | # Mr Developer 41 | .mr.developer.cfg 42 | .project 43 | .pydevproject 44 | 45 | # Rope 46 | .ropeproject 47 | 48 | # Django stuff: 49 | *.log 50 | *.pot 51 | 52 | # Sphinx documentation 53 | docs/_build/ 54 | 55 | -------------------------------------------------------------------------------- /routekit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Copyright (c) 2014, Rui Carmo 6 | Description: Bottle-specific utility functions 7 | License: MIT (see LICENSE.md for details) 8 | """ 9 | 10 | import os 11 | import sys 12 | import logging 13 | import json 14 | 15 | log = logging.getLogger() 16 | 17 | def inspect_routes(app): 18 | for route in app.routes: 19 | if 'mountpoint' in route.config: 20 | prefix = route.config['mountpoint']['prefix'] 21 | subapp = route.config['mountpoint']['target'] 22 | 23 | for prefixes, route in inspect_routes(subapp): 24 | yield [prefix] + prefixes, route 25 | else: 26 | yield [], route 27 | 28 | def dump_routes(app): 29 | for prefixes, route in inspect_routes(app): 30 | abs_prefix = '/'.join(part for p in prefixes for part in p.split('/')) 31 | log.warn("Prefix:'%s' Route:'%s' [%s] %s" % (abs_prefix, route.rule, route.method, route.callback)) 32 | -------------------------------------------------------------------------------- /dockit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Copyright (c) 2012, Rui Carmo 6 | Description: Docstring utility functions 7 | License: MIT (see LICENSE.md for details) 8 | """ 9 | 10 | import logging 11 | import inspect 12 | from bottle import app 13 | 14 | log = logging.getLogger() 15 | 16 | def docs(): 17 | """Gather all docstrings related to routes and return them grouped by module""" 18 | 19 | routes = [] 20 | modules = {} 21 | for route in app().routes: 22 | doc = inspect.getdoc(route.callback) or inspect.getcomments(route.callback) 23 | if not doc: 24 | doc = '' 25 | module = inspect.getmodule(route.callback).__name__ 26 | item = { 27 | 'method': route.method, 28 | 'route': route.rule, 29 | 'function': route.callback.__name__, 30 | 'module': module, 31 | 'doc': inspect.cleandoc(doc) 32 | } 33 | if not module in modules: 34 | modules[module] = [] 35 | modules[module].append(item) 36 | return modules -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Rui Carmo 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /cpustats.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Copyright (c) 2012, Rui Carmo 6 | Description: Utility functions for retrieving CPU statistics 7 | License: MIT (see LICENSE.md for details) 8 | """ 9 | 10 | import logging 11 | import time 12 | 13 | log = logging.getLogger() 14 | 15 | 16 | def stats(): 17 | """Retrieves all CPU counters""" 18 | cpu = open('/proc/stat','r').readlines()[0] 19 | return map(float,cpu.split()[1:5]) 20 | 21 | 22 | def usage(interval=0.1): 23 | """Estimates overall CPU usage during a short time interval""" 24 | t1 = stats() 25 | time.sleep(interval) 26 | t2 = stats() 27 | delta = [t2[i] - t1[i] for i in range(len(t1))] 28 | try: 29 | return 1.0 - (delta[-1:].pop()/(sum(delta)*1.0)) 30 | except: 31 | return 0.0 32 | 33 | 34 | def freqency(cpu='cpu0'): 35 | """Retrieves the current CPU speed in MHz - for a single CPU""" 36 | return float(open('/sys/devices/system/cpu/%s/cpufreq/scaling_cur_freq' % cpu,'r').read().strip())/1000.0 37 | 38 | 39 | def temperature(): 40 | """Retrieves the current CPU core temperature in degrees Celsius - tailored to the Raspberry Pi""" 41 | return float(open('/sys/class/thermal/thermal_zone0/temp','r').read().strip())/1000.0 42 | -------------------------------------------------------------------------------- /netstats.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Copyright (c) 2013, Rui Carmo 6 | Description: Network utility functions 7 | License: MIT (see LICENSE.md for details) 8 | """ 9 | 10 | import re 11 | import logging 12 | import socket 13 | import struct 14 | 15 | log = logging.getLogger() 16 | 17 | 18 | def valid_mac_address(addr): 19 | """Validate a physical Ethernet address""" 20 | return re.match("[0-9a-f]{2}([-:][0-9a-f]{2}){5}$", addr.lower()) 21 | 22 | 23 | def valid_ip_address(addr): 24 | """Quick and dirty way to validate any kind of IP address""" 25 | try: 26 | socket.inet_aton(addr) 27 | return True 28 | except socket.error: 29 | return False 30 | 31 | 32 | def get_net_bytes(dev='eth0'): 33 | """Read network interface traffic counters""" 34 | return { 35 | 'rx': float(open('/sys/class/net/%s/statistics/rx_bytes' % dev,'r').read().strip()), 36 | 'tx': float(open('/sys/class/net/%s/statistics/tx_bytes' % dev,'r').read().strip()) 37 | } 38 | 39 | 40 | def get_mac_address(dev="eth0"): 41 | """Retrieves the MAC address from the /sys virtual filesystem - will only work on Linux.""" 42 | return open('/sys/class/net/%s/address' % dev,'r').read().strip() 43 | 44 | 45 | def get_ip_address(dev="eth0"): 46 | """Retrieves the IP address via SIOCGIFADDR - only tested on Linux.""" 47 | try: 48 | s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) 49 | return socket.inet_ntoa(fcntl.ioctl(s.fileno(),0x8915,struct.pack('256s', dev[:15]))[20:24]) 50 | except: 51 | return None -------------------------------------------------------------------------------- /stringkit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Copyright (c) 2012, Rui Carmo 6 | Description: Utility functions for retrieving process information 7 | License: MIT (see LICENSE.md for details) 8 | """ 9 | 10 | import sys 11 | import re 12 | import logging 13 | import htmlentitydefs 14 | import unicodedata 15 | 16 | log = logging.getLogger() 17 | 18 | 19 | def rsplit(s, sep=None, maxsplit=-1): 20 | """Equivalent to str.split, except splitting from the right""" 21 | if sys.version_info < (2, 4, 0): 22 | if sep is not None: 23 | sep = sep[::-1] 24 | L = s[::-1].split(sep, maxsplit) 25 | L.reverse() 26 | return [s[::-1] for s in L] 27 | else: 28 | return s.rsplit(sep, maxsplit) 29 | 30 | 31 | def shrink(line, bound=50, rep='[...]'): 32 | """Shrinks a string, adding an ellipsis to the middle""" 33 | l = len(line) 34 | if l < bound: 35 | return line 36 | if bound <= len(rep): 37 | return rep 38 | k = bound - len(rep) 39 | return line[0:k / 2] + rep + line[-k / 2:] 40 | 41 | 42 | def convert_entity(m): 43 | """Converts entities to codepoints where applicable""" 44 | if m.group(1) == '#': 45 | try: 46 | return unichr(int(m.group(2))) 47 | except ValueError: 48 | return '&#%s;' % m.group(2) 49 | try: 50 | return unichr(htmlentitydefs.name2codepoint[m.group(2)]) 51 | except KeyError: 52 | return '&%s;' % m.group(2) 53 | 54 | 55 | def convert_html(buffer): 56 | """Replaces all entities with codepoints""" 57 | return re.sub(r'&(#?)(.+?);', convertentity, buffer) 58 | 59 | 60 | def munge_string(buffer): 61 | """Builds anchor IDs""" 62 | return re.sub("[\W+]", "-", buffer.lower()) 63 | 64 | 65 | def remove_diacritics(buffer): 66 | """Remove diactritical marks in Latin characters""" 67 | unicodedata.normalize('NFKD', unicode(buffer)).encode('ASCII', 'ignore') 68 | -------------------------------------------------------------------------------- /procstats.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Copyright (c) 2012, Rui Carmo 6 | Description: Utility functions for retrieving process information 7 | License: MIT (see LICENSE.md for details) 8 | """ 9 | 10 | import os 11 | import logging 12 | import platform 13 | import __builtin__ 14 | 15 | log = logging.getLogger() 16 | 17 | # Module globals 18 | openfiles = set() 19 | oldfile = __builtin__.file 20 | oldopen = __builtin__.open 21 | patched = False 22 | 23 | 24 | class _file(oldfile): 25 | """File wrapper""" 26 | def __init__(self, *args): 27 | self.x = args[0] 28 | log.debug("FILE OPEN: %s" % str(self.x)) 29 | oldfile.__init__(self, *args) 30 | openfiles.add(self) 31 | 32 | def close(self): 33 | log.debug("FILE CLOSED: %s" % str(self.x)) 34 | oldfile.close(self) 35 | openfiles.remove(self) 36 | 37 | 38 | def _open(*args): 39 | return newfile(*args) 40 | 41 | 42 | def monkeypatch_files(): 43 | """Wraps builtin file operations to allow us to track open files""" 44 | __builtin__.file = _file 45 | __builtin__.open = _open 46 | patched = True 47 | 48 | 49 | def get_open_fd_count(): 50 | if 'Darwin' in platform.platform(): 51 | pid = os.getpid() 52 | procs = subprocess.check_output([ "lsof", '-w', '-Ff', "-p", str(pid)]) 53 | nprocs = len(filter(lambda s: s and s[0] == 'f' and s[1:].isdigit(),procs.split('\n'))) 54 | return nprocs 55 | # check if we've monkeypatched anything 56 | if patched: 57 | return len(get_open_files()) 58 | else: 59 | # Will only work for Linux 60 | return len(os.listdir('/proc/self/fd')) 61 | 62 | 63 | def get_open_files(): 64 | return [f.x for f in openfiles] 65 | 66 | 67 | def stats(pid): 68 | """Retrieve process kernel counters""" 69 | stats = open('/proc/%d/status' % pid,'r').readlines() 70 | return dict(filter(lambda x: len(x)==2,map(lambda x: x.split()[:2],stats))) 71 | 72 | 73 | def rss(pid): 74 | """Retrieve a process' resident set size""" 75 | try: 76 | return int(stats(pid)['VmRSS:']) 77 | except: 78 | return 0 79 | -------------------------------------------------------------------------------- /pipekit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Copyright (c) 2012, Rui Carmo 6 | Description: Pipeline patterns, mostly taken from itertools recipes 7 | License: MIT (see LICENSE.md for details) 8 | """ 9 | 10 | import itertools 11 | import collections 12 | 13 | def chunk(chunk_size=32): 14 | """Group chunk_size elements into lists""" 15 | 16 | def chunker(gen): 17 | gen = iter(gen) 18 | chunk = [] 19 | try: 20 | while True: 21 | for _ in xrange(chunk_size): 22 | chunk.append(gen.next()) 23 | yield chunk 24 | chunk = [] 25 | except StopIteration: 26 | if chunk: 27 | yield chunk 28 | 29 | return chunker 30 | 31 | 32 | def flatten(gen): 33 | """Flatten a sequence, but only one level deep.""" 34 | 35 | return itertools.chain.from_iterable(gen) 36 | 37 | 38 | 39 | def sink(iter, steps=None): 40 | """Sink data from an iterator, effecting any results from it being consumed.""" 41 | 42 | if steps is None: 43 | # feed the entire iterator into a zero-length deque 44 | collections.deque(iter, maxlen=0) 45 | else: 46 | # advance to the empty slice starting at position 'steps' 47 | next(itertools.islice(iter, steps, steps), None) 48 | 49 | 50 | 51 | def make_unique(seq, transform=None): 52 | """Remove duplicate items from a sequence""" 53 | 54 | if transform is None: 55 | def transform(x): return x 56 | seen = {} 57 | for item in seq: 58 | marker = transform(item) 59 | if marker not in seen: 60 | seen[marker] = True 61 | yield item 62 | 63 | 64 | def pipeline(source, functions): 65 | """Apply an array of functions to a source iterable""" 66 | 67 | return reduce(lambda x, y: y(x), functions, source) 68 | 69 | 70 | if __name__=='__main__': 71 | 72 | def sum(iter): 73 | for i in iter: 74 | yield i + 1 75 | 76 | steps = [ 77 | sum, 78 | chunk(8), 79 | chunk(4) 80 | ] 81 | p = pipeline(xrange(64), steps) 82 | for i in p: 83 | print i 84 | 85 | -------------------------------------------------------------------------------- /markup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | Core classes 5 | 6 | Created by Rui Carmo on 2006-09-10. 7 | Published under the MIT license. 8 | """ 9 | 10 | import logging 11 | 12 | log = logging.getLogger() 13 | 14 | 15 | def sanitize_title(title): 16 | """Generate a usable anchor from a title string""" 17 | 18 | return re.sub("[\W+]","-",title.lower()) 19 | 20 | 21 | def parse_rfc822(buffer, mime_type='text/plain'): 22 | """Helper function for parsing metadata out of a plaintext buffer""" 23 | 24 | headers = {} 25 | markup = '' 26 | if mime_type in ['text/plain', 'text/x-textile', 'text/x-markdown']: 27 | try: 28 | (header_lines,markup) = buffer.split("\n\n", 1) 29 | for header in header_lines.strip().split("\n"): 30 | (name, value) = header.strip().split(":", 1) 31 | headers[name.lower().strip()] = unicode(value.strip()) 32 | if 'content-type' in headers: 33 | mime_type = headers['content-type'] 34 | except: 35 | raise TypeError, "Invalid file format." 36 | return headers, markup, mime_type 37 | 38 | 39 | def render_markup(raw, markup=u'text/html'): 40 | """Turn markup into nice HTML""" 41 | 42 | # Allow module to load regardless of textile or markdown support 43 | try: 44 | import textile 45 | import smartypants 46 | import markdown 47 | except ImportError: 48 | pass 49 | 50 | def _markdown(raw): 51 | log.debug("Rendering Markdown") 52 | return markdown.Markdown(extensions=['extra','toc','smarty','codehilite','meta','sane_lists'], safe_mode=False).convert(raw) 53 | 54 | def _plaintext(raw): 55 | log.debug("Rendering plaintext") 56 | return u'
\n%s
' % raw 57 | 58 | def _textile(raw): 59 | log.debug("Rendering Textile") 60 | return smartypants.smartyPants(textile.textile(unicode(raw), head_offset=0, validate=0, sanitize=1, encoding='utf-8', output='utf-8')) 61 | 62 | def _html(raw): 63 | return raw 64 | 65 | return { 66 | u'text/plain' : _plaintext, 67 | u'text/x-web-markdown': _markdown, 68 | u'text/x-markdown' : _markdown, 69 | u'text/markdown' : _markdown, 70 | u'text/textile' : _textile, 71 | u'text/x-textile' : _textile, 72 | u'text/html' : _html}[markup](raw) 73 | 74 | -------------------------------------------------------------------------------- /filekit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Copyright (c) 2012, Rui Carmo 6 | Description: File utility functions 7 | License: MIT (see LICENSE.md for details) 8 | """ 9 | 10 | import os 11 | import sys 12 | import logging 13 | import zipfile 14 | 15 | log = logging.getLogger() 16 | 17 | def path_for(name, script=__file__): 18 | """Build absolute paths to resources based on app path""" 19 | 20 | if 'uwsgi' in sys.argv: 21 | return os.path.join(os.path.abspath(os.path.join(os.path.dirname(script),'..')),name) 22 | return os.path.abspath(os.path.join(os.path.dirname(sys.argv[0]),name)) 23 | 24 | 25 | def locate(pattern, root=os.getcwd()): 26 | """Generator for iterating inside a file tree""" 27 | 28 | for path, dirs, files in os.walk(root): 29 | for filename in [os.path.abspath(os.path.join(path, filename)) for filename in files if fnmatch.fnmatch(filename, pattern)]: 30 | yield filename 31 | 32 | 33 | def walk(top, topdown=True, onerror=None, followlinks=False, ziparchive=None, zipdepth=0): 34 | """Reimplementation of os.walk to traverse ZIP files as well""" 35 | 36 | try: 37 | if (os.path.splitext(top)[1]).lower() == '.zip': 38 | if ziparchive: 39 | # skip nested ZIPs. 40 | yield top, [], [] 41 | else: 42 | ziparchive = zipfile.ZipFile(top) 43 | names = list(set(map(lambda x: [p+'/' for p in x.split('/') if p != ""][zipdepth],ziparchive.namelist()))) 44 | else: 45 | names = os.listdir(top) 46 | except error, err: 47 | if onerror is not None: 48 | onerror(err) 49 | return 50 | 51 | dirs, nondirs = [], [] 52 | if ziparchive: 53 | for name in names: 54 | if name == '__MACOSX/': 55 | continue 56 | if name[-1::] == '/': 57 | dirs.append(name) 58 | else: 59 | nondirs.append(name) 60 | else: 61 | for name in names: 62 | if os.path.isdir(os.path.join(top, name)): 63 | dirs.append(name) 64 | else: 65 | nondirs.append(name) 66 | if topdown: 67 | yield top, dirs, nondirs 68 | for name in dirs: 69 | new_path = os.path.join(top, name) 70 | if ziparchive: 71 | for x in walk(new_path, topdown, onerror, followlinks): 72 | yield x 73 | else: 74 | if followlinks or not islink(new_path): 75 | for x in walk(new_path, topdown, onerror, followlinks): 76 | yield x 77 | if not topdown: 78 | yield top, dirs, nondirs -------------------------------------------------------------------------------- /favicon.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Favicon retrieval 5 | 6 | Created by: Rui Carmo 7 | License: MIT (see LICENSE for details) 8 | """ 9 | import logging 10 | 11 | log = logging.getLogger() 12 | 13 | import urlparse 14 | from utils.urlkit import fetch, data_uri 15 | from bs4 import BeautifulSoup 16 | 17 | _default = "" 18 | 19 | def google_fetcher(site): 20 | """Fetch the favicon via Google services""" 21 | endpoint = "http://www.google.com/s2/favicons?domain=%s" % urlparse.urlparse(site).hostname 22 | try: 23 | res = fetch(endpoint) 24 | except Exception, e: 25 | log.error("could not fetch %s: %s" % (endpoint, e)) 26 | return None 27 | return data_uri(res['content-type'], res['data']) 28 | 29 | 30 | def dumb_fetcher(site): 31 | """Fetch the favicon the dumb way""" 32 | endpoint = "http://%s/favicon.ico" % urlparse.urlparse(site).hostname 33 | try: 34 | res = fetch(endpoint) 35 | except Exception, e: 36 | log.error("could not fetch %s: %s" % (endpoint, e)) 37 | return None 38 | return data_uri(res['content-type'], res['data']) 39 | 40 | 41 | def html_fetcher(site): 42 | """Fetch the favicon the hard way""" 43 | endpoint = "http://%s" % urlparse.urlparse(site).hostname 44 | try: 45 | res = fetch(endpoint) 46 | except Exception, e: 47 | log.error("Could not fetch %s: %s" % (endpoint, e)) 48 | return None 49 | 50 | try: 51 | soup = BeautifulSoup(res['data']) 52 | except Exception, e: 53 | log.error("Could not parse %s: %s" % (endpoint, e)) 54 | return None 55 | 56 | link = soup.find("link", rel="shortcut icon") 57 | if not link: 58 | return None 59 | url = link['href'] 60 | try: 61 | res = fetch(url) 62 | except Exception, e: 63 | log.error("could not fetch %s: %s" % (endpoint, e)) 64 | return None 65 | return data_uri(res['content-type'], res['data']) 66 | 67 | 68 | def fetch_anyway(site): 69 | global _default 70 | data = None 71 | for handler in [google_fetcher,dumb_fetcher,html_fetcher]: 72 | data = handler(site) 73 | if data: 74 | return data 75 | return _default 76 | -------------------------------------------------------------------------------- /core.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Copyright (c) 2012, Rui Carmo 6 | Description: Core utility functions 7 | License: MIT (see LICENSE.md for details) 8 | """ 9 | 10 | import os 11 | import sys 12 | import logging 13 | import json 14 | 15 | log = logging.getLogger() 16 | 17 | from filekit import path_for 18 | 19 | class Singleton(type): 20 | """An implemetation of the Singleton pattern (use as metaclass)""" 21 | 22 | _instances = {} 23 | 24 | def __call__(cls, *args, **kwargs): 25 | if cls not in cls._instances: 26 | cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs) 27 | return cls._instances[cls] 28 | 29 | 30 | class Struct(dict): 31 | """An object that recursively builds itself from a dict and allows easy access to attributes""" 32 | 33 | def __init__(self, obj): 34 | dict.__init__(self, obj) 35 | for k, v in obj.iteritems(): 36 | if isinstance(v, dict): 37 | self.__dict__[k] = Struct(v) 38 | else: 39 | self.__dict__[k] = v 40 | 41 | def __getattr__(self, attr): 42 | try: 43 | return self.__dict__[attr] 44 | except KeyError: 45 | raise AttributeError(attr) 46 | 47 | def __setitem__(self, key, value): 48 | super(Struct, self).__setitem__(key, value) 49 | self.__dict__[key] = value 50 | 51 | def __setattr__(self, attr, value): 52 | self.__setitem__(attr, value) 53 | 54 | 55 | def json_str(item, bind_env=True): 56 | """Helper function to cast JSON unicode data to plain str and bind environment variables""" 57 | 58 | if isinstance(item, dict): 59 | return {json_str(key,bind_env=bind_env): json_str(value,bind_env=bind_env) for key, value in item.iteritems()} 60 | elif isinstance(item, list): 61 | return [json_str(element, bind_env=bind_env) for element in item] 62 | elif isinstance(item, unicode) and bind_env: 63 | env = os.environ 64 | env.update({"APPLICATION_ROOT": path_for('')}) 65 | try: 66 | return item.encode('utf-8') % env 67 | except: 68 | return item.encode('utf-8') 69 | else: 70 | return item 71 | 72 | 73 | def get_config(filename=None): 74 | """Parses a configuration file and returns a Struct for managing the configuration""" 75 | 76 | if not filename: 77 | return Struct({}) 78 | return Struct(json.load(open(filename, 'r'),object_hook=json_str)) 79 | 80 | 81 | def safe_eval(buffer): 82 | """Perform safe evaluation of a (very) small subset of Python functions""" 83 | 84 | if '%' == buffer[0]: 85 | try: 86 | return eval(buffer[1:],{"__builtins__":None},{"environ":os.environ}) 87 | except Exception, e: 88 | log.error('Error %s while doing safe_eval of %s' % (e, buffer)) 89 | return None 90 | return buffer 91 | 92 | 93 | def tb(): 94 | """Return a concise traceback summary""" 95 | 96 | etype, value, tb = sys.exc_info() 97 | return "%s: %s (%s@%s:%d)" % (etype.__name__, value, tb.tb_frame.f_code.co_name, os.path.basename(tb.tb_frame.f_code.co_filename), tb.tb_lineno) 98 | -------------------------------------------------------------------------------- /datakit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Copyright (c) 2013, Rui Carmo 6 | Description: Clustering and statistics helpers 7 | License: MIT (see LICENSE.md for details) 8 | """ 9 | 10 | import logging 11 | import re 12 | 13 | log = logging.getLogger() 14 | 15 | _stopwords = {"en":"i,a,an,are,as,at,be,by,for,from,how,in,is,it,of,on,or,that,the,this,to,was,what,when,where".split(',')} 16 | 17 | 18 | def strip_stopwords(sentence, lang="en"): 19 | """Removes stopwords and normalizes whitespace - adapted from Django""" 20 | 21 | global _stopwords 22 | words = sentence.split() 23 | sentence = [] 24 | for word in words: 25 | if word.lower() not in _stopwords[lang]: 26 | sentence.append(word) 27 | return u' '.join(sentence) 28 | 29 | 30 | def jaccard_distance(a, b): 31 | """A simple distance function based on string overlap - adapted from sample code by Deepak Thukral""" 32 | #Tokenize string into bag of words 33 | feature1 = set(re.findall('\w+', strip_stopwords(a.lower()))[:100]) 34 | feature2 = set(re.findall('\w+', strip_stopwords(b.lower()))[:100]) 35 | similarity = 1.0 * len(feature1.intersection(feature2)) / len(feature1.union(feature2)) 36 | return 1 - similarity 37 | 38 | 39 | 40 | def levenshtein_distance(a, b, limit=None): 41 | """Returns the Levenshtein edit distance between two strings - adapted from Whoosh""" 42 | 43 | a = ''.join(re.findall('\w+', strip_stopwords(a.lower()))) 44 | b = ''.join(re.findall('\w+', strip_stopwords(b.lower()))) 45 | 46 | prev = None 47 | thisrow = range(1, len(b) + 1) + [0] 48 | for x in xrange(len(a)): 49 | # Python lists wrap around for negative indices, so put the 50 | # leftmost column at the *end* of the list. This matches with 51 | # the zero-indexed strings and saves extra calculation. 52 | prev, thisrow = thisrow, [0] * len(b) + [x + 1] 53 | for y in xrange(len(b)): 54 | delcost = prev[y] + 1 55 | addcost = thisrow[y - 1] + 1 56 | subcost = prev[y - 1] + (a[x] != b[y]) 57 | thisrow[y] = min(delcost, addcost, subcost) 58 | 59 | if limit and x > limit and min(thisrow) > limit: 60 | return limit + 1 61 | 62 | return thisrow[len(b) - 1] 63 | 64 | 65 | def damerau_levenshtein_distance(a, b, limit=None): 66 | """Returns the Damerau-Levenshtein edit distance between two strings - adapted from Whoosh""" 67 | 68 | a = ''.join(re.findall('\w+', strip_stopwords(a.lower()))) 69 | b = ''.join(re.findall('\w+', strip_stopwords(b.lower()))) 70 | 71 | oneago = None 72 | thisrow = list(range(1, len(b) + 1)) + [0] 73 | for x in xrange(len(a)): 74 | # Python lists wrap around for negative indices, so put the 75 | # leftmost column at the *end* of the list. This matches with 76 | # the zero-indexed strings and saves extra calculation. 77 | twoago, oneago, thisrow = oneago, thisrow, [0] * len(b) + [x + 1] 78 | for y in xrange(len(b)): 79 | delcost = oneago[y] + 1 80 | addcost = thisrow[y - 1] + 1 81 | subcost = oneago[y - 1] + (a[x] != b[y]) 82 | thisrow[y] = min(delcost, addcost, subcost) 83 | # This block deals with transpositions 84 | if (x > 0 and y > 0 and a[x] == b[y - 1] 85 | and a[x - 1] == b[y] and a[x] != b[y]): 86 | thisrow[y] = min(thisrow[y], twoago[y - 2] + 1) 87 | 88 | if limit and x > limit and min(thisrow) > limit: 89 | return limit + 1 90 | 91 | return thisrow[len(b) - 1] 92 | -------------------------------------------------------------------------------- /logkit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Copyright (c) 2012, Rui Carmo 6 | Description: Utility functions 7 | License: MIT (see LICENSE.md for details) 8 | """ 9 | 10 | import json 11 | import logging 12 | import os 13 | 14 | log = logging.getLogger() 15 | 16 | from collections import deque 17 | 18 | from pygments import highlight 19 | from pygments.lexers import get_lexer_by_name, guess_lexer 20 | from pygments.formatters import TerminalFormatter, Terminal256Formatter, NullFormatter 21 | 22 | 23 | class InMemoryHandler(logging.Handler): 24 | """In memory logging handler with a circular buffer""" 25 | 26 | def __init__(self, limit=8192): 27 | # run the regular Handler __init__ 28 | logging.Handler.__init__(self) 29 | # Our custom argument 30 | self.limit = limit 31 | self.flush() 32 | 33 | def emit(self, record): 34 | self.records.append(self.format(record)) 35 | 36 | def flush(self): 37 | self.records = deque([], self.limit) 38 | 39 | def dump(self): 40 | return self.records 41 | 42 | 43 | class ColorFormatter(logging.Formatter) : 44 | """Console logging formatter with coloring""" 45 | _colors = { 46 | "DEBUG" : "\033[22;32m", # green 47 | "INFO" : "\033[01;34m", # violet 48 | "WARNING" : "\033[22;35m", # magenta 49 | "ERROR" : "\033[22;31m", # red 50 | "CRITICAL": "\033[01;31m" # bold red 51 | }; 52 | 53 | def format(self, record): 54 | if 'color' in os.environ.get('TERM', ''): 55 | if(self._colors.has_key(record.levelname)): 56 | record.levelname = "%s%s\033[0;0m" % (self._colors[record.levelname], record.levelname) 57 | record.msg = "\033[37m\033[1m%s\033[0;0m" % record.msg 58 | return logging.Formatter.format(self, record) 59 | 60 | 61 | class PygmentsHandler(logging.StreamHandler): 62 | """Console logging handler with syntax highlighting""" 63 | 64 | def __init__(self, stream=None, syntax="guess", encoding='utf-8', style='default'): 65 | # run the regular Handler __init__ 66 | logging.StreamHandler.__init__(self,stream) 67 | self.pformatter = (Terminal256Formatter(encoding=encoding, style=style) 68 | if '256color' in os.environ.get('TERM', '') 69 | else TerminalFormatter(encoding=encoding,style=style)) 70 | if not stream.isatty(): 71 | self.pformatter = NullFormatter 72 | if syntax == "guess": 73 | self.lexer = guess_lexer 74 | else: 75 | self.lexer = get_lexer_by_name(syntax) 76 | 77 | def emit(self, record): 78 | if self.pformatter == NullFormatter: 79 | return 80 | msg = self.format(record) 81 | # Note that the guessing also applies to any log formatting 82 | if self.lexer == guess_lexer: 83 | lexer = guess_lexer(msg) 84 | self.stream.write(highlight(msg,lexer,self.pformatter)) 85 | return 86 | self.stream.write(highlight(msg,self.lexer,self.pformatter)) 87 | 88 | 89 | def json_ansi(item, stream, sort_keys=True, indent=0, separators=(',', ':'), encoding='utf-8', style='default'): 90 | """Helper function to pretty-print JSON via Pygments""" 91 | 92 | formatter = (Terminal256Formatter(encoding=encoding,style=style) 93 | if '256color' in os.environ.get('TERM', '') 94 | else TerminalFormatter(encoding=encoding,style=style)) 95 | if not stream.isatty(): 96 | formatter = NullFormatter 97 | lexer = get_lexer_by_name('json') 98 | stream.write(highlight(json.dumps(item,sort_keys, indent, separators),lexer,formatter)) 99 | -------------------------------------------------------------------------------- /imagekit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Image utilities 5 | 6 | Created by: Rui Carmo 7 | License: MIT (see LICENSE for details) 8 | """ 9 | 10 | from operator import itemgetter 11 | 12 | def linear_partition(seq, k): 13 | if k <= 0: 14 | return [] 15 | n = len(seq) - 1 16 | if k > n: 17 | return map(lambda x: [x], seq) 18 | table, solution = linear_partition_table(seq, k) 19 | k, ans = k-2, [] 20 | while k >= 0: 21 | ans = [[seq[i] for i in xrange(solution[n-1][k]+1, n+1)]] + ans 22 | n, k = solution[n-1][k], k-1 23 | return [[seq[i] for i in xrange(0, n+1)]] + ans 24 | 25 | 26 | def linear_partition_table(seq, k): 27 | n = len(seq) 28 | table = [[0] * k for x in xrange(n)] 29 | solution = [[0] * (k-1) for x in xrange(n-1)] 30 | for i in xrange(n): 31 | table[i][0] = seq[i] + (table[i-1][0] if i else 0) 32 | for j in xrange(k): 33 | table[0][j] = seq[0] 34 | for i in xrange(1, n): 35 | for j in xrange(1, k): 36 | table[i][j], solution[i-1][j-1] = min( 37 | ((max(table[x][j-1], table[i][0]-table[x][0]), x) for x in xrange(i)), 38 | key=itemgetter(0)) 39 | return (table, solution) 40 | 41 | 42 | def get_info(data): 43 | """Parses a small buffer and attempts to return basic image metadata""" 44 | 45 | data = str(data) 46 | size = len(data) 47 | height = -1 48 | width = -1 49 | content_type = '' 50 | 51 | # handle GIFs 52 | if (size >= 10) and data[:6] in ('GIF87a', 'GIF89a'): 53 | # Check to see if content_type is correct 54 | content_type = 'image/gif' 55 | w, h = struct.unpack("= 24) and data.startswith('\211PNG\r\n\032\n') 63 | and (data[12:16] == 'IHDR')): 64 | content_type = 'image/png' 65 | w, h = struct.unpack(">LL", data[16:24]) 66 | width = int(w) 67 | height = int(h) 68 | 69 | # Maybe this is for an older PNG version. 70 | elif (size >= 16) and data.startswith('\211PNG\r\n\032\n'): 71 | # Check to see if we have the right content type 72 | content_type = 'image/png' 73 | w, h = struct.unpack(">LL", data[8:16]) 74 | width = int(w) 75 | height = int(h) 76 | 77 | # Check for a JPEG 78 | elif (size >= 4): 79 | jpeg = StringIO.StringIO(data) 80 | b = jpeg.read(4) 81 | if b.startswith('\xff\xd8\xff\xe0'): 82 | content_type = 'image/jpeg' 83 | bs = jpeg.tell() 84 | b = jpeg.read(2) 85 | bl = (ord(b[0]) << 8) + ord(b[1]) 86 | b = jpeg.read(4) 87 | if b.startswith("JFIF"): 88 | bs += bl 89 | while(bs < len(data)): 90 | jpeg.seek(bs) 91 | b = jpeg.read(4) 92 | bl = (ord(b[2]) << 8) + ord(b[3]) 93 | if bl >= 7 and b[0] == '\xff' and b[1] == '\xc0': 94 | jpeg.read(1) 95 | b = jpeg.read(4) 96 | height = (ord(b[0]) << 8) + ord(b[1]) 97 | width = (ord(b[2]) << 8) + ord(b[3]) 98 | break 99 | bs = bs + bl + 2 100 | return width, height, content_type 101 | -------------------------------------------------------------------------------- /timekit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Copyright (c) 2012, Rui Carmo 6 | Description: Utility functions for handling date and time information 7 | License: MIT (see LICENSE.md for details) 8 | """ 9 | 10 | import time 11 | import math 12 | import re 13 | import logging 14 | import datetime 15 | import gettext 16 | 17 | gettext.textdomain('date') 18 | _ = gettext.gettext 19 | 20 | log = logging.getLogger() 21 | 22 | 23 | # Embrace and extend Mark's feedparser mechanism 24 | 25 | _textmate_date_re = \ 26 | re.compile('(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})$') 27 | 28 | 29 | def parse_date(date): 30 | """Parse a TextMate date (YYYY-MM-DD HH:MM:SS, no time zone, assume it's always localtime)""" 31 | 32 | m = _textmate_date_re.match(date) 33 | try: 34 | from feedparser import _parse_date 35 | if not m: 36 | return time.mktime(_parse_date(date)) 37 | except: 38 | pass 39 | 40 | return time.mktime(time.localtime(calendar.timegm(time.gmtime(time.mktime(time.strptime(date, 41 | '%Y-%m-%d %H:%M:%S')))))) 42 | 43 | 44 | def iso_time(value=None): 45 | """Format a timestamp in ISO format""" 46 | 47 | if value == None: 48 | value = time.localtime() 49 | tz = time.timezone / 3600 50 | return time.strftime('%Y-%m-%dT%H:%M:%S-', value) + '%(tz)02d:00' \ 51 | % vars() 52 | 53 | 54 | def http_time(value=None): 55 | """Format a timestamp for HTTP headers""" 56 | 57 | if value == None: 58 | value = time.time() 59 | return time.strftime('%a, %d %b %Y %H:%M:%S GMT', 60 | time.gmtime(value)) 61 | 62 | 63 | def plain_date(date, rss=False): 64 | """Format a date consistently""" 65 | 66 | if isinstance(date, float) or isinstance(date, int): 67 | date = time.localtime(date) 68 | 69 | # trickery to replace leading zero in month day 70 | 71 | mday = time.strftime(' %d', date).replace(' 0', ' ').strip() 72 | weekday = _(time.strftime('%A', date)) 73 | month = _(time.strftime('%b', date)) 74 | year = time.strftime('%Y', date) 75 | 76 | # build English ordinal suffixes 77 | 78 | day = int(mday) 79 | if day > 20: 80 | day = int(mday[1]) 81 | try: 82 | suffix = ['th', 'st', 'nd', 'rd'][day] 83 | except: 84 | suffix = 'th' 85 | if rss: 86 | return _('rss_update_date_format') % locals() 87 | else: 88 | return _('journal_date_format') % locals() 89 | 90 | 91 | def fuzzy_time(date=None): 92 | intervals = { 93 | '00:00-00:59': 'latenight', 94 | '01:00-03:59': 'weehours', 95 | '04:00-06:59': 'dawn', 96 | '07:00-08:59': 'breakfast', 97 | '09:00-12:29': 'morning', 98 | '12:30-14:29': 'lunchtime', 99 | '14:30-16:59': 'afternoon', 100 | '17:00-17:29': 'teatime', 101 | '17:30-18:59': 'lateafternoon', 102 | '19:00-20:29': 'evening', 103 | '20:30-21:29': 'dinnertime', 104 | '21:30-22:29': 'night', 105 | '22:30-23:59': 'latenight', 106 | } 107 | if isinstance(date, float) or isinstance(date, int): 108 | date = time.localtime(date) 109 | then = time.strftime('%H:%M', date) 110 | for i in intervals.keys(): 111 | (l, u) = i.split('-') 112 | # cheesy (but perfectly usable) string comparison 113 | if l <= then and then <= u: 114 | return _(intervals[i]) 115 | return None 116 | 117 | 118 | def relative_time(value=None, addtime=False): 119 | """ 120 | A simple time string 121 | """ 122 | 123 | value = float(value) 124 | if addtime: 125 | format = ', %H:%M' 126 | else: 127 | format = '' 128 | if time.localtime(value)[0] != time.localtime()[0]: 129 | 130 | # we have a different year 131 | 132 | format = ' %Y' + format 133 | format = time.strftime('%b', time.localtime(value)) + ' %d' \ 134 | + format 135 | return time.strftime(format, time.localtime(value)).strip() 136 | 137 | 138 | def time_since(older=None, newer=None, detail=2): 139 | """ 140 | Human-readable time strings, based on Natalie Downe's code from 141 | http://blog.natbat.co.uk/archive/2003/Jun/14/time_since 142 | Assumes time parameters are in seconds 143 | """ 144 | 145 | intervals = { # corrected from the initial 31536000 146 | 31556926: 'year', 147 | 2592000: 'month', 148 | 604800: 'week', 149 | 86400: 'day', 150 | 3600: 'hour', 151 | 60: 'minute', 152 | } 153 | chunks = intervals.keys() 154 | 155 | # Reverse sort using a lambda for Python 2.3 compatibility 156 | chunks.sort(lambda x, y: y - x) 157 | 158 | if newer == None: 159 | newer = time.time() 160 | 161 | interval = newer - older 162 | if interval < 0: 163 | return _('some_time') 164 | 165 | # We should ideally do this: 166 | # raise ValueError('Time interval cannot be negative') 167 | # but it makes sense to fail gracefully here 168 | 169 | if interval < 60: 170 | return _('less_1min') 171 | 172 | output = '' 173 | for steps in range(detail): 174 | for seconds in chunks: 175 | count = math.floor(interval / seconds) 176 | unit = intervals[seconds] 177 | if count != 0: 178 | break 179 | if count > 1: 180 | unit = unit + 's' 181 | if count != 0: 182 | output = output + '%d %s, ' % (count, _(unit)) 183 | interval = interval - count * seconds 184 | output = output[:-2] 185 | return output 186 | 187 | 188 | def datetime_to_epoch(dt): 189 | epoch = datetime.datetime.utcfromtimestamp(0) 190 | delta = dt - epoch 191 | return delta.total_seconds() 192 | -------------------------------------------------------------------------------- /taskkit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Copyright (c) 2012, Rui Carmo 6 | Description: In-process job management 7 | License: MIT (see LICENSE.md for details) 8 | """ 9 | 10 | from Queue import Empty, Queue, PriorityQueue 11 | from collections import defaultdict 12 | from functools import partial 13 | from signal import signal, SIGINT, SIGTERM, SIGHUP 14 | import sys, logging 15 | from threading import Semaphore, Thread 16 | import time, traceback, ctypes 17 | from uuid import uuid4 18 | from cPickle import dumps, loads 19 | import multiprocessing 20 | 21 | log = logging.getLogger(__name__) 22 | 23 | default_priority = 0 24 | max_workers = multiprocessing.cpu_count() * 2 25 | channels = {} 26 | closed = {} 27 | 28 | class Pool: 29 | """Represents a thread pool""" 30 | 31 | def __init__(self, workers = max_workers, rate_limit = 1000): 32 | self.max_workers = workers 33 | self.mutex = Semaphore() 34 | self.results = {} 35 | self.retries = defaultdict(int) 36 | self.queue = PriorityQueue() 37 | self.threads = [] 38 | self.rate_limit = rate_limit 39 | self.running = True 40 | 41 | def _tick(self): 42 | time.sleep(1.0/self.rate_limit) 43 | # clean up finished threads 44 | self.threads = [t for t in self.threads if t.isAlive()] 45 | return (not self.queue.empty()) or (len(self.threads) > 0) 46 | 47 | 48 | def _loop(self): 49 | """Handle task submissions""" 50 | 51 | def run_task(priority, f, uuid, retries, args, kwargs): 52 | """Run a single task""" 53 | try: 54 | t.name = getattr(f, '__name__', None) 55 | result = f(*args, **kwargs) 56 | except Exception as e: 57 | # Retry the task if applicable 58 | if log: 59 | log.error(traceback.format_exc()) 60 | if retries > 0: 61 | with self.mutex: 62 | self.retries[uuid] += 1 63 | # re-queue the task with a lower (i.e., higher-valued) priority 64 | self.queue.put((priority+1, dumps((f, uuid, retries - 1, args, kwargs)))) 65 | self.queue.task_done() 66 | return 67 | result = e 68 | with self.mutex: 69 | self.results[uuid] = dumps(result) 70 | self.retries[uuid] += 1 71 | self.queue.task_done() 72 | 73 | while self._tick(): 74 | # spawn more threads to fill free slots 75 | log.debug("Running %d/%d threads" % (len(self.threads),self.max_workers)) 76 | if self.running and len(self.threads) < self.max_workers: 77 | log.debug("Queue Length: %d" % self.queue.qsize()) 78 | try: 79 | priority, data = self.queue.get(True, 1.0/self.rate_limit) 80 | except Empty: 81 | continue 82 | f, uuid, retries, args, kwargs = loads(data) 83 | log.debug(f) 84 | t = Thread(target=run_task, args=[priority, f, uuid, retries, args, kwargs]) 85 | t.setDaemon(True) 86 | self.threads.append(t) 87 | t.start() 88 | log.debug("Exited loop.") 89 | for t in self.threads: 90 | t.join() 91 | 92 | 93 | def kill_all(self): 94 | """Very hacky way to kill threads by tossing an exception into their state""" 95 | for t in self.threads: 96 | ctypes.pythonapi.PyThreadState_SetAsyncExc(ctypes.c_long(t.ident), ctypes.py_object(SystemExit)) 97 | 98 | 99 | def stop(self): 100 | """Flush the job queue""" 101 | self.running = False 102 | self.queue = PriorityQueue() 103 | 104 | 105 | def start(self, daemonize=False): 106 | """Pool entry point""" 107 | 108 | self.results = {} 109 | self.retries = defaultdict(int) 110 | 111 | if daemonize: 112 | t = Thread(target = self._loop, args=[self]) 113 | t.setDaemon(True) 114 | t.start() 115 | return 116 | else: 117 | self._loop() 118 | 119 | 120 | default_pool = Pool() 121 | 122 | class Deferred(object): 123 | """Allows lookup of task results and status""" 124 | def __init__(self, pool, uuid): 125 | self.uuid = uuid 126 | self.pool = pool 127 | self._result = None 128 | 129 | @property 130 | def result(self): 131 | if self._result is None: 132 | with self.pool.mutex: 133 | if self.uuid in self.pool.results.keys(): 134 | self._result = loads(self.pool.results[self.uuid]) 135 | return self._result 136 | 137 | @property 138 | def retries(self): 139 | return self.pool.retries[self.uuid] 140 | 141 | 142 | def task(func=None, pool=None, max_retries=0, priority=default_priority): 143 | """Task decorator - setus up a .delay() attribute in the task function""" 144 | 145 | if func is None: 146 | return partial(task, pool=pool, max_retries=max_retries) 147 | 148 | if pool is None: 149 | pool = default_pool 150 | 151 | def delay(*args, **kwargs): 152 | uuid = str(uuid4()) # one for each task 153 | pool.queue.put((priority,dumps((func, uuid, max_retries, args, kwargs)))) 154 | return Deferred(pool, uuid) 155 | func.delay = delay 156 | func.pool = pool 157 | return func 158 | 159 | 160 | def go(*args, **kwargs): 161 | """Queue up a function, Go-style""" 162 | uuid = str(uuid4()) # one for each task 163 | default_pool.queue.put((default_priority,dumps((args[0], uuid, 0, args[1:], kwargs)))) 164 | return Deferred(default_pool, uuid) 165 | 166 | 167 | class Channel: 168 | """A serializable shim that proxies to a Queue object""" 169 | def __init__(self, size): 170 | self.uuid = str(uuid4()) # one for each task 171 | channels[self.uuid] = Queue(size) 172 | 173 | def recv(self): 174 | return channels[self.uuid].get() 175 | 176 | def send(self, item): 177 | if self.uuid in closed: 178 | raise RuntimeError("Channel is closed.") 179 | channels[self.uuid].put(item) 180 | 181 | def close(self): 182 | closed[self.uuid] = True 183 | 184 | def __iter__(self): 185 | yield self.recv() 186 | while True: 187 | try: 188 | res = channels[self.uuid].get(True, 1.0/default_pool.rate_limit) 189 | yield res 190 | except Empty: 191 | # check channel again and end iteration if closed 192 | if channels[self.uuid].empty() and (self.uuid in closed): 193 | return 194 | 195 | 196 | def chan(size = 0): 197 | """Return a shim that acts like a Go channel""" 198 | return Channel(size) 199 | 200 | 201 | def halt(signal, frame): 202 | default_pool.stop() 203 | default_pool.kill_all() 204 | sys.exit() 205 | 206 | 207 | def start(daemonize = False): 208 | signal(SIGINT, halt) 209 | signal(SIGTERM, halt) 210 | default_pool.start(daemonize = daemonize) 211 | -------------------------------------------------------------------------------- /decorators.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Decorator functions 5 | 6 | Created by: Rui Carmo 7 | """ 8 | 9 | from bottle import request, response, route, abort 10 | import time, binascii, hashlib, email.utils, functools, json, cProfile, collections 11 | from datetime import datetime 12 | import logging 13 | from core import tb 14 | 15 | # Allow importing even when Redis bindings aren't present 16 | try: 17 | from redis import StrictRedis as Redis 18 | except ImportError: 19 | pass 20 | 21 | log = logging.getLogger() 22 | 23 | gmt_format_string = "%a, %d %b %Y %H:%M:%S GMT" 24 | 25 | 26 | class CustomEncoder(json.JSONEncoder): 27 | """Custom encoder that serializes datetimes into JS-compliant times""" 28 | 29 | def default(self, obj): 30 | if isinstance(obj, datetime): 31 | epoch = datetime.utcfromtimestamp(0) 32 | delta = obj - epoch 33 | return int(delta.total_seconds()) * 1000 34 | return json.JSONEncoder.default(self, obj) 35 | 36 | 37 | def cache_redis(r, prefix='url', ttl=3600): 38 | """Cache route results in Redis""" 39 | 40 | def decorator(callback): 41 | @functools.wraps(callback) 42 | def wrapper(*args, **kwargs): 43 | try: 44 | item = json.loads(r.get('%s:%s' % (prefix,request.urlparts.path))) 45 | body = item['body'] 46 | for h in item['headers']: 47 | response.set_header(str(h), item['headers'][h]) 48 | response.set_header('X-Source', 'Redis') 49 | except Exception as e: 50 | log.debug("Redis cache miss for %s" % request.urlparts.path) 51 | body = callback(*args, **kwargs) 52 | item = { 53 | 'body': body, 54 | 'headers': dict(response.headers), 55 | 'mtime': int(time.time()) 56 | } 57 | k = '%s:%s' % (prefix, request.urlparts.path) 58 | r.set(k, json.dumps(item)) 59 | r.expire(k, ttl) 60 | return body 61 | return wrapper 62 | return decorator 63 | 64 | 65 | def cache_results(timeout=0): 66 | """Cache route results for a given period of time""" 67 | 68 | def decorator(callback): 69 | _cache = {} 70 | _times = {} 71 | 72 | @functools.wraps(callback) 73 | def wrapper(*args, **kwargs): 74 | 75 | def expire(when): 76 | for t in [k for k in _times.keys()]: 77 | if (when - t) > timeout: 78 | del(_cache[_times[t]]) 79 | del(_times[t]) 80 | 81 | now = time.time() 82 | try: 83 | item = _cache[request.urlparts] 84 | if 'If-Modified-Since' in request.headers: 85 | try: 86 | since = time.mktime(email.utils.parsedate(request.headers['If-Modified-Since'])) 87 | except: 88 | since = now 89 | if item['mtime'] >= since: 90 | expire(now) 91 | abort(304,'Not modified') 92 | for h in item['headers']: 93 | response.set_header(str(h), item['headers'][h]) 94 | body = item['body'] 95 | response.set_header('X-Source', 'Worker Cache') 96 | except KeyError: 97 | body = callback(*args, **kwargs) 98 | item = { 99 | 'body': body, 100 | 'headers': response.headers, 101 | 'mtime': int(now) 102 | } 103 | _cache[request.urlparts] = item 104 | _times[now] = request.urlparts 105 | 106 | expire(now) 107 | return body 108 | return wrapper 109 | return decorator 110 | 111 | 112 | def cache_control(seconds = 0): 113 | """Insert HTTP caching headers""" 114 | 115 | def decorator(callback): 116 | @functools.wraps(callback) 117 | def wrapper(*args, **kwargs): 118 | expires = int(time.time() + seconds) 119 | expires = time.strftime(gmt_format_string, time.gmtime(expires)) 120 | response.set_header('Expires', expires) 121 | if seconds: 122 | pragma = 'public' 123 | else: 124 | pragma = 'no-cache, must-revalidate' 125 | response.set_header('Cache-Control', "%s, max-age=%s" % (pragma, seconds)) 126 | response.set_header('Pragma', pragma) 127 | return callback(*args, **kwargs) 128 | return wrapper 129 | return decorator 130 | 131 | 132 | def profile(filename=None): 133 | """Profiling decorator for functions taking one or more arguments""" 134 | 135 | def decorator(callback): 136 | @functools.wraps(callback) 137 | def wrapper(*args, **kwargs): 138 | import cProfile 139 | import logging 140 | log.info('Profiling %s' % (callback.__name__)) 141 | try: 142 | profiler = cProfile.Profile() 143 | res = profiler.runcall(callback, *args, **kwargs) 144 | profiler.dump_stats(filename or '%s_fn.profile' % (callback.__name__)) 145 | except IOError: 146 | log.exception(_("Could not open profile '%(filename)s'") % {"filename": filename}) 147 | return res 148 | return wrapper 149 | return decorator 150 | 151 | 152 | def timed(callback): 153 | """Decorator for timing route processing""" 154 | 155 | @functools.wraps(callback) 156 | def wrapper(*args, **kwargs): 157 | start = time.time() 158 | body = callback(*args, **kwargs) 159 | end = time.time() 160 | response.set_header('X-Processing-Time', str(end - start)) 161 | return body 162 | return wrapper 163 | 164 | 165 | def jsonp(callback): 166 | """Decorator for JSONP handling""" 167 | 168 | @functools.wraps(callback) 169 | def wrapper(*args, **kwargs): 170 | body = callback(*args, **kwargs) 171 | try: 172 | body = json.dumps(body, cls=CustomEncoder) 173 | # Set content type only if serialization successful 174 | response.content_type = 'application/json' 175 | except Exception, e: 176 | return body 177 | 178 | callback_function = request.query.get('callback') 179 | if callback_function: 180 | body = ''.join([callback_function, '(', body, ')']) 181 | response.content_type = 'text/javascript' 182 | 183 | response.set_header('Last-Modified', time.strftime(gmt_format_string, time.gmtime())) 184 | response.set_header('ETag', binascii.b2a_base64(hashlib.sha1(body).digest()).strip()) 185 | response.set_header('Content-Length', len(body)) 186 | return body 187 | return wrapper 188 | 189 | 190 | def memoize(f): 191 | """Memoization decorator for functions taking one or more arguments""" 192 | 193 | class memodict(dict): 194 | def __init__(self, f): 195 | self.f = f 196 | 197 | def __call__(self, *args): 198 | return self[args] 199 | 200 | def __missing__(self, key): 201 | res = self[key] = self.f(*key) 202 | return res 203 | 204 | def __repr__(self): 205 | return self.f.__doc__ 206 | 207 | def __get__(self, obj, objtype): 208 | return functools.partial(self.__call__, obj) 209 | return memodict(f) 210 | 211 | 212 | def lru_cache(limit=100): 213 | """Least-recently-used cache decorator""" 214 | 215 | def inner_function(callback): 216 | cache = collections.OrderedDict() 217 | 218 | @functools.wraps(callback) 219 | def wrapper(*args, **kwargs): 220 | key = args 221 | if kwargs: 222 | key += tuple(sorted(kwargs.items())) 223 | try: 224 | result = cache.pop(key) 225 | except KeyError: 226 | result = callback(*args, **kwargs) 227 | if len(cache) >= limit: 228 | cache.popitem(0) 229 | cache[key] = result # refresh position 230 | return result 231 | return wrapper 232 | return inner_function 233 | -------------------------------------------------------------------------------- /urlkit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Copyright (c) 2013, Rui Carmo 6 | Description: Utility functions for retrieving CPU statistics 7 | License: MIT (see LICENSE.md for details) 8 | """ 9 | 10 | import logging 11 | import os 12 | import sys 13 | 14 | log = logging.getLogger() 15 | 16 | import re 17 | import gzip 18 | import base64 19 | import tempfile 20 | import urllib 21 | import urllib2 22 | import urlparse 23 | from StringIO import StringIO 24 | from xml.dom.minidom import parseString 25 | from urllib2 import HTTPCookieProcessor, HTTPRedirectHandler, HTTPDefaultErrorHandler, HTTPError 26 | import cookielib 27 | from collections import defaultdict 28 | from utils.core import tb 29 | from config import settings 30 | from utils.decorators import memoize 31 | from datetime import datetime 32 | 33 | # Initialize debug level upon module load 34 | #httplib.HTTPConnection.debuglevel = settings.httplib.debuglevel 35 | 36 | @memoize 37 | def shorten(url): 38 | """Minimalist URL shortener using SAPO services""" 39 | u = '?'.join(('http://services.sapo.pt/PunyURL/GetCompressedURLByURL', urllib.urlencode({'url':url}))) 40 | try: 41 | x = parseString(fetch(u)['data']) 42 | return x.getElementsByTagName('ascii')[0].firstChild.data 43 | except: 44 | return url 45 | 46 | 47 | @memoize 48 | def agnostic_shortener(url): 49 | """A more flexible URL shortener""" 50 | 51 | services = { 52 | 'tinyurl.com':'/api-create.php?url=', 53 | 'is.gd' :'/api.php?longurl=', 54 | #'api.bit.ly':"http://api.bit.ly/shorten?version=2.0.1&%s&format=text&longUrl=" % BITLY_AUTH, 55 | 'api.tr.im' :'/api/trim_simple?url=' 56 | } 57 | 58 | for shortener in self.services.keys(): 59 | try: 60 | res = fetch(self.services[shortener] + urllib.quote(url)) 61 | shorturl = res['data'].strip() 62 | if ("Error" not in shorturl) and ("http://" + urlparse.urlparse(shortener)[1] in shorturl): 63 | return shorturl 64 | else: 65 | continue 66 | except: 67 | log.warn("%s: %s" % (tb(),url)) 68 | pass 69 | return url 70 | 71 | 72 | def expand(url, remove_junk = True, timeout = None): 73 | """Resolve short URLs""" 74 | url = unicode(url) 75 | result = url 76 | 77 | #log.debug(u"%s -> ?" % url) 78 | 79 | (scheme, netloc, path, params, query, fragment) = urlparse.urlparse(url) 80 | 81 | if scheme not in ['http','https']: 82 | return result 83 | 84 | # time sinks that aren't worth expanding further 85 | if re.match( "(" + ")|(".join([i.replace('.','\.').replace('*','.+') for i in settings.expander.ignore]) + ")", netloc): 86 | return result 87 | 88 | res = {} 89 | user_agents = defaultdict(lambda: settings.fetcher.user_agent) 90 | user_agents.update(settings.expander.user_agents) 91 | user_agent = user_agents[netloc] 92 | 93 | try: 94 | res = fetch(url, head=True, timeout=timeout, user_agent=user_agent) 95 | except: 96 | #log.debug(u"%s: %s" % (tb(),url)) 97 | pass 98 | 99 | if 'url' in res: 100 | (scheme, netloc, path, params, query, fragment) = urlparse.urlparse(res['url']) 101 | if scheme not in ['http','https']: 102 | return result 103 | else: 104 | result = res['url'] 105 | 106 | if remove_junk: 107 | result = scrub_query(result) 108 | #log.debug(u"%s -> %s" % (url,result)) 109 | if fragment: 110 | return "%s#%s" % (result, fragment) 111 | else: 112 | return result 113 | 114 | 115 | def scrub_query(url): 116 | """Clean query arguments""" 117 | 118 | scrub = ["utm_source","utm_campaign","utm_medium","piwik_campaign","piwik_kwd"] 119 | 120 | url = urlparse.urldefrag(url)[0] 121 | base, sep, query = url.partition('?') 122 | seen = set() 123 | result = [] 124 | for field in query.split('&'): 125 | name, sep, value = field.partition('=') 126 | if name in seen: 127 | continue 128 | elif name in scrub: 129 | continue 130 | else: 131 | result.append(field) 132 | seen.add(name) 133 | result = '?'.join([base, sep.join(result)]) if result else base 134 | # strip dangling '?' 135 | if result[-1:] == '?': 136 | result = result[:-1] 137 | return result 138 | 139 | 140 | def data_uri(content_type, data): 141 | """Return data as a data: URI scheme""" 142 | return "data:%s;base64,%s" % (content_type, base64.urlsafe_b64encode(data)) 143 | 144 | 145 | class SmartRedirectHandler(HTTPRedirectHandler): 146 | 147 | def http_error_302(self, req, fp, code, msg, headers): 148 | result = HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers) 149 | result.status = code 150 | #log.debug("%d %s" % (code, req.get_full_url())) 151 | return result 152 | 153 | http_error_301 = http_error_303 = http_error_307 = http_error_302 154 | 155 | 156 | class DefaultErrorHandler(HTTPDefaultErrorHandler): 157 | 158 | def http_error_default(self, req, fp, code, msg, headers): 159 | result = HTTPError(req.get_full_url(), code, msg, headers, fp) 160 | result.status = code 161 | return result 162 | 163 | 164 | def _open_source(source, head, data = None, etag = None, last_modified = None, timeout = None, user_agent = "Mozilla/5.0"): 165 | """Open anything""" 166 | 167 | if hasattr(source, 'read'): 168 | return source 169 | if source == '-': 170 | return sys.stdin 171 | 172 | if urlparse.urlparse(source)[0][:4] == 'http': 173 | request = urllib2.Request(source, data) 174 | if head and not data: 175 | request.get_method = lambda: 'HEAD' 176 | request.add_header('User-Agent', user_agent) 177 | if etag: 178 | request.add_header('If-None-Match', etag) 179 | if last_modified: 180 | request.add_header('If-Modified-Since', last_modified) 181 | request.add_header('Accept-encoding', 'gzip') 182 | jar = cookielib.MozillaCookieJar() 183 | jar.set_policy(cookielib.DefaultCookiePolicy(rfc2965=True, strict_rfc2965_unverifiable=False)) 184 | opener = urllib2.build_opener(SmartRedirectHandler(), HTTPCookieProcessor(jar), DefaultErrorHandler()) 185 | return opener.open(request, None, timeout) 186 | try: 187 | return open(source) 188 | except(IOError,OSError): 189 | pass 190 | return StringIO(str(source)) 191 | 192 | 193 | def fetch(url, data = None, etag = None, last_modified = None, head = False, timeout = None, user_agent = "Mozilla/5.0"): 194 | """Fetch a URL and return the contents""" 195 | 196 | result = {} 197 | f = _open_source(url, head, data, etag, last_modified, timeout, user_agent) 198 | if not head: 199 | result['data'] = f.read() 200 | if hasattr(f, 'headers'): 201 | result.update({k.lower(): f.headers.get(k) for k in f.headers}) 202 | if f.headers.get('content-encoding', '') == 'gzip' and not head: 203 | result['data'] = gzip.GzipFile(fileobj=StringIO(result['data'])).read() 204 | if hasattr(f.headers, 'last-modified'): 205 | try: 206 | result['modified_parsed'] = datetime.strptime(f.headers['last-modified'], "%a, %d %b %Y %H:%M:%S %Z") 207 | except Exception, e: 208 | log.debug("Could not parse Last-Modified header '%s'" % f.headers['last-modified']) 209 | pass 210 | if hasattr(f, 'url'): 211 | result['url'] = unicode(f.url) 212 | result['status'] = 200 213 | if hasattr(f, 'status'): 214 | result['status'] = f.status 215 | f.close() 216 | return result 217 | 218 | 219 | def download(url, filename=None, suffix='', user_agent = "Mozilla/5.0"): 220 | """Convenience function for downloading a URL directly to the filesystem""" 221 | 222 | opener = urllib.FancyURLopener({}) 223 | opener.version = user_agent 224 | 225 | if not filename: 226 | fd, filename = tempfile.mkstemp(suffix) 227 | os.close(fd) 228 | 229 | try: 230 | opener.retrieve(url, filename) 231 | return filename 232 | except Exception as e: 233 | log.error("Could not download %(url)s: %(e)s" % locals()) 234 | return None 235 | 236 | --------------------------------------------------------------------------------