├── .gitignore ├── .hgignore ├── .hgtags ├── README.txt ├── datautil ├── __init__.py ├── cache.py ├── cli.py.command ├── clitools.py ├── date.py ├── deliveranceproxy.py ├── id.py ├── misc.py ├── normalization │ ├── __init__.py │ ├── table_based.py │ └── text.py ├── parse │ ├── __init__.py │ └── name.py ├── scrape.py ├── tabular │ ├── __init__.py │ ├── base.py │ ├── gdocs.py │ ├── html.py │ ├── misc.py │ ├── tabular_json.py │ ├── txt.py │ └── xls.py └── tests │ ├── __init__.py │ ├── data │ └── xls_reader_test.xls │ ├── parse │ └── test_name.py │ ├── tabular │ ├── __init__.py │ ├── test_base.py │ ├── test_gdocs.py │ ├── test_json.py │ ├── test_misc.py │ └── test_txt.py │ ├── test_cache.py │ ├── test_date.py │ ├── test_id.py │ ├── test_misc.py │ └── test_xls.py ├── setup.py └── swiss ├── __init__.py ├── cache.py ├── clitools.py ├── date.py ├── deliveranceproxy.py ├── id.py ├── misc.py ├── parse ├── __init__.py └── name.py ├── tabular ├── __init__.py ├── base.py ├── gdocs.py ├── html.py ├── misc.py ├── tabular_json.py ├── txt.py └── xls.py └── tests ├── __init__.py ├── data └── xls_reader_test.xls ├── parse └── test_name.py ├── tabular ├── __init__.py ├── test_base.py ├── test_gdocs.py ├── test_json.py ├── test_misc.py └── test_txt.py ├── test_cache.py ├── test_date.py ├── test_id.py ├── test_misc.py └── test_xls.py /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | .idea/* 3 | *.pyc 4 | docs/build/* -------------------------------------------------------------------------------- /.hgignore: -------------------------------------------------------------------------------- 1 | syntax: glob 2 | *.egg-info/* 3 | *.pyc 4 | *.swp 5 | *.swo 6 | sandbox/* 7 | 8 | syntax: regexp 9 | ^build$ 10 | ^pyenv$ 11 | -------------------------------------------------------------------------------- /.hgtags: -------------------------------------------------------------------------------- 1 | 3e61713892d3525675712a96fbcbc439837151d0 0.1 2 | 5d28eda958146bb213aee67ef89bc04ec5a1e06e 0.2 3 | 99c63b2a432dbfe32f7a9359d3cb8076412aa164 0.3 4 | -------------------------------------------------------------------------------- /README.txt: -------------------------------------------------------------------------------- 1 | Swiss Army Knife for Data Work. 2 | 3 | For details read the main package docstring. 4 | 5 | Open source software licensed under the MIT license. 6 | 7 | ## Install 8 | 9 | 1. Install setuptools 10 | 11 | 2. Either install directy from PyPI usinging easy_install: 12 | 13 | $ easy_install datautil 14 | 15 | OR install from the source obtainable from the mercurial repository: 16 | 17 | $ hg clone https://github.com/okfn/datautil 18 | 19 | ## Tests 20 | 21 | 1. Ensure you also have install 'xlrd' and 'gdata' (options mentioned 22 | in setup.py) and nose (for running tests): 23 | 24 | $ easy_install nose xlrd gdata 25 | 26 | 2. Run the tests: 27 | 28 | $ nosetests datautil/tests/ 29 | -------------------------------------------------------------------------------- /datautil/__init__.py: -------------------------------------------------------------------------------- 1 | '''Utilities for Data Work 2 | ======================= 3 | 4 | The datautil package provides various utilities for working with data: 5 | 6 | * cache: Url caching and scraping 7 | * tabular/*: Processing and transforming tabular data to and from various 8 | formats including csv, json, google spreadsheets, xls 9 | * misc, date: Cleaning up and parsing data especially dates. 10 | * id: ID generation and shortenening 11 | * clitools.py: Command line tools such as creating optparse object and usage 12 | from a module of object. 13 | * deliveranceproxy.py: Deliverance proxy helper 14 | 15 | 16 | CHANGELOG 17 | ========= 18 | 19 | v0.5 2011-??-?? 20 | --------------- 21 | 22 | * Minor improvements to cache 23 | 24 | v0.4 2011-01-05 25 | --------------- 26 | 27 | * Rename swiss to datautil 28 | 29 | v0.3 2010-08-01 30 | --------------- 31 | 32 | * Support for google docs spreadsheets as sources for TabularData 33 | * Improve documentation of date module and add FlexiDate.as_datetime() 34 | * New clitools module incorporating existing cli tools 35 | * deliveranceproxy.py: Deliverance proxy helper for proxying to remote 36 | websites and retheming with deliverance. 37 | * parse/name.py: new (human) name parsing code 38 | 39 | v0.2 2009-10-23 40 | --------------- 41 | 42 | * Extensive refactoring of tabular module/package 43 | * Standardized interface with BaseReader and BaseWriter 44 | * JsonReader and JsonWriter providing json reading and writing 45 | * TxtWriter to support writing to plain text 46 | * Improvements to date parsing (support for circa, 'c.', etc) 47 | * New id module to do 'compression' of uuids using 32 and 64 bit encoding 48 | 49 | 50 | v0.1 2009-06-03 51 | --------------- 52 | 53 | * Bring together existing code (from last 2+ years) into new 'datautil' package 54 | * Url caching and scraping 55 | * Tabular data handling including csv reader/writer, xls reader, latex writer 56 | and associated utilities (such as pivot_table) 57 | * Cleaning and parsing data especially dates (misc and date modules) 58 | ''' 59 | __version__ = '0.4' 60 | 61 | try: 62 | import tabular 63 | except ImportError: 64 | tabular = None 65 | from cache import * 66 | from misc import * 67 | from id import * 68 | -------------------------------------------------------------------------------- /datautil/cache.py: -------------------------------------------------------------------------------- 1 | '''A local file cache with url retrieving builtin. 2 | 3 | NB: this module has zero dependencies on modules outside of the 4 | standard lib so that it is easily reusable in other libraries and applications 5 | that do not require any other parts of the datautil package. 6 | ''' 7 | import urlparse 8 | import urllib 9 | import os 10 | import sys 11 | 12 | 13 | # have to define before Cache as used in classmethod 14 | class _Progress(object): 15 | def __init__(self): 16 | self.count = -1 17 | 18 | def dl_progress(self, count, block_size, total_size): 19 | if total_size == 0: # total_size is weird so return to avoid errors 20 | return 21 | if self.count == -1: 22 | print 'Total size: %s' % self.format_size(total_size) 23 | last_percent = int(self.count*block_size*100/total_size) 24 | percent = int(count*block_size*100/total_size) 25 | if percent > last_percent: 26 | # TODO: is this acceptable? Do we want to do something nicer? 27 | sys.stdout.write('.') 28 | sys.stdout.flush() 29 | self.count = count 30 | 31 | def format_size(self, bytes): 32 | if bytes > 1000*1000: 33 | return '%.1fMb' % (bytes/1000.0/1000) 34 | elif bytes > 10*1000: 35 | return '%iKb' % (bytes/1000) 36 | elif bytes > 1000: 37 | return '%.1fKb' % (bytes/1000.0) 38 | else: 39 | return '%ibytes' % bytes 40 | 41 | 42 | class Cache(object): 43 | '''A local file cache (and url retriever). 44 | ''' 45 | 46 | def __init__(self, path='.'): 47 | ''' 48 | @param path: path to cache (defaults to current directory) 49 | ''' 50 | self.path = path 51 | if not os.path.exists(self.path): 52 | os.makedirs(path) 53 | 54 | def retrieve(self, url, overwrite=False): 55 | '''Retrieve url into cache and return the local path to it. 56 | 57 | :param url: url to retrieve. 58 | :return: path to file retrieved. 59 | ''' 60 | dest = self.cache_path(url) 61 | self.download(url, dest, overwrite) 62 | return dest 63 | 64 | def cache_path(self, url): 65 | '''Local path for url within cache.''' 66 | name = self.basename(url) 67 | dest = os.path.join(self.path, name) 68 | return dest 69 | 70 | def filepath(self, url): 71 | '''Deprecated: use cache_path''' 72 | return self.cache_path(url) 73 | 74 | def stream(self, url): 75 | fp = self.cache_path(url) 76 | if not os.path.exists(fp): 77 | return None 78 | else: 79 | return open(fp) 80 | 81 | @classmethod 82 | def basename(self, url): 83 | scheme, netloc, path, params, query, fragment = urlparse.urlparse(url) 84 | result = path.split('/')[-1] 85 | if query: 86 | # escape '/' as otherwise path problems 87 | result += '?' + query.replace('/', '%47') 88 | return result 89 | 90 | @classmethod 91 | def download(self, url, dest, overwrite=False): 92 | '''Download a file from a url. 93 | 94 | :param url: the source url 95 | :param dest: the destination path to save to. 96 | :param overwrite: overwrite destination file if it exists (defaults to 97 | False). 98 | ''' 99 | url = url.encode('utf-8') 100 | if not os.path.exists(dest) or overwrite: 101 | print 'Retrieving %s' % url 102 | prog = _Progress() 103 | urllib.urlretrieve(url, dest, reporthook=prog.dl_progress) 104 | else: 105 | print 'Skipping download as dest already exists: %s' % url 106 | 107 | # for backwards compatability 108 | @classmethod 109 | def dl(self, url, dest=None): 110 | return self.download(url, dest) 111 | 112 | -------------------------------------------------------------------------------- /datautil/cli.py.command: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import optparse 4 | import logging 5 | from StringIO import StringIO 6 | import traceback 7 | import time 8 | 9 | parser = optparse.OptionParser() 10 | 11 | parser.add_option( 12 | '-v', '--verbose', 13 | dest='verbose', 14 | action='count', 15 | default=0, 16 | help='Give more output') 17 | parser.add_option( 18 | '-q', '--quiet', 19 | dest='quiet', 20 | action='count', 21 | default=0, 22 | help='Give less output') 23 | 24 | class Command(object): 25 | name = None 26 | usage = None 27 | default_parser = None 28 | all_commands = [] 29 | 30 | def __init__(self): 31 | assert self.name 32 | self.parser = optparse.OptionParser( 33 | usage=self.usage, 34 | prog='%s %s' % (sys.argv[0], self.name), 35 | version=parser.version) 36 | for option in self.default_parser.option_list: 37 | if not option.dest: 38 | # -h, --version, etc 39 | continue 40 | self.parser.add_option(option) 41 | Command.all_commands[self.name] = self 42 | 43 | def merge_options(self, initial_options, options): 44 | for attr in ['log']: 45 | setattr(options, attr, getattr(initial_options, attr) or getattr(options, attr)) 46 | options.quiet += initial_options.quiet 47 | options.verbose += initial_options.verbose 48 | 49 | def main(self, complete_args, args, initial_options): 50 | options = initial_options 51 | discarded_options, args = self.parser.parse_args(args) 52 | # From pip but not needed by us I think 53 | # self.merge_options(initial_options, options) 54 | self.options = options 55 | self.verbose = options.verbose 56 | 57 | level = 1 58 | level += options.verbose 59 | level -= options.quiet 60 | complete_log = [] 61 | if options.log: 62 | log_fp = open_logfile_append(options.log) 63 | logger.consumers.append((logger.DEBUG, log_fp)) 64 | else: 65 | log_fp = None 66 | 67 | exit = 0 68 | try: 69 | self.run(options, args) 70 | except: 71 | logger.fatal('Exception:\n%s' % format_exc()) 72 | exit = 2 73 | 74 | if log_fp is not None: 75 | log_fp.close() 76 | if exit: 77 | log_fn = 'datapkg-log.txt' 78 | text = '\n'.join(complete_log) 79 | # Not sure we need to tell people ... 80 | # logger.fatal('Storing complete log in %s' % log_fn) 81 | log_fp = open_logfile_append(log_fn) 82 | log_fp.write(text) 83 | log_fp.close() 84 | sys.exit(exit) 85 | -------------------------------------------------------------------------------- /datautil/clitools.py: -------------------------------------------------------------------------------- 1 | '''Expose methods or functions as commands on the command line 2 | 3 | Example usage:: 4 | 5 | # in your code 6 | from datautil.clitools import _main 7 | if __name__ == '__main__': 8 | # expose everything in current module 9 | _main(locals()) 10 | # or if you have an object MyObject with methods you want to expose 11 | _main(MyObject) 12 | ''' 13 | import os 14 | import sys 15 | import optparse 16 | import inspect 17 | 18 | def _object_methods(obj): 19 | methods = inspect.getmembers(obj, inspect.ismethod) 20 | methods = filter(lambda (name,y): not name.startswith('_'), methods) 21 | methods = dict(methods) 22 | return methods 23 | 24 | def _module_functions(functions): 25 | local_functions = dict(functions) 26 | for k,v in local_functions.items(): 27 | if not inspect.isfunction(v) or k.startswith('_'): 28 | del local_functions[k] 29 | return local_functions 30 | 31 | def _main(functions_or_object): 32 | isobject = inspect.isclass(functions_or_object) 33 | if isobject: 34 | _methods = _object_methods(functions_or_object) 35 | else: 36 | _methods = _module_functions(functions_or_object) 37 | 38 | usage = '''%prog {action} 39 | 40 | Actions: 41 | ''' 42 | usage += '\n '.join( 43 | [ '%s: %s' % (name, m.__doc__.split('\n')[0] if m.__doc__ else '') for (name,m) 44 | in sorted(_methods.items()) ]) 45 | parser = optparse.OptionParser(usage) 46 | # Optional: for a config file 47 | # parser.add_option('-c', '--config', dest='config', 48 | # help='Config file to use.') 49 | options, args = parser.parse_args() 50 | 51 | if not args or not args[0] in _methods: 52 | parser.print_help() 53 | sys.exit(1) 54 | 55 | method = args[0] 56 | if isobject: 57 | getattr(functions_or_object(), method)(*args[1:]) 58 | else: 59 | _methods[method](*args[1:]) 60 | 61 | __all__ = [ '_main' ] 62 | 63 | if __name__ == '__main__': 64 | _main(locals()) 65 | 66 | -------------------------------------------------------------------------------- /datautil/date.py: -------------------------------------------------------------------------------- 1 | """ 2 | Date parsing and normalization utilities based on FlexiDate. 3 | 4 | To parse dates use parse(), e.g.:: 5 | 6 | from datautil.date import parse 7 | 8 | parse('1890') -> FlexiDate(year=u'1890') 9 | parse('1890?') -> FlexiDate(year=u'1890', qualifier='Uncertainty: 1985?') 10 | 11 | Once you have a FlexiDate you can get access to attributes (strings of course 12 | ...):: 13 | 14 | fd = parse('Jan 1890') 15 | fd.year # u'1890' 16 | fd.month # u'01' 17 | 18 | And convert to other forms:: 19 | 20 | fd.as_float() # 1890 21 | fd.as_datetime() # datetime(1890,01,01) 22 | 23 | Background 24 | ========== 25 | 26 | FlexiDate is focused on supporting: 27 | 28 | 1. Dates outside of Python (or DB) supported period (esp. dates < 0 AD) 29 | 2. Imprecise dates (c.1860, 18??, fl. 1534, etc) 30 | 3. Normalization of dates to machine processable versions 31 | 4. Sortable in the database (in correct date order) 32 | 33 | For more information see: 34 | 35 | `Flexible Dates in Python (including BC) `_ 36 | 37 | -------------------- 38 | 39 | """ 40 | import re 41 | import datetime 42 | 43 | class FlexiDate(object): 44 | """Store dates as strings and present them in a slightly extended version 45 | of ISO8601. 46 | 47 | Modifications: 48 | * Allow a trailing qualifiers e.g. fl. 49 | * Allow replacement of unknown values by ? e.g. if sometime in 1800s 50 | can do 18?? 51 | 52 | Restriction on ISO8601: 53 | * Truncation (e.g. of centuries) is *not* permitted. 54 | * No week and day representation e.g. 1999-W01 55 | """ 56 | # pass 57 | def __init__(self, year=None, month=None, day=None, qualifier=''): 58 | # force = month or day or qualifier 59 | force = False 60 | self.year = self._cvt(year, rjust=4, force=force) 61 | self.month = self._cvt(month) 62 | self.day = self._cvt(day) 63 | self.qualifier = qualifier 64 | 65 | def _cvt(self, val, rjust=2, force=False): 66 | if val: 67 | tmp = unicode(val).strip() 68 | if tmp.startswith('-'): 69 | tmp = '-' + tmp[1:].rjust(rjust, '0') 70 | else: 71 | tmp = tmp.rjust(rjust, '0') 72 | return tmp 73 | elif force: 74 | # use '!' rather than '?' as '!' < '1' while '?' > '1' 75 | return rjust * '!' 76 | else: 77 | return '' 78 | 79 | def __str__(self): 80 | out = self.isoformat() 81 | if self.qualifier: 82 | # leading space is important as ensures when no year sort in right 83 | # order as ' ' < '1' 84 | out += u' [%s]' % self.qualifier 85 | return out 86 | 87 | def __repr__(self): 88 | return u'%s %s' % (self.__class__, self.__str__()) 89 | 90 | def isoformat(self, strict=False): 91 | '''Return date in isoformat (same as __str__ but without qualifier). 92 | 93 | WARNING: does not replace '?' in dates unless strict=True. 94 | ''' 95 | out = self.year 96 | # what do we do when no year ... 97 | for val in [ self.month, self.day ]: 98 | if not val: 99 | break 100 | out += u'-' + val 101 | if strict: 102 | out = out.replace('?', '0') 103 | return out 104 | 105 | our_re_pat = ''' 106 | (?P -?[\d?]+) 107 | (?: 108 | \s* - (?P [\d?]{1,2}) 109 | (?: \s* - (?P [\d?]{1,2}) )? 110 | )? 111 | \s* 112 | (?: \[ (?P[^]]*) \])? 113 | ''' 114 | our_re = re.compile(our_re_pat, re.VERBOSE) 115 | @classmethod 116 | def from_str(self, instr): 117 | '''Undo affect of __str__''' 118 | if not instr: 119 | return FlexiDate() 120 | 121 | out = self.our_re.match(instr) 122 | if out is None: # no match TODO: raise Exception? 123 | return None 124 | else: 125 | return FlexiDate( 126 | out.group('year'), 127 | out.group('month'), 128 | out.group('day'), 129 | qualifier=out.group('qualifier') 130 | ) 131 | 132 | def as_float(self): 133 | '''Get as a float (year being the integer part). 134 | 135 | Replace '?' in year with 9 so as to be conservative (e.g. 19?? becomes 136 | 1999) and elsewhere (month, day) with 0 137 | 138 | @return: float. 139 | ''' 140 | if not self.year: return None 141 | out = float(self.year.replace('?', '9')) 142 | if self.month: 143 | # TODO: we are assuming months are of equal length 144 | out += float(self.month.replace('?', '0')) / 12.0 145 | if self.day: 146 | out += float(self.day.replace('?', '0')) / 365.0 147 | return out 148 | 149 | def as_datetime(self): 150 | '''Get as python datetime.datetime. 151 | 152 | Require year to be a valid datetime year. Default month and day to 1 if 153 | do not exist. 154 | 155 | @return: datetime.datetime object. 156 | ''' 157 | year = int(self.year) 158 | month = int(self.month) if self.month else 1 159 | day = int(self.day) if self.day else 1 160 | return datetime.datetime(year, month, day) 161 | 162 | 163 | def parse(date, dayfirst=True): 164 | '''Parse a `date` into a `FlexiDate`. 165 | 166 | @param date: the date to parse - may be a string, datetime.date, 167 | datetime.datetime or FlexiDate. 168 | 169 | TODO: support for quarters e.g. Q4 1980 or 1954 Q3 170 | TODO: support latin stuff like M.DCC.LIII 171 | TODO: convert '-' to '?' when used that way 172 | e.g. had this date [181-] 173 | ''' 174 | if not date: 175 | return None 176 | if isinstance(date, FlexiDate): 177 | return date 178 | if isinstance(date, int): 179 | return FlexiDate(year=date) 180 | elif isinstance(date, datetime.date): 181 | parser = PythonDateParser() 182 | return parser.parse(date) 183 | else: # assuming its a string 184 | parser = DateutilDateParser() 185 | out = parser.parse(date, **{'dayfirst': dayfirst}) 186 | if out is not None: 187 | return out 188 | # msg = 'Unable to parse %s' % date 189 | # raise ValueError(date) 190 | val = 'UNPARSED: %s' % date 191 | val = val.encode('ascii', 'ignore') 192 | return FlexiDate(qualifier=val) 193 | 194 | 195 | class DateParserBase(object): 196 | def parse(self, date): 197 | raise NotImplementedError 198 | 199 | def norm(self, date): 200 | return str(self.parse(date)) 201 | 202 | class PythonDateParser(object): 203 | def parse(self, date): 204 | return FlexiDate(date.year, date.month, date.day) 205 | 206 | try: 207 | import dateutil.parser 208 | dateutil_parser = dateutil.parser.parser() 209 | except: 210 | dateutil_parser = None 211 | 212 | class DateutilDateParser(DateParserBase): 213 | _numeric = re.compile("^[0-9]+$") 214 | def parse(self, date, **kwargs): 215 | ''' 216 | :param **kwargs: any kwargs accepted by dateutil.parse function. 217 | ''' 218 | qualifiers = [] 219 | if dateutil_parser is None: 220 | return None 221 | date = orig_date = date.strip() 222 | 223 | # various normalizations 224 | # TODO: call .lower() first 225 | date = date.replace('B.C.', 'BC') 226 | date = date.replace('A.D.', 'AD') 227 | 228 | # deal with pre 0AD dates 229 | if date.startswith('-') or 'BC' in date or 'B.C.' in date: 230 | pre0AD = True 231 | else: 232 | pre0AD = False 233 | # BC seems to mess up parser 234 | date = date.replace('BC', '') 235 | 236 | # deal with circa: expressed as [c|ca|cca|circ|circa] with or without an appended period 237 | # and with or without a space, followed by a date 238 | # 'c.1950' or 'c1950' 'ca. 1980' 'circ 198?' 'cca. 1980' 'c 1029' 'circa 1960' etc. 239 | # see http://en.wikipedia.org/wiki/Circa 240 | # TODO: dates like 'circa 178?' and 'circa 178-' fail poorly 241 | # 'UNPARSED: circa 178?' / u"Note 'circa' : circa 178-" 242 | 243 | 244 | # note that the match deliberately does not capture the circa text match 245 | # this is done to remove circa bit below 246 | #circa_match = re.match('([^a-zA-Z]*)c\.?\s*(\d+.*)', date) 247 | 248 | # use non-matching groups (?:) to avoid refactoring the rest of the parsing 249 | circa_match = re.match(r'([^a-zA-Z]*)(?:circa|circ\.?|cca\.?|ca\.?|c\.?)(?:\s*?)([\d\?-]+\s?\?*)', date) 250 | 251 | if circa_match: 252 | # remove circa bit 253 | qualifiers.append("Note 'circa'") 254 | #date = ''.join(circa_match.groups()) 255 | # if an element in circa_match.groups() is None, an exception is thrown 256 | # so instead join the match groups from circa_match that are not none 257 | date = ''.join(list(el for el in circa_match.groups() if el)) 258 | 259 | # deal with p1980 (what does this mean? it can appear in 260 | # field 008 of MARC records 261 | p_match = re.match("^p(\d+)", date) 262 | if p_match: 263 | date = date[1:] 264 | 265 | # Deal with uncertainty: '1985?' 266 | uncertainty_match = re.match('([0-9xX]{4})\?', date) 267 | if uncertainty_match: 268 | # remove the ? 269 | date = date[:-1] 270 | qualifiers.append('Uncertainty') 271 | 272 | # Parse the numbers intelligently 273 | # do not use std parser function as creates lots of default data 274 | res = dateutil_parser._parse(date, **kwargs) 275 | 276 | if res is None: 277 | # Couldn't parse it 278 | return None 279 | #Note: Years of less than 3 digits not interpreted by 280 | # dateutil correctly 281 | # e.g. 87 -> 1987 282 | # 4 -> day 4 (no year) 283 | # Both cases are handled in this routine 284 | if res.year is None and res.day: 285 | year = res.day 286 | # If the whole date is simply two digits then dateutil_parser makes 287 | # it '86' -> '1986'. So strip off the '19'. (If the date specified 288 | # day/month then a two digit year is more likely to be this century 289 | # and so allow the '19' prefix to it.) 290 | elif self._numeric.match(date) and (len(date) == 2 or date.startswith('00')): 291 | year = res.year % 100 292 | else: 293 | year = res.year 294 | 295 | # finally add back in BC stuff 296 | if pre0AD: 297 | year = -year 298 | 299 | if not qualifiers: 300 | qualifier = '' 301 | else: 302 | qualifier = ', '.join(qualifiers) + (' : %s' % orig_date) 303 | return FlexiDate(year, res.month, res.day, qualifier=qualifier) 304 | 305 | -------------------------------------------------------------------------------- /datautil/deliveranceproxy.py: -------------------------------------------------------------------------------- 1 | '''Use deliverance_ for proxying and re-theming. 2 | 3 | .. _deliverance: http://packages.python.org/Deliverance/ 4 | 5 | Usage requirements (in pip-requirements.txt format):: 6 | 7 | # suggest installing lxml directly 8 | lxml 9 | deliverance>=0.3a 10 | # for urlmap and proxy 11 | paste 12 | # for Response 13 | webob 14 | 15 | Example usage:: 16 | 17 | dest = 'http://myremotes.ite/' 18 | mytheme = '....' 19 | my_deliverance_rules = ' ...' 20 | # or 21 | # my_deliverance_rules = open('/my/path/to/rules.xml').read() 22 | deliverance_proxy = create_deliverance_proxy(mytheme, dest, 23 | my_deliverance_rules) 24 | 25 | # from in wsgi app 26 | # path on remote destination url you want to proxy to ... 27 | # you can omit this if local path and remote path are the same 28 | environ['PATH_INFO'] = '/my_destination_path' 29 | deliverance_proxy(environ, start_response) 30 | ''' 31 | import logging 32 | 33 | import paste.urlmap 34 | import deliverance.middleware 35 | import paste.proxy 36 | from webob import Request, Response 37 | from deliverance.middleware import DeliveranceMiddleware, SubrequestRuleGetter 38 | from deliverance.log import PrintingLogger 39 | 40 | 41 | default_deliverance_rules = \ 42 | ''' 43 | 44 | 45 | 47 | 48 | 49 | 50 | 51 | 54 | 55 | 56 | ''' 57 | 58 | def create_deliverance_proxy(proxy_base_url, theme_html, rules_xml=None): 59 | '''Proxy to another url with re-theming using deliverance. 60 | 61 | Based on http://rufuspollock.org/code/deliverance 62 | 63 | :param proxy_base_url: base destination url we are proxying to. 64 | :param theme_html: string providing html theme to use for re-themeing. 65 | :param rules_xml: (optional) deliverance rules xml as a string. If not 66 | provided use `default_deliverance_rules`. For info on rulesets see 67 | deliverance docs. We require that ruleset support a single 68 | substitution string '%s' which is used to insert internal mountpoint 69 | for the them ('/_deliverance_theme.html'). 70 | ''' 71 | theme_url = '/_deliverance_theme.html' 72 | # use a urlmap so we can mount theme and urlset 73 | app = paste.urlmap.URLMap() 74 | # set up theme consistent with our rules file 75 | app[theme_url] = Response(theme_html) 76 | 77 | if rules_xml: 78 | rules = rules_xml 79 | else: 80 | rules = default_deliverance_rules 81 | rules = rules % theme_url 82 | app['/_deliverance_rules.xml'] = Response(rules, content_type="application/xml") 83 | 84 | class MyProxy(object): 85 | def __init__(self, proxy_base_url): 86 | self.proxy = paste.proxy.Proxy(proxy_base_url) 87 | 88 | def __call__(self, environ, start_response): 89 | req = Request(environ) 90 | res = req.get_response(self.proxy) 91 | res.decode_content() 92 | return res(environ, start_response) 93 | 94 | app['/'] = MyProxy(proxy_base_url) 95 | deliv = DeliveranceMiddleware(app, SubrequestRuleGetter('/_deliverance_rules.xml'), 96 | PrintingLogger, 97 | log_factory_kw=dict(print_level=logging.WARNING)) 98 | return deliv 99 | 100 | -------------------------------------------------------------------------------- /datautil/id.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import uuid 3 | 4 | def compress_uuid(_uuid): 5 | '''Provided shortened string representation of UUID via base64 encoding. 6 | 7 | @return: 22 character base64 encoded version of UUID. 8 | ''' 9 | if isinstance(_uuid, basestring): 10 | _uuid = uuid.UUID(_uuid) 11 | encode = base64.b64encode(_uuid.bytes, '_-') 12 | # throw away trailing == 13 | return encode[:22] 14 | 15 | def uncompress_uuid(b64_encoded): 16 | '''Reverse compress_uuid 17 | 18 | @return: 36 char str representation of uuid. 19 | ''' 20 | b64_encoded = str(b64_encoded) 21 | if not b64_encoded.endswith('=='): 22 | b64_encoded += '==' 23 | out = base64.b64decode(b64_encoded, '_-') 24 | _uuid = uuid.UUID(bytes=out) 25 | return str(_uuid) 26 | 27 | 28 | import struct 29 | def int_to_b32(int_): 30 | out = struct.pack('1i', int_) 31 | out = base64.b32encode(out) 32 | # throw away trailing '=' 33 | return out[:-1] 34 | 35 | def b32_to_int(b32): 36 | out = base64.b32decode(b32+'=', casefold=True) 37 | out = struct.unpack('1i', out)[0] 38 | return out 39 | 40 | -------------------------------------------------------------------------------- /datautil/misc.py: -------------------------------------------------------------------------------- 1 | # TODO: create a strict option where None is returned on failed convert rather 2 | # than original value 3 | placeholders = [ '', '-', '#' ] 4 | def floatify(value): 5 | '''Convert value to a float if possible. 6 | 7 | @return: Floatified value. If value is blank or placeholder ('-') return 8 | None. Can deal with ',' in value. Will also floatify dates. If nothing 9 | works returns original value. 10 | ''' 11 | if value is None: 12 | return None 13 | if isinstance(value, basestring): 14 | stripped = value.strip() 15 | if not stripped or stripped in placeholders: 16 | return None 17 | else: 18 | # often numbers have commas in them like 1,030 19 | v = value.replace(',', '') 20 | try: 21 | newval = float(v) 22 | return newval 23 | except: 24 | pass 25 | # will return original value if fails 26 | return date_to_float(value) 27 | 28 | def floatify_matrix(matrix): 29 | return [ [ floatify(col) for col in row ] for row in matrix ] 30 | 31 | # TODO: remove/convert to using date.FlexiDate.as_float() 32 | import datetime 33 | def date_to_float(date): 34 | '''Convert a date to float. 35 | 36 | Accepts either a date object or a string parseable to a date object 37 | 38 | @return: converted value or original if conversion fails 39 | ''' 40 | import dateutil.parser 41 | if isinstance(date, basestring): 42 | try: # simple year 43 | return float(date) 44 | except: 45 | pass 46 | try: 47 | val = dateutil.parser.parse(date, default=datetime.date(1,1,1)) 48 | except: 49 | return date 50 | else: 51 | val = date 52 | 53 | if isinstance(val, datetime.date): 54 | fval = val.year + val.month / 12.0 + val.day / 365.0 55 | return round(fval, 3) 56 | else: 57 | return val 58 | 59 | def make_series(matrix, xcol, ycols=None): 60 | '''Take a matrix and return series (i.e. list of tuples) corresponding to 61 | specified column indices. 62 | 63 | E.g. if matrix is: 64 | [ [1,2,3,4] 65 | [5,6,7,8] ] 66 | 67 | and xcol = 0, ycols=[1,3] then output is: 68 | 69 | [ 70 | [ [1,2], [5,6] ], 71 | [ [1,4], [5,8] ], 72 | ] 73 | 74 | If ycols not defined then return all possible series (excluding xcol 75 | with itself. 76 | ''' 77 | cols = zip(*matrix) 78 | if ycols is None: 79 | ycols = range(len(cols)) 80 | del ycols[xcol] 81 | cols = floatify_matrix(cols) 82 | def is_good(value): 83 | if value is None: return False 84 | tv = str(value) 85 | stopchars = [ '', '-' ] 86 | if tv in stopchars: 87 | return False 88 | return True 89 | def is_good_tuple(tuple): 90 | return is_good(tuple[0]) and is_good(tuple[1]) 91 | 92 | xcoldata = cols[xcol] 93 | ycols = [ cols[ii] for ii in ycols ] 94 | series = [ filter(is_good_tuple, zip(xcoldata, col)) for col in ycols ] 95 | return series 96 | 97 | -------------------------------------------------------------------------------- /datautil/normalization/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rufuspollock-okfn/datautil/6bcb8795013d305f02467967367ecca15ad351e2/datautil/normalization/__init__.py -------------------------------------------------------------------------------- /datautil/normalization/table_based.py: -------------------------------------------------------------------------------- 1 | import gdata.spreadsheet.text_db 2 | 3 | def _transform_key(key): 4 | return key.lower().strip() 5 | 6 | class Normalizer(object): 7 | 8 | def __init__(self, username, password, doc_id, sheet, key_row): 9 | self.client = gdata.spreadsheet.text_db.DatabaseClient( 10 | username=username, password=password) 11 | self._get_table(doc_id, sheet) 12 | self.key_row = key_row 13 | self._records = None 14 | 15 | @property 16 | def records(self): 17 | if self._records is None: 18 | self._records = [r.content for r in self.table.FindRecords('')] 19 | return self._records 20 | 21 | def _get_table(self, doc_id, sheet): 22 | db = self.client.GetDatabases(doc_id)[0] 23 | self.table = db.GetTables(name=sheet)[0] 24 | self.table.LookupFields() 25 | 26 | def keys(self): 27 | return set([r.get(self.key_row) for r in self.records \ 28 | if r.get(self.key_row) is not None]) 29 | 30 | def __contains__(self, item): 31 | return item in self.keys() 32 | 33 | def get(self, key, source_hint=None): 34 | if key is None: 35 | return {} 36 | record = self.lookup(key) 37 | if record: 38 | return record 39 | return self.add(_transform_key(key), source_hint).content 40 | 41 | def lookup(self, key): 42 | if key is None: 43 | return {} 44 | local_key = _transform_key(unicode(key)) 45 | for record in self.records: 46 | # TODO #1: figure out FindRecords syntax 47 | # TODO #2: fuzzy matching for longer keys 48 | if record.get(self.key_row) == local_key: 49 | return record 50 | 51 | 52 | def add(self, value, source_hint): 53 | fields = self.table.fields 54 | row = dict(zip(fields, [None] * len(fields))) 55 | row[self.key_row] = value 56 | if source_hint is not None: 57 | row['source'] = source_hint 58 | self._records.append(row) 59 | return self.table.AddRecord(row) 60 | 61 | class NormalizerJoin(object): 62 | 63 | def __init__(self, first, second): 64 | self.first = first 65 | self.second = second 66 | 67 | def get(self, key, source_hint=None): 68 | if key in self.second: 69 | return self.second.get(key) 70 | data = self.first.get(key, source_hint=source_hint) 71 | if self.second.key_row in data: 72 | data.update(self.second.get(data.get(self.second.key_row))) 73 | return data 74 | 75 | def Licenses(username, password): 76 | doc_id = 'thlRT-WO0EVweyjiwtYLslA' 77 | first = Normalizer(username, password, doc_id, 'Forms', 'original') 78 | second = Normalizer(username, password, doc_id, 'Licenses', 'code') 79 | return NormalizerJoin(first, second) 80 | 81 | def Formats(username, password): 82 | doc_id = 'tO-VTk7QwloOt0EP3YpCC4A' 83 | first = Normalizer(username, password, doc_id, 'Forms', 'original') 84 | second = Normalizer(username, password, doc_id, 'Formats', 'mimetype') 85 | return NormalizerJoin(first, second) 86 | 87 | 88 | -------------------------------------------------------------------------------- /datautil/normalization/text.py: -------------------------------------------------------------------------------- 1 | import re 2 | import unicodedata 3 | import string 4 | 5 | KILL_DASHES = re.compile("\\-+") 6 | 7 | def compose(text): 8 | return unicodedata.normalize('NFKC', text) 9 | 10 | def decompose(text): 11 | return unicodedata.normalize('NFKD', text) 12 | 13 | def recompose(text): 14 | return compose(decompose(text)) 15 | 16 | def url_slug(text): 17 | """ Convert arbitrary text to something that can be a url slug. """ 18 | out = [] 19 | for c in decompose(text): 20 | cat = unicodedata.category(c)[0].upper() 21 | if cat == 'Z': 22 | out.append('-') 23 | if c in string.ascii_letters or c in string.digits: 24 | out.append(c) 25 | if c in ['-', '.', '+', '_']: 26 | out.append(c) 27 | text = u"".join(out).lower() 28 | return KILL_DASHES.sub('-', text) 29 | 30 | 31 | -------------------------------------------------------------------------------- /datautil/parse/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rufuspollock-okfn/datautil/6bcb8795013d305f02467967367ecca15ad351e2/datautil/parse/__init__.py -------------------------------------------------------------------------------- /datautil/parse/name.py: -------------------------------------------------------------------------------- 1 | '''Parse names of people into a standard format.''' 2 | 3 | import re 4 | 5 | titles = [ 6 | u'Ayatollah', 7 | u'Baron', 8 | u'Bishop', 9 | u'Dame', 10 | u'Dr', 11 | u'Fr', 12 | u'Graf', 13 | u'King', 14 | u'Lady', 15 | u'Maj', 16 | u'Major', 17 | u'Mrs', 18 | u'Prof', 19 | u'Rev', 20 | u'Sir', 21 | u'St', 22 | ] 23 | 24 | class Name(object): 25 | '''A name of a person or entity. 26 | 27 | Not a domain object but a convenient way to handle/parse names. 28 | 29 | Attributes: 30 | title 31 | ln: last name 32 | firstnames: first names as list 33 | ''' 34 | def __init__(self, ln='', fns=None, title=''): 35 | self.ln = ln 36 | self.fns = fns 37 | if self.fns is None: self.fns = [] 38 | self.title = title 39 | 40 | def norm(self): 41 | '''Return normalised name string (LastFirst format) 42 | ''' 43 | return name_tostr(self) 44 | 45 | def __str__(self): 46 | '''Display name using normalised format 47 | ''' 48 | return self.norm() 49 | 50 | class NameParserBase(object): 51 | regex_remove_fullstops = re.compile(r'(\w{2,})\.(\W|$)', re.UNICODE) 52 | 53 | def parse(self, fullname): 54 | '''Parse the `fullname` string into a `Name` object. 55 | 56 | @return: `Name` object for `fullname` 57 | ''' 58 | if fullname is None: 59 | return Name() 60 | fullname = unicode(fullname.strip()) 61 | if not fullname: 62 | return Name() 63 | 64 | # remove words ending '.', e.g. 'Bosch.' 65 | fullname = self.regex_remove_fullstops.sub(r'\1\2', fullname) 66 | 67 | # make sure initials are separted by ' ' 68 | # but first deal with special edge case like [Major.] 69 | # fullname = fullname.replace('.]', ']') 70 | fullname = fullname.replace('.', '. ') 71 | name = self._toparts(fullname) 72 | name.ln = self.normcase(name.ln) 73 | name.fns = [ self.normcase(x) for x in name.fns ] 74 | name.title = self.normcase(name.title) 75 | return name 76 | 77 | def _toparts(self, fullname): 78 | '''Implement in inheriting classes, called by parse. 79 | ''' 80 | raise NotImplementedError() 81 | 82 | def tostr(self, name): 83 | '''Convert name object back into a string. 84 | ''' 85 | raise NotImplementedError() 86 | 87 | def normcase(self, name): 88 | # useful to handle none and you often get this from regexes 89 | if name is None: 90 | return '' 91 | name = name.strip() 92 | if name.upper() == name or name.lower() == name: 93 | return name.capitalize() 94 | # avoid issues with e.g. McTaggart 95 | else: 96 | return name 97 | 98 | def untitlize(self, _str): 99 | '''Return title contained in _str if a title else return empty string. 100 | ''' 101 | title = _str.strip() 102 | title = _str.strip('()') 103 | if title in titles: 104 | return title 105 | # always assume something in square brackets is a title 106 | elif title.startswith('[') and title.endswith(']'): 107 | return title[1:-1].strip() 108 | else: 109 | return '' 110 | 111 | def titlize(self, _str): 112 | return u'[' + _str + u']' 113 | 114 | def norm(self, date): 115 | return str(self.parse(date)) 116 | 117 | 118 | class LastFirst(NameParserBase): 119 | '''Parse and creates names of form: 120 | 121 | lastname, first-names-in-order [title] 122 | ''' 123 | def _toparts(self, fullname): 124 | if ',' not in fullname and ' ' in fullname: 125 | raise ValueError('Expected "," in name: %s' % fullname) 126 | name = Name() 127 | # NB: if more than 2 commas just ignore stuff after 2nd one 128 | parts = fullname.split(',') 129 | name.ln = parts[0] 130 | name.fns = parts[1].strip().split() 131 | if name.fns: 132 | title = self.untitlize(name.fns[-1]) 133 | if title: 134 | name.title = title 135 | del name.fns[-1] 136 | return name 137 | 138 | def tostr(self, name): 139 | if name.ln or name.fns: 140 | fns = ' '.join(name.fns) 141 | if not fns: 142 | out = name.ln 143 | else: 144 | out = unicode(', '.join((name.ln, ' '.join(name.fns)))) 145 | else: 146 | return '' 147 | if name.title: 148 | out = out + u' [%s]' % name.title 149 | return out 150 | 151 | 152 | class FirstLast(NameParserBase): 153 | '''Parse and create names of form: 154 | 155 | [title] first-names last-name 156 | ''' 157 | def _toparts(self, fullname): 158 | name = Name() 159 | if ',' in fullname: 160 | raise ValueError('Should not have "," in FirstLast type name: %s' % 161 | fullname) 162 | parts = fullname.split() 163 | name.ln = parts[-1] 164 | name.fns = parts[:-1] 165 | if name.fns: 166 | title = self.untitlize(name.fns[0]) 167 | if title: 168 | name.title = title 169 | del name.fns[0] 170 | return name 171 | 172 | def tostr(self, name): 173 | if name.fns or name.ln: 174 | out = u' '.join(name.fns) + ' ' + name.ln 175 | else: 176 | return '' 177 | if name.title: 178 | out = u'[%s]' % name.title + out 179 | return out 180 | 181 | 182 | def parse_name(fullname): 183 | if ',' in fullname: 184 | parser = LastFirst() 185 | else: 186 | parser = FirstLast() 187 | return parser.parse(fullname) 188 | 189 | def name_tostr(name, parser_class=LastFirst): 190 | parser = parser_class() 191 | return parser.tostr(name) 192 | 193 | def normalize(name_str, parser_class=LastFirst): 194 | name = parse_name(name_str) 195 | return name_tostr(name, parser_class) 196 | 197 | 198 | -------------------------------------------------------------------------------- /datautil/scrape.py: -------------------------------------------------------------------------------- 1 | # taken from http://effbot.org/zone/re-sub.htm#unescape-html 2 | import re, htmlentitydefs 3 | 4 | ## 5 | # Removes HTML or XML character references and entities from a text string. 6 | # 7 | # @param text The HTML (or XML) source text. 8 | # @return The plain text, as a Unicode string, if necessary. 9 | 10 | def unescape(text): 11 | def fixup(m): 12 | text = m.group(0) 13 | if text[:2] == "&#": 14 | # character reference 15 | try: 16 | if text[:3] == "&#x": 17 | return unichr(int(text[3:-1], 16)) 18 | else: 19 | return unichr(int(text[2:-1])) 20 | except ValueError: 21 | pass 22 | else: 23 | # named entity 24 | try: 25 | text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) 26 | except KeyError: 27 | pass 28 | return text # leave as is 29 | return re.sub("&#?\w+;", fixup, text) 30 | -------------------------------------------------------------------------------- /datautil/tabular/__init__.py: -------------------------------------------------------------------------------- 1 | from base import * 2 | from misc import * 3 | from xls import XlsReader 4 | from html import * 5 | from tabular_json import JsonReader, JsonWriter 6 | from txt import TxtWriter 7 | 8 | -------------------------------------------------------------------------------- /datautil/tabular/base.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tools for dealing with tabular data 3 | """ 4 | 5 | class TabularData(object): 6 | """Holder for tabular data 7 | 8 | NB: 9 | * Assume data organized in rows. 10 | * No type conversion so all data will be as entered. 11 | 12 | Properties: 13 | * data: data itself provided as array of arrays 14 | * header: associated header columns (if they exist) 15 | 16 | TODO: handling of large datasets (iterators?) 17 | """ 18 | 19 | def __init__(self, data=None, header=None): 20 | """ 21 | Initialize object. If data or header not set they are defaulted to 22 | empty list. 23 | 24 | NB: must use None as default value for arguments rather than [] 25 | because [] is mutable and using it will result in subtle bugs. See: 26 | 'Default parameter values are evaluated when the function definition 27 | is executed.' [http://www.python.org/doc/current/ref/function.html] 28 | """ 29 | self.data = [] 30 | self.header = [] 31 | if data is not None: 32 | self.data = data 33 | if header is not None: 34 | self.header = header 35 | 36 | def __repr__(self): 37 | out = [] 38 | if self.header: 39 | out.append(self.header) 40 | # limit to 10 items 41 | out += self.data[0:10] 42 | return repr(out) 43 | 44 | def __str__(self): 45 | return repr(self) 46 | 47 | def __iter__(self): 48 | return self.data.__iter__() 49 | 50 | @classmethod 51 | def from_list(self, list_, header=True): 52 | return TabularData(header=list_[0], data=list_[1:]) 53 | 54 | def to_list(self): 55 | if self.header: 56 | return [ self.header ] + self.data 57 | else: 58 | return self.data 59 | 60 | 61 | class ReaderBase(object): 62 | def __init__(self, filepath_or_fileobj=None, encoding='utf8'): 63 | self.filepath = None 64 | self.fileobj = None 65 | self._filepath_or_fileobj(filepath_or_fileobj) 66 | self.encoding = 'utf8' 67 | 68 | def _filepath_or_fileobj(self, filepath_or_fileobj): 69 | if filepath_or_fileobj is None: # do not overwrite any existing value 70 | pass 71 | elif isinstance(filepath_or_fileobj, basestring): 72 | self.filepath = filepath_or_fileobj 73 | self.fileobj = open(self.filepath) 74 | else: 75 | self.filepath = None 76 | self.fileobj = filepath_or_fileobj 77 | 78 | def read(self, filepath_or_fileobj=None): 79 | self._filepath_or_fileobj(filepath_or_fileobj) 80 | 81 | 82 | class WriterBase(object): 83 | ''' 84 | Extra arguments to write methods: 85 | has_row_headings: first col of each row is a heading. 86 | ''' 87 | def __init__(self, round_ndigits=None, **kwargs): 88 | ''' 89 | @round_ndigits: number of decimal places to use when rounding numerical 90 | values when textifying for output 91 | ''' 92 | self.round_ndigits = round_ndigits 93 | 94 | def write(self, tabular_data, fileobj, *args, **kwargs): 95 | pass 96 | 97 | def write_str(self, tabular_data, *args, **kwargs): 98 | from StringIO import StringIO 99 | holder = StringIO() 100 | self.write(tabular_data, holder, *args, **kwargs) 101 | holder.seek(0) 102 | return holder.read() 103 | 104 | def value_to_str(self, value): 105 | '''Convert value to text (rounding floats/ints as necessary). 106 | ''' 107 | if value is None: 108 | return '' 109 | if self.round_ndigits is not None and \ 110 | (isinstance(value, int) or isinstance(value, float)): 111 | roundedResult = round(value, self.round_ndigits) 112 | if self.round_ndigits <= 0: # o/w will have in .0 at end 113 | roundedResult = int(roundedResult) 114 | roundedResult = str(roundedResult) 115 | # deal with case when rounding has added unnecessary digits 116 | if len(str(value)) < len(roundedResult): 117 | return str(value) 118 | else: 119 | return roundedResult 120 | else: 121 | return unicode(value) 122 | 123 | 124 | import csv 125 | import codecs 126 | class UTF8Recoder: 127 | """ 128 | Iterator that reads an encoded stream and reencodes the input to UTF-8 129 | 130 | From: 131 | """ 132 | def __init__(self, f, encoding=None): 133 | if encoding: 134 | self.reader = codecs.getreader(encoding)(f) 135 | else: # already unicode so just return f 136 | self.reader = f 137 | 138 | def __iter__(self): 139 | return self 140 | 141 | def next(self): 142 | return self.reader.next().encode('utf-8') 143 | 144 | class CsvReader(ReaderBase): 145 | """Read data from a csv file into a TabularData structure 146 | 147 | Note that the csv module does *not* support unicode: 148 | 149 | > This version of the csv module doesn't support Unicode input. Also, there 150 | > are currently some issues regarding ASCII NUL characters. Accordingly, 151 | > all input should be UTF-8 or printable ASCII to be safe; see the examples 152 | > in section 9.1.5. These restrictions will be removed in the future. 153 | > 154 | """ 155 | 156 | def read(self, filepath_or_fileobj=None, encoding=None, **kwargs): 157 | """Read in a csv file and return a TabularData object. 158 | 159 | @param fileobj: file like object. 160 | @param encoding: if set use this instead of default encoding set in 161 | __init__ to decode the file like object. NB: will check if fileobj 162 | already in unicode in which case this is ignored. 163 | @param kwargs: all further kwargs are passed to the underlying `csv.reader` function 164 | @return tabular data object (all values encoded as utf-8). 165 | """ 166 | super(CsvReader, self).read(filepath_or_fileobj) 167 | if encoding: 168 | self.encoding = encoding 169 | tabData = TabularData() 170 | 171 | sample = self.fileobj.read() 172 | # first do a simple test -- maybe sample is already unicode 173 | if type(sample) == unicode: 174 | encoded_fo = UTF8Recoder(self.fileobj, None) 175 | else: 176 | sample = sample.decode(self.encoding) 177 | encoded_fo = UTF8Recoder(self.fileobj, self.encoding) 178 | sample = sample.encode('utf-8') 179 | sniffer = csv.Sniffer() 180 | hasHeader = sniffer.has_header(sample) 181 | 182 | self.fileobj.seek(0) 183 | ourkwargs = { 184 | 'skipinitialspace': True 185 | } 186 | if kwargs: 187 | ourkwargs.update(kwargs) 188 | 189 | reader = csv.reader(encoded_fo, **ourkwargs) 190 | if hasHeader: 191 | tabData.header = reader.next() 192 | for row in reader: 193 | tabData.data.append(row) 194 | return tabData 195 | 196 | # for backwards compatibility 197 | ReaderCsv = CsvReader 198 | 199 | class CsvWriter(WriterBase): 200 | # TODO: unicode support a la CsvReader 201 | def write(self, tabular_data, fileobj, encoding='utf-8'): 202 | writer = csv.writer(fileobj) 203 | if tabular_data.header: 204 | writer.writerow(tabular_data.header) 205 | for row in tabular_data.data: 206 | writer.writerow(row) 207 | fileobj.flush() 208 | 209 | 210 | ## -------------------------------- 211 | ## Converting to Latex 212 | 213 | class LatexWriter(WriterBase): 214 | 215 | def write(self, tabular_data, fileobj, has_row_headings=False): 216 | self.has_row_headings = has_row_headings 217 | matrix = tabular_data.data 218 | has_header = len(tabular_data.header) > 0 219 | if has_header: 220 | matrix.insert(0, tabular_data.header) 221 | out = self._write(matrix, has_header) 222 | fileobj.write(out) 223 | 224 | def _write(self, matrix, has_header=True): 225 | if len(matrix) == 0: return 226 | # no hline on first row as this seems to mess up latex \input 227 | # http://groups.google.com/group/comp.text.tex/browse_thread/thread/1e1db553a958ebd8/0e590a22cb59f43d 228 | out = '%s' % self.process_row(matrix[0], has_header) 229 | for row in matrix[1:]: 230 | out += self.process_row(row) 231 | return out 232 | 233 | def process_row(self, row, heading=False): 234 | if len(row) == 0: return 235 | out = '%s' % self.process_cell(row[0], heading or self.has_row_headings) 236 | for cell in row[1:]: 237 | out += ' & %s' % self.process_cell(cell, heading) 238 | out += ' \\\\\n\hline\n' 239 | return out 240 | 241 | def process_cell(self, cell, heading=False): 242 | cell_text = self.value_to_str(cell) 243 | cell_text = self.escape(cell_text) 244 | if heading: 245 | return '\\textbf{%s}' % cell_text 246 | else: 247 | return cell_text 248 | 249 | def escape(self, text): 250 | escape_chars = [ '&', '%' ] 251 | out = text 252 | for ch in escape_chars: 253 | out = out.replace(ch, '\\%s' % ch) 254 | return out 255 | 256 | 257 | # TODO: 2009-08-05 deprecate 258 | def table2latex(matrix, has_header=True, has_row_headings=False): 259 | m2l = LatexWriter() 260 | m2l.has_row_headings = has_row_headings 261 | return m2l._write(matrix, has_header) 262 | 263 | -------------------------------------------------------------------------------- /datautil/tabular/gdocs.py: -------------------------------------------------------------------------------- 1 | '''TabularData from a Google Docs Spreadsheet. 2 | ''' 3 | from base import ReaderBase, TabularData 4 | import gdata.spreadsheet.service 5 | import gdata.spreadsheet.text_db 6 | 7 | 8 | class GDocsReaderTextDb(ReaderBase): 9 | '''Read a google docs spreadsheet using the gdata.spreadsheet.text_db 10 | library. 11 | 12 | NB: any blank line in spreadsheet will be taken as terminating data. 13 | ''' 14 | def __init__(self, spreadsheet_id, username=None, password=None, 15 | id_is_name=False): 16 | ''' 17 | @param: spreadsheet_id: gdoc id or name (?key={id} in url). If name you 18 | must set id_is_name to True. 19 | ''' 20 | # do not pass spreadsheet_id down as it will be url or sheet name 21 | super(GDocsReaderTextDb, self).__init__() 22 | self.source = spreadsheet_id 23 | self.id_is_name = id_is_name 24 | self.gd_client = gdata.spreadsheet.text_db.DatabaseClient( 25 | username=username, 26 | password=password) 27 | 28 | def load_text_db_table(self, sheet_name='Sheet1'): 29 | '''Load text_db Table object corresponding to specified sheet_name. 30 | ''' 31 | super(GDocsReaderTextDb, self).read(None) 32 | if self.id_is_name: 33 | dbs = self.gd_client.GetDatabases(name=self.source) 34 | else: 35 | dbs = self.gd_client.GetDatabases(spreadsheet_key=self.source) 36 | assert len(dbs) >= 1, 'No spreadsheet of that name/id' 37 | db = dbs[0] 38 | table = db.GetTables(name=sheet_name)[0] 39 | return table 40 | 41 | def read(self, sheet_name='Sheet1'): 42 | '''Load the specified google spreadsheet worksheet as a L{TabularData} 43 | object. 44 | 45 | @return L{TabularData} object. 46 | ''' 47 | text_db_table = self.load_text_db_table(sheet_name) 48 | tdata = TabularData() 49 | text_db_table.LookupFields() 50 | tdata.header = text_db_table.fields 51 | # finds all records it seems 52 | rows = text_db_table.FindRecords('') 53 | for row in rows: 54 | rowdata = [] 55 | for colname in tdata.header: 56 | rowdata.append(row.content[colname]) 57 | tdata.data.append(rowdata) 58 | return tdata 59 | 60 | 61 | # not yet working properly (cannot work out ListFeed yet ...) 62 | # textdb is nicer but Spreadsheet allows one to get all cells using CellsFeed 63 | # (even when blank lines) (this is not true when using ListFeed though ...) 64 | # class GDocsReaderSpreadsheet(ReaderBase): 65 | # ''' 66 | # 67 | # From Docs for the API: 68 | # 69 | # 70 | # > The list feed contains all rows after the first row up to the first blank 71 | # row. The first blank row terminates the data set. If expected data isn't 72 | # appearing in a feed, check the worksheet manually to see whether there's an 73 | # unexpected blank row in the middle of the data. In particular, if the 74 | # second row of the spreadsheet is blank, then the list feed will contain no 75 | # data. 76 | # ''' 77 | # def __init__(self, spreadsheet_id, username=None, password=None, 78 | # id_is_name=False): 79 | # ''' 80 | # @param: spreadsheet_id: gdoc id or name (?key={id} in url). If name you 81 | # must set id_is_name to True. 82 | # ''' 83 | # # do not pass spreadsheet_id down as it will be url or sheet name 84 | # super(GDocsReaderSpreadsheet, self).__init__() 85 | # self.source = spreadsheet_id 86 | # self.id_is_name = id_is_name 87 | # self.gd_client = gdata.spreadsheet.service.SpreadsheetsService() 88 | # self.gd_client.email = username 89 | # self.gd_client.password = password 90 | # 91 | # def read(self, sheet_index=0): 92 | # '''Load the specified google spreadsheet worksheet as a L{TabularData} 93 | # object. 94 | # 95 | # @return L{TabularData} object. 96 | # ''' 97 | # super(GDocsReaderSpreadsheet, self).read(None) 98 | # self.gd_client.source = self.source 99 | # self.gd_client.ProgrammaticLogin() 100 | # if self.id_is_name: 101 | # feed = self.gd_client.GetSpreadsheetsFeed() 102 | # # no len on feed ... 103 | # # assert len(feed) > 0, 'No spreadsheets found for: %s' % self.source 104 | # spreadsheet_id = feed.entry[0].id.text.split('/')[-1] 105 | # else: 106 | # spreadsheet_id = self.source 107 | # sheetfeed = self.gd_client.GetWorksheetsFeed(spreadsheet_id) 108 | # wrksht_id = sheetfeed.entry[sheet_index].id.text.split('/')[-1] 109 | # row_feed = self.gd_client.GetListFeed(spreadsheet_id, wrksht_id) 110 | # 111 | # tdata = TabularData() 112 | # # tdata.header 113 | # # how do we get rows rather than just all the cells? 114 | # for i, entry in enumerate(row_feed.entry): 115 | # print entry.content['col1'] 116 | # print entry.content 117 | # tdata.data.append([entry.content.text]) 118 | # return tdata 119 | 120 | -------------------------------------------------------------------------------- /datautil/tabular/html.py: -------------------------------------------------------------------------------- 1 | import re 2 | from HTMLParser import HTMLParser 3 | 4 | from base import TabularData, ReaderBase, WriterBase 5 | 6 | 7 | class HtmlReader(ReaderBase): 8 | '''Read data from HTML table into L{TabularData}. 9 | 10 | ''' 11 | def read(self, filepath_or_fileobj=None, table_index=0): 12 | '''Read data from fileobj. 13 | 14 | NB: post read all tables extracted are in attribute named 'tables'. 15 | 16 | @arg table_index: if multiple tables in the html return table at this 17 | index. 18 | @return: L{TabularData} object (all content in the data part, i.e. no 19 | header). 20 | ''' 21 | super(HtmlReader, self).read(filepath_or_fileobj) 22 | parser = _OurTableExtractor() 23 | parser.reset() 24 | parser.feed(self.fileobj.read()) 25 | self.tables = parser.tables 26 | return self.tables[table_index] 27 | 28 | 29 | class _OurTableExtractor(HTMLParser): 30 | ''' 31 | # TODO: tbody, thead etc 32 | # TODO: nested tables 33 | 34 | # TODO: will barf on bad html so may need to run tidy first ... 35 | # tidy -w 0 -b -omit -asxml -ascii 36 | ''' 37 | def reset(self): 38 | HTMLParser.reset(self) 39 | self.tables = [] 40 | self._rows = [] 41 | self._row = [] 42 | self._text = '' 43 | 44 | def handle_starttag(self, tag, attrs): 45 | if tag == 'tr': 46 | self._row = [] 47 | elif tag == 'td' or tag == 'th': 48 | self._text = '' 49 | elif tag == 'br': 50 | self._text += '\n' 51 | 52 | def handle_endtag(self, tag): 53 | if tag == 'tr': 54 | self._rows.append(self._row) 55 | if tag == 'td' or tag == 'th': 56 | self._row.append(self._text) 57 | if tag == 'table': 58 | self.tables.append(TabularData(data=self._rows)) 59 | self._rows = [] 60 | 61 | def handle_data(self, data): 62 | self._text += data.strip() 63 | 64 | 65 | import re 66 | class HtmlWriter(WriterBase): 67 | """ 68 | Write tabular data to xhtml 69 | """ 70 | 71 | def __init__(self, round_ndigits=2, pretty_print=False, table_attributes = {'class': 'data'}): 72 | """ 73 | @pretty_print: whether to pretty print (indent) output 74 | @table_attributes: dictionary of html attribute name/value pairs to be 75 | added to the table element 76 | """ 77 | super(HtmlWriter, self).__init__(round_ndigits) 78 | self.pretty_print = pretty_print 79 | self.table_attributes = table_attributes 80 | 81 | def write(self, tabulardata, fileobj, caption = '', rowHeadings = []): 82 | """ 83 | Write matrix of data to xhtml table. 84 | Allow for addition of row and column headings 85 | 86 | @return xhtml table containing data 87 | 88 | @param data: table of data that makes up table 89 | @param caption: the caption for the table (if empty no caption created) 90 | @param rowHeadings: additional headings for rows (separate from 91 | tabulardata) 92 | """ 93 | columnHeadings = tabulardata.header 94 | data = tabulardata.data 95 | haveRowHeadings = (len(rowHeadings) > 0) 96 | 97 | htmlTable = ' 0: 110 | if haveRowHeadings and numColHeads == len(data[0]): 111 | # [[TODO: is this dangerous? should i make a copy ...]] 112 | columnHeadings.insert(0, '') 113 | htmlTable += self.writeHeading(columnHeadings) 114 | 115 | htmlTable += '' 116 | if self.pretty_print: 117 | htmlTable += '\n' 118 | 119 | for ii in range(0, len(data)): 120 | # have to add 1 as first row is headings 121 | if haveRowHeadings: 122 | htmlTable += self.writeRow(data[ii], rowHeadings[ii]) 123 | else: 124 | htmlTable += self.writeRow(data[ii]) 125 | 126 | htmlTable += '' 127 | 128 | if self.pretty_print: 129 | fileobj.write(self.prettyPrint(htmlTable)) 130 | else: 131 | fileobj.write(htmlTable) 132 | 133 | def value_to_str(self, value): 134 | import cgi 135 | out = super(HtmlWriter, self).value_to_str(value) 136 | out = cgi.escape(out) 137 | return out 138 | 139 | def writeHeading(self, row): 140 | """ 141 | Write heading for html table () 142 | """ 143 | result = '' 144 | result += self.writeGeneralRow(row, 'th') 145 | result += '' 146 | if self.pretty_print: 147 | result += '\n' 148 | return result 149 | 150 | def writeRow(self, row, rowHeading = ''): 151 | result = '' 152 | if rowHeading != '': 153 | result = '%s' % self.value_to_str(rowHeading) 154 | result += self.writeGeneralRow(row, 'td') 155 | result = '%s' % result 156 | if self.pretty_print: 157 | result += '\n' 158 | return result 159 | 160 | def writeGeneralRow(self, row, tagName): 161 | result = '' 162 | for ii in range(len(row)): 163 | result += '<%s>%s' % (tagName, self.value_to_str(row[ii]), tagName) 164 | return result 165 | 166 | def prettyPrint(self, html): 167 | """pretty print html using HTMLTidy""" 168 | # [[TODO: strip out html wrapper stuff that is added (head, body etc) 169 | try: 170 | import mx.Tidy 171 | out = mx.Tidy.tidy(html, None, None, wrap = 0, indent = 'yes')[2] 172 | except: 173 | out = html 174 | return self.tabify(out) 175 | 176 | def tabify(self, instr, tabsize = 2): 177 | """ 178 | tabify text by replacing spaces of size tabSize by tabs 179 | """ 180 | whitespace = tabsize * ' ' 181 | return re.sub(whitespace, '\t', instr) 182 | 183 | 184 | # for backwards compatibility 185 | # 2008-05-30 186 | WriterHtml = HtmlWriter 187 | 188 | 189 | -------------------------------------------------------------------------------- /datautil/tabular/misc.py: -------------------------------------------------------------------------------- 1 | '''General Helper methods for tabular data. 2 | ''' 3 | from base import TabularData 4 | 5 | def transpose(data): 6 | '''Transpose a list of lists. 7 | 8 | Or do it directy: data = zip(*data) 9 | ''' 10 | return zip(*data) 11 | 12 | def select_columns(matrix, cols): 13 | '''Return a matrix with only those column indexes in cols.''' 14 | tsp = transpose(matrix) 15 | out = [] 16 | cols.sort() 17 | for c in cols: 18 | out.append(tsp[c]) 19 | return transpose(out) 20 | 21 | 22 | def pivot(table, left, top, value): 23 | """Unnormalize (pivot) a normalised input set of tabular data. 24 | 25 | @param table: simple list of lists or a L{TabularData} object. 26 | 27 | Eg. To transform the tabular data like 28 | 29 | Name, Year, Value 30 | ----------------------- 31 | 'x', 2004, 1 32 | 'y', 2004, 2 33 | 'x', 2005, 3 34 | 'y', 2005, 4 35 | 36 | into the new list: 37 | 38 | Year, 'x', 'y' 39 | ------------------------ 40 | 2004, 1, 2 41 | 2005, 3, 4 42 | 43 | you would do: 44 | 45 | pivot(tabulardata, 1, 0, 2) 46 | 47 | OR (requires header to exist): 48 | 49 | pivot(tabulardata, 'Year', 'Name', 'Value') 50 | """ 51 | if not isinstance(left, int): 52 | left = table.header.index(left) 53 | if not isinstance(top, int): 54 | top = table.header.index(top) 55 | if not isinstance(value, int): 56 | value = table.header.index(value) 57 | 58 | rs = TabularData() 59 | # construct double dict keyed by left values 60 | tdict = {} 61 | xvals = set() 62 | yvals = set() 63 | for row in table: 64 | xval = row[left] 65 | if not xval in tdict: 66 | tdict[xval] = {} 67 | tdict[xval][row[top]] = row[value] 68 | xvals.add(xval) 69 | yvals.add(row[top]) 70 | xvals = sorted(list(xvals)) 71 | yvals = sorted(list(yvals)) 72 | xhead = 'X' 73 | if hasattr(table, 'header') and table.header: 74 | xhead = table.header[left] 75 | rs.header = [ xhead ] + yvals 76 | rs.data = [ [x] + [ tdict[x].get(y, '') for y in yvals ] for x in xvals ] 77 | return rs 78 | 79 | -------------------------------------------------------------------------------- /datautil/tabular/tabular_json.py: -------------------------------------------------------------------------------- 1 | '''JSON Reader and Writer''' 2 | try: 3 | import json 4 | except ImportError: 5 | try: 6 | import simplejson as json 7 | except ImportError: # simplejson not installed 8 | pass 9 | from base import TabularData, ReaderBase, WriterBase 10 | 11 | 12 | class JsonReader(ReaderBase): 13 | def read(self, filepath_or_fileobj=None): 14 | '''Read JSON encoded data from source into a L{TabularData} object. 15 | 16 | JSON encoded data should either be: 17 | * dict (with header and data attributes) 18 | * list (first row assumed to be the header) 19 | 20 | @return L{TabularData} 21 | ''' 22 | super(JsonReader, self).read(filepath_or_fileobj) 23 | jsondata = json.load(self.fileobj) 24 | if isinstance(jsondata, dict): 25 | return TabularData(header=jsondata.get('header', None), 26 | data=jsondata.get('data', None) 27 | ) 28 | elif isinstance(jsondata, list): 29 | return TabularData(header=jsondata[0], data=jsondata[1:]) 30 | else: 31 | raise Exception('Cannot load TabularData from %s' % jsondata) 32 | 33 | class JsonWriter(WriterBase): 34 | 35 | def write(self, tabular_data, fileobj, indent=2): 36 | super(JsonWriter, self).write(tabular_data, fileobj) 37 | jsondata = { u'header': tabular_data.header, 38 | u'data': tabular_data.data 39 | } 40 | json.dump(jsondata, fileobj, indent=indent) 41 | 42 | -------------------------------------------------------------------------------- /datautil/tabular/txt.py: -------------------------------------------------------------------------------- 1 | from base import WriterBase 2 | 3 | class TxtWriter(WriterBase): 4 | '''Write tabular data to plain text in nicely formatted way 5 | 6 | TODO 7 | ==== 8 | 9 | 1. allow output_width of 0 meaning use width necessary to fit all rows on one 10 | line 11 | 12 | 2. rather than truncate cell contents wrap it onto two lines (and/or allow 13 | spillover if adjacent cell is empty) 14 | 15 | * wontfix: can let terminal do this: just set width very large ... 16 | 17 | 3. (?) stream output back rather than returning all at once 18 | 19 | 4. Add support for limiting number of columns displayed. DONE 2007-08-02 20 | * TODO: add unittest 21 | ''' 22 | 23 | def __init__(self, output_width=0, number_of_columns=0, **kwargs): 24 | ''' 25 | @param output_width: display width (0 means unlimited). 26 | @param number_of_columns: number of columns to try to display (not 27 | guaranteed to be this number if this would cause problems). (0 28 | means all columns) 29 | ''' 30 | super(TxtWriter, self).__init__(**kwargs) 31 | self.output_width = output_width 32 | self.number_of_columns = number_of_columns 33 | 34 | def write(self, tabular_data, fileobj): 35 | result = '' 36 | formatter = None 37 | row_cache = [] 38 | sample_length = 4 39 | rows = tabular_data.data 40 | if tabular_data.header: 41 | rows = [ tabular_data.header ] + rows 42 | # include header in sample rows (do we always want to?) 43 | sample_rows = rows[:sample_length] 44 | self._compute_parameters(sample_rows) 45 | result += self._write_separator() 46 | for row in rows: 47 | result += self._write_row(row) 48 | result += self._write_separator() 49 | fileobj.write(result) 50 | 51 | def _compute_parameters(self, sample_rows): 52 | maxcols = self._get_maxcols(sample_rows) 53 | if not self.number_of_columns: 54 | self.numcols = maxcols 55 | else: 56 | self.numcols = min(self.number_of_columns, maxcols) 57 | self.colwidths = [] 58 | self._set_colwidths(sample_rows) 59 | if self.colwidths[0] < 2: 60 | msg =\ 61 | u'''It is not possible to effectively format this many columns of material with 62 | this narrow an output window. Column width is: %s''' % self.colwidths[0] 63 | # TODO: log it? 64 | print msg 65 | 66 | def _write_row(self, row): 67 | '''Return the input 'python' row as an appropriately formatted string. 68 | ''' 69 | result = '|' 70 | count = 0 71 | for cell in row[:self.numcols]: 72 | width = self.colwidths[count] 73 | result += self._format_cell(width, cell) 74 | count += 1 75 | # now pad out with extra cols as necessary 76 | while count < self.numcols: 77 | width = self.colwidths[count] 78 | result += self._format_cell(width, ' ') 79 | count += 1 80 | return result + '\n' 81 | 82 | def _write_separator(self): 83 | result = '+' 84 | for width in self.colwidths: 85 | result += '-' * (width-1) + '+' 86 | return result + '\n' 87 | 88 | def _get_maxcols(self, sample_rows): 89 | maxcols = 0 90 | for row in sample_rows: 91 | maxcols = max(maxcols, len(row)) 92 | return maxcols 93 | 94 | def _set_colwidths(self, sample_rows): 95 | # subtract -1 so that we have (at least) one spare screen column 96 | if self.output_width != 0: 97 | colwidth = int( (self.output_width - 1) / self.numcols) 98 | for ii in range(self.numcols): 99 | self.colwidths.append(colwidth) 100 | else: # make every col as wide as it needs to be 101 | self.colwidths = [0] * self.numcols 102 | for row in sample_rows: 103 | for ii in range(self.numcols): 104 | cellwidth = len(self.value_to_str(row[ii])) 105 | self.colwidths[ii] = max(self.colwidths[ii], 106 | cellwidth 107 | ) 108 | self.colwidths = [ x + 1 for x in self.colwidths ] 109 | 110 | def _format_cell(self, width, content): 111 | content = self.value_to_str(content) 112 | content = content.strip() 113 | if len(content) > width - 1: 114 | # TODO: be brutal (this *has* to be fixed) 115 | content = content[:width-1] 116 | return content.center(width-1) + '|' 117 | 118 | -------------------------------------------------------------------------------- /datautil/tabular/xls.py: -------------------------------------------------------------------------------- 1 | '''Work with Excel (xls) files. 2 | 3 | Requires xlrd 4 | ''' 5 | try: 6 | import xlrd 7 | except ImportError: # xlrd not installed 8 | pass 9 | 10 | from base import ReaderBase, TabularData 11 | 12 | class XlsReader(ReaderBase): 13 | '''Read Excel (xls) files. 14 | 15 | Requires the xlrd package (see pypi). 16 | ''' 17 | def __init__(self, filepath_or_fileobj=None): 18 | super(XlsReader, self).__init__(filepath_or_fileobj) 19 | if self.fileobj: 20 | self.book = xlrd.open_workbook(file_contents=self.fileobj.read()) 21 | ## TODO: fix the rest of this 22 | 23 | def read(self, fileobj=None, sheet_index=0): 24 | '''Read an excel file (provide as fileobj) and return the specified 25 | sheet as a L{TabularData} object. 26 | 27 | For convenience also store: 28 | 29 | self.book: xlrd WorkBook object 30 | 31 | @return L{TabularData} object. 32 | ''' 33 | super(XlsReader, self).read(fileobj) 34 | if fileobj: 35 | self.book = xlrd.open_workbook(file_contents=self.fileobj.read()) 36 | tab = TabularData() 37 | booksheet = self.book.sheet_by_index(sheet_index) 38 | data = self.extract_sheet(booksheet, self.book) 39 | tab.data = data 40 | return tab 41 | 42 | def info(self): 43 | '''Return summary info about this Excel Workbook.''' 44 | info = '' 45 | info += 'The number of worksheets is: %s\n' % self.book.nsheets 46 | info += 'Worksheet name(s):\n' % self.book.sheet_names() 47 | count = -1 48 | for sn in self.book.sheet_names(): 49 | count += 1 50 | info += '%s %s\n' % (count, sn) 51 | return info 52 | 53 | def sheet_info(self, sheet_index): 54 | '''Summary info about an xls sheet. 55 | 56 | @return: printable string giving info. 57 | ''' 58 | import pprint 59 | sh = self.book.sheet_by_index(sheet_index) 60 | info = sh.name + '\n' 61 | info += 'Rows: %s Cols: %s\n\n' % (sh.nrows, sh.ncols) 62 | MAX_ROWS = 30 63 | for rx in range(min(sh.nrows, MAX_ROWS)): 64 | info += str(sh.row(rx)) + '\n' 65 | return info 66 | 67 | def extract_sheet(self, sheet, book): 68 | matrix = [] 69 | nrows = sheet.nrows 70 | ncols = sheet.ncols 71 | for rx in range(nrows): 72 | outrow = [] 73 | for cx in range(ncols): 74 | cell = sheet.cell(rowx=rx, colx=cx) 75 | val = self.cell_to_python(cell, book) 76 | outrow.append(val) 77 | matrix.append(outrow) 78 | return matrix 79 | 80 | def cell_to_python(self, cell, book): 81 | # annoying need book argument for datemode 82 | # info on types: http://www.lexicon.net/sjmachin/xlrd.html#xlrd.Cell-class 83 | if cell.ctype == xlrd.XL_CELL_NUMBER: 84 | return float(cell.value) 85 | elif cell.ctype == xlrd.XL_CELL_DATE: 86 | from datetime import date 87 | # TODO: distinguish date and datetime 88 | args = xlrd.xldate_as_tuple(cell.value, book.datemode) 89 | try: 90 | return date(args[0], args[1], args[2]) 91 | except Exception, inst: 92 | print 'Error parsing excel date (%s): %s' % (args, inst) 93 | return None 94 | elif cell.ctype == xlrd.XL_CELL_BOOLEAN: 95 | return bool(cell.value) 96 | else: 97 | return cell.value 98 | 99 | 100 | -------------------------------------------------------------------------------- /datautil/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # placeholder 2 | -------------------------------------------------------------------------------- /datautil/tests/data/xls_reader_test.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rufuspollock-okfn/datautil/6bcb8795013d305f02467967367ecca15ad351e2/datautil/tests/data/xls_reader_test.xls -------------------------------------------------------------------------------- /datautil/tests/parse/test_name.py: -------------------------------------------------------------------------------- 1 | import datautil.parse.name 2 | 3 | 4 | class TestName: 5 | def test_parse_name_FL(self): 6 | name = u'Ludwig Van Beethoven' 7 | out = datautil.parse.name.parse_name(name) 8 | assert out.ln == u'Beethoven' 9 | assert out.fns == ['Ludwig', 'Van'] 10 | 11 | def test_parse_name_LF_with_extra_comma(self): 12 | out = datautil.parse.name.parse_name('More, Sir Thomas,Saint') 13 | assert out.ln == 'More', out 14 | assert out.fns == ['Sir', 'Thomas'] 15 | 16 | def test_parse_name_FL_normcase(self): 17 | name = u'Ludwig van BEETHOVEN' 18 | out = datautil.parse.name.parse_name(name) 19 | assert out.ln == 'Beethoven', out 20 | 21 | def test_parse_name_LF_with_title(self): 22 | name = u'Chandos, John [Sir]' 23 | out = datautil.parse.name.parse_name(name) 24 | assert out.ln == 'Chandos', out 25 | assert out.title == 'Sir', out 26 | 27 | def test_parse_name_FL_with_title(self): 28 | name = u'Sir John CHANDOS' 29 | out = datautil.parse.name.parse_name(name) 30 | assert out.ln == 'Chandos', out 31 | assert out.title == 'Sir', out 32 | 33 | def test_parse_name_FL_with_title_2(self): 34 | name = u'Prof Benjamin AARON' 35 | out = datautil.parse.name.parse_name(name) 36 | assert out.ln == 'Aaron', out 37 | assert out.title == 'Prof', out 38 | assert out.fns == ['Benjamin'], out 39 | assert str(out) == 'Aaron, Benjamin [Prof]' 40 | 41 | def test_parse_title_with_fullstop(self): 42 | name = 'Major. abc xyz' 43 | out = datautil.parse.name.parse_name(name) 44 | assert out.title == 'Major', out.title 45 | 46 | def test_parse_title_with_fullstop_2(self): 47 | name = 'Xyz, Abc [Major.]' 48 | out = datautil.parse.name.parse_name(name) 49 | print out 50 | assert out.title == 'Major', out.title 51 | 52 | def test_parse_title_with_brackets(self): 53 | name = 'Dickens, Gerald (Sir)' 54 | out = datautil.parse.name.parse_name(name) 55 | assert out.title == 'Sir', out.title 56 | 57 | name = '(Sir) Gerald Dickens' 58 | out = datautil.parse.name.parse_name(name) 59 | assert out.title == 'Sir', out.title 60 | 61 | def test_parse_name_FL_initials(self): 62 | name = 'Chekhov, A.P.' 63 | out = datautil.parse.name.parse_name(name) 64 | assert out.ln == 'Chekhov' 65 | assert out.fns == ['A.', 'P.'], out 66 | 67 | def test_strip_fullstops(self): 68 | name = 'George. Bosch' 69 | out = datautil.parse.name.normalize(name) 70 | assert out == 'Bosch, George' 71 | 72 | name = 'George. a.p. Bosch.' 73 | out = datautil.parse.name.normalize(name) 74 | assert out == 'Bosch, George A. P.', out 75 | 76 | name = 'Geo.rge. Bosch' 77 | out = datautil.parse.name.normalize(name) 78 | assert out == 'Bosch, Geo. Rge', out 79 | 80 | name = 'Geo.Smith. Bosch' 81 | out = datautil.parse.name.normalize(name) 82 | assert out == 'Bosch, Geo. Smith', out 83 | 84 | def test_tostr(self): 85 | name = datautil.parse.name.Name(ln='Beethoven', fns=['Ludwig', 'van']) 86 | exp = u'Beethoven, Ludwig van' 87 | out = datautil.parse.name.name_tostr(name) 88 | assert out == exp, out 89 | 90 | def test_with_no_name(self): 91 | name = datautil.parse.name.parse_name(' ') 92 | assert name.ln is '', name 93 | out = datautil.parse.name.normalize(' ') 94 | assert out == '', out 95 | 96 | def test_surname(self): 97 | name = u'SCHUBERT' 98 | out = str(datautil.parse.name.parse_name(name)) 99 | assert out == 'Schubert' 100 | 101 | -------------------------------------------------------------------------------- /datautil/tests/tabular/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rufuspollock-okfn/datautil/6bcb8795013d305f02467967367ecca15ad351e2/datautil/tests/tabular/__init__.py -------------------------------------------------------------------------------- /datautil/tests/tabular/test_base.py: -------------------------------------------------------------------------------- 1 | import os 2 | from StringIO import StringIO 3 | 4 | import datautil.tabular 5 | 6 | class TestTabularData: 7 | testlist = [ ['X', 'Y'], [1,2], [3,4] ] 8 | 9 | def test_1(self): 10 | tabular = datautil.tabular.TabularData() 11 | assert tabular.header == [] 12 | 13 | def test_from_list(self): 14 | out = datautil.tabular.TabularData.from_list(self.testlist) 15 | assert out.header == [ 'X', 'Y' ] 16 | assert out.data == [ [1,2], [3,4] ] 17 | 18 | def test_to_list(self): 19 | td = datautil.tabular.TabularData( 20 | header=['X', 'Y'], 21 | data=[ [1,2], [3,4] ] 22 | ) 23 | out = td.to_list() 24 | assert out == self.testlist 25 | 26 | 27 | class TestWriterBase: 28 | def test_value_to_str(self): 29 | w = datautil.tabular.WriterBase() # round_ndigits=None 30 | out = w.value_to_str('x') 31 | assert out == u'x', out 32 | out = w.value_to_str(1) 33 | assert out == u'1', out 34 | out = w.value_to_str(1.3555) 35 | assert out == u'1.3555', out 36 | 37 | w = datautil.tabular.WriterBase(round_ndigits=2) 38 | out = w.value_to_str('x') 39 | assert out == u'x', out 40 | out = w.value_to_str(1) 41 | assert out == u'1', out 42 | out = w.value_to_str(1.3555) 43 | assert out == u'1.36', out 44 | 45 | w.round_ndigits = -1 46 | out = w.value_to_str(102.34) 47 | assert out == u'100', out 48 | 49 | 50 | class TestReaderCsv(object): 51 | 52 | csvdata = \ 53 | '''"header1", "header 2" 54 | 1, 2''' 55 | header = [ 'header1', 'header 2' ] 56 | data = [ ['1', '2'] ] 57 | 58 | def setUp(self): 59 | reader = datautil.tabular.ReaderCsv() 60 | fileobj = StringIO(self.csvdata) 61 | self.tab = reader.read(fileobj) 62 | 63 | def test_header(self): 64 | assert self.header == self.tab.header 65 | 66 | def test_data(self): 67 | assert self.data == self.tab.data 68 | 69 | 70 | class TestReaderCsvUnicode(TestReaderCsv): 71 | csvdata = \ 72 | u'''"headi\xf1g", "header 2" 73 | 1, 2''' 74 | header = [ u'headi\xf1g'.encode('utf-8'), 'header 2' ] 75 | data = [ ['1', '2'] ] 76 | 77 | 78 | class TestReaderCsvEncoded(TestReaderCsvUnicode): 79 | encoding = 'utf-16' 80 | csvdata = \ 81 | u'''"headi\xf1g", "header 2" 82 | 1, 2'''.encode(encoding) 83 | 84 | def setUp(self): 85 | reader = datautil.tabular.ReaderCsv() 86 | fileobj = StringIO(self.csvdata) 87 | self.tab = reader.read(fileobj, encoding=self.encoding) 88 | 89 | 90 | class TestCsvWriter: 91 | def test_writer(self): 92 | writer = datautil.tabular.CsvWriter() 93 | fo = StringIO() 94 | td = datautil.tabular.TabularData([[1,2],[3,4]], header=['one', 95 | 'two']) 96 | writer.write(td, fo) 97 | fo.seek(0) 98 | out = fo.read() 99 | exp = \ 100 | '''one,two\r 101 | 1,2\r 102 | 3,4\r\n''' 103 | assert out == exp 104 | 105 | 106 | class TestHtmlReader: 107 | 108 | inraw1 = ''' 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 |
12
1983
34
120 | ''' 121 | in1 = StringIO(inraw1) 122 | 123 | exp1 = [ ['1', '2'], 124 | ['1983'], 125 | ['3', '4'], 126 | ] 127 | 128 | def test_1(self): 129 | reader = datautil.tabular.HtmlReader() 130 | tab = reader.read(self.in1) 131 | assert tab.data == self.exp1 132 | 133 | 134 | class TestHtmlWriter: 135 | 136 | def setUp(self): 137 | rawData = [[1,1], [0,1]] 138 | self.indata1 = datautil.tabular.TabularData(data=rawData) 139 | self.writer1 = datautil.tabular.HtmlWriter(table_attributes={'id':1, 'class': 'data'}) 140 | 141 | def test_0_simple(self): 142 | indata1 = [[1,1], [0,1]] 143 | expected = ''+\ 144 | '
11
01
' 145 | out1 = self.writer1.write_str(self.indata1) 146 | assert expected == out1 147 | 148 | def test_col_headings(self): 149 | self.indata1.header = [u'x','y'] 150 | caption = '' 151 | expected = ''+\ 152 | '' + \ 153 | '
xy
11
01
' 154 | # no caption but headings 155 | out1 = self.writer1.write_str(self.indata1, caption) 156 | assert expected == out1 157 | 158 | def test_row_headings(self): 159 | self.indata1.header = ['x','y'] 160 | rowHeadings = ['Date 1', 'Date 2'] 161 | caption = '' 162 | expected = '' + \ 163 | '' + \ 164 | '' + \ 165 | '
xy
Date 111
Date 201
' 166 | # no caption but headings 167 | out1 = self.writer1.write_str(self.indata1, caption, rowHeadings) 168 | assert expected == out1 169 | 170 | def test_escaping(self): 171 | tdata = datautil.tabular.TabularData(header=['s&p', 'y01' 180 | # print self.writer1.prettyPrint(in1) 181 | 182 | 183 | class TestLatexWriter: 184 | 185 | matrix = [[ 'H1', 'H2'], 186 | [1,'2%'], 187 | [3,4], 188 | ] 189 | 190 | exp = \ 191 | r'''\textbf{H1} & \textbf{H2} \\ 192 | \hline 193 | 1 & 2\% \\ 194 | \hline 195 | 3 & 4 \\ 196 | \hline 197 | ''' 198 | m2l = datautil.tabular.LatexWriter() 199 | 200 | def test_escape(self): 201 | in1 = '& % $ something' 202 | exp1 = r'\& \% $ something' 203 | assert self.m2l.escape(in1) == exp1 204 | 205 | def test_table2latex(self): 206 | out = datautil.tabular.table2latex(self.matrix) 207 | self.diff(self.exp, out) 208 | assert out == self.exp 209 | 210 | def test_write(self): 211 | td = datautil.tabular.TabularData(data=self.matrix[1:], header=self.matrix[0]) 212 | out = self.m2l.write_str(td) 213 | self.diff(self.exp, out) 214 | assert out == self.exp 215 | 216 | def diff(self, str1, str2): 217 | import difflib 218 | differ = difflib.Differ() 219 | text1 = str1.splitlines(1) 220 | text2 = str2.splitlines(1) 221 | result = list(differ.compare(text1, text2)) 222 | from pprint import pprint 223 | pprint(result) 224 | 225 | 226 | -------------------------------------------------------------------------------- /datautil/tests/tabular/test_gdocs.py: -------------------------------------------------------------------------------- 1 | import os 2 | from ConfigParser import SafeConfigParser 3 | 4 | import datautil.tabular.gdocs as gdocs 5 | from nose.plugins.skip import SkipTest 6 | 7 | 8 | cfg = SafeConfigParser() 9 | if not os.path.exists('test.ini'): 10 | msg = 'To run GDocs tests you need a config file. See %s for details' % __file__ 11 | raise SkipTest(msg) 12 | cfg.readfp(open('test.ini')) 13 | username = cfg.get('gdocs', 'username') 14 | password = cfg.get('gdocs', 'password') 15 | 16 | 17 | class TestGDocsTextDb: 18 | def test_01(self): 19 | source = 'okfn-datautil-gdocs-testing' 20 | reader = gdocs.GDocsReaderTextDb(source, username, password, id_is_name=True) 21 | tdata = reader.read() 22 | assert tdata.header == ['col1', 'col2'] 23 | assert len(tdata.data) == 5, tdata 24 | 25 | 26 | # not working properly yet 27 | class _TestGDocs: 28 | def test_01(self): 29 | source = 't8GZy4Lb6jhVjCL5nrqZ5TQ' 30 | reader = gdocs.GDocsReaderSpreadsheet(source, username, password) 31 | tdata = reader.read() 32 | assert len(tdata.data) == 6, tdata 33 | 34 | def test_02_id_is_name(self): 35 | source = 'okfn-datautil-gdocs-testing' 36 | reader = gdocs.GDocsReaderSpreadsheet(source, username, password, id_is_name=True) 37 | tdata = reader.read() 38 | assert len(tdata.data) == 6, tdata 39 | 40 | 41 | -------------------------------------------------------------------------------- /datautil/tests/tabular/test_json.py: -------------------------------------------------------------------------------- 1 | from StringIO import StringIO 2 | import datautil.tabular.tabular_json as js 3 | 4 | class TestJson: 5 | in1 = { 'header': [u'a', u'b'], 6 | 'data': [[1,2], [3,4]] 7 | } 8 | in2 = [ in1['header'] ] + in1['data'] 9 | in1sio = StringIO(js.json.dumps(in1)) 10 | in1sio.seek(0) 11 | in2sio = StringIO(js.json.dumps(in2)) 12 | in2sio.seek(0) 13 | 14 | def test_JsonReader(self): 15 | reader = js.JsonReader() 16 | out = reader.read(self.in1sio) 17 | assert out.header == self.in1['header'] 18 | assert out.data == self.in1['data'] 19 | 20 | out = reader.read(self.in2sio) 21 | assert out.header == self.in1['header'] 22 | assert out.data == self.in1['data'] 23 | 24 | def test_JsonWriter(self): 25 | writer = js.JsonWriter() 26 | td = js.TabularData(header=self.in1['header'], data=self.in1['data']) 27 | out = writer.write_str(td) 28 | assert js.json.loads(out) == self.in1 29 | 30 | -------------------------------------------------------------------------------- /datautil/tests/tabular/test_misc.py: -------------------------------------------------------------------------------- 1 | import datautil.tabular 2 | 3 | class TestTranspose: 4 | 5 | def test_1(self): 6 | inlist = [ 7 | [ 0, 1 ], 8 | [ 1, 0 ], 9 | ] 10 | exp = [ 11 | ( 0, 1 ), 12 | ( 1, 0 ), 13 | ] 14 | out = datautil.tabular.transpose(inlist) 15 | assert out == exp, out 16 | 17 | class TestPivot: 18 | td = datautil.tabular.TabularData( 19 | header=['Name','Year','Value'], 20 | data=[ 21 | ['x',2004,1], 22 | ['y',2004,2], 23 | ['y',2005,4], 24 | ['x',2005,3], 25 | ], 26 | ) 27 | 28 | def test_pivot_with_tabular(self): 29 | out = datautil.tabular.pivot(self.td, 1, 0, 2) 30 | assert out.data[0] == [2004, 1, 2] 31 | assert out.data[-1] == [2005, 3, 4] 32 | 33 | def test_pivot_with_tabular_2(self): 34 | out = datautil.tabular.pivot(self.td, 'Year', 'Name', 'Value') 35 | assert out.data[0] == [2004, 1, 2] 36 | 37 | def test_pivot_simple_list(self): 38 | out = datautil.tabular.pivot(self.td.data, 1, 0, 2) 39 | assert out.data[0] == [2004, 1, 2] 40 | 41 | -------------------------------------------------------------------------------- /datautil/tests/tabular/test_txt.py: -------------------------------------------------------------------------------- 1 | import StringIO 2 | 3 | from datautil.tabular.txt import * 4 | from datautil.tabular import TabularData, CsvReader 5 | 6 | class TestFormatting: 7 | 8 | sample_rows = [ 9 | ['1', '2', 'head blah', 'blah blah blah'], 10 | ['a', 'b', 'c', 'd', 'e', 'g' ], 11 | ['1', '2', 'annakarenina annakarenina annakarenina'], 12 | ] 13 | output_width = 60 14 | 15 | writer = TxtWriter(output_width=output_width) 16 | writer._compute_parameters(sample_rows) 17 | 18 | def test_1(self): 19 | assert self.writer.numcols == 6 20 | 21 | def test_colwidths(self): 22 | exp = int ((self.output_width -1) / 6) 23 | assert self.writer.colwidths[0] == exp 24 | 25 | def test__write_1(self): 26 | out = self.writer._write_row(self.sample_rows[0]) 27 | assert len(out) <= self.output_width 28 | 29 | def test__write_2(self): 30 | out = self.writer._write_row(self.sample_rows[0]) 31 | exp = '| 1 | 2 |head bla|blah bla| | |\n' 32 | assert out == exp 33 | 34 | def test__write_separator(self): 35 | out = self.writer._write_separator() 36 | exp = '+--------+--------+--------+--------+--------+--------+\n' 37 | 38 | 39 | 40 | class TestTxtWriter: 41 | sample = \ 42 | '''"YEAR","PH","RPH","RPH_1","LN_RPH","LN_RPH_1","HH","LN_HH" 43 | 1971,7.852361625,43.9168370988587,42.9594500501036,3.78229777955476,3.76025664867788,16185,9.69184016636035 44 | ,,abc, 45 | 1972,10.504714885,55.1134791192682,43.9168370988587,4.00939431635556,3.78229777955476,16397,9.70485367024987 46 | , ,, ''' 47 | 48 | expected = \ 49 | '''+------+------+------+------+------+------+------+------+ 50 | | YEAR | PH | RPH |RPH_1 |LN_RPH|LN_RPH| HH |LN_HH | 51 | +------+------+------+------+------+------+------+------+ 52 | | 1971 |7.8523|43.916|42.959|3.7822|3.7602|16185 |9.6918| 53 | +------+------+------+------+------+------+------+------+ 54 | | | | abc | | | | | | 55 | +------+------+------+------+------+------+------+------+ 56 | | 1972 |10.504|55.113|43.916|4.0093|3.7822|16397 |9.7048| 57 | +------+------+------+------+------+------+------+------+ 58 | | | | | | | | | | 59 | +------+------+------+------+------+------+------+------+ 60 | ''' 61 | 62 | def test_simple(self): 63 | indata = TabularData(data=[range(5),range(5,10)]) 64 | writer = TxtWriter() 65 | out = writer.write_str(indata) 66 | exp = '''+-+-+-+-+-+ 67 | |0|1|2|3|4| 68 | +-+-+-+-+-+ 69 | |5|6|7|8|9| 70 | +-+-+-+-+-+ 71 | ''' 72 | print out 73 | print exp 74 | assert out == exp 75 | 76 | def test_output_width(self): 77 | indata = TabularData(data=[range(5),range(5,10)]) 78 | writer = TxtWriter(output_width=16) 79 | out = writer.write_str(indata) 80 | outlen = len(out.splitlines()[0]) 81 | assert outlen == 16, outlen 82 | 83 | def test_using_csv(self): 84 | fileobj = StringIO.StringIO(self.sample) 85 | in_tdata = CsvReader(fileobj).read() 86 | writer = TxtWriter(output_width=60) 87 | out = writer.write_str(in_tdata) 88 | print out 89 | print self.expected 90 | assert self.expected == out, out 91 | 92 | -------------------------------------------------------------------------------- /datautil/tests/test_cache.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | import shutil 3 | import os 4 | 5 | from datautil.cache import Cache 6 | 7 | class TestCache: 8 | @classmethod 9 | def setup_class(self): 10 | self.tmp = tempfile.mkdtemp() 11 | self.path = os.path.join(self.tmp, 'abc.txt') 12 | open(self.path, 'w').write('abc') 13 | self.url = 'file://%s' % self.path 14 | 15 | @classmethod 16 | def teardown_class(self): 17 | shutil.rmtree(self.tmp) 18 | 19 | def test_basename(self): 20 | base = 'http://www.abc.org/' 21 | in1 = base + 'xyz' 22 | out = Cache.basename(in1) 23 | assert out == 'xyz' 24 | 25 | in2 = base + 'xyz/abc.txt' 26 | out = Cache.basename(in2) 27 | assert out == 'abc.txt' 28 | 29 | in3 = base + 'membersDo?body=ABC' 30 | out = Cache.basename(in3) 31 | assert out == 'membersDo?body=ABC', out 32 | 33 | in3 = base + 'membersDo?body=data/ABC' 34 | out = Cache.basename(in3) 35 | assert out == 'membersDo?body=data%47ABC', out 36 | 37 | def test_filepath(self): 38 | r = Cache() 39 | base = 'http://www.abc.org/' 40 | in1 = base + 'xyz' 41 | out = r.filepath(in1) 42 | # ./xyz 43 | assert out.endswith('xyz'), out 44 | 45 | def test_dl(self): 46 | dest = os.path.join(self.tmp, 'out.txt') 47 | Cache.dl(self.url, dest) 48 | assert os.path.exists(dest) 49 | assert open(dest).read() == 'abc' 50 | 51 | def test_cache(self): 52 | cache = os.path.join(self.tmp, 'cache') 53 | r = Cache(cache) 54 | r.retrieve(self.url) 55 | assert os.path.exists(os.path.join(cache, 'abc.txt')) 56 | 57 | -------------------------------------------------------------------------------- /datautil/tests/test_date.py: -------------------------------------------------------------------------------- 1 | from datautil.date import * 2 | 3 | import datetime 4 | 5 | class TestPythonStringOrdering(object): 6 | # It is impossible to find a string format such that +ve and -ve numbers 7 | # sort correctly as strings: 8 | # if (in string ordering) X < Y => -X < -Y (False!) 9 | def test_ordering(self): 10 | assert '0' < '1' 11 | assert '-10' < '10' 12 | assert '-' < '@' 13 | assert '-' < '0' 14 | assert '-100' < '-X10' 15 | assert '10' < '1000' 16 | assert '02000' < '10000' 17 | assert ' 2000' < '10000' 18 | 19 | def test_bad_ordering(self): 20 | assert ' ' < '0' 21 | assert ' ' < '-' 22 | assert not '-' < '+' 23 | assert '-100' > '-10' 24 | assert not '-100' < '-010' 25 | assert not '-100' < '- 10' 26 | assert not '-100' < ' -10' 27 | assert '10000' < '2000' 28 | assert not '-10' < ' 1' 29 | 30 | 31 | class TestFlexiDate(object): 32 | def test_init(self): 33 | fd = FlexiDate() 34 | assert fd.year == '', fd 35 | assert fd.month == '', fd 36 | 37 | fd = FlexiDate(2000, 1,1) 38 | assert fd.month == '01', fd 39 | assert fd.day== '01', fd 40 | 41 | def test_str(self): 42 | fd = FlexiDate(2000, 1, 23) 43 | assert str(fd) == '2000-01-23', '"%s"' % fd 44 | fd = FlexiDate(-2000, 1, 23) 45 | assert str(fd) == '-2000-01-23' 46 | fd = FlexiDate(2000) 47 | assert str(fd) == '2000' 48 | fd = FlexiDate(1760, qualifier='fl.') 49 | assert str(fd) == '1760 [fl.]', fd 50 | 51 | fd = FlexiDate(qualifier='anything') 52 | assert str(fd) == ' [anything]' 53 | 54 | 55 | def test_from_str(self): 56 | def dotest(fd): 57 | out = FlexiDate.from_str(str(fd)) 58 | assert str(out) == str(fd) 59 | 60 | fd = FlexiDate(2000, 1, 23) 61 | dotest(fd) 62 | fd = FlexiDate(1760, qualifier='fl.') 63 | dotest(fd) 64 | fd = FlexiDate(-1760, 1, 3, qualifier='fl.') 65 | dotest(fd) 66 | 67 | def test_as_float(self): 68 | fd = FlexiDate(2000) 69 | assert fd.as_float() == float(2000), fd.as_float() 70 | fd = FlexiDate(1760, 1, 2) 71 | exp = 1760 + 1/12.0 + 2/365.0 72 | assert fd.as_float() == exp, fd.as_float() 73 | fd = FlexiDate(-1000) 74 | assert fd.as_float() == float(-1000) 75 | 76 | def test_as_datetime(self): 77 | fd = FlexiDate(2000) 78 | out = fd.as_datetime() 79 | assert out == datetime.datetime(2000, 1, 1), out 80 | fd = FlexiDate(1760, 1, 2) 81 | out = fd.as_datetime() 82 | assert out == datetime.datetime(1760,1,2), out 83 | 84 | 85 | class TestDateParsers(object): 86 | def test_using_datetime(self): 87 | parser = PythonDateParser() 88 | 89 | d1 = datetime.date(2000, 1, 23) 90 | fd = parser.parse(d1) 91 | assert fd.year == '2000' 92 | 93 | d1 = datetime.datetime(2000, 1, 23) 94 | fd = parser.parse(d1) 95 | # assert str(fd) == '2000-01-23T00:00:00', fd 96 | assert str(fd) == '2000-01-23', fd 97 | 98 | def test_using_dateutil(self): 99 | parser = DateutilDateParser() 100 | 101 | in1 = '2001-02' 102 | fd = parser.parse(in1) 103 | assert str(fd) == in1, fd 104 | 105 | in1 = 'March 1762' 106 | fd = parser.parse(in1) 107 | assert str(fd) == '1762-03' 108 | 109 | in1 = 'March 1762' 110 | fd = parser.parse(in1) 111 | assert str(fd) == '1762-03' 112 | 113 | in1 = '1768 AD' 114 | fd = parser.parse(in1) 115 | assert str(fd) == '1768', fd 116 | 117 | in1 = '1768 A.D.' 118 | fd = parser.parse(in1) 119 | assert str(fd) == '1768', fd 120 | 121 | in1 = '-1850' 122 | fd = parser.parse(in1) 123 | assert str(fd) == '-1850', fd 124 | 125 | in1 = '1762 BC' 126 | fd = parser.parse(in1) 127 | assert str(fd) == '-1762', fd 128 | 129 | in1 = '4 BC' 130 | fd = parser.parse(in1) 131 | assert str(fd) == '-0004', fd 132 | 133 | in1 = '4 B.C.' 134 | fd = parser.parse(in1) 135 | assert str(fd) == '-0004', fd 136 | 137 | in1 = 'Wed, 06 Jan 2010 09:30:00 GMT' 138 | fd = parser.parse(in1) 139 | assert str(fd) == '2010-01-06', fd 140 | 141 | in1 = 'Tue, 07 Dec 2010 10:00:00 GMT' 142 | fd = parser.parse(in1) 143 | assert str(fd) == '2010-12-07', fd 144 | 145 | def test_parse(self): 146 | d1 = datetime.datetime(2000, 1, 23) 147 | fd = parse(d1) 148 | assert fd.year == '2000' 149 | 150 | fd = parse('March 1762') 151 | assert str(fd) == '1762-03' 152 | 153 | fd = parse(1966) 154 | assert str(fd) == '1966' 155 | 156 | fd = parse('22/07/2010') 157 | assert fd.month == '07', fd.month 158 | 159 | def test_parse_ambiguous_day_month(self): 160 | fd = parse('05/07/2010') 161 | assert fd.month == '07', fd.month 162 | assert fd.day == '05', fd.month 163 | 164 | def test_parse_with_none(self): 165 | d1 = parse(None) 166 | assert d1 is None 167 | 168 | def test_parse_wildcards(self): 169 | fd = parse('198?') 170 | assert fd.year == '', fd.year # expect this to not parse 171 | # TODO but we should have a float if possible 172 | # assert fd.as_float() == u'1980', fd.as_float() 173 | 174 | def test_parse_with_qualifiers(self): 175 | 176 | fd = parse('1985?') 177 | assert fd.year == u'1985', fd 178 | assert fd.qualifier == u'Uncertainty : 1985?', fd.qualifier 179 | 180 | # match '[c|c. |c.] {date}' 181 | fd = parse('c.1780') 182 | assert fd.year == u'1780', fd 183 | assert fd.qualifier.startswith(u"Note 'circa'"), fd 184 | 185 | fd = parse('c. 1780') 186 | assert fd.year == u'1780', fd 187 | assert fd.qualifier.startswith(u"Note 'circa'"), fd 188 | 189 | fd = parse('c1780') 190 | assert fd.year == '1780', fd 191 | assert fd.qualifier == u"Note 'circa' : c1780", fd 192 | 193 | fd = parse('c 1780') 194 | assert fd.year == u'1780', fd 195 | assert fd.qualifier.startswith(u"Note 'circa'"), fd 196 | 197 | # match 'circa {date}' | circa{date}' 198 | fd = parse('circa1780') 199 | assert fd.year == u'1780', fd 200 | assert fd.qualifier == u"Note 'circa' : circa1780", fd 201 | 202 | fd = parse('circa 1780') 203 | assert fd.year == u'1780', fd 204 | assert fd.qualifier.startswith(u"Note 'circa'"), fd 205 | 206 | # match '[circ|circ. |circ.] {date}' 207 | fd = parse('circ1780') 208 | assert fd.year == u'1780', fd 209 | assert fd.qualifier == u"Note 'circa' : circ1780", fd 210 | 211 | fd = parse('circ 1780') 212 | assert fd.year == u'1780', fd 213 | assert fd.qualifier.startswith(u"Note 'circa'"), fd 214 | 215 | fd = parse('circ.1780') 216 | assert fd.year == u'1780', fd 217 | assert fd.qualifier == u"Note 'circa' : circ.1780", fd 218 | 219 | fd = parse('circ. 1780') 220 | assert fd.year == u'1780', fd 221 | assert fd.qualifier.startswith(u"Note 'circa'"), fd 222 | 223 | # match '[cca|cca. |cca.] {date}' 224 | fd = parse('cca1780') 225 | assert fd.year == u'1780', fd 226 | assert fd.qualifier == u"Note 'circa' : cca1780", fd 227 | 228 | fd = parse('cca 1780') 229 | assert fd.year == u'1780', fd 230 | assert fd.qualifier.startswith(u"Note 'circa'"), fd 231 | 232 | fd = parse('cca.1780') 233 | assert fd.year == u'1780', fd 234 | assert fd.qualifier == u"Note 'circa' : cca.1780", fd 235 | 236 | fd = parse('cca. 1780') 237 | assert fd.year == u'1780', fd 238 | assert fd.qualifier.startswith(u"Note 'circa'"), fd 239 | 240 | # match '[ca|ca. |ca.] {date}' 241 | 242 | fd = parse('ca. 1780') 243 | assert fd.year == u'1780', fd 244 | assert fd.qualifier.startswith(u"Note 'circa'"), fd 245 | 246 | fd = parse('ca. 1780') 247 | assert fd.year == u'1780', fd 248 | assert fd.qualifier.startswith(u"Note 'circa'"), fd 249 | 250 | fd = parse('ca.1780') 251 | assert fd.year == u'1780', fd 252 | assert fd.qualifier.startswith(u"Note 'circa'"), fd 253 | 254 | fd = parse('ca.1780') 255 | assert fd.year == u'1780', fd 256 | assert fd.qualifier.startswith(u"Note 'circa'"), fd 257 | 258 | fd = parse('ca.1780') 259 | assert fd.year == u'1780', fd 260 | assert fd.qualifier.startswith(u"Note 'circa'"), fd 261 | 262 | 263 | 264 | def test_ambiguous(self): 265 | # TODO: have to be careful here ... 266 | fd = parse('1068/1069') 267 | 268 | def test_small_years(self): 269 | in1 = '23' 270 | fd = parse(in1) 271 | assert str(fd) == '0023', fd 272 | assert fd.as_float() == 23, fd.as_float() 273 | 274 | def test_small_years_with_zeros(self): 275 | in1 = '0023' 276 | fd = parse(in1) 277 | assert str(fd) == '0023', fd 278 | assert fd.as_float() == 23, fd.as_float() 279 | 280 | def test_years_with_alpha_prefix(self): 281 | in1 = "p1980" 282 | fd = parse(in1) 283 | assert str(fd) == "1980", fd 284 | -------------------------------------------------------------------------------- /datautil/tests/test_id.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | 3 | import datautil.id 4 | 5 | def test_compress_and_uncompress_uuid(): 6 | hexversion = '86c3f19d-8854-4ef5-8d88-f008e0817871' 7 | 8 | out = datautil.id.compress_uuid(hexversion) 9 | assert len(out) == 22 10 | 11 | orig = datautil.id.uncompress_uuid(out) 12 | assert orig == hexversion 13 | 14 | # test unicode 15 | orig = datautil.id.uncompress_uuid(unicode(out)) 16 | assert orig == hexversion 17 | 18 | u1 = uuid.UUID(hexversion) 19 | out = datautil.id.compress_uuid(u1) 20 | assert len(out) == 22 21 | 22 | 23 | def test_int_to_b32(): 24 | def check(int_): 25 | out = datautil.id.int_to_b32(int_) 26 | assert isinstance(out, basestring) 27 | assert len(out) == 7, out 28 | 29 | back = datautil.id.b32_to_int(out) 30 | assert back == int_, (int_,back) 31 | 32 | check(1) 33 | check(2**28+1) 34 | check(2**30-1) 35 | 36 | -------------------------------------------------------------------------------- /datautil/tests/test_misc.py: -------------------------------------------------------------------------------- 1 | from datautil.misc import * 2 | 3 | class TestFloatify: 4 | def test_floatify_1(self): 5 | x = '10' 6 | assert floatify(x) == 10.0 7 | 8 | def test_floatify_2(self): 9 | x = '1,030' 10 | assert floatify(x) == 1030.0 11 | 12 | def test_floatify_2(self): 13 | x = '' 14 | out = floatify(x) 15 | assert out == None, out 16 | x = '#' 17 | out = floatify(x) 18 | assert out == None, out 19 | 20 | def test_floatify_matrix(self): 21 | x = [ 22 | ['1', '2'], 23 | ['abc', '3.0'] 24 | ] 25 | exp = [ 26 | [1.0, 2.0], 27 | ['abc', 3.0] 28 | ] 29 | out = floatify_matrix(x) 30 | assert out == exp 31 | 32 | 33 | class TestMakeSeries: 34 | 35 | def test_make_series(self): 36 | indata = [ [ '1980', '100', '50' ], 37 | [ '1981', '101', '51' ], 38 | [ '1982', '102', '' ], 39 | ] 40 | exp = [ 41 | [ (1980.0, 100.0), (1981.0, 101.0), (1982.0, 102.0) ], 42 | [ (1980.0, 50.0), (1981.0, 51.0) ] 43 | ] 44 | out = make_series(indata, xcol=0, ycols=[1,2]) 45 | assert out == exp, out 46 | 47 | -------------------------------------------------------------------------------- /datautil/tests/test_xls.py: -------------------------------------------------------------------------------- 1 | import pkg_resources 2 | 3 | import datautil.tabular 4 | 5 | class TestXlsReader: 6 | 7 | def test_stuff(self): 8 | fo = pkg_resources.resource_stream('datautil', 9 | 'tests/data/xls_reader_test.xls') 10 | reader = datautil.tabular.XlsReader(fo) 11 | tab = reader.read() 12 | assert tab.data[0][0] == 1850 13 | assert tab.data[19][1] == 12.3 14 | 15 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | import sys 4 | sys.path.insert(0, '.') 5 | from datautil import __version__, __doc__ as __long_description__ 6 | 7 | setup( 8 | name='datautil', 9 | version=__version__, 10 | license='MIT', 11 | description='Utilities for Data Work', 12 | long_description=__long_description__, 13 | author='Open Knowledge Foundation', 14 | author_email='info@okfn.org', 15 | url='http://okfn.org/projects/datautil/', 16 | download_url='https://github.com/okfn/datautil/', 17 | install_requires=[ 18 | # python-dateutil 2.0 has different _parse method, so stick to 1.4.1 19 | 'python-dateutil>=1.0,<1.99', 20 | # (optional) for excel handling 21 | # xlrd 22 | # (optional) for google docs handling 23 | # gdata 24 | ], 25 | packages=find_packages(), 26 | include_package_data=True, 27 | zip_safe=False, 28 | classifiers = [ 29 | 'Development Status :: 5 - Production/Stable', 30 | 'Environment :: Console', 31 | 'Intended Audience :: Developers', 32 | 'Operating System :: OS Independent', 33 | 'Programming Language :: Python', 34 | 'Programming Language :: Python :: 2 :: Only', 35 | 'Topic :: Software Development :: Libraries :: Python Modules' 36 | ], 37 | ) 38 | -------------------------------------------------------------------------------- /swiss/__init__.py: -------------------------------------------------------------------------------- 1 | '''Swiss Army Knife for Data Work 2 | ============================== 3 | 4 | The swiss package provides various utilities for working with data: 5 | 6 | * cache: Url caching and scraping 7 | * tabular/*: Processing and transforming tabular data to and from various 8 | formats including csv, json, google spreadsheets, xls 9 | * misc, date: Cleaning up and parsing data especially dates. 10 | * id: ID generation and shortenening 11 | * clitools.py: Command line tools such as creating optparse object and usage 12 | from a module of object. 13 | * deliveranceproxy.py: Deliverance proxy helper 14 | 15 | 16 | CHANGELOG 17 | ========= 18 | 19 | v0.3 2010-08-01 20 | --------------- 21 | 22 | * Support for google docs spreadsheets as sources for TabularData 23 | * Improve documentation of date module and add FlexiDate.as_datetime() 24 | * New clitools module incorporating existing cli tools 25 | * deliveranceproxy.py: Deliverance proxy helper for proxying to remote 26 | websites and retheming with deliverance. 27 | * parse/name.py: new (human) name parsing code 28 | 29 | v0.2 2009-10-23 30 | --------------- 31 | 32 | * Extensive refactoring of tabular module/package 33 | * Standardized interface with BaseReader and BaseWriter 34 | * JsonReader and JsonWriter providing json reading and writing 35 | * TxtWriter to support writing to plain text 36 | * Improvements to date parsing (support for circa, 'c.', etc) 37 | * New id module to do 'compression' of uuids using 32 and 64 bit encoding 38 | 39 | 40 | v0.1 2009-06-03 41 | --------------- 42 | 43 | * Bring together existing code (from last 2+ years) into new 'swiss' package 44 | * Url caching and scraping 45 | * Tabular data handling including csv reader/writer, xls reader, latex writer 46 | and associated utilities (such as pivot_table) 47 | * Cleaning and parsing data especially dates (misc and date modules) 48 | ''' 49 | __version__ = '0.3' 50 | 51 | import tabular 52 | from cache import * 53 | from misc import * 54 | from id import * 55 | -------------------------------------------------------------------------------- /swiss/cache.py: -------------------------------------------------------------------------------- 1 | '''A local file cache with url retrieving builtin. 2 | 3 | NB: this module has zero dependencies on modules outside of the 4 | standard lib so that it is easily reusable in other libraries and applications 5 | that do not require any other parts of the swiss package. 6 | ''' 7 | import urlparse 8 | import urllib 9 | import os 10 | import sys 11 | 12 | 13 | # have to define before Cache as used in classmethod 14 | class _Progress(object): 15 | def __init__(self): 16 | self.count = -1 17 | 18 | def dl_progress(self, count, block_size, total_size): 19 | if total_size == 0: # total_size is weird so return to avoid errors 20 | return 21 | if self.count == -1: 22 | print 'Total size: %s' % self.format_size(total_size) 23 | last_percent = int(self.count*block_size*100/total_size) 24 | percent = int(count*block_size*100/total_size) 25 | if percent > last_percent: 26 | # TODO: is this acceptable? Do we want to do something nicer? 27 | sys.stdout.write('.') 28 | sys.stdout.flush() 29 | self.count = count 30 | 31 | def format_size(self, bytes): 32 | if bytes > 1000*1000: 33 | return '%.1fMb' % (bytes/1000.0/1000) 34 | elif bytes > 10*1000: 35 | return '%iKb' % (bytes/1000) 36 | elif bytes > 1000: 37 | return '%.1fKb' % (bytes/1000.0) 38 | else: 39 | return '%ibytes' % bytes 40 | 41 | 42 | class Cache(object): 43 | '''A local file cache (and url retriever). 44 | ''' 45 | 46 | def __init__(self, path='.'): 47 | ''' 48 | @param path: path to cache (defaults to current directory) 49 | ''' 50 | self.path = path 51 | if not os.path.exists(self.path): 52 | os.makedirs(path) 53 | 54 | def retrieve(self, url, force=False): 55 | '''Retrieve url into cache and return the local path to it.''' 56 | dest = self.cache_path(url) 57 | if not os.path.exists(dest) or force: 58 | self.download(url, dest) 59 | return dest 60 | 61 | def cache_path(self, url): 62 | '''Local path for url within cache.''' 63 | name = self.basename(url) 64 | dest = os.path.join(self.path, name) 65 | return dest 66 | 67 | def filepath(self, url): 68 | '''Deprecated: use cache_path''' 69 | return self.cache_path(url) 70 | 71 | def stream(self, url): 72 | fp = self.cache_path(url) 73 | if not os.path.exists(fp): 74 | return None 75 | else: 76 | return open(fp) 77 | 78 | @classmethod 79 | def basename(self, url): 80 | scheme, netloc, path, params, query, fragment = urlparse.urlparse(url) 81 | result = path.split('/')[-1] 82 | if query: 83 | # escape '/' as otherwise path problems 84 | result += '?' + query.replace('/', '%47') 85 | return result 86 | 87 | @classmethod 88 | def download(self, url, dest=None): 89 | '''Download a file from a url. 90 | ''' 91 | if not dest: 92 | dest = self.basename(url) 93 | print 'Retrieving %s' % url 94 | prog = _Progress() 95 | urllib.urlretrieve(url, dest, reporthook=prog.dl_progress) 96 | 97 | # for backwards compatability 98 | @classmethod 99 | def dl(self, url, dest=None): 100 | return self.download(url, dest) 101 | 102 | -------------------------------------------------------------------------------- /swiss/clitools.py: -------------------------------------------------------------------------------- 1 | '''Expose methods or functions as commands on the command line 2 | 3 | Example usage:: 4 | 5 | # in your code 6 | from swiss.clitools import _main 7 | if __name__ == '__main__': 8 | # expose everything in current module 9 | _main(locals()) 10 | # or if you have an object MyObject with methods you want to expose 11 | _main(MyObject) 12 | ''' 13 | import os 14 | import sys 15 | import optparse 16 | import inspect 17 | 18 | def _object_methods(obj): 19 | methods = inspect.getmembers(obj, inspect.ismethod) 20 | methods = filter(lambda (name,y): not name.startswith('_'), methods) 21 | methods = dict(methods) 22 | return methods 23 | 24 | def _module_functions(functions): 25 | local_functions = dict(functions) 26 | for k,v in local_functions.items(): 27 | if not inspect.isfunction(v) or k.startswith('_'): 28 | del local_functions[k] 29 | return local_functions 30 | 31 | def _main(functions_or_object): 32 | isobject = inspect.isclass(functions_or_object) 33 | if isobject: 34 | _methods = _object_methods(functions_or_object) 35 | else: 36 | _methods = _module_functions(functions_or_object) 37 | 38 | usage = '''%prog {action} 39 | 40 | Actions: 41 | ''' 42 | usage += '\n '.join( 43 | [ '%s: %s' % (name, m.__doc__.split('\n')[0] if m.__doc__ else '') for (name,m) 44 | in sorted(_methods.items()) ]) 45 | parser = optparse.OptionParser(usage) 46 | # Optional: for a config file 47 | # parser.add_option('-c', '--config', dest='config', 48 | # help='Config file to use.') 49 | options, args = parser.parse_args() 50 | 51 | if not args or not args[0] in _methods: 52 | parser.print_help() 53 | sys.exit(1) 54 | 55 | method = args[0] 56 | if isobject: 57 | getattr(functions_or_object(), method)(*args[1:]) 58 | else: 59 | _methods[method](*args[1:]) 60 | 61 | __all__ = [ '_main' ] 62 | 63 | if __name__ == '__main__': 64 | _main(locals()) 65 | 66 | -------------------------------------------------------------------------------- /swiss/date.py: -------------------------------------------------------------------------------- 1 | '''Date parsing and normalization utilities based on FlexiDate. 2 | 3 | To parser dates use parse, e.g.:: 4 | 5 | parse('1890') -> FlexiDate(year=u'1890') 6 | parse('1890?') -> FlexiDate(year=u'1890', qualifier='Uncertainty: 1985?') 7 | 8 | Once you have a FlexiDate you can get access to attributes (strings of course 9 | ...):: 10 | 11 | fd = parse('Jan 1890') 12 | fd.year # u'1890' 13 | fd.month # u'01' 14 | 15 | And convert to other forms: 16 | 17 | fd.as_float() # 1890 18 | fd.as_datetime() # datetime(1890,01,01) 19 | 20 | Background 21 | ========== 22 | 23 | FlexiDate is focused on supporting: 24 | 25 | 1. Dates outside of Python (or DB) supported period (esp. dates < 0 AD) 26 | 2. Imprecise dates (c.1860, 18??, fl. 1534, etc) 27 | 3. Normalization of dates to machine processable versions 28 | 4. Sortable in the database (in correct date order) 29 | 30 | For more information see: 31 | 32 | http://www.rufuspollock.org/2009/06/18/flexible-dates-in-python/ 33 | ''' 34 | import re 35 | import datetime 36 | 37 | class FlexiDate(object): 38 | """Store dates as strings and present them in a slightly extended version 39 | of ISO8601. 40 | 41 | Modifications: 42 | * Allow a trailing qualifiers e.g. fl. 43 | * Allow replacement of unknown values by ? e.g. if sometime in 1800s 44 | can do 18?? 45 | 46 | Restriction on ISO8601: 47 | * Truncation (e.g. of centuries) is *not* permitted. 48 | * No week and day representation e.g. 1999-W01 49 | """ 50 | # pass 51 | def __init__(self, year=None, month=None, day=None, qualifier=''): 52 | # force = month or day or qualifier 53 | force = False 54 | self.year = self._cvt(year, rjust=4, force=force) 55 | self.month = self._cvt(month) 56 | self.day = self._cvt(day) 57 | self.qualifier = qualifier 58 | 59 | def _cvt(self, val, rjust=2, force=False): 60 | if val: 61 | tmp = unicode(val).strip() 62 | if tmp.startswith('-'): 63 | tmp = '-' + tmp[1:].rjust(rjust, '0') 64 | else: 65 | tmp = tmp.rjust(rjust, '0') 66 | return tmp 67 | elif force: 68 | # use '!' rather than '?' as '!' < '1' while '?' > '1' 69 | return rjust * '!' 70 | else: 71 | return '' 72 | 73 | def __str__(self): 74 | out = self.isoformat() 75 | if self.qualifier: 76 | # leading space is important as ensures when no year sort in right 77 | # order as ' ' < '1' 78 | out += u' [%s]' % self.qualifier 79 | return out 80 | 81 | def __repr__(self): 82 | return u'%s %s' % (self.__class__, self.__str__()) 83 | 84 | def isoformat(self, strict=False): 85 | '''Return date in isoformat (same as __str__ but without qualifier). 86 | 87 | WARNING: does not replace '?' in dates unless strict=True. 88 | ''' 89 | out = self.year 90 | # what do we do when no year ... 91 | for val in [ self.month, self.day ]: 92 | if not val: 93 | break 94 | out += u'-' + val 95 | if strict: 96 | out = out.replace('?', '0') 97 | return out 98 | 99 | our_re_pat = ''' 100 | (?P -?[\d?]+) 101 | (?: 102 | \s* - (?P [\d?]{1,2}) 103 | (?: \s* - (?P [\d?]{1,2}) )? 104 | )? 105 | \s* 106 | (?: \[ (?P[^]]*) \])? 107 | ''' 108 | our_re = re.compile(our_re_pat, re.VERBOSE) 109 | @classmethod 110 | def from_str(self, instr): 111 | '''Undo affect of __str__''' 112 | if not instr: 113 | return FlexiDate() 114 | 115 | out = self.our_re.match(instr) 116 | if out is None: # no match TODO: raise Exception? 117 | return None 118 | else: 119 | return FlexiDate( 120 | out.group('year'), 121 | out.group('month'), 122 | out.group('day'), 123 | qualifier=out.group('qualifier') 124 | ) 125 | 126 | def as_float(self): 127 | '''Get as a float (year being the integer part). 128 | 129 | Replace '?' in year with 9 so as to be conservative (e.g. 19?? becomes 130 | 1999) and elsewhere (month, day) with 0 131 | 132 | @return: float. 133 | ''' 134 | if not self.year: return None 135 | out = float(self.year.replace('?', '9')) 136 | if self.month: 137 | # TODO: we are assuming months are of equal length 138 | out += float(self.month.replace('?', '0')) / 12.0 139 | if self.day: 140 | out += float(self.day.replace('?', '0')) / 365.0 141 | return out 142 | 143 | def as_datetime(self): 144 | '''Get as python datetime.datetime. 145 | 146 | Require year to be a valid datetime year. Default month and day to 1 if 147 | do not exist. 148 | 149 | @return: datetime.datetime object. 150 | ''' 151 | year = int(self.year) 152 | month = int(self.month) if self.month else 1 153 | day = int(self.day) if self.day else 1 154 | return datetime.datetime(year, month, day) 155 | 156 | 157 | def parse(date, dayfirst=True): 158 | '''Parse a `date` into a `FlexiDate`. 159 | 160 | @param date: the date to parse - may be a string, datetime.date, 161 | datetime.datetime or FlexiDate. 162 | 163 | TODO: support for quarters e.g. Q4 1980 or 1954 Q3 164 | TODO: support latin stuff like M.DCC.LIII 165 | TODO: convert '-' to '?' when used that way 166 | e.g. had this date [181-] 167 | ''' 168 | if not date: 169 | return None 170 | if isinstance(date, FlexiDate): 171 | return date 172 | if isinstance(date, int): 173 | return FlexiDate(year=date) 174 | elif isinstance(date, datetime.date): 175 | parser = PythonDateParser() 176 | return parser.parse(date) 177 | else: # assuming its a string 178 | parser = DateutilDateParser() 179 | out = parser.parse(date, **{'dayfirst': dayfirst}) 180 | if out is not None: 181 | return out 182 | # msg = 'Unable to parse %s' % date 183 | # raise ValueError(date) 184 | val = 'UNPARSED: %s' % date 185 | val = val.encode('ascii', 'ignore') 186 | return FlexiDate(qualifier=val) 187 | 188 | 189 | class DateParserBase(object): 190 | def parse(self, date): 191 | raise NotImplementedError 192 | 193 | def norm(self, date): 194 | return str(self.parse(date)) 195 | 196 | class PythonDateParser(object): 197 | def parse(self, date): 198 | return FlexiDate(date.year, date.month, date.day) 199 | 200 | try: 201 | import dateutil.parser 202 | dateutil_parser = dateutil.parser.parser() 203 | except: 204 | dateutil_parser = None 205 | 206 | class DateutilDateParser(DateParserBase): 207 | _numeric = re.compile("^[0-9]+$") 208 | def parse(self, date, **kwargs): 209 | ''' 210 | :param **kwargs: any kwargs accepted by dateutil.parse function. 211 | ''' 212 | qualifiers = [] 213 | if dateutil_parser is None: 214 | return None 215 | date = orig_date = date.strip() 216 | 217 | # various normalizations 218 | # TODO: call .lower() first 219 | date = date.replace('B.C.', 'BC') 220 | date = date.replace('A.D.', 'AD') 221 | 222 | # deal with pre 0AD dates 223 | if date.startswith('-') or 'BC' in date or 'B.C.' in date: 224 | pre0AD = True 225 | else: 226 | pre0AD = False 227 | # BC seems to mess up parser 228 | date = date.replace('BC', '') 229 | 230 | # deal with circa: 'c.1950' or 'c1950' 231 | circa_match = re.match('(.*)c\.?\s*(\d+.*)', date) 232 | if circa_match: 233 | # remove circa bit 234 | qualifiers.append("Note 'circa'") 235 | date = ''.join(circa_match.groups()) 236 | 237 | # deal with p1980 (what does this mean? it can appear in 238 | # field 008 of MARC records 239 | p_match = re.match("^p(\d+)", date) 240 | if p_match: 241 | date = date[1:] 242 | 243 | # Deal with uncertainty: '1985?' 244 | uncertainty_match = re.match('([0-9xX]{4})\?', date) 245 | if uncertainty_match: 246 | # remove the ? 247 | date = date[:-1] 248 | qualifiers.append('Uncertainty') 249 | 250 | # Parse the numbers intelligently 251 | # do not use std parser function as creates lots of default data 252 | res = dateutil_parser._parse(date, **kwargs) 253 | 254 | if res is None: 255 | # Couldn't parse it 256 | return None 257 | #Note: Years of less than 3 digits not interpreted by 258 | # dateutil correctly 259 | # e.g. 87 -> 1987 260 | # 4 -> day 4 (no year) 261 | # Both cases are handled in this routine 262 | if res.year is None and res.day: 263 | year = res.day 264 | # If the whole date is simply two digits then dateutil_parser makes 265 | # it '86' -> '1986'. So strip off the '19'. (If the date specified 266 | # day/month then a two digit year is more likely to be this century 267 | # and so allow the '19' prefix to it.) 268 | elif self._numeric.match(date) and (len(date) == 2 or date.startswith('00')): 269 | year = res.year % 100 270 | else: 271 | year = res.year 272 | 273 | # finally add back in BC stuff 274 | if pre0AD: 275 | year = -year 276 | 277 | if not qualifiers: 278 | qualifier = '' 279 | else: 280 | qualifier = ', '.join(qualifiers) + (' : %s' % orig_date) 281 | return FlexiDate(year, res.month, res.day, qualifier=qualifier) 282 | 283 | -------------------------------------------------------------------------------- /swiss/deliveranceproxy.py: -------------------------------------------------------------------------------- 1 | '''Use deliverance_ for proxying and re-theming. 2 | 3 | .. _deliverance: http://packages.python.org/Deliverance/ 4 | 5 | Usage requirements (in pip-requirements.txt format):: 6 | 7 | # suggest installing lxml directly 8 | lxml 9 | deliverance>=0.3a 10 | # for urlmap and proxy 11 | paste 12 | # for Response 13 | webob 14 | 15 | Example usage:: 16 | 17 | dest = 'http://myremotes.ite/' 18 | mytheme = '....' 19 | my_deliverance_rules = ' ...' 20 | # or 21 | # my_deliverance_rules = open('/my/path/to/rules.xml').read() 22 | deliverance_proxy = create_deliverance_proxy(mytheme, dest, 23 | my_deliverance_rules) 24 | 25 | # from in wsgi app 26 | # path on remote destination url you want to proxy to ... 27 | # you can omit this if local path and remote path are the same 28 | environ['PATH_INFO'] = '/my_destination_path' 29 | deliverance_proxy(environ, start_response) 30 | ''' 31 | import logging 32 | 33 | import paste.urlmap 34 | import deliverance.middleware 35 | import paste.proxy 36 | from webob import Request, Response 37 | from deliverance.middleware import DeliveranceMiddleware, SubrequestRuleGetter 38 | from deliverance.log import PrintingLogger 39 | 40 | 41 | default_deliverance_rules = \ 42 | ''' 43 | 44 | 45 | 47 | 48 | 49 | 50 | 51 | 54 | 55 | 56 | ''' 57 | 58 | def create_deliverance_proxy(proxy_base_url, theme_html, rules_xml=None): 59 | '''Proxy to another url with re-theming using deliverance. 60 | 61 | Based on http://rufuspollock.org/code/deliverance 62 | 63 | :param proxy_base_url: base destination url we are proxying to. 64 | :param theme_html: string providing html theme to use for re-themeing. 65 | :param rules_xml: (optional) deliverance rules xml as a string. If not 66 | provided use `default_deliverance_rules`. For info on rulesets see 67 | deliverance docs. We require that ruleset support a single 68 | substitution string '%s' which is used to insert internal mountpoint 69 | for the them ('/_deliverance_theme.html'). 70 | ''' 71 | theme_url = '/_deliverance_theme.html' 72 | # use a urlmap so we can mount theme and urlset 73 | app = paste.urlmap.URLMap() 74 | # set up theme consistent with our rules file 75 | app[theme_url] = Response(theme_html) 76 | 77 | if rules_xml: 78 | rules = rules_xml 79 | else: 80 | rules = default_deliverance_rules 81 | rules = rules % theme_url 82 | app['/_deliverance_rules.xml'] = Response(rules, content_type="application/xml") 83 | 84 | class MyProxy(object): 85 | def __init__(self, proxy_base_url): 86 | self.proxy = paste.proxy.Proxy(proxy_base_url) 87 | 88 | def __call__(self, environ, start_response): 89 | req = Request(environ) 90 | res = req.get_response(self.proxy) 91 | res.decode_content() 92 | return res(environ, start_response) 93 | 94 | app['/'] = MyProxy(proxy_base_url) 95 | deliv = DeliveranceMiddleware(app, SubrequestRuleGetter('/_deliverance_rules.xml'), 96 | PrintingLogger, 97 | log_factory_kw=dict(print_level=logging.WARNING)) 98 | return deliv 99 | 100 | -------------------------------------------------------------------------------- /swiss/id.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import uuid 3 | 4 | def compress_uuid(_uuid): 5 | '''Provided shortened string representation of UUID via base64 encoding. 6 | 7 | @return: 22 character base64 encoded version of UUID. 8 | ''' 9 | if isinstance(_uuid, basestring): 10 | _uuid = uuid.UUID(_uuid) 11 | encode = base64.b64encode(_uuid.bytes, '_-') 12 | # throw away trailing == 13 | return encode[:22] 14 | 15 | def uncompress_uuid(b64_encoded): 16 | '''Reverse compress_uuid 17 | 18 | @return: 36 char str representation of uuid. 19 | ''' 20 | b64_encoded = str(b64_encoded) 21 | if not b64_encoded.endswith('=='): 22 | b64_encoded += '==' 23 | out = base64.b64decode(b64_encoded, '_-') 24 | _uuid = uuid.UUID(bytes=out) 25 | return str(_uuid) 26 | 27 | 28 | import struct 29 | def int_to_b32(int_): 30 | out = struct.pack('1i', int_) 31 | out = base64.b32encode(out) 32 | # throw away trailing '=' 33 | return out[:-1] 34 | 35 | def b32_to_int(b32): 36 | out = base64.b32decode(b32+'=', casefold=True) 37 | out = struct.unpack('1i', out)[0] 38 | return out 39 | 40 | -------------------------------------------------------------------------------- /swiss/misc.py: -------------------------------------------------------------------------------- 1 | # TODO: create a strict option where None is returned on failed convert rather 2 | # than original value 3 | placeholders = [ '', '-', '#' ] 4 | def floatify(value): 5 | '''Convert value to a float if possible. 6 | 7 | @return: Floatified value. If value is blank or placeholder ('-') return 8 | None. Can deal with ',' in value. Will also floatify dates. If nothing 9 | works returns original value. 10 | ''' 11 | if value is None: 12 | return None 13 | if isinstance(value, basestring): 14 | stripped = value.strip() 15 | if not stripped or stripped in placeholders: 16 | return None 17 | else: 18 | # often numbers have commas in them like 1,030 19 | v = value.replace(',', '') 20 | try: 21 | newval = float(v) 22 | return newval 23 | except: 24 | pass 25 | # will return original value if fails 26 | return date_to_float(value) 27 | 28 | def floatify_matrix(matrix): 29 | return [ [ floatify(col) for col in row ] for row in matrix ] 30 | 31 | # TODO: remove/convert to using date.FlexiDate.as_float() 32 | import datetime 33 | def date_to_float(date): 34 | '''Convert a date to float. 35 | 36 | Accepts either a date object or a string parseable to a date object 37 | 38 | @return: converted value or original if conversion fails 39 | ''' 40 | import dateutil.parser 41 | if isinstance(date, basestring): 42 | try: # simple year 43 | return float(date) 44 | except: 45 | pass 46 | try: 47 | val = dateutil.parser.parse(date, default=datetime.date(1,1,1)) 48 | except: 49 | return date 50 | else: 51 | val = date 52 | 53 | if isinstance(val, datetime.date): 54 | fval = val.year + val.month / 12.0 + val.day / 365.0 55 | return round(fval, 3) 56 | else: 57 | return val 58 | 59 | def make_series(matrix, xcol, ycols=None): 60 | '''Take a matrix and return series (i.e. list of tuples) corresponding to 61 | specified column indices. 62 | 63 | E.g. if matrix is: 64 | [ [1,2,3,4] 65 | [5,6,7,8] ] 66 | 67 | and xcol = 0, ycols=[1,3] then output is: 68 | 69 | [ 70 | [ [1,2], [5,6] ], 71 | [ [1,4], [5,8] ], 72 | ] 73 | 74 | If ycols not defined then return all possible series (excluding xcol 75 | with itself. 76 | ''' 77 | cols = zip(*matrix) 78 | if ycols is None: 79 | ycols = range(len(cols)) 80 | del ycols[xcol] 81 | cols = floatify_matrix(cols) 82 | def is_good(value): 83 | if value is None: return False 84 | tv = str(value) 85 | stopchars = [ '', '-' ] 86 | if tv in stopchars: 87 | return False 88 | return True 89 | def is_good_tuple(tuple): 90 | return is_good(tuple[0]) and is_good(tuple[1]) 91 | 92 | xcoldata = cols[xcol] 93 | ycols = [ cols[ii] for ii in ycols ] 94 | series = [ filter(is_good_tuple, zip(xcoldata, col)) for col in ycols ] 95 | return series 96 | 97 | -------------------------------------------------------------------------------- /swiss/parse/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rufuspollock-okfn/datautil/6bcb8795013d305f02467967367ecca15ad351e2/swiss/parse/__init__.py -------------------------------------------------------------------------------- /swiss/parse/name.py: -------------------------------------------------------------------------------- 1 | '''Parse names of people into a standard format.''' 2 | 3 | import re 4 | 5 | titles = [ 6 | u'Ayatollah', 7 | u'Baron', 8 | u'Bishop', 9 | u'Dame', 10 | u'Dr', 11 | u'Fr', 12 | u'Graf', 13 | u'King', 14 | u'Lady', 15 | u'Maj', 16 | u'Major', 17 | u'Mrs', 18 | u'Prof', 19 | u'Rev', 20 | u'Sir', 21 | u'St', 22 | ] 23 | 24 | class Name(object): 25 | '''A name of a person or entity. 26 | 27 | Not a domain object but a convenient way to handle/parse names. 28 | 29 | Attributes: 30 | title 31 | ln: last name 32 | firstnames: first names as list 33 | ''' 34 | def __init__(self, ln='', fns=None, title=''): 35 | self.ln = ln 36 | self.fns = fns 37 | if self.fns is None: self.fns = [] 38 | self.title = title 39 | 40 | def norm(self): 41 | '''Return normalised name string (LastFirst format) 42 | ''' 43 | return name_tostr(self) 44 | 45 | def __str__(self): 46 | '''Display name using normalised format 47 | ''' 48 | return self.norm() 49 | 50 | class NameParserBase(object): 51 | regex_remove_fullstops = re.compile(r'(\w{2,})\.(\W|$)', re.UNICODE) 52 | 53 | def parse(self, fullname): 54 | '''Parse the `fullname` string into a `Name` object. 55 | 56 | @return: `Name` object for `fullname` 57 | ''' 58 | if fullname is None: 59 | return Name() 60 | fullname = unicode(fullname.strip()) 61 | if not fullname: 62 | return Name() 63 | 64 | # remove words ending '.', e.g. 'Bosch.' 65 | fullname = self.regex_remove_fullstops.sub(r'\1\2', fullname) 66 | 67 | # make sure initials are separted by ' ' 68 | # but first deal with special edge case like [Major.] 69 | # fullname = fullname.replace('.]', ']') 70 | fullname = fullname.replace('.', '. ') 71 | name = self._toparts(fullname) 72 | name.ln = self.normcase(name.ln) 73 | name.fns = [ self.normcase(x) for x in name.fns ] 74 | name.title = self.normcase(name.title) 75 | return name 76 | 77 | def _toparts(self, fullname): 78 | '''Implement in inheriting classes, called by parse. 79 | ''' 80 | raise NotImplementedError() 81 | 82 | def tostr(self, name): 83 | '''Convert name object back into a string. 84 | ''' 85 | raise NotImplementedError() 86 | 87 | def normcase(self, name): 88 | # useful to handle none and you often get this from regexes 89 | if name is None: 90 | return '' 91 | name = name.strip() 92 | if name.upper() == name or name.lower() == name: 93 | return name.capitalize() 94 | # avoid issues with e.g. McTaggart 95 | else: 96 | return name 97 | 98 | def untitlize(self, _str): 99 | '''Return title contained in _str if a title else return empty string. 100 | ''' 101 | title = _str.strip() 102 | title = _str.strip('()') 103 | if title in titles: 104 | return title 105 | # always assume something in square brackets is a title 106 | elif title.startswith('[') and title.endswith(']'): 107 | return title[1:-1].strip() 108 | else: 109 | return '' 110 | 111 | def titlize(self, _str): 112 | return u'[' + _str + u']' 113 | 114 | def norm(self, date): 115 | return str(self.parse(date)) 116 | 117 | 118 | class LastFirst(NameParserBase): 119 | '''Parse and creates names of form: 120 | 121 | lastname, first-names-in-order [title] 122 | ''' 123 | def _toparts(self, fullname): 124 | if ',' not in fullname and ' ' in fullname: 125 | raise ValueError('Expected "," in name: %s' % fullname) 126 | name = Name() 127 | # NB: if more than 2 commas just ignore stuff after 2nd one 128 | parts = fullname.split(',') 129 | name.ln = parts[0] 130 | name.fns = parts[1].strip().split() 131 | if name.fns: 132 | title = self.untitlize(name.fns[-1]) 133 | if title: 134 | name.title = title 135 | del name.fns[-1] 136 | return name 137 | 138 | def tostr(self, name): 139 | if name.ln or name.fns: 140 | fns = ' '.join(name.fns) 141 | if not fns: 142 | out = name.ln 143 | else: 144 | out = unicode(', '.join((name.ln, ' '.join(name.fns)))) 145 | else: 146 | return '' 147 | if name.title: 148 | out = out + u' [%s]' % name.title 149 | return out 150 | 151 | 152 | class FirstLast(NameParserBase): 153 | '''Parse and create names of form: 154 | 155 | [title] first-names last-name 156 | ''' 157 | def _toparts(self, fullname): 158 | name = Name() 159 | if ',' in fullname: 160 | raise ValueError('Should not have "," in FirstLast type name: %s' % 161 | fullname) 162 | parts = fullname.split() 163 | name.ln = parts[-1] 164 | name.fns = parts[:-1] 165 | if name.fns: 166 | title = self.untitlize(name.fns[0]) 167 | if title: 168 | name.title = title 169 | del name.fns[0] 170 | return name 171 | 172 | def tostr(self, name): 173 | if name.fns or name.ln: 174 | out = u' '.join(name.fns) + ' ' + name.ln 175 | else: 176 | return '' 177 | if name.title: 178 | out = u'[%s]' % name.title + out 179 | return out 180 | 181 | 182 | def parse_name(fullname): 183 | if ',' in fullname: 184 | parser = LastFirst() 185 | else: 186 | parser = FirstLast() 187 | return parser.parse(fullname) 188 | 189 | def name_tostr(name, parser_class=LastFirst): 190 | parser = parser_class() 191 | return parser.tostr(name) 192 | 193 | def normalize(name_str, parser_class=LastFirst): 194 | name = parse_name(name_str) 195 | return name_tostr(name, parser_class) 196 | 197 | 198 | -------------------------------------------------------------------------------- /swiss/tabular/__init__.py: -------------------------------------------------------------------------------- 1 | from base import * 2 | from misc import * 3 | from xls import XlsReader 4 | from html import * 5 | from tabular_json import JsonReader, JsonWriter 6 | from txt import TxtWriter 7 | 8 | -------------------------------------------------------------------------------- /swiss/tabular/base.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tools for dealing with tabular data 3 | """ 4 | 5 | class TabularData(object): 6 | """Holder for tabular data 7 | 8 | NB: 9 | * Assume data organized in rows. 10 | * No type conversion so all data will be as entered. 11 | 12 | Properties: 13 | * data: data itself provided as array of arrays 14 | * header: associated header columns (if they exist) 15 | 16 | TODO: handling of large datasets (iterators?) 17 | """ 18 | 19 | def __init__(self, data=None, header=None): 20 | """ 21 | Initialize object. If data or header not set they are defaulted to 22 | empty list. 23 | 24 | NB: must use None as default value for arguments rather than [] 25 | because [] is mutable and using it will result in subtle bugs. See: 26 | 'Default parameter values are evaluated when the function definition 27 | is executed.' [http://www.python.org/doc/current/ref/function.html] 28 | """ 29 | self.data = [] 30 | self.header = [] 31 | if data is not None: 32 | self.data = data 33 | if header is not None: 34 | self.header = header 35 | 36 | def __repr__(self): 37 | out = [] 38 | if self.header: 39 | out.append(self.header) 40 | # limit to 10 items 41 | out += self.data[0:10] 42 | return repr(out) 43 | 44 | def __str__(self): 45 | return repr(self) 46 | 47 | def __iter__(self): 48 | return self.data.__iter__() 49 | 50 | @classmethod 51 | def from_list(self, list_, header=True): 52 | return TabularData(header=list_[0], data=list_[1:]) 53 | 54 | def to_list(self): 55 | if self.header: 56 | return [ self.header ] + self.data 57 | else: 58 | return self.data 59 | 60 | 61 | class ReaderBase(object): 62 | def __init__(self, filepath_or_fileobj=None, encoding='utf8'): 63 | self.filepath = None 64 | self.fileobj = None 65 | self._filepath_or_fileobj(filepath_or_fileobj) 66 | self.encoding = 'utf8' 67 | 68 | def _filepath_or_fileobj(self, filepath_or_fileobj): 69 | if filepath_or_fileobj is None: # do not overwrite any existing value 70 | pass 71 | elif isinstance(filepath_or_fileobj, basestring): 72 | self.filepath = filepath_or_fileobj 73 | self.fileobj = open(self.filepath) 74 | else: 75 | self.filepath = None 76 | self.fileobj = filepath_or_fileobj 77 | 78 | def read(self, filepath_or_fileobj=None): 79 | self._filepath_or_fileobj(filepath_or_fileobj) 80 | 81 | 82 | class WriterBase(object): 83 | ''' 84 | Extra arguments to write methods: 85 | has_row_headings: first col of each row is a heading. 86 | ''' 87 | def __init__(self, round_ndigits=None, **kwargs): 88 | ''' 89 | @round_ndigits: number of decimal places to use when rounding numerical 90 | values when textifying for output 91 | ''' 92 | self.round_ndigits = round_ndigits 93 | 94 | def write(self, tabular_data, fileobj, *args, **kwargs): 95 | pass 96 | 97 | def write_str(self, tabular_data, *args, **kwargs): 98 | from StringIO import StringIO 99 | holder = StringIO() 100 | self.write(tabular_data, holder, *args, **kwargs) 101 | holder.seek(0) 102 | return holder.read() 103 | 104 | def value_to_str(self, value): 105 | '''Convert value to text (rounding floats/ints as necessary). 106 | ''' 107 | if value is None: 108 | return '' 109 | if self.round_ndigits is not None and \ 110 | (isinstance(value, int) or isinstance(value, float)): 111 | roundedResult = round(value, self.round_ndigits) 112 | if self.round_ndigits <= 0: # o/w will have in .0 at end 113 | roundedResult = int(roundedResult) 114 | roundedResult = str(roundedResult) 115 | # deal with case when rounding has added unnecessary digits 116 | if len(str(value)) < len(roundedResult): 117 | return str(value) 118 | else: 119 | return roundedResult 120 | else: 121 | return unicode(value) 122 | 123 | 124 | import csv 125 | import codecs 126 | class UTF8Recoder: 127 | """ 128 | Iterator that reads an encoded stream and reencodes the input to UTF-8 129 | 130 | From: 131 | """ 132 | def __init__(self, f, encoding=None): 133 | if encoding: 134 | self.reader = codecs.getreader(encoding)(f) 135 | else: # already unicode so just return f 136 | self.reader = f 137 | 138 | def __iter__(self): 139 | return self 140 | 141 | def next(self): 142 | return self.reader.next().encode('utf-8') 143 | 144 | class CsvReader(ReaderBase): 145 | """Read data from a csv file into a TabularData structure 146 | 147 | Note that the csv module does *not* support unicode: 148 | 149 | > This version of the csv module doesn't support Unicode input. Also, there 150 | > are currently some issues regarding ASCII NUL characters. Accordingly, 151 | > all input should be UTF-8 or printable ASCII to be safe; see the examples 152 | > in section 9.1.5. These restrictions will be removed in the future. 153 | > 154 | """ 155 | 156 | def read(self, filepath_or_fileobj=None, encoding=None, **kwargs): 157 | """Read in a csv file and return a TabularData object. 158 | 159 | @param fileobj: file like object. 160 | @param encoding: if set use this instead of default encoding set in 161 | __init__ to decode the file like object. NB: will check if fileobj 162 | already in unicode in which case this is ignored. 163 | @param kwargs: all further kwargs are passed to the underlying `csv.reader` function 164 | @return tabular data object (all values encoded as utf-8). 165 | """ 166 | super(CsvReader, self).read(filepath_or_fileobj) 167 | if encoding: 168 | self.encoding = encoding 169 | tabData = TabularData() 170 | 171 | sample = self.fileobj.read() 172 | # first do a simple test -- maybe sample is already unicode 173 | if type(sample) == unicode: 174 | encoded_fo = UTF8Recoder(self.fileobj, None) 175 | else: 176 | sample = sample.decode(self.encoding) 177 | encoded_fo = UTF8Recoder(self.fileobj, self.encoding) 178 | sample = sample.encode('utf-8') 179 | sniffer = csv.Sniffer() 180 | hasHeader = sniffer.has_header(sample) 181 | 182 | self.fileobj.seek(0) 183 | ourkwargs = { 184 | 'skipinitialspace': True 185 | } 186 | if kwargs: 187 | ourkwargs.update(kwargs) 188 | 189 | reader = csv.reader(encoded_fo, **ourkwargs) 190 | if hasHeader: 191 | tabData.header = reader.next() 192 | for row in reader: 193 | tabData.data.append(row) 194 | return tabData 195 | 196 | # for backwards compatibility 197 | ReaderCsv = CsvReader 198 | 199 | class CsvWriter(WriterBase): 200 | # TODO: unicode support a la CsvReader 201 | def write(self, tabular_data, fileobj, encoding='utf-8'): 202 | writer = csv.writer(fileobj) 203 | if tabular_data.header: 204 | writer.writerow(tabular_data.header) 205 | for row in tabular_data.data: 206 | writer.writerow(row) 207 | fileobj.flush() 208 | 209 | 210 | ## -------------------------------- 211 | ## Converting to Latex 212 | 213 | class LatexWriter(WriterBase): 214 | 215 | def write(self, tabular_data, fileobj, has_row_headings=False): 216 | self.has_row_headings = has_row_headings 217 | matrix = tabular_data.data 218 | has_header = len(tabular_data.header) > 0 219 | if has_header: 220 | matrix.insert(0, tabular_data.header) 221 | out = self._write(matrix, has_header) 222 | fileobj.write(out) 223 | 224 | def _write(self, matrix, has_header=True): 225 | if len(matrix) == 0: return 226 | # no hline on first row as this seems to mess up latex \input 227 | # http://groups.google.com/group/comp.text.tex/browse_thread/thread/1e1db553a958ebd8/0e590a22cb59f43d 228 | out = '%s' % self.process_row(matrix[0], has_header) 229 | for row in matrix[1:]: 230 | out += self.process_row(row) 231 | return out 232 | 233 | def process_row(self, row, heading=False): 234 | if len(row) == 0: return 235 | out = '%s' % self.process_cell(row[0], heading or self.has_row_headings) 236 | for cell in row[1:]: 237 | out += ' & %s' % self.process_cell(cell, heading) 238 | out += ' \\\\\n\hline\n' 239 | return out 240 | 241 | def process_cell(self, cell, heading=False): 242 | cell_text = self.value_to_str(cell) 243 | cell_text = self.escape(cell_text) 244 | if heading: 245 | return '\\textbf{%s}' % cell_text 246 | else: 247 | return cell_text 248 | 249 | def escape(self, text): 250 | escape_chars = [ '&', '%' ] 251 | out = text 252 | for ch in escape_chars: 253 | out = out.replace(ch, '\\%s' % ch) 254 | return out 255 | 256 | 257 | # TODO: 2009-08-05 deprecate 258 | def table2latex(matrix, has_header=True, has_row_headings=False): 259 | m2l = LatexWriter() 260 | m2l.has_row_headings = has_row_headings 261 | return m2l._write(matrix, has_header) 262 | 263 | -------------------------------------------------------------------------------- /swiss/tabular/gdocs.py: -------------------------------------------------------------------------------- 1 | '''TabularData from a Google Docs Spreadsheet. 2 | ''' 3 | from base import ReaderBase, TabularData 4 | import gdata.spreadsheet.service 5 | import gdata.spreadsheet.text_db 6 | 7 | 8 | class GDocsReaderTextDb(ReaderBase): 9 | '''Read a google docs spreadsheet using the gdata.spreadsheet.text_db 10 | library. 11 | 12 | NB: any blank line in spreadsheet will be taken as terminating data. 13 | ''' 14 | def __init__(self, spreadsheet_id, username=None, password=None, 15 | id_is_name=False): 16 | ''' 17 | @param: spreadsheet_id: gdoc id or name (?key={id} in url). If name you 18 | must set id_is_name to True. 19 | ''' 20 | # do not pass spreadsheet_id down as it will be url or sheet name 21 | super(GDocsReaderTextDb, self).__init__() 22 | self.source = spreadsheet_id 23 | self.id_is_name = id_is_name 24 | self.gd_client = gdata.spreadsheet.text_db.DatabaseClient( 25 | username=username, 26 | password=password) 27 | 28 | def load_text_db_table(self, sheet_name='Sheet1'): 29 | '''Load text_db Table object corresponding to specified sheet_name. 30 | ''' 31 | super(GDocsReaderTextDb, self).read(None) 32 | if self.id_is_name: 33 | dbs = self.gd_client.GetDatabases(name=self.source) 34 | else: 35 | dbs = self.gd_client.GetDatabases(spreadsheet_key=self.source) 36 | assert len(dbs) >= 1, 'No spreadsheet of that name/id' 37 | db = dbs[0] 38 | table = db.GetTables(name=sheet_name)[0] 39 | return table 40 | 41 | def read(self, sheet_name='Sheet1'): 42 | '''Load the specified google spreadsheet worksheet as a L{TabularData} 43 | object. 44 | 45 | @return L{TabularData} object. 46 | ''' 47 | text_db_table = self.load_text_db_table(sheet_name) 48 | tdata = TabularData() 49 | text_db_table.LookupFields() 50 | tdata.header = text_db_table.fields 51 | # finds all records it seems 52 | rows = text_db_table.FindRecords('') 53 | for row in rows: 54 | rowdata = [] 55 | for colname in tdata.header: 56 | rowdata.append(row.content[colname]) 57 | tdata.data.append(rowdata) 58 | return tdata 59 | 60 | 61 | # not yet working properly (cannot work out ListFeed yet ...) 62 | # textdb is nicer but Spreadsheet allows one to get all cells using CellsFeed 63 | # (even when blank lines) (this is not true when using ListFeed though ...) 64 | # class GDocsReaderSpreadsheet(ReaderBase): 65 | # ''' 66 | # 67 | # From Docs for the API: 68 | # 69 | # 70 | # > The list feed contains all rows after the first row up to the first blank 71 | # row. The first blank row terminates the data set. If expected data isn't 72 | # appearing in a feed, check the worksheet manually to see whether there's an 73 | # unexpected blank row in the middle of the data. In particular, if the 74 | # second row of the spreadsheet is blank, then the list feed will contain no 75 | # data. 76 | # ''' 77 | # def __init__(self, spreadsheet_id, username=None, password=None, 78 | # id_is_name=False): 79 | # ''' 80 | # @param: spreadsheet_id: gdoc id or name (?key={id} in url). If name you 81 | # must set id_is_name to True. 82 | # ''' 83 | # # do not pass spreadsheet_id down as it will be url or sheet name 84 | # super(GDocsReaderSpreadsheet, self).__init__() 85 | # self.source = spreadsheet_id 86 | # self.id_is_name = id_is_name 87 | # self.gd_client = gdata.spreadsheet.service.SpreadsheetsService() 88 | # self.gd_client.email = username 89 | # self.gd_client.password = password 90 | # 91 | # def read(self, sheet_index=0): 92 | # '''Load the specified google spreadsheet worksheet as a L{TabularData} 93 | # object. 94 | # 95 | # @return L{TabularData} object. 96 | # ''' 97 | # super(GDocsReaderSpreadsheet, self).read(None) 98 | # self.gd_client.source = self.source 99 | # self.gd_client.ProgrammaticLogin() 100 | # if self.id_is_name: 101 | # feed = self.gd_client.GetSpreadsheetsFeed() 102 | # # no len on feed ... 103 | # # assert len(feed) > 0, 'No spreadsheets found for: %s' % self.source 104 | # spreadsheet_id = feed.entry[0].id.text.split('/')[-1] 105 | # else: 106 | # spreadsheet_id = self.source 107 | # sheetfeed = self.gd_client.GetWorksheetsFeed(spreadsheet_id) 108 | # wrksht_id = sheetfeed.entry[sheet_index].id.text.split('/')[-1] 109 | # row_feed = self.gd_client.GetListFeed(spreadsheet_id, wrksht_id) 110 | # 111 | # tdata = TabularData() 112 | # # tdata.header 113 | # # how do we get rows rather than just all the cells? 114 | # for i, entry in enumerate(row_feed.entry): 115 | # print entry.content['col1'] 116 | # print entry.content 117 | # tdata.data.append([entry.content.text]) 118 | # return tdata 119 | 120 | -------------------------------------------------------------------------------- /swiss/tabular/html.py: -------------------------------------------------------------------------------- 1 | import re 2 | from HTMLParser import HTMLParser 3 | 4 | from base import TabularData, ReaderBase, WriterBase 5 | 6 | 7 | class HtmlReader(ReaderBase): 8 | '''Read data from HTML table into L{TabularData}. 9 | 10 | ''' 11 | def read(self, filepath_or_fileobj=None, table_index=0): 12 | '''Read data from fileobj. 13 | 14 | NB: post read all tables extracted are in attribute named 'tables'. 15 | 16 | @arg table_index: if multiple tables in the html return table at this 17 | index. 18 | @return: L{TabularData} object (all content in the data part, i.e. no 19 | header). 20 | ''' 21 | super(HtmlReader, self).read(filepath_or_fileobj) 22 | parser = _OurTableExtractor() 23 | parser.reset() 24 | parser.feed(self.fileobj.read()) 25 | self.tables = parser.tables 26 | return self.tables[table_index] 27 | 28 | 29 | class _OurTableExtractor(HTMLParser): 30 | ''' 31 | # TODO: tbody, thead etc 32 | # TODO: nested tables 33 | 34 | # TODO: will barf on bad html so may need to run tidy first ... 35 | # tidy -w 0 -b -omit -asxml -ascii 36 | ''' 37 | def reset(self): 38 | HTMLParser.reset(self) 39 | self.tables = [] 40 | self._rows = [] 41 | self._row = [] 42 | self._text = '' 43 | 44 | def handle_starttag(self, tag, attrs): 45 | if tag == 'tr': 46 | self._row = [] 47 | elif tag == 'td' or tag == 'th': 48 | self._text = '' 49 | elif tag == 'br': 50 | self._text += '\n' 51 | 52 | def handle_endtag(self, tag): 53 | if tag == 'tr': 54 | self._rows.append(self._row) 55 | if tag == 'td' or tag == 'th': 56 | self._row.append(self._text) 57 | if tag == 'table': 58 | self.tables.append(TabularData(data=self._rows)) 59 | self._rows = [] 60 | 61 | def handle_data(self, data): 62 | self._text += data.strip() 63 | 64 | 65 | import re 66 | class HtmlWriter(WriterBase): 67 | """ 68 | Write tabular data to xhtml 69 | """ 70 | 71 | def __init__(self, round_ndigits=2, pretty_print=False, table_attributes = {'class': 'data'}): 72 | """ 73 | @pretty_print: whether to pretty print (indent) output 74 | @table_attributes: dictionary of html attribute name/value pairs to be 75 | added to the table element 76 | """ 77 | super(HtmlWriter, self).__init__(round_ndigits) 78 | self.pretty_print = pretty_print 79 | self.table_attributes = table_attributes 80 | 81 | def write(self, tabulardata, fileobj, caption = '', rowHeadings = []): 82 | """ 83 | Write matrix of data to xhtml table. 84 | Allow for addition of row and column headings 85 | 86 | @return xhtml table containing data 87 | 88 | @param data: table of data that makes up table 89 | @param caption: the caption for the table (if empty no caption created) 90 | @param rowHeadings: additional headings for rows (separate from 91 | tabulardata) 92 | """ 93 | columnHeadings = tabulardata.header 94 | data = tabulardata.data 95 | haveRowHeadings = (len(rowHeadings) > 0) 96 | 97 | htmlTable = ' 0: 110 | if haveRowHeadings and numColHeads == len(data[0]): 111 | # [[TODO: is this dangerous? should i make a copy ...]] 112 | columnHeadings.insert(0, '') 113 | htmlTable += self.writeHeading(columnHeadings) 114 | 115 | htmlTable += '' 116 | if self.pretty_print: 117 | htmlTable += '\n' 118 | 119 | for ii in range(0, len(data)): 120 | # have to add 1 as first row is headings 121 | if haveRowHeadings: 122 | htmlTable += self.writeRow(data[ii], rowHeadings[ii]) 123 | else: 124 | htmlTable += self.writeRow(data[ii]) 125 | 126 | htmlTable += '' 127 | 128 | if self.pretty_print: 129 | fileobj.write(self.prettyPrint(htmlTable)) 130 | else: 131 | fileobj.write(htmlTable) 132 | 133 | def value_to_str(self, value): 134 | import cgi 135 | out = super(HtmlWriter, self).value_to_str(value) 136 | out = cgi.escape(out) 137 | return out 138 | 139 | def writeHeading(self, row): 140 | """ 141 | Write heading for html table () 142 | """ 143 | result = '' 144 | result += self.writeGeneralRow(row, 'th') 145 | result += '' 146 | if self.pretty_print: 147 | result += '\n' 148 | return result 149 | 150 | def writeRow(self, row, rowHeading = ''): 151 | result = '' 152 | if rowHeading != '': 153 | result = '%s' % self.value_to_str(rowHeading) 154 | result += self.writeGeneralRow(row, 'td') 155 | result = '%s' % result 156 | if self.pretty_print: 157 | result += '\n' 158 | return result 159 | 160 | def writeGeneralRow(self, row, tagName): 161 | result = '' 162 | for ii in range(len(row)): 163 | result += '<%s>%s' % (tagName, self.value_to_str(row[ii]), tagName) 164 | return result 165 | 166 | def prettyPrint(self, html): 167 | """pretty print html using HTMLTidy""" 168 | # [[TODO: strip out html wrapper stuff that is added (head, body etc) 169 | try: 170 | import mx.Tidy 171 | out = mx.Tidy.tidy(html, None, None, wrap = 0, indent = 'yes')[2] 172 | except: 173 | out = html 174 | return self.tabify(out) 175 | 176 | def tabify(self, instr, tabsize = 2): 177 | """ 178 | tabify text by replacing spaces of size tabSize by tabs 179 | """ 180 | whitespace = tabsize * ' ' 181 | return re.sub(whitespace, '\t', instr) 182 | 183 | 184 | # for backwards compatibility 185 | # 2008-05-30 186 | WriterHtml = HtmlWriter 187 | 188 | 189 | -------------------------------------------------------------------------------- /swiss/tabular/misc.py: -------------------------------------------------------------------------------- 1 | '''General Helper methods for tabular data. 2 | ''' 3 | from base import TabularData 4 | 5 | def transpose(data): 6 | '''Transpose a list of lists. 7 | 8 | Or do it directy: data = zip(*data) 9 | ''' 10 | return zip(*data) 11 | 12 | def select_columns(matrix, cols): 13 | '''Return a matrix with only those column indexes in cols.''' 14 | tsp = transpose(matrix) 15 | out = [] 16 | cols.sort() 17 | for c in cols: 18 | out.append(tsp[c]) 19 | return transpose(out) 20 | 21 | 22 | def pivot(table, left, top, value): 23 | """Unnormalize (pivot) a normalised input set of tabular data. 24 | 25 | @param table: simple list of lists or a L{TabularData} object. 26 | 27 | Eg. To transform the tabular data like 28 | 29 | Name, Year, Value 30 | ----------------------- 31 | 'x', 2004, 1 32 | 'y', 2004, 2 33 | 'x', 2005, 3 34 | 'y', 2005, 4 35 | 36 | into the new list: 37 | 38 | Year, 'x', 'y' 39 | ------------------------ 40 | 2004, 1, 2 41 | 2005, 3, 4 42 | 43 | you would do: 44 | 45 | pivot(tabulardata, 1, 0, 2) 46 | 47 | OR (requires header to exist): 48 | 49 | pivot(tabulardata, 'Year', 'Name', 'Value') 50 | """ 51 | if not isinstance(left, int): 52 | left = table.header.index(left) 53 | if not isinstance(top, int): 54 | top = table.header.index(top) 55 | if not isinstance(value, int): 56 | value = table.header.index(value) 57 | 58 | rs = TabularData() 59 | # construct double dict keyed by left values 60 | tdict = {} 61 | xvals = set() 62 | yvals = set() 63 | for row in table: 64 | xval = row[left] 65 | if not xval in tdict: 66 | tdict[xval] = {} 67 | tdict[xval][row[top]] = row[value] 68 | xvals.add(xval) 69 | yvals.add(row[top]) 70 | xvals = sorted(list(xvals)) 71 | yvals = sorted(list(yvals)) 72 | xhead = 'X' 73 | if hasattr(table, 'header') and table.header: 74 | xhead = table.header[left] 75 | rs.header = [ xhead ] + yvals 76 | rs.data = [ [x] + [ tdict[x].get(y, '') for y in yvals ] for x in xvals ] 77 | return rs 78 | 79 | -------------------------------------------------------------------------------- /swiss/tabular/tabular_json.py: -------------------------------------------------------------------------------- 1 | '''JSON Reader and Writer''' 2 | try: 3 | import json 4 | except ImportError: 5 | try: 6 | import simplejson as json 7 | except ImportError: # simplejson not installed 8 | pass 9 | from base import TabularData, ReaderBase, WriterBase 10 | 11 | 12 | class JsonReader(ReaderBase): 13 | def read(self, filepath_or_fileobj=None): 14 | '''Read JSON encoded data from source into a L{TabularData} object. 15 | 16 | JSON encoded data should either be: 17 | * dict (with header and data attributes) 18 | * list (first row assumed to be the header) 19 | 20 | @return L{TabularData} 21 | ''' 22 | super(JsonReader, self).read(filepath_or_fileobj) 23 | jsondata = json.load(self.fileobj) 24 | if isinstance(jsondata, dict): 25 | return TabularData(header=jsondata.get('header', None), 26 | data=jsondata.get('data', None) 27 | ) 28 | elif isinstance(jsondata, list): 29 | return TabularData(header=jsondata[0], data=jsondata[1:]) 30 | else: 31 | raise Exception('Cannot load TabularData from %s' % jsondata) 32 | 33 | class JsonWriter(WriterBase): 34 | 35 | def write(self, tabular_data, fileobj, indent=2): 36 | super(JsonWriter, self).write(tabular_data, fileobj) 37 | jsondata = { u'header': tabular_data.header, 38 | u'data': tabular_data.data 39 | } 40 | json.dump(jsondata, fileobj, indent=indent) 41 | 42 | -------------------------------------------------------------------------------- /swiss/tabular/txt.py: -------------------------------------------------------------------------------- 1 | from base import WriterBase 2 | 3 | class TxtWriter(WriterBase): 4 | '''Write tabular data to plain text in nicely formatted way 5 | 6 | TODO 7 | ==== 8 | 9 | 1. allow output_width of 0 meaning use width necessary to fit all rows on one 10 | line 11 | 12 | 2. rather than truncate cell contents wrap it onto two lines (and/or allow 13 | spillover if adjacent cell is empty) 14 | 15 | * wontfix: can let terminal do this: just set width very large ... 16 | 17 | 3. (?) stream output back rather than returning all at once 18 | 19 | 4. Add support for limiting number of columns displayed. DONE 2007-08-02 20 | * TODO: add unittest 21 | ''' 22 | 23 | def __init__(self, output_width=0, number_of_columns=0, **kwargs): 24 | ''' 25 | @param output_width: display width (0 means unlimited). 26 | @param number_of_columns: number of columns to try to display (not 27 | guaranteed to be this number if this would cause problems). (0 28 | means all columns) 29 | ''' 30 | super(TxtWriter, self).__init__(**kwargs) 31 | self.output_width = output_width 32 | self.number_of_columns = number_of_columns 33 | 34 | def write(self, tabular_data, fileobj): 35 | result = '' 36 | formatter = None 37 | row_cache = [] 38 | sample_length = 4 39 | rows = tabular_data.data 40 | if tabular_data.header: 41 | rows = [ tabular_data.header ] + rows 42 | # include header in sample rows (do we always want to?) 43 | sample_rows = rows[:sample_length] 44 | self._compute_parameters(sample_rows) 45 | result += self._write_separator() 46 | for row in rows: 47 | result += self._write_row(row) 48 | result += self._write_separator() 49 | fileobj.write(result) 50 | 51 | def _compute_parameters(self, sample_rows): 52 | maxcols = self._get_maxcols(sample_rows) 53 | if not self.number_of_columns: 54 | self.numcols = maxcols 55 | else: 56 | self.numcols = min(self.number_of_columns, maxcols) 57 | self.colwidths = [] 58 | self._set_colwidths(sample_rows) 59 | if self.colwidths[0] < 2: 60 | msg =\ 61 | u'''It is not possible to effectively format this many columns of material with 62 | this narrow an output window. Column width is: %s''' % self.colwidths[0] 63 | # TODO: log it? 64 | print msg 65 | 66 | def _write_row(self, row): 67 | '''Return the input 'python' row as an appropriately formatted string. 68 | ''' 69 | result = '|' 70 | count = 0 71 | for cell in row[:self.numcols]: 72 | width = self.colwidths[count] 73 | result += self._format_cell(width, cell) 74 | count += 1 75 | # now pad out with extra cols as necessary 76 | while count < self.numcols: 77 | width = self.colwidths[count] 78 | result += self._format_cell(width, ' ') 79 | count += 1 80 | return result + '\n' 81 | 82 | def _write_separator(self): 83 | result = '+' 84 | for width in self.colwidths: 85 | result += '-' * (width-1) + '+' 86 | return result + '\n' 87 | 88 | def _get_maxcols(self, sample_rows): 89 | maxcols = 0 90 | for row in sample_rows: 91 | maxcols = max(maxcols, len(row)) 92 | return maxcols 93 | 94 | def _set_colwidths(self, sample_rows): 95 | # subtract -1 so that we have (at least) one spare screen column 96 | if self.output_width != 0: 97 | colwidth = int( (self.output_width - 1) / self.numcols) 98 | for ii in range(self.numcols): 99 | self.colwidths.append(colwidth) 100 | else: # make every col as wide as it needs to be 101 | self.colwidths = [0] * self.numcols 102 | for row in sample_rows: 103 | for ii in range(self.numcols): 104 | cellwidth = len(self.value_to_str(row[ii])) 105 | self.colwidths[ii] = max(self.colwidths[ii], 106 | cellwidth 107 | ) 108 | self.colwidths = [ x + 1 for x in self.colwidths ] 109 | 110 | def _format_cell(self, width, content): 111 | content = self.value_to_str(content) 112 | content = content.strip() 113 | if len(content) > width - 1: 114 | # TODO: be brutal (this *has* to be fixed) 115 | content = content[:width-1] 116 | return content.center(width-1) + '|' 117 | 118 | -------------------------------------------------------------------------------- /swiss/tabular/xls.py: -------------------------------------------------------------------------------- 1 | '''Work with Excel (xls) files. 2 | 3 | Requires xlrd 4 | ''' 5 | try: 6 | import xlrd 7 | except ImportError: # xlrd not installed 8 | pass 9 | 10 | from base import ReaderBase, TabularData 11 | 12 | class XlsReader(ReaderBase): 13 | '''Read Excel (xls) files. 14 | 15 | Requires the xlrd package (see pypi). 16 | ''' 17 | def __init__(self, filepath_or_fileobj=None): 18 | super(XlsReader, self).__init__(filepath_or_fileobj) 19 | if self.fileobj: 20 | self.book = xlrd.open_workbook(file_contents=self.fileobj.read()) 21 | ## TODO: fix the rest of this 22 | 23 | def read(self, fileobj=None, sheet_index=0): 24 | '''Read an excel file (provide as fileobj) and return the specified 25 | sheet as a L{TabularData} object. 26 | 27 | For convenience also store: 28 | 29 | self.book: xlrd WorkBook object 30 | 31 | @return L{TabularData} object. 32 | ''' 33 | super(XlsReader, self).read(fileobj) 34 | if fileobj: 35 | self.book = xlrd.open_workbook(file_contents=self.fileobj.read()) 36 | tab = TabularData() 37 | booksheet = self.book.sheet_by_index(sheet_index) 38 | data = self.extract_sheet(booksheet, self.book) 39 | tab.data = data 40 | return tab 41 | 42 | def info(self): 43 | '''Return summary info about this Excel Workbook.''' 44 | info = '' 45 | info += 'The number of worksheets is: %s\n' % self.book.nsheets 46 | info += 'Worksheet name(s):\n' % self.book.sheet_names() 47 | count = -1 48 | for sn in self.book.sheet_names(): 49 | count += 1 50 | info += '%s %s\n' % (count, sn) 51 | return info 52 | 53 | def sheet_info(self, sheet_index): 54 | '''Summary info about an xls sheet. 55 | 56 | @return: printable string giving info. 57 | ''' 58 | import pprint 59 | sh = self.book.sheet_by_index(sheet_index) 60 | info = sh.name + '\n' 61 | info += 'Rows: %s Cols: %s\n\n' % (sh.nrows, sh.ncols) 62 | MAX_ROWS = 30 63 | for rx in range(min(sh.nrows, MAX_ROWS)): 64 | info += str(sh.row(rx)) + '\n' 65 | return info 66 | 67 | def extract_sheet(self, sheet, book): 68 | matrix = [] 69 | nrows = sheet.nrows 70 | ncols = sheet.ncols 71 | for rx in range(nrows): 72 | outrow = [] 73 | for cx in range(ncols): 74 | cell = sheet.cell(rowx=rx, colx=cx) 75 | val = self.cell_to_python(cell, book) 76 | outrow.append(val) 77 | matrix.append(outrow) 78 | return matrix 79 | 80 | def cell_to_python(self, cell, book): 81 | # annoying need book argument for datemode 82 | # info on types: http://www.lexicon.net/sjmachin/xlrd.html#xlrd.Cell-class 83 | if cell.ctype == xlrd.XL_CELL_NUMBER: 84 | return float(cell.value) 85 | elif cell.ctype == xlrd.XL_CELL_DATE: 86 | from datetime import date 87 | # TODO: distinguish date and datetime 88 | args = xlrd.xldate_as_tuple(cell.value, book.datemode) 89 | try: 90 | return date(args[0], args[1], args[2]) 91 | except Exception, inst: 92 | print 'Error parsing excel date (%s): %s' % (args, inst) 93 | return None 94 | elif cell.ctype == xlrd.XL_CELL_BOOLEAN: 95 | return bool(cell.value) 96 | else: 97 | return cell.value 98 | 99 | 100 | -------------------------------------------------------------------------------- /swiss/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # placeholder 2 | -------------------------------------------------------------------------------- /swiss/tests/data/xls_reader_test.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rufuspollock-okfn/datautil/6bcb8795013d305f02467967367ecca15ad351e2/swiss/tests/data/xls_reader_test.xls -------------------------------------------------------------------------------- /swiss/tests/parse/test_name.py: -------------------------------------------------------------------------------- 1 | import swiss.parse.name 2 | 3 | 4 | class TestName: 5 | def test_parse_name_FL(self): 6 | name = u'Ludwig Van Beethoven' 7 | out = swiss.parse.name.parse_name(name) 8 | assert out.ln == u'Beethoven' 9 | assert out.fns == ['Ludwig', 'Van'] 10 | 11 | def test_parse_name_LF_with_extra_comma(self): 12 | out = swiss.parse.name.parse_name('More, Sir Thomas,Saint') 13 | assert out.ln == 'More', out 14 | assert out.fns == ['Sir', 'Thomas'] 15 | 16 | def test_parse_name_FL_normcase(self): 17 | name = u'Ludwig van BEETHOVEN' 18 | out = swiss.parse.name.parse_name(name) 19 | assert out.ln == 'Beethoven', out 20 | 21 | def test_parse_name_LF_with_title(self): 22 | name = u'Chandos, John [Sir]' 23 | out = swiss.parse.name.parse_name(name) 24 | assert out.ln == 'Chandos', out 25 | assert out.title == 'Sir', out 26 | 27 | def test_parse_name_FL_with_title(self): 28 | name = u'Sir John CHANDOS' 29 | out = swiss.parse.name.parse_name(name) 30 | assert out.ln == 'Chandos', out 31 | assert out.title == 'Sir', out 32 | 33 | def test_parse_name_FL_with_title_2(self): 34 | name = u'Prof Benjamin AARON' 35 | out = swiss.parse.name.parse_name(name) 36 | assert out.ln == 'Aaron', out 37 | assert out.title == 'Prof', out 38 | assert out.fns == ['Benjamin'], out 39 | assert str(out) == 'Aaron, Benjamin [Prof]' 40 | 41 | def test_parse_title_with_fullstop(self): 42 | name = 'Major. abc xyz' 43 | out = swiss.parse.name.parse_name(name) 44 | assert out.title == 'Major', out.title 45 | 46 | def test_parse_title_with_fullstop_2(self): 47 | name = 'Xyz, Abc [Major.]' 48 | out = swiss.parse.name.parse_name(name) 49 | print out 50 | assert out.title == 'Major', out.title 51 | 52 | def test_parse_title_with_brackets(self): 53 | name = 'Dickens, Gerald (Sir)' 54 | out = swiss.parse.name.parse_name(name) 55 | assert out.title == 'Sir', out.title 56 | 57 | name = '(Sir) Gerald Dickens' 58 | out = swiss.parse.name.parse_name(name) 59 | assert out.title == 'Sir', out.title 60 | 61 | def test_parse_name_FL_initials(self): 62 | name = 'Chekhov, A.P.' 63 | out = swiss.parse.name.parse_name(name) 64 | assert out.ln == 'Chekhov' 65 | assert out.fns == ['A.', 'P.'], out 66 | 67 | def test_strip_fullstops(self): 68 | name = 'George. Bosch' 69 | out = swiss.parse.name.normalize(name) 70 | assert out == 'Bosch, George' 71 | 72 | name = 'George. a.p. Bosch.' 73 | out = swiss.parse.name.normalize(name) 74 | assert out == 'Bosch, George A. P.', out 75 | 76 | name = 'Geo.rge. Bosch' 77 | out = swiss.parse.name.normalize(name) 78 | assert out == 'Bosch, Geo. Rge', out 79 | 80 | name = 'Geo.Smith. Bosch' 81 | out = swiss.parse.name.normalize(name) 82 | assert out == 'Bosch, Geo. Smith', out 83 | 84 | def test_tostr(self): 85 | name = swiss.parse.name.Name(ln='Beethoven', fns=['Ludwig', 'van']) 86 | exp = u'Beethoven, Ludwig van' 87 | out = swiss.parse.name.name_tostr(name) 88 | assert out == exp, out 89 | 90 | def test_with_no_name(self): 91 | name = swiss.parse.name.parse_name(' ') 92 | assert name.ln is '', name 93 | out = swiss.parse.name.normalize(' ') 94 | assert out == '', out 95 | 96 | def test_surname(self): 97 | name = u'SCHUBERT' 98 | out = str(swiss.parse.name.parse_name(name)) 99 | assert out == 'Schubert' 100 | 101 | -------------------------------------------------------------------------------- /swiss/tests/tabular/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rufuspollock-okfn/datautil/6bcb8795013d305f02467967367ecca15ad351e2/swiss/tests/tabular/__init__.py -------------------------------------------------------------------------------- /swiss/tests/tabular/test_base.py: -------------------------------------------------------------------------------- 1 | import os 2 | from StringIO import StringIO 3 | 4 | import swiss.tabular 5 | 6 | class TestTabularData: 7 | testlist = [ ['X', 'Y'], [1,2], [3,4] ] 8 | 9 | def test_1(self): 10 | tabular = swiss.tabular.TabularData() 11 | assert tabular.header == [] 12 | 13 | def test_from_list(self): 14 | out = swiss.tabular.TabularData.from_list(self.testlist) 15 | assert out.header == [ 'X', 'Y' ] 16 | assert out.data == [ [1,2], [3,4] ] 17 | 18 | def test_to_list(self): 19 | td = swiss.tabular.TabularData( 20 | header=['X', 'Y'], 21 | data=[ [1,2], [3,4] ] 22 | ) 23 | out = td.to_list() 24 | assert out == self.testlist 25 | 26 | 27 | class TestWriterBase: 28 | def test_value_to_str(self): 29 | w = swiss.tabular.WriterBase() # round_ndigits=None 30 | out = w.value_to_str('x') 31 | assert out == u'x', out 32 | out = w.value_to_str(1) 33 | assert out == u'1', out 34 | out = w.value_to_str(1.3555) 35 | assert out == u'1.3555', out 36 | 37 | w = swiss.tabular.WriterBase(round_ndigits=2) 38 | out = w.value_to_str('x') 39 | assert out == u'x', out 40 | out = w.value_to_str(1) 41 | assert out == u'1', out 42 | out = w.value_to_str(1.3555) 43 | assert out == u'1.36', out 44 | 45 | w.round_ndigits = -1 46 | out = w.value_to_str(102.34) 47 | assert out == u'100', out 48 | 49 | 50 | class TestReaderCsv(object): 51 | 52 | csvdata = \ 53 | '''"header1", "header 2" 54 | 1, 2''' 55 | header = [ 'header1', 'header 2' ] 56 | data = [ ['1', '2'] ] 57 | 58 | def setUp(self): 59 | reader = swiss.tabular.ReaderCsv() 60 | fileobj = StringIO(self.csvdata) 61 | self.tab = reader.read(fileobj) 62 | 63 | def test_header(self): 64 | assert self.header == self.tab.header 65 | 66 | def test_data(self): 67 | assert self.data == self.tab.data 68 | 69 | 70 | class TestReaderCsvUnicode(TestReaderCsv): 71 | csvdata = \ 72 | u'''"headi\xf1g", "header 2" 73 | 1, 2''' 74 | header = [ u'headi\xf1g'.encode('utf-8'), 'header 2' ] 75 | data = [ ['1', '2'] ] 76 | 77 | 78 | class TestReaderCsvEncoded(TestReaderCsvUnicode): 79 | encoding = 'utf-16' 80 | csvdata = \ 81 | u'''"headi\xf1g", "header 2" 82 | 1, 2'''.encode(encoding) 83 | 84 | def setUp(self): 85 | reader = swiss.tabular.ReaderCsv() 86 | fileobj = StringIO(self.csvdata) 87 | self.tab = reader.read(fileobj, encoding=self.encoding) 88 | 89 | 90 | class TestCsvWriter: 91 | def test_writer(self): 92 | writer = swiss.tabular.CsvWriter() 93 | fo = StringIO() 94 | td = swiss.tabular.TabularData([[1,2],[3,4]], header=['one', 95 | 'two']) 96 | writer.write(td, fo) 97 | fo.seek(0) 98 | out = fo.read() 99 | exp = \ 100 | '''one,two\r 101 | 1,2\r 102 | 3,4\r\n''' 103 | assert out == exp 104 | 105 | 106 | class TestHtmlReader: 107 | 108 | inraw1 = ''' 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 |
12
1983
34
120 | ''' 121 | in1 = StringIO(inraw1) 122 | 123 | exp1 = [ ['1', '2'], 124 | ['1983'], 125 | ['3', '4'], 126 | ] 127 | 128 | def test_1(self): 129 | reader = swiss.tabular.HtmlReader() 130 | tab = reader.read(self.in1) 131 | assert tab.data == self.exp1 132 | 133 | 134 | class TestHtmlWriter: 135 | 136 | def setUp(self): 137 | rawData = [[1,1], [0,1]] 138 | self.indata1 = swiss.tabular.TabularData(data=rawData) 139 | self.writer1 = swiss.tabular.HtmlWriter(table_attributes={'id':1, 'class': 'data'}) 140 | 141 | def test_0_simple(self): 142 | indata1 = [[1,1], [0,1]] 143 | expected = ''+\ 144 | '
11
01
' 145 | out1 = self.writer1.write_str(self.indata1) 146 | assert expected == out1 147 | 148 | def test_col_headings(self): 149 | self.indata1.header = [u'x','y'] 150 | caption = '' 151 | expected = ''+\ 152 | '' + \ 153 | '
xy
11
01
' 154 | # no caption but headings 155 | out1 = self.writer1.write_str(self.indata1, caption) 156 | assert expected == out1 157 | 158 | def test_row_headings(self): 159 | self.indata1.header = ['x','y'] 160 | rowHeadings = ['Date 1', 'Date 2'] 161 | caption = '' 162 | expected = '' + \ 163 | '' + \ 164 | '' + \ 165 | '
xy
Date 111
Date 201
' 166 | # no caption but headings 167 | out1 = self.writer1.write_str(self.indata1, caption, rowHeadings) 168 | assert expected == out1 169 | 170 | def test_escaping(self): 171 | tdata = swiss.tabular.TabularData(header=['s&p', 'y01' 180 | # print self.writer1.prettyPrint(in1) 181 | 182 | 183 | class TestLatexWriter: 184 | 185 | matrix = [[ 'H1', 'H2'], 186 | [1,'2%'], 187 | [3,4], 188 | ] 189 | 190 | exp = \ 191 | r'''\textbf{H1} & \textbf{H2} \\ 192 | \hline 193 | 1 & 2\% \\ 194 | \hline 195 | 3 & 4 \\ 196 | \hline 197 | ''' 198 | m2l = swiss.tabular.LatexWriter() 199 | 200 | def test_escape(self): 201 | in1 = '& % $ something' 202 | exp1 = r'\& \% $ something' 203 | assert self.m2l.escape(in1) == exp1 204 | 205 | def test_table2latex(self): 206 | out = swiss.tabular.table2latex(self.matrix) 207 | self.diff(self.exp, out) 208 | assert out == self.exp 209 | 210 | def test_write(self): 211 | td = swiss.tabular.TabularData(data=self.matrix[1:], header=self.matrix[0]) 212 | out = self.m2l.write_str(td) 213 | self.diff(self.exp, out) 214 | assert out == self.exp 215 | 216 | def diff(self, str1, str2): 217 | import difflib 218 | differ = difflib.Differ() 219 | text1 = str1.splitlines(1) 220 | text2 = str2.splitlines(1) 221 | result = list(differ.compare(text1, text2)) 222 | from pprint import pprint 223 | pprint(result) 224 | 225 | 226 | -------------------------------------------------------------------------------- /swiss/tests/tabular/test_gdocs.py: -------------------------------------------------------------------------------- 1 | import os 2 | from ConfigParser import SafeConfigParser 3 | 4 | import swiss.tabular.gdocs as gdocs 5 | 6 | 7 | cfg = SafeConfigParser() 8 | if not os.path.exists('test.ini'): 9 | msg = 'To run these tests you need a config file. See this file for details' 10 | raise Exception(msg) 11 | cfg.readfp(open('test.ini')) 12 | username = cfg.get('gdocs', 'username') 13 | password = cfg.get('gdocs', 'password') 14 | 15 | 16 | class TestGDocsTextDb: 17 | def test_01(self): 18 | source = 'okfn-swiss-gdocs-testing' 19 | reader = gdocs.GDocsReaderTextDb(source, username, password, id_is_name=True) 20 | tdata = reader.read() 21 | assert tdata.header == ['col1', 'col2'] 22 | assert len(tdata.data) == 5, tdata 23 | 24 | 25 | # not working properly yet 26 | class _TestGDocs: 27 | def test_01(self): 28 | source = 't8GZy4Lb6jhVjCL5nrqZ5TQ' 29 | reader = gdocs.GDocsReaderSpreadsheet(source, username, password) 30 | tdata = reader.read() 31 | assert len(tdata.data) == 6, tdata 32 | 33 | def test_02_id_is_name(self): 34 | source = 'okfn-swiss-gdocs-testing' 35 | reader = gdocs.GDocsReaderSpreadsheet(source, username, password, id_is_name=True) 36 | tdata = reader.read() 37 | assert len(tdata.data) == 6, tdata 38 | 39 | 40 | -------------------------------------------------------------------------------- /swiss/tests/tabular/test_json.py: -------------------------------------------------------------------------------- 1 | from StringIO import StringIO 2 | import swiss.tabular.tabular_json as js 3 | 4 | class TestJson: 5 | in1 = { 'header': [u'a', u'b'], 6 | 'data': [[1,2], [3,4]] 7 | } 8 | in2 = [ in1['header'] ] + in1['data'] 9 | in1sio = StringIO(js.json.dumps(in1)) 10 | in1sio.seek(0) 11 | in2sio = StringIO(js.json.dumps(in2)) 12 | in2sio.seek(0) 13 | 14 | def test_JsonReader(self): 15 | reader = js.JsonReader() 16 | out = reader.read(self.in1sio) 17 | assert out.header == self.in1['header'] 18 | assert out.data == self.in1['data'] 19 | 20 | out = reader.read(self.in2sio) 21 | assert out.header == self.in1['header'] 22 | assert out.data == self.in1['data'] 23 | 24 | def test_JsonWriter(self): 25 | writer = js.JsonWriter() 26 | td = js.TabularData(header=self.in1['header'], data=self.in1['data']) 27 | out = writer.write_str(td) 28 | assert js.json.loads(out) == self.in1 29 | 30 | -------------------------------------------------------------------------------- /swiss/tests/tabular/test_misc.py: -------------------------------------------------------------------------------- 1 | import swiss.tabular 2 | 3 | class TestTranspose: 4 | 5 | def test_1(self): 6 | inlist = [ 7 | [ 0, 1 ], 8 | [ 1, 0 ], 9 | ] 10 | exp = [ 11 | ( 0, 1 ), 12 | ( 1, 0 ), 13 | ] 14 | out = swiss.tabular.transpose(inlist) 15 | assert out == exp, out 16 | 17 | class TestPivot: 18 | td = swiss.tabular.TabularData( 19 | header=['Name','Year','Value'], 20 | data=[ 21 | ['x',2004,1], 22 | ['y',2004,2], 23 | ['y',2005,4], 24 | ['x',2005,3], 25 | ], 26 | ) 27 | 28 | def test_pivot_with_tabular(self): 29 | out = swiss.tabular.pivot(self.td, 1, 0, 2) 30 | assert out.data[0] == [2004, 1, 2] 31 | assert out.data[-1] == [2005, 3, 4] 32 | 33 | def test_pivot_with_tabular_2(self): 34 | out = swiss.tabular.pivot(self.td, 'Year', 'Name', 'Value') 35 | assert out.data[0] == [2004, 1, 2] 36 | 37 | def test_pivot_simple_list(self): 38 | out = swiss.tabular.pivot(self.td.data, 1, 0, 2) 39 | assert out.data[0] == [2004, 1, 2] 40 | 41 | -------------------------------------------------------------------------------- /swiss/tests/tabular/test_txt.py: -------------------------------------------------------------------------------- 1 | import StringIO 2 | 3 | from swiss.tabular.txt import * 4 | from swiss.tabular import TabularData, CsvReader 5 | 6 | class TestFormatting: 7 | 8 | sample_rows = [ 9 | ['1', '2', 'head blah', 'blah blah blah'], 10 | ['a', 'b', 'c', 'd', 'e', 'g' ], 11 | ['1', '2', 'annakarenina annakarenina annakarenina'], 12 | ] 13 | output_width = 60 14 | 15 | writer = TxtWriter(output_width=output_width) 16 | writer._compute_parameters(sample_rows) 17 | 18 | def test_1(self): 19 | assert self.writer.numcols == 6 20 | 21 | def test_colwidths(self): 22 | exp = int ((self.output_width -1) / 6) 23 | assert self.writer.colwidths[0] == exp 24 | 25 | def test__write_1(self): 26 | out = self.writer._write_row(self.sample_rows[0]) 27 | assert len(out) <= self.output_width 28 | 29 | def test__write_2(self): 30 | out = self.writer._write_row(self.sample_rows[0]) 31 | exp = '| 1 | 2 |head bla|blah bla| | |\n' 32 | assert out == exp 33 | 34 | def test__write_separator(self): 35 | out = self.writer._write_separator() 36 | exp = '+--------+--------+--------+--------+--------+--------+\n' 37 | 38 | 39 | 40 | class TestTxtWriter: 41 | sample = \ 42 | '''"YEAR","PH","RPH","RPH_1","LN_RPH","LN_RPH_1","HH","LN_HH" 43 | 1971,7.852361625,43.9168370988587,42.9594500501036,3.78229777955476,3.76025664867788,16185,9.69184016636035 44 | ,,abc, 45 | 1972,10.504714885,55.1134791192682,43.9168370988587,4.00939431635556,3.78229777955476,16397,9.70485367024987 46 | , ,, ''' 47 | 48 | expected = \ 49 | '''+------+------+------+------+------+------+------+------+ 50 | | YEAR | PH | RPH |RPH_1 |LN_RPH|LN_RPH| HH |LN_HH | 51 | +------+------+------+------+------+------+------+------+ 52 | | 1971 |7.8523|43.916|42.959|3.7822|3.7602|16185 |9.6918| 53 | +------+------+------+------+------+------+------+------+ 54 | | | | abc | | | | | | 55 | +------+------+------+------+------+------+------+------+ 56 | | 1972 |10.504|55.113|43.916|4.0093|3.7822|16397 |9.7048| 57 | +------+------+------+------+------+------+------+------+ 58 | | | | | | | | | | 59 | +------+------+------+------+------+------+------+------+ 60 | ''' 61 | 62 | def test_simple(self): 63 | indata = TabularData(data=[range(5),range(5,10)]) 64 | writer = TxtWriter() 65 | out = writer.write_str(indata) 66 | exp = '''+-+-+-+-+-+ 67 | |0|1|2|3|4| 68 | +-+-+-+-+-+ 69 | |5|6|7|8|9| 70 | +-+-+-+-+-+ 71 | ''' 72 | print out 73 | print exp 74 | assert out == exp 75 | 76 | def test_output_width(self): 77 | indata = TabularData(data=[range(5),range(5,10)]) 78 | writer = TxtWriter(output_width=16) 79 | out = writer.write_str(indata) 80 | outlen = len(out.splitlines()[0]) 81 | assert outlen == 16, outlen 82 | 83 | def test_using_csv(self): 84 | fileobj = StringIO.StringIO(self.sample) 85 | in_tdata = CsvReader(fileobj).read() 86 | writer = TxtWriter(output_width=60) 87 | out = writer.write_str(in_tdata) 88 | print out 89 | print self.expected 90 | assert self.expected == out, out 91 | 92 | -------------------------------------------------------------------------------- /swiss/tests/test_cache.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | import shutil 3 | import os 4 | 5 | from swiss.cache import Cache 6 | 7 | class TestCache: 8 | @classmethod 9 | def setup_class(self): 10 | self.tmp = tempfile.mkdtemp() 11 | self.path = os.path.join(self.tmp, 'abc.txt') 12 | open(self.path, 'w').write('abc') 13 | self.url = 'file://%s' % self.path 14 | 15 | @classmethod 16 | def teardown_class(self): 17 | shutil.rmtree(self.tmp) 18 | 19 | def test_basename(self): 20 | base = 'http://www.abc.org/' 21 | in1 = base + 'xyz' 22 | out = Cache.basename(in1) 23 | assert out == 'xyz' 24 | 25 | in2 = base + 'xyz/abc.txt' 26 | out = Cache.basename(in2) 27 | assert out == 'abc.txt' 28 | 29 | in3 = base + 'membersDo?body=ABC' 30 | out = Cache.basename(in3) 31 | assert out == 'membersDo?body=ABC', out 32 | 33 | in3 = base + 'membersDo?body=data/ABC' 34 | out = Cache.basename(in3) 35 | assert out == 'membersDo?body=data%47ABC', out 36 | 37 | def test_filepath(self): 38 | r = Cache() 39 | base = 'http://www.abc.org/' 40 | in1 = base + 'xyz' 41 | out = r.filepath(in1) 42 | # ./xyz 43 | assert out.endswith('xyz'), out 44 | 45 | def test_dl(self): 46 | dest = os.path.join(self.tmp, 'out.txt') 47 | Cache.dl(self.url, dest) 48 | assert os.path.exists(dest) 49 | assert open(dest).read() == 'abc' 50 | 51 | def test_cache(self): 52 | cache = os.path.join(self.tmp, 'cache') 53 | r = Cache(cache) 54 | r.retrieve(self.url) 55 | assert os.path.exists(os.path.join(cache, 'abc.txt')) 56 | 57 | -------------------------------------------------------------------------------- /swiss/tests/test_date.py: -------------------------------------------------------------------------------- 1 | from swiss.date import * 2 | 3 | import datetime 4 | 5 | class TestPythonStringOrdering(object): 6 | # It is impossible to find a string format such that +ve and -ve numbers 7 | # sort correctly as strings: 8 | # if (in string ordering) X < Y => -X < -Y (False!) 9 | def test_ordering(self): 10 | assert '0' < '1' 11 | assert '-10' < '10' 12 | assert '-' < '@' 13 | assert '-' < '0' 14 | assert '-100' < '-X10' 15 | assert '10' < '1000' 16 | assert '02000' < '10000' 17 | assert ' 2000' < '10000' 18 | 19 | def test_bad_ordering(self): 20 | assert ' ' < '0' 21 | assert ' ' < '-' 22 | assert not '-' < '+' 23 | assert '-100' > '-10' 24 | assert not '-100' < '-010' 25 | assert not '-100' < '- 10' 26 | assert not '-100' < ' -10' 27 | assert '10000' < '2000' 28 | assert not '-10' < ' 1' 29 | 30 | 31 | class TestFlexiDate(object): 32 | def test_init(self): 33 | fd = FlexiDate() 34 | assert fd.year == '', fd 35 | assert fd.month == '', fd 36 | 37 | fd = FlexiDate(2000, 1,1) 38 | assert fd.month == '01', fd 39 | assert fd.day== '01', fd 40 | 41 | def test_str(self): 42 | fd = FlexiDate(2000, 1, 23) 43 | assert str(fd) == '2000-01-23', '"%s"' % fd 44 | fd = FlexiDate(-2000, 1, 23) 45 | assert str(fd) == '-2000-01-23' 46 | fd = FlexiDate(2000) 47 | assert str(fd) == '2000' 48 | fd = FlexiDate(1760, qualifier='fl.') 49 | assert str(fd) == '1760 [fl.]', fd 50 | 51 | fd = FlexiDate(qualifier='anything') 52 | assert str(fd) == ' [anything]' 53 | 54 | 55 | def test_from_str(self): 56 | def dotest(fd): 57 | out = FlexiDate.from_str(str(fd)) 58 | assert str(out) == str(fd) 59 | 60 | fd = FlexiDate(2000, 1, 23) 61 | dotest(fd) 62 | fd = FlexiDate(1760, qualifier='fl.') 63 | dotest(fd) 64 | fd = FlexiDate(-1760, 1, 3, qualifier='fl.') 65 | dotest(fd) 66 | 67 | def test_as_float(self): 68 | fd = FlexiDate(2000) 69 | assert fd.as_float() == float(2000), fd.as_float() 70 | fd = FlexiDate(1760, 1, 2) 71 | exp = 1760 + 1/12.0 + 2/365.0 72 | assert fd.as_float() == exp, fd.as_float() 73 | fd = FlexiDate(-1000) 74 | assert fd.as_float() == float(-1000) 75 | 76 | def test_as_datetime(self): 77 | fd = FlexiDate(2000) 78 | out = fd.as_datetime() 79 | assert out == datetime.datetime(2000, 1, 1), out 80 | fd = FlexiDate(1760, 1, 2) 81 | out = fd.as_datetime() 82 | assert out == datetime.datetime(1760,1,2), out 83 | 84 | 85 | class TestDateParsers(object): 86 | def test_using_datetime(self): 87 | parser = PythonDateParser() 88 | 89 | d1 = datetime.date(2000, 1, 23) 90 | fd = parser.parse(d1) 91 | assert fd.year == '2000' 92 | 93 | d1 = datetime.datetime(2000, 1, 23) 94 | fd = parser.parse(d1) 95 | # assert str(fd) == '2000-01-23T00:00:00', fd 96 | assert str(fd) == '2000-01-23', fd 97 | 98 | def test_using_dateutil(self): 99 | parser = DateutilDateParser() 100 | 101 | in1 = '2001-02' 102 | fd = parser.parse(in1) 103 | assert str(fd) == in1, fd 104 | 105 | in1 = 'March 1762' 106 | fd = parser.parse(in1) 107 | assert str(fd) == '1762-03' 108 | 109 | in1 = 'March 1762' 110 | fd = parser.parse(in1) 111 | assert str(fd) == '1762-03' 112 | 113 | in1 = '1768 AD' 114 | fd = parser.parse(in1) 115 | assert str(fd) == '1768', fd 116 | 117 | in1 = '1768 A.D.' 118 | fd = parser.parse(in1) 119 | assert str(fd) == '1768', fd 120 | 121 | in1 = '-1850' 122 | fd = parser.parse(in1) 123 | assert str(fd) == '-1850', fd 124 | 125 | in1 = '1762 BC' 126 | fd = parser.parse(in1) 127 | assert str(fd) == '-1762', fd 128 | 129 | in1 = '4 BC' 130 | fd = parser.parse(in1) 131 | assert str(fd) == '-0004', fd 132 | 133 | in1 = '4 B.C.' 134 | fd = parser.parse(in1) 135 | assert str(fd) == '-0004', fd 136 | 137 | def test_parse(self): 138 | d1 = datetime.datetime(2000, 1, 23) 139 | fd = parse(d1) 140 | assert fd.year == '2000' 141 | 142 | fd = parse('March 1762') 143 | assert str(fd) == '1762-03' 144 | 145 | fd = parse(1966) 146 | assert str(fd) == '1966' 147 | 148 | fd = parse('22/07/2010') 149 | assert fd.month == '07', fd.month 150 | 151 | def test_parse_ambiguous_day_month(self): 152 | fd = parse('05/07/2010') 153 | assert fd.month == '07', fd.month 154 | assert fd.day == '05', fd.month 155 | 156 | def test_parse_with_none(self): 157 | d1 = parse(None) 158 | assert d1 is None 159 | 160 | def test_parse_wildcards(self): 161 | fd = parse('198?') 162 | assert fd.year == '', fd.year # expect this to not parse 163 | # TODO but we should have a float if possible 164 | # assert fd.as_float() == u'1980', fd.as_float() 165 | 166 | def test_parse_with_qualifiers(self): 167 | fd = parse('1985?') 168 | assert fd.year == u'1985', fd 169 | assert fd.qualifier == u'Uncertainty : 1985?', fd.qualifier 170 | 171 | fd = parse('c.1780') 172 | assert fd.year == u'1780', fd 173 | assert fd.qualifier == u"Note 'circa' : c.1780", fd 174 | 175 | fd = parse('c. 1780') 176 | assert fd.year == u'1780', fd 177 | assert fd.qualifier.startswith(u"Note 'circa'"), fd 178 | 179 | def test_ambiguous(self): 180 | # TODO: have to be careful here ... 181 | fd = parse('1068/1069') 182 | 183 | def test_small_years(self): 184 | in1 = '23' 185 | fd = parse(in1) 186 | assert str(fd) == '0023', fd 187 | assert fd.as_float() == 23, fd.as_float() 188 | 189 | def test_small_years_with_zeros(self): 190 | in1 = '0023' 191 | fd = parse(in1) 192 | assert str(fd) == '0023', fd 193 | assert fd.as_float() == 23, fd.as_float() 194 | 195 | def test_years_with_alpha_prefix(self): 196 | in1 = "p1980" 197 | fd = parse(in1) 198 | assert str(fd) == "1980", fd 199 | 200 | -------------------------------------------------------------------------------- /swiss/tests/test_id.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | 3 | import swiss.id 4 | 5 | def test_compress_and_uncompress_uuid(): 6 | hexversion = '86c3f19d-8854-4ef5-8d88-f008e0817871' 7 | 8 | out = swiss.id.compress_uuid(hexversion) 9 | assert len(out) == 22 10 | 11 | orig = swiss.id.uncompress_uuid(out) 12 | assert orig == hexversion 13 | 14 | # test unicode 15 | orig = swiss.id.uncompress_uuid(unicode(out)) 16 | assert orig == hexversion 17 | 18 | u1 = uuid.UUID(hexversion) 19 | out = swiss.id.compress_uuid(u1) 20 | assert len(out) == 22 21 | 22 | 23 | def test_int_to_b32(): 24 | def check(int_): 25 | out = swiss.id.int_to_b32(int_) 26 | assert isinstance(out, basestring) 27 | assert len(out) == 7, out 28 | 29 | back = swiss.id.b32_to_int(out) 30 | assert back == int_, (int_,back) 31 | 32 | check(1) 33 | check(2**28+1) 34 | check(2**30-1) 35 | 36 | -------------------------------------------------------------------------------- /swiss/tests/test_misc.py: -------------------------------------------------------------------------------- 1 | from swiss.misc import * 2 | 3 | class TestFloatify: 4 | def test_floatify_1(self): 5 | x = '10' 6 | assert floatify(x) == 10.0 7 | 8 | def test_floatify_2(self): 9 | x = '1,030' 10 | assert floatify(x) == 1030.0 11 | 12 | def test_floatify_2(self): 13 | x = '' 14 | out = floatify(x) 15 | assert out == None, out 16 | x = '#' 17 | out = floatify(x) 18 | assert out == None, out 19 | 20 | def test_floatify_matrix(self): 21 | x = [ 22 | ['1', '2'], 23 | ['abc', '3.0'] 24 | ] 25 | exp = [ 26 | [1.0, 2.0], 27 | ['abc', 3.0] 28 | ] 29 | out = floatify_matrix(x) 30 | assert out == exp 31 | 32 | 33 | class TestMakeSeries: 34 | 35 | def test_make_series(self): 36 | indata = [ [ '1980', '100', '50' ], 37 | [ '1981', '101', '51' ], 38 | [ '1982', '102', '' ], 39 | ] 40 | exp = [ 41 | [ (1980.0, 100.0), (1981.0, 101.0), (1982.0, 102.0) ], 42 | [ (1980.0, 50.0), (1981.0, 51.0) ] 43 | ] 44 | out = make_series(indata, xcol=0, ycols=[1,2]) 45 | assert out == exp, out 46 | 47 | -------------------------------------------------------------------------------- /swiss/tests/test_xls.py: -------------------------------------------------------------------------------- 1 | import pkg_resources 2 | 3 | import swiss.tabular 4 | 5 | class TestXlsReader: 6 | 7 | def test_stuff(self): 8 | fo = pkg_resources.resource_stream('swiss', 9 | 'tests/data/xls_reader_test.xls') 10 | reader = swiss.tabular.XlsReader(fo) 11 | tab = reader.read() 12 | assert tab.data[0][0] == 1850 13 | assert tab.data[19][1] == 12.3 14 | 15 | --------------------------------------------------------------------------------