├── .gitignore
├── .hgignore
├── .hgtags
├── README.txt
├── datautil
    ├── __init__.py
    ├── cache.py
    ├── cli.py.command
    ├── clitools.py
    ├── date.py
    ├── deliveranceproxy.py
    ├── id.py
    ├── misc.py
    ├── normalization
    │   ├── __init__.py
    │   ├── table_based.py
    │   └── text.py
    ├── parse
    │   ├── __init__.py
    │   └── name.py
    ├── scrape.py
    ├── tabular
    │   ├── __init__.py
    │   ├── base.py
    │   ├── gdocs.py
    │   ├── html.py
    │   ├── misc.py
    │   ├── tabular_json.py
    │   ├── txt.py
    │   └── xls.py
    └── tests
    │   ├── __init__.py
    │   ├── data
    │       └── xls_reader_test.xls
    │   ├── parse
    │       └── test_name.py
    │   ├── tabular
    │       ├── __init__.py
    │       ├── test_base.py
    │       ├── test_gdocs.py
    │       ├── test_json.py
    │       ├── test_misc.py
    │       └── test_txt.py
    │   ├── test_cache.py
    │   ├── test_date.py
    │   ├── test_id.py
    │   ├── test_misc.py
    │   └── test_xls.py
├── setup.py
└── swiss
    ├── __init__.py
    ├── cache.py
    ├── clitools.py
    ├── date.py
    ├── deliveranceproxy.py
    ├── id.py
    ├── misc.py
    ├── parse
        ├── __init__.py
        └── name.py
    ├── tabular
        ├── __init__.py
        ├── base.py
        ├── gdocs.py
        ├── html.py
        ├── misc.py
        ├── tabular_json.py
        ├── txt.py
        └── xls.py
    └── tests
        ├── __init__.py
        ├── data
            └── xls_reader_test.xls
        ├── parse
            └── test_name.py
        ├── tabular
            ├── __init__.py
            ├── test_base.py
            ├── test_gdocs.py
            ├── test_json.py
            ├── test_misc.py
            └── test_txt.py
        ├── test_cache.py
        ├── test_date.py
        ├── test_id.py
        ├── test_misc.py
        └── test_xls.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | .idea/*
3 | *.pyc
4 | docs/build/*


--------------------------------------------------------------------------------
/.hgignore:
--------------------------------------------------------------------------------
 1 | syntax: glob
 2 | *.egg-info/*
 3 | *.pyc
 4 | *.swp
 5 | *.swo
 6 | sandbox/*
 7 | 
 8 | syntax: regexp
 9 | ^build$
10 | ^pyenv$
11 | 


--------------------------------------------------------------------------------
/.hgtags:
--------------------------------------------------------------------------------
1 | 3e61713892d3525675712a96fbcbc439837151d0 0.1
2 | 5d28eda958146bb213aee67ef89bc04ec5a1e06e 0.2
3 | 99c63b2a432dbfe32f7a9359d3cb8076412aa164 0.3
4 | 


--------------------------------------------------------------------------------
/README.txt:
--------------------------------------------------------------------------------
 1 | Swiss Army Knife for Data Work.
 2 | 
 3 | For details read the main package docstring.
 4 | 
 5 | Open source software licensed under the MIT license.
 6 | 
 7 | ## Install
 8 | 
 9 | 1. Install setuptools
10 | 
11 | 2. Either install directy from PyPI usinging easy_install:
12 |   
13 |     $ easy_install datautil
14 | 
15 |    OR install from the source obtainable from the mercurial repository:
16 | 
17 |     $ hg clone https://github.com/okfn/datautil
18 |   
19 | ## Tests
20 | 
21 | 1. Ensure you also have install 'xlrd' and 'gdata' (options mentioned 
22 |    in setup.py) and nose (for running tests):
23 | 
24 |     $ easy_install nose xlrd gdata
25 | 
26 | 2. Run the tests:
27 | 
28 |     $ nosetests datautil/tests/
29 | 


--------------------------------------------------------------------------------
/datautil/__init__.py:
--------------------------------------------------------------------------------
 1 | '''Utilities for Data Work
 2 | =======================
 3 | 
 4 | The datautil package provides various utilities for working with data:
 5 | 
 6 |   * cache: Url caching and scraping
 7 |   * tabular/*: Processing and transforming tabular data to and from various
 8 |     formats including csv, json, google spreadsheets, xls
 9 |   * misc, date: Cleaning up and parsing data especially dates.
10 |   * id: ID generation and shortenening
11 |   * clitools.py: Command line tools such as creating optparse object and usage
12 |     from a module of object.
13 |   * deliveranceproxy.py: Deliverance proxy helper
14 | 
15 | 
16 | CHANGELOG
17 | =========
18 | 
19 | v0.5 2011-??-??
20 | ---------------
21 | 
22 |   * Minor improvements to cache
23 | 
24 | v0.4 2011-01-05
25 | ---------------
26 | 
27 |   * Rename swiss to datautil
28 | 
29 | v0.3 2010-08-01
30 | ---------------
31 | 
32 |   * Support for google docs spreadsheets as sources for TabularData
33 |   * Improve documentation of date module and add FlexiDate.as_datetime()
34 |   * New clitools module incorporating existing cli tools
35 |   * deliveranceproxy.py: Deliverance proxy helper for proxying to remote
36 |     websites and retheming with deliverance.
37 |   * parse/name.py: new (human) name parsing code
38 | 
39 | v0.2 2009-10-23
40 | ---------------
41 | 
42 |   * Extensive refactoring of tabular module/package
43 |     * Standardized interface with BaseReader and BaseWriter
44 |     * JsonReader and JsonWriter providing json reading and writing
45 |     * TxtWriter to support writing to plain text
46 |   * Improvements to date parsing (support for circa, 'c.', etc)
47 |   * New id module to do 'compression' of uuids using 32 and 64 bit encoding
48 | 
49 | 
50 | v0.1 2009-06-03
51 | ---------------
52 | 
53 |   * Bring together existing code (from last 2+ years) into new 'datautil' package
54 |   * Url caching and scraping
55 |   * Tabular data handling including csv reader/writer, xls reader, latex writer
56 |     and associated utilities (such as pivot_table)
57 |   * Cleaning and parsing data especially dates (misc and date modules)
58 | '''
59 | __version__ = '0.4'
60 | 
61 | try:
62 |     import tabular
63 | except ImportError:
64 |     tabular = None
65 | from cache import *
66 | from misc import *
67 | from id import *
68 | 


--------------------------------------------------------------------------------
/datautil/cache.py:
--------------------------------------------------------------------------------
  1 | '''A local file cache with url retrieving builtin.
  2 | 
  3 | NB: this module has zero dependencies on modules outside of the
  4 | standard lib so that it is easily reusable in other libraries and applications
  5 | that do not require any other parts of the datautil package.
  6 | '''
  7 | import urlparse
  8 | import urllib
  9 | import os
 10 | import sys
 11 | 
 12 | 
 13 | # have to define before Cache as used in classmethod
 14 | class _Progress(object):
 15 |     def __init__(self):
 16 |         self.count = -1
 17 | 
 18 |     def dl_progress(self, count, block_size, total_size):
 19 |         if total_size == 0: # total_size is weird so return to avoid errors
 20 |             return
 21 |         if self.count == -1:
 22 |             print 'Total size: %s' % self.format_size(total_size)
 23 |         last_percent = int(self.count*block_size*100/total_size)
 24 |         percent = int(count*block_size*100/total_size)
 25 |         if percent > last_percent:
 26 |             # TODO: is this acceptable? Do we want to do something nicer?
 27 |             sys.stdout.write('.')
 28 |             sys.stdout.flush()
 29 |         self.count = count
 30 | 
 31 |     def format_size(self, bytes):
 32 |         if bytes > 1000*1000:
 33 |             return '%.1fMb' % (bytes/1000.0/1000)
 34 |         elif bytes > 10*1000:
 35 |             return '%iKb' % (bytes/1000)
 36 |         elif bytes > 1000:
 37 |             return '%.1fKb' % (bytes/1000.0)
 38 |         else:
 39 |             return '%ibytes' % bytes
 40 | 
 41 | 
 42 | class Cache(object):
 43 |     '''A local file cache (and url retriever).
 44 |     '''
 45 | 
 46 |     def __init__(self, path='.'):
 47 |         '''
 48 |         @param path: path to cache (defaults to current directory)
 49 |         '''
 50 |         self.path = path
 51 |         if not os.path.exists(self.path):
 52 |             os.makedirs(path)
 53 | 
 54 |     def retrieve(self, url, overwrite=False):
 55 |         '''Retrieve url into cache and return the local path to it.
 56 |        
 57 |         :param url: url to retrieve.
 58 |         :return: path to file retrieved.
 59 |         '''
 60 |         dest = self.cache_path(url)
 61 |         self.download(url, dest, overwrite)
 62 |         return dest
 63 | 
 64 |     def cache_path(self, url):
 65 |         '''Local path for url within cache.'''
 66 |         name = self.basename(url)
 67 |         dest = os.path.join(self.path, name)
 68 |         return dest
 69 | 
 70 |     def filepath(self, url):
 71 |         '''Deprecated: use cache_path'''
 72 |         return self.cache_path(url)
 73 | 
 74 |     def stream(self, url):
 75 |         fp = self.cache_path(url)
 76 |         if not os.path.exists(fp):
 77 |             return None
 78 |         else:
 79 |             return open(fp)
 80 |     
 81 |     @classmethod
 82 |     def basename(self, url):
 83 |         scheme, netloc, path, params, query, fragment = urlparse.urlparse(url)
 84 |         result = path.split('/')[-1]
 85 |         if query:
 86 |             # escape '/' as otherwise path problems
 87 |             result += '?' + query.replace('/', '%47')
 88 |         return result
 89 | 
 90 |     @classmethod
 91 |     def download(self, url, dest, overwrite=False):
 92 |         '''Download a file from a url.
 93 | 
 94 |         :param url: the source url
 95 |         :param dest: the destination path to save to.
 96 |         :param overwrite: overwrite destination file if it exists (defaults to
 97 |             False).
 98 |         '''
 99 |         url = url.encode('utf-8')
100 |         if not os.path.exists(dest) or overwrite:
101 |             print 'Retrieving %s' % url
102 |             prog = _Progress()
103 |             urllib.urlretrieve(url, dest, reporthook=prog.dl_progress)
104 |         else:
105 |             print 'Skipping download as dest already exists: %s' % url
106 | 
107 |     # for backwards compatability
108 |     @classmethod
109 |     def dl(self, url, dest=None):
110 |         return self.download(url, dest)
111 | 
112 | 


--------------------------------------------------------------------------------
/datautil/cli.py.command:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import optparse
 4 | import logging
 5 | from StringIO import StringIO
 6 | import traceback
 7 | import time
 8 | 
 9 | parser = optparse.OptionParser()
10 | 
11 | parser.add_option(
12 |     '-v', '--verbose',
13 |     dest='verbose',
14 |     action='count',
15 |     default=0,
16 |     help='Give more output')
17 | parser.add_option(
18 |     '-q', '--quiet',
19 |     dest='quiet',
20 |     action='count',
21 |     default=0,
22 |     help='Give less output')
23 | 
24 | class Command(object):
25 |     name = None
26 |     usage = None
27 |     default_parser = None
28 |     all_commands = []
29 | 
30 |     def __init__(self):
31 |         assert self.name
32 |         self.parser = optparse.OptionParser(
33 |             usage=self.usage,
34 |             prog='%s %s' % (sys.argv[0], self.name),
35 |             version=parser.version)
36 |         for option in self.default_parser.option_list:
37 |             if not option.dest:
38 |                 # -h, --version, etc
39 |                 continue
40 |             self.parser.add_option(option)
41 |         Command.all_commands[self.name] = self
42 | 
43 |     def merge_options(self, initial_options, options):
44 |         for attr in ['log']:
45 |             setattr(options, attr, getattr(initial_options, attr) or getattr(options, attr))
46 |         options.quiet += initial_options.quiet
47 |         options.verbose += initial_options.verbose
48 | 
49 |     def main(self, complete_args, args, initial_options):
50 |         options = initial_options
51 |         discarded_options, args = self.parser.parse_args(args)
52 |         # From pip but not needed by us I think
53 |         # self.merge_options(initial_options, options)
54 |         self.options = options
55 |         self.verbose = options.verbose
56 | 
57 |         level = 1
58 |         level += options.verbose
59 |         level -= options.quiet
60 |         complete_log = []
61 |         if options.log:
62 |             log_fp = open_logfile_append(options.log)
63 |             logger.consumers.append((logger.DEBUG, log_fp))
64 |         else:
65 |             log_fp = None
66 | 
67 |         exit = 0
68 |         try:
69 |             self.run(options, args)
70 |         except:
71 |             logger.fatal('Exception:\n%s' % format_exc())
72 |             exit = 2
73 |         
74 |         if log_fp is not None:
75 |             log_fp.close()
76 |         if exit:
77 |             log_fn = 'datapkg-log.txt'
78 |             text = '\n'.join(complete_log)
79 |             # Not sure we need to tell people ...
80 |             # logger.fatal('Storing complete log in %s' % log_fn)
81 |             log_fp = open_logfile_append(log_fn)
82 |             log_fp.write(text)
83 |             log_fp.close()
84 |         sys.exit(exit)
85 | 


--------------------------------------------------------------------------------
/datautil/clitools.py:
--------------------------------------------------------------------------------
 1 | '''Expose methods or functions as commands on the command line
 2 | 
 3 | Example usage::
 4 | 
 5 |     # in your code
 6 |     from datautil.clitools import _main
 7 |     if __name__  == '__main__':
 8 |         # expose everything in current module
 9 |         _main(locals())
10 |         # or if you have an object MyObject with methods you want to expose
11 |         _main(MyObject)
12 | '''
13 | import os
14 | import sys
15 | import optparse
16 | import inspect
17 | 
18 | def _object_methods(obj):
19 |     methods = inspect.getmembers(obj, inspect.ismethod)
20 |     methods = filter(lambda (name,y): not name.startswith('_'), methods)
21 |     methods = dict(methods)
22 |     return methods
23 | 
24 | def _module_functions(functions):
25 |     local_functions = dict(functions)
26 |     for k,v in local_functions.items():
27 |         if not inspect.isfunction(v) or k.startswith('_'):
28 |             del local_functions[k]
29 |     return local_functions
30 | 
31 | def _main(functions_or_object):
32 |     isobject = inspect.isclass(functions_or_object)
33 |     if isobject:
34 |         _methods = _object_methods(functions_or_object)
35 |     else:
36 |         _methods = _module_functions(functions_or_object)
37 | 
38 |     usage = '''%prog {action}
39 | 
40 | Actions:
41 |     '''
42 |     usage += '\n    '.join(
43 |         [ '%s: %s' % (name, m.__doc__.split('\n')[0] if m.__doc__ else '') for (name,m)
44 |         in sorted(_methods.items()) ])
45 |     parser = optparse.OptionParser(usage)
46 |     # Optional: for a config file
47 |     # parser.add_option('-c', '--config', dest='config',
48 |     #         help='Config file to use.')
49 |     options, args = parser.parse_args()
50 | 
51 |     if not args or not args[0] in _methods:
52 |         parser.print_help()
53 |         sys.exit(1)
54 | 
55 |     method = args[0]
56 |     if isobject:
57 |         getattr(functions_or_object(), method)(*args[1:])
58 |     else:
59 |         _methods[method](*args[1:])
60 | 
61 | __all__ = [ '_main' ]
62 | 
63 | if __name__ == '__main__':
64 |     _main(locals())
65 | 
66 | 


--------------------------------------------------------------------------------
/datautil/date.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Date parsing and normalization utilities based on FlexiDate.
  3 | 
  4 | To parse dates use parse(), e.g.::
  5 | 
  6 | from datautil.date import parse
  7 | 
  8 | parse('1890') -> FlexiDate(year=u'1890')
  9 | parse('1890?') -> FlexiDate(year=u'1890', qualifier='Uncertainty: 1985?')
 10 | 
 11 | Once you have a FlexiDate you can get access to attributes (strings of course
 12 | ...)::
 13 | 
 14 |     fd = parse('Jan 1890')
 15 |     fd.year # u'1890'
 16 |     fd.month # u'01'
 17 | 
 18 | And convert to other forms::
 19 | 
 20 |     fd.as_float() # 1890
 21 |     fd.as_datetime() # datetime(1890,01,01)
 22 | 
 23 | Background
 24 | ==========
 25 | 
 26 | FlexiDate is focused on supporting:
 27 | 
 28 |   1. Dates outside of Python (or DB) supported period (esp. dates < 0 AD)
 29 |   2. Imprecise dates (c.1860, 18??, fl. 1534, etc)
 30 |   3. Normalization of dates to machine processable versions
 31 |   4. Sortable in the database (in correct date order)
 32 | 
 33 | For more information see:
 34 | 
 35 | `Flexible Dates in Python (including BC) <http://rufuspollock.org/2009/06/18/flexible-dates-in-python/>`_
 36 | 
 37 | --------------------
 38 | 
 39 | """
 40 | import re
 41 | import datetime
 42 | 
 43 | class FlexiDate(object):
 44 |     """Store dates as strings and present them in a slightly extended version
 45 |     of ISO8601.
 46 | 
 47 |     Modifications:
 48 |         * Allow a trailing qualifiers e.g. fl.
 49 |         * Allow replacement of unknown values by ? e.g. if sometime in 1800s
 50 |           can do 18??
 51 |     
 52 |     Restriction on ISO8601:
 53 |         * Truncation (e.g. of centuries) is *not* permitted.
 54 |         * No week and day representation e.g. 1999-W01
 55 |     """
 56 |     # pass
 57 |     def __init__(self, year=None, month=None, day=None, qualifier=''):
 58 |         # force = month or day or qualifier
 59 |         force = False
 60 |         self.year = self._cvt(year, rjust=4, force=force)
 61 |         self.month = self._cvt(month)
 62 |         self.day = self._cvt(day)
 63 |         self.qualifier = qualifier
 64 |          
 65 |     def _cvt(self, val, rjust=2, force=False):
 66 |         if val:
 67 |             tmp = unicode(val).strip()
 68 |             if tmp.startswith('-'):
 69 |                 tmp = '-' + tmp[1:].rjust(rjust, '0')
 70 |             else:
 71 |                 tmp = tmp.rjust(rjust, '0')
 72 |             return tmp
 73 |         elif force:
 74 |             # use '!' rather than '?' as '!' < '1' while '?' > '1'
 75 |             return rjust * '!'
 76 |         else:
 77 |             return ''
 78 | 
 79 |     def __str__(self):
 80 |         out = self.isoformat()
 81 |         if self.qualifier:
 82 |             # leading space is important as ensures when no year sort in right
 83 |             # order as ' ' < '1'
 84 |             out += u' [%s]' % self.qualifier
 85 |         return out
 86 | 
 87 |     def __repr__(self):
 88 |         return u'%s %s' % (self.__class__, self.__str__())
 89 | 
 90 |     def isoformat(self, strict=False):
 91 |         '''Return date in isoformat (same as __str__ but without qualifier).
 92 |         
 93 |         WARNING: does not replace '?' in dates unless strict=True.
 94 |         '''
 95 |         out = self.year
 96 |         # what do we do when no year ...
 97 |         for val in [ self.month, self.day ]:
 98 |             if not val:
 99 |                 break
100 |             out += u'-' + val
101 |         if strict:
102 |             out = out.replace('?', '0')
103 |         return out
104 | 
105 |     our_re_pat = '''
106 |         (?P<year> -?[\d?]+)
107 |         (?:
108 |                 \s* - (?P<month> [\d?]{1,2})
109 |             (?: \s* - (?P<day> [\d?]{1,2}) )?
110 |         )?
111 |         \s*
112 |         (?: \[ (?P<qualifier>[^]]*) \])?
113 |         '''
114 |     our_re = re.compile(our_re_pat, re.VERBOSE)
115 |     @classmethod
116 |     def from_str(self, instr):
117 |         '''Undo affect of __str__'''
118 |         if not instr:
119 |             return FlexiDate()
120 | 
121 |         out = self.our_re.match(instr)
122 |         if out is None: # no match TODO: raise Exception?
123 |             return None
124 |         else:
125 |             return FlexiDate(
126 |                     out.group('year'),
127 |                     out.group('month'),
128 |                     out.group('day'),
129 |                     qualifier=out.group('qualifier')
130 |                     )
131 |     
132 |     def as_float(self):
133 |         '''Get as a float (year being the integer part).
134 | 
135 |         Replace '?' in year with 9 so as to be conservative (e.g. 19?? becomes
136 |         1999) and elsewhere (month, day) with 0
137 | 
138 |         @return: float.
139 |         '''
140 |         if not self.year: return None
141 |         out = float(self.year.replace('?', '9'))
142 |         if self.month:
143 |             # TODO: we are assuming months are of equal length
144 |             out += float(self.month.replace('?', '0')) / 12.0
145 |             if self.day:
146 |                 out += float(self.day.replace('?', '0')) / 365.0
147 |         return out
148 | 
149 |     def as_datetime(self):
150 |         '''Get as python datetime.datetime.
151 | 
152 |         Require year to be a valid datetime year. Default month and day to 1 if
153 |         do not exist.
154 | 
155 |         @return: datetime.datetime object.
156 |         '''
157 |         year = int(self.year)
158 |         month = int(self.month) if self.month else 1
159 |         day = int(self.day) if self.day else 1
160 |         return datetime.datetime(year, month, day)
161 | 
162 | 
163 | def parse(date, dayfirst=True):
164 |     '''Parse a `date` into a `FlexiDate`.
165 | 
166 |     @param date: the date to parse - may be a string, datetime.date,
167 |     datetime.datetime or FlexiDate.
168 | 
169 |     TODO: support for quarters e.g. Q4 1980 or 1954 Q3
170 |     TODO: support latin stuff like M.DCC.LIII  
171 |     TODO: convert '-' to '?' when used that way
172 |         e.g. had this date [181-]
173 |     '''
174 |     if not date:
175 |         return None
176 |     if isinstance(date, FlexiDate):
177 |         return date
178 |     if isinstance(date, int):
179 |         return FlexiDate(year=date)
180 |     elif isinstance(date, datetime.date):
181 |         parser = PythonDateParser()
182 |         return parser.parse(date)
183 |     else: # assuming its a string
184 |         parser = DateutilDateParser()
185 |         out = parser.parse(date, **{'dayfirst': dayfirst})
186 |         if out is not None:
187 |             return out
188 |         # msg = 'Unable to parse %s' % date
189 |         # raise ValueError(date)
190 |         val = 'UNPARSED: %s' % date
191 |         val = val.encode('ascii', 'ignore')
192 |         return FlexiDate(qualifier=val)
193 | 
194 | 
195 | class DateParserBase(object):
196 |     def parse(self, date):
197 |         raise NotImplementedError
198 | 
199 |     def norm(self, date):
200 |         return str(self.parse(date))
201 | 
202 | class PythonDateParser(object):
203 |     def parse(self, date):
204 |         return FlexiDate(date.year, date.month, date.day)
205 | 
206 | try:
207 |     import dateutil.parser
208 |     dateutil_parser = dateutil.parser.parser()
209 | except:
210 |     dateutil_parser = None
211 | 
212 | class DateutilDateParser(DateParserBase):
213 |     _numeric = re.compile("^[0-9]+$")
214 |     def parse(self, date, **kwargs):
215 |         '''
216 |         :param **kwargs: any kwargs accepted by dateutil.parse function.
217 |         '''
218 |         qualifiers = []
219 |         if dateutil_parser is None:
220 |             return None
221 |         date = orig_date = date.strip()
222 | 
223 |         # various normalizations
224 |         # TODO: call .lower() first
225 |         date = date.replace('B.C.', 'BC')
226 |         date = date.replace('A.D.', 'AD')
227 | 
228 |         # deal with pre 0AD dates
229 |         if date.startswith('-') or 'BC' in date or 'B.C.' in date:
230 |             pre0AD = True
231 |         else:
232 |             pre0AD = False
233 |         # BC seems to mess up parser
234 |         date = date.replace('BC', '')
235 | 
236 |         # deal with circa: expressed as [c|ca|cca|circ|circa] with or without an appended period
237 |         # and with or without a space, followed by a date
238 |         # 'c.1950' or 'c1950' 'ca. 1980' 'circ 198?' 'cca. 1980'  'c 1029' 'circa 1960' etc.
239 |         # see http://en.wikipedia.org/wiki/Circa
240 |         # TODO: dates like 'circa 178?' and 'circa 178-' fail poorly
241 |         # 'UNPARSED: circa 178?' / u"Note 'circa' : circa 178-"
242 | 
243 | 
244 |         # note that the match deliberately does not capture the circa text match
245 |         # this is done to remove circa bit below
246 |         #circa_match = re.match('([^a-zA-Z]*)c\.?\s*(\d+.*)', date)
247 | 
248 |         # use non-matching groups (?:) to avoid refactoring the rest of the parsing
249 |         circa_match = re.match(r'([^a-zA-Z]*)(?:circa|circ\.?|cca\.?|ca\.?|c\.?)(?:\s*?)([\d\?-]+\s?\?*)', date)
250 | 
251 |         if circa_match:
252 |             # remove circa bit
253 |             qualifiers.append("Note 'circa'")
254 |             #date = ''.join(circa_match.groups())
255 |             # if an element in circa_match.groups() is None, an exception is thrown
256 |             # so instead join the match groups from circa_match that are not none
257 |             date = ''.join(list(el for el in circa_match.groups() if el))
258 | 
259 |         # deal with p1980 (what does this mean? it can appear in
260 |         # field 008 of MARC records
261 |         p_match = re.match("^p(\d+)", date)
262 |         if p_match:
263 |             date = date[1:]
264 | 
265 |         # Deal with uncertainty: '1985?'
266 |         uncertainty_match = re.match('([0-9xX]{4})\?', date)
267 |         if uncertainty_match:
268 |             # remove the ?
269 |             date = date[:-1]
270 |             qualifiers.append('Uncertainty')
271 | 
272 |         # Parse the numbers intelligently
273 |         # do not use std parser function as creates lots of default data
274 |         res = dateutil_parser._parse(date, **kwargs)
275 | 
276 |         if res is None:
277 |             # Couldn't parse it
278 |             return None
279 |         #Note: Years of less than 3 digits not interpreted by
280 |         #      dateutil correctly
281 |         #      e.g. 87 -> 1987
282 |         #           4  -> day 4 (no year)
283 |         # Both cases are handled in this routine
284 |         if res.year is None and res.day:
285 |             year = res.day
286 |         # If the whole date is simply two digits then dateutil_parser makes
287 |         # it '86' -> '1986'. So strip off the '19'. (If the date specified
288 |         # day/month then a two digit year is more likely to be this century
289 |         # and so allow the '19' prefix to it.)
290 |         elif self._numeric.match(date) and (len(date) == 2 or date.startswith('00')):
291 |             year = res.year % 100
292 |         else:
293 |             year = res.year
294 | 
295 |         # finally add back in BC stuff
296 |         if pre0AD:
297 |             year = -year
298 |             
299 |         if not qualifiers:
300 |             qualifier = ''
301 |         else:
302 |             qualifier = ', '.join(qualifiers) + (' : %s' % orig_date)
303 |         return FlexiDate(year, res.month, res.day, qualifier=qualifier)
304 | 
305 | 


--------------------------------------------------------------------------------
/datautil/deliveranceproxy.py:
--------------------------------------------------------------------------------
  1 | '''Use deliverance_ for proxying and re-theming.
  2 | 
  3 | .. _deliverance: http://packages.python.org/Deliverance/
  4 | 
  5 | Usage requirements (in pip-requirements.txt format)::
  6 | 
  7 |     # suggest installing lxml directly
  8 |     lxml
  9 |     deliverance>=0.3a
 10 |     # for urlmap and proxy
 11 |     paste
 12 |     # for Response
 13 |     webob
 14 | 
 15 | Example usage::
 16 | 
 17 |     dest = 'http://myremotes.ite/'
 18 |     mytheme = '<html>....</html>'
 19 |     my_deliverance_rules = '<ruleset><theme href="%s" /> ...</ruleset>'
 20 |     # or
 21 |     # my_deliverance_rules = open('/my/path/to/rules.xml').read()
 22 |     deliverance_proxy = create_deliverance_proxy(mytheme, dest,
 23 |         my_deliverance_rules)
 24 | 
 25 |     # from in wsgi app
 26 |     # path on remote destination url you want to proxy to ...
 27 |     # you can omit this if local path and remote path are the same
 28 |     environ['PATH_INFO'] = '/my_destination_path'
 29 |     deliverance_proxy(environ, start_response)
 30 | '''
 31 | import logging
 32 | 
 33 | import paste.urlmap
 34 | import deliverance.middleware
 35 | import paste.proxy
 36 | from webob import Request, Response
 37 | from deliverance.middleware import DeliveranceMiddleware, SubrequestRuleGetter
 38 | from deliverance.log import PrintingLogger
 39 | 
 40 | 
 41 | default_deliverance_rules = \
 42 | '''<ruleset>
 43 |   <theme href="%s" />
 44 |   <!-- These are the default rules for anything with class="default" or no class: -->
 45 |   <!-- suppress standard behaviour of copying over head stuff links, html, css
 46 |   etc -->
 47 |   <rule suppress-standard="1"> 
 48 |       <replace content="children:/html/head/title" theme="children:/html/head/title" nocontent="ignore" />
 49 | 
 50 |     <replace content="children:#content" theme="children:#content" />
 51 |     <!--
 52 |     <append content="children:#sidebar" theme="children:#primary" />
 53 |     -->
 54 |   </rule>
 55 | </ruleset>
 56 | '''
 57 | 
 58 | def create_deliverance_proxy(proxy_base_url, theme_html, rules_xml=None):
 59 |     '''Proxy to another url with re-theming using deliverance.
 60 | 
 61 |     Based on http://rufuspollock.org/code/deliverance
 62 | 
 63 |     :param proxy_base_url: base destination url we are proxying to.
 64 |     :param theme_html: string providing html theme to use for re-themeing.
 65 |     :param rules_xml: (optional) deliverance rules xml as a string. If not
 66 |         provided use `default_deliverance_rules`. For info on rulesets see
 67 |         deliverance docs. We require that ruleset support a single
 68 |         substitution string '%s' which is used to insert internal mountpoint
 69 |         for the them ('/_deliverance_theme.html').
 70 |     '''
 71 |     theme_url = '/_deliverance_theme.html'
 72 |     # use a urlmap so we can mount theme and urlset
 73 |     app = paste.urlmap.URLMap()
 74 |     # set up theme consistent with our rules file
 75 |     app[theme_url] = Response(theme_html)
 76 | 
 77 |     if rules_xml:
 78 |         rules = rules_xml
 79 |     else:
 80 |         rules = default_deliverance_rules
 81 |     rules = rules % theme_url
 82 |     app['/_deliverance_rules.xml'] = Response(rules, content_type="application/xml")
 83 | 
 84 |     class MyProxy(object):
 85 |         def __init__(self, proxy_base_url):
 86 |            self.proxy = paste.proxy.Proxy(proxy_base_url) 
 87 |         
 88 |         def __call__(self, environ, start_response):
 89 |             req = Request(environ)
 90 |             res = req.get_response(self.proxy)
 91 |             res.decode_content()
 92 |             return res(environ, start_response)
 93 | 
 94 |     app['/'] = MyProxy(proxy_base_url)
 95 |     deliv = DeliveranceMiddleware(app, SubrequestRuleGetter('/_deliverance_rules.xml'),
 96 |         PrintingLogger,
 97 |         log_factory_kw=dict(print_level=logging.WARNING))
 98 |     return deliv
 99 | 
100 | 


--------------------------------------------------------------------------------
/datautil/id.py:
--------------------------------------------------------------------------------
 1 | import base64
 2 | import uuid
 3 | 
 4 | def compress_uuid(_uuid):
 5 |     '''Provided shortened string representation of UUID via base64 encoding.
 6 | 
 7 |     @return: 22 character base64 encoded version of UUID.
 8 |     '''
 9 |     if isinstance(_uuid, basestring):
10 |         _uuid = uuid.UUID(_uuid)
11 |     encode = base64.b64encode(_uuid.bytes, '_-')
12 |     # throw away trailing ==
13 |     return encode[:22]
14 | 
15 | def uncompress_uuid(b64_encoded):
16 |     '''Reverse compress_uuid
17 | 
18 |     @return: 36 char str representation of uuid.
19 |     '''
20 |     b64_encoded = str(b64_encoded)
21 |     if not b64_encoded.endswith('=='):
22 |         b64_encoded += '=='
23 |     out = base64.b64decode(b64_encoded, '_-')
24 |     _uuid = uuid.UUID(bytes=out)
25 |     return str(_uuid)
26 | 
27 | 
28 | import struct
29 | def int_to_b32(int_):
30 |     out = struct.pack('1i', int_)
31 |     out = base64.b32encode(out)
32 |     # throw away trailing '='
33 |     return out[:-1]
34 | 
35 | def b32_to_int(b32):
36 |     out = base64.b32decode(b32+'=', casefold=True)
37 |     out = struct.unpack('1i', out)[0]
38 |     return out
39 | 
40 | 


--------------------------------------------------------------------------------
/datautil/misc.py:
--------------------------------------------------------------------------------
 1 | # TODO: create a strict option where None is returned on failed convert rather
 2 | # than original value
 3 | placeholders = [ '', '-', '#' ]
 4 | def floatify(value):
 5 |     '''Convert value to a float if possible.
 6 | 
 7 |     @return: Floatified value. If value is blank or placeholder ('-') return
 8 |     None. Can deal with ',' in value. Will also floatify dates. If nothing
 9 |     works returns original value. 
10 |     '''
11 |     if value is None:
12 |         return None
13 |     if isinstance(value, basestring):
14 |         stripped = value.strip()
15 |         if not stripped or stripped in placeholders:
16 |             return None
17 |         else: 
18 |             # often numbers have commas in them like 1,030
19 |             v = value.replace(',', '')
20 |     try:
21 |         newval = float(v)
22 |         return newval
23 |     except:
24 |         pass
25 |     # will return original value if fails
26 |     return date_to_float(value)
27 | 
28 | def floatify_matrix(matrix):
29 |     return [ [ floatify(col) for col in row ] for row in matrix ]
30 | 
31 | # TODO: remove/convert to using date.FlexiDate.as_float()
32 | import datetime
33 | def date_to_float(date):
34 |     '''Convert a date to float.
35 | 
36 |     Accepts either a date object or a string parseable to a date object
37 |     
38 |     @return: converted value or original if conversion fails
39 |     '''
40 |     import dateutil.parser
41 |     if isinstance(date, basestring):
42 |         try: # simple year
43 |             return float(date)
44 |         except:
45 |             pass
46 |         try:
47 |             val = dateutil.parser.parse(date, default=datetime.date(1,1,1))
48 |         except:
49 |             return date
50 |     else:
51 |         val = date
52 | 
53 |     if isinstance(val, datetime.date):
54 |         fval = val.year + val.month / 12.0 + val.day / 365.0
55 |         return round(fval, 3)
56 |     else:
57 |         return val
58 | 
59 | def make_series(matrix, xcol, ycols=None):
60 |     '''Take a matrix and return series (i.e. list of tuples) corresponding to
61 |     specified column indices.
62 | 
63 |     E.g. if matrix is:
64 |         [ [1,2,3,4]
65 |           [5,6,7,8] ]
66 |    
67 |     and xcol = 0, ycols=[1,3] then output is:
68 | 
69 |     [
70 |         [ [1,2], [5,6] ],
71 |         [ [1,4], [5,8] ],
72 |     ]
73 | 
74 |     If ycols not defined then return all possible series (excluding xcol
75 |     with itself.
76 |     '''
77 |     cols = zip(*matrix)
78 |     if ycols is None:
79 |         ycols = range(len(cols))
80 |         del ycols[xcol]
81 |     cols = floatify_matrix(cols)
82 |     def is_good(value):
83 |         if value is None: return False
84 |         tv = str(value)
85 |         stopchars = [ '', '-' ]
86 |         if tv in stopchars:
87 |             return False
88 |         return True
89 |     def is_good_tuple(tuple):
90 |         return is_good(tuple[0]) and is_good(tuple[1])
91 |     
92 |     xcoldata = cols[xcol]
93 |     ycols = [ cols[ii] for ii in ycols ]
94 |     series = [ filter(is_good_tuple, zip(xcoldata, col)) for col in ycols ]
95 |     return series
96 | 
97 | 


--------------------------------------------------------------------------------
/datautil/normalization/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rufuspollock-okfn/datautil/6bcb8795013d305f02467967367ecca15ad351e2/datautil/normalization/__init__.py


--------------------------------------------------------------------------------
/datautil/normalization/table_based.py:
--------------------------------------------------------------------------------
 1 | import gdata.spreadsheet.text_db
 2 | 
 3 | def _transform_key(key):
 4 |     return key.lower().strip()
 5 | 
 6 | class Normalizer(object):
 7 |     
 8 |     def __init__(self, username, password, doc_id, sheet, key_row):
 9 |         self.client = gdata.spreadsheet.text_db.DatabaseClient(
10 |                       username=username, password=password)
11 |         self._get_table(doc_id, sheet)
12 |         self.key_row = key_row
13 |         self._records = None
14 | 
15 |     @property
16 |     def records(self):
17 |         if self._records is None:
18 |             self._records = [r.content for r in self.table.FindRecords('')]
19 |         return self._records
20 | 
21 |     def _get_table(self, doc_id, sheet):
22 |         db = self.client.GetDatabases(doc_id)[0]
23 |         self.table = db.GetTables(name=sheet)[0]
24 |         self.table.LookupFields()
25 |     
26 |     def keys(self): 
27 |         return set([r.get(self.key_row) for r in self.records \
28 |                     if r.get(self.key_row) is not None])
29 |     
30 |     def __contains__(self, item):
31 |         return item in self.keys()
32 | 
33 |     def get(self, key, source_hint=None):
34 |         if key is None:
35 |             return {}
36 |         record = self.lookup(key)
37 |         if record:
38 |             return record
39 |         return self.add(_transform_key(key), source_hint).content
40 |     
41 |     def lookup(self, key):
42 |         if key is None: 
43 |             return {}
44 |         local_key = _transform_key(unicode(key))
45 |         for record in self.records: 
46 |             # TODO #1: figure out FindRecords syntax
47 |             # TODO #2: fuzzy matching for longer keys
48 |             if record.get(self.key_row) == local_key:
49 |                 return record
50 | 
51 | 
52 |     def add(self, value, source_hint):
53 |         fields = self.table.fields
54 |         row = dict(zip(fields, [None] * len(fields)))
55 |         row[self.key_row] = value
56 |         if source_hint is not None:
57 |             row['source'] = source_hint
58 |         self._records.append(row)
59 |         return self.table.AddRecord(row)
60 |         
61 | class NormalizerJoin(object):
62 |     
63 |     def __init__(self, first, second):
64 |         self.first = first
65 |         self.second = second
66 | 
67 |     def get(self, key, source_hint=None):
68 |         if key in self.second:
69 |             return self.second.get(key)
70 |         data = self.first.get(key, source_hint=source_hint)
71 |         if self.second.key_row in data:
72 |             data.update(self.second.get(data.get(self.second.key_row)))
73 |         return data
74 |     
75 | def Licenses(username, password):
76 |     doc_id = 'thlRT-WO0EVweyjiwtYLslA'
77 |     first = Normalizer(username, password, doc_id, 'Forms', 'original')
78 |     second = Normalizer(username, password, doc_id, 'Licenses', 'code')
79 |     return NormalizerJoin(first, second)
80 | 
81 | def Formats(username, password):
82 |     doc_id = 'tO-VTk7QwloOt0EP3YpCC4A'
83 |     first = Normalizer(username, password, doc_id, 'Forms', 'original')
84 |     second = Normalizer(username, password, doc_id, 'Formats', 'mimetype')
85 |     return NormalizerJoin(first, second)
86 | 
87 | 
88 | 


--------------------------------------------------------------------------------
/datautil/normalization/text.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import unicodedata
 3 | import string
 4 | 
 5 | KILL_DASHES = re.compile("\\-+")
 6 | 
 7 | def compose(text):
 8 |     return unicodedata.normalize('NFKC', text)
 9 |     
10 | def decompose(text):
11 |     return unicodedata.normalize('NFKD', text)
12 | 
13 | def recompose(text):
14 |     return compose(decompose(text))
15 | 
16 | def url_slug(text):
17 |     """ Convert arbitrary text to something that can be a url slug. """
18 |     out = []
19 |     for c in decompose(text):
20 |         cat = unicodedata.category(c)[0].upper()
21 |         if cat == 'Z':
22 |             out.append('-')
23 |         if c in string.ascii_letters or c in string.digits:
24 |             out.append(c)
25 |         if c in ['-', '.', '+', '_']:
26 |             out.append(c)
27 |     text = u"".join(out).lower()
28 |     return KILL_DASHES.sub('-', text)
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/datautil/parse/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rufuspollock-okfn/datautil/6bcb8795013d305f02467967367ecca15ad351e2/datautil/parse/__init__.py


--------------------------------------------------------------------------------
/datautil/parse/name.py:
--------------------------------------------------------------------------------
  1 | '''Parse names of people into a standard format.'''
  2 | 
  3 | import re
  4 | 
  5 | titles = [
  6 |         u'Ayatollah',
  7 |         u'Baron',
  8 |         u'Bishop',
  9 |         u'Dame',
 10 |         u'Dr',
 11 |         u'Fr',
 12 |         u'Graf',
 13 |         u'King',
 14 |         u'Lady',
 15 |         u'Maj',
 16 |         u'Major',
 17 |         u'Mrs',
 18 |         u'Prof',
 19 |         u'Rev',
 20 |         u'Sir',
 21 |         u'St',
 22 |         ]
 23 | 
 24 | class Name(object):
 25 |     '''A name of a person or entity.
 26 |     
 27 |     Not a domain object but a convenient way to handle/parse names.
 28 | 
 29 |     Attributes:
 30 |         title
 31 |         ln: last name 
 32 |         firstnames: first names as list
 33 |     '''
 34 |     def __init__(self, ln='', fns=None, title=''):
 35 |         self.ln = ln
 36 |         self.fns = fns
 37 |         if self.fns is None: self.fns = []
 38 |         self.title = title
 39 | 
 40 |     def norm(self):
 41 |         '''Return normalised name string (LastFirst format)
 42 |         '''
 43 |         return name_tostr(self)        
 44 | 
 45 |     def __str__(self):
 46 |         '''Display name using normalised format
 47 |         '''
 48 |         return self.norm()
 49 | 
 50 | class NameParserBase(object):
 51 |     regex_remove_fullstops = re.compile(r'(\w{2,})\.(\W|$)', re.UNICODE)
 52 |     
 53 |     def parse(self, fullname):
 54 |         '''Parse the `fullname` string into a `Name` object.
 55 | 
 56 |         @return: `Name` object for `fullname`
 57 |         '''
 58 |         if fullname is None:
 59 |             return Name()
 60 |         fullname = unicode(fullname.strip())
 61 |         if not fullname:
 62 |             return Name()
 63 | 
 64 |         # remove words ending '.', e.g. 'Bosch.'
 65 |         fullname = self.regex_remove_fullstops.sub(r'\1\2', fullname)
 66 | 
 67 |         # make sure initials are separted by ' '
 68 |         # but first deal with special edge case like [Major.]
 69 | #        fullname = fullname.replace('.]', ']')
 70 |         fullname = fullname.replace('.', '. ')
 71 |         name = self._toparts(fullname)
 72 |         name.ln = self.normcase(name.ln)
 73 |         name.fns = [ self.normcase(x) for x in name.fns ]
 74 |         name.title = self.normcase(name.title)
 75 |         return name
 76 | 
 77 |     def _toparts(self, fullname):
 78 |         '''Implement in inheriting classes, called by parse.
 79 |         '''
 80 |         raise NotImplementedError()
 81 | 
 82 |     def tostr(self, name):
 83 |         '''Convert name object back into a string.
 84 |         '''
 85 |         raise NotImplementedError()
 86 | 
 87 |     def normcase(self, name): 
 88 |         # useful to handle none and you often get this from regexes
 89 |         if name is None:
 90 |             return ''
 91 |         name = name.strip()
 92 |         if name.upper() == name or name.lower() == name:
 93 |             return name.capitalize()
 94 |         # avoid issues with e.g. McTaggart
 95 |         else:
 96 |             return name
 97 | 
 98 |     def untitlize(self, _str):
 99 |         '''Return title contained in _str if a title else return empty string.
100 |         '''
101 |         title = _str.strip()
102 |         title = _str.strip('()')
103 |         if title in titles:
104 |             return title
105 |         # always assume something in square brackets is a title
106 |         elif title.startswith('[') and title.endswith(']'):
107 |             return title[1:-1].strip()
108 |         else:
109 |             return ''
110 | 
111 |     def titlize(self, _str):
112 |         return u'[' + _str + u']'
113 | 
114 |     def norm(self, date):
115 |         return str(self.parse(date))
116 | 
117 | 
118 | class LastFirst(NameParserBase):
119 |     '''Parse and creates names of form:
120 | 
121 |         lastname, first-names-in-order [title]
122 |     '''
123 |     def _toparts(self, fullname):
124 |         if ',' not in fullname and ' ' in fullname:
125 |             raise ValueError('Expected "," in name: %s' % fullname)
126 |         name = Name()
127 |         # NB: if more than 2 commas just ignore stuff after 2nd one
128 |         parts = fullname.split(',')
129 |         name.ln = parts[0]
130 |         name.fns = parts[1].strip().split()
131 |         if name.fns:
132 |             title = self.untitlize(name.fns[-1])
133 |             if title:
134 |                 name.title = title
135 |                 del name.fns[-1]
136 |         return name
137 | 
138 |     def tostr(self, name):
139 |         if name.ln or name.fns:
140 |             fns = ' '.join(name.fns)
141 |             if not fns:
142 |                 out = name.ln
143 |             else:
144 |                 out = unicode(', '.join((name.ln, ' '.join(name.fns))))
145 |         else:
146 |             return ''
147 |         if name.title:
148 |             out = out + u' [%s]' % name.title
149 |         return out
150 | 
151 | 
152 | class FirstLast(NameParserBase):
153 |     '''Parse and create names of form:
154 | 
155 |         [title] first-names last-name
156 |     '''
157 |     def _toparts(self, fullname):
158 |         name = Name()
159 |         if ',' in fullname:
160 |             raise ValueError('Should not have "," in FirstLast type name: %s' %
161 |                     fullname)
162 |         parts = fullname.split()
163 |         name.ln = parts[-1]
164 |         name.fns = parts[:-1]
165 |         if name.fns:
166 |             title = self.untitlize(name.fns[0])
167 |             if title:
168 |                 name.title = title
169 |                 del name.fns[0]
170 |         return name
171 | 
172 |     def tostr(self, name):
173 |         if name.fns or name.ln:
174 |             out = u' '.join(name.fns) + ' ' + name.ln
175 |         else:
176 |             return ''
177 |         if name.title:
178 |             out = u'[%s]' % name.title + out
179 |         return out
180 | 
181 | 
182 | def parse_name(fullname):
183 |     if ',' in fullname:
184 |         parser = LastFirst()
185 |     else:
186 |         parser = FirstLast()
187 |     return parser.parse(fullname)
188 | 
189 | def name_tostr(name, parser_class=LastFirst):
190 |     parser = parser_class()
191 |     return parser.tostr(name)
192 | 
193 | def normalize(name_str, parser_class=LastFirst):
194 |     name = parse_name(name_str)
195 |     return name_tostr(name, parser_class)
196 | 
197 | 
198 | 


--------------------------------------------------------------------------------
/datautil/scrape.py:
--------------------------------------------------------------------------------
 1 | # taken from http://effbot.org/zone/re-sub.htm#unescape-html
 2 | import re, htmlentitydefs
 3 | 
 4 | ##
 5 | # Removes HTML or XML character references and entities from a text string.
 6 | #
 7 | # @param text The HTML (or XML) source text.
 8 | # @return The plain text, as a Unicode string, if necessary.
 9 | 
10 | def unescape(text):
11 |     def fixup(m):
12 |         text = m.group(0)
13 |         if text[:2] == "&#":
14 |             # character reference
15 |             try:
16 |                 if text[:3] == "&#x":
17 |                     return unichr(int(text[3:-1], 16))
18 |                 else:
19 |                     return unichr(int(text[2:-1]))
20 |             except ValueError:
21 |                 pass
22 |         else:
23 |             # named entity
24 |             try:
25 |                 text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
26 |             except KeyError:
27 |                 pass
28 |         return text # leave as is
29 |     return re.sub("&#?\w+;", fixup, text)
30 | 


--------------------------------------------------------------------------------
/datautil/tabular/__init__.py:
--------------------------------------------------------------------------------
1 | from base import *
2 | from misc import *
3 | from xls import XlsReader
4 | from html import *
5 | from tabular_json import JsonReader, JsonWriter
6 | from txt import TxtWriter
7 | 
8 | 


--------------------------------------------------------------------------------
/datautil/tabular/base.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tools for dealing with tabular data
  3 | """
  4 | 
  5 | class TabularData(object):
  6 |     """Holder for tabular data
  7 | 
  8 |     NB:
  9 |       * Assume data organized in rows.
 10 |       * No type conversion so all data will be as entered.
 11 | 
 12 |     Properties:
 13 |       * data: data itself provided as array of arrays
 14 |       * header: associated header columns (if they exist)
 15 | 
 16 |     TODO: handling of large datasets (iterators?)
 17 |     """
 18 | 
 19 |     def __init__(self, data=None, header=None):
 20 |         """
 21 |         Initialize object. If data or header not set they are defaulted to
 22 |         empty list.
 23 |         
 24 |         NB: must use None as default value for arguments rather than []
 25 |         because [] is mutable and using it will result in subtle bugs. See:
 26 |         'Default parameter values are evaluated when the function definition
 27 |         is executed.' [http://www.python.org/doc/current/ref/function.html]
 28 |         """
 29 |         self.data = []
 30 |         self.header = []
 31 |         if data is not None:
 32 |             self.data = data
 33 |         if header is not None:
 34 |             self.header = header
 35 |     
 36 |     def __repr__(self):
 37 |         out = []
 38 |         if self.header:
 39 |             out.append(self.header)
 40 |         # limit to 10 items
 41 |         out += self.data[0:10]
 42 |         return repr(out)
 43 | 
 44 |     def __str__(self):
 45 |         return repr(self)
 46 | 
 47 |     def __iter__(self):
 48 |         return self.data.__iter__()
 49 | 
 50 |     @classmethod
 51 |     def from_list(self, list_, header=True):
 52 |         return TabularData(header=list_[0], data=list_[1:])
 53 | 
 54 |     def to_list(self):
 55 |         if self.header:
 56 |             return [ self.header ] + self.data
 57 |         else:
 58 |             return self.data
 59 | 
 60 | 
 61 | class ReaderBase(object):
 62 |     def __init__(self, filepath_or_fileobj=None, encoding='utf8'):
 63 |         self.filepath = None
 64 |         self.fileobj = None
 65 |         self._filepath_or_fileobj(filepath_or_fileobj)
 66 |         self.encoding = 'utf8'
 67 | 
 68 |     def _filepath_or_fileobj(self, filepath_or_fileobj):
 69 |         if filepath_or_fileobj is None: # do not overwrite any existing value
 70 |             pass
 71 |         elif isinstance(filepath_or_fileobj, basestring):
 72 |             self.filepath = filepath_or_fileobj
 73 |             self.fileobj = open(self.filepath)
 74 |         else:
 75 |             self.filepath = None
 76 |             self.fileobj = filepath_or_fileobj
 77 |     
 78 |     def read(self, filepath_or_fileobj=None):
 79 |         self._filepath_or_fileobj(filepath_or_fileobj)
 80 | 
 81 | 
 82 | class WriterBase(object):
 83 |     '''
 84 |     Extra arguments to write methods:
 85 |         has_row_headings: first col of each row is a heading.
 86 |     '''
 87 |     def __init__(self, round_ndigits=None, **kwargs):
 88 |         '''
 89 |         @round_ndigits: number of decimal places to use when rounding numerical 
 90 |                         values when textifying for output 
 91 |         '''
 92 |         self.round_ndigits = round_ndigits
 93 | 
 94 |     def write(self, tabular_data, fileobj, *args, **kwargs):
 95 |         pass
 96 | 
 97 |     def write_str(self, tabular_data, *args, **kwargs):
 98 |         from StringIO import StringIO
 99 |         holder = StringIO()
100 |         self.write(tabular_data, holder, *args, **kwargs)
101 |         holder.seek(0)
102 |         return holder.read()
103 | 
104 |     def value_to_str(self, value):
105 |         '''Convert value to text (rounding floats/ints as necessary).
106 |         '''
107 |         if value is None:
108 |             return ''
109 |         if self.round_ndigits is not None and \
110 |                 (isinstance(value, int) or isinstance(value, float)):
111 |             roundedResult = round(value, self.round_ndigits)
112 |             if self.round_ndigits <= 0: # o/w will have in .0 at end
113 |                 roundedResult = int(roundedResult)
114 |             roundedResult = str(roundedResult)
115 |             # deal with case when rounding has added unnecessary digits
116 |             if len(str(value)) < len(roundedResult):
117 |                 return str(value)
118 |             else:
119 |                 return roundedResult
120 |         else:
121 |             return unicode(value)
122 | 
123 | 
124 | import csv
125 | import codecs
126 | class UTF8Recoder:
127 |     """
128 |     Iterator that reads an encoded stream and reencodes the input to UTF-8
129 | 
130 |     From: <http://docs.python.org/lib/csv-examples.html>
131 |     """
132 |     def __init__(self, f, encoding=None):
133 |         if encoding:
134 |             self.reader = codecs.getreader(encoding)(f)
135 |         else: # already unicode so just return f
136 |             self.reader = f
137 | 
138 |     def __iter__(self):
139 |         return self
140 | 
141 |     def next(self):
142 |         return self.reader.next().encode('utf-8')
143 | 
144 | class CsvReader(ReaderBase):
145 |     """Read data from a csv file into a TabularData structure
146 | 
147 |     Note that the csv module does *not* support unicode:
148 |     
149 |     > This version of the csv module doesn't support Unicode input. Also, there
150 |     > are currently some issues regarding ASCII NUL characters. Accordingly,
151 |     > all input should be UTF-8 or printable ASCII to be safe; see the examples
152 |     > in section 9.1.5. These restrictions will be removed in the future.
153 |     > <http://docs.python.org/lib/module-csv.html>
154 |     """
155 | 
156 |     def read(self, filepath_or_fileobj=None, encoding=None, **kwargs):
157 |         """Read in a csv file and return a TabularData object.
158 | 
159 |         @param fileobj: file like object.
160 |         @param encoding: if set use this instead of default encoding set in
161 |             __init__ to decode the file like object. NB: will check if fileobj
162 |             already in unicode in which case this is ignored.
163 |         @param kwargs: all further kwargs are passed to the underlying `csv.reader` function
164 |         @return tabular data object (all values encoded as utf-8).
165 |         """
166 |         super(CsvReader, self).read(filepath_or_fileobj)
167 |         if encoding:
168 |             self.encoding = encoding
169 |         tabData = TabularData()
170 | 
171 |         sample = self.fileobj.read()
172 |         # first do a simple test -- maybe sample is already unicode
173 |         if type(sample) == unicode:
174 |             encoded_fo = UTF8Recoder(self.fileobj, None)
175 |         else:
176 |             sample = sample.decode(self.encoding)
177 |             encoded_fo = UTF8Recoder(self.fileobj, self.encoding)
178 |         sample = sample.encode('utf-8')
179 |         sniffer = csv.Sniffer()
180 |         hasHeader = sniffer.has_header(sample)
181 | 
182 |         self.fileobj.seek(0)
183 |         ourkwargs = {
184 |             'skipinitialspace': True
185 |         }
186 |         if kwargs:
187 |             ourkwargs.update(kwargs)
188 | 
189 |         reader = csv.reader(encoded_fo, **ourkwargs)
190 |         if hasHeader:
191 |             tabData.header = reader.next()
192 |         for row in reader:
193 |             tabData.data.append(row)
194 |         return tabData
195 | 
196 | # for backwards compatibility
197 | ReaderCsv = CsvReader
198 | 
199 | class CsvWriter(WriterBase):
200 |     # TODO: unicode support a la CsvReader
201 |     def write(self, tabular_data, fileobj, encoding='utf-8'):
202 |         writer = csv.writer(fileobj)
203 |         if tabular_data.header:
204 |             writer.writerow(tabular_data.header)
205 |         for row in tabular_data.data:
206 |             writer.writerow(row)
207 |         fileobj.flush()
208 | 
209 | 
210 | ## --------------------------------
211 | ## Converting to Latex
212 | 
213 | class LatexWriter(WriterBase):
214 | 
215 |     def write(self, tabular_data, fileobj, has_row_headings=False):
216 |         self.has_row_headings = has_row_headings
217 |         matrix = tabular_data.data
218 |         has_header = len(tabular_data.header) > 0
219 |         if has_header: 
220 |             matrix.insert(0, tabular_data.header)
221 |         out = self._write(matrix, has_header)
222 |         fileobj.write(out)
223 |     
224 |     def _write(self, matrix, has_header=True):
225 |         if len(matrix) == 0: return
226 |         # no hline on first row as this seems to mess up latex \input
227 |         # http://groups.google.com/group/comp.text.tex/browse_thread/thread/1e1db553a958ebd8/0e590a22cb59f43d
228 |         out = '%s' % self.process_row(matrix[0], has_header)
229 |         for row in matrix[1:]:
230 |             out += self.process_row(row) 
231 |         return out
232 | 
233 |     def process_row(self, row, heading=False):
234 |         if len(row) == 0: return
235 |         out = '%s' % self.process_cell(row[0], heading or self.has_row_headings)
236 |         for cell in row[1:]:
237 |             out += ' & %s' % self.process_cell(cell, heading)
238 |         out += ' \\\\\n\hline\n'
239 |         return out
240 | 
241 |     def process_cell(self, cell, heading=False):
242 |         cell_text = self.value_to_str(cell)
243 |         cell_text = self.escape(cell_text)
244 |         if heading:
245 |             return '\\textbf{%s}' % cell_text
246 |         else:
247 |             return cell_text
248 | 
249 |     def escape(self, text):
250 |         escape_chars = [ '&', '%' ]
251 |         out = text
252 |         for ch in escape_chars:
253 |             out = out.replace(ch, '\\%s' % ch)
254 |         return out
255 |     
256 | 
257 | # TODO: 2009-08-05 deprecate
258 | def table2latex(matrix, has_header=True, has_row_headings=False):
259 |     m2l = LatexWriter()
260 |     m2l.has_row_headings = has_row_headings
261 |     return m2l._write(matrix, has_header)
262 | 
263 | 


--------------------------------------------------------------------------------
/datautil/tabular/gdocs.py:
--------------------------------------------------------------------------------
  1 | '''TabularData from a Google Docs Spreadsheet.
  2 | '''
  3 | from base import ReaderBase, TabularData
  4 | import gdata.spreadsheet.service
  5 | import gdata.spreadsheet.text_db
  6 | 
  7 | 
  8 | class GDocsReaderTextDb(ReaderBase):
  9 |     '''Read a google docs spreadsheet using the gdata.spreadsheet.text_db
 10 |     library.
 11 |     
 12 |     NB: any blank line in spreadsheet will be taken as terminating data.
 13 |     '''
 14 |     def __init__(self, spreadsheet_id, username=None, password=None,
 15 |             id_is_name=False):
 16 |         '''
 17 |         @param: spreadsheet_id: gdoc id or name (?key={id} in url). If name you
 18 |         must set id_is_name to True.
 19 |         '''
 20 |         # do not pass spreadsheet_id down as it will be url or sheet name
 21 |         super(GDocsReaderTextDb, self).__init__()
 22 |         self.source = spreadsheet_id
 23 |         self.id_is_name = id_is_name
 24 |         self.gd_client = gdata.spreadsheet.text_db.DatabaseClient(
 25 |                 username=username,
 26 |                 password=password)
 27 |     
 28 |     def load_text_db_table(self, sheet_name='Sheet1'):
 29 |         '''Load text_db Table object corresponding to specified sheet_name.
 30 |         '''
 31 |         super(GDocsReaderTextDb, self).read(None)
 32 |         if self.id_is_name:
 33 |             dbs = self.gd_client.GetDatabases(name=self.source)
 34 |         else:
 35 |             dbs = self.gd_client.GetDatabases(spreadsheet_key=self.source)
 36 |         assert len(dbs) >= 1, 'No spreadsheet of that name/id'
 37 |         db = dbs[0]
 38 |         table = db.GetTables(name=sheet_name)[0]
 39 |         return table
 40 | 
 41 |     def read(self, sheet_name='Sheet1'):
 42 |         '''Load the specified google spreadsheet worksheet as a L{TabularData}
 43 |         object.
 44 | 
 45 |         @return L{TabularData} object.
 46 |         '''
 47 |         text_db_table = self.load_text_db_table(sheet_name)
 48 |         tdata = TabularData()
 49 |         text_db_table.LookupFields()
 50 |         tdata.header = text_db_table.fields
 51 |         # finds all records it seems
 52 |         rows = text_db_table.FindRecords('')
 53 |         for row in rows:
 54 |             rowdata = []
 55 |             for colname in tdata.header:
 56 |                 rowdata.append(row.content[colname])
 57 |             tdata.data.append(rowdata)
 58 |         return tdata
 59 | 
 60 | 
 61 | # not yet working properly (cannot work out ListFeed yet ...)
 62 | # textdb is nicer but Spreadsheet allows one to get all cells using CellsFeed
 63 | # (even when blank lines) (this is not true when using ListFeed though ...)
 64 | # class GDocsReaderSpreadsheet(ReaderBase):
 65 | #     '''
 66 | # 
 67 | #     From Docs for the API:
 68 | #     <http://code.google.com/apis/spreadsheets/data/1.0/developers_guide_python.html#listFeeds>
 69 | # 
 70 | #     > The list feed contains all rows after the first row up to the first blank
 71 | #     row. The first blank row terminates the data set. If expected data isn't
 72 | #     appearing in a feed, check the worksheet manually to see whether there's an
 73 | #     unexpected blank row in the middle of the data. In particular, if the
 74 | #     second row of the spreadsheet is blank, then the list feed will contain no
 75 | #     data.
 76 | #     '''
 77 | #     def __init__(self, spreadsheet_id, username=None, password=None,
 78 | #             id_is_name=False):
 79 | #         '''
 80 | #         @param: spreadsheet_id: gdoc id or name (?key={id} in url). If name you
 81 | #         must set id_is_name to True.
 82 | #         '''
 83 | #         # do not pass spreadsheet_id down as it will be url or sheet name
 84 | #         super(GDocsReaderSpreadsheet, self).__init__()
 85 | #         self.source = spreadsheet_id
 86 | #         self.id_is_name = id_is_name
 87 | #         self.gd_client = gdata.spreadsheet.service.SpreadsheetsService()
 88 | #         self.gd_client.email = username
 89 | #         self.gd_client.password = password
 90 | # 
 91 | #     def read(self, sheet_index=0):
 92 | #         '''Load the specified google spreadsheet worksheet as a L{TabularData}
 93 | #         object.
 94 | # 
 95 | #         @return L{TabularData} object.
 96 | #         '''
 97 | #         super(GDocsReaderSpreadsheet, self).read(None)
 98 | #         self.gd_client.source = self.source
 99 | #         self.gd_client.ProgrammaticLogin()
100 | #         if self.id_is_name:
101 | #             feed = self.gd_client.GetSpreadsheetsFeed()
102 | #             # no len on feed ...
103 | #             # assert len(feed) > 0, 'No spreadsheets found for: %s' % self.source
104 | #             spreadsheet_id = feed.entry[0].id.text.split('/')[-1]
105 | #         else:
106 | #             spreadsheet_id = self.source
107 | #         sheetfeed = self.gd_client.GetWorksheetsFeed(spreadsheet_id)
108 | #         wrksht_id = sheetfeed.entry[sheet_index].id.text.split('/')[-1]
109 | #         row_feed = self.gd_client.GetListFeed(spreadsheet_id, wrksht_id)
110 | #         
111 | #         tdata = TabularData()
112 | #         # tdata.header
113 | #         # how do we get rows rather than just all the cells?
114 | #         for i, entry in enumerate(row_feed.entry):
115 | #             print entry.content['col1']
116 | #             print entry.content
117 | #             tdata.data.append([entry.content.text])
118 | #         return tdata
119 | 
120 | 


--------------------------------------------------------------------------------
/datautil/tabular/html.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from HTMLParser import HTMLParser
  3 | 
  4 | from base import TabularData, ReaderBase, WriterBase
  5 | 
  6 | 
  7 | class HtmlReader(ReaderBase):
  8 |     '''Read data from HTML table into L{TabularData}.
  9 | 
 10 |     '''
 11 |     def read(self, filepath_or_fileobj=None, table_index=0):
 12 |         '''Read data from fileobj.
 13 | 
 14 |         NB: post read all tables extracted are in attribute named 'tables'.
 15 | 
 16 |         @arg table_index: if multiple tables in the html return table at this
 17 |             index.
 18 |         @return: L{TabularData} object (all content in the data part, i.e. no
 19 |         header).
 20 |         '''
 21 |         super(HtmlReader, self).read(filepath_or_fileobj)
 22 |         parser = _OurTableExtractor()
 23 |         parser.reset()
 24 |         parser.feed(self.fileobj.read())
 25 |         self.tables = parser.tables
 26 |         return self.tables[table_index]
 27 | 
 28 | 
 29 | class _OurTableExtractor(HTMLParser):
 30 |     '''
 31 |     # TODO: tbody, thead etc
 32 |     # TODO: nested tables
 33 | 
 34 |     # TODO: will barf on bad html so may need to run tidy first ...
 35 |     # tidy -w 0 -b -omit -asxml -ascii
 36 |     '''
 37 |     def reset(self):
 38 |         HTMLParser.reset(self)
 39 |         self.tables = []
 40 |         self._rows = []
 41 |         self._row = []
 42 |         self._text = ''
 43 | 
 44 |     def handle_starttag(self, tag, attrs):
 45 |         if tag == 'tr':
 46 |             self._row = []
 47 |         elif tag == 'td' or tag == 'th':
 48 |             self._text = ''
 49 |         elif tag == 'br':
 50 |             self._text += '\n'
 51 | 
 52 |     def handle_endtag(self, tag):
 53 |         if tag == 'tr':
 54 |             self._rows.append(self._row)
 55 |         if tag == 'td' or tag == 'th':
 56 |             self._row.append(self._text)
 57 |         if tag == 'table':
 58 |             self.tables.append(TabularData(data=self._rows))
 59 |             self._rows = []
 60 | 
 61 |     def handle_data(self, data):
 62 |         self._text += data.strip()
 63 | 
 64 |     
 65 | import re
 66 | class HtmlWriter(WriterBase):
 67 |     """
 68 |     Write tabular data to xhtml
 69 |     """
 70 |     
 71 |     def __init__(self, round_ndigits=2, pretty_print=False, table_attributes = {'class': 'data'}):
 72 |         """
 73 |         @pretty_print: whether to pretty print (indent) output
 74 |         @table_attributes: dictionary of html attribute name/value pairs to be
 75 |         added to the table element
 76 |         """
 77 |         super(HtmlWriter, self).__init__(round_ndigits)
 78 |         self.pretty_print = pretty_print
 79 |         self.table_attributes = table_attributes
 80 |     
 81 |     def write(self, tabulardata, fileobj, caption = '', rowHeadings = []):
 82 |         """
 83 |         Write matrix of data to xhtml table.
 84 |         Allow for addition of row and column headings
 85 |         
 86 |         @return xhtml table containing data
 87 |         
 88 |         @param data: table of data that makes up table
 89 |         @param caption: the caption for the table (if empty no caption created)
 90 |         @param rowHeadings: additional headings for rows (separate from
 91 |         tabulardata)
 92 |         """
 93 |         columnHeadings = tabulardata.header
 94 |         data = tabulardata.data
 95 |         haveRowHeadings = (len(rowHeadings) > 0)
 96 |         
 97 |         htmlTable = '<table'
 98 |         for key, value in self.table_attributes.items():
 99 |             htmlTable += ' %s="%s"' % (key, value)
100 |         htmlTable += '>'
101 |         
102 |         # deal with caption
103 |         if caption != '':
104 |             htmlTable += '<caption>%s</caption>' % caption
105 |         
106 |         # deal with col headings
107 |         # if we there are rowHeadings may want to add blank column at front
108 |         numColHeads = len(columnHeadings)
109 |         if numColHeads > 0:
110 |             if haveRowHeadings and numColHeads == len(data[0]):
111 |                 # [[TODO: is this dangerous? should i make a copy ...]]
112 |                 columnHeadings.insert(0, '')
113 |             htmlTable += self.writeHeading(columnHeadings)
114 |         
115 |         htmlTable += '<tbody>'
116 |         if self.pretty_print:
117 |             htmlTable += '\n'
118 |         
119 |         for ii in range(0, len(data)):
120 |             # have to add 1 as first row is headings
121 |             if haveRowHeadings:
122 |                 htmlTable += self.writeRow(data[ii], rowHeadings[ii])
123 |             else:
124 |                 htmlTable += self.writeRow(data[ii])
125 |         
126 |         htmlTable += '</tbody></table>'
127 |         
128 |         if self.pretty_print:
129 |             fileobj.write(self.prettyPrint(htmlTable))
130 |         else:
131 |             fileobj.write(htmlTable)
132 | 
133 |     def value_to_str(self, value):
134 |         import cgi
135 |         out = super(HtmlWriter, self).value_to_str(value)
136 |         out = cgi.escape(out)
137 |         return out
138 |         
139 |     def writeHeading(self, row):
140 |         """
141 |         Write heading for html table (<thead>)
142 |         """
143 |         result = '<thead><tr>'
144 |         result += self.writeGeneralRow(row, 'th')
145 |         result += '</tr></thead>'
146 |         if self.pretty_print:
147 |             result += '\n'
148 |         return result
149 |     
150 |     def writeRow(self, row, rowHeading = ''):
151 |         result = ''
152 |         if rowHeading != '':
153 |             result = '<th>%s</th>' % self.value_to_str(rowHeading)
154 |         result += self.writeGeneralRow(row, 'td')
155 |         result = '<tr>%s</tr>' % result
156 |         if self.pretty_print:
157 |             result += '\n'
158 |         return result
159 |     
160 |     def writeGeneralRow(self, row, tagName):
161 |         result = ''
162 |         for ii in range(len(row)):
163 |             result += '<%s>%s</%s>' % (tagName, self.value_to_str(row[ii]), tagName)
164 |         return result
165 |         
166 |     def prettyPrint(self, html):
167 |         """pretty print html using HTMLTidy"""
168 |         # [[TODO: strip out html wrapper stuff that is added (head, body etc)
169 |         try:
170 |             import mx.Tidy
171 |             out = mx.Tidy.tidy(html, None, None, wrap = 0, indent = 'yes')[2]
172 |         except:
173 |             out = html
174 |         return self.tabify(out)
175 |         
176 |     def tabify(self, instr, tabsize = 2):
177 |         """
178 |         tabify text by replacing spaces of size tabSize by tabs
179 |         """
180 |         whitespace = tabsize * ' '
181 |         return re.sub(whitespace, '\t', instr)
182 |         
183 |     
184 | # for backwards compatibility
185 | # 2008-05-30
186 | WriterHtml = HtmlWriter
187 | 
188 | 
189 | 


--------------------------------------------------------------------------------
/datautil/tabular/misc.py:
--------------------------------------------------------------------------------
 1 | '''General Helper methods for tabular data.
 2 | '''
 3 | from base import TabularData
 4 | 
 5 | def transpose(data):
 6 |     '''Transpose a list of lists.
 7 |     
 8 |     Or do it directy: data = zip(*data)
 9 |     '''
10 |     return zip(*data)
11 | 
12 | def select_columns(matrix, cols):
13 |     '''Return a matrix with only those column indexes in cols.'''
14 |     tsp = transpose(matrix)
15 |     out = []
16 |     cols.sort()
17 |     for c in cols:
18 |         out.append(tsp[c])
19 |     return transpose(out)
20 | 
21 | 
22 | def pivot(table, left, top, value):
23 |     """Unnormalize (pivot) a normalised input set of tabular data.
24 | 
25 |     @param table: simple list of lists or a L{TabularData} object.
26 |     
27 |     Eg. To transform the tabular data like
28 |     
29 |     Name,   Year,  Value
30 |     -----------------------
31 |     'x', 2004, 1
32 |     'y', 2004, 2
33 |     'x', 2005, 3
34 |     'y', 2005, 4
35 |     
36 |     into the new list:
37 |     
38 |     Year, 'x', 'y'
39 |     ------------------------
40 |     2004, 1, 2
41 |     2005, 3, 4
42 |     
43 |     you would do:
44 | 
45 |         pivot(tabulardata, 1, 0, 2)
46 | 
47 |         OR (requires header to exist):
48 | 
49 |         pivot(tabulardata, 'Year', 'Name', 'Value')
50 |     """
51 |     if not isinstance(left, int):
52 |         left = table.header.index(left)
53 |     if not isinstance(top, int):
54 |         top = table.header.index(top)
55 |     if not isinstance(value, int):
56 |         value = table.header.index(value)
57 | 
58 |     rs = TabularData()
59 |     # construct double dict keyed by left values
60 |     tdict = {}
61 |     xvals = set()
62 |     yvals = set()
63 |     for row in table:
64 |         xval = row[left]
65 |         if not xval in tdict:
66 |             tdict[xval] = {}
67 |         tdict[xval][row[top]] = row[value]
68 |         xvals.add(xval)
69 |         yvals.add(row[top])
70 |     xvals = sorted(list(xvals))
71 |     yvals = sorted(list(yvals))
72 |     xhead = 'X'
73 |     if hasattr(table, 'header') and table.header:
74 |         xhead = table.header[left]
75 |     rs.header = [ xhead ] + yvals
76 |     rs.data = [ [x] + [ tdict[x].get(y, '') for y in yvals ] for x in xvals ]
77 |     return rs
78 | 
79 | 


--------------------------------------------------------------------------------
/datautil/tabular/tabular_json.py:
--------------------------------------------------------------------------------
 1 | '''JSON Reader and Writer'''
 2 | try:
 3 |     import json
 4 | except ImportError:
 5 |     try:
 6 |         import simplejson as json
 7 |     except ImportError: # simplejson not installed
 8 |         pass
 9 | from base import TabularData, ReaderBase, WriterBase
10 | 
11 | 
12 | class JsonReader(ReaderBase):
13 |     def read(self, filepath_or_fileobj=None):
14 |         '''Read JSON encoded data from source into a L{TabularData} object.
15 | 
16 |         JSON encoded data should either be:
17 |             * dict (with header and data attributes)
18 |             * list (first row assumed to be the header)
19 | 
20 |         @return L{TabularData}
21 |         '''
22 |         super(JsonReader, self).read(filepath_or_fileobj)
23 |         jsondata = json.load(self.fileobj)
24 |         if isinstance(jsondata, dict):
25 |             return TabularData(header=jsondata.get('header', None),
26 |                     data=jsondata.get('data', None)
27 |                     )
28 |         elif isinstance(jsondata, list):
29 |             return TabularData(header=jsondata[0], data=jsondata[1:])
30 |         else:
31 |             raise Exception('Cannot load TabularData from %s' % jsondata)
32 | 
33 | class JsonWriter(WriterBase):
34 | 
35 |     def write(self, tabular_data, fileobj, indent=2):
36 |         super(JsonWriter, self).write(tabular_data, fileobj)
37 |         jsondata = { u'header': tabular_data.header,
38 |                 u'data': tabular_data.data
39 |                 }
40 |         json.dump(jsondata, fileobj, indent=indent)
41 | 
42 | 


--------------------------------------------------------------------------------
/datautil/tabular/txt.py:
--------------------------------------------------------------------------------
  1 | from base import WriterBase
  2 | 
  3 | class TxtWriter(WriterBase):
  4 |     '''Write tabular data to plain text in nicely formatted way
  5 | 
  6 | TODO
  7 | ====
  8 | 
  9 | 1. allow output_width of 0 meaning use width necessary to fit all rows on one
 10 |    line
 11 | 
 12 | 2. rather than truncate cell contents wrap it onto two lines (and/or allow
 13 |    spillover if adjacent cell is empty)
 14 |    
 15 |      * wontfix: can let terminal do this: just set width very large ...
 16 | 
 17 | 3. (?) stream output back rather than returning all at once
 18 | 
 19 | 4. Add support for limiting number of columns displayed. DONE 2007-08-02
 20 |    * TODO: add unittest
 21 | '''
 22 | 
 23 |     def __init__(self, output_width=0, number_of_columns=0, **kwargs):
 24 |         '''
 25 |         @param output_width: display width (0 means unlimited).
 26 |         @param number_of_columns: number of columns to try to display (not
 27 |             guaranteed to be this number if this would cause problems). (0
 28 |             means all columns)
 29 |         '''
 30 |         super(TxtWriter, self).__init__(**kwargs)
 31 |         self.output_width = output_width
 32 |         self.number_of_columns = number_of_columns
 33 | 
 34 |     def write(self, tabular_data, fileobj):
 35 |         result = ''
 36 |         formatter = None
 37 |         row_cache = []
 38 |         sample_length = 4
 39 |         rows = tabular_data.data
 40 |         if tabular_data.header:
 41 |             rows = [ tabular_data.header ] + rows
 42 |         # include header in sample rows (do we always want to?)
 43 |         sample_rows = rows[:sample_length]
 44 |         self._compute_parameters(sample_rows)
 45 |         result += self._write_separator()
 46 |         for row in rows:
 47 |             result += self._write_row(row)
 48 |             result += self._write_separator()
 49 |         fileobj.write(result)
 50 | 
 51 |     def _compute_parameters(self, sample_rows):
 52 |         maxcols = self._get_maxcols(sample_rows)
 53 |         if not self.number_of_columns:
 54 |             self.numcols = maxcols
 55 |         else:
 56 |             self.numcols = min(self.number_of_columns, maxcols)
 57 |         self.colwidths = []
 58 |         self._set_colwidths(sample_rows)
 59 |         if self.colwidths[0] < 2:
 60 |             msg =\
 61 | u'''It is not possible to effectively format this many columns of material with
 62 | this narrow an output window. Column width is: %s''' % self.colwidths[0]
 63 |             # TODO: log it?
 64 |             print msg
 65 | 
 66 |     def _write_row(self, row):
 67 |         '''Return the input 'python' row as an appropriately formatted string.
 68 |         '''
 69 |         result = '|'
 70 |         count = 0
 71 |         for cell in row[:self.numcols]:
 72 |             width = self.colwidths[count]
 73 |             result += self._format_cell(width, cell)
 74 |             count += 1
 75 |         # now pad out with extra cols as necessary
 76 |         while count < self.numcols:
 77 |             width = self.colwidths[count]
 78 |             result += self._format_cell(width, ' ')
 79 |             count += 1
 80 |         return result + '\n'
 81 | 
 82 |     def _write_separator(self):
 83 |         result = '+'
 84 |         for width in self.colwidths:
 85 |             result += '-' * (width-1) + '+'
 86 |         return result + '\n'
 87 | 
 88 |     def _get_maxcols(self, sample_rows):
 89 |         maxcols = 0
 90 |         for row in sample_rows:
 91 |             maxcols = max(maxcols, len(row))
 92 |         return maxcols
 93 | 
 94 |     def _set_colwidths(self, sample_rows):
 95 |         # subtract -1 so that we have (at least) one spare screen column
 96 |         if self.output_width != 0:
 97 |             colwidth = int( (self.output_width - 1) / self.numcols)
 98 |             for ii in range(self.numcols):
 99 |                 self.colwidths.append(colwidth)
100 |         else: # make every col as wide as it needs to be
101 |             self.colwidths = [0] * self.numcols
102 |             for row in sample_rows:
103 |                 for ii in range(self.numcols):
104 |                     cellwidth = len(self.value_to_str(row[ii]))
105 |                     self.colwidths[ii] = max(self.colwidths[ii],
106 |                             cellwidth
107 |                             )
108 |             self.colwidths = [ x + 1 for x in self.colwidths ]
109 | 
110 |     def _format_cell(self, width, content):
111 |         content = self.value_to_str(content)
112 |         content = content.strip()
113 |         if len(content) > width - 1:
114 |             # TODO: be brutal (this *has* to be fixed)
115 |             content = content[:width-1]
116 |         return content.center(width-1) + '|'
117 | 
118 | 


--------------------------------------------------------------------------------
/datautil/tabular/xls.py:
--------------------------------------------------------------------------------
  1 | '''Work with Excel (xls) files.
  2 | 
  3 | Requires xlrd
  4 | '''
  5 | try:
  6 |     import xlrd
  7 | except ImportError: # xlrd not installed
  8 |     pass
  9 | 
 10 | from base import ReaderBase, TabularData
 11 | 
 12 | class XlsReader(ReaderBase):
 13 |     '''Read Excel (xls) files.
 14 | 
 15 |     Requires the xlrd package (see pypi).
 16 |     '''
 17 |     def __init__(self, filepath_or_fileobj=None):
 18 |         super(XlsReader, self).__init__(filepath_or_fileobj)
 19 |         if self.fileobj:
 20 |             self.book = xlrd.open_workbook(file_contents=self.fileobj.read())
 21 |         ## TODO: fix the rest of this
 22 | 
 23 |     def read(self, fileobj=None, sheet_index=0):
 24 |         '''Read an excel file (provide as fileobj) and return the specified
 25 |         sheet as a L{TabularData} object.
 26 | 
 27 |         For convenience also store:
 28 | 
 29 |         self.book: xlrd WorkBook object
 30 |         
 31 |         @return L{TabularData} object.
 32 |         '''
 33 |         super(XlsReader, self).read(fileobj)
 34 |         if fileobj:
 35 |             self.book = xlrd.open_workbook(file_contents=self.fileobj.read())
 36 |         tab = TabularData()
 37 |         booksheet = self.book.sheet_by_index(sheet_index)
 38 |         data = self.extract_sheet(booksheet, self.book)
 39 |         tab.data = data
 40 |         return tab
 41 | 
 42 |     def info(self):
 43 |         '''Return summary info about this Excel Workbook.'''
 44 |         info = ''
 45 |         info += 'The number of worksheets is: %s\n' % self.book.nsheets
 46 |         info += 'Worksheet name(s):\n' % self.book.sheet_names()
 47 |         count = -1
 48 |         for sn in self.book.sheet_names():
 49 |             count += 1
 50 |             info += '%s  %s\n' % (count, sn)
 51 |         return info
 52 | 
 53 |     def sheet_info(self, sheet_index):
 54 |         '''Summary info about an xls sheet.
 55 | 
 56 |         @return: printable string giving info.
 57 |         '''
 58 |         import pprint
 59 |         sh = self.book.sheet_by_index(sheet_index)
 60 |         info = sh.name + '\n'
 61 |         info += 'Rows: %s Cols: %s\n\n' % (sh.nrows, sh.ncols)
 62 |         MAX_ROWS = 30
 63 |         for rx in range(min(sh.nrows, MAX_ROWS)):
 64 |             info += str(sh.row(rx)) + '\n'
 65 |         return info
 66 | 
 67 |     def extract_sheet(self, sheet, book):
 68 |         matrix = []
 69 |         nrows = sheet.nrows
 70 |         ncols = sheet.ncols
 71 |         for rx in range(nrows):
 72 |             outrow = []
 73 |             for cx in range(ncols):
 74 |                 cell = sheet.cell(rowx=rx, colx=cx)
 75 |                 val = self.cell_to_python(cell, book)
 76 |                 outrow.append(val)
 77 |             matrix.append(outrow)
 78 |         return matrix
 79 | 
 80 |     def cell_to_python(self, cell, book):
 81 |         # annoying need book argument for datemode
 82 |         # info on types: http://www.lexicon.net/sjmachin/xlrd.html#xlrd.Cell-class
 83 |         if cell.ctype == xlrd.XL_CELL_NUMBER: 
 84 |             return float(cell.value)
 85 |         elif cell.ctype == xlrd.XL_CELL_DATE:
 86 |             from datetime import date
 87 |             # TODO: distinguish date and datetime
 88 |             args = xlrd.xldate_as_tuple(cell.value, book.datemode)
 89 |             try:
 90 |                 return date(args[0], args[1], args[2])
 91 |             except Exception, inst:
 92 |                 print 'Error parsing excel date (%s): %s' % (args, inst)
 93 |                 return None
 94 |         elif cell.ctype == xlrd.XL_CELL_BOOLEAN:
 95 |             return bool(cell.value)
 96 |         else:
 97 |             return cell.value
 98 | 
 99 | 
100 | 


--------------------------------------------------------------------------------
/datautil/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # placeholder
2 | 


--------------------------------------------------------------------------------
/datautil/tests/data/xls_reader_test.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rufuspollock-okfn/datautil/6bcb8795013d305f02467967367ecca15ad351e2/datautil/tests/data/xls_reader_test.xls


--------------------------------------------------------------------------------
/datautil/tests/parse/test_name.py:
--------------------------------------------------------------------------------
  1 | import datautil.parse.name
  2 | 
  3 | 
  4 | class TestName:
  5 |     def test_parse_name_FL(self):
  6 |         name = u'Ludwig Van Beethoven'
  7 |         out = datautil.parse.name.parse_name(name)
  8 |         assert out.ln == u'Beethoven'
  9 |         assert out.fns == ['Ludwig', 'Van']
 10 | 
 11 |     def test_parse_name_LF_with_extra_comma(self):
 12 |         out = datautil.parse.name.parse_name('More, Sir Thomas,Saint')
 13 |         assert out.ln == 'More', out
 14 |         assert out.fns == ['Sir', 'Thomas']
 15 | 
 16 |     def test_parse_name_FL_normcase(self):
 17 |         name = u'Ludwig van BEETHOVEN'
 18 |         out = datautil.parse.name.parse_name(name)
 19 |         assert out.ln == 'Beethoven', out
 20 | 
 21 |     def test_parse_name_LF_with_title(self):
 22 |         name = u'Chandos, John [Sir]'
 23 |         out = datautil.parse.name.parse_name(name)
 24 |         assert out.ln == 'Chandos', out
 25 |         assert out.title == 'Sir', out
 26 | 
 27 |     def test_parse_name_FL_with_title(self):
 28 |         name = u'Sir John CHANDOS'
 29 |         out = datautil.parse.name.parse_name(name)
 30 |         assert out.ln == 'Chandos', out
 31 |         assert out.title == 'Sir', out
 32 | 
 33 |     def test_parse_name_FL_with_title_2(self):
 34 |         name = u'Prof Benjamin AARON'
 35 |         out = datautil.parse.name.parse_name(name)
 36 |         assert out.ln == 'Aaron', out
 37 |         assert out.title == 'Prof', out
 38 |         assert out.fns == ['Benjamin'], out
 39 |         assert str(out) == 'Aaron, Benjamin [Prof]'
 40 | 
 41 |     def test_parse_title_with_fullstop(self):
 42 |         name = 'Major. abc xyz'
 43 |         out = datautil.parse.name.parse_name(name)
 44 |         assert out.title == 'Major', out.title
 45 | 
 46 |     def test_parse_title_with_fullstop_2(self):
 47 |         name = 'Xyz, Abc [Major.]'
 48 |         out = datautil.parse.name.parse_name(name)
 49 |         print out
 50 |         assert out.title == 'Major', out.title
 51 | 
 52 |     def test_parse_title_with_brackets(self):
 53 |         name = 'Dickens, Gerald (Sir)'
 54 |         out = datautil.parse.name.parse_name(name)
 55 |         assert out.title == 'Sir', out.title
 56 | 
 57 |         name = '(Sir) Gerald Dickens'
 58 |         out = datautil.parse.name.parse_name(name)
 59 |         assert out.title == 'Sir', out.title
 60 | 
 61 |     def test_parse_name_FL_initials(self):
 62 |         name = 'Chekhov, A.P.'
 63 |         out = datautil.parse.name.parse_name(name)
 64 |         assert out.ln == 'Chekhov'
 65 |         assert out.fns == ['A.', 'P.'], out
 66 | 
 67 |     def test_strip_fullstops(self):
 68 |         name = 'George. Bosch'
 69 |         out = datautil.parse.name.normalize(name)
 70 |         assert out == 'Bosch, George'        
 71 | 
 72 |         name = 'George. a.p. Bosch.'
 73 |         out = datautil.parse.name.normalize(name)
 74 |         assert out == 'Bosch, George A. P.', out
 75 | 
 76 |         name = 'Geo.rge. Bosch'
 77 |         out = datautil.parse.name.normalize(name)
 78 |         assert out == 'Bosch, Geo. Rge', out
 79 | 
 80 |         name = 'Geo.Smith. Bosch'
 81 |         out = datautil.parse.name.normalize(name)
 82 |         assert out == 'Bosch, Geo. Smith', out
 83 | 
 84 |     def test_tostr(self):
 85 |         name = datautil.parse.name.Name(ln='Beethoven', fns=['Ludwig', 'van'])
 86 |         exp = u'Beethoven, Ludwig van'
 87 |         out = datautil.parse.name.name_tostr(name)
 88 |         assert out == exp, out
 89 | 
 90 |     def test_with_no_name(self):
 91 |         name = datautil.parse.name.parse_name(' ')
 92 |         assert name.ln is '', name
 93 |         out = datautil.parse.name.normalize(' ')
 94 |         assert out == '', out
 95 | 
 96 |     def test_surname(self):
 97 |         name = u'SCHUBERT'
 98 |         out = str(datautil.parse.name.parse_name(name))
 99 |         assert out == 'Schubert'
100 | 
101 | 


--------------------------------------------------------------------------------
/datautil/tests/tabular/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rufuspollock-okfn/datautil/6bcb8795013d305f02467967367ecca15ad351e2/datautil/tests/tabular/__init__.py


--------------------------------------------------------------------------------
/datautil/tests/tabular/test_base.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from StringIO import StringIO
  3 | 
  4 | import datautil.tabular
  5 | 
  6 | class TestTabularData:
  7 |     testlist = [ ['X', 'Y'], [1,2], [3,4] ]
  8 | 
  9 |     def test_1(self):
 10 |         tabular = datautil.tabular.TabularData()
 11 |         assert tabular.header == []
 12 | 
 13 |     def test_from_list(self):
 14 |         out = datautil.tabular.TabularData.from_list(self.testlist)
 15 |         assert out.header == [ 'X', 'Y' ]
 16 |         assert out.data == [ [1,2], [3,4] ]
 17 | 
 18 |     def test_to_list(self):
 19 |         td = datautil.tabular.TabularData(
 20 |             header=['X', 'Y'],
 21 |             data=[ [1,2], [3,4] ]
 22 |             )
 23 |         out = td.to_list()
 24 |         assert out == self.testlist
 25 | 
 26 | 
 27 | class TestWriterBase:
 28 |     def test_value_to_str(self):
 29 |         w = datautil.tabular.WriterBase() # round_ndigits=None
 30 |         out = w.value_to_str('x')
 31 |         assert out == u'x', out
 32 |         out = w.value_to_str(1)
 33 |         assert out == u'1', out
 34 |         out = w.value_to_str(1.3555)
 35 |         assert out == u'1.3555', out
 36 | 
 37 |         w = datautil.tabular.WriterBase(round_ndigits=2)
 38 |         out = w.value_to_str('x')
 39 |         assert out == u'x', out
 40 |         out = w.value_to_str(1)
 41 |         assert out == u'1', out
 42 |         out = w.value_to_str(1.3555)
 43 |         assert out == u'1.36', out
 44 | 
 45 |         w.round_ndigits = -1
 46 |         out = w.value_to_str(102.34)
 47 |         assert out == u'100', out
 48 | 
 49 | 
 50 | class TestReaderCsv(object):
 51 |     
 52 |     csvdata = \
 53 | '''"header1", "header 2"
 54 | 1, 2'''
 55 |     header = [ 'header1', 'header 2' ]
 56 |     data = [ ['1', '2'] ]
 57 |   
 58 |     def setUp(self):
 59 |         reader = datautil.tabular.ReaderCsv()
 60 |         fileobj = StringIO(self.csvdata)
 61 |         self.tab = reader.read(fileobj)
 62 | 
 63 |     def test_header(self):
 64 |         assert self.header == self.tab.header
 65 | 
 66 |     def test_data(self):
 67 |         assert self.data == self.tab.data
 68 | 
 69 | 
 70 | class TestReaderCsvUnicode(TestReaderCsv):
 71 |     csvdata = \
 72 | u'''"headi\xf1g", "header 2"
 73 | 1, 2'''
 74 |     header = [ u'headi\xf1g'.encode('utf-8'), 'header 2' ]
 75 |     data = [ ['1', '2'] ]
 76 | 
 77 | 
 78 | class TestReaderCsvEncoded(TestReaderCsvUnicode):
 79 |     encoding = 'utf-16'
 80 |     csvdata = \
 81 | u'''"headi\xf1g", "header 2"
 82 | 1, 2'''.encode(encoding)
 83 | 
 84 |     def setUp(self):
 85 |         reader = datautil.tabular.ReaderCsv()
 86 |         fileobj = StringIO(self.csvdata)
 87 |         self.tab = reader.read(fileobj, encoding=self.encoding)
 88 | 
 89 | 
 90 | class TestCsvWriter:
 91 |     def test_writer(self):
 92 |         writer = datautil.tabular.CsvWriter()
 93 |         fo = StringIO()
 94 |         td = datautil.tabular.TabularData([[1,2],[3,4]], header=['one',
 95 |             'two'])
 96 |         writer.write(td, fo)
 97 |         fo.seek(0)
 98 |         out = fo.read()
 99 |         exp = \
100 | '''one,two\r
101 | 1,2\r
102 | 3,4\r\n'''
103 |         assert out == exp
104 | 
105 | 
106 | class TestHtmlReader:
107 | 
108 |     inraw1 = '''
109 | <table>
110 |     <tr>
111 |         <td>1</td><td>2</td>
112 |     </tr>
113 |     <tr>
114 |         <th colspan="2">1983</th>
115 |     </tr>
116 |     <tr>
117 |         <td>3</td><td>4</td>
118 |     </tr>
119 | </table>
120 |     '''
121 |     in1 = StringIO(inraw1)
122 |     
123 |     exp1 = [ ['1', '2'],
124 |             ['1983'],
125 |             ['3', '4'],
126 |             ]
127 |     
128 |     def test_1(self):
129 |         reader = datautil.tabular.HtmlReader()
130 |         tab = reader.read(self.in1)
131 |         assert tab.data == self.exp1
132 | 
133 | 
134 | class TestHtmlWriter:
135 | 
136 |     def setUp(self):
137 |         rawData = [[1,1], [0,1]]
138 |         self.indata1 = datautil.tabular.TabularData(data=rawData)
139 |         self.writer1 = datautil.tabular.HtmlWriter(table_attributes={'id':1, 'class': 'data'})
140 | 
141 |     def test_0_simple(self):
142 |         indata1 = [[1,1], [0,1]]
143 |         expected = '<table id="1" class="data"><tbody><tr><td>1</td><td>1</td></tr>'+\
144 |             '<tr><td>0</td><td>1</td></tr></tbody></table>'
145 |         out1 = self.writer1.write_str(self.indata1)
146 |         assert expected == out1
147 |     
148 |     def test_col_headings(self):
149 |         self.indata1.header = [u'x','y']
150 |         caption = ''
151 |         expected = '<table id="1" class="data"><thead><tr><th>x</th><th>y</th></tr>'+\
152 |             '</thead><tbody><tr><td>1</td><td>1</td></tr><tr><td>0</td>' + \
153 |             '<td>1</td></tr></tbody></table>'
154 |         # no caption but headings
155 |         out1 = self.writer1.write_str(self.indata1, caption)
156 |         assert expected == out1
157 |     
158 |     def test_row_headings(self):
159 |         self.indata1.header = ['x','y']
160 |         rowHeadings = ['Date 1', 'Date 2']
161 |         caption = ''
162 |         expected = '<table id="1" class="data"><thead><tr><th></th><th>x</th>' + \
163 |             '<th>y</th></tr></thead><tbody><tr><th>Date 1</th><td>1</td>' + \
164 |             '<td>1</td></tr><tr><th>Date 2</th><td>0</td><td>1</td></tr>' + \
165 |             '</tbody></table>'
166 |         # no caption but headings
167 |         out1 = self.writer1.write_str(self.indata1, caption, rowHeadings)
168 |         assert expected == out1
169 |     
170 |     def test_escaping(self):
171 |         tdata = datautil.tabular.TabularData(header=['s&p', 'y<z'])
172 |         out = self.writer1.write_str(tdata)
173 |         assert 's&amp;p' in out, out
174 |         assert 'y&lt;z' in out
175 | 
176 |     
177 | #    def testPrettyPrint(self):
178 | #        in1 = '<table><tr><th>x</th><th>y</th></tr>' + \
179 | #            '<tr><td>0</td><td>1</td></tr></table>'
180 | #        print self.writer1.prettyPrint(in1)
181 | 
182 | 
183 | class TestLatexWriter:
184 | 
185 |     matrix = [[ 'H1', 'H2'],
186 |            [1,'2%'],
187 |            [3,4],
188 |            ]
189 | 
190 |     exp = \
191 | r'''\textbf{H1} & \textbf{H2} \\
192 | \hline
193 | 1 & 2\% \\
194 | \hline
195 | 3 & 4 \\
196 | \hline
197 | '''
198 |     m2l = datautil.tabular.LatexWriter()
199 | 
200 |     def test_escape(self):
201 |         in1 = '& % $ something'
202 |         exp1 = r'\& \% $ something'
203 |         assert self.m2l.escape(in1) == exp1
204 | 
205 |     def test_table2latex(self):
206 |         out = datautil.tabular.table2latex(self.matrix)
207 |         self.diff(self.exp, out)
208 |         assert out == self.exp
209 | 
210 |     def test_write(self):
211 |         td = datautil.tabular.TabularData(data=self.matrix[1:], header=self.matrix[0])
212 |         out = self.m2l.write_str(td)
213 |         self.diff(self.exp, out)
214 |         assert out == self.exp
215 | 
216 |     def diff(self, str1, str2):
217 |         import difflib
218 |         differ = difflib.Differ()
219 |         text1 = str1.splitlines(1)
220 |         text2 = str2.splitlines(1)
221 |         result = list(differ.compare(text1, text2))
222 |         from pprint import pprint
223 |         pprint(result)
224 | 
225 | 
226 | 


--------------------------------------------------------------------------------
/datautil/tests/tabular/test_gdocs.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from ConfigParser import SafeConfigParser
 3 | 
 4 | import datautil.tabular.gdocs as gdocs
 5 | from nose.plugins.skip import SkipTest
 6 | 
 7 | 
 8 | cfg = SafeConfigParser()
 9 | if not os.path.exists('test.ini'):
10 |     msg = 'To run GDocs tests you need a config file. See %s for details' % __file__
11 |     raise SkipTest(msg)
12 | cfg.readfp(open('test.ini'))
13 | username = cfg.get('gdocs', 'username')
14 | password = cfg.get('gdocs', 'password')
15 | 
16 | 
17 | class TestGDocsTextDb:
18 |     def test_01(self):
19 |         source = 'okfn-datautil-gdocs-testing'
20 |         reader = gdocs.GDocsReaderTextDb(source, username, password, id_is_name=True)
21 |         tdata = reader.read()
22 |         assert tdata.header == ['col1', 'col2']
23 |         assert len(tdata.data) == 5, tdata
24 | 
25 | 
26 | # not working properly yet
27 | class _TestGDocs:
28 |     def test_01(self):
29 |         source = 't8GZy4Lb6jhVjCL5nrqZ5TQ'
30 |         reader = gdocs.GDocsReaderSpreadsheet(source, username, password)
31 |         tdata = reader.read()
32 |         assert len(tdata.data) == 6, tdata
33 | 
34 |     def test_02_id_is_name(self):
35 |         source = 'okfn-datautil-gdocs-testing'
36 |         reader = gdocs.GDocsReaderSpreadsheet(source, username, password, id_is_name=True)
37 |         tdata = reader.read()
38 |         assert len(tdata.data) == 6, tdata
39 | 
40 | 
41 | 


--------------------------------------------------------------------------------
/datautil/tests/tabular/test_json.py:
--------------------------------------------------------------------------------
 1 | from StringIO import StringIO
 2 | import datautil.tabular.tabular_json as js
 3 | 
 4 | class TestJson:
 5 |     in1 = { 'header': [u'a', u'b'],
 6 |             'data': [[1,2], [3,4]]
 7 |             }
 8 |     in2 = [ in1['header'] ] + in1['data']
 9 |     in1sio = StringIO(js.json.dumps(in1))
10 |     in1sio.seek(0)
11 |     in2sio = StringIO(js.json.dumps(in2))
12 |     in2sio.seek(0)
13 | 
14 |     def test_JsonReader(self):
15 |         reader = js.JsonReader()
16 |         out = reader.read(self.in1sio)
17 |         assert out.header == self.in1['header']
18 |         assert out.data == self.in1['data']
19 | 
20 |         out = reader.read(self.in2sio)
21 |         assert out.header == self.in1['header']
22 |         assert out.data == self.in1['data']
23 | 
24 |     def test_JsonWriter(self):
25 |         writer = js.JsonWriter()
26 |         td = js.TabularData(header=self.in1['header'], data=self.in1['data'])
27 |         out = writer.write_str(td)
28 |         assert js.json.loads(out) == self.in1
29 | 
30 | 


--------------------------------------------------------------------------------
/datautil/tests/tabular/test_misc.py:
--------------------------------------------------------------------------------
 1 | import datautil.tabular
 2 | 
 3 | class TestTranspose:
 4 | 
 5 |     def test_1(self):
 6 |         inlist = [
 7 |                 [ 0, 1 ],
 8 |                 [ 1, 0 ],
 9 |                 ]
10 |         exp = [
11 |                 ( 0, 1 ),
12 |                 ( 1, 0 ),
13 |                 ]
14 |         out = datautil.tabular.transpose(inlist)
15 |         assert out == exp, out
16 | 
17 | class TestPivot:
18 |     td = datautil.tabular.TabularData(
19 |             header=['Name','Year','Value'],
20 |             data=[
21 |                 ['x',2004,1],
22 |                 ['y',2004,2],
23 |                 ['y',2005,4],
24 |                 ['x',2005,3],
25 |             ],
26 |         )
27 | 
28 |     def test_pivot_with_tabular(self):
29 |         out = datautil.tabular.pivot(self.td, 1, 0, 2)
30 |         assert out.data[0] == [2004, 1, 2]
31 |         assert out.data[-1] == [2005, 3, 4]
32 | 
33 |     def test_pivot_with_tabular_2(self):
34 |         out = datautil.tabular.pivot(self.td, 'Year', 'Name', 'Value')
35 |         assert out.data[0] == [2004, 1, 2]
36 | 
37 |     def test_pivot_simple_list(self):
38 |         out = datautil.tabular.pivot(self.td.data, 1, 0, 2)
39 |         assert out.data[0] == [2004, 1, 2]
40 | 
41 | 


--------------------------------------------------------------------------------
/datautil/tests/tabular/test_txt.py:
--------------------------------------------------------------------------------
 1 | import StringIO
 2 | 
 3 | from datautil.tabular.txt import *
 4 | from datautil.tabular import TabularData, CsvReader
 5 | 
 6 | class TestFormatting:
 7 | 
 8 |     sample_rows  = [
 9 |             ['1', '2', 'head blah', 'blah blah blah'],
10 |             ['a', 'b', 'c', 'd', 'e', 'g' ],
11 |             ['1', '2', 'annakarenina annakarenina annakarenina'],
12 |             ]
13 |     output_width = 60
14 | 
15 |     writer = TxtWriter(output_width=output_width)
16 |     writer._compute_parameters(sample_rows)
17 | 
18 |     def test_1(self):
19 |         assert self.writer.numcols == 6
20 | 
21 |     def test_colwidths(self):
22 |         exp = int ((self.output_width -1) / 6)
23 |         assert self.writer.colwidths[0] == exp
24 |         
25 |     def test__write_1(self):
26 |         out = self.writer._write_row(self.sample_rows[0])
27 |         assert len(out) <= self.output_width
28 | 
29 |     def test__write_2(self):
30 |         out = self.writer._write_row(self.sample_rows[0])
31 |         exp = '|   1    |   2    |head bla|blah bla|        |        |\n'
32 |         assert out == exp
33 | 
34 |     def test__write_separator(self):
35 |         out = self.writer._write_separator()
36 |         exp = '+--------+--------+--------+--------+--------+--------+\n'
37 | 
38 | 
39 | 
40 | class TestTxtWriter:
41 |     sample = \
42 | '''"YEAR","PH","RPH","RPH_1","LN_RPH","LN_RPH_1","HH","LN_HH"
43 |     1971,7.852361625,43.9168370988587,42.9594500501036,3.78229777955476,3.76025664867788,16185,9.69184016636035
44 |     ,,abc,
45 |     1972,10.504714885,55.1134791192682,43.9168370988587,4.00939431635556,3.78229777955476,16397,9.70485367024987
46 |     , ,,  '''
47 | 
48 |     expected = \
49 | '''+------+------+------+------+------+------+------+------+
50 | | YEAR |  PH  | RPH  |RPH_1 |LN_RPH|LN_RPH|  HH  |LN_HH |
51 | +------+------+------+------+------+------+------+------+
52 | | 1971 |7.8523|43.916|42.959|3.7822|3.7602|16185 |9.6918|
53 | +------+------+------+------+------+------+------+------+
54 | |      |      | abc  |      |      |      |      |      |
55 | +------+------+------+------+------+------+------+------+
56 | | 1972 |10.504|55.113|43.916|4.0093|3.7822|16397 |9.7048|
57 | +------+------+------+------+------+------+------+------+
58 | |      |      |      |      |      |      |      |      |
59 | +------+------+------+------+------+------+------+------+
60 | '''
61 | 
62 |     def test_simple(self):
63 |         indata = TabularData(data=[range(5),range(5,10)])
64 |         writer = TxtWriter()
65 |         out = writer.write_str(indata)
66 |         exp = '''+-+-+-+-+-+
67 | |0|1|2|3|4|
68 | +-+-+-+-+-+
69 | |5|6|7|8|9|
70 | +-+-+-+-+-+
71 | '''
72 |         print out
73 |         print exp
74 |         assert out == exp
75 | 
76 |     def test_output_width(self):
77 |         indata = TabularData(data=[range(5),range(5,10)])
78 |         writer = TxtWriter(output_width=16)
79 |         out = writer.write_str(indata)
80 |         outlen = len(out.splitlines()[0])
81 |         assert outlen == 16, outlen
82 | 
83 |     def test_using_csv(self):
84 |         fileobj = StringIO.StringIO(self.sample)
85 |         in_tdata = CsvReader(fileobj).read()
86 |         writer = TxtWriter(output_width=60)
87 |         out = writer.write_str(in_tdata)
88 |         print out
89 |         print self.expected
90 |         assert self.expected == out, out
91 | 
92 | 


--------------------------------------------------------------------------------
/datautil/tests/test_cache.py:
--------------------------------------------------------------------------------
 1 | import tempfile
 2 | import shutil
 3 | import os
 4 | 
 5 | from datautil.cache import Cache
 6 | 
 7 | class TestCache:
 8 |     @classmethod
 9 |     def setup_class(self):
10 |         self.tmp = tempfile.mkdtemp()
11 |         self.path = os.path.join(self.tmp, 'abc.txt')
12 |         open(self.path, 'w').write('abc')
13 |         self.url = 'file://%s' % self.path
14 | 
15 |     @classmethod
16 |     def teardown_class(self):
17 |         shutil.rmtree(self.tmp)
18 | 
19 |     def test_basename(self):
20 |         base = 'http://www.abc.org/'
21 |         in1 = base + 'xyz'
22 |         out = Cache.basename(in1)
23 |         assert out == 'xyz'
24 | 
25 |         in2 = base + 'xyz/abc.txt'
26 |         out = Cache.basename(in2)
27 |         assert out == 'abc.txt'
28 | 
29 |         in3 = base + 'membersDo?body=ABC'
30 |         out = Cache.basename(in3)
31 |         assert out == 'membersDo?body=ABC', out
32 | 
33 |         in3 = base + 'membersDo?body=data/ABC'
34 |         out = Cache.basename(in3)
35 |         assert out == 'membersDo?body=data%47ABC', out
36 | 
37 |     def test_filepath(self):
38 |         r = Cache()
39 |         base = 'http://www.abc.org/'
40 |         in1 = base + 'xyz'
41 |         out = r.filepath(in1)
42 |         # ./xyz
43 |         assert out.endswith('xyz'), out
44 | 
45 |     def test_dl(self):
46 |         dest = os.path.join(self.tmp, 'out.txt')
47 |         Cache.dl(self.url, dest)
48 |         assert os.path.exists(dest)
49 |         assert open(dest).read() == 'abc'
50 | 
51 |     def test_cache(self):
52 |         cache = os.path.join(self.tmp, 'cache')
53 |         r = Cache(cache)
54 |         r.retrieve(self.url)
55 |         assert os.path.exists(os.path.join(cache, 'abc.txt'))
56 | 
57 | 


--------------------------------------------------------------------------------
/datautil/tests/test_date.py:
--------------------------------------------------------------------------------
  1 | from datautil.date import *
  2 | 
  3 | import datetime
  4 | 
  5 | class TestPythonStringOrdering(object):
  6 |     # It is impossible to find a string format such that +ve and -ve numbers
  7 |     # sort correctly as strings:
  8 |     # if (in string ordering) X < Y => -X < -Y (False!)
  9 |     def test_ordering(self):
 10 |         assert '0' < '1'
 11 |         assert '-10' < '10'
 12 |         assert '-' < '@'
 13 |         assert '-' < '0'
 14 |         assert '-100' < '-X10'
 15 |         assert '10' < '1000'
 16 |         assert '02000' < '10000'
 17 |         assert ' 2000' < '10000'
 18 | 
 19 |     def test_bad_ordering(self):
 20 |         assert ' ' < '0'
 21 |         assert ' ' < '-'
 22 |         assert not '-' < '+'
 23 |         assert '-100' > '-10'
 24 |         assert not '-100' < '-010'
 25 |         assert not '-100' < '- 10'
 26 |         assert not '-100' < ' -10'
 27 |         assert '10000' < '2000'
 28 |         assert not '-10' < ' 1'
 29 |         
 30 | 
 31 | class TestFlexiDate(object):
 32 |     def test_init(self):
 33 |         fd = FlexiDate()
 34 |         assert fd.year == '', fd
 35 |         assert fd.month == '', fd
 36 | 
 37 |         fd = FlexiDate(2000, 1,1)
 38 |         assert fd.month == '01', fd
 39 |         assert fd.day== '01', fd
 40 | 
 41 |     def test_str(self):
 42 |         fd = FlexiDate(2000, 1, 23)
 43 |         assert str(fd) == '2000-01-23', '"%s"' % fd
 44 |         fd = FlexiDate(-2000, 1, 23)
 45 |         assert str(fd) == '-2000-01-23'
 46 |         fd = FlexiDate(2000)
 47 |         assert str(fd) == '2000'
 48 |         fd = FlexiDate(1760, qualifier='fl.')
 49 |         assert str(fd) == '1760 [fl.]', fd
 50 | 
 51 |         fd = FlexiDate(qualifier='anything')
 52 |         assert str(fd) == ' [anything]'
 53 | 
 54 | 
 55 |     def test_from_str(self):
 56 |         def dotest(fd):
 57 |             out = FlexiDate.from_str(str(fd))
 58 |             assert str(out) == str(fd)
 59 | 
 60 |         fd = FlexiDate(2000, 1, 23)
 61 |         dotest(fd)
 62 |         fd = FlexiDate(1760, qualifier='fl.')
 63 |         dotest(fd)
 64 |         fd = FlexiDate(-1760, 1, 3, qualifier='fl.')
 65 |         dotest(fd)
 66 |     
 67 |     def test_as_float(self):
 68 |         fd = FlexiDate(2000)
 69 |         assert fd.as_float() == float(2000), fd.as_float()
 70 |         fd = FlexiDate(1760, 1, 2)
 71 |         exp = 1760 + 1/12.0 + 2/365.0
 72 |         assert fd.as_float() == exp, fd.as_float()
 73 |         fd = FlexiDate(-1000)
 74 |         assert fd.as_float() == float(-1000)
 75 | 
 76 |     def test_as_datetime(self):
 77 |         fd = FlexiDate(2000)
 78 |         out = fd.as_datetime()
 79 |         assert out == datetime.datetime(2000, 1, 1), out
 80 |         fd = FlexiDate(1760, 1, 2)
 81 |         out = fd.as_datetime()
 82 |         assert out == datetime.datetime(1760,1,2), out
 83 | 
 84 | 
 85 | class TestDateParsers(object):
 86 |     def test_using_datetime(self):
 87 |         parser = PythonDateParser()
 88 | 
 89 |         d1 = datetime.date(2000, 1, 23)
 90 |         fd = parser.parse(d1)
 91 |         assert fd.year == '2000'
 92 | 
 93 |         d1 = datetime.datetime(2000, 1, 23)
 94 |         fd = parser.parse(d1)
 95 |         # assert str(fd) == '2000-01-23T00:00:00', fd
 96 |         assert str(fd) == '2000-01-23', fd
 97 | 
 98 |     def test_using_dateutil(self):
 99 |         parser = DateutilDateParser()
100 | 
101 |         in1 = '2001-02'
102 |         fd = parser.parse(in1)
103 |         assert str(fd) == in1, fd
104 | 
105 |         in1 = 'March 1762'
106 |         fd = parser.parse(in1)
107 |         assert str(fd) == '1762-03'
108 | 
109 |         in1 = 'March 1762'
110 |         fd = parser.parse(in1)
111 |         assert str(fd) == '1762-03'
112 | 
113 |         in1 = '1768 AD'
114 |         fd = parser.parse(in1)
115 |         assert str(fd) == '1768', fd
116 | 
117 |         in1 = '1768 A.D.'
118 |         fd = parser.parse(in1)
119 |         assert str(fd) == '1768', fd
120 | 
121 |         in1 = '-1850'
122 |         fd = parser.parse(in1)
123 |         assert str(fd) == '-1850', fd
124 | 
125 |         in1 = '1762 BC'
126 |         fd = parser.parse(in1)
127 |         assert str(fd) == '-1762', fd
128 | 
129 |         in1 = '4 BC'
130 |         fd = parser.parse(in1)
131 |         assert str(fd) == '-0004', fd
132 | 
133 |         in1 = '4 B.C.'
134 |         fd = parser.parse(in1)
135 |         assert str(fd) == '-0004', fd
136 | 
137 |         in1 = 'Wed, 06 Jan 2010 09:30:00 GMT'
138 |         fd = parser.parse(in1)
139 |         assert str(fd) == '2010-01-06', fd
140 | 
141 |         in1 = 'Tue, 07 Dec 2010 10:00:00 GMT'
142 |         fd = parser.parse(in1)
143 |         assert str(fd) == '2010-12-07', fd
144 | 
145 |     def test_parse(self):
146 |         d1 = datetime.datetime(2000, 1, 23)
147 |         fd = parse(d1)
148 |         assert fd.year == '2000'
149 | 
150 |         fd = parse('March 1762')
151 |         assert str(fd) == '1762-03'
152 | 
153 |         fd = parse(1966)
154 |         assert str(fd) == '1966'
155 | 
156 |         fd = parse('22/07/2010')
157 |         assert fd.month == '07', fd.month
158 | 
159 |     def test_parse_ambiguous_day_month(self):
160 |         fd = parse('05/07/2010')
161 |         assert fd.month == '07', fd.month
162 |         assert fd.day == '05', fd.month
163 | 
164 |     def test_parse_with_none(self):
165 |         d1 = parse(None)
166 |         assert d1 is None
167 |     
168 |     def test_parse_wildcards(self):
169 |         fd = parse('198?')
170 |         assert fd.year == '', fd.year # expect this to not parse
171 |         # TODO but we should have a float if possible
172 | #        assert fd.as_float() == u'1980', fd.as_float()
173 | 
174 |     def test_parse_with_qualifiers(self):
175 | 
176 |         fd = parse('1985?')
177 |         assert fd.year == u'1985', fd
178 |         assert fd.qualifier == u'Uncertainty : 1985?', fd.qualifier
179 | 
180 |         #  match '[c|c. |c.] {date}'
181 |         fd = parse('c.1780')
182 |         assert fd.year == u'1780', fd
183 |         assert fd.qualifier.startswith(u"Note 'circa'"), fd
184 | 
185 |         fd = parse('c. 1780')
186 |         assert fd.year == u'1780', fd
187 |         assert fd.qualifier.startswith(u"Note 'circa'"), fd
188 | 
189 |         fd = parse('c1780')
190 |         assert fd.year == '1780', fd
191 |         assert fd.qualifier == u"Note 'circa' : c1780", fd
192 | 
193 |         fd = parse('c 1780')
194 |         assert fd.year == u'1780', fd
195 |         assert fd.qualifier.startswith(u"Note 'circa'"), fd
196 | 
197 |         #  match 'circa {date}' | circa{date}'
198 |         fd = parse('circa1780')
199 |         assert fd.year == u'1780', fd
200 |         assert fd.qualifier == u"Note 'circa' : circa1780", fd
201 | 
202 |         fd = parse('circa 1780')
203 |         assert fd.year == u'1780', fd
204 |         assert fd.qualifier.startswith(u"Note 'circa'"), fd
205 | 
206 |         #  match '[circ|circ. |circ.] {date}'
207 |         fd = parse('circ1780')
208 |         assert fd.year == u'1780', fd
209 |         assert fd.qualifier == u"Note 'circa' : circ1780", fd
210 | 
211 |         fd = parse('circ 1780')
212 |         assert fd.year == u'1780', fd
213 |         assert fd.qualifier.startswith(u"Note 'circa'"), fd
214 | 
215 |         fd = parse('circ.1780')
216 |         assert fd.year == u'1780', fd
217 |         assert fd.qualifier == u"Note 'circa' : circ.1780", fd
218 | 
219 |         fd = parse('circ. 1780')
220 |         assert fd.year == u'1780', fd
221 |         assert fd.qualifier.startswith(u"Note 'circa'"), fd
222 | 
223 |         #  match '[cca|cca. |cca.] {date}'
224 |         fd = parse('cca1780')
225 |         assert fd.year == u'1780', fd
226 |         assert fd.qualifier == u"Note 'circa' : cca1780", fd
227 | 
228 |         fd = parse('cca 1780')
229 |         assert fd.year == u'1780', fd
230 |         assert fd.qualifier.startswith(u"Note 'circa'"), fd
231 | 
232 |         fd = parse('cca.1780')
233 |         assert fd.year == u'1780', fd
234 |         assert fd.qualifier == u"Note 'circa' : cca.1780", fd
235 | 
236 |         fd = parse('cca. 1780')
237 |         assert fd.year == u'1780', fd
238 |         assert fd.qualifier.startswith(u"Note 'circa'"), fd
239 | 
240 |         #  match '[ca|ca. |ca.] {date}'
241 | 
242 |         fd = parse('ca. 1780')
243 |         assert fd.year == u'1780', fd
244 |         assert fd.qualifier.startswith(u"Note 'circa'"), fd
245 | 
246 |         fd = parse('ca. 1780')
247 |         assert fd.year == u'1780', fd
248 |         assert fd.qualifier.startswith(u"Note 'circa'"), fd
249 | 
250 |         fd = parse('ca.1780')
251 |         assert fd.year == u'1780', fd
252 |         assert fd.qualifier.startswith(u"Note 'circa'"), fd
253 | 
254 |         fd = parse('ca.1780')
255 |         assert fd.year == u'1780', fd
256 |         assert fd.qualifier.startswith(u"Note 'circa'"), fd
257 | 
258 |         fd = parse('ca.1780')
259 |         assert fd.year == u'1780', fd
260 |         assert fd.qualifier.startswith(u"Note 'circa'"), fd
261 | 
262 | 
263 | 
264 |     def test_ambiguous(self):
265 |         # TODO: have to be careful here ...
266 |         fd = parse('1068/1069')
267 | 
268 |     def test_small_years(self):
269 |         in1 = '23'
270 |         fd = parse(in1)
271 |         assert str(fd) == '0023', fd
272 |         assert fd.as_float() == 23, fd.as_float()
273 | 
274 |     def test_small_years_with_zeros(self):
275 |         in1 = '0023'
276 |         fd = parse(in1)
277 |         assert str(fd) == '0023', fd
278 |         assert fd.as_float() == 23, fd.as_float()
279 | 
280 |     def test_years_with_alpha_prefix(self):
281 |         in1 = "p1980"
282 |         fd = parse(in1)
283 |         assert str(fd) == "1980", fd
284 | 


--------------------------------------------------------------------------------
/datautil/tests/test_id.py:
--------------------------------------------------------------------------------
 1 | import uuid
 2 | 
 3 | import datautil.id
 4 | 
 5 | def test_compress_and_uncompress_uuid():
 6 |     hexversion = '86c3f19d-8854-4ef5-8d88-f008e0817871'
 7 | 
 8 |     out = datautil.id.compress_uuid(hexversion)
 9 |     assert len(out) == 22
10 | 
11 |     orig = datautil.id.uncompress_uuid(out)
12 |     assert orig == hexversion
13 | 
14 |     # test unicode
15 |     orig = datautil.id.uncompress_uuid(unicode(out))
16 |     assert orig == hexversion
17 | 
18 |     u1 = uuid.UUID(hexversion)
19 |     out = datautil.id.compress_uuid(u1)
20 |     assert len(out) == 22
21 | 
22 | 
23 | def test_int_to_b32():
24 |     def check(int_):
25 |         out = datautil.id.int_to_b32(int_)
26 |         assert isinstance(out, basestring)
27 |         assert len(out) == 7, out
28 | 
29 |         back = datautil.id.b32_to_int(out)
30 |         assert back == int_, (int_,back)
31 | 
32 |     check(1)
33 |     check(2**28+1)
34 |     check(2**30-1)
35 | 
36 | 


--------------------------------------------------------------------------------
/datautil/tests/test_misc.py:
--------------------------------------------------------------------------------
 1 | from datautil.misc import *
 2 | 
 3 | class TestFloatify:
 4 |     def test_floatify_1(self):
 5 |         x = '10'
 6 |         assert floatify(x) == 10.0
 7 | 
 8 |     def test_floatify_2(self):
 9 |         x = '1,030'
10 |         assert floatify(x) == 1030.0
11 | 
12 |     def test_floatify_2(self):
13 |         x = ''
14 |         out = floatify(x)
15 |         assert out == None, out
16 |         x = '#'
17 |         out = floatify(x)
18 |         assert out == None, out
19 | 
20 |     def test_floatify_matrix(self):
21 |         x = [ 
22 |                 ['1', '2'],
23 |                 ['abc', '3.0']
24 |                 ]
25 |         exp = [ 
26 |                 [1.0, 2.0],
27 |                 ['abc', 3.0]
28 |                 ]
29 |         out = floatify_matrix(x)
30 |         assert out == exp
31 | 
32 | 
33 | class TestMakeSeries:
34 | 
35 |     def test_make_series(self):
36 |         indata = [ [ '1980', '100', '50' ],
37 |                 [ '1981', '101', '51' ],
38 |                 [ '1982', '102', '' ],
39 |                 ]
40 |         exp = [
41 |                 [ (1980.0, 100.0), (1981.0, 101.0), (1982.0, 102.0) ],
42 |                 [ (1980.0, 50.0), (1981.0, 51.0) ]
43 |             ]
44 |         out = make_series(indata, xcol=0, ycols=[1,2])
45 |         assert out == exp, out
46 | 
47 | 


--------------------------------------------------------------------------------
/datautil/tests/test_xls.py:
--------------------------------------------------------------------------------
 1 | import pkg_resources
 2 | 
 3 | import datautil.tabular
 4 | 
 5 | class TestXlsReader:
 6 | 
 7 |     def test_stuff(self):
 8 |         fo = pkg_resources.resource_stream('datautil',
 9 |             'tests/data/xls_reader_test.xls')
10 |         reader = datautil.tabular.XlsReader(fo)
11 |         tab = reader.read()
12 |         assert tab.data[0][0] == 1850
13 |         assert tab.data[19][1] == 12.3
14 | 
15 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | import sys
 4 | sys.path.insert(0, '.')
 5 | from datautil import __version__, __doc__ as __long_description__
 6 | 
 7 | setup(
 8 |     name='datautil',
 9 |     version=__version__,
10 |     license='MIT',
11 |     description='Utilities for Data Work',
12 |     long_description=__long_description__,
13 |     author='Open Knowledge Foundation',
14 |     author_email='info@okfn.org',
15 |     url='http://okfn.org/projects/datautil/',
16 |     download_url='https://github.com/okfn/datautil/',
17 |     install_requires=[
18 |         # python-dateutil 2.0 has different _parse method, so stick to 1.4.1
19 |         'python-dateutil>=1.0,<1.99',
20 |         # (optional) for excel handling
21 |         # xlrd
22 |         # (optional) for google docs handling
23 |         # gdata
24 |         ],
25 |     packages=find_packages(),
26 |     include_package_data=True,
27 |     zip_safe=False,
28 |     classifiers = [
29 |         'Development Status :: 5 - Production/Stable',
30 |         'Environment :: Console',
31 |         'Intended Audience :: Developers',
32 |         'Operating System :: OS Independent',
33 |         'Programming Language :: Python',
34 |         'Programming Language :: Python :: 2 :: Only',
35 |         'Topic :: Software Development :: Libraries :: Python Modules'
36 |     ],
37 | )
38 | 


--------------------------------------------------------------------------------
/swiss/__init__.py:
--------------------------------------------------------------------------------
 1 | '''Swiss Army Knife for Data Work
 2 | ==============================
 3 | 
 4 | The swiss package provides various utilities for working with data:
 5 | 
 6 |   * cache: Url caching and scraping
 7 |   * tabular/*: Processing and transforming tabular data to and from various
 8 |     formats including csv, json, google spreadsheets, xls
 9 |   * misc, date: Cleaning up and parsing data especially dates.
10 |   * id: ID generation and shortenening
11 |   * clitools.py: Command line tools such as creating optparse object and usage
12 |     from a module of object.
13 |   * deliveranceproxy.py: Deliverance proxy helper
14 | 
15 | 
16 | CHANGELOG
17 | =========
18 | 
19 | v0.3 2010-08-01
20 | ---------------
21 | 
22 |   * Support for google docs spreadsheets as sources for TabularData
23 |   * Improve documentation of date module and add FlexiDate.as_datetime()
24 |   * New clitools module incorporating existing cli tools
25 |   * deliveranceproxy.py: Deliverance proxy helper for proxying to remote
26 |     websites and retheming with deliverance.
27 |   * parse/name.py: new (human) name parsing code
28 | 
29 | v0.2 2009-10-23
30 | ---------------
31 | 
32 |   * Extensive refactoring of tabular module/package
33 |     * Standardized interface with BaseReader and BaseWriter
34 |     * JsonReader and JsonWriter providing json reading and writing
35 |     * TxtWriter to support writing to plain text
36 |   * Improvements to date parsing (support for circa, 'c.', etc)
37 |   * New id module to do 'compression' of uuids using 32 and 64 bit encoding
38 | 
39 | 
40 | v0.1 2009-06-03
41 | ---------------
42 | 
43 |   * Bring together existing code (from last 2+ years) into new 'swiss' package
44 |   * Url caching and scraping
45 |   * Tabular data handling including csv reader/writer, xls reader, latex writer
46 |     and associated utilities (such as pivot_table)
47 |   * Cleaning and parsing data especially dates (misc and date modules)
48 | '''
49 | __version__ = '0.3'
50 | 
51 | import tabular
52 | from cache import *
53 | from misc import *
54 | from id import *
55 | 


--------------------------------------------------------------------------------
/swiss/cache.py:
--------------------------------------------------------------------------------
  1 | '''A local file cache with url retrieving builtin.
  2 | 
  3 | NB: this module has zero dependencies on modules outside of the
  4 | standard lib so that it is easily reusable in other libraries and applications
  5 | that do not require any other parts of the swiss package.
  6 | '''
  7 | import urlparse
  8 | import urllib
  9 | import os
 10 | import sys
 11 | 
 12 | 
 13 | # have to define before Cache as used in classmethod
 14 | class _Progress(object):
 15 |     def __init__(self):
 16 |         self.count = -1
 17 | 
 18 |     def dl_progress(self, count, block_size, total_size):
 19 |         if total_size == 0: # total_size is weird so return to avoid errors
 20 |             return
 21 |         if self.count == -1:
 22 |             print 'Total size: %s' % self.format_size(total_size)
 23 |         last_percent = int(self.count*block_size*100/total_size)
 24 |         percent = int(count*block_size*100/total_size)
 25 |         if percent > last_percent:
 26 |             # TODO: is this acceptable? Do we want to do something nicer?
 27 |             sys.stdout.write('.')
 28 |             sys.stdout.flush()
 29 |         self.count = count
 30 | 
 31 |     def format_size(self, bytes):
 32 |         if bytes > 1000*1000:
 33 |             return '%.1fMb' % (bytes/1000.0/1000)
 34 |         elif bytes > 10*1000:
 35 |             return '%iKb' % (bytes/1000)
 36 |         elif bytes > 1000:
 37 |             return '%.1fKb' % (bytes/1000.0)
 38 |         else:
 39 |             return '%ibytes' % bytes
 40 | 
 41 | 
 42 | class Cache(object):
 43 |     '''A local file cache (and url retriever).
 44 |     '''
 45 | 
 46 |     def __init__(self, path='.'):
 47 |         '''
 48 |         @param path: path to cache (defaults to current directory)
 49 |         '''
 50 |         self.path = path
 51 |         if not os.path.exists(self.path):
 52 |             os.makedirs(path)
 53 | 
 54 |     def retrieve(self, url, force=False):
 55 |         '''Retrieve url into cache and return the local path to it.'''
 56 |         dest = self.cache_path(url)
 57 |         if not os.path.exists(dest) or force:
 58 |             self.download(url, dest)
 59 |         return dest
 60 | 
 61 |     def cache_path(self, url):
 62 |         '''Local path for url within cache.'''
 63 |         name = self.basename(url)
 64 |         dest = os.path.join(self.path, name)
 65 |         return dest
 66 | 
 67 |     def filepath(self, url):
 68 |         '''Deprecated: use cache_path'''
 69 |         return self.cache_path(url)
 70 | 
 71 |     def stream(self, url):
 72 |         fp = self.cache_path(url)
 73 |         if not os.path.exists(fp):
 74 |             return None
 75 |         else:
 76 |             return open(fp)
 77 |     
 78 |     @classmethod
 79 |     def basename(self, url):
 80 |         scheme, netloc, path, params, query, fragment = urlparse.urlparse(url)
 81 |         result = path.split('/')[-1]
 82 |         if query:
 83 |             # escape '/' as otherwise path problems
 84 |             result += '?' + query.replace('/', '%47')
 85 |         return result
 86 | 
 87 |     @classmethod
 88 |     def download(self, url, dest=None):
 89 |         '''Download a file from a url.
 90 |         '''
 91 |         if not dest:
 92 |             dest = self.basename(url)
 93 |         print 'Retrieving %s' % url 
 94 |         prog = _Progress()
 95 |         urllib.urlretrieve(url, dest, reporthook=prog.dl_progress)
 96 | 
 97 |     # for backwards compatability
 98 |     @classmethod
 99 |     def dl(self, url, dest=None):
100 |         return self.download(url, dest)
101 | 
102 | 


--------------------------------------------------------------------------------
/swiss/clitools.py:
--------------------------------------------------------------------------------
 1 | '''Expose methods or functions as commands on the command line
 2 | 
 3 | Example usage::
 4 | 
 5 |     # in your code
 6 |     from swiss.clitools import _main
 7 |     if __name__  == '__main__':
 8 |         # expose everything in current module
 9 |         _main(locals())
10 |         # or if you have an object MyObject with methods you want to expose
11 |         _main(MyObject)
12 | '''
13 | import os
14 | import sys
15 | import optparse
16 | import inspect
17 | 
18 | def _object_methods(obj):
19 |     methods = inspect.getmembers(obj, inspect.ismethod)
20 |     methods = filter(lambda (name,y): not name.startswith('_'), methods)
21 |     methods = dict(methods)
22 |     return methods
23 | 
24 | def _module_functions(functions):
25 |     local_functions = dict(functions)
26 |     for k,v in local_functions.items():
27 |         if not inspect.isfunction(v) or k.startswith('_'):
28 |             del local_functions[k]
29 |     return local_functions
30 | 
31 | def _main(functions_or_object):
32 |     isobject = inspect.isclass(functions_or_object)
33 |     if isobject:
34 |         _methods = _object_methods(functions_or_object)
35 |     else:
36 |         _methods = _module_functions(functions_or_object)
37 | 
38 |     usage = '''%prog {action}
39 | 
40 | Actions:
41 |     '''
42 |     usage += '\n    '.join(
43 |         [ '%s: %s' % (name, m.__doc__.split('\n')[0] if m.__doc__ else '') for (name,m)
44 |         in sorted(_methods.items()) ])
45 |     parser = optparse.OptionParser(usage)
46 |     # Optional: for a config file
47 |     # parser.add_option('-c', '--config', dest='config',
48 |     #         help='Config file to use.')
49 |     options, args = parser.parse_args()
50 | 
51 |     if not args or not args[0] in _methods:
52 |         parser.print_help()
53 |         sys.exit(1)
54 | 
55 |     method = args[0]
56 |     if isobject:
57 |         getattr(functions_or_object(), method)(*args[1:])
58 |     else:
59 |         _methods[method](*args[1:])
60 | 
61 | __all__ = [ '_main' ]
62 | 
63 | if __name__ == '__main__':
64 |     _main(locals())
65 | 
66 | 


--------------------------------------------------------------------------------
/swiss/date.py:
--------------------------------------------------------------------------------
  1 | '''Date parsing and normalization utilities based on FlexiDate.
  2 | 
  3 | To parser dates use parse, e.g.::
  4 | 
  5 |     parse('1890') -> FlexiDate(year=u'1890')
  6 |     parse('1890?') -> FlexiDate(year=u'1890', qualifier='Uncertainty: 1985?')
  7 | 
  8 | Once you have a FlexiDate you can get access to attributes (strings of course
  9 | ...)::
 10 | 
 11 |     fd = parse('Jan 1890')
 12 |     fd.year # u'1890'
 13 |     fd.month # u'01'
 14 | 
 15 | And convert to other forms:
 16 | 
 17 |     fd.as_float() # 1890
 18 |     fd.as_datetime() # datetime(1890,01,01)
 19 | 
 20 | Background
 21 | ==========
 22 | 
 23 | FlexiDate is focused on supporting:
 24 | 
 25 |   1. Dates outside of Python (or DB) supported period (esp. dates < 0 AD)
 26 |   2. Imprecise dates (c.1860, 18??, fl. 1534, etc)
 27 |   3. Normalization of dates to machine processable versions
 28 |   4. Sortable in the database (in correct date order)
 29 | 
 30 | For more information see:
 31 | 
 32 | http://www.rufuspollock.org/2009/06/18/flexible-dates-in-python/
 33 | '''
 34 | import re
 35 | import datetime
 36 | 
 37 | class FlexiDate(object):
 38 |     """Store dates as strings and present them in a slightly extended version
 39 |     of ISO8601.
 40 | 
 41 |     Modifications:
 42 |         * Allow a trailing qualifiers e.g. fl.
 43 |         * Allow replacement of unknown values by ? e.g. if sometime in 1800s
 44 |           can do 18??
 45 |     
 46 |     Restriction on ISO8601:
 47 |         * Truncation (e.g. of centuries) is *not* permitted.
 48 |         * No week and day representation e.g. 1999-W01
 49 |     """
 50 |     # pass
 51 |     def __init__(self, year=None, month=None, day=None, qualifier=''):
 52 |         # force = month or day or qualifier
 53 |         force = False
 54 |         self.year = self._cvt(year, rjust=4, force=force)
 55 |         self.month = self._cvt(month)
 56 |         self.day = self._cvt(day)
 57 |         self.qualifier = qualifier
 58 |          
 59 |     def _cvt(self, val, rjust=2, force=False):
 60 |         if val:
 61 |             tmp = unicode(val).strip()
 62 |             if tmp.startswith('-'):
 63 |                 tmp = '-' + tmp[1:].rjust(rjust, '0')
 64 |             else:
 65 |                 tmp = tmp.rjust(rjust, '0')
 66 |             return tmp
 67 |         elif force:
 68 |             # use '!' rather than '?' as '!' < '1' while '?' > '1'
 69 |             return rjust * '!'
 70 |         else:
 71 |             return ''
 72 | 
 73 |     def __str__(self):
 74 |         out = self.isoformat()
 75 |         if self.qualifier:
 76 |             # leading space is important as ensures when no year sort in right
 77 |             # order as ' ' < '1'
 78 |             out += u' [%s]' % self.qualifier
 79 |         return out
 80 | 
 81 |     def __repr__(self):
 82 |         return u'%s %s' % (self.__class__, self.__str__())
 83 | 
 84 |     def isoformat(self, strict=False):
 85 |         '''Return date in isoformat (same as __str__ but without qualifier).
 86 |         
 87 |         WARNING: does not replace '?' in dates unless strict=True.
 88 |         '''
 89 |         out = self.year
 90 |         # what do we do when no year ...
 91 |         for val in [ self.month, self.day ]:
 92 |             if not val:
 93 |                 break
 94 |             out += u'-' + val
 95 |         if strict:
 96 |             out = out.replace('?', '0')
 97 |         return out
 98 | 
 99 |     our_re_pat = '''
100 |         (?P<year> -?[\d?]+)
101 |         (?:
102 |                 \s* - (?P<month> [\d?]{1,2})
103 |             (?: \s* - (?P<day> [\d?]{1,2}) )?
104 |         )?
105 |         \s*
106 |         (?: \[ (?P<qualifier>[^]]*) \])?
107 |         '''
108 |     our_re = re.compile(our_re_pat, re.VERBOSE)
109 |     @classmethod
110 |     def from_str(self, instr):
111 |         '''Undo affect of __str__'''
112 |         if not instr:
113 |             return FlexiDate()
114 | 
115 |         out = self.our_re.match(instr)
116 |         if out is None: # no match TODO: raise Exception?
117 |             return None
118 |         else:
119 |             return FlexiDate(
120 |                     out.group('year'),
121 |                     out.group('month'),
122 |                     out.group('day'),
123 |                     qualifier=out.group('qualifier')
124 |                     )
125 |     
126 |     def as_float(self):
127 |         '''Get as a float (year being the integer part).
128 | 
129 |         Replace '?' in year with 9 so as to be conservative (e.g. 19?? becomes
130 |         1999) and elsewhere (month, day) with 0
131 | 
132 |         @return: float.
133 |         '''
134 |         if not self.year: return None
135 |         out = float(self.year.replace('?', '9'))
136 |         if self.month:
137 |             # TODO: we are assuming months are of equal length
138 |             out += float(self.month.replace('?', '0')) / 12.0
139 |             if self.day:
140 |                 out += float(self.day.replace('?', '0')) / 365.0
141 |         return out
142 | 
143 |     def as_datetime(self):
144 |         '''Get as python datetime.datetime.
145 | 
146 |         Require year to be a valid datetime year. Default month and day to 1 if
147 |         do not exist.
148 | 
149 |         @return: datetime.datetime object.
150 |         '''
151 |         year = int(self.year)
152 |         month = int(self.month) if self.month else 1
153 |         day = int(self.day) if self.day else 1
154 |         return datetime.datetime(year, month, day)
155 | 
156 | 
157 | def parse(date, dayfirst=True):
158 |     '''Parse a `date` into a `FlexiDate`.
159 | 
160 |     @param date: the date to parse - may be a string, datetime.date,
161 |     datetime.datetime or FlexiDate.
162 | 
163 |     TODO: support for quarters e.g. Q4 1980 or 1954 Q3
164 |     TODO: support latin stuff like M.DCC.LIII  
165 |     TODO: convert '-' to '?' when used that way
166 |         e.g. had this date [181-]
167 |     '''
168 |     if not date:
169 |         return None
170 |     if isinstance(date, FlexiDate):
171 |         return date
172 |     if isinstance(date, int):
173 |         return FlexiDate(year=date)
174 |     elif isinstance(date, datetime.date):
175 |         parser = PythonDateParser()
176 |         return parser.parse(date)
177 |     else: # assuming its a string
178 |         parser = DateutilDateParser()
179 |         out = parser.parse(date, **{'dayfirst': dayfirst})
180 |         if out is not None:
181 |             return out
182 |         # msg = 'Unable to parse %s' % date
183 |         # raise ValueError(date)
184 |         val = 'UNPARSED: %s' % date
185 |         val = val.encode('ascii', 'ignore')
186 |         return FlexiDate(qualifier=val)
187 | 
188 | 
189 | class DateParserBase(object):
190 |     def parse(self, date):
191 |         raise NotImplementedError
192 | 
193 |     def norm(self, date):
194 |         return str(self.parse(date))
195 | 
196 | class PythonDateParser(object):
197 |     def parse(self, date):
198 |         return FlexiDate(date.year, date.month, date.day)
199 | 
200 | try:
201 |     import dateutil.parser
202 |     dateutil_parser = dateutil.parser.parser()
203 | except:
204 |     dateutil_parser = None
205 | 
206 | class DateutilDateParser(DateParserBase):
207 |     _numeric = re.compile("^[0-9]+$")
208 |     def parse(self, date, **kwargs):
209 |         '''
210 |         :param **kwargs: any kwargs accepted by dateutil.parse function.
211 |         '''
212 |         qualifiers = []
213 |         if dateutil_parser is None:
214 |             return None
215 |         date = orig_date = date.strip()
216 | 
217 |         # various normalizations
218 |         # TODO: call .lower() first
219 |         date = date.replace('B.C.', 'BC')
220 |         date = date.replace('A.D.', 'AD')
221 | 
222 |         # deal with pre 0AD dates
223 |         if date.startswith('-') or 'BC' in date or 'B.C.' in date:
224 |             pre0AD = True
225 |         else:
226 |             pre0AD = False
227 |         # BC seems to mess up parser
228 |         date = date.replace('BC', '')
229 | 
230 |         # deal with circa: 'c.1950' or 'c1950'
231 |         circa_match = re.match('(.*)c\.?\s*(\d+.*)', date)
232 |         if circa_match:
233 |             # remove circa bit
234 |             qualifiers.append("Note 'circa'")
235 |             date = ''.join(circa_match.groups())
236 | 
237 |         # deal with p1980 (what does this mean? it can appear in
238 |         # field 008 of MARC records
239 |         p_match = re.match("^p(\d+)", date)
240 |         if p_match:
241 |             date = date[1:]
242 | 
243 |         # Deal with uncertainty: '1985?'
244 |         uncertainty_match = re.match('([0-9xX]{4})\?', date)
245 |         if uncertainty_match:
246 |             # remove the ?
247 |             date = date[:-1]
248 |             qualifiers.append('Uncertainty')
249 | 
250 |         # Parse the numbers intelligently
251 |         # do not use std parser function as creates lots of default data
252 |         res = dateutil_parser._parse(date, **kwargs)
253 | 
254 |         if res is None:
255 |             # Couldn't parse it
256 |             return None
257 |         #Note: Years of less than 3 digits not interpreted by
258 |         #      dateutil correctly
259 |         #      e.g. 87 -> 1987
260 |         #           4  -> day 4 (no year)
261 |         # Both cases are handled in this routine
262 |         if res.year is None and res.day:
263 |             year = res.day
264 |         # If the whole date is simply two digits then dateutil_parser makes
265 |         # it '86' -> '1986'. So strip off the '19'. (If the date specified
266 |         # day/month then a two digit year is more likely to be this century
267 |         # and so allow the '19' prefix to it.)
268 |         elif self._numeric.match(date) and (len(date) == 2 or date.startswith('00')):
269 |             year = res.year % 100
270 |         else:
271 |             year = res.year
272 | 
273 |         # finally add back in BC stuff
274 |         if pre0AD:
275 |             year = -year
276 |             
277 |         if not qualifiers:
278 |             qualifier = ''
279 |         else:
280 |             qualifier = ', '.join(qualifiers) + (' : %s' % orig_date)
281 |         return FlexiDate(year, res.month, res.day, qualifier=qualifier)
282 |     
283 | 


--------------------------------------------------------------------------------
/swiss/deliveranceproxy.py:
--------------------------------------------------------------------------------
  1 | '''Use deliverance_ for proxying and re-theming.
  2 | 
  3 | .. _deliverance: http://packages.python.org/Deliverance/
  4 | 
  5 | Usage requirements (in pip-requirements.txt format)::
  6 | 
  7 |     # suggest installing lxml directly
  8 |     lxml
  9 |     deliverance>=0.3a
 10 |     # for urlmap and proxy
 11 |     paste
 12 |     # for Response
 13 |     webob
 14 | 
 15 | Example usage::
 16 | 
 17 |     dest = 'http://myremotes.ite/'
 18 |     mytheme = '<html>....</html>'
 19 |     my_deliverance_rules = '<ruleset><theme href="%s" /> ...</ruleset>'
 20 |     # or
 21 |     # my_deliverance_rules = open('/my/path/to/rules.xml').read()
 22 |     deliverance_proxy = create_deliverance_proxy(mytheme, dest,
 23 |         my_deliverance_rules)
 24 | 
 25 |     # from in wsgi app
 26 |     # path on remote destination url you want to proxy to ...
 27 |     # you can omit this if local path and remote path are the same
 28 |     environ['PATH_INFO'] = '/my_destination_path'
 29 |     deliverance_proxy(environ, start_response)
 30 | '''
 31 | import logging
 32 | 
 33 | import paste.urlmap
 34 | import deliverance.middleware
 35 | import paste.proxy
 36 | from webob import Request, Response
 37 | from deliverance.middleware import DeliveranceMiddleware, SubrequestRuleGetter
 38 | from deliverance.log import PrintingLogger
 39 | 
 40 | 
 41 | default_deliverance_rules = \
 42 | '''<ruleset>
 43 |   <theme href="%s" />
 44 |   <!-- These are the default rules for anything with class="default" or no class: -->
 45 |   <!-- suppress standard behaviour of copying over head stuff links, html, css
 46 |   etc -->
 47 |   <rule suppress-standard="1"> 
 48 |       <replace content="children:/html/head/title" theme="children:/html/head/title" nocontent="ignore" />
 49 | 
 50 |     <replace content="children:#content" theme="children:#content" />
 51 |     <!--
 52 |     <append content="children:#sidebar" theme="children:#primary" />
 53 |     -->
 54 |   </rule>
 55 | </ruleset>
 56 | '''
 57 | 
 58 | def create_deliverance_proxy(proxy_base_url, theme_html, rules_xml=None):
 59 |     '''Proxy to another url with re-theming using deliverance.
 60 | 
 61 |     Based on http://rufuspollock.org/code/deliverance
 62 | 
 63 |     :param proxy_base_url: base destination url we are proxying to.
 64 |     :param theme_html: string providing html theme to use for re-themeing.
 65 |     :param rules_xml: (optional) deliverance rules xml as a string. If not
 66 |         provided use `default_deliverance_rules`. For info on rulesets see
 67 |         deliverance docs. We require that ruleset support a single
 68 |         substitution string '%s' which is used to insert internal mountpoint
 69 |         for the them ('/_deliverance_theme.html').
 70 |     '''
 71 |     theme_url = '/_deliverance_theme.html'
 72 |     # use a urlmap so we can mount theme and urlset
 73 |     app = paste.urlmap.URLMap()
 74 |     # set up theme consistent with our rules file
 75 |     app[theme_url] = Response(theme_html)
 76 | 
 77 |     if rules_xml:
 78 |         rules = rules_xml
 79 |     else:
 80 |         rules = default_deliverance_rules
 81 |     rules = rules % theme_url
 82 |     app['/_deliverance_rules.xml'] = Response(rules, content_type="application/xml")
 83 | 
 84 |     class MyProxy(object):
 85 |         def __init__(self, proxy_base_url):
 86 |            self.proxy = paste.proxy.Proxy(proxy_base_url) 
 87 |         
 88 |         def __call__(self, environ, start_response):
 89 |             req = Request(environ)
 90 |             res = req.get_response(self.proxy)
 91 |             res.decode_content()
 92 |             return res(environ, start_response)
 93 | 
 94 |     app['/'] = MyProxy(proxy_base_url)
 95 |     deliv = DeliveranceMiddleware(app, SubrequestRuleGetter('/_deliverance_rules.xml'),
 96 |         PrintingLogger,
 97 |         log_factory_kw=dict(print_level=logging.WARNING))
 98 |     return deliv
 99 | 
100 | 


--------------------------------------------------------------------------------
/swiss/id.py:
--------------------------------------------------------------------------------
 1 | import base64
 2 | import uuid
 3 | 
 4 | def compress_uuid(_uuid):
 5 |     '''Provided shortened string representation of UUID via base64 encoding.
 6 | 
 7 |     @return: 22 character base64 encoded version of UUID.
 8 |     '''
 9 |     if isinstance(_uuid, basestring):
10 |         _uuid = uuid.UUID(_uuid)
11 |     encode = base64.b64encode(_uuid.bytes, '_-')
12 |     # throw away trailing ==
13 |     return encode[:22]
14 | 
15 | def uncompress_uuid(b64_encoded):
16 |     '''Reverse compress_uuid
17 | 
18 |     @return: 36 char str representation of uuid.
19 |     '''
20 |     b64_encoded = str(b64_encoded)
21 |     if not b64_encoded.endswith('=='):
22 |         b64_encoded += '=='
23 |     out = base64.b64decode(b64_encoded, '_-')
24 |     _uuid = uuid.UUID(bytes=out)
25 |     return str(_uuid)
26 | 
27 | 
28 | import struct
29 | def int_to_b32(int_):
30 |     out = struct.pack('1i', int_)
31 |     out = base64.b32encode(out)
32 |     # throw away trailing '='
33 |     return out[:-1]
34 | 
35 | def b32_to_int(b32):
36 |     out = base64.b32decode(b32+'=', casefold=True)
37 |     out = struct.unpack('1i', out)[0]
38 |     return out
39 | 
40 | 


--------------------------------------------------------------------------------
/swiss/misc.py:
--------------------------------------------------------------------------------
 1 | # TODO: create a strict option where None is returned on failed convert rather
 2 | # than original value
 3 | placeholders = [ '', '-', '#' ]
 4 | def floatify(value):
 5 |     '''Convert value to a float if possible.
 6 | 
 7 |     @return: Floatified value. If value is blank or placeholder ('-') return
 8 |     None. Can deal with ',' in value. Will also floatify dates. If nothing
 9 |     works returns original value. 
10 |     '''
11 |     if value is None:
12 |         return None
13 |     if isinstance(value, basestring):
14 |         stripped = value.strip()
15 |         if not stripped or stripped in placeholders:
16 |             return None
17 |         else: 
18 |             # often numbers have commas in them like 1,030
19 |             v = value.replace(',', '')
20 |     try:
21 |         newval = float(v)
22 |         return newval
23 |     except:
24 |         pass
25 |     # will return original value if fails
26 |     return date_to_float(value)
27 | 
28 | def floatify_matrix(matrix):
29 |     return [ [ floatify(col) for col in row ] for row in matrix ]
30 | 
31 | # TODO: remove/convert to using date.FlexiDate.as_float()
32 | import datetime
33 | def date_to_float(date):
34 |     '''Convert a date to float.
35 | 
36 |     Accepts either a date object or a string parseable to a date object
37 |     
38 |     @return: converted value or original if conversion fails
39 |     '''
40 |     import dateutil.parser
41 |     if isinstance(date, basestring):
42 |         try: # simple year
43 |             return float(date)
44 |         except:
45 |             pass
46 |         try:
47 |             val = dateutil.parser.parse(date, default=datetime.date(1,1,1))
48 |         except:
49 |             return date
50 |     else:
51 |         val = date
52 | 
53 |     if isinstance(val, datetime.date):
54 |         fval = val.year + val.month / 12.0 + val.day / 365.0
55 |         return round(fval, 3)
56 |     else:
57 |         return val
58 | 
59 | def make_series(matrix, xcol, ycols=None):
60 |     '''Take a matrix and return series (i.e. list of tuples) corresponding to
61 |     specified column indices.
62 | 
63 |     E.g. if matrix is:
64 |         [ [1,2,3,4]
65 |           [5,6,7,8] ]
66 |    
67 |     and xcol = 0, ycols=[1,3] then output is:
68 | 
69 |     [
70 |         [ [1,2], [5,6] ],
71 |         [ [1,4], [5,8] ],
72 |     ]
73 | 
74 |     If ycols not defined then return all possible series (excluding xcol
75 |     with itself.
76 |     '''
77 |     cols = zip(*matrix)
78 |     if ycols is None:
79 |         ycols = range(len(cols))
80 |         del ycols[xcol]
81 |     cols = floatify_matrix(cols)
82 |     def is_good(value):
83 |         if value is None: return False
84 |         tv = str(value)
85 |         stopchars = [ '', '-' ]
86 |         if tv in stopchars:
87 |             return False
88 |         return True
89 |     def is_good_tuple(tuple):
90 |         return is_good(tuple[0]) and is_good(tuple[1])
91 |     
92 |     xcoldata = cols[xcol]
93 |     ycols = [ cols[ii] for ii in ycols ]
94 |     series = [ filter(is_good_tuple, zip(xcoldata, col)) for col in ycols ]
95 |     return series
96 | 
97 | 


--------------------------------------------------------------------------------
/swiss/parse/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rufuspollock-okfn/datautil/6bcb8795013d305f02467967367ecca15ad351e2/swiss/parse/__init__.py


--------------------------------------------------------------------------------
/swiss/parse/name.py:
--------------------------------------------------------------------------------
  1 | '''Parse names of people into a standard format.'''
  2 | 
  3 | import re
  4 | 
  5 | titles = [
  6 |         u'Ayatollah',
  7 |         u'Baron',
  8 |         u'Bishop',
  9 |         u'Dame',
 10 |         u'Dr',
 11 |         u'Fr',
 12 |         u'Graf',
 13 |         u'King',
 14 |         u'Lady',
 15 |         u'Maj',
 16 |         u'Major',
 17 |         u'Mrs',
 18 |         u'Prof',
 19 |         u'Rev',
 20 |         u'Sir',
 21 |         u'St',
 22 |         ]
 23 | 
 24 | class Name(object):
 25 |     '''A name of a person or entity.
 26 |     
 27 |     Not a domain object but a convenient way to handle/parse names.
 28 | 
 29 |     Attributes:
 30 |         title
 31 |         ln: last name 
 32 |         firstnames: first names as list
 33 |     '''
 34 |     def __init__(self, ln='', fns=None, title=''):
 35 |         self.ln = ln
 36 |         self.fns = fns
 37 |         if self.fns is None: self.fns = []
 38 |         self.title = title
 39 | 
 40 |     def norm(self):
 41 |         '''Return normalised name string (LastFirst format)
 42 |         '''
 43 |         return name_tostr(self)        
 44 | 
 45 |     def __str__(self):
 46 |         '''Display name using normalised format
 47 |         '''
 48 |         return self.norm()
 49 | 
 50 | class NameParserBase(object):
 51 |     regex_remove_fullstops = re.compile(r'(\w{2,})\.(\W|$)', re.UNICODE)
 52 |     
 53 |     def parse(self, fullname):
 54 |         '''Parse the `fullname` string into a `Name` object.
 55 | 
 56 |         @return: `Name` object for `fullname`
 57 |         '''
 58 |         if fullname is None:
 59 |             return Name()
 60 |         fullname = unicode(fullname.strip())
 61 |         if not fullname:
 62 |             return Name()
 63 | 
 64 |         # remove words ending '.', e.g. 'Bosch.'
 65 |         fullname = self.regex_remove_fullstops.sub(r'\1\2', fullname)
 66 | 
 67 |         # make sure initials are separted by ' '
 68 |         # but first deal with special edge case like [Major.]
 69 | #        fullname = fullname.replace('.]', ']')
 70 |         fullname = fullname.replace('.', '. ')
 71 |         name = self._toparts(fullname)
 72 |         name.ln = self.normcase(name.ln)
 73 |         name.fns = [ self.normcase(x) for x in name.fns ]
 74 |         name.title = self.normcase(name.title)
 75 |         return name
 76 | 
 77 |     def _toparts(self, fullname):
 78 |         '''Implement in inheriting classes, called by parse.
 79 |         '''
 80 |         raise NotImplementedError()
 81 | 
 82 |     def tostr(self, name):
 83 |         '''Convert name object back into a string.
 84 |         '''
 85 |         raise NotImplementedError()
 86 | 
 87 |     def normcase(self, name): 
 88 |         # useful to handle none and you often get this from regexes
 89 |         if name is None:
 90 |             return ''
 91 |         name = name.strip()
 92 |         if name.upper() == name or name.lower() == name:
 93 |             return name.capitalize()
 94 |         # avoid issues with e.g. McTaggart
 95 |         else:
 96 |             return name
 97 | 
 98 |     def untitlize(self, _str):
 99 |         '''Return title contained in _str if a title else return empty string.
100 |         '''
101 |         title = _str.strip()
102 |         title = _str.strip('()')
103 |         if title in titles:
104 |             return title
105 |         # always assume something in square brackets is a title
106 |         elif title.startswith('[') and title.endswith(']'):
107 |             return title[1:-1].strip()
108 |         else:
109 |             return ''
110 | 
111 |     def titlize(self, _str):
112 |         return u'[' + _str + u']'
113 | 
114 |     def norm(self, date):
115 |         return str(self.parse(date))
116 | 
117 | 
118 | class LastFirst(NameParserBase):
119 |     '''Parse and creates names of form:
120 | 
121 |         lastname, first-names-in-order [title]
122 |     '''
123 |     def _toparts(self, fullname):
124 |         if ',' not in fullname and ' ' in fullname:
125 |             raise ValueError('Expected "," in name: %s' % fullname)
126 |         name = Name()
127 |         # NB: if more than 2 commas just ignore stuff after 2nd one
128 |         parts = fullname.split(',')
129 |         name.ln = parts[0]
130 |         name.fns = parts[1].strip().split()
131 |         if name.fns:
132 |             title = self.untitlize(name.fns[-1])
133 |             if title:
134 |                 name.title = title
135 |                 del name.fns[-1]
136 |         return name
137 | 
138 |     def tostr(self, name):
139 |         if name.ln or name.fns:
140 |             fns = ' '.join(name.fns)
141 |             if not fns:
142 |                 out = name.ln
143 |             else:
144 |                 out = unicode(', '.join((name.ln, ' '.join(name.fns))))
145 |         else:
146 |             return ''
147 |         if name.title:
148 |             out = out + u' [%s]' % name.title
149 |         return out
150 | 
151 | 
152 | class FirstLast(NameParserBase):
153 |     '''Parse and create names of form:
154 | 
155 |         [title] first-names last-name
156 |     '''
157 |     def _toparts(self, fullname):
158 |         name = Name()
159 |         if ',' in fullname:
160 |             raise ValueError('Should not have "," in FirstLast type name: %s' %
161 |                     fullname)
162 |         parts = fullname.split()
163 |         name.ln = parts[-1]
164 |         name.fns = parts[:-1]
165 |         if name.fns:
166 |             title = self.untitlize(name.fns[0])
167 |             if title:
168 |                 name.title = title
169 |                 del name.fns[0]
170 |         return name
171 | 
172 |     def tostr(self, name):
173 |         if name.fns or name.ln:
174 |             out = u' '.join(name.fns) + ' ' + name.ln
175 |         else:
176 |             return ''
177 |         if name.title:
178 |             out = u'[%s]' % name.title + out
179 |         return out
180 | 
181 | 
182 | def parse_name(fullname):
183 |     if ',' in fullname:
184 |         parser = LastFirst()
185 |     else:
186 |         parser = FirstLast()
187 |     return parser.parse(fullname)
188 | 
189 | def name_tostr(name, parser_class=LastFirst):
190 |     parser = parser_class()
191 |     return parser.tostr(name)
192 | 
193 | def normalize(name_str, parser_class=LastFirst):
194 |     name = parse_name(name_str)
195 |     return name_tostr(name, parser_class)
196 | 
197 | 
198 | 


--------------------------------------------------------------------------------
/swiss/tabular/__init__.py:
--------------------------------------------------------------------------------
1 | from base import *
2 | from misc import *
3 | from xls import XlsReader
4 | from html import *
5 | from tabular_json import JsonReader, JsonWriter
6 | from txt import TxtWriter
7 | 
8 | 


--------------------------------------------------------------------------------
/swiss/tabular/base.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tools for dealing with tabular data
  3 | """
  4 | 
  5 | class TabularData(object):
  6 |     """Holder for tabular data
  7 | 
  8 |     NB:
  9 |       * Assume data organized in rows.
 10 |       * No type conversion so all data will be as entered.
 11 | 
 12 |     Properties:
 13 |       * data: data itself provided as array of arrays
 14 |       * header: associated header columns (if they exist)
 15 | 
 16 |     TODO: handling of large datasets (iterators?)
 17 |     """
 18 | 
 19 |     def __init__(self, data=None, header=None):
 20 |         """
 21 |         Initialize object. If data or header not set they are defaulted to
 22 |         empty list.
 23 |         
 24 |         NB: must use None as default value for arguments rather than []
 25 |         because [] is mutable and using it will result in subtle bugs. See:
 26 |         'Default parameter values are evaluated when the function definition
 27 |         is executed.' [http://www.python.org/doc/current/ref/function.html]
 28 |         """
 29 |         self.data = []
 30 |         self.header = []
 31 |         if data is not None:
 32 |             self.data = data
 33 |         if header is not None:
 34 |             self.header = header
 35 |     
 36 |     def __repr__(self):
 37 |         out = []
 38 |         if self.header:
 39 |             out.append(self.header)
 40 |         # limit to 10 items
 41 |         out += self.data[0:10]
 42 |         return repr(out)
 43 | 
 44 |     def __str__(self):
 45 |         return repr(self)
 46 | 
 47 |     def __iter__(self):
 48 |         return self.data.__iter__()
 49 | 
 50 |     @classmethod
 51 |     def from_list(self, list_, header=True):
 52 |         return TabularData(header=list_[0], data=list_[1:])
 53 | 
 54 |     def to_list(self):
 55 |         if self.header:
 56 |             return [ self.header ] + self.data
 57 |         else:
 58 |             return self.data
 59 | 
 60 | 
 61 | class ReaderBase(object):
 62 |     def __init__(self, filepath_or_fileobj=None, encoding='utf8'):
 63 |         self.filepath = None
 64 |         self.fileobj = None
 65 |         self._filepath_or_fileobj(filepath_or_fileobj)
 66 |         self.encoding = 'utf8'
 67 | 
 68 |     def _filepath_or_fileobj(self, filepath_or_fileobj):
 69 |         if filepath_or_fileobj is None: # do not overwrite any existing value
 70 |             pass
 71 |         elif isinstance(filepath_or_fileobj, basestring):
 72 |             self.filepath = filepath_or_fileobj
 73 |             self.fileobj = open(self.filepath)
 74 |         else:
 75 |             self.filepath = None
 76 |             self.fileobj = filepath_or_fileobj
 77 |     
 78 |     def read(self, filepath_or_fileobj=None):
 79 |         self._filepath_or_fileobj(filepath_or_fileobj)
 80 | 
 81 | 
 82 | class WriterBase(object):
 83 |     '''
 84 |     Extra arguments to write methods:
 85 |         has_row_headings: first col of each row is a heading.
 86 |     '''
 87 |     def __init__(self, round_ndigits=None, **kwargs):
 88 |         '''
 89 |         @round_ndigits: number of decimal places to use when rounding numerical 
 90 |                         values when textifying for output 
 91 |         '''
 92 |         self.round_ndigits = round_ndigits
 93 | 
 94 |     def write(self, tabular_data, fileobj, *args, **kwargs):
 95 |         pass
 96 | 
 97 |     def write_str(self, tabular_data, *args, **kwargs):
 98 |         from StringIO import StringIO
 99 |         holder = StringIO()
100 |         self.write(tabular_data, holder, *args, **kwargs)
101 |         holder.seek(0)
102 |         return holder.read()
103 | 
104 |     def value_to_str(self, value):
105 |         '''Convert value to text (rounding floats/ints as necessary).
106 |         '''
107 |         if value is None:
108 |             return ''
109 |         if self.round_ndigits is not None and \
110 |                 (isinstance(value, int) or isinstance(value, float)):
111 |             roundedResult = round(value, self.round_ndigits)
112 |             if self.round_ndigits <= 0: # o/w will have in .0 at end
113 |                 roundedResult = int(roundedResult)
114 |             roundedResult = str(roundedResult)
115 |             # deal with case when rounding has added unnecessary digits
116 |             if len(str(value)) < len(roundedResult):
117 |                 return str(value)
118 |             else:
119 |                 return roundedResult
120 |         else:
121 |             return unicode(value)
122 | 
123 | 
124 | import csv
125 | import codecs
126 | class UTF8Recoder:
127 |     """
128 |     Iterator that reads an encoded stream and reencodes the input to UTF-8
129 | 
130 |     From: <http://docs.python.org/lib/csv-examples.html>
131 |     """
132 |     def __init__(self, f, encoding=None):
133 |         if encoding:
134 |             self.reader = codecs.getreader(encoding)(f)
135 |         else: # already unicode so just return f
136 |             self.reader = f
137 | 
138 |     def __iter__(self):
139 |         return self
140 | 
141 |     def next(self):
142 |         return self.reader.next().encode('utf-8')
143 | 
144 | class CsvReader(ReaderBase):
145 |     """Read data from a csv file into a TabularData structure
146 | 
147 |     Note that the csv module does *not* support unicode:
148 |     
149 |     > This version of the csv module doesn't support Unicode input. Also, there
150 |     > are currently some issues regarding ASCII NUL characters. Accordingly,
151 |     > all input should be UTF-8 or printable ASCII to be safe; see the examples
152 |     > in section 9.1.5. These restrictions will be removed in the future.
153 |     > <http://docs.python.org/lib/module-csv.html>
154 |     """
155 | 
156 |     def read(self, filepath_or_fileobj=None, encoding=None, **kwargs):
157 |         """Read in a csv file and return a TabularData object.
158 | 
159 |         @param fileobj: file like object.
160 |         @param encoding: if set use this instead of default encoding set in
161 |             __init__ to decode the file like object. NB: will check if fileobj
162 |             already in unicode in which case this is ignored.
163 |         @param kwargs: all further kwargs are passed to the underlying `csv.reader` function
164 |         @return tabular data object (all values encoded as utf-8).
165 |         """
166 |         super(CsvReader, self).read(filepath_or_fileobj)
167 |         if encoding:
168 |             self.encoding = encoding
169 |         tabData = TabularData()
170 | 
171 |         sample = self.fileobj.read()
172 |         # first do a simple test -- maybe sample is already unicode
173 |         if type(sample) == unicode:
174 |             encoded_fo = UTF8Recoder(self.fileobj, None)
175 |         else:
176 |             sample = sample.decode(self.encoding)
177 |             encoded_fo = UTF8Recoder(self.fileobj, self.encoding)
178 |         sample = sample.encode('utf-8')
179 |         sniffer = csv.Sniffer()
180 |         hasHeader = sniffer.has_header(sample)
181 | 
182 |         self.fileobj.seek(0)
183 |         ourkwargs = {
184 |             'skipinitialspace': True
185 |         }
186 |         if kwargs:
187 |             ourkwargs.update(kwargs)
188 | 
189 |         reader = csv.reader(encoded_fo, **ourkwargs)
190 |         if hasHeader:
191 |             tabData.header = reader.next()
192 |         for row in reader:
193 |             tabData.data.append(row)
194 |         return tabData
195 | 
196 | # for backwards compatibility
197 | ReaderCsv = CsvReader
198 | 
199 | class CsvWriter(WriterBase):
200 |     # TODO: unicode support a la CsvReader
201 |     def write(self, tabular_data, fileobj, encoding='utf-8'):
202 |         writer = csv.writer(fileobj)
203 |         if tabular_data.header:
204 |             writer.writerow(tabular_data.header)
205 |         for row in tabular_data.data:
206 |             writer.writerow(row)
207 |         fileobj.flush()
208 | 
209 | 
210 | ## --------------------------------
211 | ## Converting to Latex
212 | 
213 | class LatexWriter(WriterBase):
214 | 
215 |     def write(self, tabular_data, fileobj, has_row_headings=False):
216 |         self.has_row_headings = has_row_headings
217 |         matrix = tabular_data.data
218 |         has_header = len(tabular_data.header) > 0
219 |         if has_header: 
220 |             matrix.insert(0, tabular_data.header)
221 |         out = self._write(matrix, has_header)
222 |         fileobj.write(out)
223 |     
224 |     def _write(self, matrix, has_header=True):
225 |         if len(matrix) == 0: return
226 |         # no hline on first row as this seems to mess up latex \input
227 |         # http://groups.google.com/group/comp.text.tex/browse_thread/thread/1e1db553a958ebd8/0e590a22cb59f43d
228 |         out = '%s' % self.process_row(matrix[0], has_header)
229 |         for row in matrix[1:]:
230 |             out += self.process_row(row) 
231 |         return out
232 | 
233 |     def process_row(self, row, heading=False):
234 |         if len(row) == 0: return
235 |         out = '%s' % self.process_cell(row[0], heading or self.has_row_headings)
236 |         for cell in row[1:]:
237 |             out += ' & %s' % self.process_cell(cell, heading)
238 |         out += ' \\\\\n\hline\n'
239 |         return out
240 | 
241 |     def process_cell(self, cell, heading=False):
242 |         cell_text = self.value_to_str(cell)
243 |         cell_text = self.escape(cell_text)
244 |         if heading:
245 |             return '\\textbf{%s}' % cell_text
246 |         else:
247 |             return cell_text
248 | 
249 |     def escape(self, text):
250 |         escape_chars = [ '&', '%' ]
251 |         out = text
252 |         for ch in escape_chars:
253 |             out = out.replace(ch, '\\%s' % ch)
254 |         return out
255 |     
256 | 
257 | # TODO: 2009-08-05 deprecate
258 | def table2latex(matrix, has_header=True, has_row_headings=False):
259 |     m2l = LatexWriter()
260 |     m2l.has_row_headings = has_row_headings
261 |     return m2l._write(matrix, has_header)
262 | 
263 | 


--------------------------------------------------------------------------------
/swiss/tabular/gdocs.py:
--------------------------------------------------------------------------------
  1 | '''TabularData from a Google Docs Spreadsheet.
  2 | '''
  3 | from base import ReaderBase, TabularData
  4 | import gdata.spreadsheet.service
  5 | import gdata.spreadsheet.text_db
  6 | 
  7 | 
  8 | class GDocsReaderTextDb(ReaderBase):
  9 |     '''Read a google docs spreadsheet using the gdata.spreadsheet.text_db
 10 |     library.
 11 |     
 12 |     NB: any blank line in spreadsheet will be taken as terminating data.
 13 |     '''
 14 |     def __init__(self, spreadsheet_id, username=None, password=None,
 15 |             id_is_name=False):
 16 |         '''
 17 |         @param: spreadsheet_id: gdoc id or name (?key={id} in url). If name you
 18 |         must set id_is_name to True.
 19 |         '''
 20 |         # do not pass spreadsheet_id down as it will be url or sheet name
 21 |         super(GDocsReaderTextDb, self).__init__()
 22 |         self.source = spreadsheet_id
 23 |         self.id_is_name = id_is_name
 24 |         self.gd_client = gdata.spreadsheet.text_db.DatabaseClient(
 25 |                 username=username,
 26 |                 password=password)
 27 |     
 28 |     def load_text_db_table(self, sheet_name='Sheet1'):
 29 |         '''Load text_db Table object corresponding to specified sheet_name.
 30 |         '''
 31 |         super(GDocsReaderTextDb, self).read(None)
 32 |         if self.id_is_name:
 33 |             dbs = self.gd_client.GetDatabases(name=self.source)
 34 |         else:
 35 |             dbs = self.gd_client.GetDatabases(spreadsheet_key=self.source)
 36 |         assert len(dbs) >= 1, 'No spreadsheet of that name/id'
 37 |         db = dbs[0]
 38 |         table = db.GetTables(name=sheet_name)[0]
 39 |         return table
 40 | 
 41 |     def read(self, sheet_name='Sheet1'):
 42 |         '''Load the specified google spreadsheet worksheet as a L{TabularData}
 43 |         object.
 44 | 
 45 |         @return L{TabularData} object.
 46 |         '''
 47 |         text_db_table = self.load_text_db_table(sheet_name)
 48 |         tdata = TabularData()
 49 |         text_db_table.LookupFields()
 50 |         tdata.header = text_db_table.fields
 51 |         # finds all records it seems
 52 |         rows = text_db_table.FindRecords('')
 53 |         for row in rows:
 54 |             rowdata = []
 55 |             for colname in tdata.header:
 56 |                 rowdata.append(row.content[colname])
 57 |             tdata.data.append(rowdata)
 58 |         return tdata
 59 | 
 60 | 
 61 | # not yet working properly (cannot work out ListFeed yet ...)
 62 | # textdb is nicer but Spreadsheet allows one to get all cells using CellsFeed
 63 | # (even when blank lines) (this is not true when using ListFeed though ...)
 64 | # class GDocsReaderSpreadsheet(ReaderBase):
 65 | #     '''
 66 | # 
 67 | #     From Docs for the API:
 68 | #     <http://code.google.com/apis/spreadsheets/data/1.0/developers_guide_python.html#listFeeds>
 69 | # 
 70 | #     > The list feed contains all rows after the first row up to the first blank
 71 | #     row. The first blank row terminates the data set. If expected data isn't
 72 | #     appearing in a feed, check the worksheet manually to see whether there's an
 73 | #     unexpected blank row in the middle of the data. In particular, if the
 74 | #     second row of the spreadsheet is blank, then the list feed will contain no
 75 | #     data.
 76 | #     '''
 77 | #     def __init__(self, spreadsheet_id, username=None, password=None,
 78 | #             id_is_name=False):
 79 | #         '''
 80 | #         @param: spreadsheet_id: gdoc id or name (?key={id} in url). If name you
 81 | #         must set id_is_name to True.
 82 | #         '''
 83 | #         # do not pass spreadsheet_id down as it will be url or sheet name
 84 | #         super(GDocsReaderSpreadsheet, self).__init__()
 85 | #         self.source = spreadsheet_id
 86 | #         self.id_is_name = id_is_name
 87 | #         self.gd_client = gdata.spreadsheet.service.SpreadsheetsService()
 88 | #         self.gd_client.email = username
 89 | #         self.gd_client.password = password
 90 | # 
 91 | #     def read(self, sheet_index=0):
 92 | #         '''Load the specified google spreadsheet worksheet as a L{TabularData}
 93 | #         object.
 94 | # 
 95 | #         @return L{TabularData} object.
 96 | #         '''
 97 | #         super(GDocsReaderSpreadsheet, self).read(None)
 98 | #         self.gd_client.source = self.source
 99 | #         self.gd_client.ProgrammaticLogin()
100 | #         if self.id_is_name:
101 | #             feed = self.gd_client.GetSpreadsheetsFeed()
102 | #             # no len on feed ...
103 | #             # assert len(feed) > 0, 'No spreadsheets found for: %s' % self.source
104 | #             spreadsheet_id = feed.entry[0].id.text.split('/')[-1]
105 | #         else:
106 | #             spreadsheet_id = self.source
107 | #         sheetfeed = self.gd_client.GetWorksheetsFeed(spreadsheet_id)
108 | #         wrksht_id = sheetfeed.entry[sheet_index].id.text.split('/')[-1]
109 | #         row_feed = self.gd_client.GetListFeed(spreadsheet_id, wrksht_id)
110 | #         
111 | #         tdata = TabularData()
112 | #         # tdata.header
113 | #         # how do we get rows rather than just all the cells?
114 | #         for i, entry in enumerate(row_feed.entry):
115 | #             print entry.content['col1']
116 | #             print entry.content
117 | #             tdata.data.append([entry.content.text])
118 | #         return tdata
119 | 
120 | 


--------------------------------------------------------------------------------
/swiss/tabular/html.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from HTMLParser import HTMLParser
  3 | 
  4 | from base import TabularData, ReaderBase, WriterBase
  5 | 
  6 | 
  7 | class HtmlReader(ReaderBase):
  8 |     '''Read data from HTML table into L{TabularData}.
  9 | 
 10 |     '''
 11 |     def read(self, filepath_or_fileobj=None, table_index=0):
 12 |         '''Read data from fileobj.
 13 | 
 14 |         NB: post read all tables extracted are in attribute named 'tables'.
 15 | 
 16 |         @arg table_index: if multiple tables in the html return table at this
 17 |             index.
 18 |         @return: L{TabularData} object (all content in the data part, i.e. no
 19 |         header).
 20 |         '''
 21 |         super(HtmlReader, self).read(filepath_or_fileobj)
 22 |         parser = _OurTableExtractor()
 23 |         parser.reset()
 24 |         parser.feed(self.fileobj.read())
 25 |         self.tables = parser.tables
 26 |         return self.tables[table_index]
 27 | 
 28 | 
 29 | class _OurTableExtractor(HTMLParser):
 30 |     '''
 31 |     # TODO: tbody, thead etc
 32 |     # TODO: nested tables
 33 | 
 34 |     # TODO: will barf on bad html so may need to run tidy first ...
 35 |     # tidy -w 0 -b -omit -asxml -ascii
 36 |     '''
 37 |     def reset(self):
 38 |         HTMLParser.reset(self)
 39 |         self.tables = []
 40 |         self._rows = []
 41 |         self._row = []
 42 |         self._text = ''
 43 | 
 44 |     def handle_starttag(self, tag, attrs):
 45 |         if tag == 'tr':
 46 |             self._row = []
 47 |         elif tag == 'td' or tag == 'th':
 48 |             self._text = ''
 49 |         elif tag == 'br':
 50 |             self._text += '\n'
 51 | 
 52 |     def handle_endtag(self, tag):
 53 |         if tag == 'tr':
 54 |             self._rows.append(self._row)
 55 |         if tag == 'td' or tag == 'th':
 56 |             self._row.append(self._text)
 57 |         if tag == 'table':
 58 |             self.tables.append(TabularData(data=self._rows))
 59 |             self._rows = []
 60 | 
 61 |     def handle_data(self, data):
 62 |         self._text += data.strip()
 63 | 
 64 |     
 65 | import re
 66 | class HtmlWriter(WriterBase):
 67 |     """
 68 |     Write tabular data to xhtml
 69 |     """
 70 |     
 71 |     def __init__(self, round_ndigits=2, pretty_print=False, table_attributes = {'class': 'data'}):
 72 |         """
 73 |         @pretty_print: whether to pretty print (indent) output
 74 |         @table_attributes: dictionary of html attribute name/value pairs to be
 75 |         added to the table element
 76 |         """
 77 |         super(HtmlWriter, self).__init__(round_ndigits)
 78 |         self.pretty_print = pretty_print
 79 |         self.table_attributes = table_attributes
 80 |     
 81 |     def write(self, tabulardata, fileobj, caption = '', rowHeadings = []):
 82 |         """
 83 |         Write matrix of data to xhtml table.
 84 |         Allow for addition of row and column headings
 85 |         
 86 |         @return xhtml table containing data
 87 |         
 88 |         @param data: table of data that makes up table
 89 |         @param caption: the caption for the table (if empty no caption created)
 90 |         @param rowHeadings: additional headings for rows (separate from
 91 |         tabulardata)
 92 |         """
 93 |         columnHeadings = tabulardata.header
 94 |         data = tabulardata.data
 95 |         haveRowHeadings = (len(rowHeadings) > 0)
 96 |         
 97 |         htmlTable = '<table'
 98 |         for key, value in self.table_attributes.items():
 99 |             htmlTable += ' %s="%s"' % (key, value)
100 |         htmlTable += '>'
101 |         
102 |         # deal with caption
103 |         if caption != '':
104 |             htmlTable += '<caption>%s</caption>' % caption
105 |         
106 |         # deal with col headings
107 |         # if we there are rowHeadings may want to add blank column at front
108 |         numColHeads = len(columnHeadings)
109 |         if numColHeads > 0:
110 |             if haveRowHeadings and numColHeads == len(data[0]):
111 |                 # [[TODO: is this dangerous? should i make a copy ...]]
112 |                 columnHeadings.insert(0, '')
113 |             htmlTable += self.writeHeading(columnHeadings)
114 |         
115 |         htmlTable += '<tbody>'
116 |         if self.pretty_print:
117 |             htmlTable += '\n'
118 |         
119 |         for ii in range(0, len(data)):
120 |             # have to add 1 as first row is headings
121 |             if haveRowHeadings:
122 |                 htmlTable += self.writeRow(data[ii], rowHeadings[ii])
123 |             else:
124 |                 htmlTable += self.writeRow(data[ii])
125 |         
126 |         htmlTable += '</tbody></table>'
127 |         
128 |         if self.pretty_print:
129 |             fileobj.write(self.prettyPrint(htmlTable))
130 |         else:
131 |             fileobj.write(htmlTable)
132 | 
133 |     def value_to_str(self, value):
134 |         import cgi
135 |         out = super(HtmlWriter, self).value_to_str(value)
136 |         out = cgi.escape(out)
137 |         return out
138 |         
139 |     def writeHeading(self, row):
140 |         """
141 |         Write heading for html table (<thead>)
142 |         """
143 |         result = '<thead><tr>'
144 |         result += self.writeGeneralRow(row, 'th')
145 |         result += '</tr></thead>'
146 |         if self.pretty_print:
147 |             result += '\n'
148 |         return result
149 |     
150 |     def writeRow(self, row, rowHeading = ''):
151 |         result = ''
152 |         if rowHeading != '':
153 |             result = '<th>%s</th>' % self.value_to_str(rowHeading)
154 |         result += self.writeGeneralRow(row, 'td')
155 |         result = '<tr>%s</tr>' % result
156 |         if self.pretty_print:
157 |             result += '\n'
158 |         return result
159 |     
160 |     def writeGeneralRow(self, row, tagName):
161 |         result = ''
162 |         for ii in range(len(row)):
163 |             result += '<%s>%s</%s>' % (tagName, self.value_to_str(row[ii]), tagName)
164 |         return result
165 |         
166 |     def prettyPrint(self, html):
167 |         """pretty print html using HTMLTidy"""
168 |         # [[TODO: strip out html wrapper stuff that is added (head, body etc)
169 |         try:
170 |             import mx.Tidy
171 |             out = mx.Tidy.tidy(html, None, None, wrap = 0, indent = 'yes')[2]
172 |         except:
173 |             out = html
174 |         return self.tabify(out)
175 |         
176 |     def tabify(self, instr, tabsize = 2):
177 |         """
178 |         tabify text by replacing spaces of size tabSize by tabs
179 |         """
180 |         whitespace = tabsize * ' '
181 |         return re.sub(whitespace, '\t', instr)
182 |         
183 |     
184 | # for backwards compatibility
185 | # 2008-05-30
186 | WriterHtml = HtmlWriter
187 | 
188 | 
189 | 


--------------------------------------------------------------------------------
/swiss/tabular/misc.py:
--------------------------------------------------------------------------------
 1 | '''General Helper methods for tabular data.
 2 | '''
 3 | from base import TabularData
 4 | 
 5 | def transpose(data):
 6 |     '''Transpose a list of lists.
 7 |     
 8 |     Or do it directy: data = zip(*data)
 9 |     '''
10 |     return zip(*data)
11 | 
12 | def select_columns(matrix, cols):
13 |     '''Return a matrix with only those column indexes in cols.'''
14 |     tsp = transpose(matrix)
15 |     out = []
16 |     cols.sort()
17 |     for c in cols:
18 |         out.append(tsp[c])
19 |     return transpose(out)
20 | 
21 | 
22 | def pivot(table, left, top, value):
23 |     """Unnormalize (pivot) a normalised input set of tabular data.
24 | 
25 |     @param table: simple list of lists or a L{TabularData} object.
26 |     
27 |     Eg. To transform the tabular data like
28 |     
29 |     Name,   Year,  Value
30 |     -----------------------
31 |     'x', 2004, 1
32 |     'y', 2004, 2
33 |     'x', 2005, 3
34 |     'y', 2005, 4
35 |     
36 |     into the new list:
37 |     
38 |     Year, 'x', 'y'
39 |     ------------------------
40 |     2004, 1, 2
41 |     2005, 3, 4
42 |     
43 |     you would do:
44 | 
45 |         pivot(tabulardata, 1, 0, 2)
46 | 
47 |         OR (requires header to exist):
48 | 
49 |         pivot(tabulardata, 'Year', 'Name', 'Value')
50 |     """
51 |     if not isinstance(left, int):
52 |         left = table.header.index(left)
53 |     if not isinstance(top, int):
54 |         top = table.header.index(top)
55 |     if not isinstance(value, int):
56 |         value = table.header.index(value)
57 | 
58 |     rs = TabularData()
59 |     # construct double dict keyed by left values
60 |     tdict = {}
61 |     xvals = set()
62 |     yvals = set()
63 |     for row in table:
64 |         xval = row[left]
65 |         if not xval in tdict:
66 |             tdict[xval] = {}
67 |         tdict[xval][row[top]] = row[value]
68 |         xvals.add(xval)
69 |         yvals.add(row[top])
70 |     xvals = sorted(list(xvals))
71 |     yvals = sorted(list(yvals))
72 |     xhead = 'X'
73 |     if hasattr(table, 'header') and table.header:
74 |         xhead = table.header[left]
75 |     rs.header = [ xhead ] + yvals
76 |     rs.data = [ [x] + [ tdict[x].get(y, '') for y in yvals ] for x in xvals ]
77 |     return rs
78 | 
79 | 


--------------------------------------------------------------------------------
/swiss/tabular/tabular_json.py:
--------------------------------------------------------------------------------
 1 | '''JSON Reader and Writer'''
 2 | try:
 3 |     import json
 4 | except ImportError:
 5 |     try:
 6 |         import simplejson as json
 7 |     except ImportError: # simplejson not installed
 8 |         pass
 9 | from base import TabularData, ReaderBase, WriterBase
10 | 
11 | 
12 | class JsonReader(ReaderBase):
13 |     def read(self, filepath_or_fileobj=None):
14 |         '''Read JSON encoded data from source into a L{TabularData} object.
15 | 
16 |         JSON encoded data should either be:
17 |             * dict (with header and data attributes)
18 |             * list (first row assumed to be the header)
19 | 
20 |         @return L{TabularData}
21 |         '''
22 |         super(JsonReader, self).read(filepath_or_fileobj)
23 |         jsondata = json.load(self.fileobj)
24 |         if isinstance(jsondata, dict):
25 |             return TabularData(header=jsondata.get('header', None),
26 |                     data=jsondata.get('data', None)
27 |                     )
28 |         elif isinstance(jsondata, list):
29 |             return TabularData(header=jsondata[0], data=jsondata[1:])
30 |         else:
31 |             raise Exception('Cannot load TabularData from %s' % jsondata)
32 | 
33 | class JsonWriter(WriterBase):
34 | 
35 |     def write(self, tabular_data, fileobj, indent=2):
36 |         super(JsonWriter, self).write(tabular_data, fileobj)
37 |         jsondata = { u'header': tabular_data.header,
38 |                 u'data': tabular_data.data
39 |                 }
40 |         json.dump(jsondata, fileobj, indent=indent)
41 | 
42 | 


--------------------------------------------------------------------------------
/swiss/tabular/txt.py:
--------------------------------------------------------------------------------
  1 | from base import WriterBase
  2 | 
  3 | class TxtWriter(WriterBase):
  4 |     '''Write tabular data to plain text in nicely formatted way
  5 | 
  6 | TODO
  7 | ====
  8 | 
  9 | 1. allow output_width of 0 meaning use width necessary to fit all rows on one
 10 |    line
 11 | 
 12 | 2. rather than truncate cell contents wrap it onto two lines (and/or allow
 13 |    spillover if adjacent cell is empty)
 14 |    
 15 |      * wontfix: can let terminal do this: just set width very large ...
 16 | 
 17 | 3. (?) stream output back rather than returning all at once
 18 | 
 19 | 4. Add support for limiting number of columns displayed. DONE 2007-08-02
 20 |    * TODO: add unittest
 21 | '''
 22 | 
 23 |     def __init__(self, output_width=0, number_of_columns=0, **kwargs):
 24 |         '''
 25 |         @param output_width: display width (0 means unlimited).
 26 |         @param number_of_columns: number of columns to try to display (not
 27 |             guaranteed to be this number if this would cause problems). (0
 28 |             means all columns)
 29 |         '''
 30 |         super(TxtWriter, self).__init__(**kwargs)
 31 |         self.output_width = output_width
 32 |         self.number_of_columns = number_of_columns
 33 | 
 34 |     def write(self, tabular_data, fileobj):
 35 |         result = ''
 36 |         formatter = None
 37 |         row_cache = []
 38 |         sample_length = 4
 39 |         rows = tabular_data.data
 40 |         if tabular_data.header:
 41 |             rows = [ tabular_data.header ] + rows
 42 |         # include header in sample rows (do we always want to?)
 43 |         sample_rows = rows[:sample_length]
 44 |         self._compute_parameters(sample_rows)
 45 |         result += self._write_separator()
 46 |         for row in rows:
 47 |             result += self._write_row(row)
 48 |             result += self._write_separator()
 49 |         fileobj.write(result)
 50 | 
 51 |     def _compute_parameters(self, sample_rows):
 52 |         maxcols = self._get_maxcols(sample_rows)
 53 |         if not self.number_of_columns:
 54 |             self.numcols = maxcols
 55 |         else:
 56 |             self.numcols = min(self.number_of_columns, maxcols)
 57 |         self.colwidths = []
 58 |         self._set_colwidths(sample_rows)
 59 |         if self.colwidths[0] < 2:
 60 |             msg =\
 61 | u'''It is not possible to effectively format this many columns of material with
 62 | this narrow an output window. Column width is: %s''' % self.colwidths[0]
 63 |             # TODO: log it?
 64 |             print msg
 65 | 
 66 |     def _write_row(self, row):
 67 |         '''Return the input 'python' row as an appropriately formatted string.
 68 |         '''
 69 |         result = '|'
 70 |         count = 0
 71 |         for cell in row[:self.numcols]:
 72 |             width = self.colwidths[count]
 73 |             result += self._format_cell(width, cell)
 74 |             count += 1
 75 |         # now pad out with extra cols as necessary
 76 |         while count < self.numcols:
 77 |             width = self.colwidths[count]
 78 |             result += self._format_cell(width, ' ')
 79 |             count += 1
 80 |         return result + '\n'
 81 | 
 82 |     def _write_separator(self):
 83 |         result = '+'
 84 |         for width in self.colwidths:
 85 |             result += '-' * (width-1) + '+'
 86 |         return result + '\n'
 87 | 
 88 |     def _get_maxcols(self, sample_rows):
 89 |         maxcols = 0
 90 |         for row in sample_rows:
 91 |             maxcols = max(maxcols, len(row))
 92 |         return maxcols
 93 | 
 94 |     def _set_colwidths(self, sample_rows):
 95 |         # subtract -1 so that we have (at least) one spare screen column
 96 |         if self.output_width != 0:
 97 |             colwidth = int( (self.output_width - 1) / self.numcols)
 98 |             for ii in range(self.numcols):
 99 |                 self.colwidths.append(colwidth)
100 |         else: # make every col as wide as it needs to be
101 |             self.colwidths = [0] * self.numcols
102 |             for row in sample_rows:
103 |                 for ii in range(self.numcols):
104 |                     cellwidth = len(self.value_to_str(row[ii]))
105 |                     self.colwidths[ii] = max(self.colwidths[ii],
106 |                             cellwidth
107 |                             )
108 |             self.colwidths = [ x + 1 for x in self.colwidths ]
109 | 
110 |     def _format_cell(self, width, content):
111 |         content = self.value_to_str(content)
112 |         content = content.strip()
113 |         if len(content) > width - 1:
114 |             # TODO: be brutal (this *has* to be fixed)
115 |             content = content[:width-1]
116 |         return content.center(width-1) + '|'
117 | 
118 | 


--------------------------------------------------------------------------------
/swiss/tabular/xls.py:
--------------------------------------------------------------------------------
  1 | '''Work with Excel (xls) files.
  2 | 
  3 | Requires xlrd
  4 | '''
  5 | try:
  6 |     import xlrd
  7 | except ImportError: # xlrd not installed
  8 |     pass
  9 | 
 10 | from base import ReaderBase, TabularData
 11 | 
 12 | class XlsReader(ReaderBase):
 13 |     '''Read Excel (xls) files.
 14 | 
 15 |     Requires the xlrd package (see pypi).
 16 |     '''
 17 |     def __init__(self, filepath_or_fileobj=None):
 18 |         super(XlsReader, self).__init__(filepath_or_fileobj)
 19 |         if self.fileobj:
 20 |             self.book = xlrd.open_workbook(file_contents=self.fileobj.read())
 21 |         ## TODO: fix the rest of this
 22 | 
 23 |     def read(self, fileobj=None, sheet_index=0):
 24 |         '''Read an excel file (provide as fileobj) and return the specified
 25 |         sheet as a L{TabularData} object.
 26 | 
 27 |         For convenience also store:
 28 | 
 29 |         self.book: xlrd WorkBook object
 30 |         
 31 |         @return L{TabularData} object.
 32 |         '''
 33 |         super(XlsReader, self).read(fileobj)
 34 |         if fileobj:
 35 |             self.book = xlrd.open_workbook(file_contents=self.fileobj.read())
 36 |         tab = TabularData()
 37 |         booksheet = self.book.sheet_by_index(sheet_index)
 38 |         data = self.extract_sheet(booksheet, self.book)
 39 |         tab.data = data
 40 |         return tab
 41 | 
 42 |     def info(self):
 43 |         '''Return summary info about this Excel Workbook.'''
 44 |         info = ''
 45 |         info += 'The number of worksheets is: %s\n' % self.book.nsheets
 46 |         info += 'Worksheet name(s):\n' % self.book.sheet_names()
 47 |         count = -1
 48 |         for sn in self.book.sheet_names():
 49 |             count += 1
 50 |             info += '%s  %s\n' % (count, sn)
 51 |         return info
 52 | 
 53 |     def sheet_info(self, sheet_index):
 54 |         '''Summary info about an xls sheet.
 55 | 
 56 |         @return: printable string giving info.
 57 |         '''
 58 |         import pprint
 59 |         sh = self.book.sheet_by_index(sheet_index)
 60 |         info = sh.name + '\n'
 61 |         info += 'Rows: %s Cols: %s\n\n' % (sh.nrows, sh.ncols)
 62 |         MAX_ROWS = 30
 63 |         for rx in range(min(sh.nrows, MAX_ROWS)):
 64 |             info += str(sh.row(rx)) + '\n'
 65 |         return info
 66 | 
 67 |     def extract_sheet(self, sheet, book):
 68 |         matrix = []
 69 |         nrows = sheet.nrows
 70 |         ncols = sheet.ncols
 71 |         for rx in range(nrows):
 72 |             outrow = []
 73 |             for cx in range(ncols):
 74 |                 cell = sheet.cell(rowx=rx, colx=cx)
 75 |                 val = self.cell_to_python(cell, book)
 76 |                 outrow.append(val)
 77 |             matrix.append(outrow)
 78 |         return matrix
 79 | 
 80 |     def cell_to_python(self, cell, book):
 81 |         # annoying need book argument for datemode
 82 |         # info on types: http://www.lexicon.net/sjmachin/xlrd.html#xlrd.Cell-class
 83 |         if cell.ctype == xlrd.XL_CELL_NUMBER: 
 84 |             return float(cell.value)
 85 |         elif cell.ctype == xlrd.XL_CELL_DATE:
 86 |             from datetime import date
 87 |             # TODO: distinguish date and datetime
 88 |             args = xlrd.xldate_as_tuple(cell.value, book.datemode)
 89 |             try:
 90 |                 return date(args[0], args[1], args[2])
 91 |             except Exception, inst:
 92 |                 print 'Error parsing excel date (%s): %s' % (args, inst)
 93 |                 return None
 94 |         elif cell.ctype == xlrd.XL_CELL_BOOLEAN:
 95 |             return bool(cell.value)
 96 |         else:
 97 |             return cell.value
 98 | 
 99 | 
100 | 


--------------------------------------------------------------------------------
/swiss/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # placeholder
2 | 


--------------------------------------------------------------------------------
/swiss/tests/data/xls_reader_test.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rufuspollock-okfn/datautil/6bcb8795013d305f02467967367ecca15ad351e2/swiss/tests/data/xls_reader_test.xls


--------------------------------------------------------------------------------
/swiss/tests/parse/test_name.py:
--------------------------------------------------------------------------------
  1 | import swiss.parse.name
  2 | 
  3 | 
  4 | class TestName:
  5 |     def test_parse_name_FL(self):
  6 |         name = u'Ludwig Van Beethoven'
  7 |         out = swiss.parse.name.parse_name(name)
  8 |         assert out.ln == u'Beethoven'
  9 |         assert out.fns == ['Ludwig', 'Van']
 10 | 
 11 |     def test_parse_name_LF_with_extra_comma(self):
 12 |         out = swiss.parse.name.parse_name('More, Sir Thomas,Saint')
 13 |         assert out.ln == 'More', out
 14 |         assert out.fns == ['Sir', 'Thomas']
 15 | 
 16 |     def test_parse_name_FL_normcase(self):
 17 |         name = u'Ludwig van BEETHOVEN'
 18 |         out = swiss.parse.name.parse_name(name)
 19 |         assert out.ln == 'Beethoven', out
 20 | 
 21 |     def test_parse_name_LF_with_title(self):
 22 |         name = u'Chandos, John [Sir]'
 23 |         out = swiss.parse.name.parse_name(name)
 24 |         assert out.ln == 'Chandos', out
 25 |         assert out.title == 'Sir', out
 26 | 
 27 |     def test_parse_name_FL_with_title(self):
 28 |         name = u'Sir John CHANDOS'
 29 |         out = swiss.parse.name.parse_name(name)
 30 |         assert out.ln == 'Chandos', out
 31 |         assert out.title == 'Sir', out
 32 | 
 33 |     def test_parse_name_FL_with_title_2(self):
 34 |         name = u'Prof Benjamin AARON'
 35 |         out = swiss.parse.name.parse_name(name)
 36 |         assert out.ln == 'Aaron', out
 37 |         assert out.title == 'Prof', out
 38 |         assert out.fns == ['Benjamin'], out
 39 |         assert str(out) == 'Aaron, Benjamin [Prof]'
 40 | 
 41 |     def test_parse_title_with_fullstop(self):
 42 |         name = 'Major. abc xyz'
 43 |         out = swiss.parse.name.parse_name(name)
 44 |         assert out.title == 'Major', out.title
 45 | 
 46 |     def test_parse_title_with_fullstop_2(self):
 47 |         name = 'Xyz, Abc [Major.]'
 48 |         out = swiss.parse.name.parse_name(name)
 49 |         print out
 50 |         assert out.title == 'Major', out.title
 51 | 
 52 |     def test_parse_title_with_brackets(self):
 53 |         name = 'Dickens, Gerald (Sir)'
 54 |         out = swiss.parse.name.parse_name(name)
 55 |         assert out.title == 'Sir', out.title
 56 | 
 57 |         name = '(Sir) Gerald Dickens'
 58 |         out = swiss.parse.name.parse_name(name)
 59 |         assert out.title == 'Sir', out.title
 60 | 
 61 |     def test_parse_name_FL_initials(self):
 62 |         name = 'Chekhov, A.P.'
 63 |         out = swiss.parse.name.parse_name(name)
 64 |         assert out.ln == 'Chekhov'
 65 |         assert out.fns == ['A.', 'P.'], out
 66 | 
 67 |     def test_strip_fullstops(self):
 68 |         name = 'George. Bosch'
 69 |         out = swiss.parse.name.normalize(name)
 70 |         assert out == 'Bosch, George'        
 71 | 
 72 |         name = 'George. a.p. Bosch.'
 73 |         out = swiss.parse.name.normalize(name)
 74 |         assert out == 'Bosch, George A. P.', out
 75 | 
 76 |         name = 'Geo.rge. Bosch'
 77 |         out = swiss.parse.name.normalize(name)
 78 |         assert out == 'Bosch, Geo. Rge', out
 79 | 
 80 |         name = 'Geo.Smith. Bosch'
 81 |         out = swiss.parse.name.normalize(name)
 82 |         assert out == 'Bosch, Geo. Smith', out
 83 | 
 84 |     def test_tostr(self):
 85 |         name = swiss.parse.name.Name(ln='Beethoven', fns=['Ludwig', 'van'])
 86 |         exp = u'Beethoven, Ludwig van'
 87 |         out = swiss.parse.name.name_tostr(name)
 88 |         assert out == exp, out
 89 | 
 90 |     def test_with_no_name(self):
 91 |         name = swiss.parse.name.parse_name(' ')
 92 |         assert name.ln is '', name
 93 |         out = swiss.parse.name.normalize(' ')
 94 |         assert out == '', out
 95 | 
 96 |     def test_surname(self):
 97 |         name = u'SCHUBERT'
 98 |         out = str(swiss.parse.name.parse_name(name))
 99 |         assert out == 'Schubert'
100 | 
101 | 


--------------------------------------------------------------------------------
/swiss/tests/tabular/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rufuspollock-okfn/datautil/6bcb8795013d305f02467967367ecca15ad351e2/swiss/tests/tabular/__init__.py


--------------------------------------------------------------------------------
/swiss/tests/tabular/test_base.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from StringIO import StringIO
  3 | 
  4 | import swiss.tabular
  5 | 
  6 | class TestTabularData:
  7 |     testlist = [ ['X', 'Y'], [1,2], [3,4] ]
  8 | 
  9 |     def test_1(self):
 10 |         tabular = swiss.tabular.TabularData()
 11 |         assert tabular.header == []
 12 | 
 13 |     def test_from_list(self):
 14 |         out = swiss.tabular.TabularData.from_list(self.testlist)
 15 |         assert out.header == [ 'X', 'Y' ]
 16 |         assert out.data == [ [1,2], [3,4] ]
 17 | 
 18 |     def test_to_list(self):
 19 |         td = swiss.tabular.TabularData(
 20 |             header=['X', 'Y'],
 21 |             data=[ [1,2], [3,4] ]
 22 |             )
 23 |         out = td.to_list()
 24 |         assert out == self.testlist
 25 | 
 26 | 
 27 | class TestWriterBase:
 28 |     def test_value_to_str(self):
 29 |         w = swiss.tabular.WriterBase() # round_ndigits=None
 30 |         out = w.value_to_str('x')
 31 |         assert out == u'x', out
 32 |         out = w.value_to_str(1)
 33 |         assert out == u'1', out
 34 |         out = w.value_to_str(1.3555)
 35 |         assert out == u'1.3555', out
 36 | 
 37 |         w = swiss.tabular.WriterBase(round_ndigits=2)
 38 |         out = w.value_to_str('x')
 39 |         assert out == u'x', out
 40 |         out = w.value_to_str(1)
 41 |         assert out == u'1', out
 42 |         out = w.value_to_str(1.3555)
 43 |         assert out == u'1.36', out
 44 | 
 45 |         w.round_ndigits = -1
 46 |         out = w.value_to_str(102.34)
 47 |         assert out == u'100', out
 48 | 
 49 | 
 50 | class TestReaderCsv(object):
 51 |     
 52 |     csvdata = \
 53 | '''"header1", "header 2"
 54 | 1, 2'''
 55 |     header = [ 'header1', 'header 2' ]
 56 |     data = [ ['1', '2'] ]
 57 |   
 58 |     def setUp(self):
 59 |         reader = swiss.tabular.ReaderCsv()
 60 |         fileobj = StringIO(self.csvdata)
 61 |         self.tab = reader.read(fileobj)
 62 | 
 63 |     def test_header(self):
 64 |         assert self.header == self.tab.header
 65 | 
 66 |     def test_data(self):
 67 |         assert self.data == self.tab.data
 68 | 
 69 | 
 70 | class TestReaderCsvUnicode(TestReaderCsv):
 71 |     csvdata = \
 72 | u'''"headi\xf1g", "header 2"
 73 | 1, 2'''
 74 |     header = [ u'headi\xf1g'.encode('utf-8'), 'header 2' ]
 75 |     data = [ ['1', '2'] ]
 76 | 
 77 | 
 78 | class TestReaderCsvEncoded(TestReaderCsvUnicode):
 79 |     encoding = 'utf-16'
 80 |     csvdata = \
 81 | u'''"headi\xf1g", "header 2"
 82 | 1, 2'''.encode(encoding)
 83 | 
 84 |     def setUp(self):
 85 |         reader = swiss.tabular.ReaderCsv()
 86 |         fileobj = StringIO(self.csvdata)
 87 |         self.tab = reader.read(fileobj, encoding=self.encoding)
 88 | 
 89 | 
 90 | class TestCsvWriter:
 91 |     def test_writer(self):
 92 |         writer = swiss.tabular.CsvWriter()
 93 |         fo = StringIO()
 94 |         td = swiss.tabular.TabularData([[1,2],[3,4]], header=['one',
 95 |             'two'])
 96 |         writer.write(td, fo)
 97 |         fo.seek(0)
 98 |         out = fo.read()
 99 |         exp = \
100 | '''one,two\r
101 | 1,2\r
102 | 3,4\r\n'''
103 |         assert out == exp
104 | 
105 | 
106 | class TestHtmlReader:
107 | 
108 |     inraw1 = '''
109 | <table>
110 |     <tr>
111 |         <td>1</td><td>2</td>
112 |     </tr>
113 |     <tr>
114 |         <th colspan="2">1983</th>
115 |     </tr>
116 |     <tr>
117 |         <td>3</td><td>4</td>
118 |     </tr>
119 | </table>
120 |     '''
121 |     in1 = StringIO(inraw1)
122 |     
123 |     exp1 = [ ['1', '2'],
124 |             ['1983'],
125 |             ['3', '4'],
126 |             ]
127 |     
128 |     def test_1(self):
129 |         reader = swiss.tabular.HtmlReader()
130 |         tab = reader.read(self.in1)
131 |         assert tab.data == self.exp1
132 | 
133 | 
134 | class TestHtmlWriter:
135 | 
136 |     def setUp(self):
137 |         rawData = [[1,1], [0,1]]
138 |         self.indata1 = swiss.tabular.TabularData(data=rawData)
139 |         self.writer1 = swiss.tabular.HtmlWriter(table_attributes={'id':1, 'class': 'data'})
140 | 
141 |     def test_0_simple(self):
142 |         indata1 = [[1,1], [0,1]]
143 |         expected = '<table id="1" class="data"><tbody><tr><td>1</td><td>1</td></tr>'+\
144 |             '<tr><td>0</td><td>1</td></tr></tbody></table>'
145 |         out1 = self.writer1.write_str(self.indata1)
146 |         assert expected == out1
147 |     
148 |     def test_col_headings(self):
149 |         self.indata1.header = [u'x','y']
150 |         caption = ''
151 |         expected = '<table id="1" class="data"><thead><tr><th>x</th><th>y</th></tr>'+\
152 |             '</thead><tbody><tr><td>1</td><td>1</td></tr><tr><td>0</td>' + \
153 |             '<td>1</td></tr></tbody></table>'
154 |         # no caption but headings
155 |         out1 = self.writer1.write_str(self.indata1, caption)
156 |         assert expected == out1
157 |     
158 |     def test_row_headings(self):
159 |         self.indata1.header = ['x','y']
160 |         rowHeadings = ['Date 1', 'Date 2']
161 |         caption = ''
162 |         expected = '<table id="1" class="data"><thead><tr><th></th><th>x</th>' + \
163 |             '<th>y</th></tr></thead><tbody><tr><th>Date 1</th><td>1</td>' + \
164 |             '<td>1</td></tr><tr><th>Date 2</th><td>0</td><td>1</td></tr>' + \
165 |             '</tbody></table>'
166 |         # no caption but headings
167 |         out1 = self.writer1.write_str(self.indata1, caption, rowHeadings)
168 |         assert expected == out1
169 |     
170 |     def test_escaping(self):
171 |         tdata = swiss.tabular.TabularData(header=['s&p', 'y<z'])
172 |         out = self.writer1.write_str(tdata)
173 |         assert 's&amp;p' in out, out
174 |         assert 'y&lt;z' in out
175 | 
176 |     
177 | #    def testPrettyPrint(self):
178 | #        in1 = '<table><tr><th>x</th><th>y</th></tr>' + \
179 | #            '<tr><td>0</td><td>1</td></tr></table>'
180 | #        print self.writer1.prettyPrint(in1)
181 | 
182 | 
183 | class TestLatexWriter:
184 | 
185 |     matrix = [[ 'H1', 'H2'],
186 |            [1,'2%'],
187 |            [3,4],
188 |            ]
189 | 
190 |     exp = \
191 | r'''\textbf{H1} & \textbf{H2} \\
192 | \hline
193 | 1 & 2\% \\
194 | \hline
195 | 3 & 4 \\
196 | \hline
197 | '''
198 |     m2l = swiss.tabular.LatexWriter()
199 | 
200 |     def test_escape(self):
201 |         in1 = '& % $ something'
202 |         exp1 = r'\& \% $ something'
203 |         assert self.m2l.escape(in1) == exp1
204 | 
205 |     def test_table2latex(self):
206 |         out = swiss.tabular.table2latex(self.matrix)
207 |         self.diff(self.exp, out)
208 |         assert out == self.exp
209 | 
210 |     def test_write(self):
211 |         td = swiss.tabular.TabularData(data=self.matrix[1:], header=self.matrix[0])
212 |         out = self.m2l.write_str(td)
213 |         self.diff(self.exp, out)
214 |         assert out == self.exp
215 | 
216 |     def diff(self, str1, str2):
217 |         import difflib
218 |         differ = difflib.Differ()
219 |         text1 = str1.splitlines(1)
220 |         text2 = str2.splitlines(1)
221 |         result = list(differ.compare(text1, text2))
222 |         from pprint import pprint
223 |         pprint(result)
224 | 
225 | 
226 | 


--------------------------------------------------------------------------------
/swiss/tests/tabular/test_gdocs.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from ConfigParser import SafeConfigParser
 3 | 
 4 | import swiss.tabular.gdocs as gdocs
 5 | 
 6 | 
 7 | cfg = SafeConfigParser()
 8 | if not os.path.exists('test.ini'):
 9 |     msg = 'To run these tests you need a config file. See this file for details'
10 |     raise Exception(msg)
11 | cfg.readfp(open('test.ini'))
12 | username = cfg.get('gdocs', 'username')
13 | password = cfg.get('gdocs', 'password')
14 | 
15 | 
16 | class TestGDocsTextDb:
17 |     def test_01(self):
18 |         source = 'okfn-swiss-gdocs-testing'
19 |         reader = gdocs.GDocsReaderTextDb(source, username, password, id_is_name=True)
20 |         tdata = reader.read()
21 |         assert tdata.header == ['col1', 'col2']
22 |         assert len(tdata.data) == 5, tdata
23 | 
24 | 
25 | # not working properly yet
26 | class _TestGDocs:
27 |     def test_01(self):
28 |         source = 't8GZy4Lb6jhVjCL5nrqZ5TQ'
29 |         reader = gdocs.GDocsReaderSpreadsheet(source, username, password)
30 |         tdata = reader.read()
31 |         assert len(tdata.data) == 6, tdata
32 | 
33 |     def test_02_id_is_name(self):
34 |         source = 'okfn-swiss-gdocs-testing'
35 |         reader = gdocs.GDocsReaderSpreadsheet(source, username, password, id_is_name=True)
36 |         tdata = reader.read()
37 |         assert len(tdata.data) == 6, tdata
38 | 
39 | 
40 | 


--------------------------------------------------------------------------------
/swiss/tests/tabular/test_json.py:
--------------------------------------------------------------------------------
 1 | from StringIO import StringIO
 2 | import swiss.tabular.tabular_json as js
 3 | 
 4 | class TestJson:
 5 |     in1 = { 'header': [u'a', u'b'],
 6 |             'data': [[1,2], [3,4]]
 7 |             }
 8 |     in2 = [ in1['header'] ] + in1['data']
 9 |     in1sio = StringIO(js.json.dumps(in1))
10 |     in1sio.seek(0)
11 |     in2sio = StringIO(js.json.dumps(in2))
12 |     in2sio.seek(0)
13 | 
14 |     def test_JsonReader(self):
15 |         reader = js.JsonReader()
16 |         out = reader.read(self.in1sio)
17 |         assert out.header == self.in1['header']
18 |         assert out.data == self.in1['data']
19 | 
20 |         out = reader.read(self.in2sio)
21 |         assert out.header == self.in1['header']
22 |         assert out.data == self.in1['data']
23 | 
24 |     def test_JsonWriter(self):
25 |         writer = js.JsonWriter()
26 |         td = js.TabularData(header=self.in1['header'], data=self.in1['data'])
27 |         out = writer.write_str(td)
28 |         assert js.json.loads(out) == self.in1
29 | 
30 | 


--------------------------------------------------------------------------------
/swiss/tests/tabular/test_misc.py:
--------------------------------------------------------------------------------
 1 | import swiss.tabular
 2 | 
 3 | class TestTranspose:
 4 | 
 5 |     def test_1(self):
 6 |         inlist = [
 7 |                 [ 0, 1 ],
 8 |                 [ 1, 0 ],
 9 |                 ]
10 |         exp = [
11 |                 ( 0, 1 ),
12 |                 ( 1, 0 ),
13 |                 ]
14 |         out = swiss.tabular.transpose(inlist)
15 |         assert out == exp, out
16 | 
17 | class TestPivot:
18 |     td = swiss.tabular.TabularData(
19 |             header=['Name','Year','Value'],
20 |             data=[
21 |                 ['x',2004,1],
22 |                 ['y',2004,2],
23 |                 ['y',2005,4],
24 |                 ['x',2005,3],
25 |             ],
26 |         )
27 | 
28 |     def test_pivot_with_tabular(self):
29 |         out = swiss.tabular.pivot(self.td, 1, 0, 2)
30 |         assert out.data[0] == [2004, 1, 2]
31 |         assert out.data[-1] == [2005, 3, 4]
32 | 
33 |     def test_pivot_with_tabular_2(self):
34 |         out = swiss.tabular.pivot(self.td, 'Year', 'Name', 'Value')
35 |         assert out.data[0] == [2004, 1, 2]
36 | 
37 |     def test_pivot_simple_list(self):
38 |         out = swiss.tabular.pivot(self.td.data, 1, 0, 2)
39 |         assert out.data[0] == [2004, 1, 2]
40 | 
41 | 


--------------------------------------------------------------------------------
/swiss/tests/tabular/test_txt.py:
--------------------------------------------------------------------------------
 1 | import StringIO
 2 | 
 3 | from swiss.tabular.txt import *
 4 | from swiss.tabular import TabularData, CsvReader
 5 | 
 6 | class TestFormatting:
 7 | 
 8 |     sample_rows  = [
 9 |             ['1', '2', 'head blah', 'blah blah blah'],
10 |             ['a', 'b', 'c', 'd', 'e', 'g' ],
11 |             ['1', '2', 'annakarenina annakarenina annakarenina'],
12 |             ]
13 |     output_width = 60
14 | 
15 |     writer = TxtWriter(output_width=output_width)
16 |     writer._compute_parameters(sample_rows)
17 | 
18 |     def test_1(self):
19 |         assert self.writer.numcols == 6
20 | 
21 |     def test_colwidths(self):
22 |         exp = int ((self.output_width -1) / 6)
23 |         assert self.writer.colwidths[0] == exp
24 |         
25 |     def test__write_1(self):
26 |         out = self.writer._write_row(self.sample_rows[0])
27 |         assert len(out) <= self.output_width
28 | 
29 |     def test__write_2(self):
30 |         out = self.writer._write_row(self.sample_rows[0])
31 |         exp = '|   1    |   2    |head bla|blah bla|        |        |\n'
32 |         assert out == exp
33 | 
34 |     def test__write_separator(self):
35 |         out = self.writer._write_separator()
36 |         exp = '+--------+--------+--------+--------+--------+--------+\n'
37 | 
38 | 
39 | 
40 | class TestTxtWriter:
41 |     sample = \
42 | '''"YEAR","PH","RPH","RPH_1","LN_RPH","LN_RPH_1","HH","LN_HH"
43 |     1971,7.852361625,43.9168370988587,42.9594500501036,3.78229777955476,3.76025664867788,16185,9.69184016636035
44 |     ,,abc,
45 |     1972,10.504714885,55.1134791192682,43.9168370988587,4.00939431635556,3.78229777955476,16397,9.70485367024987
46 |     , ,,  '''
47 | 
48 |     expected = \
49 | '''+------+------+------+------+------+------+------+------+
50 | | YEAR |  PH  | RPH  |RPH_1 |LN_RPH|LN_RPH|  HH  |LN_HH |
51 | +------+------+------+------+------+------+------+------+
52 | | 1971 |7.8523|43.916|42.959|3.7822|3.7602|16185 |9.6918|
53 | +------+------+------+------+------+------+------+------+
54 | |      |      | abc  |      |      |      |      |      |
55 | +------+------+------+------+------+------+------+------+
56 | | 1972 |10.504|55.113|43.916|4.0093|3.7822|16397 |9.7048|
57 | +------+------+------+------+------+------+------+------+
58 | |      |      |      |      |      |      |      |      |
59 | +------+------+------+------+------+------+------+------+
60 | '''
61 | 
62 |     def test_simple(self):
63 |         indata = TabularData(data=[range(5),range(5,10)])
64 |         writer = TxtWriter()
65 |         out = writer.write_str(indata)
66 |         exp = '''+-+-+-+-+-+
67 | |0|1|2|3|4|
68 | +-+-+-+-+-+
69 | |5|6|7|8|9|
70 | +-+-+-+-+-+
71 | '''
72 |         print out
73 |         print exp
74 |         assert out == exp
75 | 
76 |     def test_output_width(self):
77 |         indata = TabularData(data=[range(5),range(5,10)])
78 |         writer = TxtWriter(output_width=16)
79 |         out = writer.write_str(indata)
80 |         outlen = len(out.splitlines()[0])
81 |         assert outlen == 16, outlen
82 | 
83 |     def test_using_csv(self):
84 |         fileobj = StringIO.StringIO(self.sample)
85 |         in_tdata = CsvReader(fileobj).read()
86 |         writer = TxtWriter(output_width=60)
87 |         out = writer.write_str(in_tdata)
88 |         print out
89 |         print self.expected
90 |         assert self.expected == out, out
91 | 
92 | 


--------------------------------------------------------------------------------
/swiss/tests/test_cache.py:
--------------------------------------------------------------------------------
 1 | import tempfile
 2 | import shutil
 3 | import os
 4 | 
 5 | from swiss.cache import Cache
 6 | 
 7 | class TestCache:
 8 |     @classmethod
 9 |     def setup_class(self):
10 |         self.tmp = tempfile.mkdtemp()
11 |         self.path = os.path.join(self.tmp, 'abc.txt')
12 |         open(self.path, 'w').write('abc')
13 |         self.url = 'file://%s' % self.path
14 | 
15 |     @classmethod
16 |     def teardown_class(self):
17 |         shutil.rmtree(self.tmp)
18 | 
19 |     def test_basename(self):
20 |         base = 'http://www.abc.org/'
21 |         in1 = base + 'xyz'
22 |         out = Cache.basename(in1)
23 |         assert out == 'xyz'
24 | 
25 |         in2 = base + 'xyz/abc.txt'
26 |         out = Cache.basename(in2)
27 |         assert out == 'abc.txt'
28 | 
29 |         in3 = base + 'membersDo?body=ABC'
30 |         out = Cache.basename(in3)
31 |         assert out == 'membersDo?body=ABC', out
32 | 
33 |         in3 = base + 'membersDo?body=data/ABC'
34 |         out = Cache.basename(in3)
35 |         assert out == 'membersDo?body=data%47ABC', out
36 | 
37 |     def test_filepath(self):
38 |         r = Cache()
39 |         base = 'http://www.abc.org/'
40 |         in1 = base + 'xyz'
41 |         out = r.filepath(in1)
42 |         # ./xyz
43 |         assert out.endswith('xyz'), out
44 | 
45 |     def test_dl(self):
46 |         dest = os.path.join(self.tmp, 'out.txt')
47 |         Cache.dl(self.url, dest)
48 |         assert os.path.exists(dest)
49 |         assert open(dest).read() == 'abc'
50 | 
51 |     def test_cache(self):
52 |         cache = os.path.join(self.tmp, 'cache')
53 |         r = Cache(cache)
54 |         r.retrieve(self.url)
55 |         assert os.path.exists(os.path.join(cache, 'abc.txt'))
56 | 
57 | 


--------------------------------------------------------------------------------
/swiss/tests/test_date.py:
--------------------------------------------------------------------------------
  1 | from swiss.date import *
  2 | 
  3 | import datetime
  4 | 
  5 | class TestPythonStringOrdering(object):
  6 |     # It is impossible to find a string format such that +ve and -ve numbers
  7 |     # sort correctly as strings:
  8 |     # if (in string ordering) X < Y => -X < -Y (False!)
  9 |     def test_ordering(self):
 10 |         assert '0' < '1'
 11 |         assert '-10' < '10'
 12 |         assert '-' < '@'
 13 |         assert '-' < '0'
 14 |         assert '-100' < '-X10'
 15 |         assert '10' < '1000'
 16 |         assert '02000' < '10000'
 17 |         assert ' 2000' < '10000'
 18 | 
 19 |     def test_bad_ordering(self):
 20 |         assert ' ' < '0'
 21 |         assert ' ' < '-'
 22 |         assert not '-' < '+'
 23 |         assert '-100' > '-10'
 24 |         assert not '-100' < '-010'
 25 |         assert not '-100' < '- 10'
 26 |         assert not '-100' < ' -10'
 27 |         assert '10000' < '2000'
 28 |         assert not '-10' < ' 1'
 29 |         
 30 | 
 31 | class TestFlexiDate(object):
 32 |     def test_init(self):
 33 |         fd = FlexiDate()
 34 |         assert fd.year == '', fd
 35 |         assert fd.month == '', fd
 36 | 
 37 |         fd = FlexiDate(2000, 1,1)
 38 |         assert fd.month == '01', fd
 39 |         assert fd.day== '01', fd
 40 | 
 41 |     def test_str(self):
 42 |         fd = FlexiDate(2000, 1, 23)
 43 |         assert str(fd) == '2000-01-23', '"%s"' % fd
 44 |         fd = FlexiDate(-2000, 1, 23)
 45 |         assert str(fd) == '-2000-01-23'
 46 |         fd = FlexiDate(2000)
 47 |         assert str(fd) == '2000'
 48 |         fd = FlexiDate(1760, qualifier='fl.')
 49 |         assert str(fd) == '1760 [fl.]', fd
 50 | 
 51 |         fd = FlexiDate(qualifier='anything')
 52 |         assert str(fd) == ' [anything]'
 53 | 
 54 | 
 55 |     def test_from_str(self):
 56 |         def dotest(fd):
 57 |             out = FlexiDate.from_str(str(fd))
 58 |             assert str(out) == str(fd)
 59 | 
 60 |         fd = FlexiDate(2000, 1, 23)
 61 |         dotest(fd)
 62 |         fd = FlexiDate(1760, qualifier='fl.')
 63 |         dotest(fd)
 64 |         fd = FlexiDate(-1760, 1, 3, qualifier='fl.')
 65 |         dotest(fd)
 66 |     
 67 |     def test_as_float(self):
 68 |         fd = FlexiDate(2000)
 69 |         assert fd.as_float() == float(2000), fd.as_float()
 70 |         fd = FlexiDate(1760, 1, 2)
 71 |         exp = 1760 + 1/12.0 + 2/365.0
 72 |         assert fd.as_float() == exp, fd.as_float()
 73 |         fd = FlexiDate(-1000)
 74 |         assert fd.as_float() == float(-1000)
 75 | 
 76 |     def test_as_datetime(self):
 77 |         fd = FlexiDate(2000)
 78 |         out = fd.as_datetime()
 79 |         assert out == datetime.datetime(2000, 1, 1), out
 80 |         fd = FlexiDate(1760, 1, 2)
 81 |         out = fd.as_datetime()
 82 |         assert out == datetime.datetime(1760,1,2), out
 83 | 
 84 | 
 85 | class TestDateParsers(object):
 86 |     def test_using_datetime(self):
 87 |         parser = PythonDateParser()
 88 | 
 89 |         d1 = datetime.date(2000, 1, 23)
 90 |         fd = parser.parse(d1)
 91 |         assert fd.year == '2000'
 92 | 
 93 |         d1 = datetime.datetime(2000, 1, 23)
 94 |         fd = parser.parse(d1)
 95 |         # assert str(fd) == '2000-01-23T00:00:00', fd
 96 |         assert str(fd) == '2000-01-23', fd
 97 | 
 98 |     def test_using_dateutil(self):
 99 |         parser = DateutilDateParser()
100 | 
101 |         in1 = '2001-02'
102 |         fd = parser.parse(in1)
103 |         assert str(fd) == in1, fd
104 | 
105 |         in1 = 'March 1762'
106 |         fd = parser.parse(in1)
107 |         assert str(fd) == '1762-03'
108 | 
109 |         in1 = 'March 1762'
110 |         fd = parser.parse(in1)
111 |         assert str(fd) == '1762-03'
112 | 
113 |         in1 = '1768 AD'
114 |         fd = parser.parse(in1)
115 |         assert str(fd) == '1768', fd
116 | 
117 |         in1 = '1768 A.D.'
118 |         fd = parser.parse(in1)
119 |         assert str(fd) == '1768', fd
120 | 
121 |         in1 = '-1850'
122 |         fd = parser.parse(in1)
123 |         assert str(fd) == '-1850', fd
124 | 
125 |         in1 = '1762 BC'
126 |         fd = parser.parse(in1)
127 |         assert str(fd) == '-1762', fd
128 | 
129 |         in1 = '4 BC'
130 |         fd = parser.parse(in1)
131 |         assert str(fd) == '-0004', fd
132 | 
133 |         in1 = '4 B.C.'
134 |         fd = parser.parse(in1)
135 |         assert str(fd) == '-0004', fd
136 | 
137 |     def test_parse(self):
138 |         d1 = datetime.datetime(2000, 1, 23)
139 |         fd = parse(d1)
140 |         assert fd.year == '2000'
141 | 
142 |         fd = parse('March 1762')
143 |         assert str(fd) == '1762-03'
144 | 
145 |         fd = parse(1966)
146 |         assert str(fd) == '1966'
147 | 
148 |         fd = parse('22/07/2010')
149 |         assert fd.month == '07', fd.month
150 | 
151 |     def test_parse_ambiguous_day_month(self):
152 |         fd = parse('05/07/2010')
153 |         assert fd.month == '07', fd.month
154 |         assert fd.day == '05', fd.month
155 | 
156 |     def test_parse_with_none(self):
157 |         d1 = parse(None)
158 |         assert d1 is None
159 |     
160 |     def test_parse_wildcards(self):
161 |         fd = parse('198?')
162 |         assert fd.year == '', fd.year # expect this to not parse
163 |         # TODO but we should have a float if possible
164 | #        assert fd.as_float() == u'1980', fd.as_float()
165 | 
166 |     def test_parse_with_qualifiers(self):
167 |         fd = parse('1985?')
168 |         assert fd.year == u'1985', fd
169 |         assert fd.qualifier == u'Uncertainty : 1985?', fd.qualifier
170 | 
171 |         fd = parse('c.1780')
172 |         assert fd.year == u'1780', fd
173 |         assert fd.qualifier == u"Note 'circa' : c.1780", fd
174 | 
175 |         fd = parse('c. 1780')
176 |         assert fd.year == u'1780', fd
177 |         assert fd.qualifier.startswith(u"Note 'circa'"), fd
178 | 
179 |     def test_ambiguous(self):
180 |         # TODO: have to be careful here ...
181 |         fd = parse('1068/1069')
182 | 
183 |     def test_small_years(self):
184 |         in1 = '23'
185 |         fd = parse(in1)
186 |         assert str(fd) == '0023', fd
187 |         assert fd.as_float() == 23, fd.as_float()
188 | 
189 |     def test_small_years_with_zeros(self):
190 |         in1 = '0023'
191 |         fd = parse(in1)
192 |         assert str(fd) == '0023', fd
193 |         assert fd.as_float() == 23, fd.as_float()
194 | 
195 |     def test_years_with_alpha_prefix(self):
196 |         in1 = "p1980"
197 |         fd = parse(in1)
198 |         assert str(fd) == "1980", fd
199 |         
200 | 


--------------------------------------------------------------------------------
/swiss/tests/test_id.py:
--------------------------------------------------------------------------------
 1 | import uuid
 2 | 
 3 | import swiss.id
 4 | 
 5 | def test_compress_and_uncompress_uuid():
 6 |     hexversion = '86c3f19d-8854-4ef5-8d88-f008e0817871'
 7 | 
 8 |     out = swiss.id.compress_uuid(hexversion)
 9 |     assert len(out) == 22
10 | 
11 |     orig = swiss.id.uncompress_uuid(out)
12 |     assert orig == hexversion
13 | 
14 |     # test unicode
15 |     orig = swiss.id.uncompress_uuid(unicode(out))
16 |     assert orig == hexversion
17 | 
18 |     u1 = uuid.UUID(hexversion)
19 |     out = swiss.id.compress_uuid(u1)
20 |     assert len(out) == 22
21 | 
22 | 
23 | def test_int_to_b32():
24 |     def check(int_):
25 |         out = swiss.id.int_to_b32(int_)
26 |         assert isinstance(out, basestring)
27 |         assert len(out) == 7, out
28 | 
29 |         back = swiss.id.b32_to_int(out)
30 |         assert back == int_, (int_,back)
31 | 
32 |     check(1)
33 |     check(2**28+1)
34 |     check(2**30-1)
35 | 
36 | 


--------------------------------------------------------------------------------
/swiss/tests/test_misc.py:
--------------------------------------------------------------------------------
 1 | from swiss.misc import *
 2 | 
 3 | class TestFloatify:
 4 |     def test_floatify_1(self):
 5 |         x = '10'
 6 |         assert floatify(x) == 10.0
 7 | 
 8 |     def test_floatify_2(self):
 9 |         x = '1,030'
10 |         assert floatify(x) == 1030.0
11 | 
12 |     def test_floatify_2(self):
13 |         x = ''
14 |         out = floatify(x)
15 |         assert out == None, out
16 |         x = '#'
17 |         out = floatify(x)
18 |         assert out == None, out
19 | 
20 |     def test_floatify_matrix(self):
21 |         x = [ 
22 |                 ['1', '2'],
23 |                 ['abc', '3.0']
24 |                 ]
25 |         exp = [ 
26 |                 [1.0, 2.0],
27 |                 ['abc', 3.0]
28 |                 ]
29 |         out = floatify_matrix(x)
30 |         assert out == exp
31 | 
32 | 
33 | class TestMakeSeries:
34 | 
35 |     def test_make_series(self):
36 |         indata = [ [ '1980', '100', '50' ],
37 |                 [ '1981', '101', '51' ],
38 |                 [ '1982', '102', '' ],
39 |                 ]
40 |         exp = [
41 |                 [ (1980.0, 100.0), (1981.0, 101.0), (1982.0, 102.0) ],
42 |                 [ (1980.0, 50.0), (1981.0, 51.0) ]
43 |             ]
44 |         out = make_series(indata, xcol=0, ycols=[1,2])
45 |         assert out == exp, out
46 | 
47 | 


--------------------------------------------------------------------------------
/swiss/tests/test_xls.py:
--------------------------------------------------------------------------------
 1 | import pkg_resources
 2 | 
 3 | import swiss.tabular
 4 | 
 5 | class TestXlsReader:
 6 | 
 7 |     def test_stuff(self):
 8 |         fo = pkg_resources.resource_stream('swiss',
 9 |             'tests/data/xls_reader_test.xls')
10 |         reader = swiss.tabular.XlsReader(fo)
11 |         tab = reader.read()
12 |         assert tab.data[0][0] == 1850
13 |         assert tab.data[19][1] == 12.3
14 | 
15 | 


--------------------------------------------------------------------------------