├── .gitignore
├── .hgignore
├── .hgtags
├── README.txt
├── datautil
├── __init__.py
├── cache.py
├── cli.py.command
├── clitools.py
├── date.py
├── deliveranceproxy.py
├── id.py
├── misc.py
├── normalization
│ ├── __init__.py
│ ├── table_based.py
│ └── text.py
├── parse
│ ├── __init__.py
│ └── name.py
├── scrape.py
├── tabular
│ ├── __init__.py
│ ├── base.py
│ ├── gdocs.py
│ ├── html.py
│ ├── misc.py
│ ├── tabular_json.py
│ ├── txt.py
│ └── xls.py
└── tests
│ ├── __init__.py
│ ├── data
│ └── xls_reader_test.xls
│ ├── parse
│ └── test_name.py
│ ├── tabular
│ ├── __init__.py
│ ├── test_base.py
│ ├── test_gdocs.py
│ ├── test_json.py
│ ├── test_misc.py
│ └── test_txt.py
│ ├── test_cache.py
│ ├── test_date.py
│ ├── test_id.py
│ ├── test_misc.py
│ └── test_xls.py
├── setup.py
└── swiss
├── __init__.py
├── cache.py
├── clitools.py
├── date.py
├── deliveranceproxy.py
├── id.py
├── misc.py
├── parse
├── __init__.py
└── name.py
├── tabular
├── __init__.py
├── base.py
├── gdocs.py
├── html.py
├── misc.py
├── tabular_json.py
├── txt.py
└── xls.py
└── tests
├── __init__.py
├── data
└── xls_reader_test.xls
├── parse
└── test_name.py
├── tabular
├── __init__.py
├── test_base.py
├── test_gdocs.py
├── test_json.py
├── test_misc.py
└── test_txt.py
├── test_cache.py
├── test_date.py
├── test_id.py
├── test_misc.py
└── test_xls.py
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | .idea/*
3 | *.pyc
4 | docs/build/*
--------------------------------------------------------------------------------
/.hgignore:
--------------------------------------------------------------------------------
1 | syntax: glob
2 | *.egg-info/*
3 | *.pyc
4 | *.swp
5 | *.swo
6 | sandbox/*
7 |
8 | syntax: regexp
9 | ^build$
10 | ^pyenv$
11 |
--------------------------------------------------------------------------------
/.hgtags:
--------------------------------------------------------------------------------
1 | 3e61713892d3525675712a96fbcbc439837151d0 0.1
2 | 5d28eda958146bb213aee67ef89bc04ec5a1e06e 0.2
3 | 99c63b2a432dbfe32f7a9359d3cb8076412aa164 0.3
4 |
--------------------------------------------------------------------------------
/README.txt:
--------------------------------------------------------------------------------
1 | Swiss Army Knife for Data Work.
2 |
3 | For details read the main package docstring.
4 |
5 | Open source software licensed under the MIT license.
6 |
7 | ## Install
8 |
9 | 1. Install setuptools
10 |
11 | 2. Either install directy from PyPI usinging easy_install:
12 |
13 | $ easy_install datautil
14 |
15 | OR install from the source obtainable from the mercurial repository:
16 |
17 | $ hg clone https://github.com/okfn/datautil
18 |
19 | ## Tests
20 |
21 | 1. Ensure you also have install 'xlrd' and 'gdata' (options mentioned
22 | in setup.py) and nose (for running tests):
23 |
24 | $ easy_install nose xlrd gdata
25 |
26 | 2. Run the tests:
27 |
28 | $ nosetests datautil/tests/
29 |
--------------------------------------------------------------------------------
/datautil/__init__.py:
--------------------------------------------------------------------------------
1 | '''Utilities for Data Work
2 | =======================
3 |
4 | The datautil package provides various utilities for working with data:
5 |
6 | * cache: Url caching and scraping
7 | * tabular/*: Processing and transforming tabular data to and from various
8 | formats including csv, json, google spreadsheets, xls
9 | * misc, date: Cleaning up and parsing data especially dates.
10 | * id: ID generation and shortenening
11 | * clitools.py: Command line tools such as creating optparse object and usage
12 | from a module of object.
13 | * deliveranceproxy.py: Deliverance proxy helper
14 |
15 |
16 | CHANGELOG
17 | =========
18 |
19 | v0.5 2011-??-??
20 | ---------------
21 |
22 | * Minor improvements to cache
23 |
24 | v0.4 2011-01-05
25 | ---------------
26 |
27 | * Rename swiss to datautil
28 |
29 | v0.3 2010-08-01
30 | ---------------
31 |
32 | * Support for google docs spreadsheets as sources for TabularData
33 | * Improve documentation of date module and add FlexiDate.as_datetime()
34 | * New clitools module incorporating existing cli tools
35 | * deliveranceproxy.py: Deliverance proxy helper for proxying to remote
36 | websites and retheming with deliverance.
37 | * parse/name.py: new (human) name parsing code
38 |
39 | v0.2 2009-10-23
40 | ---------------
41 |
42 | * Extensive refactoring of tabular module/package
43 | * Standardized interface with BaseReader and BaseWriter
44 | * JsonReader and JsonWriter providing json reading and writing
45 | * TxtWriter to support writing to plain text
46 | * Improvements to date parsing (support for circa, 'c.', etc)
47 | * New id module to do 'compression' of uuids using 32 and 64 bit encoding
48 |
49 |
50 | v0.1 2009-06-03
51 | ---------------
52 |
53 | * Bring together existing code (from last 2+ years) into new 'datautil' package
54 | * Url caching and scraping
55 | * Tabular data handling including csv reader/writer, xls reader, latex writer
56 | and associated utilities (such as pivot_table)
57 | * Cleaning and parsing data especially dates (misc and date modules)
58 | '''
59 | __version__ = '0.4'
60 |
61 | try:
62 | import tabular
63 | except ImportError:
64 | tabular = None
65 | from cache import *
66 | from misc import *
67 | from id import *
68 |
--------------------------------------------------------------------------------
/datautil/cache.py:
--------------------------------------------------------------------------------
1 | '''A local file cache with url retrieving builtin.
2 |
3 | NB: this module has zero dependencies on modules outside of the
4 | standard lib so that it is easily reusable in other libraries and applications
5 | that do not require any other parts of the datautil package.
6 | '''
7 | import urlparse
8 | import urllib
9 | import os
10 | import sys
11 |
12 |
13 | # have to define before Cache as used in classmethod
14 | class _Progress(object):
15 | def __init__(self):
16 | self.count = -1
17 |
18 | def dl_progress(self, count, block_size, total_size):
19 | if total_size == 0: # total_size is weird so return to avoid errors
20 | return
21 | if self.count == -1:
22 | print 'Total size: %s' % self.format_size(total_size)
23 | last_percent = int(self.count*block_size*100/total_size)
24 | percent = int(count*block_size*100/total_size)
25 | if percent > last_percent:
26 | # TODO: is this acceptable? Do we want to do something nicer?
27 | sys.stdout.write('.')
28 | sys.stdout.flush()
29 | self.count = count
30 |
31 | def format_size(self, bytes):
32 | if bytes > 1000*1000:
33 | return '%.1fMb' % (bytes/1000.0/1000)
34 | elif bytes > 10*1000:
35 | return '%iKb' % (bytes/1000)
36 | elif bytes > 1000:
37 | return '%.1fKb' % (bytes/1000.0)
38 | else:
39 | return '%ibytes' % bytes
40 |
41 |
42 | class Cache(object):
43 | '''A local file cache (and url retriever).
44 | '''
45 |
46 | def __init__(self, path='.'):
47 | '''
48 | @param path: path to cache (defaults to current directory)
49 | '''
50 | self.path = path
51 | if not os.path.exists(self.path):
52 | os.makedirs(path)
53 |
54 | def retrieve(self, url, overwrite=False):
55 | '''Retrieve url into cache and return the local path to it.
56 |
57 | :param url: url to retrieve.
58 | :return: path to file retrieved.
59 | '''
60 | dest = self.cache_path(url)
61 | self.download(url, dest, overwrite)
62 | return dest
63 |
64 | def cache_path(self, url):
65 | '''Local path for url within cache.'''
66 | name = self.basename(url)
67 | dest = os.path.join(self.path, name)
68 | return dest
69 |
70 | def filepath(self, url):
71 | '''Deprecated: use cache_path'''
72 | return self.cache_path(url)
73 |
74 | def stream(self, url):
75 | fp = self.cache_path(url)
76 | if not os.path.exists(fp):
77 | return None
78 | else:
79 | return open(fp)
80 |
81 | @classmethod
82 | def basename(self, url):
83 | scheme, netloc, path, params, query, fragment = urlparse.urlparse(url)
84 | result = path.split('/')[-1]
85 | if query:
86 | # escape '/' as otherwise path problems
87 | result += '?' + query.replace('/', '%47')
88 | return result
89 |
90 | @classmethod
91 | def download(self, url, dest, overwrite=False):
92 | '''Download a file from a url.
93 |
94 | :param url: the source url
95 | :param dest: the destination path to save to.
96 | :param overwrite: overwrite destination file if it exists (defaults to
97 | False).
98 | '''
99 | url = url.encode('utf-8')
100 | if not os.path.exists(dest) or overwrite:
101 | print 'Retrieving %s' % url
102 | prog = _Progress()
103 | urllib.urlretrieve(url, dest, reporthook=prog.dl_progress)
104 | else:
105 | print 'Skipping download as dest already exists: %s' % url
106 |
107 | # for backwards compatability
108 | @classmethod
109 | def dl(self, url, dest=None):
110 | return self.download(url, dest)
111 |
112 |
--------------------------------------------------------------------------------
/datautil/cli.py.command:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | import optparse
4 | import logging
5 | from StringIO import StringIO
6 | import traceback
7 | import time
8 |
9 | parser = optparse.OptionParser()
10 |
11 | parser.add_option(
12 | '-v', '--verbose',
13 | dest='verbose',
14 | action='count',
15 | default=0,
16 | help='Give more output')
17 | parser.add_option(
18 | '-q', '--quiet',
19 | dest='quiet',
20 | action='count',
21 | default=0,
22 | help='Give less output')
23 |
24 | class Command(object):
25 | name = None
26 | usage = None
27 | default_parser = None
28 | all_commands = []
29 |
30 | def __init__(self):
31 | assert self.name
32 | self.parser = optparse.OptionParser(
33 | usage=self.usage,
34 | prog='%s %s' % (sys.argv[0], self.name),
35 | version=parser.version)
36 | for option in self.default_parser.option_list:
37 | if not option.dest:
38 | # -h, --version, etc
39 | continue
40 | self.parser.add_option(option)
41 | Command.all_commands[self.name] = self
42 |
43 | def merge_options(self, initial_options, options):
44 | for attr in ['log']:
45 | setattr(options, attr, getattr(initial_options, attr) or getattr(options, attr))
46 | options.quiet += initial_options.quiet
47 | options.verbose += initial_options.verbose
48 |
49 | def main(self, complete_args, args, initial_options):
50 | options = initial_options
51 | discarded_options, args = self.parser.parse_args(args)
52 | # From pip but not needed by us I think
53 | # self.merge_options(initial_options, options)
54 | self.options = options
55 | self.verbose = options.verbose
56 |
57 | level = 1
58 | level += options.verbose
59 | level -= options.quiet
60 | complete_log = []
61 | if options.log:
62 | log_fp = open_logfile_append(options.log)
63 | logger.consumers.append((logger.DEBUG, log_fp))
64 | else:
65 | log_fp = None
66 |
67 | exit = 0
68 | try:
69 | self.run(options, args)
70 | except:
71 | logger.fatal('Exception:\n%s' % format_exc())
72 | exit = 2
73 |
74 | if log_fp is not None:
75 | log_fp.close()
76 | if exit:
77 | log_fn = 'datapkg-log.txt'
78 | text = '\n'.join(complete_log)
79 | # Not sure we need to tell people ...
80 | # logger.fatal('Storing complete log in %s' % log_fn)
81 | log_fp = open_logfile_append(log_fn)
82 | log_fp.write(text)
83 | log_fp.close()
84 | sys.exit(exit)
85 |
--------------------------------------------------------------------------------
/datautil/clitools.py:
--------------------------------------------------------------------------------
1 | '''Expose methods or functions as commands on the command line
2 |
3 | Example usage::
4 |
5 | # in your code
6 | from datautil.clitools import _main
7 | if __name__ == '__main__':
8 | # expose everything in current module
9 | _main(locals())
10 | # or if you have an object MyObject with methods you want to expose
11 | _main(MyObject)
12 | '''
13 | import os
14 | import sys
15 | import optparse
16 | import inspect
17 |
18 | def _object_methods(obj):
19 | methods = inspect.getmembers(obj, inspect.ismethod)
20 | methods = filter(lambda (name,y): not name.startswith('_'), methods)
21 | methods = dict(methods)
22 | return methods
23 |
24 | def _module_functions(functions):
25 | local_functions = dict(functions)
26 | for k,v in local_functions.items():
27 | if not inspect.isfunction(v) or k.startswith('_'):
28 | del local_functions[k]
29 | return local_functions
30 |
31 | def _main(functions_or_object):
32 | isobject = inspect.isclass(functions_or_object)
33 | if isobject:
34 | _methods = _object_methods(functions_or_object)
35 | else:
36 | _methods = _module_functions(functions_or_object)
37 |
38 | usage = '''%prog {action}
39 |
40 | Actions:
41 | '''
42 | usage += '\n '.join(
43 | [ '%s: %s' % (name, m.__doc__.split('\n')[0] if m.__doc__ else '') for (name,m)
44 | in sorted(_methods.items()) ])
45 | parser = optparse.OptionParser(usage)
46 | # Optional: for a config file
47 | # parser.add_option('-c', '--config', dest='config',
48 | # help='Config file to use.')
49 | options, args = parser.parse_args()
50 |
51 | if not args or not args[0] in _methods:
52 | parser.print_help()
53 | sys.exit(1)
54 |
55 | method = args[0]
56 | if isobject:
57 | getattr(functions_or_object(), method)(*args[1:])
58 | else:
59 | _methods[method](*args[1:])
60 |
61 | __all__ = [ '_main' ]
62 |
63 | if __name__ == '__main__':
64 | _main(locals())
65 |
66 |
--------------------------------------------------------------------------------
/datautil/date.py:
--------------------------------------------------------------------------------
1 | """
2 | Date parsing and normalization utilities based on FlexiDate.
3 |
4 | To parse dates use parse(), e.g.::
5 |
6 | from datautil.date import parse
7 |
8 | parse('1890') -> FlexiDate(year=u'1890')
9 | parse('1890?') -> FlexiDate(year=u'1890', qualifier='Uncertainty: 1985?')
10 |
11 | Once you have a FlexiDate you can get access to attributes (strings of course
12 | ...)::
13 |
14 | fd = parse('Jan 1890')
15 | fd.year # u'1890'
16 | fd.month # u'01'
17 |
18 | And convert to other forms::
19 |
20 | fd.as_float() # 1890
21 | fd.as_datetime() # datetime(1890,01,01)
22 |
23 | Background
24 | ==========
25 |
26 | FlexiDate is focused on supporting:
27 |
28 | 1. Dates outside of Python (or DB) supported period (esp. dates < 0 AD)
29 | 2. Imprecise dates (c.1860, 18??, fl. 1534, etc)
30 | 3. Normalization of dates to machine processable versions
31 | 4. Sortable in the database (in correct date order)
32 |
33 | For more information see:
34 |
35 | `Flexible Dates in Python (including BC) `_
36 |
37 | --------------------
38 |
39 | """
40 | import re
41 | import datetime
42 |
43 | class FlexiDate(object):
44 | """Store dates as strings and present them in a slightly extended version
45 | of ISO8601.
46 |
47 | Modifications:
48 | * Allow a trailing qualifiers e.g. fl.
49 | * Allow replacement of unknown values by ? e.g. if sometime in 1800s
50 | can do 18??
51 |
52 | Restriction on ISO8601:
53 | * Truncation (e.g. of centuries) is *not* permitted.
54 | * No week and day representation e.g. 1999-W01
55 | """
56 | # pass
57 | def __init__(self, year=None, month=None, day=None, qualifier=''):
58 | # force = month or day or qualifier
59 | force = False
60 | self.year = self._cvt(year, rjust=4, force=force)
61 | self.month = self._cvt(month)
62 | self.day = self._cvt(day)
63 | self.qualifier = qualifier
64 |
65 | def _cvt(self, val, rjust=2, force=False):
66 | if val:
67 | tmp = unicode(val).strip()
68 | if tmp.startswith('-'):
69 | tmp = '-' + tmp[1:].rjust(rjust, '0')
70 | else:
71 | tmp = tmp.rjust(rjust, '0')
72 | return tmp
73 | elif force:
74 | # use '!' rather than '?' as '!' < '1' while '?' > '1'
75 | return rjust * '!'
76 | else:
77 | return ''
78 |
79 | def __str__(self):
80 | out = self.isoformat()
81 | if self.qualifier:
82 | # leading space is important as ensures when no year sort in right
83 | # order as ' ' < '1'
84 | out += u' [%s]' % self.qualifier
85 | return out
86 |
87 | def __repr__(self):
88 | return u'%s %s' % (self.__class__, self.__str__())
89 |
90 | def isoformat(self, strict=False):
91 | '''Return date in isoformat (same as __str__ but without qualifier).
92 |
93 | WARNING: does not replace '?' in dates unless strict=True.
94 | '''
95 | out = self.year
96 | # what do we do when no year ...
97 | for val in [ self.month, self.day ]:
98 | if not val:
99 | break
100 | out += u'-' + val
101 | if strict:
102 | out = out.replace('?', '0')
103 | return out
104 |
105 | our_re_pat = '''
106 | (?P -?[\d?]+)
107 | (?:
108 | \s* - (?P [\d?]{1,2})
109 | (?: \s* - (?P [\d?]{1,2}) )?
110 | )?
111 | \s*
112 | (?: \[ (?P[^]]*) \])?
113 | '''
114 | our_re = re.compile(our_re_pat, re.VERBOSE)
115 | @classmethod
116 | def from_str(self, instr):
117 | '''Undo affect of __str__'''
118 | if not instr:
119 | return FlexiDate()
120 |
121 | out = self.our_re.match(instr)
122 | if out is None: # no match TODO: raise Exception?
123 | return None
124 | else:
125 | return FlexiDate(
126 | out.group('year'),
127 | out.group('month'),
128 | out.group('day'),
129 | qualifier=out.group('qualifier')
130 | )
131 |
132 | def as_float(self):
133 | '''Get as a float (year being the integer part).
134 |
135 | Replace '?' in year with 9 so as to be conservative (e.g. 19?? becomes
136 | 1999) and elsewhere (month, day) with 0
137 |
138 | @return: float.
139 | '''
140 | if not self.year: return None
141 | out = float(self.year.replace('?', '9'))
142 | if self.month:
143 | # TODO: we are assuming months are of equal length
144 | out += float(self.month.replace('?', '0')) / 12.0
145 | if self.day:
146 | out += float(self.day.replace('?', '0')) / 365.0
147 | return out
148 |
149 | def as_datetime(self):
150 | '''Get as python datetime.datetime.
151 |
152 | Require year to be a valid datetime year. Default month and day to 1 if
153 | do not exist.
154 |
155 | @return: datetime.datetime object.
156 | '''
157 | year = int(self.year)
158 | month = int(self.month) if self.month else 1
159 | day = int(self.day) if self.day else 1
160 | return datetime.datetime(year, month, day)
161 |
162 |
163 | def parse(date, dayfirst=True):
164 | '''Parse a `date` into a `FlexiDate`.
165 |
166 | @param date: the date to parse - may be a string, datetime.date,
167 | datetime.datetime or FlexiDate.
168 |
169 | TODO: support for quarters e.g. Q4 1980 or 1954 Q3
170 | TODO: support latin stuff like M.DCC.LIII
171 | TODO: convert '-' to '?' when used that way
172 | e.g. had this date [181-]
173 | '''
174 | if not date:
175 | return None
176 | if isinstance(date, FlexiDate):
177 | return date
178 | if isinstance(date, int):
179 | return FlexiDate(year=date)
180 | elif isinstance(date, datetime.date):
181 | parser = PythonDateParser()
182 | return parser.parse(date)
183 | else: # assuming its a string
184 | parser = DateutilDateParser()
185 | out = parser.parse(date, **{'dayfirst': dayfirst})
186 | if out is not None:
187 | return out
188 | # msg = 'Unable to parse %s' % date
189 | # raise ValueError(date)
190 | val = 'UNPARSED: %s' % date
191 | val = val.encode('ascii', 'ignore')
192 | return FlexiDate(qualifier=val)
193 |
194 |
195 | class DateParserBase(object):
196 | def parse(self, date):
197 | raise NotImplementedError
198 |
199 | def norm(self, date):
200 | return str(self.parse(date))
201 |
202 | class PythonDateParser(object):
203 | def parse(self, date):
204 | return FlexiDate(date.year, date.month, date.day)
205 |
206 | try:
207 | import dateutil.parser
208 | dateutil_parser = dateutil.parser.parser()
209 | except:
210 | dateutil_parser = None
211 |
212 | class DateutilDateParser(DateParserBase):
213 | _numeric = re.compile("^[0-9]+$")
214 | def parse(self, date, **kwargs):
215 | '''
216 | :param **kwargs: any kwargs accepted by dateutil.parse function.
217 | '''
218 | qualifiers = []
219 | if dateutil_parser is None:
220 | return None
221 | date = orig_date = date.strip()
222 |
223 | # various normalizations
224 | # TODO: call .lower() first
225 | date = date.replace('B.C.', 'BC')
226 | date = date.replace('A.D.', 'AD')
227 |
228 | # deal with pre 0AD dates
229 | if date.startswith('-') or 'BC' in date or 'B.C.' in date:
230 | pre0AD = True
231 | else:
232 | pre0AD = False
233 | # BC seems to mess up parser
234 | date = date.replace('BC', '')
235 |
236 | # deal with circa: expressed as [c|ca|cca|circ|circa] with or without an appended period
237 | # and with or without a space, followed by a date
238 | # 'c.1950' or 'c1950' 'ca. 1980' 'circ 198?' 'cca. 1980' 'c 1029' 'circa 1960' etc.
239 | # see http://en.wikipedia.org/wiki/Circa
240 | # TODO: dates like 'circa 178?' and 'circa 178-' fail poorly
241 | # 'UNPARSED: circa 178?' / u"Note 'circa' : circa 178-"
242 |
243 |
244 | # note that the match deliberately does not capture the circa text match
245 | # this is done to remove circa bit below
246 | #circa_match = re.match('([^a-zA-Z]*)c\.?\s*(\d+.*)', date)
247 |
248 | # use non-matching groups (?:) to avoid refactoring the rest of the parsing
249 | circa_match = re.match(r'([^a-zA-Z]*)(?:circa|circ\.?|cca\.?|ca\.?|c\.?)(?:\s*?)([\d\?-]+\s?\?*)', date)
250 |
251 | if circa_match:
252 | # remove circa bit
253 | qualifiers.append("Note 'circa'")
254 | #date = ''.join(circa_match.groups())
255 | # if an element in circa_match.groups() is None, an exception is thrown
256 | # so instead join the match groups from circa_match that are not none
257 | date = ''.join(list(el for el in circa_match.groups() if el))
258 |
259 | # deal with p1980 (what does this mean? it can appear in
260 | # field 008 of MARC records
261 | p_match = re.match("^p(\d+)", date)
262 | if p_match:
263 | date = date[1:]
264 |
265 | # Deal with uncertainty: '1985?'
266 | uncertainty_match = re.match('([0-9xX]{4})\?', date)
267 | if uncertainty_match:
268 | # remove the ?
269 | date = date[:-1]
270 | qualifiers.append('Uncertainty')
271 |
272 | # Parse the numbers intelligently
273 | # do not use std parser function as creates lots of default data
274 | res = dateutil_parser._parse(date, **kwargs)
275 |
276 | if res is None:
277 | # Couldn't parse it
278 | return None
279 | #Note: Years of less than 3 digits not interpreted by
280 | # dateutil correctly
281 | # e.g. 87 -> 1987
282 | # 4 -> day 4 (no year)
283 | # Both cases are handled in this routine
284 | if res.year is None and res.day:
285 | year = res.day
286 | # If the whole date is simply two digits then dateutil_parser makes
287 | # it '86' -> '1986'. So strip off the '19'. (If the date specified
288 | # day/month then a two digit year is more likely to be this century
289 | # and so allow the '19' prefix to it.)
290 | elif self._numeric.match(date) and (len(date) == 2 or date.startswith('00')):
291 | year = res.year % 100
292 | else:
293 | year = res.year
294 |
295 | # finally add back in BC stuff
296 | if pre0AD:
297 | year = -year
298 |
299 | if not qualifiers:
300 | qualifier = ''
301 | else:
302 | qualifier = ', '.join(qualifiers) + (' : %s' % orig_date)
303 | return FlexiDate(year, res.month, res.day, qualifier=qualifier)
304 |
305 |
--------------------------------------------------------------------------------
/datautil/deliveranceproxy.py:
--------------------------------------------------------------------------------
1 | '''Use deliverance_ for proxying and re-theming.
2 |
3 | .. _deliverance: http://packages.python.org/Deliverance/
4 |
5 | Usage requirements (in pip-requirements.txt format)::
6 |
7 | # suggest installing lxml directly
8 | lxml
9 | deliverance>=0.3a
10 | # for urlmap and proxy
11 | paste
12 | # for Response
13 | webob
14 |
15 | Example usage::
16 |
17 | dest = 'http://myremotes.ite/'
18 | mytheme = '....'
19 | my_deliverance_rules = ' ...'
20 | # or
21 | # my_deliverance_rules = open('/my/path/to/rules.xml').read()
22 | deliverance_proxy = create_deliverance_proxy(mytheme, dest,
23 | my_deliverance_rules)
24 |
25 | # from in wsgi app
26 | # path on remote destination url you want to proxy to ...
27 | # you can omit this if local path and remote path are the same
28 | environ['PATH_INFO'] = '/my_destination_path'
29 | deliverance_proxy(environ, start_response)
30 | '''
31 | import logging
32 |
33 | import paste.urlmap
34 | import deliverance.middleware
35 | import paste.proxy
36 | from webob import Request, Response
37 | from deliverance.middleware import DeliveranceMiddleware, SubrequestRuleGetter
38 | from deliverance.log import PrintingLogger
39 |
40 |
41 | default_deliverance_rules = \
42 | '''
43 |
44 |
45 |
47 |
48 |
49 |
50 |
51 |
54 |
55 |
56 | '''
57 |
58 | def create_deliverance_proxy(proxy_base_url, theme_html, rules_xml=None):
59 | '''Proxy to another url with re-theming using deliverance.
60 |
61 | Based on http://rufuspollock.org/code/deliverance
62 |
63 | :param proxy_base_url: base destination url we are proxying to.
64 | :param theme_html: string providing html theme to use for re-themeing.
65 | :param rules_xml: (optional) deliverance rules xml as a string. If not
66 | provided use `default_deliverance_rules`. For info on rulesets see
67 | deliverance docs. We require that ruleset support a single
68 | substitution string '%s' which is used to insert internal mountpoint
69 | for the them ('/_deliverance_theme.html').
70 | '''
71 | theme_url = '/_deliverance_theme.html'
72 | # use a urlmap so we can mount theme and urlset
73 | app = paste.urlmap.URLMap()
74 | # set up theme consistent with our rules file
75 | app[theme_url] = Response(theme_html)
76 |
77 | if rules_xml:
78 | rules = rules_xml
79 | else:
80 | rules = default_deliverance_rules
81 | rules = rules % theme_url
82 | app['/_deliverance_rules.xml'] = Response(rules, content_type="application/xml")
83 |
84 | class MyProxy(object):
85 | def __init__(self, proxy_base_url):
86 | self.proxy = paste.proxy.Proxy(proxy_base_url)
87 |
88 | def __call__(self, environ, start_response):
89 | req = Request(environ)
90 | res = req.get_response(self.proxy)
91 | res.decode_content()
92 | return res(environ, start_response)
93 |
94 | app['/'] = MyProxy(proxy_base_url)
95 | deliv = DeliveranceMiddleware(app, SubrequestRuleGetter('/_deliverance_rules.xml'),
96 | PrintingLogger,
97 | log_factory_kw=dict(print_level=logging.WARNING))
98 | return deliv
99 |
100 |
--------------------------------------------------------------------------------
/datautil/id.py:
--------------------------------------------------------------------------------
1 | import base64
2 | import uuid
3 |
4 | def compress_uuid(_uuid):
5 | '''Provided shortened string representation of UUID via base64 encoding.
6 |
7 | @return: 22 character base64 encoded version of UUID.
8 | '''
9 | if isinstance(_uuid, basestring):
10 | _uuid = uuid.UUID(_uuid)
11 | encode = base64.b64encode(_uuid.bytes, '_-')
12 | # throw away trailing ==
13 | return encode[:22]
14 |
15 | def uncompress_uuid(b64_encoded):
16 | '''Reverse compress_uuid
17 |
18 | @return: 36 char str representation of uuid.
19 | '''
20 | b64_encoded = str(b64_encoded)
21 | if not b64_encoded.endswith('=='):
22 | b64_encoded += '=='
23 | out = base64.b64decode(b64_encoded, '_-')
24 | _uuid = uuid.UUID(bytes=out)
25 | return str(_uuid)
26 |
27 |
28 | import struct
29 | def int_to_b32(int_):
30 | out = struct.pack('1i', int_)
31 | out = base64.b32encode(out)
32 | # throw away trailing '='
33 | return out[:-1]
34 |
35 | def b32_to_int(b32):
36 | out = base64.b32decode(b32+'=', casefold=True)
37 | out = struct.unpack('1i', out)[0]
38 | return out
39 |
40 |
--------------------------------------------------------------------------------
/datautil/misc.py:
--------------------------------------------------------------------------------
1 | # TODO: create a strict option where None is returned on failed convert rather
2 | # than original value
3 | placeholders = [ '', '-', '#' ]
4 | def floatify(value):
5 | '''Convert value to a float if possible.
6 |
7 | @return: Floatified value. If value is blank or placeholder ('-') return
8 | None. Can deal with ',' in value. Will also floatify dates. If nothing
9 | works returns original value.
10 | '''
11 | if value is None:
12 | return None
13 | if isinstance(value, basestring):
14 | stripped = value.strip()
15 | if not stripped or stripped in placeholders:
16 | return None
17 | else:
18 | # often numbers have commas in them like 1,030
19 | v = value.replace(',', '')
20 | try:
21 | newval = float(v)
22 | return newval
23 | except:
24 | pass
25 | # will return original value if fails
26 | return date_to_float(value)
27 |
28 | def floatify_matrix(matrix):
29 | return [ [ floatify(col) for col in row ] for row in matrix ]
30 |
31 | # TODO: remove/convert to using date.FlexiDate.as_float()
32 | import datetime
33 | def date_to_float(date):
34 | '''Convert a date to float.
35 |
36 | Accepts either a date object or a string parseable to a date object
37 |
38 | @return: converted value or original if conversion fails
39 | '''
40 | import dateutil.parser
41 | if isinstance(date, basestring):
42 | try: # simple year
43 | return float(date)
44 | except:
45 | pass
46 | try:
47 | val = dateutil.parser.parse(date, default=datetime.date(1,1,1))
48 | except:
49 | return date
50 | else:
51 | val = date
52 |
53 | if isinstance(val, datetime.date):
54 | fval = val.year + val.month / 12.0 + val.day / 365.0
55 | return round(fval, 3)
56 | else:
57 | return val
58 |
59 | def make_series(matrix, xcol, ycols=None):
60 | '''Take a matrix and return series (i.e. list of tuples) corresponding to
61 | specified column indices.
62 |
63 | E.g. if matrix is:
64 | [ [1,2,3,4]
65 | [5,6,7,8] ]
66 |
67 | and xcol = 0, ycols=[1,3] then output is:
68 |
69 | [
70 | [ [1,2], [5,6] ],
71 | [ [1,4], [5,8] ],
72 | ]
73 |
74 | If ycols not defined then return all possible series (excluding xcol
75 | with itself.
76 | '''
77 | cols = zip(*matrix)
78 | if ycols is None:
79 | ycols = range(len(cols))
80 | del ycols[xcol]
81 | cols = floatify_matrix(cols)
82 | def is_good(value):
83 | if value is None: return False
84 | tv = str(value)
85 | stopchars = [ '', '-' ]
86 | if tv in stopchars:
87 | return False
88 | return True
89 | def is_good_tuple(tuple):
90 | return is_good(tuple[0]) and is_good(tuple[1])
91 |
92 | xcoldata = cols[xcol]
93 | ycols = [ cols[ii] for ii in ycols ]
94 | series = [ filter(is_good_tuple, zip(xcoldata, col)) for col in ycols ]
95 | return series
96 |
97 |
--------------------------------------------------------------------------------
/datautil/normalization/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rufuspollock-okfn/datautil/6bcb8795013d305f02467967367ecca15ad351e2/datautil/normalization/__init__.py
--------------------------------------------------------------------------------
/datautil/normalization/table_based.py:
--------------------------------------------------------------------------------
1 | import gdata.spreadsheet.text_db
2 |
3 | def _transform_key(key):
4 | return key.lower().strip()
5 |
6 | class Normalizer(object):
7 |
8 | def __init__(self, username, password, doc_id, sheet, key_row):
9 | self.client = gdata.spreadsheet.text_db.DatabaseClient(
10 | username=username, password=password)
11 | self._get_table(doc_id, sheet)
12 | self.key_row = key_row
13 | self._records = None
14 |
15 | @property
16 | def records(self):
17 | if self._records is None:
18 | self._records = [r.content for r in self.table.FindRecords('')]
19 | return self._records
20 |
21 | def _get_table(self, doc_id, sheet):
22 | db = self.client.GetDatabases(doc_id)[0]
23 | self.table = db.GetTables(name=sheet)[0]
24 | self.table.LookupFields()
25 |
26 | def keys(self):
27 | return set([r.get(self.key_row) for r in self.records \
28 | if r.get(self.key_row) is not None])
29 |
30 | def __contains__(self, item):
31 | return item in self.keys()
32 |
33 | def get(self, key, source_hint=None):
34 | if key is None:
35 | return {}
36 | record = self.lookup(key)
37 | if record:
38 | return record
39 | return self.add(_transform_key(key), source_hint).content
40 |
41 | def lookup(self, key):
42 | if key is None:
43 | return {}
44 | local_key = _transform_key(unicode(key))
45 | for record in self.records:
46 | # TODO #1: figure out FindRecords syntax
47 | # TODO #2: fuzzy matching for longer keys
48 | if record.get(self.key_row) == local_key:
49 | return record
50 |
51 |
52 | def add(self, value, source_hint):
53 | fields = self.table.fields
54 | row = dict(zip(fields, [None] * len(fields)))
55 | row[self.key_row] = value
56 | if source_hint is not None:
57 | row['source'] = source_hint
58 | self._records.append(row)
59 | return self.table.AddRecord(row)
60 |
61 | class NormalizerJoin(object):
62 |
63 | def __init__(self, first, second):
64 | self.first = first
65 | self.second = second
66 |
67 | def get(self, key, source_hint=None):
68 | if key in self.second:
69 | return self.second.get(key)
70 | data = self.first.get(key, source_hint=source_hint)
71 | if self.second.key_row in data:
72 | data.update(self.second.get(data.get(self.second.key_row)))
73 | return data
74 |
75 | def Licenses(username, password):
76 | doc_id = 'thlRT-WO0EVweyjiwtYLslA'
77 | first = Normalizer(username, password, doc_id, 'Forms', 'original')
78 | second = Normalizer(username, password, doc_id, 'Licenses', 'code')
79 | return NormalizerJoin(first, second)
80 |
81 | def Formats(username, password):
82 | doc_id = 'tO-VTk7QwloOt0EP3YpCC4A'
83 | first = Normalizer(username, password, doc_id, 'Forms', 'original')
84 | second = Normalizer(username, password, doc_id, 'Formats', 'mimetype')
85 | return NormalizerJoin(first, second)
86 |
87 |
88 |
--------------------------------------------------------------------------------
/datautil/normalization/text.py:
--------------------------------------------------------------------------------
1 | import re
2 | import unicodedata
3 | import string
4 |
5 | KILL_DASHES = re.compile("\\-+")
6 |
7 | def compose(text):
8 | return unicodedata.normalize('NFKC', text)
9 |
10 | def decompose(text):
11 | return unicodedata.normalize('NFKD', text)
12 |
13 | def recompose(text):
14 | return compose(decompose(text))
15 |
16 | def url_slug(text):
17 | """ Convert arbitrary text to something that can be a url slug. """
18 | out = []
19 | for c in decompose(text):
20 | cat = unicodedata.category(c)[0].upper()
21 | if cat == 'Z':
22 | out.append('-')
23 | if c in string.ascii_letters or c in string.digits:
24 | out.append(c)
25 | if c in ['-', '.', '+', '_']:
26 | out.append(c)
27 | text = u"".join(out).lower()
28 | return KILL_DASHES.sub('-', text)
29 |
30 |
31 |
--------------------------------------------------------------------------------
/datautil/parse/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rufuspollock-okfn/datautil/6bcb8795013d305f02467967367ecca15ad351e2/datautil/parse/__init__.py
--------------------------------------------------------------------------------
/datautil/parse/name.py:
--------------------------------------------------------------------------------
1 | '''Parse names of people into a standard format.'''
2 |
3 | import re
4 |
5 | titles = [
6 | u'Ayatollah',
7 | u'Baron',
8 | u'Bishop',
9 | u'Dame',
10 | u'Dr',
11 | u'Fr',
12 | u'Graf',
13 | u'King',
14 | u'Lady',
15 | u'Maj',
16 | u'Major',
17 | u'Mrs',
18 | u'Prof',
19 | u'Rev',
20 | u'Sir',
21 | u'St',
22 | ]
23 |
24 | class Name(object):
25 | '''A name of a person or entity.
26 |
27 | Not a domain object but a convenient way to handle/parse names.
28 |
29 | Attributes:
30 | title
31 | ln: last name
32 | firstnames: first names as list
33 | '''
34 | def __init__(self, ln='', fns=None, title=''):
35 | self.ln = ln
36 | self.fns = fns
37 | if self.fns is None: self.fns = []
38 | self.title = title
39 |
40 | def norm(self):
41 | '''Return normalised name string (LastFirst format)
42 | '''
43 | return name_tostr(self)
44 |
45 | def __str__(self):
46 | '''Display name using normalised format
47 | '''
48 | return self.norm()
49 |
50 | class NameParserBase(object):
51 | regex_remove_fullstops = re.compile(r'(\w{2,})\.(\W|$)', re.UNICODE)
52 |
53 | def parse(self, fullname):
54 | '''Parse the `fullname` string into a `Name` object.
55 |
56 | @return: `Name` object for `fullname`
57 | '''
58 | if fullname is None:
59 | return Name()
60 | fullname = unicode(fullname.strip())
61 | if not fullname:
62 | return Name()
63 |
64 | # remove words ending '.', e.g. 'Bosch.'
65 | fullname = self.regex_remove_fullstops.sub(r'\1\2', fullname)
66 |
67 | # make sure initials are separted by ' '
68 | # but first deal with special edge case like [Major.]
69 | # fullname = fullname.replace('.]', ']')
70 | fullname = fullname.replace('.', '. ')
71 | name = self._toparts(fullname)
72 | name.ln = self.normcase(name.ln)
73 | name.fns = [ self.normcase(x) for x in name.fns ]
74 | name.title = self.normcase(name.title)
75 | return name
76 |
77 | def _toparts(self, fullname):
78 | '''Implement in inheriting classes, called by parse.
79 | '''
80 | raise NotImplementedError()
81 |
82 | def tostr(self, name):
83 | '''Convert name object back into a string.
84 | '''
85 | raise NotImplementedError()
86 |
87 | def normcase(self, name):
88 | # useful to handle none and you often get this from regexes
89 | if name is None:
90 | return ''
91 | name = name.strip()
92 | if name.upper() == name or name.lower() == name:
93 | return name.capitalize()
94 | # avoid issues with e.g. McTaggart
95 | else:
96 | return name
97 |
98 | def untitlize(self, _str):
99 | '''Return title contained in _str if a title else return empty string.
100 | '''
101 | title = _str.strip()
102 | title = _str.strip('()')
103 | if title in titles:
104 | return title
105 | # always assume something in square brackets is a title
106 | elif title.startswith('[') and title.endswith(']'):
107 | return title[1:-1].strip()
108 | else:
109 | return ''
110 |
111 | def titlize(self, _str):
112 | return u'[' + _str + u']'
113 |
114 | def norm(self, date):
115 | return str(self.parse(date))
116 |
117 |
118 | class LastFirst(NameParserBase):
119 | '''Parse and creates names of form:
120 |
121 | lastname, first-names-in-order [title]
122 | '''
123 | def _toparts(self, fullname):
124 | if ',' not in fullname and ' ' in fullname:
125 | raise ValueError('Expected "," in name: %s' % fullname)
126 | name = Name()
127 | # NB: if more than 2 commas just ignore stuff after 2nd one
128 | parts = fullname.split(',')
129 | name.ln = parts[0]
130 | name.fns = parts[1].strip().split()
131 | if name.fns:
132 | title = self.untitlize(name.fns[-1])
133 | if title:
134 | name.title = title
135 | del name.fns[-1]
136 | return name
137 |
138 | def tostr(self, name):
139 | if name.ln or name.fns:
140 | fns = ' '.join(name.fns)
141 | if not fns:
142 | out = name.ln
143 | else:
144 | out = unicode(', '.join((name.ln, ' '.join(name.fns))))
145 | else:
146 | return ''
147 | if name.title:
148 | out = out + u' [%s]' % name.title
149 | return out
150 |
151 |
152 | class FirstLast(NameParserBase):
153 | '''Parse and create names of form:
154 |
155 | [title] first-names last-name
156 | '''
157 | def _toparts(self, fullname):
158 | name = Name()
159 | if ',' in fullname:
160 | raise ValueError('Should not have "," in FirstLast type name: %s' %
161 | fullname)
162 | parts = fullname.split()
163 | name.ln = parts[-1]
164 | name.fns = parts[:-1]
165 | if name.fns:
166 | title = self.untitlize(name.fns[0])
167 | if title:
168 | name.title = title
169 | del name.fns[0]
170 | return name
171 |
172 | def tostr(self, name):
173 | if name.fns or name.ln:
174 | out = u' '.join(name.fns) + ' ' + name.ln
175 | else:
176 | return ''
177 | if name.title:
178 | out = u'[%s]' % name.title + out
179 | return out
180 |
181 |
182 | def parse_name(fullname):
183 | if ',' in fullname:
184 | parser = LastFirst()
185 | else:
186 | parser = FirstLast()
187 | return parser.parse(fullname)
188 |
189 | def name_tostr(name, parser_class=LastFirst):
190 | parser = parser_class()
191 | return parser.tostr(name)
192 |
193 | def normalize(name_str, parser_class=LastFirst):
194 | name = parse_name(name_str)
195 | return name_tostr(name, parser_class)
196 |
197 |
198 |
--------------------------------------------------------------------------------
/datautil/scrape.py:
--------------------------------------------------------------------------------
1 | # taken from http://effbot.org/zone/re-sub.htm#unescape-html
2 | import re, htmlentitydefs
3 |
4 | ##
5 | # Removes HTML or XML character references and entities from a text string.
6 | #
7 | # @param text The HTML (or XML) source text.
8 | # @return The plain text, as a Unicode string, if necessary.
9 |
10 | def unescape(text):
11 | def fixup(m):
12 | text = m.group(0)
13 | if text[:2] == "":
14 | # character reference
15 | try:
16 | if text[:3] == "":
17 | return unichr(int(text[3:-1], 16))
18 | else:
19 | return unichr(int(text[2:-1]))
20 | except ValueError:
21 | pass
22 | else:
23 | # named entity
24 | try:
25 | text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
26 | except KeyError:
27 | pass
28 | return text # leave as is
29 | return re.sub("?\w+;", fixup, text)
30 |
--------------------------------------------------------------------------------
/datautil/tabular/__init__.py:
--------------------------------------------------------------------------------
1 | from base import *
2 | from misc import *
3 | from xls import XlsReader
4 | from html import *
5 | from tabular_json import JsonReader, JsonWriter
6 | from txt import TxtWriter
7 |
8 |
--------------------------------------------------------------------------------
/datautil/tabular/base.py:
--------------------------------------------------------------------------------
1 | """
2 | Tools for dealing with tabular data
3 | """
4 |
5 | class TabularData(object):
6 | """Holder for tabular data
7 |
8 | NB:
9 | * Assume data organized in rows.
10 | * No type conversion so all data will be as entered.
11 |
12 | Properties:
13 | * data: data itself provided as array of arrays
14 | * header: associated header columns (if they exist)
15 |
16 | TODO: handling of large datasets (iterators?)
17 | """
18 |
19 | def __init__(self, data=None, header=None):
20 | """
21 | Initialize object. If data or header not set they are defaulted to
22 | empty list.
23 |
24 | NB: must use None as default value for arguments rather than []
25 | because [] is mutable and using it will result in subtle bugs. See:
26 | 'Default parameter values are evaluated when the function definition
27 | is executed.' [http://www.python.org/doc/current/ref/function.html]
28 | """
29 | self.data = []
30 | self.header = []
31 | if data is not None:
32 | self.data = data
33 | if header is not None:
34 | self.header = header
35 |
36 | def __repr__(self):
37 | out = []
38 | if self.header:
39 | out.append(self.header)
40 | # limit to 10 items
41 | out += self.data[0:10]
42 | return repr(out)
43 |
44 | def __str__(self):
45 | return repr(self)
46 |
47 | def __iter__(self):
48 | return self.data.__iter__()
49 |
50 | @classmethod
51 | def from_list(self, list_, header=True):
52 | return TabularData(header=list_[0], data=list_[1:])
53 |
54 | def to_list(self):
55 | if self.header:
56 | return [ self.header ] + self.data
57 | else:
58 | return self.data
59 |
60 |
61 | class ReaderBase(object):
62 | def __init__(self, filepath_or_fileobj=None, encoding='utf8'):
63 | self.filepath = None
64 | self.fileobj = None
65 | self._filepath_or_fileobj(filepath_or_fileobj)
66 | self.encoding = 'utf8'
67 |
68 | def _filepath_or_fileobj(self, filepath_or_fileobj):
69 | if filepath_or_fileobj is None: # do not overwrite any existing value
70 | pass
71 | elif isinstance(filepath_or_fileobj, basestring):
72 | self.filepath = filepath_or_fileobj
73 | self.fileobj = open(self.filepath)
74 | else:
75 | self.filepath = None
76 | self.fileobj = filepath_or_fileobj
77 |
78 | def read(self, filepath_or_fileobj=None):
79 | self._filepath_or_fileobj(filepath_or_fileobj)
80 |
81 |
82 | class WriterBase(object):
83 | '''
84 | Extra arguments to write methods:
85 | has_row_headings: first col of each row is a heading.
86 | '''
87 | def __init__(self, round_ndigits=None, **kwargs):
88 | '''
89 | @round_ndigits: number of decimal places to use when rounding numerical
90 | values when textifying for output
91 | '''
92 | self.round_ndigits = round_ndigits
93 |
94 | def write(self, tabular_data, fileobj, *args, **kwargs):
95 | pass
96 |
97 | def write_str(self, tabular_data, *args, **kwargs):
98 | from StringIO import StringIO
99 | holder = StringIO()
100 | self.write(tabular_data, holder, *args, **kwargs)
101 | holder.seek(0)
102 | return holder.read()
103 |
104 | def value_to_str(self, value):
105 | '''Convert value to text (rounding floats/ints as necessary).
106 | '''
107 | if value is None:
108 | return ''
109 | if self.round_ndigits is not None and \
110 | (isinstance(value, int) or isinstance(value, float)):
111 | roundedResult = round(value, self.round_ndigits)
112 | if self.round_ndigits <= 0: # o/w will have in .0 at end
113 | roundedResult = int(roundedResult)
114 | roundedResult = str(roundedResult)
115 | # deal with case when rounding has added unnecessary digits
116 | if len(str(value)) < len(roundedResult):
117 | return str(value)
118 | else:
119 | return roundedResult
120 | else:
121 | return unicode(value)
122 |
123 |
124 | import csv
125 | import codecs
126 | class UTF8Recoder:
127 | """
128 | Iterator that reads an encoded stream and reencodes the input to UTF-8
129 |
130 | From:
131 | """
132 | def __init__(self, f, encoding=None):
133 | if encoding:
134 | self.reader = codecs.getreader(encoding)(f)
135 | else: # already unicode so just return f
136 | self.reader = f
137 |
138 | def __iter__(self):
139 | return self
140 |
141 | def next(self):
142 | return self.reader.next().encode('utf-8')
143 |
144 | class CsvReader(ReaderBase):
145 | """Read data from a csv file into a TabularData structure
146 |
147 | Note that the csv module does *not* support unicode:
148 |
149 | > This version of the csv module doesn't support Unicode input. Also, there
150 | > are currently some issues regarding ASCII NUL characters. Accordingly,
151 | > all input should be UTF-8 or printable ASCII to be safe; see the examples
152 | > in section 9.1.5. These restrictions will be removed in the future.
153 | >
154 | """
155 |
156 | def read(self, filepath_or_fileobj=None, encoding=None, **kwargs):
157 | """Read in a csv file and return a TabularData object.
158 |
159 | @param fileobj: file like object.
160 | @param encoding: if set use this instead of default encoding set in
161 | __init__ to decode the file like object. NB: will check if fileobj
162 | already in unicode in which case this is ignored.
163 | @param kwargs: all further kwargs are passed to the underlying `csv.reader` function
164 | @return tabular data object (all values encoded as utf-8).
165 | """
166 | super(CsvReader, self).read(filepath_or_fileobj)
167 | if encoding:
168 | self.encoding = encoding
169 | tabData = TabularData()
170 |
171 | sample = self.fileobj.read()
172 | # first do a simple test -- maybe sample is already unicode
173 | if type(sample) == unicode:
174 | encoded_fo = UTF8Recoder(self.fileobj, None)
175 | else:
176 | sample = sample.decode(self.encoding)
177 | encoded_fo = UTF8Recoder(self.fileobj, self.encoding)
178 | sample = sample.encode('utf-8')
179 | sniffer = csv.Sniffer()
180 | hasHeader = sniffer.has_header(sample)
181 |
182 | self.fileobj.seek(0)
183 | ourkwargs = {
184 | 'skipinitialspace': True
185 | }
186 | if kwargs:
187 | ourkwargs.update(kwargs)
188 |
189 | reader = csv.reader(encoded_fo, **ourkwargs)
190 | if hasHeader:
191 | tabData.header = reader.next()
192 | for row in reader:
193 | tabData.data.append(row)
194 | return tabData
195 |
196 | # for backwards compatibility
197 | ReaderCsv = CsvReader
198 |
199 | class CsvWriter(WriterBase):
200 | # TODO: unicode support a la CsvReader
201 | def write(self, tabular_data, fileobj, encoding='utf-8'):
202 | writer = csv.writer(fileobj)
203 | if tabular_data.header:
204 | writer.writerow(tabular_data.header)
205 | for row in tabular_data.data:
206 | writer.writerow(row)
207 | fileobj.flush()
208 |
209 |
210 | ## --------------------------------
211 | ## Converting to Latex
212 |
213 | class LatexWriter(WriterBase):
214 |
215 | def write(self, tabular_data, fileobj, has_row_headings=False):
216 | self.has_row_headings = has_row_headings
217 | matrix = tabular_data.data
218 | has_header = len(tabular_data.header) > 0
219 | if has_header:
220 | matrix.insert(0, tabular_data.header)
221 | out = self._write(matrix, has_header)
222 | fileobj.write(out)
223 |
224 | def _write(self, matrix, has_header=True):
225 | if len(matrix) == 0: return
226 | # no hline on first row as this seems to mess up latex \input
227 | # http://groups.google.com/group/comp.text.tex/browse_thread/thread/1e1db553a958ebd8/0e590a22cb59f43d
228 | out = '%s' % self.process_row(matrix[0], has_header)
229 | for row in matrix[1:]:
230 | out += self.process_row(row)
231 | return out
232 |
233 | def process_row(self, row, heading=False):
234 | if len(row) == 0: return
235 | out = '%s' % self.process_cell(row[0], heading or self.has_row_headings)
236 | for cell in row[1:]:
237 | out += ' & %s' % self.process_cell(cell, heading)
238 | out += ' \\\\\n\hline\n'
239 | return out
240 |
241 | def process_cell(self, cell, heading=False):
242 | cell_text = self.value_to_str(cell)
243 | cell_text = self.escape(cell_text)
244 | if heading:
245 | return '\\textbf{%s}' % cell_text
246 | else:
247 | return cell_text
248 |
249 | def escape(self, text):
250 | escape_chars = [ '&', '%' ]
251 | out = text
252 | for ch in escape_chars:
253 | out = out.replace(ch, '\\%s' % ch)
254 | return out
255 |
256 |
257 | # TODO: 2009-08-05 deprecate
258 | def table2latex(matrix, has_header=True, has_row_headings=False):
259 | m2l = LatexWriter()
260 | m2l.has_row_headings = has_row_headings
261 | return m2l._write(matrix, has_header)
262 |
263 |
--------------------------------------------------------------------------------
/datautil/tabular/gdocs.py:
--------------------------------------------------------------------------------
1 | '''TabularData from a Google Docs Spreadsheet.
2 | '''
3 | from base import ReaderBase, TabularData
4 | import gdata.spreadsheet.service
5 | import gdata.spreadsheet.text_db
6 |
7 |
8 | class GDocsReaderTextDb(ReaderBase):
9 | '''Read a google docs spreadsheet using the gdata.spreadsheet.text_db
10 | library.
11 |
12 | NB: any blank line in spreadsheet will be taken as terminating data.
13 | '''
14 | def __init__(self, spreadsheet_id, username=None, password=None,
15 | id_is_name=False):
16 | '''
17 | @param: spreadsheet_id: gdoc id or name (?key={id} in url). If name you
18 | must set id_is_name to True.
19 | '''
20 | # do not pass spreadsheet_id down as it will be url or sheet name
21 | super(GDocsReaderTextDb, self).__init__()
22 | self.source = spreadsheet_id
23 | self.id_is_name = id_is_name
24 | self.gd_client = gdata.spreadsheet.text_db.DatabaseClient(
25 | username=username,
26 | password=password)
27 |
28 | def load_text_db_table(self, sheet_name='Sheet1'):
29 | '''Load text_db Table object corresponding to specified sheet_name.
30 | '''
31 | super(GDocsReaderTextDb, self).read(None)
32 | if self.id_is_name:
33 | dbs = self.gd_client.GetDatabases(name=self.source)
34 | else:
35 | dbs = self.gd_client.GetDatabases(spreadsheet_key=self.source)
36 | assert len(dbs) >= 1, 'No spreadsheet of that name/id'
37 | db = dbs[0]
38 | table = db.GetTables(name=sheet_name)[0]
39 | return table
40 |
41 | def read(self, sheet_name='Sheet1'):
42 | '''Load the specified google spreadsheet worksheet as a L{TabularData}
43 | object.
44 |
45 | @return L{TabularData} object.
46 | '''
47 | text_db_table = self.load_text_db_table(sheet_name)
48 | tdata = TabularData()
49 | text_db_table.LookupFields()
50 | tdata.header = text_db_table.fields
51 | # finds all records it seems
52 | rows = text_db_table.FindRecords('')
53 | for row in rows:
54 | rowdata = []
55 | for colname in tdata.header:
56 | rowdata.append(row.content[colname])
57 | tdata.data.append(rowdata)
58 | return tdata
59 |
60 |
61 | # not yet working properly (cannot work out ListFeed yet ...)
62 | # textdb is nicer but Spreadsheet allows one to get all cells using CellsFeed
63 | # (even when blank lines) (this is not true when using ListFeed though ...)
64 | # class GDocsReaderSpreadsheet(ReaderBase):
65 | # '''
66 | #
67 | # From Docs for the API:
68 | #
69 | #
70 | # > The list feed contains all rows after the first row up to the first blank
71 | # row. The first blank row terminates the data set. If expected data isn't
72 | # appearing in a feed, check the worksheet manually to see whether there's an
73 | # unexpected blank row in the middle of the data. In particular, if the
74 | # second row of the spreadsheet is blank, then the list feed will contain no
75 | # data.
76 | # '''
77 | # def __init__(self, spreadsheet_id, username=None, password=None,
78 | # id_is_name=False):
79 | # '''
80 | # @param: spreadsheet_id: gdoc id or name (?key={id} in url). If name you
81 | # must set id_is_name to True.
82 | # '''
83 | # # do not pass spreadsheet_id down as it will be url or sheet name
84 | # super(GDocsReaderSpreadsheet, self).__init__()
85 | # self.source = spreadsheet_id
86 | # self.id_is_name = id_is_name
87 | # self.gd_client = gdata.spreadsheet.service.SpreadsheetsService()
88 | # self.gd_client.email = username
89 | # self.gd_client.password = password
90 | #
91 | # def read(self, sheet_index=0):
92 | # '''Load the specified google spreadsheet worksheet as a L{TabularData}
93 | # object.
94 | #
95 | # @return L{TabularData} object.
96 | # '''
97 | # super(GDocsReaderSpreadsheet, self).read(None)
98 | # self.gd_client.source = self.source
99 | # self.gd_client.ProgrammaticLogin()
100 | # if self.id_is_name:
101 | # feed = self.gd_client.GetSpreadsheetsFeed()
102 | # # no len on feed ...
103 | # # assert len(feed) > 0, 'No spreadsheets found for: %s' % self.source
104 | # spreadsheet_id = feed.entry[0].id.text.split('/')[-1]
105 | # else:
106 | # spreadsheet_id = self.source
107 | # sheetfeed = self.gd_client.GetWorksheetsFeed(spreadsheet_id)
108 | # wrksht_id = sheetfeed.entry[sheet_index].id.text.split('/')[-1]
109 | # row_feed = self.gd_client.GetListFeed(spreadsheet_id, wrksht_id)
110 | #
111 | # tdata = TabularData()
112 | # # tdata.header
113 | # # how do we get rows rather than just all the cells?
114 | # for i, entry in enumerate(row_feed.entry):
115 | # print entry.content['col1']
116 | # print entry.content
117 | # tdata.data.append([entry.content.text])
118 | # return tdata
119 |
120 |
--------------------------------------------------------------------------------
/datautil/tabular/html.py:
--------------------------------------------------------------------------------
1 | import re
2 | from HTMLParser import HTMLParser
3 |
4 | from base import TabularData, ReaderBase, WriterBase
5 |
6 |
7 | class HtmlReader(ReaderBase):
8 | '''Read data from HTML table into L{TabularData}.
9 |
10 | '''
11 | def read(self, filepath_or_fileobj=None, table_index=0):
12 | '''Read data from fileobj.
13 |
14 | NB: post read all tables extracted are in attribute named 'tables'.
15 |
16 | @arg table_index: if multiple tables in the html return table at this
17 | index.
18 | @return: L{TabularData} object (all content in the data part, i.e. no
19 | header).
20 | '''
21 | super(HtmlReader, self).read(filepath_or_fileobj)
22 | parser = _OurTableExtractor()
23 | parser.reset()
24 | parser.feed(self.fileobj.read())
25 | self.tables = parser.tables
26 | return self.tables[table_index]
27 |
28 |
29 | class _OurTableExtractor(HTMLParser):
30 | '''
31 | # TODO: tbody, thead etc
32 | # TODO: nested tables
33 |
34 | # TODO: will barf on bad html so may need to run tidy first ...
35 | # tidy -w 0 -b -omit -asxml -ascii
36 | '''
37 | def reset(self):
38 | HTMLParser.reset(self)
39 | self.tables = []
40 | self._rows = []
41 | self._row = []
42 | self._text = ''
43 |
44 | def handle_starttag(self, tag, attrs):
45 | if tag == 'tr':
46 | self._row = []
47 | elif tag == 'td' or tag == 'th':
48 | self._text = ''
49 | elif tag == 'br':
50 | self._text += '\n'
51 |
52 | def handle_endtag(self, tag):
53 | if tag == 'tr':
54 | self._rows.append(self._row)
55 | if tag == 'td' or tag == 'th':
56 | self._row.append(self._text)
57 | if tag == 'table':
58 | self.tables.append(TabularData(data=self._rows))
59 | self._rows = []
60 |
61 | def handle_data(self, data):
62 | self._text += data.strip()
63 |
64 |
65 | import re
66 | class HtmlWriter(WriterBase):
67 | """
68 | Write tabular data to xhtml
69 | """
70 |
71 | def __init__(self, round_ndigits=2, pretty_print=False, table_attributes = {'class': 'data'}):
72 | """
73 | @pretty_print: whether to pretty print (indent) output
74 | @table_attributes: dictionary of html attribute name/value pairs to be
75 | added to the table element
76 | """
77 | super(HtmlWriter, self).__init__(round_ndigits)
78 | self.pretty_print = pretty_print
79 | self.table_attributes = table_attributes
80 |
81 | def write(self, tabulardata, fileobj, caption = '', rowHeadings = []):
82 | """
83 | Write matrix of data to xhtml table.
84 | Allow for addition of row and column headings
85 |
86 | @return xhtml table containing data
87 |
88 | @param data: table of data that makes up table
89 | @param caption: the caption for the table (if empty no caption created)
90 | @param rowHeadings: additional headings for rows (separate from
91 | tabulardata)
92 | """
93 | columnHeadings = tabulardata.header
94 | data = tabulardata.data
95 | haveRowHeadings = (len(rowHeadings) > 0)
96 |
97 | htmlTable = ''
101 |
102 | # deal with caption
103 | if caption != '':
104 | htmlTable += '%s' % caption
105 |
106 | # deal with col headings
107 | # if we there are rowHeadings may want to add blank column at front
108 | numColHeads = len(columnHeadings)
109 | if numColHeads > 0:
110 | if haveRowHeadings and numColHeads == len(data[0]):
111 | # [[TODO: is this dangerous? should i make a copy ...]]
112 | columnHeadings.insert(0, '')
113 | htmlTable += self.writeHeading(columnHeadings)
114 |
115 | htmlTable += ''
116 | if self.pretty_print:
117 | htmlTable += '\n'
118 |
119 | for ii in range(0, len(data)):
120 | # have to add 1 as first row is headings
121 | if haveRowHeadings:
122 | htmlTable += self.writeRow(data[ii], rowHeadings[ii])
123 | else:
124 | htmlTable += self.writeRow(data[ii])
125 |
126 | htmlTable += '
'
127 |
128 | if self.pretty_print:
129 | fileobj.write(self.prettyPrint(htmlTable))
130 | else:
131 | fileobj.write(htmlTable)
132 |
133 | def value_to_str(self, value):
134 | import cgi
135 | out = super(HtmlWriter, self).value_to_str(value)
136 | out = cgi.escape(out)
137 | return out
138 |
139 | def writeHeading(self, row):
140 | """
141 | Write heading for html table ()
142 | """
143 | result = ''
144 | result += self.writeGeneralRow(row, 'th')
145 | result += '
'
146 | if self.pretty_print:
147 | result += '\n'
148 | return result
149 |
150 | def writeRow(self, row, rowHeading = ''):
151 | result = ''
152 | if rowHeading != '':
153 | result = '%s | ' % self.value_to_str(rowHeading)
154 | result += self.writeGeneralRow(row, 'td')
155 | result = '%s
' % result
156 | if self.pretty_print:
157 | result += '\n'
158 | return result
159 |
160 | def writeGeneralRow(self, row, tagName):
161 | result = ''
162 | for ii in range(len(row)):
163 | result += '<%s>%s%s>' % (tagName, self.value_to_str(row[ii]), tagName)
164 | return result
165 |
166 | def prettyPrint(self, html):
167 | """pretty print html using HTMLTidy"""
168 | # [[TODO: strip out html wrapper stuff that is added (head, body etc)
169 | try:
170 | import mx.Tidy
171 | out = mx.Tidy.tidy(html, None, None, wrap = 0, indent = 'yes')[2]
172 | except:
173 | out = html
174 | return self.tabify(out)
175 |
176 | def tabify(self, instr, tabsize = 2):
177 | """
178 | tabify text by replacing spaces of size tabSize by tabs
179 | """
180 | whitespace = tabsize * ' '
181 | return re.sub(whitespace, '\t', instr)
182 |
183 |
184 | # for backwards compatibility
185 | # 2008-05-30
186 | WriterHtml = HtmlWriter
187 |
188 |
189 |
--------------------------------------------------------------------------------
/datautil/tabular/misc.py:
--------------------------------------------------------------------------------
1 | '''General Helper methods for tabular data.
2 | '''
3 | from base import TabularData
4 |
5 | def transpose(data):
6 | '''Transpose a list of lists.
7 |
8 | Or do it directy: data = zip(*data)
9 | '''
10 | return zip(*data)
11 |
12 | def select_columns(matrix, cols):
13 | '''Return a matrix with only those column indexes in cols.'''
14 | tsp = transpose(matrix)
15 | out = []
16 | cols.sort()
17 | for c in cols:
18 | out.append(tsp[c])
19 | return transpose(out)
20 |
21 |
22 | def pivot(table, left, top, value):
23 | """Unnormalize (pivot) a normalised input set of tabular data.
24 |
25 | @param table: simple list of lists or a L{TabularData} object.
26 |
27 | Eg. To transform the tabular data like
28 |
29 | Name, Year, Value
30 | -----------------------
31 | 'x', 2004, 1
32 | 'y', 2004, 2
33 | 'x', 2005, 3
34 | 'y', 2005, 4
35 |
36 | into the new list:
37 |
38 | Year, 'x', 'y'
39 | ------------------------
40 | 2004, 1, 2
41 | 2005, 3, 4
42 |
43 | you would do:
44 |
45 | pivot(tabulardata, 1, 0, 2)
46 |
47 | OR (requires header to exist):
48 |
49 | pivot(tabulardata, 'Year', 'Name', 'Value')
50 | """
51 | if not isinstance(left, int):
52 | left = table.header.index(left)
53 | if not isinstance(top, int):
54 | top = table.header.index(top)
55 | if not isinstance(value, int):
56 | value = table.header.index(value)
57 |
58 | rs = TabularData()
59 | # construct double dict keyed by left values
60 | tdict = {}
61 | xvals = set()
62 | yvals = set()
63 | for row in table:
64 | xval = row[left]
65 | if not xval in tdict:
66 | tdict[xval] = {}
67 | tdict[xval][row[top]] = row[value]
68 | xvals.add(xval)
69 | yvals.add(row[top])
70 | xvals = sorted(list(xvals))
71 | yvals = sorted(list(yvals))
72 | xhead = 'X'
73 | if hasattr(table, 'header') and table.header:
74 | xhead = table.header[left]
75 | rs.header = [ xhead ] + yvals
76 | rs.data = [ [x] + [ tdict[x].get(y, '') for y in yvals ] for x in xvals ]
77 | return rs
78 |
79 |
--------------------------------------------------------------------------------
/datautil/tabular/tabular_json.py:
--------------------------------------------------------------------------------
1 | '''JSON Reader and Writer'''
2 | try:
3 | import json
4 | except ImportError:
5 | try:
6 | import simplejson as json
7 | except ImportError: # simplejson not installed
8 | pass
9 | from base import TabularData, ReaderBase, WriterBase
10 |
11 |
12 | class JsonReader(ReaderBase):
13 | def read(self, filepath_or_fileobj=None):
14 | '''Read JSON encoded data from source into a L{TabularData} object.
15 |
16 | JSON encoded data should either be:
17 | * dict (with header and data attributes)
18 | * list (first row assumed to be the header)
19 |
20 | @return L{TabularData}
21 | '''
22 | super(JsonReader, self).read(filepath_or_fileobj)
23 | jsondata = json.load(self.fileobj)
24 | if isinstance(jsondata, dict):
25 | return TabularData(header=jsondata.get('header', None),
26 | data=jsondata.get('data', None)
27 | )
28 | elif isinstance(jsondata, list):
29 | return TabularData(header=jsondata[0], data=jsondata[1:])
30 | else:
31 | raise Exception('Cannot load TabularData from %s' % jsondata)
32 |
33 | class JsonWriter(WriterBase):
34 |
35 | def write(self, tabular_data, fileobj, indent=2):
36 | super(JsonWriter, self).write(tabular_data, fileobj)
37 | jsondata = { u'header': tabular_data.header,
38 | u'data': tabular_data.data
39 | }
40 | json.dump(jsondata, fileobj, indent=indent)
41 |
42 |
--------------------------------------------------------------------------------
/datautil/tabular/txt.py:
--------------------------------------------------------------------------------
1 | from base import WriterBase
2 |
3 | class TxtWriter(WriterBase):
4 | '''Write tabular data to plain text in nicely formatted way
5 |
6 | TODO
7 | ====
8 |
9 | 1. allow output_width of 0 meaning use width necessary to fit all rows on one
10 | line
11 |
12 | 2. rather than truncate cell contents wrap it onto two lines (and/or allow
13 | spillover if adjacent cell is empty)
14 |
15 | * wontfix: can let terminal do this: just set width very large ...
16 |
17 | 3. (?) stream output back rather than returning all at once
18 |
19 | 4. Add support for limiting number of columns displayed. DONE 2007-08-02
20 | * TODO: add unittest
21 | '''
22 |
23 | def __init__(self, output_width=0, number_of_columns=0, **kwargs):
24 | '''
25 | @param output_width: display width (0 means unlimited).
26 | @param number_of_columns: number of columns to try to display (not
27 | guaranteed to be this number if this would cause problems). (0
28 | means all columns)
29 | '''
30 | super(TxtWriter, self).__init__(**kwargs)
31 | self.output_width = output_width
32 | self.number_of_columns = number_of_columns
33 |
34 | def write(self, tabular_data, fileobj):
35 | result = ''
36 | formatter = None
37 | row_cache = []
38 | sample_length = 4
39 | rows = tabular_data.data
40 | if tabular_data.header:
41 | rows = [ tabular_data.header ] + rows
42 | # include header in sample rows (do we always want to?)
43 | sample_rows = rows[:sample_length]
44 | self._compute_parameters(sample_rows)
45 | result += self._write_separator()
46 | for row in rows:
47 | result += self._write_row(row)
48 | result += self._write_separator()
49 | fileobj.write(result)
50 |
51 | def _compute_parameters(self, sample_rows):
52 | maxcols = self._get_maxcols(sample_rows)
53 | if not self.number_of_columns:
54 | self.numcols = maxcols
55 | else:
56 | self.numcols = min(self.number_of_columns, maxcols)
57 | self.colwidths = []
58 | self._set_colwidths(sample_rows)
59 | if self.colwidths[0] < 2:
60 | msg =\
61 | u'''It is not possible to effectively format this many columns of material with
62 | this narrow an output window. Column width is: %s''' % self.colwidths[0]
63 | # TODO: log it?
64 | print msg
65 |
66 | def _write_row(self, row):
67 | '''Return the input 'python' row as an appropriately formatted string.
68 | '''
69 | result = '|'
70 | count = 0
71 | for cell in row[:self.numcols]:
72 | width = self.colwidths[count]
73 | result += self._format_cell(width, cell)
74 | count += 1
75 | # now pad out with extra cols as necessary
76 | while count < self.numcols:
77 | width = self.colwidths[count]
78 | result += self._format_cell(width, ' ')
79 | count += 1
80 | return result + '\n'
81 |
82 | def _write_separator(self):
83 | result = '+'
84 | for width in self.colwidths:
85 | result += '-' * (width-1) + '+'
86 | return result + '\n'
87 |
88 | def _get_maxcols(self, sample_rows):
89 | maxcols = 0
90 | for row in sample_rows:
91 | maxcols = max(maxcols, len(row))
92 | return maxcols
93 |
94 | def _set_colwidths(self, sample_rows):
95 | # subtract -1 so that we have (at least) one spare screen column
96 | if self.output_width != 0:
97 | colwidth = int( (self.output_width - 1) / self.numcols)
98 | for ii in range(self.numcols):
99 | self.colwidths.append(colwidth)
100 | else: # make every col as wide as it needs to be
101 | self.colwidths = [0] * self.numcols
102 | for row in sample_rows:
103 | for ii in range(self.numcols):
104 | cellwidth = len(self.value_to_str(row[ii]))
105 | self.colwidths[ii] = max(self.colwidths[ii],
106 | cellwidth
107 | )
108 | self.colwidths = [ x + 1 for x in self.colwidths ]
109 |
110 | def _format_cell(self, width, content):
111 | content = self.value_to_str(content)
112 | content = content.strip()
113 | if len(content) > width - 1:
114 | # TODO: be brutal (this *has* to be fixed)
115 | content = content[:width-1]
116 | return content.center(width-1) + '|'
117 |
118 |
--------------------------------------------------------------------------------
/datautil/tabular/xls.py:
--------------------------------------------------------------------------------
1 | '''Work with Excel (xls) files.
2 |
3 | Requires xlrd
4 | '''
5 | try:
6 | import xlrd
7 | except ImportError: # xlrd not installed
8 | pass
9 |
10 | from base import ReaderBase, TabularData
11 |
12 | class XlsReader(ReaderBase):
13 | '''Read Excel (xls) files.
14 |
15 | Requires the xlrd package (see pypi).
16 | '''
17 | def __init__(self, filepath_or_fileobj=None):
18 | super(XlsReader, self).__init__(filepath_or_fileobj)
19 | if self.fileobj:
20 | self.book = xlrd.open_workbook(file_contents=self.fileobj.read())
21 | ## TODO: fix the rest of this
22 |
23 | def read(self, fileobj=None, sheet_index=0):
24 | '''Read an excel file (provide as fileobj) and return the specified
25 | sheet as a L{TabularData} object.
26 |
27 | For convenience also store:
28 |
29 | self.book: xlrd WorkBook object
30 |
31 | @return L{TabularData} object.
32 | '''
33 | super(XlsReader, self).read(fileobj)
34 | if fileobj:
35 | self.book = xlrd.open_workbook(file_contents=self.fileobj.read())
36 | tab = TabularData()
37 | booksheet = self.book.sheet_by_index(sheet_index)
38 | data = self.extract_sheet(booksheet, self.book)
39 | tab.data = data
40 | return tab
41 |
42 | def info(self):
43 | '''Return summary info about this Excel Workbook.'''
44 | info = ''
45 | info += 'The number of worksheets is: %s\n' % self.book.nsheets
46 | info += 'Worksheet name(s):\n' % self.book.sheet_names()
47 | count = -1
48 | for sn in self.book.sheet_names():
49 | count += 1
50 | info += '%s %s\n' % (count, sn)
51 | return info
52 |
53 | def sheet_info(self, sheet_index):
54 | '''Summary info about an xls sheet.
55 |
56 | @return: printable string giving info.
57 | '''
58 | import pprint
59 | sh = self.book.sheet_by_index(sheet_index)
60 | info = sh.name + '\n'
61 | info += 'Rows: %s Cols: %s\n\n' % (sh.nrows, sh.ncols)
62 | MAX_ROWS = 30
63 | for rx in range(min(sh.nrows, MAX_ROWS)):
64 | info += str(sh.row(rx)) + '\n'
65 | return info
66 |
67 | def extract_sheet(self, sheet, book):
68 | matrix = []
69 | nrows = sheet.nrows
70 | ncols = sheet.ncols
71 | for rx in range(nrows):
72 | outrow = []
73 | for cx in range(ncols):
74 | cell = sheet.cell(rowx=rx, colx=cx)
75 | val = self.cell_to_python(cell, book)
76 | outrow.append(val)
77 | matrix.append(outrow)
78 | return matrix
79 |
80 | def cell_to_python(self, cell, book):
81 | # annoying need book argument for datemode
82 | # info on types: http://www.lexicon.net/sjmachin/xlrd.html#xlrd.Cell-class
83 | if cell.ctype == xlrd.XL_CELL_NUMBER:
84 | return float(cell.value)
85 | elif cell.ctype == xlrd.XL_CELL_DATE:
86 | from datetime import date
87 | # TODO: distinguish date and datetime
88 | args = xlrd.xldate_as_tuple(cell.value, book.datemode)
89 | try:
90 | return date(args[0], args[1], args[2])
91 | except Exception, inst:
92 | print 'Error parsing excel date (%s): %s' % (args, inst)
93 | return None
94 | elif cell.ctype == xlrd.XL_CELL_BOOLEAN:
95 | return bool(cell.value)
96 | else:
97 | return cell.value
98 |
99 |
100 |
--------------------------------------------------------------------------------
/datautil/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # placeholder
2 |
--------------------------------------------------------------------------------
/datautil/tests/data/xls_reader_test.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rufuspollock-okfn/datautil/6bcb8795013d305f02467967367ecca15ad351e2/datautil/tests/data/xls_reader_test.xls
--------------------------------------------------------------------------------
/datautil/tests/parse/test_name.py:
--------------------------------------------------------------------------------
1 | import datautil.parse.name
2 |
3 |
4 | class TestName:
5 | def test_parse_name_FL(self):
6 | name = u'Ludwig Van Beethoven'
7 | out = datautil.parse.name.parse_name(name)
8 | assert out.ln == u'Beethoven'
9 | assert out.fns == ['Ludwig', 'Van']
10 |
11 | def test_parse_name_LF_with_extra_comma(self):
12 | out = datautil.parse.name.parse_name('More, Sir Thomas,Saint')
13 | assert out.ln == 'More', out
14 | assert out.fns == ['Sir', 'Thomas']
15 |
16 | def test_parse_name_FL_normcase(self):
17 | name = u'Ludwig van BEETHOVEN'
18 | out = datautil.parse.name.parse_name(name)
19 | assert out.ln == 'Beethoven', out
20 |
21 | def test_parse_name_LF_with_title(self):
22 | name = u'Chandos, John [Sir]'
23 | out = datautil.parse.name.parse_name(name)
24 | assert out.ln == 'Chandos', out
25 | assert out.title == 'Sir', out
26 |
27 | def test_parse_name_FL_with_title(self):
28 | name = u'Sir John CHANDOS'
29 | out = datautil.parse.name.parse_name(name)
30 | assert out.ln == 'Chandos', out
31 | assert out.title == 'Sir', out
32 |
33 | def test_parse_name_FL_with_title_2(self):
34 | name = u'Prof Benjamin AARON'
35 | out = datautil.parse.name.parse_name(name)
36 | assert out.ln == 'Aaron', out
37 | assert out.title == 'Prof', out
38 | assert out.fns == ['Benjamin'], out
39 | assert str(out) == 'Aaron, Benjamin [Prof]'
40 |
41 | def test_parse_title_with_fullstop(self):
42 | name = 'Major. abc xyz'
43 | out = datautil.parse.name.parse_name(name)
44 | assert out.title == 'Major', out.title
45 |
46 | def test_parse_title_with_fullstop_2(self):
47 | name = 'Xyz, Abc [Major.]'
48 | out = datautil.parse.name.parse_name(name)
49 | print out
50 | assert out.title == 'Major', out.title
51 |
52 | def test_parse_title_with_brackets(self):
53 | name = 'Dickens, Gerald (Sir)'
54 | out = datautil.parse.name.parse_name(name)
55 | assert out.title == 'Sir', out.title
56 |
57 | name = '(Sir) Gerald Dickens'
58 | out = datautil.parse.name.parse_name(name)
59 | assert out.title == 'Sir', out.title
60 |
61 | def test_parse_name_FL_initials(self):
62 | name = 'Chekhov, A.P.'
63 | out = datautil.parse.name.parse_name(name)
64 | assert out.ln == 'Chekhov'
65 | assert out.fns == ['A.', 'P.'], out
66 |
67 | def test_strip_fullstops(self):
68 | name = 'George. Bosch'
69 | out = datautil.parse.name.normalize(name)
70 | assert out == 'Bosch, George'
71 |
72 | name = 'George. a.p. Bosch.'
73 | out = datautil.parse.name.normalize(name)
74 | assert out == 'Bosch, George A. P.', out
75 |
76 | name = 'Geo.rge. Bosch'
77 | out = datautil.parse.name.normalize(name)
78 | assert out == 'Bosch, Geo. Rge', out
79 |
80 | name = 'Geo.Smith. Bosch'
81 | out = datautil.parse.name.normalize(name)
82 | assert out == 'Bosch, Geo. Smith', out
83 |
84 | def test_tostr(self):
85 | name = datautil.parse.name.Name(ln='Beethoven', fns=['Ludwig', 'van'])
86 | exp = u'Beethoven, Ludwig van'
87 | out = datautil.parse.name.name_tostr(name)
88 | assert out == exp, out
89 |
90 | def test_with_no_name(self):
91 | name = datautil.parse.name.parse_name(' ')
92 | assert name.ln is '', name
93 | out = datautil.parse.name.normalize(' ')
94 | assert out == '', out
95 |
96 | def test_surname(self):
97 | name = u'SCHUBERT'
98 | out = str(datautil.parse.name.parse_name(name))
99 | assert out == 'Schubert'
100 |
101 |
--------------------------------------------------------------------------------
/datautil/tests/tabular/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rufuspollock-okfn/datautil/6bcb8795013d305f02467967367ecca15ad351e2/datautil/tests/tabular/__init__.py
--------------------------------------------------------------------------------
/datautil/tests/tabular/test_base.py:
--------------------------------------------------------------------------------
1 | import os
2 | from StringIO import StringIO
3 |
4 | import datautil.tabular
5 |
6 | class TestTabularData:
7 | testlist = [ ['X', 'Y'], [1,2], [3,4] ]
8 |
9 | def test_1(self):
10 | tabular = datautil.tabular.TabularData()
11 | assert tabular.header == []
12 |
13 | def test_from_list(self):
14 | out = datautil.tabular.TabularData.from_list(self.testlist)
15 | assert out.header == [ 'X', 'Y' ]
16 | assert out.data == [ [1,2], [3,4] ]
17 |
18 | def test_to_list(self):
19 | td = datautil.tabular.TabularData(
20 | header=['X', 'Y'],
21 | data=[ [1,2], [3,4] ]
22 | )
23 | out = td.to_list()
24 | assert out == self.testlist
25 |
26 |
27 | class TestWriterBase:
28 | def test_value_to_str(self):
29 | w = datautil.tabular.WriterBase() # round_ndigits=None
30 | out = w.value_to_str('x')
31 | assert out == u'x', out
32 | out = w.value_to_str(1)
33 | assert out == u'1', out
34 | out = w.value_to_str(1.3555)
35 | assert out == u'1.3555', out
36 |
37 | w = datautil.tabular.WriterBase(round_ndigits=2)
38 | out = w.value_to_str('x')
39 | assert out == u'x', out
40 | out = w.value_to_str(1)
41 | assert out == u'1', out
42 | out = w.value_to_str(1.3555)
43 | assert out == u'1.36', out
44 |
45 | w.round_ndigits = -1
46 | out = w.value_to_str(102.34)
47 | assert out == u'100', out
48 |
49 |
50 | class TestReaderCsv(object):
51 |
52 | csvdata = \
53 | '''"header1", "header 2"
54 | 1, 2'''
55 | header = [ 'header1', 'header 2' ]
56 | data = [ ['1', '2'] ]
57 |
58 | def setUp(self):
59 | reader = datautil.tabular.ReaderCsv()
60 | fileobj = StringIO(self.csvdata)
61 | self.tab = reader.read(fileobj)
62 |
63 | def test_header(self):
64 | assert self.header == self.tab.header
65 |
66 | def test_data(self):
67 | assert self.data == self.tab.data
68 |
69 |
70 | class TestReaderCsvUnicode(TestReaderCsv):
71 | csvdata = \
72 | u'''"headi\xf1g", "header 2"
73 | 1, 2'''
74 | header = [ u'headi\xf1g'.encode('utf-8'), 'header 2' ]
75 | data = [ ['1', '2'] ]
76 |
77 |
78 | class TestReaderCsvEncoded(TestReaderCsvUnicode):
79 | encoding = 'utf-16'
80 | csvdata = \
81 | u'''"headi\xf1g", "header 2"
82 | 1, 2'''.encode(encoding)
83 |
84 | def setUp(self):
85 | reader = datautil.tabular.ReaderCsv()
86 | fileobj = StringIO(self.csvdata)
87 | self.tab = reader.read(fileobj, encoding=self.encoding)
88 |
89 |
90 | class TestCsvWriter:
91 | def test_writer(self):
92 | writer = datautil.tabular.CsvWriter()
93 | fo = StringIO()
94 | td = datautil.tabular.TabularData([[1,2],[3,4]], header=['one',
95 | 'two'])
96 | writer.write(td, fo)
97 | fo.seek(0)
98 | out = fo.read()
99 | exp = \
100 | '''one,two\r
101 | 1,2\r
102 | 3,4\r\n'''
103 | assert out == exp
104 |
105 |
106 | class TestHtmlReader:
107 |
108 | inraw1 = '''
109 |
110 |
111 | 1 | 2 |
112 |
113 |
114 | 1983 |
115 |
116 |
117 | 3 | 4 |
118 |
119 |
120 | '''
121 | in1 = StringIO(inraw1)
122 |
123 | exp1 = [ ['1', '2'],
124 | ['1983'],
125 | ['3', '4'],
126 | ]
127 |
128 | def test_1(self):
129 | reader = datautil.tabular.HtmlReader()
130 | tab = reader.read(self.in1)
131 | assert tab.data == self.exp1
132 |
133 |
134 | class TestHtmlWriter:
135 |
136 | def setUp(self):
137 | rawData = [[1,1], [0,1]]
138 | self.indata1 = datautil.tabular.TabularData(data=rawData)
139 | self.writer1 = datautil.tabular.HtmlWriter(table_attributes={'id':1, 'class': 'data'})
140 |
141 | def test_0_simple(self):
142 | indata1 = [[1,1], [0,1]]
143 | expected = ''
145 | out1 = self.writer1.write_str(self.indata1)
146 | assert expected == out1
147 |
148 | def test_col_headings(self):
149 | self.indata1.header = [u'x','y']
150 | caption = ''
151 | expected = 'x | y |
'+\
152 | '1 | 1 |
0 | ' + \
153 | '1 |
'
154 | # no caption but headings
155 | out1 = self.writer1.write_str(self.indata1, caption)
156 | assert expected == out1
157 |
158 | def test_row_headings(self):
159 | self.indata1.header = ['x','y']
160 | rowHeadings = ['Date 1', 'Date 2']
161 | caption = ''
162 | expected = ' | x | ' + \
163 | 'y |
---|
Date 1 | 1 | ' + \
164 | '1 |
---|
Date 2 | 0 | 1 |
' + \
165 | '
'
166 | # no caption but headings
167 | out1 = self.writer1.write_str(self.indata1, caption, rowHeadings)
168 | assert expected == out1
169 |
170 | def test_escaping(self):
171 | tdata = datautil.tabular.TabularData(header=['s&p', 'y0 | 1 | '
180 | # print self.writer1.prettyPrint(in1)
181 |
182 |
183 | class TestLatexWriter:
184 |
185 | matrix = [[ 'H1', 'H2'],
186 | [1,'2%'],
187 | [3,4],
188 | ]
189 |
190 | exp = \
191 | r'''\textbf{H1} & \textbf{H2} \\
192 | \hline
193 | 1 & 2\% \\
194 | \hline
195 | 3 & 4 \\
196 | \hline
197 | '''
198 | m2l = datautil.tabular.LatexWriter()
199 |
200 | def test_escape(self):
201 | in1 = '& % $ something'
202 | exp1 = r'\& \% $ something'
203 | assert self.m2l.escape(in1) == exp1
204 |
205 | def test_table2latex(self):
206 | out = datautil.tabular.table2latex(self.matrix)
207 | self.diff(self.exp, out)
208 | assert out == self.exp
209 |
210 | def test_write(self):
211 | td = datautil.tabular.TabularData(data=self.matrix[1:], header=self.matrix[0])
212 | out = self.m2l.write_str(td)
213 | self.diff(self.exp, out)
214 | assert out == self.exp
215 |
216 | def diff(self, str1, str2):
217 | import difflib
218 | differ = difflib.Differ()
219 | text1 = str1.splitlines(1)
220 | text2 = str2.splitlines(1)
221 | result = list(differ.compare(text1, text2))
222 | from pprint import pprint
223 | pprint(result)
224 |
225 |
226 |
--------------------------------------------------------------------------------
/datautil/tests/tabular/test_gdocs.py:
--------------------------------------------------------------------------------
1 | import os
2 | from ConfigParser import SafeConfigParser
3 |
4 | import datautil.tabular.gdocs as gdocs
5 | from nose.plugins.skip import SkipTest
6 |
7 |
8 | cfg = SafeConfigParser()
9 | if not os.path.exists('test.ini'):
10 | msg = 'To run GDocs tests you need a config file. See %s for details' % __file__
11 | raise SkipTest(msg)
12 | cfg.readfp(open('test.ini'))
13 | username = cfg.get('gdocs', 'username')
14 | password = cfg.get('gdocs', 'password')
15 |
16 |
17 | class TestGDocsTextDb:
18 | def test_01(self):
19 | source = 'okfn-datautil-gdocs-testing'
20 | reader = gdocs.GDocsReaderTextDb(source, username, password, id_is_name=True)
21 | tdata = reader.read()
22 | assert tdata.header == ['col1', 'col2']
23 | assert len(tdata.data) == 5, tdata
24 |
25 |
26 | # not working properly yet
27 | class _TestGDocs:
28 | def test_01(self):
29 | source = 't8GZy4Lb6jhVjCL5nrqZ5TQ'
30 | reader = gdocs.GDocsReaderSpreadsheet(source, username, password)
31 | tdata = reader.read()
32 | assert len(tdata.data) == 6, tdata
33 |
34 | def test_02_id_is_name(self):
35 | source = 'okfn-datautil-gdocs-testing'
36 | reader = gdocs.GDocsReaderSpreadsheet(source, username, password, id_is_name=True)
37 | tdata = reader.read()
38 | assert len(tdata.data) == 6, tdata
39 |
40 |
41 |
--------------------------------------------------------------------------------
/datautil/tests/tabular/test_json.py:
--------------------------------------------------------------------------------
1 | from StringIO import StringIO
2 | import datautil.tabular.tabular_json as js
3 |
4 | class TestJson:
5 | in1 = { 'header': [u'a', u'b'],
6 | 'data': [[1,2], [3,4]]
7 | }
8 | in2 = [ in1['header'] ] + in1['data']
9 | in1sio = StringIO(js.json.dumps(in1))
10 | in1sio.seek(0)
11 | in2sio = StringIO(js.json.dumps(in2))
12 | in2sio.seek(0)
13 |
14 | def test_JsonReader(self):
15 | reader = js.JsonReader()
16 | out = reader.read(self.in1sio)
17 | assert out.header == self.in1['header']
18 | assert out.data == self.in1['data']
19 |
20 | out = reader.read(self.in2sio)
21 | assert out.header == self.in1['header']
22 | assert out.data == self.in1['data']
23 |
24 | def test_JsonWriter(self):
25 | writer = js.JsonWriter()
26 | td = js.TabularData(header=self.in1['header'], data=self.in1['data'])
27 | out = writer.write_str(td)
28 | assert js.json.loads(out) == self.in1
29 |
30 |
--------------------------------------------------------------------------------
/datautil/tests/tabular/test_misc.py:
--------------------------------------------------------------------------------
1 | import datautil.tabular
2 |
3 | class TestTranspose:
4 |
5 | def test_1(self):
6 | inlist = [
7 | [ 0, 1 ],
8 | [ 1, 0 ],
9 | ]
10 | exp = [
11 | ( 0, 1 ),
12 | ( 1, 0 ),
13 | ]
14 | out = datautil.tabular.transpose(inlist)
15 | assert out == exp, out
16 |
17 | class TestPivot:
18 | td = datautil.tabular.TabularData(
19 | header=['Name','Year','Value'],
20 | data=[
21 | ['x',2004,1],
22 | ['y',2004,2],
23 | ['y',2005,4],
24 | ['x',2005,3],
25 | ],
26 | )
27 |
28 | def test_pivot_with_tabular(self):
29 | out = datautil.tabular.pivot(self.td, 1, 0, 2)
30 | assert out.data[0] == [2004, 1, 2]
31 | assert out.data[-1] == [2005, 3, 4]
32 |
33 | def test_pivot_with_tabular_2(self):
34 | out = datautil.tabular.pivot(self.td, 'Year', 'Name', 'Value')
35 | assert out.data[0] == [2004, 1, 2]
36 |
37 | def test_pivot_simple_list(self):
38 | out = datautil.tabular.pivot(self.td.data, 1, 0, 2)
39 | assert out.data[0] == [2004, 1, 2]
40 |
41 |
--------------------------------------------------------------------------------
/datautil/tests/tabular/test_txt.py:
--------------------------------------------------------------------------------
1 | import StringIO
2 |
3 | from datautil.tabular.txt import *
4 | from datautil.tabular import TabularData, CsvReader
5 |
6 | class TestFormatting:
7 |
8 | sample_rows = [
9 | ['1', '2', 'head blah', 'blah blah blah'],
10 | ['a', 'b', 'c', 'd', 'e', 'g' ],
11 | ['1', '2', 'annakarenina annakarenina annakarenina'],
12 | ]
13 | output_width = 60
14 |
15 | writer = TxtWriter(output_width=output_width)
16 | writer._compute_parameters(sample_rows)
17 |
18 | def test_1(self):
19 | assert self.writer.numcols == 6
20 |
21 | def test_colwidths(self):
22 | exp = int ((self.output_width -1) / 6)
23 | assert self.writer.colwidths[0] == exp
24 |
25 | def test__write_1(self):
26 | out = self.writer._write_row(self.sample_rows[0])
27 | assert len(out) <= self.output_width
28 |
29 | def test__write_2(self):
30 | out = self.writer._write_row(self.sample_rows[0])
31 | exp = '| 1 | 2 |head bla|blah bla| | |\n'
32 | assert out == exp
33 |
34 | def test__write_separator(self):
35 | out = self.writer._write_separator()
36 | exp = '+--------+--------+--------+--------+--------+--------+\n'
37 |
38 |
39 |
40 | class TestTxtWriter:
41 | sample = \
42 | '''"YEAR","PH","RPH","RPH_1","LN_RPH","LN_RPH_1","HH","LN_HH"
43 | 1971,7.852361625,43.9168370988587,42.9594500501036,3.78229777955476,3.76025664867788,16185,9.69184016636035
44 | ,,abc,
45 | 1972,10.504714885,55.1134791192682,43.9168370988587,4.00939431635556,3.78229777955476,16397,9.70485367024987
46 | , ,, '''
47 |
48 | expected = \
49 | '''+------+------+------+------+------+------+------+------+
50 | | YEAR | PH | RPH |RPH_1 |LN_RPH|LN_RPH| HH |LN_HH |
51 | +------+------+------+------+------+------+------+------+
52 | | 1971 |7.8523|43.916|42.959|3.7822|3.7602|16185 |9.6918|
53 | +------+------+------+------+------+------+------+------+
54 | | | | abc | | | | | |
55 | +------+------+------+------+------+------+------+------+
56 | | 1972 |10.504|55.113|43.916|4.0093|3.7822|16397 |9.7048|
57 | +------+------+------+------+------+------+------+------+
58 | | | | | | | | | |
59 | +------+------+------+------+------+------+------+------+
60 | '''
61 |
62 | def test_simple(self):
63 | indata = TabularData(data=[range(5),range(5,10)])
64 | writer = TxtWriter()
65 | out = writer.write_str(indata)
66 | exp = '''+-+-+-+-+-+
67 | |0|1|2|3|4|
68 | +-+-+-+-+-+
69 | |5|6|7|8|9|
70 | +-+-+-+-+-+
71 | '''
72 | print out
73 | print exp
74 | assert out == exp
75 |
76 | def test_output_width(self):
77 | indata = TabularData(data=[range(5),range(5,10)])
78 | writer = TxtWriter(output_width=16)
79 | out = writer.write_str(indata)
80 | outlen = len(out.splitlines()[0])
81 | assert outlen == 16, outlen
82 |
83 | def test_using_csv(self):
84 | fileobj = StringIO.StringIO(self.sample)
85 | in_tdata = CsvReader(fileobj).read()
86 | writer = TxtWriter(output_width=60)
87 | out = writer.write_str(in_tdata)
88 | print out
89 | print self.expected
90 | assert self.expected == out, out
91 |
92 |
--------------------------------------------------------------------------------
/datautil/tests/test_cache.py:
--------------------------------------------------------------------------------
1 | import tempfile
2 | import shutil
3 | import os
4 |
5 | from datautil.cache import Cache
6 |
7 | class TestCache:
8 | @classmethod
9 | def setup_class(self):
10 | self.tmp = tempfile.mkdtemp()
11 | self.path = os.path.join(self.tmp, 'abc.txt')
12 | open(self.path, 'w').write('abc')
13 | self.url = 'file://%s' % self.path
14 |
15 | @classmethod
16 | def teardown_class(self):
17 | shutil.rmtree(self.tmp)
18 |
19 | def test_basename(self):
20 | base = 'http://www.abc.org/'
21 | in1 = base + 'xyz'
22 | out = Cache.basename(in1)
23 | assert out == 'xyz'
24 |
25 | in2 = base + 'xyz/abc.txt'
26 | out = Cache.basename(in2)
27 | assert out == 'abc.txt'
28 |
29 | in3 = base + 'membersDo?body=ABC'
30 | out = Cache.basename(in3)
31 | assert out == 'membersDo?body=ABC', out
32 |
33 | in3 = base + 'membersDo?body=data/ABC'
34 | out = Cache.basename(in3)
35 | assert out == 'membersDo?body=data%47ABC', out
36 |
37 | def test_filepath(self):
38 | r = Cache()
39 | base = 'http://www.abc.org/'
40 | in1 = base + 'xyz'
41 | out = r.filepath(in1)
42 | # ./xyz
43 | assert out.endswith('xyz'), out
44 |
45 | def test_dl(self):
46 | dest = os.path.join(self.tmp, 'out.txt')
47 | Cache.dl(self.url, dest)
48 | assert os.path.exists(dest)
49 | assert open(dest).read() == 'abc'
50 |
51 | def test_cache(self):
52 | cache = os.path.join(self.tmp, 'cache')
53 | r = Cache(cache)
54 | r.retrieve(self.url)
55 | assert os.path.exists(os.path.join(cache, 'abc.txt'))
56 |
57 |
--------------------------------------------------------------------------------
/datautil/tests/test_date.py:
--------------------------------------------------------------------------------
1 | from datautil.date import *
2 |
3 | import datetime
4 |
5 | class TestPythonStringOrdering(object):
6 | # It is impossible to find a string format such that +ve and -ve numbers
7 | # sort correctly as strings:
8 | # if (in string ordering) X < Y => -X < -Y (False!)
9 | def test_ordering(self):
10 | assert '0' < '1'
11 | assert '-10' < '10'
12 | assert '-' < '@'
13 | assert '-' < '0'
14 | assert '-100' < '-X10'
15 | assert '10' < '1000'
16 | assert '02000' < '10000'
17 | assert ' 2000' < '10000'
18 |
19 | def test_bad_ordering(self):
20 | assert ' ' < '0'
21 | assert ' ' < '-'
22 | assert not '-' < '+'
23 | assert '-100' > '-10'
24 | assert not '-100' < '-010'
25 | assert not '-100' < '- 10'
26 | assert not '-100' < ' -10'
27 | assert '10000' < '2000'
28 | assert not '-10' < ' 1'
29 |
30 |
31 | class TestFlexiDate(object):
32 | def test_init(self):
33 | fd = FlexiDate()
34 | assert fd.year == '', fd
35 | assert fd.month == '', fd
36 |
37 | fd = FlexiDate(2000, 1,1)
38 | assert fd.month == '01', fd
39 | assert fd.day== '01', fd
40 |
41 | def test_str(self):
42 | fd = FlexiDate(2000, 1, 23)
43 | assert str(fd) == '2000-01-23', '"%s"' % fd
44 | fd = FlexiDate(-2000, 1, 23)
45 | assert str(fd) == '-2000-01-23'
46 | fd = FlexiDate(2000)
47 | assert str(fd) == '2000'
48 | fd = FlexiDate(1760, qualifier='fl.')
49 | assert str(fd) == '1760 [fl.]', fd
50 |
51 | fd = FlexiDate(qualifier='anything')
52 | assert str(fd) == ' [anything]'
53 |
54 |
55 | def test_from_str(self):
56 | def dotest(fd):
57 | out = FlexiDate.from_str(str(fd))
58 | assert str(out) == str(fd)
59 |
60 | fd = FlexiDate(2000, 1, 23)
61 | dotest(fd)
62 | fd = FlexiDate(1760, qualifier='fl.')
63 | dotest(fd)
64 | fd = FlexiDate(-1760, 1, 3, qualifier='fl.')
65 | dotest(fd)
66 |
67 | def test_as_float(self):
68 | fd = FlexiDate(2000)
69 | assert fd.as_float() == float(2000), fd.as_float()
70 | fd = FlexiDate(1760, 1, 2)
71 | exp = 1760 + 1/12.0 + 2/365.0
72 | assert fd.as_float() == exp, fd.as_float()
73 | fd = FlexiDate(-1000)
74 | assert fd.as_float() == float(-1000)
75 |
76 | def test_as_datetime(self):
77 | fd = FlexiDate(2000)
78 | out = fd.as_datetime()
79 | assert out == datetime.datetime(2000, 1, 1), out
80 | fd = FlexiDate(1760, 1, 2)
81 | out = fd.as_datetime()
82 | assert out == datetime.datetime(1760,1,2), out
83 |
84 |
85 | class TestDateParsers(object):
86 | def test_using_datetime(self):
87 | parser = PythonDateParser()
88 |
89 | d1 = datetime.date(2000, 1, 23)
90 | fd = parser.parse(d1)
91 | assert fd.year == '2000'
92 |
93 | d1 = datetime.datetime(2000, 1, 23)
94 | fd = parser.parse(d1)
95 | # assert str(fd) == '2000-01-23T00:00:00', fd
96 | assert str(fd) == '2000-01-23', fd
97 |
98 | def test_using_dateutil(self):
99 | parser = DateutilDateParser()
100 |
101 | in1 = '2001-02'
102 | fd = parser.parse(in1)
103 | assert str(fd) == in1, fd
104 |
105 | in1 = 'March 1762'
106 | fd = parser.parse(in1)
107 | assert str(fd) == '1762-03'
108 |
109 | in1 = 'March 1762'
110 | fd = parser.parse(in1)
111 | assert str(fd) == '1762-03'
112 |
113 | in1 = '1768 AD'
114 | fd = parser.parse(in1)
115 | assert str(fd) == '1768', fd
116 |
117 | in1 = '1768 A.D.'
118 | fd = parser.parse(in1)
119 | assert str(fd) == '1768', fd
120 |
121 | in1 = '-1850'
122 | fd = parser.parse(in1)
123 | assert str(fd) == '-1850', fd
124 |
125 | in1 = '1762 BC'
126 | fd = parser.parse(in1)
127 | assert str(fd) == '-1762', fd
128 |
129 | in1 = '4 BC'
130 | fd = parser.parse(in1)
131 | assert str(fd) == '-0004', fd
132 |
133 | in1 = '4 B.C.'
134 | fd = parser.parse(in1)
135 | assert str(fd) == '-0004', fd
136 |
137 | in1 = 'Wed, 06 Jan 2010 09:30:00 GMT'
138 | fd = parser.parse(in1)
139 | assert str(fd) == '2010-01-06', fd
140 |
141 | in1 = 'Tue, 07 Dec 2010 10:00:00 GMT'
142 | fd = parser.parse(in1)
143 | assert str(fd) == '2010-12-07', fd
144 |
145 | def test_parse(self):
146 | d1 = datetime.datetime(2000, 1, 23)
147 | fd = parse(d1)
148 | assert fd.year == '2000'
149 |
150 | fd = parse('March 1762')
151 | assert str(fd) == '1762-03'
152 |
153 | fd = parse(1966)
154 | assert str(fd) == '1966'
155 |
156 | fd = parse('22/07/2010')
157 | assert fd.month == '07', fd.month
158 |
159 | def test_parse_ambiguous_day_month(self):
160 | fd = parse('05/07/2010')
161 | assert fd.month == '07', fd.month
162 | assert fd.day == '05', fd.month
163 |
164 | def test_parse_with_none(self):
165 | d1 = parse(None)
166 | assert d1 is None
167 |
168 | def test_parse_wildcards(self):
169 | fd = parse('198?')
170 | assert fd.year == '', fd.year # expect this to not parse
171 | # TODO but we should have a float if possible
172 | # assert fd.as_float() == u'1980', fd.as_float()
173 |
174 | def test_parse_with_qualifiers(self):
175 |
176 | fd = parse('1985?')
177 | assert fd.year == u'1985', fd
178 | assert fd.qualifier == u'Uncertainty : 1985?', fd.qualifier
179 |
180 | # match '[c|c. |c.] {date}'
181 | fd = parse('c.1780')
182 | assert fd.year == u'1780', fd
183 | assert fd.qualifier.startswith(u"Note 'circa'"), fd
184 |
185 | fd = parse('c. 1780')
186 | assert fd.year == u'1780', fd
187 | assert fd.qualifier.startswith(u"Note 'circa'"), fd
188 |
189 | fd = parse('c1780')
190 | assert fd.year == '1780', fd
191 | assert fd.qualifier == u"Note 'circa' : c1780", fd
192 |
193 | fd = parse('c 1780')
194 | assert fd.year == u'1780', fd
195 | assert fd.qualifier.startswith(u"Note 'circa'"), fd
196 |
197 | # match 'circa {date}' | circa{date}'
198 | fd = parse('circa1780')
199 | assert fd.year == u'1780', fd
200 | assert fd.qualifier == u"Note 'circa' : circa1780", fd
201 |
202 | fd = parse('circa 1780')
203 | assert fd.year == u'1780', fd
204 | assert fd.qualifier.startswith(u"Note 'circa'"), fd
205 |
206 | # match '[circ|circ. |circ.] {date}'
207 | fd = parse('circ1780')
208 | assert fd.year == u'1780', fd
209 | assert fd.qualifier == u"Note 'circa' : circ1780", fd
210 |
211 | fd = parse('circ 1780')
212 | assert fd.year == u'1780', fd
213 | assert fd.qualifier.startswith(u"Note 'circa'"), fd
214 |
215 | fd = parse('circ.1780')
216 | assert fd.year == u'1780', fd
217 | assert fd.qualifier == u"Note 'circa' : circ.1780", fd
218 |
219 | fd = parse('circ. 1780')
220 | assert fd.year == u'1780', fd
221 | assert fd.qualifier.startswith(u"Note 'circa'"), fd
222 |
223 | # match '[cca|cca. |cca.] {date}'
224 | fd = parse('cca1780')
225 | assert fd.year == u'1780', fd
226 | assert fd.qualifier == u"Note 'circa' : cca1780", fd
227 |
228 | fd = parse('cca 1780')
229 | assert fd.year == u'1780', fd
230 | assert fd.qualifier.startswith(u"Note 'circa'"), fd
231 |
232 | fd = parse('cca.1780')
233 | assert fd.year == u'1780', fd
234 | assert fd.qualifier == u"Note 'circa' : cca.1780", fd
235 |
236 | fd = parse('cca. 1780')
237 | assert fd.year == u'1780', fd
238 | assert fd.qualifier.startswith(u"Note 'circa'"), fd
239 |
240 | # match '[ca|ca. |ca.] {date}'
241 |
242 | fd = parse('ca. 1780')
243 | assert fd.year == u'1780', fd
244 | assert fd.qualifier.startswith(u"Note 'circa'"), fd
245 |
246 | fd = parse('ca. 1780')
247 | assert fd.year == u'1780', fd
248 | assert fd.qualifier.startswith(u"Note 'circa'"), fd
249 |
250 | fd = parse('ca.1780')
251 | assert fd.year == u'1780', fd
252 | assert fd.qualifier.startswith(u"Note 'circa'"), fd
253 |
254 | fd = parse('ca.1780')
255 | assert fd.year == u'1780', fd
256 | assert fd.qualifier.startswith(u"Note 'circa'"), fd
257 |
258 | fd = parse('ca.1780')
259 | assert fd.year == u'1780', fd
260 | assert fd.qualifier.startswith(u"Note 'circa'"), fd
261 |
262 |
263 |
264 | def test_ambiguous(self):
265 | # TODO: have to be careful here ...
266 | fd = parse('1068/1069')
267 |
268 | def test_small_years(self):
269 | in1 = '23'
270 | fd = parse(in1)
271 | assert str(fd) == '0023', fd
272 | assert fd.as_float() == 23, fd.as_float()
273 |
274 | def test_small_years_with_zeros(self):
275 | in1 = '0023'
276 | fd = parse(in1)
277 | assert str(fd) == '0023', fd
278 | assert fd.as_float() == 23, fd.as_float()
279 |
280 | def test_years_with_alpha_prefix(self):
281 | in1 = "p1980"
282 | fd = parse(in1)
283 | assert str(fd) == "1980", fd
284 |
--------------------------------------------------------------------------------
/datautil/tests/test_id.py:
--------------------------------------------------------------------------------
1 | import uuid
2 |
3 | import datautil.id
4 |
5 | def test_compress_and_uncompress_uuid():
6 | hexversion = '86c3f19d-8854-4ef5-8d88-f008e0817871'
7 |
8 | out = datautil.id.compress_uuid(hexversion)
9 | assert len(out) == 22
10 |
11 | orig = datautil.id.uncompress_uuid(out)
12 | assert orig == hexversion
13 |
14 | # test unicode
15 | orig = datautil.id.uncompress_uuid(unicode(out))
16 | assert orig == hexversion
17 |
18 | u1 = uuid.UUID(hexversion)
19 | out = datautil.id.compress_uuid(u1)
20 | assert len(out) == 22
21 |
22 |
23 | def test_int_to_b32():
24 | def check(int_):
25 | out = datautil.id.int_to_b32(int_)
26 | assert isinstance(out, basestring)
27 | assert len(out) == 7, out
28 |
29 | back = datautil.id.b32_to_int(out)
30 | assert back == int_, (int_,back)
31 |
32 | check(1)
33 | check(2**28+1)
34 | check(2**30-1)
35 |
36 |
--------------------------------------------------------------------------------
/datautil/tests/test_misc.py:
--------------------------------------------------------------------------------
1 | from datautil.misc import *
2 |
3 | class TestFloatify:
4 | def test_floatify_1(self):
5 | x = '10'
6 | assert floatify(x) == 10.0
7 |
8 | def test_floatify_2(self):
9 | x = '1,030'
10 | assert floatify(x) == 1030.0
11 |
12 | def test_floatify_2(self):
13 | x = ''
14 | out = floatify(x)
15 | assert out == None, out
16 | x = '#'
17 | out = floatify(x)
18 | assert out == None, out
19 |
20 | def test_floatify_matrix(self):
21 | x = [
22 | ['1', '2'],
23 | ['abc', '3.0']
24 | ]
25 | exp = [
26 | [1.0, 2.0],
27 | ['abc', 3.0]
28 | ]
29 | out = floatify_matrix(x)
30 | assert out == exp
31 |
32 |
33 | class TestMakeSeries:
34 |
35 | def test_make_series(self):
36 | indata = [ [ '1980', '100', '50' ],
37 | [ '1981', '101', '51' ],
38 | [ '1982', '102', '' ],
39 | ]
40 | exp = [
41 | [ (1980.0, 100.0), (1981.0, 101.0), (1982.0, 102.0) ],
42 | [ (1980.0, 50.0), (1981.0, 51.0) ]
43 | ]
44 | out = make_series(indata, xcol=0, ycols=[1,2])
45 | assert out == exp, out
46 |
47 |
--------------------------------------------------------------------------------
/datautil/tests/test_xls.py:
--------------------------------------------------------------------------------
1 | import pkg_resources
2 |
3 | import datautil.tabular
4 |
5 | class TestXlsReader:
6 |
7 | def test_stuff(self):
8 | fo = pkg_resources.resource_stream('datautil',
9 | 'tests/data/xls_reader_test.xls')
10 | reader = datautil.tabular.XlsReader(fo)
11 | tab = reader.read()
12 | assert tab.data[0][0] == 1850
13 | assert tab.data[19][1] == 12.3
14 |
15 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 |
3 | import sys
4 | sys.path.insert(0, '.')
5 | from datautil import __version__, __doc__ as __long_description__
6 |
7 | setup(
8 | name='datautil',
9 | version=__version__,
10 | license='MIT',
11 | description='Utilities for Data Work',
12 | long_description=__long_description__,
13 | author='Open Knowledge Foundation',
14 | author_email='info@okfn.org',
15 | url='http://okfn.org/projects/datautil/',
16 | download_url='https://github.com/okfn/datautil/',
17 | install_requires=[
18 | # python-dateutil 2.0 has different _parse method, so stick to 1.4.1
19 | 'python-dateutil>=1.0,<1.99',
20 | # (optional) for excel handling
21 | # xlrd
22 | # (optional) for google docs handling
23 | # gdata
24 | ],
25 | packages=find_packages(),
26 | include_package_data=True,
27 | zip_safe=False,
28 | classifiers = [
29 | 'Development Status :: 5 - Production/Stable',
30 | 'Environment :: Console',
31 | 'Intended Audience :: Developers',
32 | 'Operating System :: OS Independent',
33 | 'Programming Language :: Python',
34 | 'Programming Language :: Python :: 2 :: Only',
35 | 'Topic :: Software Development :: Libraries :: Python Modules'
36 | ],
37 | )
38 |
--------------------------------------------------------------------------------
/swiss/__init__.py:
--------------------------------------------------------------------------------
1 | '''Swiss Army Knife for Data Work
2 | ==============================
3 |
4 | The swiss package provides various utilities for working with data:
5 |
6 | * cache: Url caching and scraping
7 | * tabular/*: Processing and transforming tabular data to and from various
8 | formats including csv, json, google spreadsheets, xls
9 | * misc, date: Cleaning up and parsing data especially dates.
10 | * id: ID generation and shortenening
11 | * clitools.py: Command line tools such as creating optparse object and usage
12 | from a module of object.
13 | * deliveranceproxy.py: Deliverance proxy helper
14 |
15 |
16 | CHANGELOG
17 | =========
18 |
19 | v0.3 2010-08-01
20 | ---------------
21 |
22 | * Support for google docs spreadsheets as sources for TabularData
23 | * Improve documentation of date module and add FlexiDate.as_datetime()
24 | * New clitools module incorporating existing cli tools
25 | * deliveranceproxy.py: Deliverance proxy helper for proxying to remote
26 | websites and retheming with deliverance.
27 | * parse/name.py: new (human) name parsing code
28 |
29 | v0.2 2009-10-23
30 | ---------------
31 |
32 | * Extensive refactoring of tabular module/package
33 | * Standardized interface with BaseReader and BaseWriter
34 | * JsonReader and JsonWriter providing json reading and writing
35 | * TxtWriter to support writing to plain text
36 | * Improvements to date parsing (support for circa, 'c.', etc)
37 | * New id module to do 'compression' of uuids using 32 and 64 bit encoding
38 |
39 |
40 | v0.1 2009-06-03
41 | ---------------
42 |
43 | * Bring together existing code (from last 2+ years) into new 'swiss' package
44 | * Url caching and scraping
45 | * Tabular data handling including csv reader/writer, xls reader, latex writer
46 | and associated utilities (such as pivot_table)
47 | * Cleaning and parsing data especially dates (misc and date modules)
48 | '''
49 | __version__ = '0.3'
50 |
51 | import tabular
52 | from cache import *
53 | from misc import *
54 | from id import *
55 |
--------------------------------------------------------------------------------
/swiss/cache.py:
--------------------------------------------------------------------------------
1 | '''A local file cache with url retrieving builtin.
2 |
3 | NB: this module has zero dependencies on modules outside of the
4 | standard lib so that it is easily reusable in other libraries and applications
5 | that do not require any other parts of the swiss package.
6 | '''
7 | import urlparse
8 | import urllib
9 | import os
10 | import sys
11 |
12 |
13 | # have to define before Cache as used in classmethod
14 | class _Progress(object):
15 | def __init__(self):
16 | self.count = -1
17 |
18 | def dl_progress(self, count, block_size, total_size):
19 | if total_size == 0: # total_size is weird so return to avoid errors
20 | return
21 | if self.count == -1:
22 | print 'Total size: %s' % self.format_size(total_size)
23 | last_percent = int(self.count*block_size*100/total_size)
24 | percent = int(count*block_size*100/total_size)
25 | if percent > last_percent:
26 | # TODO: is this acceptable? Do we want to do something nicer?
27 | sys.stdout.write('.')
28 | sys.stdout.flush()
29 | self.count = count
30 |
31 | def format_size(self, bytes):
32 | if bytes > 1000*1000:
33 | return '%.1fMb' % (bytes/1000.0/1000)
34 | elif bytes > 10*1000:
35 | return '%iKb' % (bytes/1000)
36 | elif bytes > 1000:
37 | return '%.1fKb' % (bytes/1000.0)
38 | else:
39 | return '%ibytes' % bytes
40 |
41 |
42 | class Cache(object):
43 | '''A local file cache (and url retriever).
44 | '''
45 |
46 | def __init__(self, path='.'):
47 | '''
48 | @param path: path to cache (defaults to current directory)
49 | '''
50 | self.path = path
51 | if not os.path.exists(self.path):
52 | os.makedirs(path)
53 |
54 | def retrieve(self, url, force=False):
55 | '''Retrieve url into cache and return the local path to it.'''
56 | dest = self.cache_path(url)
57 | if not os.path.exists(dest) or force:
58 | self.download(url, dest)
59 | return dest
60 |
61 | def cache_path(self, url):
62 | '''Local path for url within cache.'''
63 | name = self.basename(url)
64 | dest = os.path.join(self.path, name)
65 | return dest
66 |
67 | def filepath(self, url):
68 | '''Deprecated: use cache_path'''
69 | return self.cache_path(url)
70 |
71 | def stream(self, url):
72 | fp = self.cache_path(url)
73 | if not os.path.exists(fp):
74 | return None
75 | else:
76 | return open(fp)
77 |
78 | @classmethod
79 | def basename(self, url):
80 | scheme, netloc, path, params, query, fragment = urlparse.urlparse(url)
81 | result = path.split('/')[-1]
82 | if query:
83 | # escape '/' as otherwise path problems
84 | result += '?' + query.replace('/', '%47')
85 | return result
86 |
87 | @classmethod
88 | def download(self, url, dest=None):
89 | '''Download a file from a url.
90 | '''
91 | if not dest:
92 | dest = self.basename(url)
93 | print 'Retrieving %s' % url
94 | prog = _Progress()
95 | urllib.urlretrieve(url, dest, reporthook=prog.dl_progress)
96 |
97 | # for backwards compatability
98 | @classmethod
99 | def dl(self, url, dest=None):
100 | return self.download(url, dest)
101 |
102 |
--------------------------------------------------------------------------------
/swiss/clitools.py:
--------------------------------------------------------------------------------
1 | '''Expose methods or functions as commands on the command line
2 |
3 | Example usage::
4 |
5 | # in your code
6 | from swiss.clitools import _main
7 | if __name__ == '__main__':
8 | # expose everything in current module
9 | _main(locals())
10 | # or if you have an object MyObject with methods you want to expose
11 | _main(MyObject)
12 | '''
13 | import os
14 | import sys
15 | import optparse
16 | import inspect
17 |
18 | def _object_methods(obj):
19 | methods = inspect.getmembers(obj, inspect.ismethod)
20 | methods = filter(lambda (name,y): not name.startswith('_'), methods)
21 | methods = dict(methods)
22 | return methods
23 |
24 | def _module_functions(functions):
25 | local_functions = dict(functions)
26 | for k,v in local_functions.items():
27 | if not inspect.isfunction(v) or k.startswith('_'):
28 | del local_functions[k]
29 | return local_functions
30 |
31 | def _main(functions_or_object):
32 | isobject = inspect.isclass(functions_or_object)
33 | if isobject:
34 | _methods = _object_methods(functions_or_object)
35 | else:
36 | _methods = _module_functions(functions_or_object)
37 |
38 | usage = '''%prog {action}
39 |
40 | Actions:
41 | '''
42 | usage += '\n '.join(
43 | [ '%s: %s' % (name, m.__doc__.split('\n')[0] if m.__doc__ else '') for (name,m)
44 | in sorted(_methods.items()) ])
45 | parser = optparse.OptionParser(usage)
46 | # Optional: for a config file
47 | # parser.add_option('-c', '--config', dest='config',
48 | # help='Config file to use.')
49 | options, args = parser.parse_args()
50 |
51 | if not args or not args[0] in _methods:
52 | parser.print_help()
53 | sys.exit(1)
54 |
55 | method = args[0]
56 | if isobject:
57 | getattr(functions_or_object(), method)(*args[1:])
58 | else:
59 | _methods[method](*args[1:])
60 |
61 | __all__ = [ '_main' ]
62 |
63 | if __name__ == '__main__':
64 | _main(locals())
65 |
66 |
--------------------------------------------------------------------------------
/swiss/date.py:
--------------------------------------------------------------------------------
1 | '''Date parsing and normalization utilities based on FlexiDate.
2 |
3 | To parser dates use parse, e.g.::
4 |
5 | parse('1890') -> FlexiDate(year=u'1890')
6 | parse('1890?') -> FlexiDate(year=u'1890', qualifier='Uncertainty: 1985?')
7 |
8 | Once you have a FlexiDate you can get access to attributes (strings of course
9 | ...)::
10 |
11 | fd = parse('Jan 1890')
12 | fd.year # u'1890'
13 | fd.month # u'01'
14 |
15 | And convert to other forms:
16 |
17 | fd.as_float() # 1890
18 | fd.as_datetime() # datetime(1890,01,01)
19 |
20 | Background
21 | ==========
22 |
23 | FlexiDate is focused on supporting:
24 |
25 | 1. Dates outside of Python (or DB) supported period (esp. dates < 0 AD)
26 | 2. Imprecise dates (c.1860, 18??, fl. 1534, etc)
27 | 3. Normalization of dates to machine processable versions
28 | 4. Sortable in the database (in correct date order)
29 |
30 | For more information see:
31 |
32 | http://www.rufuspollock.org/2009/06/18/flexible-dates-in-python/
33 | '''
34 | import re
35 | import datetime
36 |
37 | class FlexiDate(object):
38 | """Store dates as strings and present them in a slightly extended version
39 | of ISO8601.
40 |
41 | Modifications:
42 | * Allow a trailing qualifiers e.g. fl.
43 | * Allow replacement of unknown values by ? e.g. if sometime in 1800s
44 | can do 18??
45 |
46 | Restriction on ISO8601:
47 | * Truncation (e.g. of centuries) is *not* permitted.
48 | * No week and day representation e.g. 1999-W01
49 | """
50 | # pass
51 | def __init__(self, year=None, month=None, day=None, qualifier=''):
52 | # force = month or day or qualifier
53 | force = False
54 | self.year = self._cvt(year, rjust=4, force=force)
55 | self.month = self._cvt(month)
56 | self.day = self._cvt(day)
57 | self.qualifier = qualifier
58 |
59 | def _cvt(self, val, rjust=2, force=False):
60 | if val:
61 | tmp = unicode(val).strip()
62 | if tmp.startswith('-'):
63 | tmp = '-' + tmp[1:].rjust(rjust, '0')
64 | else:
65 | tmp = tmp.rjust(rjust, '0')
66 | return tmp
67 | elif force:
68 | # use '!' rather than '?' as '!' < '1' while '?' > '1'
69 | return rjust * '!'
70 | else:
71 | return ''
72 |
73 | def __str__(self):
74 | out = self.isoformat()
75 | if self.qualifier:
76 | # leading space is important as ensures when no year sort in right
77 | # order as ' ' < '1'
78 | out += u' [%s]' % self.qualifier
79 | return out
80 |
81 | def __repr__(self):
82 | return u'%s %s' % (self.__class__, self.__str__())
83 |
84 | def isoformat(self, strict=False):
85 | '''Return date in isoformat (same as __str__ but without qualifier).
86 |
87 | WARNING: does not replace '?' in dates unless strict=True.
88 | '''
89 | out = self.year
90 | # what do we do when no year ...
91 | for val in [ self.month, self.day ]:
92 | if not val:
93 | break
94 | out += u'-' + val
95 | if strict:
96 | out = out.replace('?', '0')
97 | return out
98 |
99 | our_re_pat = '''
100 | (?P -?[\d?]+)
101 | (?:
102 | \s* - (?P [\d?]{1,2})
103 | (?: \s* - (?P [\d?]{1,2}) )?
104 | )?
105 | \s*
106 | (?: \[ (?P[^]]*) \])?
107 | '''
108 | our_re = re.compile(our_re_pat, re.VERBOSE)
109 | @classmethod
110 | def from_str(self, instr):
111 | '''Undo affect of __str__'''
112 | if not instr:
113 | return FlexiDate()
114 |
115 | out = self.our_re.match(instr)
116 | if out is None: # no match TODO: raise Exception?
117 | return None
118 | else:
119 | return FlexiDate(
120 | out.group('year'),
121 | out.group('month'),
122 | out.group('day'),
123 | qualifier=out.group('qualifier')
124 | )
125 |
126 | def as_float(self):
127 | '''Get as a float (year being the integer part).
128 |
129 | Replace '?' in year with 9 so as to be conservative (e.g. 19?? becomes
130 | 1999) and elsewhere (month, day) with 0
131 |
132 | @return: float.
133 | '''
134 | if not self.year: return None
135 | out = float(self.year.replace('?', '9'))
136 | if self.month:
137 | # TODO: we are assuming months are of equal length
138 | out += float(self.month.replace('?', '0')) / 12.0
139 | if self.day:
140 | out += float(self.day.replace('?', '0')) / 365.0
141 | return out
142 |
143 | def as_datetime(self):
144 | '''Get as python datetime.datetime.
145 |
146 | Require year to be a valid datetime year. Default month and day to 1 if
147 | do not exist.
148 |
149 | @return: datetime.datetime object.
150 | '''
151 | year = int(self.year)
152 | month = int(self.month) if self.month else 1
153 | day = int(self.day) if self.day else 1
154 | return datetime.datetime(year, month, day)
155 |
156 |
157 | def parse(date, dayfirst=True):
158 | '''Parse a `date` into a `FlexiDate`.
159 |
160 | @param date: the date to parse - may be a string, datetime.date,
161 | datetime.datetime or FlexiDate.
162 |
163 | TODO: support for quarters e.g. Q4 1980 or 1954 Q3
164 | TODO: support latin stuff like M.DCC.LIII
165 | TODO: convert '-' to '?' when used that way
166 | e.g. had this date [181-]
167 | '''
168 | if not date:
169 | return None
170 | if isinstance(date, FlexiDate):
171 | return date
172 | if isinstance(date, int):
173 | return FlexiDate(year=date)
174 | elif isinstance(date, datetime.date):
175 | parser = PythonDateParser()
176 | return parser.parse(date)
177 | else: # assuming its a string
178 | parser = DateutilDateParser()
179 | out = parser.parse(date, **{'dayfirst': dayfirst})
180 | if out is not None:
181 | return out
182 | # msg = 'Unable to parse %s' % date
183 | # raise ValueError(date)
184 | val = 'UNPARSED: %s' % date
185 | val = val.encode('ascii', 'ignore')
186 | return FlexiDate(qualifier=val)
187 |
188 |
189 | class DateParserBase(object):
190 | def parse(self, date):
191 | raise NotImplementedError
192 |
193 | def norm(self, date):
194 | return str(self.parse(date))
195 |
196 | class PythonDateParser(object):
197 | def parse(self, date):
198 | return FlexiDate(date.year, date.month, date.day)
199 |
200 | try:
201 | import dateutil.parser
202 | dateutil_parser = dateutil.parser.parser()
203 | except:
204 | dateutil_parser = None
205 |
206 | class DateutilDateParser(DateParserBase):
207 | _numeric = re.compile("^[0-9]+$")
208 | def parse(self, date, **kwargs):
209 | '''
210 | :param **kwargs: any kwargs accepted by dateutil.parse function.
211 | '''
212 | qualifiers = []
213 | if dateutil_parser is None:
214 | return None
215 | date = orig_date = date.strip()
216 |
217 | # various normalizations
218 | # TODO: call .lower() first
219 | date = date.replace('B.C.', 'BC')
220 | date = date.replace('A.D.', 'AD')
221 |
222 | # deal with pre 0AD dates
223 | if date.startswith('-') or 'BC' in date or 'B.C.' in date:
224 | pre0AD = True
225 | else:
226 | pre0AD = False
227 | # BC seems to mess up parser
228 | date = date.replace('BC', '')
229 |
230 | # deal with circa: 'c.1950' or 'c1950'
231 | circa_match = re.match('(.*)c\.?\s*(\d+.*)', date)
232 | if circa_match:
233 | # remove circa bit
234 | qualifiers.append("Note 'circa'")
235 | date = ''.join(circa_match.groups())
236 |
237 | # deal with p1980 (what does this mean? it can appear in
238 | # field 008 of MARC records
239 | p_match = re.match("^p(\d+)", date)
240 | if p_match:
241 | date = date[1:]
242 |
243 | # Deal with uncertainty: '1985?'
244 | uncertainty_match = re.match('([0-9xX]{4})\?', date)
245 | if uncertainty_match:
246 | # remove the ?
247 | date = date[:-1]
248 | qualifiers.append('Uncertainty')
249 |
250 | # Parse the numbers intelligently
251 | # do not use std parser function as creates lots of default data
252 | res = dateutil_parser._parse(date, **kwargs)
253 |
254 | if res is None:
255 | # Couldn't parse it
256 | return None
257 | #Note: Years of less than 3 digits not interpreted by
258 | # dateutil correctly
259 | # e.g. 87 -> 1987
260 | # 4 -> day 4 (no year)
261 | # Both cases are handled in this routine
262 | if res.year is None and res.day:
263 | year = res.day
264 | # If the whole date is simply two digits then dateutil_parser makes
265 | # it '86' -> '1986'. So strip off the '19'. (If the date specified
266 | # day/month then a two digit year is more likely to be this century
267 | # and so allow the '19' prefix to it.)
268 | elif self._numeric.match(date) and (len(date) == 2 or date.startswith('00')):
269 | year = res.year % 100
270 | else:
271 | year = res.year
272 |
273 | # finally add back in BC stuff
274 | if pre0AD:
275 | year = -year
276 |
277 | if not qualifiers:
278 | qualifier = ''
279 | else:
280 | qualifier = ', '.join(qualifiers) + (' : %s' % orig_date)
281 | return FlexiDate(year, res.month, res.day, qualifier=qualifier)
282 |
283 |
--------------------------------------------------------------------------------
/swiss/deliveranceproxy.py:
--------------------------------------------------------------------------------
1 | '''Use deliverance_ for proxying and re-theming.
2 |
3 | .. _deliverance: http://packages.python.org/Deliverance/
4 |
5 | Usage requirements (in pip-requirements.txt format)::
6 |
7 | # suggest installing lxml directly
8 | lxml
9 | deliverance>=0.3a
10 | # for urlmap and proxy
11 | paste
12 | # for Response
13 | webob
14 |
15 | Example usage::
16 |
17 | dest = 'http://myremotes.ite/'
18 | mytheme = '....'
19 | my_deliverance_rules = ' ...'
20 | # or
21 | # my_deliverance_rules = open('/my/path/to/rules.xml').read()
22 | deliverance_proxy = create_deliverance_proxy(mytheme, dest,
23 | my_deliverance_rules)
24 |
25 | # from in wsgi app
26 | # path on remote destination url you want to proxy to ...
27 | # you can omit this if local path and remote path are the same
28 | environ['PATH_INFO'] = '/my_destination_path'
29 | deliverance_proxy(environ, start_response)
30 | '''
31 | import logging
32 |
33 | import paste.urlmap
34 | import deliverance.middleware
35 | import paste.proxy
36 | from webob import Request, Response
37 | from deliverance.middleware import DeliveranceMiddleware, SubrequestRuleGetter
38 | from deliverance.log import PrintingLogger
39 |
40 |
41 | default_deliverance_rules = \
42 | '''
43 |
44 |
45 |
47 |
48 |
49 |
50 |
51 |
54 |
55 |
56 | '''
57 |
58 | def create_deliverance_proxy(proxy_base_url, theme_html, rules_xml=None):
59 | '''Proxy to another url with re-theming using deliverance.
60 |
61 | Based on http://rufuspollock.org/code/deliverance
62 |
63 | :param proxy_base_url: base destination url we are proxying to.
64 | :param theme_html: string providing html theme to use for re-themeing.
65 | :param rules_xml: (optional) deliverance rules xml as a string. If not
66 | provided use `default_deliverance_rules`. For info on rulesets see
67 | deliverance docs. We require that ruleset support a single
68 | substitution string '%s' which is used to insert internal mountpoint
69 | for the them ('/_deliverance_theme.html').
70 | '''
71 | theme_url = '/_deliverance_theme.html'
72 | # use a urlmap so we can mount theme and urlset
73 | app = paste.urlmap.URLMap()
74 | # set up theme consistent with our rules file
75 | app[theme_url] = Response(theme_html)
76 |
77 | if rules_xml:
78 | rules = rules_xml
79 | else:
80 | rules = default_deliverance_rules
81 | rules = rules % theme_url
82 | app['/_deliverance_rules.xml'] = Response(rules, content_type="application/xml")
83 |
84 | class MyProxy(object):
85 | def __init__(self, proxy_base_url):
86 | self.proxy = paste.proxy.Proxy(proxy_base_url)
87 |
88 | def __call__(self, environ, start_response):
89 | req = Request(environ)
90 | res = req.get_response(self.proxy)
91 | res.decode_content()
92 | return res(environ, start_response)
93 |
94 | app['/'] = MyProxy(proxy_base_url)
95 | deliv = DeliveranceMiddleware(app, SubrequestRuleGetter('/_deliverance_rules.xml'),
96 | PrintingLogger,
97 | log_factory_kw=dict(print_level=logging.WARNING))
98 | return deliv
99 |
100 |
--------------------------------------------------------------------------------
/swiss/id.py:
--------------------------------------------------------------------------------
1 | import base64
2 | import uuid
3 |
4 | def compress_uuid(_uuid):
5 | '''Provided shortened string representation of UUID via base64 encoding.
6 |
7 | @return: 22 character base64 encoded version of UUID.
8 | '''
9 | if isinstance(_uuid, basestring):
10 | _uuid = uuid.UUID(_uuid)
11 | encode = base64.b64encode(_uuid.bytes, '_-')
12 | # throw away trailing ==
13 | return encode[:22]
14 |
15 | def uncompress_uuid(b64_encoded):
16 | '''Reverse compress_uuid
17 |
18 | @return: 36 char str representation of uuid.
19 | '''
20 | b64_encoded = str(b64_encoded)
21 | if not b64_encoded.endswith('=='):
22 | b64_encoded += '=='
23 | out = base64.b64decode(b64_encoded, '_-')
24 | _uuid = uuid.UUID(bytes=out)
25 | return str(_uuid)
26 |
27 |
28 | import struct
29 | def int_to_b32(int_):
30 | out = struct.pack('1i', int_)
31 | out = base64.b32encode(out)
32 | # throw away trailing '='
33 | return out[:-1]
34 |
35 | def b32_to_int(b32):
36 | out = base64.b32decode(b32+'=', casefold=True)
37 | out = struct.unpack('1i', out)[0]
38 | return out
39 |
40 |
--------------------------------------------------------------------------------
/swiss/misc.py:
--------------------------------------------------------------------------------
1 | # TODO: create a strict option where None is returned on failed convert rather
2 | # than original value
3 | placeholders = [ '', '-', '#' ]
4 | def floatify(value):
5 | '''Convert value to a float if possible.
6 |
7 | @return: Floatified value. If value is blank or placeholder ('-') return
8 | None. Can deal with ',' in value. Will also floatify dates. If nothing
9 | works returns original value.
10 | '''
11 | if value is None:
12 | return None
13 | if isinstance(value, basestring):
14 | stripped = value.strip()
15 | if not stripped or stripped in placeholders:
16 | return None
17 | else:
18 | # often numbers have commas in them like 1,030
19 | v = value.replace(',', '')
20 | try:
21 | newval = float(v)
22 | return newval
23 | except:
24 | pass
25 | # will return original value if fails
26 | return date_to_float(value)
27 |
28 | def floatify_matrix(matrix):
29 | return [ [ floatify(col) for col in row ] for row in matrix ]
30 |
31 | # TODO: remove/convert to using date.FlexiDate.as_float()
32 | import datetime
33 | def date_to_float(date):
34 | '''Convert a date to float.
35 |
36 | Accepts either a date object or a string parseable to a date object
37 |
38 | @return: converted value or original if conversion fails
39 | '''
40 | import dateutil.parser
41 | if isinstance(date, basestring):
42 | try: # simple year
43 | return float(date)
44 | except:
45 | pass
46 | try:
47 | val = dateutil.parser.parse(date, default=datetime.date(1,1,1))
48 | except:
49 | return date
50 | else:
51 | val = date
52 |
53 | if isinstance(val, datetime.date):
54 | fval = val.year + val.month / 12.0 + val.day / 365.0
55 | return round(fval, 3)
56 | else:
57 | return val
58 |
59 | def make_series(matrix, xcol, ycols=None):
60 | '''Take a matrix and return series (i.e. list of tuples) corresponding to
61 | specified column indices.
62 |
63 | E.g. if matrix is:
64 | [ [1,2,3,4]
65 | [5,6,7,8] ]
66 |
67 | and xcol = 0, ycols=[1,3] then output is:
68 |
69 | [
70 | [ [1,2], [5,6] ],
71 | [ [1,4], [5,8] ],
72 | ]
73 |
74 | If ycols not defined then return all possible series (excluding xcol
75 | with itself.
76 | '''
77 | cols = zip(*matrix)
78 | if ycols is None:
79 | ycols = range(len(cols))
80 | del ycols[xcol]
81 | cols = floatify_matrix(cols)
82 | def is_good(value):
83 | if value is None: return False
84 | tv = str(value)
85 | stopchars = [ '', '-' ]
86 | if tv in stopchars:
87 | return False
88 | return True
89 | def is_good_tuple(tuple):
90 | return is_good(tuple[0]) and is_good(tuple[1])
91 |
92 | xcoldata = cols[xcol]
93 | ycols = [ cols[ii] for ii in ycols ]
94 | series = [ filter(is_good_tuple, zip(xcoldata, col)) for col in ycols ]
95 | return series
96 |
97 |
--------------------------------------------------------------------------------
/swiss/parse/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rufuspollock-okfn/datautil/6bcb8795013d305f02467967367ecca15ad351e2/swiss/parse/__init__.py
--------------------------------------------------------------------------------
/swiss/parse/name.py:
--------------------------------------------------------------------------------
1 | '''Parse names of people into a standard format.'''
2 |
3 | import re
4 |
5 | titles = [
6 | u'Ayatollah',
7 | u'Baron',
8 | u'Bishop',
9 | u'Dame',
10 | u'Dr',
11 | u'Fr',
12 | u'Graf',
13 | u'King',
14 | u'Lady',
15 | u'Maj',
16 | u'Major',
17 | u'Mrs',
18 | u'Prof',
19 | u'Rev',
20 | u'Sir',
21 | u'St',
22 | ]
23 |
24 | class Name(object):
25 | '''A name of a person or entity.
26 |
27 | Not a domain object but a convenient way to handle/parse names.
28 |
29 | Attributes:
30 | title
31 | ln: last name
32 | firstnames: first names as list
33 | '''
34 | def __init__(self, ln='', fns=None, title=''):
35 | self.ln = ln
36 | self.fns = fns
37 | if self.fns is None: self.fns = []
38 | self.title = title
39 |
40 | def norm(self):
41 | '''Return normalised name string (LastFirst format)
42 | '''
43 | return name_tostr(self)
44 |
45 | def __str__(self):
46 | '''Display name using normalised format
47 | '''
48 | return self.norm()
49 |
50 | class NameParserBase(object):
51 | regex_remove_fullstops = re.compile(r'(\w{2,})\.(\W|$)', re.UNICODE)
52 |
53 | def parse(self, fullname):
54 | '''Parse the `fullname` string into a `Name` object.
55 |
56 | @return: `Name` object for `fullname`
57 | '''
58 | if fullname is None:
59 | return Name()
60 | fullname = unicode(fullname.strip())
61 | if not fullname:
62 | return Name()
63 |
64 | # remove words ending '.', e.g. 'Bosch.'
65 | fullname = self.regex_remove_fullstops.sub(r'\1\2', fullname)
66 |
67 | # make sure initials are separted by ' '
68 | # but first deal with special edge case like [Major.]
69 | # fullname = fullname.replace('.]', ']')
70 | fullname = fullname.replace('.', '. ')
71 | name = self._toparts(fullname)
72 | name.ln = self.normcase(name.ln)
73 | name.fns = [ self.normcase(x) for x in name.fns ]
74 | name.title = self.normcase(name.title)
75 | return name
76 |
77 | def _toparts(self, fullname):
78 | '''Implement in inheriting classes, called by parse.
79 | '''
80 | raise NotImplementedError()
81 |
82 | def tostr(self, name):
83 | '''Convert name object back into a string.
84 | '''
85 | raise NotImplementedError()
86 |
87 | def normcase(self, name):
88 | # useful to handle none and you often get this from regexes
89 | if name is None:
90 | return ''
91 | name = name.strip()
92 | if name.upper() == name or name.lower() == name:
93 | return name.capitalize()
94 | # avoid issues with e.g. McTaggart
95 | else:
96 | return name
97 |
98 | def untitlize(self, _str):
99 | '''Return title contained in _str if a title else return empty string.
100 | '''
101 | title = _str.strip()
102 | title = _str.strip('()')
103 | if title in titles:
104 | return title
105 | # always assume something in square brackets is a title
106 | elif title.startswith('[') and title.endswith(']'):
107 | return title[1:-1].strip()
108 | else:
109 | return ''
110 |
111 | def titlize(self, _str):
112 | return u'[' + _str + u']'
113 |
114 | def norm(self, date):
115 | return str(self.parse(date))
116 |
117 |
118 | class LastFirst(NameParserBase):
119 | '''Parse and creates names of form:
120 |
121 | lastname, first-names-in-order [title]
122 | '''
123 | def _toparts(self, fullname):
124 | if ',' not in fullname and ' ' in fullname:
125 | raise ValueError('Expected "," in name: %s' % fullname)
126 | name = Name()
127 | # NB: if more than 2 commas just ignore stuff after 2nd one
128 | parts = fullname.split(',')
129 | name.ln = parts[0]
130 | name.fns = parts[1].strip().split()
131 | if name.fns:
132 | title = self.untitlize(name.fns[-1])
133 | if title:
134 | name.title = title
135 | del name.fns[-1]
136 | return name
137 |
138 | def tostr(self, name):
139 | if name.ln or name.fns:
140 | fns = ' '.join(name.fns)
141 | if not fns:
142 | out = name.ln
143 | else:
144 | out = unicode(', '.join((name.ln, ' '.join(name.fns))))
145 | else:
146 | return ''
147 | if name.title:
148 | out = out + u' [%s]' % name.title
149 | return out
150 |
151 |
152 | class FirstLast(NameParserBase):
153 | '''Parse and create names of form:
154 |
155 | [title] first-names last-name
156 | '''
157 | def _toparts(self, fullname):
158 | name = Name()
159 | if ',' in fullname:
160 | raise ValueError('Should not have "," in FirstLast type name: %s' %
161 | fullname)
162 | parts = fullname.split()
163 | name.ln = parts[-1]
164 | name.fns = parts[:-1]
165 | if name.fns:
166 | title = self.untitlize(name.fns[0])
167 | if title:
168 | name.title = title
169 | del name.fns[0]
170 | return name
171 |
172 | def tostr(self, name):
173 | if name.fns or name.ln:
174 | out = u' '.join(name.fns) + ' ' + name.ln
175 | else:
176 | return ''
177 | if name.title:
178 | out = u'[%s]' % name.title + out
179 | return out
180 |
181 |
182 | def parse_name(fullname):
183 | if ',' in fullname:
184 | parser = LastFirst()
185 | else:
186 | parser = FirstLast()
187 | return parser.parse(fullname)
188 |
189 | def name_tostr(name, parser_class=LastFirst):
190 | parser = parser_class()
191 | return parser.tostr(name)
192 |
193 | def normalize(name_str, parser_class=LastFirst):
194 | name = parse_name(name_str)
195 | return name_tostr(name, parser_class)
196 |
197 |
198 |
--------------------------------------------------------------------------------
/swiss/tabular/__init__.py:
--------------------------------------------------------------------------------
1 | from base import *
2 | from misc import *
3 | from xls import XlsReader
4 | from html import *
5 | from tabular_json import JsonReader, JsonWriter
6 | from txt import TxtWriter
7 |
8 |
--------------------------------------------------------------------------------
/swiss/tabular/base.py:
--------------------------------------------------------------------------------
1 | """
2 | Tools for dealing with tabular data
3 | """
4 |
5 | class TabularData(object):
6 | """Holder for tabular data
7 |
8 | NB:
9 | * Assume data organized in rows.
10 | * No type conversion so all data will be as entered.
11 |
12 | Properties:
13 | * data: data itself provided as array of arrays
14 | * header: associated header columns (if they exist)
15 |
16 | TODO: handling of large datasets (iterators?)
17 | """
18 |
19 | def __init__(self, data=None, header=None):
20 | """
21 | Initialize object. If data or header not set they are defaulted to
22 | empty list.
23 |
24 | NB: must use None as default value for arguments rather than []
25 | because [] is mutable and using it will result in subtle bugs. See:
26 | 'Default parameter values are evaluated when the function definition
27 | is executed.' [http://www.python.org/doc/current/ref/function.html]
28 | """
29 | self.data = []
30 | self.header = []
31 | if data is not None:
32 | self.data = data
33 | if header is not None:
34 | self.header = header
35 |
36 | def __repr__(self):
37 | out = []
38 | if self.header:
39 | out.append(self.header)
40 | # limit to 10 items
41 | out += self.data[0:10]
42 | return repr(out)
43 |
44 | def __str__(self):
45 | return repr(self)
46 |
47 | def __iter__(self):
48 | return self.data.__iter__()
49 |
50 | @classmethod
51 | def from_list(self, list_, header=True):
52 | return TabularData(header=list_[0], data=list_[1:])
53 |
54 | def to_list(self):
55 | if self.header:
56 | return [ self.header ] + self.data
57 | else:
58 | return self.data
59 |
60 |
61 | class ReaderBase(object):
62 | def __init__(self, filepath_or_fileobj=None, encoding='utf8'):
63 | self.filepath = None
64 | self.fileobj = None
65 | self._filepath_or_fileobj(filepath_or_fileobj)
66 | self.encoding = 'utf8'
67 |
68 | def _filepath_or_fileobj(self, filepath_or_fileobj):
69 | if filepath_or_fileobj is None: # do not overwrite any existing value
70 | pass
71 | elif isinstance(filepath_or_fileobj, basestring):
72 | self.filepath = filepath_or_fileobj
73 | self.fileobj = open(self.filepath)
74 | else:
75 | self.filepath = None
76 | self.fileobj = filepath_or_fileobj
77 |
78 | def read(self, filepath_or_fileobj=None):
79 | self._filepath_or_fileobj(filepath_or_fileobj)
80 |
81 |
82 | class WriterBase(object):
83 | '''
84 | Extra arguments to write methods:
85 | has_row_headings: first col of each row is a heading.
86 | '''
87 | def __init__(self, round_ndigits=None, **kwargs):
88 | '''
89 | @round_ndigits: number of decimal places to use when rounding numerical
90 | values when textifying for output
91 | '''
92 | self.round_ndigits = round_ndigits
93 |
94 | def write(self, tabular_data, fileobj, *args, **kwargs):
95 | pass
96 |
97 | def write_str(self, tabular_data, *args, **kwargs):
98 | from StringIO import StringIO
99 | holder = StringIO()
100 | self.write(tabular_data, holder, *args, **kwargs)
101 | holder.seek(0)
102 | return holder.read()
103 |
104 | def value_to_str(self, value):
105 | '''Convert value to text (rounding floats/ints as necessary).
106 | '''
107 | if value is None:
108 | return ''
109 | if self.round_ndigits is not None and \
110 | (isinstance(value, int) or isinstance(value, float)):
111 | roundedResult = round(value, self.round_ndigits)
112 | if self.round_ndigits <= 0: # o/w will have in .0 at end
113 | roundedResult = int(roundedResult)
114 | roundedResult = str(roundedResult)
115 | # deal with case when rounding has added unnecessary digits
116 | if len(str(value)) < len(roundedResult):
117 | return str(value)
118 | else:
119 | return roundedResult
120 | else:
121 | return unicode(value)
122 |
123 |
124 | import csv
125 | import codecs
126 | class UTF8Recoder:
127 | """
128 | Iterator that reads an encoded stream and reencodes the input to UTF-8
129 |
130 | From:
131 | """
132 | def __init__(self, f, encoding=None):
133 | if encoding:
134 | self.reader = codecs.getreader(encoding)(f)
135 | else: # already unicode so just return f
136 | self.reader = f
137 |
138 | def __iter__(self):
139 | return self
140 |
141 | def next(self):
142 | return self.reader.next().encode('utf-8')
143 |
144 | class CsvReader(ReaderBase):
145 | """Read data from a csv file into a TabularData structure
146 |
147 | Note that the csv module does *not* support unicode:
148 |
149 | > This version of the csv module doesn't support Unicode input. Also, there
150 | > are currently some issues regarding ASCII NUL characters. Accordingly,
151 | > all input should be UTF-8 or printable ASCII to be safe; see the examples
152 | > in section 9.1.5. These restrictions will be removed in the future.
153 | >
154 | """
155 |
156 | def read(self, filepath_or_fileobj=None, encoding=None, **kwargs):
157 | """Read in a csv file and return a TabularData object.
158 |
159 | @param fileobj: file like object.
160 | @param encoding: if set use this instead of default encoding set in
161 | __init__ to decode the file like object. NB: will check if fileobj
162 | already in unicode in which case this is ignored.
163 | @param kwargs: all further kwargs are passed to the underlying `csv.reader` function
164 | @return tabular data object (all values encoded as utf-8).
165 | """
166 | super(CsvReader, self).read(filepath_or_fileobj)
167 | if encoding:
168 | self.encoding = encoding
169 | tabData = TabularData()
170 |
171 | sample = self.fileobj.read()
172 | # first do a simple test -- maybe sample is already unicode
173 | if type(sample) == unicode:
174 | encoded_fo = UTF8Recoder(self.fileobj, None)
175 | else:
176 | sample = sample.decode(self.encoding)
177 | encoded_fo = UTF8Recoder(self.fileobj, self.encoding)
178 | sample = sample.encode('utf-8')
179 | sniffer = csv.Sniffer()
180 | hasHeader = sniffer.has_header(sample)
181 |
182 | self.fileobj.seek(0)
183 | ourkwargs = {
184 | 'skipinitialspace': True
185 | }
186 | if kwargs:
187 | ourkwargs.update(kwargs)
188 |
189 | reader = csv.reader(encoded_fo, **ourkwargs)
190 | if hasHeader:
191 | tabData.header = reader.next()
192 | for row in reader:
193 | tabData.data.append(row)
194 | return tabData
195 |
196 | # for backwards compatibility
197 | ReaderCsv = CsvReader
198 |
199 | class CsvWriter(WriterBase):
200 | # TODO: unicode support a la CsvReader
201 | def write(self, tabular_data, fileobj, encoding='utf-8'):
202 | writer = csv.writer(fileobj)
203 | if tabular_data.header:
204 | writer.writerow(tabular_data.header)
205 | for row in tabular_data.data:
206 | writer.writerow(row)
207 | fileobj.flush()
208 |
209 |
210 | ## --------------------------------
211 | ## Converting to Latex
212 |
213 | class LatexWriter(WriterBase):
214 |
215 | def write(self, tabular_data, fileobj, has_row_headings=False):
216 | self.has_row_headings = has_row_headings
217 | matrix = tabular_data.data
218 | has_header = len(tabular_data.header) > 0
219 | if has_header:
220 | matrix.insert(0, tabular_data.header)
221 | out = self._write(matrix, has_header)
222 | fileobj.write(out)
223 |
224 | def _write(self, matrix, has_header=True):
225 | if len(matrix) == 0: return
226 | # no hline on first row as this seems to mess up latex \input
227 | # http://groups.google.com/group/comp.text.tex/browse_thread/thread/1e1db553a958ebd8/0e590a22cb59f43d
228 | out = '%s' % self.process_row(matrix[0], has_header)
229 | for row in matrix[1:]:
230 | out += self.process_row(row)
231 | return out
232 |
233 | def process_row(self, row, heading=False):
234 | if len(row) == 0: return
235 | out = '%s' % self.process_cell(row[0], heading or self.has_row_headings)
236 | for cell in row[1:]:
237 | out += ' & %s' % self.process_cell(cell, heading)
238 | out += ' \\\\\n\hline\n'
239 | return out
240 |
241 | def process_cell(self, cell, heading=False):
242 | cell_text = self.value_to_str(cell)
243 | cell_text = self.escape(cell_text)
244 | if heading:
245 | return '\\textbf{%s}' % cell_text
246 | else:
247 | return cell_text
248 |
249 | def escape(self, text):
250 | escape_chars = [ '&', '%' ]
251 | out = text
252 | for ch in escape_chars:
253 | out = out.replace(ch, '\\%s' % ch)
254 | return out
255 |
256 |
257 | # TODO: 2009-08-05 deprecate
258 | def table2latex(matrix, has_header=True, has_row_headings=False):
259 | m2l = LatexWriter()
260 | m2l.has_row_headings = has_row_headings
261 | return m2l._write(matrix, has_header)
262 |
263 |
--------------------------------------------------------------------------------
/swiss/tabular/gdocs.py:
--------------------------------------------------------------------------------
1 | '''TabularData from a Google Docs Spreadsheet.
2 | '''
3 | from base import ReaderBase, TabularData
4 | import gdata.spreadsheet.service
5 | import gdata.spreadsheet.text_db
6 |
7 |
8 | class GDocsReaderTextDb(ReaderBase):
9 | '''Read a google docs spreadsheet using the gdata.spreadsheet.text_db
10 | library.
11 |
12 | NB: any blank line in spreadsheet will be taken as terminating data.
13 | '''
14 | def __init__(self, spreadsheet_id, username=None, password=None,
15 | id_is_name=False):
16 | '''
17 | @param: spreadsheet_id: gdoc id or name (?key={id} in url). If name you
18 | must set id_is_name to True.
19 | '''
20 | # do not pass spreadsheet_id down as it will be url or sheet name
21 | super(GDocsReaderTextDb, self).__init__()
22 | self.source = spreadsheet_id
23 | self.id_is_name = id_is_name
24 | self.gd_client = gdata.spreadsheet.text_db.DatabaseClient(
25 | username=username,
26 | password=password)
27 |
28 | def load_text_db_table(self, sheet_name='Sheet1'):
29 | '''Load text_db Table object corresponding to specified sheet_name.
30 | '''
31 | super(GDocsReaderTextDb, self).read(None)
32 | if self.id_is_name:
33 | dbs = self.gd_client.GetDatabases(name=self.source)
34 | else:
35 | dbs = self.gd_client.GetDatabases(spreadsheet_key=self.source)
36 | assert len(dbs) >= 1, 'No spreadsheet of that name/id'
37 | db = dbs[0]
38 | table = db.GetTables(name=sheet_name)[0]
39 | return table
40 |
41 | def read(self, sheet_name='Sheet1'):
42 | '''Load the specified google spreadsheet worksheet as a L{TabularData}
43 | object.
44 |
45 | @return L{TabularData} object.
46 | '''
47 | text_db_table = self.load_text_db_table(sheet_name)
48 | tdata = TabularData()
49 | text_db_table.LookupFields()
50 | tdata.header = text_db_table.fields
51 | # finds all records it seems
52 | rows = text_db_table.FindRecords('')
53 | for row in rows:
54 | rowdata = []
55 | for colname in tdata.header:
56 | rowdata.append(row.content[colname])
57 | tdata.data.append(rowdata)
58 | return tdata
59 |
60 |
61 | # not yet working properly (cannot work out ListFeed yet ...)
62 | # textdb is nicer but Spreadsheet allows one to get all cells using CellsFeed
63 | # (even when blank lines) (this is not true when using ListFeed though ...)
64 | # class GDocsReaderSpreadsheet(ReaderBase):
65 | # '''
66 | #
67 | # From Docs for the API:
68 | #
69 | #
70 | # > The list feed contains all rows after the first row up to the first blank
71 | # row. The first blank row terminates the data set. If expected data isn't
72 | # appearing in a feed, check the worksheet manually to see whether there's an
73 | # unexpected blank row in the middle of the data. In particular, if the
74 | # second row of the spreadsheet is blank, then the list feed will contain no
75 | # data.
76 | # '''
77 | # def __init__(self, spreadsheet_id, username=None, password=None,
78 | # id_is_name=False):
79 | # '''
80 | # @param: spreadsheet_id: gdoc id or name (?key={id} in url). If name you
81 | # must set id_is_name to True.
82 | # '''
83 | # # do not pass spreadsheet_id down as it will be url or sheet name
84 | # super(GDocsReaderSpreadsheet, self).__init__()
85 | # self.source = spreadsheet_id
86 | # self.id_is_name = id_is_name
87 | # self.gd_client = gdata.spreadsheet.service.SpreadsheetsService()
88 | # self.gd_client.email = username
89 | # self.gd_client.password = password
90 | #
91 | # def read(self, sheet_index=0):
92 | # '''Load the specified google spreadsheet worksheet as a L{TabularData}
93 | # object.
94 | #
95 | # @return L{TabularData} object.
96 | # '''
97 | # super(GDocsReaderSpreadsheet, self).read(None)
98 | # self.gd_client.source = self.source
99 | # self.gd_client.ProgrammaticLogin()
100 | # if self.id_is_name:
101 | # feed = self.gd_client.GetSpreadsheetsFeed()
102 | # # no len on feed ...
103 | # # assert len(feed) > 0, 'No spreadsheets found for: %s' % self.source
104 | # spreadsheet_id = feed.entry[0].id.text.split('/')[-1]
105 | # else:
106 | # spreadsheet_id = self.source
107 | # sheetfeed = self.gd_client.GetWorksheetsFeed(spreadsheet_id)
108 | # wrksht_id = sheetfeed.entry[sheet_index].id.text.split('/')[-1]
109 | # row_feed = self.gd_client.GetListFeed(spreadsheet_id, wrksht_id)
110 | #
111 | # tdata = TabularData()
112 | # # tdata.header
113 | # # how do we get rows rather than just all the cells?
114 | # for i, entry in enumerate(row_feed.entry):
115 | # print entry.content['col1']
116 | # print entry.content
117 | # tdata.data.append([entry.content.text])
118 | # return tdata
119 |
120 |
--------------------------------------------------------------------------------
/swiss/tabular/html.py:
--------------------------------------------------------------------------------
1 | import re
2 | from HTMLParser import HTMLParser
3 |
4 | from base import TabularData, ReaderBase, WriterBase
5 |
6 |
7 | class HtmlReader(ReaderBase):
8 | '''Read data from HTML table into L{TabularData}.
9 |
10 | '''
11 | def read(self, filepath_or_fileobj=None, table_index=0):
12 | '''Read data from fileobj.
13 |
14 | NB: post read all tables extracted are in attribute named 'tables'.
15 |
16 | @arg table_index: if multiple tables in the html return table at this
17 | index.
18 | @return: L{TabularData} object (all content in the data part, i.e. no
19 | header).
20 | '''
21 | super(HtmlReader, self).read(filepath_or_fileobj)
22 | parser = _OurTableExtractor()
23 | parser.reset()
24 | parser.feed(self.fileobj.read())
25 | self.tables = parser.tables
26 | return self.tables[table_index]
27 |
28 |
29 | class _OurTableExtractor(HTMLParser):
30 | '''
31 | # TODO: tbody, thead etc
32 | # TODO: nested tables
33 |
34 | # TODO: will barf on bad html so may need to run tidy first ...
35 | # tidy -w 0 -b -omit -asxml -ascii
36 | '''
37 | def reset(self):
38 | HTMLParser.reset(self)
39 | self.tables = []
40 | self._rows = []
41 | self._row = []
42 | self._text = ''
43 |
44 | def handle_starttag(self, tag, attrs):
45 | if tag == 'tr':
46 | self._row = []
47 | elif tag == 'td' or tag == 'th':
48 | self._text = ''
49 | elif tag == 'br':
50 | self._text += '\n'
51 |
52 | def handle_endtag(self, tag):
53 | if tag == 'tr':
54 | self._rows.append(self._row)
55 | if tag == 'td' or tag == 'th':
56 | self._row.append(self._text)
57 | if tag == 'table':
58 | self.tables.append(TabularData(data=self._rows))
59 | self._rows = []
60 |
61 | def handle_data(self, data):
62 | self._text += data.strip()
63 |
64 |
65 | import re
66 | class HtmlWriter(WriterBase):
67 | """
68 | Write tabular data to xhtml
69 | """
70 |
71 | def __init__(self, round_ndigits=2, pretty_print=False, table_attributes = {'class': 'data'}):
72 | """
73 | @pretty_print: whether to pretty print (indent) output
74 | @table_attributes: dictionary of html attribute name/value pairs to be
75 | added to the table element
76 | """
77 | super(HtmlWriter, self).__init__(round_ndigits)
78 | self.pretty_print = pretty_print
79 | self.table_attributes = table_attributes
80 |
81 | def write(self, tabulardata, fileobj, caption = '', rowHeadings = []):
82 | """
83 | Write matrix of data to xhtml table.
84 | Allow for addition of row and column headings
85 |
86 | @return xhtml table containing data
87 |
88 | @param data: table of data that makes up table
89 | @param caption: the caption for the table (if empty no caption created)
90 | @param rowHeadings: additional headings for rows (separate from
91 | tabulardata)
92 | """
93 | columnHeadings = tabulardata.header
94 | data = tabulardata.data
95 | haveRowHeadings = (len(rowHeadings) > 0)
96 |
97 | htmlTable = ''
101 |
102 | # deal with caption
103 | if caption != '':
104 | htmlTable += '%s' % caption
105 |
106 | # deal with col headings
107 | # if we there are rowHeadings may want to add blank column at front
108 | numColHeads = len(columnHeadings)
109 | if numColHeads > 0:
110 | if haveRowHeadings and numColHeads == len(data[0]):
111 | # [[TODO: is this dangerous? should i make a copy ...]]
112 | columnHeadings.insert(0, '')
113 | htmlTable += self.writeHeading(columnHeadings)
114 |
115 | htmlTable += ''
116 | if self.pretty_print:
117 | htmlTable += '\n'
118 |
119 | for ii in range(0, len(data)):
120 | # have to add 1 as first row is headings
121 | if haveRowHeadings:
122 | htmlTable += self.writeRow(data[ii], rowHeadings[ii])
123 | else:
124 | htmlTable += self.writeRow(data[ii])
125 |
126 | htmlTable += '
'
127 |
128 | if self.pretty_print:
129 | fileobj.write(self.prettyPrint(htmlTable))
130 | else:
131 | fileobj.write(htmlTable)
132 |
133 | def value_to_str(self, value):
134 | import cgi
135 | out = super(HtmlWriter, self).value_to_str(value)
136 | out = cgi.escape(out)
137 | return out
138 |
139 | def writeHeading(self, row):
140 | """
141 | Write heading for html table ()
142 | """
143 | result = ''
144 | result += self.writeGeneralRow(row, 'th')
145 | result += '
'
146 | if self.pretty_print:
147 | result += '\n'
148 | return result
149 |
150 | def writeRow(self, row, rowHeading = ''):
151 | result = ''
152 | if rowHeading != '':
153 | result = '%s | ' % self.value_to_str(rowHeading)
154 | result += self.writeGeneralRow(row, 'td')
155 | result = '%s
' % result
156 | if self.pretty_print:
157 | result += '\n'
158 | return result
159 |
160 | def writeGeneralRow(self, row, tagName):
161 | result = ''
162 | for ii in range(len(row)):
163 | result += '<%s>%s%s>' % (tagName, self.value_to_str(row[ii]), tagName)
164 | return result
165 |
166 | def prettyPrint(self, html):
167 | """pretty print html using HTMLTidy"""
168 | # [[TODO: strip out html wrapper stuff that is added (head, body etc)
169 | try:
170 | import mx.Tidy
171 | out = mx.Tidy.tidy(html, None, None, wrap = 0, indent = 'yes')[2]
172 | except:
173 | out = html
174 | return self.tabify(out)
175 |
176 | def tabify(self, instr, tabsize = 2):
177 | """
178 | tabify text by replacing spaces of size tabSize by tabs
179 | """
180 | whitespace = tabsize * ' '
181 | return re.sub(whitespace, '\t', instr)
182 |
183 |
184 | # for backwards compatibility
185 | # 2008-05-30
186 | WriterHtml = HtmlWriter
187 |
188 |
189 |
--------------------------------------------------------------------------------
/swiss/tabular/misc.py:
--------------------------------------------------------------------------------
1 | '''General Helper methods for tabular data.
2 | '''
3 | from base import TabularData
4 |
5 | def transpose(data):
6 | '''Transpose a list of lists.
7 |
8 | Or do it directy: data = zip(*data)
9 | '''
10 | return zip(*data)
11 |
12 | def select_columns(matrix, cols):
13 | '''Return a matrix with only those column indexes in cols.'''
14 | tsp = transpose(matrix)
15 | out = []
16 | cols.sort()
17 | for c in cols:
18 | out.append(tsp[c])
19 | return transpose(out)
20 |
21 |
22 | def pivot(table, left, top, value):
23 | """Unnormalize (pivot) a normalised input set of tabular data.
24 |
25 | @param table: simple list of lists or a L{TabularData} object.
26 |
27 | Eg. To transform the tabular data like
28 |
29 | Name, Year, Value
30 | -----------------------
31 | 'x', 2004, 1
32 | 'y', 2004, 2
33 | 'x', 2005, 3
34 | 'y', 2005, 4
35 |
36 | into the new list:
37 |
38 | Year, 'x', 'y'
39 | ------------------------
40 | 2004, 1, 2
41 | 2005, 3, 4
42 |
43 | you would do:
44 |
45 | pivot(tabulardata, 1, 0, 2)
46 |
47 | OR (requires header to exist):
48 |
49 | pivot(tabulardata, 'Year', 'Name', 'Value')
50 | """
51 | if not isinstance(left, int):
52 | left = table.header.index(left)
53 | if not isinstance(top, int):
54 | top = table.header.index(top)
55 | if not isinstance(value, int):
56 | value = table.header.index(value)
57 |
58 | rs = TabularData()
59 | # construct double dict keyed by left values
60 | tdict = {}
61 | xvals = set()
62 | yvals = set()
63 | for row in table:
64 | xval = row[left]
65 | if not xval in tdict:
66 | tdict[xval] = {}
67 | tdict[xval][row[top]] = row[value]
68 | xvals.add(xval)
69 | yvals.add(row[top])
70 | xvals = sorted(list(xvals))
71 | yvals = sorted(list(yvals))
72 | xhead = 'X'
73 | if hasattr(table, 'header') and table.header:
74 | xhead = table.header[left]
75 | rs.header = [ xhead ] + yvals
76 | rs.data = [ [x] + [ tdict[x].get(y, '') for y in yvals ] for x in xvals ]
77 | return rs
78 |
79 |
--------------------------------------------------------------------------------
/swiss/tabular/tabular_json.py:
--------------------------------------------------------------------------------
1 | '''JSON Reader and Writer'''
2 | try:
3 | import json
4 | except ImportError:
5 | try:
6 | import simplejson as json
7 | except ImportError: # simplejson not installed
8 | pass
9 | from base import TabularData, ReaderBase, WriterBase
10 |
11 |
12 | class JsonReader(ReaderBase):
13 | def read(self, filepath_or_fileobj=None):
14 | '''Read JSON encoded data from source into a L{TabularData} object.
15 |
16 | JSON encoded data should either be:
17 | * dict (with header and data attributes)
18 | * list (first row assumed to be the header)
19 |
20 | @return L{TabularData}
21 | '''
22 | super(JsonReader, self).read(filepath_or_fileobj)
23 | jsondata = json.load(self.fileobj)
24 | if isinstance(jsondata, dict):
25 | return TabularData(header=jsondata.get('header', None),
26 | data=jsondata.get('data', None)
27 | )
28 | elif isinstance(jsondata, list):
29 | return TabularData(header=jsondata[0], data=jsondata[1:])
30 | else:
31 | raise Exception('Cannot load TabularData from %s' % jsondata)
32 |
33 | class JsonWriter(WriterBase):
34 |
35 | def write(self, tabular_data, fileobj, indent=2):
36 | super(JsonWriter, self).write(tabular_data, fileobj)
37 | jsondata = { u'header': tabular_data.header,
38 | u'data': tabular_data.data
39 | }
40 | json.dump(jsondata, fileobj, indent=indent)
41 |
42 |
--------------------------------------------------------------------------------
/swiss/tabular/txt.py:
--------------------------------------------------------------------------------
1 | from base import WriterBase
2 |
3 | class TxtWriter(WriterBase):
4 | '''Write tabular data to plain text in nicely formatted way
5 |
6 | TODO
7 | ====
8 |
9 | 1. allow output_width of 0 meaning use width necessary to fit all rows on one
10 | line
11 |
12 | 2. rather than truncate cell contents wrap it onto two lines (and/or allow
13 | spillover if adjacent cell is empty)
14 |
15 | * wontfix: can let terminal do this: just set width very large ...
16 |
17 | 3. (?) stream output back rather than returning all at once
18 |
19 | 4. Add support for limiting number of columns displayed. DONE 2007-08-02
20 | * TODO: add unittest
21 | '''
22 |
23 | def __init__(self, output_width=0, number_of_columns=0, **kwargs):
24 | '''
25 | @param output_width: display width (0 means unlimited).
26 | @param number_of_columns: number of columns to try to display (not
27 | guaranteed to be this number if this would cause problems). (0
28 | means all columns)
29 | '''
30 | super(TxtWriter, self).__init__(**kwargs)
31 | self.output_width = output_width
32 | self.number_of_columns = number_of_columns
33 |
34 | def write(self, tabular_data, fileobj):
35 | result = ''
36 | formatter = None
37 | row_cache = []
38 | sample_length = 4
39 | rows = tabular_data.data
40 | if tabular_data.header:
41 | rows = [ tabular_data.header ] + rows
42 | # include header in sample rows (do we always want to?)
43 | sample_rows = rows[:sample_length]
44 | self._compute_parameters(sample_rows)
45 | result += self._write_separator()
46 | for row in rows:
47 | result += self._write_row(row)
48 | result += self._write_separator()
49 | fileobj.write(result)
50 |
51 | def _compute_parameters(self, sample_rows):
52 | maxcols = self._get_maxcols(sample_rows)
53 | if not self.number_of_columns:
54 | self.numcols = maxcols
55 | else:
56 | self.numcols = min(self.number_of_columns, maxcols)
57 | self.colwidths = []
58 | self._set_colwidths(sample_rows)
59 | if self.colwidths[0] < 2:
60 | msg =\
61 | u'''It is not possible to effectively format this many columns of material with
62 | this narrow an output window. Column width is: %s''' % self.colwidths[0]
63 | # TODO: log it?
64 | print msg
65 |
66 | def _write_row(self, row):
67 | '''Return the input 'python' row as an appropriately formatted string.
68 | '''
69 | result = '|'
70 | count = 0
71 | for cell in row[:self.numcols]:
72 | width = self.colwidths[count]
73 | result += self._format_cell(width, cell)
74 | count += 1
75 | # now pad out with extra cols as necessary
76 | while count < self.numcols:
77 | width = self.colwidths[count]
78 | result += self._format_cell(width, ' ')
79 | count += 1
80 | return result + '\n'
81 |
82 | def _write_separator(self):
83 | result = '+'
84 | for width in self.colwidths:
85 | result += '-' * (width-1) + '+'
86 | return result + '\n'
87 |
88 | def _get_maxcols(self, sample_rows):
89 | maxcols = 0
90 | for row in sample_rows:
91 | maxcols = max(maxcols, len(row))
92 | return maxcols
93 |
94 | def _set_colwidths(self, sample_rows):
95 | # subtract -1 so that we have (at least) one spare screen column
96 | if self.output_width != 0:
97 | colwidth = int( (self.output_width - 1) / self.numcols)
98 | for ii in range(self.numcols):
99 | self.colwidths.append(colwidth)
100 | else: # make every col as wide as it needs to be
101 | self.colwidths = [0] * self.numcols
102 | for row in sample_rows:
103 | for ii in range(self.numcols):
104 | cellwidth = len(self.value_to_str(row[ii]))
105 | self.colwidths[ii] = max(self.colwidths[ii],
106 | cellwidth
107 | )
108 | self.colwidths = [ x + 1 for x in self.colwidths ]
109 |
110 | def _format_cell(self, width, content):
111 | content = self.value_to_str(content)
112 | content = content.strip()
113 | if len(content) > width - 1:
114 | # TODO: be brutal (this *has* to be fixed)
115 | content = content[:width-1]
116 | return content.center(width-1) + '|'
117 |
118 |
--------------------------------------------------------------------------------
/swiss/tabular/xls.py:
--------------------------------------------------------------------------------
1 | '''Work with Excel (xls) files.
2 |
3 | Requires xlrd
4 | '''
5 | try:
6 | import xlrd
7 | except ImportError: # xlrd not installed
8 | pass
9 |
10 | from base import ReaderBase, TabularData
11 |
12 | class XlsReader(ReaderBase):
13 | '''Read Excel (xls) files.
14 |
15 | Requires the xlrd package (see pypi).
16 | '''
17 | def __init__(self, filepath_or_fileobj=None):
18 | super(XlsReader, self).__init__(filepath_or_fileobj)
19 | if self.fileobj:
20 | self.book = xlrd.open_workbook(file_contents=self.fileobj.read())
21 | ## TODO: fix the rest of this
22 |
23 | def read(self, fileobj=None, sheet_index=0):
24 | '''Read an excel file (provide as fileobj) and return the specified
25 | sheet as a L{TabularData} object.
26 |
27 | For convenience also store:
28 |
29 | self.book: xlrd WorkBook object
30 |
31 | @return L{TabularData} object.
32 | '''
33 | super(XlsReader, self).read(fileobj)
34 | if fileobj:
35 | self.book = xlrd.open_workbook(file_contents=self.fileobj.read())
36 | tab = TabularData()
37 | booksheet = self.book.sheet_by_index(sheet_index)
38 | data = self.extract_sheet(booksheet, self.book)
39 | tab.data = data
40 | return tab
41 |
42 | def info(self):
43 | '''Return summary info about this Excel Workbook.'''
44 | info = ''
45 | info += 'The number of worksheets is: %s\n' % self.book.nsheets
46 | info += 'Worksheet name(s):\n' % self.book.sheet_names()
47 | count = -1
48 | for sn in self.book.sheet_names():
49 | count += 1
50 | info += '%s %s\n' % (count, sn)
51 | return info
52 |
53 | def sheet_info(self, sheet_index):
54 | '''Summary info about an xls sheet.
55 |
56 | @return: printable string giving info.
57 | '''
58 | import pprint
59 | sh = self.book.sheet_by_index(sheet_index)
60 | info = sh.name + '\n'
61 | info += 'Rows: %s Cols: %s\n\n' % (sh.nrows, sh.ncols)
62 | MAX_ROWS = 30
63 | for rx in range(min(sh.nrows, MAX_ROWS)):
64 | info += str(sh.row(rx)) + '\n'
65 | return info
66 |
67 | def extract_sheet(self, sheet, book):
68 | matrix = []
69 | nrows = sheet.nrows
70 | ncols = sheet.ncols
71 | for rx in range(nrows):
72 | outrow = []
73 | for cx in range(ncols):
74 | cell = sheet.cell(rowx=rx, colx=cx)
75 | val = self.cell_to_python(cell, book)
76 | outrow.append(val)
77 | matrix.append(outrow)
78 | return matrix
79 |
80 | def cell_to_python(self, cell, book):
81 | # annoying need book argument for datemode
82 | # info on types: http://www.lexicon.net/sjmachin/xlrd.html#xlrd.Cell-class
83 | if cell.ctype == xlrd.XL_CELL_NUMBER:
84 | return float(cell.value)
85 | elif cell.ctype == xlrd.XL_CELL_DATE:
86 | from datetime import date
87 | # TODO: distinguish date and datetime
88 | args = xlrd.xldate_as_tuple(cell.value, book.datemode)
89 | try:
90 | return date(args[0], args[1], args[2])
91 | except Exception, inst:
92 | print 'Error parsing excel date (%s): %s' % (args, inst)
93 | return None
94 | elif cell.ctype == xlrd.XL_CELL_BOOLEAN:
95 | return bool(cell.value)
96 | else:
97 | return cell.value
98 |
99 |
100 |
--------------------------------------------------------------------------------
/swiss/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # placeholder
2 |
--------------------------------------------------------------------------------
/swiss/tests/data/xls_reader_test.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rufuspollock-okfn/datautil/6bcb8795013d305f02467967367ecca15ad351e2/swiss/tests/data/xls_reader_test.xls
--------------------------------------------------------------------------------
/swiss/tests/parse/test_name.py:
--------------------------------------------------------------------------------
1 | import swiss.parse.name
2 |
3 |
4 | class TestName:
5 | def test_parse_name_FL(self):
6 | name = u'Ludwig Van Beethoven'
7 | out = swiss.parse.name.parse_name(name)
8 | assert out.ln == u'Beethoven'
9 | assert out.fns == ['Ludwig', 'Van']
10 |
11 | def test_parse_name_LF_with_extra_comma(self):
12 | out = swiss.parse.name.parse_name('More, Sir Thomas,Saint')
13 | assert out.ln == 'More', out
14 | assert out.fns == ['Sir', 'Thomas']
15 |
16 | def test_parse_name_FL_normcase(self):
17 | name = u'Ludwig van BEETHOVEN'
18 | out = swiss.parse.name.parse_name(name)
19 | assert out.ln == 'Beethoven', out
20 |
21 | def test_parse_name_LF_with_title(self):
22 | name = u'Chandos, John [Sir]'
23 | out = swiss.parse.name.parse_name(name)
24 | assert out.ln == 'Chandos', out
25 | assert out.title == 'Sir', out
26 |
27 | def test_parse_name_FL_with_title(self):
28 | name = u'Sir John CHANDOS'
29 | out = swiss.parse.name.parse_name(name)
30 | assert out.ln == 'Chandos', out
31 | assert out.title == 'Sir', out
32 |
33 | def test_parse_name_FL_with_title_2(self):
34 | name = u'Prof Benjamin AARON'
35 | out = swiss.parse.name.parse_name(name)
36 | assert out.ln == 'Aaron', out
37 | assert out.title == 'Prof', out
38 | assert out.fns == ['Benjamin'], out
39 | assert str(out) == 'Aaron, Benjamin [Prof]'
40 |
41 | def test_parse_title_with_fullstop(self):
42 | name = 'Major. abc xyz'
43 | out = swiss.parse.name.parse_name(name)
44 | assert out.title == 'Major', out.title
45 |
46 | def test_parse_title_with_fullstop_2(self):
47 | name = 'Xyz, Abc [Major.]'
48 | out = swiss.parse.name.parse_name(name)
49 | print out
50 | assert out.title == 'Major', out.title
51 |
52 | def test_parse_title_with_brackets(self):
53 | name = 'Dickens, Gerald (Sir)'
54 | out = swiss.parse.name.parse_name(name)
55 | assert out.title == 'Sir', out.title
56 |
57 | name = '(Sir) Gerald Dickens'
58 | out = swiss.parse.name.parse_name(name)
59 | assert out.title == 'Sir', out.title
60 |
61 | def test_parse_name_FL_initials(self):
62 | name = 'Chekhov, A.P.'
63 | out = swiss.parse.name.parse_name(name)
64 | assert out.ln == 'Chekhov'
65 | assert out.fns == ['A.', 'P.'], out
66 |
67 | def test_strip_fullstops(self):
68 | name = 'George. Bosch'
69 | out = swiss.parse.name.normalize(name)
70 | assert out == 'Bosch, George'
71 |
72 | name = 'George. a.p. Bosch.'
73 | out = swiss.parse.name.normalize(name)
74 | assert out == 'Bosch, George A. P.', out
75 |
76 | name = 'Geo.rge. Bosch'
77 | out = swiss.parse.name.normalize(name)
78 | assert out == 'Bosch, Geo. Rge', out
79 |
80 | name = 'Geo.Smith. Bosch'
81 | out = swiss.parse.name.normalize(name)
82 | assert out == 'Bosch, Geo. Smith', out
83 |
84 | def test_tostr(self):
85 | name = swiss.parse.name.Name(ln='Beethoven', fns=['Ludwig', 'van'])
86 | exp = u'Beethoven, Ludwig van'
87 | out = swiss.parse.name.name_tostr(name)
88 | assert out == exp, out
89 |
90 | def test_with_no_name(self):
91 | name = swiss.parse.name.parse_name(' ')
92 | assert name.ln is '', name
93 | out = swiss.parse.name.normalize(' ')
94 | assert out == '', out
95 |
96 | def test_surname(self):
97 | name = u'SCHUBERT'
98 | out = str(swiss.parse.name.parse_name(name))
99 | assert out == 'Schubert'
100 |
101 |
--------------------------------------------------------------------------------
/swiss/tests/tabular/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rufuspollock-okfn/datautil/6bcb8795013d305f02467967367ecca15ad351e2/swiss/tests/tabular/__init__.py
--------------------------------------------------------------------------------
/swiss/tests/tabular/test_base.py:
--------------------------------------------------------------------------------
1 | import os
2 | from StringIO import StringIO
3 |
4 | import swiss.tabular
5 |
6 | class TestTabularData:
7 | testlist = [ ['X', 'Y'], [1,2], [3,4] ]
8 |
9 | def test_1(self):
10 | tabular = swiss.tabular.TabularData()
11 | assert tabular.header == []
12 |
13 | def test_from_list(self):
14 | out = swiss.tabular.TabularData.from_list(self.testlist)
15 | assert out.header == [ 'X', 'Y' ]
16 | assert out.data == [ [1,2], [3,4] ]
17 |
18 | def test_to_list(self):
19 | td = swiss.tabular.TabularData(
20 | header=['X', 'Y'],
21 | data=[ [1,2], [3,4] ]
22 | )
23 | out = td.to_list()
24 | assert out == self.testlist
25 |
26 |
27 | class TestWriterBase:
28 | def test_value_to_str(self):
29 | w = swiss.tabular.WriterBase() # round_ndigits=None
30 | out = w.value_to_str('x')
31 | assert out == u'x', out
32 | out = w.value_to_str(1)
33 | assert out == u'1', out
34 | out = w.value_to_str(1.3555)
35 | assert out == u'1.3555', out
36 |
37 | w = swiss.tabular.WriterBase(round_ndigits=2)
38 | out = w.value_to_str('x')
39 | assert out == u'x', out
40 | out = w.value_to_str(1)
41 | assert out == u'1', out
42 | out = w.value_to_str(1.3555)
43 | assert out == u'1.36', out
44 |
45 | w.round_ndigits = -1
46 | out = w.value_to_str(102.34)
47 | assert out == u'100', out
48 |
49 |
50 | class TestReaderCsv(object):
51 |
52 | csvdata = \
53 | '''"header1", "header 2"
54 | 1, 2'''
55 | header = [ 'header1', 'header 2' ]
56 | data = [ ['1', '2'] ]
57 |
58 | def setUp(self):
59 | reader = swiss.tabular.ReaderCsv()
60 | fileobj = StringIO(self.csvdata)
61 | self.tab = reader.read(fileobj)
62 |
63 | def test_header(self):
64 | assert self.header == self.tab.header
65 |
66 | def test_data(self):
67 | assert self.data == self.tab.data
68 |
69 |
70 | class TestReaderCsvUnicode(TestReaderCsv):
71 | csvdata = \
72 | u'''"headi\xf1g", "header 2"
73 | 1, 2'''
74 | header = [ u'headi\xf1g'.encode('utf-8'), 'header 2' ]
75 | data = [ ['1', '2'] ]
76 |
77 |
78 | class TestReaderCsvEncoded(TestReaderCsvUnicode):
79 | encoding = 'utf-16'
80 | csvdata = \
81 | u'''"headi\xf1g", "header 2"
82 | 1, 2'''.encode(encoding)
83 |
84 | def setUp(self):
85 | reader = swiss.tabular.ReaderCsv()
86 | fileobj = StringIO(self.csvdata)
87 | self.tab = reader.read(fileobj, encoding=self.encoding)
88 |
89 |
90 | class TestCsvWriter:
91 | def test_writer(self):
92 | writer = swiss.tabular.CsvWriter()
93 | fo = StringIO()
94 | td = swiss.tabular.TabularData([[1,2],[3,4]], header=['one',
95 | 'two'])
96 | writer.write(td, fo)
97 | fo.seek(0)
98 | out = fo.read()
99 | exp = \
100 | '''one,two\r
101 | 1,2\r
102 | 3,4\r\n'''
103 | assert out == exp
104 |
105 |
106 | class TestHtmlReader:
107 |
108 | inraw1 = '''
109 |
110 |
111 | 1 | 2 |
112 |
113 |
114 | 1983 |
115 |
116 |
117 | 3 | 4 |
118 |
119 |
120 | '''
121 | in1 = StringIO(inraw1)
122 |
123 | exp1 = [ ['1', '2'],
124 | ['1983'],
125 | ['3', '4'],
126 | ]
127 |
128 | def test_1(self):
129 | reader = swiss.tabular.HtmlReader()
130 | tab = reader.read(self.in1)
131 | assert tab.data == self.exp1
132 |
133 |
134 | class TestHtmlWriter:
135 |
136 | def setUp(self):
137 | rawData = [[1,1], [0,1]]
138 | self.indata1 = swiss.tabular.TabularData(data=rawData)
139 | self.writer1 = swiss.tabular.HtmlWriter(table_attributes={'id':1, 'class': 'data'})
140 |
141 | def test_0_simple(self):
142 | indata1 = [[1,1], [0,1]]
143 | expected = ''
145 | out1 = self.writer1.write_str(self.indata1)
146 | assert expected == out1
147 |
148 | def test_col_headings(self):
149 | self.indata1.header = [u'x','y']
150 | caption = ''
151 | expected = 'x | y |
'+\
152 | '1 | 1 |
0 | ' + \
153 | '1 |
'
154 | # no caption but headings
155 | out1 = self.writer1.write_str(self.indata1, caption)
156 | assert expected == out1
157 |
158 | def test_row_headings(self):
159 | self.indata1.header = ['x','y']
160 | rowHeadings = ['Date 1', 'Date 2']
161 | caption = ''
162 | expected = ' | x | ' + \
163 | 'y |
---|
Date 1 | 1 | ' + \
164 | '1 |
---|
Date 2 | 0 | 1 |
' + \
165 | '
'
166 | # no caption but headings
167 | out1 = self.writer1.write_str(self.indata1, caption, rowHeadings)
168 | assert expected == out1
169 |
170 | def test_escaping(self):
171 | tdata = swiss.tabular.TabularData(header=['s&p', 'y0 | 1 | '
180 | # print self.writer1.prettyPrint(in1)
181 |
182 |
183 | class TestLatexWriter:
184 |
185 | matrix = [[ 'H1', 'H2'],
186 | [1,'2%'],
187 | [3,4],
188 | ]
189 |
190 | exp = \
191 | r'''\textbf{H1} & \textbf{H2} \\
192 | \hline
193 | 1 & 2\% \\
194 | \hline
195 | 3 & 4 \\
196 | \hline
197 | '''
198 | m2l = swiss.tabular.LatexWriter()
199 |
200 | def test_escape(self):
201 | in1 = '& % $ something'
202 | exp1 = r'\& \% $ something'
203 | assert self.m2l.escape(in1) == exp1
204 |
205 | def test_table2latex(self):
206 | out = swiss.tabular.table2latex(self.matrix)
207 | self.diff(self.exp, out)
208 | assert out == self.exp
209 |
210 | def test_write(self):
211 | td = swiss.tabular.TabularData(data=self.matrix[1:], header=self.matrix[0])
212 | out = self.m2l.write_str(td)
213 | self.diff(self.exp, out)
214 | assert out == self.exp
215 |
216 | def diff(self, str1, str2):
217 | import difflib
218 | differ = difflib.Differ()
219 | text1 = str1.splitlines(1)
220 | text2 = str2.splitlines(1)
221 | result = list(differ.compare(text1, text2))
222 | from pprint import pprint
223 | pprint(result)
224 |
225 |
226 |
--------------------------------------------------------------------------------
/swiss/tests/tabular/test_gdocs.py:
--------------------------------------------------------------------------------
1 | import os
2 | from ConfigParser import SafeConfigParser
3 |
4 | import swiss.tabular.gdocs as gdocs
5 |
6 |
7 | cfg = SafeConfigParser()
8 | if not os.path.exists('test.ini'):
9 | msg = 'To run these tests you need a config file. See this file for details'
10 | raise Exception(msg)
11 | cfg.readfp(open('test.ini'))
12 | username = cfg.get('gdocs', 'username')
13 | password = cfg.get('gdocs', 'password')
14 |
15 |
16 | class TestGDocsTextDb:
17 | def test_01(self):
18 | source = 'okfn-swiss-gdocs-testing'
19 | reader = gdocs.GDocsReaderTextDb(source, username, password, id_is_name=True)
20 | tdata = reader.read()
21 | assert tdata.header == ['col1', 'col2']
22 | assert len(tdata.data) == 5, tdata
23 |
24 |
25 | # not working properly yet
26 | class _TestGDocs:
27 | def test_01(self):
28 | source = 't8GZy4Lb6jhVjCL5nrqZ5TQ'
29 | reader = gdocs.GDocsReaderSpreadsheet(source, username, password)
30 | tdata = reader.read()
31 | assert len(tdata.data) == 6, tdata
32 |
33 | def test_02_id_is_name(self):
34 | source = 'okfn-swiss-gdocs-testing'
35 | reader = gdocs.GDocsReaderSpreadsheet(source, username, password, id_is_name=True)
36 | tdata = reader.read()
37 | assert len(tdata.data) == 6, tdata
38 |
39 |
40 |
--------------------------------------------------------------------------------
/swiss/tests/tabular/test_json.py:
--------------------------------------------------------------------------------
1 | from StringIO import StringIO
2 | import swiss.tabular.tabular_json as js
3 |
4 | class TestJson:
5 | in1 = { 'header': [u'a', u'b'],
6 | 'data': [[1,2], [3,4]]
7 | }
8 | in2 = [ in1['header'] ] + in1['data']
9 | in1sio = StringIO(js.json.dumps(in1))
10 | in1sio.seek(0)
11 | in2sio = StringIO(js.json.dumps(in2))
12 | in2sio.seek(0)
13 |
14 | def test_JsonReader(self):
15 | reader = js.JsonReader()
16 | out = reader.read(self.in1sio)
17 | assert out.header == self.in1['header']
18 | assert out.data == self.in1['data']
19 |
20 | out = reader.read(self.in2sio)
21 | assert out.header == self.in1['header']
22 | assert out.data == self.in1['data']
23 |
24 | def test_JsonWriter(self):
25 | writer = js.JsonWriter()
26 | td = js.TabularData(header=self.in1['header'], data=self.in1['data'])
27 | out = writer.write_str(td)
28 | assert js.json.loads(out) == self.in1
29 |
30 |
--------------------------------------------------------------------------------
/swiss/tests/tabular/test_misc.py:
--------------------------------------------------------------------------------
1 | import swiss.tabular
2 |
3 | class TestTranspose:
4 |
5 | def test_1(self):
6 | inlist = [
7 | [ 0, 1 ],
8 | [ 1, 0 ],
9 | ]
10 | exp = [
11 | ( 0, 1 ),
12 | ( 1, 0 ),
13 | ]
14 | out = swiss.tabular.transpose(inlist)
15 | assert out == exp, out
16 |
17 | class TestPivot:
18 | td = swiss.tabular.TabularData(
19 | header=['Name','Year','Value'],
20 | data=[
21 | ['x',2004,1],
22 | ['y',2004,2],
23 | ['y',2005,4],
24 | ['x',2005,3],
25 | ],
26 | )
27 |
28 | def test_pivot_with_tabular(self):
29 | out = swiss.tabular.pivot(self.td, 1, 0, 2)
30 | assert out.data[0] == [2004, 1, 2]
31 | assert out.data[-1] == [2005, 3, 4]
32 |
33 | def test_pivot_with_tabular_2(self):
34 | out = swiss.tabular.pivot(self.td, 'Year', 'Name', 'Value')
35 | assert out.data[0] == [2004, 1, 2]
36 |
37 | def test_pivot_simple_list(self):
38 | out = swiss.tabular.pivot(self.td.data, 1, 0, 2)
39 | assert out.data[0] == [2004, 1, 2]
40 |
41 |
--------------------------------------------------------------------------------
/swiss/tests/tabular/test_txt.py:
--------------------------------------------------------------------------------
1 | import StringIO
2 |
3 | from swiss.tabular.txt import *
4 | from swiss.tabular import TabularData, CsvReader
5 |
6 | class TestFormatting:
7 |
8 | sample_rows = [
9 | ['1', '2', 'head blah', 'blah blah blah'],
10 | ['a', 'b', 'c', 'd', 'e', 'g' ],
11 | ['1', '2', 'annakarenina annakarenina annakarenina'],
12 | ]
13 | output_width = 60
14 |
15 | writer = TxtWriter(output_width=output_width)
16 | writer._compute_parameters(sample_rows)
17 |
18 | def test_1(self):
19 | assert self.writer.numcols == 6
20 |
21 | def test_colwidths(self):
22 | exp = int ((self.output_width -1) / 6)
23 | assert self.writer.colwidths[0] == exp
24 |
25 | def test__write_1(self):
26 | out = self.writer._write_row(self.sample_rows[0])
27 | assert len(out) <= self.output_width
28 |
29 | def test__write_2(self):
30 | out = self.writer._write_row(self.sample_rows[0])
31 | exp = '| 1 | 2 |head bla|blah bla| | |\n'
32 | assert out == exp
33 |
34 | def test__write_separator(self):
35 | out = self.writer._write_separator()
36 | exp = '+--------+--------+--------+--------+--------+--------+\n'
37 |
38 |
39 |
40 | class TestTxtWriter:
41 | sample = \
42 | '''"YEAR","PH","RPH","RPH_1","LN_RPH","LN_RPH_1","HH","LN_HH"
43 | 1971,7.852361625,43.9168370988587,42.9594500501036,3.78229777955476,3.76025664867788,16185,9.69184016636035
44 | ,,abc,
45 | 1972,10.504714885,55.1134791192682,43.9168370988587,4.00939431635556,3.78229777955476,16397,9.70485367024987
46 | , ,, '''
47 |
48 | expected = \
49 | '''+------+------+------+------+------+------+------+------+
50 | | YEAR | PH | RPH |RPH_1 |LN_RPH|LN_RPH| HH |LN_HH |
51 | +------+------+------+------+------+------+------+------+
52 | | 1971 |7.8523|43.916|42.959|3.7822|3.7602|16185 |9.6918|
53 | +------+------+------+------+------+------+------+------+
54 | | | | abc | | | | | |
55 | +------+------+------+------+------+------+------+------+
56 | | 1972 |10.504|55.113|43.916|4.0093|3.7822|16397 |9.7048|
57 | +------+------+------+------+------+------+------+------+
58 | | | | | | | | | |
59 | +------+------+------+------+------+------+------+------+
60 | '''
61 |
62 | def test_simple(self):
63 | indata = TabularData(data=[range(5),range(5,10)])
64 | writer = TxtWriter()
65 | out = writer.write_str(indata)
66 | exp = '''+-+-+-+-+-+
67 | |0|1|2|3|4|
68 | +-+-+-+-+-+
69 | |5|6|7|8|9|
70 | +-+-+-+-+-+
71 | '''
72 | print out
73 | print exp
74 | assert out == exp
75 |
76 | def test_output_width(self):
77 | indata = TabularData(data=[range(5),range(5,10)])
78 | writer = TxtWriter(output_width=16)
79 | out = writer.write_str(indata)
80 | outlen = len(out.splitlines()[0])
81 | assert outlen == 16, outlen
82 |
83 | def test_using_csv(self):
84 | fileobj = StringIO.StringIO(self.sample)
85 | in_tdata = CsvReader(fileobj).read()
86 | writer = TxtWriter(output_width=60)
87 | out = writer.write_str(in_tdata)
88 | print out
89 | print self.expected
90 | assert self.expected == out, out
91 |
92 |
--------------------------------------------------------------------------------
/swiss/tests/test_cache.py:
--------------------------------------------------------------------------------
1 | import tempfile
2 | import shutil
3 | import os
4 |
5 | from swiss.cache import Cache
6 |
7 | class TestCache:
8 | @classmethod
9 | def setup_class(self):
10 | self.tmp = tempfile.mkdtemp()
11 | self.path = os.path.join(self.tmp, 'abc.txt')
12 | open(self.path, 'w').write('abc')
13 | self.url = 'file://%s' % self.path
14 |
15 | @classmethod
16 | def teardown_class(self):
17 | shutil.rmtree(self.tmp)
18 |
19 | def test_basename(self):
20 | base = 'http://www.abc.org/'
21 | in1 = base + 'xyz'
22 | out = Cache.basename(in1)
23 | assert out == 'xyz'
24 |
25 | in2 = base + 'xyz/abc.txt'
26 | out = Cache.basename(in2)
27 | assert out == 'abc.txt'
28 |
29 | in3 = base + 'membersDo?body=ABC'
30 | out = Cache.basename(in3)
31 | assert out == 'membersDo?body=ABC', out
32 |
33 | in3 = base + 'membersDo?body=data/ABC'
34 | out = Cache.basename(in3)
35 | assert out == 'membersDo?body=data%47ABC', out
36 |
37 | def test_filepath(self):
38 | r = Cache()
39 | base = 'http://www.abc.org/'
40 | in1 = base + 'xyz'
41 | out = r.filepath(in1)
42 | # ./xyz
43 | assert out.endswith('xyz'), out
44 |
45 | def test_dl(self):
46 | dest = os.path.join(self.tmp, 'out.txt')
47 | Cache.dl(self.url, dest)
48 | assert os.path.exists(dest)
49 | assert open(dest).read() == 'abc'
50 |
51 | def test_cache(self):
52 | cache = os.path.join(self.tmp, 'cache')
53 | r = Cache(cache)
54 | r.retrieve(self.url)
55 | assert os.path.exists(os.path.join(cache, 'abc.txt'))
56 |
57 |
--------------------------------------------------------------------------------
/swiss/tests/test_date.py:
--------------------------------------------------------------------------------
1 | from swiss.date import *
2 |
3 | import datetime
4 |
5 | class TestPythonStringOrdering(object):
6 | # It is impossible to find a string format such that +ve and -ve numbers
7 | # sort correctly as strings:
8 | # if (in string ordering) X < Y => -X < -Y (False!)
9 | def test_ordering(self):
10 | assert '0' < '1'
11 | assert '-10' < '10'
12 | assert '-' < '@'
13 | assert '-' < '0'
14 | assert '-100' < '-X10'
15 | assert '10' < '1000'
16 | assert '02000' < '10000'
17 | assert ' 2000' < '10000'
18 |
19 | def test_bad_ordering(self):
20 | assert ' ' < '0'
21 | assert ' ' < '-'
22 | assert not '-' < '+'
23 | assert '-100' > '-10'
24 | assert not '-100' < '-010'
25 | assert not '-100' < '- 10'
26 | assert not '-100' < ' -10'
27 | assert '10000' < '2000'
28 | assert not '-10' < ' 1'
29 |
30 |
31 | class TestFlexiDate(object):
32 | def test_init(self):
33 | fd = FlexiDate()
34 | assert fd.year == '', fd
35 | assert fd.month == '', fd
36 |
37 | fd = FlexiDate(2000, 1,1)
38 | assert fd.month == '01', fd
39 | assert fd.day== '01', fd
40 |
41 | def test_str(self):
42 | fd = FlexiDate(2000, 1, 23)
43 | assert str(fd) == '2000-01-23', '"%s"' % fd
44 | fd = FlexiDate(-2000, 1, 23)
45 | assert str(fd) == '-2000-01-23'
46 | fd = FlexiDate(2000)
47 | assert str(fd) == '2000'
48 | fd = FlexiDate(1760, qualifier='fl.')
49 | assert str(fd) == '1760 [fl.]', fd
50 |
51 | fd = FlexiDate(qualifier='anything')
52 | assert str(fd) == ' [anything]'
53 |
54 |
55 | def test_from_str(self):
56 | def dotest(fd):
57 | out = FlexiDate.from_str(str(fd))
58 | assert str(out) == str(fd)
59 |
60 | fd = FlexiDate(2000, 1, 23)
61 | dotest(fd)
62 | fd = FlexiDate(1760, qualifier='fl.')
63 | dotest(fd)
64 | fd = FlexiDate(-1760, 1, 3, qualifier='fl.')
65 | dotest(fd)
66 |
67 | def test_as_float(self):
68 | fd = FlexiDate(2000)
69 | assert fd.as_float() == float(2000), fd.as_float()
70 | fd = FlexiDate(1760, 1, 2)
71 | exp = 1760 + 1/12.0 + 2/365.0
72 | assert fd.as_float() == exp, fd.as_float()
73 | fd = FlexiDate(-1000)
74 | assert fd.as_float() == float(-1000)
75 |
76 | def test_as_datetime(self):
77 | fd = FlexiDate(2000)
78 | out = fd.as_datetime()
79 | assert out == datetime.datetime(2000, 1, 1), out
80 | fd = FlexiDate(1760, 1, 2)
81 | out = fd.as_datetime()
82 | assert out == datetime.datetime(1760,1,2), out
83 |
84 |
85 | class TestDateParsers(object):
86 | def test_using_datetime(self):
87 | parser = PythonDateParser()
88 |
89 | d1 = datetime.date(2000, 1, 23)
90 | fd = parser.parse(d1)
91 | assert fd.year == '2000'
92 |
93 | d1 = datetime.datetime(2000, 1, 23)
94 | fd = parser.parse(d1)
95 | # assert str(fd) == '2000-01-23T00:00:00', fd
96 | assert str(fd) == '2000-01-23', fd
97 |
98 | def test_using_dateutil(self):
99 | parser = DateutilDateParser()
100 |
101 | in1 = '2001-02'
102 | fd = parser.parse(in1)
103 | assert str(fd) == in1, fd
104 |
105 | in1 = 'March 1762'
106 | fd = parser.parse(in1)
107 | assert str(fd) == '1762-03'
108 |
109 | in1 = 'March 1762'
110 | fd = parser.parse(in1)
111 | assert str(fd) == '1762-03'
112 |
113 | in1 = '1768 AD'
114 | fd = parser.parse(in1)
115 | assert str(fd) == '1768', fd
116 |
117 | in1 = '1768 A.D.'
118 | fd = parser.parse(in1)
119 | assert str(fd) == '1768', fd
120 |
121 | in1 = '-1850'
122 | fd = parser.parse(in1)
123 | assert str(fd) == '-1850', fd
124 |
125 | in1 = '1762 BC'
126 | fd = parser.parse(in1)
127 | assert str(fd) == '-1762', fd
128 |
129 | in1 = '4 BC'
130 | fd = parser.parse(in1)
131 | assert str(fd) == '-0004', fd
132 |
133 | in1 = '4 B.C.'
134 | fd = parser.parse(in1)
135 | assert str(fd) == '-0004', fd
136 |
137 | def test_parse(self):
138 | d1 = datetime.datetime(2000, 1, 23)
139 | fd = parse(d1)
140 | assert fd.year == '2000'
141 |
142 | fd = parse('March 1762')
143 | assert str(fd) == '1762-03'
144 |
145 | fd = parse(1966)
146 | assert str(fd) == '1966'
147 |
148 | fd = parse('22/07/2010')
149 | assert fd.month == '07', fd.month
150 |
151 | def test_parse_ambiguous_day_month(self):
152 | fd = parse('05/07/2010')
153 | assert fd.month == '07', fd.month
154 | assert fd.day == '05', fd.month
155 |
156 | def test_parse_with_none(self):
157 | d1 = parse(None)
158 | assert d1 is None
159 |
160 | def test_parse_wildcards(self):
161 | fd = parse('198?')
162 | assert fd.year == '', fd.year # expect this to not parse
163 | # TODO but we should have a float if possible
164 | # assert fd.as_float() == u'1980', fd.as_float()
165 |
166 | def test_parse_with_qualifiers(self):
167 | fd = parse('1985?')
168 | assert fd.year == u'1985', fd
169 | assert fd.qualifier == u'Uncertainty : 1985?', fd.qualifier
170 |
171 | fd = parse('c.1780')
172 | assert fd.year == u'1780', fd
173 | assert fd.qualifier == u"Note 'circa' : c.1780", fd
174 |
175 | fd = parse('c. 1780')
176 | assert fd.year == u'1780', fd
177 | assert fd.qualifier.startswith(u"Note 'circa'"), fd
178 |
179 | def test_ambiguous(self):
180 | # TODO: have to be careful here ...
181 | fd = parse('1068/1069')
182 |
183 | def test_small_years(self):
184 | in1 = '23'
185 | fd = parse(in1)
186 | assert str(fd) == '0023', fd
187 | assert fd.as_float() == 23, fd.as_float()
188 |
189 | def test_small_years_with_zeros(self):
190 | in1 = '0023'
191 | fd = parse(in1)
192 | assert str(fd) == '0023', fd
193 | assert fd.as_float() == 23, fd.as_float()
194 |
195 | def test_years_with_alpha_prefix(self):
196 | in1 = "p1980"
197 | fd = parse(in1)
198 | assert str(fd) == "1980", fd
199 |
200 |
--------------------------------------------------------------------------------
/swiss/tests/test_id.py:
--------------------------------------------------------------------------------
1 | import uuid
2 |
3 | import swiss.id
4 |
5 | def test_compress_and_uncompress_uuid():
6 | hexversion = '86c3f19d-8854-4ef5-8d88-f008e0817871'
7 |
8 | out = swiss.id.compress_uuid(hexversion)
9 | assert len(out) == 22
10 |
11 | orig = swiss.id.uncompress_uuid(out)
12 | assert orig == hexversion
13 |
14 | # test unicode
15 | orig = swiss.id.uncompress_uuid(unicode(out))
16 | assert orig == hexversion
17 |
18 | u1 = uuid.UUID(hexversion)
19 | out = swiss.id.compress_uuid(u1)
20 | assert len(out) == 22
21 |
22 |
23 | def test_int_to_b32():
24 | def check(int_):
25 | out = swiss.id.int_to_b32(int_)
26 | assert isinstance(out, basestring)
27 | assert len(out) == 7, out
28 |
29 | back = swiss.id.b32_to_int(out)
30 | assert back == int_, (int_,back)
31 |
32 | check(1)
33 | check(2**28+1)
34 | check(2**30-1)
35 |
36 |
--------------------------------------------------------------------------------
/swiss/tests/test_misc.py:
--------------------------------------------------------------------------------
1 | from swiss.misc import *
2 |
3 | class TestFloatify:
4 | def test_floatify_1(self):
5 | x = '10'
6 | assert floatify(x) == 10.0
7 |
8 | def test_floatify_2(self):
9 | x = '1,030'
10 | assert floatify(x) == 1030.0
11 |
12 | def test_floatify_2(self):
13 | x = ''
14 | out = floatify(x)
15 | assert out == None, out
16 | x = '#'
17 | out = floatify(x)
18 | assert out == None, out
19 |
20 | def test_floatify_matrix(self):
21 | x = [
22 | ['1', '2'],
23 | ['abc', '3.0']
24 | ]
25 | exp = [
26 | [1.0, 2.0],
27 | ['abc', 3.0]
28 | ]
29 | out = floatify_matrix(x)
30 | assert out == exp
31 |
32 |
33 | class TestMakeSeries:
34 |
35 | def test_make_series(self):
36 | indata = [ [ '1980', '100', '50' ],
37 | [ '1981', '101', '51' ],
38 | [ '1982', '102', '' ],
39 | ]
40 | exp = [
41 | [ (1980.0, 100.0), (1981.0, 101.0), (1982.0, 102.0) ],
42 | [ (1980.0, 50.0), (1981.0, 51.0) ]
43 | ]
44 | out = make_series(indata, xcol=0, ycols=[1,2])
45 | assert out == exp, out
46 |
47 |
--------------------------------------------------------------------------------
/swiss/tests/test_xls.py:
--------------------------------------------------------------------------------
1 | import pkg_resources
2 |
3 | import swiss.tabular
4 |
5 | class TestXlsReader:
6 |
7 | def test_stuff(self):
8 | fo = pkg_resources.resource_stream('swiss',
9 | 'tests/data/xls_reader_test.xls')
10 | reader = swiss.tabular.XlsReader(fo)
11 | tab = reader.read()
12 | assert tab.data[0][0] == 1850
13 | assert tab.data[19][1] == 12.3
14 |
15 |
--------------------------------------------------------------------------------