5 | #
6 | # MIT Licence. See http://opensource.org/licenses/MIT
7 | #
8 | # Created on 2016-05-21
9 | #
10 |
11 | """CLI command for I Sheet You Not"""
12 |
13 | from __future__ import print_function, unicode_literals, absolute_import
14 |
15 | from isheetyounot.cli import main
16 | from isheetyounot.aw3 import rescue
17 |
18 | if __name__ == '__main__':
19 | rescue(main)
20 |
--------------------------------------------------------------------------------
/src/xlrd/__init__.py:
--------------------------------------------------------------------------------
1 | from os import path
2 |
3 | from .info import __VERSION__
4 |
5 | # Copyright (c) 2005-2012 Stephen John Machin, Lingfo Pty Ltd
6 | # This module is part of the xlrd package, which is released under a
7 | # BSD-style licence.
8 |
9 | from . import licences
10 |
11 | ##
12 | # A Python module for extracting data from MS Excel (TM) spreadsheet files.
13 | #
14 | # Version 0.7.4 -- April 2012
15 | #
16 | #
17 | # General information
18 | #
19 | # Acknowledgements
20 | #
21 | #
22 | # Development of this module would not have been possible without the document
23 | # "OpenOffice.org's Documentation of the Microsoft Excel File Format"
24 | # ("OOo docs" for short).
25 | # The latest version is available from OpenOffice.org in
26 | # PDF format
27 | # and
28 | # ODT format.
29 | # Small portions of the OOo docs are reproduced in this
30 | # document. A study of the OOo docs is recommended for those who wish a
31 | # deeper understanding of the Excel file layout than the xlrd docs can provide.
32 | #
33 | #
34 | # Backporting to Python 2.1 was partially funded by
35 | #
36 | # Journyx - provider of timesheet and project accounting solutions.
37 | #
38 | #
39 | #
40 | # Provision of formatting information in version 0.6.1 was funded by
41 | #
42 | # Simplistix Ltd.
43 | #
44 | #
45 | #
46 | # Unicode
47 | #
48 | # This module presents all text strings as Python unicode objects.
49 | # From Excel 97 onwards, text in Excel spreadsheets has been stored as Unicode.
50 | # Older files (Excel 95 and earlier) don't keep strings in Unicode;
51 | # a CODEPAGE record provides a codepage number (for example, 1252) which is
52 | # used by xlrd to derive the encoding (for same example: "cp1252") which is
53 | # used to translate to Unicode.
54 | #
55 | # If the CODEPAGE record is missing (possible if the file was created
56 | # by third-party software), xlrd will assume that the encoding is ascii, and keep going.
57 | # If the actual encoding is not ascii, a UnicodeDecodeError exception will be raised and
58 | # you will need to determine the encoding yourself, and tell xlrd:
59 | #
60 | # book = xlrd.open_workbook(..., encoding_override="cp1252")
61 | #
62 | # If the CODEPAGE record exists but is wrong (for example, the codepage
63 | # number is 1251, but the strings are actually encoded in koi8_r),
64 | # it can be overridden using the same mechanism.
65 | # The supplied runxlrd.py has a corresponding command-line argument, which
66 | # may be used for experimentation:
67 | #
68 | # runxlrd.py -e koi8_r 3rows myfile.xls
69 | #
70 | # The first place to look for an encoding ("codec name") is
71 | #
72 | # the Python documentation.
73 | #
74 | #
75 | #
76 | # Dates in Excel spreadsheets
77 | #
78 | # In reality, there are no such things. What you have are floating point
79 | # numbers and pious hope.
80 | # There are several problems with Excel dates:
81 | #
82 | # (1) Dates are not stored as a separate data type; they are stored as
83 | # floating point numbers and you have to rely on
84 | # (a) the "number format" applied to them in Excel and/or
85 | # (b) knowing which cells are supposed to have dates in them.
86 | # This module helps with (a) by inspecting the
87 | # format that has been applied to each number cell;
88 | # if it appears to be a date format, the cell
89 | # is classified as a date rather than a number. Feedback on this feature,
90 | # especially from non-English-speaking locales, would be appreciated.
91 | #
92 | # (2) Excel for Windows stores dates by default as the number of
93 | # days (or fraction thereof) since 1899-12-31T00:00:00. Excel for
94 | # Macintosh uses a default start date of 1904-01-01T00:00:00. The date
95 | # system can be changed in Excel on a per-workbook basis (for example:
96 | # Tools -> Options -> Calculation, tick the "1904 date system" box).
97 | # This is of course a bad idea if there are already dates in the
98 | # workbook. There is no good reason to change it even if there are no
99 | # dates in the workbook. Which date system is in use is recorded in the
100 | # workbook. A workbook transported from Windows to Macintosh (or vice
101 | # versa) will work correctly with the host Excel. When using this
102 | # module's xldate_as_tuple function to convert numbers from a workbook,
103 | # you must use the datemode attribute of the Book object. If you guess,
104 | # or make a judgement depending on where you believe the workbook was
105 | # created, you run the risk of being 1462 days out of kilter.
106 | #
107 | # Reference:
108 | # http://support.microsoft.com/default.aspx?scid=KB;EN-US;q180162
109 | #
110 | #
111 | # (3) The Excel implementation of the Windows-default 1900-based date system works on the
112 | # incorrect premise that 1900 was a leap year. It interprets the number 60 as meaning 1900-02-29,
113 | # which is not a valid date. Consequently any number less than 61 is ambiguous. Example: is 59 the
114 | # result of 1900-02-28 entered directly, or is it 1900-03-01 minus 2 days? The OpenOffice.org Calc
115 | # program "corrects" the Microsoft problem; entering 1900-02-27 causes the number 59 to be stored.
116 | # Save as an XLS file, then open the file with Excel -- you'll see 1900-02-28 displayed.
117 | #
118 | # Reference: http://support.microsoft.com/default.aspx?scid=kb;en-us;214326
119 | #
120 | # (4) The Macintosh-default 1904-based date system counts 1904-01-02 as day 1 and 1904-01-01 as day zero.
121 | # Thus any number such that (0.0 <= number < 1.0) is ambiguous. Is 0.625 a time of day (15:00:00),
122 | # independent of the calendar,
123 | # or should it be interpreted as an instant on a particular day (1904-01-01T15:00:00)?
124 | # The xldate_* functions in this module
125 | # take the view that such a number is a calendar-independent time of day (like Python's datetime.time type) for both
126 | # date systems. This is consistent with more recent Microsoft documentation
127 | # (for example, the help file for Excel 2002 which says that the first day
128 | # in the 1904 date system is 1904-01-02).
129 | #
130 | #
(5) Usage of the Excel DATE() function may leave strange dates in a spreadsheet. Quoting the help file,
131 | # in respect of the 1900 date system: "If year is between 0 (zero) and 1899 (inclusive),
132 | # Excel adds that value to 1900 to calculate the year. For example, DATE(108,1,2) returns January 2, 2008 (1900+108)."
133 | # This gimmick, semi-defensible only for arguments up to 99 and only in the pre-Y2K-awareness era,
134 | # means that DATE(1899, 12, 31) is interpreted as 3799-12-31.
135 | #
136 | # For further information, please refer to the documentation for the xldate_* functions.
137 | #
138 | # Named references, constants, formulas, and macros
139 | #
140 | #
141 | # A name is used to refer to a cell, a group of cells, a constant
142 | # value, a formula, or a macro. Usually the scope of a name is global
143 | # across the whole workbook. However it can be local to a worksheet.
144 | # For example, if the sales figures are in different cells in
145 | # different sheets, the user may define the name "Sales" in each
146 | # sheet. There are built-in names, like "Print_Area" and
147 | # "Print_Titles"; these two are naturally local to a sheet.
148 | #
149 | # To inspect the names with a user interface like MS Excel, OOo Calc,
150 | # or Gnumeric, click on Insert/Names/Define. This will show the global
151 | # names, plus those local to the currently selected sheet.
152 | #
153 | # A Book object provides two dictionaries (name_map and
154 | # name_and_scope_map) and a list (name_obj_list) which allow various
155 | # ways of accessing the Name objects. There is one Name object for
156 | # each NAME record found in the workbook. Name objects have many
157 | # attributes, several of which are relevant only when obj.macro is 1.
158 | #
159 | # In the examples directory you will find namesdemo.xls which
160 | # showcases the many different ways that names can be used, and
161 | # xlrdnamesAPIdemo.py which offers 3 different queries for inspecting
162 | # the names in your files, and shows how to extract whatever a name is
163 | # referring to. There is currently one "convenience method",
164 | # Name.cell(), which extracts the value in the case where the name
165 | # refers to a single cell. More convenience methods are planned. The
166 | # source code for Name.cell (in __init__.py) is an extra source of
167 | # information on how the Name attributes hang together.
168 | #
169 | #
170 | # Name information is not extracted from files older than
171 | # Excel 5.0 (Book.biff_version < 50)
172 | #
173 | # Formatting
174 | #
175 | # Introduction
176 | #
177 | # This collection of features, new in xlrd version 0.6.1, is intended
178 | # to provide the information needed to (1) display/render spreadsheet contents
179 | # (say) on a screen or in a PDF file, and (2) copy spreadsheet data to another
180 | # file without losing the ability to display/render it.
181 | #
182 | # The Palette; Colour Indexes
183 | #
184 | # A colour is represented in Excel as a (red, green, blue) ("RGB") tuple
185 | # with each component in range(256). However it is not possible to access an
186 | # unlimited number of colours; each spreadsheet is limited to a palette of 64 different
187 | # colours (24 in Excel 3.0 and 4.0, 8 in Excel 2.0). Colours are referenced by an index
188 | # ("colour index") into this palette.
189 | #
190 | # Colour indexes 0 to 7 represent 8 fixed built-in colours: black, white, red, green, blue,
191 | # yellow, magenta, and cyan.
192 | #
193 | # The remaining colours in the palette (8 to 63 in Excel 5.0 and later)
194 | # can be changed by the user. In the Excel 2003 UI, Tools/Options/Color presents a palette
195 | # of 7 rows of 8 colours. The last two rows are reserved for use in charts.
196 | # The correspondence between this grid and the assigned
197 | # colour indexes is NOT left-to-right top-to-bottom.
198 | # Indexes 8 to 15 correspond to changeable
199 | # parallels of the 8 fixed colours -- for example, index 7 is forever cyan;
200 | # index 15 starts off being cyan but can be changed by the user.
201 | #
202 | # The default colour for each index depends on the file version; tables of the defaults
203 | # are available in the source code. If the user changes one or more colours,
204 | # a PALETTE record appears in the XLS file -- it gives the RGB values for *all* changeable
205 | # indexes.
206 | # Note that colours can be used in "number formats": "[CYAN]...." and "[COLOR8]...." refer
207 | # to colour index 7; "[COLOR16]...." will produce cyan
208 | # unless the user changes colour index 15 to something else.
209 | #
210 | #
In addition, there are several "magic" colour indexes used by Excel:
211 | # 0x18 (BIFF3-BIFF4), 0x40 (BIFF5-BIFF8): System window text colour for border lines
212 | # (used in XF, CF, and WINDOW2 records)
213 | # 0x19 (BIFF3-BIFF4), 0x41 (BIFF5-BIFF8): System window background colour for pattern background
214 | # (used in XF and CF records )
215 | # 0x43: System face colour (dialogue background colour)
216 | # 0x4D: System window text colour for chart border lines
217 | # 0x4E: System window background colour for chart areas
218 | # 0x4F: Automatic colour for chart border lines (seems to be always Black)
219 | # 0x50: System ToolTip background colour (used in note objects)
220 | # 0x51: System ToolTip text colour (used in note objects)
221 | # 0x7FFF: System window text colour for fonts (used in FONT and CF records)
222 | # Note 0x7FFF appears to be the *default* colour index. It appears quite often in FONT
223 | # records.
224 | #
225 | #
Default Formatting
226 | #
227 | # Default formatting is applied to all empty cells (those not described by a cell record).
228 | # Firstly row default information (ROW record, Rowinfo class) is used if available.
229 | # Failing that, column default information (COLINFO record, Colinfo class) is used if available.
230 | # As a last resort the worksheet/workbook default cell format will be used; this
231 | # should always be present in an Excel file,
232 | # described by the XF record with the fixed index 15 (0-based). By default, it uses the
233 | # worksheet/workbook default cell style, described by the very first XF record (index 0).
234 | #
235 | # Formatting features not included in xlrd version 0.6.1
236 | #
237 | # - Rich text i.e. strings containing partial bold italic
238 | # and underlined text, change of font inside a string, etc.
239 | # See OOo docs s3.4 and s3.2.
240 | # Rich text is included in version 0.7.2
241 | # - Asian phonetic text (known as "ruby"), used for Japanese furigana. See OOo docs
242 | # s3.4.2 (p15)
243 | # - Conditional formatting. See OOo docs
244 | # s5.12, s6.21 (CONDFMT record), s6.16 (CF record)
245 | # - Miscellaneous sheet-level and book-level items e.g. printing layout, screen panes.
246 | # - Modern Excel file versions don't keep most of the built-in
247 | # "number formats" in the file; Excel loads formats according to the
248 | # user's locale. Currently xlrd's emulation of this is limited to
249 | # a hard-wired table that applies to the US English locale. This may mean
250 | # that currency symbols, date order, thousands separator, decimals separator, etc
251 | # are inappropriate. Note that this does not affect users who are copying XLS
252 | # files, only those who are visually rendering cells.
253 | #
254 | #
255 | # Loading worksheets on demand
256 | #
257 | # This feature, new in version 0.7.1, is governed by the on_demand argument
258 | # to the open_workbook() function and allows saving memory and time by loading
259 | # only those sheets that the caller is interested in, and releasing sheets
260 | # when no longer required.
261 | #
262 | # on_demand=False (default): No change. open_workbook() loads global data
263 | # and all sheets, releases resources no longer required (principally the
264 | # str or mmap object containing the Workbook stream), and returns.
265 | #
266 | # on_demand=True and BIFF version < 5.0: A warning message is emitted,
267 | # on_demand is recorded as False, and the old process is followed.
268 | #
269 | # on_demand=True and BIFF version >= 5.0: open_workbook() loads global
270 | # data and returns without releasing resources. At this stage, the only
271 | # information available about sheets is Book.nsheets and Book.sheet_names().
272 | #
273 | # Book.sheet_by_name() and Book.sheet_by_index() will load the requested
274 | # sheet if it is not already loaded.
275 | #
276 | # Book.sheets() will load all/any unloaded sheets.
277 | #
278 | # The caller may save memory by calling
279 | # Book.unload_sheet(sheet_name_or_index) when finished with the sheet.
280 | # This applies irrespective of the state of on_demand.
281 | #
282 | # The caller may re-load an unloaded sheet by calling Book.sheet_by_xxxx()
283 | # -- except if those required resources have been released (which will
284 | # have happened automatically when on_demand is false). This is the only
285 | # case where an exception will be raised.
286 | #
287 | # The caller may query the state of a sheet:
288 | # Book.sheet_loaded(sheet_name_or_index) -> a bool
289 | #
290 | # Book.release_resources() may used to save memory and close
291 | # any memory-mapped file before proceding to examine already-loaded
292 | # sheets. Once resources are released, no further sheets can be loaded.
293 | #
294 | # When using on-demand, it is advisable to ensure that
295 | # Book.release_resources() is always called even if an exception
296 | # is raised in your own code; otherwise if the input file has been
297 | # memory-mapped, the mmap.mmap object will not be closed and you will
298 | # not be able to access the physical file until your Python process
299 | # terminates. This can be done by calling Book.release_resources()
300 | # explicitly in the finally suite of a try/finally block.
301 | # New in xlrd 0.7.2: the Book object is a "context manager", so if
302 | # using Python 2.5 or later, you can wrap your code in a "with"
303 | # statement.
304 | ##
305 |
306 | import sys, zipfile, pprint
307 | from . import timemachine
308 | from .biffh import (
309 | XLRDError,
310 | biff_text_from_num,
311 | error_text_from_code,
312 | XL_CELL_BLANK,
313 | XL_CELL_TEXT,
314 | XL_CELL_BOOLEAN,
315 | XL_CELL_ERROR,
316 | XL_CELL_EMPTY,
317 | XL_CELL_DATE,
318 | XL_CELL_NUMBER
319 | )
320 | from .formula import * # is constrained by __all__
321 | from .book import Book, colname #### TODO #### formula also has `colname` (restricted to 256 cols)
322 | from .sheet import empty_cell
323 | from .xldate import XLDateError, xldate_as_tuple
324 |
325 | if sys.version.startswith("IronPython"):
326 | # print >> sys.stderr, "...importing encodings"
327 | import encodings
328 |
329 | try:
330 | import mmap
331 | MMAP_AVAILABLE = 1
332 | except ImportError:
333 | MMAP_AVAILABLE = 0
334 | USE_MMAP = MMAP_AVAILABLE
335 |
336 | ##
337 | #
338 | # Open a spreadsheet file for data extraction.
339 | #
340 | # @param filename The path to the spreadsheet file to be opened.
341 | #
342 | # @param logfile An open file to which messages and diagnostics are written.
343 | #
344 | # @param verbosity Increases the volume of trace material written to the logfile.
345 | #
346 | # @param use_mmap Whether to use the mmap module is determined heuristically.
347 | # Use this arg to override the result. Current heuristic: mmap is used if it exists.
348 | #
349 | # @param file_contents ... as a string or an mmap.mmap object or some other behave-alike object.
350 | # If file_contents is supplied, filename will not be used, except (possibly) in messages.
351 | #
352 | # @param encoding_override Used to overcome missing or bad codepage information
353 | # in older-version files. Refer to discussion in the Unicode section above.
354 | #
-- New in version 0.6.0
355 | #
356 | # @param formatting_info Governs provision of a reference to an XF (eXtended Format) object
357 | # for each cell in the worksheet.
358 | #
Default is False. This is backwards compatible and saves memory.
359 | # "Blank" cells (those with their own formatting information but no data) are treated as empty
360 | # (by ignoring the file's BLANK and MULBLANK records).
361 | # It cuts off any bottom "margin" of rows of empty (and blank) cells and
362 | # any right "margin" of columns of empty (and blank) cells.
363 | # Only cell_value and cell_type are available.
364 | #
True provides all cells, including empty and blank cells.
365 | # XF information is available for each cell.
366 | #
-- New in version 0.6.1
367 | #
368 | # @param on_demand Governs whether sheets are all loaded initially or when demanded
369 | # by the caller. Please refer back to the section "Loading worksheets on demand" for details.
370 | #
-- New in version 0.7.1
371 | #
372 | # @param ragged_rows False (the default) means all rows are padded out with empty cells so that all
373 | # rows have the same size (Sheet.ncols). True means that there are no empty cells at the ends of rows.
374 | # This can result in substantial memory savings if rows are of widely varying sizes. See also the
375 | # Sheet.row_len() method.
376 | #
-- New in version 0.7.2
377 | #
378 | # @return An instance of the Book class.
379 |
380 | def open_workbook(filename=None,
381 | logfile=sys.stdout,
382 | verbosity=0,
383 | use_mmap=USE_MMAP,
384 | file_contents=None,
385 | encoding_override=None,
386 | formatting_info=False,
387 | on_demand=False,
388 | ragged_rows=False,
389 | ):
390 | peeksz = 4
391 | if file_contents:
392 | peek = file_contents[:peeksz]
393 | else:
394 | f = open(filename, "rb")
395 | peek = f.read(peeksz)
396 | f.close()
397 | if peek == b"PK\x03\x04": # a ZIP file
398 | if file_contents:
399 | zf = zipfile.ZipFile(timemachine.BYTES_IO(file_contents))
400 | else:
401 | zf = zipfile.ZipFile(filename)
402 |
403 | # Workaround for some third party files that use forward slashes and
404 | # lower case names. We map the expected name in lowercase to the
405 | # actual filename in the zip container.
406 | component_names = dict([(name.replace('\\', '/').lower(), name)
407 | for name in zf.namelist()])
408 |
409 | if verbosity:
410 | logfile.write('ZIP component_names:\n')
411 | pprint.pprint(component_names, logfile)
412 | if 'xl/workbook.xml' in component_names:
413 | from . import xlsx
414 | bk = xlsx.open_workbook_2007_xml(
415 | zf,
416 | component_names,
417 | logfile=logfile,
418 | verbosity=verbosity,
419 | use_mmap=use_mmap,
420 | formatting_info=formatting_info,
421 | on_demand=on_demand,
422 | ragged_rows=ragged_rows,
423 | )
424 | return bk
425 | if 'xl/workbook.bin' in component_names:
426 | raise XLRDError('Excel 2007 xlsb file; not supported')
427 | if 'content.xml' in component_names:
428 | raise XLRDError('Openoffice.org ODS file; not supported')
429 | raise XLRDError('ZIP file contents not a known type of workbook')
430 |
431 | from . import book
432 | bk = book.open_workbook_xls(
433 | filename=filename,
434 | logfile=logfile,
435 | verbosity=verbosity,
436 | use_mmap=use_mmap,
437 | file_contents=file_contents,
438 | encoding_override=encoding_override,
439 | formatting_info=formatting_info,
440 | on_demand=on_demand,
441 | ragged_rows=ragged_rows,
442 | )
443 | return bk
444 |
445 | ##
446 | # For debugging: dump an XLS file's BIFF records in char & hex.
447 | # @param filename The path to the file to be dumped.
448 | # @param outfile An open file, to which the dump is written.
449 | # @param unnumbered If true, omit offsets (for meaningful diffs).
450 |
451 | def dump(filename, outfile=sys.stdout, unnumbered=False):
452 | from .biffh import biff_dump
453 | bk = Book()
454 | bk.biff2_8_load(filename=filename, logfile=outfile, )
455 | biff_dump(bk.mem, bk.base, bk.stream_len, 0, outfile, unnumbered)
456 |
457 | ##
458 | # For debugging and analysis: summarise the file's BIFF records.
459 | # I.e. produce a sorted file of (record_name, count).
460 | # @param filename The path to the file to be summarised.
461 | # @param outfile An open file, to which the summary is written.
462 |
463 | def count_records(filename, outfile=sys.stdout):
464 | from .biffh import biff_count_records
465 | bk = Book()
466 | bk.biff2_8_load(filename=filename, logfile=outfile, )
467 | biff_count_records(bk.mem, bk.base, bk.stream_len, outfile)
468 |
--------------------------------------------------------------------------------
/src/xlrd/biffh.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deanishe/i-sheet-you-not/ffebaa0248336fad333ba408f31f9001aefafb8a/src/xlrd/biffh.py
--------------------------------------------------------------------------------
/src/xlrd/compdoc.py:
--------------------------------------------------------------------------------
1 | # -*- coding: cp1252 -*-
2 |
3 | ##
4 | # Implements the minimal functionality required
5 | # to extract a "Workbook" or "Book" stream (as one big string)
6 | # from an OLE2 Compound Document file.
7 | # Copyright � 2005-2012 Stephen John Machin, Lingfo Pty Ltd
8 | # This module is part of the xlrd package, which is released under a BSD-style licence.
9 | ##
10 |
11 | # No part of the content of this file was derived from the works of David Giffin.
12 |
13 | # 2008-11-04 SJM Avoid assertion error when -1 used instead of -2 for first_SID of empty SCSS [Frank Hoffsuemmer]
14 | # 2007-09-08 SJM Warning message if sector sizes are extremely large.
15 | # 2007-05-07 SJM Meaningful exception instead of IndexError if a SAT (sector allocation table) is corrupted.
16 | # 2007-04-22 SJM Missing "<" in a struct.unpack call => can't open files on bigendian platforms.
17 |
18 | from __future__ import print_function
19 | import sys
20 | from struct import unpack
21 | from .timemachine import *
22 | import array
23 |
24 | ##
25 | # Magic cookie that should appear in the first 8 bytes of the file.
26 | SIGNATURE = b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1"
27 |
28 | EOCSID = -2
29 | FREESID = -1
30 | SATSID = -3
31 | MSATSID = -4
32 | EVILSID = -5
33 |
34 | class CompDocError(Exception):
35 | pass
36 |
37 | class DirNode(object):
38 |
39 | def __init__(self, DID, dent, DEBUG=0, logfile=sys.stdout):
40 | # dent is the 128-byte directory entry
41 | self.DID = DID
42 | self.logfile = logfile
43 | (cbufsize, self.etype, self.colour, self.left_DID, self.right_DID,
44 | self.root_DID) = \
45 | unpack(' 20: # allows for 2**20 bytes i.e. 1MB
98 | print("WARNING: sector size (2**%d) is preposterous; assuming 512 and continuing ..." \
99 | % ssz, file=logfile)
100 | ssz = 9
101 | if sssz > ssz:
102 | print("WARNING: short stream sector size (2**%d) is preposterous; assuming 64 and continuing ..." \
103 | % sssz, file=logfile)
104 | sssz = 6
105 | self.sec_size = sec_size = 1 << ssz
106 | self.short_sec_size = 1 << sssz
107 | if self.sec_size != 512 or self.short_sec_size != 64:
108 | print("@@@@ sec_size=%d short_sec_size=%d" % (self.sec_size, self.short_sec_size), file=logfile)
109 | (
110 | SAT_tot_secs, self.dir_first_sec_sid, _unused, self.min_size_std_stream,
111 | SSAT_first_sec_sid, SSAT_tot_secs,
112 | MSATX_first_sec_sid, MSATX_tot_secs,
113 | # ) = unpack(' 1:
154 | print('MSATX: sid=%d (0x%08X)' % (sid, sid), file=logfile)
155 | if sid >= mem_data_secs:
156 | msg = "MSAT extension: accessing sector %d but only %d in file" % (sid, mem_data_secs)
157 | if DEBUG > 1:
158 | print(msg, file=logfile)
159 | break
160 | raise CompDocError(msg)
161 | elif sid < 0:
162 | raise CompDocError("MSAT extension: invalid sector id: %d" % sid)
163 | if seen[sid]:
164 | raise CompDocError("MSAT corruption: seen[%d] == %d" % (sid, seen[sid]))
165 | seen[sid] = 1
166 | actual_MSATX_sectors += 1
167 | if DEBUG and actual_MSATX_sectors > expected_MSATX_sectors:
168 | print("[1]===>>>", mem_data_secs, nent, SAT_sectors_reqd, expected_MSATX_sectors, actual_MSATX_sectors, file=logfile)
169 | offset = 512 + sec_size * sid
170 | MSAT.extend(unpack(fmt, mem[offset:offset+sec_size]))
171 | sid = MSAT.pop() # last sector id is sid of next sector in the chain
172 |
173 | if DEBUG and actual_MSATX_sectors != expected_MSATX_sectors:
174 | print("[2]===>>>", mem_data_secs, nent, SAT_sectors_reqd, expected_MSATX_sectors, actual_MSATX_sectors, file=logfile)
175 | if DEBUG:
176 | print("MSAT: len =", len(MSAT), file=logfile)
177 | dump_list(MSAT, 10, logfile)
178 | #
179 | # === build the SAT ===
180 | #
181 | self.SAT = []
182 | actual_SAT_sectors = 0
183 | dump_again = 0
184 | for msidx in xrange(len(MSAT)):
185 | msid = MSAT[msidx]
186 | if msid in (FREESID, EOCSID):
187 | # Specification: the MSAT array may be padded with trailing FREESID entries.
188 | # Toleration: a FREESID or EOCSID entry anywhere in the MSAT array will be ignored.
189 | continue
190 | if msid >= mem_data_secs:
191 | if not trunc_warned:
192 | print("WARNING *** File is truncated, or OLE2 MSAT is corrupt!!", file=logfile)
193 | print("INFO: Trying to access sector %d but only %d available" \
194 | % (msid, mem_data_secs), file=logfile)
195 | trunc_warned = 1
196 | MSAT[msidx] = EVILSID
197 | dump_again = 1
198 | continue
199 | elif msid < -2:
200 | raise CompDocError("MSAT: invalid sector id: %d" % msid)
201 | if seen[msid]:
202 | raise CompDocError("MSAT extension corruption: seen[%d] == %d" % (msid, seen[msid]))
203 | seen[msid] = 2
204 | actual_SAT_sectors += 1
205 | if DEBUG and actual_SAT_sectors > SAT_sectors_reqd:
206 | print("[3]===>>>", mem_data_secs, nent, SAT_sectors_reqd, expected_MSATX_sectors, actual_MSATX_sectors, actual_SAT_sectors, msid, file=logfile)
207 | offset = 512 + sec_size * msid
208 | self.SAT.extend(unpack(fmt, mem[offset:offset+sec_size]))
209 |
210 | if DEBUG:
211 | print("SAT: len =", len(self.SAT), file=logfile)
212 | dump_list(self.SAT, 10, logfile)
213 | # print >> logfile, "SAT ",
214 | # for i, s in enumerate(self.SAT):
215 | # print >> logfile, "entry: %4d offset: %6d, next entry: %4d" % (i, 512 + sec_size * i, s)
216 | # print >> logfile, "%d:%d " % (i, s),
217 | print(file=logfile)
218 | if DEBUG and dump_again:
219 | print("MSAT: len =", len(MSAT), file=logfile)
220 | dump_list(MSAT, 10, logfile)
221 | for satx in xrange(mem_data_secs, len(self.SAT)):
222 | self.SAT[satx] = EVILSID
223 | print("SAT: len =", len(self.SAT), file=logfile)
224 | dump_list(self.SAT, 10, logfile)
225 | #
226 | # === build the directory ===
227 | #
228 | dbytes = self._get_stream(
229 | self.mem, 512, self.SAT, self.sec_size, self.dir_first_sec_sid,
230 | name="directory", seen_id=3)
231 | dirlist = []
232 | did = -1
233 | for pos in xrange(0, len(dbytes), 128):
234 | did += 1
235 | dirlist.append(DirNode(did, dbytes[pos:pos+128], 0, logfile))
236 | self.dirlist = dirlist
237 | _build_family_tree(dirlist, 0, dirlist[0].root_DID) # and stand well back ...
238 | if DEBUG:
239 | for d in dirlist:
240 | d.dump(DEBUG)
241 | #
242 | # === get the SSCS ===
243 | #
244 | sscs_dir = self.dirlist[0]
245 | assert sscs_dir.etype == 5 # root entry
246 | if sscs_dir.first_SID < 0 or sscs_dir.tot_size == 0:
247 | # Problem reported by Frank Hoffsuemmer: some software was
248 | # writing -1 instead of -2 (EOCSID) for the first_SID
249 | # when the SCCS was empty. Not having EOCSID caused assertion
250 | # failure in _get_stream.
251 | # Solution: avoid calling _get_stream in any case when the
252 | # SCSS appears to be empty.
253 | self.SSCS = ""
254 | else:
255 | self.SSCS = self._get_stream(
256 | self.mem, 512, self.SAT, sec_size, sscs_dir.first_SID,
257 | sscs_dir.tot_size, name="SSCS", seen_id=4)
258 | # if DEBUG: print >> logfile, "SSCS", repr(self.SSCS)
259 | #
260 | # === build the SSAT ===
261 | #
262 | self.SSAT = []
263 | if SSAT_tot_secs > 0 and sscs_dir.tot_size == 0:
264 | print("WARNING *** OLE2 inconsistency: SSCS size is 0 but SSAT size is non-zero", file=logfile)
265 | if sscs_dir.tot_size > 0:
266 | sid = SSAT_first_sec_sid
267 | nsecs = SSAT_tot_secs
268 | while sid >= 0 and nsecs > 0:
269 | if seen[sid]:
270 | raise CompDocError("SSAT corruption: seen[%d] == %d" % (sid, seen[sid]))
271 | seen[sid] = 5
272 | nsecs -= 1
273 | start_pos = 512 + sid * sec_size
274 | news = list(unpack(fmt, mem[start_pos:start_pos+sec_size]))
275 | self.SSAT.extend(news)
276 | sid = self.SAT[sid]
277 | if DEBUG: print("SSAT last sid %d; remaining sectors %d" % (sid, nsecs), file=logfile)
278 | assert nsecs == 0 and sid == EOCSID
279 | if DEBUG:
280 | print("SSAT", file=logfile)
281 | dump_list(self.SSAT, 10, logfile)
282 | if DEBUG:
283 | print("seen", file=logfile)
284 | dump_list(seen, 20, logfile)
285 |
286 | def _get_stream(self, mem, base, sat, sec_size, start_sid, size=None, name='', seen_id=None):
287 | # print >> self.logfile, "_get_stream", base, sec_size, start_sid, size
288 | sectors = []
289 | s = start_sid
290 | if size is None:
291 | # nothing to check against
292 | while s >= 0:
293 | if seen_id is not None:
294 | if self.seen[s]:
295 | raise CompDocError("%s corruption: seen[%d] == %d" % (name, s, self.seen[s]))
296 | self.seen[s] = seen_id
297 | start_pos = base + s * sec_size
298 | sectors.append(mem[start_pos:start_pos+sec_size])
299 | try:
300 | s = sat[s]
301 | except IndexError:
302 | raise CompDocError(
303 | "OLE2 stream %r: sector allocation table invalid entry (%d)" %
304 | (name, s)
305 | )
306 | assert s == EOCSID
307 | else:
308 | todo = size
309 | while s >= 0:
310 | if seen_id is not None:
311 | if self.seen[s]:
312 | raise CompDocError("%s corruption: seen[%d] == %d" % (name, s, self.seen[s]))
313 | self.seen[s] = seen_id
314 | start_pos = base + s * sec_size
315 | grab = sec_size
316 | if grab > todo:
317 | grab = todo
318 | todo -= grab
319 | sectors.append(mem[start_pos:start_pos+grab])
320 | try:
321 | s = sat[s]
322 | except IndexError:
323 | raise CompDocError(
324 | "OLE2 stream %r: sector allocation table invalid entry (%d)" %
325 | (name, s)
326 | )
327 | assert s == EOCSID
328 | if todo != 0:
329 | fprintf(self.logfile,
330 | "WARNING *** OLE2 stream %r: expected size %d, actual size %d\n",
331 | name, size, size - todo)
332 |
333 | return b''.join(sectors)
334 |
335 | def _dir_search(self, path, storage_DID=0):
336 | # Return matching DirNode instance, or None
337 | head = path[0]
338 | tail = path[1:]
339 | dl = self.dirlist
340 | for child in dl[storage_DID].children:
341 | if dl[child].name.lower() == head.lower():
342 | et = dl[child].etype
343 | if et == 2:
344 | return dl[child]
345 | if et == 1:
346 | if not tail:
347 | raise CompDocError("Requested component is a 'storage'")
348 | return self._dir_search(tail, child)
349 | dl[child].dump(1)
350 | raise CompDocError("Requested stream is not a 'user stream'")
351 | return None
352 |
353 | ##
354 | # Interrogate the compound document's directory; return the stream as a string if found, otherwise
355 | # return None.
356 | # @param qname Name of the desired stream e.g. u'Workbook'. Should be in Unicode or convertible thereto.
357 |
358 | def get_named_stream(self, qname):
359 | d = self._dir_search(qname.split("/"))
360 | if d is None:
361 | return None
362 | if d.tot_size >= self.min_size_std_stream:
363 | return self._get_stream(
364 | self.mem, 512, self.SAT, self.sec_size, d.first_SID,
365 | d.tot_size, name=qname, seen_id=d.DID+6)
366 | else:
367 | return self._get_stream(
368 | self.SSCS, 0, self.SSAT, self.short_sec_size, d.first_SID,
369 | d.tot_size, name=qname + " (from SSCS)", seen_id=None)
370 |
371 | ##
372 | # Interrogate the compound document's directory.
373 | # If the named stream is not found, (None, 0, 0) will be returned.
374 | # If the named stream is found and is contiguous within the original byte sequence ("mem")
375 | # used when the document was opened,
376 | # then (mem, offset_to_start_of_stream, length_of_stream) is returned.
377 | # Otherwise a new string is built from the fragments and (new_string, 0, length_of_stream) is returned.
378 | # @param qname Name of the desired stream e.g. u'Workbook'. Should be in Unicode or convertible thereto.
379 |
380 | def locate_named_stream(self, qname):
381 | d = self._dir_search(qname.split("/"))
382 | if d is None:
383 | return (None, 0, 0)
384 | if d.tot_size > self.mem_data_len:
385 | raise CompDocError("%r stream length (%d bytes) > file data size (%d bytes)"
386 | % (qname, d.tot_size, self.mem_data_len))
387 | if d.tot_size >= self.min_size_std_stream:
388 | result = self._locate_stream(
389 | self.mem, 512, self.SAT, self.sec_size, d.first_SID,
390 | d.tot_size, qname, d.DID+6)
391 | if self.DEBUG:
392 | print("\nseen", file=self.logfile)
393 | dump_list(self.seen, 20, self.logfile)
394 | return result
395 | else:
396 | return (
397 | self._get_stream(
398 | self.SSCS, 0, self.SSAT, self.short_sec_size, d.first_SID,
399 | d.tot_size, qname + " (from SSCS)", None),
400 | 0,
401 | d.tot_size
402 | )
403 |
404 | def _locate_stream(self, mem, base, sat, sec_size, start_sid, expected_stream_size, qname, seen_id):
405 | # print >> self.logfile, "_locate_stream", base, sec_size, start_sid, expected_stream_size
406 | s = start_sid
407 | if s < 0:
408 | raise CompDocError("_locate_stream: start_sid (%d) is -ve" % start_sid)
409 | p = -99 # dummy previous SID
410 | start_pos = -9999
411 | end_pos = -8888
412 | slices = []
413 | tot_found = 0
414 | found_limit = (expected_stream_size + sec_size - 1) // sec_size
415 | while s >= 0:
416 | if self.seen[s]:
417 | print("_locate_stream(%s): seen" % qname, file=self.logfile); dump_list(self.seen, 20, self.logfile)
418 | raise CompDocError("%s corruption: seen[%d] == %d" % (qname, s, self.seen[s]))
419 | self.seen[s] = seen_id
420 | tot_found += 1
421 | if tot_found > found_limit:
422 | raise CompDocError(
423 | "%s: size exceeds expected %d bytes; corrupt?"
424 | % (qname, found_limit * sec_size)
425 | ) # Note: expected size rounded up to higher sector
426 | if s == p+1:
427 | # contiguous sectors
428 | end_pos += sec_size
429 | else:
430 | # start new slice
431 | if p >= 0:
432 | # not first time
433 | slices.append((start_pos, end_pos))
434 | start_pos = base + s * sec_size
435 | end_pos = start_pos + sec_size
436 | p = s
437 | s = sat[s]
438 | assert s == EOCSID
439 | assert tot_found == found_limit
440 | # print >> self.logfile, "_locate_stream(%s): seen" % qname; dump_list(self.seen, 20, self.logfile)
441 | if not slices:
442 | # The stream is contiguous ... just what we like!
443 | return (mem, start_pos, expected_stream_size)
444 | slices.append((start_pos, end_pos))
445 | # print >> self.logfile, "+++>>> %d fragments" % len(slices)
446 | return (b''.join([mem[start_pos:end_pos] for start_pos, end_pos in slices]), 0, expected_stream_size)
447 |
448 | # ==========================================================================================
449 | def x_dump_line(alist, stride, f, dpos, equal=0):
450 | print("%5d%s" % (dpos, " ="[equal]), end=' ', file=f)
451 | for value in alist[dpos:dpos + stride]:
452 | print(str(value), end=' ', file=f)
453 | print(file=f)
454 |
455 | def dump_list(alist, stride, f=sys.stdout):
456 | def _dump_line(dpos, equal=0):
457 | print("%5d%s" % (dpos, " ="[equal]), end=' ', file=f)
458 | for value in alist[dpos:dpos + stride]:
459 | print(str(value), end=' ', file=f)
460 | print(file=f)
461 | pos = None
462 | oldpos = None
463 | for pos in xrange(0, len(alist), stride):
464 | if oldpos is None:
465 | _dump_line(pos)
466 | oldpos = pos
467 | elif alist[pos:pos+stride] != alist[oldpos:oldpos+stride]:
468 | if pos - oldpos > stride:
469 | _dump_line(pos - stride, equal=1)
470 | _dump_line(pos)
471 | oldpos = pos
472 | if oldpos is not None and pos is not None and pos != oldpos:
473 | _dump_line(pos, equal=1)
474 |
--------------------------------------------------------------------------------
/src/xlrd/doc/compdoc.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | The compdoc Module
6 |
7 |
8 | The compdoc Module
9 | Implements the minimal functionality required
10 | to extract a "Workbook" or "Book" stream (as one big string)
11 | from an OLE2 Compound Document file.
12 |
Copyright © 2005-2012 Stephen John Machin, Lingfo Pty Ltd
13 | This module is part of the xlrd package, which is released under a BSD-style licence.
14 | Module Contents
15 |
16 | - CompDoc(mem, logfile=sys.stdout, DEBUG=0) (class) [#]
17 | -
18 |
Compound document handler.
19 |
20 | - mem
21 | -
22 | The raw contents of the file, as a string, or as an mmap.mmap() object. The
23 | only operation it needs to support is slicing.
24 |
25 | For more information about this class, see The CompDoc Class.
26 |
27 | - SIGNATURE (variable) [#]
28 | -
29 |
Magic cookie that should appear in the first 8 bytes of the file.
30 |
31 |
32 |
33 |
34 | - CompDoc(mem, logfile=sys.stdout, DEBUG=0) (class) [#]
35 | -
36 |
Compound document handler.
37 |
38 | - mem
39 | -
40 | The raw contents of the file, as a string, or as an mmap.mmap() object. The
41 | only operation it needs to support is slicing.
42 |
43 |
44 | - get_named_stream(qname) [#]
45 | -
46 |
Interrogate the compound document's directory; return the stream as a string if found, otherwise
47 | return None.
48 |
49 | - qname
50 | -
51 | Name of the desired stream e.g. u'Workbook'. Should be in Unicode or convertible thereto.
52 |
53 |
54 | - locate_named_stream(qname) [#]
55 | -
56 |
Interrogate the compound document's directory.
57 | If the named stream is not found, (None, 0, 0) will be returned.
58 | If the named stream is found and is contiguous within the original byte sequence ("mem")
59 | used when the document was opened,
60 | then (mem, offset_to_start_of_stream, length_of_stream) is returned.
61 | Otherwise a new string is built from the fragments and (new_string, 0, length_of_stream) is returned.
62 |
63 | - qname
64 | -
65 | Name of the desired stream e.g. u'Workbook'. Should be in Unicode or convertible thereto.
66 |
67 |
68 |
69 |
70 |
--------------------------------------------------------------------------------
/src/xlrd/examples/namesdemo.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deanishe/i-sheet-you-not/ffebaa0248336fad333ba408f31f9001aefafb8a/src/xlrd/examples/namesdemo.xls
--------------------------------------------------------------------------------
/src/xlrd/examples/xlrdnameAPIdemo.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deanishe/i-sheet-you-not/ffebaa0248336fad333ba408f31f9001aefafb8a/src/xlrd/examples/xlrdnameAPIdemo.py
--------------------------------------------------------------------------------
/src/xlrd/formatting.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deanishe/i-sheet-you-not/ffebaa0248336fad333ba408f31f9001aefafb8a/src/xlrd/formatting.py
--------------------------------------------------------------------------------
/src/xlrd/formula.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deanishe/i-sheet-you-not/ffebaa0248336fad333ba408f31f9001aefafb8a/src/xlrd/formula.py
--------------------------------------------------------------------------------
/src/xlrd/info.py:
--------------------------------------------------------------------------------
1 | __VERSION__ = "0.9.4"
2 |
--------------------------------------------------------------------------------
/src/xlrd/licences.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deanishe/i-sheet-you-not/ffebaa0248336fad333ba408f31f9001aefafb8a/src/xlrd/licences.py
--------------------------------------------------------------------------------
/src/xlrd/sheet.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deanishe/i-sheet-you-not/ffebaa0248336fad333ba408f31f9001aefafb8a/src/xlrd/sheet.py
--------------------------------------------------------------------------------
/src/xlrd/timemachine.py:
--------------------------------------------------------------------------------
1 | ##
2 | # Copyright (c) 2006-2012 Stephen John Machin, Lingfo Pty Ltd
3 | # This module is part of the xlrd package, which is released under a BSD-style licence.
4 | ##
5 |
6 | # timemachine.py -- adaptation for single codebase.
7 | # Currently supported: 2.6 to 2.7, 3.2+
8 | # usage: from timemachine import *
9 |
10 | from __future__ import print_function
11 | import sys
12 |
13 | python_version = sys.version_info[:2] # e.g. version 2.6 -> (2, 6)
14 |
15 | if python_version >= (3, 0):
16 | # Python 3
17 | BYTES_LITERAL = lambda x: x.encode('latin1')
18 | UNICODE_LITERAL = lambda x: x
19 | BYTES_ORD = lambda byte: byte
20 | from io import BytesIO as BYTES_IO
21 | def fprintf(f, fmt, *vargs):
22 | fmt = fmt.replace("%r", "%a")
23 | if fmt.endswith('\n'):
24 | print(fmt[:-1] % vargs, file=f)
25 | else:
26 | print(fmt % vargs, end=' ', file=f)
27 | EXCEL_TEXT_TYPES = (str, bytes, bytearray) # xlwt: isinstance(obj, EXCEL_TEXT_TYPES)
28 | REPR = ascii
29 | xrange = range
30 | unicode = lambda b, enc: b.decode(enc)
31 | ensure_unicode = lambda s: s
32 | unichr = chr
33 | else:
34 | # Python 2
35 | BYTES_LITERAL = lambda x: x
36 | UNICODE_LITERAL = lambda x: x.decode('latin1')
37 | BYTES_ORD = ord
38 | from cStringIO import StringIO as BYTES_IO
39 | def fprintf(f, fmt, *vargs):
40 | if fmt.endswith('\n'):
41 | print(fmt[:-1] % vargs, file=f)
42 | else:
43 | print(fmt % vargs, end=' ', file=f)
44 | try:
45 | EXCEL_TEXT_TYPES = basestring # xlwt: isinstance(obj, EXCEL_TEXT_TYPES)
46 | except NameError:
47 | EXCEL_TEXT_TYPES = (str, unicode)
48 | REPR = repr
49 | xrange = xrange
50 | # following used only to overcome 2.x ElementTree gimmick which
51 | # returns text as `str` if it's ascii, otherwise `unicode`
52 | ensure_unicode = unicode # used only in xlsx.py
53 |
--------------------------------------------------------------------------------
/src/xlrd/xldate.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deanishe/i-sheet-you-not/ffebaa0248336fad333ba408f31f9001aefafb8a/src/xlrd/xldate.py
--------------------------------------------------------------------------------
/src/xlrd/xlsx.py:
--------------------------------------------------------------------------------
1 | ##
2 | # Portions copyright (c) 2008-2012 Stephen John Machin, Lingfo Pty Ltd
3 | # This module is part of the xlrd package, which is released under a BSD-style licence.
4 | ##
5 |
6 | from __future__ import print_function, unicode_literals
7 |
8 | DEBUG = 0
9 |
10 | import sys
11 | import re
12 | from .timemachine import *
13 | from .book import Book, Name
14 | from .biffh import error_text_from_code, XLRDError, XL_CELL_BLANK, XL_CELL_TEXT, XL_CELL_BOOLEAN, XL_CELL_ERROR
15 | from .formatting import is_date_format_string, Format, XF
16 | from .sheet import Sheet
17 |
18 | DLF = sys.stdout # Default Log File
19 |
20 | ET = None
21 | ET_has_iterparse = False
22 | Element_has_iter = False
23 |
24 | def ensure_elementtree_imported(verbosity, logfile):
25 | global ET, ET_has_iterparse, Element_has_iter
26 | if ET is not None:
27 | return
28 | if "IronPython" in sys.version:
29 | import xml.etree.ElementTree as ET
30 | #### 2.7.2.1: fails later with
31 | #### NotImplementedError: iterparse is not supported on IronPython. (CP #31923)
32 | else:
33 | try: import xml.etree.cElementTree as ET
34 | except ImportError:
35 | try: import cElementTree as ET
36 | except ImportError:
37 | try: import lxml.etree as ET
38 | except ImportError:
39 | try: import xml.etree.ElementTree as ET
40 | except ImportError:
41 | try: import elementtree.ElementTree as ET
42 | except ImportError:
43 | raise Exception("Failed to import an ElementTree implementation")
44 | if hasattr(ET, 'iterparse'):
45 | _dummy_stream = BYTES_IO(b'')
46 | try:
47 | ET.iterparse(_dummy_stream)
48 | ET_has_iterparse = True
49 | except NotImplementedError:
50 | pass
51 | Element_has_iter = hasattr(ET.ElementTree, 'iter')
52 | if verbosity:
53 | etree_version = repr([
54 | (item, getattr(ET, item))
55 | for item in ET.__dict__.keys()
56 | if item.lower().replace('_', '') == 'version'
57 | ])
58 | print(ET.__file__, ET.__name__, etree_version, ET_has_iterparse, file=logfile)
59 |
60 | def split_tag(tag):
61 | pos = tag.rfind('}') + 1
62 | if pos >= 2:
63 | return tag[:pos], tag[pos:]
64 | return '', tag
65 |
66 | def augment_keys(adict, uri):
67 | # uri must already be enclosed in {}
68 | for x in list(adict.keys()):
69 | adict[uri + x] = adict[x]
70 |
71 | _UPPERCASE_1_REL_INDEX = {} # Used in fast conversion of column names (e.g. "XFD") to indices (16383)
72 | for _x in xrange(26):
73 | _UPPERCASE_1_REL_INDEX["ABCDEFGHIJKLMNOPQRSTUVWXYZ"[_x]] = _x + 1
74 | for _x in "123456789":
75 | _UPPERCASE_1_REL_INDEX[_x] = 0
76 | del _x
77 |
78 | def cell_name_to_rowx_colx(cell_name, letter_value=_UPPERCASE_1_REL_INDEX):
79 | # Extract column index from cell name
80 | # A => 0, Z =>25, AA => 26, XFD => 16383
81 | colx = 0
82 | charx = -1
83 | try:
84 | for c in cell_name:
85 | charx += 1
86 | lv = letter_value[c]
87 | if lv:
88 | colx = colx * 26 + lv
89 | else: # start of row number; can't be '0'
90 | colx = colx - 1
91 | assert 0 <= colx < X12_MAX_COLS
92 | break
93 | except KeyError:
94 | raise Exception('Unexpected character %r in cell name %r' % (c, cell_name))
95 | rowx = int(cell_name[charx:]) - 1
96 | return rowx, colx
97 |
98 | error_code_from_text = {}
99 | for _code, _text in error_text_from_code.items():
100 | error_code_from_text[_text] = _code
101 |
102 | # === X12 === Excel 2007 .xlsx ===============================================
103 |
104 | U_SSML12 = "{http://schemas.openxmlformats.org/spreadsheetml/2006/main}"
105 | U_ODREL = "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}"
106 | U_PKGREL = "{http://schemas.openxmlformats.org/package/2006/relationships}"
107 | U_CP = "{http://schemas.openxmlformats.org/package/2006/metadata/core-properties}"
108 | U_DC = "{http://purl.org/dc/elements/1.1/}"
109 | U_DCTERMS = "{http://purl.org/dc/terms/}"
110 | XML_SPACE_ATTR = "{http://www.w3.org/XML/1998/namespace}space"
111 | XML_WHITESPACE = "\t\n \r"
112 | X12_MAX_ROWS = 2 ** 20
113 | X12_MAX_COLS = 2 ** 14
114 | V_TAG = U_SSML12 + 'v' # cell child: value
115 | F_TAG = U_SSML12 + 'f' # cell child: formula
116 | IS_TAG = U_SSML12 + 'is' # cell child: inline string
117 |
118 | def unescape(s,
119 | subber=re.compile(r'_x[0-9A-Fa-f]{4,4}_', re.UNICODE).sub,
120 | repl=lambda mobj: unichr(int(mobj.group(0)[2:6], 16)),
121 | ):
122 | if "_" in s:
123 | return subber(repl, s)
124 | return s
125 |
126 | def cooked_text(self, elem):
127 | t = elem.text
128 | if t is None:
129 | return ''
130 | if elem.get(XML_SPACE_ATTR) != 'preserve':
131 | t = t.strip(XML_WHITESPACE)
132 | return ensure_unicode(unescape(t))
133 |
134 | def get_text_from_si_or_is(self, elem, r_tag=U_SSML12+'r', t_tag=U_SSML12 +'t'):
135 | "Returns unescaped unicode"
136 | accum = []
137 | for child in elem:
138 | # self.dump_elem(child)
139 | tag = child.tag
140 | if tag == t_tag:
141 | t = cooked_text(self, child)
142 | if t: # note: .text attribute can be None
143 | accum.append(t)
144 | elif tag == r_tag:
145 | for tnode in child:
146 | if tnode.tag == t_tag:
147 | t = cooked_text(self, tnode)
148 | if t:
149 | accum.append(t)
150 | return ''.join(accum)
151 |
152 | def map_attributes(amap, elem, obj):
153 | for xml_attr, obj_attr, cnv_func_or_const in amap:
154 | if not xml_attr:
155 | setattr(obj, obj_attr, cnv_func_or_const)
156 | continue
157 | if not obj_attr: continue #### FIX ME ####
158 | raw_value = elem.get(xml_attr)
159 | cooked_value = cnv_func_or_const(raw_value)
160 | setattr(obj, obj_attr, cooked_value)
161 |
162 | def cnv_ST_Xstring(s):
163 | if s is None: return ""
164 | return ensure_unicode(s)
165 |
166 | def cnv_xsd_unsignedInt(s):
167 | if not s:
168 | return None
169 | value = int(s)
170 | assert value >= 0
171 | return value
172 |
173 | def cnv_xsd_boolean(s):
174 | if not s:
175 | return 0
176 | if s in ("1", "true", "on"):
177 | return 1
178 | if s in ("0", "false", "off"):
179 | return 0
180 | raise ValueError("unexpected xsd:boolean value: %r" % s)
181 |
182 |
183 | _defined_name_attribute_map = (
184 | ("name", "name", cnv_ST_Xstring, ),
185 | ("comment", "", cnv_ST_Xstring, ),
186 | ("customMenu", "", cnv_ST_Xstring, ),
187 | ("description", "", cnv_ST_Xstring, ),
188 | ("help", "", cnv_ST_Xstring, ),
189 | ("statusBar", "", cnv_ST_Xstring, ),
190 | ("localSheetId", "scope", cnv_xsd_unsignedInt, ),
191 | ("hidden", "hidden", cnv_xsd_boolean, ),
192 | ("function", "func", cnv_xsd_boolean, ),
193 | ("vbProcedure", "vbasic", cnv_xsd_boolean, ),
194 | ("xlm", "macro", cnv_xsd_boolean, ),
195 | ("functionGroupId", "funcgroup", cnv_xsd_unsignedInt, ),
196 | ("shortcutKey", "", cnv_ST_Xstring, ),
197 | ("publishToServer", "", cnv_xsd_boolean, ),
198 | ("workbookParameter", "", cnv_xsd_boolean, ),
199 | ("", "any_err", 0, ),
200 | ("", "any_external", 0, ),
201 | ("", "any_rel", 0, ),
202 | ("", "basic_formula_len", 0, ),
203 | ("", "binary", 0, ),
204 | ("", "builtin", 0, ),
205 | ("", "complex", 0, ),
206 | ("", "evaluated", 0, ),
207 | ("", "excel_sheet_index", 0, ),
208 | ("", "excel_sheet_num", 0, ),
209 | ("", "option_flags", 0, ),
210 | ("", "result", None, ),
211 | ("", "stack", None, ),
212 | )
213 |
214 | def make_name_access_maps(bk):
215 | name_and_scope_map = {} # (name.lower(), scope): Name_object
216 | name_map = {} # name.lower() : list of Name_objects (sorted in scope order)
217 | num_names = len(bk.name_obj_list)
218 | for namex in xrange(num_names):
219 | nobj = bk.name_obj_list[namex]
220 | name_lcase = nobj.name.lower()
221 | key = (name_lcase, nobj.scope)
222 | if key in name_and_scope_map:
223 | msg = 'Duplicate entry %r in name_and_scope_map' % (key, )
224 | if 0:
225 | raise XLRDError(msg)
226 | else:
227 | if bk.verbosity:
228 | print(msg, file=bk.logfile)
229 | name_and_scope_map[key] = nobj
230 | sort_data = (nobj.scope, namex, nobj)
231 | if name_lcase in name_map:
232 | name_map[name_lcase].append(sort_data)
233 | else:
234 | name_map[name_lcase] = [sort_data]
235 | for key in name_map.keys():
236 | alist = name_map[key]
237 | alist.sort()
238 | name_map[key] = [x[2] for x in alist]
239 | bk.name_and_scope_map = name_and_scope_map
240 | bk.name_map = name_map
241 |
242 | class X12General(object):
243 |
244 | def process_stream(self, stream, heading=None):
245 | if self.verbosity >= 2 and heading is not None:
246 | fprintf(self.logfile, "\n=== %s ===\n", heading)
247 | self.tree = ET.parse(stream)
248 | getmethod = self.tag2meth.get
249 | for elem in self.tree.iter() if Element_has_iter else self.tree.getiterator():
250 | if self.verbosity >= 3:
251 | self.dump_elem(elem)
252 | meth = getmethod(elem.tag)
253 | if meth:
254 | meth(self, elem)
255 | self.finish_off()
256 |
257 | def finish_off(self):
258 | pass
259 |
260 | def dump_elem(self, elem):
261 | fprintf(self.logfile,
262 | "===\ntag=%r len=%d attrib=%r text=%r tail=%r\n",
263 | split_tag(elem.tag)[1], len(elem), elem.attrib, elem.text, elem.tail)
264 |
265 | def dumpout(self, fmt, *vargs):
266 | text = (12 * ' ' + fmt + '\n') % vargs
267 | self.logfile.write(text)
268 |
269 | class X12Book(X12General):
270 |
271 | def __init__(self, bk, logfile=DLF, verbosity=False):
272 | self.bk = bk
273 | self.logfile = logfile
274 | self.verbosity = verbosity
275 | self.bk.nsheets = 0
276 | self.bk.props = {}
277 | self.relid2path = {}
278 | self.relid2reltype = {}
279 | self.sheet_targets = [] # indexed by sheetx
280 | self.sheetIds = [] # indexed by sheetx
281 |
282 | core_props_menu = {
283 | U_CP+"lastModifiedBy": ("last_modified_by", cnv_ST_Xstring),
284 | U_DC+"creator": ("creator", cnv_ST_Xstring),
285 | U_DCTERMS+"modified": ("modified", cnv_ST_Xstring),
286 | U_DCTERMS+"created": ("created", cnv_ST_Xstring),
287 | }
288 |
289 | def process_coreprops(self, stream):
290 | if self.verbosity >= 2:
291 | fprintf(self.logfile, "\n=== coreProps ===\n")
292 | self.tree = ET.parse(stream)
293 | getmenu = self.core_props_menu.get
294 | props = {}
295 | for elem in self.tree.iter() if Element_has_iter else self.tree.getiterator():
296 | if self.verbosity >= 3:
297 | self.dump_elem(elem)
298 | menu = getmenu(elem.tag)
299 | if menu:
300 | attr, func = menu
301 | value = func(elem.text)
302 | props[attr] = value
303 | self.bk.user_name = props.get('last_modified_by') or props.get('creator')
304 | self.bk.props = props
305 | if self.verbosity >= 2:
306 | fprintf(self.logfile, "props: %r\n", props)
307 | self.finish_off()
308 |
309 | def process_rels(self, stream):
310 | if self.verbosity >= 2:
311 | fprintf(self.logfile, "\n=== Relationships ===\n")
312 | tree = ET.parse(stream)
313 | r_tag = U_PKGREL + 'Relationship'
314 | for elem in tree.findall(r_tag):
315 | rid = elem.get('Id')
316 | target = elem.get('Target')
317 | reltype = elem.get('Type').split('/')[-1]
318 | if self.verbosity >= 2:
319 | self.dumpout('Id=%r Type=%r Target=%r', rid, reltype, target)
320 | self.relid2reltype[rid] = reltype
321 | # self.relid2path[rid] = 'xl/' + target
322 | if target.startswith('/'):
323 | self.relid2path[rid] = target[1:] # drop the /
324 | else:
325 | self.relid2path[rid] = 'xl/' + target
326 |
327 | def do_defined_name(self, elem):
328 | #### UNDER CONSTRUCTION ####
329 | if 0 and self.verbosity >= 3:
330 | self.dump_elem(elem)
331 | nobj = Name()
332 | bk = self.bk
333 | nobj.bk = bk
334 | nobj.name_index = len(bk.name_obj_list)
335 | bk.name_obj_list.append(nobj)
336 | nobj.name = elem.get('name')
337 | nobj.raw_formula = None # compiled bytecode formula -- not in XLSX
338 | nobj.formula_text = cooked_text(self, elem)
339 | map_attributes(_defined_name_attribute_map, elem, nobj)
340 | if nobj.scope is None:
341 | nobj.scope = -1 # global
342 | if nobj.name.startswith("_xlnm."):
343 | nobj.builtin = 1
344 | if self.verbosity >= 2:
345 | nobj.dump(header='=== Name object ===')
346 |
347 | def do_defined_names(self, elem):
348 | for child in elem:
349 | self.do_defined_name(child)
350 | make_name_access_maps(self.bk)
351 |
352 | def do_sheet(self, elem):
353 | bk = self.bk
354 | sheetx = bk.nsheets
355 | # print elem.attrib
356 | rid = elem.get(U_ODREL + 'id')
357 | sheetId = int(elem.get('sheetId'))
358 | name = unescape(ensure_unicode(elem.get('name')))
359 | reltype = self.relid2reltype[rid]
360 | target = self.relid2path[rid]
361 | if self.verbosity >= 2:
362 | self.dumpout(
363 | 'sheetx=%d sheetId=%r rid=%r type=%r name=%r',
364 | sheetx, sheetId, rid, reltype, name)
365 | if reltype != 'worksheet':
366 | if self.verbosity >= 2:
367 | self.dumpout('Ignoring sheet of type %r (name=%r)', reltype, name)
368 | return
369 | state = elem.get('state')
370 | visibility_map = {
371 | None: 0,
372 | 'visible': 0,
373 | 'hidden': 1,
374 | 'veryHidden': 2
375 | }
376 | bk._sheet_visibility.append(visibility_map[state])
377 | sheet = Sheet(bk, position=None, name=name, number=sheetx)
378 | sheet.utter_max_rows = X12_MAX_ROWS
379 | sheet.utter_max_cols = X12_MAX_COLS
380 | bk._sheet_list.append(sheet)
381 | bk._sheet_names.append(name)
382 | bk.nsheets += 1
383 | self.sheet_targets.append(target)
384 | self.sheetIds.append(sheetId)
385 |
386 |
387 | def do_workbookpr(self, elem):
388 | datemode = cnv_xsd_boolean(elem.get('date1904'))
389 | if self.verbosity >= 2:
390 | self.dumpout('datemode=%r', datemode)
391 | self.bk.datemode = datemode
392 |
393 | tag2meth = {
394 | 'definedNames': do_defined_names,
395 | 'workbookPr': do_workbookpr,
396 | 'sheet': do_sheet,
397 | }
398 | augment_keys(tag2meth, U_SSML12)
399 |
400 | class X12SST(X12General):
401 |
402 | def __init__(self, bk, logfile=DLF, verbosity=0):
403 | self.bk = bk
404 | self.logfile = logfile
405 | self.verbosity = verbosity
406 | if ET_has_iterparse:
407 | self.process_stream = self.process_stream_iterparse
408 | else:
409 | self.process_stream = self.process_stream_findall
410 |
411 | def process_stream_iterparse(self, stream, heading=None):
412 | if self.verbosity >= 2 and heading is not None:
413 | fprintf(self.logfile, "\n=== %s ===\n", heading)
414 | si_tag = U_SSML12 + 'si'
415 | elemno = -1
416 | sst = self.bk._sharedstrings
417 | for event, elem in ET.iterparse(stream):
418 | if elem.tag != si_tag: continue
419 | elemno = elemno + 1
420 | if self.verbosity >= 3:
421 | fprintf(self.logfile, "element #%d\n", elemno)
422 | self.dump_elem(elem)
423 | result = get_text_from_si_or_is(self, elem)
424 | sst.append(result)
425 | elem.clear() # destroy all child elements
426 | if self.verbosity >= 2:
427 | self.dumpout('Entries in SST: %d', len(sst))
428 | if self.verbosity >= 3:
429 | for x, s in enumerate(sst):
430 | fprintf(self.logfile, "SST x=%d s=%r\n", x, s)
431 |
432 | def process_stream_findall(self, stream, heading=None):
433 | if self.verbosity >= 2 and heading is not None:
434 | fprintf(self.logfile, "\n=== %s ===\n", heading)
435 | self.tree = ET.parse(stream)
436 | si_tag = U_SSML12 + 'si'
437 | elemno = -1
438 | sst = self.bk._sharedstrings
439 | for elem in self.tree.findall(si_tag):
440 | elemno = elemno + 1
441 | if self.verbosity >= 3:
442 | fprintf(self.logfile, "element #%d\n", elemno)
443 | self.dump_elem(elem)
444 | result = get_text_from_si_or_is(self, elem)
445 | sst.append(result)
446 | if self.verbosity >= 2:
447 | self.dumpout('Entries in SST: %d', len(sst))
448 |
449 | class X12Styles(X12General):
450 |
451 | def __init__(self, bk, logfile=DLF, verbosity=0):
452 | self.bk = bk
453 | self.logfile = logfile
454 | self.verbosity = verbosity
455 | self.xf_counts = [0, 0]
456 | self.xf_type = None
457 | self.fmt_is_date = {}
458 | for x in list(range(14, 23)) + list(range(45, 48)): #### hard-coding FIX ME ####
459 | self.fmt_is_date[x] = 1
460 | # dummy entry for XF 0 in case no Styles section
461 | self.bk._xf_index_to_xl_type_map[0] = 2
462 | # fill_in_standard_formats(bk) #### pre-integration kludge
463 |
464 | def do_cellstylexfs(self, elem):
465 | self.xf_type = 0
466 |
467 | def do_cellxfs(self, elem):
468 | self.xf_type = 1
469 |
470 | def do_numfmt(self, elem):
471 | formatCode = ensure_unicode(elem.get('formatCode'))
472 | numFmtId = int(elem.get('numFmtId'))
473 | is_date = is_date_format_string(self.bk, formatCode)
474 | self.fmt_is_date[numFmtId] = is_date
475 | fmt_obj = Format(numFmtId, is_date + 2, formatCode)
476 | self.bk.format_map[numFmtId] = fmt_obj
477 | if self.verbosity >= 3:
478 | self.dumpout('numFmtId=%d formatCode=%r is_date=%d', numFmtId, formatCode, is_date)
479 |
480 | def do_xf(self, elem):
481 | if self.xf_type != 1:
482 | #### ignoring style XFs for the moment
483 | return
484 | xfx = self.xf_counts[self.xf_type]
485 | self.xf_counts[self.xf_type] = xfx + 1
486 | xf = XF()
487 | self.bk.xf_list.append(xf)
488 | self.bk.xfcount += 1
489 | numFmtId = int(elem.get('numFmtId', '0'))
490 | xf.format_key = numFmtId
491 | is_date = self.fmt_is_date.get(numFmtId, 0)
492 | self.bk._xf_index_to_xl_type_map[xfx] = is_date + 2
493 | if self.verbosity >= 3:
494 | self.dumpout(
495 | 'xfx=%d numFmtId=%d',
496 | xfx, numFmtId,
497 | )
498 | self.dumpout(repr(self.bk._xf_index_to_xl_type_map))
499 |
500 | tag2meth = {
501 | 'cellStyleXfs': do_cellstylexfs,
502 | 'cellXfs': do_cellxfs,
503 | 'numFmt': do_numfmt,
504 | 'xf': do_xf,
505 | }
506 | augment_keys(tag2meth, U_SSML12)
507 |
508 | class X12Sheet(X12General):
509 |
510 | def __init__(self, sheet, logfile=DLF, verbosity=0):
511 | self.sheet = sheet
512 | self.logfile = logfile
513 | self.verbosity = verbosity
514 | self.rowx = -1 # We may need to count them.
515 | self.bk = sheet.book
516 | self.sst = self.bk._sharedstrings
517 | self.merged_cells = sheet.merged_cells
518 | self.warned_no_cell_name = 0
519 | self.warned_no_row_num = 0
520 | if ET_has_iterparse:
521 | self.process_stream = self.own_process_stream
522 |
523 | def own_process_stream(self, stream, heading=None):
524 | if self.verbosity >= 2 and heading is not None:
525 | fprintf(self.logfile, "\n=== %s ===\n", heading)
526 | getmethod = self.tag2meth.get
527 | row_tag = U_SSML12 + "row"
528 | self_do_row = self.do_row
529 | for event, elem in ET.iterparse(stream):
530 | if elem.tag == row_tag:
531 | self_do_row(elem)
532 | elem.clear() # destroy all child elements (cells)
533 | elif elem.tag == U_SSML12 + "dimension":
534 | self.do_dimension(elem)
535 | elif elem.tag == U_SSML12 + "mergeCell":
536 | self.do_merge_cell(elem)
537 | self.finish_off()
538 |
539 | def process_comments_stream(self, stream):
540 | root = ET.parse(stream).getroot()
541 | author_list = root[0]
542 | assert author_list.tag == U_SSML12 + 'authors'
543 | authors = [elem.text for elem in author_list]
544 | comment_list = root[1]
545 | assert comment_list.tag == U_SSML12 + 'commentList'
546 | cell_note_map = self.sheet.cell_note_map
547 | from .sheet import Note
548 | text_tag = U_SSML12 + 'text'
549 | r_tag = U_SSML12 + 'r'
550 | t_tag = U_SSML12 + 't'
551 | for elem in comment_list.findall(U_SSML12 + 'comment'):
552 | ts = elem.findall('./' + text_tag + '/' + t_tag)
553 | ts += elem.findall('./' + text_tag + '/' + r_tag + '/' + t_tag)
554 | ref = elem.get('ref')
555 | note = Note()
556 | note.author = authors[int(elem.get('authorId'))]
557 | note.rowx, note.colx = coords = cell_name_to_rowx_colx(ref)
558 | note.text = ''
559 | for t in ts:
560 | note.text += cooked_text(self, t)
561 | cell_note_map[coords] = note
562 |
563 | def do_dimension(self, elem):
564 | ref = elem.get('ref') # example: "A1:Z99" or just "A1"
565 | if ref:
566 | # print >> self.logfile, "dimension: ref=%r" % ref
567 | last_cell_ref = ref.split(':')[-1] # example: "Z99"
568 | rowx, colx = cell_name_to_rowx_colx(last_cell_ref)
569 | self.sheet._dimnrows = rowx + 1
570 | self.sheet._dimncols = colx + 1
571 |
572 | def do_merge_cell(self, elem):
573 | # The ref attribute should be a cell range like "B1:D5".
574 | ref = elem.get('ref')
575 | if ref:
576 | first_cell_ref, last_cell_ref = ref.split(':')
577 | first_rowx, first_colx = cell_name_to_rowx_colx(first_cell_ref)
578 | last_rowx, last_colx = cell_name_to_rowx_colx(last_cell_ref)
579 | self.merged_cells.append((first_rowx, last_rowx + 1,
580 | first_colx, last_colx + 1))
581 |
582 | def do_row(self, row_elem):
583 |
584 | def bad_child_tag(child_tag):
585 | raise Exception('cell type %s has unexpected child <%s> at rowx=%r colx=%r' % (cell_type, child_tag, rowx, colx))
586 |
587 | row_number = row_elem.get('r')
588 | if row_number is None: # Yes, it's optional.
589 | self.rowx += 1
590 | explicit_row_number = 0
591 | if self.verbosity and not self.warned_no_row_num:
592 | self.dumpout("no row number; assuming rowx=%d", self.rowx)
593 | self.warned_no_row_num = 1
594 | else:
595 | self.rowx = int(row_number) - 1
596 | explicit_row_number = 1
597 | assert 0 <= self.rowx < X12_MAX_ROWS
598 | rowx = self.rowx
599 | colx = -1
600 | if self.verbosity >= 3:
601 | self.dumpout(" row_number=%r rowx=%d explicit=%d",
602 | row_number, self.rowx, explicit_row_number)
603 | letter_value = _UPPERCASE_1_REL_INDEX
604 | for cell_elem in row_elem:
605 | cell_name = cell_elem.get('r')
606 | if cell_name is None: # Yes, it's optional.
607 | colx += 1
608 | if self.verbosity and not self.warned_no_cell_name:
609 | self.dumpout("no cellname; assuming rowx=%d colx=%d", rowx, colx)
610 | self.warned_no_cell_name = 1
611 | else:
612 | # Extract column index from cell name
613 | # A => 0, Z =>25, AA => 26, XFD => 16383
614 | colx = 0
615 | charx = -1
616 | try:
617 | for c in cell_name:
618 | charx += 1
619 | if c == '$':
620 | continue
621 | lv = letter_value[c]
622 | if lv:
623 | colx = colx * 26 + lv
624 | else: # start of row number; can't be '0'
625 | colx = colx - 1
626 | assert 0 <= colx < X12_MAX_COLS
627 | break
628 | except KeyError:
629 | raise Exception('Unexpected character %r in cell name %r' % (c, cell_name))
630 | if explicit_row_number and cell_name[charx:] != row_number:
631 | raise Exception('cell name %r but row number is %r' % (cell_name, row_number))
632 | xf_index = int(cell_elem.get('s', '0'))
633 | cell_type = cell_elem.get('t', 'n')
634 | tvalue = None
635 | formula = None
636 | if cell_type == 'n':
637 | # n = number. Most frequent type.
638 | # child contains plain text which can go straight into float()
639 | # OR there's no text in which case it's a BLANK cell
640 | for child in cell_elem:
641 | child_tag = child.tag
642 | if child_tag == V_TAG:
643 | tvalue = child.text
644 | elif child_tag == F_TAG:
645 | formula = cooked_text(self, child)
646 | else:
647 | raise Exception('unexpected tag %r' % child_tag)
648 | if not tvalue:
649 | if self.bk.formatting_info:
650 | self.sheet.put_cell(rowx, colx, XL_CELL_BLANK, '', xf_index)
651 | else:
652 | self.sheet.put_cell(rowx, colx, None, float(tvalue), xf_index)
653 | elif cell_type == "s":
654 | # s = index into shared string table. 2nd most frequent type
655 | # child contains plain text which can go straight into int()
656 | for child in cell_elem:
657 | child_tag = child.tag
658 | if child_tag == V_TAG:
659 | tvalue = child.text
660 | elif child_tag == F_TAG:
661 | # formula not expected here, but gnumeric does it.
662 | formula = child.text
663 | else:
664 | bad_child_tag(child_tag)
665 | if not tvalue:
666 | #
667 | if self.bk.formatting_info:
668 | self.sheet.put_cell(rowx, colx, XL_CELL_BLANK, '', xf_index)
669 | else:
670 | value = self.sst[int(tvalue)]
671 | self.sheet.put_cell(rowx, colx, XL_CELL_TEXT, value, xf_index)
672 | elif cell_type == "str":
673 | # str = string result from formula.
674 | # Should have (formula) child; however in one file, all text cells are str with no formula.
675 | # child can contain escapes
676 | for child in cell_elem:
677 | child_tag = child.tag
678 | if child_tag == V_TAG:
679 | tvalue = cooked_text(self, child)
680 | elif child_tag == F_TAG:
681 | formula = cooked_text(self, child)
682 | else:
683 | bad_child_tag(child_tag)
684 | # assert tvalue is not None and formula is not None
685 | # Yuk. Fails with file created by gnumeric -- no tvalue!
686 | self.sheet.put_cell(rowx, colx, XL_CELL_TEXT, tvalue, xf_index)
687 | elif cell_type == "b":
688 | # b = boolean
689 | # child contains "0" or "1"
690 | # Maybe the data should be converted with cnv_xsd_boolean;
691 | # ECMA standard is silent; Excel 2007 writes 0 or 1
692 | for child in cell_elem:
693 | child_tag = child.tag
694 | if child_tag == V_TAG:
695 | tvalue = child.text
696 | elif child_tag == F_TAG:
697 | formula = cooked_text(self, child)
698 | else:
699 | bad_child_tag(child_tag)
700 | self.sheet.put_cell(rowx, colx, XL_CELL_BOOLEAN, int(tvalue), xf_index)
701 | elif cell_type == "e":
702 | # e = error
703 | # child contains e.g. "#REF!"
704 | for child in cell_elem:
705 | child_tag = child.tag
706 | if child_tag == V_TAG:
707 | tvalue = child.text
708 | elif child_tag == F_TAG:
709 | formula = cooked_text(self, child)
710 | else:
711 | bad_child_tag(child_tag)
712 | value = error_code_from_text[tvalue]
713 | self.sheet.put_cell(rowx, colx, XL_CELL_ERROR, value, xf_index)
714 | elif cell_type == "inlineStr":
715 | # Not expected in files produced by Excel.
716 | # Only possible child is .
717 | # It's a way of allowing 3rd party s/w to write text (including rich text) cells
718 | # without having to build a shared string table
719 | for child in cell_elem:
720 | child_tag = child.tag
721 | if child_tag == IS_TAG:
722 | tvalue = get_text_from_si_or_is(self, child)
723 | else:
724 | bad_child_tag(child_tag)
725 | assert tvalue is not None
726 | self.sheet.put_cell(rowx, colx, XL_CELL_TEXT, tvalue, xf_index)
727 | else:
728 | raise Exception("Unknown cell type %r in rowx=%d colx=%d" % (cell_type, rowx, colx))
729 |
730 | tag2meth = {
731 | 'row': do_row,
732 | }
733 | augment_keys(tag2meth, U_SSML12)
734 |
735 | def open_workbook_2007_xml(
736 | zf,
737 | component_names,
738 | logfile=sys.stdout,
739 | verbosity=0,
740 | use_mmap=0,
741 | formatting_info=0,
742 | on_demand=0,
743 | ragged_rows=0,
744 | ):
745 | ensure_elementtree_imported(verbosity, logfile)
746 | bk = Book()
747 | bk.logfile = logfile
748 | bk.verbosity = verbosity
749 | bk.formatting_info = formatting_info
750 | if formatting_info:
751 | raise NotImplementedError("formatting_info=True not yet implemented")
752 | bk.use_mmap = False #### Not supported initially
753 | bk.on_demand = on_demand
754 | if on_demand:
755 | if verbosity:
756 | print("WARNING *** on_demand=True not yet implemented; falling back to False", file=bk.logfile)
757 | bk.on_demand = False
758 | bk.ragged_rows = ragged_rows
759 |
760 | x12book = X12Book(bk, logfile, verbosity)
761 | zflo = zf.open(component_names['xl/_rels/workbook.xml.rels'])
762 | x12book.process_rels(zflo)
763 | del zflo
764 | zflo = zf.open(component_names['xl/workbook.xml'])
765 | x12book.process_stream(zflo, 'Workbook')
766 | del zflo
767 | props_name = 'docprops/core.xml'
768 | if props_name in component_names:
769 | zflo = zf.open(component_names[props_name])
770 | x12book.process_coreprops(zflo)
771 |
772 | x12sty = X12Styles(bk, logfile, verbosity)
773 | if 'xl/styles.xml' in component_names:
774 | zflo = zf.open(component_names['xl/styles.xml'])
775 | x12sty.process_stream(zflo, 'styles')
776 | del zflo
777 | else:
778 | # seen in MS sample file MergedCells.xlsx
779 | pass
780 |
781 | sst_fname = 'xl/sharedstrings.xml'
782 | x12sst = X12SST(bk, logfile, verbosity)
783 | if sst_fname in component_names:
784 | zflo = zf.open(component_names[sst_fname])
785 | x12sst.process_stream(zflo, 'SST')
786 | del zflo
787 |
788 | for sheetx in range(bk.nsheets):
789 | fname = x12book.sheet_targets[sheetx]
790 | zflo = zf.open(component_names[fname])
791 | sheet = bk._sheet_list[sheetx]
792 | x12sheet = X12Sheet(sheet, logfile, verbosity)
793 | heading = "Sheet %r (sheetx=%d) from %r" % (sheet.name, sheetx, fname)
794 | x12sheet.process_stream(zflo, heading)
795 | del zflo
796 | comments_fname = 'xl/comments%d.xml' % (sheetx + 1)
797 | if comments_fname in component_names:
798 | comments_stream = zf.open(component_names[comments_fname])
799 | x12sheet.process_comments_stream(comments_stream)
800 | del comments_stream
801 |
802 | sheet.tidy_dimensions()
803 |
804 | return bk
805 |
--------------------------------------------------------------------------------