. Given titles as
8 | Open XML files, in order, this will output them as an Akoma Ntoso XML file.
9 |
10 | Usage: scrape-statcode-us-co [options] file
11 | Arguments:
12 |
13 | file input .docx file from the bulk download site
14 | -o file output file ('-' for stdout) (default: stdout)
15 | -c string The StarOffice connect-string (default: 'socket,host=localhost,port=2002,tcpNoDelay=1')
16 | -d enable debuging output (default: warnings only)
17 |
18 | To run LibreOffice:
19 |
20 | soffice --writer --accept='socket,host=localhost,port=2002,tcpNoDelay=1;urp;' --norestore --nologo --headless --nolockcheck
21 | """
22 |
23 | import sys
24 | import os
25 | import getopt
26 | import lxml.etree as etree
27 | import uno
28 | import unohelper
29 | import logging
30 | import mimetypes
31 | import enum
32 | import collections
33 | import threading
34 | import queue
35 | import types
36 | import re
37 |
38 | logger = logging.getLogger(__name__)
39 |
40 | ##
41 | # Parse paramters, call processing function.
42 | #
43 | def main():
44 | fout = sys.stdout.buffer
45 | debug = logging.INFO
46 | connect_string = 'uno:socket,host=localhost,port=2002,tcpNoDelay=1;urp;StarOffice.ComponentContext'
47 | fn = None
48 | logger.SUPERDEBUG = logging.DEBUG-2 # XXX monkey fix
49 | logger.UBERDEBUG = logging.DEBUG-4
50 |
51 | # Configure logging
52 | logging.basicConfig(format='{levelname} {process}/{thread}/{funcName} {message}', style='{', level=debug)
53 | logging.addLevelName(logger.SUPERDEBUG, 'SUPERDEBUG')
54 | logging.addLevelName(logger.UBERDEBUG, 'UBERDEBUG')
55 | logger.superdebug = lambda msg, *args, **kwargs: logger.log(logger.SUPERDEBUG, msg, *args, **kwargs)
56 | logger.uberdebug = lambda msg, *args, **kwargs: logger.log(logger.UBERDEBUG, msg, *args, **kwargs)
57 |
58 | # Parse arguments
59 | try:
60 | opts, args = getopt.getopt(sys.argv[1:], 'o:c:dh')
61 | except getopt.GetoptError:
62 | logger.fatal(f"getopt error {usage}")
63 | return 1
64 |
65 | for opt, arg in opts:
66 | if opt in {'-d', '--debug'}:
67 | if debug is logging.INFO:
68 | debug = logging.DEBUG
69 | elif debug is logging.DEBUG:
70 | debug = logger.SUPERDEBUG
71 | elif debug is logger.SUPERDEBUG:
72 | debug = logger.UBERDEBUG
73 | else:
74 | logger.warning("main unknown debugging level")
75 | debug = logging.DEBUG
76 | elif opt in {'-o'}:
77 | fout = arg
78 | elif opt in {'-c'}:
79 | connect_string = f"uno:{arg};urp;StarOffice.ComponentContext"
80 | elif opt in {'-h', '--help'}:
81 | print(usage)
82 | return 0
83 | else:
84 | logger.fatal(f"invalid flag {opt}{usage}")
85 | return 1
86 |
87 | logger.setLevel(debug)
88 | if len(args) != 1:
89 | logger.fatal(f"Missing input file {usage}")
90 | return 1
91 | fn = args[0]
92 |
93 | # Verify file type. This fails on Windows?
94 | mime = mimetypes.guess_type(fn)
95 | if mime[0] not in ('application/rtf','application/vnd.openxmlformats-officedocument.wordprocessingml.document') and os.name != 'nt':
96 | logger.critical(f"Unknown filetype: {mime} {fn}")
97 | return 2
98 |
99 | xml = do_parse(connect_string, fn)
100 | if xml is None:
101 | logger.critical("Parsing returned no XML")
102 | return 3
103 |
104 | tree = etree.ElementTree(xml)
105 | tree.write(fout)
106 |
107 | return 0
108 |
109 | ##
110 | # Parse a file and return Akoma Ntoso XML.
111 | #
112 | def do_parse(connect_string, fn):
113 | logger.info('do_parse parsing: {}'.format(fn))
114 |
115 | akn = etree.Element('akomaNtoso', nsmap={None: "http://docs.oasis-open.org/legaldocml/ns/akn/3.0", "xsi": "http://www.w3.org/2001/XMLSchema-instance"})
116 | akn.attrib['{http://www.w3.org/2001/XMLSchema-instance}schemaLocation'] = "http://docs.oasis-open.org/legaldocml/ns/akn/3.0 ../schemas/akomantoso30.xsd"
117 | act = etree.SubElement(akn, 'act')
118 | meta = etree.SubElement(act, 'meta')
119 | body = etree.SubElement(act, 'body')
120 |
121 | #css = etree.ProcessingInstruction("xml-stylesheet", text='type="text/css" href="akn.css"')
122 | #akn.addprevious(css)
123 |
124 | try:
125 | paraq = queue.Queue(100)
126 | xmlq = queue.Queue(50)
127 | outq = queue.Queue() # XXX should we bound these?
128 | errq = queue.Queue()
129 |
130 | # open files, build threads
131 | filethread = threading.Thread(target=OOFile.run, args=(connect_string, fn, errq, paraq))
132 | parserthread = threading.Thread(target=OOFileParser.run, args=(paraq, xmlq, errq))
133 | builderthread = threading.Thread(target=XMLBuilder.run, args=(xmlq, outq, errq))
134 |
135 | # parse
136 | builderthread.start()
137 | parserthread.start()
138 | filethread.start()
139 | xml = outq.get()
140 | if xml is not False: # TODO implement better queue poisoning. Maybe use exception objects?
141 | body.append(xml)
142 | except OSError as e:
143 | logger.critical('do_parse opening files: {}'.format(e))
144 | return None
145 | except BaseException as e:
146 | logger.critical('do_parse exception: {} {}'.format(type(e), e))
147 | return None
148 |
149 | # Wait for completion of threads. Is this necessary?
150 | logger.info('joining threads: {}'.format(fn))
151 | filethread.join()
152 | parserthread.join()
153 | builderthread.join()
154 |
155 | # return
156 | return akn
157 |
158 | ##
159 | # A state machine that parses a stream of semi-structured document lines
160 | # into partial Akoma Ntoso XML. The parse() function will consume the input
161 | # and output an XML object.
162 | #
163 | class XMLBuilder:
164 | def __init__(self):
165 | # `state' is an ordered dictionary with the top-most
166 | # element of each type, which represents our heirarchy
167 | # of elements
168 | self.state = collections.OrderedDict()
169 | self.state['title'] = None
170 | self.state['article'] = None
171 | self.state['part'] = None
172 | self.state['subpart'] = None
173 | self.state['section'] = None
174 | self.state['subsection'] = None
175 | self.state['paragraph'] = None
176 | self.state['subparagraph'] = None
177 | self.state['subsubparagraph'] = None
178 | self.last = None
179 | ##
180 | #
181 | #
182 | @staticmethod
183 | def run(inq, outq, errq):
184 | try:
185 | builder = XMLBuilder()
186 | builder.parse(inq, outq, errq)
187 | except BaseException as e:
188 | logger.critical('XMLBuilder.run exception: {} {}'.format(type(e), e), exc_info=True)
189 | outq.put(False) # poison queue
190 | errq.put(False)
191 | ##
192 | # Parse all messages in @inq and return an XML object.
193 | #
194 | def parse(self, inq, outq, errq):
195 | assert inq is not None
196 | # process messages
197 | while True:
198 | msg = inq.get()
199 | if msg is None: # EOF
200 | outq.put(self.get_state_top())
201 | break
202 | elif msg is False: # poison pill
203 | outq.put(False) # poison queue
204 | errq.put(False)
205 | break
206 | logger.superdebug('XMLBuilder.parse: {}'.format(msg))
207 | self.event(msg)
208 | ##
209 | # Process a signal.
210 | #
211 | def event(self, signal):
212 | typ,subtype = signal['type'], signal['subtype']
213 | if typ in {'heirarchy'}:
214 | self.event_heirarchy(signal)
215 | elif typ in {'text'}:
216 | self.event_text(signal)
217 | else:
218 | raise RuntimeError('XMLBuilder: unknown event: {}'.format(signal))
219 | ##
220 | # Append the p/text() to the current heirarchy element. If the current heirarchy element does not have a content child element, create it.
221 | #
222 | # Right now only transition_text is emitting these messages.
223 | #
224 | def event_text(self, signal):
225 | text = signal['content']
226 | last_el = self.get_last()
227 | if last_el is None:
228 | logger.warning(f'ignoring text {text}')
229 | return
230 | # Get or create content element.
231 | content_el = last_el.find("./content")
232 | if content_el is None:
233 | content_el = etree.SubElement(last_el, 'content')
234 | # Append new p element.
235 | pel = etree.SubElement(content_el, 'p')
236 | pel.text = text
237 | ##
238 | # All heirarchical elements are essentially the same, except that only
239 | # sections and below have content and have different ID algorithms.
240 | #
241 | def event_heirarchy(self, signal):
242 | typ, enum, head, text, status = signal['subtype'], signal['enum'], signal['heading'], signal['content'], signal['status']
243 | # determine subtype
244 | if typ is None:
245 | typ = self.parse_heirarchy_type(enum)
246 | # create element
247 | el = etree.Element(typ)
248 | # info
249 | tel = None
250 | # el.attrib['title'] = self.get_name(typ, enum)
251 | el.attrib['eId'] = self.get_id(typ, enum)
252 | if status:
253 | el.attrib['status'] = status
254 | if enum:
255 | nel = etree.SubElement(el, 'num')
256 | nel.text = enum
257 | if head:
258 | hel = etree.SubElement(el, 'heading')
259 | hel.text = head
260 | if text:
261 | tel = etree.SubElement(el, 'content')
262 | pel = etree.SubElement(tel, 'p')
263 | pel.text = text
264 | # get parent (only title has no parent) and attach
265 | parentel = self.get_state_parent(typ)
266 | if parentel is not None:
267 | parentel.append(el)
268 | else:
269 | logger.warning('event_section no parent: {}'.format(signal))
270 | # update state
271 | self.set_state(el, typ)
272 | ##
273 | # Determine the type of element from its enumeration.
274 | #
275 | # Note that 'I' may be a subparagraph, or it may be a
276 | # sub-subparagraph that comes after 'H' etc.
277 | #
278 | # -------------------------
279 | # | s | type |
280 | # -------------------------
281 | # | 1 | subsection |
282 | # | a | paragraph |
283 | # | IV | subparagraph |
284 | # | A | sub-subparagraph |
285 | # -------------------------
286 | #
287 | def parse_heirarchy_type(self, s):
288 | ret = 'subsection'
289 | if s.isdecimal():
290 | ret = 'subsection'
291 | elif s.islower():
292 | ret = 'paragraph'
293 | elif 'I' not in s and 'V' not in s and 'X' not in s:
294 | ret = 'subsubparagraph'
295 | elif s == 'I' and self.state['subsubparagraph'] is not None and self.state['subsubparagraph'][0] == 'H':
296 | ret = 'subsubparagraph'
297 | elif s == 'V' and self.state['subsubparagraph'] is not None and self.state['subsubparagraph'][0] == 'U':
298 | ret = 'subsubparagraph'
299 | elif s == 'X' and self.state['subsubparagraph'] is not None and self.state['subsubparagraph'][0] == 'W':
300 | ret = 'subdivision'
301 | else:
302 | logger.superdebug('heirarchy_type assume roman num: {}'.format(s))
303 | ret = 'subparagraph'
304 | return ret
305 | ##
306 | #
307 | #
308 | def get_name(self, typ, enum):
309 | assert typ is not None
310 | name = typ[0].upper() + typ[1:]
311 | if enum is not None: # XXX if no enum, is this required to be unique?
312 | name += ' ' + enum
313 | return name
314 | ##
315 | # XXX requires non-None parent to have id attribute?
316 | #
317 | def get_id(self, typ, enum):
318 | assert typ is not None and enum is not None
319 | parentel = self.get_state_parent(typ)
320 | if parentel is None:
321 | # XXX only top-most element's parent will be None?
322 | ident = typ + '-' + enum
323 | elif typ in {'section'}:
324 | ident = typ + '-' + enum
325 | elif XMLBuilder.test_above_section(typ):
326 | ident = parentel.attrib['eId'] + '-' + typ + '-' + enum
327 | elif XMLBuilder.test_below_section(typ):
328 | ident = parentel.attrib['eId'] + '-' + enum
329 | else:
330 | logger.critical('get_id unknown type: {}'.format(typ))
331 | raise RuntimeError('get_id unknown type: {}'.format(typ))
332 | return ident
333 | ##
334 | # Test if type is below section type.
335 | #
336 | # TODO should probably make more reboust to changes in heirarchy tree
337 | #
338 | @staticmethod
339 | def test_below_section(typ):
340 | return typ in {'subsection', 'paragraph', 'subparagraph', 'subsubparagraph'}
341 | ##
342 | # Test if type is below section type.
343 | #
344 | # TODO should probably make more reboust to changes in heirarchy tree
345 | #
346 | @staticmethod
347 | def test_above_section(typ):
348 | return typ in {'title', 'article', 'part', 'subpart'}
349 | ##
350 | # Get the lowest non-None element above type, or None if its the highest.
351 | #
352 | def get_state_parent(self, typ):
353 | # get a reversed list of keys above typ
354 | keys = list(self.state.keys())
355 | keys = reversed(keys[:keys.index(typ)])
356 | # get bottom-most element above typ
357 | for key in keys:
358 | if self.state[key] is not None:
359 | return self.state[key]
360 | return None
361 | ##
362 | # Get and return the top-most element.
363 | #
364 | def get_state_top(self):
365 | for key in self.state.keys():
366 | if self.state[key] is not None:
367 | return self.state[key]
368 | ##
369 | # Get the last heirarchy element that was set.
370 | #
371 | def get_last(self):
372 | return self.last
373 | ##
374 | # Set (normalize and update) state.
375 | #
376 | # NOTE: Setting this will change which element gets the current text.
377 | #
378 | def set_state(self, el, typ):
379 | # update state
380 | self.state[typ] = el
381 | # normalize state: clear all elements below type from state
382 | keys = list(self.state.keys())
383 | keys = keys[keys.index(typ)+1:]
384 | for key in keys:
385 | self.state[key] = None
386 | # Reset the latest
387 | self.last = el
388 |
389 | regex_sec = r'^(\d+\-[\d\.]+\-[\d\.]+)\.\s\s+(.+?)\.\s+(.+)'
390 |
391 | ##
392 | #
393 | # Here we do essential data processing.
394 | #
395 | # @input A stream of document lines.
396 | # @output A stream of heirarchy and text element data.
397 | #
398 | class OOFileParser:
399 | ##
400 | #
401 | #
402 | class StateEnum(enum.IntEnum):
403 | init = 1 # TODO needed?
404 | idle = 3
405 | heirarchy = 4
406 | section = 5
407 | section_idle = 6
408 | section_note = 7
409 | section_note_one = 8
410 | section_note_two = 9
411 | text = 10
412 | ##
413 | #
414 | #
415 | def __init__(self):
416 | self.state = self.StateEnum.init
417 | self.last_line_ended_with_colon = False
418 | self.stash = None
419 | ##
420 | #
421 | #
422 | @staticmethod
423 | def run(inq, outq, errq):
424 | try:
425 | parser = OOFileParser()
426 | parser.parse(inq, outq, errq)
427 | except BaseException as e:
428 | logger.critical('OOFileParser.run exception: {} {}'.format(type(e), e), exc_info=True)
429 | outq.put(False) # poison queue
430 | errq.put(False)
431 | ##
432 | # Parse messages from @inq and output resulting messages in @outq.
433 | #
434 | def parse(self, inq, outq, errq):
435 | assert inq is not None and outq is not None
436 | while True:
437 | inmsg = inq.get()
438 | if inmsg is None: # poison pill
439 | outq.put(None) # poison queue
440 | break
441 | elif inmsg is False:
442 | outq.put(False)
443 | errq.put(False)
444 | break
445 | for outmsg in self.event(inmsg):
446 | outq.put(outmsg)
447 | ##
448 | # Consume an event and return a list of structured elements
449 | # in the form of {'type':, 'enum':, 'heading':, 'content':}.
450 | #
451 | # The event function is chosen by current state; the transition
452 | # function is (then) chosen by current state and the signal.
453 | #
454 | def event(self, signal):
455 | # XXX strip line
456 | signal['line'] = signal['line'].strip()
457 | # XXX fixups
458 | signal['line'] = OOFileParser.fixup(signal['line'])
459 | #
460 | if self.state == self.StateEnum.init:
461 | ret = self.event_init(signal)
462 | elif self.state == self.StateEnum.idle:
463 | ret = self.event_idle(signal)
464 | elif self.state == self.StateEnum.heirarchy:
465 | ret = self.event_heirarchy(signal)
466 | elif self.state == self.StateEnum.section:
467 | ret = self.event_section(signal)
468 | elif self.state == self.StateEnum.section_note:
469 | ret = self.event_section_note(signal)
470 | elif self.state == self.StateEnum.section_note_one:
471 | ret = self.event_section_note_one(signal)
472 | elif self.state == self.StateEnum.section_note_two:
473 | ret = self.event_section_note_two(signal)
474 | elif self.state == self.StateEnum.text:
475 | ret = self.event_text(signal)
476 | # XXX keep track of centered text preceeded by lines ending with ':'
477 | if self.state != self.StateEnum.idle:
478 | self.last_line_ended_with_colon = signal['line'].endswith(':')
479 | #
480 | return ret
481 | def event_init(self, signal):
482 | logger.uberdebug('init')
483 | # XXX skip first line
484 | return self.transition_idle(signal)
485 | def event_idle(self, signal):
486 | logger.uberdebug('idle')
487 | line, adjust, lmargin, weight, align = signal['line'], signal['adjust'], signal['lmargin'], signal['weight'], signal['align']
488 | if line == '':
489 | return self.transition_self(signal)
490 | elif lmargin > 0:
491 | return self.transition_text(signal)
492 | elif OOFileParser.test_sec(line, adjust):
493 | return self.transition_section(signal)
494 | elif OOFileParser.test_heirarchy(line):
495 | return self.transition_heirarchy(signal)
496 | elif OOFileParser.test_anonymous_heirarchy(line, adjust, weight):
497 | # XXX skip anonymous heirarchies
498 | return self.transition_self(signal)
499 | # XXX should we only be able to enter subheader state
500 | # from heirarchy state to prevent mistaking text for subheaders?
501 | elif (adjust == 'center' or align == 'CENTER') and self.last_line_ended_with_colon is False:
502 | return self.transition_heirarchy_subheader(signal)
503 | else:
504 | # assume text attached to previous section/subsection
505 | return self.transition_text(signal)
506 | ##
507 | # NOTE if we transition away, flush stashed output signal.
508 | #
509 | def event_heirarchy(self, signal):
510 | logger.uberdebug('heirarchy')
511 | line, adjust, weight, align = signal['line'], signal['adjust'], signal['weight'], signal['align']
512 | if line == '':
513 | # don't transition because we may get subheader
514 | return self.transition_self(signal)
515 | elif OOFileParser.test_sec(line, adjust):
516 | return self.transition_heirarchy_flush(self.transition_section, signal)
517 | elif OOFileParser.test_heirarchy(line):
518 | return self.transition_heirarchy_flush(self.transition_heirarchy, signal)
519 | elif OOFileParser.test_anonymous_heirarchy(line, adjust, weight):
520 | # XXX skip anonymous heirarchies
521 | return self.transition_self(signal)
522 | elif adjust == 'center' or align == 'CENTER': # XXX should we test on last_line_ended_with_colon?
523 | return self.transition_heirarchy_subheader(signal)
524 | else: # XXX is there something better to do here? will a subheader ever not be centered?
525 | return self.transition_heirarchy_flush(self.transition_text, signal)
526 | ##
527 | # event_section
528 | #
529 | def event_section(self, signal):
530 | logger.uberdebug('section')
531 | line, adjust = signal['line'], signal['adjust']
532 | if line == '':
533 | return self.transition_idle(signal)
534 | # XXX put fixups into fixups()?
535 | elif line.endswith('\xa0weeks') or line == 'the use of an artificial limb':
536 | # fixup 8-42-107
537 | return self.transition_text(signal)
538 | elif line.startswith('$'):
539 | # fixup 9-4-109
540 | return self.transition_text(signal)
541 | elif OOFileParser.test_sec(line, adjust):
542 | return self.transition_section(signal)
543 | else: # XXX
544 | return self.transition_text(signal)
545 | def event_text(self, signal):
546 | logger.uberdebug('text')
547 | line, adjust, lmargin = signal['line'], signal['adjust'], signal['lmargin']
548 | if line == '':
549 | return self.transition_self(signal)
550 | elif lmargin > 0:
551 | return self.transition_text(signal)
552 | elif line.endswith('\xa0weeks') or line == 'the use of an artificial limb':
553 | # XXX fixup 8-42-107
554 | return self.transition_text(signal)
555 | elif line.startswith('$'):
556 | # fixup various
557 | return self.transition_text(signal)
558 | elif OOFileParser.test_sec(line, adjust):
559 | return self.transition_section(signal)
560 | elif OOFileParser.test_heirarchy(line):
561 | return self.transition_heirarchy(signal)
562 | else:
563 | # assume text attached to previous section/subsection
564 | return self.transition_text(signal)
565 | def transition_self(self, signal):
566 | logger.uberdebug('self: {}'.format(signal))
567 | return []
568 | def transition_idle(self, signal):
569 | logger.uberdebug('idle: {}'.format(signal))
570 | self.state = self.StateEnum.idle
571 | return []
572 | ##
573 | # Stash the output signal away and flush it when we leave the
574 | # heirarchy state.
575 | #
576 | def transition_heirarchy(self, signal):
577 | logger.superdebug('heirarchy: {}'.format(signal))
578 | line = signal['line']
579 | typ,enum = line.split(' ',1)
580 | typ = typ.lower()
581 | output = {'type': 'heirarchy', 'subtype': typ, 'name': line, 'enum': enum, 'heading': None, 'content': None, 'status': None}
582 | self.stash = output
583 | self.state = self.StateEnum.heirarchy
584 | return []
585 | ##
586 | # Append input signal information to stashed output signal.
587 | #
588 | # XXX Always guard against anonymous heirarchies to avoid
589 | # crashes on lack of incomplete heirarchy in stash.
590 | #
591 | def transition_heirarchy_subheader(self, signal):
592 | logger.superdebug('subheader: {}'.format(signal))
593 | if self.stash is not None:
594 | line, weight = signal['line'], signal['weight']
595 | head, status = OOFileParser.parse_subheader(line)
596 | if head is not None:
597 | if self.stash['heading'] is not None:
598 | self.stash['heading'] += ' ' + head
599 | else:
600 | self.stash['heading'] = head
601 | if status is not None:
602 | if self.stash['status'] is not None:
603 | self.stash['status'] += ' ' + status
604 | else:
605 | self.stash['status'] = status
606 | else:
607 | logger.warning('subheader stash is None')
608 | self.state = self.StateEnum.heirarchy
609 | return []
610 | ##
611 | # Flush stashed output signal
612 | #
613 | def transition_heirarchy_flush(self, f, signal):
614 | logger.uberdebug(f'h_flush {signal}')
615 | assert isinstance(f, types.MethodType)
616 | sig = self.stash
617 | logger.uberdebug(f'h_flush {sig}')
618 | self.stash = None
619 | ret = f(signal)
620 | ret.insert(0, sig)
621 | logger.uberdebug(f'h_flush {ret}')
622 | return ret
623 | def transition_section(self, signal):
624 | logger.uberdebug('section: {}'.format(signal))
625 | sec = OOFileParser.tokenize_section(signal['line']) # return enum, head, status, text, subsecl
626 | logger.uberdebug('section sec: {sec}')
627 | ret = [{'type': 'heirarchy', 'subtype': 'section', 'name': None, 'enum': sec[0], 'heading': sec[1], 'content': sec[3], 'status': sec[2]}]
628 | self.state = self.StateEnum.section
629 | return ret
630 | def transition_text(self, signal):
631 | logger.superdebug('text: {}'.format(signal))
632 | ret = [{'type': 'text', 'subtype': None, 'name': None, 'enum': None, 'heading': None, 'content': signal['line'], 'status': None}]
633 | self.state = self.StateEnum.text
634 | return ret
635 | #
636 | # XXX these methods are complete hacks
637 | #
638 | @staticmethod
639 | def test_sec(line, adjust):
640 | m = re.search(regex_sec, line)
641 | return m is not None
642 | @staticmethod
643 | def test_heirarchy(line):
644 | # XXX should there be a space after each?
645 | # XXX is it always a digit after the word?
646 | # XXX Title 24, Article 60, Part 22/25 have articles within!?
647 | # XXX Section 14-5-609 starts Part C, so alphanumeric?
648 | return len(line) and (line.startswith('TITLE ') and line.split('TITLE ',1)[1][0].isdigit() or line.startswith('PART ') and line.split('PART ',1)[1][0].isalnum() or line.startswith('SUBPART ') and line.split('SUBPART ',1)[1][0].isalnum() or line.startswith('ARTICLE ') and line.split('ARTICLE ',1)[1][0].isdigit()) and not line.endswith('.')
649 | ##
650 | # Test for anonymous (untyped, only with heading) heirarchies.
651 | #
652 | # XXX need more robust logic for checking 'A.' types
653 | #
654 | @staticmethod
655 | def test_anonymous_heirarchy(line, adjust, weight):
656 | return adjust == 'center' and (weight == 'bold' or line.startswith('A.') or line.startswith('B.') or line.startswith('C.') or line.startswith('D.'))
657 | ##
658 | #
659 | #
660 | @staticmethod
661 | def parse_subheader(s):
662 | status = None
663 | if s.endswith('(Reserved)'):
664 | s,_ = s.rsplit('(Reserved)',1)
665 | status = 'incomplete'
666 | return s or None, status
667 | ##
668 | # Return a parsed section and with any subsection(s).
669 | #
670 | # XXX sometimes the header element has the first enum, e.g., 'header (a)'
671 | #
672 | @staticmethod
673 | def tokenize_section(line):
674 | m = re.search(regex_sec, line)
675 | if m:
676 | enum = m.group(1)
677 | heading = m.group(2)
678 | text = m.group(3)
679 | return enum, heading, None, text, None
680 | l = line.split('\xa0')
681 | logger.uberdebug('tokenize_section: {}'.format(l))
682 | l = [s.strip() for s in l]
683 | enum = head = status = subsecl = None
684 | textl = [] # TODO should we join? or should they be separate ?
685 | for n,s in enumerate(l):
686 | if s == '':
687 | pass
688 | elif enum is None:
689 | enum = OOFileParser.parse_sec_enum(s)
690 | logger.uberdebug(f'tokenize_section enum: {enum}')
691 | elif head is None:
692 | head,status = OOFileParser.parse_sec_head(s)
693 | logger.uberdebug('tokenize_section head: {} {}'.format(head, status))
694 | else:
695 | textl.append(s)
696 | text = str.join(' ', textl) or None
697 | # return enum, head, status, text, subsecl
698 | return enum, head, status, text, None
699 | ##
700 | # Return a parsed section string.
701 | #
702 | @staticmethod
703 | def parse_sec_enum(s):
704 | m = re.search(regex_sec, s)
705 | if m:
706 | return m.group(1)
707 | return s.rstrip('.')
708 | ##
709 | # Return a parsed heading string.
710 | #
711 | @staticmethod
712 | def parse_sec_head(s):
713 | status = None
714 | if s.endswith('(Repealed)'):
715 | s,_ = s.rsplit('(Repealed)',1)
716 | status = 'removed'
717 | return s.strip().rstrip('.'), status
718 | ##
719 | # Perform specific fixups on string and return fixed-up string.
720 | #
721 | @staticmethod
722 | def fixup(line):
723 | orig = line
724 | # sections
725 | line = line.replace('this part\xa05', 'this part 5')
726 | line = line.replace('property\xa0-\xa0nonprofit', 'property - nonprofit')
727 | line = line.replace('defend\xa0-\xa0standing', 'defend - standing')
728 | line = line.replace('complaint\xa0-\xa0service', 'complaint - service')
729 | line = line.replace('article\xa064', 'article 64')
730 | line = line.replace('8-17-105.Compliance standard.', '8-17-105.\xa0\xa0Compliance standard.')
731 | # subsections
732 | if line.startswith('(4) '):
733 | line = '(4)\xa0\xa0' + line[5:]
734 | elif line.startswith('(II) '):
735 | line = '(II)\xa0\xa0' + line[5:]
736 | line = line.replace('this part\xa05', 'this part 5')
737 | line = line.replace('BTU/H\xa0FT', 'BTU/H FT')
738 | line = line.replace('by section\xa07-62-1104', 'by section 7-62-1104')
739 | line = line.replace('of subsections\xa0(1) and', 'of subsections (1) and')
740 | line = line.replace('title\xa0shall', 'title shall')
741 | line = line.replace('article\xa060', 'article 60')
742 | line = line.replace('section\xa05-12-102', 'section 5-12-102')
743 | line = line.replace('section\xa07-64-1205', 'section 7-64-1205')
744 | line = line.replace('section\xa07-64-601', 'section 7-64-601')
745 | # can't remember
746 | line = line.replace('article\xa0V', 'article V')
747 | line = line.replace('§§\xa01', '§§ 1')
748 | line = line.replace(' §\xa038-35-106.5', ' § 38-35-106.5')
749 | # ret
750 | if orig is not line:
751 | logger.superdebug('fixup replace: {} {}'.format(repr(orig), repr(line)))
752 | return line
753 |
754 | ##
755 | # Represents a file.
756 | #
757 | class OOFile():
758 | ##
759 | #
760 | #
761 | @staticmethod
762 | def run(connect_string, fn, errq, outq):
763 | try:
764 | OOFile.parse(connect_string, fn, errq, outq)
765 | except BaseException as e:
766 | logger.critical('OOFile.run exception: {} {}'.format(type(e), e))
767 | outq.put(False) # poison queue
768 |
769 | ##
770 | # Open file using desktop and parse and enqueue messages representing paragraphs.
771 | #
772 | @staticmethod
773 | def parse(connect_string, fn, errq, outq):
774 | assert fn is not None and outq is not None
775 | doc = None
776 | # get desktop
777 | desktop = OOFile.connect_soffice(connect_string)
778 | if not desktop:
779 | logger.critical('OOFile.parse no desktop')
780 | outq.put(False)
781 | return
782 | # open file
783 | url = unohelper.systemPathToFileUrl(os.path.abspath(fn))
784 | try:
785 | doc = desktop.loadComponentFromURL(url ,'_blank', 0, (uno.createUnoStruct('com.sun.star.beans.PropertyValue', 'ReadOnly', 0, True, 0),))
786 | except uno.getClass('com.sun.star.lang.IllegalArgumentException') as e:
787 | logger.critical('OOFile.parse file not found: {}'.format(e))
788 | outq.put(False)
789 | return
790 | except uno.getClass('com.sun.star.lang.DisposedException') as e:
791 | logger.critical('OOFile.parse desktop bridge died: {}'.format(e))
792 | outq.put(False)
793 | return
794 | except uno.getClass('com.sun.star.uno.RuntimeException') as e:
795 | logger.critical('OOFile.parse desktop exception: {}'.format(e))
796 | outq.put(False)
797 | return
798 | if doc is None:
799 | logger.critical('OOFile.parse doc is None')
800 | outq.put(False)
801 | return
802 | # get the com.sun.star.text.Text service and get an XEnumeration of com.sun.star.text.Paragraph objects from the XEnumerationAccess
803 | for para in OOFile.XEnumeration(doc.getText()):
804 | lmargin = None
805 | adjust = None
806 | weightn = -1
807 | style = None
808 | align = None
809 | # skip non-paragraphs
810 | if not para.supportsService('com.sun.star.text.Paragraph'):
811 | continue
812 | # get left margin
813 | if para.supportsService('com.sun.star.style.ParagraphProperties') and hasattr(para, 'ParaLeftMargin'):
814 | lmargin = para.ParaLeftMargin
815 | # get adjustment
816 | if para.supportsService('com.sun.star.style.ParagraphProperties') and hasattr(para, 'ParaAdjust'):
817 | adjustn = para.ParaAdjust
818 | if para.supportsService('com.sun.star.style.ParagraphProperties') and hasattr(para, 'ParaTabStops') and hasattr(para.ParaTabStops[0], "Alignment"):
819 | align = para.ParaTabStops[0].Alignment.value
820 | ss = []
821 | # get an XEnumeration of com.sun.star.text.TextPortion objects
822 | for portion in OOFile.XEnumeration(para):
823 | # skip non-text portions
824 | if portion.TextPortionType != 'Text':
825 | continue
826 | # get portion string
827 | ss.append(portion.getString())
828 | # get the last portion's weight
829 | if portion.supportsService('com.sun.star.style.CharacterProperties') and hasattr(portion, 'CharWeight'):
830 | weightn = portion.CharWeight
831 | # get the last portion's style
832 | if portion.supportsService('com.sun.star.style.ParagraphProperties') and hasattr(portion, 'ParaStyleName'):
833 | style = portion.ParaStyleName # XXX need to strip?
834 | # interpret data
835 | s = str.join('', ss)
836 | if adjustn == 3: # com.sun.star.style.ParagraphAdjust
837 | adjust = 'center'
838 | elif adjustn == 0:
839 | adjust = 'left'
840 | elif adjustn == 2:
841 | adjust = 'block'
842 | else:
843 | logger.warning('OOFile.parse unknown adjust: {}'.format(adjustn))
844 | adjust = None
845 | if round(weightn) == 100: # com.sun.star.awt.FontWeight
846 | weight = 'normal'
847 | elif round(weightn) == 150:
848 | weight = 'bold'
849 | elif weightn == -1:
850 | weight = None
851 | else:
852 | logger.warning('OOFile.parse unknown weight: {}'.format(weightn))
853 | weight = None
854 | message = {'align': align, 'adjust': adjust, 'lmargin': lmargin, 'weight': weight, 'style': style, 'line': s}
855 | # check for error message
856 | if errq.qsize() > 0:
857 | try:
858 | inmsg = errq.get(block=False)
859 | OOFile.close(doc)
860 | outq.put(False) # poison output queue and exit
861 | return
862 | except queue.Empty as e:
863 | logger.warning('OOFile.parse errq size weirdness')
864 | # enqueue message
865 | outq.put(message)
866 | # close desktop
867 | OOFile.close(desktop)
868 | # poison queue
869 | outq.put(None)
870 |
871 | ##
872 | # Close the Desktop connection.
873 | #
874 | @staticmethod
875 | def close(desktop):
876 | # See .
877 | logger.debug('Closing desktop')
878 | try:
879 | if desktop is not None:
880 | # XXX we should check for the com.sun.star.util.XCloseable interface first
881 | desktop.close(True)
882 | except uno.getClass('com.sun.star.lang.DisposedException') as e:
883 | logger.critical('OOFile.parse uno.DisposedException: {} {}'.format(doc, e))
884 | except uno.getClass('com.sun.star.uno.RuntimeException') as e:
885 | logger.critical('OOFile.parse uno.RuntimeException: {} {}'.format(doc, e))
886 | except Exception as e:
887 | logger.critical('exception: {} {}'.format(type(e), e))
888 |
889 | ##
890 | # Get an XEnumeration of objects from a given object supporting the
891 | # XEnumerationAccess interface.
892 | #
893 | @staticmethod
894 | def XEnumeration(obj):
895 | xenum = obj.createEnumeration()
896 | while xenum.hasMoreElements():
897 | yield xenum.nextElement()
898 |
899 | ##
900 | # Connect to a running soffice instance and return a XDesktop object.
901 | #
902 | @staticmethod
903 | def connect_soffice(connect_string, tries=5, sleep=5):
904 | desktop = None
905 |
906 | try:
907 | ctxLocal = uno.getComponentContext()
908 | smgrLocal = ctxLocal.ServiceManager
909 | resolver = smgrLocal.createInstanceWithContext('com.sun.star.bridge.UnoUrlResolver', ctxLocal)
910 | ctx = resolver.resolve(connect_string)
911 | smgr = ctx.ServiceManager
912 | logger.superdebug(f"smgr: {smgr}")
913 | desktop = smgr.createInstanceWithContext("com.sun.star.frame.Desktop", ctx)
914 |
915 | except uno.getClass('com.sun.star.lang.DisposedException') as e:
916 | logger.critical(f"Bridge died: {e}")
917 | except Exception as e:
918 | logger.critical(f"{type(e)} {e}")
919 | except uno.getClass('com.sun.star.connection.NoConnectException') as e:
920 | logger.critical(f"Failed to connect: {e}")
921 |
922 | if desktop is None:
923 | logger.critical(f"Desktop is None")
924 |
925 | return desktop
926 |
927 | # do it
928 | if __name__ == "__main__":
929 | sys.exit(main())
930 |
--------------------------------------------------------------------------------