├── util ├── akn2html-single.xsl ├── aknnltk.py ├── akncat └── AkomaNtoso.psm1 ├── README.md ├── src ├── us-co │ ├── crs.xsl │ └── scrape-statcode-us-co └── us-ca │ ├── scrape-statcode-us-ca │ └── scrape-regcode-us-ca ├── test └── oofile.clj └── LICENSE /util/akn2html-single.xsl: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | Colorado Revised Statutes 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 |
24 | 25 | 26 | 27 | inactive 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | [Repealed] 45 | 46 | 47 | [Reserved] 48 | 49 |
50 | 51 | 52 |
53 | 54 |
55 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # openlaw 2 | 3 | This builds toward a minimal toolset for the display and analysis of legislative, regulatory, and judicial documents with the [Akoma Ntoso](http://www.akomantoso.org/) and related legal document standards. 4 | 5 | ## Rationale 6 | 7 | "Ignorance of the law is no excuse", as the saying goes, but a majority of citizens, nationals and residents are not informed about the law due to its inaccessibility, cost, language, complexity, volume, and rapid change. The law shapes economics and society, and thus the majority are denied the full profits of their democratic labor. 8 | 9 | Overcoming these barriers is a necessary step for democratic control of states' and polical parties' governments. This is meant as a working demonstration of using standard formats to reduce the burden. 10 | 11 | ## Utilities 12 | 13 | * The `akncat` utility provides functionality similar to the UNIX [`cat`](https://en.wikipedia.org/wiki/Cat_(Unix)) utility but for Akoma Ntoso documents. 14 | * The `Import-AkomaNtoso` [PowerShell](https://en.wikipedia.org/wiki/Windows_Terminal) cmdlet provides similarly for the .NET environment. 15 | * There is also an [NLTK](http://www.nltk.org/) corpus reader for natural language processing in Python. 16 | 17 | ## Data 18 | 19 | The current legal datasets have conversions to Akoma Ntoso: 20 | 21 | * Colorado Revised Statutes: ¯\\_(ツ)_/¯ 22 | * ~~United States Code~~: Cornell LII LexCraft was superseded by USLM. 23 | * ~~California codified statutes~~: Out of date? 24 | 25 | ## License 26 | 27 | Copyright is waived, see the CC0 in the LICENSE file. 28 | 29 | ## Caveat emptor 30 | 31 | This is by me for me. These legal dataset conversions are experimental, and should not be considered a faithful or official reproduction. They are currently not conformant to the Akoma Ntoso standard and no tests are done for their validity or compatibility. Given the nature of the collection processes, e.g., lack of documentation, much has been interpreted and misinterpreted. The code will probably break things! See the LICENSE file. 32 | -------------------------------------------------------------------------------- /util/aknnltk.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python3 -uW ignore 2 | # -*- coding: utf-8 -*- 3 | ## 4 | # Corpus reader for Akoma Ntoso documents. 5 | # 6 | 7 | from nltk.tokenize import word_tokenize, sent_tokenize 8 | from nltk.corpus.reader.xmldocs import XMLCorpusReader 9 | import unittest 10 | 11 | class AKNCorpusReader(XMLCorpusReader): 12 | """ 13 | Corpus reader for Akoma Ntoso documents. 14 | 15 | """ 16 | def __init__(self, root, fileids): 17 | XMLCorpusReader.__init__(self, root, fileids) 18 | 19 | def words(self, fileid=None): 20 | """ 21 | Returns a list of strings representing word tokens. 22 | """ 23 | return (val for subl in self.sents(fileid) for val in subl) 24 | 25 | def sents(self, fileid=None): 26 | """ 27 | Returns a list of lists of strings representing word tokens with 28 | sentence boundaries intact. 29 | """ 30 | return (word_tokenize(sent) for sent in self._sents(fileid)) 31 | 32 | def _sents(self, fileid=None): 33 | """ 34 | Returns all of the sentences in the specified file. 35 | 36 | Returns all of the Akoma Ntoso '//section//content/p' text nodes 37 | in the specified file. 38 | """ 39 | els = self.xml(fileid).iterfind('.//{http://docs.oasis-open.org/legaldocml/ns/akn/3.0/WD17}section//{http://docs.oasis-open.org/legaldocml/ns/akn/3.0/WD17}content') 40 | paras = (''.join(el.itertext()) for el in els) 41 | return (val for subl in (sent_tokenize(p) for p in paras) for val in subl) 42 | 43 | class USLMCorpusReader(XMLCorpusReader): 44 | """ 45 | Corpus reader for Akoma Ntoso documents. 46 | 47 | """ 48 | def __init__(self, root, fileids): 49 | XMLCorpusReader.__init__(self, root, fileids) 50 | 51 | def words(self, fileid=None): 52 | """ 53 | Returns a list of strings representing word tokens. 54 | """ 55 | return (val for subl in self.sents(fileid) for val in subl) 56 | 57 | def sents(self, fileid=None): 58 | """ 59 | Returns a list of lists of strings representing word tokens with 60 | sentence boundaries intact. 61 | """ 62 | return (word_tokenize(sent) for sent in self._sents(fileid)) 63 | 64 | def _sents(self, fileid=None): 65 | """ 66 | Returns all of the sentences in the specified file. 67 | 68 | Returns all of the Akoma Ntoso '//section//content/p' text nodes 69 | in the specified file. 70 | """ 71 | els = self.xml(fileid).iterfind('.//{http://schemas.gpo.gov/xml/uslm}section//{http://schemas.gpo.gov/xml/uslm}content') 72 | paras = (''.join(el.itertext()) for el in els) 73 | return (val for subl in (sent_tokenize(p) for p in paras) for val in subl) 74 | 75 | class MyTest(unittest.TestCase): 76 | def test(self): 77 | from nltk.text import Text 78 | corpus = USLMCorpusReader('./', '.*\.xml') 79 | t1 = Text(corpus.words('COMPS-10273.xml')) 80 | t1.concordance('secretary',lines=float('inf')) 81 | 82 | if __name__ == "__main__": 83 | unittest.main() 84 | 85 | -------------------------------------------------------------------------------- /src/us-co/crs.xsl: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | <num><xsl:value-of select="substring(normalize-space(CRS/TITLE_NUM/text()), string-length('TITLE')+2)"/></num> 17 | <heading><xsl:value-of select="normalize-space(CRS/TITLE_TEXT/text())"/></heading> 18 | <xsl:apply-templates select="CRS/ARTICLE_NUM"/> 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 |
27 | 28 | 29 | 30 |
31 |
32 | 33 | 34 | 35 | 36 | 37 |
38 | 39 | removed 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 |
54 |
55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 |
77 | 78 | -------------------------------------------------------------------------------- /util/akncat: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python3 -uW all 2 | # -*- coding: utf-8 -*- 3 | 4 | usage="""Usage: akncat [OPTION]... [FILE]... 5 | Concatenate Akoma Ntoso FILE(s) sections to standard output. 6 | 7 | With no FILE, or when FILE is -, read standard input. 8 | 9 | -a, --abbr toggle output of document abbreviation (default: yes) 10 | -n, --nums toggle output of section numbers (default: yes) 11 | -t, --headings toggle output of title/heading/subheading (default: yes) 12 | -c, --content toggle output of section content (default: yes) 13 | -r, --refs toggle output of section references (default: no) 14 | -s, --status toggle output of sections with statusType "removed" (default: no) 15 | -v, --verbose increase verbosity 16 | -h, --help display this help and exit 17 | 18 | Examples: 19 | curl -fL --no-progress-meter \\ 20 | 'https://www.govinfo.gov/link/bills/117/hr/1319?billversion=mostrecent&link-type=uslm' | \\ 21 | akncat -anc 22 | 23 | curl -fL --no-progress-meter \\ 24 | 'https://www.govinfo.gov/link/plaw/117/public/2?link-type=uslm' | \\ 25 | akncat -anc 26 | 27 | curl -fL --no-progress-meter \\ 28 | https://www.govinfo.gov/content/pkg/COMPS-16472/uslm/COMPS-16472.xml | \\ 29 | akncat -anc 30 | """ 31 | 32 | import sys 33 | import getopt 34 | import logging 35 | import os 36 | import re 37 | # XXX lxml is required! 38 | try: import lxml.etree as etree 39 | except ImportError: import xml.etree.ElementTree as etree 40 | 41 | xmlns = "http://docs.oasis-open.org/legaldocml/ns/akn/3.0" 42 | uslm_ns = "http://schemas.gpo.gov/xml/uslm" 43 | ns = {'uslm': uslm_ns} 44 | 45 | do_repealed = False 46 | do_abbr = False 47 | do_number = True 48 | do_heading = True 49 | do_content = True 50 | do_notes = False 51 | 52 | ## 53 | # Entry function. Parse parameters, call cat_file(). 54 | # 55 | def main(): 56 | global do_repealed 57 | global do_abbr 58 | global do_number 59 | global do_heading 60 | global do_content 61 | global do_notes 62 | debug = logging.WARN 63 | logging.basicConfig(format='akncat: {message}', style='{', level=debug) 64 | try: 65 | opts, args = getopt.getopt(sys.argv[1:], 'trncashv') 66 | except getopt.GetoptError as e: 67 | logging.fatal(f'getopt error: {e}') 68 | print(usage) 69 | sys.exit(2) 70 | for opt, arg in opts: 71 | if opt in {'-t', '--headings'}: 72 | do_heading = not do_heading 73 | elif opt in {'-r', '--refs'}: 74 | do_notes = not do_notes 75 | elif opt in {'-n', '--nums'}: 76 | do_number = not do_number 77 | elif opt in {'-c', '--content'}: 78 | do_content = not do_content 79 | elif opt in {'-a', '--abbr'}: 80 | do_abbr = not do_abbr 81 | elif opt in {'-s', '--status'}: 82 | do_repealed = not do_repealed 83 | elif opt in {'-v', '--verbose'}: 84 | debug -= 10 85 | logging.getLogger().setLevel(debug) 86 | elif opt in {'-h', '--help'}: 87 | print(usage) 88 | sys.exit(1) 89 | if len(args) < 1: 90 | cat_file(sys.stdin) 91 | else: 92 | for arg in args: 93 | cat_file(arg) 94 | os.close(sys.stdout.fileno()) 95 | 96 | def cat_file(path): 97 | if path == '-': 98 | path = sys.stdin 99 | try: 100 | tree = etree.parse(path) 101 | if tree.getroot().nsmap[None] == xmlns: 102 | parse_akoma_ntoso(tree) 103 | elif tree.getroot().nsmap[None] == uslm_ns: 104 | parse_uslm(tree) 105 | except (BrokenPipeError,KeyboardInterrupt): 106 | pass 107 | except etree.ParseError: 108 | if path is sys.stdin: 109 | path = '-' 110 | logging.info(f"Failed to parse {path}") 111 | 112 | ## 113 | # Parse a USLM ElementTree. 114 | # 115 | # TODO What is the algorithm here? 116 | # TODO Should we be editing the in-memory representation? 117 | # 118 | def parse_uslm(tree): 119 | for bad in tree.xpath("//*[self::uslm:elided]", namespaces=ns): 120 | remove_node(bad) 121 | 122 | for section in tree.xpath('//uslm:section[not(ancestor::uslm:quotedContent)]', namespaces=ns): 123 | if do_abbr: 124 | # XXX We can't delete editorialNote[@role='uscRef'] until after this! 125 | ident = get_id(section) 126 | if ident: 127 | section.text = ident + ' ' + (section.text or '') 128 | if not do_content: 129 | for bad in section.xpath(".//*[self::uslm:content or self::uslm:chapeau or self::uslm:subsection or self::uslm:paragraph or self::uslm:continuation]", namespaces=ns): 130 | remove_node(bad) 131 | if not do_notes: 132 | for bad in section.xpath(".//*[self::uslm:editorialNote or self::uslm:footnote or self::uslm:sourceCredit or self::uslm:sidenote]", namespaces=ns): 133 | remove_node(bad) 134 | if not do_number: 135 | # TODO Should we normalize or remove descendant tags? 136 | for bad in section.xpath("./uslm:num", namespaces=ns): 137 | remove_node(bad) 138 | if not do_heading: 139 | for bad in section.xpath("./uslm:heading", namespaces=ns): 140 | remove_node(bad) 141 | print(' '.join((etree.tostring(section, encoding='unicode', method='text')).split())) 142 | 143 | # XXX Don't remove tail text! 144 | def remove_node(element): 145 | element.getparent().remove(element) 146 | 147 | ## 148 | # Get an identifier for an element, or None. 149 | # 150 | # From the element or its nearest ancestor, prefer its USC identifier 151 | # over its normal identifier. 152 | # 153 | def get_id(element): 154 | while element is not None: 155 | for ref in element.xpath("./uslm:editorialNote[@role='uscRef']/uslm:ref/@href", namespaces=ns): 156 | return str(ref) 157 | if "identifier" in element.attrib: 158 | return str(element.attrib["identifier"]) 159 | element = element.getparent() 160 | 161 | ## 162 | # TODO 163 | # 164 | def parse_akoma_ntoso(tree): 165 | numre = '{' + xmlns + '}num' 166 | headingre = '{' + xmlns + '}heading' 167 | contentre = './/{' + xmlns + '}content/*' 168 | flatcontentre = './/{' + xmlns + '}content' 169 | metare = '{' + xmlns + '}meta' 170 | abbrre = '{' + xmlns + '}abbr' 171 | if do_repealed: 172 | path = '//akn:section' 173 | else: 174 | path = '//akn:section[not(contains(@status, "removed"))]' 175 | sections = tree.xpath(path, namespaces={'akn': xmlns}) 176 | trans = str.maketrans('“”’\t', '""\' ', '\n') 177 | abbr = '' 178 | if do_abbr: 179 | el = tree.getroot()[0] 180 | for subel in el: 181 | if subel.tag != metare and len(subel) and abbrre in subel[0].attrib: 182 | abbr = subel[0].attrib[abbrre] + ' ' 183 | for section in sections: 184 | enum = '' 185 | heading = '' 186 | content = '' 187 | if do_number: 188 | el = section.find(numre) 189 | if el is not None: 190 | enum = el.text+'. ' 191 | if do_heading: 192 | el = section.find(headingre) 193 | if el is not None: 194 | heading = el.text+'. ' 195 | if do_content: 196 | els = section.iterfind(contentre) 197 | content = ' '.join(''.join(el.itertext()).translate(trans).strip() for el in els) 198 | if len(content) == 0: 199 | els = section.iterfind(flatcontentre) 200 | content = ' '.join(''.join(el.itertext()).translate(trans).strip() for el in els) 201 | print(abbr + enum + heading + content) 202 | 203 | if __name__ == "__main__": 204 | main() 205 | 206 | -------------------------------------------------------------------------------- /util/AkomaNtoso.psm1: -------------------------------------------------------------------------------- 1 | <# 2 | .Synopsis 3 | Import Akoma Ntoso. 4 | 5 | .Description 6 | Import Akoma Ntoso or USLM documents from wherever. 7 | 8 | .Parameter Document 9 | The object, path, URI or array thereof to parse. 10 | 11 | .Parameter IncludeRepealed 12 | Include repealed sections. 13 | 14 | .Example 15 | Import-Module './AkomaNtoso.psm1' 16 | $Bill = Import-Akn 'https://www.govinfo.gov/link/bills/117/hr/1319?link-type=uslm' 17 | $Act = Import-Akn 'https://www.govinfo.gov/link/plaw/117/public/2?link-type=uslm' 18 | $Comps = Import-Akn 'https://www.govinfo.gov/content/pkg/COMPS-16472/uslm/COMPS-16472.xml' 19 | Compare-Object $Bill.Num $Act.Num 20 | Compare-Object $Act.Num $Comps.Num 21 | Import-Akn 'https://www.legislation.gov.uk/ukpga/1982/11/data.akn' | Format-List * 22 | #> 23 | function Import-AkomaNtoso { 24 | [CmdletBinding()] 25 | Param( 26 | [Parameter(Mandatory,ValueFromPipeline)]$Document, 27 | [switch]$IncludeAll 28 | ) 29 | 30 | Process { 31 | # 32 | # Parse the user input. Be flexible. 33 | # 34 | 35 | if ($Document -is [xml]) {} # TODO Check that it's Akoma Ntoso. 36 | elseif ($Document -is [string] -and [System.Uri]::IsWellFormedUriString($Document, [System.UriKind]::Absolute)) { 37 | $proxy = [System.Net.WebRequest]::GetSystemWebProxy() 38 | $proxyParameter = @{} 39 | if ($proxy.GetProxy($Document)) { $proxyParameter = @{Proxy = $proxy.GetProxy($Document); ProxyUseDefaultCredentials = $true } } 40 | $Document = Invoke-RestMethod $Document @proxyParameter # TODO Why is this not terminating without explicitly throwing? 41 | } 42 | elseif ($Document -and $Document.GetType() -in @([string], [System.IO.Stream], [System.IO.TextReader], [System.Xml.XmlReader])) { 43 | $DocumentInput = $Document 44 | $Document = [System.Xml.XmlDocument]::new() 45 | $Document.Load($DocumentInput) 46 | } 47 | elseif ($Document -is [object[]]) { return ($Document | Import-Akn) } 48 | else { return } # TODO Respectfully throw. 49 | if ($null -eq $Document -or $Document -isnot [System.Xml.XmlDocument]) {return} 50 | 51 | ([AkomaNtosoDocument]::new($Document, $IncludeAll)).Sections 52 | } 53 | } 54 | 55 | class AkomaNtosoSection { 56 | [string[]]$Id 57 | [string]$Num 58 | [string]$Heading 59 | [boolean]$IsRepealed 60 | [string]$Content 61 | 62 | hidden static [string]$xPathUSCIdentifierAttr = "./uslm:editorialNote[@role='uscRef']/uslm:ref/@href" 63 | hidden static [string]$xPathUSCSidenoteIdentifierAttr = "./uslm:sidenote//uslm:ref/@href" # TODO 64 | hidden static [string]$xPathNumAttr = "./akn:num|./uslm:num/@value" 65 | hidden static [string]$xPathHeading = "./akn:heading|./uslm:heading" 66 | hidden static [string]$xPathSelf = "." 67 | hidden static [string]$xPathRemoveAll = "./akn:num|./akn:heading|./uslm:num|./uslm:heading|.//uslm:editorialNote[@role='uscRef']|.//*[self::uslm:footnote or self::uslm:sourceCredit or self::uslm:sidenote or self::uslm:page]" 68 | hidden static [string]$reIsRepealed = '^([. ]+)$' # TODO Process notes. 69 | 70 | AkomaNtosoSection( 71 | [System.Xml.XmlNode]$Xml, 72 | [System.Xml.XmlNamespaceManager]$XmlNamespaceManager 73 | ){ 74 | $this.Id = [AkomaNtosoSection]::GetId($Xml, $XmlNamespaceManager) 75 | $this.Num = [AkomaNtosoSection]::GetNum($Xml, $XmlNamespaceManager) 76 | $this.Heading = [AkomaNtosoSection]::GetXmlInnerText($_.SelectSingleNode([AkomaNtosoSection]::xPathHeading, $XmlNamespaceManager)) 77 | $this.IsRepealed = [AkomaNtosoSection]::TestHeadingIsRepealed($this.Heading) 78 | $this.Content = [AkomaNtosoSection]::GetXmlInnerText($_.SelectSingleNode([AkomaNtosoSection]::xPathSelf, $XmlNamespaceManager)) 79 | } 80 | 81 | static [string[]]GetId( 82 | [System.Xml.XmlNode]$Element, 83 | [System.Xml.XmlNamespaceManager]$NamespaceManager 84 | ){ 85 | # Amoma Ntoso eID 86 | $AknId = $Element.Attributes["eId"] 87 | if ($AknId) {$AknId = $AknId.Value} 88 | 89 | # USLM identifier 90 | $UslmId = [AkomaNtosoSection]::GetXmlAttributeValue($Element.Attributes["identifier"]) 91 | 92 | # USLM USC identifier 93 | $UslmUscId = [AkomaNtosoSection]::GetXmlAttributeValue($Element.SelectSingleNode([AkomaNtosoSection]::xPathUSCIdentifierAttr, $NamespaceManager)) 94 | $UslmUscId = [AkomaNtosoSection]::GetXmlAttributeValue($Element.SelectSingleNode([AkomaNtosoSection]::xPathUSCSidenoteIdentifierAttr, $NamespaceManager)) 95 | 96 | return (@($AknId, $UslmId, $UslmUscId, $UslmUscId) | ? {$_}) 97 | } 98 | 99 | static [string]GetNum( 100 | [System.Xml.XmlNode]$Element, 101 | [System.Xml.XmlNamespaceManager]$NamespaceManager 102 | ){ 103 | $AknNum = [AkomaNtosoSection]::GetXmlInnerText($Element.SelectSingleNode([AkomaNtosoSection]::xPathNumAttr, $NamespaceManager)) 104 | $UslmNum = [AkomaNtosoSection]::GetXmlAttributeValue($Element.SelectSingleNode([AkomaNtosoSection]::xPathNumAttr, $NamespaceManager)) 105 | 106 | return ($AknNum ? $AknNum : $UslmNum) 107 | } 108 | 109 | static [boolean]TestHeadingIsRepealed([string]$heading){ 110 | if ($heading -match [AkomaNtosoSection]::reIsRepealed) {return $true} 111 | return $false 112 | } 113 | 114 | static [string]GetXmlAttributeValue($obj){if ($obj -is [System.Xml.XmlAttribute]) {return $obj.Value}; return $null;} 115 | static [string]GetXmlInnerText([System.Xml.XmlNode]$obj){if ($obj -and $obj.InnerText) {return $obj.InnerText.Trim()}; return $null;} 116 | } 117 | 118 | class AkomaNtosoDocument { 119 | [AkomaNtosoSection[]]$Sections 120 | [int]$Count 121 | 122 | hidden static [string]$xPathSection = "//akn:section[not(ancestor::akn:hcontainer)]|//uslm:section[not(ancestor::uslm:quotedContent)]" 123 | hidden static [string]$xPathSectionAll = "//akn:section|//uslm:section" 124 | 125 | AkomaNtosoDocument( 126 | [System.Xml.XmlDocument]$XmlDocument, 127 | [switch]$IncludeAll 128 | ){ 129 | $path = [AkomaNtosoDocument]::xPathSection 130 | if ($IncludeAll) {$path = [AkomaNtosoDocument]::xPathSectionAll} 131 | 132 | $XmlNamespaceManager = New-Object -TypeName System.Xml.XmlNamespaceManager -ArgumentList $XmlDocument.NameTable 133 | $XmlNamespaceManager.AddNamespace("akn", "http://docs.oasis-open.org/legaldocml/ns/akn/3.0") 134 | $XmlNamespaceManager.AddNamespace("uslm", "http://schemas.gpo.gov/xml/uslm") 135 | 136 | $Nodes = $XmlDocument.DocumentElement.SelectNodes($path, $XmlNamespaceManager) 137 | $this.Count = $Nodes.Count 138 | $this.Sections = $Nodes.forEach{ 139 | # TODO Should we sanity check $_ type? 140 | [AkomaNtosoSection]::new($_, $XmlNamespaceManager) 141 | } 142 | } 143 | } 144 | 145 | Update-TypeData -TypeName "AkomaNtosoDocument" -DefaultDisplayPropertySet "Count","Sections" -DefaultKeyPropertySet "Sections" -ErrorAction SilentlyContinue 146 | Update-TypeData -TypeName "AkomaNtosoSection" -DefaultDisplayPropertySet "Num", "Heading" -DefaultKeyPropertySet "Num" -ErrorAction SilentlyContinue 147 | New-Alias Import-Akn Import-AkomaNtoso 148 | Export-ModuleMember -Function * -Alias * 149 | -------------------------------------------------------------------------------- /test/oofile.clj: -------------------------------------------------------------------------------- 1 | ; 2 | ; java -cp $HOME/Downloads/clojure-1.6.0/clojure-1.6.0-slim.jar:/usr/share/java/tablelayout.jar:/usr/share/java/java_uno.jar:/usr/share/java/juh.jar:/usr/share/java/jurt.jar:/usr/share/java/ridl.jar:/usr/share/java/unoloader.jar:/usr/share/java/unoil.jar -Djava.library.path=/usr/lib/ure/lib clojure.main 3 | ; 4 | 5 | (ns org.linkedlaw.oofile) 6 | 7 | ;; process functions 8 | 9 | ; XXX relies on PATH, existence of /dev/null 10 | (defn spawn-soffice 11 | "Spawn an soffice process and return the Process object." 12 | [] 13 | (let [pb (java.lang.ProcessBuilder. 14 | ["soffice" 15 | "--accept=pipe,name=officepipe;urp;StarOffice.ServiceManager" 16 | "--norestore" 17 | "--nologo" 18 | "--headless" 19 | "--nolockcheck"])] 20 | (.redirectErrorStream pb true) 21 | (.redirectOutput pb 22 | (java.lang.ProcessBuilder$Redirect/appendTo 23 | (java.io.File. "/dev/null"))) 24 | (.start pb))) 25 | 26 | (defn get-uno-desktop 27 | "Connect to a running soffice process and return a Desktop frame." 28 | [] 29 | (let [xLocalContext (com.sun.star.comp.helper.Bootstrap/createInitialComponentContext nil) 30 | xLocalServiceManager (.getServiceManager xLocalContext) 31 | oUrlResolver (.createInstanceWithContext xLocalServiceManager "com.sun.star.bridge.UnoUrlResolver" xLocalContext) 32 | xUrlResolver (com.sun.star.uno.UnoRuntime/queryInterface com.sun.star.bridge.XUnoUrlResolver oUrlResolver)] 33 | (if-let [oContext (first (filter some? 34 | (for [i (range 3)] 35 | (try 36 | (.resolve xUrlResolver "uno:pipe,name=officepipe;urp;StarOffice.ComponentContext") 37 | (catch java.lang.Exception e (java.lang.Thread/sleep 5000))))))] 38 | (let [xContext (com.sun.star.uno.UnoRuntime/queryInterface com.sun.star.uno.XComponentContext oContext) 39 | oMCF (.getServiceManager xContext) 40 | xMCF (com.sun.star.uno.UnoRuntime/queryInterface com.sun.star.lang.XMultiComponentFactory oMCF)] 41 | (.createInstanceWithContext xMCF "com.sun.star.frame.Desktop" xContext))))) 42 | 43 | (defn terminate-uno-desktop 44 | "Terminate the Desktop." 45 | [oDesktop] 46 | (let [xDesktop (com.sun.star.uno.UnoRuntime/queryInterface com.sun.star.frame.XDesktop oDesktop)] 47 | (.terminate xDesktop))) 48 | 49 | (defn get-uno-doc 50 | "Given a Desktop frame and filename, open a file and return a XTextDocument object." 51 | [oDesktop file] 52 | (let [xCLoader (com.sun.star.uno.UnoRuntime/queryInterface com.sun.star.frame.XComponentLoader oDesktop) 53 | oDocument (.loadComponentFromURL xCLoader file "_blank" 0 (into-array com.sun.star.beans.PropertyValue []))] 54 | (com.sun.star.uno.UnoRuntime/queryInterface com.sun.star.text.XTextDocument oDocument))) 55 | 56 | ;; helper functions 57 | 58 | (defn uno-enumeration-seq 59 | "Wrap a XEnumeration object as a seq." 60 | [^com.sun.star.container.XEnumeration xEnum] 61 | (enumeration-seq 62 | (reify java.util.Enumeration 63 | (^boolean hasMoreElements [this] (.hasMoreElements xEnum)) 64 | (nextElement [this] (.nextElement xEnum))))) 65 | 66 | (defn supports-service? 67 | "Given a service name, determine if a XComponent object supports the such service." 68 | [service xComponent] 69 | (let [xServiceInfo (com.sun.star.uno.UnoRuntime/queryInterface com.sun.star.lang.XServiceInfo xComponent)] 70 | (.supportsService xServiceInfo service))) 71 | 72 | (defn get-property 73 | "Given a property name and a XComponent object supporting the XPropertySet service, return that property." 74 | [prop xComp] 75 | (let [xPropertySet (com.sun.star.uno.UnoRuntime/queryInterface com.sun.star.beans.XPropertySet xComp)] 76 | (.getPropertyValue xPropertySet prop))) 77 | 78 | (defn is-text-portion? 79 | "Given a XComponent object supporting the TextPortion service, determine if its TextPortionType is Text." 80 | [xTextPortion] 81 | ; (if (supports-service? "com.sun.star.text.TextPortion" xTextPortion) ; XXX only blank lines are meeting both tests 82 | (let [xTextPortionType (get-property "TextPortionType" xTextPortion)] 83 | (= xTextPortionType "Text"))) 84 | 85 | ;; text functions 86 | 87 | (defstruct signal-struct :line :adjust :margin :weight) 88 | 89 | (defn make-portion-map 90 | "Given a signal struct and a XComponent object collection with each supporting the TextPortion and CharacterProperties services, 91 | return a signal struct with a concatenated string and the last object's CharWeight property." 92 | [signal coll] 93 | (loop [sb (java.lang.StringBuilder.) 94 | weight nil 95 | more coll] 96 | (if more 97 | (let [xComp (first more)] 98 | (recur (.append sb (.getString xComp)) 99 | (get-property "CharWeight" xComp) 100 | (next more))) 101 | (assoc signal :line (str sb) :weight weight)))) 102 | 103 | (defn get-content-map 104 | "Given a XTextContent object, return a string that is a concatenation of the sequence of XTextPortion objects that have 'Text' TextPortionType values." 105 | [signal] 106 | (let [xTextElement (:xtext signal) 107 | sig (dissoc signal :xtext)] 108 | (make-portion-map sig 109 | (filter is-text-portion? 110 | (map #(com.sun.star.uno.UnoRuntime/queryInterface com.sun.star.text.XTextRange %) 111 | (uno-enumeration-seq 112 | (.createEnumeration 113 | (com.sun.star.uno.UnoRuntime/queryInterface com.sun.star.container.XEnumerationAccess xTextElement)))))))) 114 | 115 | ; XXX doesnt do properties at the moment 116 | (defn make-content-map 117 | "Given a XTextContent object, return a signal struct with the XTextContent object and their associated properties ParaLeftMargin and ParaAdjust." 118 | [xTextElement] 119 | (let [signal (struct signal-struct nil nil nil nil)] 120 | (assoc signal :xtext xTextElement))) ; NOTE rm or memory leak! 121 | 122 | (defn get-doc-content 123 | "Given a XTextDocument object, return a sequence of signal structs containing XTextContent objects that support the Paragraph service, 124 | as well as their associated ParaLeftMargin and ParaAdjust." 125 | [xDoc] 126 | (map make-content-map 127 | (filter #(supports-service? "com.sun.star.text.Paragraph" %) 128 | ; (map #(com.sun.star.uno.UnoRuntime/queryInterface com.sun.star.text.XTextContent %) ; not needed for some reason 129 | (uno-enumeration-seq 130 | (.createEnumeration 131 | (com.sun.star.uno.UnoRuntime/queryInterface com.sun.star.container.XEnumerationAccess (.getText xDoc))))))) 132 | 133 | (defn get-doc-maps 134 | "Given a XTextDocument object, return a sequence of maps representing paragraphs and their properties." 135 | [xDoc] 136 | (map get-content-map (get-doc-content xDoc))) 137 | 138 | ;; test functions 139 | 140 | (defn t-get-uno-doc 141 | "" 142 | [] 143 | (let [p (spawn-soffice) 144 | desktop (get-uno-desktop) 145 | doc-one (get-uno-doc desktop "file:///home/msr/src/openlaw-test/us-co-law/crs2013/CRS%20Title%2024%20(2013).rtf")] 146 | (if (some? doc-one) 147 | (do 148 | (doseq [m (get-doc-maps doc-one)] 149 | (prn m)) 150 | (.dispose doc-one) 151 | (terminate-uno-desktop desktop) 152 | (println "Terminated soffice.") 153 | (.waitFor p)) 154 | (println "No document.")))) 155 | 156 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Creative Commons Legal Code 2 | 3 | CC0 1.0 Universal 4 | 5 | CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE 6 | LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN 7 | ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS 8 | INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES 9 | REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS 10 | PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM 11 | THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED 12 | HEREUNDER. 13 | 14 | Statement of Purpose 15 | 16 | The laws of most jurisdictions throughout the world automatically confer 17 | exclusive Copyright and Related Rights (defined below) upon the creator 18 | and subsequent owner(s) (each and all, an "owner") of an original work of 19 | authorship and/or a database (each, a "Work"). 20 | 21 | Certain owners wish to permanently relinquish those rights to a Work for 22 | the purpose of contributing to a commons of creative, cultural and 23 | scientific works ("Commons") that the public can reliably and without fear 24 | of later claims of infringement build upon, modify, incorporate in other 25 | works, reuse and redistribute as freely as possible in any form whatsoever 26 | and for any purposes, including without limitation commercial purposes. 27 | These owners may contribute to the Commons to promote the ideal of a free 28 | culture and the further production of creative, cultural and scientific 29 | works, or to gain reputation or greater distribution for their Work in 30 | part through the use and efforts of others. 31 | 32 | For these and/or other purposes and motivations, and without any 33 | expectation of additional consideration or compensation, the person 34 | associating CC0 with a Work (the "Affirmer"), to the extent that he or she 35 | is an owner of Copyright and Related Rights in the Work, voluntarily 36 | elects to apply CC0 to the Work and publicly distribute the Work under its 37 | terms, with knowledge of his or her Copyright and Related Rights in the 38 | Work and the meaning and intended legal effect of CC0 on those rights. 39 | 40 | 1. Copyright and Related Rights. A Work made available under CC0 may be 41 | protected by copyright and related or neighboring rights ("Copyright and 42 | Related Rights"). Copyright and Related Rights include, but are not 43 | limited to, the following: 44 | 45 | i. the right to reproduce, adapt, distribute, perform, display, 46 | communicate, and translate a Work; 47 | ii. moral rights retained by the original author(s) and/or performer(s); 48 | iii. publicity and privacy rights pertaining to a person's image or 49 | likeness depicted in a Work; 50 | iv. rights protecting against unfair competition in regards to a Work, 51 | subject to the limitations in paragraph 4(a), below; 52 | v. rights protecting the extraction, dissemination, use and reuse of data 53 | in a Work; 54 | vi. database rights (such as those arising under Directive 96/9/EC of the 55 | European Parliament and of the Council of 11 March 1996 on the legal 56 | protection of databases, and under any national implementation 57 | thereof, including any amended or successor version of such 58 | directive); and 59 | vii. other similar, equivalent or corresponding rights throughout the 60 | world based on applicable law or treaty, and any national 61 | implementations thereof. 62 | 63 | 2. Waiver. To the greatest extent permitted by, but not in contravention 64 | of, applicable law, Affirmer hereby overtly, fully, permanently, 65 | irrevocably and unconditionally waives, abandons, and surrenders all of 66 | Affirmer's Copyright and Related Rights and associated claims and causes 67 | of action, whether now known or unknown (including existing as well as 68 | future claims and causes of action), in the Work (i) in all territories 69 | worldwide, (ii) for the maximum duration provided by applicable law or 70 | treaty (including future time extensions), (iii) in any current or future 71 | medium and for any number of copies, and (iv) for any purpose whatsoever, 72 | including without limitation commercial, advertising or promotional 73 | purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each 74 | member of the public at large and to the detriment of Affirmer's heirs and 75 | successors, fully intending that such Waiver shall not be subject to 76 | revocation, rescission, cancellation, termination, or any other legal or 77 | equitable action to disrupt the quiet enjoyment of the Work by the public 78 | as contemplated by Affirmer's express Statement of Purpose. 79 | 80 | 3. Public License Fallback. Should any part of the Waiver for any reason 81 | be judged legally invalid or ineffective under applicable law, then the 82 | Waiver shall be preserved to the maximum extent permitted taking into 83 | account Affirmer's express Statement of Purpose. In addition, to the 84 | extent the Waiver is so judged Affirmer hereby grants to each affected 85 | person a royalty-free, non transferable, non sublicensable, non exclusive, 86 | irrevocable and unconditional license to exercise Affirmer's Copyright and 87 | Related Rights in the Work (i) in all territories worldwide, (ii) for the 88 | maximum duration provided by applicable law or treaty (including future 89 | time extensions), (iii) in any current or future medium and for any number 90 | of copies, and (iv) for any purpose whatsoever, including without 91 | limitation commercial, advertising or promotional purposes (the 92 | "License"). The License shall be deemed effective as of the date CC0 was 93 | applied by Affirmer to the Work. Should any part of the License for any 94 | reason be judged legally invalid or ineffective under applicable law, such 95 | partial invalidity or ineffectiveness shall not invalidate the remainder 96 | of the License, and in such case Affirmer hereby affirms that he or she 97 | will not (i) exercise any of his or her remaining Copyright and Related 98 | Rights in the Work or (ii) assert any associated claims and causes of 99 | action with respect to the Work, in either case contrary to Affirmer's 100 | express Statement of Purpose. 101 | 102 | 4. Limitations and Disclaimers. 103 | 104 | a. No trademark or patent rights held by Affirmer are waived, abandoned, 105 | surrendered, licensed or otherwise affected by this document. 106 | b. Affirmer offers the Work as-is and makes no representations or 107 | warranties of any kind concerning the Work, express, implied, 108 | statutory or otherwise, including without limitation warranties of 109 | title, merchantability, fitness for a particular purpose, non 110 | infringement, or the absence of latent or other defects, accuracy, or 111 | the present or absence of errors, whether or not discoverable, all to 112 | the greatest extent permissible under applicable law. 113 | c. Affirmer disclaims responsibility for clearing rights of other persons 114 | that may apply to the Work or any use thereof, including without 115 | limitation any person's Copyright and Related Rights in the Work. 116 | Further, Affirmer disclaims responsibility for obtaining any necessary 117 | consents, permissions or other rights required for any use of the 118 | Work. 119 | d. Affirmer understands and acknowledges that Creative Commons is not a 120 | party to this document and has no duty or obligation with respect to 121 | this CC0 or use of the Work. 122 | -------------------------------------------------------------------------------- /src/us-ca/scrape-statcode-us-ca: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python3 -uW all 2 | # -*- coding: utf-8 -*- 3 | 4 | usage=""" 5 | scrape-statcode-us-ca - convert the California Codes into Akoma Ntoso 6 | 7 | Get the data from e.g. 8 | 9 | 10 | Usage: scrape-statcode-us-ca [options] zipfile 11 | Arguments: 12 | 13 | zipfile ZIP file of the LC data 14 | -d enable debugging (twice for verbose) 15 | -c code output specific codes (one flag for each) 16 | -h show this help and exit 17 | 18 | NOTE: To use on Windows console, "SET PYTHONIOENCODING=cp437:replace". 19 | """ 20 | 21 | import sys 22 | import getopt 23 | import logging 24 | import zipfile 25 | import csv 26 | import string 27 | import io 28 | import itertools 29 | import re 30 | try: 31 | import lxml.etree as etree 32 | except ImportError: 33 | import xml.etree.ElementTree as etree 34 | 35 | xslt = etree.XSLT(etree.XML(""" 36 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | """)) 77 | 78 | ## 79 | # Entry function. Parse parameters, call main function. 80 | # 81 | def main(): 82 | codes = [] 83 | debug = False 84 | loglvl = logging.WARN 85 | try: 86 | opts, args = getopt.getopt(sys.argv[1:], 'hdt:c:') 87 | except getopt.GetoptError as e: 88 | logging.fatal('getopt error: %s %s', e, usage) 89 | sys.exit(1) 90 | if len(args) < 1: 91 | logging.fatal('need filename %s', usage) 92 | sys.exit(1) 93 | zfn = args[0] 94 | for opt, arg in opts: 95 | if opt in ('-h', '--help'): 96 | print(usage) 97 | sys.exit(0) 98 | elif opt in {'-d', '--debug'}: 99 | if not debug: 100 | loglvl = logging.INFO 101 | else: 102 | loglvl = logging.DEBUG 103 | debug = True 104 | elif opt in {'-c', '--code'}: 105 | codes.append(arg.upper()) 106 | # else: 107 | # logging.fatal('invalid option: %s %s', opt, usage) 108 | # sys.exit(1) 109 | logging.basicConfig(format='{levelname:8s}: {message}', style='{', level=loglvl) 110 | do_it(zfn, codes) 111 | 112 | ## 113 | # Build the organizational graph, match sections to their data, and read 114 | # section text from file as we convert t,,, 115 | # 116 | def do_it(zfn, codes): 117 | logging.info('parsing law db...') 118 | law = parse_org(zfn) 119 | 120 | logging.info('matching sections...') 121 | matchsecs(law, zfn) 122 | 123 | # TODO parallelize 124 | for code in filter(lambda x: not len(codes) or x in codes, list(law.keys())): 125 | code2akn(zfn, law, code) 126 | del law[code] 127 | 128 | ## 129 | # Driver function to convert legal code into Akoma Ntoso. 130 | # 131 | def code2akn(zfn, law, code): 132 | logging.info('converting {}...'.format(code)) 133 | zf = zipfile.ZipFile(zfn) 134 | # akn = etree.Element('akomaNtoso', nsmap={None: "http://docs.oasis-open.org/legaldocml/ns/akn/3.0/CSD08", "proposed": "http://docs.oasis-open.org/legaldocml/ns/akn/3.0/CSD05.proposed", "uslm": "http://xml.house.gov/schemas/uslm/1.0", "xhtml": "http://www.w3.org/1999/xhtml", "dc": "http://purl.org/dc/elements/1.1/", "dcterms": "http://purl.org/dc/terms/"}) 135 | # akn = etree.Element('akomaNtoso', nsmap={None: "http://docs.oasis-open.org/legaldocml/ns/akn/3.0/CSD14"}) 136 | akn = etree.Element('akomaNtoso', nsmap={None: "http://docs.oasis-open.org/legaldocml/ns/akn/3.0/CSD14", "proposed": "http://docs.oasis-open.org/legaldocml/ns/akn/3.0/CSD15.proposed"}) 137 | # akn = etree.Element('akomaNtoso') 138 | doc = akn.makeelement('doc', attrib={'name': 'code'}) 139 | akn.append(doc) 140 | meta = doc.makeelement('meta') 141 | doc.append(meta) 142 | body = doc.makeelement('mainBody') 143 | doc.append(body) 144 | code2akn_r(zf, body, law[code], itertools.count(1)) 145 | zf.close() 146 | fn = code + '.xml' 147 | logging.info('writing {}...'.format(fn)) 148 | akn.getroottree().write(fn) 149 | 150 | ## 151 | # 152 | # 153 | def code2akn_r(zf, parent_el, item, unk_count): 154 | # create this node 155 | typ, enum, head = item[0] 156 | elid = build_hcontainer_id(typ, enum, parent_el, unk_count) 157 | # elname = build_name(typ, enum, head) 158 | # this_el = parent_el.makeelement(typ, attrib={'id': elid, 'title': elname}) 159 | if typ == 'book': 160 | # this_el = parent_el.makeelement(typ, attrib={'id': elid, '{http://docs.oasis-open.org/legaldocml/ns/akn/3.0/CSD15.proposed}abbrev': enum}, nsmap={'proposed': 'http://docs.oasis-open.org/legaldocml/ns/akn/3.0/CSD15.proposed'}) 161 | this_el = parent_el.makeelement(typ, attrib={'id': elid, '{http://docs.oasis-open.org/legaldocml/ns/akn/3.0/CSD15.proposed}abbrev': enum}) 162 | else: 163 | this_el = parent_el.makeelement(typ, attrib={'id': elid}) 164 | parent_el.append(this_el) 165 | if enum is not None: 166 | num_el = this_el.makeelement('num') 167 | num_el.text = enum 168 | this_el.append(num_el) 169 | if head is not None: 170 | heading_el = this_el.makeelement('heading') 171 | heading_el.text = head 172 | this_el.append(heading_el) 173 | 174 | # sub org items 175 | for subitem in item[1]: 176 | code2akn_r(zf, this_el, subitem, unk_count) 177 | 178 | # sub sec items 179 | for subitem in item[2]: 180 | enum, fn, (stat_y, stat_c, stat_s, hist) = subitem 181 | elid = build_section_id(enum, parent_el) 182 | # elname = build_name('section', enum) 183 | # section_el = this_el.makeelement('section', attrib={'id': elid, 'title': elname}) 184 | # section_el = this_el.makeelement('section', attrib={'id': elid}) 185 | section_el = this_el.makeelement('section', attrib={'id': elid, 'fn': fn}) 186 | this_el.append(section_el) 187 | if enum is not None: 188 | num_el = section_el.makeelement('num') 189 | num_el.text = enum 190 | section_el.append(num_el) 191 | with zf.open(fn) as f: 192 | caml_el = xslt(etree.parse(f)).getroot() 193 | caml2akn(section_el, caml_el) 194 | 195 | ## 196 | # Build an ID attribute for an organizational element. 197 | # 198 | def build_hcontainer_id(typ, enum, parent_el, unk_count=None): 199 | if typ == 'book': 200 | return '/' + enum.lower() 201 | elif typ == 'hcontainer': 202 | return parent_el.attrib['id'] + '/unnamed-' + str(next(unk_count)) 203 | else: 204 | return parent_el.attrib['id'] + '/' + typ + '-' + enum 205 | 206 | ## 207 | # Build an ID attribute for a section element. 208 | # 209 | def build_section_id(enum, parent_el): 210 | while parent_el.tag != 'book': 211 | parent_el = parent_el.getparent() 212 | return parent_el.attrib['id'] + '/section-' + enum 213 | 214 | ## 215 | # Build a name attribute for an element. 216 | # 217 | def build_name(typ, enum, head=None): 218 | if enum is None: 219 | return head 220 | if typ == 'book': 221 | return head 222 | return typ[0].upper() + typ[1:] + ' ' + enum 223 | 224 | ## 225 | # Parse the organizational structure into a nested dictionary. 226 | # 227 | # Each level is a dictionary of the lower levels. The lowest level is 228 | # data of the form [(desc, (start, end), []), ...] where the empty 229 | # list is for the secions. 230 | # 231 | # For the data of a non-lowest-level, follow the 'NULL' keys on 232 | # down, as when 233 | # 234 | def parse_org(zfn): 235 | zf = zipfile.ZipFile(zfn) 236 | law = {} 237 | 238 | # codes_tbl: 239 | # 240 | # ( 241 | # CODE, 242 | # TITLE 243 | # ) 244 | with io.TextIOWrapper(zf.open('CODES_TBL.dat'), encoding='utf-8', newline='') as codes_tbl: 245 | for r in csv.reader(codes_tbl, 'excel-tab', quotechar='`'): 246 | code = r[0] 247 | head = r[1].strip('* ').split(' - ')[0] 248 | law[code] = [None,SparseList(),SparseList()] 249 | law[code][0] = ('book',code,head) 250 | 251 | # law_toc_tbl: 252 | # 253 | # ( 254 | # LAW_CODE, 255 | # DIVISION, 256 | # TITLE, 257 | # PART, 258 | # CHAPTER, 259 | # ARTICLE, 260 | # HEADING, 261 | # ACTIVE_FLG, 262 | # TRANS_UID, 263 | # TRANS_UPDATE, 264 | # NODE_SEQUENCE, 265 | # NODE_LEVEL, 266 | # NODE_POSITION, 267 | # NODE_TREEPATH, 268 | # CONTAINS_LAW_SECTIONS, 269 | # HISTORY_NOTE, 270 | # OP_STATUES, 271 | # OP_CHAPTER, 272 | # OP_SECTION 273 | # ) 274 | with io.TextIOWrapper(zf.open('LAW_TOC_TBL.dat'), encoding='utf-8', newline='') as law_toc_tbl: 275 | for row in csv.reader(law_toc_tbl, 'excel-tab', quotechar='`'): 276 | # parse row 277 | code = row[0] 278 | if row[7] == 'Y': 279 | active = True 280 | elif row[7] == 'N': 281 | active = False 282 | else: 283 | logging.fatal('unknown row[7]') 284 | sys.exit(1) 285 | path = row[13] 286 | typ, enum, head = parse_header(row[6]) 287 | if row[14] == 'Y': 288 | empty = False 289 | elif row[14] == 'N': 290 | empty = True 291 | else: 292 | logging.fatal('unknown row[14]') 293 | sys.exit(1) 294 | if row[16] == 'NULL': 295 | op_stat = None 296 | else: 297 | op_stat = row[16] 298 | if row[17] == 'NULL': 299 | op_ch = None 300 | else: 301 | op_ch = row[17] 302 | if row[18] == 'NULL': 303 | op_sec = None 304 | else: 305 | op_sec = row[18] 306 | # # checks 307 | # if not empty and (s is None or e is None): 308 | # warn('DB insists', code, path, typ, n, t, 'has el but doesnt give s/e') 309 | if not active: 310 | logging.debug('not active: {} {} {} {}'.format(code,typ,enum,head)) 311 | # if empty: 312 | # info('empty:',typ,n,t,s,e) 313 | if (op_ch and not op_stat) or (op_sec and (not op_stat or not op_ch)): 314 | logging.debug('~stat&(ch|sec): {} {} {}'.format(op_stat,op_ch,op_sec)) 315 | if op_stat: 316 | try: 317 | y = int(op_stat) 318 | except ValueError: 319 | logging.debug('years are in N: {} {} {}'.format(op_stat,op_ch,op_sec)) 320 | else: 321 | if y < 1849 or y > 2013: 322 | logging.debug('stat: {} {} {}'.format(op_stat,op_ch,op_sec)) 323 | # op_stat 324 | org_start(law[code], path, (typ, enum, head)) 325 | 326 | zf.close() 327 | return law 328 | 329 | ## 330 | # Parse the type, enumeration, and heading. 331 | # 332 | def parse_header(s): 333 | typ = None 334 | enum = None 335 | head = None 336 | typs = {'division', 'part', 'subpart', 'title', 'subtitle', 'chapter', 'subchapter', 'article'} 337 | if s[0] == '[': # TODO mark as repealed 338 | s = s.split(']', 1)[0][1:] 339 | if '.' not in s: 340 | typ = 'hcontainer' 341 | enum = None 342 | head = s.strip() 343 | logging.debug('no typ+enum: {}'.format(s)) 344 | return typ, enum, head 345 | first, rest = s.split('. ', 1) 346 | typ, enum = first.split(' ', 1) 347 | typ = typ.lower() 348 | if typ not in typs: 349 | # logging.debug('non-std header: {}'.format(s)) 350 | first, rest = s.split(' (', 1) 351 | for t in typs: 352 | if t in rest.lower(): 353 | first1, rest1 = rest.lower().split(t, 1) 354 | first2, rest2 = rest1.lstrip().split(' ', 1) 355 | enum = first2.strip() 356 | typ = t 357 | head = first 358 | logging.debug('extracted header: {} {} {}'.format(typ, enum, head)) 359 | return typ, enum, head 360 | logging.warning('typ not recognized: {}'.format(s)) 361 | typ = 'hcontainer' 362 | enum = None 363 | head = s 364 | return typ, enum, head 365 | head = rest.split(' [', 1)[0] 366 | return typ, enum, head 367 | 368 | ## 369 | # Use a DB "treepath" (one-based indexing separated by a period) to traverse 370 | # a list (actually a SparseList), creating a list at each traversal if 371 | # necessary. 372 | # 373 | # A list represents an organizational element, with the zeroth item 374 | # representing the organizational element data, the subsequent items 375 | # representing its children, and any non-zeroth non-list items representing 376 | # sections. 377 | # 378 | # Ex: 379 | # 380 | # {'header': ('type1', 'enum1' 'title1'), 'org': [{'header': ('type2', 'enum1' 'title1'), 'sec': [('enum1', 'fn1', ('staty1', 'statch1')), ('enum2', 'fn2', ('staty2', 'statch2'))]}, {'header':('type2', 'enum2' 'title2'), 'sec': [('enum3', 'fn3', ('staty3', 'statch3')), ...]}, ...]} 381 | # 382 | def org_get(l, path): 383 | for p in path.split('.'): 384 | i = int(p)-1 # paths are one-based 385 | ln = l[1][i] 386 | if ln is None: 387 | l[1][i] = [None, SparseList(), SparseList()] 388 | ln = l[1][i] 389 | l = ln 390 | return l 391 | 392 | ## 393 | # Traverse a list and add the data to the zeroth position of the list 394 | # at that level. Used for organizational elements as the zeroth item 395 | # is always the organizational element's data. 396 | # 397 | def org_start(l, path, data): 398 | l = org_get(l, path) 399 | l[0] = data 400 | 401 | ## 402 | # Traverse a list and append the data to the list at that level. 403 | # 404 | def org_app(l, path, pos, data): 405 | l = org_get(l, path) 406 | i = int(pos)-1 # paths are one-based 407 | l[2][i] = data 408 | 409 | ## 410 | # A list that will automatically grow, setting preceeding items as None. 411 | # 412 | # See . 413 | # 414 | class SparseList(list): 415 | def __setitem__(self, index, value): 416 | missing = index - len(self) + 1 417 | if missing > 0: 418 | self.extend([None] * missing) 419 | list.__setitem__(self, index, value) 420 | 421 | def __getitem__(self, index): 422 | try: 423 | return list.__getitem__(self, index) 424 | except IndexError: 425 | return None 426 | 427 | ## 428 | # Match all sections and add their data to the organization data 429 | # structure. Only one element, the deepest element, gets the data. 430 | # 431 | # TODO: what do brackets in section mean? 432 | # TODO: mod for use in CONS 433 | # 434 | def matchsecs(law, zfn): 435 | zf = zipfile.ZipFile(zfn) 436 | rows = {} 437 | 438 | logging.info('parsing and matching section tables...') 439 | 440 | # law_toc_sections_tbl: 441 | # 442 | # ( 443 | # ID, 444 | # LAW_CODE, 445 | # NODE_TREEPATH, 446 | # SECTION_NUM, 447 | # SECTION_ORDER, 448 | # TITLE, 449 | # OP_STATUES, 450 | # OP_CHAPTER, 451 | # OP_SECTION, 452 | # TRANS_UID, 453 | # TRANS_UPDATE, 454 | # LAW_SECTION_VERSION_ID, 455 | # SEQ_NUM 456 | # ) 457 | with io.TextIOWrapper(zf.open('LAW_TOC_SECTIONS_TBL.dat'), encoding='utf-8', newline='') as law_toc_sec_tbl: 458 | for r1 in csv.reader(law_toc_sec_tbl, 'excel-tab', quotechar='`'): 459 | key = r1[11] 460 | code = r1[1] 461 | path = r1[2] 462 | sec = r1[3].strip('[]').rstrip('.') # not sure what brackets mean 463 | pos = r1[4] 464 | assert int(pos) != 0 465 | if sec.count(' '): 466 | sec = sec.split()[-1] 467 | rows[key] = [code, path, sec, pos] 468 | 469 | # law_section_tbl: 470 | # 471 | # ( 472 | # id, 473 | # law_code, 474 | # section_num, 475 | # op_statutes, 476 | # op_chapter, 477 | # op_section, 478 | # effective_date, 479 | # law_section_version_id, 480 | # division, 481 | # title, 482 | # part, 483 | # chapter, 484 | # article, 485 | # history, 486 | # content_xml, 487 | # active_flg, 488 | # trans_uid, 489 | # trans_update, 490 | # ) 491 | # 492 | with io.TextIOWrapper(zf.open('LAW_SECTION_TBL.dat'), encoding='utf-8', newline='') as law_sec_tbl: 493 | for r2 in csv.reader(law_sec_tbl, 'excel-tab', quotechar='`'): 494 | # code = r2[1] 495 | key = r2[7] 496 | stat_y = r2[3] 497 | stat_c = r2[4] 498 | stat_s = r2[5] 499 | hist = r2[13] 500 | fn = r2[14] # filename 501 | act = r2[15] 502 | 503 | if act != 'Y': 504 | logging.fatal('row not active! %s', row) 505 | sys.exit(1) 506 | 507 | if stat_y == 'NULL': 508 | stat_y = None 509 | if stat_c == 'NULL': 510 | stat_c = None 511 | if stat_s == 'NULL': 512 | stat_s = None 513 | stat = (stat_y,stat_c,stat_s,hist) 514 | 515 | rows[key].append(fn) 516 | rows[key].append(stat) 517 | 518 | zf.close() 519 | 520 | logging.info('adding section tables to law structure...') 521 | 522 | for key in rows: 523 | code = rows[key][0] 524 | path = rows[key][1] 525 | sec = rows[key][2] 526 | pos = rows[key][3] 527 | fn = rows[key][4] 528 | stat = rows[key][5] 529 | 530 | org_app(law[code], path, pos, (sec, fn, stat)) 531 | 532 | caml2akn_re1 = re.compile('^\(([^)]+)\)(.*)') 533 | caml2akn_re2 = re.compile('^\(([^)]+)\)\s*\(([^)]+)\)(.*)') 534 | caml2akn_re3 = re.compile('^\(([^)]+)\)\s*\(([^)]+)\)\s*\(([^)]+)\)(.*)') 535 | 536 | ## 537 | # Return AKN given (transformed) CAML. 538 | # 539 | # Foreach non-empty 'p' child in the top-level Content element, 540 | # check for a 3, 2, or 1 section intro. Make sure to check 541 | # if its a valid enum. If there are multiple section intros, 542 | # attach each to its parent element. If it doesn't have a 543 | # section intro, attach to the apparent parent element. 544 | # 545 | # TODO: Some 'p' elements without section intros should be 546 | # attached to the top-level section element, not their 547 | # apparent parent. We may have to manual decide these cases. 548 | # Examples include: 549 | # 550 | # /pen/section-182/subdivision-a 551 | # 552 | def caml2akn(sectionel, camlel): 553 | # in form of ('a', Element) 554 | top = {'section': (None, sectionel), 555 | 'subdivision': (None, None), 556 | 'paragraph': (None, None), 557 | 'subparagraph': (None, None), 558 | 'clause': (None, None)} 559 | for childel in camlel: 560 | # we only convert non-empty 'p' 561 | if childel.tag != 'p' or childel.text is None: 562 | caml2akn_attachcontent(top, childel) 563 | continue 564 | first = None 565 | second = None 566 | third = None 567 | # match or attach and continue 568 | m = caml2akn_re3.search(childel.text) 569 | if m: 570 | first, second, third, rest = m.groups() 571 | else: 572 | m = caml2akn_re2.search(childel.text) 573 | if m: 574 | first, second, rest = m.groups() 575 | else: 576 | m = caml2akn_re1.search(childel.text) 577 | if m: 578 | first, rest = m.groups() 579 | else: 580 | # we cannot convert 581 | caml2akn_attachcontent(top, childel) 582 | continue 583 | # verify enumerators are valid 584 | if first and caml2akn_gethtype(top, first) is None or second and caml2akn_gethtype(top, second) is None or third and caml2akn_gethtype(top, third) is None: 585 | caml2akn_attachcontent(top, childel) 586 | continue 587 | # attach each new hcontainer to each new parent hcontainer 588 | newel,typ = caml2akn_attachhcontainer(top, first) 589 | if second: 590 | newel,typ = caml2akn_attachhcontainer(top, second, newel) 591 | if third: 592 | newel,typ = caml2akn_attachhcontainer(top, third, newel) 593 | # attach content 594 | childel.text = rest.lstrip() 595 | caml2akn_attachcontent(top, childel, newel) 596 | # normalize the state 597 | caml2akn_normalizetop(top, typ) 598 | 599 | caml2akn_topidxl = ['section', 600 | 'subdivision', 601 | 'paragraph', 602 | 'subparagraph', 603 | 'clause'] 604 | caml2akn_topidx = {'section': 0, 605 | 'subdivision': 1, 606 | 'paragraph': 2, 607 | 'subparagraph': 3, 608 | 'clause': 4} 609 | 610 | ## 611 | # Return the lowest element above 'typ', or the lowest element 612 | # if 'typ' is None. 613 | # 614 | def caml2akn_getparent(top, typ=None): 615 | if typ is None: 616 | startidx = 4 617 | else: 618 | startidx = caml2akn_topidx[typ]-1 619 | for checkidx in range(startidx, -1, -1): 620 | checktyp = caml2akn_topidxl[checkidx] 621 | if top[checktyp][1] is not None: 622 | return top[checktyp][1] 623 | return None # XXX do we ever? 624 | 625 | ## 626 | # After we set a new element in 'top', we need to clear all lower 627 | # elements, otherwise we will mistakenly choose them as parents. 628 | # 629 | def caml2akn_normalizetop(top, typ): 630 | startidx = caml2akn_topidx[typ]+1 631 | for clearidx in range(startidx, len(caml2akn_topidxl)): 632 | cleartyp = caml2akn_topidxl[clearidx] 633 | top[cleartyp] = (None, None) 634 | 635 | ## 636 | # Create and attach an 'hconatiner' to its parent given an enumerator 637 | # and the state ('top') and possibly a predetermined parent. 638 | # 639 | def caml2akn_attachhcontainer(top, enum, parentel=None): 640 | typ = caml2akn_gethtype(top, enum) 641 | if parentel is None: 642 | parentel = caml2akn_getparent(top, typ) 643 | newelid = build_hcontainer_id(typ, enum, parentel) 644 | newel = parentel.makeelement(typ, attrib={'id': newelid}) 645 | parentel.append(newel) 646 | numel = newel.makeelement('num') 647 | numel.text = enum 648 | newel.append(numel) 649 | top[typ] = (enum, newel) 650 | return newel,typ 651 | 652 | ## 653 | # Attach child element to the parent element's content. 654 | # 655 | def caml2akn_attachcontent(top, childel, parentel=None): 656 | if parentel is None: 657 | parentel = caml2akn_getparent(top) 658 | content = parentel.find('content') 659 | if content is None: 660 | content = parentel.makeelement('content') 661 | parentel.append(content) 662 | content.append(childel) 663 | 664 | ## 665 | # Determine the type of element. Note that 'i' may be a clause, or it 666 | # may be a subdivision that comes after 'h' etc. 667 | # 668 | # --------------------- 669 | # | s | type | 670 | # --------------------- 671 | # | None | content | 672 | # | a | subdivision | 673 | # | 1 | paragraph | 674 | # | A | subparagraph | 675 | # | i | clause | 676 | # --------------------- 677 | # 678 | def caml2akn_gethtype(top, s): 679 | ret = None 680 | if s.isdecimal(): 681 | ret = 'paragraph' 682 | elif s.isupper(): 683 | ret = 'subparagraph' 684 | elif 'i' not in s and 'v' not in s and 'x' not in s: 685 | ret = 'subdivision' 686 | elif s == 'i' and top['subdivision'][0] == 'h': 687 | ret = 'subdivision' 688 | elif s == 'v' and top['subdivision'][0] == 'u': 689 | ret = 'subdivision' 690 | elif s == 'x' and top['subdivision'][0] == 'w': 691 | ret = 'subdivision' 692 | elif s[0] == 'a' or s[0] == 'b': 693 | ret = 'subdivision' 694 | elif s[0].isupper() or ' ' in s or 'reserved' in s: 695 | logging.debug('not htype: {}'.format(s)) 696 | ret = None 697 | else: 698 | # logging.debug('assuming roman: {}'.format(s)) 699 | ret = 'clause' 700 | return ret 701 | 702 | if __name__ == "__main__": 703 | main() 704 | 705 | -------------------------------------------------------------------------------- /src/us-ca/scrape-regcode-us-ca: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python3 -uW all 2 | # -*- coding: utf-8 -*- 3 | 4 | usage=""" 5 | scrape-regcode-us-ca - convert the California Code of Regulations into Akoma Ntoso 6 | 7 | See . Given titles as 8 | RTF files, in order, this will output them as an Akoma Ntoso XML file. 9 | 10 | Usage: scrape-regcode-us-ca [options] file [file ...] 11 | Arguments: 12 | 13 | file input RTF file(s) 14 | -o file output file ('-' for stdout) (default: stdout) 15 | -n num number of parallel threads (default: 2) 16 | -p num number of Open Office processes (default: 1) 17 | -d enable debuging output (default: warnings only) 18 | """ 19 | 20 | import sys 21 | import os 22 | import getopt 23 | import lxml.etree as etree 24 | import uno 25 | import unohelper 26 | import shlex 27 | import subprocess 28 | import time 29 | import logging 30 | import mimetypes 31 | import enum 32 | import collections 33 | import concurrent.futures 34 | import threading 35 | import queue 36 | import tempfile 37 | import types 38 | import re 39 | import html 40 | 41 | ## 42 | # Entry function: Parse paramters, call main function. 43 | # 44 | def main(): 45 | fout = sys.stdout.buffer 46 | debug = logging.WARNING 47 | threads = 2 48 | processes = 1 49 | logging.SUPERDEBUG = logging.DEBUG-2 # XXX monkey fix 50 | logging.UBERDEBUG = logging.DEBUG-4 51 | 52 | # parse arguments 53 | try: 54 | opts, args = getopt.getopt(sys.argv[1:], 'o:n:p:dh') 55 | except getopt.GetoptError: 56 | logging.fatal('getopt error {}'.format(usage)) 57 | return 1 58 | 59 | for opt, arg in opts: 60 | if opt in {'-d', '--debug'}: 61 | if debug is logging.WARNING: 62 | debug = logging.INFO 63 | elif debug is logging.INFO: 64 | debug = logging.DEBUG 65 | elif debug is logging.DEBUG: 66 | debug = logging.SUPERDEBUG 67 | else: 68 | debug = logging.UBERDEBUG 69 | elif opt in {'-o'}: 70 | fout = arg 71 | elif opt in {'-n'}: 72 | threads = int(arg) 73 | elif opt in {'-p'}: 74 | processes = int(arg) 75 | elif opt in {'-h', '--help'}: 76 | print(opt, usage) 77 | return 0 78 | else: 79 | logging.fatal('invalid flag {}{}'.format(opt, usage)) 80 | return 1 81 | 82 | if len(args) < 1: 83 | logging.fatal('need input files {}'.format(usage)) 84 | return 1 85 | fns = args 86 | 87 | # configure 88 | logging.basicConfig(format='{levelname} {process}/{threadName} {message}', style='{', level=debug) 89 | logging.addLevelName(logging.SUPERDEBUG, 'SUPERDEBUG') 90 | logging.addLevelName(logging.UBERDEBUG, 'UBERDEBUG') 91 | # logging.Logger.superdebug = lambda inst, msg, *args, **kwargs: inst.log(logging.SUPERDEBUG, msg, *args, **kwargs) 92 | logging.superdebug = lambda msg, *args, **kwargs: logging.log(logging.SUPERDEBUG, msg, *args, **kwargs) 93 | # logging.Logger.uberdebug = lambda inst, msg, *args, **kwargs: inst.log(logging.UBERDEBUG, msg, *args, **kwargs) 94 | logging.uberdebug = lambda msg, *args, **kwargs: logging.log(logging.UBERDEBUG, msg, *args, **kwargs) 95 | 96 | # do it 97 | ret = do_it(fns, fout, processes, threads) 98 | 99 | return ret 100 | 101 | ## 102 | # Execute do_parse() against given filenames in parallel. 103 | # 104 | def do_it(fns, fout, nprocs, nthreads): 105 | ret = 0 106 | # start soffice processes 107 | procs = [] 108 | for i in range(nprocs): 109 | #pipename, tmpd, p = OOFile.start_soffice() 110 | procs.append(OOFile.start_soffice()) 111 | time.sleep(5) 112 | # xml body 113 | akn = etree.Element('akomaNtoso', nsmap={None: "http://docs.oasis-open.org/legaldocml/ns/akn/3.0/CSD14", "proposed": "http://docs.oasis-open.org/legaldocml/ns/akn/3.0/CSD15.proposed"}) 114 | act = etree.SubElement(akn, 'act') 115 | meta = etree.SubElement(act, 'meta') 116 | body = etree.SubElement(act, 'body') 117 | # submit tasks 118 | executor = concurrent.futures.ThreadPoolExecutor(max_workers=nthreads) 119 | futures = [] 120 | for i,fn in enumerate(fns): 121 | pipename = procs[i % nprocs][0] 122 | # errq signals producer to fail 123 | errq = queue.Queue() # XXX beware IPC v. interthread 124 | future = executor.submit(do_parse, pipename, fn, errq) 125 | futures.append((future, errq)) 126 | # complete tasks 127 | try: 128 | for xml in (future.result() for future,errq in futures): 129 | # TODO we should have a faster way to detect errors 130 | if xml is None: 131 | # breaking cleanly triggers else clause 132 | # XXX is this really shutting down executor? 133 | raise RuntimeError('do_parse failed: xml is None') 134 | body.append(xml) 135 | except BaseException as e: 136 | logging.critical('do_it exception: {} {}'.format(type(e), e)) 137 | ret = 1 138 | else: 139 | tree = etree.ElementTree(akn) 140 | tree.write(fout) 141 | finally: 142 | # cleanup 143 | logging.info('do_it canceling futures') 144 | for (n,(future,errq)) in enumerate(reversed(futures)): 145 | logging.debug('do_it canceling {}'.format(n)) 146 | future.cancel() 147 | errq.put(False) 148 | logging.debug('do_it shutting down executor') 149 | executor.shutdown() 150 | for pipename, tmpd, p in procs: 151 | logging.debug('do_it terminating {} and waiting on {}'.format(pipename, p)) 152 | if p.poll() is None: # XXX why does this look wrong? 153 | OOFile.terminate_soffice(OOFile.connect_soffice(pipename)) 154 | p.wait() 155 | tmpd.cleanup() 156 | logging.info('do_it cleaned up') 157 | return ret 158 | 159 | ## 160 | # Parse a file and return partial Akoma Ntoso XML. 161 | # 162 | def do_parse(pipename, fn, errq): 163 | logging.info('do_parse parsing: {}'.format(fn)) 164 | xml = None 165 | try: 166 | paraq = queue.Queue(100) 167 | xmlq = queue.Queue(50) 168 | outq = queue.Queue() # XXX should we bound these? 169 | # open files, build threads 170 | # TODO different types of files? 171 | mime = mimetypes.guess_type(fn) 172 | if mime[0] == 'application/rtf': 173 | filethread = threading.Thread(target=OOFile.run, args=(pipename, fn, errq, paraq)) 174 | else: 175 | logging.critical('do_parse unknown filetype: {} {}'.format(fn, mime)) 176 | return None 177 | parserthread = threading.Thread(target=OOFileParser.run, args=(paraq, xmlq, errq)) 178 | builderthread = threading.Thread(target=XMLBuilder.run, args=(xmlq, outq, errq)) 179 | # parse 180 | builderthread.start() 181 | parserthread.start() 182 | filethread.start() 183 | xml = outq.get() 184 | if xml is False: # TODO implement better queue poisoning 185 | xml = None 186 | except OSError as e: 187 | logging.critical('do_parse opening files: {}'.format(e)) 188 | return None 189 | except BaseException as e: 190 | logging.critical('do_parse exception: {} {}'.format(type(e), e)) 191 | logging.info('do_parse done: {}'.format(fn)) 192 | # wait for completion of threads 193 | # TODO is this necessary? 194 | filethread.join() 195 | parserthread.join() 196 | builderthread.join() 197 | # return 198 | return xml 199 | 200 | ## 201 | # A state machine that parses a stream of semi-structured document lines 202 | # into partial Akoma Ntoso XML. The parse() function will consume the input 203 | # and output an XML object. 204 | # 205 | class XMLBuilder: 206 | def __init__(self): 207 | # `state' is an ordered dictionary with the top-most 208 | # element of each type, which represents our heirarchy 209 | # of elements 210 | self.state = collections.OrderedDict() 211 | self.state['title'] = None 212 | self.state['appendix'] = None 213 | self.state['division'] = None 214 | self.state['subdivision'] = None 215 | self.state['chapter'] = None 216 | self.state['subchapter'] = None 217 | self.state['group'] = None 218 | self.state['subgroup'] = None 219 | self.state['article'] = None 220 | self.state['subarticle'] = None 221 | self.state['section'] = None 222 | self.state['paragraph'] = None 223 | self.state['a'] = None # 'subsection' 224 | self.state['1'] = None 225 | self.state['A'] = None 226 | self.state['i'] = None 227 | ## 228 | # 229 | # 230 | @staticmethod 231 | def run(inq, outq, errq): 232 | try: 233 | builder = XMLBuilder() 234 | builder.parse(inq, outq, errq) 235 | except BaseException as e: 236 | logging.critical('XMLBuilder.run exception: {} {}'.format(type(e), e), exc_info=True) 237 | outq.put(False) # poison queue 238 | errq.put(False) 239 | ## 240 | # Parse all messages in @inq and return an XML object. 241 | # 242 | def parse(self, inq, outq, errq): 243 | assert inq is not None 244 | # process messages 245 | while True: 246 | msg = inq.get() 247 | if msg is None: # poison pill 248 | outq.put(self.get_state_top()) 249 | break 250 | elif msg is False: 251 | outq.put(False) # poison queue 252 | errq.put(False) 253 | break 254 | logging.superdebug('XMLBuilder.parse: {}'.format(msg)) 255 | self.event(msg) 256 | ## 257 | # Process a signal. 258 | # 259 | def event(self, signal): 260 | typ = signal['type'] 261 | if typ in {'heirarchy'}: 262 | self.event_heirarchy(signal) 263 | else: 264 | raise RuntimeError('XMLBuilder: unknown event: {}'.format(signal)) 265 | ## 266 | # All heirarchical elements are essentially the same, except that only 267 | # sections and below have content and have different ID algorithms. 268 | # 269 | def event_heirarchy(self, signal): 270 | typ, enum, head, text, status = signal['subtype'], signal['enum'], signal['heading'], signal['content'], signal['status'] 271 | # determine subtype 272 | tag = typ 273 | if typ is None: 274 | tag, typ = self.parse_heirarchy_type(enum) 275 | # create element 276 | el = etree.Element(tag) 277 | # info 278 | el.attrib['title'] = self.get_name(typ, enum) 279 | el.attrib['id'] = self.get_id(typ, enum) 280 | if status: 281 | el.attrib['status'] = status 282 | if enum: 283 | nel = etree.SubElement(el, 'num') 284 | nel.text = enum 285 | if head: 286 | hel = etree.SubElement(el, 'heading') 287 | hel.text = head 288 | if text: 289 | tel = etree.SubElement(el, 'content') 290 | pel = etree.SubElement(tel, 'p') 291 | pel.text = text 292 | # get parent (only title has no parent) and attach 293 | parentel = self.get_state_parent(typ) 294 | if parentel is not None: 295 | parentel.append(el) 296 | else: 297 | logging.debug('event_section no parent: {}'.format(signal)) 298 | # update state 299 | self.set_state(el, typ) 300 | ## 301 | # Determine the type of element from its enumeration. 302 | # 303 | # XXX is this different per title? 304 | # title 2 calls (b) subdivision and (9) subsection 305 | # 306 | # ------------------------- 307 | # | s | type | 308 | # ------------------------- 309 | # | | paragraph | 310 | # | a | subsection | 311 | # | 1 | subsection | 312 | # | A | subsection | 313 | # | iv | subsection | 314 | # ------------------------- 315 | # 316 | def parse_heirarchy_type(self, s): 317 | if s is None or len(s) == 0: 318 | ret = 'paragraph' 319 | elif s.isdecimal(): 320 | ret = '1' 321 | elif s.isupper(): 322 | ret = 'A' 323 | elif 'i' not in s and 'v' not in s and 'x' not in s: 324 | ret = 'a' 325 | elif s == 'i' and self.state['a'] is not None and self.state['a'][0] == 'h': 326 | ret = 'a' 327 | elif s == 'v' and self.state['a'] is not None and self.state['a'][0] == 'u': 328 | ret = 'a' 329 | elif s == 'x' and self.state['a'] is not None and self.state['a'][0] == 'w': 330 | ret = 'a' 331 | else: 332 | logging.superdebug('heirarchy_type assume roman num: {}'.format(s)) 333 | ret = 'i' 334 | return 'subsection', ret 335 | ## 336 | # 337 | # 338 | def get_name(self, typ, enum): 339 | assert typ is not None 340 | name = typ[0].upper() + typ[1:] 341 | if enum is not None: # XXX if no enum, is this required to be unique? 342 | name += ' ' + enum 343 | return name 344 | ## 345 | # XXX requires non-None parent to have id attribute? 346 | # 347 | def get_id(self, typ, enum): 348 | assert typ is not None and enum is not None 349 | parentel = self.get_state_parent(typ) 350 | if parentel is None: 351 | # XXX only top-most element's parent will be None? 352 | ident = '/' + typ + '-' + enum 353 | elif typ in {'section'}: 354 | superparentel = self.get_state_top() 355 | if superparentel is None: 356 | logging.critical('get_id superparentel is None') 357 | raise RuntimeError('get_id superparentel is None') 358 | ident = superparentel.attrib['id'] + '/' + typ + '-' + enum 359 | elif XMLBuilder.test_above_section(typ): 360 | ident = parentel.attrib['id'] + '/' + typ + '-' + enum 361 | elif XMLBuilder.test_below_section(typ): 362 | ident = parentel.attrib['id'] + '/' + enum 363 | else: 364 | logging.critical('get_id unknown type: {}'.format(typ)) 365 | raise RuntimeError('get_id unknown type: {}'.format(typ)) 366 | return ident 367 | ## 368 | # Test if type is below section type. 369 | # 370 | # TODO should probably make more reboust to changes in heirarchy tree 371 | # 372 | @staticmethod 373 | def test_below_section(typ): 374 | return typ in {'paragraph', 'a', '1', 'A', 'i'} 375 | ## 376 | # Test if type is below section type. 377 | # 378 | # TODO should probably make more reboust to changes in heirarchy tree 379 | # 380 | @staticmethod 381 | def test_above_section(typ): 382 | return typ in {'title', 'appendix', 'division', 'subdivision', 'chapter', 'subchapter', 'group', 'subgroup', 'article', 'subarticle'} 383 | ## 384 | # Get the lowest non-None element above type, or None if its the highest. 385 | # 386 | def get_state_parent(self, typ): 387 | # get a reversed list of keys above typ 388 | keys = list(self.state.keys()) 389 | keys = reversed(keys[:keys.index(typ)]) 390 | # get bottom-most element above typ 391 | for key in keys: 392 | if self.state[key] is not None: 393 | return self.state[key] 394 | return None 395 | ## 396 | # Get and return the top-most element. 397 | # 398 | def get_state_top(self): 399 | for key in self.state.keys(): 400 | if self.state[key] is not None: 401 | return self.state[key] 402 | ## 403 | # Update (and normalize) state. 404 | # 405 | def set_state(self, el, typ): 406 | # update state 407 | self.state[typ] = el 408 | # normalize state: clear all elements below type from state 409 | keys = list(self.state.keys()) 410 | keys = keys[keys.index(typ)+1:] 411 | for key in keys: 412 | self.state[key] = None 413 | 414 | ## 415 | # A state machine program that parses a stream of unstructured lines into 416 | # a stream of structured elements. 417 | # 418 | # Its essentially a Mealy machine, whose output is a list of 419 | # structured elements, which are returned on event(). The parse() function 420 | # will drive the event loop and yield each such line. 421 | # 422 | class OOFileParser: 423 | ## 424 | # 425 | # 426 | @staticmethod 427 | def run(inq, outq, errq): 428 | try: 429 | OOFileParser.parse(inq, outq, errq) 430 | except BaseException as e: 431 | logging.critical('OOFileParser.run exception: {} {}'.format(type(e), e), exc_info=True) 432 | outq.put(False) # poison queue 433 | errq.put(False) 434 | ## 435 | # Parse messages from @inq and output resulting messages in @outq. 436 | # 437 | @staticmethod 438 | def parse(inq, outq, errq): 439 | assert inq is not None and outq is not None 440 | while True: 441 | inmsg = inq.get() 442 | if inmsg is None: # poison pill 443 | outq.put(None) # poison queue 444 | break 445 | elif inmsg is False: 446 | outq.put(False) 447 | errq.put(False) 448 | break 449 | for outmsg in OOFileParser.event(inmsg): 450 | outq.put(outmsg) 451 | ## 452 | # 453 | # 454 | @staticmethod 455 | def event(signal): 456 | line, style = signal['line'], signal['style'] 457 | logging.uberdebug('event line: {} {}'.format(style, repr(line))) 458 | if style in {'LVL0', 'LVL1', 'LVL2', 'LVL3', 'LVL4', 'LVL5', 'LVL6', 'SECTION', 'APPENDIX'}: 459 | return OOFileParser.event_org(signal) 460 | elif style in {'NOTEP', 'HISTP'}: 461 | return OOFileParser.event_nh(signal) 462 | elif style in {'SECTION PARAGRAPH', 'ANOTEP'}: 463 | return OOFileParser.event_subsec(signal) 464 | else: 465 | return [] 466 | ## 467 | # 468 | # 469 | @staticmethod 470 | def event_org(signal): 471 | line, style = signal['line'], signal['style'] 472 | typ, enum, head, status = OOFileParser.tokenize_org(line, style) 473 | output = {'type': 'heirarchy', 'subtype': typ, 'name': head, 'enum': enum, 'heading': head, 'content': None, 'status': status} 474 | return [output] 475 | ## 476 | # 477 | # 478 | @staticmethod 479 | def event_nh(signal): 480 | line, style = signal['line'], signal['style'] 481 | if style in {'NOTEP'}: 482 | typ = 'note' 483 | elif style in {'HISTP'}: 484 | typ = 'history' 485 | output = {'type': typ, 'subtype': None, 'name': None, 'enum': None, 'heading': None, 'content': line, 'status': None} 486 | return [output] 487 | ## 488 | # 489 | # 490 | @staticmethod 491 | def event_subsec(signal): 492 | line = signal['line'] 493 | enum, text = OOFileParser.tokenize_subsec(line) 494 | output = {'type': 'heirarchy', 'subtype': None, 'name': None, 'enum': enum, 'heading': None, 'content': text, 'status': None} 495 | return [output] 496 | ## 497 | # XXX fix return values on error to return something reasonable 498 | # 499 | @staticmethod 500 | def tokenize_org(line, style): 501 | orgre = '(TITLE|Division|Part|Subdivision|Chapter|Subchapter|Group|Subgroup|Article|Subarticle|Appendix)\s+(\d+.*)\.\s*(.*)\s+(\[Repealed\]|\[Renumbered\]|\[Reserved\])*\**' 502 | orgrens = '(TITLE|Division|Part|Subdivision|Chapter|Subchapter|Group|Subgroup|Article|Subarticle|Appendix)\s+(\d+.*)\.\s*(.*)' 503 | appre = 'Appendix\s(.+?)\s*(.*)' 504 | secre = '§(\d+.*?)\.\s(.*?)\.\s*(\[Repealed\]|\[Renumbered\]|\[Reserved\])*' 505 | secrenp = '§(\d+.*?)\.\s(.*)' # NOTE some section headings don't end with a period 506 | secrenp2 = '§(\d+.*?)\s(.*)\.*' 507 | 508 | typ = None 509 | enum = None 510 | head = None 511 | status = None 512 | 513 | if style in {'SECTION'}: 514 | typ = 'section' 515 | m = re.match(secre, line) 516 | if m is None: 517 | logging.warning('tokenize: {} did not match secre on {}'.format(style, line)) 518 | m = re.match(secrenp, line) 519 | if not m: 520 | logging.warning('tokenize: {} did not match secrenp on {}'.format(style, line)) 521 | m = re.match(secrenp2, line) 522 | if not m: 523 | logging.warning('tokenize: {} did not match secrenp2 on {}'.format(style, line)) 524 | head = line 525 | else: 526 | enum, head, status = OOFileParser.tokenize_org_groups(m.groups()) 527 | else: 528 | enum, head, status = OOFileParser.tokenize_org_groups(m.groups()) 529 | else: 530 | enum, head, status = OOFileParser.tokenize_org_groups(m.groups()) 531 | else: 532 | if re.search('\[|\*', line): 533 | m = re.match(orgre, line) 534 | else: 535 | m = re.match(orgrens, line) 536 | if not m: 537 | m = re.match(appre, line) 538 | if not m: 539 | logging.warning('tokenize: {} did not match appre on {}'.format(style, line)) 540 | head = line 541 | else: 542 | typ = 'appendix' 543 | enum = m.group(1) 544 | head = m.group(2) 545 | else: 546 | groups = m.groups() 547 | typ = html.escape(groups[0]).lower() 548 | enum = html.escape(groups[1]) 549 | head = html.escape(groups[2]).rstrip('.') # XXX why is the period being included? 550 | if len(groups) == 4 and groups[3]: 551 | status = OOFileParser.parse_status(groups[3].strip('[]*').lower()) 552 | else: 553 | status = None 554 | return (typ, enum, head, status) 555 | ## 556 | # 557 | # 558 | @staticmethod 559 | def tokenize_org_groups(groups): 560 | enum = head = status = None 561 | if groups[0] is not None: 562 | enum = html.escape(groups[0]) 563 | if groups[1] is not None: 564 | head = html.escape(groups[1])+'.' 565 | else: 566 | logging.warning('tokenize: secre.group(2) is None') 567 | head = line 568 | if len(groups) > 2 and groups[2] is not None: 569 | status = OOFileParser.parse_status(groups[2].strip('[]*').lower()) 570 | return (enum, head, status) 571 | ## 572 | # 573 | # 574 | @staticmethod 575 | def parse_status(s): 576 | s = s.strip('[]*').lower() 577 | status = 'unknown' 578 | if s == 'repealed': 579 | status = 'removed' 580 | elif s == 'renumbered': 581 | status = 'removed' 582 | elif s == 'reserved': 583 | status = 'incomplete' 584 | return status 585 | ## 586 | # 587 | # 588 | @staticmethod 589 | def tokenize_subsec(line): 590 | if line.startswith('('): 591 | enums, rest = line.split(')',1) 592 | enum = enums.lstrip('(') 593 | text = rest.strip() 594 | else: 595 | enum = '' # XXX this will be used in ID 596 | text = line 597 | return enum, text 598 | 599 | ## 600 | # 601 | # 602 | class OOFile(): 603 | ## 604 | # 605 | # 606 | @staticmethod 607 | def run(pipename, fn, errq, outq): 608 | try: 609 | OOFile.parse(pipename, fn, errq, outq) 610 | except BaseException as e: 611 | logging.critical('OOFile.run exception: {} {}'.format(type(e), e)) 612 | outq.put(False) # poison queue 613 | ## 614 | # Open file using desktop and parse and enqueue messages representing paragraphs. 615 | # 616 | @staticmethod 617 | def parse(pipename, fn, errq, outq): 618 | assert fn is not None and outq is not None 619 | doc = None 620 | # get desktop 621 | desktop = OOFile.connect_soffice(pipename) 622 | if not desktop: 623 | logging.critical('OOFile.parse no desktop') 624 | outq.put(False) 625 | return 626 | # open file 627 | url = unohelper.systemPathToFileUrl(os.path.abspath(fn)) 628 | try: 629 | doc = desktop.loadComponentFromURL(url ,'_blank', 0, (uno.createUnoStruct('com.sun.star.beans.PropertyValue', 'ReadOnly', 0, True, 0),)) 630 | except uno.getClass('com.sun.star.lang.IllegalArgumentException') as e: 631 | logging.critical('OOFile.parse file not found: {}'.format(filename)) 632 | outq.put(False) 633 | return 634 | except uno.getClass('com.sun.star.lang.DisposedException') as e: 635 | logging.critical('OOFile.parse desktop bridge died: {}'.format(e)) 636 | outq.put(False) 637 | return 638 | except uno.getClass('com.sun.star.uno.RuntimeException') as e: 639 | logging.critical('OOFile.parse desktop exception: {}'.format(e)) 640 | outq.put(False) 641 | return 642 | if doc is None: 643 | logging.critical('OOFile.parse doc is None') 644 | outq.put(False) 645 | return 646 | # get the com.sun.star.text.Text service and get an XEnumeration of com.sun.star.text.Paragraph objects from the XEnumerationAccess 647 | for para in OOFile.XEnumeration(doc.getText()): 648 | lmargin = None 649 | adjust = None 650 | weightn = -1 651 | style = None 652 | # skip non-paragraphs 653 | if not para.supportsService('com.sun.star.text.Paragraph'): 654 | continue 655 | # get left margin 656 | if para.supportsService('com.sun.star.style.ParagraphProperties') and hasattr(para, 'ParaLeftMargin'): 657 | lmargin = para.ParaLeftMargin 658 | # get adjustment 659 | if para.supportsService('com.sun.star.style.ParagraphProperties') and hasattr(para, 'ParaAdjust'): 660 | adjustn = para.ParaAdjust 661 | ss = [] 662 | # get an XEnumeration of com.sun.star.text.TextPortion objects 663 | for portion in OOFile.XEnumeration(para): 664 | # skip non-text portions 665 | if portion.TextPortionType != 'Text': 666 | continue 667 | # get portion string 668 | ss.append(portion.getString()) 669 | # get the last portion's weight 670 | if portion.supportsService('com.sun.star.style.CharacterProperties') and hasattr(portion, 'CharWeight'): 671 | weightn = portion.CharWeight 672 | # get the last portion's style 673 | if portion.supportsService('com.sun.star.style.ParagraphProperties') and hasattr(portion, 'ParaStyleName'): 674 | style = portion.ParaStyleName # XXX need to strip? 675 | # interpret data 676 | s = str.join('', ss) 677 | if adjustn == 3: # com.sun.star.style.ParagraphAdjust 678 | adjust = 'center' 679 | elif adjustn == 0: 680 | adjust = 'left' 681 | elif adjustn == 2: 682 | adjust = 'block' 683 | else: 684 | logging.warning('OOFile.parse unknown adjust: {}'.format(adjustn)) 685 | adjust = None 686 | if round(weightn) == 100: # com.sun.star.awt.FontWeight 687 | weight = 'normal' 688 | elif round(weightn) == 150: 689 | weight = 'bold' 690 | elif weightn == -1: 691 | weight = None 692 | else: 693 | logging.warning('OOFile.parse unknown weight: {}'.format(weightn)) 694 | weight = None 695 | message = {'line': s, 'adjust': adjust, 'lmargin': lmargin, 'weight': weight, 'style': style} 696 | # check for error message 697 | if errq.qsize() > 0: 698 | try: 699 | inmsg = errq.get(block=False) 700 | OOFile.close(doc) 701 | outq.put(False) # poison output queue and exit 702 | return 703 | except queue.Empty as e: 704 | logging.warning('OOFile.parse errq size weirdness') 705 | # enqueue message 706 | outq.put(message) 707 | # close file 708 | OOFile.close(doc) 709 | # poison queue 710 | outq.put(None) 711 | ## 712 | # 713 | # 714 | @staticmethod 715 | def close(doc): 716 | # See . 717 | logging.debug('OOFile closing: {}'.format(doc)) 718 | try: 719 | if doc is not None: 720 | # XXX we should check for the com.sun.star.util.XCloseable interface first 721 | doc.close(True) 722 | except uno.getClass('com.sun.star.lang.DisposedException') as e: 723 | logging.critical('OOFile.parse uno.DisposedException: {} {}'.format(doc, e)) 724 | except uno.getClass('com.sun.star.uno.RuntimeException') as e: 725 | logging.critical('OOFile.parse uno.RuntimeException: {} {}'.format(doc, e)) 726 | ## 727 | # Get an XEnumeration of objects from a given object supporting the 728 | # XEnumerationAccess interface. 729 | # 730 | @staticmethod 731 | def XEnumeration(obj): 732 | xenum = obj.createEnumeration() 733 | while xenum.hasMoreElements(): 734 | yield xenum.nextElement() 735 | ## 736 | # TODO make POSIX/Windows agnostic, e.g., USERPROFILE instead of HOME. 737 | # 738 | @staticmethod 739 | def start_soffice(pipename=None): 740 | if pipename is None: 741 | pipename = 'officepipe-'+next(tempfile._RandomNameSequence()) 742 | tmpd = tempfile.TemporaryDirectory() 743 | cmd = 'soffice --accept="pipe,name='+pipename+';urp;StarOffice.ServiceManager" --norestore --nologo --headless --nolockcheck' 744 | p = subprocess.Popen(shlex.split(cmd), env={"HOME": tmpd.name}, stdin=subprocess.DEVNULL, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT) 745 | return pipename, tmpd, p 746 | 747 | ## 748 | # Connect to a running soffice instance and return a XDesktop object. 749 | # 750 | @staticmethod 751 | def connect_soffice(pipename, tries=5, sleep=5): 752 | context = None 753 | desktop = None 754 | local = uno.getComponentContext() 755 | resolver = local.ServiceManager.createInstanceWithContext('com.sun.star.bridge.UnoUrlResolver', local) 756 | resolvurl = 'uno:pipe,name='+pipename+';urp;StarOffice.ComponentContext' 757 | for i in range(tries): 758 | try: 759 | context = resolver.resolve(resolvurl) 760 | if context is not None: 761 | desktop = context.ServiceManager.createInstanceWithContext('com.sun.star.frame.Desktop', context) 762 | except uno.getClass('com.sun.star.lang.DisposedException') as e: 763 | logging.critical('OOFile.connect_soffice bridge died: {}'.format(e)) 764 | break 765 | except uno.getClass('com.sun.star.connection.NoConnectException') as e: 766 | logging.debug('OOFile.connect_soffice failed to connect {} / {}'.format(i+1, tries)) 767 | time.sleep(sleep) 768 | if context is None or desktop is None: 769 | logging.warning('OOFile.connect_soffice failed to connect') 770 | return desktop 771 | 772 | ## 773 | # 774 | # 775 | @staticmethod 776 | def terminate_soffice(desktop): 777 | if desktop is None: 778 | logging.debug('OOFile.terminate_soffice desktop None') 779 | return False 780 | try: 781 | desktop.terminate() # kills soffice 782 | except uno.getClass('com.sun.star.lang.DisposedException') as e: # XXX needed? 783 | logging.critical('OOFile.terminate_soffice uno.DisposedException: {} {}'.format(desktop, e)) 784 | return False 785 | except uno.getClass('com.sun.star.uno.RuntimeException') as e: 786 | logging.critical('OOFile.terminate_soffice uno.RuntimeException: {} {}'.format(desktop, e)) 787 | return False 788 | return True 789 | 790 | # do it 791 | if __name__ == "__main__": 792 | sys.exit(main()) 793 | 794 | -------------------------------------------------------------------------------- /src/us-co/scrape-statcode-us-co: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python3 -uW all 2 | # -*- coding: utf-8 -*- 3 | 4 | usage=""" 5 | scrape-statcode-us-co - convert the Colorado Revised Statutes into Akoma Ntoso 6 | 7 | See . Given titles as 8 | Open XML files, in order, this will output them as an Akoma Ntoso XML file. 9 | 10 | Usage: scrape-statcode-us-co [options] file 11 | Arguments: 12 | 13 | file input .docx file from the bulk download site 14 | -o file output file ('-' for stdout) (default: stdout) 15 | -c string The StarOffice connect-string (default: 'socket,host=localhost,port=2002,tcpNoDelay=1') 16 | -d enable debuging output (default: warnings only) 17 | 18 | To run LibreOffice: 19 | 20 | soffice --writer --accept='socket,host=localhost,port=2002,tcpNoDelay=1;urp;' --norestore --nologo --headless --nolockcheck 21 | """ 22 | 23 | import sys 24 | import os 25 | import getopt 26 | import lxml.etree as etree 27 | import uno 28 | import unohelper 29 | import logging 30 | import mimetypes 31 | import enum 32 | import collections 33 | import threading 34 | import queue 35 | import types 36 | import re 37 | 38 | logger = logging.getLogger(__name__) 39 | 40 | ## 41 | # Parse paramters, call processing function. 42 | # 43 | def main(): 44 | fout = sys.stdout.buffer 45 | debug = logging.INFO 46 | connect_string = 'uno:socket,host=localhost,port=2002,tcpNoDelay=1;urp;StarOffice.ComponentContext' 47 | fn = None 48 | logger.SUPERDEBUG = logging.DEBUG-2 # XXX monkey fix 49 | logger.UBERDEBUG = logging.DEBUG-4 50 | 51 | # Configure logging 52 | logging.basicConfig(format='{levelname} {process}/{thread}/{funcName} {message}', style='{', level=debug) 53 | logging.addLevelName(logger.SUPERDEBUG, 'SUPERDEBUG') 54 | logging.addLevelName(logger.UBERDEBUG, 'UBERDEBUG') 55 | logger.superdebug = lambda msg, *args, **kwargs: logger.log(logger.SUPERDEBUG, msg, *args, **kwargs) 56 | logger.uberdebug = lambda msg, *args, **kwargs: logger.log(logger.UBERDEBUG, msg, *args, **kwargs) 57 | 58 | # Parse arguments 59 | try: 60 | opts, args = getopt.getopt(sys.argv[1:], 'o:c:dh') 61 | except getopt.GetoptError: 62 | logger.fatal(f"getopt error {usage}") 63 | return 1 64 | 65 | for opt, arg in opts: 66 | if opt in {'-d', '--debug'}: 67 | if debug is logging.INFO: 68 | debug = logging.DEBUG 69 | elif debug is logging.DEBUG: 70 | debug = logger.SUPERDEBUG 71 | elif debug is logger.SUPERDEBUG: 72 | debug = logger.UBERDEBUG 73 | else: 74 | logger.warning("main unknown debugging level") 75 | debug = logging.DEBUG 76 | elif opt in {'-o'}: 77 | fout = arg 78 | elif opt in {'-c'}: 79 | connect_string = f"uno:{arg};urp;StarOffice.ComponentContext" 80 | elif opt in {'-h', '--help'}: 81 | print(usage) 82 | return 0 83 | else: 84 | logger.fatal(f"invalid flag {opt}{usage}") 85 | return 1 86 | 87 | logger.setLevel(debug) 88 | if len(args) != 1: 89 | logger.fatal(f"Missing input file {usage}") 90 | return 1 91 | fn = args[0] 92 | 93 | # Verify file type. This fails on Windows? 94 | mime = mimetypes.guess_type(fn) 95 | if mime[0] not in ('application/rtf','application/vnd.openxmlformats-officedocument.wordprocessingml.document') and os.name != 'nt': 96 | logger.critical(f"Unknown filetype: {mime} {fn}") 97 | return 2 98 | 99 | xml = do_parse(connect_string, fn) 100 | if xml is None: 101 | logger.critical("Parsing returned no XML") 102 | return 3 103 | 104 | tree = etree.ElementTree(xml) 105 | tree.write(fout) 106 | 107 | return 0 108 | 109 | ## 110 | # Parse a file and return Akoma Ntoso XML. 111 | # 112 | def do_parse(connect_string, fn): 113 | logger.info('do_parse parsing: {}'.format(fn)) 114 | 115 | akn = etree.Element('akomaNtoso', nsmap={None: "http://docs.oasis-open.org/legaldocml/ns/akn/3.0", "xsi": "http://www.w3.org/2001/XMLSchema-instance"}) 116 | akn.attrib['{http://www.w3.org/2001/XMLSchema-instance}schemaLocation'] = "http://docs.oasis-open.org/legaldocml/ns/akn/3.0 ../schemas/akomantoso30.xsd" 117 | act = etree.SubElement(akn, 'act') 118 | meta = etree.SubElement(act, 'meta') 119 | body = etree.SubElement(act, 'body') 120 | 121 | #css = etree.ProcessingInstruction("xml-stylesheet", text='type="text/css" href="akn.css"') 122 | #akn.addprevious(css) 123 | 124 | try: 125 | paraq = queue.Queue(100) 126 | xmlq = queue.Queue(50) 127 | outq = queue.Queue() # XXX should we bound these? 128 | errq = queue.Queue() 129 | 130 | # open files, build threads 131 | filethread = threading.Thread(target=OOFile.run, args=(connect_string, fn, errq, paraq)) 132 | parserthread = threading.Thread(target=OOFileParser.run, args=(paraq, xmlq, errq)) 133 | builderthread = threading.Thread(target=XMLBuilder.run, args=(xmlq, outq, errq)) 134 | 135 | # parse 136 | builderthread.start() 137 | parserthread.start() 138 | filethread.start() 139 | xml = outq.get() 140 | if xml is not False: # TODO implement better queue poisoning. Maybe use exception objects? 141 | body.append(xml) 142 | except OSError as e: 143 | logger.critical('do_parse opening files: {}'.format(e)) 144 | return None 145 | except BaseException as e: 146 | logger.critical('do_parse exception: {} {}'.format(type(e), e)) 147 | return None 148 | 149 | # Wait for completion of threads. Is this necessary? 150 | logger.info('joining threads: {}'.format(fn)) 151 | filethread.join() 152 | parserthread.join() 153 | builderthread.join() 154 | 155 | # return 156 | return akn 157 | 158 | ## 159 | # A state machine that parses a stream of semi-structured document lines 160 | # into partial Akoma Ntoso XML. The parse() function will consume the input 161 | # and output an XML object. 162 | # 163 | class XMLBuilder: 164 | def __init__(self): 165 | # `state' is an ordered dictionary with the top-most 166 | # element of each type, which represents our heirarchy 167 | # of elements 168 | self.state = collections.OrderedDict() 169 | self.state['title'] = None 170 | self.state['article'] = None 171 | self.state['part'] = None 172 | self.state['subpart'] = None 173 | self.state['section'] = None 174 | self.state['subsection'] = None 175 | self.state['paragraph'] = None 176 | self.state['subparagraph'] = None 177 | self.state['subsubparagraph'] = None 178 | self.last = None 179 | ## 180 | # 181 | # 182 | @staticmethod 183 | def run(inq, outq, errq): 184 | try: 185 | builder = XMLBuilder() 186 | builder.parse(inq, outq, errq) 187 | except BaseException as e: 188 | logger.critical('XMLBuilder.run exception: {} {}'.format(type(e), e), exc_info=True) 189 | outq.put(False) # poison queue 190 | errq.put(False) 191 | ## 192 | # Parse all messages in @inq and return an XML object. 193 | # 194 | def parse(self, inq, outq, errq): 195 | assert inq is not None 196 | # process messages 197 | while True: 198 | msg = inq.get() 199 | if msg is None: # EOF 200 | outq.put(self.get_state_top()) 201 | break 202 | elif msg is False: # poison pill 203 | outq.put(False) # poison queue 204 | errq.put(False) 205 | break 206 | logger.superdebug('XMLBuilder.parse: {}'.format(msg)) 207 | self.event(msg) 208 | ## 209 | # Process a signal. 210 | # 211 | def event(self, signal): 212 | typ,subtype = signal['type'], signal['subtype'] 213 | if typ in {'heirarchy'}: 214 | self.event_heirarchy(signal) 215 | elif typ in {'text'}: 216 | self.event_text(signal) 217 | else: 218 | raise RuntimeError('XMLBuilder: unknown event: {}'.format(signal)) 219 | ## 220 | # Append the p/text() to the current heirarchy element. If the current heirarchy element does not have a content child element, create it. 221 | # 222 | # Right now only transition_text is emitting these messages. 223 | # 224 | def event_text(self, signal): 225 | text = signal['content'] 226 | last_el = self.get_last() 227 | if last_el is None: 228 | logger.warning(f'ignoring text {text}') 229 | return 230 | # Get or create content element. 231 | content_el = last_el.find("./content") 232 | if content_el is None: 233 | content_el = etree.SubElement(last_el, 'content') 234 | # Append new p element. 235 | pel = etree.SubElement(content_el, 'p') 236 | pel.text = text 237 | ## 238 | # All heirarchical elements are essentially the same, except that only 239 | # sections and below have content and have different ID algorithms. 240 | # 241 | def event_heirarchy(self, signal): 242 | typ, enum, head, text, status = signal['subtype'], signal['enum'], signal['heading'], signal['content'], signal['status'] 243 | # determine subtype 244 | if typ is None: 245 | typ = self.parse_heirarchy_type(enum) 246 | # create element 247 | el = etree.Element(typ) 248 | # info 249 | tel = None 250 | # el.attrib['title'] = self.get_name(typ, enum) 251 | el.attrib['eId'] = self.get_id(typ, enum) 252 | if status: 253 | el.attrib['status'] = status 254 | if enum: 255 | nel = etree.SubElement(el, 'num') 256 | nel.text = enum 257 | if head: 258 | hel = etree.SubElement(el, 'heading') 259 | hel.text = head 260 | if text: 261 | tel = etree.SubElement(el, 'content') 262 | pel = etree.SubElement(tel, 'p') 263 | pel.text = text 264 | # get parent (only title has no parent) and attach 265 | parentel = self.get_state_parent(typ) 266 | if parentel is not None: 267 | parentel.append(el) 268 | else: 269 | logger.warning('event_section no parent: {}'.format(signal)) 270 | # update state 271 | self.set_state(el, typ) 272 | ## 273 | # Determine the type of element from its enumeration. 274 | # 275 | # Note that 'I' may be a subparagraph, or it may be a 276 | # sub-subparagraph that comes after 'H' etc. 277 | # 278 | # ------------------------- 279 | # | s | type | 280 | # ------------------------- 281 | # | 1 | subsection | 282 | # | a | paragraph | 283 | # | IV | subparagraph | 284 | # | A | sub-subparagraph | 285 | # ------------------------- 286 | # 287 | def parse_heirarchy_type(self, s): 288 | ret = 'subsection' 289 | if s.isdecimal(): 290 | ret = 'subsection' 291 | elif s.islower(): 292 | ret = 'paragraph' 293 | elif 'I' not in s and 'V' not in s and 'X' not in s: 294 | ret = 'subsubparagraph' 295 | elif s == 'I' and self.state['subsubparagraph'] is not None and self.state['subsubparagraph'][0] == 'H': 296 | ret = 'subsubparagraph' 297 | elif s == 'V' and self.state['subsubparagraph'] is not None and self.state['subsubparagraph'][0] == 'U': 298 | ret = 'subsubparagraph' 299 | elif s == 'X' and self.state['subsubparagraph'] is not None and self.state['subsubparagraph'][0] == 'W': 300 | ret = 'subdivision' 301 | else: 302 | logger.superdebug('heirarchy_type assume roman num: {}'.format(s)) 303 | ret = 'subparagraph' 304 | return ret 305 | ## 306 | # 307 | # 308 | def get_name(self, typ, enum): 309 | assert typ is not None 310 | name = typ[0].upper() + typ[1:] 311 | if enum is not None: # XXX if no enum, is this required to be unique? 312 | name += ' ' + enum 313 | return name 314 | ## 315 | # XXX requires non-None parent to have id attribute? 316 | # 317 | def get_id(self, typ, enum): 318 | assert typ is not None and enum is not None 319 | parentel = self.get_state_parent(typ) 320 | if parentel is None: 321 | # XXX only top-most element's parent will be None? 322 | ident = typ + '-' + enum 323 | elif typ in {'section'}: 324 | ident = typ + '-' + enum 325 | elif XMLBuilder.test_above_section(typ): 326 | ident = parentel.attrib['eId'] + '-' + typ + '-' + enum 327 | elif XMLBuilder.test_below_section(typ): 328 | ident = parentel.attrib['eId'] + '-' + enum 329 | else: 330 | logger.critical('get_id unknown type: {}'.format(typ)) 331 | raise RuntimeError('get_id unknown type: {}'.format(typ)) 332 | return ident 333 | ## 334 | # Test if type is below section type. 335 | # 336 | # TODO should probably make more reboust to changes in heirarchy tree 337 | # 338 | @staticmethod 339 | def test_below_section(typ): 340 | return typ in {'subsection', 'paragraph', 'subparagraph', 'subsubparagraph'} 341 | ## 342 | # Test if type is below section type. 343 | # 344 | # TODO should probably make more reboust to changes in heirarchy tree 345 | # 346 | @staticmethod 347 | def test_above_section(typ): 348 | return typ in {'title', 'article', 'part', 'subpart'} 349 | ## 350 | # Get the lowest non-None element above type, or None if its the highest. 351 | # 352 | def get_state_parent(self, typ): 353 | # get a reversed list of keys above typ 354 | keys = list(self.state.keys()) 355 | keys = reversed(keys[:keys.index(typ)]) 356 | # get bottom-most element above typ 357 | for key in keys: 358 | if self.state[key] is not None: 359 | return self.state[key] 360 | return None 361 | ## 362 | # Get and return the top-most element. 363 | # 364 | def get_state_top(self): 365 | for key in self.state.keys(): 366 | if self.state[key] is not None: 367 | return self.state[key] 368 | ## 369 | # Get the last heirarchy element that was set. 370 | # 371 | def get_last(self): 372 | return self.last 373 | ## 374 | # Set (normalize and update) state. 375 | # 376 | # NOTE: Setting this will change which element gets the current text. 377 | # 378 | def set_state(self, el, typ): 379 | # update state 380 | self.state[typ] = el 381 | # normalize state: clear all elements below type from state 382 | keys = list(self.state.keys()) 383 | keys = keys[keys.index(typ)+1:] 384 | for key in keys: 385 | self.state[key] = None 386 | # Reset the latest 387 | self.last = el 388 | 389 | regex_sec = r'^(\d+\-[\d\.]+\-[\d\.]+)\.\s\s+(.+?)\.\s+(.+)' 390 | 391 | ## 392 | # 393 | # Here we do essential data processing. 394 | # 395 | # @input A stream of document lines. 396 | # @output A stream of heirarchy and text element data. 397 | # 398 | class OOFileParser: 399 | ## 400 | # 401 | # 402 | class StateEnum(enum.IntEnum): 403 | init = 1 # TODO needed? 404 | idle = 3 405 | heirarchy = 4 406 | section = 5 407 | section_idle = 6 408 | section_note = 7 409 | section_note_one = 8 410 | section_note_two = 9 411 | text = 10 412 | ## 413 | # 414 | # 415 | def __init__(self): 416 | self.state = self.StateEnum.init 417 | self.last_line_ended_with_colon = False 418 | self.stash = None 419 | ## 420 | # 421 | # 422 | @staticmethod 423 | def run(inq, outq, errq): 424 | try: 425 | parser = OOFileParser() 426 | parser.parse(inq, outq, errq) 427 | except BaseException as e: 428 | logger.critical('OOFileParser.run exception: {} {}'.format(type(e), e), exc_info=True) 429 | outq.put(False) # poison queue 430 | errq.put(False) 431 | ## 432 | # Parse messages from @inq and output resulting messages in @outq. 433 | # 434 | def parse(self, inq, outq, errq): 435 | assert inq is not None and outq is not None 436 | while True: 437 | inmsg = inq.get() 438 | if inmsg is None: # poison pill 439 | outq.put(None) # poison queue 440 | break 441 | elif inmsg is False: 442 | outq.put(False) 443 | errq.put(False) 444 | break 445 | for outmsg in self.event(inmsg): 446 | outq.put(outmsg) 447 | ## 448 | # Consume an event and return a list of structured elements 449 | # in the form of {'type':, 'enum':, 'heading':, 'content':}. 450 | # 451 | # The event function is chosen by current state; the transition 452 | # function is (then) chosen by current state and the signal. 453 | # 454 | def event(self, signal): 455 | # XXX strip line 456 | signal['line'] = signal['line'].strip() 457 | # XXX fixups 458 | signal['line'] = OOFileParser.fixup(signal['line']) 459 | # 460 | if self.state == self.StateEnum.init: 461 | ret = self.event_init(signal) 462 | elif self.state == self.StateEnum.idle: 463 | ret = self.event_idle(signal) 464 | elif self.state == self.StateEnum.heirarchy: 465 | ret = self.event_heirarchy(signal) 466 | elif self.state == self.StateEnum.section: 467 | ret = self.event_section(signal) 468 | elif self.state == self.StateEnum.section_note: 469 | ret = self.event_section_note(signal) 470 | elif self.state == self.StateEnum.section_note_one: 471 | ret = self.event_section_note_one(signal) 472 | elif self.state == self.StateEnum.section_note_two: 473 | ret = self.event_section_note_two(signal) 474 | elif self.state == self.StateEnum.text: 475 | ret = self.event_text(signal) 476 | # XXX keep track of centered text preceeded by lines ending with ':' 477 | if self.state != self.StateEnum.idle: 478 | self.last_line_ended_with_colon = signal['line'].endswith(':') 479 | # 480 | return ret 481 | def event_init(self, signal): 482 | logger.uberdebug('init') 483 | # XXX skip first line 484 | return self.transition_idle(signal) 485 | def event_idle(self, signal): 486 | logger.uberdebug('idle') 487 | line, adjust, lmargin, weight, align = signal['line'], signal['adjust'], signal['lmargin'], signal['weight'], signal['align'] 488 | if line == '': 489 | return self.transition_self(signal) 490 | elif lmargin > 0: 491 | return self.transition_text(signal) 492 | elif OOFileParser.test_sec(line, adjust): 493 | return self.transition_section(signal) 494 | elif OOFileParser.test_heirarchy(line): 495 | return self.transition_heirarchy(signal) 496 | elif OOFileParser.test_anonymous_heirarchy(line, adjust, weight): 497 | # XXX skip anonymous heirarchies 498 | return self.transition_self(signal) 499 | # XXX should we only be able to enter subheader state 500 | # from heirarchy state to prevent mistaking text for subheaders? 501 | elif (adjust == 'center' or align == 'CENTER') and self.last_line_ended_with_colon is False: 502 | return self.transition_heirarchy_subheader(signal) 503 | else: 504 | # assume text attached to previous section/subsection 505 | return self.transition_text(signal) 506 | ## 507 | # NOTE if we transition away, flush stashed output signal. 508 | # 509 | def event_heirarchy(self, signal): 510 | logger.uberdebug('heirarchy') 511 | line, adjust, weight, align = signal['line'], signal['adjust'], signal['weight'], signal['align'] 512 | if line == '': 513 | # don't transition because we may get subheader 514 | return self.transition_self(signal) 515 | elif OOFileParser.test_sec(line, adjust): 516 | return self.transition_heirarchy_flush(self.transition_section, signal) 517 | elif OOFileParser.test_heirarchy(line): 518 | return self.transition_heirarchy_flush(self.transition_heirarchy, signal) 519 | elif OOFileParser.test_anonymous_heirarchy(line, adjust, weight): 520 | # XXX skip anonymous heirarchies 521 | return self.transition_self(signal) 522 | elif adjust == 'center' or align == 'CENTER': # XXX should we test on last_line_ended_with_colon? 523 | return self.transition_heirarchy_subheader(signal) 524 | else: # XXX is there something better to do here? will a subheader ever not be centered? 525 | return self.transition_heirarchy_flush(self.transition_text, signal) 526 | ## 527 | # event_section 528 | # 529 | def event_section(self, signal): 530 | logger.uberdebug('section') 531 | line, adjust = signal['line'], signal['adjust'] 532 | if line == '': 533 | return self.transition_idle(signal) 534 | # XXX put fixups into fixups()? 535 | elif line.endswith('\xa0weeks') or line == 'the use of an artificial limb': 536 | # fixup 8-42-107 537 | return self.transition_text(signal) 538 | elif line.startswith('$'): 539 | # fixup 9-4-109 540 | return self.transition_text(signal) 541 | elif OOFileParser.test_sec(line, adjust): 542 | return self.transition_section(signal) 543 | else: # XXX 544 | return self.transition_text(signal) 545 | def event_text(self, signal): 546 | logger.uberdebug('text') 547 | line, adjust, lmargin = signal['line'], signal['adjust'], signal['lmargin'] 548 | if line == '': 549 | return self.transition_self(signal) 550 | elif lmargin > 0: 551 | return self.transition_text(signal) 552 | elif line.endswith('\xa0weeks') or line == 'the use of an artificial limb': 553 | # XXX fixup 8-42-107 554 | return self.transition_text(signal) 555 | elif line.startswith('$'): 556 | # fixup various 557 | return self.transition_text(signal) 558 | elif OOFileParser.test_sec(line, adjust): 559 | return self.transition_section(signal) 560 | elif OOFileParser.test_heirarchy(line): 561 | return self.transition_heirarchy(signal) 562 | else: 563 | # assume text attached to previous section/subsection 564 | return self.transition_text(signal) 565 | def transition_self(self, signal): 566 | logger.uberdebug('self: {}'.format(signal)) 567 | return [] 568 | def transition_idle(self, signal): 569 | logger.uberdebug('idle: {}'.format(signal)) 570 | self.state = self.StateEnum.idle 571 | return [] 572 | ## 573 | # Stash the output signal away and flush it when we leave the 574 | # heirarchy state. 575 | # 576 | def transition_heirarchy(self, signal): 577 | logger.superdebug('heirarchy: {}'.format(signal)) 578 | line = signal['line'] 579 | typ,enum = line.split(' ',1) 580 | typ = typ.lower() 581 | output = {'type': 'heirarchy', 'subtype': typ, 'name': line, 'enum': enum, 'heading': None, 'content': None, 'status': None} 582 | self.stash = output 583 | self.state = self.StateEnum.heirarchy 584 | return [] 585 | ## 586 | # Append input signal information to stashed output signal. 587 | # 588 | # XXX Always guard against anonymous heirarchies to avoid 589 | # crashes on lack of incomplete heirarchy in stash. 590 | # 591 | def transition_heirarchy_subheader(self, signal): 592 | logger.superdebug('subheader: {}'.format(signal)) 593 | if self.stash is not None: 594 | line, weight = signal['line'], signal['weight'] 595 | head, status = OOFileParser.parse_subheader(line) 596 | if head is not None: 597 | if self.stash['heading'] is not None: 598 | self.stash['heading'] += ' ' + head 599 | else: 600 | self.stash['heading'] = head 601 | if status is not None: 602 | if self.stash['status'] is not None: 603 | self.stash['status'] += ' ' + status 604 | else: 605 | self.stash['status'] = status 606 | else: 607 | logger.warning('subheader stash is None') 608 | self.state = self.StateEnum.heirarchy 609 | return [] 610 | ## 611 | # Flush stashed output signal 612 | # 613 | def transition_heirarchy_flush(self, f, signal): 614 | logger.uberdebug(f'h_flush {signal}') 615 | assert isinstance(f, types.MethodType) 616 | sig = self.stash 617 | logger.uberdebug(f'h_flush {sig}') 618 | self.stash = None 619 | ret = f(signal) 620 | ret.insert(0, sig) 621 | logger.uberdebug(f'h_flush {ret}') 622 | return ret 623 | def transition_section(self, signal): 624 | logger.uberdebug('section: {}'.format(signal)) 625 | sec = OOFileParser.tokenize_section(signal['line']) # return enum, head, status, text, subsecl 626 | logger.uberdebug('section sec: {sec}') 627 | ret = [{'type': 'heirarchy', 'subtype': 'section', 'name': None, 'enum': sec[0], 'heading': sec[1], 'content': sec[3], 'status': sec[2]}] 628 | self.state = self.StateEnum.section 629 | return ret 630 | def transition_text(self, signal): 631 | logger.superdebug('text: {}'.format(signal)) 632 | ret = [{'type': 'text', 'subtype': None, 'name': None, 'enum': None, 'heading': None, 'content': signal['line'], 'status': None}] 633 | self.state = self.StateEnum.text 634 | return ret 635 | # 636 | # XXX these methods are complete hacks 637 | # 638 | @staticmethod 639 | def test_sec(line, adjust): 640 | m = re.search(regex_sec, line) 641 | return m is not None 642 | @staticmethod 643 | def test_heirarchy(line): 644 | # XXX should there be a space after each? 645 | # XXX is it always a digit after the word? 646 | # XXX Title 24, Article 60, Part 22/25 have articles within!? 647 | # XXX Section 14-5-609 starts Part C, so alphanumeric? 648 | return len(line) and (line.startswith('TITLE ') and line.split('TITLE ',1)[1][0].isdigit() or line.startswith('PART ') and line.split('PART ',1)[1][0].isalnum() or line.startswith('SUBPART ') and line.split('SUBPART ',1)[1][0].isalnum() or line.startswith('ARTICLE ') and line.split('ARTICLE ',1)[1][0].isdigit()) and not line.endswith('.') 649 | ## 650 | # Test for anonymous (untyped, only with heading) heirarchies. 651 | # 652 | # XXX need more robust logic for checking 'A.' types 653 | # 654 | @staticmethod 655 | def test_anonymous_heirarchy(line, adjust, weight): 656 | return adjust == 'center' and (weight == 'bold' or line.startswith('A.') or line.startswith('B.') or line.startswith('C.') or line.startswith('D.')) 657 | ## 658 | # 659 | # 660 | @staticmethod 661 | def parse_subheader(s): 662 | status = None 663 | if s.endswith('(Reserved)'): 664 | s,_ = s.rsplit('(Reserved)',1) 665 | status = 'incomplete' 666 | return s or None, status 667 | ## 668 | # Return a parsed section and with any subsection(s). 669 | # 670 | # XXX sometimes the header element has the first enum, e.g., 'header (a)' 671 | # 672 | @staticmethod 673 | def tokenize_section(line): 674 | m = re.search(regex_sec, line) 675 | if m: 676 | enum = m.group(1) 677 | heading = m.group(2) 678 | text = m.group(3) 679 | return enum, heading, None, text, None 680 | l = line.split('\xa0') 681 | logger.uberdebug('tokenize_section: {}'.format(l)) 682 | l = [s.strip() for s in l] 683 | enum = head = status = subsecl = None 684 | textl = [] # TODO should we join? or should they be separate

? 685 | for n,s in enumerate(l): 686 | if s == '': 687 | pass 688 | elif enum is None: 689 | enum = OOFileParser.parse_sec_enum(s) 690 | logger.uberdebug(f'tokenize_section enum: {enum}') 691 | elif head is None: 692 | head,status = OOFileParser.parse_sec_head(s) 693 | logger.uberdebug('tokenize_section head: {} {}'.format(head, status)) 694 | else: 695 | textl.append(s) 696 | text = str.join(' ', textl) or None 697 | # return enum, head, status, text, subsecl 698 | return enum, head, status, text, None 699 | ## 700 | # Return a parsed section string. 701 | # 702 | @staticmethod 703 | def parse_sec_enum(s): 704 | m = re.search(regex_sec, s) 705 | if m: 706 | return m.group(1) 707 | return s.rstrip('.') 708 | ## 709 | # Return a parsed heading string. 710 | # 711 | @staticmethod 712 | def parse_sec_head(s): 713 | status = None 714 | if s.endswith('(Repealed)'): 715 | s,_ = s.rsplit('(Repealed)',1) 716 | status = 'removed' 717 | return s.strip().rstrip('.'), status 718 | ## 719 | # Perform specific fixups on string and return fixed-up string. 720 | # 721 | @staticmethod 722 | def fixup(line): 723 | orig = line 724 | # sections 725 | line = line.replace('this part\xa05', 'this part 5') 726 | line = line.replace('property\xa0-\xa0nonprofit', 'property - nonprofit') 727 | line = line.replace('defend\xa0-\xa0standing', 'defend - standing') 728 | line = line.replace('complaint\xa0-\xa0service', 'complaint - service') 729 | line = line.replace('article\xa064', 'article 64') 730 | line = line.replace('8-17-105.Compliance standard.', '8-17-105.\xa0\xa0Compliance standard.') 731 | # subsections 732 | if line.startswith('(4) '): 733 | line = '(4)\xa0\xa0' + line[5:] 734 | elif line.startswith('(II) '): 735 | line = '(II)\xa0\xa0' + line[5:] 736 | line = line.replace('this part\xa05', 'this part 5') 737 | line = line.replace('BTU/H\xa0FT', 'BTU/H FT') 738 | line = line.replace('by section\xa07-62-1104', 'by section 7-62-1104') 739 | line = line.replace('of subsections\xa0(1) and', 'of subsections (1) and') 740 | line = line.replace('title\xa0shall', 'title shall') 741 | line = line.replace('article\xa060', 'article 60') 742 | line = line.replace('section\xa05-12-102', 'section 5-12-102') 743 | line = line.replace('section\xa07-64-1205', 'section 7-64-1205') 744 | line = line.replace('section\xa07-64-601', 'section 7-64-601') 745 | # can't remember 746 | line = line.replace('article\xa0V', 'article V') 747 | line = line.replace('§§\xa01', '§§ 1') 748 | line = line.replace(' §\xa038-35-106.5', ' § 38-35-106.5') 749 | # ret 750 | if orig is not line: 751 | logger.superdebug('fixup replace: {} {}'.format(repr(orig), repr(line))) 752 | return line 753 | 754 | ## 755 | # Represents a file. 756 | # 757 | class OOFile(): 758 | ## 759 | # 760 | # 761 | @staticmethod 762 | def run(connect_string, fn, errq, outq): 763 | try: 764 | OOFile.parse(connect_string, fn, errq, outq) 765 | except BaseException as e: 766 | logger.critical('OOFile.run exception: {} {}'.format(type(e), e)) 767 | outq.put(False) # poison queue 768 | 769 | ## 770 | # Open file using desktop and parse and enqueue messages representing paragraphs. 771 | # 772 | @staticmethod 773 | def parse(connect_string, fn, errq, outq): 774 | assert fn is not None and outq is not None 775 | doc = None 776 | # get desktop 777 | desktop = OOFile.connect_soffice(connect_string) 778 | if not desktop: 779 | logger.critical('OOFile.parse no desktop') 780 | outq.put(False) 781 | return 782 | # open file 783 | url = unohelper.systemPathToFileUrl(os.path.abspath(fn)) 784 | try: 785 | doc = desktop.loadComponentFromURL(url ,'_blank', 0, (uno.createUnoStruct('com.sun.star.beans.PropertyValue', 'ReadOnly', 0, True, 0),)) 786 | except uno.getClass('com.sun.star.lang.IllegalArgumentException') as e: 787 | logger.critical('OOFile.parse file not found: {}'.format(e)) 788 | outq.put(False) 789 | return 790 | except uno.getClass('com.sun.star.lang.DisposedException') as e: 791 | logger.critical('OOFile.parse desktop bridge died: {}'.format(e)) 792 | outq.put(False) 793 | return 794 | except uno.getClass('com.sun.star.uno.RuntimeException') as e: 795 | logger.critical('OOFile.parse desktop exception: {}'.format(e)) 796 | outq.put(False) 797 | return 798 | if doc is None: 799 | logger.critical('OOFile.parse doc is None') 800 | outq.put(False) 801 | return 802 | # get the com.sun.star.text.Text service and get an XEnumeration of com.sun.star.text.Paragraph objects from the XEnumerationAccess 803 | for para in OOFile.XEnumeration(doc.getText()): 804 | lmargin = None 805 | adjust = None 806 | weightn = -1 807 | style = None 808 | align = None 809 | # skip non-paragraphs 810 | if not para.supportsService('com.sun.star.text.Paragraph'): 811 | continue 812 | # get left margin 813 | if para.supportsService('com.sun.star.style.ParagraphProperties') and hasattr(para, 'ParaLeftMargin'): 814 | lmargin = para.ParaLeftMargin 815 | # get adjustment 816 | if para.supportsService('com.sun.star.style.ParagraphProperties') and hasattr(para, 'ParaAdjust'): 817 | adjustn = para.ParaAdjust 818 | if para.supportsService('com.sun.star.style.ParagraphProperties') and hasattr(para, 'ParaTabStops') and hasattr(para.ParaTabStops[0], "Alignment"): 819 | align = para.ParaTabStops[0].Alignment.value 820 | ss = [] 821 | # get an XEnumeration of com.sun.star.text.TextPortion objects 822 | for portion in OOFile.XEnumeration(para): 823 | # skip non-text portions 824 | if portion.TextPortionType != 'Text': 825 | continue 826 | # get portion string 827 | ss.append(portion.getString()) 828 | # get the last portion's weight 829 | if portion.supportsService('com.sun.star.style.CharacterProperties') and hasattr(portion, 'CharWeight'): 830 | weightn = portion.CharWeight 831 | # get the last portion's style 832 | if portion.supportsService('com.sun.star.style.ParagraphProperties') and hasattr(portion, 'ParaStyleName'): 833 | style = portion.ParaStyleName # XXX need to strip? 834 | # interpret data 835 | s = str.join('', ss) 836 | if adjustn == 3: # com.sun.star.style.ParagraphAdjust 837 | adjust = 'center' 838 | elif adjustn == 0: 839 | adjust = 'left' 840 | elif adjustn == 2: 841 | adjust = 'block' 842 | else: 843 | logger.warning('OOFile.parse unknown adjust: {}'.format(adjustn)) 844 | adjust = None 845 | if round(weightn) == 100: # com.sun.star.awt.FontWeight 846 | weight = 'normal' 847 | elif round(weightn) == 150: 848 | weight = 'bold' 849 | elif weightn == -1: 850 | weight = None 851 | else: 852 | logger.warning('OOFile.parse unknown weight: {}'.format(weightn)) 853 | weight = None 854 | message = {'align': align, 'adjust': adjust, 'lmargin': lmargin, 'weight': weight, 'style': style, 'line': s} 855 | # check for error message 856 | if errq.qsize() > 0: 857 | try: 858 | inmsg = errq.get(block=False) 859 | OOFile.close(doc) 860 | outq.put(False) # poison output queue and exit 861 | return 862 | except queue.Empty as e: 863 | logger.warning('OOFile.parse errq size weirdness') 864 | # enqueue message 865 | outq.put(message) 866 | # close desktop 867 | OOFile.close(desktop) 868 | # poison queue 869 | outq.put(None) 870 | 871 | ## 872 | # Close the Desktop connection. 873 | # 874 | @staticmethod 875 | def close(desktop): 876 | # See . 877 | logger.debug('Closing desktop') 878 | try: 879 | if desktop is not None: 880 | # XXX we should check for the com.sun.star.util.XCloseable interface first 881 | desktop.close(True) 882 | except uno.getClass('com.sun.star.lang.DisposedException') as e: 883 | logger.critical('OOFile.parse uno.DisposedException: {} {}'.format(doc, e)) 884 | except uno.getClass('com.sun.star.uno.RuntimeException') as e: 885 | logger.critical('OOFile.parse uno.RuntimeException: {} {}'.format(doc, e)) 886 | except Exception as e: 887 | logger.critical('exception: {} {}'.format(type(e), e)) 888 | 889 | ## 890 | # Get an XEnumeration of objects from a given object supporting the 891 | # XEnumerationAccess interface. 892 | # 893 | @staticmethod 894 | def XEnumeration(obj): 895 | xenum = obj.createEnumeration() 896 | while xenum.hasMoreElements(): 897 | yield xenum.nextElement() 898 | 899 | ## 900 | # Connect to a running soffice instance and return a XDesktop object. 901 | # 902 | @staticmethod 903 | def connect_soffice(connect_string, tries=5, sleep=5): 904 | desktop = None 905 | 906 | try: 907 | ctxLocal = uno.getComponentContext() 908 | smgrLocal = ctxLocal.ServiceManager 909 | resolver = smgrLocal.createInstanceWithContext('com.sun.star.bridge.UnoUrlResolver', ctxLocal) 910 | ctx = resolver.resolve(connect_string) 911 | smgr = ctx.ServiceManager 912 | logger.superdebug(f"smgr: {smgr}") 913 | desktop = smgr.createInstanceWithContext("com.sun.star.frame.Desktop", ctx) 914 | 915 | except uno.getClass('com.sun.star.lang.DisposedException') as e: 916 | logger.critical(f"Bridge died: {e}") 917 | except Exception as e: 918 | logger.critical(f"{type(e)} {e}") 919 | except uno.getClass('com.sun.star.connection.NoConnectException') as e: 920 | logger.critical(f"Failed to connect: {e}") 921 | 922 | if desktop is None: 923 | logger.critical(f"Desktop is None") 924 | 925 | return desktop 926 | 927 | # do it 928 | if __name__ == "__main__": 929 | sys.exit(main()) 930 | --------------------------------------------------------------------------------