├── .codeclimate.yml
├── .github
    └── workflows
    │   └── tests.yml
├── .gitignore
├── .isort.cfg
├── .pre-commit-config.yaml
├── LICENSE
├── README.md
├── __main__.py
├── de_decisions_pipeline.py
├── de_decisions_pipeline_steps
    ├── a_download.py
    ├── b_clean.py
    ├── c_hierarchy.py
    ├── common.py
    ├── d_reference_areas_parse.py
    └── e_network.py
├── download_de_gesetze_im_internet_data.py
├── download_us_code_data.py
├── download_us_reg_data.py
├── requirements-dev.txt
├── requirements.txt
├── setup.cfg
├── statics.py
├── statutes_pipeline_steps
    ├── __init__.py
    ├── crossreference_graph.py
    ├── de_authority_edgelist.py
    ├── de_crossreference_edgelist.py
    ├── de_crossreference_lookup.py
    ├── de_law_names.py
    ├── de_prepare_input.py
    ├── de_reference_areas.py
    ├── de_reference_parse.py
    ├── de_reference_parse_vso_list.py
    ├── de_to_xml.py
    ├── hierarchy_graph.py
    ├── snapshot_mapping_edgelist.py
    ├── snapshot_mapping_index.py
    ├── us_authority_edgelist.py
    ├── us_crossreference_edgelist.py
    ├── us_crossreference_lookup.py
    ├── us_prepare_input.py
    ├── us_reference_areas.py
    ├── us_reference_parse.py
    ├── us_reference_reg.py
    ├── us_reg_prepare_input.py
    ├── us_reg_to_xml.py
    └── us_to_xml.py
├── tests
    ├── __init__.py
    ├── test_common.py
    ├── test_de_reference_parse.py
    ├── test_de_reference_parse_vso_list.py
    ├── test_snapshot_mapping_index.py
    └── test_us_reg_xml.py
├── utils
    ├── common.py
    ├── simplify_gii_xml.py
    └── string_list_contains.py
├── xml-schema-decisions-de.xsd
├── xml-schema.xsd
└── xml-styles.css


/.codeclimate.yml:
--------------------------------------------------------------------------------
1 | version: "2" 
2 | plugins:
3 |   pep8:
4 |     enabled: true
5 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
 1 | name: Tests
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - master
 6 | jobs:
 7 |   run:
 8 |     env:
 9 |       PYTHON: '3.7'
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |     - uses: actions/checkout@master
13 |     - name: Setup Python
14 |       uses: actions/setup-python@master
15 |       with:
16 |         python-version: 3.7
17 |     - name: Install dependencies
18 |       run: |
19 |         python -m pip install --upgrade pip
20 |         pip install -r requirements.txt
21 |     - name: Generate coverage report
22 |       run: |
23 |         pip install pytest
24 |         pip install pytest-cov
25 |         python -m pytest tests --cov=./ --cov-report=xml
26 |     - name: Upload coverage to Codecov
27 |       uses: codecov/codecov-action@v1
28 |       with:
29 |         token: ${{ secrets.CODECOV_TOKEN }}
30 |         file: ./coverage.xml
31 |         directory: ./coverage/reports/
32 |         flags: unittests
33 |         env_vars: OS,PYTHON
34 |         name: codecov-umbrella
35 |         fail_ci_if_error: true
36 |         path_to_write_report: ./codecov_report.gz
37 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # Custom
132 | temp/
133 | .idea/
134 | venv/
135 | 


--------------------------------------------------------------------------------
/.isort.cfg:
--------------------------------------------------------------------------------
1 | [settings]
2 | profile = black
3 | extra_standard_library = setuptools,pkg_resources
4 | known_test = pytest
5 | known_first_party = ${root_pkg}
6 | sections = FUTURE,STDLIB,TEST,THIRDPARTY,FIRSTPARTY,LOCALFOLDER
7 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | exclude: '^docs/conf.py'
 2 | 
 3 | repos:
 4 | - repo: git://github.com/pre-commit/pre-commit-hooks
 5 |   rev: v3.2.0
 6 |   hooks:
 7 |   - id: trailing-whitespace
 8 |   - id: check-added-large-files
 9 |   - id: check-ast
10 |   - id: check-json
11 |   - id: check-merge-conflict
12 |   - id: check-xml
13 |   - id: check-yaml
14 |   - id: debug-statements
15 |   - id: end-of-file-fixer
16 |   - id: requirements-txt-fixer
17 |   - id: mixed-line-ending
18 |     args: ['--fix=no']
19 | 
20 | 
21 | - repo: http://github.com/timothycrosley/isort
22 |   rev: 5.4.2
23 |   hooks:
24 |   - id: isort
25 | 
26 | - repo: https://github.com/psf/black
27 |   rev: stable
28 |   hooks:
29 |   - id: black
30 |     language_version: python3
31 | 
32 | - repo: https://gitlab.com/pycqa/flake8
33 |   rev: 3.8.3
34 |   hooks:
35 |   - id: flake8
36 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 2-Clause License
 2 | 
 3 | Copyright (c) 2021, QuantLaw
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright notice, this
10 |    list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 |    this list of conditions and the following disclaimer in the documentation
14 |    and/or other materials provided with the distribution.
15 | 
16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![codecov](https://codecov.io/gh/QuantLaw/legal-data-preprocessing/branch/master/graph/badge.svg?token=FABCUR680K)](https://codecov.io/gh/QuantLaw/legal-data-preprocessing)
  2 | [![Tests](https://github.com/QuantLaw/legal-data-preprocessing/workflows/Tests/badge.svg)](https://github.com/QuantLaw/legal-data-preprocessing/actions)
  3 | [![Maintainability](https://api.codeclimate.com/v1/badges/8cffa9a56ce357314456/maintainability)](https://codeclimate.com/repos/5f1bf2a3fccc45014c00c615/maintainability)
  4 | [![DOI](https://zenodo.org/badge/doi/10.5281/zenodo.4070772.svg)](https://doi.org/10.5281/zenodo.4070772)
  5 | 
  6 | # Legal Data Preprocessing
  7 | 
  8 | This repository contains code to preprocess legal text documents.
  9 | It is, inter alia, used to produce the results reported in the following publications:
 10 | 
 11 | - Daniel Martin Katz, Corinna Coupette, Janis Beckedorf, and Dirk Hartung, Complex Societies and the Growth of the Law, *Sci. Rep.* **10** (2020), [https://doi.org/10.1038/s41598-020-73623-x](https://doi.org/10.1038/s41598-020-73623-x)
 12 | - Corinna Coupette, Janis Beckedorf, Dirk Hartung, Michael Bommarito, and Daniel Martin Katz, Measuring Law Over Time, to appear (2021)
 13 | 
 14 | Related Repositories:
 15 | - [Complex Societies and the Growth of the Law](https://github.com/QuantLaw/Complex-Societies-and-Growth) ([Publication Release](https://doi.org/10.5281/zenodo.4070769))
 16 | - [Measuring Law Over Time](https://github.com/QuantLaw/Measuring-Law-Over-Time)
 17 | - [Legal Data Clustering](https://github.com/QuantLaw/legal-data-clustering) ([Latest Publication Release](https://doi.org/10.5281/zenodo.4070774))
 18 | 
 19 | Related Data: 
 20 | - [Preprocessed Input Data for *Sci. Rep.* **10** (2020)](https://doi.org/10.5281/zenodo.4070767)
 21 | - [Preprocessed Input Data for *Measuring Law Over Time*, to appear (2021)](https://doi.org/10.5281/zenodo.4660133)
 22 | 
 23 | ## Setup
 24 | 
 25 | 1. It is assumed that you have Python 3.7 installed. (Other versions are not tested.)
 26 | 2. Set up a virtual environment and activate it. (This is not required but recommended.)
 27 | 3. Install the required packages `pip install -r requirements.txt`.
 28 | 
 29 | ## Getting Started
 30 | 
 31 | Make sure the following folders do not exist next to the root folder of this repository:
 32 | - `legal-networks-data`
 33 | - `gesetze-im-internet`
 34 | 
 35 | Download and prepare the data for the United States (US) and Germany. (See the respective "1. Data input"
 36 | sections below.) Afterwards, you can run the pipeline.
 37 | 
 38 | For the US statutory data:
 39 | 
 40 | 1. Download the data: `python download_us_code_data.py`
 41 | 2. Run all steps of the pipeline: `python . us all`
 42 | 
 43 | For the US statutory & regulatory data:
 44 | 
 45 | 1. Download statutory data: `python download_us_code_data.py`
 46 | 2. Download regulatory data: `python download_us_reg_data.py`
 47 | 3. Run all steps of the pipeline: `python . us all` and `python . us all -r`
 48 | 
 49 | 
 50 | For the German statutory data, using a *juris* export:
 51 | 
 52 | 1. Prepare the data (as shown in a separate repository)
 53 | 2. Run all steps of the pipeline: `python . de all`
 54 | 
 55 | For the German statutory & regulatory data, using a *juris* export:
 56 | 
 57 | 1. Prepare the data (as shown in a separate repository)
 58 | 2. Run all steps of the pipeline: `python . de all -r`
 59 | 
 60 | For the German statutory data, using Gesetze im Internet (GII):
 61 | 
 62 | 1. Prepare the data: `python download_de_gesetze_im_internet_data.py --dates 2019-06-10 2020-01-18`.
 63 |     You need to specify the dates you want to analyze.
 64 | 2. Run all steps of the pipeline except for `prepare_input` for the specified dates:
 65 |     `python . de xml law_names reference_areas reference_parse hierarchy_graph crossreference_lookup crossreference_edgelist crossreference_graph snapshot_mapping_edgelist --snapshots 2019-06-10 2020-01-18`
 66 | 
 67 | If you need to reduce memory usage, you can deactivate multiprocessing with the argument `--single-process`.
 68 | 
 69 | To download and prepare German judicial decision data from https://www.rechtsprechung-im-internet.de,
 70 | run `python de_decisions_pipeline.py all`.
 71 | 
 72 | 
 73 | ## Statutes
 74 | 
 75 | US and German federal statutes and regulations are converted from official sources (or *juris*)
 76 | to multiple clean formats focussing on the structure of the law.
 77 | 
 78 | Output formats are:
 79 | 
 80 | - XML files containing the text, the hierarchical structure of the law, and cross-references.
 81 | - Gpickle files for each Title/Gesetz/Rechtsverodnung and version containing the hierarchical structure of the statutes.
 82 | - Gpickle files for each snapshot (year in the US or date in Germany) containing the hierarchical structure of the statutes
 83 |     and the cross-references between different elements of the statutes with reduced granularity and corresponding nodelists 
 84 |     and edgelists.
 85 | - Snapshot mapping edgelists: These lists map elements of a network at one snapshot
 86 |     to a snapshot at another time. They encode, e.g., where a clause of the US Code in 2010 is
 87 |     located in the US Code of 2011. This mapping is derived from the text and the structure
 88 |     of the statutes.
 89 | 
 90 | The steps of the pipeline are:
 91 | 
 92 | - `prepare_input`
 93 | - `xml`
 94 | - `law_names` (only for German pipeline)
 95 | - `reference_areas`
 96 | - `reference_parse`
 97 | - `hierarchy_graph`
 98 | - `crossreference_lookup`
 99 | - `crossreference_edgelist`
100 | - `crossreference_graph`
101 | - `snapshot_mapping_index`
102 | - `snapshot_mapping_edgelist`
103 | 
104 | 
105 | ### US
106 | 
107 | The processing for the US Code is executed in multiple steps:
108 | 
109 | 
110 | #### 1. Data Input
111 | 
112 | Inputs are ZIP files downloaded from the US House of Representatives Office of the Law
113 | Revision Counsel and U.S. Government Publishing Office. We use annual versions in XHTML format that are available on
114 | https://uscode.house.gov/download/annualhistoricalarchives/downloadxhtml.shtml and
115 | https://www.govinfo.gov/bulkdata/CFR.
116 | Files should be located at regarding the statutes `../legal-networks-data/us/1_input` and regarding the regulations `../legal-networks-data/us_reg/1_input`.
117 | This folder should contain unzipped yearly folders.
118 | 
119 | You can automatically obtain the required data by running `download_us_code_data.py` and `download_us_reg_data.py`.
120 | 
121 | 
122 | #### 2. XML Files
123 | 
124 | - Files containing titles of the US Code are copied to `temp/us/11_htm`.
125 |     Appendices and Stylesheets are filtered. (Result of step: `prepare_input`)
126 | - Simple XML files focusing on the structure are generated from the XHTML files.
127 |     Results can be found in `temp/us/12_xml`. (Result of step: `xml`)
128 | - Text segments containing a cross-reference are annotated in the XML files. Results are saved to
129 |     `temp/us/13_reference_areas`. (Result of step: `reference_areas`)
130 | - The contents of the annotated cross-references are extracted and added to the XML.
131 | 
132 | The results of the XML generation are saved to `../legal-networks-data/us/2_xml`. (Result of step: `reference_parse`)
133 | 
134 | CFR data is located at `us_reg` folders next to the `us` folder.
135 | 
136 | 
137 | #### 3. Hierarchy Graphs
138 | 
139 | Graphs containing the hierarchical structure of the statutes are saved to `../legal-networks-data/us/3_hierarchy_graph`
140 | in separate files for each Title and annual version. (Result of step: `hierarchy_graph`)
141 | 
142 | CFR data is located at `us_reg` folders next to the `us` folder.
143 | 
144 | 
145 | #### 4. Crossreference Graphs
146 | 
147 | - A list of all sections in the US Code at a specific point in time is generated to obtain a list of possible
148 |     destinations of cross-references. This is a preparation step for drawing edges from the cross-reference source to the cross-reference destination. The lists are stored at `temp/us/31_crossreference_lookup`.
149 |     (Result of step: `crossreference_lookup`)
150 | - Lists of all cross-references are generated. They contain the ID of the referencing and the referenced element.
151 |     The lists are located at `temp/us/32_crossreference_edgelist`.
152 |     (Result of step: `crossreference_edgelist`)
153 | - Hierarchy graphs of the individual Titles are combined and edges for cross-references are added within and between
154 |     Titles.
155 | 
156 | Each annual version of the US Code is stored at `../legal-networks-data/us/4_crossreference_graph` in three files as a nodeslist, an edgelist and networkx graph stored as gpickle.gz-file in the subfolder `seqitems`. The node-list contains all nodes, whereas subseqitems are excluded in the networkx file.
157 | (Result of step: `crossreference_graph`)
158 | 
159 | The combined data regarding the US Code and the CFR is located at `us_reg` folders next to the `us` folder.
160 | 
161 | 
162 | #### 5. Snapshot Mapping Edgelists
163 | 
164 | Snapshot mapping edgelists are stored at `../legal-networks-data/us/5_snapshot_mapping_edgelist` and ``../legal-networks-data/us_reg/5_snapshot_mapping_edgelist``.
165 | 
166 | 
167 | #### Germany
168 | 
169 | #### 1. Data Input
170 | 
171 | Inputs are XML files in a format simplified from that of documents available from GII.
172 | These files can be generated from two sources:
173 | 
174 | 1. XML files provided by GII. To obtain older versions of this website
175 |     use our public archive at https://github.com/legal-networks/gesetze-im-internet.
176 |     Downloaded files must be simplified before they are suitable input.
177 |     Use `download_de_gesetze_im_internet_data.py` to download, simplify and rename the source files.
178 |     This replaces step `prepare_input` in the pipeline.
179 |     (Make sure that you do not run this step. It is not possible to run `all` steps.)
180 | 2. An export from the *juris* database can be used to obtain the data.
181 |     Whereas this datasource covers a longer time period, we cannot make it publicly available due to licensing restrictions.
182 | 
183 | #### 2. XML Files
184 | 
185 | - Files in the simplified format of Gesetze im Internet are generated and saved to `temp/de/11_gii_xml`
186 |     (Result of step: `prepare_input` or  `download_de_gesetze_im_internet_data.py`)
187 | - Simple XML files focusing on the structure are generated from the original XML files.
188 |     Results can be found in `temp/de/12_xml`. (Result of step: `xml`)
189 | - A list of the names of all statutes (Gesetze) is saved to
190 |         `temp/de/12_xml_law_names.csv` with a mapping to the corresponding files.
191 |         This is used to extract cross-references, as statutes are typically referenced by their name.
192 |         Names are saved in a stemmed version. (Result of step: `law_names`)
193 | 
194 |     Furthermore, `temp/de/12_xml_law_names_compiled.pickle` is generated.
195 |         It contains the same information as `12_xml_law_names.csv`,
196 |         but is optimized to obtain the stemmed names of all valid laws at specific dates. (Result of step: `law_names`)
197 | - Text segments containing a cross-reference are annotated in the XML files. Results are saved to
198 |     `temp/de/13_reference_areas`. (Result of step: `reference_areas`)
199 | - The contents of the annotated cross-references are extracted and added to the XML.
200 | 
201 | The results of the XML generation are saved to `../legal-networks-data/de/2_xml`. (Result of step: `reference_parse`)
202 | 
203 | The combined data of statutes and regulations is located at `de_reg` folders next to the `de` folder.
204 | 
205 | #### 3. Hierarchy Graphs
206 | 
207 | Hierarchy Graphs are saved to `../legal-networks-data/de/3_hierarchy_graph`.
208 | See the documentation regarding the US hierarchy graphs for further information.
209 | 
210 | The combined data of statutes and regulations is located at `de_reg` folders next to the `de` folder.
211 | 
212 | #### 4. Cross-Reference Graphs
213 | 
214 | In general cross-reference graphs are generated in the same manner as for the US dataset
215 | (see above for further information).
216 | (Interim) results are saved to
217 | `temp/us/31_crossreference_lookup`,
218 | `temp/us/32_crossreference_edgelist`, and
219 | `../legal-networks-data/us/4_crossreference_graph`, respectively.
220 | 
221 | A major difference are the possible dates for which to create cross-reference graphs.
222 | For the US, only annual version are available.
223 | The *juris* export allows one to select any day to create a snapshot.
224 | If you rely on https://github.com/legal-networks/gesetze-im-internet as a data source, you can only select days
225 | for which a snapshot was created.
226 | 
227 | The combined data of statutes and regulations is located at `de_reg` folders next to the `de` folder.
228 | 
229 | #### 5. Snapshot Mapping Edgelists
230 | 
231 | Snapshot mapping edgelists are stored at `../legal-networks-data/de/5_snapshot_mapping_edgelist`.
232 | 
233 | The combined data of statutes and regulations is located at `de_reg` folders next to the `de` folder.
234 | 


--------------------------------------------------------------------------------
/__main__.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import re
  4 | 
  5 | from statics import (
  6 |     ALL_YEARS,
  7 |     ALL_YEARS_REG,
  8 |     DE_CROSSREFERENCE_EDGELIST_PATH,
  9 |     DE_CROSSREFERENCE_GRAPH_PATH,
 10 |     DE_HIERARCHY_GRAPH_PATH,
 11 |     DE_REFERENCE_PARSED_PATH,
 12 |     DE_REG_AUTHORITY_EDGELIST_PATH,
 13 |     DE_REG_CROSSREFERENCE_EDGELIST_PATH,
 14 |     DE_REG_CROSSREFERENCE_GRAPH_PATH,
 15 |     DE_REG_HIERARCHY_GRAPH_PATH,
 16 |     DE_REG_REFERENCE_PARSED_PATH,
 17 |     DE_REG_SNAPSHOT_MAPPING_EDGELIST_PATH,
 18 |     DE_REG_SNAPSHOT_MAPPING_INDEX_PATH,
 19 |     DE_SNAPSHOT_MAPPING_EDGELIST_PATH,
 20 |     DE_SNAPSHOT_MAPPING_INDEX_PATH,
 21 |     US_CROSSREFERENCE_EDGELIST_PATH,
 22 |     US_CROSSREFERENCE_GRAPH_PATH,
 23 |     US_HIERARCHY_GRAPH_PATH,
 24 |     US_REFERENCE_PARSED_PATH,
 25 |     US_REG_AUTHORITY_EDGELIST_PATH,
 26 |     US_REG_CROSSREFERENCE_EDGELIST_PATH,
 27 |     US_REG_CROSSREFERENCE_GRAPH_PATH,
 28 |     US_REG_HIERARCHY_GRAPH_PATH,
 29 |     US_REG_REFERENCE_PARSED_PATH,
 30 |     US_REG_SNAPSHOT_MAPPING_EDGELIST_PATH,
 31 |     US_REG_SNAPSHOT_MAPPING_INDEX_PATH,
 32 |     US_SNAPSHOT_MAPPING_EDGELIST_PATH,
 33 |     US_SNAPSHOT_MAPPING_INDEX_PATH,
 34 | )
 35 | from statutes_pipeline_steps.crossreference_graph import CrossreferenceGraphStep
 36 | from statutes_pipeline_steps.de_authority_edgelist import DeAuthorityEdgelist
 37 | from statutes_pipeline_steps.de_crossreference_edgelist import DeCrossreferenceEdgelist
 38 | from statutes_pipeline_steps.de_crossreference_lookup import DeCrossreferenceLookup
 39 | from statutes_pipeline_steps.de_law_names import DeLawNamesStep
 40 | from statutes_pipeline_steps.de_prepare_input import de_prepare_input
 41 | from statutes_pipeline_steps.de_reference_areas import DeReferenceAreasStep
 42 | from statutes_pipeline_steps.de_reference_parse import DeReferenceParseStep
 43 | from statutes_pipeline_steps.de_to_xml import DeToXmlStep, get_type_for_doknr_dict
 44 | from statutes_pipeline_steps.hierarchy_graph import HierarchyGraphStep
 45 | from statutes_pipeline_steps.snapshot_mapping_edgelist import (
 46 |     SnapshotMappingEdgelistStep,
 47 | )
 48 | from statutes_pipeline_steps.snapshot_mapping_index import SnapshotMappingIndexStep
 49 | from statutes_pipeline_steps.us_authority_edgelist import UsAuthorityEdgelist
 50 | from statutes_pipeline_steps.us_crossreference_edgelist import UsCrossreferenceEdgelist
 51 | from statutes_pipeline_steps.us_crossreference_lookup import UsCrossreferenceLookup
 52 | from statutes_pipeline_steps.us_prepare_input import us_prepare_input
 53 | from statutes_pipeline_steps.us_reference_areas import UsReferenceAreasStep
 54 | from statutes_pipeline_steps.us_reference_parse import UsReferenceParseStep
 55 | from statutes_pipeline_steps.us_reg_prepare_input import us_reg_prepare_input
 56 | from statutes_pipeline_steps.us_reg_to_xml import UsRegsToXmlStep
 57 | from statutes_pipeline_steps.us_to_xml import UsToXmlStep
 58 | from utils.common import load_law_names, load_law_names_compiled, str_to_bool
 59 | 
 60 | 
 61 | def get_subseqitem_conf(subseqitems):
 62 |     if subseqitems is None:
 63 |         return False, True
 64 |     elif subseqitems is True:
 65 |         return (True,)
 66 |     elif subseqitems is False:
 67 |         return (False,)
 68 | 
 69 | 
 70 | ALL_STEPS = [
 71 |     "prepare_input",
 72 |     "xml",
 73 |     "law_names",  # DE only
 74 |     "reference_areas",
 75 |     "reference_parse",
 76 |     "hierarchy_graph",
 77 |     "crossreference_lookup",
 78 |     "crossreference_edgelist",
 79 |     "authority_edgelist",
 80 |     "crossreference_graph",
 81 |     # creates edgelist to map nodes between snapshots for DYNAMIC graph
 82 |     "snapshot_mapping_index",
 83 |     "snapshot_mapping_edgelist",
 84 | ]
 85 | 
 86 | if __name__ == "__main__":
 87 |     parser = argparse.ArgumentParser()
 88 |     parser.add_argument("dataset", help="select a dataset: DE or US")
 89 |     parser.add_argument("steps", nargs="+", help="select a step to perform by name")
 90 |     parser.add_argument("--filter", nargs="*", help="filter for specific files")
 91 |     parser.add_argument(
 92 |         "--single-process",
 93 |         dest="use_multiprocessing",
 94 |         action="store_const",
 95 |         const=False,
 96 |         default=True,
 97 |         help="prevent multiprocessing",
 98 |     )
 99 |     parser.add_argument(
100 |         "--overwrite",
101 |         dest="overwrite",
102 |         action="store_const",
103 |         const=True,
104 |         default=False,
105 |         help="overwrite files",
106 |     )
107 |     parser.add_argument(
108 |         "--subseqitems",
109 |         dest="subseqitems",
110 |         nargs="?",
111 |         const=True,
112 |         type=str_to_bool,
113 |         default=None,
114 |         help="include subseqitems in graphs",
115 |     )
116 |     parser.add_argument(
117 |         "--snapshots",
118 |         dest="snapshots",
119 |         nargs="*",
120 |         type=str,
121 |         default=["all"],
122 |         help=(
123 |             "snapshots for crossreferences. Eg. 2010-01-01 for de dataset or 2010 for "
124 |             "us dataset. To run on whole research window: all or all-new-years"
125 |         ),
126 |     )
127 |     parser.add_argument(
128 |         "--interval",
129 |         dest="interval",
130 |         nargs="?",
131 |         type=int,
132 |         default=1,
133 |         help=(
134 |             "Only for snapshot_mapping_edgelist. Interval for mapped snapshots. "
135 |             "Default 1 (snapshot)"
136 |         ),
137 |     )
138 | 
139 |     parser.add_argument(
140 |         "-r",
141 |         "--regulations",
142 |         dest="regulations",
143 |         action="store_const",
144 |         const=True,
145 |         default=False,
146 |         help="Include regulations",
147 |     )
148 | 
149 |     parser.add_argument(
150 |         "-dc",
151 |         "--detailed-crossreferences",
152 |         dest="detailed_crossreferences",
153 |         action="store_const",
154 |         const=True,
155 |         default=False,
156 |         help="Resolve cross references on the lowest possible level. "
157 |         "Default is to resolve on seqitem level (e.g. sections).",
158 |     )
159 |     args = parser.parse_args()
160 | 
161 |     steps = [step.lower() for step in args.steps]
162 |     dataset = args.dataset.lower()
163 |     use_multiprocessing = args.use_multiprocessing
164 |     processes = None if args.use_multiprocessing else 1
165 |     overwrite = args.overwrite
166 |     snapshots = args.snapshots
167 |     interval = args.interval
168 |     selected_items = args.filter or []
169 |     regulations = args.regulations
170 |     detailed_crossreferences = args.detailed_crossreferences
171 | 
172 |     if dataset not in ["de", "us"]:
173 |         raise Exception(f"{dataset} unsupported dataset. Options: us, de")
174 | 
175 |     if "all" in snapshots or "all-new-years" in snapshots:
176 |         years = ALL_YEARS_REG if regulations else ALL_YEARS
177 |         if dataset == "us":
178 |             snapshots = [f"{year}" for year in years]
179 |         elif dataset == "de":
180 |             snapshots = [
181 |                 f"{year}-12-31" if "all" in snapshots else f"{year}-01-01"
182 |                 for year in years
183 |             ]
184 | 
185 |     if "all" in steps:
186 |         steps = ALL_STEPS
187 |     else:
188 |         unknown_steps = [s for s in steps if s not in ALL_STEPS]
189 |         assert not unknown_steps, unknown_steps
190 | 
191 |     if (
192 |         "crossreference_lookup" in steps
193 |         or "crossreference_edgelist" in steps
194 |         or "crossreference_graph" in steps
195 |     ):
196 |         if dataset == "de" or snapshots:
197 |             for snapshot in snapshots:
198 |                 if not re.fullmatch(r"\d{4}(-\d{2}-\d{2})?", snapshot):
199 |                     raise Exception(
200 |                         "Add --snapshots as argument. "
201 |                         "E.g. for de --snapshots 2012-01-31 2013-01-31 or for us "
202 |                         "--snapshot 2001"
203 |                     )
204 | 
205 |     if detailed_crossreferences and regulations:
206 |         raise Exception(
207 |             "Combining detailed cross-references and regulations is not tested."
208 |         )
209 | 
210 |     if "prepare_input" in steps:
211 |         if dataset == "us":
212 |             if regulations:
213 |                 us_reg_prepare_input()
214 |             else:
215 |                 us_prepare_input()
216 |         elif dataset == "de":
217 |             de_prepare_input(regulations)
218 |         print("Filter input: done")
219 | 
220 |     if "xml" in steps:
221 |         if dataset == "us":
222 |             if regulations:
223 |                 step = UsRegsToXmlStep(processes)
224 |             else:
225 |                 step = UsToXmlStep(processes)
226 |             items = step.get_items(overwrite)
227 |             step.execute_filtered_items(items, selected_items)
228 |         elif dataset == "de":
229 |             dok_type_dict = get_type_for_doknr_dict()
230 |             step = DeToXmlStep(
231 |                 regulations=regulations,
232 |                 processes=processes,
233 |                 dok_type_dict=dok_type_dict,
234 |             )
235 |             items = step.get_items(overwrite)
236 |             step.execute_filtered_items(items, selected_items)
237 |         print("Convert to xml: done")
238 | 
239 |     if "law_names" in steps:
240 |         if dataset == "de":
241 |             step = DeLawNamesStep(regulations=regulations, processes=processes)
242 |             items = step.get_items()
243 |             step.execute_items(items)
244 |             print("Law names: done")
245 | 
246 |     if "reference_areas" in steps:
247 |         if dataset == "us":
248 |             step = UsReferenceAreasStep(regulations=regulations, processes=processes)
249 |             items = step.get_items(overwrite)
250 |             step.execute_filtered_items(items)
251 | 
252 |         elif dataset == "de":
253 |             law_names = load_law_names_compiled(regulations)
254 |             step = DeReferenceAreasStep(
255 |                 law_names=law_names, regulations=regulations, processes=processes
256 |             )
257 |             items = step.get_items(overwrite)
258 |             step.execute_filtered_items(items)
259 | 
260 |         print("Extract reference areas: done")
261 | 
262 |     if "reference_parse" in steps:
263 |         if dataset == "us":
264 |             step = UsReferenceParseStep(regulations=regulations, processes=processes)
265 |             items = step.get_items(overwrite)
266 |             step.execute_filtered_items(items)
267 |         if dataset == "de":
268 |             law_names = load_law_names_compiled(regulations)
269 |             step = DeReferenceParseStep(
270 |                 law_names=law_names, regulations=regulations, processes=processes
271 |             )
272 |             items = step.get_items(overwrite)
273 |             step.execute_filtered_items(items)
274 | 
275 |             print("Parse references: done")
276 | 
277 |     if "hierarchy_graph" in steps:
278 |         # for subseqitems_conf in get_subseqitem_conf(args.subseqitems):
279 |         for subseqitems_conf in [True]:
280 |             if dataset == "us":
281 |                 source = (
282 |                     US_REG_REFERENCE_PARSED_PATH
283 |                     if regulations
284 |                     else US_REFERENCE_PARSED_PATH
285 |                 )
286 |                 destination = os.path.join(
287 |                     US_REG_HIERARCHY_GRAPH_PATH
288 |                     if regulations
289 |                     else US_HIERARCHY_GRAPH_PATH,
290 |                     "subseqitems" if subseqitems_conf else "seqitems",
291 |                 )
292 |             elif dataset == "de":
293 |                 source = (
294 |                     DE_REG_REFERENCE_PARSED_PATH
295 |                     if regulations
296 |                     else DE_REFERENCE_PARSED_PATH
297 |                 )
298 |                 destination = (
299 |                     (
300 |                         DE_REG_HIERARCHY_GRAPH_PATH
301 |                         if regulations
302 |                         else DE_HIERARCHY_GRAPH_PATH
303 |                     )
304 |                     + "/"
305 |                     + ("subseqitems" if subseqitems_conf else "seqitems")
306 |                 )
307 | 
308 |             step = HierarchyGraphStep(
309 |                 source=source,
310 |                 destination=destination,
311 |                 add_subseqitems=subseqitems_conf,
312 |                 processes=processes,
313 |             )
314 |             items = step.get_items(overwrite)
315 |             step.execute_filtered_items(items)
316 |         print("Make hierarchy graphs: done")
317 | 
318 |     if "crossreference_lookup" in steps:
319 |         if dataset == "us":
320 |             step = UsCrossreferenceLookup(
321 |                 detailed_crossreferences=detailed_crossreferences,
322 |                 regulations=regulations,
323 |                 processes=processes,
324 |             )
325 |             items = step.get_items(overwrite, snapshots)
326 |             step.execute_items(items)
327 | 
328 |         elif dataset == "de":
329 |             assert not detailed_crossreferences
330 |             step = DeCrossreferenceLookup(regulations=regulations, processes=processes)
331 |             items = step.get_items(snapshots)
332 |             step.execute_items(items)
333 | 
334 |         print("Create crossreference lookup: done")
335 | 
336 |     if "crossreference_edgelist" in steps:
337 |         if dataset == "us":
338 |             step = UsCrossreferenceEdgelist(
339 |                 detailed_crossreferences=detailed_crossreferences,
340 |                 regulations=regulations,
341 |                 processes=processes,
342 |             )
343 |             items = step.get_items(overwrite, snapshots)
344 |             step.execute_items(items)
345 | 
346 |         elif dataset == "de":
347 |             assert not detailed_crossreferences
348 |             law_names_data = load_law_names(regulations)
349 |             step = DeCrossreferenceEdgelist(
350 |                 regulations=regulations,
351 |                 law_names_data=law_names_data,
352 |                 processes=processes,
353 |             )
354 |             items = step.get_items(overwrite, snapshots)
355 |             step.execute_items(items)
356 | 
357 |         print("Create crossreference edgelist: done")
358 | 
359 |     if "authority_edgelist" in steps:
360 |         if dataset == "de" and regulations:
361 |             law_names_data = load_law_names(regulations)
362 |             step = DeAuthorityEdgelist(
363 |                 law_names_data=law_names_data, processes=processes
364 |             )
365 |             items = step.get_items(overwrite, snapshots)
366 |             step.execute_items(items)
367 |         elif dataset == "us" and regulations:
368 |             assert not detailed_crossreferences
369 |             step = UsAuthorityEdgelist(
370 |                 detailed_crossreferences=detailed_crossreferences,
371 |                 processes=processes,
372 |                 regulations=regulations,
373 |             )
374 |             items = step.get_items(overwrite, snapshots)
375 |             step.execute_items(items)
376 |         print("Create authority edgelist: done")
377 | 
378 |     if "crossreference_graph" in steps:
379 |         if dataset == "us":
380 |             source = US_HIERARCHY_GRAPH_PATH
381 |             source_regulation = US_REG_HIERARCHY_GRAPH_PATH
382 |             destination = (
383 |                 US_REG_CROSSREFERENCE_GRAPH_PATH
384 |                 if regulations
385 |                 else US_CROSSREFERENCE_GRAPH_PATH
386 |             ) + ("/detailed" if detailed_crossreferences else "")
387 |             edgelist_folder = (
388 |                 US_REG_CROSSREFERENCE_EDGELIST_PATH
389 |                 if regulations
390 |                 else US_CROSSREFERENCE_EDGELIST_PATH
391 |             ) + ("/detailed" if detailed_crossreferences else "")
392 |             authority_edgelist_folder = US_REG_AUTHORITY_EDGELIST_PATH
393 |         elif dataset == "de":
394 |             assert not detailed_crossreferences
395 |             source = (
396 |                 DE_REG_HIERARCHY_GRAPH_PATH if regulations else DE_HIERARCHY_GRAPH_PATH
397 |             )
398 |             source_regulation = None
399 |             destination = (
400 |                 DE_REG_CROSSREFERENCE_GRAPH_PATH
401 |                 if regulations
402 |                 else DE_CROSSREFERENCE_GRAPH_PATH
403 |             )
404 |             edgelist_folder = (
405 |                 DE_REG_CROSSREFERENCE_EDGELIST_PATH
406 |                 if regulations
407 |                 else DE_CROSSREFERENCE_EDGELIST_PATH
408 |             )
409 |             authority_edgelist_folder = DE_REG_AUTHORITY_EDGELIST_PATH
410 | 
411 |         step = CrossreferenceGraphStep(
412 |             regulations=regulations,
413 |             source=source,
414 |             source_regulation=source_regulation,
415 |             destination=destination,
416 |             edgelist_folder=edgelist_folder,
417 |             dataset=dataset,
418 |             authority_edgelist_folder=authority_edgelist_folder,
419 |             processes=processes,
420 |         )
421 |         items = step.get_items(overwrite, snapshots)
422 |         step.execute_items(items)
423 | 
424 |         print("Make crossreference graph: done")
425 | 
426 |     if "snapshot_mapping_index" in steps:
427 |         assert not detailed_crossreferences
428 |         if dataset == "us":
429 |             source_text = (
430 |                 [US_REFERENCE_PARSED_PATH, US_REG_REFERENCE_PARSED_PATH]
431 |                 if regulations
432 |                 else US_REFERENCE_PARSED_PATH
433 |             )
434 |             destination = os.path.join(
435 |                 US_REG_SNAPSHOT_MAPPING_INDEX_PATH
436 |                 if regulations
437 |                 else US_SNAPSHOT_MAPPING_INDEX_PATH,
438 |                 "subseqitems",
439 |             )
440 |             law_names_data = None
441 |         elif dataset == "de":
442 |             source_text = (
443 |                 DE_REG_REFERENCE_PARSED_PATH
444 |                 if regulations
445 |                 else DE_REFERENCE_PARSED_PATH
446 |             )
447 |             destination = os.path.join(
448 |                 DE_REG_SNAPSHOT_MAPPING_INDEX_PATH
449 |                 if regulations
450 |                 else DE_SNAPSHOT_MAPPING_INDEX_PATH,
451 |                 "subseqitems",
452 |             )
453 |             law_names_data = load_law_names(regulations)
454 | 
455 |         step = SnapshotMappingIndexStep(
456 |             source_text,
457 |             destination,
458 |             dataset,
459 |             law_names_data,
460 |             processes=processes,
461 |         )
462 |         items = step.get_items(overwrite, snapshots)
463 |         step.execute_items(items)
464 | 
465 |         print("Make snapshot mapping: done")
466 | 
467 |     if "snapshot_mapping_edgelist" in steps:
468 |         assert not detailed_crossreferences
469 |         if dataset == "us":
470 |             source = os.path.join(
471 |                 US_REG_SNAPSHOT_MAPPING_INDEX_PATH
472 |                 if regulations
473 |                 else US_SNAPSHOT_MAPPING_INDEX_PATH,
474 |                 "subseqitems",
475 |             )
476 |             destination = os.path.join(
477 |                 US_REG_SNAPSHOT_MAPPING_EDGELIST_PATH
478 |                 if regulations
479 |                 else US_SNAPSHOT_MAPPING_EDGELIST_PATH,
480 |                 "subseqitems",
481 |             )
482 |         elif dataset == "de":
483 |             source = os.path.join(
484 |                 DE_REG_SNAPSHOT_MAPPING_INDEX_PATH
485 |                 if regulations
486 |                 else DE_SNAPSHOT_MAPPING_INDEX_PATH,
487 |                 "subseqitems",
488 |             )
489 |             destination = os.path.join(
490 |                 DE_REG_SNAPSHOT_MAPPING_EDGELIST_PATH
491 |                 if regulations
492 |                 else DE_SNAPSHOT_MAPPING_EDGELIST_PATH,
493 |                 "subseqitems",
494 |             )
495 | 
496 |         step = SnapshotMappingEdgelistStep(
497 |             source,
498 |             destination,
499 |             interval,
500 |             dataset,
501 |             processes=processes,
502 |         )
503 |         items = step.get_items(overwrite, snapshots)
504 |         step.execute_items(items)
505 | 
506 |         print("Make snapshot mapping: done")
507 | 


--------------------------------------------------------------------------------
/de_decisions_pipeline.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from de_decisions_pipeline_steps.a_download import download
 4 | from de_decisions_pipeline_steps.b_clean import clean
 5 | from de_decisions_pipeline_steps.c_hierarchy import hierarchy
 6 | from de_decisions_pipeline_steps.d_reference_areas_parse import reference_parse_areas
 7 | from de_decisions_pipeline_steps.e_network import network
 8 | 
 9 | if __name__ == "__main__":
10 |     parser = argparse.ArgumentParser()
11 |     parser.add_argument(
12 |         "steps", nargs="*", default=["all"], help="select a step to perform by name"
13 |     )
14 |     args = parser.parse_args()
15 | 
16 |     if args.steps == ["all"]:
17 |         steps = ["download", "clean", "hierarchy", "references", "network"]
18 |     else:
19 |         steps = args.steps
20 | 
21 |     if "download" in steps:
22 |         download()
23 | 
24 |     if "clean" in steps:
25 |         clean()
26 | 
27 |     if "hierarchy" in steps:
28 |         hierarchy()
29 | 
30 |     if "references" in steps:
31 |         reference_parse_areas(regulations=False)
32 | 
33 |     if "network" in steps:
34 |         network()
35 | 


--------------------------------------------------------------------------------
/de_decisions_pipeline_steps/a_download.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import zipfile
 3 | from multiprocessing.pool import Pool
 4 | 
 5 | import requests
 6 | from bs4 import BeautifulSoup
 7 | from quantlaw.utils.files import ensure_exists
 8 | from requests.adapters import HTTPAdapter
 9 | from urllib3.util.retry import Retry
10 | 
11 | from statics import (
12 |     DE_DECISIONS_DOWNLOAD_TOC,
13 |     DE_DECISIONS_DOWNLOAD_XML,
14 |     DE_DECISIONS_DOWNLOAD_ZIP,
15 |     DE_DECISIONS_TEMP_DATA_PATH,
16 | )
17 | 
18 | 
19 | def download_item(link_text):
20 |     s = requests.Session()
21 |     retries = Retry(
22 |         total=10,
23 |         backoff_factor=2,
24 |     )
25 |     s.mount("https://", HTTPAdapter(max_retries=retries))
26 | 
27 |     filename = link_text.split("/")[-1]
28 |     if not os.path.isfile(f"{DE_DECISIONS_DOWNLOAD_ZIP}/{filename}"):
29 |         content = s.get(link_text).content
30 |         with open(f"{DE_DECISIONS_DOWNLOAD_ZIP}/{filename}", "wb") as f:
31 |             f.write(content)
32 | 
33 | 
34 | def download():
35 |     ensure_exists(DE_DECISIONS_TEMP_DATA_PATH)
36 |     toc = requests.get("https://www.rechtsprechung-im-internet.de/rii-toc.xml").text
37 |     with open(DE_DECISIONS_DOWNLOAD_TOC, "w") as f:
38 |         f.write(toc)
39 | 
40 |     with open(DE_DECISIONS_DOWNLOAD_TOC) as f:
41 |         toc = f.read()
42 |     soup = BeautifulSoup(toc, "lxml-xml")
43 |     len(soup.findAll("item"))
44 | 
45 |     ensure_exists(DE_DECISIONS_DOWNLOAD_ZIP)
46 |     items = [i.link.text for i in soup.findAll("item")]
47 |     with Pool(4) as p:
48 |         p.map(download_item, items)
49 | 
50 |     ensure_exists(DE_DECISIONS_DOWNLOAD_XML)
51 | 
52 |     i = 0
53 |     for filename in os.listdir(DE_DECISIONS_DOWNLOAD_ZIP):
54 |         if os.path.splitext(filename)[1] == ".zip":
55 |             zip_ref = zipfile.ZipFile(f"{DE_DECISIONS_DOWNLOAD_ZIP}/{filename}", "r")
56 |             zip_ref.extractall(DE_DECISIONS_DOWNLOAD_XML)
57 |             zip_ref.close()
58 |             i += 1
59 |             print(f"\r{i} entpackt", end="")
60 | 


--------------------------------------------------------------------------------
/de_decisions_pipeline_steps/b_clean.py:
--------------------------------------------------------------------------------
 1 | import multiprocessing
 2 | import os
 3 | 
 4 | from bs4 import BeautifulSoup, Tag
 5 | from quantlaw.utils.beautiful_soup import save_soup
 6 | from quantlaw.utils.files import ensure_exists, list_dir
 7 | 
 8 | from de_decisions_pipeline_steps.common import get_docparts_with_p
 9 | from statics import DE_DECISIONS_DOWNLOAD_XML, DE_DECISIONS_XML
10 | 
11 | 
12 | def clean_abs(section_tag):
13 |     contents = []
14 | 
15 |     for dl in section_tag.findAll("dl"):
16 |         number = dl.dt.get_text(" ").strip()
17 |         number = number if len(number) else None
18 |         text = dl.dd.get_text(" ").strip()
19 |         indented = bool(dl.dd.p and "margin-left" in dl.dd.p.attrs.get("style", ""))
20 |         if len(text):
21 |             contents.append(
22 |                 dict(
23 |                     number=number,
24 |                     content=text,
25 |                     indented=indented,
26 |                 )
27 |             )
28 | 
29 |     return contents
30 | 
31 | 
32 | def replace_tag_with_content(tag, contents, soup):
33 |     for children in tag.contents:
34 |         if type(children) is Tag:
35 |             children.decompose()
36 |     tag.contents = []
37 |     for content_dict in contents:
38 |         p_tag = soup.new_tag("p")
39 |         if content_dict["indented"]:
40 |             p_tag["indented"] = str(True)
41 |         if content_dict["number"]:
42 |             p_tag["numbers"] = content_dict["number"]
43 |         p_tag.append(soup.new_string(content_dict["content"]))
44 |         tag.append(p_tag)
45 | 
46 | 
47 | def fix_data(decision, text):
48 |     if "JURE149015016" in decision:
49 |         text = text.replace("Art.l ", "Art. I ")
50 |     return text
51 | 
52 | 
53 | def clean_decision(decision):
54 |     if not os.path.exists(f"{DE_DECISIONS_XML}/{decision}"):
55 |         with open(f"{DE_DECISIONS_DOWNLOAD_XML}/{decision}", encoding="utf8") as f:
56 |             content = f.read()
57 |             content = content.replace("\xa0", " ")
58 |             soup = BeautifulSoup(content, "lxml-xml")
59 |         for doc_parts in get_docparts_with_p(soup):
60 |             contents = clean_abs(doc_parts)
61 |             replace_tag_with_content(doc_parts, contents, soup)
62 | 
63 |         soup_str = fix_data(decision, str(soup))
64 |         save_soup(soup_str, f"{DE_DECISIONS_XML}/{decision}")
65 | 
66 | 
67 | def clean():
68 |     ensure_exists(DE_DECISIONS_XML)
69 |     decisions = list_dir(DE_DECISIONS_DOWNLOAD_XML, ".xml")
70 |     with multiprocessing.Pool() as p:
71 |         p.map(clean_decision, decisions)
72 | 


--------------------------------------------------------------------------------
/de_decisions_pipeline_steps/c_hierarchy.py:
--------------------------------------------------------------------------------
  1 | import multiprocessing
  2 | import os
  3 | import re
  4 | 
  5 | from bs4 import BeautifulSoup
  6 | from quantlaw.utils.files import ensure_exists, list_dir
  7 | 
  8 | from de_decisions_pipeline_steps.common import get_docparts_with_p
  9 | from statics import DE_DECISIONS_HIERARCHY, DE_DECISIONS_XML
 10 | 
 11 | 
 12 | def extract_number(text, token_position=0):
 13 |     if len(text) == 0 or len(text.split()) <= token_position:
 14 |         return None, None
 15 |     first_token = text.split()[token_position]
 16 | 
 17 |     match = re.fullmatch(r"(([a-h])\2*)\)", first_token)
 18 |     if match:  # a) aa) aaa)
 19 |         level = len(match[1])
 20 |         return f"alpha-lower-bracket-{level}", match[1]
 21 | 
 22 |     match = re.fullmatch(r"\((([a-h])\2*)\)", first_token)
 23 |     if match:  # (a) (aa) (aaa)
 24 |         level = len(match[1])
 25 |         return f"alpha-lower-double-bracket-{level}", match[1]
 26 | 
 27 |     match = re.fullmatch(r"(([a-h])\2*)\.", first_token)
 28 |     if match:  # a. aa. aaa.
 29 |         level = len(match[1])
 30 |         return f"alpha-lower-dot-{level}", match[1]
 31 | 
 32 |     match = re.fullmatch(r"((X[CL]|L?X{0,3})(I[XV]|V?I{0,3}))\.?", first_token)
 33 |     if match:  # I. II III.
 34 |         return "roman-upper", match[1]
 35 | 
 36 |     match = re.fullmatch(r"(\d+)\.", first_token)
 37 |     if match:  # 1. 2. 3.
 38 |         return "arabic-dot", match[1]
 39 | 
 40 |     match = re.fullmatch(r"(\d+)\)", first_token)
 41 |     if match:
 42 |         return "arabic-bracket", match[1]
 43 | 
 44 |     match = re.fullmatch(
 45 |         r"([A-H])\.", first_token
 46 |     )  # Only until "H." Others are mostly false-positives and disambiguation would
 47 |     # be required.
 48 |     if match:
 49 |         return "alpha-dot", match[1]
 50 | 
 51 |     match = re.fullmatch(r"([A-H])\)", first_token)
 52 |     if match:
 53 |         return "alpha-bracket", match[1]
 54 | 
 55 |     match = re.fullmatch(r"(\d(\.\d)*)\.?", first_token)
 56 |     if match and len(match[0]) > 1:
 57 |         level = len(match[1].split("."))
 58 |         return f"numeric-{level}", match[1]
 59 | 
 60 |     return None, None
 61 | 
 62 | 
 63 | master_order = [
 64 |     "alpha-dot",
 65 |     "alpha-bracket",
 66 |     "roman-upper",
 67 |     "arabic-dot",
 68 |     "arabic-bracket",
 69 |     "numeric-2",
 70 |     "numeric-3",
 71 |     "numeric-4",
 72 |     "numeric-5",
 73 |     "alpha-lower-dot-1",
 74 |     "alpha-lower-bracket-1",
 75 |     "alpha-lower-dot-2",
 76 |     "alpha-lower-bracket-2",
 77 |     "alpha-lower-dot-3",
 78 |     "alpha-lower-bracket-3",
 79 |     "alpha-lower-bracket-4",
 80 |     "alpha-lower-double-bracket-1",
 81 |     "alpha-lower-double-bracket-2",
 82 |     "alpha-lower-double-bracket-3",
 83 |     "alpha-lower-double-bracket-4",
 84 | ]
 85 | 
 86 | 
 87 | def extract_hierarchy(decision):
 88 |     if not os.path.exists(f"{DE_DECISIONS_HIERARCHY}/{decision}"):
 89 |         with open(f"{DE_DECISIONS_XML}/{decision}", encoding="utf8") as f:
 90 |             soup = BeautifulSoup(f.read(), "lxml-xml")
 91 |         for doc_parts in get_docparts_with_p(soup):
 92 |             # has_numbered_ps = bool(doc_parts.find('p', attrs={'numbers': True}))
 93 |             for p in doc_parts.find_all("p", {"indented": str(True)}):
 94 |                 text = p.get_text().strip()
 95 |                 for token_position in range(3):
 96 |                     match_type, value = extract_number(text, token_position)
 97 |                     if match_type:
 98 |                         if token_position == 0:
 99 |                             p.attrs["hierarchy_num_type"] = match_type
100 |                             p.attrs["hierarchy_num"] = value
101 |                         else:
102 |                             if match_type == "alpha-dot":
103 |                                 break
104 |                             p.attrs["hierarchy_num_type"] += "," + match_type
105 |                             p.attrs["hierarchy_num"] += "," + value
106 |                     else:
107 |                         break
108 | 
109 |                     # hierarchy_num_types = list()
110 |                     # for p in doc_parts.find_all(
111 |                     #     "p", attrs={"hierarchy_num_type": True}
112 |                     # ):
113 |                     #     for hierarchy_num_type in p.attrs["hierarchy_num_type"].split(
114 |                     #         ","
115 |                     #     ):
116 |                     #         if hierarchy_num_type not in hierarchy_num_types:
117 |                     #             hierarchy_num_types.append(hierarchy_num_type)
118 |                     #
119 |                     # if hierarchy_num_types:
120 |                     #     unknown_order = len(
121 |                     #         set(hierarchy_num_types) - set(master_order)
122 |                     #     )
123 |                     #     if not unknown_order:
124 |                     #         hierarchy_num_types_ordered = sorted(
125 |                     #             hierarchy_num_types, key=lambda x: master_order.index(
126 |                     #               x
127 |                     #             )
128 |                     #         )
129 |                     #     if unknown_order or tuple(hierarchy_num_types) != tuple(
130 |                     #         hierarchy_num_types_ordered
131 |                     #     ):
132 |                     #         print(decision, doc_parts.name, hierarchy_num_types)
133 | 
134 |         nested_soup = BeautifulSoup("", "lxml-xml")
135 |         assert len(soup.gertyp.get_text()), decision
136 |         assert len(soup.find("entsch-datum").get_text()) == 8, decision
137 |         assert len(soup.aktenzeichen.get_text()), decision
138 |         assert len(soup.doktyp.get_text()), decision
139 | 
140 |         datum_raw = soup.find("entsch-datum").get_text()
141 |         datum = f"{datum_raw[:4]}-{datum_raw[4:6]}-{datum_raw[6:]}"
142 | 
143 |         nested_soup.append(
144 |             nested_soup.new_tag(
145 |                 "document",
146 |                 gericht=soup.gertyp.get_text(),
147 |                 datum=datum,
148 |                 az=soup.aktenzeichen.get_text(),
149 |                 doktyp=soup.doktyp.get_text(),
150 |             )
151 |         )
152 | 
153 |         if len(soup.spruchkoerper.get_text()):
154 |             nested_soup.document.attrs["spruchkoerper"] = soup.spruchkoerper.get_text()
155 | 
156 |         if len(soup.norm.get_text()):
157 |             nested_soup.document.append(nested_soup.new_tag("norm"))
158 |             nested_soup.norm.append(nested_soup.new_string(soup.norm.get_text(" ")))
159 | 
160 |         for doc_part in [
161 |             soup.tenor,
162 |             soup.tatbestand,
163 |             soup.entscheidungsgruende,
164 |             soup.gruende,
165 |             soup.abwmeinung,
166 |             soup.sonstlt,
167 |         ]:
168 |             if not len(doc_part.get_text()):
169 |                 continue
170 |             item = nested_soup.new_tag("item", heading=doc_part.name)
171 |             nested_soup.document.append(item)
172 | 
173 |             open_tags = [dict(tag=item, level=-1)]
174 |             text_tag = None
175 |             for p in doc_part.find_all("p"):
176 |                 if text_tag and "indented" in p.attrs:
177 |                     assert p.attrs["indented"] == "True"
178 |                     text_tag.append(" " + nested_soup.new_string(p.get_text(" ")))
179 |                     continue
180 | 
181 |                 if "hierarchy_num_type" in p.attrs:
182 |                     for num_type, num in zip(
183 |                         p.attrs["hierarchy_num_type"].split(","),
184 |                         p.attrs["hierarchy_num"].split(","),
185 |                     ):
186 |                         current_level = master_order.index(num_type)
187 |                         while open_tags[-1]["level"] >= current_level:
188 |                             open_tags.pop()
189 | 
190 |                         item = nested_soup.new_tag("item", heading=num)
191 |                         open_tags[-1]["tag"].append(item)
192 |                         open_tags.append(dict(tag=item, level=current_level))
193 | 
194 |                 text_tag = nested_soup.new_tag("text")
195 |                 text_tag.append(nested_soup.new_string(p.get_text(" ")))
196 |                 seqitem = nested_soup.new_tag("seqitem")
197 |                 seqitem.append(text_tag)
198 |                 open_tags[-1]["tag"].append(seqitem)
199 | 
200 |         decision_id = decision.split(".")[0]
201 |         nodeid_counter = 0
202 |         for tag in nested_soup.find_all(["document", "item", "seqitem"]):
203 |             tag.attrs["key"] = f"{decision_id}_{nodeid_counter:06d}"
204 |             nodeid_counter += 1
205 |             tag.attrs["level"] = (
206 |                 0 if tag.name == "document" else tag.parent.attrs["level"] + 1
207 |             )
208 | 
209 |         with open(f"{DE_DECISIONS_HIERARCHY}/{decision}", "w", encoding="utf8") as f:
210 |             f.write(str(nested_soup))
211 | 
212 | 
213 | def hierarchy():
214 |     ensure_exists(DE_DECISIONS_HIERARCHY)
215 |     decisions = list_dir(DE_DECISIONS_XML, ".xml")
216 | 
217 |     with multiprocessing.Pool() as p:
218 |         p.map(extract_hierarchy, decisions)
219 | 


--------------------------------------------------------------------------------
/de_decisions_pipeline_steps/common.py:
--------------------------------------------------------------------------------
 1 | def get_docparts_with_p(soup):
 2 |     return [
 3 |         soup.titelzeile,
 4 |         soup.leitsatz,
 5 |         soup.sonstosatz,
 6 |         soup.tenor,
 7 |         soup.tatbestand,
 8 |         soup.entscheidungsgruende,
 9 |         soup.gruende,
10 |         soup.sonstlt,
11 |         soup.abwmeinung,
12 |     ]
13 | 


--------------------------------------------------------------------------------
/de_decisions_pipeline_steps/d_reference_areas_parse.py:
--------------------------------------------------------------------------------
  1 | import multiprocessing
  2 | import os
  3 | import sys
  4 | import traceback
  5 | 
  6 | from bs4 import BeautifulSoup
  7 | from quantlaw.de_extract.statutes_areas import StatutesExtractor
  8 | from quantlaw.de_extract.statutes_parse import StatutesParser
  9 | from quantlaw.utils.beautiful_soup import save_soup
 10 | from quantlaw.utils.files import ensure_exists, list_dir
 11 | 
 12 | from statics import (
 13 |     DE_DECISIONS_HIERARCHY,
 14 |     DE_DECISIONS_REFERENCE_AREAS,
 15 |     DE_DECISIONS_REFERENCE_PARSED_XML,
 16 | )
 17 | from statutes_pipeline_steps.de_reference_areas import find_references_in_soup
 18 | from statutes_pipeline_steps.de_reference_parse import (
 19 |     identify_lawreference_law_name_in_soup,
 20 |     identify_reference_law_name_in_soup,
 21 |     parse_reference_content_in_soup,
 22 | )
 23 | from utils.common import get_stemmed_law_names, load_law_names_compiled
 24 | 
 25 | 
 26 | def get_lawnames_date(requested_date):
 27 |     requested_date = requested_date.replace("-", "")
 28 |     lookup_date = None
 29 |     for date in sorted(law_names):
 30 |         if date <= requested_date:
 31 |             lookup_date = date
 32 |         else:
 33 |             break
 34 |     if not lookup_date:
 35 |         raise Exception(f"No lawnames for {lookup_date} not found.")
 36 |     return lookup_date
 37 | 
 38 | 
 39 | def find_references(decision):
 40 |     try:
 41 |         logs = []
 42 |         areas_exists = os.path.exists(f"{DE_DECISIONS_REFERENCE_AREAS}/{decision}")
 43 |         parsed_exists = os.path.exists(
 44 |             f"{DE_DECISIONS_REFERENCE_PARSED_XML}/{decision}"
 45 |         )
 46 | 
 47 |         if not (areas_exists and parsed_exists):  # General preparation
 48 |             with open(f"{DE_DECISIONS_HIERARCHY}/{decision}", encoding="utf8") as f:
 49 |                 file_content = f.read()
 50 |             file_content = file_content.replace(
 51 |                 "<!DOCTYPE dokument SYSTEM "
 52 |                 '"http://www.rechtsprechung-im-internet.de/dtd/v1/rii-dok.dtd">',
 53 |                 "",
 54 |             )
 55 |             soup = BeautifulSoup(file_content, "lxml-xml")
 56 | 
 57 |             # Get Entscheidungsdatum
 58 |             date = get_lawnames_date(soup.document.attrs["datum"])
 59 | 
 60 |             # Get laws in effect at time of decision
 61 |             laws_lookup = get_stemmed_law_names(date, law_names)
 62 |             parser = StatutesParser(laws_lookup)
 63 |             extractor = StatutesExtractor(laws_lookup)
 64 | 
 65 |         if not areas_exists:
 66 |             logs.append(
 67 |                 find_references_in_soup(
 68 |                     soup,
 69 |                     extractor,
 70 |                     para=0,
 71 |                     art=0,
 72 |                     text_tag_name=["text", "norm"],
 73 |                 )
 74 |                 # set para and atr to 0 that refernece with naming a law are ignored.
 75 |             )
 76 |             save_soup(soup, f"{DE_DECISIONS_REFERENCE_AREAS}/{decision}")
 77 | 
 78 |         if not parsed_exists:
 79 |             with open(
 80 |                 f"{DE_DECISIONS_REFERENCE_AREAS}/{decision}", encoding="utf8"
 81 |             ) as f:
 82 |                 soup = BeautifulSoup(f.read(), "lxml-xml")
 83 |             parse_reference_content_in_soup(soup, parser, decision)
 84 |             identify_reference_law_name_in_soup(
 85 |                 soup, parser, current_lawid=None, skip_errors=True
 86 |             )
 87 |             identify_lawreference_law_name_in_soup(soup, laws_lookup)
 88 | 
 89 |             save_soup(soup, f"{DE_DECISIONS_REFERENCE_PARSED_XML}/{decision}")
 90 |     except Exception:
 91 |         print("-----", decision, "-----")
 92 |         the_type, the_value, the_traceback = sys.exc_info()
 93 |         traceback.print_exception(the_type, the_value, the_traceback)
 94 |         raise
 95 | 
 96 | 
 97 | def reference_parse_areas(regulations):
 98 |     global law_names
 99 |     law_names = load_law_names_compiled(regulations)
100 |     ensure_exists(DE_DECISIONS_REFERENCE_AREAS)
101 |     ensure_exists(DE_DECISIONS_REFERENCE_PARSED_XML)
102 |     decisions = list_dir(DE_DECISIONS_HIERARCHY, ".xml")
103 |     with multiprocessing.Pool() as p:
104 |         p.map(find_references, decisions)
105 | 
106 | 
107 | # # REgZ extractor
108 | #
109 | # pattern = re.compile(
110 | #     r'(?:'
111 | #         r'(?:B\s+)?'
112 | #         r'(?:\d?/?\d+|I?(?:X|V|I)+I*a?)'
113 | #         r'\s+'
114 | #         r'(?:B\s+)?'
115 | #     r')?'
116 | #     r'('
117 | #         r'[A-Za-z\-Ü]+'
118 | #         r'(?:\s*\((?:pat|Brfg|B|R|VZ|P|VS|Vs|Ü)\))?'
119 | #     r')'
120 | #     r'\s*'
121 | #     r'(?:\d+\s*(?:,|\-|und)\s*)?'
122 | #     r'\d+\/\d+a?'
123 | #     r'(?:\s+\(PKH\)|\s+\(E[PU]\b\)?|\s+\(?[BRSKFCAD][LRH]?\)?)?'
124 | #     r'(?:\s+\-\s+Vz\s+\d+/\d+)?'
125 | #     r'\)?'
126 | #     r'(?:\s+\(vormals\s.+)?'
127 | # )
128 | # keys = list(azs)
129 | # az_splitted = [
130 | #     [
131 | #         az
132 | #         for az in re.split(
133 | #         r'\s+hinzuverb\.,\s+|,?\s*\(?\bverb\.\s*mi?t?\b\.?,?\s*|(?:,?\s+und|,?\s+zu|,),?\s(?!\d+/\d)(?:hinzuverb\.\s+)?|\s\((?=(?:\d+|I?(?:X|V|I)+I*a?)+\s+[A-Z]+\s+\d+/\d+)|\szu(?=\d\s)',
134 | #         azs[k])
135 | #     ]
136 | #     for k in keys
137 | # ]
138 | #
139 | # regZ = [
140 | #     [
141 | #         pattern.fullmatch(az)
142 | #         for az in az_list
143 | #     ]
144 | #     for az_list in az_splitted
145 | # ]
146 | # regZ = [
147 | #     [
148 | #         match and match.group(1)
149 | #         for match in match_list
150 | #     ]
151 | #     for match_list in regZ
152 | # ]
153 | 


--------------------------------------------------------------------------------
/de_decisions_pipeline_steps/e_network.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import json
  3 | import multiprocessing
  4 | import re
  5 | 
  6 | import networkx as nx
  7 | from quantlaw.utils.beautiful_soup import create_soup
  8 | from quantlaw.utils.files import list_dir
  9 | from quantlaw.utils.networkx import multi_to_weighted
 10 | 
 11 | from statics import DE_DECISIONS_NETWORK, DE_DECISIONS_REFERENCE_PARSED_XML
 12 | 
 13 | 
 14 | def count_characters(text, whites=False):
 15 |     if whites:
 16 |         return len(text)
 17 |     else:
 18 |         return len(re.sub(r"\s", "", text))
 19 | 
 20 | 
 21 | def count_tokens(text, unique=False):
 22 |     if not unique:
 23 |         return len(text.split())
 24 |     else:
 25 |         return len(set(text.split()))
 26 | 
 27 | 
 28 | def get_graph_data_from_decision(decision):
 29 |     try:
 30 |         soup = create_soup(f"{DE_DECISIONS_REFERENCE_PARSED_XML}/{decision}")
 31 |         items = list(soup.find_all(["document", "item", "seqitem"]))
 32 |         node_dicts = []
 33 |         containment_edges = []
 34 | 
 35 |         for item in items:
 36 |             node_dict = dict(
 37 |                 key=item.attrs["key"],
 38 |                 heading=item.attrs.get("heading", ""),
 39 |                 level=int(item.attrs["level"]),
 40 |                 type=item.name,
 41 |             )
 42 | 
 43 |             text = item.get_text(" ")
 44 |             node_dict["chars_n"] = count_characters(text, whites=True)
 45 |             node_dict["chars_nowhites"] = count_characters(text, whites=False)
 46 |             node_dict["tokens_n"] = count_tokens(text, unique=False)
 47 |             node_dict["tokens_unique"] = count_tokens(text, unique=True)
 48 | 
 49 |             if item.name == "document":
 50 |                 for key in ["az", "gericht", "datum", "doktyp", "spruchkoerper"]:
 51 |                     node_dict[key] = item.attrs.get(key, "")
 52 |                 parent_key = "root"
 53 |             else:
 54 |                 node_dict["parent_key"] = item.parent.attrs["key"]
 55 |                 parent_key = item.parent.attrs["key"]
 56 | 
 57 |             node_dicts.append(node_dict)
 58 |             containment_edges.append((parent_key, item.attrs["key"]))
 59 | 
 60 |         reference_edges = []
 61 |         for item in items:
 62 |             for node in item.find_all("reference"):
 63 |                 if (
 64 |                     node.lawname
 65 |                     and "parsed" in node.attrs
 66 |                     and node.lawname.get("type")
 67 |                     in [
 68 |                         "dict",
 69 |                         "sgb",
 70 |                     ]
 71 |                 ):
 72 |                     refs = json.loads(node.attrs["parsed"])
 73 |                     for ref in refs:
 74 |                         ref_key = "_".join(ref[:2])
 75 |                         reference_edges.append((item.attrs["key"], ref_key))
 76 |     except Exception:
 77 |         print(decision)
 78 |         raise
 79 | 
 80 |     return node_dicts, containment_edges, reference_edges
 81 | 
 82 | 
 83 | def network():
 84 |     decisions = list_dir(DE_DECISIONS_REFERENCE_PARSED_XML, ".xml")
 85 |     with multiprocessing.Pool() as p:
 86 |         results = p.map(get_graph_data_from_decision, decisions)
 87 | 
 88 |     node_dicts = list(itertools.chain.from_iterable([x[0] for x in results]))
 89 |     containment_edges = list(itertools.chain.from_iterable([x[1] for x in results]))
 90 |     reference_edges = list(itertools.chain.from_iterable([x[2] for x in results]))
 91 | 
 92 |     hierarchy_G = nx.DiGraph()
 93 |     hierarchy_G.add_node("root", level=-1, key="root", bipartite="decision")
 94 |     hierarchy_G.add_nodes_from(
 95 |         [(x["key"], x) for x in node_dicts], bipartite="decision"
 96 |     )
 97 |     hierarchy_G.add_edges_from(containment_edges, edge_type="containment")
 98 | 
 99 |     reference_G = nx.MultiDiGraph(hierarchy_G)
100 |     print("created")
101 |     reference_G.add_nodes_from(
102 |         sorted({x[-1] for x in reference_edges}), bipartite="statute"
103 |     )
104 |     print("Statute nodes added")
105 |     reference_G.add_edges_from(reference_edges, edge_type="reference")
106 |     print("Reference edges added")
107 | 
108 |     reference_weighted_G = multi_to_weighted(reference_G)
109 | 
110 |     nx.write_gpickle(reference_weighted_G, DE_DECISIONS_NETWORK)
111 | 


--------------------------------------------------------------------------------
/download_de_gesetze_im_internet_data.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | from multiprocessing.pool import Pool
 4 | 
 5 | from git import Git, Repo
 6 | 
 7 | from statics import DE_ORIGINAL_PATH
 8 | from utils.simplify_gii_xml import simplify_gii_xml
 9 | 
10 | REPO_PATH = "../gesetze-im-internet"
11 | REPO_PARENT_PATH = "../"
12 | ITEMS_PATH = f"{REPO_PATH}/data/items/"
13 | 
14 | GII_REPO_URL = "https://github.com/QuantLaw/gesetze-im-internet.git"
15 | 
16 | 
17 | def copy_and_simplify_file(xml_file):
18 |     doknr = xml_file.split(".")[0]
19 |     file_path = os.path.join(ITEMS_PATH, folder, xml_file)
20 |     stripped_date = date.replace("-", "")
21 |     target_file = os.path.join(
22 |         DE_ORIGINAL_PATH, f"{doknr}_{stripped_date}_{stripped_date}.xml"
23 |     )
24 |     simplify_gii_xml(file_path, target_file)
25 | 
26 | 
27 | if __name__ == "__main__":
28 |     parser = argparse.ArgumentParser()
29 |     parser.add_argument(
30 |         "-d", "--dates", nargs="*", help="List dates in format YYYY=mm-dd"
31 |     )
32 |     parser.add_argument(
33 |         "-i",
34 |         "--ignore-not-found",
35 |         help="Ignore that some files are not included in this snapshot",
36 |         action="store_true",
37 |     )
38 | 
39 |     args = parser.parse_args()
40 | 
41 |     if os.path.exists(REPO_PATH):
42 |         print(f"Please remove the folder {REPO_PATH}")
43 |         exit(1)
44 | 
45 |     Git(REPO_PARENT_PATH).clone(GII_REPO_URL)
46 | 
47 |     repo = Repo(REPO_PATH)
48 |     available_dates = [d.name for d in repo.tags]
49 | 
50 |     if not args.dates:
51 |         print("Please choose dates to import\nOptions:\n")
52 |         for t in available_dates:
53 |             print(t)
54 |         exit(1)
55 | 
56 |     for date in args.dates:
57 |         if date not in available_dates:
58 |             raise Exception(f"{date} is not available")
59 | 
60 |     git = Git(REPO_PATH)
61 |     os.makedirs(DE_ORIGINAL_PATH)
62 | 
63 |     for date in args.dates:
64 |         git.checkout(date)
65 | 
66 |         with open(f"{REPO_PATH}/data/not_found.txt") as f:
67 |             not_found = f.read()
68 | 
69 |         if not args.ignore_not_found and len(not_found.strip()):
70 |             raise Exception(
71 |                 f"Some files are not included in snapshot {date}. "
72 |                 f"Use another snapshot or --ignore-not-found"
73 |             )
74 | 
75 |         for folder in [
76 |             f for f in os.listdir(ITEMS_PATH) if os.path.isdir(ITEMS_PATH + f)
77 |         ]:
78 |             folder_path = ITEMS_PATH + folder
79 |             xml_files = [f for f in os.listdir(folder_path) if f.endswith(".xml")]
80 | 
81 |             with Pool() as p:
82 |                 p.map(copy_and_simplify_file, xml_files)
83 | 
84 |         print(date, "imported")
85 | 
86 |     print(f"Done. You may now remove `{REPO_PATH}`")
87 | 


--------------------------------------------------------------------------------
/download_us_code_data.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import shutil
 3 | from multiprocessing.pool import Pool
 4 | from zipfile import ZipFile
 5 | 
 6 | import requests
 7 | from bs4 import BeautifulSoup
 8 | from quantlaw.utils.files import ensure_exists
 9 | 
10 | from statics import US_INPUT_PATH
11 | 
12 | INDEX_URL = (
13 |     "https://uscode.house.gov/download/annualhistoricalarchives/downloadxhtml.shtml"
14 | )
15 | 
16 | DOWNLOAD_BASE_URL = "https://uscode.house.gov/download/annualhistoricalarchives/"
17 | 
18 | 
19 | def download(ref):
20 |     year = re.match(r"XHTML/(\d+)\.zip", ref)[1]
21 |     print("loading", year)
22 |     r = requests.get(DOWNLOAD_BASE_URL + ref, stream=True)
23 |     if r.status_code == 200:
24 |         zip_path = f"{US_INPUT_PATH}/{year}.zip"
25 |         with open(zip_path, "wb") as f:
26 |             r.raw.decode_content = True
27 |             shutil.copyfileobj(r.raw, f)
28 | 
29 |         with ZipFile(zip_path) as f:
30 |             f.extractall(US_INPUT_PATH)
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     response = requests.get(INDEX_URL)
35 |     soup = BeautifulSoup(str(response.content), "lxml")
36 |     refs = []
37 |     for s_string in soup.find_all(text=" zip file]"):
38 |         a_tag = s_string.parent
39 |         assert a_tag.name == "a"
40 |         refs.append(a_tag.attrs["href"])
41 | 
42 |     ensure_exists(US_INPUT_PATH)
43 | 
44 |     with Pool(4) as p:
45 |         p.map(download, sorted(refs))
46 | 


--------------------------------------------------------------------------------
/download_us_reg_data.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | from multiprocessing.pool import Pool
 4 | 
 5 | import requests
 6 | from quantlaw.utils.files import ensure_exists
 7 | 
 8 | from statics import US_REG_INPUT_PATH
 9 | 
10 | DOWNLOAD_BASE_URL = "https://www.govinfo.gov/bulkdata/CFR/{}/CFR-{}.zip"
11 | 
12 | 
13 | def download(year):
14 |     zip_path = f"{US_REG_INPUT_PATH}/{year}.zip"
15 |     if not os.path.exists(zip_path):
16 |         print("loading", year)
17 |         r = requests.get(DOWNLOAD_BASE_URL.format(year, year), stream=True)
18 |         if r.status_code == 200:
19 |             with open(zip_path, "wb") as f:
20 |                 r.raw.decode_content = True
21 |                 shutil.copyfileobj(r.raw, f)
22 |             print("downloaded", year)
23 | 
24 | 
25 | if __name__ == "__main__":
26 | 
27 |     ensure_exists(US_REG_INPUT_PATH)
28 |     with Pool(4) as p:
29 |         p.map(download, list(range(1996, 2020 + 1)))
30 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
 1 | beautifulsoup4
 2 | black
 3 | coverage
 4 | flake8
 5 | isort
 6 | lxml
 7 | networkx==2.4
 8 | numpy
 9 | pandas
10 | pre-commit
11 | quantlaw
12 | regex
13 | requests
14 | textdistance
15 | tqdm
16 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | appdirs==1.4.4
 2 | beautifulsoup4==4.9.3
 3 | black==20.8b1
 4 | certifi==2020.12.5
 5 | cfgv==3.2.0
 6 | chardet==4.0.0
 7 | click==7.1.2
 8 | coverage==5.3.1
 9 | decorator==4.4.2
10 | distlib==0.3.1
11 | filelock==3.0.12
12 | flake8==3.8.4
13 | identify==1.5.11
14 | idna==2.10
15 | importlib-metadata==3.3.0
16 | isort==5.7.0
17 | lxml==4.6.2
18 | mccabe==0.6.1
19 | mypy-extensions==0.4.3
20 | networkx==2.4
21 | nodeenv==1.5.0
22 | numpy==1.19.4
23 | pandas==1.2.0
24 | pathspec==0.8.1
25 | pre-commit==2.9.3
26 | pycodestyle==2.6.0
27 | pyflakes==2.2.0
28 | python-dateutil==2.8.1
29 | pytz==2020.5
30 | PyYAML==5.3.1
31 | quantlaw==0.0.5
32 | regex==2020.11.13
33 | requests==2.25.1
34 | six==1.15.0
35 | soupsieve==2.1
36 | textdistance==4.2.0
37 | toml==0.10.2
38 | tqdm==4.55.1
39 | typed-ast==1.4.2
40 | typing-extensions==3.7.4.3
41 | urllib3==1.26.2
42 | virtualenv==20.2.2
43 | zipp==3.4.0
44 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 88
3 | ignore = E203, W503, E131
4 | exclude = .git,__pycache__,build,dist,venv
5 | 


--------------------------------------------------------------------------------
/statics.py:
--------------------------------------------------------------------------------
  1 | ALL_YEARS = list(range(1994, 2020))
  2 | ALL_YEARS_REG = list(range(1998, 2020))
  3 | 
  4 | DATA_PATH = "../legal-networks-data"
  5 | US_DATA_PATH = f"{DATA_PATH}/us"
  6 | US_TEMP_DATA_PATH = "temp/us"
  7 | 
  8 | US_INPUT_PATH = f"{US_DATA_PATH}/1_input"
  9 | US_ORIGINAL_PATH = f"{US_TEMP_DATA_PATH}/11_htm"
 10 | US_XML_PATH = f"{US_TEMP_DATA_PATH}/12_xml"
 11 | US_REFERENCE_AREAS_PATH = f"{US_TEMP_DATA_PATH}/13_reference_areas"
 12 | US_REFERENCE_PARSED_PATH = f"{US_DATA_PATH}/2_xml"
 13 | US_HIERARCHY_GRAPH_PATH = f"{US_DATA_PATH}/3_hierarchy_graph"
 14 | US_CROSSREFERENCE_LOOKUP_PATH = f"{US_TEMP_DATA_PATH}/31_crossreference_lookup"
 15 | US_CROSSREFERENCE_EDGELIST_PATH = f"{US_TEMP_DATA_PATH}/32_crossreference_edgelist"
 16 | US_CROSSREFERENCE_GRAPH_PATH = f"{US_DATA_PATH}/4_crossreference_graph"
 17 | US_SNAPSHOT_MAPPING_INDEX_PATH = f"{US_TEMP_DATA_PATH}/41_snapshot_mapping_index"
 18 | US_SNAPSHOT_MAPPING_EDGELIST_PATH = f"{US_DATA_PATH}/5_snapshot_mapping_edgelist"
 19 | 
 20 | US_HELPERS_PATH = f"{US_TEMP_DATA_PATH}/helpers"
 21 | US_REFERENCE_AREAS_LOG_PATH = f"{US_HELPERS_PATH}/us_extract_reference_areas.log"
 22 | US_REFERENCE_PARSED_LOG_PATH = f"{US_HELPERS_PATH}/us_extract_reference_parsed.log"
 23 | 
 24 | US_REG_DATA_PATH = f"{DATA_PATH}/us_reg"
 25 | US_REG_TEMP_DATA_PATH = "temp/us_reg"
 26 | 
 27 | US_REG_INPUT_PATH = f"{US_REG_DATA_PATH}/1_input"
 28 | US_REG_INPUT_COPY_LOG_PATH = f"{US_REG_DATA_PATH}/1_input_copy_log.csv"
 29 | US_REG_ORIGINAL_PATH = f"{US_REG_TEMP_DATA_PATH}/11_htm"
 30 | US_REG_XML_PATH = f"{US_REG_TEMP_DATA_PATH}/12_xml"
 31 | US_REG_REFERENCE_AREAS_PATH = f"{US_REG_TEMP_DATA_PATH}/13_reference_areas"
 32 | US_REG_REFERENCE_PARSED_PATH = f"{US_REG_DATA_PATH}/2_xml"
 33 | US_REG_HIERARCHY_GRAPH_PATH = f"{US_REG_DATA_PATH}/3_hierarchy_graph"
 34 | US_REG_CROSSREFERENCE_LOOKUP_PATH = f"{US_REG_TEMP_DATA_PATH}/31_crossreference_lookup"
 35 | US_REG_CROSSREFERENCE_EDGELIST_PATH = (
 36 |     f"{US_REG_TEMP_DATA_PATH}/32_crossreference_edgelist"
 37 | )
 38 | US_REG_AUTHORITY_EDGELIST_PATH = f"{US_REG_TEMP_DATA_PATH}/33_authority_edgelist"
 39 | US_REG_CROSSREFERENCE_GRAPH_PATH = f"{US_REG_DATA_PATH}/4_crossreference_graph"
 40 | US_REG_SNAPSHOT_MAPPING_INDEX_PATH = (
 41 |     f"{US_REG_TEMP_DATA_PATH}/41_snapshot_mapping_index"
 42 | )
 43 | US_REG_SNAPSHOT_MAPPING_EDGELIST_PATH = (
 44 |     f"{US_REG_DATA_PATH}/5_snapshot_mapping_edgelist"
 45 | )
 46 | 
 47 | US_REG_HELPERS_PATH = f"{US_REG_TEMP_DATA_PATH}/helpers"
 48 | US_REG_REFERENCE_AREAS_LOG_PATH = (
 49 |     f"{US_REG_HELPERS_PATH}/us_extract_reference_areas.log"
 50 | )
 51 | US_REG_REFERENCE_PARSED_LOG_PATH = (
 52 |     f"{US_REG_HELPERS_PATH}/us_extract_reference_parsed.log"
 53 | )
 54 | 
 55 | 
 56 | DE_DATA_PATH = f"{DATA_PATH}/de"
 57 | DE_TEMP_DATA_PATH = "temp/de"
 58 | 
 59 | JURIS_EXPORT_PATH = f"{DE_DATA_PATH}/1_juris_gii_xml"
 60 | JURIS_EXPORT_GESETZE_LIST_PATH = f"{DE_DATA_PATH}/1_juris_gii_xml_gesetze.txt"
 61 | JURIS_EXPORT_RVO_LIST_PATH = f"{DE_DATA_PATH}/1_juris_gii_xml_rvo.txt"
 62 | 
 63 | DE_ORIGINAL_PATH = f"{DE_TEMP_DATA_PATH}/11_gii_xml"
 64 | DE_XML_PATH = f"{DE_TEMP_DATA_PATH}/12_xml"
 65 | DE_LAW_NAMES_PATH = f"{DE_TEMP_DATA_PATH}/12_xml_law_names.csv"
 66 | DE_LAW_NAMES_COMPILED_PATH = f"{DE_TEMP_DATA_PATH}/12_xml_law_names_compiled.pickle"
 67 | DE_REFERENCE_AREAS_PATH = f"{DE_TEMP_DATA_PATH}/13_reference_areas"
 68 | DE_REFERENCE_PARSED_PATH = f"{DE_DATA_PATH}/2_xml"
 69 | DE_HIERARCHY_GRAPH_PATH = f"{DE_DATA_PATH}/3_hierarchy_graph"
 70 | DE_CROSSREFERENCE_LOOKUP_PATH = f"{DE_TEMP_DATA_PATH}/31_crossreference_lookup"
 71 | DE_CROSSREFERENCE_EDGELIST_PATH = f"{DE_TEMP_DATA_PATH}/32_crossreference_edgelist"
 72 | DE_CROSSREFERENCE_GRAPH_PATH = f"{DE_DATA_PATH}/4_crossreference_graph"
 73 | DE_SNAPSHOT_MAPPING_INDEX_PATH = f"{DE_TEMP_DATA_PATH}/41_snapshot_mapping_index"
 74 | DE_SNAPSHOT_MAPPING_EDGELIST_PATH = f"{DE_DATA_PATH}/5_snapshot_mapping_edgelist"
 75 | 
 76 | DE_HELPERS_PATH = f"{DE_TEMP_DATA_PATH}/helpers"
 77 | DE_REFERENCE_AREAS_LOG_PATH = f"{DE_HELPERS_PATH}/de_extract_reference_areas.log"
 78 | DE_REFERENCE_PARSED_LOG_PATH = f"{DE_HELPERS_PATH}/de_extract_reference_parsed.log"
 79 | 
 80 | DE_DECISIONS_DATA_PATH = f"{DATA_PATH}/de_decisions"
 81 | DE_DECISIONS_TEMP_DATA_PATH = "temp/de_decisions"
 82 | 
 83 | DE_DECISIONS_DOWNLOAD_TOC = f"{DE_DECISIONS_TEMP_DATA_PATH}/de_rii_toc.xml"
 84 | DE_DECISIONS_DOWNLOAD_ZIP = f"{DE_DECISIONS_DATA_PATH}/0_input"
 85 | DE_DECISIONS_DOWNLOAD_XML = f"{DE_DECISIONS_TEMP_DATA_PATH}/00_xml"
 86 | DE_DECISIONS_XML = f"{DE_DECISIONS_TEMP_DATA_PATH}/01_xml_cleaned"
 87 | DE_DECISIONS_HIERARCHY = f"{DE_DECISIONS_TEMP_DATA_PATH}/02_hierarchy"
 88 | DE_DECISIONS_REFERENCE_AREAS = f"{DE_DECISIONS_TEMP_DATA_PATH}/03_reference_areas"
 89 | DE_DECISIONS_REFERENCE_PARSED_XML = f"{DE_DECISIONS_DATA_PATH}/1_xml"
 90 | DE_DECISIONS_NETWORK = f"{DE_DECISIONS_DATA_PATH}/2_network.gpickle.gz"
 91 | 
 92 | DE_REG_DATA_PATH = f"{DATA_PATH}/de_reg"
 93 | DE_REG_TEMP_DATA_PATH = "temp/de_reg"
 94 | 
 95 | DE_REG_ORIGINAL_PATH = f"{DE_REG_TEMP_DATA_PATH}/11_gii_xml"
 96 | 
 97 | DE_REG_DATA_PATH = f"{DATA_PATH}/de_reg"
 98 | DE_REG_TEMP_DATA_PATH = "temp/de_reg"
 99 | 
100 | DE_REG_ORIGINAL_PATH = f"{DE_REG_TEMP_DATA_PATH}/11_gii_xml"
101 | DE_REG_XML_PATH = f"{DE_REG_TEMP_DATA_PATH}/12_xml"
102 | DE_REG_LAW_NAMES_COMPILED_PATH = (
103 |     f"{DE_REG_TEMP_DATA_PATH}/12_xml_law_names_compiled.pickle"
104 | )
105 | DE_REG_LAW_NAMES_PATH = f"{DE_REG_TEMP_DATA_PATH}/12_xml_law_names.csv"
106 | DE_REG_REFERENCE_AREAS_PATH = f"{DE_REG_TEMP_DATA_PATH}/13_reference_areas"
107 | DE_REG_REFERENCE_PARSED_PATH = f"{DE_REG_DATA_PATH}/2_xml"
108 | DE_REG_HIERARCHY_GRAPH_PATH = f"{DE_REG_DATA_PATH}/3_hierarchy_graph"
109 | DE_REG_CROSSREFERENCE_LOOKUP_PATH = f"{DE_REG_TEMP_DATA_PATH}/31_crossreference_lookup"
110 | DE_REG_CROSSREFERENCE_EDGELIST_PATH = (
111 |     f"{DE_REG_TEMP_DATA_PATH}/32_crossreference_edgelist"
112 | )
113 | DE_REG_AUTHORITY_EDGELIST_PATH = f"{DE_REG_TEMP_DATA_PATH}/33_authority_edgelist"
114 | DE_REG_CROSSREFERENCE_GRAPH_PATH = f"{DE_REG_DATA_PATH}/4_crossreference_graph"
115 | DE_REG_SNAPSHOT_MAPPING_INDEX_PATH = (
116 |     f"{DE_REG_TEMP_DATA_PATH}/41_snapshot_mapping_index"
117 | )
118 | DE_REG_SNAPSHOT_MAPPING_EDGELIST_PATH = (
119 |     f"{DE_REG_DATA_PATH}/5_snapshot_mapping_edgelist"
120 | )
121 | 
122 | DE_REG_HELPERS_PATH = f"{DE_REG_TEMP_DATA_PATH}/helpers"
123 | DE_REG_REFERENCE_AREAS_LOG_PATH = (
124 |     f"{DE_REG_HELPERS_PATH}/de_extract_reference_areas.log"
125 | )
126 | DE_REG_REFERENCE_PARSED_LOG_PATH = (
127 |     f"{DE_REG_HELPERS_PATH}/de_extract_reference_parsed.log"
128 | )
129 | 


--------------------------------------------------------------------------------
/statutes_pipeline_steps/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuantLaw/legal-data-preprocessing/4264cd630b13e3d3bb934d4abd73b5b98217873c/statutes_pipeline_steps/__init__.py


--------------------------------------------------------------------------------
/statutes_pipeline_steps/crossreference_graph.py:
--------------------------------------------------------------------------------
  1 | import multiprocessing
  2 | import os
  3 | 
  4 | import networkx as nx
  5 | import pandas as pd
  6 | from quantlaw.utils.files import ensure_exists, list_dir
  7 | from quantlaw.utils.networkx import load_graph_from_csv_files
  8 | 
  9 | from utils.common import RegulationsPipelineStep, get_snapshot_law_list, load_law_names
 10 | 
 11 | 
 12 | class CrossreferenceGraphStep(RegulationsPipelineStep):
 13 |     max_number_of_processes = min(2, max(multiprocessing.cpu_count() - 2, 1))
 14 | 
 15 |     def __init__(
 16 |         self,
 17 |         source,
 18 |         source_regulation,
 19 |         destination,
 20 |         edgelist_folder,
 21 |         dataset,
 22 |         authority_edgelist_folder,
 23 |         *args,
 24 |         **kwargs,
 25 |     ):
 26 |         self.source = source
 27 |         self.source_regulation = source_regulation
 28 |         self.destination = destination
 29 |         self.edgelist_folder = edgelist_folder
 30 |         self.dataset = dataset
 31 |         self.authority_edgelist_folder = authority_edgelist_folder
 32 |         super().__init__(*args, **kwargs)
 33 | 
 34 |     def get_items(self, overwrite, snapshots) -> list:
 35 |         ensure_exists(self.destination + "/seqitems")
 36 |         if not snapshots:
 37 |             snapshots = sorted(
 38 |                 set(
 39 |                     [
 40 |                         os.path.splitext(x)[0]
 41 |                         for x in list_dir(self.edgelist_folder, ".csv")
 42 |                     ]
 43 |                 )
 44 |             )
 45 | 
 46 |         if not overwrite:
 47 |             existing_files = list_dir(
 48 |                 os.path.join(self.destination, "seqitems"), ".gpickle.gz"
 49 |             )
 50 |             snapshots = list(
 51 |                 filter(
 52 |                     lambda year: f"{year}.gpickle.gz" not in existing_files, snapshots
 53 |                 )
 54 |             )
 55 | 
 56 |         if not len(snapshots):
 57 |             return []
 58 | 
 59 |         if self.dataset == "us":
 60 |             files = []
 61 |             for snapshot in snapshots:
 62 |                 statute_files = [
 63 |                     f"{self.source}/subseqitems/{x}"
 64 |                     for x in os.listdir(os.path.join(self.source, "subseqitems"))
 65 |                     if str(snapshot) in x
 66 |                 ]
 67 |                 regulation_files = (
 68 |                     [
 69 |                         f"{self.source_regulation}/subseqitems/{x}"
 70 |                         for x in os.listdir(
 71 |                             os.path.join(self.source_regulation, "subseqitems")
 72 |                         )
 73 |                         if str(snapshot) in x
 74 |                     ]
 75 |                     if self.regulations
 76 |                     else None
 77 |                 )
 78 |                 files.append(
 79 |                     (
 80 |                         snapshot,
 81 |                         statute_files,
 82 |                         regulation_files,
 83 |                     )
 84 |                 )
 85 |         else:  # is DE
 86 |             files = []
 87 |             law_names_data = load_law_names(self.regulations)
 88 |             for snapshot in snapshots:
 89 |                 graph_files = get_snapshot_law_list(snapshot, law_names_data)
 90 |                 files.append(
 91 |                     (
 92 |                         snapshot,
 93 |                         [
 94 |                             f'{self.source}/subseqitems/{x.replace(".xml", ".gpickle")}'
 95 |                             for x in graph_files
 96 |                         ],
 97 |                         None,
 98 |                     )
 99 |                 )
100 | 
101 |         return files
102 | 
103 |     def execute_item(self, item):
104 |         year, files, files_regulations = item
105 | 
106 |         if self.regulations and files_regulations:
107 |             files += files_regulations
108 | 
109 |         node_columns = [
110 |             "key",
111 |             "level",
112 |             "citekey",
113 |             "parent_key",
114 |             "type",
115 |             "document_type",
116 |             "heading",
117 |             "law_name",
118 |             "chars_n",
119 |             "chars_nowhites",
120 |             "tokens_n",
121 |             "tokens_unique",
122 |             "abbr_1",
123 |             "abbr_2",
124 |             "subject_areas",
125 |             "legislators",
126 |             "contributors",
127 |             "texts_tokens_n",
128 |             "texts_chars_n",
129 |         ]
130 |         edge_columns = ["u", "v", "edge_type"]
131 | 
132 |         nodes_csv_path = f"{self.destination}/{year}.nodes.csv.gz"
133 |         edges_csv_path = f"{self.destination}/{year}.edges.csv.gz"
134 | 
135 |         pd.DataFrame(
136 |             [dict(level=-1, key="root", law_name="root")], columns=node_columns
137 |         ).to_csv(
138 |             nodes_csv_path,
139 |             header=True,
140 |             index=False,
141 |             columns=node_columns,
142 |         )
143 | 
144 |         pd.DataFrame([], columns=edge_columns).to_csv(
145 |             edges_csv_path,
146 |             header=True,
147 |             index=False,
148 |             columns=edge_columns,
149 |         )
150 | 
151 |         for file in files:
152 |             nG = nx.read_gpickle(file)
153 |             nx.set_node_attributes(nG, nG.graph.get("name", file), name="law_name")
154 | 
155 |             nodes_df = pd.DataFrame(
156 |                 [d for n, d in nG.nodes(data=True)], columns=node_columns
157 |             )
158 | 
159 |             if self.dataset.lower() == "us":
160 |                 nodes_df["document_type"] = [
161 |                     "regulation" if key.startswith("cfr") else "statute"
162 |                     for key in nodes_df.key
163 |                 ]
164 | 
165 |             nodes_df.to_csv(
166 |                 nodes_csv_path,
167 |                 header=False,
168 |                 index=False,
169 |                 columns=node_columns,
170 |                 mode="a",
171 |             )
172 | 
173 |             edges_df = pd.DataFrame(
174 |                 [dict(u=u, v=v, edge_type="containment") for u, v in nG.edges()],
175 |                 columns=edge_columns,
176 |             )
177 | 
178 |             for idx, row in nodes_df[nodes_df.level == 0].iterrows():
179 |                 edges_df = edges_df.append(
180 |                     [dict(u="root", v=row.key, edge_type="containment")]
181 |                 )
182 | 
183 |             edges_df.to_csv(
184 |                 edges_csv_path,
185 |                 header=False,
186 |                 index=False,
187 |                 columns=edge_columns,
188 |                 mode="a",
189 |             )
190 | 
191 |         # Get reference edges
192 |         edge_list = pd.read_csv(f"{self.edgelist_folder}/{year}.csv")
193 |         edges_df = pd.DataFrame(
194 |             {"u": edge_list.out_node, "v": edge_list.in_node, "edge_type": "reference"},
195 |             columns=edge_columns,
196 |         )
197 |         edges_df.to_csv(
198 |             edges_csv_path,
199 |             header=False,
200 |             index=False,
201 |             columns=edge_columns,
202 |             mode="a",
203 |         )
204 | 
205 |         # add authority edges
206 |         if self.regulations:
207 |             edge_list = pd.read_csv(f"{self.authority_edgelist_folder}/{year}.csv")
208 |             edges_df = pd.DataFrame(
209 |                 {
210 |                     "u": edge_list.out_node,
211 |                     "v": edge_list.in_node,
212 |                     "edge_type": "authority",
213 |                 },
214 |                 columns=edge_columns,
215 |             )
216 |             edges_df.to_csv(
217 |                 edges_csv_path,
218 |                 header=False,
219 |                 index=False,
220 |                 columns=edge_columns,
221 |                 mode="a",
222 |             )
223 | 
224 |         # Create and save seqitem graph
225 |         G = load_graph_from_csv_files(
226 |             self.destination, year, filter="exclude_subseqitems"
227 |         )
228 | 
229 |         nx.write_gpickle(G, f"{self.destination}/seqitems/{year}.gpickle.gz")
230 | 


--------------------------------------------------------------------------------
/statutes_pipeline_steps/de_authority_edgelist.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | 
  4 | import numpy
  5 | import pandas as pd
  6 | from quantlaw.utils.beautiful_soup import create_soup
  7 | from quantlaw.utils.files import ensure_exists
  8 | from quantlaw.utils.pipeline import PipelineStep
  9 | 
 10 | from statics import (
 11 |     DE_REFERENCE_PARSED_PATH,
 12 |     DE_REG_AUTHORITY_EDGELIST_PATH,
 13 |     DE_REG_CROSSREFERENCE_LOOKUP_PATH,
 14 |     DE_REG_REFERENCE_PARSED_PATH,
 15 | )
 16 | from utils.common import get_snapshot_law_list
 17 | 
 18 | 
 19 | def get_filename(date):
 20 |     return f"{date}.csv"
 21 | 
 22 | 
 23 | class DeAuthorityEdgelist(PipelineStep):
 24 |     def __init__(self, law_names_data, *args, **kwargs):
 25 |         self.law_names_data = law_names_data
 26 |         super().__init__(*args, **kwargs)
 27 | 
 28 |     def get_items(self, overwrite, snapshots) -> list:
 29 |         ensure_exists(DE_REG_AUTHORITY_EDGELIST_PATH)
 30 | 
 31 |         if not overwrite:
 32 |             existing_files = os.listdir(DE_REG_AUTHORITY_EDGELIST_PATH)
 33 |             snapshots = list(
 34 |                 filter(lambda f: get_filename(f) not in existing_files, snapshots)
 35 |             )
 36 | 
 37 |         return snapshots
 38 | 
 39 |     def execute_item(self, item):
 40 |         files = get_snapshot_law_list(item, self.law_names_data)
 41 |         source_folder = DE_REG_CROSSREFERENCE_LOOKUP_PATH
 42 |         target_folder = DE_REG_AUTHORITY_EDGELIST_PATH
 43 |         key_df = (
 44 |             pd.read_csv(f"{source_folder}/{item}.csv").dropna().set_index("citekey")
 45 |         )
 46 |         law_citekeys_dict = {
 47 |             citekey.split("_")[0]: "_".join(row["key"].split("_")[:-1]) + "_000001"
 48 |             for citekey, row in key_df.iterrows()
 49 |         }
 50 | 
 51 |         df = None
 52 |         for file in files:
 53 |             edge_df = make_edge_list(file, key_df, law_citekeys_dict, regulations=True)
 54 |             df = edge_df if df is None else df.append(edge_df, ignore_index=True)
 55 |         df.to_csv(f"{target_folder}/{item}.csv", index=False)
 56 | 
 57 | 
 58 | def make_edge_list(file, key_df, law_citekeys_dict, regulations):
 59 |     soup = create_soup(
 60 |         os.path.join(
 61 |             DE_REG_REFERENCE_PARSED_PATH if regulations else DE_REFERENCE_PARSED_PATH,
 62 |             file,
 63 |         )
 64 |     )
 65 |     edges = []
 66 | 
 67 |     # FOR DEBUG
 68 |     problem_matches = set()
 69 |     problem_keys = set()
 70 | 
 71 |     for item in soup.find_all(["document", "seqitem"], attrs={"parsed": True}):
 72 |         item_parsed_ref_str = item.attrs["parsed"]
 73 |         if not item_parsed_ref_str or item_parsed_ref_str == "[]":
 74 |             continue
 75 | 
 76 |         node_out = item.get("key")
 77 |         refs = json.loads(item_parsed_ref_str)
 78 |         for ref in refs:
 79 |             # TODO multiple laws with the same bnormabk
 80 |             if len(ref) > 1:  # Ref to seqitem at least
 81 |                 try:
 82 |                     key = "_".join(ref[:2])
 83 |                     matches = key_df.at[key, "key"]
 84 |                     if type(matches) == numpy.ndarray:
 85 |                         print(f"Multiple matches for {key}")
 86 |                         matches = matches[0]
 87 |                     if type(matches) is not str:
 88 |                         problem_matches.add(tuple(matches))
 89 |                     node_in = matches if type(matches) == str else matches[0]
 90 |                     edges.append((node_out, node_in))
 91 |                 except KeyError:
 92 |                     problem_keys.add(key)
 93 |             else:  # ref to document only
 94 |                 node_in = law_citekeys_dict.get(ref[0])
 95 |                 if node_in:
 96 |                     edges.append((node_out, node_in))
 97 | 
 98 |     # FOR DEBUG
 99 |     # if len(problem_matches) > 0:
100 |     #     print(f"{file} Problem Matches:\n", sorted(list(problem_matches)))
101 |     # if len(problem_keys) > 0:
102 |     #     print(f"{file} Problem Matches:\n", sorted(list(problem_keys)))
103 |     return pd.DataFrame(edges, columns=["out_node", "in_node"])
104 | 


--------------------------------------------------------------------------------
/statutes_pipeline_steps/de_crossreference_edgelist.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | 
  4 | import numpy
  5 | import pandas as pd
  6 | from quantlaw.utils.beautiful_soup import create_soup
  7 | from quantlaw.utils.files import ensure_exists
  8 | 
  9 | from statics import (
 10 |     DE_CROSSREFERENCE_EDGELIST_PATH,
 11 |     DE_CROSSREFERENCE_LOOKUP_PATH,
 12 |     DE_REFERENCE_PARSED_PATH,
 13 |     DE_REG_CROSSREFERENCE_EDGELIST_PATH,
 14 |     DE_REG_CROSSREFERENCE_LOOKUP_PATH,
 15 |     DE_REG_REFERENCE_PARSED_PATH,
 16 | )
 17 | from utils.common import RegulationsPipelineStep, get_snapshot_law_list
 18 | 
 19 | 
 20 | class DeCrossreferenceEdgelist(RegulationsPipelineStep):
 21 |     def __init__(self, law_names_data, *args, **kwargs):
 22 |         self.law_names_data = law_names_data
 23 |         super().__init__(*args, **kwargs)
 24 | 
 25 |     def get_items(self, overwrite, snapshots) -> list:
 26 |         target_folder = (
 27 |             DE_REG_CROSSREFERENCE_EDGELIST_PATH
 28 |             if self.regulations
 29 |             else DE_CROSSREFERENCE_EDGELIST_PATH
 30 |         )
 31 |         ensure_exists(target_folder)
 32 | 
 33 |         if not overwrite:
 34 |             existing_files = os.listdir(target_folder)
 35 |             snapshots = list(
 36 |                 filter(lambda f: get_filename(f) not in existing_files, snapshots)
 37 |             )
 38 | 
 39 |         return snapshots
 40 | 
 41 |     def execute_item(self, item):
 42 |         files = get_snapshot_law_list(item, self.law_names_data)
 43 |         source_folder = (
 44 |             DE_REG_CROSSREFERENCE_LOOKUP_PATH
 45 |             if self.regulations
 46 |             else DE_CROSSREFERENCE_LOOKUP_PATH
 47 |         )
 48 |         target_folder = (
 49 |             DE_REG_CROSSREFERENCE_EDGELIST_PATH
 50 |             if self.regulations
 51 |             else DE_CROSSREFERENCE_EDGELIST_PATH
 52 |         )
 53 |         key_df = (
 54 |             pd.read_csv(f"{source_folder}/{item}.csv").dropna().set_index("citekey")
 55 |         )
 56 |         df = None
 57 |         for file in files:
 58 |             edge_df = make_edge_list(file, key_df, self.regulations)
 59 |             df = edge_df if df is None else df.append(edge_df, ignore_index=True)
 60 |         df.to_csv(f"{target_folder}/{item}.csv", index=False)
 61 | 
 62 | 
 63 | def get_filename(date):
 64 |     return f"{date}.csv"
 65 | 
 66 | 
 67 | def make_edge_list(file, key_df, regulations):
 68 |     soup = create_soup(
 69 |         os.path.join(
 70 |             DE_REG_REFERENCE_PARSED_PATH if regulations else DE_REFERENCE_PARSED_PATH,
 71 |             file,
 72 |         )
 73 |     )
 74 |     edges = []
 75 | 
 76 |     # # FOR DEBUG
 77 |     # problem_matches = set()
 78 |     # problem_keys = set()
 79 | 
 80 |     for item in soup.find_all("seqitem"):
 81 |         references = item.find_all("reference")
 82 |         if references:
 83 |             node_out = item.get("key")
 84 |             for node in references:
 85 |                 if node.lawname and node.lawname.get("type") in [
 86 |                     "dict",
 87 |                     "sgb",
 88 |                     "internal",
 89 |                 ]:
 90 |                     refs = json.loads(node.attrs["parsed"])
 91 |                     for ref in refs:
 92 |                         try:
 93 |                             key = "_".join(ref[:2])
 94 |                             matches = key_df.at[key, "key"]
 95 |                             if type(matches) == numpy.ndarray:
 96 |                                 print(f"Multiple matches for {key}")
 97 |                                 matches = matches[0]
 98 |                             # # FOR DEBUG
 99 |                             # if type(matches) is not str:
100 |                             #     problem_matches.add(tuple(matches))
101 |                             node_in = matches if type(matches) == str else matches[0]
102 |                             edges.append((node_out, node_in))
103 |                             assert len(ref) > 1
104 |                         except KeyError:
105 |                             pass
106 |                             # # FOR DEBUG
107 |                             # problem_keys.add(key)
108 | 
109 |     # FOR DEBUG
110 |     # if len(problem_matches) > 0:
111 |     #     print(f"{file} Problem Matches:\n", sorted(list(problem_matches)))
112 |     # if len(problem_keys) > 0:
113 |     #     print(f"{file} Problem Matches:\n", sorted(list(problem_keys)))
114 |     return pd.DataFrame(edges, columns=["out_node", "in_node"])
115 | 


--------------------------------------------------------------------------------
/statutes_pipeline_steps/de_crossreference_lookup.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from quantlaw.utils.beautiful_soup import create_soup
 3 | from quantlaw.utils.files import ensure_exists
 4 | 
 5 | from statics import (
 6 |     DE_CROSSREFERENCE_LOOKUP_PATH,
 7 |     DE_REFERENCE_PARSED_PATH,
 8 |     DE_REG_CROSSREFERENCE_LOOKUP_PATH,
 9 |     DE_REG_REFERENCE_PARSED_PATH,
10 | )
11 | from utils.common import RegulationsPipelineStep, get_snapshot_law_list, load_law_names
12 | 
13 | 
14 | class DeCrossreferenceLookup(RegulationsPipelineStep):
15 |     def get_items(self, snapshots) -> list:
16 |         ensure_exists(
17 |             DE_REG_CROSSREFERENCE_LOOKUP_PATH
18 |             if self.regulations
19 |             else DE_CROSSREFERENCE_LOOKUP_PATH
20 |         )
21 |         files = []
22 |         law_names_data = load_law_names(self.regulations)
23 |         for snapshot in snapshots:
24 |             files.append((snapshot, get_snapshot_law_list(snapshot, law_names_data)))
25 |         return files
26 | 
27 |     def execute_item(self, item):
28 |         date, files = item
29 |         data = []
30 |         source_folder = (
31 |             DE_REG_REFERENCE_PARSED_PATH
32 |             if self.regulations
33 |             else DE_REFERENCE_PARSED_PATH
34 |         )
35 |         target_folder = (
36 |             DE_REG_CROSSREFERENCE_LOOKUP_PATH
37 |             if self.regulations
38 |             else DE_CROSSREFERENCE_LOOKUP_PATH
39 |         )
40 |         for file in files:
41 |             soup = create_soup(f"{source_folder}/{file}")
42 |             for tag in soup.find_all(citekey=True):
43 |                 data.append([tag.attrs["key"], tag.attrs["citekey"]])
44 |         df = pd.DataFrame(data, columns=["key", "citekey"])
45 |         destination_file = f"{target_folder}/{date}.csv"
46 |         df.to_csv(destination_file, index=False)
47 | 


--------------------------------------------------------------------------------
/statutes_pipeline_steps/de_law_names.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | 
 3 | import pandas as pd
 4 | from quantlaw.de_extract.stemming import stem_law_name
 5 | from quantlaw.utils.beautiful_soup import create_soup
 6 | from quantlaw.utils.files import list_dir
 7 | 
 8 | from statics import (
 9 |     DE_LAW_NAMES_COMPILED_PATH,
10 |     DE_LAW_NAMES_PATH,
11 |     DE_REG_LAW_NAMES_COMPILED_PATH,
12 |     DE_REG_LAW_NAMES_PATH,
13 |     DE_REG_XML_PATH,
14 |     DE_XML_PATH,
15 | )
16 | from utils.common import RegulationsPipelineStep, load_law_names
17 | 
18 | 
19 | class DeLawNamesStep(RegulationsPipelineStep):
20 |     def get_items(self) -> list:
21 |         src = DE_REG_XML_PATH if self.regulations else DE_XML_PATH
22 |         files = list_dir(src, ".xml")
23 |         return files
24 | 
25 |     def execute_item(self, item):
26 |         src = DE_REG_XML_PATH if self.regulations else DE_XML_PATH
27 |         soup = create_soup(f"{src}/{item}")
28 |         document = soup.find("document", recursive=False)
29 |         result = set()
30 |         citekey = document.attrs["key"].split("_")[1]
31 | 
32 |         if "heading" in document.attrs:
33 |             law_name = stem_law_name(document.attrs["heading"])
34 |             result.add((law_name, citekey, item))
35 | 
36 |         if "heading_short" in document.attrs:
37 |             law_name = stem_law_name(document.attrs["heading_short"])
38 |             result.add((law_name, citekey, item))
39 | 
40 |         if "abbr_1" in document.attrs:
41 |             law_name = stem_law_name(document.attrs["abbr_1"])
42 |             result.add((law_name, citekey, item))
43 | 
44 |         if "abbr_2" in document.attrs:
45 |             law_name = stem_law_name(document.attrs["abbr_2"])
46 |             result.add((law_name, citekey, item))
47 |         return result
48 | 
49 |     def finish_execution(self, names_per_file):
50 |         dest_compiled = (
51 |             DE_REG_LAW_NAMES_COMPILED_PATH
52 |             if self.regulations
53 |             else DE_LAW_NAMES_COMPILED_PATH
54 |         )
55 |         dest_csv = DE_REG_LAW_NAMES_PATH if self.regulations else DE_LAW_NAMES_PATH
56 | 
57 |         result = []
58 |         for names_of_file in names_per_file:
59 |             result.extend(names_of_file)
60 | 
61 |         df = pd.DataFrame(result, columns=["citename", "citekey", "filename"])
62 |         df.to_csv(dest_csv, index=False)
63 | 
64 |         dated_law_names = compile_law_names(self.regulations)
65 |         with open(dest_compiled, "wb") as f:
66 |             pickle.dump(dated_law_names, f)
67 | 
68 | 
69 | def compile_law_names(regulations):
70 |     data = load_law_names(regulations)
71 |     dates = sorted({r["start"] for r in data})
72 | 
73 |     dated_law_names = {}
74 | 
75 |     date_len = len(dates)
76 |     for i, date in enumerate(dates):
77 |         if i % 100 == 0:
78 |             print(f"\r{i/date_len}", end="")
79 |         law_names_list = [d for d in data if d["start"] <= date and d["end"] >= date]
80 |         law_names = {}
81 |         for row in law_names_list:
82 |             law_names[row["citename"]] = row["citekey"]
83 |         dated_law_names[date] = law_names
84 |     print()
85 | 
86 |     return dated_law_names
87 | 


--------------------------------------------------------------------------------
/statutes_pipeline_steps/de_prepare_input.py:
--------------------------------------------------------------------------------
 1 | # Roughly validate the input files
 2 | import os
 3 | import shutil
 4 | 
 5 | from quantlaw.utils.files import ensure_exists
 6 | 
 7 | from statics import (
 8 |     DE_ORIGINAL_PATH,
 9 |     DE_REG_ORIGINAL_PATH,
10 |     JURIS_EXPORT_GESETZE_LIST_PATH,
11 |     JURIS_EXPORT_PATH,
12 |     JURIS_EXPORT_RVO_LIST_PATH,
13 | )
14 | 
15 | 
16 | def copy_selected_doknrs(selection_list, target_dir):
17 |     ensure_exists(target_dir)
18 |     for doknr in selection_list:
19 |         version_filenames = [
20 |             f for f in os.listdir(f"{JURIS_EXPORT_PATH}/{doknr}") if f.endswith(".xml")
21 |         ]
22 |         for version_filename in version_filenames:
23 |             assert len(version_filename.split("_")) == 3
24 |             shutil.copy(
25 |                 f"{JURIS_EXPORT_PATH}/{doknr}/{version_filename}",
26 |                 f"{target_dir}/{version_filename}",
27 |             )
28 | 
29 | 
30 | def de_prepare_input(regulations):
31 | 
32 |     dest = DE_REG_ORIGINAL_PATH if regulations else DE_ORIGINAL_PATH
33 | 
34 |     with open(JURIS_EXPORT_GESETZE_LIST_PATH) as f:
35 |         gesetze_dirs = f.read().strip().split("\n")
36 |     copy_selected_doknrs(gesetze_dirs, dest)
37 | 
38 |     if regulations:
39 |         with open(JURIS_EXPORT_RVO_LIST_PATH) as f:
40 |             rvo_dirs = f.read().strip().split("\n")
41 |         copy_selected_doknrs(rvo_dirs, DE_REG_ORIGINAL_PATH)
42 | 


--------------------------------------------------------------------------------
/statutes_pipeline_steps/de_reference_areas.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import os
  3 | 
  4 | import bs4
  5 | from quantlaw.de_extract.statutes_abstract import StatutesMatchWithMainArea
  6 | from quantlaw.de_extract.statutes_areas import StatutesExtractor
  7 | from quantlaw.utils.beautiful_soup import create_soup
  8 | from quantlaw.utils.files import ensure_exists, list_dir
  9 | 
 10 | from statics import (
 11 |     DE_HELPERS_PATH,
 12 |     DE_REFERENCE_AREAS_LOG_PATH,
 13 |     DE_REFERENCE_AREAS_PATH,
 14 |     DE_REG_HELPERS_PATH,
 15 |     DE_REG_REFERENCE_AREAS_LOG_PATH,
 16 |     DE_REG_REFERENCE_AREAS_PATH,
 17 |     DE_REG_XML_PATH,
 18 |     DE_XML_PATH,
 19 | )
 20 | from utils.common import RegulationsPipelineStep, get_stemmed_law_names_for_filename
 21 | 
 22 | 
 23 | class DeReferenceAreasStep(RegulationsPipelineStep):
 24 |     max_number_of_processes = 2
 25 | 
 26 |     def __init__(self, law_names, *args, **kwargs):
 27 |         self.law_names = law_names
 28 |         super().__init__(*args, **kwargs)
 29 | 
 30 |     def get_items(self, overwrite) -> list:
 31 |         src = DE_REG_XML_PATH if self.regulations else DE_XML_PATH
 32 |         dest = (
 33 |             DE_REG_REFERENCE_AREAS_PATH if self.regulations else DE_REFERENCE_AREAS_PATH
 34 |         )
 35 | 
 36 |         ensure_exists(dest)
 37 |         files = list_dir(src, ".xml")
 38 | 
 39 |         if not overwrite:
 40 |             existing_files = os.listdir(dest)
 41 |             files = list(filter(lambda f: f not in existing_files, files))
 42 |         return files
 43 | 
 44 |     def execute_item(self, item):
 45 |         src = DE_REG_XML_PATH if self.regulations else DE_XML_PATH
 46 |         dest = (
 47 |             DE_REG_REFERENCE_AREAS_PATH if self.regulations else DE_REFERENCE_AREAS_PATH
 48 |         )
 49 | 
 50 |         laws_lookup = get_stemmed_law_names_for_filename(item, self.law_names)
 51 |         extractor = StatutesExtractor(laws_lookup)
 52 |         result = []
 53 |         soup = create_soup(f"{src}/{item}")
 54 |         para, art, misc = analyze_type_of_headings(soup)
 55 | 
 56 |         result.extend(find_references_in_soup(soup, extractor, para, art))
 57 | 
 58 |         # Find references without preceding article or § (currently not implemented)
 59 |         # long_law_regex_pattern = law_keys_to_regex(laws_lookup_keys, 5)
 60 |         # short_law_regex_pattern = law_keys_to_regex(laws_lookup_keys, 3, 4)
 61 |         # for section in soup.find_all("text"):
 62 |         #     find_law_references_in_section(
 63 |         #         section, soup, long_law_regex_pattern, stem_law_name
 64 |         #     )
 65 |         #     find_law_references_in_section(
 66 |         #         section, soup, short_law_regex_pattern, clean_name
 67 |         #     )
 68 | 
 69 |         save_soup_with_style(soup, f"{dest}/{item}")
 70 | 
 71 |         return result
 72 | 
 73 |     def finish_execution(self, results):
 74 |         logs = list(itertools.chain.from_iterable(results))
 75 |         ensure_exists(DE_REG_HELPERS_PATH if self.regulations else DE_HELPERS_PATH)
 76 |         with open(
 77 |             DE_REG_REFERENCE_AREAS_LOG_PATH
 78 |             if self.regulations
 79 |             else DE_REFERENCE_AREAS_LOG_PATH,
 80 |             mode="w",
 81 |         ) as f:
 82 |             f.write("\n".join(sorted(logs, key=lambda x: x.lower())))
 83 | 
 84 | 
 85 | ########################################
 86 | # Functions general and normal citations
 87 | ########################################
 88 | 
 89 | 
 90 | def save_soup_with_style(soup, path):
 91 |     output_lines = str(soup).replace("\n\n", "\n").split("\n")
 92 |     output_lines.insert(1, '<?xml-stylesheet href="../../xml-styles.css"?>')
 93 |     output = "\n".join(output_lines)
 94 | 
 95 |     with open(path, "w") as f:
 96 |         f.write(output)
 97 | 
 98 | 
 99 | def analyze_type_of_headings(soup):
100 |     para = 0
101 |     art = 0
102 |     misc = 0
103 |     for tag in soup.find_all("seqitem"):
104 |         if "heading" not in tag.attrs:
105 |             misc += 1
106 |         elif tag.attrs["heading"].replace("\n", "").startswith("§"):
107 |             para += 1
108 |         elif tag.attrs["heading"].replace("\n", "").lower().startswith("art"):
109 |             art += 1
110 |         else:
111 |             misc += 1
112 |     return para, art, misc
113 | 
114 | 
115 | def add_tag(string, pos, end, tag):
116 |     tag.string = string[pos:end]
117 |     return [
118 |         bs4.element.NavigableString(string[:pos]),
119 |         tag,
120 |         bs4.element.NavigableString(string[end:]),
121 |     ]
122 | 
123 | 
124 | def split_reference(string, len_main, len_suffix, soup):
125 |     main_str = string[:len_main]
126 |     suffix_str = string[len_main : len_main + len_suffix]
127 |     law_str = string[len_main + len_suffix :]
128 | 
129 |     result = [soup.new_tag("main"), soup.new_tag("suffix"), soup.new_tag("lawname")]
130 |     result[0].append(main_str)
131 |     result[1].append(suffix_str)
132 |     result[2].append(law_str)
133 | 
134 |     return result
135 | 
136 | 
137 | def handle_reference_match(match: StatutesMatchWithMainArea, section, soup, para, art):
138 |     # Set internal references to ignore if seqitem unit (Art|§) does not match between
139 |     # reference and target law
140 |     if match.law_match_type == "internal":
141 |         if (section.contents[-1][match.start :].startswith("§") and para == 0) or (
142 |             section.contents[-1][match.start :].lower().startswith("art") and art == 0
143 |         ):
144 |             match.law_match_type = "ignore"
145 | 
146 |     ref_tag = soup.new_tag("reference", pattern="inline")
147 |     section.contents[-1:] = add_tag(
148 |         section.contents[-1],
149 |         match.start,
150 |         match.end + match.suffix_len + match.law_len,
151 |         ref_tag,
152 |     )
153 |     ref_tag.contents = split_reference(
154 |         ref_tag.string, match.end - match.start, match.suffix_len, soup
155 |     )
156 |     ref_tag.contents[-1]["type"] = match.law_match_type
157 | 
158 | 
159 | def find_references_in_section(section, soup, extractor: StatutesExtractor, para, art):
160 |     logs = []
161 |     match = extractor.search(section.contents[-1])  # Search first match
162 |     while match:
163 |         if match.has_main_area():
164 |             handle_reference_match(match, section, soup, para, art)
165 |         match = extractor.search(
166 |             section.contents[-1], pos=(0 if match.has_main_area() else match.end)
167 |         )
168 |     return logs
169 | 
170 | 
171 | def find_references_in_soup(soup, extractor, para, art, text_tag_name="text"):
172 |     logs = []
173 |     for text in soup.find_all(text_tag_name):
174 |         if text.is_empty_element:
175 |             continue
176 |         assert text.string
177 |         logs.extend(find_references_in_section(text, soup, extractor, para, art))
178 |     return logs
179 | 
180 | 
181 | ########################################################
182 | # Functions: references without preceding 'article' or §
183 | ########################################################
184 | #
185 | #
186 | #
187 | # def pos_in_orig_string(i, stemmed, orig):
188 | #     prefix = stemmed[:i]
189 | #     stemmed_tokens = regex.findall(r"[\w']+|[\W']+", prefix)
190 | #     orig_tokens = regex.findall(r"[\w']+|[\W']+", orig)
191 | #     # return (len(''.join(orig_tokens[:len(stemmed_tokens)-1])) +
192 | #     #         len(stemmed_tokens[-1]) # Precise position
193 | #     return len("".join(orig_tokens[: len(stemmed_tokens)]))  # Round to next boundary
194 | #
195 | #
196 | # def law_keys_to_regex(keys, min_length, max_length=-1):
197 | #     pattern = ""
198 | #     for key in keys:
199 | #         if len(key) >= min_length and (len(key) <= max_length or max_length == -1):
200 | #             pattern += regex.escape(key) + r"|"
201 | #     pattern = pattern[:-1]
202 | #     full_pattern = r"\b(?>" + pattern + r")\b"
203 | #     return regex.compile(full_pattern, flags=regex.IGNORECASE)
204 | #
205 | #
206 | # def find_law_references_in_section(section, soup, law_regex_pattern, sanitizer):
207 | #     for item in list(section.contents):
208 | #         i_in_section = section.contents.index(item)
209 | #         if type(item) is not bs4.element.NavigableString:
210 | #             continue
211 | #         test_string = sanitizer(item.string)
212 | #         matches = law_regex_pattern.finditer(test_string)
213 | #         for match in reversed(list(matches)):
214 | #             orig_start = pos_in_orig_string(match.start(), test_string, item.string)
215 | #             orig_end = pos_in_orig_string(match.end(), test_string, item.string)
216 | #
217 | #             ref_tag = soup.new_tag("reference", pattern="generic")
218 | #
219 | #             section.contents[i_in_section : i_in_section + 1] = add_tag(
220 | #                 section.contents[i_in_section], orig_start, orig_end, ref_tag
221 | #             )
222 | 


--------------------------------------------------------------------------------
/statutes_pipeline_steps/de_reference_parse.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import json
  3 | import os
  4 | 
  5 | from quantlaw.de_extract.statutes_parse import StatutesParser, StringCaseException
  6 | from quantlaw.de_extract.stemming import stem_law_name
  7 | from quantlaw.utils.beautiful_soup import create_soup, save_soup
  8 | from quantlaw.utils.files import ensure_exists, list_dir
  9 | 
 10 | from statics import (
 11 |     DE_HELPERS_PATH,
 12 |     DE_REFERENCE_AREAS_PATH,
 13 |     DE_REFERENCE_PARSED_LOG_PATH,
 14 |     DE_REFERENCE_PARSED_PATH,
 15 |     DE_REG_HELPERS_PATH,
 16 |     DE_REG_REFERENCE_AREAS_PATH,
 17 |     DE_REG_REFERENCE_PARSED_LOG_PATH,
 18 |     DE_REG_REFERENCE_PARSED_PATH,
 19 | )
 20 | from statutes_pipeline_steps.de_reference_parse_vso_list import (
 21 |     identify_reference_in_juris_vso_list,
 22 | )
 23 | from utils.common import (
 24 |     RegulationsPipelineStep,
 25 |     copy_xml_schema_to_data_folder,
 26 |     get_stemmed_law_names_for_filename,
 27 | )
 28 | 
 29 | 
 30 | class DeReferenceParseStep(RegulationsPipelineStep):
 31 |     max_number_of_processes = 2
 32 | 
 33 |     def __init__(self, law_names, *args, **kwargs):
 34 |         self.law_names = law_names
 35 |         super().__init__(*args, **kwargs)
 36 | 
 37 |     def get_items(self, overwrite) -> list:
 38 |         src = (
 39 |             DE_REG_REFERENCE_AREAS_PATH if self.regulations else DE_REFERENCE_AREAS_PATH
 40 |         )
 41 |         dest = (
 42 |             DE_REG_REFERENCE_PARSED_PATH
 43 |             if self.regulations
 44 |             else DE_REFERENCE_PARSED_PATH
 45 |         )
 46 | 
 47 |         ensure_exists(dest)
 48 |         files = list_dir(src, ".xml")
 49 | 
 50 |         ensure_exists(dest)
 51 |         files = list_dir(src, ".xml")
 52 | 
 53 |         if not overwrite:
 54 |             existing_files = os.listdir(dest)
 55 |             files = list(filter(lambda f: f not in existing_files, files))
 56 | 
 57 |         copy_xml_schema_to_data_folder()
 58 | 
 59 |         return files
 60 | 
 61 |     def execute_item(self, item):
 62 |         src = (
 63 |             DE_REG_REFERENCE_AREAS_PATH if self.regulations else DE_REFERENCE_AREAS_PATH
 64 |         )
 65 |         dest = (
 66 |             DE_REG_REFERENCE_PARSED_PATH
 67 |             if self.regulations
 68 |             else DE_REFERENCE_PARSED_PATH
 69 |         )
 70 | 
 71 |         laws_lookup = get_stemmed_law_names_for_filename(item, self.law_names)
 72 |         parser = StatutesParser(laws_lookup)
 73 | 
 74 |         logs = list()
 75 | 
 76 |         # for debug
 77 |         logs.append(f"Start file - {item}")
 78 | 
 79 |         soup = create_soup(f"{src}/{item}")
 80 |         parse_reference_content_in_soup(soup, parser, debug_context=item)
 81 |         current_lawid = soup.document.attrs["key"].split("_")[1]
 82 |         identify_reference_law_name_in_soup(soup, parser, current_lawid)
 83 |         identify_lawreference_law_name_in_soup(soup, laws_lookup)
 84 | 
 85 |         identify_reference_in_juris_vso_list(soup, parser)
 86 | 
 87 |         save_soup(soup, f"{dest}/{item}")
 88 |         return logs
 89 | 
 90 |     def finish_execution(self, results):
 91 |         logs = list(itertools.chain.from_iterable(results))
 92 |         ensure_exists(DE_REG_HELPERS_PATH if self.regulations else DE_HELPERS_PATH)
 93 |         with open(
 94 |             DE_REG_REFERENCE_PARSED_LOG_PATH
 95 |             if self.regulations
 96 |             else DE_REFERENCE_PARSED_LOG_PATH,
 97 |             mode="w",
 98 |         ) as f:
 99 |             f.write("\n".join(sorted(logs, key=lambda x: x.lower())))
100 | 
101 | 
102 | def parse_reference_content(reference, parser):
103 |     citation = reference.main.get_text()
104 |     reference_paths = parser.parse_main(citation)
105 | 
106 |     reference["parsed_verbose"] = json.dumps(reference_paths, ensure_ascii=False)
107 |     reference_paths_simple = [
108 |         [component[1] for component in path] for path in reference_paths
109 |     ]
110 |     reference["parsed"] = json.dumps(reference_paths_simple, ensure_ascii=False)
111 | 
112 | 
113 | def parse_reference_content_in_soup(soup, parser, debug_context=None):
114 |     for reference in soup.find_all("reference", {"pattern": "inline"}):
115 |         if reference.main:
116 |             try:
117 |                 parse_reference_content(reference, parser)
118 |             except StringCaseException as error:
119 |                 print(error, "context", debug_context)
120 | 
121 | 
122 | def identify_reference_law_name_in_soup(soup, parser, current_lawid, skip_errors=False):
123 |     for reference in soup.find_all("reference", {"pattern": "inline"}):
124 | 
125 |         lawid = parser.parse_law(
126 |             reference.lawname.string, reference.lawname["type"], current_lawid
127 |         )
128 | 
129 |         try:
130 |             ref_parts = json.loads(reference["parsed_verbose"])
131 | 
132 |             if reference.lawname.attrs["type"] in ["internal", "dict", "sgb"]:
133 |                 for ref_part in ref_parts:
134 |                     if not lawid:
135 |                         print(reference)
136 |                     ref_part.insert(0, ["Gesetz", lawid])
137 |             reference["parsed_verbose"] = json.dumps(ref_parts, ensure_ascii=False)
138 | 
139 |             ref_parts = json.loads(reference["parsed"])
140 |             if reference.lawname.attrs["type"] in ["internal", "dict", "sgb"]:
141 |                 for ref_part in ref_parts:
142 |                     assert lawid
143 |                     ref_part.insert(0, lawid)
144 |             reference["parsed"] = json.dumps(ref_parts, ensure_ascii=False)
145 |         except KeyError:
146 |             if skip_errors:
147 |                 print(reference)
148 |             else:
149 |                 raise
150 | 
151 | 
152 | def identify_lawreference_law_name_in_soup(soup, laws_lookup):
153 |     for reference in soup.find_all("reference", {"pattern": "generic"}):
154 |         reference["parsed"] = [[laws_lookup[stem_law_name(reference.string)]]]
155 | 


--------------------------------------------------------------------------------
/statutes_pipeline_steps/de_reference_parse_vso_list.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | import regex
 4 | 
 5 | # fmt: off
 6 | from quantlaw.de_extract.statutes_parse import StatutesParser
 7 | from quantlaw.de_extract.stemming import stem_law_name
 8 | 
 9 | reference_trigger_pattern = regex.compile(
10 |     r'('
11 |         r'§{1,2}|'
12 |         r'\bArt\b\.?|'
13 |         r'Artikels?n?'
14 |     r')\s*'
15 | )
16 | # fmt: on
17 | 
18 | 
19 | def identify_reference_in_juris_vso_list(soup, parser: StatutesParser):
20 | 
21 |     vso_tags = soup.find_all(["document", "seqitem"], attrs={"verweise": True})
22 |     for vso_tag in vso_tags:
23 |         parsed_vso_refs = []
24 |         parsed_vso_refs_simple = []
25 |         verweise = (
26 |             []
27 |             if vso_tag.attrs["verweise"] == "[]"
28 |             else json.loads(vso_tag.attrs["verweise"])
29 |         )
30 |         for verweis in verweise:
31 |             if not verweis["typ"] in [
32 |                 "Ermächtigung",
33 |                 "Rechtsgrundlage",
34 |                 "Durchführungsvorschrift",
35 |             ]:
36 |                 # 'Vertragsgesetz', 'Sonderregelung', 'GLIEDERUNG', 'SAMMELVERWEISUNG',
37 |                 # 'Einführungsvorschrift', 'InnerstaatlDurchfVorschr' will be ignored
38 |                 continue
39 |             if not verweis["normabk"]:
40 |                 continue
41 |             lawname_stem = stem_law_name(verweis["normabk"])
42 |             match = parser.match_law_name(lawname_stem)
43 |             print(match)
44 |             # if match:
45 |             #     lawid = parser.laws_lookup[match]
46 |             #     parsed_vso_ref = [[["Gesetz", lawid]]]
47 |             #     parsed_vso_ref_simple = [[lawid]]
48 |             #
49 |             #     # Append ref. details if present in raw data
50 |             #     enbez = verweis["enbez"]
51 |             #     if enbez and reference_trigger_pattern.match(enbez):
52 |             #
53 |             #         try:
54 |             #             (
55 |             #                 reference_paths,
56 |             #                 reference_paths_simple,
57 |             #             ) = parse_reference_string(enbez, debug_context=None)
58 |             #
59 |             #             parsed_vso_ref = [
60 |             #                 parsed_vso_ref[0] + r for r in reference_paths
61 |             #             ]
62 |             #             parsed_vso_ref_simple = [
63 |             #                 parsed_vso_ref_simple[0] + r
64 |             #                 for r in reference_paths_simple
65 |             #             ]
66 |             #
67 |             #         except StringCaseException as error:
68 |             #             print(error, "context", enbez)
69 |             #
70 |             #     parsed_vso_refs.extend(parsed_vso_ref)
71 |             #     parsed_vso_refs_simple.extend(parsed_vso_ref_simple)
72 | 
73 |         # Remove duplicates
74 |         parsed_vso_refs = remove_duplicate_references(parsed_vso_refs)
75 |         parsed_vso_refs_simple = remove_duplicate_references(parsed_vso_refs_simple)
76 | 
77 |         vso_tag.attrs["parsed_verbose"] = json.dumps(
78 |             parsed_vso_refs, ensure_ascii=False
79 |         )
80 |         vso_tag.attrs["parsed"] = json.dumps(parsed_vso_refs_simple, ensure_ascii=False)
81 | 
82 | 
83 | def remove_duplicate_references(references):
84 |     res = []
85 |     res_str = []
86 |     for elem in references:
87 |         elem_str = str(elem)
88 |         if elem_str not in res_str:
89 |             res.append(elem)
90 |             res_str.append(elem_str)
91 | 
92 |     return res
93 | 


--------------------------------------------------------------------------------
/statutes_pipeline_steps/hierarchy_graph.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | 
  4 | import networkx as nx
  5 | from lxml import etree
  6 | from quantlaw.utils.files import ensure_exists, list_dir
  7 | from quantlaw.utils.pipeline import PipelineStep
  8 | 
  9 | 
 10 | class HierarchyGraphStep(PipelineStep):
 11 |     def __init__(self, source, destination, add_subseqitems, *args, **kwargs):
 12 |         self.source = source
 13 |         self.destination = destination
 14 |         self.add_subseqitems = add_subseqitems
 15 |         super().__init__(*args, **kwargs)
 16 | 
 17 |     def get_items(self, overwrite) -> list:
 18 |         ensure_exists(self.destination)
 19 |         files = list_dir(self.source, ".xml")
 20 | 
 21 |         if not overwrite:
 22 |             existing_files = list_dir(self.destination, ".gpickle")
 23 |             files = list(
 24 |                 filter(lambda f: get_gpickle_filename(f) not in existing_files, files)
 25 |             )
 26 | 
 27 |         return files
 28 | 
 29 |     def execute_item(self, item):
 30 |         G = build_graph(f"{self.source}/{item}", add_subseqitems=self.add_subseqitems)
 31 | 
 32 |         destination_path = f"{self.destination}/{get_gpickle_filename(item)}"
 33 |         nx.write_gpickle(G, destination_path)
 34 | 
 35 | 
 36 | ###########
 37 | # Functions
 38 | ###########
 39 | 
 40 | 
 41 | def get_gpickle_filename(filename):
 42 |     return f"{os.path.splitext(filename)[0]}.gpickle"
 43 | 
 44 | 
 45 | def add_juris_attrs(item, node_attrs):
 46 |     if item.attrib.get("normgeber"):
 47 |         node_attrs["legislators"] = item.attrib["normgeber"]
 48 |     if item.attrib.get("mitwirkende"):
 49 |         node_attrs["contributors"] = item.attrib["mitwirkende"]
 50 |     if item.attrib.get("sachgebiete"):
 51 |         node_attrs["subject_areas"] = item.attrib["sachgebiete"]
 52 | 
 53 | 
 54 | def nest_items(G, items, document_type):
 55 |     """
 56 |     Convert xml soup to graph tree using networkx
 57 |     """
 58 |     for item in items:
 59 |         if item.tag != "document":
 60 |             node_attrs = dict(
 61 |                 key=item.attrib["key"],
 62 |                 citekey=item.attrib.get("citekey", ""),
 63 |                 heading=item.attrib.get("heading", ""),
 64 |                 parent_key=item.getparent().attrib["key"],
 65 |                 level=int(item.attrib["level"]),
 66 |                 type=item.tag,
 67 |             )
 68 |             if document_type:
 69 |                 node_attrs["document_type"] = document_type
 70 |             add_juris_attrs(item, node_attrs)
 71 | 
 72 |             G.add_node(item.attrib["key"], **node_attrs)
 73 |             G.add_edge(item.getparent().attrib["key"], item.attrib["key"])
 74 | 
 75 |         else:  # handle root node
 76 | 
 77 |             node_attrs = dict(
 78 |                 key=item.attrib["key"],
 79 |                 citekey=item.attrib.get("citekey", ""),
 80 |                 heading=item.attrib.get("heading", ""),
 81 |                 parent_key="",
 82 |                 level=0,
 83 |                 type=item.tag,
 84 |                 **(dict(document_type=document_type) if document_type else {}),
 85 |             )
 86 |             if "abbr_1" in item.attrib:
 87 |                 node_attrs["abbr_1"] = item.attrib["abbr_1"]
 88 |             if "abbr_2" in item.attrib:
 89 |                 node_attrs["abbr_2"] = item.attrib["abbr_2"]
 90 |             add_juris_attrs(item, node_attrs)
 91 | 
 92 |             G.add_node(item.attrib["key"], **node_attrs)
 93 |             G.graph["name"] = item.attrib.get("heading", "")
 94 | 
 95 |     return G
 96 | 
 97 | 
 98 | def count_characters(text, whites=False):
 99 |     """
100 |     Get character count of a text
101 | 
102 |     Args:
103 |         whites: If True, whitespaces are not counted
104 |     """
105 |     if whites:
106 |         return len(text)
107 |     else:
108 |         return len(re.sub(r"\s", "", text))
109 | 
110 | 
111 | def count_tokens(text, unique=False):
112 |     """
113 |     Get token count of given text. Tokens are delimited by whitespaces.
114 |     Args:
115 |         unique: It True, only unique tokens are counted.
116 |     """
117 |     if not unique:
118 |         return len(text.split())
119 |     else:
120 |         return len(set(text.split()))
121 | 
122 | 
123 | def build_graph(filename, add_subseqitems=False):
124 |     """
125 |     Builds an awesome graph from a file.
126 |     """
127 | 
128 |     # Read input file
129 |     tree = etree.parse(filename)
130 | 
131 |     document_type = (
132 |         tree.xpath("/document")[0].attrib.get("document_type", None)
133 |         if tree.xpath("/document")
134 |         else None
135 |     )
136 | 
137 |     # Create target graph
138 |     G = nx.DiGraph()
139 | 
140 |     xpath = (
141 |         "//document | //item | //seqitem | //subseqitem"
142 |         if add_subseqitems
143 |         else "//document | //item | //seqitem"
144 |     )
145 | 
146 |     # Create a tree if the elements in the target graph
147 |     G = nest_items(G, items=tree.xpath(xpath), document_type=document_type)
148 | 
149 |     # Add attributes regarding the contained text to the target graoh
150 |     for item in tree.xpath(xpath):
151 |         text = " ".join(item.itertext())
152 |         G.nodes[item.attrib["key"]]["chars_n"] = count_characters(text, whites=True)
153 |         G.nodes[item.attrib["key"]]["chars_nowhites"] = count_characters(
154 |             text, whites=False
155 |         )
156 |         G.nodes[item.attrib["key"]]["tokens_n"] = count_tokens(text, unique=False)
157 |         G.nodes[item.attrib["key"]]["tokens_unique"] = count_tokens(text, unique=True)
158 | 
159 |     items_with_text = {elem.getparent() for elem in tree.xpath("//text")}
160 |     for item in items_with_text:
161 |         all_elems = item.getchildren()
162 |         text_elems = [e for e in all_elems if e.tag == "text"]
163 |         if len(all_elems) > 1 and text_elems:
164 |             texts_tokens_n = []
165 |             texts_chars_n = []
166 |             for elem in text_elems:
167 |                 text = " ".join(elem.itertext())
168 |                 texts_tokens_n.append(str(count_tokens(text, unique=False)))
169 |                 texts_chars_n.append(str(count_characters(text, whites=False)))
170 |             G.nodes[item.attrib["key"]]["texts_tokens_n"] = ",".join(texts_tokens_n)
171 |             G.nodes[item.attrib["key"]]["texts_chars_n"] = ",".join(texts_chars_n)
172 | 
173 |     return G
174 | 


--------------------------------------------------------------------------------
/statutes_pipeline_steps/snapshot_mapping_edgelist.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import pickle
  4 | from collections import Counter, deque
  5 | from multiprocessing import Pool
  6 | 
  7 | import networkx as nx
  8 | import textdistance
  9 | import tqdm
 10 | from quantlaw.utils.beautiful_soup import create_soup
 11 | from quantlaw.utils.files import ensure_exists, list_dir
 12 | from quantlaw.utils.networkx import get_leaves
 13 | from quantlaw.utils.pipeline import PipelineStep
 14 | from regex import regex
 15 | 
 16 | from utils.common import get_snapshot_law_list, invert_dict_mapping_unique
 17 | from utils.string_list_contains import StringContainsAlign
 18 | 
 19 | 
 20 | class SnapshotMappingEdgelistStep(PipelineStep):
 21 |     max_number_of_processes = 1
 22 | 
 23 |     def __init__(
 24 |         self,
 25 |         source,
 26 |         destination,
 27 |         interval,
 28 |         dataset,
 29 |         min_text_length=50,
 30 |         radius=5,
 31 |         distance_threshold=0.9,
 32 |         *args,
 33 |         **kwargs,
 34 |     ):
 35 |         self.source = source
 36 |         self.destination = destination
 37 |         self.interval = interval
 38 |         self.dataset = dataset
 39 |         self.min_text_length = min_text_length
 40 |         self.radius = radius
 41 |         self.distance_threshold = distance_threshold
 42 |         super().__init__(*args, **kwargs)
 43 | 
 44 |     def get_items(self, overwrite, snapshots) -> list:
 45 |         ensure_exists(self.destination)
 46 |         items = sorted(list_dir(self.source, ".pickle"))
 47 |         items = [i[: -len(".pickle")] for i in items]
 48 | 
 49 |         # Create mappings to draw the edges
 50 |         mappings = [
 51 |             (file1, file2)
 52 |             for file1, file2 in zip(items[: -self.interval], items[self.interval :])
 53 |         ]
 54 | 
 55 |         if snapshots:
 56 |             mappings = list(filter(lambda f: f[0] in snapshots, mappings))
 57 | 
 58 |         if not overwrite:
 59 |             existing_files = list_dir(self.destination, ".json")
 60 |             mappings = list(
 61 |                 filter(lambda x: mapping_filename(x) not in existing_files, mappings)
 62 |             )
 63 | 
 64 |         return mappings
 65 | 
 66 |     def execute_item(self, item):
 67 |         filename1, filename2 = item
 68 | 
 69 |         data1 = self.load_pickle(filename1)
 70 |         data2 = self.load_pickle(filename2)
 71 | 
 72 |         # STEP 1: perfect matches unique when considering text
 73 |         new_mappings = map_unique_texts(
 74 |             data1, data2, min_text_length=self.min_text_length
 75 |         )
 76 |         remaining_keys1, remaining_keys2 = get_remaining(
 77 |             data1["keys"], data2["keys"], new_mappings, printing=f"{item}/Step 1"
 78 |         )
 79 | 
 80 |         # STEP 2: perfect matches unique when considering text _and_ citekey
 81 |         new_mappings_current_step = map_same_citekey_same_text(
 82 |             data1, data2, remaining_keys1, remaining_keys2
 83 |         )
 84 |         new_mappings = {**new_mappings_current_step, **new_mappings}
 85 |         del new_mappings_current_step
 86 |         remaining_keys1, remaining_keys2 = get_remaining(
 87 |             data1["keys"], data2["keys"], new_mappings, printing=f"{item}/Step 2"
 88 |         )
 89 | 
 90 |         # STEP 3: text appended/prepended/removed
 91 |         new_mappings_current_step = map_text_containment(
 92 |             data1, data2, remaining_keys1, remaining_keys2
 93 |         )
 94 |         new_mappings = {**new_mappings_current_step, **new_mappings}
 95 |         del new_mappings_current_step
 96 |         remaining_keys1, remaining_keys2 = get_remaining(
 97 |             data1["keys"], data2["keys"], new_mappings, printing=f"{item}/Step 3"
 98 |         )
 99 | 
100 |         # STEP 4: neighborhood matching
101 |         data_keys1 = data1["keys"]
102 |         data_keys2 = data2["keys"]
103 |         data_texts1 = data1["texts"]
104 |         data_texts2 = data2["texts"]
105 |         del data1
106 |         del data2
107 | 
108 |         common_neighbor_kwargs = dict(
109 |             new_mappings=new_mappings,
110 |             data_keys1=data_keys1,
111 |             data_keys2=data_keys2,
112 |             data_texts1=data_texts1,
113 |             data_texts2=data_texts2,
114 |             remaining_keys1=remaining_keys1,
115 |             remaining_keys2=remaining_keys2,
116 |             radius=self.radius,
117 |             distance_threshold=self.distance_threshold,
118 |         )
119 | 
120 |         text_distance_cache = map_similar_text_common_neighbors(
121 |             **common_neighbor_kwargs,
122 |             printing=str(item),
123 |             dry_run=True,
124 |         )
125 |         text_distance_cache = update_textdistance_cache(text_distance_cache)
126 |         map_similar_text_common_neighbors(
127 |             **common_neighbor_kwargs,
128 |             printing=str(item),
129 |             text_distance_cache=text_distance_cache,
130 |         )
131 | 
132 |         dest_path = f"{self.destination}/{mapping_filename(item)}"
133 |         with open(dest_path, "w") as f:
134 |             json.dump(new_mappings, f)
135 | 
136 |         # only called to print stats
137 |         get_remaining(data_keys1, data_keys2, new_mappings, printing=f"{item}/DONE")
138 | 
139 |     def load_pickle(self, snapshot):
140 |         with open(os.path.join(self.source, snapshot + ".pickle"), "rb") as f:
141 |             raw_data = pickle.load(f)
142 |         return raw_data
143 | 
144 | 
145 | def mapping_filename(mapping):
146 |     """
147 |     returns the filename mappings are stored in
148 |     """
149 |     filename1, filename2 = mapping
150 |     result = f"{filename1}_{filename2}.json"
151 |     return result
152 | 
153 | 
154 | def load_crossref_graph(filename, source):
155 |     graph_path = f"{source}/{filename}"
156 |     G = nx.read_gpickle(graph_path)
157 |     return G
158 | 
159 | 
160 | def get_remaining(data_keys1, data_keys2, new_mappings, asserting=True, printing=True):
161 |     """
162 |     Prints stats and returns keys of both snapshots to be matched
163 |     """
164 |     remaining_keys1 = set(data_keys1) - set(new_mappings.keys())
165 |     remaining_keys2 = set(data_keys2) - set(new_mappings.values())
166 |     if asserting:
167 |         assert len(set(new_mappings.keys())) == len(set(new_mappings.values()))
168 |     if printing:
169 |         print(
170 |             f"\n{printing}; "
171 |             f"Progress {len(new_mappings)/min(len(data_keys1), len(data_keys2))}; "
172 |             f"Remaining keys: {len(remaining_keys1)} {len(remaining_keys2)}; "
173 |         )
174 |     return remaining_keys1, remaining_keys2
175 | 
176 | 
177 | def get_leaf_texts_to_compare(
178 |     graph_filename, G, source_text, source_text_reg, law_names_data, dataset
179 | ):
180 |     """
181 |     get text for leaves of a hierarchy graph. Can be seqitem or supseqitem graph.
182 |     Leaves are only seqitems or supseqitems.
183 |     """
184 |     leaf_keys = get_leaves(G)
185 | 
186 |     snapshot = graph_filename[: -len(".gpickle.gz")]
187 | 
188 |     if dataset == "us":
189 |         files = [
190 |             os.path.join(source_text, x)
191 |             for x in list_dir(source_text, ".xml")
192 |             if x.split(".")[0].split("_")[-1] == snapshot
193 |         ]
194 |         if source_text_reg:
195 |             files += [
196 |                 os.path.join(source_text_reg, x)
197 |                 for x in list_dir(source_text_reg, ".xml")
198 |                 if x.split(".")[0].split("_")[-1] == snapshot
199 |             ]
200 |         files.sort()
201 |     else:  # is DE
202 |         files = get_snapshot_law_list(snapshot, law_names_data)
203 |         files = [os.path.join(source_text, f) for f in files]
204 | 
205 |     whitespace_pattern = regex.compile(r"[\s\n]+")
206 |     texts = {}
207 |     for file in files:
208 |         print(f"\r{files.index(file)} / {len(files)}", end="")
209 |         soup = create_soup(file)
210 |         tags = soup.find_all(["seqitem", "subseqitem"])
211 |         for tag in tags:
212 |             if tag["key"] in leaf_keys:
213 |                 text = tag.get_text(" ")
214 |                 text = whitespace_pattern.sub(" ", text).lower().strip()
215 |                 texts[tag["key"]] = text.lower()
216 |     return texts
217 | 
218 | 
219 | def map_unique_texts(data1, data2, min_text_length=50):
220 |     """
221 |     Maps nodes from snapshot t1 to t2 if texts are in each snapshot unique and appear
222 |     in the both snapshots
223 |     """
224 |     leaf_texts1 = {k: t for k, t in zip(data1["keys"], data1["texts"])}
225 |     leaf_texts2 = {k: t for k, t in zip(data2["keys"], data2["texts"])}
226 | 
227 |     # Create dicts with text as keys
228 |     inverted_unique_leaf_texts1 = invert_dict_mapping_unique(leaf_texts1)
229 |     inverted_unique_leaf_texts2 = invert_dict_mapping_unique(leaf_texts2)
230 | 
231 |     # find unique texts in both snapshots
232 |     both_unique_texts = set(inverted_unique_leaf_texts1.keys()) & set(
233 |         inverted_unique_leaf_texts2.keys()
234 |     )
235 | 
236 |     # Filter for texts with min length
237 |     both_unique_texts = {x for x in both_unique_texts if len(x) >= min_text_length}
238 | 
239 |     # Create mapping
240 |     new_mappings = {}
241 |     for text in both_unique_texts:
242 |         new_mappings[inverted_unique_leaf_texts1[text]] = inverted_unique_leaf_texts2[
243 |             text
244 |         ]
245 |     return new_mappings
246 | 
247 | 
248 | def map_same_citekey_same_text(data1, data2, remaining_keys1, remaining_keys2):
249 |     text_and_citekeys1 = {
250 |         k: (c.lower(), t)
251 |         for k, t, c in zip(data1["keys"], data1["texts"], data1["citekeys"])
252 |         if c and k in remaining_keys1
253 |     }
254 |     text_and_citekeys2 = {
255 |         k: (c.lower(), t)
256 |         for k, t, c in zip(data2["keys"], data2["texts"], data2["citekeys"])
257 |         if c and k in remaining_keys2
258 |     }
259 |     inverted_text_and_citekeys1 = invert_dict_mapping_unique(text_and_citekeys1)
260 |     inverted_text_and_citekeys2 = invert_dict_mapping_unique(text_and_citekeys2)
261 | 
262 |     both_unique_text_and_citekeys = set(inverted_text_and_citekeys1.keys()) & set(
263 |         inverted_text_and_citekeys2.keys()
264 |     )
265 | 
266 |     # Create mapping
267 |     new_mappings = {}
268 |     for text_and_citekey in both_unique_text_and_citekeys:
269 |         new_mappings[
270 |             inverted_text_and_citekeys1[text_and_citekey]
271 |         ] = inverted_text_and_citekeys2[text_and_citekey]
272 |     return new_mappings
273 | 
274 | 
275 | def clip_text_for_containment_matching(text):
276 |     return text.split(" ", 1)[-1]  # get rid of German Absatz numbers (e.g., "(1)")
277 | 
278 | 
279 | def map_text_containment(
280 |     data1,
281 |     data2,
282 |     remaining_keys1,
283 |     remaining_keys2,
284 |     min_text_length=50,
285 | ):
286 |     remaining_keys1_list = sorted(remaining_keys1)
287 |     remaining_keys2_list = sorted(remaining_keys2)
288 |     leaf_texts1_dict = {k: t for k, t in zip(data1["keys"], data1["texts"])}
289 |     leaf_texts2_dict = {k: t for k, t in zip(data2["keys"], data2["texts"])}
290 | 
291 |     aligner = StringContainsAlign(min_text_length=min_text_length)
292 |     aligner.text_list_0 = [
293 |         clip_text_for_containment_matching(leaf_texts1_dict[k])
294 |         for k in remaining_keys1_list
295 |     ]
296 |     aligner.text_list_1 = [
297 |         clip_text_for_containment_matching(leaf_texts2_dict[k])
298 |         for k in remaining_keys2_list
299 |     ]
300 |     aligner.create_index()
301 | 
302 |     containment_idx_forward = aligner.run()
303 |     containment_idx_reversed = aligner.run(reversed=True)
304 |     aligner.clean_index()
305 | 
306 |     containment_idx_reversed = [(v, u) for u, v in containment_idx_reversed]
307 | 
308 |     containment_idx = set(containment_idx_forward + containment_idx_reversed)
309 | 
310 |     # Filter one to one matches
311 |     idx_1_counts = Counter(u for u, v in containment_idx)
312 |     idx_2_counts = Counter(v for u, v in containment_idx)
313 | 
314 |     unique_keys_1 = {idx for idx, cnt in idx_1_counts.items() if cnt == 1}
315 |     unique_keys_2 = {idx for idx, cnt in idx_2_counts.items() if cnt == 1}
316 | 
317 |     new_mappings = {}
318 |     for u, v in containment_idx_forward + containment_idx_reversed:
319 |         if u in unique_keys_1 and v in unique_keys_2:
320 |             u_key = remaining_keys1_list[u]
321 |             v_key = remaining_keys2_list[v]
322 |             new_mappings[u_key] = v_key
323 | 
324 |     return new_mappings
325 | 
326 | 
327 | def get_neighborhood(data_keys, node, radius, keys_len, key_index_dict):
328 | 
329 |     curr_index = key_index_dict[node]
330 |     lower_bound = max(0, curr_index - radius)
331 |     upper_bound = min(keys_len, curr_index + radius)
332 | 
333 |     neighborhood = data_keys[lower_bound : upper_bound + 1]
334 | 
335 |     # Remove node in radius but of another law/title as their order ist mostly arbitrary
336 |     key_prefix = node.split("_")[0]
337 |     neighborhood = {n for n in neighborhood if n.startswith(key_prefix)}
338 | 
339 |     return neighborhood
340 | 
341 | 
342 | def cached_text_distance(s1, s2, cache, dry_run):
343 |     key = (s1, s2)
344 |     if dry_run:
345 |         distance = None
346 |         cache[key] = distance
347 |     elif key not in cache:
348 |         distance = textdistance.jaro_winkler(s1, s2)
349 |         cache[key] = distance
350 |     else:
351 |         distance = cache[key]
352 |     return distance
353 | 
354 | 
355 | def calc_text_distance(args):
356 |     return textdistance.jaro_winkler(*args)
357 | 
358 | 
359 | def update_textdistance_cache(text_distance_cache):
360 |     text_distance_texts = list(text_distance_cache.keys())
361 |     with Pool() as p:
362 |         distances = tqdm.tqdm(
363 |             p.imap(calc_text_distance, text_distance_texts),
364 |             total=len(text_distance_texts),
365 |         )
366 |         return {k: v for k, v in zip(text_distance_texts, distances)}
367 | 
368 | 
369 | def map_similar_text_common_neighbors(
370 |     new_mappings,
371 |     data_keys1,
372 |     data_keys2,
373 |     data_texts1,
374 |     data_texts2,
375 |     remaining_keys1,
376 |     remaining_keys2,
377 |     radius=5,
378 |     distance_threshold=0.9,
379 |     printing=None,
380 |     dry_run=False,
381 |     text_distance_cache=None,
382 | ):
383 |     if not text_distance_cache:
384 |         text_distance_cache = dict()
385 | 
386 |     keys_len1 = len(data_keys1)
387 |     keys_len2 = len(data_keys2)
388 |     key_index_dict1 = {k: idx for idx, k in enumerate(data_keys1)}
389 |     key_index_dict2 = {k: idx for idx, k in enumerate(data_keys2)}
390 | 
391 |     leaf_texts1 = {k: v for k, v in zip(data_keys1, data_texts1)}
392 |     leaf_texts2 = {k: v for k, v in zip(data_keys2, data_texts2)}
393 | 
394 |     key_queue = deque(remaining_keys1)
395 |     key_queue_set = set(key_queue)
396 |     i = -1  # only to print the process
397 |     while key_queue:
398 |         remaining_key1 = key_queue.popleft()
399 |         key_queue_set.remove(remaining_key1)
400 |         i += 1  # only to print the process
401 |         if i % 100 == 0 and printing:
402 |             total = len(key_queue) + i
403 |             print(
404 |                 f"\r{printing} " f"{i/total*100:.2f}% \t ({total} )",
405 |                 end="",
406 |             )
407 | 
408 |         remaining_text1 = leaf_texts1[remaining_key1]
409 | 
410 |         # Get neighborhood of node in G1
411 |         # Get mapping to G2 for neighborhood nodes
412 |         # Get neighborhood of mapped G2 nodes
413 |         neighborhood_nodes1 = get_neighborhood(
414 |             data_keys1, remaining_key1, radius, keys_len1, key_index_dict1
415 |         )
416 |         neighborhood_nodes2 = set()
417 | 
418 |         for neighborhood_node1 in neighborhood_nodes1:
419 |             if neighborhood_node1 in new_mappings:
420 |                 neighborhood_nodes2.update(
421 |                     get_neighborhood(
422 |                         data_keys2,
423 |                         new_mappings[neighborhood_node1],
424 |                         radius,
425 |                         keys_len2,
426 |                         key_index_dict2,
427 |                     )
428 |                 )
429 | 
430 |         # Remove duplicates in G2 neighborhood
431 |         neighborhood_nodes2 = [x for x in neighborhood_nodes2 if x in remaining_keys2]
432 | 
433 |         # Find most similar text
434 |         neighborhood_text2 = [leaf_texts2.get(x) for x in neighborhood_nodes2]
435 |         similarity = [
436 |             cached_text_distance(remaining_text1, x, text_distance_cache, dry_run)
437 |             if x
438 |             else 0
439 |             for x in neighborhood_text2
440 |         ]
441 |         if not dry_run:
442 |             max_similarity = max(similarity) if similarity else 0
443 | 
444 |             if max_similarity > distance_threshold:
445 |                 # Add to mapping and update remaining_keys
446 |                 max_index = similarity.index(max_similarity)
447 |                 id2_to_match_to = neighborhood_nodes2[max_index]
448 |                 new_mappings[remaining_key1] = id2_to_match_to
449 |                 remaining_keys2.remove(id2_to_match_to)
450 |                 remaining_keys1.remove(remaining_key1)
451 | 
452 |                 # Requeue neighborhood of newly mapped element
453 |                 neighborhood_to_requeue = [
454 |                     n
455 |                     for n in neighborhood_nodes1
456 |                     if n in remaining_keys1 and n not in key_queue_set
457 |                 ]
458 |                 key_queue.extend(neighborhood_to_requeue)
459 |                 key_queue_set.update(neighborhood_to_requeue)
460 | 
461 |     print()
462 |     return text_distance_cache
463 | 


--------------------------------------------------------------------------------
/statutes_pipeline_steps/snapshot_mapping_index.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pickle
  3 | 
  4 | import networkx as nx
  5 | from lxml import etree
  6 | from quantlaw.utils.files import ensure_exists, list_dir
  7 | from quantlaw.utils.pipeline import PipelineStep
  8 | from regex import regex
  9 | 
 10 | from utils.common import get_snapshot_law_list
 11 | 
 12 | 
 13 | class SnapshotMappingIndexStep(PipelineStep):
 14 |     def __init__(
 15 |         self,
 16 |         source_text,
 17 |         destination,
 18 |         dataset,
 19 |         law_names_data=None,
 20 |         *args,
 21 |         **kwargs,
 22 |     ):
 23 |         self.source_text = source_text
 24 |         self.destination = destination
 25 |         self.dataset = dataset
 26 |         self.law_names_data = law_names_data
 27 |         super().__init__(*args, **kwargs)
 28 | 
 29 |     def get_items(self, overwrite, snapshots) -> list:
 30 |         ensure_exists(self.destination)
 31 |         items = snapshots
 32 |         if not overwrite:
 33 |             existing_files = list_dir(self.destination, ".pickle")
 34 |             items = list(filter(lambda x: (x + ".pickle") not in existing_files, items))
 35 |         return items
 36 | 
 37 |     def execute_item(self, item):
 38 |         # Load texts
 39 |         item_data = list(
 40 |             get_texttags_to_compare(
 41 |                 item,
 42 |                 self.source_text,
 43 |                 self.law_names_data,
 44 |                 self.dataset,
 45 |             )
 46 |         )
 47 | 
 48 |         self.save_raw(item, item_data)
 49 | 
 50 |     def save_raw(self, item, item_data):
 51 | 
 52 |         keys, citekeys, texts = list(zip(*item_data))
 53 | 
 54 |         pickle_path = os.path.join(self.destination, item + ".pickle")
 55 | 
 56 |         with open(pickle_path, "wb") as f:
 57 |             pickle.dump(dict(keys=keys, texts=texts, citekeys=citekeys), f)
 58 | 
 59 | 
 60 | def load_crossref_graph(item, source):
 61 |     graph_path = f"{source}/{item}.gpickle.gz"
 62 |     G = nx.read_gpickle(graph_path)
 63 |     return G
 64 | 
 65 | 
 66 | def get_texttags_to_compare(snapshot, source_texts, law_names_data, dataset):
 67 | 
 68 |     if dataset == "us":
 69 |         if type(source_texts) is str:
 70 |             source_texts = [source_texts]
 71 | 
 72 |         files = sorted(
 73 |             [
 74 |                 os.path.join(source_text, x)
 75 |                 for source_text in source_texts
 76 |                 for x in list_dir(source_text, ".xml")
 77 |                 if x.split(".")[0].split("_")[-1] == snapshot
 78 |             ]
 79 |         )
 80 |     else:  # is DE
 81 |         assert type(source_texts) is str
 82 |         files = get_snapshot_law_list(snapshot, law_names_data)
 83 |         files = [os.path.join(source_texts, f) for f in files]
 84 | 
 85 |     whitespace_pattern = regex.compile(r"[\s\n]+")
 86 | 
 87 |     for file in files:
 88 |         tree = etree.parse(file)
 89 |         for text_tag in tree.xpath("//text"):
 90 |             item = text_tag.getparent()
 91 | 
 92 |             text_elems = [e for e in item.getchildren() if e.tag == "text"]
 93 |             pos_in_item = text_elems.index(text_tag)
 94 |             text_key = item.attrib["key"] + f"_{pos_in_item}"
 95 | 
 96 |             seqitem = get_seqitem(item)
 97 |             if seqitem is not None:
 98 |                 citekey = seqitem.attrib.get("citekey")
 99 |             else:
100 |                 citekey = None
101 | 
102 |             text = etree.tostring(text_tag, method="text", encoding="utf8").decode(
103 |                 "utf-8"
104 |             )
105 |             text = whitespace_pattern.sub(" ", text).lower().strip()
106 | 
107 |             yield text_key, citekey, text
108 | 
109 | 
110 | def get_seqitem(elem):
111 |     if elem is None:
112 |         return None
113 |     elif elem.tag == "seqitem":
114 |         return elem
115 |     return get_seqitem(elem.getparent())
116 | 


--------------------------------------------------------------------------------
/statutes_pipeline_steps/us_authority_edgelist.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | import json
 3 | 
 4 | import lxml.etree
 5 | 
 6 | from statics import US_REG_AUTHORITY_EDGELIST_PATH
 7 | from statutes_pipeline_steps.us_crossreference_edgelist import UsCrossreferenceEdgelist
 8 | 
 9 | 
10 | class UsAuthorityEdgelist(UsCrossreferenceEdgelist):
11 |     @property
12 |     def dest(self):
13 |         assert self.regulations
14 |         return US_REG_AUTHORITY_EDGELIST_PATH
15 | 
16 |     def make_edge_list(self, yearfile_path, key_dict):
17 |         with open(yearfile_path, encoding="utf8") as f:
18 |             file_elem = lxml.etree.parse(f)
19 |         edge_list = []
20 | 
21 |         # for debug
22 |         # problem_matches = set()
23 |         # problem_keys = set()
24 | 
25 |         for item in file_elem.xpath("//*[@auth_text_parsed]"):
26 |             node_out = item.attrib.get("key")
27 |             refs = itertools.chain.from_iterable(
28 |                 json.loads(item.attrib["auth_text_parsed"])
29 |             )
30 |             for ref in refs:
31 |                 key = "_".join(ref[:2])
32 |                 node_in = key_dict.get(key)
33 | 
34 |                 if node_in:
35 |                     edge_list.append([node_out, node_in])
36 | 
37 |         return edge_list
38 | 


--------------------------------------------------------------------------------
/statutes_pipeline_steps/us_crossreference_edgelist.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | 
  4 | import lxml.etree
  5 | import pandas as pd
  6 | from quantlaw.utils.files import ensure_exists, list_dir
  7 | 
  8 | from statics import (
  9 |     US_CROSSREFERENCE_EDGELIST_PATH,
 10 |     US_CROSSREFERENCE_LOOKUP_PATH,
 11 |     US_REFERENCE_PARSED_PATH,
 12 |     US_REG_CROSSREFERENCE_EDGELIST_PATH,
 13 |     US_REG_CROSSREFERENCE_LOOKUP_PATH,
 14 |     US_REG_REFERENCE_PARSED_PATH,
 15 | )
 16 | from utils.common import RegulationsPipelineStep
 17 | 
 18 | 
 19 | class UsCrossreferenceEdgelist(RegulationsPipelineStep):
 20 |     def __init__(self, detailed_crossreferences, *args, **kwargs):
 21 |         self.detailed_crossreferences = detailed_crossreferences
 22 |         super().__init__(*args, **kwargs)
 23 | 
 24 |     def get_items(self, overwrite, snapshots) -> list:
 25 |         ensure_exists(self.dest)
 26 |         if not snapshots:
 27 |             snapshots = sorted(
 28 |                 set([os.path.splitext(x)[0] for x in list_dir(self.lookup, ".csv")])
 29 |             )
 30 | 
 31 |         if not overwrite:
 32 |             existing_files = os.listdir(self.dest)
 33 |             snapshots = list(
 34 |                 filter(lambda f: get_filename(f) not in existing_files, snapshots)
 35 |             )
 36 | 
 37 |         return snapshots
 38 | 
 39 |     @property
 40 |     def dest(self):
 41 |         return (
 42 |             US_REG_CROSSREFERENCE_EDGELIST_PATH
 43 |             if self.regulations
 44 |             else US_CROSSREFERENCE_EDGELIST_PATH
 45 |         ) + ("/detailed" if self.detailed_crossreferences else "")
 46 | 
 47 |     @property
 48 |     def lookup(self):
 49 |         return (
 50 |             US_REG_CROSSREFERENCE_LOOKUP_PATH
 51 |             if self.regulations
 52 |             else US_CROSSREFERENCE_LOOKUP_PATH
 53 |         ) + ("/detailed" if self.detailed_crossreferences else "")
 54 | 
 55 |     def execute_item(self, item):
 56 |         yearfiles = [
 57 |             os.path.join(US_REFERENCE_PARSED_PATH, x)
 58 |             for x in list_dir(US_REFERENCE_PARSED_PATH, ".xml")
 59 |             if str(item) in x
 60 |         ]
 61 |         if self.regulations:
 62 |             yearfiles += [
 63 |                 os.path.join(US_REG_REFERENCE_PARSED_PATH, x)
 64 |                 for x in list_dir(US_REG_REFERENCE_PARSED_PATH, ".xml")
 65 |                 if str(item) in x
 66 |             ]
 67 | 
 68 |         key_df = pd.read_csv(f"{self.lookup}/{item}.csv").dropna().set_index("citekey")
 69 |         key_dict = {}
 70 |         for idx, val in key_df.key.iteritems():
 71 |             if idx not in key_dict:
 72 |                 key_dict[idx] = val
 73 |         edge_list = []
 74 |         for yearfile_path in yearfiles:
 75 |             edge_list_file = self.make_edge_list(yearfile_path, key_dict)
 76 |             edge_list.extend(edge_list_file)
 77 |         if edge_list:
 78 |             df = pd.DataFrame(edge_list, columns=["out_node", "in_node"])
 79 |             df.to_csv(f"{self.dest}/{item}.csv", index=False)
 80 | 
 81 |     def make_edge_list(self, yearfile_path, key_dict):
 82 |         with open(yearfile_path, encoding="utf8") as f:
 83 |             file_elem = lxml.etree.parse(f)
 84 |         edge_list = []
 85 | 
 86 |         if self.detailed_crossreferences:
 87 |             for ref_elem in file_elem.xpath(".//reference"):
 88 |                 node_out = ref_elem.getparent().getparent().attrib.get("key")
 89 |                 refs = json.loads(ref_elem.attrib["parsed"])
 90 |                 for ref in refs:
 91 |                     for cutoff in range(len(ref), 1, -1):
 92 |                         key = "_".join(ref[:cutoff])
 93 |                         node_in = key_dict.get(key)
 94 |                         if node_in:
 95 |                             edge_list.append([node_out, node_in])
 96 |                             break
 97 |         else:
 98 |             for seqitem_elem in file_elem.xpath("//seqitem"):
 99 |                 node_out = seqitem_elem.attrib.get("key")
100 |                 for ref_elem in seqitem_elem.xpath(".//reference"):
101 |                     refs = json.loads(ref_elem.attrib["parsed"])
102 |                     for ref in refs:
103 |                         for cutoff in range(len(ref), 1, -1):
104 |                             key = "_".join(ref[:cutoff])
105 |                             node_in = key_dict.get(key)
106 |                             if node_in:
107 |                                 edge_list.append([node_out, node_in])
108 |                                 break
109 |         return edge_list
110 | 
111 | 
112 | ###########
113 | # Functions
114 | ###########
115 | 
116 | 
117 | def get_filename(date):
118 |     return f"{date}.csv"
119 | 


--------------------------------------------------------------------------------
/statutes_pipeline_steps/us_crossreference_lookup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import lxml.etree
 4 | import pandas as pd
 5 | from quantlaw.utils.files import ensure_exists, list_dir
 6 | 
 7 | from statics import (
 8 |     US_CROSSREFERENCE_LOOKUP_PATH,
 9 |     US_REFERENCE_PARSED_PATH,
10 |     US_REG_CROSSREFERENCE_LOOKUP_PATH,
11 |     US_REG_REFERENCE_PARSED_PATH,
12 | )
13 | from utils.common import RegulationsPipelineStep
14 | 
15 | 
16 | class UsCrossreferenceLookup(RegulationsPipelineStep):
17 |     def __init__(self, detailed_crossreferences, *args, **kwargs):
18 |         self.detailed_crossreferences = detailed_crossreferences
19 |         super().__init__(*args, **kwargs)
20 | 
21 |     def get_items(self, overwrite, snapshots) -> list:
22 |         ensure_exists(self.dest)
23 | 
24 |         # If snapshots not set, create list of all years
25 |         if not snapshots:
26 |             snapshots = sorted(
27 |                 set(
28 |                     [
29 |                         x.split(".")[0].split("_")[-1]
30 |                         for x in list_dir(US_REFERENCE_PARSED_PATH, ".xml")
31 |                     ]
32 |                 )
33 |             )
34 | 
35 |         if not overwrite:
36 |             existing_files = os.listdir(self.dest)
37 |             snapshots = list(
38 |                 filter(lambda f: get_filename(f) not in existing_files, snapshots)
39 |             )
40 | 
41 |         return snapshots
42 | 
43 |     @property
44 |     def dest(self):
45 |         return (
46 |             US_REG_CROSSREFERENCE_LOOKUP_PATH
47 |             if self.regulations
48 |             else US_CROSSREFERENCE_LOOKUP_PATH
49 |         ) + ("/detailed" if self.detailed_crossreferences else "")
50 | 
51 |     def execute_item(self, item):
52 |         yearfiles = [
53 |             os.path.join(US_REFERENCE_PARSED_PATH, x)
54 |             for x in list_dir(US_REFERENCE_PARSED_PATH, ".xml")
55 |             if str(item) in x
56 |         ]
57 |         if self.regulations:
58 |             yearfiles += [
59 |                 os.path.join(US_REG_REFERENCE_PARSED_PATH, x)
60 |                 for x in list_dir(US_REG_REFERENCE_PARSED_PATH, ".xml")
61 |                 if str(item) in x
62 |             ]
63 |         data = []
64 |         for file in yearfiles:
65 |             with open(file, encoding="utf8") as f:
66 |                 file_elem = lxml.etree.parse(f)
67 |             for node in file_elem.xpath("//*[@citekey]"):
68 |                 data.append([node.attrib["key"], node.attrib["citekey"]])
69 |             if self.detailed_crossreferences:
70 |                 for node in file_elem.xpath("//*[@citekey_detailed]"):
71 |                     for citekey in node.attrib["citekey_detailed"].split(","):
72 |                         data.append([node.attrib["key"], citekey])
73 |         df = pd.DataFrame(data, columns=["key", "citekey"])
74 |         destination_file = f"{self.dest}/{get_filename(item)}"
75 |         df.to_csv(destination_file, index=False)
76 | 
77 | 
78 | def get_filename(year):
79 |     return f"{year}.csv"
80 | 


--------------------------------------------------------------------------------
/statutes_pipeline_steps/us_prepare_input.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | import shutil
 4 | 
 5 | from quantlaw.utils.files import ensure_exists
 6 | 
 7 | from statics import US_INPUT_PATH, US_ORIGINAL_PATH
 8 | 
 9 | 
10 | def us_prepare_input():
11 |     """
12 |     moves source files into main dir and validate files roughly
13 |     """
14 | 
15 |     ensure_exists(US_ORIGINAL_PATH)
16 | 
17 |     subfolders = [f.name for f in os.scandir(US_INPUT_PATH) if f.is_dir()]
18 |     for subfolder in subfolders:
19 |         for item in os.listdir(f"{US_INPUT_PATH}/{subfolder}"):
20 | 
21 |             # Filter by filename pattern
22 |             pattern = re.compile(r"(\d+)usc(\d+)(a)?\.html?", flags=re.IGNORECASE)
23 |             match = pattern.fullmatch(item)
24 |             if not match:
25 |                 continue
26 | 
27 |             new_name = f'{match[2]}{"1" if match[3] else "0"}_{match[1]}.htm'
28 | 
29 |             # Prevent overwriting files
30 |             if os.path.exists(f"{US_ORIGINAL_PATH}/{new_name}"):
31 |                 print(f"{US_ORIGINAL_PATH}/{new_name} already exists")
32 |             else:
33 |                 shutil.copy(
34 |                     f"{US_INPUT_PATH}/{subfolder}/{item}",
35 |                     f"{US_ORIGINAL_PATH}/{new_name}",
36 |                 )
37 | 
38 |     files = os.listdir(US_ORIGINAL_PATH)
39 |     files = [f for f in files if f.endswith(".htm")]
40 |     pattern = re.compile(r"(\d+)_(\d+)\.htm")
41 |     years = {}
42 |     for file in files:
43 |         match = pattern.fullmatch(file)
44 |         year = match[2]
45 |         title = match[1]
46 |         years[year] = years[year] if years.get(year) else []
47 |         years[year].append(title)
48 | 
49 |     for idx in list(years.keys()):
50 |         years[idx] = sorted(years[idx])
51 | 
52 |     print(f"{len(files)} files found")
53 |     print(f"{len(years)} years found")
54 | 
55 |     for year in sorted(years.keys()):
56 |         titles = years[year]
57 |         print(f"{year}: n={len(titles)}, max='{max(titles)}'")
58 | 


--------------------------------------------------------------------------------
/statutes_pipeline_steps/us_reference_areas.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import multiprocessing
  3 | import os
  4 | 
  5 | import bs4
  6 | from quantlaw.utils.beautiful_soup import create_soup, save_soup
  7 | from quantlaw.utils.files import ensure_exists, list_dir
  8 | from regex import regex
  9 | 
 10 | from statics import (
 11 |     US_HELPERS_PATH,
 12 |     US_REFERENCE_AREAS_LOG_PATH,
 13 |     US_REFERENCE_AREAS_PATH,
 14 |     US_REG_HELPERS_PATH,
 15 |     US_REG_REFERENCE_AREAS_LOG_PATH,
 16 |     US_REG_REFERENCE_AREAS_PATH,
 17 |     US_REG_XML_PATH,
 18 |     US_XML_PATH,
 19 | )
 20 | from statutes_pipeline_steps.us_reference_reg import find_authority_references
 21 | from utils.common import RegulationsPipelineStep
 22 | 
 23 | 
 24 | class UsReferenceAreasStep(RegulationsPipelineStep):
 25 |     max_number_of_processes = max(int(multiprocessing.cpu_count() / 2), 1)
 26 | 
 27 |     def get_items(self, overwrite) -> list:
 28 |         src = US_REG_XML_PATH if self.regulations else US_XML_PATH
 29 |         dest = (
 30 |             US_REG_REFERENCE_AREAS_PATH if self.regulations else US_REFERENCE_AREAS_PATH
 31 |         )
 32 |         ensure_exists(dest)
 33 |         files = list_dir(src, ".xml")
 34 | 
 35 |         if not overwrite:
 36 |             existing_files = os.listdir(dest)
 37 |             files = list(filter(lambda f: f not in existing_files, files))
 38 | 
 39 |         return files
 40 | 
 41 |     def execute_item(self, item):
 42 |         src = US_REG_XML_PATH if self.regulations else US_XML_PATH
 43 |         dest = (
 44 |             US_REG_REFERENCE_AREAS_PATH if self.regulations else US_REFERENCE_AREAS_PATH
 45 |         )
 46 |         soup = create_soup(f"{src}/{item}")
 47 |         logs = find_references(soup, usc_pattern, {"pattern": "block"})
 48 |         logs += find_references(soup, inline_pattern, {"pattern": "inline"})
 49 | 
 50 |         if self.regulations:
 51 |             logs += find_authority_references(soup, usc_pattern)
 52 | 
 53 |         save_soup(soup, f"{dest}/{item}")
 54 |         return logs
 55 | 
 56 |     def finish_execution(self, results):
 57 |         logs = list(itertools.chain.from_iterable(results))
 58 |         ensure_exists(US_REG_HELPERS_PATH if self.regulations else US_HELPERS_PATH)
 59 |         log_path = (
 60 |             US_REG_REFERENCE_AREAS_LOG_PATH
 61 |             if self.regulations
 62 |             else US_REFERENCE_AREAS_LOG_PATH
 63 |         )
 64 |         with open(log_path, mode="w") as f:
 65 |             f.write("\n".join(sorted(logs, key=lambda x: x.lower())))
 66 | 
 67 | 
 68 | ################
 69 | # Regex patterns
 70 | ################
 71 | 
 72 | # fmt: off
 73 | 
 74 | regex_definitions = (
 75 |     r'(?(DEFINE)'
 76 |         r'(?<sec>'
 77 |             r'(\d+([\da-zA-Z\-\–\—\.]*[\da-zA-Z\-\–\—])?)'
 78 |             r'(\(\d*[a-z]{0,3}i*\))*'
 79 |             r'(\s+et\.?\s+seq\.?)?'
 80 |             r'(\s+and\sfollowing)?'
 81 |         r')'
 82 |         r'(?<numb>'
 83 |             r'(\(\d*[a-z]{0,2}i?\))+'
 84 |             r'(\s+et\.?\s+seq\.?)?'
 85 |         r')'
 86 |         r'(?<conn>'
 87 |             r',?\s+(and|or|to|through)(\sin)?\s+|'
 88 |             r'(,|;)\s+'
 89 |         r')'
 90 |     r')'
 91 | )
 92 | 
 93 | usc_pattern_string = regex_definitions + (
 94 |     r'('
 95 |         r'(\d+)\s*'
 96 |         r'('
 97 |             r'U\.?S\.?C\.?'
 98 |         r'|'
 99 |             r'C\.?F\.?R\.?'
100 |         r')\s*'
101 |         r'(Sec(?:tions?|\.)?|§§?|\b(sub)?Parts?)?\s*'
102 |         r'(?&sec)'
103 |         r'((?&conn)(Sec(?:tions|\.)?|§§?|\b(sub)?Parts?)?\s*(?&sec)|(?&conn)(?&numb))*'
104 |     r')'
105 |     r'(?!\w*(\sApp\.)?\s(U\.?S\.?C\.?|C\.?F\.?R\.?|Stat\.))'
106 |     r'\s*'
107 |     r'('
108 |         r'(of\sthis\s(sub\-?)?(title|chapter|part|section|division|paragraph))'
109 |   r'|'
110 |         r'(of\stitle\s\d+)'
111 |     r')?'
112 |     r'('
113 |         r'\s+of\s+the\s+Code\s+of\s+Federal\s+Regulations'
114 |     r'|'
115 |         r'\s+of\s+the\s+Code\s+of\s+the\s+United\s+States'
116 |     r')?'
117 | )
118 | usc_pattern = regex.compile(usc_pattern_string, flags=regex.IGNORECASE)
119 | 
120 | inline_pattern_string = regex_definitions + (
121 |     r'(Sec(?:tion|\.)?|§§?|\b(sub)?parts?)\s*'
122 |     r'(?&sec)'
123 |     r'('
124 |         r'(?&conn)'
125 |         r'(Sec(?:tions?|\.)?|§§?)?'
126 |         r'\s*'
127 |         r'(?&sec)'
128 |     r'|'
129 |         r'(?&conn)(?&numb)'
130 |     r')*'
131 |     r'\s*'
132 |     r'('
133 |         r'(of\sthis\s(sub\-?)?(title|chapter|part|section|division|paragraph))'
134 |     r'|'
135 |         r'(of\stitle\s\d+)'
136 |     r')?'
137 |     r'('
138 |         r'\s+of\s+the\s+Code\s+of\s+Federal\s+Regulations'
139 |     r'|'
140 |         r'\s+of\s+the\s+Code\s+of\s+the\s+United\s+States'
141 |     r')?'
142 | )
143 | inline_pattern = regex.compile(inline_pattern_string, flags=regex.IGNORECASE)
144 | 
145 | # fmt: on
146 | 
147 | ###########
148 | # Functions
149 | ###########
150 | 
151 | 
152 | def add_tag(string, pos, end, tag):
153 |     """
154 |     Wraps part of a string a given tag
155 |     """
156 |     tag.string = string[pos:end]
157 |     return [
158 |         bs4.element.NavigableString(string[:pos]),
159 |         tag,
160 |         bs4.element.NavigableString(string[end:]),
161 |     ]
162 | 
163 | 
164 | def find_references(soup, pattern, attrs):
165 |     """
166 |     Finds the references in the soup and marks them a tag
167 |     """
168 |     logs = []  # For debug
169 | 
170 |     text_tags = list(soup.find_all("text"))
171 |     for text_tag in text_tags:
172 |         for text_tag_string in list(text_tag.contents):
173 |             if type(text_tag_string) is not bs4.element.NavigableString:
174 |                 continue
175 |             tag_cursor = text_tag_string
176 |             last_match_end = 0
177 |             matches = pattern.finditer(text_tag_string)
178 |             for match in list(matches):
179 |                 if regex.match(r"\s?,?of\b", text_tag_string[match.end() :]):
180 |                     continue
181 |                 ref_tag = soup.new_tag("reference", **attrs)
182 |                 pre_text, ref_tag, post_text = add_tag(
183 |                     text_tag_string, match.start(), match.end(), ref_tag
184 |                 )
185 | 
186 |                 pre_text = pre_text[last_match_end:]
187 |                 last_match_end = match.end()
188 | 
189 |                 tag_cursor.replace_with(ref_tag)
190 |                 ref_tag.insert_before(pre_text)
191 |                 ref_tag.insert_after(post_text)
192 |                 tag_cursor = post_text
193 | 
194 |                 logs.append(f"{post_text[:50]} --- {match[0]}")  # For debug
195 | 
196 |     return logs  # For debug
197 | 


--------------------------------------------------------------------------------
/statutes_pipeline_steps/us_reference_parse.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import json
  3 | import multiprocessing
  4 | import os
  5 | from builtins import Exception
  6 | 
  7 | import regex
  8 | from quantlaw.utils.beautiful_soup import create_soup, save_soup
  9 | from quantlaw.utils.files import ensure_exists, list_dir
 10 | 
 11 | from statics import (
 12 |     US_HELPERS_PATH,
 13 |     US_REFERENCE_AREAS_PATH,
 14 |     US_REFERENCE_PARSED_LOG_PATH,
 15 |     US_REFERENCE_PARSED_PATH,
 16 |     US_REG_HELPERS_PATH,
 17 |     US_REG_REFERENCE_AREAS_PATH,
 18 |     US_REG_REFERENCE_PARSED_LOG_PATH,
 19 |     US_REG_REFERENCE_PARSED_PATH,
 20 | )
 21 | from utils.common import RegulationsPipelineStep
 22 | 
 23 | 
 24 | class UsReferenceParseStep(RegulationsPipelineStep):
 25 |     max_number_of_processes = max(int(multiprocessing.cpu_count() / 2), 1)
 26 | 
 27 |     def get_items(self, overwrite) -> list:
 28 |         src = (
 29 |             US_REG_REFERENCE_AREAS_PATH if self.regulations else US_REFERENCE_AREAS_PATH
 30 |         )
 31 |         dest = (
 32 |             US_REG_REFERENCE_PARSED_PATH
 33 |             if self.regulations
 34 |             else US_REFERENCE_PARSED_PATH
 35 |         )
 36 | 
 37 |         ensure_exists(dest)
 38 |         files = list_dir(src, ".xml")
 39 | 
 40 |         if not overwrite:
 41 |             existing_files = os.listdir(dest)
 42 |             files = list(filter(lambda f: f not in existing_files, files))
 43 |         return files
 44 | 
 45 |     def execute_item(self, item):
 46 |         from statutes_pipeline_steps.us_reference_reg import parse_authority_references
 47 | 
 48 |         src = (
 49 |             US_REG_REFERENCE_AREAS_PATH if self.regulations else US_REFERENCE_AREAS_PATH
 50 |         )
 51 |         dest = (
 52 |             US_REG_REFERENCE_PARSED_PATH
 53 |             if self.regulations
 54 |             else US_REFERENCE_PARSED_PATH
 55 |         )
 56 | 
 57 |         soup = create_soup(f"{src}/{item}")
 58 | 
 59 |         this_title = self.get_title_from_filename(item)
 60 |         try:
 61 |             logs = parse_references(soup, this_title, this_usc=not self.regulations)
 62 |             logs += parse_authority_references(soup)
 63 |         except Exception:
 64 |             print(item)
 65 |             raise
 66 |         save_soup(soup, f"{dest}/{item}")
 67 |         return logs
 68 | 
 69 |     def finish_execution(self, results):
 70 |         logs = list(itertools.chain.from_iterable(results))
 71 |         ensure_exists(US_REG_HELPERS_PATH if self.regulations else US_HELPERS_PATH)
 72 |         with open(
 73 |             US_REG_REFERENCE_PARSED_LOG_PATH
 74 |             if self.regulations
 75 |             else US_REFERENCE_PARSED_LOG_PATH,
 76 |             mode="w",
 77 |         ) as f:
 78 |             f.write("\n".join(sorted(logs, key=lambda x: x.lower())))
 79 | 
 80 |     def get_title_from_filename(self, filename):
 81 |         if self.regulations:
 82 |             base = os.path.splitext(filename)[0]
 83 |             assert base.startswith("cfr")
 84 |             title_key = base.split("_")[0][len("cfr") :]
 85 |             return int(title_key)
 86 |         else:
 87 |             base = os.path.splitext(filename)[0]
 88 |             title_key = base.split("_")[0]
 89 |             assert title_key[-1] == "0"
 90 |             assert len(title_key) == 3
 91 |             return int(title_key[:-1])
 92 | 
 93 | 
 94 | ###########
 95 | # Functions
 96 | ###########
 97 | 
 98 | 
 99 | def sortable_paragraph_number(string):
100 |     MIN_DIGITS = 4
101 |     digits = len(regex.match(r"^\d*", string)[0])
102 |     if not digits:
103 |         return string
104 |     return "0 " * (MIN_DIGITS - digits) + string
105 | 
106 | 
107 | split_pattern_short = regex.compile(
108 |     r"\s*(?:\b|(?<=\d))(U\.?S\.?C|C\.?F\.?R)(?:\.|\b|(?=\d)|Sec\.)\s*",
109 |     flags=regex.IGNORECASE,
110 | )
111 | split_pattern_inline = regex.compile(
112 |     # fmt: off
113 |     r"\s*of\s+(?=(?:"
114 |         r'(?:this\s(?:sub\-?)?(?:title|chapter|part|section|division|paragraph))'
115 |     r'|'
116 |         r'(?:title)'
117 |     r"))"
118 |     # fmt: on
119 |     ,
120 |     flags=regex.IGNORECASE,
121 | )
122 | sub_split_pattern = regex.compile(
123 |     r"\s*,?\s*(?:and|or|,|;|throu?g?h?|to)\s+", flags=regex.IGNORECASE
124 | )
125 | 
126 | 
127 | def get_enum_types(string):
128 |     return (
129 |         bool(regex.fullmatch(r"[a-z]", string)),
130 |         bool(regex.fullmatch(r"\d+", string)),
131 |         bool(regex.fullmatch(r"[A-Z]", string)),
132 |         bool(regex.fullmatch(r"[xvi]x{0,4}v?i{0,4}", string)),
133 |         bool(regex.fullmatch(r"[XVI]X{0,4}V?I{0,4}", string)),
134 |         bool(regex.fullmatch(r"([a-z])\1", string)),
135 |     )
136 | 
137 | 
138 | def enum_types_match(x, y):
139 |     for a, b in zip(x, y):
140 |         if a and b:
141 |             return True
142 |     return False
143 | 
144 | 
145 | # fmt: off
146 | 
147 | inline_title_pattern = regex.compile(
148 |     r'(?:'
149 |         r'(this)\s(?:sub\-?)?(?:title|chapter|part|section|division|paragraph)'
150 |     r'|'
151 |         r'title\s(\d+)'
152 |     r')'
153 |     r'(\s+of\s+the\s+Code\s+of\s+Federal\s+Regulations)?'
154 |     r'(\s+of\s+the\s+Code\s+of\s+the\s+United\s+States)?',
155 |     flags=regex.IGNORECASE
156 | )
157 | 
158 | # fmt: on
159 | 
160 | 
161 | def extract_title_inline(text, this_title, this_usc):
162 |     match = inline_title_pattern.fullmatch(text)
163 |     assert match
164 | 
165 |     if bool(match[4]):
166 |         usc = True
167 |     elif bool(match[3]):
168 |         usc = False
169 |     else:
170 |         usc = this_usc
171 | 
172 |     if match[1]:
173 |         return this_title, usc
174 |     elif match[2]:
175 |         return int(match[2]), usc
176 |     else:
177 |         raise Exception(text)
178 | 
179 | 
180 | def split_block_reference(reference_str, debug_context=None):
181 |     text_parts = split_pattern_short.split(reference_str)
182 |     if not len(text_parts) == 3:
183 |         print("ERROR", text_parts, str(debug_context))
184 |     title = int(text_parts[0].strip())
185 |     usc = "u" in text_parts[1].lower()
186 |     sub_text = text_parts[2]
187 |     return usc, title, sub_text
188 | 
189 | 
190 | def parse_references(soup, this_title, this_usc):
191 |     test_list = []  # For debug
192 |     for ref_tag in soup.find_all("reference"):
193 |         # Split into title and subtitle
194 |         last_usc = None
195 |         last_title = None
196 |         if ref_tag["pattern"] == "block":
197 |             usc, title, sub_text = split_block_reference(
198 |                 ref_tag.string, debug_context=ref_tag
199 |             )
200 |             text_parts = split_pattern_inline.split(sub_text)
201 |             if len(text_parts) == 2:
202 |                 last_title, last_usc = extract_title_inline(
203 |                     text_parts[1].strip(), this_title, this_usc
204 |                 )
205 |                 sub_text = text_parts[0]
206 |             elif len(text_parts) > 2:
207 |                 raise Exception(str(ref_tag))
208 | 
209 |         elif ref_tag["pattern"] == "inline":
210 |             text_parts = split_pattern_inline.split(ref_tag.string)
211 |             if len(text_parts) == 2:
212 |                 title, usc = extract_title_inline(
213 |                     text_parts[1].strip(), this_title, this_usc
214 |                 )
215 |                 sub_text = text_parts[0]
216 |             elif len(text_parts) == 1:
217 |                 title = this_title
218 |                 sub_text = text_parts[0].strip()
219 |                 usc = this_usc
220 |             else:
221 |                 raise Exception(str(ref_tag))
222 |         else:
223 |             raise Exception(f"{str(ref_tag)} has not matching pattern")
224 | 
225 |         references = parse_reference_text(sub_text)
226 |         add_title_to_reference(references, title, usc, last_title, last_usc)
227 | 
228 |         ref_tag["parsed"] = json.dumps(references, ensure_ascii=False)
229 |         test_list.append(f"{sub_text} -- {json.dumps(references, ensure_ascii=False)}")
230 |     return test_list
231 | 
232 | 
233 | def add_title_to_reference(references, title, usc, last_title=None, last_usc=None):
234 |     # Add title to index 0 of reference
235 |     for reference in references:
236 |         if usc:
237 |             title_str = str(title)
238 |         else:
239 |             title_str = "cfr" + str(title)
240 |         reference.insert(0, title_str)
241 |     if len(references) > 1 and last_title is not None:
242 |         assert last_usc is not None
243 |         if last_usc:
244 |             title_str = str(last_title)
245 |         else:
246 |             title_str = "cfr" + str(last_title)
247 |         references[-1][0] = title_str
248 | 
249 | 
250 | def parse_reference_text(sub_text):
251 |     # Preformat ranges
252 |     for match in regex.finditer(
253 |         r"(\d+[a-z]{0,3})[\-\–\—](\d+[a-z]{0,3})",
254 |         sub_text,
255 |         flags=regex.IGNORECASE,
256 |     ):
257 |         if sortable_paragraph_number(match[1]) <= sortable_paragraph_number(match[2]):
258 |             sub_text = (
259 |                 f"{sub_text[:match.start()]}{match[1]} through "
260 |                 f"{match[2]}{sub_text[match.end():]}"
261 |             )
262 | 
263 |     sub_text = sub_text.replace(" and following", " et. seq.").strip()
264 | 
265 |     references = []
266 |     text_sub_splitted = sub_split_pattern.split(sub_text)
267 |     for test_text in text_sub_splitted:
268 |         match = regex.fullmatch(
269 |             r"(?:§+|sec\.|sections?\b|(?:sub)?parts?\b)?\s*"
270 |             r"(\d+[a-z]{0,3}(?:[\-\–\—\.]\d+[a-z]{0,3})?)"
271 |             r"\s?"
272 |             r"((?:\((?:\d*[a-z]{0,4})\))*)"
273 |             r"("
274 |             r" et\.? seq\.?|"
275 |             r" and following"
276 |             r")?",
277 |             test_text,
278 |             flags=regex.IGNORECASE,
279 |         )
280 |         if not match:
281 |             # test_list.append(f'{test_text} -- {sub_text} -- {file}')
282 |             continue
283 |         sections = [match[1]]
284 |         sub_sections = regex.split(r"[\(\)]+", match[2])
285 |         sub_sections = [o for o in sub_sections if len(o)]
286 |         sections.extend(sub_sections)
287 | 
288 |         if sections[0]:
289 |             references.append(sections)
290 |         else:
291 |             new_reference = None
292 |             current_part_types = get_enum_types(sections[1])
293 |             for old_part in reversed(references[-1][1:]):
294 |                 if enum_types_match(current_part_types, get_enum_types(old_part)):
295 |                     new_reference = references[-1][: references[-1].index(old_part)]
296 |                     break
297 |             if not new_reference:
298 |                 new_reference = references[-1][:]
299 |             new_reference.extend(sections[1:])
300 |             references.append(new_reference)
301 |     return references
302 | 


--------------------------------------------------------------------------------
/statutes_pipeline_steps/us_reference_reg.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | from bs4 import BeautifulSoup
 4 | from regex.regex import Match
 5 | 
 6 | from statutes_pipeline_steps.us_reference_parse import (
 7 |     add_title_to_reference,
 8 |     parse_reference_text,
 9 |     split_block_reference,
10 | )
11 | 
12 | 
13 | def find_authority_references(soup: BeautifulSoup, pattern: Match):
14 |     logs = []
15 | 
16 |     for tag in soup.find_all(auth_text=True):
17 |         auth_text = tag.attrs["auth_text"]
18 |         matches = [m[0] for m in pattern.finditer(auth_text)]
19 |         tag.attrs["auth_text_areas"] = json.dumps(matches, ensure_ascii=False)
20 |     return logs
21 | 
22 | 
23 | def parse_authority_references(soup: BeautifulSoup):
24 |     logs = []
25 |     for tag in soup.find_all(auth_text_areas=True):
26 |         auth_areas = json.loads(tag.attrs["auth_text_areas"])
27 |         auth_parsed = []
28 |         for auth_area in auth_areas:
29 |             usc, title, sub_text = split_block_reference(
30 |                 auth_area, debug_context=tag.attrs["auth_text"]
31 |             )
32 |             references = parse_reference_text(sub_text)
33 |             add_title_to_reference(references, title, usc)
34 |             auth_parsed.append(references)
35 |         tag.attrs["auth_text_parsed"] = json.dumps(auth_parsed, ensure_ascii=False)
36 |     return logs
37 | 


--------------------------------------------------------------------------------
/statutes_pipeline_steps/us_reg_prepare_input.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import os
  3 | import re
  4 | import shutil
  5 | from zipfile import ZipFile
  6 | 
  7 | import pandas as pd
  8 | 
  9 | from statics import US_REG_INPUT_COPY_LOG_PATH, US_REG_INPUT_PATH, US_REG_ORIGINAL_PATH
 10 | from utils.common import ensure_exists
 11 | 
 12 | pattern = re.compile(r".+/CFR-(?P<y>\d+)-title(?P<t>\d+)-vol(?P<v>\d*).xml")
 13 | 
 14 | 
 15 | def us_reg_prepare_input():
 16 |     """moves files into main dir and validate files roughly"""
 17 | 
 18 |     ensure_exists(US_REG_ORIGINAL_PATH)
 19 | 
 20 |     year_zips = sorted(
 21 |         [f.name for f in os.scandir(US_REG_INPUT_PATH) if f.name.endswith(".zip")]
 22 |     )
 23 |     for year_zip in year_zips:
 24 |         year = os.path.splitext(year_zip)[0]
 25 |         year_folder = os.path.join(US_REG_ORIGINAL_PATH, year)
 26 |         if os.path.exists(year_folder):
 27 |             raise Exception(f"{year_folder} already exists")
 28 | 
 29 |         with ZipFile(os.path.join(US_REG_INPUT_PATH, year_zip), "r") as zipObj:
 30 |             # Extract all the contents of zip file in current directory
 31 |             zipObj.extractall(year_folder)
 32 | 
 33 |     # Get all files
 34 |     vols = [
 35 |         pattern.fullmatch(p).groupdict()
 36 |         for p in glob.glob(os.path.join(US_REG_ORIGINAL_PATH, "*/*/*.xml"))
 37 |     ]
 38 | 
 39 |     print("Dropping")
 40 |     for vol in vols:
 41 |         if not vol["v"]:
 42 |             print(vol)
 43 |             os.remove(
 44 |                 os.path.join(
 45 |                     US_REG_ORIGINAL_PATH,
 46 |                     vol["y"],
 47 |                     f"title-{vol['t']}",
 48 |                     f"CFR-{vol['y']}-title{vol['t']}-vol.xml",
 49 |                 )
 50 |             )
 51 |     vols = [v for v in vols if v["v"]]
 52 | 
 53 |     df = pd.DataFrame(vols)
 54 |     df.v = [int(v) if v else None for v in df.v]
 55 |     df.y = [int(y) for y in df.y]
 56 |     df.t = [int(t) for t in df.t]
 57 |     df = df.sort_values(["y", "t", "v"]).reset_index().drop("index", axis=1)
 58 | 
 59 |     volumes = sorted({(t, v) for t, v in zip(df.t, df.v)})
 60 | 
 61 |     copy_actions = []
 62 | 
 63 |     for title, volume in volumes:
 64 |         vol_df = df[(df.t == title) & (df.v == volume)]
 65 |         existing_years = set(vol_df.y)
 66 |         last_exisiting_year = None
 67 |         for year in range(vol_df.y.min(), vol_df.y.max()):
 68 |             if year in existing_years:
 69 |                 last_exisiting_year = year
 70 |             else:
 71 |                 assert last_exisiting_year
 72 |                 copy_actions.append(
 73 |                     dict(
 74 |                         title=title,
 75 |                         volume=volume,
 76 |                         from_year=last_exisiting_year,
 77 |                         to_year=year,
 78 |                     )
 79 |                 )
 80 |     for copy_action in copy_actions:
 81 |         to_dir = os.path.join(
 82 |             US_REG_ORIGINAL_PATH,
 83 |             str(copy_action["to_year"]),
 84 |             f"title-{copy_action['title']}",
 85 |         )
 86 |         os.makedirs(to_dir, exist_ok=True)
 87 |         shutil.copy(
 88 |             os.path.join(
 89 |                 US_REG_ORIGINAL_PATH,
 90 |                 str(copy_action["from_year"]),
 91 |                 f"title-{copy_action['title']}",
 92 |                 f"CFR-{copy_action['from_year']}-"
 93 |                 f"title{copy_action['title']}-"
 94 |                 f"vol{copy_action['volume']}.xml",
 95 |             ),
 96 |             os.path.join(
 97 |                 to_dir,
 98 |                 f"CFR-{copy_action['to_year']}-"
 99 |                 f"title{copy_action['title']}-"
100 |                 f"vol{copy_action['volume']}.xml",
101 |             ),
102 |         )
103 |     pd.DataFrame(copy_actions).to_csv(US_REG_INPUT_COPY_LOG_PATH, index=False)
104 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuantLaw/legal-data-preprocessing/4264cd630b13e3d3bb934d4abd73b5b98217873c/tests/__init__.py


--------------------------------------------------------------------------------
/tests/test_common.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import unittest
 3 | 
 4 | from utils.common import str_to_bool
 5 | 
 6 | 
 7 | class TestCommon(unittest.TestCase):
 8 |     def test_str_to_bool(self):
 9 |         self.assertTrue(str_to_bool("YES"))
10 |         self.assertTrue(str_to_bool("true"))
11 |         self.assertFalse(str_to_bool("No"))
12 |         self.assertTrue(str_to_bool(True))
13 |         with self.assertRaises(argparse.ArgumentTypeError):
14 |             str_to_bool("hell!")
15 | 


--------------------------------------------------------------------------------
/tests/test_de_reference_parse.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from bs4 import BeautifulSoup
 4 | from quantlaw.de_extract.statutes_parse import StatutesParser
 5 | 
 6 | from statutes_pipeline_steps.de_reference_parse import parse_reference_content
 7 | from statutes_pipeline_steps.us_reference_parse import split_block_reference
 8 | 
 9 | 
10 | class TestDeReferenceParse(unittest.TestCase):
11 |     def test_parse_reference_content_lower_s(self):
12 |         reference = BeautifulSoup(
13 |             '<reference pattern="inline">'
14 |             "<main>§ 6 Absatz 1 Nummer 2 Buchstabe r, s, t und v</main>"
15 |             "</reference>",
16 |             "lxml-xml",
17 |         ).reference
18 |         parser = StatutesParser({})
19 |         parse_reference_content(reference, parser)
20 |         self.assertEqual(
21 |             '[["6", "1", "2", "r"], '
22 |             '["6", "1", "2", "s"], '
23 |             '["6", "1", "2", "t"], '
24 |             '["6", "1", "2", "v"]]',
25 |             reference.attrs["parsed"],
26 |         )
27 | 
28 |     def test_parse_reference_content_upper_s_ignore(self):
29 |         reference = BeautifulSoup(
30 |             '<reference pattern="inline">'
31 |             "<main>§ 6 Absatz 1 Nummer 2 Buchstabe r, s, t, S</main>"
32 |             "</reference>",
33 |             "lxml-xml",
34 |         ).reference
35 |         parser = StatutesParser({})
36 |         parse_reference_content(reference, parser)
37 |         self.assertEqual(
38 |             '[["6", "1", "2", "r"], ' '["6", "1", "2", "s"], ' '["6", "1", "2", "t"]]',
39 |             reference.attrs["parsed"],
40 |         )
41 | 
42 |     def test_parse_reference_content_upper_s_for_Satz(self):
43 |         reference = BeautifulSoup(
44 |             '<reference pattern="inline">'
45 |             "<main>§ 6 Absatz 1 Nummer 2 S 4, S 5</main>"
46 |             "</reference>",
47 |             "lxml-xml",
48 |         ).reference
49 |         parser = StatutesParser({})
50 |         parse_reference_content(reference, parser)
51 |         self.assertEqual(
52 |             '[["6", "1", "2", "4"], ' '["6", "1", "2", "5"]]', reference.attrs["parsed"]
53 |         )
54 | 
55 |     def test_cfrsec_splitter(self):
56 |         split_block_reference("47 CFRSec. 1.1204(b)", debug_context=None)
57 | 


--------------------------------------------------------------------------------
/tests/test_de_reference_parse_vso_list.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from statutes_pipeline_steps.de_reference_parse_vso_list import (
 4 |     remove_duplicate_references,
 5 | )
 6 | 
 7 | 
 8 | class MyTestCase(unittest.TestCase):
 9 |     def test_remove_duplicate_references(self):
10 |         self.assertEqual(
11 |             remove_duplicate_references(
12 |                 [["SGB-4", "28p", "8"], ["SGB-4", "28p", "8"], ["SGB-4", "28p", "7"]]
13 |             ),
14 |             [["SGB-4", "28p", "8"], ["SGB-4", "28p", "7"]],
15 |         )
16 | 
17 |         self.assertEqual(
18 |             remove_duplicate_references(
19 |                 [
20 |                     [["Gesetz", "EnWiG"], ["§", "2"], ["Abs", "2"]],
21 |                     [["Gesetz", "EnWiG"], ["§", "2"], ["Abs", "3"]],
22 |                     [["Gesetz", "EnWiG"], ["§", "5"]],
23 |                     [["Gesetz", "EnWiG"], ["§", "5"]],
24 |                 ]
25 |             ),
26 |             [
27 |                 [["Gesetz", "EnWiG"], ["§", "2"], ["Abs", "2"]],
28 |                 [["Gesetz", "EnWiG"], ["§", "2"], ["Abs", "3"]],
29 |                 [["Gesetz", "EnWiG"], ["§", "5"]],
30 |             ],
31 |         )
32 | 
33 | 
34 | if __name__ == "__main__":
35 |     unittest.main()
36 | 


--------------------------------------------------------------------------------
/tests/test_snapshot_mapping_index.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import re
 3 | import string
 4 | from unittest import TestCase
 5 | 
 6 | from utils.string_list_contains import StringContainsAlign
 7 | 
 8 | 
 9 | def get_random_string(length):
10 |     letters = string.ascii_lowercase + " " * 8
11 |     result_str = "".join(random.choice(letters) for i in range(length))
12 |     return re.sub(r"\s+", " ", result_str)
13 | 
14 | 
15 | class StringContainsAlignTestCase(TestCase):
16 |     @classmethod
17 |     def setUpClass(cls) -> None:
18 |         list_random_part = [
19 |             get_random_string(random.randint(100, 1000)) for _ in range(100)
20 |         ]
21 |         list_random_part_1 = [
22 |             get_random_string(random.randint(100, 1000)) for _ in range(100)
23 |         ]
24 |         list_random_part_2 = [
25 |             get_random_string(random.randint(100, 1000)) for _ in range(100)
26 |         ]
27 | 
28 |         cls.test_list_1 = (
29 |             list_random_part * 10
30 |             + list_random_part_1
31 |             + list_random_part * 25
32 |             + ["sdf sdf", "sdfsdf"]
33 |         )
34 |         cls.test_list_2 = (
35 |             list_random_part_2 * 15
36 |             + list_random_part_1
37 |             + list_random_part_2 * 20
38 |             + ["sdf sdf", "sdfsdf"]
39 |         )
40 | 
41 |     def test_align(self):
42 | 
43 |         aligner = StringContainsAlign()
44 |         aligner.text_list_0 = self.__class__.test_list_1
45 |         aligner.text_list_1 = self.__class__.test_list_2
46 | 
47 |         aligner.create_index()
48 | 
49 |         res = aligner.run(reversed=True)
50 | 
51 |         self.assertEqual(102, len(res))
52 | 
53 |         aligner.min_text_length = 100
54 |         res = aligner.run(reversed=True)
55 | 
56 |         self.assertTrue(0 < len(res) < 102)
57 | 


--------------------------------------------------------------------------------
/tests/test_us_reg_xml.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from statutes_pipeline_steps.us_reg_to_xml import split_double_units
 4 | 
 5 | 
 6 | class MyTestCase(unittest.TestCase):
 7 |     def test_split_double_units(self):
 8 |         self.assertEqual(
 9 |             [["(a)"], ["(1) sdf"], ["(2) asdasd"], [["x", "y"]]],
10 |             list(split_double_units([["(a)(1) sdf"], ["(a)(2) asdasd"], [["x", "y"]]])),
11 |         )
12 | 


--------------------------------------------------------------------------------
/utils/common.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import pickle
  4 | import shutil
  5 | from collections import Counter
  6 | 
  7 | import pandas as pd
  8 | from quantlaw.utils.files import ensure_exists
  9 | from quantlaw.utils.pipeline import PipelineStep
 10 | from regex import regex
 11 | 
 12 | from statics import (
 13 |     DATA_PATH,
 14 |     DE_LAW_NAMES_COMPILED_PATH,
 15 |     DE_LAW_NAMES_PATH,
 16 |     DE_REG_LAW_NAMES_COMPILED_PATH,
 17 |     DE_REG_LAW_NAMES_PATH,
 18 | )
 19 | 
 20 | ##########
 21 | # Pipeline
 22 | ##########
 23 | 
 24 | 
 25 | class RegulationsPipelineStep(PipelineStep):
 26 |     def __init__(self, regulations, *args, **kwargs):
 27 |         self.regulations = regulations
 28 |         super().__init__(*args, **kwargs)
 29 | 
 30 | 
 31 | def str_to_bool(v):
 32 |     if isinstance(v, bool):
 33 |         return v
 34 |     if v.lower() in ("yes", "true", "t", "y", "1"):
 35 |         return True
 36 |     elif v.lower() in ("no", "false", "f", "n", "0"):
 37 |         return False
 38 |     else:
 39 |         raise argparse.ArgumentTypeError("Boolean value expected.")
 40 | 
 41 | 
 42 | ########################
 43 | # Generic Data Wrangling
 44 | ########################
 45 | 
 46 | 
 47 | def invert_dict_mapping_all(mapping_dictionary):
 48 |     """
 49 |     Args:
 50 |         mapping_dictionary: mapping from keys to values which is not necessarily
 51 |             injective, e.g., node_id to community_id mapping
 52 | 
 53 |     Returns: inverted mapping with unique values as keys and lists of former keys as
 54 |         values, e.g., community_id to node_id mapping
 55 | 
 56 |     """
 57 |     inverted = {v: [] for v in mapping_dictionary.values()}
 58 |     for k, v in mapping_dictionary.items():
 59 |         inverted[v].append(k)
 60 |     return inverted
 61 | 
 62 | 
 63 | def invert_dict_mapping_unique(source_dict):
 64 |     """
 65 |     Inverts keys and values of a dict. Only entries with unique values are inverted.
 66 |     """
 67 |     counter = Counter(source_dict.values())
 68 |     unique = set([text for text, cnt in counter.most_common() if cnt == 1])
 69 |     return {v: k for k, v in source_dict.items() if v in unique}
 70 | 
 71 | 
 72 | ####################
 73 | # DE Crossreferences
 74 | ####################
 75 | 
 76 | 
 77 | def load_law_names(regulations):
 78 |     df = pd.read_csv(DE_REG_LAW_NAMES_PATH if regulations else DE_LAW_NAMES_PATH)
 79 |     data = [
 80 |         dict(
 81 |             citename=row.citename,
 82 |             citekey=row.citekey,
 83 |             start=row.filename.split("_")[2],
 84 |             end=os.path.splitext(row.filename)[0].split("_")[3],
 85 |             filename=row.filename,
 86 |         )
 87 |         for i, row in df.iterrows()
 88 |     ]
 89 |     return data
 90 | 
 91 | 
 92 | def load_law_names_compiled(regulations):
 93 |     with open(
 94 |         DE_REG_LAW_NAMES_COMPILED_PATH if regulations else DE_LAW_NAMES_COMPILED_PATH,
 95 |         "rb",
 96 |     ) as f:
 97 |         return pickle.load(f)
 98 | 
 99 | 
100 | def get_stemmed_law_names_for_filename(filename, law_names):
101 |     date = os.path.splitext(filename)[0].split("_")[2]
102 |     return get_stemmed_law_names(date, law_names)
103 | 
104 | 
105 | def get_stemmed_law_names(date, law_names):
106 |     laws_lookup = law_names[date]
107 | 
108 |     # Custom law names, stemmed as key.
109 |     laws_lookup["grundgesetz"] = "GG"
110 | 
111 |     # Add law names without year number if key already used
112 |     shortened_keys = {}
113 |     for key, value in laws_lookup.items():
114 |         match = regex.fullmatch(r"(.+)\s\d{4}[\-\d]*", key)
115 |         if match:
116 |             if match[1] not in shortened_keys:
117 |                 shortened_keys[match[1]] = set()
118 |             shortened_keys[match[1]].update([value])
119 | 
120 |     for key, values in shortened_keys.items():
121 |         if len(values) == 1 and key not in laws_lookup.keys():
122 |             laws_lookup[key] = list(values)[0]
123 | 
124 |     return laws_lookup
125 | 
126 | 
127 | def get_snapshot_law_list(date, law_names_data):
128 |     date = date.replace("-", "")
129 |     law_names_list = {
130 |         d["filename"] for d in law_names_data if d["start"] <= date and d["end"] >= date
131 |     }
132 |     assert len(law_names_list) == len({x.split("_")[0] for x in law_names_list})
133 |     return law_names_list
134 | 
135 | 
136 | def copy_xml_schema_to_data_folder():
137 |     ensure_exists(DATA_PATH)
138 |     shutil.copyfile("xml-schema.xsd", os.path.join(DATA_PATH, "xml-schema.xsd"))
139 |     shutil.copyfile("xml-styles.css", os.path.join(DATA_PATH, "xml-styles.css"))
140 | 


--------------------------------------------------------------------------------
/utils/simplify_gii_xml.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | from bs4 import BeautifulSoup, NavigableString
  4 | from quantlaw.utils.beautiful_soup import create_soup, save_soup
  5 | 
  6 | 
  7 | def simplify_gii_xml(source, destination):
  8 |     soup = create_soup(source)
  9 |     simplify(soup)
 10 |     save_soup(soup, destination)
 11 | 
 12 | 
 13 | def remove_new_lines(tag, soup):
 14 |     for descendant in list(tag.descendants):
 15 |         if type(descendant) is NavigableString:
 16 |             text = str(descendant)
 17 |             text = re.sub(r"\s+", " ", text).strip()
 18 |             if str(descendant) != text:
 19 |                 descendant.replaceWith(soup.new_string(text))
 20 | 
 21 | 
 22 | def simplify(soup: BeautifulSoup):
 23 |     # General
 24 |     for t in soup.find_all(attrs={"builddate": True}):
 25 |         del t.attrs["builddate"]
 26 | 
 27 |     for t in soup.find_all("FnR"):
 28 |         t.extract()
 29 | 
 30 |     for metadaten in soup.find_all("metadaten"):
 31 |         for t in metadaten.find_all("titel", recursive=False):
 32 |             del t.attrs["format"]
 33 |         for t in metadaten.find_all("enbez", recursive=False):
 34 |             if t.string == "(XXXX)":
 35 |                 t.string = "XXXX"
 36 | 
 37 |     for t in soup.find_all("BR"):
 38 |         text = " "
 39 |         if type(t.previous_sibling) is NavigableString:
 40 |             text = t.previous_sibling.string + " "
 41 |             t.previous_sibling.extract()
 42 |         if type(t.next_sibling) is NavigableString:
 43 |             text += t.next_sibling.string
 44 |             t.next_sibling.extract()
 45 | 
 46 |         t.replaceWith(soup.new_string(text))
 47 | 
 48 |     # Metadaten
 49 |     for tag_name in ["ausfertigung-datum", "fundstelle", "standangabe"]:
 50 |         for t in soup.metadaten.find_all(tag_name):
 51 |             t.extract()
 52 | 
 53 |     # Text
 54 |     for t in soup.find_all("SUP", attrs={"class": "Rec"}):
 55 |         t.replaceWith(soup.new_string(" "))
 56 | 
 57 |     for t in soup.find_all(["DT", "DD", "entry", "LA"]):
 58 |         t.insert(0, soup.new_string(" "))
 59 |         t.append(soup.new_string(" "))
 60 | 
 61 |     for t in soup.find_all("P"):
 62 |         new_t = soup.new_tag("P")
 63 |         text = t.get_text()
 64 |         text = re.sub(r"\s+", " ", text).strip()
 65 |         if text:
 66 |             new_t.string = text
 67 |         t.replaceWith(new_t)
 68 | 
 69 |     for toc in soup.find_all("TOC"):
 70 |         text = toc.get_text(" ")
 71 |         text = re.sub(r"\s+", " ", text).strip()
 72 |         new_toc = soup.new_tag("TOC")
 73 |         new_toc.string = text
 74 |         toc.replaceWith(new_toc)
 75 | 
 76 |     for textdaten in soup.find_all("textdaten"):
 77 |         if textdaten.Footnotes:
 78 |             textdaten.Footnotes.extract()
 79 | 
 80 |         t = textdaten.find("text", recursive=False)
 81 |         if t and not t.get_text().strip():
 82 |             t.extract()
 83 | 
 84 |     for t in soup.find_all("Content"):
 85 |         if (
 86 |             type(t.next_sibling) is NavigableString
 87 |             and not t.next_sibling.string.strip()
 88 |         ):
 89 |             t.next_sibling.extract()
 90 | 
 91 |     for t in soup.find_all(["gliederungstitel", "titel", "langue", "kurzue"]):
 92 |         remove_new_lines(t, soup)
 93 |         for descendant in list(t.descendants):
 94 |             if type(descendant) is NavigableString:
 95 |                 text = str(descendant)
 96 |                 text = re.sub(r"\s*\*\)\s*$", "", text).strip()
 97 |                 if str(descendant) != text:
 98 |                     descendant.replaceWith(soup.new_string(text))
 99 | 
100 |     for t in soup.find_all("fussnoten"):
101 |         t.extract()
102 | 


--------------------------------------------------------------------------------
/utils/string_list_contains.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | 
 3 | 
 4 | class StringContainsAlign:
 5 |     """
 6 |     Maps strings of a list to strings of another list based on the condition
 7 |     that the string (needle) of the first list  is contained in the string
 8 |     (haystack) in the second list.
 9 |     This class optimizes performance by splitting the strings to compare into
10 |     tokens to preselect possible candidates that might contain the needle
11 |     """
12 | 
13 |     def __init__(
14 |         self,
15 |         text_list_0=None,
16 |         text_list_1=None,
17 |         sep=" ",
18 |         fixed_ends=False,
19 |         min_text_length=None,
20 |     ):
21 |         self.sep = sep
22 |         self.fixed_ends = fixed_ends
23 |         self.min_text_length = min_text_length
24 |         self.text_list_0 = text_list_0
25 |         self.text_list_1 = text_list_1
26 |         self.index_0 = None
27 |         self.index_1 = None
28 | 
29 |     def create_index(self):
30 |         assert self.text_list_0
31 |         assert self.text_list_1
32 |         self.index_0 = self._text_list_to_token_index(self.text_list_0)
33 |         self.index_1 = self._text_list_to_token_index(self.text_list_1)
34 | 
35 |     def clean_index(self):
36 |         self.index_0 = None
37 |         self.index_1 = None
38 | 
39 |     def _text_list_to_token_index(self, text_list):
40 |         token_index = defaultdict(set)
41 |         for i, text in enumerate(text_list):
42 |             text_tokens = text.split(self.sep)
43 |             for token in text_tokens:
44 |                 token_index[token].add(i)
45 |         return token_index
46 | 
47 |     def run(self, reversed=False):
48 |         text_list_needle = self.text_list_1 if reversed else self.text_list_0
49 |         text_list_haystack = self.text_list_0 if reversed else self.text_list_1
50 | 
51 |         index_haystack = self.index_0 if reversed else self.index_1
52 | 
53 |         result = []
54 | 
55 |         for needle_index, needle in enumerate(text_list_needle):
56 |             if self.min_text_length and len(needle) < self.min_text_length:
57 |                 continue
58 | 
59 |             needle_tokens = needle.split(self.sep)
60 | 
61 |             if not self.fixed_ends:
62 |                 # Remove first and last token so that e.g. adding just one letter to the
63 |                 # last token is possible
64 |                 needle_tokens = needle_tokens[1:-1]
65 | 
66 |             if needle_tokens:
67 |                 candidates = index_haystack[needle_tokens[0]]
68 |                 for token in needle_tokens[1:]:
69 |                     candidates = candidates.intersection(index_haystack[token])
70 | 
71 |                     # For performence
72 |                     if len(candidates) <= 1:
73 |                         break
74 |             else:
75 |                 candidates = range(len(text_list_haystack))
76 | 
77 |             for haystack_index in candidates:
78 |                 target_text = text_list_haystack[haystack_index]
79 |                 if needle in target_text:
80 |                     result.append((needle_index, haystack_index))
81 | 
82 |         return result
83 | 


--------------------------------------------------------------------------------
/xml-schema-decisions-de.xsd:
--------------------------------------------------------------------------------
  1 | <xs:schema
  2 |   attributeFormDefault="unqualified"
  3 |   elementFormDefault="qualified"
  4 |   xmlns:xs="http://www.w3.org/2001/XMLSchema"
  5 | >
  6 |   <xs:element
  7 |     name="document"
  8 |     type="documentType"
  9 |   />
 10 |   <xs:complexType name="documentType">
 11 |     <xs:sequence>
 12 |       <xs:element
 13 |         type="textType"
 14 |         name="norm"
 15 |         minOccurs="0"
 16 |       />
 17 |       <xs:choice
 18 |         maxOccurs="unbounded"
 19 |         minOccurs="0"
 20 |       >
 21 |         <xs:element
 22 |           type="itemType"
 23 |           name="item"
 24 |         />
 25 |         <xs:element
 26 |           type="seqitemType"
 27 |           name="seqitem"
 28 |         />
 29 |       </xs:choice>
 30 |     </xs:sequence>
 31 |     <xs:attribute
 32 |       type="xs:string"
 33 |       name="heading"
 34 |     />
 35 |     <xs:attribute
 36 |       type="xs:string"
 37 |       name="key"
 38 |       use="required"
 39 |     />
 40 |     <xs:attribute
 41 |       type="xs:unsignedInt"
 42 |       name="level"
 43 |       use="required"
 44 |     />
 45 |     <xs:attribute
 46 |       type="xs:string"
 47 |       name="abbr_1"
 48 |     />
 49 |     <xs:attribute
 50 |       type="xs:string"
 51 |       name="abbr_2"
 52 |     />
 53 |     <xs:attribute
 54 |       type="xs:string"
 55 |       name="doktyp"
 56 |     />
 57 |     <xs:attribute
 58 |       type="xs:string"
 59 |       name="az"
 60 |     />
 61 |     <xs:attribute
 62 |       type="xs:string"
 63 |       name="datum"
 64 |     />
 65 |     <xs:attribute
 66 |       type="xs:string"
 67 |       name="gericht"
 68 |     />
 69 |     <xs:attribute
 70 |       type="xs:string"
 71 |       name="spruchkoerper"
 72 |     />
 73 |   </xs:complexType>
 74 |   <xs:complexType name="itemType">
 75 |     <xs:choice
 76 |       maxOccurs="unbounded"
 77 |       minOccurs="0"
 78 |     >
 79 |       <xs:element
 80 |         type="itemType"
 81 |         name="item"
 82 |       />
 83 |       <xs:element
 84 |         type="seqitemType"
 85 |         name="seqitem"
 86 |       />
 87 |     </xs:choice>
 88 |     <xs:attribute
 89 |       type="xs:string"
 90 |       name="heading"
 91 |       use="required"
 92 |     />
 93 |     <xs:attribute
 94 |       type="xs:string"
 95 |       name="key"
 96 |       use="required"
 97 |     />
 98 |     <xs:attribute
 99 |       type="xs:unsignedInt"
100 |       name="level"
101 |       use="required"
102 |     />
103 |   </xs:complexType>
104 |   <xs:complexType name="seqitemType">
105 |     <xs:sequence>
106 |       <xs:element
107 |         type="subseqitemType"
108 |         name="subseqitem"
109 |         maxOccurs="unbounded"
110 |         minOccurs="0"
111 |       />
112 |       <xs:element
113 |         type="textType"
114 |         name="text"
115 |         maxOccurs="unbounded"
116 |         minOccurs="0"
117 |       />
118 |     </xs:sequence>
119 |     <xs:attribute
120 |       type="xs:string"
121 |       name="citekey"
122 |     />
123 |     <xs:attribute
124 |       type="xs:string"
125 |       name="heading"
126 |     />
127 |     <xs:attribute
128 |       type="xs:string"
129 |       name="key"
130 |       use="required"
131 |     />
132 |     <xs:attribute
133 |       type="xs:unsignedInt"
134 |       name="level"
135 |       use="required"
136 |     />
137 |   </xs:complexType>
138 |   <xs:complexType name="subseqitemType">
139 |     <xs:sequence>
140 |       <xs:element
141 |         type="textType"
142 |         name="text"
143 |       />
144 |     </xs:sequence>
145 |     <xs:attribute
146 |       type="xs:string"
147 |       name="key"
148 |       use="required"
149 |     />
150 |     <xs:attribute
151 |       type="xs:byte"
152 |       name="level"
153 |       use="required"
154 |     />
155 |   </xs:complexType>
156 |   <xs:complexType
157 |     name="textType"
158 |     mixed="true"
159 |   >
160 |     <xs:sequence>
161 |       <xs:element
162 |         type="referenceType"
163 |         name="reference"
164 |         maxOccurs="unbounded"
165 |         minOccurs="0"
166 |       />
167 |     </xs:sequence>
168 |   </xs:complexType>
169 |   <xs:complexType name="referenceType">
170 |     <xs:sequence>
171 |       <xs:element
172 |         type="xs:string"
173 |         name="main"
174 |       />
175 |       <xs:element
176 |         type="xs:string"
177 |         name="suffix"
178 |       />
179 |       <xs:element name="lawname">
180 |         <xs:complexType>
181 |           <xs:simpleContent>
182 |             <xs:extension base="xs:string">
183 |               <xs:attribute
184 |                 type="xs:string"
185 |                 name="type"
186 |               />
187 |             </xs:extension>
188 |           </xs:simpleContent>
189 |         </xs:complexType>
190 |       </xs:element>
191 |     </xs:sequence>
192 |     <xs:attribute
193 |       type="xs:string"
194 |       name="parsed"
195 |     />
196 |     <xs:attribute
197 |       type="xs:string"
198 |       name="parsed_verbose"
199 |     />
200 |     <xs:attribute
201 |       type="xs:string"
202 |       name="pattern"
203 |     />
204 |   </xs:complexType>
205 |   <xs:complexType name="lawnameType">
206 |     <xs:simpleContent>
207 |       <xs:extension base="xs:string">
208 |         <xs:attribute
209 |           type="xs:string"
210 |           name="type"
211 |           use="required"
212 |         />
213 |       </xs:extension>
214 |     </xs:simpleContent>
215 |   </xs:complexType>
216 | </xs:schema>
217 | 


--------------------------------------------------------------------------------
/xml-schema.xsd:
--------------------------------------------------------------------------------
  1 | <xs:schema
  2 |   attributeFormDefault="unqualified"
  3 |   elementFormDefault="qualified"
  4 |   xmlns:xs="http://www.w3.org/2001/XMLSchema"
  5 | >
  6 |   <xs:element
  7 |     name="document"
  8 |     type="documentType"
  9 |   />
 10 |   <xs:complexType name="documentType">
 11 |     <xs:choice
 12 |       maxOccurs="unbounded"
 13 |       minOccurs="0"
 14 |     >
 15 |       <xs:element
 16 |         type="itemType"
 17 |         name="item"
 18 |       />
 19 |       <xs:element
 20 |         type="seqitemType"
 21 |         name="seqitem"
 22 |       />
 23 |     </xs:choice>
 24 |     <xs:attribute
 25 |       type="xs:string"
 26 |       name="heading"
 27 |     />
 28 |     <xs:attribute
 29 |       type="xs:string"
 30 |       name="key"
 31 |       use="required"
 32 |     />
 33 |     <xs:attribute
 34 |       type="xs:unsignedInt"
 35 |       name="level"
 36 |       use="required"
 37 |     />
 38 |     <xs:attribute
 39 |       type="xs:string"
 40 |       name="abbr_1"
 41 |     />
 42 |     <xs:attribute
 43 |       type="xs:string"
 44 |       name="abbr_2"
 45 |     />
 46 |     <xs:attribute
 47 |       type="xs:string"
 48 |       name="document_type"
 49 |     />
 50 |     <xs:attribute
 51 |       type="xs:string"
 52 |       name="title"
 53 |     />
 54 |     <xs:attribute
 55 |       type="xs:string"
 56 |       name="year"
 57 |     />
 58 |   </xs:complexType>
 59 |   <xs:complexType name="itemType">
 60 |     <xs:choice
 61 |       maxOccurs="unbounded"
 62 |       minOccurs="0"
 63 |     >
 64 |       <xs:element
 65 |         type="itemType"
 66 |         name="item"
 67 |       />
 68 |       <xs:element
 69 |         type="seqitemType"
 70 |         name="seqitem"
 71 |       />
 72 |     </xs:choice>
 73 |     <xs:attribute
 74 |       type="xs:string"
 75 |       name="heading"
 76 |     />
 77 |     <xs:attribute
 78 |       type="xs:string"
 79 |       name="key"
 80 |       use="required"
 81 |     />
 82 |     <xs:attribute
 83 |       type="xs:unsignedInt"
 84 |       name="level"
 85 |       use="required"
 86 |     />
 87 |     <xs:attribute
 88 |       type="xs:string"
 89 |       name="auth_text"
 90 |     />
 91 |     <xs:attribute
 92 |       type="xs:string"
 93 |       name="auth_text_areas"
 94 |     />
 95 |     <xs:attribute
 96 |       type="xs:string"
 97 |       name="auth_text_parsed"
 98 |     />
 99 |   </xs:complexType>
100 |   <xs:complexType name="seqitemType">
101 |     <xs:choice
102 |       maxOccurs="unbounded"
103 |       minOccurs="0"
104 |     >
105 |       <xs:element
106 |         type="textType"
107 |         name="text"
108 |       />
109 |       <xs:element
110 |         type="subseqitemType"
111 |         name="subseqitem"
112 |       />
113 |     </xs:choice>
114 |     <xs:attribute
115 |       type="xs:string"
116 |       name="citekey"
117 |     />
118 |     <xs:attribute
119 |       type="xs:string"
120 |       name="heading"
121 |     />
122 |     <xs:attribute
123 |       type="xs:string"
124 |       name="key"
125 |       use="required"
126 |     />
127 |     <xs:attribute
128 |       type="xs:unsignedInt"
129 |       name="level"
130 |       use="required"
131 |     />
132 |   </xs:complexType>
133 |   <xs:complexType name="subseqitemType">
134 |     <xs:choice
135 |       maxOccurs="unbounded"
136 |       minOccurs="0"
137 |     >
138 |       <xs:element
139 |         type="textType"
140 |         name="text"
141 |       />
142 |       <xs:element
143 |         type="subseqitemType"
144 |         name="subseqitem"
145 |       />
146 |     </xs:choice>
147 |     <xs:attribute
148 |       type="xs:string"
149 |       name="key"
150 |       use="required"
151 |     />
152 |     <xs:attribute
153 |       type="xs:byte"
154 |       name="level"
155 |       use="required"
156 |     />
157 |     <xs:attribute
158 |       type="xs:string"
159 |       name="heading"
160 |     />
161 |   </xs:complexType>
162 |   <xs:complexType
163 |     name="textType"
164 |     mixed="true"
165 |   >
166 |     <xs:sequence>
167 |       <xs:element
168 |         type="referenceType"
169 |         name="reference"
170 |         maxOccurs="unbounded"
171 |         minOccurs="0"
172 |       />
173 |     </xs:sequence>
174 |   </xs:complexType>
175 |   <xs:complexType
176 |     name="referenceType"
177 |     mixed="true"
178 |   >
179 |     <xs:sequence minOccurs="0">
180 |       <xs:element
181 |         type="xs:string"
182 |         name="main"
183 |       />
184 |       <xs:element
185 |         type="xs:string"
186 |         name="suffix"
187 |       />
188 |       <xs:element name="lawname">
189 |         <xs:complexType>
190 |           <xs:simpleContent>
191 |             <xs:extension base="xs:string">
192 |               <xs:attribute
193 |                 type="xs:string"
194 |                 name="type"
195 |                 use="optional"
196 |               />
197 |             </xs:extension>
198 |           </xs:simpleContent>
199 |         </xs:complexType>
200 |       </xs:element>
201 |     </xs:sequence>
202 |     <xs:attribute
203 |       type="xs:string"
204 |       name="parsed"
205 |     />
206 |     <xs:attribute
207 |       type="xs:string"
208 |       name="parsed_verbose"
209 |     />
210 |     <xs:attribute
211 |       type="xs:string"
212 |       name="pattern"
213 |     />
214 |   </xs:complexType>
215 |   <xs:complexType name="lawnameType">
216 |     <xs:simpleContent>
217 |       <xs:extension base="xs:string">
218 |         <xs:attribute
219 |           type="xs:string"
220 |           name="type"
221 |           use="required"
222 |         />
223 |       </xs:extension>
224 |     </xs:simpleContent>
225 |   </xs:complexType>
226 | </xs:schema>
227 | 


--------------------------------------------------------------------------------
/xml-styles.css:
--------------------------------------------------------------------------------
  1 | document {
  2 |     font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", "PingFang SC", "Hiragino Sans GB", "Microsoft YaHei", "Helvetica Neue", Helvetica, Arial, sans-serif,
  3 |  "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol";
  4 |     padding: 20px;
  5 |     line-height: 140%;
  6 | }
  7 | 
  8 | seqitem {
  9 |     margin-top: 20px;
 10 |     display: block;
 11 | }
 12 | 
 13 | subseqitem {
 14 |     display: block;
 15 | }
 16 | 
 17 | item {
 18 |     border-left: solid #999 2px;
 19 |     padding-left: 20px;
 20 |     margin-left: 2px;
 21 |     display: block;
 22 | }
 23 | 
 24 | item::before, seqitem::before, subseqitem::before {
 25 |     content: attr(heading);
 26 |     font-weight: 600;
 27 |     display: block;
 28 | }
 29 | 
 30 | 
 31 | main {
 32 |     color: #00a;
 33 | }
 34 | 
 35 | suffix {
 36 |     color: #aa0;
 37 | }
 38 | 
 39 | lawname {
 40 |     background-color: #f00;
 41 | }
 42 | 
 43 | lawname[type="dict"] {
 44 |     background-color:  rgb(164, 211, 255)
 45 | }
 46 | 
 47 | lawname[type="eu"] {
 48 |     background: repeating-linear-gradient(
 49 |     135deg,
 50 |     rgb(207, 229, 250),
 51 |     rgb(207, 229, 250) 15px,
 52 |     rgb(122, 187, 252) 15px,
 53 |     rgb(122, 187, 252)  30px
 54 |     );
 55 | }
 56 | 
 57 | lawname[type="sgb"] {
 58 |     background: repeating-linear-gradient(
 59 |     135deg,
 60 |     rgb(207, 229, 250),
 61 |     rgb(207, 229, 250) 15px,
 62 |     rgb(243, 231, 128) 15px,
 63 |     rgb(243, 231, 128)  30px
 64 |     );
 65 | }
 66 | 
 67 | lawname[type="ignore"] {
 68 |     background: repeating-linear-gradient(
 69 |     135deg,
 70 |     rgb(207, 229, 250),
 71 |     rgb(207, 229, 250) 15px,
 72 |     rgb(255, 162, 134) 15px,
 73 |     rgb(255, 162, 134)  30px
 74 |     );
 75 | }
 76 | 
 77 | 
 78 | reference[nomatch=""] {
 79 |     background-color: #f00;
 80 | }
 81 | 
 82 | lawreference {
 83 |     background-color: #fca0f9;
 84 | }
 85 | 
 86 | reference:before {
 87 |     content:"\a";
 88 |     white-space: pre;
 89 | }
 90 | 
 91 | reference:after, lawreference:after {
 92 |     font-family: SFMono-Regular, Consolas, "Liberation Mono", Menlo, Courier, monospace;
 93 |     color: rgb(143, 38, 22);
 94 |     background-color: rgb(251, 229, 225);
 95 |     content: attr(parsed) ' ' attr(lawid) ' (' attr(target) ')';
 96 | }
 97 | 
 98 | reference[target="match"]:after {
 99 |     color: rgb(36, 143, 22);
100 |     background-color: rgb(229, 255, 226);
101 | }
102 | 
103 | reference[target="skipped"]:after {
104 |     color: rgb(117, 119, 19);
105 |     background-color: rgb(255, 255, 230);
106 | }
107 | 


--------------------------------------------------------------------------------