├── .codeclimate.yml ├── .github └── workflows │ └── tests.yml ├── .gitignore ├── .isort.cfg ├── .pre-commit-config.yaml ├── LICENSE ├── README.md ├── __main__.py ├── de_decisions_pipeline.py ├── de_decisions_pipeline_steps ├── a_download.py ├── b_clean.py ├── c_hierarchy.py ├── common.py ├── d_reference_areas_parse.py └── e_network.py ├── download_de_gesetze_im_internet_data.py ├── download_us_code_data.py ├── download_us_reg_data.py ├── requirements-dev.txt ├── requirements.txt ├── setup.cfg ├── statics.py ├── statutes_pipeline_steps ├── __init__.py ├── crossreference_graph.py ├── de_authority_edgelist.py ├── de_crossreference_edgelist.py ├── de_crossreference_lookup.py ├── de_law_names.py ├── de_prepare_input.py ├── de_reference_areas.py ├── de_reference_parse.py ├── de_reference_parse_vso_list.py ├── de_to_xml.py ├── hierarchy_graph.py ├── snapshot_mapping_edgelist.py ├── snapshot_mapping_index.py ├── us_authority_edgelist.py ├── us_crossreference_edgelist.py ├── us_crossreference_lookup.py ├── us_prepare_input.py ├── us_reference_areas.py ├── us_reference_parse.py ├── us_reference_reg.py ├── us_reg_prepare_input.py ├── us_reg_to_xml.py └── us_to_xml.py ├── tests ├── __init__.py ├── test_common.py ├── test_de_reference_parse.py ├── test_de_reference_parse_vso_list.py ├── test_snapshot_mapping_index.py └── test_us_reg_xml.py ├── utils ├── common.py ├── simplify_gii_xml.py └── string_list_contains.py ├── xml-schema-decisions-de.xsd ├── xml-schema.xsd └── xml-styles.css /.codeclimate.yml: -------------------------------------------------------------------------------- 1 | version: "2" 2 | plugins: 3 | pep8: 4 | enabled: true 5 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | on: 3 | push: 4 | branches: 5 | - master 6 | jobs: 7 | run: 8 | env: 9 | PYTHON: '3.7' 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@master 13 | - name: Setup Python 14 | uses: actions/setup-python@master 15 | with: 16 | python-version: 3.7 17 | - name: Install dependencies 18 | run: | 19 | python -m pip install --upgrade pip 20 | pip install -r requirements.txt 21 | - name: Generate coverage report 22 | run: | 23 | pip install pytest 24 | pip install pytest-cov 25 | python -m pytest tests --cov=./ --cov-report=xml 26 | - name: Upload coverage to Codecov 27 | uses: codecov/codecov-action@v1 28 | with: 29 | token: ${{ secrets.CODECOV_TOKEN }} 30 | file: ./coverage.xml 31 | directory: ./coverage/reports/ 32 | flags: unittests 33 | env_vars: OS,PYTHON 34 | name: codecov-umbrella 35 | fail_ci_if_error: true 36 | path_to_write_report: ./codecov_report.gz 37 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # Custom 132 | temp/ 133 | .idea/ 134 | venv/ 135 | -------------------------------------------------------------------------------- /.isort.cfg: -------------------------------------------------------------------------------- 1 | [settings] 2 | profile = black 3 | extra_standard_library = setuptools,pkg_resources 4 | known_test = pytest 5 | known_first_party = ${root_pkg} 6 | sections = FUTURE,STDLIB,TEST,THIRDPARTY,FIRSTPARTY,LOCALFOLDER 7 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | exclude: '^docs/conf.py' 2 | 3 | repos: 4 | - repo: git://github.com/pre-commit/pre-commit-hooks 5 | rev: v3.2.0 6 | hooks: 7 | - id: trailing-whitespace 8 | - id: check-added-large-files 9 | - id: check-ast 10 | - id: check-json 11 | - id: check-merge-conflict 12 | - id: check-xml 13 | - id: check-yaml 14 | - id: debug-statements 15 | - id: end-of-file-fixer 16 | - id: requirements-txt-fixer 17 | - id: mixed-line-ending 18 | args: ['--fix=no'] 19 | 20 | 21 | - repo: http://github.com/timothycrosley/isort 22 | rev: 5.4.2 23 | hooks: 24 | - id: isort 25 | 26 | - repo: https://github.com/psf/black 27 | rev: stable 28 | hooks: 29 | - id: black 30 | language_version: python3 31 | 32 | - repo: https://gitlab.com/pycqa/flake8 33 | rev: 3.8.3 34 | hooks: 35 | - id: flake8 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 2-Clause License 2 | 3 | Copyright (c) 2021, QuantLaw 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![codecov](https://codecov.io/gh/QuantLaw/legal-data-preprocessing/branch/master/graph/badge.svg?token=FABCUR680K)](https://codecov.io/gh/QuantLaw/legal-data-preprocessing) 2 | [![Tests](https://github.com/QuantLaw/legal-data-preprocessing/workflows/Tests/badge.svg)](https://github.com/QuantLaw/legal-data-preprocessing/actions) 3 | [![Maintainability](https://api.codeclimate.com/v1/badges/8cffa9a56ce357314456/maintainability)](https://codeclimate.com/repos/5f1bf2a3fccc45014c00c615/maintainability) 4 | [![DOI](https://zenodo.org/badge/doi/10.5281/zenodo.4070772.svg)](https://doi.org/10.5281/zenodo.4070772) 5 | 6 | # Legal Data Preprocessing 7 | 8 | This repository contains code to preprocess legal text documents. 9 | It is, inter alia, used to produce the results reported in the following publications: 10 | 11 | - Daniel Martin Katz, Corinna Coupette, Janis Beckedorf, and Dirk Hartung, Complex Societies and the Growth of the Law, *Sci. Rep.* **10** (2020), [https://doi.org/10.1038/s41598-020-73623-x](https://doi.org/10.1038/s41598-020-73623-x) 12 | - Corinna Coupette, Janis Beckedorf, Dirk Hartung, Michael Bommarito, and Daniel Martin Katz, Measuring Law Over Time, to appear (2021) 13 | 14 | Related Repositories: 15 | - [Complex Societies and the Growth of the Law](https://github.com/QuantLaw/Complex-Societies-and-Growth) ([Publication Release](https://doi.org/10.5281/zenodo.4070769)) 16 | - [Measuring Law Over Time](https://github.com/QuantLaw/Measuring-Law-Over-Time) 17 | - [Legal Data Clustering](https://github.com/QuantLaw/legal-data-clustering) ([Latest Publication Release](https://doi.org/10.5281/zenodo.4070774)) 18 | 19 | Related Data: 20 | - [Preprocessed Input Data for *Sci. Rep.* **10** (2020)](https://doi.org/10.5281/zenodo.4070767) 21 | - [Preprocessed Input Data for *Measuring Law Over Time*, to appear (2021)](https://doi.org/10.5281/zenodo.4660133) 22 | 23 | ## Setup 24 | 25 | 1. It is assumed that you have Python 3.7 installed. (Other versions are not tested.) 26 | 2. Set up a virtual environment and activate it. (This is not required but recommended.) 27 | 3. Install the required packages `pip install -r requirements.txt`. 28 | 29 | ## Getting Started 30 | 31 | Make sure the following folders do not exist next to the root folder of this repository: 32 | - `legal-networks-data` 33 | - `gesetze-im-internet` 34 | 35 | Download and prepare the data for the United States (US) and Germany. (See the respective "1. Data input" 36 | sections below.) Afterwards, you can run the pipeline. 37 | 38 | For the US statutory data: 39 | 40 | 1. Download the data: `python download_us_code_data.py` 41 | 2. Run all steps of the pipeline: `python . us all` 42 | 43 | For the US statutory & regulatory data: 44 | 45 | 1. Download statutory data: `python download_us_code_data.py` 46 | 2. Download regulatory data: `python download_us_reg_data.py` 47 | 3. Run all steps of the pipeline: `python . us all` and `python . us all -r` 48 | 49 | 50 | For the German statutory data, using a *juris* export: 51 | 52 | 1. Prepare the data (as shown in a separate repository) 53 | 2. Run all steps of the pipeline: `python . de all` 54 | 55 | For the German statutory & regulatory data, using a *juris* export: 56 | 57 | 1. Prepare the data (as shown in a separate repository) 58 | 2. Run all steps of the pipeline: `python . de all -r` 59 | 60 | For the German statutory data, using Gesetze im Internet (GII): 61 | 62 | 1. Prepare the data: `python download_de_gesetze_im_internet_data.py --dates 2019-06-10 2020-01-18`. 63 | You need to specify the dates you want to analyze. 64 | 2. Run all steps of the pipeline except for `prepare_input` for the specified dates: 65 | `python . de xml law_names reference_areas reference_parse hierarchy_graph crossreference_lookup crossreference_edgelist crossreference_graph snapshot_mapping_edgelist --snapshots 2019-06-10 2020-01-18` 66 | 67 | If you need to reduce memory usage, you can deactivate multiprocessing with the argument `--single-process`. 68 | 69 | To download and prepare German judicial decision data from https://www.rechtsprechung-im-internet.de, 70 | run `python de_decisions_pipeline.py all`. 71 | 72 | 73 | ## Statutes 74 | 75 | US and German federal statutes and regulations are converted from official sources (or *juris*) 76 | to multiple clean formats focussing on the structure of the law. 77 | 78 | Output formats are: 79 | 80 | - XML files containing the text, the hierarchical structure of the law, and cross-references. 81 | - Gpickle files for each Title/Gesetz/Rechtsverodnung and version containing the hierarchical structure of the statutes. 82 | - Gpickle files for each snapshot (year in the US or date in Germany) containing the hierarchical structure of the statutes 83 | and the cross-references between different elements of the statutes with reduced granularity and corresponding nodelists 84 | and edgelists. 85 | - Snapshot mapping edgelists: These lists map elements of a network at one snapshot 86 | to a snapshot at another time. They encode, e.g., where a clause of the US Code in 2010 is 87 | located in the US Code of 2011. This mapping is derived from the text and the structure 88 | of the statutes. 89 | 90 | The steps of the pipeline are: 91 | 92 | - `prepare_input` 93 | - `xml` 94 | - `law_names` (only for German pipeline) 95 | - `reference_areas` 96 | - `reference_parse` 97 | - `hierarchy_graph` 98 | - `crossreference_lookup` 99 | - `crossreference_edgelist` 100 | - `crossreference_graph` 101 | - `snapshot_mapping_index` 102 | - `snapshot_mapping_edgelist` 103 | 104 | 105 | ### US 106 | 107 | The processing for the US Code is executed in multiple steps: 108 | 109 | 110 | #### 1. Data Input 111 | 112 | Inputs are ZIP files downloaded from the US House of Representatives Office of the Law 113 | Revision Counsel and U.S. Government Publishing Office. We use annual versions in XHTML format that are available on 114 | https://uscode.house.gov/download/annualhistoricalarchives/downloadxhtml.shtml and 115 | https://www.govinfo.gov/bulkdata/CFR. 116 | Files should be located at regarding the statutes `../legal-networks-data/us/1_input` and regarding the regulations `../legal-networks-data/us_reg/1_input`. 117 | This folder should contain unzipped yearly folders. 118 | 119 | You can automatically obtain the required data by running `download_us_code_data.py` and `download_us_reg_data.py`. 120 | 121 | 122 | #### 2. XML Files 123 | 124 | - Files containing titles of the US Code are copied to `temp/us/11_htm`. 125 | Appendices and Stylesheets are filtered. (Result of step: `prepare_input`) 126 | - Simple XML files focusing on the structure are generated from the XHTML files. 127 | Results can be found in `temp/us/12_xml`. (Result of step: `xml`) 128 | - Text segments containing a cross-reference are annotated in the XML files. Results are saved to 129 | `temp/us/13_reference_areas`. (Result of step: `reference_areas`) 130 | - The contents of the annotated cross-references are extracted and added to the XML. 131 | 132 | The results of the XML generation are saved to `../legal-networks-data/us/2_xml`. (Result of step: `reference_parse`) 133 | 134 | CFR data is located at `us_reg` folders next to the `us` folder. 135 | 136 | 137 | #### 3. Hierarchy Graphs 138 | 139 | Graphs containing the hierarchical structure of the statutes are saved to `../legal-networks-data/us/3_hierarchy_graph` 140 | in separate files for each Title and annual version. (Result of step: `hierarchy_graph`) 141 | 142 | CFR data is located at `us_reg` folders next to the `us` folder. 143 | 144 | 145 | #### 4. Crossreference Graphs 146 | 147 | - A list of all sections in the US Code at a specific point in time is generated to obtain a list of possible 148 | destinations of cross-references. This is a preparation step for drawing edges from the cross-reference source to the cross-reference destination. The lists are stored at `temp/us/31_crossreference_lookup`. 149 | (Result of step: `crossreference_lookup`) 150 | - Lists of all cross-references are generated. They contain the ID of the referencing and the referenced element. 151 | The lists are located at `temp/us/32_crossreference_edgelist`. 152 | (Result of step: `crossreference_edgelist`) 153 | - Hierarchy graphs of the individual Titles are combined and edges for cross-references are added within and between 154 | Titles. 155 | 156 | Each annual version of the US Code is stored at `../legal-networks-data/us/4_crossreference_graph` in three files as a nodeslist, an edgelist and networkx graph stored as gpickle.gz-file in the subfolder `seqitems`. The node-list contains all nodes, whereas subseqitems are excluded in the networkx file. 157 | (Result of step: `crossreference_graph`) 158 | 159 | The combined data regarding the US Code and the CFR is located at `us_reg` folders next to the `us` folder. 160 | 161 | 162 | #### 5. Snapshot Mapping Edgelists 163 | 164 | Snapshot mapping edgelists are stored at `../legal-networks-data/us/5_snapshot_mapping_edgelist` and ``../legal-networks-data/us_reg/5_snapshot_mapping_edgelist``. 165 | 166 | 167 | #### Germany 168 | 169 | #### 1. Data Input 170 | 171 | Inputs are XML files in a format simplified from that of documents available from GII. 172 | These files can be generated from two sources: 173 | 174 | 1. XML files provided by GII. To obtain older versions of this website 175 | use our public archive at https://github.com/legal-networks/gesetze-im-internet. 176 | Downloaded files must be simplified before they are suitable input. 177 | Use `download_de_gesetze_im_internet_data.py` to download, simplify and rename the source files. 178 | This replaces step `prepare_input` in the pipeline. 179 | (Make sure that you do not run this step. It is not possible to run `all` steps.) 180 | 2. An export from the *juris* database can be used to obtain the data. 181 | Whereas this datasource covers a longer time period, we cannot make it publicly available due to licensing restrictions. 182 | 183 | #### 2. XML Files 184 | 185 | - Files in the simplified format of Gesetze im Internet are generated and saved to `temp/de/11_gii_xml` 186 | (Result of step: `prepare_input` or `download_de_gesetze_im_internet_data.py`) 187 | - Simple XML files focusing on the structure are generated from the original XML files. 188 | Results can be found in `temp/de/12_xml`. (Result of step: `xml`) 189 | - A list of the names of all statutes (Gesetze) is saved to 190 | `temp/de/12_xml_law_names.csv` with a mapping to the corresponding files. 191 | This is used to extract cross-references, as statutes are typically referenced by their name. 192 | Names are saved in a stemmed version. (Result of step: `law_names`) 193 | 194 | Furthermore, `temp/de/12_xml_law_names_compiled.pickle` is generated. 195 | It contains the same information as `12_xml_law_names.csv`, 196 | but is optimized to obtain the stemmed names of all valid laws at specific dates. (Result of step: `law_names`) 197 | - Text segments containing a cross-reference are annotated in the XML files. Results are saved to 198 | `temp/de/13_reference_areas`. (Result of step: `reference_areas`) 199 | - The contents of the annotated cross-references are extracted and added to the XML. 200 | 201 | The results of the XML generation are saved to `../legal-networks-data/de/2_xml`. (Result of step: `reference_parse`) 202 | 203 | The combined data of statutes and regulations is located at `de_reg` folders next to the `de` folder. 204 | 205 | #### 3. Hierarchy Graphs 206 | 207 | Hierarchy Graphs are saved to `../legal-networks-data/de/3_hierarchy_graph`. 208 | See the documentation regarding the US hierarchy graphs for further information. 209 | 210 | The combined data of statutes and regulations is located at `de_reg` folders next to the `de` folder. 211 | 212 | #### 4. Cross-Reference Graphs 213 | 214 | In general cross-reference graphs are generated in the same manner as for the US dataset 215 | (see above for further information). 216 | (Interim) results are saved to 217 | `temp/us/31_crossreference_lookup`, 218 | `temp/us/32_crossreference_edgelist`, and 219 | `../legal-networks-data/us/4_crossreference_graph`, respectively. 220 | 221 | A major difference are the possible dates for which to create cross-reference graphs. 222 | For the US, only annual version are available. 223 | The *juris* export allows one to select any day to create a snapshot. 224 | If you rely on https://github.com/legal-networks/gesetze-im-internet as a data source, you can only select days 225 | for which a snapshot was created. 226 | 227 | The combined data of statutes and regulations is located at `de_reg` folders next to the `de` folder. 228 | 229 | #### 5. Snapshot Mapping Edgelists 230 | 231 | Snapshot mapping edgelists are stored at `../legal-networks-data/de/5_snapshot_mapping_edgelist`. 232 | 233 | The combined data of statutes and regulations is located at `de_reg` folders next to the `de` folder. 234 | -------------------------------------------------------------------------------- /__main__.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import re 4 | 5 | from statics import ( 6 | ALL_YEARS, 7 | ALL_YEARS_REG, 8 | DE_CROSSREFERENCE_EDGELIST_PATH, 9 | DE_CROSSREFERENCE_GRAPH_PATH, 10 | DE_HIERARCHY_GRAPH_PATH, 11 | DE_REFERENCE_PARSED_PATH, 12 | DE_REG_AUTHORITY_EDGELIST_PATH, 13 | DE_REG_CROSSREFERENCE_EDGELIST_PATH, 14 | DE_REG_CROSSREFERENCE_GRAPH_PATH, 15 | DE_REG_HIERARCHY_GRAPH_PATH, 16 | DE_REG_REFERENCE_PARSED_PATH, 17 | DE_REG_SNAPSHOT_MAPPING_EDGELIST_PATH, 18 | DE_REG_SNAPSHOT_MAPPING_INDEX_PATH, 19 | DE_SNAPSHOT_MAPPING_EDGELIST_PATH, 20 | DE_SNAPSHOT_MAPPING_INDEX_PATH, 21 | US_CROSSREFERENCE_EDGELIST_PATH, 22 | US_CROSSREFERENCE_GRAPH_PATH, 23 | US_HIERARCHY_GRAPH_PATH, 24 | US_REFERENCE_PARSED_PATH, 25 | US_REG_AUTHORITY_EDGELIST_PATH, 26 | US_REG_CROSSREFERENCE_EDGELIST_PATH, 27 | US_REG_CROSSREFERENCE_GRAPH_PATH, 28 | US_REG_HIERARCHY_GRAPH_PATH, 29 | US_REG_REFERENCE_PARSED_PATH, 30 | US_REG_SNAPSHOT_MAPPING_EDGELIST_PATH, 31 | US_REG_SNAPSHOT_MAPPING_INDEX_PATH, 32 | US_SNAPSHOT_MAPPING_EDGELIST_PATH, 33 | US_SNAPSHOT_MAPPING_INDEX_PATH, 34 | ) 35 | from statutes_pipeline_steps.crossreference_graph import CrossreferenceGraphStep 36 | from statutes_pipeline_steps.de_authority_edgelist import DeAuthorityEdgelist 37 | from statutes_pipeline_steps.de_crossreference_edgelist import DeCrossreferenceEdgelist 38 | from statutes_pipeline_steps.de_crossreference_lookup import DeCrossreferenceLookup 39 | from statutes_pipeline_steps.de_law_names import DeLawNamesStep 40 | from statutes_pipeline_steps.de_prepare_input import de_prepare_input 41 | from statutes_pipeline_steps.de_reference_areas import DeReferenceAreasStep 42 | from statutes_pipeline_steps.de_reference_parse import DeReferenceParseStep 43 | from statutes_pipeline_steps.de_to_xml import DeToXmlStep, get_type_for_doknr_dict 44 | from statutes_pipeline_steps.hierarchy_graph import HierarchyGraphStep 45 | from statutes_pipeline_steps.snapshot_mapping_edgelist import ( 46 | SnapshotMappingEdgelistStep, 47 | ) 48 | from statutes_pipeline_steps.snapshot_mapping_index import SnapshotMappingIndexStep 49 | from statutes_pipeline_steps.us_authority_edgelist import UsAuthorityEdgelist 50 | from statutes_pipeline_steps.us_crossreference_edgelist import UsCrossreferenceEdgelist 51 | from statutes_pipeline_steps.us_crossreference_lookup import UsCrossreferenceLookup 52 | from statutes_pipeline_steps.us_prepare_input import us_prepare_input 53 | from statutes_pipeline_steps.us_reference_areas import UsReferenceAreasStep 54 | from statutes_pipeline_steps.us_reference_parse import UsReferenceParseStep 55 | from statutes_pipeline_steps.us_reg_prepare_input import us_reg_prepare_input 56 | from statutes_pipeline_steps.us_reg_to_xml import UsRegsToXmlStep 57 | from statutes_pipeline_steps.us_to_xml import UsToXmlStep 58 | from utils.common import load_law_names, load_law_names_compiled, str_to_bool 59 | 60 | 61 | def get_subseqitem_conf(subseqitems): 62 | if subseqitems is None: 63 | return False, True 64 | elif subseqitems is True: 65 | return (True,) 66 | elif subseqitems is False: 67 | return (False,) 68 | 69 | 70 | ALL_STEPS = [ 71 | "prepare_input", 72 | "xml", 73 | "law_names", # DE only 74 | "reference_areas", 75 | "reference_parse", 76 | "hierarchy_graph", 77 | "crossreference_lookup", 78 | "crossreference_edgelist", 79 | "authority_edgelist", 80 | "crossreference_graph", 81 | # creates edgelist to map nodes between snapshots for DYNAMIC graph 82 | "snapshot_mapping_index", 83 | "snapshot_mapping_edgelist", 84 | ] 85 | 86 | if __name__ == "__main__": 87 | parser = argparse.ArgumentParser() 88 | parser.add_argument("dataset", help="select a dataset: DE or US") 89 | parser.add_argument("steps", nargs="+", help="select a step to perform by name") 90 | parser.add_argument("--filter", nargs="*", help="filter for specific files") 91 | parser.add_argument( 92 | "--single-process", 93 | dest="use_multiprocessing", 94 | action="store_const", 95 | const=False, 96 | default=True, 97 | help="prevent multiprocessing", 98 | ) 99 | parser.add_argument( 100 | "--overwrite", 101 | dest="overwrite", 102 | action="store_const", 103 | const=True, 104 | default=False, 105 | help="overwrite files", 106 | ) 107 | parser.add_argument( 108 | "--subseqitems", 109 | dest="subseqitems", 110 | nargs="?", 111 | const=True, 112 | type=str_to_bool, 113 | default=None, 114 | help="include subseqitems in graphs", 115 | ) 116 | parser.add_argument( 117 | "--snapshots", 118 | dest="snapshots", 119 | nargs="*", 120 | type=str, 121 | default=["all"], 122 | help=( 123 | "snapshots for crossreferences. Eg. 2010-01-01 for de dataset or 2010 for " 124 | "us dataset. To run on whole research window: all or all-new-years" 125 | ), 126 | ) 127 | parser.add_argument( 128 | "--interval", 129 | dest="interval", 130 | nargs="?", 131 | type=int, 132 | default=1, 133 | help=( 134 | "Only for snapshot_mapping_edgelist. Interval for mapped snapshots. " 135 | "Default 1 (snapshot)" 136 | ), 137 | ) 138 | 139 | parser.add_argument( 140 | "-r", 141 | "--regulations", 142 | dest="regulations", 143 | action="store_const", 144 | const=True, 145 | default=False, 146 | help="Include regulations", 147 | ) 148 | 149 | parser.add_argument( 150 | "-dc", 151 | "--detailed-crossreferences", 152 | dest="detailed_crossreferences", 153 | action="store_const", 154 | const=True, 155 | default=False, 156 | help="Resolve cross references on the lowest possible level. " 157 | "Default is to resolve on seqitem level (e.g. sections).", 158 | ) 159 | args = parser.parse_args() 160 | 161 | steps = [step.lower() for step in args.steps] 162 | dataset = args.dataset.lower() 163 | use_multiprocessing = args.use_multiprocessing 164 | processes = None if args.use_multiprocessing else 1 165 | overwrite = args.overwrite 166 | snapshots = args.snapshots 167 | interval = args.interval 168 | selected_items = args.filter or [] 169 | regulations = args.regulations 170 | detailed_crossreferences = args.detailed_crossreferences 171 | 172 | if dataset not in ["de", "us"]: 173 | raise Exception(f"{dataset} unsupported dataset. Options: us, de") 174 | 175 | if "all" in snapshots or "all-new-years" in snapshots: 176 | years = ALL_YEARS_REG if regulations else ALL_YEARS 177 | if dataset == "us": 178 | snapshots = [f"{year}" for year in years] 179 | elif dataset == "de": 180 | snapshots = [ 181 | f"{year}-12-31" if "all" in snapshots else f"{year}-01-01" 182 | for year in years 183 | ] 184 | 185 | if "all" in steps: 186 | steps = ALL_STEPS 187 | else: 188 | unknown_steps = [s for s in steps if s not in ALL_STEPS] 189 | assert not unknown_steps, unknown_steps 190 | 191 | if ( 192 | "crossreference_lookup" in steps 193 | or "crossreference_edgelist" in steps 194 | or "crossreference_graph" in steps 195 | ): 196 | if dataset == "de" or snapshots: 197 | for snapshot in snapshots: 198 | if not re.fullmatch(r"\d{4}(-\d{2}-\d{2})?", snapshot): 199 | raise Exception( 200 | "Add --snapshots as argument. " 201 | "E.g. for de --snapshots 2012-01-31 2013-01-31 or for us " 202 | "--snapshot 2001" 203 | ) 204 | 205 | if detailed_crossreferences and regulations: 206 | raise Exception( 207 | "Combining detailed cross-references and regulations is not tested." 208 | ) 209 | 210 | if "prepare_input" in steps: 211 | if dataset == "us": 212 | if regulations: 213 | us_reg_prepare_input() 214 | else: 215 | us_prepare_input() 216 | elif dataset == "de": 217 | de_prepare_input(regulations) 218 | print("Filter input: done") 219 | 220 | if "xml" in steps: 221 | if dataset == "us": 222 | if regulations: 223 | step = UsRegsToXmlStep(processes) 224 | else: 225 | step = UsToXmlStep(processes) 226 | items = step.get_items(overwrite) 227 | step.execute_filtered_items(items, selected_items) 228 | elif dataset == "de": 229 | dok_type_dict = get_type_for_doknr_dict() 230 | step = DeToXmlStep( 231 | regulations=regulations, 232 | processes=processes, 233 | dok_type_dict=dok_type_dict, 234 | ) 235 | items = step.get_items(overwrite) 236 | step.execute_filtered_items(items, selected_items) 237 | print("Convert to xml: done") 238 | 239 | if "law_names" in steps: 240 | if dataset == "de": 241 | step = DeLawNamesStep(regulations=regulations, processes=processes) 242 | items = step.get_items() 243 | step.execute_items(items) 244 | print("Law names: done") 245 | 246 | if "reference_areas" in steps: 247 | if dataset == "us": 248 | step = UsReferenceAreasStep(regulations=regulations, processes=processes) 249 | items = step.get_items(overwrite) 250 | step.execute_filtered_items(items) 251 | 252 | elif dataset == "de": 253 | law_names = load_law_names_compiled(regulations) 254 | step = DeReferenceAreasStep( 255 | law_names=law_names, regulations=regulations, processes=processes 256 | ) 257 | items = step.get_items(overwrite) 258 | step.execute_filtered_items(items) 259 | 260 | print("Extract reference areas: done") 261 | 262 | if "reference_parse" in steps: 263 | if dataset == "us": 264 | step = UsReferenceParseStep(regulations=regulations, processes=processes) 265 | items = step.get_items(overwrite) 266 | step.execute_filtered_items(items) 267 | if dataset == "de": 268 | law_names = load_law_names_compiled(regulations) 269 | step = DeReferenceParseStep( 270 | law_names=law_names, regulations=regulations, processes=processes 271 | ) 272 | items = step.get_items(overwrite) 273 | step.execute_filtered_items(items) 274 | 275 | print("Parse references: done") 276 | 277 | if "hierarchy_graph" in steps: 278 | # for subseqitems_conf in get_subseqitem_conf(args.subseqitems): 279 | for subseqitems_conf in [True]: 280 | if dataset == "us": 281 | source = ( 282 | US_REG_REFERENCE_PARSED_PATH 283 | if regulations 284 | else US_REFERENCE_PARSED_PATH 285 | ) 286 | destination = os.path.join( 287 | US_REG_HIERARCHY_GRAPH_PATH 288 | if regulations 289 | else US_HIERARCHY_GRAPH_PATH, 290 | "subseqitems" if subseqitems_conf else "seqitems", 291 | ) 292 | elif dataset == "de": 293 | source = ( 294 | DE_REG_REFERENCE_PARSED_PATH 295 | if regulations 296 | else DE_REFERENCE_PARSED_PATH 297 | ) 298 | destination = ( 299 | ( 300 | DE_REG_HIERARCHY_GRAPH_PATH 301 | if regulations 302 | else DE_HIERARCHY_GRAPH_PATH 303 | ) 304 | + "/" 305 | + ("subseqitems" if subseqitems_conf else "seqitems") 306 | ) 307 | 308 | step = HierarchyGraphStep( 309 | source=source, 310 | destination=destination, 311 | add_subseqitems=subseqitems_conf, 312 | processes=processes, 313 | ) 314 | items = step.get_items(overwrite) 315 | step.execute_filtered_items(items) 316 | print("Make hierarchy graphs: done") 317 | 318 | if "crossreference_lookup" in steps: 319 | if dataset == "us": 320 | step = UsCrossreferenceLookup( 321 | detailed_crossreferences=detailed_crossreferences, 322 | regulations=regulations, 323 | processes=processes, 324 | ) 325 | items = step.get_items(overwrite, snapshots) 326 | step.execute_items(items) 327 | 328 | elif dataset == "de": 329 | assert not detailed_crossreferences 330 | step = DeCrossreferenceLookup(regulations=regulations, processes=processes) 331 | items = step.get_items(snapshots) 332 | step.execute_items(items) 333 | 334 | print("Create crossreference lookup: done") 335 | 336 | if "crossreference_edgelist" in steps: 337 | if dataset == "us": 338 | step = UsCrossreferenceEdgelist( 339 | detailed_crossreferences=detailed_crossreferences, 340 | regulations=regulations, 341 | processes=processes, 342 | ) 343 | items = step.get_items(overwrite, snapshots) 344 | step.execute_items(items) 345 | 346 | elif dataset == "de": 347 | assert not detailed_crossreferences 348 | law_names_data = load_law_names(regulations) 349 | step = DeCrossreferenceEdgelist( 350 | regulations=regulations, 351 | law_names_data=law_names_data, 352 | processes=processes, 353 | ) 354 | items = step.get_items(overwrite, snapshots) 355 | step.execute_items(items) 356 | 357 | print("Create crossreference edgelist: done") 358 | 359 | if "authority_edgelist" in steps: 360 | if dataset == "de" and regulations: 361 | law_names_data = load_law_names(regulations) 362 | step = DeAuthorityEdgelist( 363 | law_names_data=law_names_data, processes=processes 364 | ) 365 | items = step.get_items(overwrite, snapshots) 366 | step.execute_items(items) 367 | elif dataset == "us" and regulations: 368 | assert not detailed_crossreferences 369 | step = UsAuthorityEdgelist( 370 | detailed_crossreferences=detailed_crossreferences, 371 | processes=processes, 372 | regulations=regulations, 373 | ) 374 | items = step.get_items(overwrite, snapshots) 375 | step.execute_items(items) 376 | print("Create authority edgelist: done") 377 | 378 | if "crossreference_graph" in steps: 379 | if dataset == "us": 380 | source = US_HIERARCHY_GRAPH_PATH 381 | source_regulation = US_REG_HIERARCHY_GRAPH_PATH 382 | destination = ( 383 | US_REG_CROSSREFERENCE_GRAPH_PATH 384 | if regulations 385 | else US_CROSSREFERENCE_GRAPH_PATH 386 | ) + ("/detailed" if detailed_crossreferences else "") 387 | edgelist_folder = ( 388 | US_REG_CROSSREFERENCE_EDGELIST_PATH 389 | if regulations 390 | else US_CROSSREFERENCE_EDGELIST_PATH 391 | ) + ("/detailed" if detailed_crossreferences else "") 392 | authority_edgelist_folder = US_REG_AUTHORITY_EDGELIST_PATH 393 | elif dataset == "de": 394 | assert not detailed_crossreferences 395 | source = ( 396 | DE_REG_HIERARCHY_GRAPH_PATH if regulations else DE_HIERARCHY_GRAPH_PATH 397 | ) 398 | source_regulation = None 399 | destination = ( 400 | DE_REG_CROSSREFERENCE_GRAPH_PATH 401 | if regulations 402 | else DE_CROSSREFERENCE_GRAPH_PATH 403 | ) 404 | edgelist_folder = ( 405 | DE_REG_CROSSREFERENCE_EDGELIST_PATH 406 | if regulations 407 | else DE_CROSSREFERENCE_EDGELIST_PATH 408 | ) 409 | authority_edgelist_folder = DE_REG_AUTHORITY_EDGELIST_PATH 410 | 411 | step = CrossreferenceGraphStep( 412 | regulations=regulations, 413 | source=source, 414 | source_regulation=source_regulation, 415 | destination=destination, 416 | edgelist_folder=edgelist_folder, 417 | dataset=dataset, 418 | authority_edgelist_folder=authority_edgelist_folder, 419 | processes=processes, 420 | ) 421 | items = step.get_items(overwrite, snapshots) 422 | step.execute_items(items) 423 | 424 | print("Make crossreference graph: done") 425 | 426 | if "snapshot_mapping_index" in steps: 427 | assert not detailed_crossreferences 428 | if dataset == "us": 429 | source_text = ( 430 | [US_REFERENCE_PARSED_PATH, US_REG_REFERENCE_PARSED_PATH] 431 | if regulations 432 | else US_REFERENCE_PARSED_PATH 433 | ) 434 | destination = os.path.join( 435 | US_REG_SNAPSHOT_MAPPING_INDEX_PATH 436 | if regulations 437 | else US_SNAPSHOT_MAPPING_INDEX_PATH, 438 | "subseqitems", 439 | ) 440 | law_names_data = None 441 | elif dataset == "de": 442 | source_text = ( 443 | DE_REG_REFERENCE_PARSED_PATH 444 | if regulations 445 | else DE_REFERENCE_PARSED_PATH 446 | ) 447 | destination = os.path.join( 448 | DE_REG_SNAPSHOT_MAPPING_INDEX_PATH 449 | if regulations 450 | else DE_SNAPSHOT_MAPPING_INDEX_PATH, 451 | "subseqitems", 452 | ) 453 | law_names_data = load_law_names(regulations) 454 | 455 | step = SnapshotMappingIndexStep( 456 | source_text, 457 | destination, 458 | dataset, 459 | law_names_data, 460 | processes=processes, 461 | ) 462 | items = step.get_items(overwrite, snapshots) 463 | step.execute_items(items) 464 | 465 | print("Make snapshot mapping: done") 466 | 467 | if "snapshot_mapping_edgelist" in steps: 468 | assert not detailed_crossreferences 469 | if dataset == "us": 470 | source = os.path.join( 471 | US_REG_SNAPSHOT_MAPPING_INDEX_PATH 472 | if regulations 473 | else US_SNAPSHOT_MAPPING_INDEX_PATH, 474 | "subseqitems", 475 | ) 476 | destination = os.path.join( 477 | US_REG_SNAPSHOT_MAPPING_EDGELIST_PATH 478 | if regulations 479 | else US_SNAPSHOT_MAPPING_EDGELIST_PATH, 480 | "subseqitems", 481 | ) 482 | elif dataset == "de": 483 | source = os.path.join( 484 | DE_REG_SNAPSHOT_MAPPING_INDEX_PATH 485 | if regulations 486 | else DE_SNAPSHOT_MAPPING_INDEX_PATH, 487 | "subseqitems", 488 | ) 489 | destination = os.path.join( 490 | DE_REG_SNAPSHOT_MAPPING_EDGELIST_PATH 491 | if regulations 492 | else DE_SNAPSHOT_MAPPING_EDGELIST_PATH, 493 | "subseqitems", 494 | ) 495 | 496 | step = SnapshotMappingEdgelistStep( 497 | source, 498 | destination, 499 | interval, 500 | dataset, 501 | processes=processes, 502 | ) 503 | items = step.get_items(overwrite, snapshots) 504 | step.execute_items(items) 505 | 506 | print("Make snapshot mapping: done") 507 | -------------------------------------------------------------------------------- /de_decisions_pipeline.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from de_decisions_pipeline_steps.a_download import download 4 | from de_decisions_pipeline_steps.b_clean import clean 5 | from de_decisions_pipeline_steps.c_hierarchy import hierarchy 6 | from de_decisions_pipeline_steps.d_reference_areas_parse import reference_parse_areas 7 | from de_decisions_pipeline_steps.e_network import network 8 | 9 | if __name__ == "__main__": 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument( 12 | "steps", nargs="*", default=["all"], help="select a step to perform by name" 13 | ) 14 | args = parser.parse_args() 15 | 16 | if args.steps == ["all"]: 17 | steps = ["download", "clean", "hierarchy", "references", "network"] 18 | else: 19 | steps = args.steps 20 | 21 | if "download" in steps: 22 | download() 23 | 24 | if "clean" in steps: 25 | clean() 26 | 27 | if "hierarchy" in steps: 28 | hierarchy() 29 | 30 | if "references" in steps: 31 | reference_parse_areas(regulations=False) 32 | 33 | if "network" in steps: 34 | network() 35 | -------------------------------------------------------------------------------- /de_decisions_pipeline_steps/a_download.py: -------------------------------------------------------------------------------- 1 | import os 2 | import zipfile 3 | from multiprocessing.pool import Pool 4 | 5 | import requests 6 | from bs4 import BeautifulSoup 7 | from quantlaw.utils.files import ensure_exists 8 | from requests.adapters import HTTPAdapter 9 | from urllib3.util.retry import Retry 10 | 11 | from statics import ( 12 | DE_DECISIONS_DOWNLOAD_TOC, 13 | DE_DECISIONS_DOWNLOAD_XML, 14 | DE_DECISIONS_DOWNLOAD_ZIP, 15 | DE_DECISIONS_TEMP_DATA_PATH, 16 | ) 17 | 18 | 19 | def download_item(link_text): 20 | s = requests.Session() 21 | retries = Retry( 22 | total=10, 23 | backoff_factor=2, 24 | ) 25 | s.mount("https://", HTTPAdapter(max_retries=retries)) 26 | 27 | filename = link_text.split("/")[-1] 28 | if not os.path.isfile(f"{DE_DECISIONS_DOWNLOAD_ZIP}/{filename}"): 29 | content = s.get(link_text).content 30 | with open(f"{DE_DECISIONS_DOWNLOAD_ZIP}/{filename}", "wb") as f: 31 | f.write(content) 32 | 33 | 34 | def download(): 35 | ensure_exists(DE_DECISIONS_TEMP_DATA_PATH) 36 | toc = requests.get("https://www.rechtsprechung-im-internet.de/rii-toc.xml").text 37 | with open(DE_DECISIONS_DOWNLOAD_TOC, "w") as f: 38 | f.write(toc) 39 | 40 | with open(DE_DECISIONS_DOWNLOAD_TOC) as f: 41 | toc = f.read() 42 | soup = BeautifulSoup(toc, "lxml-xml") 43 | len(soup.findAll("item")) 44 | 45 | ensure_exists(DE_DECISIONS_DOWNLOAD_ZIP) 46 | items = [i.link.text for i in soup.findAll("item")] 47 | with Pool(4) as p: 48 | p.map(download_item, items) 49 | 50 | ensure_exists(DE_DECISIONS_DOWNLOAD_XML) 51 | 52 | i = 0 53 | for filename in os.listdir(DE_DECISIONS_DOWNLOAD_ZIP): 54 | if os.path.splitext(filename)[1] == ".zip": 55 | zip_ref = zipfile.ZipFile(f"{DE_DECISIONS_DOWNLOAD_ZIP}/{filename}", "r") 56 | zip_ref.extractall(DE_DECISIONS_DOWNLOAD_XML) 57 | zip_ref.close() 58 | i += 1 59 | print(f"\r{i} entpackt", end="") 60 | -------------------------------------------------------------------------------- /de_decisions_pipeline_steps/b_clean.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | import os 3 | 4 | from bs4 import BeautifulSoup, Tag 5 | from quantlaw.utils.beautiful_soup import save_soup 6 | from quantlaw.utils.files import ensure_exists, list_dir 7 | 8 | from de_decisions_pipeline_steps.common import get_docparts_with_p 9 | from statics import DE_DECISIONS_DOWNLOAD_XML, DE_DECISIONS_XML 10 | 11 | 12 | def clean_abs(section_tag): 13 | contents = [] 14 | 15 | for dl in section_tag.findAll("dl"): 16 | number = dl.dt.get_text(" ").strip() 17 | number = number if len(number) else None 18 | text = dl.dd.get_text(" ").strip() 19 | indented = bool(dl.dd.p and "margin-left" in dl.dd.p.attrs.get("style", "")) 20 | if len(text): 21 | contents.append( 22 | dict( 23 | number=number, 24 | content=text, 25 | indented=indented, 26 | ) 27 | ) 28 | 29 | return contents 30 | 31 | 32 | def replace_tag_with_content(tag, contents, soup): 33 | for children in tag.contents: 34 | if type(children) is Tag: 35 | children.decompose() 36 | tag.contents = [] 37 | for content_dict in contents: 38 | p_tag = soup.new_tag("p") 39 | if content_dict["indented"]: 40 | p_tag["indented"] = str(True) 41 | if content_dict["number"]: 42 | p_tag["numbers"] = content_dict["number"] 43 | p_tag.append(soup.new_string(content_dict["content"])) 44 | tag.append(p_tag) 45 | 46 | 47 | def fix_data(decision, text): 48 | if "JURE149015016" in decision: 49 | text = text.replace("Art.l ", "Art. I ") 50 | return text 51 | 52 | 53 | def clean_decision(decision): 54 | if not os.path.exists(f"{DE_DECISIONS_XML}/{decision}"): 55 | with open(f"{DE_DECISIONS_DOWNLOAD_XML}/{decision}", encoding="utf8") as f: 56 | content = f.read() 57 | content = content.replace("\xa0", " ") 58 | soup = BeautifulSoup(content, "lxml-xml") 59 | for doc_parts in get_docparts_with_p(soup): 60 | contents = clean_abs(doc_parts) 61 | replace_tag_with_content(doc_parts, contents, soup) 62 | 63 | soup_str = fix_data(decision, str(soup)) 64 | save_soup(soup_str, f"{DE_DECISIONS_XML}/{decision}") 65 | 66 | 67 | def clean(): 68 | ensure_exists(DE_DECISIONS_XML) 69 | decisions = list_dir(DE_DECISIONS_DOWNLOAD_XML, ".xml") 70 | with multiprocessing.Pool() as p: 71 | p.map(clean_decision, decisions) 72 | -------------------------------------------------------------------------------- /de_decisions_pipeline_steps/c_hierarchy.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | import os 3 | import re 4 | 5 | from bs4 import BeautifulSoup 6 | from quantlaw.utils.files import ensure_exists, list_dir 7 | 8 | from de_decisions_pipeline_steps.common import get_docparts_with_p 9 | from statics import DE_DECISIONS_HIERARCHY, DE_DECISIONS_XML 10 | 11 | 12 | def extract_number(text, token_position=0): 13 | if len(text) == 0 or len(text.split()) <= token_position: 14 | return None, None 15 | first_token = text.split()[token_position] 16 | 17 | match = re.fullmatch(r"(([a-h])\2*)\)", first_token) 18 | if match: # a) aa) aaa) 19 | level = len(match[1]) 20 | return f"alpha-lower-bracket-{level}", match[1] 21 | 22 | match = re.fullmatch(r"\((([a-h])\2*)\)", first_token) 23 | if match: # (a) (aa) (aaa) 24 | level = len(match[1]) 25 | return f"alpha-lower-double-bracket-{level}", match[1] 26 | 27 | match = re.fullmatch(r"(([a-h])\2*)\.", first_token) 28 | if match: # a. aa. aaa. 29 | level = len(match[1]) 30 | return f"alpha-lower-dot-{level}", match[1] 31 | 32 | match = re.fullmatch(r"((X[CL]|L?X{0,3})(I[XV]|V?I{0,3}))\.?", first_token) 33 | if match: # I. II III. 34 | return "roman-upper", match[1] 35 | 36 | match = re.fullmatch(r"(\d+)\.", first_token) 37 | if match: # 1. 2. 3. 38 | return "arabic-dot", match[1] 39 | 40 | match = re.fullmatch(r"(\d+)\)", first_token) 41 | if match: 42 | return "arabic-bracket", match[1] 43 | 44 | match = re.fullmatch( 45 | r"([A-H])\.", first_token 46 | ) # Only until "H." Others are mostly false-positives and disambiguation would 47 | # be required. 48 | if match: 49 | return "alpha-dot", match[1] 50 | 51 | match = re.fullmatch(r"([A-H])\)", first_token) 52 | if match: 53 | return "alpha-bracket", match[1] 54 | 55 | match = re.fullmatch(r"(\d(\.\d)*)\.?", first_token) 56 | if match and len(match[0]) > 1: 57 | level = len(match[1].split(".")) 58 | return f"numeric-{level}", match[1] 59 | 60 | return None, None 61 | 62 | 63 | master_order = [ 64 | "alpha-dot", 65 | "alpha-bracket", 66 | "roman-upper", 67 | "arabic-dot", 68 | "arabic-bracket", 69 | "numeric-2", 70 | "numeric-3", 71 | "numeric-4", 72 | "numeric-5", 73 | "alpha-lower-dot-1", 74 | "alpha-lower-bracket-1", 75 | "alpha-lower-dot-2", 76 | "alpha-lower-bracket-2", 77 | "alpha-lower-dot-3", 78 | "alpha-lower-bracket-3", 79 | "alpha-lower-bracket-4", 80 | "alpha-lower-double-bracket-1", 81 | "alpha-lower-double-bracket-2", 82 | "alpha-lower-double-bracket-3", 83 | "alpha-lower-double-bracket-4", 84 | ] 85 | 86 | 87 | def extract_hierarchy(decision): 88 | if not os.path.exists(f"{DE_DECISIONS_HIERARCHY}/{decision}"): 89 | with open(f"{DE_DECISIONS_XML}/{decision}", encoding="utf8") as f: 90 | soup = BeautifulSoup(f.read(), "lxml-xml") 91 | for doc_parts in get_docparts_with_p(soup): 92 | # has_numbered_ps = bool(doc_parts.find('p', attrs={'numbers': True})) 93 | for p in doc_parts.find_all("p", {"indented": str(True)}): 94 | text = p.get_text().strip() 95 | for token_position in range(3): 96 | match_type, value = extract_number(text, token_position) 97 | if match_type: 98 | if token_position == 0: 99 | p.attrs["hierarchy_num_type"] = match_type 100 | p.attrs["hierarchy_num"] = value 101 | else: 102 | if match_type == "alpha-dot": 103 | break 104 | p.attrs["hierarchy_num_type"] += "," + match_type 105 | p.attrs["hierarchy_num"] += "," + value 106 | else: 107 | break 108 | 109 | # hierarchy_num_types = list() 110 | # for p in doc_parts.find_all( 111 | # "p", attrs={"hierarchy_num_type": True} 112 | # ): 113 | # for hierarchy_num_type in p.attrs["hierarchy_num_type"].split( 114 | # "," 115 | # ): 116 | # if hierarchy_num_type not in hierarchy_num_types: 117 | # hierarchy_num_types.append(hierarchy_num_type) 118 | # 119 | # if hierarchy_num_types: 120 | # unknown_order = len( 121 | # set(hierarchy_num_types) - set(master_order) 122 | # ) 123 | # if not unknown_order: 124 | # hierarchy_num_types_ordered = sorted( 125 | # hierarchy_num_types, key=lambda x: master_order.index( 126 | # x 127 | # ) 128 | # ) 129 | # if unknown_order or tuple(hierarchy_num_types) != tuple( 130 | # hierarchy_num_types_ordered 131 | # ): 132 | # print(decision, doc_parts.name, hierarchy_num_types) 133 | 134 | nested_soup = BeautifulSoup("", "lxml-xml") 135 | assert len(soup.gertyp.get_text()), decision 136 | assert len(soup.find("entsch-datum").get_text()) == 8, decision 137 | assert len(soup.aktenzeichen.get_text()), decision 138 | assert len(soup.doktyp.get_text()), decision 139 | 140 | datum_raw = soup.find("entsch-datum").get_text() 141 | datum = f"{datum_raw[:4]}-{datum_raw[4:6]}-{datum_raw[6:]}" 142 | 143 | nested_soup.append( 144 | nested_soup.new_tag( 145 | "document", 146 | gericht=soup.gertyp.get_text(), 147 | datum=datum, 148 | az=soup.aktenzeichen.get_text(), 149 | doktyp=soup.doktyp.get_text(), 150 | ) 151 | ) 152 | 153 | if len(soup.spruchkoerper.get_text()): 154 | nested_soup.document.attrs["spruchkoerper"] = soup.spruchkoerper.get_text() 155 | 156 | if len(soup.norm.get_text()): 157 | nested_soup.document.append(nested_soup.new_tag("norm")) 158 | nested_soup.norm.append(nested_soup.new_string(soup.norm.get_text(" "))) 159 | 160 | for doc_part in [ 161 | soup.tenor, 162 | soup.tatbestand, 163 | soup.entscheidungsgruende, 164 | soup.gruende, 165 | soup.abwmeinung, 166 | soup.sonstlt, 167 | ]: 168 | if not len(doc_part.get_text()): 169 | continue 170 | item = nested_soup.new_tag("item", heading=doc_part.name) 171 | nested_soup.document.append(item) 172 | 173 | open_tags = [dict(tag=item, level=-1)] 174 | text_tag = None 175 | for p in doc_part.find_all("p"): 176 | if text_tag and "indented" in p.attrs: 177 | assert p.attrs["indented"] == "True" 178 | text_tag.append(" " + nested_soup.new_string(p.get_text(" "))) 179 | continue 180 | 181 | if "hierarchy_num_type" in p.attrs: 182 | for num_type, num in zip( 183 | p.attrs["hierarchy_num_type"].split(","), 184 | p.attrs["hierarchy_num"].split(","), 185 | ): 186 | current_level = master_order.index(num_type) 187 | while open_tags[-1]["level"] >= current_level: 188 | open_tags.pop() 189 | 190 | item = nested_soup.new_tag("item", heading=num) 191 | open_tags[-1]["tag"].append(item) 192 | open_tags.append(dict(tag=item, level=current_level)) 193 | 194 | text_tag = nested_soup.new_tag("text") 195 | text_tag.append(nested_soup.new_string(p.get_text(" "))) 196 | seqitem = nested_soup.new_tag("seqitem") 197 | seqitem.append(text_tag) 198 | open_tags[-1]["tag"].append(seqitem) 199 | 200 | decision_id = decision.split(".")[0] 201 | nodeid_counter = 0 202 | for tag in nested_soup.find_all(["document", "item", "seqitem"]): 203 | tag.attrs["key"] = f"{decision_id}_{nodeid_counter:06d}" 204 | nodeid_counter += 1 205 | tag.attrs["level"] = ( 206 | 0 if tag.name == "document" else tag.parent.attrs["level"] + 1 207 | ) 208 | 209 | with open(f"{DE_DECISIONS_HIERARCHY}/{decision}", "w", encoding="utf8") as f: 210 | f.write(str(nested_soup)) 211 | 212 | 213 | def hierarchy(): 214 | ensure_exists(DE_DECISIONS_HIERARCHY) 215 | decisions = list_dir(DE_DECISIONS_XML, ".xml") 216 | 217 | with multiprocessing.Pool() as p: 218 | p.map(extract_hierarchy, decisions) 219 | -------------------------------------------------------------------------------- /de_decisions_pipeline_steps/common.py: -------------------------------------------------------------------------------- 1 | def get_docparts_with_p(soup): 2 | return [ 3 | soup.titelzeile, 4 | soup.leitsatz, 5 | soup.sonstosatz, 6 | soup.tenor, 7 | soup.tatbestand, 8 | soup.entscheidungsgruende, 9 | soup.gruende, 10 | soup.sonstlt, 11 | soup.abwmeinung, 12 | ] 13 | -------------------------------------------------------------------------------- /de_decisions_pipeline_steps/d_reference_areas_parse.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | import os 3 | import sys 4 | import traceback 5 | 6 | from bs4 import BeautifulSoup 7 | from quantlaw.de_extract.statutes_areas import StatutesExtractor 8 | from quantlaw.de_extract.statutes_parse import StatutesParser 9 | from quantlaw.utils.beautiful_soup import save_soup 10 | from quantlaw.utils.files import ensure_exists, list_dir 11 | 12 | from statics import ( 13 | DE_DECISIONS_HIERARCHY, 14 | DE_DECISIONS_REFERENCE_AREAS, 15 | DE_DECISIONS_REFERENCE_PARSED_XML, 16 | ) 17 | from statutes_pipeline_steps.de_reference_areas import find_references_in_soup 18 | from statutes_pipeline_steps.de_reference_parse import ( 19 | identify_lawreference_law_name_in_soup, 20 | identify_reference_law_name_in_soup, 21 | parse_reference_content_in_soup, 22 | ) 23 | from utils.common import get_stemmed_law_names, load_law_names_compiled 24 | 25 | 26 | def get_lawnames_date(requested_date): 27 | requested_date = requested_date.replace("-", "") 28 | lookup_date = None 29 | for date in sorted(law_names): 30 | if date <= requested_date: 31 | lookup_date = date 32 | else: 33 | break 34 | if not lookup_date: 35 | raise Exception(f"No lawnames for {lookup_date} not found.") 36 | return lookup_date 37 | 38 | 39 | def find_references(decision): 40 | try: 41 | logs = [] 42 | areas_exists = os.path.exists(f"{DE_DECISIONS_REFERENCE_AREAS}/{decision}") 43 | parsed_exists = os.path.exists( 44 | f"{DE_DECISIONS_REFERENCE_PARSED_XML}/{decision}" 45 | ) 46 | 47 | if not (areas_exists and parsed_exists): # General preparation 48 | with open(f"{DE_DECISIONS_HIERARCHY}/{decision}", encoding="utf8") as f: 49 | file_content = f.read() 50 | file_content = file_content.replace( 51 | "', 53 | "", 54 | ) 55 | soup = BeautifulSoup(file_content, "lxml-xml") 56 | 57 | # Get Entscheidungsdatum 58 | date = get_lawnames_date(soup.document.attrs["datum"]) 59 | 60 | # Get laws in effect at time of decision 61 | laws_lookup = get_stemmed_law_names(date, law_names) 62 | parser = StatutesParser(laws_lookup) 63 | extractor = StatutesExtractor(laws_lookup) 64 | 65 | if not areas_exists: 66 | logs.append( 67 | find_references_in_soup( 68 | soup, 69 | extractor, 70 | para=0, 71 | art=0, 72 | text_tag_name=["text", "norm"], 73 | ) 74 | # set para and atr to 0 that refernece with naming a law are ignored. 75 | ) 76 | save_soup(soup, f"{DE_DECISIONS_REFERENCE_AREAS}/{decision}") 77 | 78 | if not parsed_exists: 79 | with open( 80 | f"{DE_DECISIONS_REFERENCE_AREAS}/{decision}", encoding="utf8" 81 | ) as f: 82 | soup = BeautifulSoup(f.read(), "lxml-xml") 83 | parse_reference_content_in_soup(soup, parser, decision) 84 | identify_reference_law_name_in_soup( 85 | soup, parser, current_lawid=None, skip_errors=True 86 | ) 87 | identify_lawreference_law_name_in_soup(soup, laws_lookup) 88 | 89 | save_soup(soup, f"{DE_DECISIONS_REFERENCE_PARSED_XML}/{decision}") 90 | except Exception: 91 | print("-----", decision, "-----") 92 | the_type, the_value, the_traceback = sys.exc_info() 93 | traceback.print_exception(the_type, the_value, the_traceback) 94 | raise 95 | 96 | 97 | def reference_parse_areas(regulations): 98 | global law_names 99 | law_names = load_law_names_compiled(regulations) 100 | ensure_exists(DE_DECISIONS_REFERENCE_AREAS) 101 | ensure_exists(DE_DECISIONS_REFERENCE_PARSED_XML) 102 | decisions = list_dir(DE_DECISIONS_HIERARCHY, ".xml") 103 | with multiprocessing.Pool() as p: 104 | p.map(find_references, decisions) 105 | 106 | 107 | # # REgZ extractor 108 | # 109 | # pattern = re.compile( 110 | # r'(?:' 111 | # r'(?:B\s+)?' 112 | # r'(?:\d?/?\d+|I?(?:X|V|I)+I*a?)' 113 | # r'\s+' 114 | # r'(?:B\s+)?' 115 | # r')?' 116 | # r'(' 117 | # r'[A-Za-z\-Ü]+' 118 | # r'(?:\s*\((?:pat|Brfg|B|R|VZ|P|VS|Vs|Ü)\))?' 119 | # r')' 120 | # r'\s*' 121 | # r'(?:\d+\s*(?:,|\-|und)\s*)?' 122 | # r'\d+\/\d+a?' 123 | # r'(?:\s+\(PKH\)|\s+\(E[PU]\b\)?|\s+\(?[BRSKFCAD][LRH]?\)?)?' 124 | # r'(?:\s+\-\s+Vz\s+\d+/\d+)?' 125 | # r'\)?' 126 | # r'(?:\s+\(vormals\s.+)?' 127 | # ) 128 | # keys = list(azs) 129 | # az_splitted = [ 130 | # [ 131 | # az 132 | # for az in re.split( 133 | # r'\s+hinzuverb\.,\s+|,?\s*\(?\bverb\.\s*mi?t?\b\.?,?\s*|(?:,?\s+und|,?\s+zu|,),?\s(?!\d+/\d)(?:hinzuverb\.\s+)?|\s\((?=(?:\d+|I?(?:X|V|I)+I*a?)+\s+[A-Z]+\s+\d+/\d+)|\szu(?=\d\s)', 134 | # azs[k]) 135 | # ] 136 | # for k in keys 137 | # ] 138 | # 139 | # regZ = [ 140 | # [ 141 | # pattern.fullmatch(az) 142 | # for az in az_list 143 | # ] 144 | # for az_list in az_splitted 145 | # ] 146 | # regZ = [ 147 | # [ 148 | # match and match.group(1) 149 | # for match in match_list 150 | # ] 151 | # for match_list in regZ 152 | # ] 153 | -------------------------------------------------------------------------------- /de_decisions_pipeline_steps/e_network.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import json 3 | import multiprocessing 4 | import re 5 | 6 | import networkx as nx 7 | from quantlaw.utils.beautiful_soup import create_soup 8 | from quantlaw.utils.files import list_dir 9 | from quantlaw.utils.networkx import multi_to_weighted 10 | 11 | from statics import DE_DECISIONS_NETWORK, DE_DECISIONS_REFERENCE_PARSED_XML 12 | 13 | 14 | def count_characters(text, whites=False): 15 | if whites: 16 | return len(text) 17 | else: 18 | return len(re.sub(r"\s", "", text)) 19 | 20 | 21 | def count_tokens(text, unique=False): 22 | if not unique: 23 | return len(text.split()) 24 | else: 25 | return len(set(text.split())) 26 | 27 | 28 | def get_graph_data_from_decision(decision): 29 | try: 30 | soup = create_soup(f"{DE_DECISIONS_REFERENCE_PARSED_XML}/{decision}") 31 | items = list(soup.find_all(["document", "item", "seqitem"])) 32 | node_dicts = [] 33 | containment_edges = [] 34 | 35 | for item in items: 36 | node_dict = dict( 37 | key=item.attrs["key"], 38 | heading=item.attrs.get("heading", ""), 39 | level=int(item.attrs["level"]), 40 | type=item.name, 41 | ) 42 | 43 | text = item.get_text(" ") 44 | node_dict["chars_n"] = count_characters(text, whites=True) 45 | node_dict["chars_nowhites"] = count_characters(text, whites=False) 46 | node_dict["tokens_n"] = count_tokens(text, unique=False) 47 | node_dict["tokens_unique"] = count_tokens(text, unique=True) 48 | 49 | if item.name == "document": 50 | for key in ["az", "gericht", "datum", "doktyp", "spruchkoerper"]: 51 | node_dict[key] = item.attrs.get(key, "") 52 | parent_key = "root" 53 | else: 54 | node_dict["parent_key"] = item.parent.attrs["key"] 55 | parent_key = item.parent.attrs["key"] 56 | 57 | node_dicts.append(node_dict) 58 | containment_edges.append((parent_key, item.attrs["key"])) 59 | 60 | reference_edges = [] 61 | for item in items: 62 | for node in item.find_all("reference"): 63 | if ( 64 | node.lawname 65 | and "parsed" in node.attrs 66 | and node.lawname.get("type") 67 | in [ 68 | "dict", 69 | "sgb", 70 | ] 71 | ): 72 | refs = json.loads(node.attrs["parsed"]) 73 | for ref in refs: 74 | ref_key = "_".join(ref[:2]) 75 | reference_edges.append((item.attrs["key"], ref_key)) 76 | except Exception: 77 | print(decision) 78 | raise 79 | 80 | return node_dicts, containment_edges, reference_edges 81 | 82 | 83 | def network(): 84 | decisions = list_dir(DE_DECISIONS_REFERENCE_PARSED_XML, ".xml") 85 | with multiprocessing.Pool() as p: 86 | results = p.map(get_graph_data_from_decision, decisions) 87 | 88 | node_dicts = list(itertools.chain.from_iterable([x[0] for x in results])) 89 | containment_edges = list(itertools.chain.from_iterable([x[1] for x in results])) 90 | reference_edges = list(itertools.chain.from_iterable([x[2] for x in results])) 91 | 92 | hierarchy_G = nx.DiGraph() 93 | hierarchy_G.add_node("root", level=-1, key="root", bipartite="decision") 94 | hierarchy_G.add_nodes_from( 95 | [(x["key"], x) for x in node_dicts], bipartite="decision" 96 | ) 97 | hierarchy_G.add_edges_from(containment_edges, edge_type="containment") 98 | 99 | reference_G = nx.MultiDiGraph(hierarchy_G) 100 | print("created") 101 | reference_G.add_nodes_from( 102 | sorted({x[-1] for x in reference_edges}), bipartite="statute" 103 | ) 104 | print("Statute nodes added") 105 | reference_G.add_edges_from(reference_edges, edge_type="reference") 106 | print("Reference edges added") 107 | 108 | reference_weighted_G = multi_to_weighted(reference_G) 109 | 110 | nx.write_gpickle(reference_weighted_G, DE_DECISIONS_NETWORK) 111 | -------------------------------------------------------------------------------- /download_de_gesetze_im_internet_data.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | from multiprocessing.pool import Pool 4 | 5 | from git import Git, Repo 6 | 7 | from statics import DE_ORIGINAL_PATH 8 | from utils.simplify_gii_xml import simplify_gii_xml 9 | 10 | REPO_PATH = "../gesetze-im-internet" 11 | REPO_PARENT_PATH = "../" 12 | ITEMS_PATH = f"{REPO_PATH}/data/items/" 13 | 14 | GII_REPO_URL = "https://github.com/QuantLaw/gesetze-im-internet.git" 15 | 16 | 17 | def copy_and_simplify_file(xml_file): 18 | doknr = xml_file.split(".")[0] 19 | file_path = os.path.join(ITEMS_PATH, folder, xml_file) 20 | stripped_date = date.replace("-", "") 21 | target_file = os.path.join( 22 | DE_ORIGINAL_PATH, f"{doknr}_{stripped_date}_{stripped_date}.xml" 23 | ) 24 | simplify_gii_xml(file_path, target_file) 25 | 26 | 27 | if __name__ == "__main__": 28 | parser = argparse.ArgumentParser() 29 | parser.add_argument( 30 | "-d", "--dates", nargs="*", help="List dates in format YYYY=mm-dd" 31 | ) 32 | parser.add_argument( 33 | "-i", 34 | "--ignore-not-found", 35 | help="Ignore that some files are not included in this snapshot", 36 | action="store_true", 37 | ) 38 | 39 | args = parser.parse_args() 40 | 41 | if os.path.exists(REPO_PATH): 42 | print(f"Please remove the folder {REPO_PATH}") 43 | exit(1) 44 | 45 | Git(REPO_PARENT_PATH).clone(GII_REPO_URL) 46 | 47 | repo = Repo(REPO_PATH) 48 | available_dates = [d.name for d in repo.tags] 49 | 50 | if not args.dates: 51 | print("Please choose dates to import\nOptions:\n") 52 | for t in available_dates: 53 | print(t) 54 | exit(1) 55 | 56 | for date in args.dates: 57 | if date not in available_dates: 58 | raise Exception(f"{date} is not available") 59 | 60 | git = Git(REPO_PATH) 61 | os.makedirs(DE_ORIGINAL_PATH) 62 | 63 | for date in args.dates: 64 | git.checkout(date) 65 | 66 | with open(f"{REPO_PATH}/data/not_found.txt") as f: 67 | not_found = f.read() 68 | 69 | if not args.ignore_not_found and len(not_found.strip()): 70 | raise Exception( 71 | f"Some files are not included in snapshot {date}. " 72 | f"Use another snapshot or --ignore-not-found" 73 | ) 74 | 75 | for folder in [ 76 | f for f in os.listdir(ITEMS_PATH) if os.path.isdir(ITEMS_PATH + f) 77 | ]: 78 | folder_path = ITEMS_PATH + folder 79 | xml_files = [f for f in os.listdir(folder_path) if f.endswith(".xml")] 80 | 81 | with Pool() as p: 82 | p.map(copy_and_simplify_file, xml_files) 83 | 84 | print(date, "imported") 85 | 86 | print(f"Done. You may now remove `{REPO_PATH}`") 87 | -------------------------------------------------------------------------------- /download_us_code_data.py: -------------------------------------------------------------------------------- 1 | import re 2 | import shutil 3 | from multiprocessing.pool import Pool 4 | from zipfile import ZipFile 5 | 6 | import requests 7 | from bs4 import BeautifulSoup 8 | from quantlaw.utils.files import ensure_exists 9 | 10 | from statics import US_INPUT_PATH 11 | 12 | INDEX_URL = ( 13 | "https://uscode.house.gov/download/annualhistoricalarchives/downloadxhtml.shtml" 14 | ) 15 | 16 | DOWNLOAD_BASE_URL = "https://uscode.house.gov/download/annualhistoricalarchives/" 17 | 18 | 19 | def download(ref): 20 | year = re.match(r"XHTML/(\d+)\.zip", ref)[1] 21 | print("loading", year) 22 | r = requests.get(DOWNLOAD_BASE_URL + ref, stream=True) 23 | if r.status_code == 200: 24 | zip_path = f"{US_INPUT_PATH}/{year}.zip" 25 | with open(zip_path, "wb") as f: 26 | r.raw.decode_content = True 27 | shutil.copyfileobj(r.raw, f) 28 | 29 | with ZipFile(zip_path) as f: 30 | f.extractall(US_INPUT_PATH) 31 | 32 | 33 | if __name__ == "__main__": 34 | response = requests.get(INDEX_URL) 35 | soup = BeautifulSoup(str(response.content), "lxml") 36 | refs = [] 37 | for s_string in soup.find_all(text=" zip file]"): 38 | a_tag = s_string.parent 39 | assert a_tag.name == "a" 40 | refs.append(a_tag.attrs["href"]) 41 | 42 | ensure_exists(US_INPUT_PATH) 43 | 44 | with Pool(4) as p: 45 | p.map(download, sorted(refs)) 46 | -------------------------------------------------------------------------------- /download_us_reg_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | from multiprocessing.pool import Pool 4 | 5 | import requests 6 | from quantlaw.utils.files import ensure_exists 7 | 8 | from statics import US_REG_INPUT_PATH 9 | 10 | DOWNLOAD_BASE_URL = "https://www.govinfo.gov/bulkdata/CFR/{}/CFR-{}.zip" 11 | 12 | 13 | def download(year): 14 | zip_path = f"{US_REG_INPUT_PATH}/{year}.zip" 15 | if not os.path.exists(zip_path): 16 | print("loading", year) 17 | r = requests.get(DOWNLOAD_BASE_URL.format(year, year), stream=True) 18 | if r.status_code == 200: 19 | with open(zip_path, "wb") as f: 20 | r.raw.decode_content = True 21 | shutil.copyfileobj(r.raw, f) 22 | print("downloaded", year) 23 | 24 | 25 | if __name__ == "__main__": 26 | 27 | ensure_exists(US_REG_INPUT_PATH) 28 | with Pool(4) as p: 29 | p.map(download, list(range(1996, 2020 + 1))) 30 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4 2 | black 3 | coverage 4 | flake8 5 | isort 6 | lxml 7 | networkx==2.4 8 | numpy 9 | pandas 10 | pre-commit 11 | quantlaw 12 | regex 13 | requests 14 | textdistance 15 | tqdm 16 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | appdirs==1.4.4 2 | beautifulsoup4==4.9.3 3 | black==20.8b1 4 | certifi==2020.12.5 5 | cfgv==3.2.0 6 | chardet==4.0.0 7 | click==7.1.2 8 | coverage==5.3.1 9 | decorator==4.4.2 10 | distlib==0.3.1 11 | filelock==3.0.12 12 | flake8==3.8.4 13 | identify==1.5.11 14 | idna==2.10 15 | importlib-metadata==3.3.0 16 | isort==5.7.0 17 | lxml==4.6.2 18 | mccabe==0.6.1 19 | mypy-extensions==0.4.3 20 | networkx==2.4 21 | nodeenv==1.5.0 22 | numpy==1.19.4 23 | pandas==1.2.0 24 | pathspec==0.8.1 25 | pre-commit==2.9.3 26 | pycodestyle==2.6.0 27 | pyflakes==2.2.0 28 | python-dateutil==2.8.1 29 | pytz==2020.5 30 | PyYAML==5.3.1 31 | quantlaw==0.0.5 32 | regex==2020.11.13 33 | requests==2.25.1 34 | six==1.15.0 35 | soupsieve==2.1 36 | textdistance==4.2.0 37 | toml==0.10.2 38 | tqdm==4.55.1 39 | typed-ast==1.4.2 40 | typing-extensions==3.7.4.3 41 | urllib3==1.26.2 42 | virtualenv==20.2.2 43 | zipp==3.4.0 44 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 88 3 | ignore = E203, W503, E131 4 | exclude = .git,__pycache__,build,dist,venv 5 | -------------------------------------------------------------------------------- /statics.py: -------------------------------------------------------------------------------- 1 | ALL_YEARS = list(range(1994, 2020)) 2 | ALL_YEARS_REG = list(range(1998, 2020)) 3 | 4 | DATA_PATH = "../legal-networks-data" 5 | US_DATA_PATH = f"{DATA_PATH}/us" 6 | US_TEMP_DATA_PATH = "temp/us" 7 | 8 | US_INPUT_PATH = f"{US_DATA_PATH}/1_input" 9 | US_ORIGINAL_PATH = f"{US_TEMP_DATA_PATH}/11_htm" 10 | US_XML_PATH = f"{US_TEMP_DATA_PATH}/12_xml" 11 | US_REFERENCE_AREAS_PATH = f"{US_TEMP_DATA_PATH}/13_reference_areas" 12 | US_REFERENCE_PARSED_PATH = f"{US_DATA_PATH}/2_xml" 13 | US_HIERARCHY_GRAPH_PATH = f"{US_DATA_PATH}/3_hierarchy_graph" 14 | US_CROSSREFERENCE_LOOKUP_PATH = f"{US_TEMP_DATA_PATH}/31_crossreference_lookup" 15 | US_CROSSREFERENCE_EDGELIST_PATH = f"{US_TEMP_DATA_PATH}/32_crossreference_edgelist" 16 | US_CROSSREFERENCE_GRAPH_PATH = f"{US_DATA_PATH}/4_crossreference_graph" 17 | US_SNAPSHOT_MAPPING_INDEX_PATH = f"{US_TEMP_DATA_PATH}/41_snapshot_mapping_index" 18 | US_SNAPSHOT_MAPPING_EDGELIST_PATH = f"{US_DATA_PATH}/5_snapshot_mapping_edgelist" 19 | 20 | US_HELPERS_PATH = f"{US_TEMP_DATA_PATH}/helpers" 21 | US_REFERENCE_AREAS_LOG_PATH = f"{US_HELPERS_PATH}/us_extract_reference_areas.log" 22 | US_REFERENCE_PARSED_LOG_PATH = f"{US_HELPERS_PATH}/us_extract_reference_parsed.log" 23 | 24 | US_REG_DATA_PATH = f"{DATA_PATH}/us_reg" 25 | US_REG_TEMP_DATA_PATH = "temp/us_reg" 26 | 27 | US_REG_INPUT_PATH = f"{US_REG_DATA_PATH}/1_input" 28 | US_REG_INPUT_COPY_LOG_PATH = f"{US_REG_DATA_PATH}/1_input_copy_log.csv" 29 | US_REG_ORIGINAL_PATH = f"{US_REG_TEMP_DATA_PATH}/11_htm" 30 | US_REG_XML_PATH = f"{US_REG_TEMP_DATA_PATH}/12_xml" 31 | US_REG_REFERENCE_AREAS_PATH = f"{US_REG_TEMP_DATA_PATH}/13_reference_areas" 32 | US_REG_REFERENCE_PARSED_PATH = f"{US_REG_DATA_PATH}/2_xml" 33 | US_REG_HIERARCHY_GRAPH_PATH = f"{US_REG_DATA_PATH}/3_hierarchy_graph" 34 | US_REG_CROSSREFERENCE_LOOKUP_PATH = f"{US_REG_TEMP_DATA_PATH}/31_crossreference_lookup" 35 | US_REG_CROSSREFERENCE_EDGELIST_PATH = ( 36 | f"{US_REG_TEMP_DATA_PATH}/32_crossreference_edgelist" 37 | ) 38 | US_REG_AUTHORITY_EDGELIST_PATH = f"{US_REG_TEMP_DATA_PATH}/33_authority_edgelist" 39 | US_REG_CROSSREFERENCE_GRAPH_PATH = f"{US_REG_DATA_PATH}/4_crossreference_graph" 40 | US_REG_SNAPSHOT_MAPPING_INDEX_PATH = ( 41 | f"{US_REG_TEMP_DATA_PATH}/41_snapshot_mapping_index" 42 | ) 43 | US_REG_SNAPSHOT_MAPPING_EDGELIST_PATH = ( 44 | f"{US_REG_DATA_PATH}/5_snapshot_mapping_edgelist" 45 | ) 46 | 47 | US_REG_HELPERS_PATH = f"{US_REG_TEMP_DATA_PATH}/helpers" 48 | US_REG_REFERENCE_AREAS_LOG_PATH = ( 49 | f"{US_REG_HELPERS_PATH}/us_extract_reference_areas.log" 50 | ) 51 | US_REG_REFERENCE_PARSED_LOG_PATH = ( 52 | f"{US_REG_HELPERS_PATH}/us_extract_reference_parsed.log" 53 | ) 54 | 55 | 56 | DE_DATA_PATH = f"{DATA_PATH}/de" 57 | DE_TEMP_DATA_PATH = "temp/de" 58 | 59 | JURIS_EXPORT_PATH = f"{DE_DATA_PATH}/1_juris_gii_xml" 60 | JURIS_EXPORT_GESETZE_LIST_PATH = f"{DE_DATA_PATH}/1_juris_gii_xml_gesetze.txt" 61 | JURIS_EXPORT_RVO_LIST_PATH = f"{DE_DATA_PATH}/1_juris_gii_xml_rvo.txt" 62 | 63 | DE_ORIGINAL_PATH = f"{DE_TEMP_DATA_PATH}/11_gii_xml" 64 | DE_XML_PATH = f"{DE_TEMP_DATA_PATH}/12_xml" 65 | DE_LAW_NAMES_PATH = f"{DE_TEMP_DATA_PATH}/12_xml_law_names.csv" 66 | DE_LAW_NAMES_COMPILED_PATH = f"{DE_TEMP_DATA_PATH}/12_xml_law_names_compiled.pickle" 67 | DE_REFERENCE_AREAS_PATH = f"{DE_TEMP_DATA_PATH}/13_reference_areas" 68 | DE_REFERENCE_PARSED_PATH = f"{DE_DATA_PATH}/2_xml" 69 | DE_HIERARCHY_GRAPH_PATH = f"{DE_DATA_PATH}/3_hierarchy_graph" 70 | DE_CROSSREFERENCE_LOOKUP_PATH = f"{DE_TEMP_DATA_PATH}/31_crossreference_lookup" 71 | DE_CROSSREFERENCE_EDGELIST_PATH = f"{DE_TEMP_DATA_PATH}/32_crossreference_edgelist" 72 | DE_CROSSREFERENCE_GRAPH_PATH = f"{DE_DATA_PATH}/4_crossreference_graph" 73 | DE_SNAPSHOT_MAPPING_INDEX_PATH = f"{DE_TEMP_DATA_PATH}/41_snapshot_mapping_index" 74 | DE_SNAPSHOT_MAPPING_EDGELIST_PATH = f"{DE_DATA_PATH}/5_snapshot_mapping_edgelist" 75 | 76 | DE_HELPERS_PATH = f"{DE_TEMP_DATA_PATH}/helpers" 77 | DE_REFERENCE_AREAS_LOG_PATH = f"{DE_HELPERS_PATH}/de_extract_reference_areas.log" 78 | DE_REFERENCE_PARSED_LOG_PATH = f"{DE_HELPERS_PATH}/de_extract_reference_parsed.log" 79 | 80 | DE_DECISIONS_DATA_PATH = f"{DATA_PATH}/de_decisions" 81 | DE_DECISIONS_TEMP_DATA_PATH = "temp/de_decisions" 82 | 83 | DE_DECISIONS_DOWNLOAD_TOC = f"{DE_DECISIONS_TEMP_DATA_PATH}/de_rii_toc.xml" 84 | DE_DECISIONS_DOWNLOAD_ZIP = f"{DE_DECISIONS_DATA_PATH}/0_input" 85 | DE_DECISIONS_DOWNLOAD_XML = f"{DE_DECISIONS_TEMP_DATA_PATH}/00_xml" 86 | DE_DECISIONS_XML = f"{DE_DECISIONS_TEMP_DATA_PATH}/01_xml_cleaned" 87 | DE_DECISIONS_HIERARCHY = f"{DE_DECISIONS_TEMP_DATA_PATH}/02_hierarchy" 88 | DE_DECISIONS_REFERENCE_AREAS = f"{DE_DECISIONS_TEMP_DATA_PATH}/03_reference_areas" 89 | DE_DECISIONS_REFERENCE_PARSED_XML = f"{DE_DECISIONS_DATA_PATH}/1_xml" 90 | DE_DECISIONS_NETWORK = f"{DE_DECISIONS_DATA_PATH}/2_network.gpickle.gz" 91 | 92 | DE_REG_DATA_PATH = f"{DATA_PATH}/de_reg" 93 | DE_REG_TEMP_DATA_PATH = "temp/de_reg" 94 | 95 | DE_REG_ORIGINAL_PATH = f"{DE_REG_TEMP_DATA_PATH}/11_gii_xml" 96 | 97 | DE_REG_DATA_PATH = f"{DATA_PATH}/de_reg" 98 | DE_REG_TEMP_DATA_PATH = "temp/de_reg" 99 | 100 | DE_REG_ORIGINAL_PATH = f"{DE_REG_TEMP_DATA_PATH}/11_gii_xml" 101 | DE_REG_XML_PATH = f"{DE_REG_TEMP_DATA_PATH}/12_xml" 102 | DE_REG_LAW_NAMES_COMPILED_PATH = ( 103 | f"{DE_REG_TEMP_DATA_PATH}/12_xml_law_names_compiled.pickle" 104 | ) 105 | DE_REG_LAW_NAMES_PATH = f"{DE_REG_TEMP_DATA_PATH}/12_xml_law_names.csv" 106 | DE_REG_REFERENCE_AREAS_PATH = f"{DE_REG_TEMP_DATA_PATH}/13_reference_areas" 107 | DE_REG_REFERENCE_PARSED_PATH = f"{DE_REG_DATA_PATH}/2_xml" 108 | DE_REG_HIERARCHY_GRAPH_PATH = f"{DE_REG_DATA_PATH}/3_hierarchy_graph" 109 | DE_REG_CROSSREFERENCE_LOOKUP_PATH = f"{DE_REG_TEMP_DATA_PATH}/31_crossreference_lookup" 110 | DE_REG_CROSSREFERENCE_EDGELIST_PATH = ( 111 | f"{DE_REG_TEMP_DATA_PATH}/32_crossreference_edgelist" 112 | ) 113 | DE_REG_AUTHORITY_EDGELIST_PATH = f"{DE_REG_TEMP_DATA_PATH}/33_authority_edgelist" 114 | DE_REG_CROSSREFERENCE_GRAPH_PATH = f"{DE_REG_DATA_PATH}/4_crossreference_graph" 115 | DE_REG_SNAPSHOT_MAPPING_INDEX_PATH = ( 116 | f"{DE_REG_TEMP_DATA_PATH}/41_snapshot_mapping_index" 117 | ) 118 | DE_REG_SNAPSHOT_MAPPING_EDGELIST_PATH = ( 119 | f"{DE_REG_DATA_PATH}/5_snapshot_mapping_edgelist" 120 | ) 121 | 122 | DE_REG_HELPERS_PATH = f"{DE_REG_TEMP_DATA_PATH}/helpers" 123 | DE_REG_REFERENCE_AREAS_LOG_PATH = ( 124 | f"{DE_REG_HELPERS_PATH}/de_extract_reference_areas.log" 125 | ) 126 | DE_REG_REFERENCE_PARSED_LOG_PATH = ( 127 | f"{DE_REG_HELPERS_PATH}/de_extract_reference_parsed.log" 128 | ) 129 | -------------------------------------------------------------------------------- /statutes_pipeline_steps/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QuantLaw/legal-data-preprocessing/4264cd630b13e3d3bb934d4abd73b5b98217873c/statutes_pipeline_steps/__init__.py -------------------------------------------------------------------------------- /statutes_pipeline_steps/crossreference_graph.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | import os 3 | 4 | import networkx as nx 5 | import pandas as pd 6 | from quantlaw.utils.files import ensure_exists, list_dir 7 | from quantlaw.utils.networkx import load_graph_from_csv_files 8 | 9 | from utils.common import RegulationsPipelineStep, get_snapshot_law_list, load_law_names 10 | 11 | 12 | class CrossreferenceGraphStep(RegulationsPipelineStep): 13 | max_number_of_processes = min(2, max(multiprocessing.cpu_count() - 2, 1)) 14 | 15 | def __init__( 16 | self, 17 | source, 18 | source_regulation, 19 | destination, 20 | edgelist_folder, 21 | dataset, 22 | authority_edgelist_folder, 23 | *args, 24 | **kwargs, 25 | ): 26 | self.source = source 27 | self.source_regulation = source_regulation 28 | self.destination = destination 29 | self.edgelist_folder = edgelist_folder 30 | self.dataset = dataset 31 | self.authority_edgelist_folder = authority_edgelist_folder 32 | super().__init__(*args, **kwargs) 33 | 34 | def get_items(self, overwrite, snapshots) -> list: 35 | ensure_exists(self.destination + "/seqitems") 36 | if not snapshots: 37 | snapshots = sorted( 38 | set( 39 | [ 40 | os.path.splitext(x)[0] 41 | for x in list_dir(self.edgelist_folder, ".csv") 42 | ] 43 | ) 44 | ) 45 | 46 | if not overwrite: 47 | existing_files = list_dir( 48 | os.path.join(self.destination, "seqitems"), ".gpickle.gz" 49 | ) 50 | snapshots = list( 51 | filter( 52 | lambda year: f"{year}.gpickle.gz" not in existing_files, snapshots 53 | ) 54 | ) 55 | 56 | if not len(snapshots): 57 | return [] 58 | 59 | if self.dataset == "us": 60 | files = [] 61 | for snapshot in snapshots: 62 | statute_files = [ 63 | f"{self.source}/subseqitems/{x}" 64 | for x in os.listdir(os.path.join(self.source, "subseqitems")) 65 | if str(snapshot) in x 66 | ] 67 | regulation_files = ( 68 | [ 69 | f"{self.source_regulation}/subseqitems/{x}" 70 | for x in os.listdir( 71 | os.path.join(self.source_regulation, "subseqitems") 72 | ) 73 | if str(snapshot) in x 74 | ] 75 | if self.regulations 76 | else None 77 | ) 78 | files.append( 79 | ( 80 | snapshot, 81 | statute_files, 82 | regulation_files, 83 | ) 84 | ) 85 | else: # is DE 86 | files = [] 87 | law_names_data = load_law_names(self.regulations) 88 | for snapshot in snapshots: 89 | graph_files = get_snapshot_law_list(snapshot, law_names_data) 90 | files.append( 91 | ( 92 | snapshot, 93 | [ 94 | f'{self.source}/subseqitems/{x.replace(".xml", ".gpickle")}' 95 | for x in graph_files 96 | ], 97 | None, 98 | ) 99 | ) 100 | 101 | return files 102 | 103 | def execute_item(self, item): 104 | year, files, files_regulations = item 105 | 106 | if self.regulations and files_regulations: 107 | files += files_regulations 108 | 109 | node_columns = [ 110 | "key", 111 | "level", 112 | "citekey", 113 | "parent_key", 114 | "type", 115 | "document_type", 116 | "heading", 117 | "law_name", 118 | "chars_n", 119 | "chars_nowhites", 120 | "tokens_n", 121 | "tokens_unique", 122 | "abbr_1", 123 | "abbr_2", 124 | "subject_areas", 125 | "legislators", 126 | "contributors", 127 | "texts_tokens_n", 128 | "texts_chars_n", 129 | ] 130 | edge_columns = ["u", "v", "edge_type"] 131 | 132 | nodes_csv_path = f"{self.destination}/{year}.nodes.csv.gz" 133 | edges_csv_path = f"{self.destination}/{year}.edges.csv.gz" 134 | 135 | pd.DataFrame( 136 | [dict(level=-1, key="root", law_name="root")], columns=node_columns 137 | ).to_csv( 138 | nodes_csv_path, 139 | header=True, 140 | index=False, 141 | columns=node_columns, 142 | ) 143 | 144 | pd.DataFrame([], columns=edge_columns).to_csv( 145 | edges_csv_path, 146 | header=True, 147 | index=False, 148 | columns=edge_columns, 149 | ) 150 | 151 | for file in files: 152 | nG = nx.read_gpickle(file) 153 | nx.set_node_attributes(nG, nG.graph.get("name", file), name="law_name") 154 | 155 | nodes_df = pd.DataFrame( 156 | [d for n, d in nG.nodes(data=True)], columns=node_columns 157 | ) 158 | 159 | if self.dataset.lower() == "us": 160 | nodes_df["document_type"] = [ 161 | "regulation" if key.startswith("cfr") else "statute" 162 | for key in nodes_df.key 163 | ] 164 | 165 | nodes_df.to_csv( 166 | nodes_csv_path, 167 | header=False, 168 | index=False, 169 | columns=node_columns, 170 | mode="a", 171 | ) 172 | 173 | edges_df = pd.DataFrame( 174 | [dict(u=u, v=v, edge_type="containment") for u, v in nG.edges()], 175 | columns=edge_columns, 176 | ) 177 | 178 | for idx, row in nodes_df[nodes_df.level == 0].iterrows(): 179 | edges_df = edges_df.append( 180 | [dict(u="root", v=row.key, edge_type="containment")] 181 | ) 182 | 183 | edges_df.to_csv( 184 | edges_csv_path, 185 | header=False, 186 | index=False, 187 | columns=edge_columns, 188 | mode="a", 189 | ) 190 | 191 | # Get reference edges 192 | edge_list = pd.read_csv(f"{self.edgelist_folder}/{year}.csv") 193 | edges_df = pd.DataFrame( 194 | {"u": edge_list.out_node, "v": edge_list.in_node, "edge_type": "reference"}, 195 | columns=edge_columns, 196 | ) 197 | edges_df.to_csv( 198 | edges_csv_path, 199 | header=False, 200 | index=False, 201 | columns=edge_columns, 202 | mode="a", 203 | ) 204 | 205 | # add authority edges 206 | if self.regulations: 207 | edge_list = pd.read_csv(f"{self.authority_edgelist_folder}/{year}.csv") 208 | edges_df = pd.DataFrame( 209 | { 210 | "u": edge_list.out_node, 211 | "v": edge_list.in_node, 212 | "edge_type": "authority", 213 | }, 214 | columns=edge_columns, 215 | ) 216 | edges_df.to_csv( 217 | edges_csv_path, 218 | header=False, 219 | index=False, 220 | columns=edge_columns, 221 | mode="a", 222 | ) 223 | 224 | # Create and save seqitem graph 225 | G = load_graph_from_csv_files( 226 | self.destination, year, filter="exclude_subseqitems" 227 | ) 228 | 229 | nx.write_gpickle(G, f"{self.destination}/seqitems/{year}.gpickle.gz") 230 | -------------------------------------------------------------------------------- /statutes_pipeline_steps/de_authority_edgelist.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | import numpy 5 | import pandas as pd 6 | from quantlaw.utils.beautiful_soup import create_soup 7 | from quantlaw.utils.files import ensure_exists 8 | from quantlaw.utils.pipeline import PipelineStep 9 | 10 | from statics import ( 11 | DE_REFERENCE_PARSED_PATH, 12 | DE_REG_AUTHORITY_EDGELIST_PATH, 13 | DE_REG_CROSSREFERENCE_LOOKUP_PATH, 14 | DE_REG_REFERENCE_PARSED_PATH, 15 | ) 16 | from utils.common import get_snapshot_law_list 17 | 18 | 19 | def get_filename(date): 20 | return f"{date}.csv" 21 | 22 | 23 | class DeAuthorityEdgelist(PipelineStep): 24 | def __init__(self, law_names_data, *args, **kwargs): 25 | self.law_names_data = law_names_data 26 | super().__init__(*args, **kwargs) 27 | 28 | def get_items(self, overwrite, snapshots) -> list: 29 | ensure_exists(DE_REG_AUTHORITY_EDGELIST_PATH) 30 | 31 | if not overwrite: 32 | existing_files = os.listdir(DE_REG_AUTHORITY_EDGELIST_PATH) 33 | snapshots = list( 34 | filter(lambda f: get_filename(f) not in existing_files, snapshots) 35 | ) 36 | 37 | return snapshots 38 | 39 | def execute_item(self, item): 40 | files = get_snapshot_law_list(item, self.law_names_data) 41 | source_folder = DE_REG_CROSSREFERENCE_LOOKUP_PATH 42 | target_folder = DE_REG_AUTHORITY_EDGELIST_PATH 43 | key_df = ( 44 | pd.read_csv(f"{source_folder}/{item}.csv").dropna().set_index("citekey") 45 | ) 46 | law_citekeys_dict = { 47 | citekey.split("_")[0]: "_".join(row["key"].split("_")[:-1]) + "_000001" 48 | for citekey, row in key_df.iterrows() 49 | } 50 | 51 | df = None 52 | for file in files: 53 | edge_df = make_edge_list(file, key_df, law_citekeys_dict, regulations=True) 54 | df = edge_df if df is None else df.append(edge_df, ignore_index=True) 55 | df.to_csv(f"{target_folder}/{item}.csv", index=False) 56 | 57 | 58 | def make_edge_list(file, key_df, law_citekeys_dict, regulations): 59 | soup = create_soup( 60 | os.path.join( 61 | DE_REG_REFERENCE_PARSED_PATH if regulations else DE_REFERENCE_PARSED_PATH, 62 | file, 63 | ) 64 | ) 65 | edges = [] 66 | 67 | # FOR DEBUG 68 | problem_matches = set() 69 | problem_keys = set() 70 | 71 | for item in soup.find_all(["document", "seqitem"], attrs={"parsed": True}): 72 | item_parsed_ref_str = item.attrs["parsed"] 73 | if not item_parsed_ref_str or item_parsed_ref_str == "[]": 74 | continue 75 | 76 | node_out = item.get("key") 77 | refs = json.loads(item_parsed_ref_str) 78 | for ref in refs: 79 | # TODO multiple laws with the same bnormabk 80 | if len(ref) > 1: # Ref to seqitem at least 81 | try: 82 | key = "_".join(ref[:2]) 83 | matches = key_df.at[key, "key"] 84 | if type(matches) == numpy.ndarray: 85 | print(f"Multiple matches for {key}") 86 | matches = matches[0] 87 | if type(matches) is not str: 88 | problem_matches.add(tuple(matches)) 89 | node_in = matches if type(matches) == str else matches[0] 90 | edges.append((node_out, node_in)) 91 | except KeyError: 92 | problem_keys.add(key) 93 | else: # ref to document only 94 | node_in = law_citekeys_dict.get(ref[0]) 95 | if node_in: 96 | edges.append((node_out, node_in)) 97 | 98 | # FOR DEBUG 99 | # if len(problem_matches) > 0: 100 | # print(f"{file} Problem Matches:\n", sorted(list(problem_matches))) 101 | # if len(problem_keys) > 0: 102 | # print(f"{file} Problem Matches:\n", sorted(list(problem_keys))) 103 | return pd.DataFrame(edges, columns=["out_node", "in_node"]) 104 | -------------------------------------------------------------------------------- /statutes_pipeline_steps/de_crossreference_edgelist.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | import numpy 5 | import pandas as pd 6 | from quantlaw.utils.beautiful_soup import create_soup 7 | from quantlaw.utils.files import ensure_exists 8 | 9 | from statics import ( 10 | DE_CROSSREFERENCE_EDGELIST_PATH, 11 | DE_CROSSREFERENCE_LOOKUP_PATH, 12 | DE_REFERENCE_PARSED_PATH, 13 | DE_REG_CROSSREFERENCE_EDGELIST_PATH, 14 | DE_REG_CROSSREFERENCE_LOOKUP_PATH, 15 | DE_REG_REFERENCE_PARSED_PATH, 16 | ) 17 | from utils.common import RegulationsPipelineStep, get_snapshot_law_list 18 | 19 | 20 | class DeCrossreferenceEdgelist(RegulationsPipelineStep): 21 | def __init__(self, law_names_data, *args, **kwargs): 22 | self.law_names_data = law_names_data 23 | super().__init__(*args, **kwargs) 24 | 25 | def get_items(self, overwrite, snapshots) -> list: 26 | target_folder = ( 27 | DE_REG_CROSSREFERENCE_EDGELIST_PATH 28 | if self.regulations 29 | else DE_CROSSREFERENCE_EDGELIST_PATH 30 | ) 31 | ensure_exists(target_folder) 32 | 33 | if not overwrite: 34 | existing_files = os.listdir(target_folder) 35 | snapshots = list( 36 | filter(lambda f: get_filename(f) not in existing_files, snapshots) 37 | ) 38 | 39 | return snapshots 40 | 41 | def execute_item(self, item): 42 | files = get_snapshot_law_list(item, self.law_names_data) 43 | source_folder = ( 44 | DE_REG_CROSSREFERENCE_LOOKUP_PATH 45 | if self.regulations 46 | else DE_CROSSREFERENCE_LOOKUP_PATH 47 | ) 48 | target_folder = ( 49 | DE_REG_CROSSREFERENCE_EDGELIST_PATH 50 | if self.regulations 51 | else DE_CROSSREFERENCE_EDGELIST_PATH 52 | ) 53 | key_df = ( 54 | pd.read_csv(f"{source_folder}/{item}.csv").dropna().set_index("citekey") 55 | ) 56 | df = None 57 | for file in files: 58 | edge_df = make_edge_list(file, key_df, self.regulations) 59 | df = edge_df if df is None else df.append(edge_df, ignore_index=True) 60 | df.to_csv(f"{target_folder}/{item}.csv", index=False) 61 | 62 | 63 | def get_filename(date): 64 | return f"{date}.csv" 65 | 66 | 67 | def make_edge_list(file, key_df, regulations): 68 | soup = create_soup( 69 | os.path.join( 70 | DE_REG_REFERENCE_PARSED_PATH if regulations else DE_REFERENCE_PARSED_PATH, 71 | file, 72 | ) 73 | ) 74 | edges = [] 75 | 76 | # # FOR DEBUG 77 | # problem_matches = set() 78 | # problem_keys = set() 79 | 80 | for item in soup.find_all("seqitem"): 81 | references = item.find_all("reference") 82 | if references: 83 | node_out = item.get("key") 84 | for node in references: 85 | if node.lawname and node.lawname.get("type") in [ 86 | "dict", 87 | "sgb", 88 | "internal", 89 | ]: 90 | refs = json.loads(node.attrs["parsed"]) 91 | for ref in refs: 92 | try: 93 | key = "_".join(ref[:2]) 94 | matches = key_df.at[key, "key"] 95 | if type(matches) == numpy.ndarray: 96 | print(f"Multiple matches for {key}") 97 | matches = matches[0] 98 | # # FOR DEBUG 99 | # if type(matches) is not str: 100 | # problem_matches.add(tuple(matches)) 101 | node_in = matches if type(matches) == str else matches[0] 102 | edges.append((node_out, node_in)) 103 | assert len(ref) > 1 104 | except KeyError: 105 | pass 106 | # # FOR DEBUG 107 | # problem_keys.add(key) 108 | 109 | # FOR DEBUG 110 | # if len(problem_matches) > 0: 111 | # print(f"{file} Problem Matches:\n", sorted(list(problem_matches))) 112 | # if len(problem_keys) > 0: 113 | # print(f"{file} Problem Matches:\n", sorted(list(problem_keys))) 114 | return pd.DataFrame(edges, columns=["out_node", "in_node"]) 115 | -------------------------------------------------------------------------------- /statutes_pipeline_steps/de_crossreference_lookup.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from quantlaw.utils.beautiful_soup import create_soup 3 | from quantlaw.utils.files import ensure_exists 4 | 5 | from statics import ( 6 | DE_CROSSREFERENCE_LOOKUP_PATH, 7 | DE_REFERENCE_PARSED_PATH, 8 | DE_REG_CROSSREFERENCE_LOOKUP_PATH, 9 | DE_REG_REFERENCE_PARSED_PATH, 10 | ) 11 | from utils.common import RegulationsPipelineStep, get_snapshot_law_list, load_law_names 12 | 13 | 14 | class DeCrossreferenceLookup(RegulationsPipelineStep): 15 | def get_items(self, snapshots) -> list: 16 | ensure_exists( 17 | DE_REG_CROSSREFERENCE_LOOKUP_PATH 18 | if self.regulations 19 | else DE_CROSSREFERENCE_LOOKUP_PATH 20 | ) 21 | files = [] 22 | law_names_data = load_law_names(self.regulations) 23 | for snapshot in snapshots: 24 | files.append((snapshot, get_snapshot_law_list(snapshot, law_names_data))) 25 | return files 26 | 27 | def execute_item(self, item): 28 | date, files = item 29 | data = [] 30 | source_folder = ( 31 | DE_REG_REFERENCE_PARSED_PATH 32 | if self.regulations 33 | else DE_REFERENCE_PARSED_PATH 34 | ) 35 | target_folder = ( 36 | DE_REG_CROSSREFERENCE_LOOKUP_PATH 37 | if self.regulations 38 | else DE_CROSSREFERENCE_LOOKUP_PATH 39 | ) 40 | for file in files: 41 | soup = create_soup(f"{source_folder}/{file}") 42 | for tag in soup.find_all(citekey=True): 43 | data.append([tag.attrs["key"], tag.attrs["citekey"]]) 44 | df = pd.DataFrame(data, columns=["key", "citekey"]) 45 | destination_file = f"{target_folder}/{date}.csv" 46 | df.to_csv(destination_file, index=False) 47 | -------------------------------------------------------------------------------- /statutes_pipeline_steps/de_law_names.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | import pandas as pd 4 | from quantlaw.de_extract.stemming import stem_law_name 5 | from quantlaw.utils.beautiful_soup import create_soup 6 | from quantlaw.utils.files import list_dir 7 | 8 | from statics import ( 9 | DE_LAW_NAMES_COMPILED_PATH, 10 | DE_LAW_NAMES_PATH, 11 | DE_REG_LAW_NAMES_COMPILED_PATH, 12 | DE_REG_LAW_NAMES_PATH, 13 | DE_REG_XML_PATH, 14 | DE_XML_PATH, 15 | ) 16 | from utils.common import RegulationsPipelineStep, load_law_names 17 | 18 | 19 | class DeLawNamesStep(RegulationsPipelineStep): 20 | def get_items(self) -> list: 21 | src = DE_REG_XML_PATH if self.regulations else DE_XML_PATH 22 | files = list_dir(src, ".xml") 23 | return files 24 | 25 | def execute_item(self, item): 26 | src = DE_REG_XML_PATH if self.regulations else DE_XML_PATH 27 | soup = create_soup(f"{src}/{item}") 28 | document = soup.find("document", recursive=False) 29 | result = set() 30 | citekey = document.attrs["key"].split("_")[1] 31 | 32 | if "heading" in document.attrs: 33 | law_name = stem_law_name(document.attrs["heading"]) 34 | result.add((law_name, citekey, item)) 35 | 36 | if "heading_short" in document.attrs: 37 | law_name = stem_law_name(document.attrs["heading_short"]) 38 | result.add((law_name, citekey, item)) 39 | 40 | if "abbr_1" in document.attrs: 41 | law_name = stem_law_name(document.attrs["abbr_1"]) 42 | result.add((law_name, citekey, item)) 43 | 44 | if "abbr_2" in document.attrs: 45 | law_name = stem_law_name(document.attrs["abbr_2"]) 46 | result.add((law_name, citekey, item)) 47 | return result 48 | 49 | def finish_execution(self, names_per_file): 50 | dest_compiled = ( 51 | DE_REG_LAW_NAMES_COMPILED_PATH 52 | if self.regulations 53 | else DE_LAW_NAMES_COMPILED_PATH 54 | ) 55 | dest_csv = DE_REG_LAW_NAMES_PATH if self.regulations else DE_LAW_NAMES_PATH 56 | 57 | result = [] 58 | for names_of_file in names_per_file: 59 | result.extend(names_of_file) 60 | 61 | df = pd.DataFrame(result, columns=["citename", "citekey", "filename"]) 62 | df.to_csv(dest_csv, index=False) 63 | 64 | dated_law_names = compile_law_names(self.regulations) 65 | with open(dest_compiled, "wb") as f: 66 | pickle.dump(dated_law_names, f) 67 | 68 | 69 | def compile_law_names(regulations): 70 | data = load_law_names(regulations) 71 | dates = sorted({r["start"] for r in data}) 72 | 73 | dated_law_names = {} 74 | 75 | date_len = len(dates) 76 | for i, date in enumerate(dates): 77 | if i % 100 == 0: 78 | print(f"\r{i/date_len}", end="") 79 | law_names_list = [d for d in data if d["start"] <= date and d["end"] >= date] 80 | law_names = {} 81 | for row in law_names_list: 82 | law_names[row["citename"]] = row["citekey"] 83 | dated_law_names[date] = law_names 84 | print() 85 | 86 | return dated_law_names 87 | -------------------------------------------------------------------------------- /statutes_pipeline_steps/de_prepare_input.py: -------------------------------------------------------------------------------- 1 | # Roughly validate the input files 2 | import os 3 | import shutil 4 | 5 | from quantlaw.utils.files import ensure_exists 6 | 7 | from statics import ( 8 | DE_ORIGINAL_PATH, 9 | DE_REG_ORIGINAL_PATH, 10 | JURIS_EXPORT_GESETZE_LIST_PATH, 11 | JURIS_EXPORT_PATH, 12 | JURIS_EXPORT_RVO_LIST_PATH, 13 | ) 14 | 15 | 16 | def copy_selected_doknrs(selection_list, target_dir): 17 | ensure_exists(target_dir) 18 | for doknr in selection_list: 19 | version_filenames = [ 20 | f for f in os.listdir(f"{JURIS_EXPORT_PATH}/{doknr}") if f.endswith(".xml") 21 | ] 22 | for version_filename in version_filenames: 23 | assert len(version_filename.split("_")) == 3 24 | shutil.copy( 25 | f"{JURIS_EXPORT_PATH}/{doknr}/{version_filename}", 26 | f"{target_dir}/{version_filename}", 27 | ) 28 | 29 | 30 | def de_prepare_input(regulations): 31 | 32 | dest = DE_REG_ORIGINAL_PATH if regulations else DE_ORIGINAL_PATH 33 | 34 | with open(JURIS_EXPORT_GESETZE_LIST_PATH) as f: 35 | gesetze_dirs = f.read().strip().split("\n") 36 | copy_selected_doknrs(gesetze_dirs, dest) 37 | 38 | if regulations: 39 | with open(JURIS_EXPORT_RVO_LIST_PATH) as f: 40 | rvo_dirs = f.read().strip().split("\n") 41 | copy_selected_doknrs(rvo_dirs, DE_REG_ORIGINAL_PATH) 42 | -------------------------------------------------------------------------------- /statutes_pipeline_steps/de_reference_areas.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import os 3 | 4 | import bs4 5 | from quantlaw.de_extract.statutes_abstract import StatutesMatchWithMainArea 6 | from quantlaw.de_extract.statutes_areas import StatutesExtractor 7 | from quantlaw.utils.beautiful_soup import create_soup 8 | from quantlaw.utils.files import ensure_exists, list_dir 9 | 10 | from statics import ( 11 | DE_HELPERS_PATH, 12 | DE_REFERENCE_AREAS_LOG_PATH, 13 | DE_REFERENCE_AREAS_PATH, 14 | DE_REG_HELPERS_PATH, 15 | DE_REG_REFERENCE_AREAS_LOG_PATH, 16 | DE_REG_REFERENCE_AREAS_PATH, 17 | DE_REG_XML_PATH, 18 | DE_XML_PATH, 19 | ) 20 | from utils.common import RegulationsPipelineStep, get_stemmed_law_names_for_filename 21 | 22 | 23 | class DeReferenceAreasStep(RegulationsPipelineStep): 24 | max_number_of_processes = 2 25 | 26 | def __init__(self, law_names, *args, **kwargs): 27 | self.law_names = law_names 28 | super().__init__(*args, **kwargs) 29 | 30 | def get_items(self, overwrite) -> list: 31 | src = DE_REG_XML_PATH if self.regulations else DE_XML_PATH 32 | dest = ( 33 | DE_REG_REFERENCE_AREAS_PATH if self.regulations else DE_REFERENCE_AREAS_PATH 34 | ) 35 | 36 | ensure_exists(dest) 37 | files = list_dir(src, ".xml") 38 | 39 | if not overwrite: 40 | existing_files = os.listdir(dest) 41 | files = list(filter(lambda f: f not in existing_files, files)) 42 | return files 43 | 44 | def execute_item(self, item): 45 | src = DE_REG_XML_PATH if self.regulations else DE_XML_PATH 46 | dest = ( 47 | DE_REG_REFERENCE_AREAS_PATH if self.regulations else DE_REFERENCE_AREAS_PATH 48 | ) 49 | 50 | laws_lookup = get_stemmed_law_names_for_filename(item, self.law_names) 51 | extractor = StatutesExtractor(laws_lookup) 52 | result = [] 53 | soup = create_soup(f"{src}/{item}") 54 | para, art, misc = analyze_type_of_headings(soup) 55 | 56 | result.extend(find_references_in_soup(soup, extractor, para, art)) 57 | 58 | # Find references without preceding article or § (currently not implemented) 59 | # long_law_regex_pattern = law_keys_to_regex(laws_lookup_keys, 5) 60 | # short_law_regex_pattern = law_keys_to_regex(laws_lookup_keys, 3, 4) 61 | # for section in soup.find_all("text"): 62 | # find_law_references_in_section( 63 | # section, soup, long_law_regex_pattern, stem_law_name 64 | # ) 65 | # find_law_references_in_section( 66 | # section, soup, short_law_regex_pattern, clean_name 67 | # ) 68 | 69 | save_soup_with_style(soup, f"{dest}/{item}") 70 | 71 | return result 72 | 73 | def finish_execution(self, results): 74 | logs = list(itertools.chain.from_iterable(results)) 75 | ensure_exists(DE_REG_HELPERS_PATH if self.regulations else DE_HELPERS_PATH) 76 | with open( 77 | DE_REG_REFERENCE_AREAS_LOG_PATH 78 | if self.regulations 79 | else DE_REFERENCE_AREAS_LOG_PATH, 80 | mode="w", 81 | ) as f: 82 | f.write("\n".join(sorted(logs, key=lambda x: x.lower()))) 83 | 84 | 85 | ######################################## 86 | # Functions general and normal citations 87 | ######################################## 88 | 89 | 90 | def save_soup_with_style(soup, path): 91 | output_lines = str(soup).replace("\n\n", "\n").split("\n") 92 | output_lines.insert(1, '') 93 | output = "\n".join(output_lines) 94 | 95 | with open(path, "w") as f: 96 | f.write(output) 97 | 98 | 99 | def analyze_type_of_headings(soup): 100 | para = 0 101 | art = 0 102 | misc = 0 103 | for tag in soup.find_all("seqitem"): 104 | if "heading" not in tag.attrs: 105 | misc += 1 106 | elif tag.attrs["heading"].replace("\n", "").startswith("§"): 107 | para += 1 108 | elif tag.attrs["heading"].replace("\n", "").lower().startswith("art"): 109 | art += 1 110 | else: 111 | misc += 1 112 | return para, art, misc 113 | 114 | 115 | def add_tag(string, pos, end, tag): 116 | tag.string = string[pos:end] 117 | return [ 118 | bs4.element.NavigableString(string[:pos]), 119 | tag, 120 | bs4.element.NavigableString(string[end:]), 121 | ] 122 | 123 | 124 | def split_reference(string, len_main, len_suffix, soup): 125 | main_str = string[:len_main] 126 | suffix_str = string[len_main : len_main + len_suffix] 127 | law_str = string[len_main + len_suffix :] 128 | 129 | result = [soup.new_tag("main"), soup.new_tag("suffix"), soup.new_tag("lawname")] 130 | result[0].append(main_str) 131 | result[1].append(suffix_str) 132 | result[2].append(law_str) 133 | 134 | return result 135 | 136 | 137 | def handle_reference_match(match: StatutesMatchWithMainArea, section, soup, para, art): 138 | # Set internal references to ignore if seqitem unit (Art|§) does not match between 139 | # reference and target law 140 | if match.law_match_type == "internal": 141 | if (section.contents[-1][match.start :].startswith("§") and para == 0) or ( 142 | section.contents[-1][match.start :].lower().startswith("art") and art == 0 143 | ): 144 | match.law_match_type = "ignore" 145 | 146 | ref_tag = soup.new_tag("reference", pattern="inline") 147 | section.contents[-1:] = add_tag( 148 | section.contents[-1], 149 | match.start, 150 | match.end + match.suffix_len + match.law_len, 151 | ref_tag, 152 | ) 153 | ref_tag.contents = split_reference( 154 | ref_tag.string, match.end - match.start, match.suffix_len, soup 155 | ) 156 | ref_tag.contents[-1]["type"] = match.law_match_type 157 | 158 | 159 | def find_references_in_section(section, soup, extractor: StatutesExtractor, para, art): 160 | logs = [] 161 | match = extractor.search(section.contents[-1]) # Search first match 162 | while match: 163 | if match.has_main_area(): 164 | handle_reference_match(match, section, soup, para, art) 165 | match = extractor.search( 166 | section.contents[-1], pos=(0 if match.has_main_area() else match.end) 167 | ) 168 | return logs 169 | 170 | 171 | def find_references_in_soup(soup, extractor, para, art, text_tag_name="text"): 172 | logs = [] 173 | for text in soup.find_all(text_tag_name): 174 | if text.is_empty_element: 175 | continue 176 | assert text.string 177 | logs.extend(find_references_in_section(text, soup, extractor, para, art)) 178 | return logs 179 | 180 | 181 | ######################################################## 182 | # Functions: references without preceding 'article' or § 183 | ######################################################## 184 | # 185 | # 186 | # 187 | # def pos_in_orig_string(i, stemmed, orig): 188 | # prefix = stemmed[:i] 189 | # stemmed_tokens = regex.findall(r"[\w']+|[\W']+", prefix) 190 | # orig_tokens = regex.findall(r"[\w']+|[\W']+", orig) 191 | # # return (len(''.join(orig_tokens[:len(stemmed_tokens)-1])) + 192 | # # len(stemmed_tokens[-1]) # Precise position 193 | # return len("".join(orig_tokens[: len(stemmed_tokens)])) # Round to next boundary 194 | # 195 | # 196 | # def law_keys_to_regex(keys, min_length, max_length=-1): 197 | # pattern = "" 198 | # for key in keys: 199 | # if len(key) >= min_length and (len(key) <= max_length or max_length == -1): 200 | # pattern += regex.escape(key) + r"|" 201 | # pattern = pattern[:-1] 202 | # full_pattern = r"\b(?>" + pattern + r")\b" 203 | # return regex.compile(full_pattern, flags=regex.IGNORECASE) 204 | # 205 | # 206 | # def find_law_references_in_section(section, soup, law_regex_pattern, sanitizer): 207 | # for item in list(section.contents): 208 | # i_in_section = section.contents.index(item) 209 | # if type(item) is not bs4.element.NavigableString: 210 | # continue 211 | # test_string = sanitizer(item.string) 212 | # matches = law_regex_pattern.finditer(test_string) 213 | # for match in reversed(list(matches)): 214 | # orig_start = pos_in_orig_string(match.start(), test_string, item.string) 215 | # orig_end = pos_in_orig_string(match.end(), test_string, item.string) 216 | # 217 | # ref_tag = soup.new_tag("reference", pattern="generic") 218 | # 219 | # section.contents[i_in_section : i_in_section + 1] = add_tag( 220 | # section.contents[i_in_section], orig_start, orig_end, ref_tag 221 | # ) 222 | -------------------------------------------------------------------------------- /statutes_pipeline_steps/de_reference_parse.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import json 3 | import os 4 | 5 | from quantlaw.de_extract.statutes_parse import StatutesParser, StringCaseException 6 | from quantlaw.de_extract.stemming import stem_law_name 7 | from quantlaw.utils.beautiful_soup import create_soup, save_soup 8 | from quantlaw.utils.files import ensure_exists, list_dir 9 | 10 | from statics import ( 11 | DE_HELPERS_PATH, 12 | DE_REFERENCE_AREAS_PATH, 13 | DE_REFERENCE_PARSED_LOG_PATH, 14 | DE_REFERENCE_PARSED_PATH, 15 | DE_REG_HELPERS_PATH, 16 | DE_REG_REFERENCE_AREAS_PATH, 17 | DE_REG_REFERENCE_PARSED_LOG_PATH, 18 | DE_REG_REFERENCE_PARSED_PATH, 19 | ) 20 | from statutes_pipeline_steps.de_reference_parse_vso_list import ( 21 | identify_reference_in_juris_vso_list, 22 | ) 23 | from utils.common import ( 24 | RegulationsPipelineStep, 25 | copy_xml_schema_to_data_folder, 26 | get_stemmed_law_names_for_filename, 27 | ) 28 | 29 | 30 | class DeReferenceParseStep(RegulationsPipelineStep): 31 | max_number_of_processes = 2 32 | 33 | def __init__(self, law_names, *args, **kwargs): 34 | self.law_names = law_names 35 | super().__init__(*args, **kwargs) 36 | 37 | def get_items(self, overwrite) -> list: 38 | src = ( 39 | DE_REG_REFERENCE_AREAS_PATH if self.regulations else DE_REFERENCE_AREAS_PATH 40 | ) 41 | dest = ( 42 | DE_REG_REFERENCE_PARSED_PATH 43 | if self.regulations 44 | else DE_REFERENCE_PARSED_PATH 45 | ) 46 | 47 | ensure_exists(dest) 48 | files = list_dir(src, ".xml") 49 | 50 | ensure_exists(dest) 51 | files = list_dir(src, ".xml") 52 | 53 | if not overwrite: 54 | existing_files = os.listdir(dest) 55 | files = list(filter(lambda f: f not in existing_files, files)) 56 | 57 | copy_xml_schema_to_data_folder() 58 | 59 | return files 60 | 61 | def execute_item(self, item): 62 | src = ( 63 | DE_REG_REFERENCE_AREAS_PATH if self.regulations else DE_REFERENCE_AREAS_PATH 64 | ) 65 | dest = ( 66 | DE_REG_REFERENCE_PARSED_PATH 67 | if self.regulations 68 | else DE_REFERENCE_PARSED_PATH 69 | ) 70 | 71 | laws_lookup = get_stemmed_law_names_for_filename(item, self.law_names) 72 | parser = StatutesParser(laws_lookup) 73 | 74 | logs = list() 75 | 76 | # for debug 77 | logs.append(f"Start file - {item}") 78 | 79 | soup = create_soup(f"{src}/{item}") 80 | parse_reference_content_in_soup(soup, parser, debug_context=item) 81 | current_lawid = soup.document.attrs["key"].split("_")[1] 82 | identify_reference_law_name_in_soup(soup, parser, current_lawid) 83 | identify_lawreference_law_name_in_soup(soup, laws_lookup) 84 | 85 | identify_reference_in_juris_vso_list(soup, parser) 86 | 87 | save_soup(soup, f"{dest}/{item}") 88 | return logs 89 | 90 | def finish_execution(self, results): 91 | logs = list(itertools.chain.from_iterable(results)) 92 | ensure_exists(DE_REG_HELPERS_PATH if self.regulations else DE_HELPERS_PATH) 93 | with open( 94 | DE_REG_REFERENCE_PARSED_LOG_PATH 95 | if self.regulations 96 | else DE_REFERENCE_PARSED_LOG_PATH, 97 | mode="w", 98 | ) as f: 99 | f.write("\n".join(sorted(logs, key=lambda x: x.lower()))) 100 | 101 | 102 | def parse_reference_content(reference, parser): 103 | citation = reference.main.get_text() 104 | reference_paths = parser.parse_main(citation) 105 | 106 | reference["parsed_verbose"] = json.dumps(reference_paths, ensure_ascii=False) 107 | reference_paths_simple = [ 108 | [component[1] for component in path] for path in reference_paths 109 | ] 110 | reference["parsed"] = json.dumps(reference_paths_simple, ensure_ascii=False) 111 | 112 | 113 | def parse_reference_content_in_soup(soup, parser, debug_context=None): 114 | for reference in soup.find_all("reference", {"pattern": "inline"}): 115 | if reference.main: 116 | try: 117 | parse_reference_content(reference, parser) 118 | except StringCaseException as error: 119 | print(error, "context", debug_context) 120 | 121 | 122 | def identify_reference_law_name_in_soup(soup, parser, current_lawid, skip_errors=False): 123 | for reference in soup.find_all("reference", {"pattern": "inline"}): 124 | 125 | lawid = parser.parse_law( 126 | reference.lawname.string, reference.lawname["type"], current_lawid 127 | ) 128 | 129 | try: 130 | ref_parts = json.loads(reference["parsed_verbose"]) 131 | 132 | if reference.lawname.attrs["type"] in ["internal", "dict", "sgb"]: 133 | for ref_part in ref_parts: 134 | if not lawid: 135 | print(reference) 136 | ref_part.insert(0, ["Gesetz", lawid]) 137 | reference["parsed_verbose"] = json.dumps(ref_parts, ensure_ascii=False) 138 | 139 | ref_parts = json.loads(reference["parsed"]) 140 | if reference.lawname.attrs["type"] in ["internal", "dict", "sgb"]: 141 | for ref_part in ref_parts: 142 | assert lawid 143 | ref_part.insert(0, lawid) 144 | reference["parsed"] = json.dumps(ref_parts, ensure_ascii=False) 145 | except KeyError: 146 | if skip_errors: 147 | print(reference) 148 | else: 149 | raise 150 | 151 | 152 | def identify_lawreference_law_name_in_soup(soup, laws_lookup): 153 | for reference in soup.find_all("reference", {"pattern": "generic"}): 154 | reference["parsed"] = [[laws_lookup[stem_law_name(reference.string)]]] 155 | -------------------------------------------------------------------------------- /statutes_pipeline_steps/de_reference_parse_vso_list.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import regex 4 | 5 | # fmt: off 6 | from quantlaw.de_extract.statutes_parse import StatutesParser 7 | from quantlaw.de_extract.stemming import stem_law_name 8 | 9 | reference_trigger_pattern = regex.compile( 10 | r'(' 11 | r'§{1,2}|' 12 | r'\bArt\b\.?|' 13 | r'Artikels?n?' 14 | r')\s*' 15 | ) 16 | # fmt: on 17 | 18 | 19 | def identify_reference_in_juris_vso_list(soup, parser: StatutesParser): 20 | 21 | vso_tags = soup.find_all(["document", "seqitem"], attrs={"verweise": True}) 22 | for vso_tag in vso_tags: 23 | parsed_vso_refs = [] 24 | parsed_vso_refs_simple = [] 25 | verweise = ( 26 | [] 27 | if vso_tag.attrs["verweise"] == "[]" 28 | else json.loads(vso_tag.attrs["verweise"]) 29 | ) 30 | for verweis in verweise: 31 | if not verweis["typ"] in [ 32 | "Ermächtigung", 33 | "Rechtsgrundlage", 34 | "Durchführungsvorschrift", 35 | ]: 36 | # 'Vertragsgesetz', 'Sonderregelung', 'GLIEDERUNG', 'SAMMELVERWEISUNG', 37 | # 'Einführungsvorschrift', 'InnerstaatlDurchfVorschr' will be ignored 38 | continue 39 | if not verweis["normabk"]: 40 | continue 41 | lawname_stem = stem_law_name(verweis["normabk"]) 42 | match = parser.match_law_name(lawname_stem) 43 | print(match) 44 | # if match: 45 | # lawid = parser.laws_lookup[match] 46 | # parsed_vso_ref = [[["Gesetz", lawid]]] 47 | # parsed_vso_ref_simple = [[lawid]] 48 | # 49 | # # Append ref. details if present in raw data 50 | # enbez = verweis["enbez"] 51 | # if enbez and reference_trigger_pattern.match(enbez): 52 | # 53 | # try: 54 | # ( 55 | # reference_paths, 56 | # reference_paths_simple, 57 | # ) = parse_reference_string(enbez, debug_context=None) 58 | # 59 | # parsed_vso_ref = [ 60 | # parsed_vso_ref[0] + r for r in reference_paths 61 | # ] 62 | # parsed_vso_ref_simple = [ 63 | # parsed_vso_ref_simple[0] + r 64 | # for r in reference_paths_simple 65 | # ] 66 | # 67 | # except StringCaseException as error: 68 | # print(error, "context", enbez) 69 | # 70 | # parsed_vso_refs.extend(parsed_vso_ref) 71 | # parsed_vso_refs_simple.extend(parsed_vso_ref_simple) 72 | 73 | # Remove duplicates 74 | parsed_vso_refs = remove_duplicate_references(parsed_vso_refs) 75 | parsed_vso_refs_simple = remove_duplicate_references(parsed_vso_refs_simple) 76 | 77 | vso_tag.attrs["parsed_verbose"] = json.dumps( 78 | parsed_vso_refs, ensure_ascii=False 79 | ) 80 | vso_tag.attrs["parsed"] = json.dumps(parsed_vso_refs_simple, ensure_ascii=False) 81 | 82 | 83 | def remove_duplicate_references(references): 84 | res = [] 85 | res_str = [] 86 | for elem in references: 87 | elem_str = str(elem) 88 | if elem_str not in res_str: 89 | res.append(elem) 90 | res_str.append(elem_str) 91 | 92 | return res 93 | -------------------------------------------------------------------------------- /statutes_pipeline_steps/hierarchy_graph.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | 4 | import networkx as nx 5 | from lxml import etree 6 | from quantlaw.utils.files import ensure_exists, list_dir 7 | from quantlaw.utils.pipeline import PipelineStep 8 | 9 | 10 | class HierarchyGraphStep(PipelineStep): 11 | def __init__(self, source, destination, add_subseqitems, *args, **kwargs): 12 | self.source = source 13 | self.destination = destination 14 | self.add_subseqitems = add_subseqitems 15 | super().__init__(*args, **kwargs) 16 | 17 | def get_items(self, overwrite) -> list: 18 | ensure_exists(self.destination) 19 | files = list_dir(self.source, ".xml") 20 | 21 | if not overwrite: 22 | existing_files = list_dir(self.destination, ".gpickle") 23 | files = list( 24 | filter(lambda f: get_gpickle_filename(f) not in existing_files, files) 25 | ) 26 | 27 | return files 28 | 29 | def execute_item(self, item): 30 | G = build_graph(f"{self.source}/{item}", add_subseqitems=self.add_subseqitems) 31 | 32 | destination_path = f"{self.destination}/{get_gpickle_filename(item)}" 33 | nx.write_gpickle(G, destination_path) 34 | 35 | 36 | ########### 37 | # Functions 38 | ########### 39 | 40 | 41 | def get_gpickle_filename(filename): 42 | return f"{os.path.splitext(filename)[0]}.gpickle" 43 | 44 | 45 | def add_juris_attrs(item, node_attrs): 46 | if item.attrib.get("normgeber"): 47 | node_attrs["legislators"] = item.attrib["normgeber"] 48 | if item.attrib.get("mitwirkende"): 49 | node_attrs["contributors"] = item.attrib["mitwirkende"] 50 | if item.attrib.get("sachgebiete"): 51 | node_attrs["subject_areas"] = item.attrib["sachgebiete"] 52 | 53 | 54 | def nest_items(G, items, document_type): 55 | """ 56 | Convert xml soup to graph tree using networkx 57 | """ 58 | for item in items: 59 | if item.tag != "document": 60 | node_attrs = dict( 61 | key=item.attrib["key"], 62 | citekey=item.attrib.get("citekey", ""), 63 | heading=item.attrib.get("heading", ""), 64 | parent_key=item.getparent().attrib["key"], 65 | level=int(item.attrib["level"]), 66 | type=item.tag, 67 | ) 68 | if document_type: 69 | node_attrs["document_type"] = document_type 70 | add_juris_attrs(item, node_attrs) 71 | 72 | G.add_node(item.attrib["key"], **node_attrs) 73 | G.add_edge(item.getparent().attrib["key"], item.attrib["key"]) 74 | 75 | else: # handle root node 76 | 77 | node_attrs = dict( 78 | key=item.attrib["key"], 79 | citekey=item.attrib.get("citekey", ""), 80 | heading=item.attrib.get("heading", ""), 81 | parent_key="", 82 | level=0, 83 | type=item.tag, 84 | **(dict(document_type=document_type) if document_type else {}), 85 | ) 86 | if "abbr_1" in item.attrib: 87 | node_attrs["abbr_1"] = item.attrib["abbr_1"] 88 | if "abbr_2" in item.attrib: 89 | node_attrs["abbr_2"] = item.attrib["abbr_2"] 90 | add_juris_attrs(item, node_attrs) 91 | 92 | G.add_node(item.attrib["key"], **node_attrs) 93 | G.graph["name"] = item.attrib.get("heading", "") 94 | 95 | return G 96 | 97 | 98 | def count_characters(text, whites=False): 99 | """ 100 | Get character count of a text 101 | 102 | Args: 103 | whites: If True, whitespaces are not counted 104 | """ 105 | if whites: 106 | return len(text) 107 | else: 108 | return len(re.sub(r"\s", "", text)) 109 | 110 | 111 | def count_tokens(text, unique=False): 112 | """ 113 | Get token count of given text. Tokens are delimited by whitespaces. 114 | Args: 115 | unique: It True, only unique tokens are counted. 116 | """ 117 | if not unique: 118 | return len(text.split()) 119 | else: 120 | return len(set(text.split())) 121 | 122 | 123 | def build_graph(filename, add_subseqitems=False): 124 | """ 125 | Builds an awesome graph from a file. 126 | """ 127 | 128 | # Read input file 129 | tree = etree.parse(filename) 130 | 131 | document_type = ( 132 | tree.xpath("/document")[0].attrib.get("document_type", None) 133 | if tree.xpath("/document") 134 | else None 135 | ) 136 | 137 | # Create target graph 138 | G = nx.DiGraph() 139 | 140 | xpath = ( 141 | "//document | //item | //seqitem | //subseqitem" 142 | if add_subseqitems 143 | else "//document | //item | //seqitem" 144 | ) 145 | 146 | # Create a tree if the elements in the target graph 147 | G = nest_items(G, items=tree.xpath(xpath), document_type=document_type) 148 | 149 | # Add attributes regarding the contained text to the target graoh 150 | for item in tree.xpath(xpath): 151 | text = " ".join(item.itertext()) 152 | G.nodes[item.attrib["key"]]["chars_n"] = count_characters(text, whites=True) 153 | G.nodes[item.attrib["key"]]["chars_nowhites"] = count_characters( 154 | text, whites=False 155 | ) 156 | G.nodes[item.attrib["key"]]["tokens_n"] = count_tokens(text, unique=False) 157 | G.nodes[item.attrib["key"]]["tokens_unique"] = count_tokens(text, unique=True) 158 | 159 | items_with_text = {elem.getparent() for elem in tree.xpath("//text")} 160 | for item in items_with_text: 161 | all_elems = item.getchildren() 162 | text_elems = [e for e in all_elems if e.tag == "text"] 163 | if len(all_elems) > 1 and text_elems: 164 | texts_tokens_n = [] 165 | texts_chars_n = [] 166 | for elem in text_elems: 167 | text = " ".join(elem.itertext()) 168 | texts_tokens_n.append(str(count_tokens(text, unique=False))) 169 | texts_chars_n.append(str(count_characters(text, whites=False))) 170 | G.nodes[item.attrib["key"]]["texts_tokens_n"] = ",".join(texts_tokens_n) 171 | G.nodes[item.attrib["key"]]["texts_chars_n"] = ",".join(texts_chars_n) 172 | 173 | return G 174 | -------------------------------------------------------------------------------- /statutes_pipeline_steps/snapshot_mapping_edgelist.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import pickle 4 | from collections import Counter, deque 5 | from multiprocessing import Pool 6 | 7 | import networkx as nx 8 | import textdistance 9 | import tqdm 10 | from quantlaw.utils.beautiful_soup import create_soup 11 | from quantlaw.utils.files import ensure_exists, list_dir 12 | from quantlaw.utils.networkx import get_leaves 13 | from quantlaw.utils.pipeline import PipelineStep 14 | from regex import regex 15 | 16 | from utils.common import get_snapshot_law_list, invert_dict_mapping_unique 17 | from utils.string_list_contains import StringContainsAlign 18 | 19 | 20 | class SnapshotMappingEdgelistStep(PipelineStep): 21 | max_number_of_processes = 1 22 | 23 | def __init__( 24 | self, 25 | source, 26 | destination, 27 | interval, 28 | dataset, 29 | min_text_length=50, 30 | radius=5, 31 | distance_threshold=0.9, 32 | *args, 33 | **kwargs, 34 | ): 35 | self.source = source 36 | self.destination = destination 37 | self.interval = interval 38 | self.dataset = dataset 39 | self.min_text_length = min_text_length 40 | self.radius = radius 41 | self.distance_threshold = distance_threshold 42 | super().__init__(*args, **kwargs) 43 | 44 | def get_items(self, overwrite, snapshots) -> list: 45 | ensure_exists(self.destination) 46 | items = sorted(list_dir(self.source, ".pickle")) 47 | items = [i[: -len(".pickle")] for i in items] 48 | 49 | # Create mappings to draw the edges 50 | mappings = [ 51 | (file1, file2) 52 | for file1, file2 in zip(items[: -self.interval], items[self.interval :]) 53 | ] 54 | 55 | if snapshots: 56 | mappings = list(filter(lambda f: f[0] in snapshots, mappings)) 57 | 58 | if not overwrite: 59 | existing_files = list_dir(self.destination, ".json") 60 | mappings = list( 61 | filter(lambda x: mapping_filename(x) not in existing_files, mappings) 62 | ) 63 | 64 | return mappings 65 | 66 | def execute_item(self, item): 67 | filename1, filename2 = item 68 | 69 | data1 = self.load_pickle(filename1) 70 | data2 = self.load_pickle(filename2) 71 | 72 | # STEP 1: perfect matches unique when considering text 73 | new_mappings = map_unique_texts( 74 | data1, data2, min_text_length=self.min_text_length 75 | ) 76 | remaining_keys1, remaining_keys2 = get_remaining( 77 | data1["keys"], data2["keys"], new_mappings, printing=f"{item}/Step 1" 78 | ) 79 | 80 | # STEP 2: perfect matches unique when considering text _and_ citekey 81 | new_mappings_current_step = map_same_citekey_same_text( 82 | data1, data2, remaining_keys1, remaining_keys2 83 | ) 84 | new_mappings = {**new_mappings_current_step, **new_mappings} 85 | del new_mappings_current_step 86 | remaining_keys1, remaining_keys2 = get_remaining( 87 | data1["keys"], data2["keys"], new_mappings, printing=f"{item}/Step 2" 88 | ) 89 | 90 | # STEP 3: text appended/prepended/removed 91 | new_mappings_current_step = map_text_containment( 92 | data1, data2, remaining_keys1, remaining_keys2 93 | ) 94 | new_mappings = {**new_mappings_current_step, **new_mappings} 95 | del new_mappings_current_step 96 | remaining_keys1, remaining_keys2 = get_remaining( 97 | data1["keys"], data2["keys"], new_mappings, printing=f"{item}/Step 3" 98 | ) 99 | 100 | # STEP 4: neighborhood matching 101 | data_keys1 = data1["keys"] 102 | data_keys2 = data2["keys"] 103 | data_texts1 = data1["texts"] 104 | data_texts2 = data2["texts"] 105 | del data1 106 | del data2 107 | 108 | common_neighbor_kwargs = dict( 109 | new_mappings=new_mappings, 110 | data_keys1=data_keys1, 111 | data_keys2=data_keys2, 112 | data_texts1=data_texts1, 113 | data_texts2=data_texts2, 114 | remaining_keys1=remaining_keys1, 115 | remaining_keys2=remaining_keys2, 116 | radius=self.radius, 117 | distance_threshold=self.distance_threshold, 118 | ) 119 | 120 | text_distance_cache = map_similar_text_common_neighbors( 121 | **common_neighbor_kwargs, 122 | printing=str(item), 123 | dry_run=True, 124 | ) 125 | text_distance_cache = update_textdistance_cache(text_distance_cache) 126 | map_similar_text_common_neighbors( 127 | **common_neighbor_kwargs, 128 | printing=str(item), 129 | text_distance_cache=text_distance_cache, 130 | ) 131 | 132 | dest_path = f"{self.destination}/{mapping_filename(item)}" 133 | with open(dest_path, "w") as f: 134 | json.dump(new_mappings, f) 135 | 136 | # only called to print stats 137 | get_remaining(data_keys1, data_keys2, new_mappings, printing=f"{item}/DONE") 138 | 139 | def load_pickle(self, snapshot): 140 | with open(os.path.join(self.source, snapshot + ".pickle"), "rb") as f: 141 | raw_data = pickle.load(f) 142 | return raw_data 143 | 144 | 145 | def mapping_filename(mapping): 146 | """ 147 | returns the filename mappings are stored in 148 | """ 149 | filename1, filename2 = mapping 150 | result = f"{filename1}_{filename2}.json" 151 | return result 152 | 153 | 154 | def load_crossref_graph(filename, source): 155 | graph_path = f"{source}/{filename}" 156 | G = nx.read_gpickle(graph_path) 157 | return G 158 | 159 | 160 | def get_remaining(data_keys1, data_keys2, new_mappings, asserting=True, printing=True): 161 | """ 162 | Prints stats and returns keys of both snapshots to be matched 163 | """ 164 | remaining_keys1 = set(data_keys1) - set(new_mappings.keys()) 165 | remaining_keys2 = set(data_keys2) - set(new_mappings.values()) 166 | if asserting: 167 | assert len(set(new_mappings.keys())) == len(set(new_mappings.values())) 168 | if printing: 169 | print( 170 | f"\n{printing}; " 171 | f"Progress {len(new_mappings)/min(len(data_keys1), len(data_keys2))}; " 172 | f"Remaining keys: {len(remaining_keys1)} {len(remaining_keys2)}; " 173 | ) 174 | return remaining_keys1, remaining_keys2 175 | 176 | 177 | def get_leaf_texts_to_compare( 178 | graph_filename, G, source_text, source_text_reg, law_names_data, dataset 179 | ): 180 | """ 181 | get text for leaves of a hierarchy graph. Can be seqitem or supseqitem graph. 182 | Leaves are only seqitems or supseqitems. 183 | """ 184 | leaf_keys = get_leaves(G) 185 | 186 | snapshot = graph_filename[: -len(".gpickle.gz")] 187 | 188 | if dataset == "us": 189 | files = [ 190 | os.path.join(source_text, x) 191 | for x in list_dir(source_text, ".xml") 192 | if x.split(".")[0].split("_")[-1] == snapshot 193 | ] 194 | if source_text_reg: 195 | files += [ 196 | os.path.join(source_text_reg, x) 197 | for x in list_dir(source_text_reg, ".xml") 198 | if x.split(".")[0].split("_")[-1] == snapshot 199 | ] 200 | files.sort() 201 | else: # is DE 202 | files = get_snapshot_law_list(snapshot, law_names_data) 203 | files = [os.path.join(source_text, f) for f in files] 204 | 205 | whitespace_pattern = regex.compile(r"[\s\n]+") 206 | texts = {} 207 | for file in files: 208 | print(f"\r{files.index(file)} / {len(files)}", end="") 209 | soup = create_soup(file) 210 | tags = soup.find_all(["seqitem", "subseqitem"]) 211 | for tag in tags: 212 | if tag["key"] in leaf_keys: 213 | text = tag.get_text(" ") 214 | text = whitespace_pattern.sub(" ", text).lower().strip() 215 | texts[tag["key"]] = text.lower() 216 | return texts 217 | 218 | 219 | def map_unique_texts(data1, data2, min_text_length=50): 220 | """ 221 | Maps nodes from snapshot t1 to t2 if texts are in each snapshot unique and appear 222 | in the both snapshots 223 | """ 224 | leaf_texts1 = {k: t for k, t in zip(data1["keys"], data1["texts"])} 225 | leaf_texts2 = {k: t for k, t in zip(data2["keys"], data2["texts"])} 226 | 227 | # Create dicts with text as keys 228 | inverted_unique_leaf_texts1 = invert_dict_mapping_unique(leaf_texts1) 229 | inverted_unique_leaf_texts2 = invert_dict_mapping_unique(leaf_texts2) 230 | 231 | # find unique texts in both snapshots 232 | both_unique_texts = set(inverted_unique_leaf_texts1.keys()) & set( 233 | inverted_unique_leaf_texts2.keys() 234 | ) 235 | 236 | # Filter for texts with min length 237 | both_unique_texts = {x for x in both_unique_texts if len(x) >= min_text_length} 238 | 239 | # Create mapping 240 | new_mappings = {} 241 | for text in both_unique_texts: 242 | new_mappings[inverted_unique_leaf_texts1[text]] = inverted_unique_leaf_texts2[ 243 | text 244 | ] 245 | return new_mappings 246 | 247 | 248 | def map_same_citekey_same_text(data1, data2, remaining_keys1, remaining_keys2): 249 | text_and_citekeys1 = { 250 | k: (c.lower(), t) 251 | for k, t, c in zip(data1["keys"], data1["texts"], data1["citekeys"]) 252 | if c and k in remaining_keys1 253 | } 254 | text_and_citekeys2 = { 255 | k: (c.lower(), t) 256 | for k, t, c in zip(data2["keys"], data2["texts"], data2["citekeys"]) 257 | if c and k in remaining_keys2 258 | } 259 | inverted_text_and_citekeys1 = invert_dict_mapping_unique(text_and_citekeys1) 260 | inverted_text_and_citekeys2 = invert_dict_mapping_unique(text_and_citekeys2) 261 | 262 | both_unique_text_and_citekeys = set(inverted_text_and_citekeys1.keys()) & set( 263 | inverted_text_and_citekeys2.keys() 264 | ) 265 | 266 | # Create mapping 267 | new_mappings = {} 268 | for text_and_citekey in both_unique_text_and_citekeys: 269 | new_mappings[ 270 | inverted_text_and_citekeys1[text_and_citekey] 271 | ] = inverted_text_and_citekeys2[text_and_citekey] 272 | return new_mappings 273 | 274 | 275 | def clip_text_for_containment_matching(text): 276 | return text.split(" ", 1)[-1] # get rid of German Absatz numbers (e.g., "(1)") 277 | 278 | 279 | def map_text_containment( 280 | data1, 281 | data2, 282 | remaining_keys1, 283 | remaining_keys2, 284 | min_text_length=50, 285 | ): 286 | remaining_keys1_list = sorted(remaining_keys1) 287 | remaining_keys2_list = sorted(remaining_keys2) 288 | leaf_texts1_dict = {k: t for k, t in zip(data1["keys"], data1["texts"])} 289 | leaf_texts2_dict = {k: t for k, t in zip(data2["keys"], data2["texts"])} 290 | 291 | aligner = StringContainsAlign(min_text_length=min_text_length) 292 | aligner.text_list_0 = [ 293 | clip_text_for_containment_matching(leaf_texts1_dict[k]) 294 | for k in remaining_keys1_list 295 | ] 296 | aligner.text_list_1 = [ 297 | clip_text_for_containment_matching(leaf_texts2_dict[k]) 298 | for k in remaining_keys2_list 299 | ] 300 | aligner.create_index() 301 | 302 | containment_idx_forward = aligner.run() 303 | containment_idx_reversed = aligner.run(reversed=True) 304 | aligner.clean_index() 305 | 306 | containment_idx_reversed = [(v, u) for u, v in containment_idx_reversed] 307 | 308 | containment_idx = set(containment_idx_forward + containment_idx_reversed) 309 | 310 | # Filter one to one matches 311 | idx_1_counts = Counter(u for u, v in containment_idx) 312 | idx_2_counts = Counter(v for u, v in containment_idx) 313 | 314 | unique_keys_1 = {idx for idx, cnt in idx_1_counts.items() if cnt == 1} 315 | unique_keys_2 = {idx for idx, cnt in idx_2_counts.items() if cnt == 1} 316 | 317 | new_mappings = {} 318 | for u, v in containment_idx_forward + containment_idx_reversed: 319 | if u in unique_keys_1 and v in unique_keys_2: 320 | u_key = remaining_keys1_list[u] 321 | v_key = remaining_keys2_list[v] 322 | new_mappings[u_key] = v_key 323 | 324 | return new_mappings 325 | 326 | 327 | def get_neighborhood(data_keys, node, radius, keys_len, key_index_dict): 328 | 329 | curr_index = key_index_dict[node] 330 | lower_bound = max(0, curr_index - radius) 331 | upper_bound = min(keys_len, curr_index + radius) 332 | 333 | neighborhood = data_keys[lower_bound : upper_bound + 1] 334 | 335 | # Remove node in radius but of another law/title as their order ist mostly arbitrary 336 | key_prefix = node.split("_")[0] 337 | neighborhood = {n for n in neighborhood if n.startswith(key_prefix)} 338 | 339 | return neighborhood 340 | 341 | 342 | def cached_text_distance(s1, s2, cache, dry_run): 343 | key = (s1, s2) 344 | if dry_run: 345 | distance = None 346 | cache[key] = distance 347 | elif key not in cache: 348 | distance = textdistance.jaro_winkler(s1, s2) 349 | cache[key] = distance 350 | else: 351 | distance = cache[key] 352 | return distance 353 | 354 | 355 | def calc_text_distance(args): 356 | return textdistance.jaro_winkler(*args) 357 | 358 | 359 | def update_textdistance_cache(text_distance_cache): 360 | text_distance_texts = list(text_distance_cache.keys()) 361 | with Pool() as p: 362 | distances = tqdm.tqdm( 363 | p.imap(calc_text_distance, text_distance_texts), 364 | total=len(text_distance_texts), 365 | ) 366 | return {k: v for k, v in zip(text_distance_texts, distances)} 367 | 368 | 369 | def map_similar_text_common_neighbors( 370 | new_mappings, 371 | data_keys1, 372 | data_keys2, 373 | data_texts1, 374 | data_texts2, 375 | remaining_keys1, 376 | remaining_keys2, 377 | radius=5, 378 | distance_threshold=0.9, 379 | printing=None, 380 | dry_run=False, 381 | text_distance_cache=None, 382 | ): 383 | if not text_distance_cache: 384 | text_distance_cache = dict() 385 | 386 | keys_len1 = len(data_keys1) 387 | keys_len2 = len(data_keys2) 388 | key_index_dict1 = {k: idx for idx, k in enumerate(data_keys1)} 389 | key_index_dict2 = {k: idx for idx, k in enumerate(data_keys2)} 390 | 391 | leaf_texts1 = {k: v for k, v in zip(data_keys1, data_texts1)} 392 | leaf_texts2 = {k: v for k, v in zip(data_keys2, data_texts2)} 393 | 394 | key_queue = deque(remaining_keys1) 395 | key_queue_set = set(key_queue) 396 | i = -1 # only to print the process 397 | while key_queue: 398 | remaining_key1 = key_queue.popleft() 399 | key_queue_set.remove(remaining_key1) 400 | i += 1 # only to print the process 401 | if i % 100 == 0 and printing: 402 | total = len(key_queue) + i 403 | print( 404 | f"\r{printing} " f"{i/total*100:.2f}% \t ({total} )", 405 | end="", 406 | ) 407 | 408 | remaining_text1 = leaf_texts1[remaining_key1] 409 | 410 | # Get neighborhood of node in G1 411 | # Get mapping to G2 for neighborhood nodes 412 | # Get neighborhood of mapped G2 nodes 413 | neighborhood_nodes1 = get_neighborhood( 414 | data_keys1, remaining_key1, radius, keys_len1, key_index_dict1 415 | ) 416 | neighborhood_nodes2 = set() 417 | 418 | for neighborhood_node1 in neighborhood_nodes1: 419 | if neighborhood_node1 in new_mappings: 420 | neighborhood_nodes2.update( 421 | get_neighborhood( 422 | data_keys2, 423 | new_mappings[neighborhood_node1], 424 | radius, 425 | keys_len2, 426 | key_index_dict2, 427 | ) 428 | ) 429 | 430 | # Remove duplicates in G2 neighborhood 431 | neighborhood_nodes2 = [x for x in neighborhood_nodes2 if x in remaining_keys2] 432 | 433 | # Find most similar text 434 | neighborhood_text2 = [leaf_texts2.get(x) for x in neighborhood_nodes2] 435 | similarity = [ 436 | cached_text_distance(remaining_text1, x, text_distance_cache, dry_run) 437 | if x 438 | else 0 439 | for x in neighborhood_text2 440 | ] 441 | if not dry_run: 442 | max_similarity = max(similarity) if similarity else 0 443 | 444 | if max_similarity > distance_threshold: 445 | # Add to mapping and update remaining_keys 446 | max_index = similarity.index(max_similarity) 447 | id2_to_match_to = neighborhood_nodes2[max_index] 448 | new_mappings[remaining_key1] = id2_to_match_to 449 | remaining_keys2.remove(id2_to_match_to) 450 | remaining_keys1.remove(remaining_key1) 451 | 452 | # Requeue neighborhood of newly mapped element 453 | neighborhood_to_requeue = [ 454 | n 455 | for n in neighborhood_nodes1 456 | if n in remaining_keys1 and n not in key_queue_set 457 | ] 458 | key_queue.extend(neighborhood_to_requeue) 459 | key_queue_set.update(neighborhood_to_requeue) 460 | 461 | print() 462 | return text_distance_cache 463 | -------------------------------------------------------------------------------- /statutes_pipeline_steps/snapshot_mapping_index.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | 4 | import networkx as nx 5 | from lxml import etree 6 | from quantlaw.utils.files import ensure_exists, list_dir 7 | from quantlaw.utils.pipeline import PipelineStep 8 | from regex import regex 9 | 10 | from utils.common import get_snapshot_law_list 11 | 12 | 13 | class SnapshotMappingIndexStep(PipelineStep): 14 | def __init__( 15 | self, 16 | source_text, 17 | destination, 18 | dataset, 19 | law_names_data=None, 20 | *args, 21 | **kwargs, 22 | ): 23 | self.source_text = source_text 24 | self.destination = destination 25 | self.dataset = dataset 26 | self.law_names_data = law_names_data 27 | super().__init__(*args, **kwargs) 28 | 29 | def get_items(self, overwrite, snapshots) -> list: 30 | ensure_exists(self.destination) 31 | items = snapshots 32 | if not overwrite: 33 | existing_files = list_dir(self.destination, ".pickle") 34 | items = list(filter(lambda x: (x + ".pickle") not in existing_files, items)) 35 | return items 36 | 37 | def execute_item(self, item): 38 | # Load texts 39 | item_data = list( 40 | get_texttags_to_compare( 41 | item, 42 | self.source_text, 43 | self.law_names_data, 44 | self.dataset, 45 | ) 46 | ) 47 | 48 | self.save_raw(item, item_data) 49 | 50 | def save_raw(self, item, item_data): 51 | 52 | keys, citekeys, texts = list(zip(*item_data)) 53 | 54 | pickle_path = os.path.join(self.destination, item + ".pickle") 55 | 56 | with open(pickle_path, "wb") as f: 57 | pickle.dump(dict(keys=keys, texts=texts, citekeys=citekeys), f) 58 | 59 | 60 | def load_crossref_graph(item, source): 61 | graph_path = f"{source}/{item}.gpickle.gz" 62 | G = nx.read_gpickle(graph_path) 63 | return G 64 | 65 | 66 | def get_texttags_to_compare(snapshot, source_texts, law_names_data, dataset): 67 | 68 | if dataset == "us": 69 | if type(source_texts) is str: 70 | source_texts = [source_texts] 71 | 72 | files = sorted( 73 | [ 74 | os.path.join(source_text, x) 75 | for source_text in source_texts 76 | for x in list_dir(source_text, ".xml") 77 | if x.split(".")[0].split("_")[-1] == snapshot 78 | ] 79 | ) 80 | else: # is DE 81 | assert type(source_texts) is str 82 | files = get_snapshot_law_list(snapshot, law_names_data) 83 | files = [os.path.join(source_texts, f) for f in files] 84 | 85 | whitespace_pattern = regex.compile(r"[\s\n]+") 86 | 87 | for file in files: 88 | tree = etree.parse(file) 89 | for text_tag in tree.xpath("//text"): 90 | item = text_tag.getparent() 91 | 92 | text_elems = [e for e in item.getchildren() if e.tag == "text"] 93 | pos_in_item = text_elems.index(text_tag) 94 | text_key = item.attrib["key"] + f"_{pos_in_item}" 95 | 96 | seqitem = get_seqitem(item) 97 | if seqitem is not None: 98 | citekey = seqitem.attrib.get("citekey") 99 | else: 100 | citekey = None 101 | 102 | text = etree.tostring(text_tag, method="text", encoding="utf8").decode( 103 | "utf-8" 104 | ) 105 | text = whitespace_pattern.sub(" ", text).lower().strip() 106 | 107 | yield text_key, citekey, text 108 | 109 | 110 | def get_seqitem(elem): 111 | if elem is None: 112 | return None 113 | elif elem.tag == "seqitem": 114 | return elem 115 | return get_seqitem(elem.getparent()) 116 | -------------------------------------------------------------------------------- /statutes_pipeline_steps/us_authority_edgelist.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import json 3 | 4 | import lxml.etree 5 | 6 | from statics import US_REG_AUTHORITY_EDGELIST_PATH 7 | from statutes_pipeline_steps.us_crossreference_edgelist import UsCrossreferenceEdgelist 8 | 9 | 10 | class UsAuthorityEdgelist(UsCrossreferenceEdgelist): 11 | @property 12 | def dest(self): 13 | assert self.regulations 14 | return US_REG_AUTHORITY_EDGELIST_PATH 15 | 16 | def make_edge_list(self, yearfile_path, key_dict): 17 | with open(yearfile_path, encoding="utf8") as f: 18 | file_elem = lxml.etree.parse(f) 19 | edge_list = [] 20 | 21 | # for debug 22 | # problem_matches = set() 23 | # problem_keys = set() 24 | 25 | for item in file_elem.xpath("//*[@auth_text_parsed]"): 26 | node_out = item.attrib.get("key") 27 | refs = itertools.chain.from_iterable( 28 | json.loads(item.attrib["auth_text_parsed"]) 29 | ) 30 | for ref in refs: 31 | key = "_".join(ref[:2]) 32 | node_in = key_dict.get(key) 33 | 34 | if node_in: 35 | edge_list.append([node_out, node_in]) 36 | 37 | return edge_list 38 | -------------------------------------------------------------------------------- /statutes_pipeline_steps/us_crossreference_edgelist.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | import lxml.etree 5 | import pandas as pd 6 | from quantlaw.utils.files import ensure_exists, list_dir 7 | 8 | from statics import ( 9 | US_CROSSREFERENCE_EDGELIST_PATH, 10 | US_CROSSREFERENCE_LOOKUP_PATH, 11 | US_REFERENCE_PARSED_PATH, 12 | US_REG_CROSSREFERENCE_EDGELIST_PATH, 13 | US_REG_CROSSREFERENCE_LOOKUP_PATH, 14 | US_REG_REFERENCE_PARSED_PATH, 15 | ) 16 | from utils.common import RegulationsPipelineStep 17 | 18 | 19 | class UsCrossreferenceEdgelist(RegulationsPipelineStep): 20 | def __init__(self, detailed_crossreferences, *args, **kwargs): 21 | self.detailed_crossreferences = detailed_crossreferences 22 | super().__init__(*args, **kwargs) 23 | 24 | def get_items(self, overwrite, snapshots) -> list: 25 | ensure_exists(self.dest) 26 | if not snapshots: 27 | snapshots = sorted( 28 | set([os.path.splitext(x)[0] for x in list_dir(self.lookup, ".csv")]) 29 | ) 30 | 31 | if not overwrite: 32 | existing_files = os.listdir(self.dest) 33 | snapshots = list( 34 | filter(lambda f: get_filename(f) not in existing_files, snapshots) 35 | ) 36 | 37 | return snapshots 38 | 39 | @property 40 | def dest(self): 41 | return ( 42 | US_REG_CROSSREFERENCE_EDGELIST_PATH 43 | if self.regulations 44 | else US_CROSSREFERENCE_EDGELIST_PATH 45 | ) + ("/detailed" if self.detailed_crossreferences else "") 46 | 47 | @property 48 | def lookup(self): 49 | return ( 50 | US_REG_CROSSREFERENCE_LOOKUP_PATH 51 | if self.regulations 52 | else US_CROSSREFERENCE_LOOKUP_PATH 53 | ) + ("/detailed" if self.detailed_crossreferences else "") 54 | 55 | def execute_item(self, item): 56 | yearfiles = [ 57 | os.path.join(US_REFERENCE_PARSED_PATH, x) 58 | for x in list_dir(US_REFERENCE_PARSED_PATH, ".xml") 59 | if str(item) in x 60 | ] 61 | if self.regulations: 62 | yearfiles += [ 63 | os.path.join(US_REG_REFERENCE_PARSED_PATH, x) 64 | for x in list_dir(US_REG_REFERENCE_PARSED_PATH, ".xml") 65 | if str(item) in x 66 | ] 67 | 68 | key_df = pd.read_csv(f"{self.lookup}/{item}.csv").dropna().set_index("citekey") 69 | key_dict = {} 70 | for idx, val in key_df.key.iteritems(): 71 | if idx not in key_dict: 72 | key_dict[idx] = val 73 | edge_list = [] 74 | for yearfile_path in yearfiles: 75 | edge_list_file = self.make_edge_list(yearfile_path, key_dict) 76 | edge_list.extend(edge_list_file) 77 | if edge_list: 78 | df = pd.DataFrame(edge_list, columns=["out_node", "in_node"]) 79 | df.to_csv(f"{self.dest}/{item}.csv", index=False) 80 | 81 | def make_edge_list(self, yearfile_path, key_dict): 82 | with open(yearfile_path, encoding="utf8") as f: 83 | file_elem = lxml.etree.parse(f) 84 | edge_list = [] 85 | 86 | if self.detailed_crossreferences: 87 | for ref_elem in file_elem.xpath(".//reference"): 88 | node_out = ref_elem.getparent().getparent().attrib.get("key") 89 | refs = json.loads(ref_elem.attrib["parsed"]) 90 | for ref in refs: 91 | for cutoff in range(len(ref), 1, -1): 92 | key = "_".join(ref[:cutoff]) 93 | node_in = key_dict.get(key) 94 | if node_in: 95 | edge_list.append([node_out, node_in]) 96 | break 97 | else: 98 | for seqitem_elem in file_elem.xpath("//seqitem"): 99 | node_out = seqitem_elem.attrib.get("key") 100 | for ref_elem in seqitem_elem.xpath(".//reference"): 101 | refs = json.loads(ref_elem.attrib["parsed"]) 102 | for ref in refs: 103 | for cutoff in range(len(ref), 1, -1): 104 | key = "_".join(ref[:cutoff]) 105 | node_in = key_dict.get(key) 106 | if node_in: 107 | edge_list.append([node_out, node_in]) 108 | break 109 | return edge_list 110 | 111 | 112 | ########### 113 | # Functions 114 | ########### 115 | 116 | 117 | def get_filename(date): 118 | return f"{date}.csv" 119 | -------------------------------------------------------------------------------- /statutes_pipeline_steps/us_crossreference_lookup.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import lxml.etree 4 | import pandas as pd 5 | from quantlaw.utils.files import ensure_exists, list_dir 6 | 7 | from statics import ( 8 | US_CROSSREFERENCE_LOOKUP_PATH, 9 | US_REFERENCE_PARSED_PATH, 10 | US_REG_CROSSREFERENCE_LOOKUP_PATH, 11 | US_REG_REFERENCE_PARSED_PATH, 12 | ) 13 | from utils.common import RegulationsPipelineStep 14 | 15 | 16 | class UsCrossreferenceLookup(RegulationsPipelineStep): 17 | def __init__(self, detailed_crossreferences, *args, **kwargs): 18 | self.detailed_crossreferences = detailed_crossreferences 19 | super().__init__(*args, **kwargs) 20 | 21 | def get_items(self, overwrite, snapshots) -> list: 22 | ensure_exists(self.dest) 23 | 24 | # If snapshots not set, create list of all years 25 | if not snapshots: 26 | snapshots = sorted( 27 | set( 28 | [ 29 | x.split(".")[0].split("_")[-1] 30 | for x in list_dir(US_REFERENCE_PARSED_PATH, ".xml") 31 | ] 32 | ) 33 | ) 34 | 35 | if not overwrite: 36 | existing_files = os.listdir(self.dest) 37 | snapshots = list( 38 | filter(lambda f: get_filename(f) not in existing_files, snapshots) 39 | ) 40 | 41 | return snapshots 42 | 43 | @property 44 | def dest(self): 45 | return ( 46 | US_REG_CROSSREFERENCE_LOOKUP_PATH 47 | if self.regulations 48 | else US_CROSSREFERENCE_LOOKUP_PATH 49 | ) + ("/detailed" if self.detailed_crossreferences else "") 50 | 51 | def execute_item(self, item): 52 | yearfiles = [ 53 | os.path.join(US_REFERENCE_PARSED_PATH, x) 54 | for x in list_dir(US_REFERENCE_PARSED_PATH, ".xml") 55 | if str(item) in x 56 | ] 57 | if self.regulations: 58 | yearfiles += [ 59 | os.path.join(US_REG_REFERENCE_PARSED_PATH, x) 60 | for x in list_dir(US_REG_REFERENCE_PARSED_PATH, ".xml") 61 | if str(item) in x 62 | ] 63 | data = [] 64 | for file in yearfiles: 65 | with open(file, encoding="utf8") as f: 66 | file_elem = lxml.etree.parse(f) 67 | for node in file_elem.xpath("//*[@citekey]"): 68 | data.append([node.attrib["key"], node.attrib["citekey"]]) 69 | if self.detailed_crossreferences: 70 | for node in file_elem.xpath("//*[@citekey_detailed]"): 71 | for citekey in node.attrib["citekey_detailed"].split(","): 72 | data.append([node.attrib["key"], citekey]) 73 | df = pd.DataFrame(data, columns=["key", "citekey"]) 74 | destination_file = f"{self.dest}/{get_filename(item)}" 75 | df.to_csv(destination_file, index=False) 76 | 77 | 78 | def get_filename(year): 79 | return f"{year}.csv" 80 | -------------------------------------------------------------------------------- /statutes_pipeline_steps/us_prepare_input.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import shutil 4 | 5 | from quantlaw.utils.files import ensure_exists 6 | 7 | from statics import US_INPUT_PATH, US_ORIGINAL_PATH 8 | 9 | 10 | def us_prepare_input(): 11 | """ 12 | moves source files into main dir and validate files roughly 13 | """ 14 | 15 | ensure_exists(US_ORIGINAL_PATH) 16 | 17 | subfolders = [f.name for f in os.scandir(US_INPUT_PATH) if f.is_dir()] 18 | for subfolder in subfolders: 19 | for item in os.listdir(f"{US_INPUT_PATH}/{subfolder}"): 20 | 21 | # Filter by filename pattern 22 | pattern = re.compile(r"(\d+)usc(\d+)(a)?\.html?", flags=re.IGNORECASE) 23 | match = pattern.fullmatch(item) 24 | if not match: 25 | continue 26 | 27 | new_name = f'{match[2]}{"1" if match[3] else "0"}_{match[1]}.htm' 28 | 29 | # Prevent overwriting files 30 | if os.path.exists(f"{US_ORIGINAL_PATH}/{new_name}"): 31 | print(f"{US_ORIGINAL_PATH}/{new_name} already exists") 32 | else: 33 | shutil.copy( 34 | f"{US_INPUT_PATH}/{subfolder}/{item}", 35 | f"{US_ORIGINAL_PATH}/{new_name}", 36 | ) 37 | 38 | files = os.listdir(US_ORIGINAL_PATH) 39 | files = [f for f in files if f.endswith(".htm")] 40 | pattern = re.compile(r"(\d+)_(\d+)\.htm") 41 | years = {} 42 | for file in files: 43 | match = pattern.fullmatch(file) 44 | year = match[2] 45 | title = match[1] 46 | years[year] = years[year] if years.get(year) else [] 47 | years[year].append(title) 48 | 49 | for idx in list(years.keys()): 50 | years[idx] = sorted(years[idx]) 51 | 52 | print(f"{len(files)} files found") 53 | print(f"{len(years)} years found") 54 | 55 | for year in sorted(years.keys()): 56 | titles = years[year] 57 | print(f"{year}: n={len(titles)}, max='{max(titles)}'") 58 | -------------------------------------------------------------------------------- /statutes_pipeline_steps/us_reference_areas.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import multiprocessing 3 | import os 4 | 5 | import bs4 6 | from quantlaw.utils.beautiful_soup import create_soup, save_soup 7 | from quantlaw.utils.files import ensure_exists, list_dir 8 | from regex import regex 9 | 10 | from statics import ( 11 | US_HELPERS_PATH, 12 | US_REFERENCE_AREAS_LOG_PATH, 13 | US_REFERENCE_AREAS_PATH, 14 | US_REG_HELPERS_PATH, 15 | US_REG_REFERENCE_AREAS_LOG_PATH, 16 | US_REG_REFERENCE_AREAS_PATH, 17 | US_REG_XML_PATH, 18 | US_XML_PATH, 19 | ) 20 | from statutes_pipeline_steps.us_reference_reg import find_authority_references 21 | from utils.common import RegulationsPipelineStep 22 | 23 | 24 | class UsReferenceAreasStep(RegulationsPipelineStep): 25 | max_number_of_processes = max(int(multiprocessing.cpu_count() / 2), 1) 26 | 27 | def get_items(self, overwrite) -> list: 28 | src = US_REG_XML_PATH if self.regulations else US_XML_PATH 29 | dest = ( 30 | US_REG_REFERENCE_AREAS_PATH if self.regulations else US_REFERENCE_AREAS_PATH 31 | ) 32 | ensure_exists(dest) 33 | files = list_dir(src, ".xml") 34 | 35 | if not overwrite: 36 | existing_files = os.listdir(dest) 37 | files = list(filter(lambda f: f not in existing_files, files)) 38 | 39 | return files 40 | 41 | def execute_item(self, item): 42 | src = US_REG_XML_PATH if self.regulations else US_XML_PATH 43 | dest = ( 44 | US_REG_REFERENCE_AREAS_PATH if self.regulations else US_REFERENCE_AREAS_PATH 45 | ) 46 | soup = create_soup(f"{src}/{item}") 47 | logs = find_references(soup, usc_pattern, {"pattern": "block"}) 48 | logs += find_references(soup, inline_pattern, {"pattern": "inline"}) 49 | 50 | if self.regulations: 51 | logs += find_authority_references(soup, usc_pattern) 52 | 53 | save_soup(soup, f"{dest}/{item}") 54 | return logs 55 | 56 | def finish_execution(self, results): 57 | logs = list(itertools.chain.from_iterable(results)) 58 | ensure_exists(US_REG_HELPERS_PATH if self.regulations else US_HELPERS_PATH) 59 | log_path = ( 60 | US_REG_REFERENCE_AREAS_LOG_PATH 61 | if self.regulations 62 | else US_REFERENCE_AREAS_LOG_PATH 63 | ) 64 | with open(log_path, mode="w") as f: 65 | f.write("\n".join(sorted(logs, key=lambda x: x.lower()))) 66 | 67 | 68 | ################ 69 | # Regex patterns 70 | ################ 71 | 72 | # fmt: off 73 | 74 | regex_definitions = ( 75 | r'(?(DEFINE)' 76 | r'(?' 77 | r'(\d+([\da-zA-Z\-\–\—\.]*[\da-zA-Z\-\–\—])?)' 78 | r'(\(\d*[a-z]{0,3}i*\))*' 79 | r'(\s+et\.?\s+seq\.?)?' 80 | r'(\s+and\sfollowing)?' 81 | r')' 82 | r'(?' 83 | r'(\(\d*[a-z]{0,2}i?\))+' 84 | r'(\s+et\.?\s+seq\.?)?' 85 | r')' 86 | r'(?' 87 | r',?\s+(and|or|to|through)(\sin)?\s+|' 88 | r'(,|;)\s+' 89 | r')' 90 | r')' 91 | ) 92 | 93 | usc_pattern_string = regex_definitions + ( 94 | r'(' 95 | r'(\d+)\s*' 96 | r'(' 97 | r'U\.?S\.?C\.?' 98 | r'|' 99 | r'C\.?F\.?R\.?' 100 | r')\s*' 101 | r'(Sec(?:tions?|\.)?|§§?|\b(sub)?Parts?)?\s*' 102 | r'(?&sec)' 103 | r'((?&conn)(Sec(?:tions|\.)?|§§?|\b(sub)?Parts?)?\s*(?&sec)|(?&conn)(?&numb))*' 104 | r')' 105 | r'(?!\w*(\sApp\.)?\s(U\.?S\.?C\.?|C\.?F\.?R\.?|Stat\.))' 106 | r'\s*' 107 | r'(' 108 | r'(of\sthis\s(sub\-?)?(title|chapter|part|section|division|paragraph))' 109 | r'|' 110 | r'(of\stitle\s\d+)' 111 | r')?' 112 | r'(' 113 | r'\s+of\s+the\s+Code\s+of\s+Federal\s+Regulations' 114 | r'|' 115 | r'\s+of\s+the\s+Code\s+of\s+the\s+United\s+States' 116 | r')?' 117 | ) 118 | usc_pattern = regex.compile(usc_pattern_string, flags=regex.IGNORECASE) 119 | 120 | inline_pattern_string = regex_definitions + ( 121 | r'(Sec(?:tion|\.)?|§§?|\b(sub)?parts?)\s*' 122 | r'(?&sec)' 123 | r'(' 124 | r'(?&conn)' 125 | r'(Sec(?:tions?|\.)?|§§?)?' 126 | r'\s*' 127 | r'(?&sec)' 128 | r'|' 129 | r'(?&conn)(?&numb)' 130 | r')*' 131 | r'\s*' 132 | r'(' 133 | r'(of\sthis\s(sub\-?)?(title|chapter|part|section|division|paragraph))' 134 | r'|' 135 | r'(of\stitle\s\d+)' 136 | r')?' 137 | r'(' 138 | r'\s+of\s+the\s+Code\s+of\s+Federal\s+Regulations' 139 | r'|' 140 | r'\s+of\s+the\s+Code\s+of\s+the\s+United\s+States' 141 | r')?' 142 | ) 143 | inline_pattern = regex.compile(inline_pattern_string, flags=regex.IGNORECASE) 144 | 145 | # fmt: on 146 | 147 | ########### 148 | # Functions 149 | ########### 150 | 151 | 152 | def add_tag(string, pos, end, tag): 153 | """ 154 | Wraps part of a string a given tag 155 | """ 156 | tag.string = string[pos:end] 157 | return [ 158 | bs4.element.NavigableString(string[:pos]), 159 | tag, 160 | bs4.element.NavigableString(string[end:]), 161 | ] 162 | 163 | 164 | def find_references(soup, pattern, attrs): 165 | """ 166 | Finds the references in the soup and marks them a tag 167 | """ 168 | logs = [] # For debug 169 | 170 | text_tags = list(soup.find_all("text")) 171 | for text_tag in text_tags: 172 | for text_tag_string in list(text_tag.contents): 173 | if type(text_tag_string) is not bs4.element.NavigableString: 174 | continue 175 | tag_cursor = text_tag_string 176 | last_match_end = 0 177 | matches = pattern.finditer(text_tag_string) 178 | for match in list(matches): 179 | if regex.match(r"\s?,?of\b", text_tag_string[match.end() :]): 180 | continue 181 | ref_tag = soup.new_tag("reference", **attrs) 182 | pre_text, ref_tag, post_text = add_tag( 183 | text_tag_string, match.start(), match.end(), ref_tag 184 | ) 185 | 186 | pre_text = pre_text[last_match_end:] 187 | last_match_end = match.end() 188 | 189 | tag_cursor.replace_with(ref_tag) 190 | ref_tag.insert_before(pre_text) 191 | ref_tag.insert_after(post_text) 192 | tag_cursor = post_text 193 | 194 | logs.append(f"{post_text[:50]} --- {match[0]}") # For debug 195 | 196 | return logs # For debug 197 | -------------------------------------------------------------------------------- /statutes_pipeline_steps/us_reference_parse.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import json 3 | import multiprocessing 4 | import os 5 | from builtins import Exception 6 | 7 | import regex 8 | from quantlaw.utils.beautiful_soup import create_soup, save_soup 9 | from quantlaw.utils.files import ensure_exists, list_dir 10 | 11 | from statics import ( 12 | US_HELPERS_PATH, 13 | US_REFERENCE_AREAS_PATH, 14 | US_REFERENCE_PARSED_LOG_PATH, 15 | US_REFERENCE_PARSED_PATH, 16 | US_REG_HELPERS_PATH, 17 | US_REG_REFERENCE_AREAS_PATH, 18 | US_REG_REFERENCE_PARSED_LOG_PATH, 19 | US_REG_REFERENCE_PARSED_PATH, 20 | ) 21 | from utils.common import RegulationsPipelineStep 22 | 23 | 24 | class UsReferenceParseStep(RegulationsPipelineStep): 25 | max_number_of_processes = max(int(multiprocessing.cpu_count() / 2), 1) 26 | 27 | def get_items(self, overwrite) -> list: 28 | src = ( 29 | US_REG_REFERENCE_AREAS_PATH if self.regulations else US_REFERENCE_AREAS_PATH 30 | ) 31 | dest = ( 32 | US_REG_REFERENCE_PARSED_PATH 33 | if self.regulations 34 | else US_REFERENCE_PARSED_PATH 35 | ) 36 | 37 | ensure_exists(dest) 38 | files = list_dir(src, ".xml") 39 | 40 | if not overwrite: 41 | existing_files = os.listdir(dest) 42 | files = list(filter(lambda f: f not in existing_files, files)) 43 | return files 44 | 45 | def execute_item(self, item): 46 | from statutes_pipeline_steps.us_reference_reg import parse_authority_references 47 | 48 | src = ( 49 | US_REG_REFERENCE_AREAS_PATH if self.regulations else US_REFERENCE_AREAS_PATH 50 | ) 51 | dest = ( 52 | US_REG_REFERENCE_PARSED_PATH 53 | if self.regulations 54 | else US_REFERENCE_PARSED_PATH 55 | ) 56 | 57 | soup = create_soup(f"{src}/{item}") 58 | 59 | this_title = self.get_title_from_filename(item) 60 | try: 61 | logs = parse_references(soup, this_title, this_usc=not self.regulations) 62 | logs += parse_authority_references(soup) 63 | except Exception: 64 | print(item) 65 | raise 66 | save_soup(soup, f"{dest}/{item}") 67 | return logs 68 | 69 | def finish_execution(self, results): 70 | logs = list(itertools.chain.from_iterable(results)) 71 | ensure_exists(US_REG_HELPERS_PATH if self.regulations else US_HELPERS_PATH) 72 | with open( 73 | US_REG_REFERENCE_PARSED_LOG_PATH 74 | if self.regulations 75 | else US_REFERENCE_PARSED_LOG_PATH, 76 | mode="w", 77 | ) as f: 78 | f.write("\n".join(sorted(logs, key=lambda x: x.lower()))) 79 | 80 | def get_title_from_filename(self, filename): 81 | if self.regulations: 82 | base = os.path.splitext(filename)[0] 83 | assert base.startswith("cfr") 84 | title_key = base.split("_")[0][len("cfr") :] 85 | return int(title_key) 86 | else: 87 | base = os.path.splitext(filename)[0] 88 | title_key = base.split("_")[0] 89 | assert title_key[-1] == "0" 90 | assert len(title_key) == 3 91 | return int(title_key[:-1]) 92 | 93 | 94 | ########### 95 | # Functions 96 | ########### 97 | 98 | 99 | def sortable_paragraph_number(string): 100 | MIN_DIGITS = 4 101 | digits = len(regex.match(r"^\d*", string)[0]) 102 | if not digits: 103 | return string 104 | return "0 " * (MIN_DIGITS - digits) + string 105 | 106 | 107 | split_pattern_short = regex.compile( 108 | r"\s*(?:\b|(?<=\d))(U\.?S\.?C|C\.?F\.?R)(?:\.|\b|(?=\d)|Sec\.)\s*", 109 | flags=regex.IGNORECASE, 110 | ) 111 | split_pattern_inline = regex.compile( 112 | # fmt: off 113 | r"\s*of\s+(?=(?:" 114 | r'(?:this\s(?:sub\-?)?(?:title|chapter|part|section|division|paragraph))' 115 | r'|' 116 | r'(?:title)' 117 | r"))" 118 | # fmt: on 119 | , 120 | flags=regex.IGNORECASE, 121 | ) 122 | sub_split_pattern = regex.compile( 123 | r"\s*,?\s*(?:and|or|,|;|throu?g?h?|to)\s+", flags=regex.IGNORECASE 124 | ) 125 | 126 | 127 | def get_enum_types(string): 128 | return ( 129 | bool(regex.fullmatch(r"[a-z]", string)), 130 | bool(regex.fullmatch(r"\d+", string)), 131 | bool(regex.fullmatch(r"[A-Z]", string)), 132 | bool(regex.fullmatch(r"[xvi]x{0,4}v?i{0,4}", string)), 133 | bool(regex.fullmatch(r"[XVI]X{0,4}V?I{0,4}", string)), 134 | bool(regex.fullmatch(r"([a-z])\1", string)), 135 | ) 136 | 137 | 138 | def enum_types_match(x, y): 139 | for a, b in zip(x, y): 140 | if a and b: 141 | return True 142 | return False 143 | 144 | 145 | # fmt: off 146 | 147 | inline_title_pattern = regex.compile( 148 | r'(?:' 149 | r'(this)\s(?:sub\-?)?(?:title|chapter|part|section|division|paragraph)' 150 | r'|' 151 | r'title\s(\d+)' 152 | r')' 153 | r'(\s+of\s+the\s+Code\s+of\s+Federal\s+Regulations)?' 154 | r'(\s+of\s+the\s+Code\s+of\s+the\s+United\s+States)?', 155 | flags=regex.IGNORECASE 156 | ) 157 | 158 | # fmt: on 159 | 160 | 161 | def extract_title_inline(text, this_title, this_usc): 162 | match = inline_title_pattern.fullmatch(text) 163 | assert match 164 | 165 | if bool(match[4]): 166 | usc = True 167 | elif bool(match[3]): 168 | usc = False 169 | else: 170 | usc = this_usc 171 | 172 | if match[1]: 173 | return this_title, usc 174 | elif match[2]: 175 | return int(match[2]), usc 176 | else: 177 | raise Exception(text) 178 | 179 | 180 | def split_block_reference(reference_str, debug_context=None): 181 | text_parts = split_pattern_short.split(reference_str) 182 | if not len(text_parts) == 3: 183 | print("ERROR", text_parts, str(debug_context)) 184 | title = int(text_parts[0].strip()) 185 | usc = "u" in text_parts[1].lower() 186 | sub_text = text_parts[2] 187 | return usc, title, sub_text 188 | 189 | 190 | def parse_references(soup, this_title, this_usc): 191 | test_list = [] # For debug 192 | for ref_tag in soup.find_all("reference"): 193 | # Split into title and subtitle 194 | last_usc = None 195 | last_title = None 196 | if ref_tag["pattern"] == "block": 197 | usc, title, sub_text = split_block_reference( 198 | ref_tag.string, debug_context=ref_tag 199 | ) 200 | text_parts = split_pattern_inline.split(sub_text) 201 | if len(text_parts) == 2: 202 | last_title, last_usc = extract_title_inline( 203 | text_parts[1].strip(), this_title, this_usc 204 | ) 205 | sub_text = text_parts[0] 206 | elif len(text_parts) > 2: 207 | raise Exception(str(ref_tag)) 208 | 209 | elif ref_tag["pattern"] == "inline": 210 | text_parts = split_pattern_inline.split(ref_tag.string) 211 | if len(text_parts) == 2: 212 | title, usc = extract_title_inline( 213 | text_parts[1].strip(), this_title, this_usc 214 | ) 215 | sub_text = text_parts[0] 216 | elif len(text_parts) == 1: 217 | title = this_title 218 | sub_text = text_parts[0].strip() 219 | usc = this_usc 220 | else: 221 | raise Exception(str(ref_tag)) 222 | else: 223 | raise Exception(f"{str(ref_tag)} has not matching pattern") 224 | 225 | references = parse_reference_text(sub_text) 226 | add_title_to_reference(references, title, usc, last_title, last_usc) 227 | 228 | ref_tag["parsed"] = json.dumps(references, ensure_ascii=False) 229 | test_list.append(f"{sub_text} -- {json.dumps(references, ensure_ascii=False)}") 230 | return test_list 231 | 232 | 233 | def add_title_to_reference(references, title, usc, last_title=None, last_usc=None): 234 | # Add title to index 0 of reference 235 | for reference in references: 236 | if usc: 237 | title_str = str(title) 238 | else: 239 | title_str = "cfr" + str(title) 240 | reference.insert(0, title_str) 241 | if len(references) > 1 and last_title is not None: 242 | assert last_usc is not None 243 | if last_usc: 244 | title_str = str(last_title) 245 | else: 246 | title_str = "cfr" + str(last_title) 247 | references[-1][0] = title_str 248 | 249 | 250 | def parse_reference_text(sub_text): 251 | # Preformat ranges 252 | for match in regex.finditer( 253 | r"(\d+[a-z]{0,3})[\-\–\—](\d+[a-z]{0,3})", 254 | sub_text, 255 | flags=regex.IGNORECASE, 256 | ): 257 | if sortable_paragraph_number(match[1]) <= sortable_paragraph_number(match[2]): 258 | sub_text = ( 259 | f"{sub_text[:match.start()]}{match[1]} through " 260 | f"{match[2]}{sub_text[match.end():]}" 261 | ) 262 | 263 | sub_text = sub_text.replace(" and following", " et. seq.").strip() 264 | 265 | references = [] 266 | text_sub_splitted = sub_split_pattern.split(sub_text) 267 | for test_text in text_sub_splitted: 268 | match = regex.fullmatch( 269 | r"(?:§+|sec\.|sections?\b|(?:sub)?parts?\b)?\s*" 270 | r"(\d+[a-z]{0,3}(?:[\-\–\—\.]\d+[a-z]{0,3})?)" 271 | r"\s?" 272 | r"((?:\((?:\d*[a-z]{0,4})\))*)" 273 | r"(" 274 | r" et\.? seq\.?|" 275 | r" and following" 276 | r")?", 277 | test_text, 278 | flags=regex.IGNORECASE, 279 | ) 280 | if not match: 281 | # test_list.append(f'{test_text} -- {sub_text} -- {file}') 282 | continue 283 | sections = [match[1]] 284 | sub_sections = regex.split(r"[\(\)]+", match[2]) 285 | sub_sections = [o for o in sub_sections if len(o)] 286 | sections.extend(sub_sections) 287 | 288 | if sections[0]: 289 | references.append(sections) 290 | else: 291 | new_reference = None 292 | current_part_types = get_enum_types(sections[1]) 293 | for old_part in reversed(references[-1][1:]): 294 | if enum_types_match(current_part_types, get_enum_types(old_part)): 295 | new_reference = references[-1][: references[-1].index(old_part)] 296 | break 297 | if not new_reference: 298 | new_reference = references[-1][:] 299 | new_reference.extend(sections[1:]) 300 | references.append(new_reference) 301 | return references 302 | -------------------------------------------------------------------------------- /statutes_pipeline_steps/us_reference_reg.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from bs4 import BeautifulSoup 4 | from regex.regex import Match 5 | 6 | from statutes_pipeline_steps.us_reference_parse import ( 7 | add_title_to_reference, 8 | parse_reference_text, 9 | split_block_reference, 10 | ) 11 | 12 | 13 | def find_authority_references(soup: BeautifulSoup, pattern: Match): 14 | logs = [] 15 | 16 | for tag in soup.find_all(auth_text=True): 17 | auth_text = tag.attrs["auth_text"] 18 | matches = [m[0] for m in pattern.finditer(auth_text)] 19 | tag.attrs["auth_text_areas"] = json.dumps(matches, ensure_ascii=False) 20 | return logs 21 | 22 | 23 | def parse_authority_references(soup: BeautifulSoup): 24 | logs = [] 25 | for tag in soup.find_all(auth_text_areas=True): 26 | auth_areas = json.loads(tag.attrs["auth_text_areas"]) 27 | auth_parsed = [] 28 | for auth_area in auth_areas: 29 | usc, title, sub_text = split_block_reference( 30 | auth_area, debug_context=tag.attrs["auth_text"] 31 | ) 32 | references = parse_reference_text(sub_text) 33 | add_title_to_reference(references, title, usc) 34 | auth_parsed.append(references) 35 | tag.attrs["auth_text_parsed"] = json.dumps(auth_parsed, ensure_ascii=False) 36 | return logs 37 | -------------------------------------------------------------------------------- /statutes_pipeline_steps/us_reg_prepare_input.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import re 4 | import shutil 5 | from zipfile import ZipFile 6 | 7 | import pandas as pd 8 | 9 | from statics import US_REG_INPUT_COPY_LOG_PATH, US_REG_INPUT_PATH, US_REG_ORIGINAL_PATH 10 | from utils.common import ensure_exists 11 | 12 | pattern = re.compile(r".+/CFR-(?P\d+)-title(?P\d+)-vol(?P\d*).xml") 13 | 14 | 15 | def us_reg_prepare_input(): 16 | """moves files into main dir and validate files roughly""" 17 | 18 | ensure_exists(US_REG_ORIGINAL_PATH) 19 | 20 | year_zips = sorted( 21 | [f.name for f in os.scandir(US_REG_INPUT_PATH) if f.name.endswith(".zip")] 22 | ) 23 | for year_zip in year_zips: 24 | year = os.path.splitext(year_zip)[0] 25 | year_folder = os.path.join(US_REG_ORIGINAL_PATH, year) 26 | if os.path.exists(year_folder): 27 | raise Exception(f"{year_folder} already exists") 28 | 29 | with ZipFile(os.path.join(US_REG_INPUT_PATH, year_zip), "r") as zipObj: 30 | # Extract all the contents of zip file in current directory 31 | zipObj.extractall(year_folder) 32 | 33 | # Get all files 34 | vols = [ 35 | pattern.fullmatch(p).groupdict() 36 | for p in glob.glob(os.path.join(US_REG_ORIGINAL_PATH, "*/*/*.xml")) 37 | ] 38 | 39 | print("Dropping") 40 | for vol in vols: 41 | if not vol["v"]: 42 | print(vol) 43 | os.remove( 44 | os.path.join( 45 | US_REG_ORIGINAL_PATH, 46 | vol["y"], 47 | f"title-{vol['t']}", 48 | f"CFR-{vol['y']}-title{vol['t']}-vol.xml", 49 | ) 50 | ) 51 | vols = [v for v in vols if v["v"]] 52 | 53 | df = pd.DataFrame(vols) 54 | df.v = [int(v) if v else None for v in df.v] 55 | df.y = [int(y) for y in df.y] 56 | df.t = [int(t) for t in df.t] 57 | df = df.sort_values(["y", "t", "v"]).reset_index().drop("index", axis=1) 58 | 59 | volumes = sorted({(t, v) for t, v in zip(df.t, df.v)}) 60 | 61 | copy_actions = [] 62 | 63 | for title, volume in volumes: 64 | vol_df = df[(df.t == title) & (df.v == volume)] 65 | existing_years = set(vol_df.y) 66 | last_exisiting_year = None 67 | for year in range(vol_df.y.min(), vol_df.y.max()): 68 | if year in existing_years: 69 | last_exisiting_year = year 70 | else: 71 | assert last_exisiting_year 72 | copy_actions.append( 73 | dict( 74 | title=title, 75 | volume=volume, 76 | from_year=last_exisiting_year, 77 | to_year=year, 78 | ) 79 | ) 80 | for copy_action in copy_actions: 81 | to_dir = os.path.join( 82 | US_REG_ORIGINAL_PATH, 83 | str(copy_action["to_year"]), 84 | f"title-{copy_action['title']}", 85 | ) 86 | os.makedirs(to_dir, exist_ok=True) 87 | shutil.copy( 88 | os.path.join( 89 | US_REG_ORIGINAL_PATH, 90 | str(copy_action["from_year"]), 91 | f"title-{copy_action['title']}", 92 | f"CFR-{copy_action['from_year']}-" 93 | f"title{copy_action['title']}-" 94 | f"vol{copy_action['volume']}.xml", 95 | ), 96 | os.path.join( 97 | to_dir, 98 | f"CFR-{copy_action['to_year']}-" 99 | f"title{copy_action['title']}-" 100 | f"vol{copy_action['volume']}.xml", 101 | ), 102 | ) 103 | pd.DataFrame(copy_actions).to_csv(US_REG_INPUT_COPY_LOG_PATH, index=False) 104 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QuantLaw/legal-data-preprocessing/4264cd630b13e3d3bb934d4abd73b5b98217873c/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_common.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import unittest 3 | 4 | from utils.common import str_to_bool 5 | 6 | 7 | class TestCommon(unittest.TestCase): 8 | def test_str_to_bool(self): 9 | self.assertTrue(str_to_bool("YES")) 10 | self.assertTrue(str_to_bool("true")) 11 | self.assertFalse(str_to_bool("No")) 12 | self.assertTrue(str_to_bool(True)) 13 | with self.assertRaises(argparse.ArgumentTypeError): 14 | str_to_bool("hell!") 15 | -------------------------------------------------------------------------------- /tests/test_de_reference_parse.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from bs4 import BeautifulSoup 4 | from quantlaw.de_extract.statutes_parse import StatutesParser 5 | 6 | from statutes_pipeline_steps.de_reference_parse import parse_reference_content 7 | from statutes_pipeline_steps.us_reference_parse import split_block_reference 8 | 9 | 10 | class TestDeReferenceParse(unittest.TestCase): 11 | def test_parse_reference_content_lower_s(self): 12 | reference = BeautifulSoup( 13 | '' 14 | "
§ 6 Absatz 1 Nummer 2 Buchstabe r, s, t und v
" 15 | "
", 16 | "lxml-xml", 17 | ).reference 18 | parser = StatutesParser({}) 19 | parse_reference_content(reference, parser) 20 | self.assertEqual( 21 | '[["6", "1", "2", "r"], ' 22 | '["6", "1", "2", "s"], ' 23 | '["6", "1", "2", "t"], ' 24 | '["6", "1", "2", "v"]]', 25 | reference.attrs["parsed"], 26 | ) 27 | 28 | def test_parse_reference_content_upper_s_ignore(self): 29 | reference = BeautifulSoup( 30 | '' 31 | "
§ 6 Absatz 1 Nummer 2 Buchstabe r, s, t, S
" 32 | "
", 33 | "lxml-xml", 34 | ).reference 35 | parser = StatutesParser({}) 36 | parse_reference_content(reference, parser) 37 | self.assertEqual( 38 | '[["6", "1", "2", "r"], ' '["6", "1", "2", "s"], ' '["6", "1", "2", "t"]]', 39 | reference.attrs["parsed"], 40 | ) 41 | 42 | def test_parse_reference_content_upper_s_for_Satz(self): 43 | reference = BeautifulSoup( 44 | '' 45 | "
§ 6 Absatz 1 Nummer 2 S 4, S 5
" 46 | "
", 47 | "lxml-xml", 48 | ).reference 49 | parser = StatutesParser({}) 50 | parse_reference_content(reference, parser) 51 | self.assertEqual( 52 | '[["6", "1", "2", "4"], ' '["6", "1", "2", "5"]]', reference.attrs["parsed"] 53 | ) 54 | 55 | def test_cfrsec_splitter(self): 56 | split_block_reference("47 CFRSec. 1.1204(b)", debug_context=None) 57 | -------------------------------------------------------------------------------- /tests/test_de_reference_parse_vso_list.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from statutes_pipeline_steps.de_reference_parse_vso_list import ( 4 | remove_duplicate_references, 5 | ) 6 | 7 | 8 | class MyTestCase(unittest.TestCase): 9 | def test_remove_duplicate_references(self): 10 | self.assertEqual( 11 | remove_duplicate_references( 12 | [["SGB-4", "28p", "8"], ["SGB-4", "28p", "8"], ["SGB-4", "28p", "7"]] 13 | ), 14 | [["SGB-4", "28p", "8"], ["SGB-4", "28p", "7"]], 15 | ) 16 | 17 | self.assertEqual( 18 | remove_duplicate_references( 19 | [ 20 | [["Gesetz", "EnWiG"], ["§", "2"], ["Abs", "2"]], 21 | [["Gesetz", "EnWiG"], ["§", "2"], ["Abs", "3"]], 22 | [["Gesetz", "EnWiG"], ["§", "5"]], 23 | [["Gesetz", "EnWiG"], ["§", "5"]], 24 | ] 25 | ), 26 | [ 27 | [["Gesetz", "EnWiG"], ["§", "2"], ["Abs", "2"]], 28 | [["Gesetz", "EnWiG"], ["§", "2"], ["Abs", "3"]], 29 | [["Gesetz", "EnWiG"], ["§", "5"]], 30 | ], 31 | ) 32 | 33 | 34 | if __name__ == "__main__": 35 | unittest.main() 36 | -------------------------------------------------------------------------------- /tests/test_snapshot_mapping_index.py: -------------------------------------------------------------------------------- 1 | import random 2 | import re 3 | import string 4 | from unittest import TestCase 5 | 6 | from utils.string_list_contains import StringContainsAlign 7 | 8 | 9 | def get_random_string(length): 10 | letters = string.ascii_lowercase + " " * 8 11 | result_str = "".join(random.choice(letters) for i in range(length)) 12 | return re.sub(r"\s+", " ", result_str) 13 | 14 | 15 | class StringContainsAlignTestCase(TestCase): 16 | @classmethod 17 | def setUpClass(cls) -> None: 18 | list_random_part = [ 19 | get_random_string(random.randint(100, 1000)) for _ in range(100) 20 | ] 21 | list_random_part_1 = [ 22 | get_random_string(random.randint(100, 1000)) for _ in range(100) 23 | ] 24 | list_random_part_2 = [ 25 | get_random_string(random.randint(100, 1000)) for _ in range(100) 26 | ] 27 | 28 | cls.test_list_1 = ( 29 | list_random_part * 10 30 | + list_random_part_1 31 | + list_random_part * 25 32 | + ["sdf sdf", "sdfsdf"] 33 | ) 34 | cls.test_list_2 = ( 35 | list_random_part_2 * 15 36 | + list_random_part_1 37 | + list_random_part_2 * 20 38 | + ["sdf sdf", "sdfsdf"] 39 | ) 40 | 41 | def test_align(self): 42 | 43 | aligner = StringContainsAlign() 44 | aligner.text_list_0 = self.__class__.test_list_1 45 | aligner.text_list_1 = self.__class__.test_list_2 46 | 47 | aligner.create_index() 48 | 49 | res = aligner.run(reversed=True) 50 | 51 | self.assertEqual(102, len(res)) 52 | 53 | aligner.min_text_length = 100 54 | res = aligner.run(reversed=True) 55 | 56 | self.assertTrue(0 < len(res) < 102) 57 | -------------------------------------------------------------------------------- /tests/test_us_reg_xml.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from statutes_pipeline_steps.us_reg_to_xml import split_double_units 4 | 5 | 6 | class MyTestCase(unittest.TestCase): 7 | def test_split_double_units(self): 8 | self.assertEqual( 9 | [["(a)"], ["(1) sdf"], ["(2) asdasd"], [["x", "y"]]], 10 | list(split_double_units([["(a)(1) sdf"], ["(a)(2) asdasd"], [["x", "y"]]])), 11 | ) 12 | -------------------------------------------------------------------------------- /utils/common.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import pickle 4 | import shutil 5 | from collections import Counter 6 | 7 | import pandas as pd 8 | from quantlaw.utils.files import ensure_exists 9 | from quantlaw.utils.pipeline import PipelineStep 10 | from regex import regex 11 | 12 | from statics import ( 13 | DATA_PATH, 14 | DE_LAW_NAMES_COMPILED_PATH, 15 | DE_LAW_NAMES_PATH, 16 | DE_REG_LAW_NAMES_COMPILED_PATH, 17 | DE_REG_LAW_NAMES_PATH, 18 | ) 19 | 20 | ########## 21 | # Pipeline 22 | ########## 23 | 24 | 25 | class RegulationsPipelineStep(PipelineStep): 26 | def __init__(self, regulations, *args, **kwargs): 27 | self.regulations = regulations 28 | super().__init__(*args, **kwargs) 29 | 30 | 31 | def str_to_bool(v): 32 | if isinstance(v, bool): 33 | return v 34 | if v.lower() in ("yes", "true", "t", "y", "1"): 35 | return True 36 | elif v.lower() in ("no", "false", "f", "n", "0"): 37 | return False 38 | else: 39 | raise argparse.ArgumentTypeError("Boolean value expected.") 40 | 41 | 42 | ######################## 43 | # Generic Data Wrangling 44 | ######################## 45 | 46 | 47 | def invert_dict_mapping_all(mapping_dictionary): 48 | """ 49 | Args: 50 | mapping_dictionary: mapping from keys to values which is not necessarily 51 | injective, e.g., node_id to community_id mapping 52 | 53 | Returns: inverted mapping with unique values as keys and lists of former keys as 54 | values, e.g., community_id to node_id mapping 55 | 56 | """ 57 | inverted = {v: [] for v in mapping_dictionary.values()} 58 | for k, v in mapping_dictionary.items(): 59 | inverted[v].append(k) 60 | return inverted 61 | 62 | 63 | def invert_dict_mapping_unique(source_dict): 64 | """ 65 | Inverts keys and values of a dict. Only entries with unique values are inverted. 66 | """ 67 | counter = Counter(source_dict.values()) 68 | unique = set([text for text, cnt in counter.most_common() if cnt == 1]) 69 | return {v: k for k, v in source_dict.items() if v in unique} 70 | 71 | 72 | #################### 73 | # DE Crossreferences 74 | #################### 75 | 76 | 77 | def load_law_names(regulations): 78 | df = pd.read_csv(DE_REG_LAW_NAMES_PATH if regulations else DE_LAW_NAMES_PATH) 79 | data = [ 80 | dict( 81 | citename=row.citename, 82 | citekey=row.citekey, 83 | start=row.filename.split("_")[2], 84 | end=os.path.splitext(row.filename)[0].split("_")[3], 85 | filename=row.filename, 86 | ) 87 | for i, row in df.iterrows() 88 | ] 89 | return data 90 | 91 | 92 | def load_law_names_compiled(regulations): 93 | with open( 94 | DE_REG_LAW_NAMES_COMPILED_PATH if regulations else DE_LAW_NAMES_COMPILED_PATH, 95 | "rb", 96 | ) as f: 97 | return pickle.load(f) 98 | 99 | 100 | def get_stemmed_law_names_for_filename(filename, law_names): 101 | date = os.path.splitext(filename)[0].split("_")[2] 102 | return get_stemmed_law_names(date, law_names) 103 | 104 | 105 | def get_stemmed_law_names(date, law_names): 106 | laws_lookup = law_names[date] 107 | 108 | # Custom law names, stemmed as key. 109 | laws_lookup["grundgesetz"] = "GG" 110 | 111 | # Add law names without year number if key already used 112 | shortened_keys = {} 113 | for key, value in laws_lookup.items(): 114 | match = regex.fullmatch(r"(.+)\s\d{4}[\-\d]*", key) 115 | if match: 116 | if match[1] not in shortened_keys: 117 | shortened_keys[match[1]] = set() 118 | shortened_keys[match[1]].update([value]) 119 | 120 | for key, values in shortened_keys.items(): 121 | if len(values) == 1 and key not in laws_lookup.keys(): 122 | laws_lookup[key] = list(values)[0] 123 | 124 | return laws_lookup 125 | 126 | 127 | def get_snapshot_law_list(date, law_names_data): 128 | date = date.replace("-", "") 129 | law_names_list = { 130 | d["filename"] for d in law_names_data if d["start"] <= date and d["end"] >= date 131 | } 132 | assert len(law_names_list) == len({x.split("_")[0] for x in law_names_list}) 133 | return law_names_list 134 | 135 | 136 | def copy_xml_schema_to_data_folder(): 137 | ensure_exists(DATA_PATH) 138 | shutil.copyfile("xml-schema.xsd", os.path.join(DATA_PATH, "xml-schema.xsd")) 139 | shutil.copyfile("xml-styles.css", os.path.join(DATA_PATH, "xml-styles.css")) 140 | -------------------------------------------------------------------------------- /utils/simplify_gii_xml.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from bs4 import BeautifulSoup, NavigableString 4 | from quantlaw.utils.beautiful_soup import create_soup, save_soup 5 | 6 | 7 | def simplify_gii_xml(source, destination): 8 | soup = create_soup(source) 9 | simplify(soup) 10 | save_soup(soup, destination) 11 | 12 | 13 | def remove_new_lines(tag, soup): 14 | for descendant in list(tag.descendants): 15 | if type(descendant) is NavigableString: 16 | text = str(descendant) 17 | text = re.sub(r"\s+", " ", text).strip() 18 | if str(descendant) != text: 19 | descendant.replaceWith(soup.new_string(text)) 20 | 21 | 22 | def simplify(soup: BeautifulSoup): 23 | # General 24 | for t in soup.find_all(attrs={"builddate": True}): 25 | del t.attrs["builddate"] 26 | 27 | for t in soup.find_all("FnR"): 28 | t.extract() 29 | 30 | for metadaten in soup.find_all("metadaten"): 31 | for t in metadaten.find_all("titel", recursive=False): 32 | del t.attrs["format"] 33 | for t in metadaten.find_all("enbez", recursive=False): 34 | if t.string == "(XXXX)": 35 | t.string = "XXXX" 36 | 37 | for t in soup.find_all("BR"): 38 | text = " " 39 | if type(t.previous_sibling) is NavigableString: 40 | text = t.previous_sibling.string + " " 41 | t.previous_sibling.extract() 42 | if type(t.next_sibling) is NavigableString: 43 | text += t.next_sibling.string 44 | t.next_sibling.extract() 45 | 46 | t.replaceWith(soup.new_string(text)) 47 | 48 | # Metadaten 49 | for tag_name in ["ausfertigung-datum", "fundstelle", "standangabe"]: 50 | for t in soup.metadaten.find_all(tag_name): 51 | t.extract() 52 | 53 | # Text 54 | for t in soup.find_all("SUP", attrs={"class": "Rec"}): 55 | t.replaceWith(soup.new_string(" ")) 56 | 57 | for t in soup.find_all(["DT", "DD", "entry", "LA"]): 58 | t.insert(0, soup.new_string(" ")) 59 | t.append(soup.new_string(" ")) 60 | 61 | for t in soup.find_all("P"): 62 | new_t = soup.new_tag("P") 63 | text = t.get_text() 64 | text = re.sub(r"\s+", " ", text).strip() 65 | if text: 66 | new_t.string = text 67 | t.replaceWith(new_t) 68 | 69 | for toc in soup.find_all("TOC"): 70 | text = toc.get_text(" ") 71 | text = re.sub(r"\s+", " ", text).strip() 72 | new_toc = soup.new_tag("TOC") 73 | new_toc.string = text 74 | toc.replaceWith(new_toc) 75 | 76 | for textdaten in soup.find_all("textdaten"): 77 | if textdaten.Footnotes: 78 | textdaten.Footnotes.extract() 79 | 80 | t = textdaten.find("text", recursive=False) 81 | if t and not t.get_text().strip(): 82 | t.extract() 83 | 84 | for t in soup.find_all("Content"): 85 | if ( 86 | type(t.next_sibling) is NavigableString 87 | and not t.next_sibling.string.strip() 88 | ): 89 | t.next_sibling.extract() 90 | 91 | for t in soup.find_all(["gliederungstitel", "titel", "langue", "kurzue"]): 92 | remove_new_lines(t, soup) 93 | for descendant in list(t.descendants): 94 | if type(descendant) is NavigableString: 95 | text = str(descendant) 96 | text = re.sub(r"\s*\*\)\s*$", "", text).strip() 97 | if str(descendant) != text: 98 | descendant.replaceWith(soup.new_string(text)) 99 | 100 | for t in soup.find_all("fussnoten"): 101 | t.extract() 102 | -------------------------------------------------------------------------------- /utils/string_list_contains.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | 3 | 4 | class StringContainsAlign: 5 | """ 6 | Maps strings of a list to strings of another list based on the condition 7 | that the string (needle) of the first list is contained in the string 8 | (haystack) in the second list. 9 | This class optimizes performance by splitting the strings to compare into 10 | tokens to preselect possible candidates that might contain the needle 11 | """ 12 | 13 | def __init__( 14 | self, 15 | text_list_0=None, 16 | text_list_1=None, 17 | sep=" ", 18 | fixed_ends=False, 19 | min_text_length=None, 20 | ): 21 | self.sep = sep 22 | self.fixed_ends = fixed_ends 23 | self.min_text_length = min_text_length 24 | self.text_list_0 = text_list_0 25 | self.text_list_1 = text_list_1 26 | self.index_0 = None 27 | self.index_1 = None 28 | 29 | def create_index(self): 30 | assert self.text_list_0 31 | assert self.text_list_1 32 | self.index_0 = self._text_list_to_token_index(self.text_list_0) 33 | self.index_1 = self._text_list_to_token_index(self.text_list_1) 34 | 35 | def clean_index(self): 36 | self.index_0 = None 37 | self.index_1 = None 38 | 39 | def _text_list_to_token_index(self, text_list): 40 | token_index = defaultdict(set) 41 | for i, text in enumerate(text_list): 42 | text_tokens = text.split(self.sep) 43 | for token in text_tokens: 44 | token_index[token].add(i) 45 | return token_index 46 | 47 | def run(self, reversed=False): 48 | text_list_needle = self.text_list_1 if reversed else self.text_list_0 49 | text_list_haystack = self.text_list_0 if reversed else self.text_list_1 50 | 51 | index_haystack = self.index_0 if reversed else self.index_1 52 | 53 | result = [] 54 | 55 | for needle_index, needle in enumerate(text_list_needle): 56 | if self.min_text_length and len(needle) < self.min_text_length: 57 | continue 58 | 59 | needle_tokens = needle.split(self.sep) 60 | 61 | if not self.fixed_ends: 62 | # Remove first and last token so that e.g. adding just one letter to the 63 | # last token is possible 64 | needle_tokens = needle_tokens[1:-1] 65 | 66 | if needle_tokens: 67 | candidates = index_haystack[needle_tokens[0]] 68 | for token in needle_tokens[1:]: 69 | candidates = candidates.intersection(index_haystack[token]) 70 | 71 | # For performence 72 | if len(candidates) <= 1: 73 | break 74 | else: 75 | candidates = range(len(text_list_haystack)) 76 | 77 | for haystack_index in candidates: 78 | target_text = text_list_haystack[haystack_index] 79 | if needle in target_text: 80 | result.append((needle_index, haystack_index)) 81 | 82 | return result 83 | -------------------------------------------------------------------------------- /xml-schema-decisions-de.xsd: -------------------------------------------------------------------------------- 1 | 6 | 10 | 11 | 12 | 17 | 21 | 25 | 29 | 30 | 31 | 35 | 40 | 45 | 49 | 53 | 57 | 61 | 65 | 69 | 73 | 74 | 75 | 79 | 83 | 87 | 88 | 93 | 98 | 103 | 104 | 105 | 106 | 112 | 118 | 119 | 123 | 127 | 132 | 137 | 138 | 139 | 140 | 144 | 145 | 150 | 155 | 156 | 160 | 161 | 167 | 168 | 169 | 170 | 171 | 175 | 179 | 180 | 181 | 182 | 183 | 187 | 188 | 189 | 190 | 191 | 192 | 196 | 200 | 204 | 205 | 206 | 207 | 208 | 213 | 214 | 215 | 216 | 217 | -------------------------------------------------------------------------------- /xml-schema.xsd: -------------------------------------------------------------------------------- 1 | 6 | 10 | 11 | 15 | 19 | 23 | 24 | 28 | 33 | 38 | 42 | 46 | 50 | 54 | 58 | 59 | 60 | 64 | 68 | 72 | 73 | 77 | 82 | 87 | 91 | 95 | 99 | 100 | 101 | 105 | 109 | 113 | 114 | 118 | 122 | 127 | 132 | 133 | 134 | 138 | 142 | 146 | 147 | 152 | 157 | 161 | 162 | 166 | 167 | 173 | 174 | 175 | 179 | 180 | 184 | 188 | 189 | 190 | 191 | 192 | 197 | 198 | 199 | 200 | 201 | 202 | 206 | 210 | 214 | 215 | 216 | 217 | 218 | 223 | 224 | 225 | 226 | 227 | -------------------------------------------------------------------------------- /xml-styles.css: -------------------------------------------------------------------------------- 1 | document { 2 | font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", "PingFang SC", "Hiragino Sans GB", "Microsoft YaHei", "Helvetica Neue", Helvetica, Arial, sans-serif, 3 | "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol"; 4 | padding: 20px; 5 | line-height: 140%; 6 | } 7 | 8 | seqitem { 9 | margin-top: 20px; 10 | display: block; 11 | } 12 | 13 | subseqitem { 14 | display: block; 15 | } 16 | 17 | item { 18 | border-left: solid #999 2px; 19 | padding-left: 20px; 20 | margin-left: 2px; 21 | display: block; 22 | } 23 | 24 | item::before, seqitem::before, subseqitem::before { 25 | content: attr(heading); 26 | font-weight: 600; 27 | display: block; 28 | } 29 | 30 | 31 | main { 32 | color: #00a; 33 | } 34 | 35 | suffix { 36 | color: #aa0; 37 | } 38 | 39 | lawname { 40 | background-color: #f00; 41 | } 42 | 43 | lawname[type="dict"] { 44 | background-color: rgb(164, 211, 255) 45 | } 46 | 47 | lawname[type="eu"] { 48 | background: repeating-linear-gradient( 49 | 135deg, 50 | rgb(207, 229, 250), 51 | rgb(207, 229, 250) 15px, 52 | rgb(122, 187, 252) 15px, 53 | rgb(122, 187, 252) 30px 54 | ); 55 | } 56 | 57 | lawname[type="sgb"] { 58 | background: repeating-linear-gradient( 59 | 135deg, 60 | rgb(207, 229, 250), 61 | rgb(207, 229, 250) 15px, 62 | rgb(243, 231, 128) 15px, 63 | rgb(243, 231, 128) 30px 64 | ); 65 | } 66 | 67 | lawname[type="ignore"] { 68 | background: repeating-linear-gradient( 69 | 135deg, 70 | rgb(207, 229, 250), 71 | rgb(207, 229, 250) 15px, 72 | rgb(255, 162, 134) 15px, 73 | rgb(255, 162, 134) 30px 74 | ); 75 | } 76 | 77 | 78 | reference[nomatch=""] { 79 | background-color: #f00; 80 | } 81 | 82 | lawreference { 83 | background-color: #fca0f9; 84 | } 85 | 86 | reference:before { 87 | content:"\a"; 88 | white-space: pre; 89 | } 90 | 91 | reference:after, lawreference:after { 92 | font-family: SFMono-Regular, Consolas, "Liberation Mono", Menlo, Courier, monospace; 93 | color: rgb(143, 38, 22); 94 | background-color: rgb(251, 229, 225); 95 | content: attr(parsed) ' ' attr(lawid) ' (' attr(target) ')'; 96 | } 97 | 98 | reference[target="match"]:after { 99 | color: rgb(36, 143, 22); 100 | background-color: rgb(229, 255, 226); 101 | } 102 | 103 | reference[target="skipped"]:after { 104 | color: rgb(117, 119, 19); 105 | background-color: rgb(255, 255, 230); 106 | } 107 | --------------------------------------------------------------------------------