├── .coveragerc ├── .github └── workflows │ └── building.yaml ├── .gitignore ├── .pypirc ├── Pipfile ├── README.md ├── eurlex └── __init__.py ├── requirements.txt ├── setup.py └── tests ├── __init__.py └── test_parsing.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | omit = setup.py -------------------------------------------------------------------------------- /.github/workflows/building.yaml: -------------------------------------------------------------------------------- 1 | name: Building 2 | on: [push] 3 | 4 | jobs: 5 | build: 6 | name: Run Python Tests and upload to PyPI 7 | runs-on: ubuntu-latest 8 | 9 | steps: 10 | - uses: actions/checkout@v2 11 | 12 | - name: Set up Python 3.8 13 | uses: actions/setup-python@v2 14 | with: 15 | python-version: 3.8 16 | 17 | - name: Install pytest 18 | run: | 19 | pip3 install pytest==6.2.5 pytest-cov==3.0.0 20 | 21 | - name: Install Python dependencies 22 | run: | 23 | python3 -m pip install --upgrade pip 24 | pip3 install -r requirements.txt 25 | 26 | - name: Test with pytest 27 | run: | 28 | pytest . --doctest-modules --exitfirst --verbose --failed-first \ 29 | --cov=. --cov-report html 30 | 31 | - name: Build and Upload to TestPyPI 32 | continue-on-error: true 33 | run: | 34 | pip3 install wheel twine 35 | python3 setup.py sdist bdist_wheel 36 | python3 -m twine upload dist/* 37 | env: 38 | TWINE_USERNAME: __token__ 39 | TWINE_PASSWORD: ${{ secrets.TWINE_TOKEN }} 40 | TWINE_REPOSITORY: pypi 41 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by https://www.toptal.com/developers/gitignore/api/python 2 | # Edit at https://www.toptal.com/developers/gitignore?templates=python 3 | 4 | ### Python ### 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | *.py,cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | cover/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | db.sqlite3-journal 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | .pybuilder/ 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # IPython 86 | profile_default/ 87 | ipython_config.py 88 | 89 | # pyenv 90 | # For a library or package, you might want to ignore these files since the code is 91 | # intended to run in multiple environments; otherwise, check them in: 92 | # .python-version 93 | 94 | # pipenv 95 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 96 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 97 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 98 | # install all needed dependencies. 99 | #Pipfile.lock 100 | 101 | # poetry 102 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 103 | # This is especially recommended for binary packages to ensure reproducibility, and is more 104 | # commonly ignored for libraries. 105 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 106 | #poetry.lock 107 | 108 | # pdm 109 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 110 | #pdm.lock 111 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 112 | # in version control. 113 | # https://pdm.fming.dev/#use-with-ide 114 | .pdm.toml 115 | 116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 117 | __pypackages__/ 118 | 119 | # Celery stuff 120 | celerybeat-schedule 121 | celerybeat.pid 122 | 123 | # SageMath parsed files 124 | *.sage.py 125 | 126 | # Environments 127 | .env 128 | .venv 129 | env/ 130 | venv/ 131 | ENV/ 132 | env.bak/ 133 | venv.bak/ 134 | 135 | # Spyder project settings 136 | .spyderproject 137 | .spyproject 138 | 139 | # Rope project settings 140 | .ropeproject 141 | 142 | # mkdocs documentation 143 | /site 144 | 145 | # mypy 146 | .mypy_cache/ 147 | .dmypy.json 148 | dmypy.json 149 | 150 | # Pyre type checker 151 | .pyre/ 152 | 153 | # pytype static type analyzer 154 | .pytype/ 155 | 156 | # Cython debug symbols 157 | cython_debug/ 158 | 159 | # PyCharm 160 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 161 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 162 | # and can be added to the global gitignore or merged into this file. For a more nuclear 163 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 164 | #.idea/ 165 | 166 | # End of https://www.toptal.com/developers/gitignore/api/python 167 | 168 | # Custom additions: 169 | # Mac OS X 170 | .DS_Store 171 | # Output data 172 | *.json 173 | *.html -------------------------------------------------------------------------------- /.pypirc: -------------------------------------------------------------------------------- 1 | [distutils] 2 | index-servers = 3 | pypi 4 | testpypi 5 | 6 | [pypi] 7 | repository = https://upload.pypi.org/legacy/ 8 | 9 | [testpypi] 10 | repository = https://test.pypi.org/legacy/ -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | url = "https://pypi.org/simple" 3 | verify_ssl = true 4 | name = "pypi" 5 | 6 | [packages] 7 | argparse = "==1.4.0" 8 | sparqlwrapper = "==1.8.5" 9 | pandas = "==1.2.4" 10 | rdflib = "==6.0.2" 11 | requests = "==2.25.1" 12 | bs4 = "==0.0.1" 13 | beautifulsoup4 = "==4.9.3" 14 | 15 | [dev-packages] 16 | 17 | [requires] 18 | python_version = "3.12" 19 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # EUR-Lex Parser 2 | 3 |

4 | Building 5 | PyPI version 6 | 7 |

8 | 9 | An EUR-Lex parser for Python. 10 | 11 | ## Usage 12 | 13 | You can install this package as follows: 14 | 15 | ```bash 16 | pip install -U eurlex 17 | ``` 18 | 19 | After installing this package, you can download and parse any document from EUR-Lex. For example, the [32019R0947 regulation](https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A32019R0947): 20 | 21 | ```python 22 | from eurlex import get_html_by_celex_id, parse_html 23 | 24 | # Retrieve and parse the document with CELEX ID "32019R0947" into a Pandas DataFrame 25 | celex_id = "32019R0947" 26 | html = get_html_by_celex_id(celex_id) 27 | df = parse_html(html) 28 | 29 | # Get the first line of Article 1 30 | df_article_1 = df[df.article == "1"] 31 | df_article_1_line_1 = df_article_1.iloc[0] 32 | 33 | # Display the subtitle of Article 1 34 | print(df_article_1_line_1.article_subtitle) 35 | >>> "Subject matter" 36 | 37 | # Display the corresponding text 38 | print(df_article_1_line_1.text) 39 | >>> "This Regulation lays down detailed provisions for the operation of unmanned aircraft systems as well as for personnel, including remote pilots and organisations involved in those operations." 40 | ``` 41 | 42 | Every document on EUR-Lex displays a CELEX number at the top of the page. More information on CELEX numbers can be found on the [EUR-Lex website](https://eur-lex.europa.eu/content/tools/eur-lex-celex-infographic-A3.pdf). 43 | 44 | For more information about the methods in this package, see the [unit tests](https://github.com/kevin91nl/eurlex/tree/main/tests) and [doctests](https://github.com/kevin91nl/eurlex/blob/main/eurlex/__init__.py). 45 | 46 | ### Data Structure 47 | 48 | The following columns are available in the parsed dataframe: 49 | 50 | - `text`: The text 51 | - `type`: The type of the data 52 | - `document`: The document in which the text is found 53 | - `article`: The article in which the text is found 54 | - `article_subtitle`: The subtitle of the article (when available) 55 | - `ref`: The indentation level of the text within the article (e.g. `["(1)", "(a)"]` when the text is found under paragraph `(1)`, subparagraph `(a)`) 56 | 57 | In some cases, additional fields are available. For example, the `group` field which contains the bold text under which a text is found. 58 | 59 | ## Code Contribution 60 | 61 | Feel free to send any issues, ideas or pull requests. -------------------------------------------------------------------------------- /eurlex/__init__.py: -------------------------------------------------------------------------------- 1 | import re 2 | import rdflib 3 | import requests 4 | from SPARQLWrapper import SPARQLWrapper, JSON 5 | import pandas as pd 6 | import datetime 7 | from xml.etree import ElementTree as ETree 8 | from typing import List, Dict 9 | 10 | 11 | def get_prefixes() -> dict: 12 | """Get a mapping from prefixes to URLs. 13 | 14 | Returns 15 | ------- 16 | dict 17 | A mapping from prefixes to URLs. 18 | """ 19 | return { 20 | "cdm": "http://publications.europa.eu/ontology/cdm#", 21 | "celex": "http://publications.europa.eu/resource/celex/", 22 | "owl": "http://www.w3.org/2002/07/owl#", 23 | "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", 24 | "cellar": "http://publications.europa.eu/resource/cellar/", 25 | "skos": "http://www.w3.org/2004/02/skos/core#", 26 | } 27 | 28 | 29 | def parse_article_paragraphs(article: str) -> dict: 30 | """Convert an article found on EUR-Lex to paragraphs. 31 | 32 | Parameters 33 | ---------- 34 | article : str 35 | The article to parse. 36 | 37 | Returns 38 | ------- 39 | dict 40 | Mapping from paragraph identifier to paragraph. 41 | 42 | Examples 43 | -------- 44 | The following example parses an article into paragraphs. Make sure to use newlines instead of the 5 spaces which 45 | are used in the example. 46 | >>> parse_article_paragraphs("This is a test with a few paragraphs: 1. The first one 2. The second one") 47 | {None: 'This is a test with a few paragraphs:', '1.': 'The first one', '2.': 'The second one'} 48 | >>> parse_article_paragraphs("This is a test with a few paragraphs: (1) The first one (2) The second one") 49 | {None: 'This is a test with a few paragraphs:', '(1)': 'The first one', '(2)': 'The second one'} 50 | """ 51 | paragraphs = dict() 52 | paragraph = None 53 | article = article.replace(" ", "\n") 54 | for line in article.split("\n"): 55 | match = re.match(r"^([0-9]+)[.]", line) 56 | if match: 57 | paragraph = match.group(0) 58 | line = ".".join(line.split(".")[1:]).strip() 59 | else: 60 | match = re.match(r"^[(]([0-9]+)[)]", line) 61 | if match: 62 | paragraph = match.group(0) 63 | line = ")".join(line.split(")")[1:]).strip() 64 | if paragraph not in paragraphs: 65 | paragraphs[paragraph] = [] 66 | paragraphs[paragraph].append(line) 67 | paragraphs = { 68 | paragraph: "\n".join(paragraphs[paragraph]).strip() for paragraph in paragraphs 69 | } 70 | return paragraphs 71 | 72 | 73 | def prepend_prefixes(query: str) -> str: 74 | """Prepend a query with prefixes. 75 | 76 | Parameters 77 | ---------- 78 | query : str 79 | The query to prepend. 80 | 81 | Returns 82 | ------- 83 | str 84 | Query prepended with the prefixes. 85 | 86 | Examples 87 | -------- 88 | >>> 'prefix rdf' in prepend_prefixes("SELECT ?name WHERE { ?person rdf:name ?name }") 89 | True 90 | """ 91 | return ( 92 | "\n".join( 93 | [ 94 | "prefix {}: <{}>".format(prefix, url) 95 | for prefix, url in get_prefixes().items() 96 | ] 97 | ) 98 | + " " 99 | + query 100 | ) 101 | 102 | 103 | def run_query(query: str) -> dict: 104 | """Run the SPARQL query on EUR-Lex. 105 | 106 | Parameters 107 | ---------- 108 | query : str 109 | The SPARQL query to run. 110 | 111 | Returns 112 | ------- 113 | dict 114 | A dictionary containing the results. 115 | """ 116 | sparql = SPARQLWrapper( 117 | "http://publications.europa.eu/webapi/rdf/sparql" 118 | ) # pragma: no cover 119 | sparql.setQuery(query) # pragma: no cover 120 | sparql.setReturnFormat(JSON) # pragma: no cover 121 | results = sparql.query().convert() # pragma: no cover 122 | return results # pragma: no cover 123 | 124 | 125 | def convert_sparql_output_to_dataframe(sparql_results: dict) -> pd.DataFrame: 126 | """Convert SPARQL output to a DataFrame. 127 | 128 | Parameters 129 | ---------- 130 | sparql_results : dict 131 | A dictionary containing the SPARQL results. 132 | 133 | Returns 134 | ------- 135 | pd.DataFrame 136 | The DataFrame representation of the SPARQL results. 137 | 138 | Examples 139 | -------- 140 | >>> convert_sparql_output_to_dataframe({'results': {'bindings': [{'subject': {'value': 'cdm:test'}}]}}).to_dict() 141 | {'subject': {0: 'cdm:test'}} 142 | """ 143 | items = [ 144 | {key: simplify_iri(item[key]["value"]) for key in item.keys()} 145 | for item in sparql_results["results"]["bindings"] 146 | ] 147 | return pd.DataFrame(items) 148 | 149 | 150 | def get_celex_dataframe(celex_id: str) -> pd.DataFrame: 151 | """Get CELEX data delivered in a DataFrame. 152 | 153 | Parameters 154 | ---------- 155 | celex_id : str 156 | The CELEX ID to get the data for. 157 | 158 | Returns 159 | ------- 160 | pd.DataFrame 161 | A DataFrame containing the results. 162 | """ 163 | graph = rdflib.Graph() # pragma: no cover 164 | results = graph.parse( 165 | f"http://publications.europa.eu/resource/" f"celex/{str(celex_id)}?language=eng" 166 | ) # pragma: no cover 167 | items = [ 168 | {key: simplify_iri(item[key]) for key in range(len(item))} for item in results 169 | ] # pragma: no cover 170 | df = pd.DataFrame(items) # pragma: no cover 171 | df.columns = ["s", "o", "p"] # pragma: no cover 172 | return df # pragma: no cover 173 | 174 | 175 | def get_celex_id( 176 | slash_notation: str, document_type: str = "R", sector_id: str = "3" 177 | ) -> str: 178 | """Derive the CELEX ID from a slash notation like 2019/947. 179 | 180 | Parameters 181 | ---------- 182 | slash_notation : str 183 | The slash notation of the document (like 2019/947). 184 | document_type : str 185 | The type of the document (e.g. "R" for regulations). 186 | sector_id : str 187 | The sector ID (e.g. 3). 188 | 189 | Returns 190 | ------- 191 | str 192 | The CELEX ID 193 | 194 | Examples 195 | -------- 196 | >>> get_celex_id('2019/947') 197 | '32019R0947' 198 | >>> get_celex_id('947/2019') 199 | '32019R0947' 200 | """ 201 | term1, term2 = slash_notation.split("/") 202 | current_year = datetime.datetime.now().year 203 | term1 = int(term1) 204 | term2 = int(term2) 205 | term1_is_year = 1800 <= term1 <= current_year 206 | term2_is_year = 1800 <= term2 <= current_year 207 | year = term2 208 | document_id = term1 209 | if term1_is_year and not term2_is_year: 210 | year = term1 211 | document_id = term2 212 | if term2_is_year and not term1_is_year: 213 | year = term2 214 | document_id = term1 215 | return "{}{}{}{}".format( 216 | str(sector_id), year, document_type, str(document_id).zfill(4) 217 | ) 218 | 219 | 220 | def get_possible_celex_ids( 221 | slash_notation: str, document_type: str = None, sector_id: str = None 222 | ) -> list: 223 | """Get a list of possible CELEX IDs (given a slash notation like 2019/947). 224 | 225 | Parameters 226 | ---------- 227 | slash_notation : str 228 | The slash notation of the document (like 2019/947). 229 | document_type : str 230 | The type of the document (e.g. "R" for regulations). 231 | sector_id : str 232 | The sector ID (e.g. 3). 233 | 234 | Returns 235 | ------- 236 | list 237 | A list of possible CELEX IDs. 238 | 239 | Examples 240 | -------- 241 | >>> '32019R0947' in get_possible_celex_ids("2019/947") 242 | True 243 | """ 244 | sector_ids = ( 245 | [str(i) for i in range(10)] + ["C", "E"] 246 | if sector_id is None 247 | else [str(sector_id)] 248 | ) 249 | document_types = ( 250 | ["L", "R", "E", "PC", "DC", "SC", "JC", "CJ", "CC", "CO"] 251 | if document_type is None 252 | else [document_type] 253 | ) 254 | possible_ids = [] 255 | for sector_id in sector_ids: 256 | for document_type in document_types: 257 | guess = get_celex_id(slash_notation, document_type, sector_id) 258 | possible_ids.append(guess) 259 | return possible_ids 260 | 261 | 262 | def guess_celex_ids_via_eurlex( 263 | slash_notation: str, document_type: str = None, sector_id: str = None 264 | ) -> list: 265 | """Guess CELEX IDs for a slash notation by looking it up via EUR-Lex. 266 | 267 | Parameters 268 | ---------- 269 | slash_notation : str 270 | The slash notation of the document (like 2019/947). 271 | document_type : str 272 | The type of the document (e.g. "R" for regulations). 273 | sector_id : str 274 | The sector ID (e.g. 3). 275 | 276 | Returns 277 | ------- 278 | list 279 | A list of possible CELEX IDs. 280 | """ 281 | slash_notation = "/".join(slash_notation.split("/")[:2]) # pragma: no cover 282 | queries = [ 283 | "{ ?s owl:sameAs celex:" + celex_id + " . ?s owl:sameAs ?o }" 284 | for celex_id in get_possible_celex_ids(slash_notation, document_type, sector_id) 285 | ] # pragma: no cover 286 | query = "SELECT * WHERE {" + " UNION ".join(queries) + "}" # pragma: no cover 287 | query = prepend_prefixes(query) # pragma: no cover 288 | results = run_query(query.strip()) # pragma: no cover 289 | celex_ids = [] # pragma: no cover 290 | for binding in results["results"]["bindings"]: # pragma: no cover 291 | if "/celex/" in binding["o"]["value"]: # pragma: no cover 292 | celex_id = binding["o"]["value"].split("/")[-1] # pragma: no cover 293 | celex_ids.append(celex_id) # pragma: no cover 294 | celex_ids = list(set(celex_ids)) # pragma: no cover 295 | return celex_ids # pragma: no cover 296 | 297 | 298 | def simplify_iri(iri: str) -> str: 299 | """Simplify prefixes in an IRI. 300 | 301 | Parameters 302 | ---------- 303 | iri : str 304 | IRI to simplify. 305 | 306 | Returns 307 | ------- 308 | str 309 | Simplified version where all prefixes are replaced by their shortcuts. 310 | 311 | Examples 312 | -------- 313 | >>> simplify_iri("http://publications.europa.eu/ontology/cdm#test") 314 | 'cdm:test' 315 | >>> simplify_iri("cdm:test") 316 | 'cdm:test' 317 | """ 318 | for prefix, url in get_prefixes().items(): 319 | if iri.startswith(url): 320 | return prefix + ":" + iri[len(url) :] 321 | return iri 322 | 323 | 324 | def get_html_by_cellar_id(cellar_id: str, language: str = "en") -> str: 325 | """Retrieve HTML by CELLAR ID. 326 | 327 | Parameters 328 | ---------- 329 | cellar_id : str 330 | The CELLAR ID to find HTML for. 331 | language : str 332 | The language to retrieve the HTML in (default: "en"). 333 | 334 | Returns 335 | ------- 336 | str 337 | HTML found using the CELLAR ID. 338 | """ 339 | url = "http://publications.europa.eu/resource/cellar/" + str( # pragma: no cover 340 | cellar_id.split(":")[1] if ":" in cellar_id else cellar_id # pragma: no cover 341 | ) # pragma: no cover 342 | response = requests.get( 343 | url, 344 | allow_redirects=True, 345 | headers={ # pragma: no cover 346 | "Accept": "text/html,application/xhtml+xml,application/xml", # pragma: no cover 347 | "Accept-Language": f"{language}", # pragma: no cover 348 | }, 349 | ) # pragma: no cover 350 | html = response.content.decode("utf-8") # pragma: no cover 351 | return html # pragma: no cover 352 | 353 | 354 | def get_html_by_celex_id(celex_id: str, language: str = "en") -> str: 355 | """Retrieve HTML by CELEX ID. 356 | 357 | Parameters 358 | ---------- 359 | celex_id : str 360 | The CELEX ID to find HTML for. 361 | language : str 362 | The language to retrieve the HTML in (default: "en"). 363 | 364 | 365 | Returns 366 | ------- 367 | str 368 | HTML found using the CELEX ID. 369 | """ 370 | url = "http://publications.europa.eu/resource/celex/" + str( 371 | celex_id 372 | ) # pragma: no cover 373 | response = requests.get( 374 | url, 375 | allow_redirects=True, 376 | headers={ # pragma: no cover 377 | "Accept": "text/html,application/xhtml+xml,application/xml", # pragma: no cover 378 | "Accept-Language": f"{language}", # pragma: no cover 379 | }, 380 | ) # pragma: no cover 381 | html = response.content.decode("utf-8") # pragma: no cover 382 | return html # pragma: no cover 383 | 384 | 385 | def get_tag_name(raw_tag_name: str) -> str: 386 | """Get the tag name. 387 | 388 | Parameters 389 | ---------- 390 | raw_tag_name : str 391 | The original tag name. 392 | 393 | Returns 394 | ------- 395 | str 396 | The parsed tag name. 397 | 398 | Examples 399 | -------- 400 | >>> get_tag_name('tag}test') 401 | 'test' 402 | """ 403 | return raw_tag_name.split("}")[1] if "}" in raw_tag_name else raw_tag_name 404 | 405 | 406 | def parse_modifiers( 407 | child: ETree.Element, ref: list = None, context: dict = None 408 | ) -> list: 409 | """Parse modifiers. 410 | 411 | Parameters 412 | ---------- 413 | child : xml.etree.ElementTree.Element 414 | XML tree. 415 | ref : list 416 | References. 417 | context : dict 418 | Context. 419 | 420 | Returns 421 | ------- 422 | list 423 | Results. 424 | 425 | Examples 426 | -------- 427 | >>> parse_modifiers(ETree.fromstring('

Text

')) 428 | [{'text': 'Text', 'type': 'text', 'modifier': 'italic', 'ref': [], 'context': {}}] 429 | >>> parse_modifiers(ETree.fromstring('

Text

')) 430 | [{'text': 'Text', 'type': 'text', 'modifier': 'signatory', 'ref': [], 'context': {}}] 431 | >>> parse_modifiers(ETree.fromstring('

Text

')) 432 | [{'text': 'Text', 'type': 'text', 'modifier': 'note', 'ref': [], 'context': {}}] 433 | >>> parse_modifiers(ETree.fromstring('

')) 434 | [] 435 | """ 436 | ref = [] if ref is None else ref 437 | context = {} if context is None else context 438 | output = [] 439 | new_context = context.copy() 440 | if child.attrib["class"] == "italic": 441 | output.append( 442 | { 443 | "text": _get_text(child), 444 | "type": "text", 445 | "modifier": "italic", 446 | "ref": ref, 447 | "context": new_context.copy(), 448 | } 449 | ) 450 | elif child.attrib["class"] == "signatory": 451 | output.append( 452 | { 453 | "text": _get_text(child), 454 | "type": "text", 455 | "modifier": "signatory", 456 | "ref": ref, 457 | "context": new_context.copy(), 458 | } 459 | ) 460 | elif child.attrib["class"] == "note": 461 | output.append( 462 | { 463 | "text": _get_text(child), 464 | "type": "text", 465 | "modifier": "note", 466 | "ref": ref, 467 | "context": new_context.copy(), 468 | } 469 | ) 470 | return output 471 | 472 | 473 | def _get_text(child: ETree.Element) -> str: 474 | """Get text. 475 | 476 | Parameters 477 | ---------- 478 | child : xml.etree.ElementTree.Element 479 | XML tree. 480 | 481 | Returns 482 | ------- 483 | str 484 | Text. 485 | 486 | Examples 487 | -------- 488 | >>> _get_text(ETree.fromstring('

Text

')) 489 | 'Text' 490 | >>> _get_text(ETree.fromstring('

Text

')) 491 | 'Text' 492 | """ 493 | if len(child) == 1: 494 | return _get_text(child[0]) 495 | if child.text is not None: 496 | return child.text.strip() 497 | return "" 498 | 499 | 500 | def parse_span(child: ETree.Element, ref: list = None, context: dict = None) -> list: 501 | """Parse a or

tag. 502 | 503 | Parameters 504 | ---------- 505 | child : xml.etree.ElementTree.Element 506 | XML tree. 507 | ref : list 508 | References. 509 | context : dict 510 | Context. 511 | 512 | Returns 513 | ------- 514 | list 515 | Results. 516 | 517 | Examples 518 | -------- 519 | >>> parse_span(ETree.fromstring('

Text

')) 520 | [{'text': 'Text', 'type': 'doc-title', 'ref': [], 'context': {'document': 'Text'}}] 521 | >>> parse_span(ETree.fromstring('

Text

')) 522 | [{'text': 'Text', 'type': 'art-subtitle', 'ref': [], 'context': {'article_subtitle': 'Text'}}] 523 | >>> parse_span(ETree.fromstring('

Text

')) 524 | [{'text': 'Text', 'type': 'art-title', 'ref': [], 'context': {'article': 'Text'}}] 525 | >>> parse_span(ETree.fromstring('

Text

')) 526 | [{'text': 'Text', 'type': 'group-title', 'ref': [], 'context': {}}] 527 | >>> parse_span(ETree.fromstring('

Text

')) 528 | [{'text': 'Text', 'type': 'group-title', 'ref': [], 'context': {}}] 529 | >>> parse_span(ETree.fromstring('

Text

')) 530 | [{'text': 'Text', 'type': 'section-title', 'ref': [], 'context': {}}] 531 | >>> parse_span(ETree.fromstring('

1. Text

')) 532 | [{'text': 'Text', 'type': 'text', 'ref': [], 'context': {'paragraph': '1'}}] 533 | >>> parse_span(ETree.fromstring('

Text

')) 534 | [{'text': 'Text', 'type': 'text', 'modifier': 'italic', 'ref': [], 'context': {}}] 535 | >>> parse_span(ETree.fromstring('

Text

')) 536 | [] 537 | """ 538 | ref = [] if ref is None else ref 539 | context = {} if context is None else context 540 | output = [] 541 | if "class" not in child.attrib: 542 | return output 543 | if child.attrib["class"] == "doc-ti": 544 | if "document" not in context: 545 | context["document"] = "" 546 | context["document"] += _get_text(child) 547 | output.append( 548 | { 549 | "text": _get_text(child), 550 | "type": "doc-title", 551 | "ref": ref, 552 | "context": context.copy(), 553 | } 554 | ) 555 | elif child.attrib["class"] == "sti-art": 556 | context["article_subtitle"] = _get_text(child) 557 | output.append( 558 | { 559 | "text": _get_text(child), 560 | "type": "art-subtitle", 561 | "ref": ref, 562 | "context": context.copy(), 563 | } 564 | ) 565 | elif child.attrib["class"] == "ti-art": 566 | context["article"] = _get_text(child).replace("Article", "").strip() 567 | output.append( 568 | { 569 | "text": _get_text(child), 570 | "type": "art-title", 571 | "ref": ref, 572 | "context": context.copy(), 573 | } 574 | ) 575 | elif child.attrib["class"].startswith("ti-grseq-"): 576 | output.append( 577 | { 578 | "text": _get_text(child), 579 | "type": "group-title", 580 | "ref": ref, 581 | "context": context.copy(), 582 | } 583 | ) 584 | context["group"] = _get_text(child) 585 | elif child.attrib["class"].startswith("ti-section-"): 586 | output.append( 587 | { 588 | "text": _get_text(child), 589 | "type": "section-title", 590 | "ref": ref, 591 | "context": context.copy(), 592 | } 593 | ) 594 | context["section"] = _get_text(child) 595 | elif child.attrib["class"] == "normal": 596 | text = _get_text(child) 597 | if re.match("^[0-9]+[.]", text): 598 | context["paragraph"] = text.split(".")[0] 599 | text = ".".join(text.split(".")[1:]).strip() 600 | output.append( 601 | {"text": text, "type": "text", "ref": ref, "context": context.copy()} 602 | ) 603 | else: 604 | output.extend(parse_modifiers(child, ref, context)) 605 | return output 606 | 607 | 608 | def parse_article(tree: ETree.Element, ref: list = None, context: dict = None) -> list: 609 | """Parse an article. 610 | 611 | Parameters 612 | ---------- 613 | tree : xml.etree.ElementTree.Element 614 | XML tree. 615 | ref : list 616 | References. 617 | context : dict 618 | Context. 619 | 620 | Returns 621 | ------- 622 | list 623 | Results. 624 | 625 | Examples 626 | -------- 627 | >>> parse_article(ETree.fromstring('Link')) 628 | [{'text': 'Link', 'type': 'link', 'ref': [], 'context': {}}] 629 | >>> parse_article(ETree.fromstring('

Text

')) 630 | [{'text': 'Text', 'type': 'doc-title', 'ref': [], 'context': {'document': 'Text'}}] 631 | >>> parse_article(ETree.fromstring('

1

2

')) 632 | [] 633 | >>> parse_article(ETree.fromstring('
Text
')) 634 | [] 635 | >>> parse_article(ETree.fromstring('Text')) 636 | [] 637 | >>> parse_article(ETree.fromstring('Text')) 638 | [] 639 | """ 640 | namespaces = {"html": "http://www.w3.org/1999/xhtml"} 641 | ref = [] if ref is None else ref 642 | context = {} if context is None else context 643 | output = [] 644 | new_context = context.copy() 645 | for child in tree: 646 | if get_tag_name(child.tag) in ["a"]: 647 | output.append( 648 | { 649 | "text": _get_text(child), 650 | "type": "link", 651 | "ref": ref, 652 | "context": new_context.copy(), 653 | } 654 | ) 655 | elif get_tag_name(child.tag) in ["p", "span"]: 656 | output.extend(parse_span(child, ref, new_context)) 657 | elif get_tag_name(child.tag) == "table": 658 | results = child.findall( 659 | "html:tbody/html:tr/html:td", namespaces=namespaces 660 | ) + child.findall("tbody/tr/td", namespaces=namespaces) 661 | if ( 662 | len(results) == 2 663 | and len(results[0]) == 1 664 | and get_tag_name(results[0][0].tag) == "p" 665 | ): 666 | key = None 667 | for subchild in results[0]: 668 | key = _get_text(subchild) 669 | output.extend(parse_article(results[1], ref + [key], new_context)) 670 | else: 671 | pass 672 | elif get_tag_name(child.tag) == "div": 673 | output.extend(parse_article(child, ref, new_context)) 674 | elif get_tag_name(child.tag) in ["head", "hr"]: 675 | pass 676 | elif get_tag_name(child.tag) == "body": 677 | output.extend(parse_article(child, ref, context)) 678 | return output 679 | 680 | 681 | def parse_html(html: str) -> pd.DataFrame: 682 | """Parse EUR-Lex HTML into a DataFrame. 683 | 684 | Parameters 685 | ---------- 686 | html : str 687 | The HTML to parse. 688 | 689 | Returns 690 | ------- 691 | pd.DataFrame 692 | The parsed DataFrame 693 | 694 | Examples 695 | -------- 696 | >>> parse_html('

Text

').to_dict(orient='records') 697 | [{'text': 'Text', 'type': 'text', 'ref': [], 'context': {}}] 698 | >>> parse_html('

Text

').to_dict(orient='records') 699 | [] 700 | >>> parse_html('>> parse_html('

ANNEX

Group

Text

').to_dict(orient='records') 703 | [{'text': 'Text', 'type': 'text', 'ref': [], 'context': {'document': 'ANNEX', 'group': 'Group'}, 'document': 'ANNEX', 'group': 'Group'}] 704 | """ 705 | try: 706 | tree = ETree.fromstring(html) 707 | except ETree.ParseError: 708 | return pd.DataFrame() 709 | records = [] 710 | for item in parse_article(tree): 711 | for key, value in item["context"].items(): 712 | item[key] = value 713 | records.append(item) 714 | df = pd.DataFrame.from_records(records) 715 | df = df[df.type == "text"] if "type" in df.columns else df 716 | return df 717 | 718 | 719 | def get_regulations(limit: int = -1, shuffle: bool = False) -> list: 720 | """Retrieve regulations from EUR-Lex. 721 | 722 | Parameters 723 | ---------- 724 | limit : int 725 | The maximum number of regulations to retrieve (default: no limit). 726 | shuffle : bool 727 | Whether to shuffle the retrieved regulations (default: False). 728 | 729 | Returns 730 | ------- 731 | list 732 | A list of CELLAR IDs. 733 | """ 734 | query = "select ?doc where {?doc cdm:work_has_resource-type . }" 737 | + (" order by rand()" if shuffle else "") 738 | + (" limit " + str(limit) if limit > 0 else "") 739 | ) # pragma: no cover 740 | results = run_query(prepend_prefixes(query)) # pragma: no cover 741 | cellar_ids = [] # pragma: no cover 742 | for result in results["results"]["bindings"]: # pragma: no cover 743 | cellar_ids.append(result["doc"]["value"].split("/")[-1]) # pragma: no cover 744 | return cellar_ids # pragma: no cover 745 | 746 | 747 | def get_documents(types: List[str] = ["REG"], limit: int = -1) -> List[Dict[str, str]]: 748 | """Retrieve a list of of documents of specified types from EUR-Lex that have a CELEX-number, as a list of dicts. 749 | 750 | Parameters 751 | ---------- 752 | types : List[str] 753 | The by the SparQL-API recognized type of documents to return 754 | Examples: ["DIR", "DIR_IMPL", "DIR_DEL", "REG", "REG_IMPL", "REG_FINANC", "REG_DEL"] 755 | limit : int 756 | The maximum number of regulations to retrieve. (default: no limit). 757 | 758 | Returns 759 | ------- 760 | List[dict] 761 | A list of dicts, containing publication date, publication url, celex number and type of document. 762 | """ 763 | query = "select distinct ?doc ?type ?celex ?date\n" 764 | query += "where{ ?doc cdm:work_has_resource-type ?type.\n" 765 | query += " FILTER(\n " 766 | query += " ||\n ".join( 767 | map( 768 | lambda type: f"?type=", 769 | types, 770 | ) 771 | ) 772 | query += "\n )\n" 773 | query += " FILTER(BOUND(?celex))\n" 774 | query += " OPTIONAL{?doc cdm:resource_legal_id_celex ?celex.}\n" 775 | query += " OPTIONAL{?doc cdm:work_date_document ?date.}\n" 776 | query += "}\n" 777 | if limit > 0: 778 | query += "limit " + str(limit) 779 | 780 | results = [] 781 | query_results = run_query(prepend_prefixes(query)) 782 | 783 | for result in query_results["results"]["bindings"]: 784 | results.append( 785 | { 786 | "celex": result["celex"]["value"], 787 | "date": result["date"]["value"], 788 | "link": result["doc"]["value"], 789 | "type": result["type"]["value"].split("/")[-1], 790 | } 791 | ) 792 | 793 | return results 794 | 795 | 796 | def process_paragraphs(paragraphs: list) -> pd.DataFrame: 797 | """Process the paragraphs. 798 | 799 | Parameters 800 | ---------- 801 | paragraphs : list 802 | The list of currently downloaded paragraphs. 803 | 804 | Returns 805 | ------- 806 | pd.DataFrame 807 | A DataFrame containing the processed paragraphs. 808 | 809 | Examples 810 | -------- 811 | >>> process_paragraphs([]).to_dict(orient='records') 812 | [] 813 | >>> process_paragraphs([{'celex_id': '1', 'paragraph': 'Done at 2021-11-25.'}]).to_dict(orient='records') 814 | [] 815 | """ 816 | df_paragraphs = pd.DataFrame.from_records(paragraphs) 817 | if "paragraph" not in df_paragraphs.columns: 818 | return df_paragraphs 819 | df_paragraphs = ( 820 | df_paragraphs[~df_paragraphs.paragraph.str.startswith("Done at")] 821 | if len(df_paragraphs) 822 | else df_paragraphs 823 | ) 824 | df_paragraphs = ( 825 | df_paragraphs[~df_paragraphs.paragraph.str.startswith("It shall apply from")] 826 | if len(df_paragraphs) 827 | else df_paragraphs 828 | ) 829 | df_paragraphs = ( 830 | df_paragraphs[~df_paragraphs.paragraph.str.contains("is replaced by")] 831 | if len(df_paragraphs) 832 | else df_paragraphs 833 | ) 834 | df_paragraphs = ( 835 | df_paragraphs[~df_paragraphs.paragraph.str.endswith("is updated.")] 836 | if len(df_paragraphs) 837 | else df_paragraphs 838 | ) 839 | df_paragraphs = ( 840 | df_paragraphs[~df_paragraphs.paragraph.str.endswith("is deleted.")] 841 | if len(df_paragraphs) 842 | else df_paragraphs 843 | ) 844 | df_paragraphs = ( 845 | df_paragraphs[~df_paragraphs.paragraph.str.endswith("is removed.")] 846 | if len(df_paragraphs) 847 | else df_paragraphs 848 | ) 849 | df_paragraphs = ( 850 | df_paragraphs[~df_paragraphs.paragraph.str.endswith("is hereby repealed.")] 851 | if len(df_paragraphs) 852 | else df_paragraphs 853 | ) 854 | df_paragraphs = ( 855 | df_paragraphs[~df_paragraphs.paragraph.str.endswith("are updated.")] 856 | if len(df_paragraphs) 857 | else df_paragraphs 858 | ) 859 | df_paragraphs = ( 860 | df_paragraphs[~df_paragraphs.paragraph.str.endswith("are deleted.")] 861 | if len(df_paragraphs) 862 | else df_paragraphs 863 | ) 864 | df_paragraphs = ( 865 | df_paragraphs[~df_paragraphs.paragraph.str.endswith("are removed.")] 866 | if len(df_paragraphs) 867 | else df_paragraphs 868 | ) 869 | df_paragraphs = ( 870 | df_paragraphs[~df_paragraphs.paragraph.str.contains("is amended ")] 871 | if len(df_paragraphs) 872 | else df_paragraphs 873 | ) 874 | df_paragraphs = ( 875 | df_paragraphs[~df_paragraphs.paragraph.str.contains("is repealed with")] 876 | if len(df_paragraphs) 877 | else df_paragraphs 878 | ) 879 | df_paragraphs = ( 880 | df_paragraphs[df_paragraphs.paragraph.str.endswith(".")] 881 | if len(df_paragraphs) 882 | else df_paragraphs 883 | ) 884 | df_paragraphs = ( 885 | df_paragraphs[ 886 | df_paragraphs.paragraph.apply(lambda text: text[0].upper() == text[0]) 887 | ] 888 | if len(df_paragraphs) 889 | else df_paragraphs 890 | ) 891 | df_paragraphs = ( 892 | df_paragraphs[~df_paragraphs.paragraph.str.contains("‘")] 893 | if len(df_paragraphs) 894 | else df_paragraphs 895 | ) 896 | df_paragraphs = ( 897 | df_paragraphs[~df_paragraphs.paragraph.str.contains("’")] 898 | if len(df_paragraphs) 899 | else df_paragraphs 900 | ) 901 | df_paragraphs = ( 902 | df_paragraphs[df_paragraphs.paragraph.apply(len) >= 100] 903 | if len(df_paragraphs) 904 | else df_paragraphs 905 | ) 906 | df_paragraphs = ( 907 | df_paragraphs.drop_duplicates("paragraph") 908 | if len(df_paragraphs) 909 | else df_paragraphs 910 | ) 911 | return df_paragraphs 912 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | argparse==1.4.0 2 | sparqlwrapper==1.8.5 3 | pandas==1.2.4 4 | rdflib==6.0.2 5 | requests==2.25.1 6 | bs4==0.0.1 7 | beautifulsoup4==4.9.3 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | from pathlib import Path 3 | 4 | this_directory = Path(__file__).parent 5 | long_description = (this_directory / "README.md").read_text() 6 | 7 | setuptools.setup( 8 | name="eurlex", 9 | version="0.1.4", 10 | author="K.M.J. Jacobs", 11 | author_email="kmj.jacobs@maastrichtuniversity.nl", 12 | description="An EUR-Lex parser for Python.", 13 | long_description=long_description, 14 | long_description_content_type="text/markdown", 15 | url="https://github.com/kevin91nl/eurlex", 16 | packages=setuptools.find_packages(), 17 | include_package_data=True, 18 | classifiers=[], 19 | python_requires=">=3.6", 20 | # Load the requirements.txt 21 | install_requires=[ 22 | line.strip() 23 | for line in (this_directory / "requirements.txt").read_text().split("\n") 24 | if line.strip() 25 | ], 26 | ) 27 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | def _merge_dicts(a, b, path=None): 2 | if path is None: 3 | path = [] 4 | for key in b: 5 | if key in a: 6 | if isinstance(a[key], dict) and isinstance(b[key], dict): 7 | _merge_dicts(a[key], b[key], path + [str(key)]) 8 | elif a[key] == b[key]: 9 | pass 10 | else: 11 | if a[key] is None: 12 | a[key] = b[key] 13 | if b[key] is None: 14 | b[key] = a[key] 15 | else: 16 | a[key] = b[key] 17 | return a 18 | 19 | 20 | def _convert_outline_item(outline_item): 21 | """Convert an outline item. 22 | 23 | The following input: 24 | ``` 25 | ["1", "a", "i."] 26 | ``` 27 | 28 | Is converted into the following output: 29 | ``` 30 | { 31 | "1": { 32 | "a": { 33 | "i.": None 34 | } 35 | } 36 | } 37 | ``` 38 | """ 39 | if len(outline_item) == 1: 40 | return {outline_item[0]: None} 41 | else: 42 | node, remainder = outline_item[0], outline_item[1:] 43 | return {node: _convert_outline_item(remainder)} 44 | 45 | 46 | def _convert_outline(outline_as_tuples): 47 | """Convert an outline as tuples into a tree format. 48 | 49 | Given the following input: 50 | 51 | ``` 52 | [ 53 | ["1", "a", "i."], 54 | ["1", "a", "ii."], 55 | ["1", "b", "i."], 56 | ["2"], 57 | ["3", "a"] 58 | ] 59 | ``` 60 | 61 | Generate the following output (all leaves are encoded as None): 62 | 63 | ``` 64 | {"1": {"a": {"i.": None, "ii.": None}, "b": {"i.": None}}, "2": None, "3": None} 65 | ``` 66 | """ 67 | tree = dict() 68 | for item in outline_as_tuples: 69 | tree = _merge_dicts(tree, _convert_outline_item(item)) 70 | return tree 71 | -------------------------------------------------------------------------------- /tests/test_parsing.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from eurlex import get_html_by_celex_id, parse_html 3 | import pandas as pd 4 | 5 | from tests import _convert_outline 6 | 7 | 8 | @pytest.mark.parametrize( 9 | "celex_id,header,ref,expected_text_or_text_count", 10 | [ 11 | [ 12 | "32019R0945", 13 | "Requirements for a class C0 Unmanned aircraft system", 14 | ["(8)", "(a)"], 15 | 6, 16 | ], 17 | [ 18 | "32019R0945", 19 | "Requirements for a class C0 Unmanned aircraft system", 20 | ["(1)"], 21 | "have an MTOM of less than 250 g, including payload;", 22 | ], 23 | [ 24 | "32019R0945", 25 | "Definitions", 26 | ["(1)"], 27 | "‘unmanned aircraft’ (‘UA’) means any aircraft operating or designed to operate autonomously or to be piloted remotely without a pilot on board;", 28 | ], 29 | [ 30 | "32019R0947", 31 | "UAS.OPEN.060 Responsibilities of the remote pilot", 32 | ["(2)", "(f)"], 33 | "comply with the operator's procedures when available.", 34 | ], 35 | ], 36 | ) 37 | def test_paragraph_content(celex_id, header, ref, expected_text_or_text_count): 38 | html = get_html_by_celex_id(celex_id) 39 | df = parse_html(html) 40 | expected_text = None 41 | expected_text_count = 1 42 | if type(expected_text_or_text_count) == int: 43 | expected_text_count = expected_text_or_text_count 44 | if type(expected_text_or_text_count) == str: 45 | expected_text = expected_text_or_text_count 46 | assert df.shape[0] > 0, "No rows found for CELEX ID {}".format(celex_id) 47 | if "article_subtitle" not in df: 48 | df["article_subtitle"] = "" 49 | df = df[(df.group == header) | (df.article_subtitle == header)] 50 | assert df.shape[0] > 0, "No rows found for header {}".format(header) 51 | df = df[df.ref.apply("".join).str.startswith("".join(ref))] 52 | assert df.shape[0] > 0, "No rows found for reference {}".format(ref) 53 | assert df.shape[0] == expected_text_count, "Expected {} texts, but found {}".format( 54 | expected_text_count, df.shape[0] 55 | ) 56 | assert ( 57 | expected_text is None or df.text.values[0] == expected_text 58 | ), "Text is not as expected" 59 | 60 | 61 | @pytest.mark.parametrize( 62 | "celex_id,header,expected_outline", 63 | [ 64 | ( 65 | "32019R0947", 66 | "UAS.SPEC.020 Operational declaration", 67 | { 68 | "(1)": { 69 | "(a)": {"i.": None, "ii.": None, "iii.": None, "iv.": None}, 70 | "(b)": {"i.": None, "ii.": None}, 71 | }, 72 | "(2)": {"(a)": None, "(b)": None, "(c)": None, "(d)": None}, 73 | "(3)": None, 74 | "(4)": None, 75 | "(5)": None, 76 | "(6)": None, 77 | }, 78 | ) 79 | ], 80 | ) 81 | def test_outline(celex_id, header, expected_outline): 82 | html = get_html_by_celex_id(celex_id) 83 | df = parse_html(html) 84 | df = df[(df.group == header) | (df.article_subtitle == header)] 85 | assert ( 86 | _convert_outline(df.ref.tolist()) == expected_outline 87 | ), "Outline is not as expected" 88 | 89 | 90 | @pytest.mark.parametrize( 91 | "celex_id,expected_articles", [("32015R0220", 16), ("32019R0947", 23)] 92 | ) 93 | def test_article_count(celex_id, expected_articles): 94 | html = get_html_by_celex_id(celex_id) 95 | df = parse_html(html) 96 | num_unique_articles = df[~pd.isna(df.article)].article.unique().shape[0] 97 | assert ( 98 | num_unique_articles == expected_articles 99 | ), f"Wrong number of articles (found: {num_unique_articles}, expected: {expected_articles})" 100 | --------------------------------------------------------------------------------