├── .coveragerc
├── .github
    └── workflows
    │   └── building.yaml
├── .gitignore
├── .pypirc
├── Pipfile
├── README.md
├── eurlex
    └── __init__.py
├── requirements.txt
├── setup.py
└── tests
    ├── __init__.py
    └── test_parsing.py


/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | omit = setup.py


--------------------------------------------------------------------------------
/.github/workflows/building.yaml:
--------------------------------------------------------------------------------
 1 | name: Building
 2 | on: [push]
 3 | 
 4 | jobs:
 5 |   build:
 6 |     name: Run Python Tests and upload to PyPI
 7 |     runs-on: ubuntu-latest
 8 | 
 9 |     steps:
10 |       - uses: actions/checkout@v2
11 | 
12 |       - name: Set up Python 3.8
13 |         uses: actions/setup-python@v2
14 |         with:
15 |           python-version: 3.8
16 | 
17 |       - name: Install pytest
18 |         run: |
19 |           pip3 install pytest==6.2.5 pytest-cov==3.0.0
20 | 
21 |       - name: Install Python dependencies
22 |         run: |
23 |           python3 -m pip install --upgrade pip
24 |           pip3 install -r requirements.txt
25 | 
26 |       - name: Test with pytest
27 |         run: |
28 |           pytest . --doctest-modules --exitfirst --verbose --failed-first \
29 |           --cov=. --cov-report html
30 | 
31 |       - name: Build and Upload to TestPyPI
32 |         continue-on-error: true
33 |         run: |
34 |           pip3 install wheel twine
35 |           python3 setup.py sdist bdist_wheel
36 |           python3 -m twine upload dist/*
37 |         env:
38 |           TWINE_USERNAME: __token__
39 |           TWINE_PASSWORD: ${{ secrets.TWINE_TOKEN }}
40 |           TWINE_REPOSITORY: pypi
41 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Created by https://www.toptal.com/developers/gitignore/api/python
  2 | # Edit at https://www.toptal.com/developers/gitignore?templates=python
  3 | 
  4 | ### Python ###
  5 | # Byte-compiled / optimized / DLL files
  6 | __pycache__/
  7 | *.py[cod]
  8 | *$py.class
  9 | 
 10 | # C extensions
 11 | *.so
 12 | 
 13 | # Distribution / packaging
 14 | .Python
 15 | build/
 16 | develop-eggs/
 17 | dist/
 18 | downloads/
 19 | eggs/
 20 | .eggs/
 21 | lib/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | wheels/
 27 | share/python-wheels/
 28 | *.egg-info/
 29 | .installed.cfg
 30 | *.egg
 31 | MANIFEST
 32 | 
 33 | # PyInstaller
 34 | #  Usually these files are written by a python script from a template
 35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 36 | *.manifest
 37 | *.spec
 38 | 
 39 | # Installer logs
 40 | pip-log.txt
 41 | pip-delete-this-directory.txt
 42 | 
 43 | # Unit test / coverage reports
 44 | htmlcov/
 45 | .tox/
 46 | .nox/
 47 | .coverage
 48 | .coverage.*
 49 | .cache
 50 | nosetests.xml
 51 | coverage.xml
 52 | *.cover
 53 | *.py,cover
 54 | .hypothesis/
 55 | .pytest_cache/
 56 | cover/
 57 | 
 58 | # Translations
 59 | *.mo
 60 | *.pot
 61 | 
 62 | # Django stuff:
 63 | *.log
 64 | local_settings.py
 65 | db.sqlite3
 66 | db.sqlite3-journal
 67 | 
 68 | # Flask stuff:
 69 | instance/
 70 | .webassets-cache
 71 | 
 72 | # Scrapy stuff:
 73 | .scrapy
 74 | 
 75 | # Sphinx documentation
 76 | docs/_build/
 77 | 
 78 | # PyBuilder
 79 | .pybuilder/
 80 | target/
 81 | 
 82 | # Jupyter Notebook
 83 | .ipynb_checkpoints
 84 | 
 85 | # IPython
 86 | profile_default/
 87 | ipython_config.py
 88 | 
 89 | # pyenv
 90 | #   For a library or package, you might want to ignore these files since the code is
 91 | #   intended to run in multiple environments; otherwise, check them in:
 92 | # .python-version
 93 | 
 94 | # pipenv
 95 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 96 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 97 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 98 | #   install all needed dependencies.
 99 | #Pipfile.lock
100 | 
101 | # poetry
102 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
103 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
104 | #   commonly ignored for libraries.
105 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
106 | #poetry.lock
107 | 
108 | # pdm
109 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
110 | #pdm.lock
111 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
112 | #   in version control.
113 | #   https://pdm.fming.dev/#use-with-ide
114 | .pdm.toml
115 | 
116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
117 | __pypackages__/
118 | 
119 | # Celery stuff
120 | celerybeat-schedule
121 | celerybeat.pid
122 | 
123 | # SageMath parsed files
124 | *.sage.py
125 | 
126 | # Environments
127 | .env
128 | .venv
129 | env/
130 | venv/
131 | ENV/
132 | env.bak/
133 | venv.bak/
134 | 
135 | # Spyder project settings
136 | .spyderproject
137 | .spyproject
138 | 
139 | # Rope project settings
140 | .ropeproject
141 | 
142 | # mkdocs documentation
143 | /site
144 | 
145 | # mypy
146 | .mypy_cache/
147 | .dmypy.json
148 | dmypy.json
149 | 
150 | # Pyre type checker
151 | .pyre/
152 | 
153 | # pytype static type analyzer
154 | .pytype/
155 | 
156 | # Cython debug symbols
157 | cython_debug/
158 | 
159 | # PyCharm
160 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
161 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
162 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
163 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
164 | #.idea/
165 | 
166 | # End of https://www.toptal.com/developers/gitignore/api/python
167 | 
168 | # Custom additions:
169 | # Mac OS X
170 | .DS_Store
171 | # Output data
172 | *.json
173 | *.html


--------------------------------------------------------------------------------
/.pypirc:
--------------------------------------------------------------------------------
 1 | [distutils]
 2 | index-servers =
 3 |     pypi
 4 |     testpypi
 5 | 
 6 | [pypi]
 7 | repository = https://upload.pypi.org/legacy/
 8 | 
 9 | [testpypi]
10 | repository = https://test.pypi.org/legacy/


--------------------------------------------------------------------------------
/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | url = "https://pypi.org/simple"
 3 | verify_ssl = true
 4 | name = "pypi"
 5 | 
 6 | [packages]
 7 | argparse = "==1.4.0"
 8 | sparqlwrapper = "==1.8.5"
 9 | pandas = "==1.2.4"
10 | rdflib = "==6.0.2"
11 | requests = "==2.25.1"
12 | bs4 = "==0.0.1"
13 | beautifulsoup4 = "==4.9.3"
14 | 
15 | [dev-packages]
16 | 
17 | [requires]
18 | python_version = "3.12"
19 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # EUR-Lex Parser
 2 | 
 3 | <p>
 4 |     <a href="https://github.com/kevin91nl/eurlex/actions/workflows/building.yaml"><img src="https://github.com/kevin91nl/eurlex/actions/workflows/building.yaml/badge.svg" alt="Building" height="18"></a>
 5 |     <a href="https://badge.fury.io/py/eurlex"><img src="https://badge.fury.io/py/eurlex.svg" alt="PyPI version" height="18"></a>
 6 |     <a href=https://github.com/ambv/black><img src="https://img.shields.io/badge/code%20style-black-000000.svg" height="18"></a>
 7 | </p>
 8 | 
 9 | An EUR-Lex parser for Python.
10 | 
11 | ## Usage
12 | 
13 | You can install this package as follows:
14 | 
15 | ```bash
16 | pip install -U eurlex
17 | ```
18 | 
19 | After installing this package, you can download and parse any document from EUR-Lex. For example, the [32019R0947 regulation](https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A32019R0947):
20 | 
21 | ```python
22 | from eurlex import get_html_by_celex_id, parse_html
23 | 
24 | # Retrieve and parse the document with CELEX ID "32019R0947" into a Pandas DataFrame
25 | celex_id = "32019R0947"
26 | html = get_html_by_celex_id(celex_id)
27 | df = parse_html(html)
28 | 
29 | # Get the first line of Article 1
30 | df_article_1 = df[df.article == "1"]
31 | df_article_1_line_1 = df_article_1.iloc[0]
32 | 
33 | # Display the subtitle of Article 1
34 | print(df_article_1_line_1.article_subtitle)
35 | >>> "Subject matter"
36 | 
37 | # Display the corresponding text
38 | print(df_article_1_line_1.text)
39 | >>> "This Regulation lays down detailed provisions for the operation of unmanned aircraft systems as well as for personnel, including remote pilots and organisations involved in those operations."
40 | ```
41 | 
42 | Every document on EUR-Lex displays a CELEX number at the top of the page. More information on CELEX numbers can be found on the [EUR-Lex website](https://eur-lex.europa.eu/content/tools/eur-lex-celex-infographic-A3.pdf).
43 | 
44 | For more information about the methods in this package, see the [unit tests](https://github.com/kevin91nl/eurlex/tree/main/tests) and [doctests](https://github.com/kevin91nl/eurlex/blob/main/eurlex/__init__.py).
45 | 
46 | ### Data Structure
47 | 
48 | The following columns are available in the parsed dataframe:
49 | 
50 | - `text`: The text
51 | - `type`: The type of the data
52 | - `document`: The document in which the text is found
53 | - `article`: The article in which the text is found
54 | - `article_subtitle`: The subtitle of the article (when available)
55 | - `ref`: The indentation level of the text within the article (e.g. `["(1)", "(a)"]` when the text is found under paragraph `(1)`, subparagraph `(a)`)
56 | 
57 | In some cases, additional fields are available. For example, the `group` field which contains the bold text under which a text is found.
58 | 
59 | ## Code Contribution
60 | 
61 | Feel free to send any issues, ideas or pull requests.


--------------------------------------------------------------------------------
/eurlex/__init__.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import rdflib
  3 | import requests
  4 | from SPARQLWrapper import SPARQLWrapper, JSON
  5 | import pandas as pd
  6 | import datetime
  7 | from xml.etree import ElementTree as ETree
  8 | from typing import List, Dict
  9 | 
 10 | 
 11 | def get_prefixes() -> dict:
 12 |     """Get a mapping from prefixes to URLs.
 13 | 
 14 |     Returns
 15 |     -------
 16 |     dict
 17 |         A mapping from prefixes to URLs.
 18 |     """
 19 |     return {
 20 |         "cdm": "http://publications.europa.eu/ontology/cdm#",
 21 |         "celex": "http://publications.europa.eu/resource/celex/",
 22 |         "owl": "http://www.w3.org/2002/07/owl#",
 23 |         "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
 24 |         "cellar": "http://publications.europa.eu/resource/cellar/",
 25 |         "skos": "http://www.w3.org/2004/02/skos/core#",
 26 |     }
 27 | 
 28 | 
 29 | def parse_article_paragraphs(article: str) -> dict:
 30 |     """Convert an article found on EUR-Lex to paragraphs.
 31 | 
 32 |     Parameters
 33 |     ----------
 34 |     article : str
 35 |         The article to parse.
 36 | 
 37 |     Returns
 38 |     -------
 39 |     dict
 40 |         Mapping from paragraph identifier to paragraph.
 41 | 
 42 |     Examples
 43 |     --------
 44 |     The following example parses an article into paragraphs. Make sure to use newlines instead of the 5 spaces which
 45 |     are used in the example.
 46 |     >>> parse_article_paragraphs("This is a test with a few paragraphs:     1. The first one     2. The second one")
 47 |     {None: 'This is a test with a few paragraphs:', '1.': 'The first one', '2.': 'The second one'}
 48 |     >>> parse_article_paragraphs("This is a test with a few paragraphs:     (1) The first one     (2) The second one")
 49 |     {None: 'This is a test with a few paragraphs:', '(1)': 'The first one', '(2)': 'The second one'}
 50 |     """
 51 |     paragraphs = dict()
 52 |     paragraph = None
 53 |     article = article.replace("     ", "\n")
 54 |     for line in article.split("\n"):
 55 |         match = re.match(r"^([0-9]+)[.]", line)
 56 |         if match:
 57 |             paragraph = match.group(0)
 58 |             line = ".".join(line.split(".")[1:]).strip()
 59 |         else:
 60 |             match = re.match(r"^[(]([0-9]+)[)]", line)
 61 |             if match:
 62 |                 paragraph = match.group(0)
 63 |                 line = ")".join(line.split(")")[1:]).strip()
 64 |         if paragraph not in paragraphs:
 65 |             paragraphs[paragraph] = []
 66 |         paragraphs[paragraph].append(line)
 67 |     paragraphs = {
 68 |         paragraph: "\n".join(paragraphs[paragraph]).strip() for paragraph in paragraphs
 69 |     }
 70 |     return paragraphs
 71 | 
 72 | 
 73 | def prepend_prefixes(query: str) -> str:
 74 |     """Prepend a query with prefixes.
 75 | 
 76 |     Parameters
 77 |     ----------
 78 |     query : str
 79 |         The query to prepend.
 80 | 
 81 |     Returns
 82 |     -------
 83 |     str
 84 |         Query prepended with the prefixes.
 85 | 
 86 |     Examples
 87 |     --------
 88 |     >>> 'prefix rdf' in prepend_prefixes("SELECT ?name WHERE { ?person rdf:name ?name }")
 89 |     True
 90 |     """
 91 |     return (
 92 |         "\n".join(
 93 |             [
 94 |                 "prefix {}: <{}>".format(prefix, url)
 95 |                 for prefix, url in get_prefixes().items()
 96 |             ]
 97 |         )
 98 |         + " "
 99 |         + query
100 |     )
101 | 
102 | 
103 | def run_query(query: str) -> dict:
104 |     """Run the SPARQL query on EUR-Lex.
105 | 
106 |     Parameters
107 |     ----------
108 |     query : str
109 |         The SPARQL query to run.
110 | 
111 |     Returns
112 |     -------
113 |     dict
114 |         A dictionary containing the results.
115 |     """
116 |     sparql = SPARQLWrapper(
117 |         "http://publications.europa.eu/webapi/rdf/sparql"
118 |     )  # pragma: no cover
119 |     sparql.setQuery(query)  # pragma: no cover
120 |     sparql.setReturnFormat(JSON)  # pragma: no cover
121 |     results = sparql.query().convert()  # pragma: no cover
122 |     return results  # pragma: no cover
123 | 
124 | 
125 | def convert_sparql_output_to_dataframe(sparql_results: dict) -> pd.DataFrame:
126 |     """Convert SPARQL output to a DataFrame.
127 | 
128 |     Parameters
129 |     ----------
130 |     sparql_results : dict
131 |         A dictionary containing the SPARQL results.
132 | 
133 |     Returns
134 |     -------
135 |     pd.DataFrame
136 |         The DataFrame representation of the SPARQL results.
137 | 
138 |     Examples
139 |     --------
140 |     >>> convert_sparql_output_to_dataframe({'results': {'bindings': [{'subject': {'value': 'cdm:test'}}]}}).to_dict()
141 |     {'subject': {0: 'cdm:test'}}
142 |     """
143 |     items = [
144 |         {key: simplify_iri(item[key]["value"]) for key in item.keys()}
145 |         for item in sparql_results["results"]["bindings"]
146 |     ]
147 |     return pd.DataFrame(items)
148 | 
149 | 
150 | def get_celex_dataframe(celex_id: str) -> pd.DataFrame:
151 |     """Get CELEX data delivered in a DataFrame.
152 | 
153 |     Parameters
154 |     ----------
155 |     celex_id : str
156 |         The CELEX ID to get the data for.
157 | 
158 |     Returns
159 |     -------
160 |     pd.DataFrame
161 |         A DataFrame containing the results.
162 |     """
163 |     graph = rdflib.Graph()  # pragma: no cover
164 |     results = graph.parse(
165 |         f"http://publications.europa.eu/resource/" f"celex/{str(celex_id)}?language=eng"
166 |     )  # pragma: no cover
167 |     items = [
168 |         {key: simplify_iri(item[key]) for key in range(len(item))} for item in results
169 |     ]  # pragma: no cover
170 |     df = pd.DataFrame(items)  # pragma: no cover
171 |     df.columns = ["s", "o", "p"]  # pragma: no cover
172 |     return df  # pragma: no cover
173 | 
174 | 
175 | def get_celex_id(
176 |     slash_notation: str, document_type: str = "R", sector_id: str = "3"
177 | ) -> str:
178 |     """Derive the CELEX ID from a slash notation like 2019/947.
179 | 
180 |     Parameters
181 |     ----------
182 |     slash_notation : str
183 |         The slash notation of the document (like 2019/947).
184 |     document_type : str
185 |         The type of the document (e.g. "R" for regulations).
186 |     sector_id : str
187 |         The sector ID (e.g. 3).
188 | 
189 |     Returns
190 |     -------
191 |     str
192 |         The CELEX ID
193 | 
194 |     Examples
195 |     --------
196 |     >>> get_celex_id('2019/947')
197 |     '32019R0947'
198 |     >>> get_celex_id('947/2019')
199 |     '32019R0947'
200 |     """
201 |     term1, term2 = slash_notation.split("/")
202 |     current_year = datetime.datetime.now().year
203 |     term1 = int(term1)
204 |     term2 = int(term2)
205 |     term1_is_year = 1800 <= term1 <= current_year
206 |     term2_is_year = 1800 <= term2 <= current_year
207 |     year = term2
208 |     document_id = term1
209 |     if term1_is_year and not term2_is_year:
210 |         year = term1
211 |         document_id = term2
212 |     if term2_is_year and not term1_is_year:
213 |         year = term2
214 |         document_id = term1
215 |     return "{}{}{}{}".format(
216 |         str(sector_id), year, document_type, str(document_id).zfill(4)
217 |     )
218 | 
219 | 
220 | def get_possible_celex_ids(
221 |     slash_notation: str, document_type: str = None, sector_id: str = None
222 | ) -> list:
223 |     """Get a list of possible CELEX IDs (given a slash notation like 2019/947).
224 | 
225 |     Parameters
226 |     ----------
227 |     slash_notation : str
228 |         The slash notation of the document (like 2019/947).
229 |     document_type : str
230 |         The type of the document (e.g. "R" for regulations).
231 |     sector_id : str
232 |         The sector ID (e.g. 3).
233 | 
234 |     Returns
235 |     -------
236 |     list
237 |         A list of possible CELEX IDs.
238 | 
239 |     Examples
240 |     --------
241 |     >>> '32019R0947' in get_possible_celex_ids("2019/947")
242 |     True
243 |     """
244 |     sector_ids = (
245 |         [str(i) for i in range(10)] + ["C", "E"]
246 |         if sector_id is None
247 |         else [str(sector_id)]
248 |     )
249 |     document_types = (
250 |         ["L", "R", "E", "PC", "DC", "SC", "JC", "CJ", "CC", "CO"]
251 |         if document_type is None
252 |         else [document_type]
253 |     )
254 |     possible_ids = []
255 |     for sector_id in sector_ids:
256 |         for document_type in document_types:
257 |             guess = get_celex_id(slash_notation, document_type, sector_id)
258 |             possible_ids.append(guess)
259 |     return possible_ids
260 | 
261 | 
262 | def guess_celex_ids_via_eurlex(
263 |     slash_notation: str, document_type: str = None, sector_id: str = None
264 | ) -> list:
265 |     """Guess CELEX IDs for a slash notation by looking it up via EUR-Lex.
266 | 
267 |     Parameters
268 |     ----------
269 |     slash_notation : str
270 |         The slash notation of the document (like 2019/947).
271 |     document_type : str
272 |         The type of the document (e.g. "R" for regulations).
273 |     sector_id : str
274 |         The sector ID (e.g. 3).
275 | 
276 |     Returns
277 |     -------
278 |     list
279 |         A list of possible CELEX IDs.
280 |     """
281 |     slash_notation = "/".join(slash_notation.split("/")[:2])  # pragma: no cover
282 |     queries = [
283 |         "{ ?s owl:sameAs celex:" + celex_id + " . ?s owl:sameAs ?o }"
284 |         for celex_id in get_possible_celex_ids(slash_notation, document_type, sector_id)
285 |     ]  # pragma: no cover
286 |     query = "SELECT * WHERE {" + " UNION ".join(queries) + "}"  # pragma: no cover
287 |     query = prepend_prefixes(query)  # pragma: no cover
288 |     results = run_query(query.strip())  # pragma: no cover
289 |     celex_ids = []  # pragma: no cover
290 |     for binding in results["results"]["bindings"]:  # pragma: no cover
291 |         if "/celex/" in binding["o"]["value"]:  # pragma: no cover
292 |             celex_id = binding["o"]["value"].split("/")[-1]  # pragma: no cover
293 |             celex_ids.append(celex_id)  # pragma: no cover
294 |     celex_ids = list(set(celex_ids))  # pragma: no cover
295 |     return celex_ids  # pragma: no cover
296 | 
297 | 
298 | def simplify_iri(iri: str) -> str:
299 |     """Simplify prefixes in an IRI.
300 | 
301 |     Parameters
302 |     ----------
303 |     iri : str
304 |         IRI to simplify.
305 | 
306 |     Returns
307 |     -------
308 |     str
309 |         Simplified version where all prefixes are replaced by their shortcuts.
310 | 
311 |     Examples
312 |     --------
313 |     >>> simplify_iri("http://publications.europa.eu/ontology/cdm#test")
314 |     'cdm:test'
315 |     >>> simplify_iri("cdm:test")
316 |     'cdm:test'
317 |     """
318 |     for prefix, url in get_prefixes().items():
319 |         if iri.startswith(url):
320 |             return prefix + ":" + iri[len(url) :]
321 |     return iri
322 | 
323 | 
324 | def get_html_by_cellar_id(cellar_id: str, language: str = "en") -> str:
325 |     """Retrieve HTML by CELLAR ID.
326 | 
327 |     Parameters
328 |     ----------
329 |     cellar_id : str
330 |         The CELLAR ID to find HTML for.
331 |     language : str
332 |         The language to retrieve the HTML in (default: "en").
333 | 
334 |     Returns
335 |     -------
336 |     str
337 |         HTML found using the CELLAR ID.
338 |     """
339 |     url = "http://publications.europa.eu/resource/cellar/" + str(  # pragma: no cover
340 |         cellar_id.split(":")[1] if ":" in cellar_id else cellar_id  # pragma: no cover
341 |     )  # pragma: no cover
342 |     response = requests.get(
343 |         url,
344 |         allow_redirects=True,
345 |         headers={  # pragma: no cover
346 |             "Accept": "text/html,application/xhtml+xml,application/xml",  # pragma: no cover
347 |             "Accept-Language": f"{language}",  # pragma: no cover
348 |         },
349 |     )  # pragma: no cover
350 |     html = response.content.decode("utf-8")  # pragma: no cover
351 |     return html  # pragma: no cover
352 | 
353 | 
354 | def get_html_by_celex_id(celex_id: str, language: str = "en") -> str:
355 |     """Retrieve HTML by CELEX ID.
356 | 
357 |     Parameters
358 |     ----------
359 |     celex_id : str
360 |         The CELEX ID to find HTML for.
361 |     language : str
362 |         The language to retrieve the HTML in (default: "en").
363 | 
364 | 
365 |     Returns
366 |     -------
367 |     str
368 |         HTML found using the CELEX ID.
369 |     """
370 |     url = "http://publications.europa.eu/resource/celex/" + str(
371 |         celex_id
372 |     )  # pragma: no cover
373 |     response = requests.get(
374 |         url,
375 |         allow_redirects=True,
376 |         headers={  # pragma: no cover
377 |             "Accept": "text/html,application/xhtml+xml,application/xml",  # pragma: no cover
378 |             "Accept-Language": f"{language}",  # pragma: no cover
379 |         },
380 |     )  # pragma: no cover
381 |     html = response.content.decode("utf-8")  # pragma: no cover
382 |     return html  # pragma: no cover
383 | 
384 | 
385 | def get_tag_name(raw_tag_name: str) -> str:
386 |     """Get the tag name.
387 | 
388 |     Parameters
389 |     ----------
390 |     raw_tag_name : str
391 |         The original tag name.
392 | 
393 |     Returns
394 |     -------
395 |     str
396 |         The parsed tag name.
397 | 
398 |     Examples
399 |     --------
400 |     >>> get_tag_name('tag}test')
401 |     'test'
402 |     """
403 |     return raw_tag_name.split("}")[1] if "}" in raw_tag_name else raw_tag_name
404 | 
405 | 
406 | def parse_modifiers(
407 |     child: ETree.Element, ref: list = None, context: dict = None
408 | ) -> list:
409 |     """Parse modifiers.
410 | 
411 |     Parameters
412 |     ----------
413 |     child : xml.etree.ElementTree.Element
414 |         XML tree.
415 |     ref : list
416 |         References.
417 |     context : dict
418 |         Context.
419 | 
420 |     Returns
421 |     -------
422 |     list
423 |         Results.
424 | 
425 |     Examples
426 |     --------
427 |     >>> parse_modifiers(ETree.fromstring('<p class="italic">Text</p>'))
428 |     [{'text': 'Text', 'type': 'text', 'modifier': 'italic', 'ref': [], 'context': {}}]
429 |     >>> parse_modifiers(ETree.fromstring('<p class="signatory">Text</p>'))
430 |     [{'text': 'Text', 'type': 'text', 'modifier': 'signatory', 'ref': [], 'context': {}}]
431 |     >>> parse_modifiers(ETree.fromstring('<p class="note">Text</p>'))
432 |     [{'text': 'Text', 'type': 'text', 'modifier': 'note', 'ref': [], 'context': {}}]
433 |     >>> parse_modifiers(ETree.fromstring('<p class="separator"></p>'))
434 |     []
435 |     """
436 |     ref = [] if ref is None else ref
437 |     context = {} if context is None else context
438 |     output = []
439 |     new_context = context.copy()
440 |     if child.attrib["class"] == "italic":
441 |         output.append(
442 |             {
443 |                 "text": _get_text(child),
444 |                 "type": "text",
445 |                 "modifier": "italic",
446 |                 "ref": ref,
447 |                 "context": new_context.copy(),
448 |             }
449 |         )
450 |     elif child.attrib["class"] == "signatory":
451 |         output.append(
452 |             {
453 |                 "text": _get_text(child),
454 |                 "type": "text",
455 |                 "modifier": "signatory",
456 |                 "ref": ref,
457 |                 "context": new_context.copy(),
458 |             }
459 |         )
460 |     elif child.attrib["class"] == "note":
461 |         output.append(
462 |             {
463 |                 "text": _get_text(child),
464 |                 "type": "text",
465 |                 "modifier": "note",
466 |                 "ref": ref,
467 |                 "context": new_context.copy(),
468 |             }
469 |         )
470 |     return output
471 | 
472 | 
473 | def _get_text(child: ETree.Element) -> str:
474 |     """Get text.
475 | 
476 |     Parameters
477 |     ----------
478 |     child : xml.etree.ElementTree.Element
479 |         XML tree.
480 | 
481 |     Returns
482 |     -------
483 |     str
484 |         Text.
485 | 
486 |     Examples
487 |     --------
488 |     >>> _get_text(ETree.fromstring('<p>Text</p>'))
489 |     'Text'
490 |     >>> _get_text(ETree.fromstring('<p><span>Text</span></p>'))
491 |     'Text'
492 |     """
493 |     if len(child) == 1:
494 |         return _get_text(child[0])
495 |     if child.text is not None:
496 |         return child.text.strip()
497 |     return ""
498 | 
499 | 
500 | def parse_span(child: ETree.Element, ref: list = None, context: dict = None) -> list:
501 |     """Parse a <span> or <p> tag.
502 | 
503 |     Parameters
504 |     ----------
505 |     child : xml.etree.ElementTree.Element
506 |         XML tree.
507 |     ref : list
508 |         References.
509 |     context : dict
510 |         Context.
511 | 
512 |     Returns
513 |     -------
514 |     list
515 |         Results.
516 | 
517 |     Examples
518 |     --------
519 |     >>> parse_span(ETree.fromstring('<p class="doc-ti">Text</p>'))
520 |     [{'text': 'Text', 'type': 'doc-title', 'ref': [], 'context': {'document': 'Text'}}]
521 |     >>> parse_span(ETree.fromstring('<p class="sti-art">Text</p>'))
522 |     [{'text': 'Text', 'type': 'art-subtitle', 'ref': [], 'context': {'article_subtitle': 'Text'}}]
523 |     >>> parse_span(ETree.fromstring('<p class="ti-art">Text</p>'))
524 |     [{'text': 'Text', 'type': 'art-title', 'ref': [], 'context': {'article': 'Text'}}]
525 |     >>> parse_span(ETree.fromstring('<p class="ti-grseq-1">Text</p>'))
526 |     [{'text': 'Text', 'type': 'group-title', 'ref': [], 'context': {}}]
527 |     >>> parse_span(ETree.fromstring('<p class="ti-grseq-1"><span class="bold">Text</span></p>'))
528 |     [{'text': 'Text', 'type': 'group-title', 'ref': [], 'context': {}}]
529 |     >>> parse_span(ETree.fromstring('<p class="ti-section-1">Text</p>'))
530 |     [{'text': 'Text', 'type': 'section-title', 'ref': [], 'context': {}}]
531 |     >>> parse_span(ETree.fromstring('<p class="normal">1. Text</p>'))
532 |     [{'text': 'Text', 'type': 'text', 'ref': [], 'context': {'paragraph': '1'}}]
533 |     >>> parse_span(ETree.fromstring('<p class="italic">Text</p>'))
534 |     [{'text': 'Text', 'type': 'text', 'modifier': 'italic', 'ref': [], 'context': {}}]
535 |     >>> parse_span(ETree.fromstring('<p>Text</p>'))
536 |     []
537 |     """
538 |     ref = [] if ref is None else ref
539 |     context = {} if context is None else context
540 |     output = []
541 |     if "class" not in child.attrib:
542 |         return output
543 |     if child.attrib["class"] == "doc-ti":
544 |         if "document" not in context:
545 |             context["document"] = ""
546 |         context["document"] += _get_text(child)
547 |         output.append(
548 |             {
549 |                 "text": _get_text(child),
550 |                 "type": "doc-title",
551 |                 "ref": ref,
552 |                 "context": context.copy(),
553 |             }
554 |         )
555 |     elif child.attrib["class"] == "sti-art":
556 |         context["article_subtitle"] = _get_text(child)
557 |         output.append(
558 |             {
559 |                 "text": _get_text(child),
560 |                 "type": "art-subtitle",
561 |                 "ref": ref,
562 |                 "context": context.copy(),
563 |             }
564 |         )
565 |     elif child.attrib["class"] == "ti-art":
566 |         context["article"] = _get_text(child).replace("Article", "").strip()
567 |         output.append(
568 |             {
569 |                 "text": _get_text(child),
570 |                 "type": "art-title",
571 |                 "ref": ref,
572 |                 "context": context.copy(),
573 |             }
574 |         )
575 |     elif child.attrib["class"].startswith("ti-grseq-"):
576 |         output.append(
577 |             {
578 |                 "text": _get_text(child),
579 |                 "type": "group-title",
580 |                 "ref": ref,
581 |                 "context": context.copy(),
582 |             }
583 |         )
584 |         context["group"] = _get_text(child)
585 |     elif child.attrib["class"].startswith("ti-section-"):
586 |         output.append(
587 |             {
588 |                 "text": _get_text(child),
589 |                 "type": "section-title",
590 |                 "ref": ref,
591 |                 "context": context.copy(),
592 |             }
593 |         )
594 |         context["section"] = _get_text(child)
595 |     elif child.attrib["class"] == "normal":
596 |         text = _get_text(child)
597 |         if re.match("^[0-9]+[.]", text):
598 |             context["paragraph"] = text.split(".")[0]
599 |             text = ".".join(text.split(".")[1:]).strip()
600 |         output.append(
601 |             {"text": text, "type": "text", "ref": ref, "context": context.copy()}
602 |         )
603 |     else:
604 |         output.extend(parse_modifiers(child, ref, context))
605 |     return output
606 | 
607 | 
608 | def parse_article(tree: ETree.Element, ref: list = None, context: dict = None) -> list:
609 |     """Parse an article.
610 | 
611 |     Parameters
612 |     ----------
613 |     tree : xml.etree.ElementTree.Element
614 |         XML tree.
615 |     ref : list
616 |         References.
617 |     context : dict
618 |         Context.
619 | 
620 |     Returns
621 |     -------
622 |     list
623 |         Results.
624 | 
625 |     Examples
626 |     --------
627 |     >>> parse_article(ETree.fromstring('<html><a>Link</a></html>'))
628 |     [{'text': 'Link', 'type': 'link', 'ref': [], 'context': {}}]
629 |     >>> parse_article(ETree.fromstring('<html><p class="doc-ti">Text</p></html>'))
630 |     [{'text': 'Text', 'type': 'doc-title', 'ref': [], 'context': {'document': 'Text'}}]
631 |     >>> parse_article(ETree.fromstring('<p><table><tbody><tr><td><p>1</p></td><td>2</td></tr></tbody></table></p>'))
632 |     []
633 |     >>> parse_article(ETree.fromstring('<html><div>Text</div></html>'))
634 |     []
635 |     >>> parse_article(ETree.fromstring('<html><head>Text</head></html>'))
636 |     []
637 |     >>> parse_article(ETree.fromstring('<html><body>Text</body></html>'))
638 |     []
639 |     """
640 |     namespaces = {"html": "http://www.w3.org/1999/xhtml"}
641 |     ref = [] if ref is None else ref
642 |     context = {} if context is None else context
643 |     output = []
644 |     new_context = context.copy()
645 |     for child in tree:
646 |         if get_tag_name(child.tag) in ["a"]:
647 |             output.append(
648 |                 {
649 |                     "text": _get_text(child),
650 |                     "type": "link",
651 |                     "ref": ref,
652 |                     "context": new_context.copy(),
653 |                 }
654 |             )
655 |         elif get_tag_name(child.tag) in ["p", "span"]:
656 |             output.extend(parse_span(child, ref, new_context))
657 |         elif get_tag_name(child.tag) == "table":
658 |             results = child.findall(
659 |                 "html:tbody/html:tr/html:td", namespaces=namespaces
660 |             ) + child.findall("tbody/tr/td", namespaces=namespaces)
661 |             if (
662 |                 len(results) == 2
663 |                 and len(results[0]) == 1
664 |                 and get_tag_name(results[0][0].tag) == "p"
665 |             ):
666 |                 key = None
667 |                 for subchild in results[0]:
668 |                     key = _get_text(subchild)
669 |                 output.extend(parse_article(results[1], ref + [key], new_context))
670 |             else:
671 |                 pass
672 |         elif get_tag_name(child.tag) == "div":
673 |             output.extend(parse_article(child, ref, new_context))
674 |         elif get_tag_name(child.tag) in ["head", "hr"]:
675 |             pass
676 |         elif get_tag_name(child.tag) == "body":
677 |             output.extend(parse_article(child, ref, context))
678 |     return output
679 | 
680 | 
681 | def parse_html(html: str) -> pd.DataFrame:
682 |     """Parse EUR-Lex HTML into a DataFrame.
683 | 
684 |     Parameters
685 |     ----------
686 |     html : str
687 |         The HTML to parse.
688 | 
689 |     Returns
690 |     -------
691 |     pd.DataFrame
692 |         The parsed DataFrame
693 | 
694 |     Examples
695 |     --------
696 |     >>> parse_html('<html><body><p class="normal">Text</p></body></html>').to_dict(orient='records')
697 |     [{'text': 'Text', 'type': 'text', 'ref': [], 'context': {}}]
698 |     >>> parse_html('<html><p class="doc-ti">Text</p></html>').to_dict(orient='records')
699 |     []
700 |     >>> parse_html('<html').to_dict(orient='records')
701 |     []
702 |     >>> parse_html('<html><p class="doc-ti">ANNEX</p><p class="ti-grseq-1"><span>Group</span></p><p class="normal">Text</p></html>').to_dict(orient='records')
703 |     [{'text': 'Text', 'type': 'text', 'ref': [], 'context': {'document': 'ANNEX', 'group': 'Group'}, 'document': 'ANNEX', 'group': 'Group'}]
704 |     """
705 |     try:
706 |         tree = ETree.fromstring(html)
707 |     except ETree.ParseError:
708 |         return pd.DataFrame()
709 |     records = []
710 |     for item in parse_article(tree):
711 |         for key, value in item["context"].items():
712 |             item[key] = value
713 |         records.append(item)
714 |     df = pd.DataFrame.from_records(records)
715 |     df = df[df.type == "text"] if "type" in df.columns else df
716 |     return df
717 | 
718 | 
719 | def get_regulations(limit: int = -1, shuffle: bool = False) -> list:
720 |     """Retrieve regulations from EUR-Lex.
721 | 
722 |     Parameters
723 |     ----------
724 |     limit : int
725 |         The maximum number of regulations to retrieve (default: no limit).
726 |     shuffle : bool
727 |         Whether to shuffle the retrieved regulations (default: False).
728 | 
729 |     Returns
730 |     -------
731 |     list
732 |         A list of CELLAR IDs.
733 |     """
734 |     query = "select ?doc where {?doc cdm:work_has_resource-type <http://publications.europa.eu/"  # pragma: no cover
735 |     query += (
736 |         "resource/authority/resource-type/REG_IMPL> . }"
737 |         + (" order by rand()" if shuffle else "")
738 |         + (" limit " + str(limit) if limit > 0 else "")
739 |     )  # pragma: no cover
740 |     results = run_query(prepend_prefixes(query))  # pragma: no cover
741 |     cellar_ids = []  # pragma: no cover
742 |     for result in results["results"]["bindings"]:  # pragma: no cover
743 |         cellar_ids.append(result["doc"]["value"].split("/")[-1])  # pragma: no cover
744 |     return cellar_ids  # pragma: no cover
745 | 
746 | 
747 | def get_documents(types: List[str] = ["REG"], limit: int = -1) -> List[Dict[str, str]]:
748 |     """Retrieve a list of of documents of specified types from EUR-Lex that have a CELEX-number, as a list of dicts.
749 | 
750 |     Parameters
751 |     ----------
752 |     types : List[str]
753 |         The by the SparQL-API recognized type of documents to return
754 |         Examples: ["DIR", "DIR_IMPL", "DIR_DEL", "REG", "REG_IMPL", "REG_FINANC", "REG_DEL"]
755 |     limit : int
756 |         The maximum number of regulations to retrieve. (default: no limit).
757 | 
758 |     Returns
759 |     -------
760 |     List[dict]
761 |         A list of dicts, containing publication date, publication url, celex number and type of document.
762 |     """
763 |     query = "select distinct ?doc ?type ?celex ?date\n"
764 |     query += "where{ ?doc cdm:work_has_resource-type ?type.\n"
765 |     query += "  FILTER(\n    "
766 |     query += " ||\n    ".join(
767 |         map(
768 |             lambda type: f"?type=<http://publications.europa.eu/resource/authority/resource-type/{type}>",
769 |             types,
770 |         )
771 |     )
772 |     query += "\n  )\n"
773 |     query += "  FILTER(BOUND(?celex))\n"
774 |     query += "  OPTIONAL{?doc cdm:resource_legal_id_celex ?celex.}\n"
775 |     query += "  OPTIONAL{?doc cdm:work_date_document ?date.}\n"
776 |     query += "}\n"
777 |     if limit > 0:
778 |         query += "limit " + str(limit)
779 | 
780 |     results = []
781 |     query_results = run_query(prepend_prefixes(query))
782 | 
783 |     for result in query_results["results"]["bindings"]:
784 |         results.append(
785 |             {
786 |                 "celex": result["celex"]["value"],
787 |                 "date": result["date"]["value"],
788 |                 "link": result["doc"]["value"],
789 |                 "type": result["type"]["value"].split("/")[-1],
790 |             }
791 |         )
792 | 
793 |     return results
794 | 
795 | 
796 | def process_paragraphs(paragraphs: list) -> pd.DataFrame:
797 |     """Process the paragraphs.
798 | 
799 |     Parameters
800 |     ----------
801 |     paragraphs : list
802 |         The list of currently downloaded paragraphs.
803 | 
804 |     Returns
805 |     -------
806 |     pd.DataFrame
807 |         A DataFrame containing the processed paragraphs.
808 | 
809 |     Examples
810 |     --------
811 |     >>> process_paragraphs([]).to_dict(orient='records')
812 |     []
813 |     >>> process_paragraphs([{'celex_id': '1', 'paragraph': 'Done at 2021-11-25.'}]).to_dict(orient='records')
814 |     []
815 |     """
816 |     df_paragraphs = pd.DataFrame.from_records(paragraphs)
817 |     if "paragraph" not in df_paragraphs.columns:
818 |         return df_paragraphs
819 |     df_paragraphs = (
820 |         df_paragraphs[~df_paragraphs.paragraph.str.startswith("Done at")]
821 |         if len(df_paragraphs)
822 |         else df_paragraphs
823 |     )
824 |     df_paragraphs = (
825 |         df_paragraphs[~df_paragraphs.paragraph.str.startswith("It shall apply from")]
826 |         if len(df_paragraphs)
827 |         else df_paragraphs
828 |     )
829 |     df_paragraphs = (
830 |         df_paragraphs[~df_paragraphs.paragraph.str.contains("is replaced by")]
831 |         if len(df_paragraphs)
832 |         else df_paragraphs
833 |     )
834 |     df_paragraphs = (
835 |         df_paragraphs[~df_paragraphs.paragraph.str.endswith("is updated.")]
836 |         if len(df_paragraphs)
837 |         else df_paragraphs
838 |     )
839 |     df_paragraphs = (
840 |         df_paragraphs[~df_paragraphs.paragraph.str.endswith("is deleted.")]
841 |         if len(df_paragraphs)
842 |         else df_paragraphs
843 |     )
844 |     df_paragraphs = (
845 |         df_paragraphs[~df_paragraphs.paragraph.str.endswith("is removed.")]
846 |         if len(df_paragraphs)
847 |         else df_paragraphs
848 |     )
849 |     df_paragraphs = (
850 |         df_paragraphs[~df_paragraphs.paragraph.str.endswith("is hereby repealed.")]
851 |         if len(df_paragraphs)
852 |         else df_paragraphs
853 |     )
854 |     df_paragraphs = (
855 |         df_paragraphs[~df_paragraphs.paragraph.str.endswith("are updated.")]
856 |         if len(df_paragraphs)
857 |         else df_paragraphs
858 |     )
859 |     df_paragraphs = (
860 |         df_paragraphs[~df_paragraphs.paragraph.str.endswith("are deleted.")]
861 |         if len(df_paragraphs)
862 |         else df_paragraphs
863 |     )
864 |     df_paragraphs = (
865 |         df_paragraphs[~df_paragraphs.paragraph.str.endswith("are removed.")]
866 |         if len(df_paragraphs)
867 |         else df_paragraphs
868 |     )
869 |     df_paragraphs = (
870 |         df_paragraphs[~df_paragraphs.paragraph.str.contains("is amended ")]
871 |         if len(df_paragraphs)
872 |         else df_paragraphs
873 |     )
874 |     df_paragraphs = (
875 |         df_paragraphs[~df_paragraphs.paragraph.str.contains("is repealed with")]
876 |         if len(df_paragraphs)
877 |         else df_paragraphs
878 |     )
879 |     df_paragraphs = (
880 |         df_paragraphs[df_paragraphs.paragraph.str.endswith(".")]
881 |         if len(df_paragraphs)
882 |         else df_paragraphs
883 |     )
884 |     df_paragraphs = (
885 |         df_paragraphs[
886 |             df_paragraphs.paragraph.apply(lambda text: text[0].upper() == text[0])
887 |         ]
888 |         if len(df_paragraphs)
889 |         else df_paragraphs
890 |     )
891 |     df_paragraphs = (
892 |         df_paragraphs[~df_paragraphs.paragraph.str.contains("‘")]
893 |         if len(df_paragraphs)
894 |         else df_paragraphs
895 |     )
896 |     df_paragraphs = (
897 |         df_paragraphs[~df_paragraphs.paragraph.str.contains("’")]
898 |         if len(df_paragraphs)
899 |         else df_paragraphs
900 |     )
901 |     df_paragraphs = (
902 |         df_paragraphs[df_paragraphs.paragraph.apply(len) >= 100]
903 |         if len(df_paragraphs)
904 |         else df_paragraphs
905 |     )
906 |     df_paragraphs = (
907 |         df_paragraphs.drop_duplicates("paragraph")
908 |         if len(df_paragraphs)
909 |         else df_paragraphs
910 |     )
911 |     return df_paragraphs
912 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | argparse==1.4.0
2 | sparqlwrapper==1.8.5
3 | pandas==1.2.4
4 | rdflib==6.0.2
5 | requests==2.25.1
6 | bs4==0.0.1
7 | beautifulsoup4==4.9.3


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | from pathlib import Path
 3 | 
 4 | this_directory = Path(__file__).parent
 5 | long_description = (this_directory / "README.md").read_text()
 6 | 
 7 | setuptools.setup(
 8 |     name="eurlex",
 9 |     version="0.1.4",
10 |     author="K.M.J. Jacobs",
11 |     author_email="kmj.jacobs@maastrichtuniversity.nl",
12 |     description="An EUR-Lex parser for Python.",
13 |     long_description=long_description,
14 |     long_description_content_type="text/markdown",
15 |     url="https://github.com/kevin91nl/eurlex",
16 |     packages=setuptools.find_packages(),
17 |     include_package_data=True,
18 |     classifiers=[],
19 |     python_requires=">=3.6",
20 |     # Load the requirements.txt
21 |     install_requires=[
22 |         line.strip()
23 |         for line in (this_directory / "requirements.txt").read_text().split("\n")
24 |         if line.strip()
25 |     ],
26 | )
27 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
 1 | def _merge_dicts(a, b, path=None):
 2 |     if path is None:
 3 |         path = []
 4 |     for key in b:
 5 |         if key in a:
 6 |             if isinstance(a[key], dict) and isinstance(b[key], dict):
 7 |                 _merge_dicts(a[key], b[key], path + [str(key)])
 8 |             elif a[key] == b[key]:
 9 |                 pass
10 |             else:
11 |                 if a[key] is None:
12 |                     a[key] = b[key]
13 |                 if b[key] is None:
14 |                     b[key] = a[key]
15 |         else:
16 |             a[key] = b[key]
17 |     return a
18 | 
19 | 
20 | def _convert_outline_item(outline_item):
21 |     """Convert an outline item.
22 | 
23 |     The following input:
24 |     ```
25 |     ["1", "a", "i."]
26 |     ```
27 | 
28 |     Is converted into the following output:
29 |     ```
30 |     {
31 |         "1": {
32 |             "a": {
33 |                 "i.": None
34 |             }
35 |         }
36 |     }
37 |     ```
38 |     """
39 |     if len(outline_item) == 1:
40 |         return {outline_item[0]: None}
41 |     else:
42 |         node, remainder = outline_item[0], outline_item[1:]
43 |         return {node: _convert_outline_item(remainder)}
44 | 
45 | 
46 | def _convert_outline(outline_as_tuples):
47 |     """Convert an outline as tuples into a tree format.
48 | 
49 |     Given the following input:
50 | 
51 |     ```
52 |     [
53 |         ["1", "a", "i."],
54 |         ["1", "a", "ii."],
55 |         ["1", "b", "i."],
56 |         ["2"],
57 |         ["3", "a"]
58 |     ]
59 |     ```
60 | 
61 |     Generate the following output (all leaves are encoded as None):
62 | 
63 |     ```
64 |     {"1": {"a": {"i.": None, "ii.": None}, "b": {"i.": None}}, "2": None, "3": None}
65 |     ```
66 |     """
67 |     tree = dict()
68 |     for item in outline_as_tuples:
69 |         tree = _merge_dicts(tree, _convert_outline_item(item))
70 |     return tree
71 | 


--------------------------------------------------------------------------------
/tests/test_parsing.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from eurlex import get_html_by_celex_id, parse_html
  3 | import pandas as pd
  4 | 
  5 | from tests import _convert_outline
  6 | 
  7 | 
  8 | @pytest.mark.parametrize(
  9 |     "celex_id,header,ref,expected_text_or_text_count",
 10 |     [
 11 |         [
 12 |             "32019R0945",
 13 |             "Requirements for a class C0 Unmanned aircraft system",
 14 |             ["(8)", "(a)"],
 15 |             6,
 16 |         ],
 17 |         [
 18 |             "32019R0945",
 19 |             "Requirements for a class C0 Unmanned aircraft system",
 20 |             ["(1)"],
 21 |             "have an MTOM of less than 250 g, including payload;",
 22 |         ],
 23 |         [
 24 |             "32019R0945",
 25 |             "Definitions",
 26 |             ["(1)"],
 27 |             "‘unmanned aircraft’ (‘UA’) means any aircraft operating or designed to operate autonomously or to be piloted remotely without a pilot on board;",
 28 |         ],
 29 |         [
 30 |             "32019R0947",
 31 |             "UAS.OPEN.060 Responsibilities of the remote pilot",
 32 |             ["(2)", "(f)"],
 33 |             "comply with the operator's procedures when available.",
 34 |         ],
 35 |     ],
 36 | )
 37 | def test_paragraph_content(celex_id, header, ref, expected_text_or_text_count):
 38 |     html = get_html_by_celex_id(celex_id)
 39 |     df = parse_html(html)
 40 |     expected_text = None
 41 |     expected_text_count = 1
 42 |     if type(expected_text_or_text_count) == int:
 43 |         expected_text_count = expected_text_or_text_count
 44 |     if type(expected_text_or_text_count) == str:
 45 |         expected_text = expected_text_or_text_count
 46 |     assert df.shape[0] > 0, "No rows found for CELEX ID {}".format(celex_id)
 47 |     if "article_subtitle" not in df:
 48 |         df["article_subtitle"] = ""
 49 |     df = df[(df.group == header) | (df.article_subtitle == header)]
 50 |     assert df.shape[0] > 0, "No rows found for header {}".format(header)
 51 |     df = df[df.ref.apply("".join).str.startswith("".join(ref))]
 52 |     assert df.shape[0] > 0, "No rows found for reference {}".format(ref)
 53 |     assert df.shape[0] == expected_text_count, "Expected {} texts, but found {}".format(
 54 |         expected_text_count, df.shape[0]
 55 |     )
 56 |     assert (
 57 |         expected_text is None or df.text.values[0] == expected_text
 58 |     ), "Text is not as expected"
 59 | 
 60 | 
 61 | @pytest.mark.parametrize(
 62 |     "celex_id,header,expected_outline",
 63 |     [
 64 |         (
 65 |             "32019R0947",
 66 |             "UAS.SPEC.020 Operational declaration",
 67 |             {
 68 |                 "(1)": {
 69 |                     "(a)": {"i.": None, "ii.": None, "iii.": None, "iv.": None},
 70 |                     "(b)": {"i.": None, "ii.": None},
 71 |                 },
 72 |                 "(2)": {"(a)": None, "(b)": None, "(c)": None, "(d)": None},
 73 |                 "(3)": None,
 74 |                 "(4)": None,
 75 |                 "(5)": None,
 76 |                 "(6)": None,
 77 |             },
 78 |         )
 79 |     ],
 80 | )
 81 | def test_outline(celex_id, header, expected_outline):
 82 |     html = get_html_by_celex_id(celex_id)
 83 |     df = parse_html(html)
 84 |     df = df[(df.group == header) | (df.article_subtitle == header)]
 85 |     assert (
 86 |         _convert_outline(df.ref.tolist()) == expected_outline
 87 |     ), "Outline is not as expected"
 88 | 
 89 | 
 90 | @pytest.mark.parametrize(
 91 |     "celex_id,expected_articles", [("32015R0220", 16), ("32019R0947", 23)]
 92 | )
 93 | def test_article_count(celex_id, expected_articles):
 94 |     html = get_html_by_celex_id(celex_id)
 95 |     df = parse_html(html)
 96 |     num_unique_articles = df[~pd.isna(df.article)].article.unique().shape[0]
 97 |     assert (
 98 |         num_unique_articles == expected_articles
 99 |     ), f"Wrong number of articles (found: {num_unique_articles}, expected: {expected_articles})"
100 | 


--------------------------------------------------------------------------------