├── requirements_dev.txt
├── .dockerignore
├── docker
    ├── crontab
    ├── ssh
    │   └── config
    └── cron.sh
├── deploy.sh
├── requirements.txt
├── docker-compose.yml
├── Dockerfile
├── scrape.sh
├── LICENSE.txt
├── README.md
└── scrape.py


/requirements_dev.txt:
--------------------------------------------------------------------------------
1 | requests
2 | beautifulsoup4
3 | lxml
4 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | .github
2 | venv
3 | .git
4 | data_branch
5 | .idea
6 | 


--------------------------------------------------------------------------------
/docker/crontab:
--------------------------------------------------------------------------------
1 | 00 04 * * * root /scrape.sh >> /var/log/cron.log 2>&1
2 | 


--------------------------------------------------------------------------------
/docker/ssh/config:
--------------------------------------------------------------------------------
1 | Host *
2 |   AddKeysToAgent yes
3 |   IdentityFile /root/.ssh/key
4 |   StrictHostKeyChecking no
5 | 


--------------------------------------------------------------------------------
/deploy.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | docker build . --tag scraper
4 | docker stack deploy gii_scraper -c docker-compose.yml
5 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4==4.9.0
2 | certifi==2020.4.5.1
3 | chardet==3.0.4
4 | idna==2.9
5 | lxml==4.6.2
6 | requests==2.23.0
7 | soupsieve==2.0
8 | urllib3==1.25.9
9 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3.9"
 2 | 
 3 | services:
 4 |   scraper:
 5 |     image: scraper
 6 |     deploy:
 7 |       resources:
 8 |         limits:
 9 |           cpus: '0.50'
10 |           memory: 512M
11 |       restart_policy:
12 |         condition: on-failure
13 |     secrets:
14 |       - gii_scraper_github_key
15 | 
16 | secrets:
17 |   gii_scraper_github_key:
18 |     external: true
19 | 


--------------------------------------------------------------------------------
/docker/cron.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin bash
 2 | 
 3 | cp /run/secrets/gii_scraper_github_key /root/.ssh/key
 4 | echo '\n' >> /root/.ssh/key
 5 | chmod 600 /root/.ssh/key
 6 | ssh-agent sh -c 'ssh-add /root/.ssh/key'
 7 | 
 8 | 
 9 | printenv | cat - /etc/cron.d/cron-jobs > ~/crontab.tmp \
10 |     && mv ~/crontab.tmp /etc/cron.d/cron-jobs
11 | 
12 | chmod 644 /etc/cron.d/cron-jobs
13 | 
14 | tail -f /var/log/cron.log &
15 | 
16 | cron -f
17 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.7-bullseye
 2 | 
 3 | RUN apt-get update && apt-get -y install cron
 4 | 
 5 | COPY requirements.txt requirements.txt
 6 | RUN pip install -r requirements.txt
 7 | 
 8 | ADD docker/ssh /root/.ssh/
 9 | ADD scrape.py scrape.py
10 | 
11 | ADD scrape.sh /scrape.sh
12 | RUN chmod +x /scrape.sh
13 | 
14 | ADD docker/cron.sh /usr/bin/cron.sh
15 | RUN chmod +x /usr/bin/cron.sh
16 | 
17 | ADD docker/crontab /etc/cron.d/cron-jobs
18 | RUN chmod 0644 /etc/cron.d/cron-jobs
19 | 
20 | RUN touch /var/log/cron.log
21 | 
22 | ENTRYPOINT ["/bin/sh", "/usr/bin/cron.sh"]
23 | #ENTRYPOINT ["/bin/sh", "/scrape.sh"]
24 | 


--------------------------------------------------------------------------------
/scrape.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | set -e
 3 | 
 4 | rm -rf /data_branch
 5 | 
 6 | git clone \
 7 |   --branch data \
 8 |   --single-branch \
 9 |   --depth 1 \
10 |   -q \
11 |   git@github.com:QuantLaw/gesetze-im-internet.git \
12 |   /data_branch
13 | 
14 | SCRAPE_DATETIME=$(date +'%Y-%m-%dT%T')
15 | SCRAPE_DATE=$(date +'%Y-%m-%d')
16 | 
17 | git config --global user.email "scraper@github.com"
18 | git config --global user.name "Scraper"
19 | 
20 | cd /
21 | python scrape.py /data_branch $SCRAPE_DATETIME
22 | 
23 | cd /data_branch
24 | git add .
25 | git commit -m "scrape $SCRAPE_DATETIME" --date $SCRAPE_DATETIME
26 | git tag $SCRAPE_DATE
27 | git push
28 | git push git@github.com:QuantLaw/gesetze-im-internet.git $SCRAPE_DATE -f
29 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2020, Janis Beckedorf, Corinna Coupette, Dirk Hartung
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 |   this list of conditions and the following disclaimer in the documentation
12 |   and/or other materials provided with the distribution.
13 | 
14 | * Neither the name of quantlaw nor the names of its
15 |   contributors may be used to endorse or promote products derived from
16 |   this software without specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Gesetze im Internet
 2 | 
 3 | Es werden die auf https://www.gesetze-im-internet.de veröffentlichten Gesetze, Rechtsverordnungen, etc. täglich archiviert. 
 4 | Das Archiv beschränkt sich auf XML-Dateien nebst Anhänge. (Die inhaltsgleichen PDF-, EPUB- und HTML-Dateien werden nicht gesichert.)
 5 | 
 6 | Die Daten sind im [Branch 'Data' dieses Repositories](https://github.com/QuantLaw/gesetze-im-internet/tree/data) 
 7 | abrufbar.
 8 | 
 9 | Unter [Releases](https://github.com/QuantLaw/gesetze-im-internet/releases) 
10 | kann der Stand zu einem Tag ausgewählt, eingesehen und separat heruntergeladen werden.
11 | 
12 | 
13 | ## Nutzung
14 | 
15 | Dieses Archiv enthält die jeweils aktuellen Gesetze seit dem 10. Juni 2019 in strukturiertem Format. 
16 | Dieser historische Datensatz eignet sich insbesondere für die maschinelle Weiterverarbeitung 
17 | und kann beispielsweise für quantitative Analysen des Rechts genutzt werden. 
18 | Daher wird auf eine Weiterverarbeitung der archivierten Daten an dieser Stelle verzichtet.
19 | 
20 | 
21 | ## Hintergrundinformationen
22 | 
23 | Das [Log](https://github.com/QuantLaw/gesetze-im-internet/blob/data/data/log.md) 
24 | enthält eine Liste aller archivierten Versionen.
25 | Ebenfalls können die Commit-Messages im Branch 'Data' genutzt werden.
26 | 
27 | Ab Mai 2020 geschieht die Archivierung grundsätzlich täglich.
28 | Das Archiv reicht bis zum 10. Juni 2019 zurück. 
29 | Für diesen Zeitraum stehen wöchentliche Versionen bereit.
30 | Die Archivierung geschieht transparent mittels Docker. 
31 | Das genutzte Skript ist in diesem Repository im Master-Branch enthalten.
32 | 
33 | ### Archivierungsprozess
34 | 
35 | Die Archivierung basiert auf dem Inhaltsverzeichnis von Gesetze im Internet, das als XML-Datei bereitgestellt wird. 
36 | (Siehe https://www.gesetze-im-internet.de/hinweise.html für nähere Informationen.)
37 | Es werden alle genannten Gesetze heruntergeladen und entpackt. 
38 | Sofern sich ihr Inhalt geändert hat, wird die neue Version zum Repository hinzugefügt.
39 | 
40 | In seltenen Fällen ist eine im Inhaltsverzeichnis aufgeführte Datei auf dem Server nicht verfügbar. 
41 | Solche eine Datei wird ausgelassen und unter `data/not_found.txt` im jeweiligen Commit dokumentiert. 
42 | Typischerweise ist die Datei leer, da dieser Fehler bei der betreffenden Archivierung nicht aufgetreten ist.
43 | 
44 | Finden die Betreiber von gesetze-im-internet.de Fehler in den Daten (beispielsweise einen Tippfehler), 
45 | werden diese auf der Webseite nachträglich korrigiert.
46 | Entsprechend wird bei der nächsten Archivierung die Fehlerkorrektur als neue Gesetzesversion in das Archiv übernommen.
47 | Im Archiv wird der Fehler jedoch nicht in bereits archivierten Versionen nachträglich korrigiert. 
48 | Daher kann von einer neuen Dateiversion nicht zwingend auf eine Änderung der Rechtslage geschlossen werden,
49 | ohne die Änderung inhaltlich zu untersuchen.
50 | Neben einer Fehlerkorrektur wird eine neue Version häufig durch eine Aktualisierung des `builddate` 
51 | (ein Attribut in der XML-Datei) verursacht.
52 | 


--------------------------------------------------------------------------------
/scrape.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import shutil
  4 | from multiprocessing.pool import Pool
  5 | from zipfile import ZipFile, BadZipFile
  6 | import time
  7 | 
  8 | import requests
  9 | from bs4 import BeautifulSoup
 10 | from requests.adapters import HTTPAdapter
 11 | from requests.packages.urllib3.util.retry import Retry
 12 | 
 13 | 
 14 | def requests_retry_session(
 15 |     retries=5, backoff_factor=10, status_forcelist=(500, 502, 504), session=None,
 16 | ):
 17 |     session = session or requests.Session()
 18 |     retry = Retry(
 19 |         total=retries,
 20 |         read=retries,
 21 |         connect=retries,
 22 |         backoff_factor=backoff_factor,
 23 |         status_forcelist=status_forcelist,
 24 |     )
 25 |     adapter = HTTPAdapter(max_retries=retry)
 26 |     session.mount("http://", adapter)
 27 |     session.mount("https://", adapter)
 28 |     return session
 29 | 
 30 | 
 31 | def ensure_exists(path):
 32 |     if not os.path.exists(path):
 33 |         os.makedirs(path)
 34 |     return path
 35 | 
 36 | 
 37 | def handle_links(link, TEMP_PATH, ITEMS_PATH):
 38 |     time.sleep(0.25)
 39 |     error = None
 40 |     # print("Loading", link)
 41 | 
 42 |     link_parts = link.split("/")
 43 |     assert link_parts[-1] == "xml.zip"
 44 |     item_id = link_parts[-2]
 45 | 
 46 |     r = requests_retry_session().get(link)
 47 |     zip_path = TEMP_PATH + item_id + ".zip"
 48 |     with open(zip_path, "wb") as f:
 49 |         f.write(r.content)
 50 |     try:
 51 |         with ZipFile(zip_path) as f:
 52 |             f.extractall(ITEMS_PATH + item_id)
 53 |     except BadZipFile:
 54 |         with open(zip_path, "rb") as f:
 55 |             contents = f.read()
 56 |         if b"<title>404 Not Found</title>" in contents:
 57 |             error = item_id
 58 |         else:
 59 |             raise
 60 |     os.remove(zip_path)
 61 |     return error
 62 | 
 63 | 
 64 | def scrape(TEMP_PATH, ITEMS_PATH, TOC_PATH, NOT_FOUND_PATH):
 65 |     toc = requests_retry_session().get("https://www.gesetze-im-internet.de/gii-toc.xml")
 66 |     with open(TOC_PATH, "wb") as f:
 67 |         f.write(toc.content)
 68 | 
 69 |     soup = BeautifulSoup(toc.text, "lxml-xml")
 70 | 
 71 |     links = [item.link.get_text() for item in list(soup.find_all("item"))]
 72 | 
 73 |     with Pool(2) as p:
 74 |         errors = p.starmap(
 75 |             handle_links,
 76 |             [(l, TEMP_PATH, ITEMS_PATH) for l in links]
 77 |         )
 78 |     errors = [e for e in errors if e is not None]
 79 | 
 80 |     with open(NOT_FOUND_PATH, "w") as f:
 81 |         for e in errors:
 82 |             f.write(e + "\n")
 83 | 
 84 |     shutil.rmtree(TEMP_PATH)
 85 | 
 86 | 
 87 | if __name__ == "__main__":
 88 |     parser = argparse.ArgumentParser()
 89 |     parser.add_argument("data_repo_path", type=str)
 90 |     parser.add_argument("datetime", type=str)
 91 |     args = parser.parse_args()
 92 | 
 93 |     BASE_PATH = os.path.join(args.data_repo_path, "data/")
 94 |     LOG_PATH = os.path.join(BASE_PATH, "log.md")
 95 |     TOC_PATH = os.path.join(BASE_PATH, "toc.xml")
 96 |     NOT_FOUND_PATH = os.path.join(BASE_PATH, "not_found.txt")
 97 |     ITEMS_PATH = os.path.join(BASE_PATH, "items/")
 98 |     TEMP_PATH = os.path.join(BASE_PATH, "temp/")
 99 | 
100 |     if os.path.exists(TOC_PATH):
101 |         os.remove(TOC_PATH)
102 |     if os.path.exists(NOT_FOUND_PATH):
103 |         os.remove(NOT_FOUND_PATH)
104 |     shutil.rmtree(ITEMS_PATH, ignore_errors=True)
105 |     shutil.rmtree(TEMP_PATH, ignore_errors=True)
106 | 
107 |     ensure_exists(BASE_PATH)
108 |     ensure_exists(ITEMS_PATH)
109 |     ensure_exists(TEMP_PATH)
110 |     scrape(TEMP_PATH, ITEMS_PATH, TOC_PATH, NOT_FOUND_PATH)
111 | 
112 |     with open(LOG_PATH, "a+") as file:
113 |         file.writelines(f"- {args.datetime}\n")
114 |     print("DONE", args.datetime)
115 | 


--------------------------------------------------------------------------------