├── scrapers
    ├── __init__.py
    ├── test
    │   ├── __init__.py
    │   ├── test_district_data.py
    │   ├── test_test_data.py
    │   └── test_dates.py
    ├── .gitignore
    ├── scrape_nw_common.py
    ├── scrape_gl_common.py
    ├── scrape_ag_common.py
    ├── scrape_vd_common.py
    ├── scrape_fl_tests.py
    ├── scrape_zh_tests.py
    ├── scrape_be_tests.py
    ├── scrape_fr_common.py
    ├── validate_scraper_output.sh
    ├── scrape_so_common.py
    ├── db_common.py
    ├── scrape_fr_tests.py
    ├── scrape_nw_tests.py
    ├── scrape_zh.py
    ├── scrape_tg_districts.py
    ├── scrape_ti_tests.py
    ├── scrape_zg_tests.py
    ├── scrape_sh_tests.py
    ├── scrape_gl_tests.py
    ├── scrape_nw.py
    ├── run_district_scraper.sh
    ├── scrape_vs_common.py
    ├── run_tests_scraper.sh
    ├── meta_scrape.sh
    ├── scrape_sg_tests.py
    ├── scrape_vs_tests.py
    ├── test_tests_scraper.sh
    ├── scrape_tg.py
    ├── run_scraper.sh
    ├── test_district_scraper.sh
    ├── scrape_ne.py
    ├── scrape_ag_tests.py
    ├── scrape_ju_tests.py
    ├── scrape_ai.py
    ├── scrape_bl_common.py
    ├── scrape_fr.py
    ├── scrape_be_districts.py
    ├── scrape_ge_tests.py
    ├── test_scraper.sh
    ├── scrape_lu.py
    ├── scrape_sg_districts.py
    ├── scrape_bs.py
    ├── download.sh
    ├── scrape_tests.py
    ├── validate_scrapers.py
    ├── scrape_ge_common.py
    ├── populate_district_database.py
    ├── scrape_so_districts.py
    ├── scrape_sh.py
    ├── scrape_bs_tests.py
    ├── scrape_be.py
    ├── scrape_sz_districts.py
    ├── populate_database.py
    ├── scrape_gr_districts.py
    ├── populate_tests_database.py
    ├── scrape_tg_tests.py
    ├── scrape_ti.py
    ├── scrape_sz.py
    ├── certificate.pem
    ├── scrape_ag.py
    ├── scrape_vd_tests.py
    ├── scrape_so.py
    ├── scrape_fl.py
    ├── scrape_ow.py
    ├── scrape_vs.py
    ├── scrape_sh_common.py
    ├── scrape_fr_districts.py
    ├── scrape_bl_tests.py
    ├── scrape_ur.py
    ├── scrape_gl.py
    ├── scrape_so_tests.py
    ├── add_district_db_entry.py
    ├── convert_parsed_to_csv.py
    ├── scrape_gr.py
    ├── scrape_bl_districts.py
    ├── scrape_ag_districts.py
    ├── scrape_vs_districts.py
    └── scrape_vd.py
├── gd.png
├── logos.png
├── requirements-ocr.txt
├── dashboard
    └── dashboard.png
├── binder
    └── environment.yml
├── statistisches_amt_kt_zh.png
├── .gitignore
├── requirements.txt
├── setup.py
├── fallzahlen_bezirke
    ├── Readme.md
    └── fallzahlen_kanton_AG_bezirk.csv
├── fallzahlen_plz
    └── Readme.md
├── fallzahlen_kanton_total_csv_v2
    └── README.md
├── COVID19_Fallzahlen_Kanton_ZH_isolated_quarantined.csv
├── COVID19_Fallzahlen_Kanton_ZH_Beispiel_alter_geschlecht_.csv
├── fallzahlen_kanton_alter_geschlecht_csv
    ├── COVID19_Fallzahlen_Kanton_AI_alter_geschlecht.csv
    ├── COVID19_Fallzahlen_Kanton_alter_geschlecht_BEISPIEL.csv
    ├── Readme.md
    ├── COVID19_Fallzahlen_Kanton_AR_alter_geschlecht.csv
    ├── COVID19_Fallzahlen_Kanton_AG_alter_geschlecht.csv
    └── COVID19_Einwohner_Kanton_ZH_altersklassen_geschlecht.csv
├── fallzahlen_kanton_zh
    ├── README.md
    └── COVID19_VOC_Kanton_ZH.csv
├── scripts
    ├── latest_total.sh
    ├── transform_all_new2old.sh
    ├── transform_all_add_columns.sh
    ├── transform_all_old2new.sh
    ├── check_for_empty_lines.sh
    ├── merge_canton_csvs.rb
    ├── update_dates_in_readme.sh
    ├── validate-schema.js
    ├── latest_per_canton.sh
    ├── new2oldcsv.py
    ├── old2newcsv.py
    ├── add_new_columns.py
    ├── remove_older_entries.py
    ├── check_for_outliers.py
    └── validate-csv.js
├── mappingCanton_BFS.csv
├── package.json
├── COVID19_Fallzahlen_Beispiel.csv
├── CONTRIBUTING.md
├── fallzahlen_kanton_total_csv
    └── README.md
├── .github
    └── workflows
    │   ├── rebase.yml
    │   ├── lint_python.yml
    │   ├── test_scraper.yml
    │   ├── activate_scraper.yml
    │   ├── deactivate_scraper.yml
    │   ├── test_tests_scraper.yml
    │   ├── test_district_scraper.yml
    │   ├── validate-csv.yml
    │   ├── run_district_scrapers.yml
    │   └── run_tests_scraper.yml
├── correction_status.csv
└── fallzahlen_tests
    └── fallzahlen_kanton_JU_tests.csv


/scrapers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scrapers/test/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scrapers/.gitignore:
--------------------------------------------------------------------------------
1 | webarchiveorg.log
2 | __pycache__
3 | 


--------------------------------------------------------------------------------
/gd.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openZH/covid_19/HEAD/gd.png


--------------------------------------------------------------------------------
/logos.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openZH/covid_19/HEAD/logos.png


--------------------------------------------------------------------------------
/requirements-ocr.txt:
--------------------------------------------------------------------------------
1 | opencv-python==4.4.0.44
2 | numpy
3 | pytesseract
4 | 


--------------------------------------------------------------------------------
/dashboard/dashboard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openZH/covid_19/HEAD/dashboard/dashboard.png


--------------------------------------------------------------------------------
/binder/environment.yml:
--------------------------------------------------------------------------------
1 | channels:
2 |   - conda-forge
3 | dependencies:
4 |   - matplotlib
5 |   - pandas
6 | 


--------------------------------------------------------------------------------
/statistisches_amt_kt_zh.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openZH/covid_19/HEAD/statistisches_amt_kt_zh.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/*
2 | node_modules
3 | scrapers/data.sqlite
4 | *.pyc
5 | boxplot.png
6 | geckodriver.log
7 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4
2 | requests
3 | dateparser
4 | xlrd==1.2.0
5 | pytest
6 | pandas
7 | selenium
8 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 | 
3 | setup(name="scrapers", packages=find_packages())
4 | 


--------------------------------------------------------------------------------
/fallzahlen_bezirke/Readme.md:
--------------------------------------------------------------------------------
1 | # Metadata
2 | 
3 | See https://github.com/openZH/covid_19/tree/master#canton-zurich-districts-bezirk. 
4 | 


--------------------------------------------------------------------------------
/fallzahlen_plz/Readme.md:
--------------------------------------------------------------------------------
1 | # Metadata
2 | See: https://github.com/openZH/covid_19/tree/master#canton-zurich-postal-codes-postleitzahl. 
3 | 


--------------------------------------------------------------------------------
/fallzahlen_kanton_total_csv_v2/README.md:
--------------------------------------------------------------------------------
1 | # Metadata
2 | Siehe: https://github.com/openZH/covid_19/tree/master#swiss-cantons-and-principality-of-liechtenstein-unified-dataset. 
3 | 


--------------------------------------------------------------------------------
/COVID19_Fallzahlen_Kanton_ZH_isolated_quarantined.csv:
--------------------------------------------------------------------------------
1 | date,abbreviation_canton_and_fl,current_isolated,current_quarantined
2 | 2020-05-26,ZH,14,58
3 | 2020-05-29,ZH,22,67
4 | 2020-06-02,ZH,18,47
5 | 


--------------------------------------------------------------------------------
/COVID19_Fallzahlen_Kanton_ZH_Beispiel_alter_geschlecht_.csv:
--------------------------------------------------------------------------------
1 | Date,Area,AgeYear,Gender,NewConfCases,NewDeaths,PreExistingCond
2 | 2020-03-01,Canton_ZH,30,F,1,0,0
3 | 2020-03-01,Canton_ZH,32,M,0,1,1
4 | 
5 | 


--------------------------------------------------------------------------------
/fallzahlen_kanton_alter_geschlecht_csv/COVID19_Fallzahlen_Kanton_AI_alter_geschlecht.csv:
--------------------------------------------------------------------------------
1 | Date,Area,AgeYear,Gender,NewConfCases,NewPosTests1,NewCured,NewDeaths
2 | 14.03.2020,Canton_AI,59,m,,1,,
3 | 14.03.2020,Canton_AI,57,f,,1,,
4 | 


--------------------------------------------------------------------------------
/fallzahlen_kanton_alter_geschlecht_csv/COVID19_Fallzahlen_Kanton_alter_geschlecht_BEISPIEL.csv:
--------------------------------------------------------------------------------
1 | Date,Area,AgeYear,Gender,NewConfCases,NewDeaths,PreExistingCond
2 | 2020-03-01,Canton_ZH,30,F,1,0,0
3 | 2020-03-01,Canton_ZH,32,M,0,1,1
4 | 
5 | 


--------------------------------------------------------------------------------
/fallzahlen_kanton_zh/README.md:
--------------------------------------------------------------------------------
1 | # Metadata
2 | See:
3 | - https://github.com/openZH/covid_19/tree/master#canton-z%C3%BCrich-unified-dataset 
4 | - https://github.com/openZH/covid_19/blob/master/README.md#canton-z%C3%BCrich-more-detailed-dataset.
5 | 


--------------------------------------------------------------------------------
/scripts/latest_total.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | for f in *.csv; do
4 |   # Output last row with non-zero commulative number of cases
5 |   awk -F , '{if ($5) { print $1, $3, $5; }}' "$f" | tail -1
6 | done | awk 'BEGIN { sum = 0; } { sum += $3; } END { print sum; }'
7 | 


--------------------------------------------------------------------------------
/fallzahlen_kanton_alter_geschlecht_csv/Readme.md:
--------------------------------------------------------------------------------
1 | # Metadata
2 | 
3 | See:
4 | - https://github.com/openZH/covid_19/tree/master#swiss-cantons-and-principality-of-liechtenstein-more-detailed-dataset
5 | - https://github.com/openZH/covid_19/tree/master#canton-zurich-more-detailed-dataset
6 | 


--------------------------------------------------------------------------------
/scripts/transform_all_new2old.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | DIR="$(cd "$(dirname "$0")" && pwd)"
 4 | 
 5 | for f in $DIR/../fallzahlen_kanton_total_csv_v2/*.csv;
 6 | do
 7 |     filename="$(basename "$f")"
 8 |     $DIR/new2oldcsv.py $f > $DIR/../fallzahlen_kanton_total_csv/$filename
 9 | done
10 | 


--------------------------------------------------------------------------------
/mappingCanton_BFS.csv:
--------------------------------------------------------------------------------
 1 | abk,bfs
 2 | ZH,01
 3 | BE,02
 4 | LU,03
 5 | UR,04
 6 | SZ,05
 7 | OW,06
 8 | NW,07
 9 | GL,08
10 | ZG,09
11 | FR,10
12 | SO,11
13 | BS,12
14 | BL,13
15 | SH,14
16 | AR,15
17 | AI,16
18 | SG,17
19 | GR,18
20 | AG,19
21 | TG,20
22 | TI,21
23 | VD,22
24 | VS,23
25 | NE,24
26 | GE,25
27 | JU,26
28 | FL,99
29 | 


--------------------------------------------------------------------------------
/fallzahlen_kanton_alter_geschlecht_csv/COVID19_Fallzahlen_Kanton_AR_alter_geschlecht.csv:
--------------------------------------------------------------------------------
1 | Date,Area,AgeYear,Gender,NewConfCases,NewPosTests1,NewCured,NewDeaths
2 | 05.03.2020,Canton_AR,50,f,,1,,
3 | 09.03.2020,Canton_AR,,f,,1,,
4 | 12.03.2020,Canton_AR,69,f,,1,,
5 | 12.03.2020,Canton_AR,38,f,,1,,
6 | 12.03.2020,Canton_AR,42,f,,1,,
7 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "covid_19",
 3 |   "version": "1.0.0",
 4 |   "repository": "git@github.com:openZH/covid_19.git",
 5 |   "license": "MIT",
 6 |   "dependencies": {
 7 |     "csv-validator": "0.0.3"
 8 |   },
 9 |   "scripts": {
10 |     "test": "node scripts/validate-csv.js fallzahlen_kanton_total_csv_v2/*.csv"
11 |   }
12 | }
13 | 


--------------------------------------------------------------------------------
/scripts/transform_all_add_columns.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | DIR="$(cd "$(dirname "$0")" && pwd)"
 4 | 
 5 | for f in $DIR/../fallzahlen_kanton_total_csv_v2/*.csv;
 6 | do
 7 |     filename="$(basename "$f")"
 8 |     $DIR/add_new_columns.py $f > /tmp/columnfile 
 9 |     cat /tmp/columnfile > $DIR/../fallzahlen_kanton_total_csv_v2/$filename
10 | done
11 | 


--------------------------------------------------------------------------------
/scripts/transform_all_old2new.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | DIR="$(cd "$(dirname "$0")" && pwd)"
 4 | 
 5 | mkdir -p $DIR/../fallzahlen_kanton_total_csv_v2
 6 | 
 7 | for f in $DIR/../fallzahlen_kanton_total_csv/*.csv;
 8 | do
 9 |     filename="$(basename "$f")"
10 |     $DIR/old2newcsv.py $f > $DIR/../fallzahlen_kanton_total_csv_v2/$filename
11 | done
12 | 


--------------------------------------------------------------------------------
/scripts/check_for_empty_lines.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | path="$*"
 4 | output=$(grep --line-number --with-filename '^\s*$' $path)
 5 | grep_exit=$?
 6 | 
 7 | if [ $grep_exit -eq 0 ] ; then
 8 |     echo "× Found empty lines in the following files/line number:"
 9 |     echo $output
10 |     exit 1
11 | else
12 |     echo "✓ No empty lines found"
13 |     exit 0
14 | fi
15 | 
16 | 


--------------------------------------------------------------------------------
/COVID19_Fallzahlen_Beispiel.csv:
--------------------------------------------------------------------------------
1 | date,time,abbreviation_canton_and_fl,ncumul_tested,ncumul_conf,new_hosp,current_hosp,current_icu,current_vent,ncumul_released,ncumul_deceased,source,current_isolated,current_quarantined,current_quarantined_riskareatravel
2 | 2020-02-27,17:40,AG,10000,1000,10,100,10,10,100,10,https://ag.ch/...,37,88,112
3 | 2020-02-28,11:00,AG,11000,1010,5,80,5,5,120,15,https://ag.ch/...,35,67,132
4 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | Contributors to data collection & cleaning: please check https://github.com/openZH/covid_19/issues for open issues, and use this to flag any problems.
2 | 
3 | The best way to get started right now is to join the discussion at https://github.com/openZH/covid_19/discussions?discussions_q=sort%3Atop
4 | 
5 | Users of the data: please share links to your projects in https://github.com/openZH/covid_19#community-contributions
6 | 


--------------------------------------------------------------------------------
/fallzahlen_kanton_alter_geschlecht_csv/COVID19_Fallzahlen_Kanton_AG_alter_geschlecht.csv:
--------------------------------------------------------------------------------
1 | Date,Area,AgeYear,Gender,NewConfCases,NewPosTests1,NewCured,NewDeaths,source
2 | 01.03.2020,Canton_AG,31,m,1,,,,https://www.ag.ch/de/aktuelles/medienportal/medienmitteilung/medienmitteilungen/mediendetails_138717.jsp
3 | 01.03.2020,Canton_AG,74,f,,,,1,https://www.ag.ch/media/kanton_aargau/themen_1/coronavirus_1/lagebulletins/200305_KFS_Coronavirus_Lagebulletin_5.pdf
4 | 


--------------------------------------------------------------------------------
/scrapers/scrape_nw_common.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import re
 5 | from bs4 import BeautifulSoup
 6 | import scrape_common as sc
 7 | 
 8 | 
 9 | def get_nw_page():
10 |     url = 'https://www.nw.ch/gesundheitsamtdienste/6044'
11 |     content = sc.download(url, silent=True)
12 |     content = content.replace("&nbsp;", " ")
13 |     content = re.sub(r'(\d+)\'(\d+)', r'\1\2', content)
14 |     soup = BeautifulSoup(content, 'html.parser')
15 |     return url, soup
16 | 


--------------------------------------------------------------------------------
/scrapers/scrape_gl_common.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import re
 5 | from bs4 import BeautifulSoup
 6 | import scrape_common as sc
 7 | 
 8 | 
 9 | def get_gl_pdf_url():
10 |     d = sc.download('https://www.gl.ch/verwaltung/finanzen-und-gesundheit/gesundheit/coronavirus.html/4817', silent=True)
11 |     soup = BeautifulSoup(d, 'html.parser')
12 | 
13 |     # weekly pdf
14 |     elem = soup.find(href=re.compile(r'Sentinella.*\.pdf'))
15 |     if elem is None:
16 |         return None
17 |     return elem.get('href')
18 | 


--------------------------------------------------------------------------------
/scrapers/scrape_ag_common.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | from bs4 import BeautifulSoup
 4 | import re
 5 | import scrape_common as sc
 6 | 
 7 | 
 8 | def get_ag_xls_url():
 9 |     data_url = 'https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp'
10 |     d = sc.download(data_url, silent=True)
11 |     soup = BeautifulSoup(d, 'html.parser')
12 |     xls_url = soup.find('a', href=re.compile(r'\.xlsx$'))['href']
13 |     if not xls_url.startswith('http'):
14 |         xls_url = f'https://www.ag.ch{xls_url}'
15 |     return xls_url
16 | 


--------------------------------------------------------------------------------
/fallzahlen_kanton_total_csv/README.md:
--------------------------------------------------------------------------------
 1 | # Warning: Do not manually update files in this directory
 2 | 
 3 | This directory contains all cantonal files in the "old" structure (before 2020-04-09).
 4 | All CSV files in this directory will be **updated automatically** every 15min based on the corresponding file in the "fallzahlen_kanton_total_csv_v2" directory.
 5 | 
 6 | All manual changes to these files will be overwritten.
 7 | 
 8 | # Metadata
 9 | See: https://github.com/openZH/covid_19/tree/master#swiss-cantons-and-principality-of-liechtenstein-unified-dataset.
10 | 


--------------------------------------------------------------------------------
/.github/workflows/rebase.yml:
--------------------------------------------------------------------------------
 1 | on: 
 2 |   issue_comment:
 3 |     types: [created]
 4 | name: Automatic Rebase
 5 | jobs:
 6 |   rebase:
 7 |     name: Rebase
 8 |     if: github.event.issue.pull_request != '' && contains(github.event.comment.body, '/rebase')
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |     - name: Checkout the latest code
12 |       uses: actions/checkout@v3
13 |       with:
14 |         fetch-depth: 0
15 |     - name: Automatic Rebase
16 |       uses: cirrus-actions/rebase@1.3.1
17 |       env:
18 |         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
19 | 


--------------------------------------------------------------------------------
/scrapers/scrape_vd_common.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import re
 5 | import scrape_common as sc
 6 | 
 7 | 
 8 | def get_weekly_pdf_url():
 9 |     return get_all_weekly_pdf_urls()[0]
10 | 
11 | 
12 | def get_all_weekly_pdf_urls():
13 |     base_url = 'https://www.infosan.vd.ch'
14 |     d = sc.download(base_url, silent=True)
15 | 
16 |     urls = re.findall(r"window.open\('(.*_epidemio\.pdf)'", d)
17 |     result = []
18 |     for url in urls:
19 |         if not url.startswith('http'):
20 |             url = f'{base_url}/{url}'
21 |         result.append(url)
22 |     return result
23 | 


--------------------------------------------------------------------------------
/scrapers/scrape_fl_tests.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import scrape_common as sc
 4 | 
 5 | url = 'https://www.llv.li/files/as/grafik_covid19_tests_pro_kw.xlsx'
 6 | xls = sc.xlsdownload(url, silent=True)
 7 | rows = sc.parse_xls(xls, header_row=74, sheet_name='gTests_AG')
 8 | year = '2020'
 9 | for row in rows:
10 |     if row['C'] is None:
11 |         # skip the footer line
12 |         continue
13 |     td = sc.TestData(canton='FL', url=url)
14 |     td.week = int(sc.find(r'KW (\d+)', row['C']))
15 |     if td.week == 1:
16 |         year = '2021'
17 |     td.year = year
18 |     td.negative_tests = row['Negativ']
19 |     td.positive_tests = row['Positiv']
20 |     print(td)
21 | 


--------------------------------------------------------------------------------
/.github/workflows/lint_python.yml:
--------------------------------------------------------------------------------
 1 | name: Tests + Linting Python
 2 | on:
 3 |   pull_request:
 4 |   push:
 5 |     branches: [master]
 6 |   workflow_dispatch: ~
 7 | jobs:
 8 |   lint_python:
 9 |     runs-on: ubuntu-20.04
10 |     timeout-minutes: 10
11 |     steps:
12 |       - uses: actions/checkout@v3
13 |       - name: Set up Python 3.7
14 |         uses: actions/setup-python@v4
15 |         with:
16 |           python-version: 3.7
17 |       - run: python -m pip install --upgrade pip
18 |       - run: pip install flake8 pytest
19 |       - run: pip install -r requirements.txt
20 |       - run: flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
21 |       - run: PYTHONPATH=scrapers pytest
22 | 


--------------------------------------------------------------------------------
/scrapers/scrape_zh_tests.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import csv
 4 | from io import StringIO
 5 | import scrape_common as sc
 6 | 
 7 | 
 8 | url = 'https://raw.githubusercontent.com/openZH/covid_19/master/fallzahlen_kanton_zh/COVID19_Anteil_positiver_Test_pro_KW.csv'
 9 | data = sc.download(url, silent=True)
10 | 
11 | reader = csv.DictReader(StringIO(data), delimiter=',')
12 | for row in reader:
13 |     td = sc.TestData(canton='ZH', url=url)
14 |     td.start_date = row['Woche_von']
15 |     td.end_date = row['Woche_bis']
16 |     td.week = row['Kalenderwoche']
17 |     td.positive_tests = int(row['Anzahl_positiv'])
18 |     td.negative_tests = int(row['Anzahl_negativ'])
19 |     td.positivity_rate = float(row['Anteil_positiv'])
20 |     print(td)
21 | 


--------------------------------------------------------------------------------
/scrapers/scrape_be_tests.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import csv
 4 | from io import StringIO
 5 | import scrape_common as sc
 6 | 
 7 | url = 'https://covid-kennzahlen.apps.be.ch/#/de/cockpit'
 8 | 
 9 | csv_url = 'https://raw.githubusercontent.com/openDataBE/covid19Data/develop/vortag_tests.csv'
10 | d = sc.download(csv_url, silent=True)
11 | reader = csv.DictReader(StringIO(d), delimiter=',')
12 | for row in reader:
13 |     td = sc.TestData(canton='BE', url=url)
14 |     date = sc.date_from_text(row['datum']).isoformat()
15 |     td.start_date = date
16 |     td.end_date = date
17 |     td.total_tests = row['durchgefuehrte_tests']
18 |     td.positive_tests = row['positive_tests']
19 |     td.positivity_rate = row['positivitaetsrate']
20 |     print(td)
21 | 


--------------------------------------------------------------------------------
/scripts/merge_canton_csvs.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | require 'csv'
 4 | 
 5 | # get files
 6 | files = Dir["fallzahlen_kanton_total_csv_v2/*.csv"]
 7 | 
 8 | # output array
 9 | rows = []
10 | 
11 | # read headers
12 | header = CSV.read(files.first).first
13 | 
14 | # read all csv files
15 | files.each do |fn|
16 |   CSV.foreach(fn, headers: true) do |row|
17 |     # make sure time is formatted with leading zeroes
18 |     if row[1] =~ /(\d{1,2}):(\d{1,2})/
19 |       row[1] = sprintf "%02d:%02d", $1.to_i, $2.to_i
20 |     end
21 |     rows << row[0..14]
22 |   end
23 | end
24 | 
25 | # sort records by date
26 | rows.sort_by! { |x| "#{x[0]}-#{x[1]}-#{x[2]}" }
27 | 
28 |     
29 | # output
30 | puts header.to_csv
31 | rows.each{ |row| puts row.to_csv }
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/scrapers/scrape_fr_common.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import re
 5 | from bs4 import BeautifulSoup
 6 | import scrape_common as sc
 7 | 
 8 | 
 9 | def get_fr_csv():
10 |     main_url = 'https://www.fr.ch/de/gesundheit/covid-19/coronavirus-statistik-ueber-die-entwicklung-im-kanton'
11 |     d = sc.download(main_url, silent=True)
12 | 
13 |     soup = BeautifulSoup(d, 'html.parser')
14 |     item = soup.find('a', title=re.compile(r"Statistik .ber die Entwicklungen im Kanton.*"))
15 |     csv_url = item.get('href')
16 |     assert csv_url, "URL is empty"
17 |     if not csv_url.startswith('http'):
18 |         csv_url = f'https://www.fr.ch{csv_url}'
19 | 
20 |     csv = sc.download(csv_url, silent=True)
21 |     csv = re.sub(r'(\d+)\'(\d+)', r'\1\2', csv)
22 |     return csv_url, csv, main_url
23 | 


--------------------------------------------------------------------------------
/scrapers/validate_scraper_output.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Script to run a single scraper
 4 | 
 5 | set -e
 6 | set -o pipefail
 7 | 
 8 | function cleanup {
 9 |   exit $?
10 | }
11 | trap "cleanup" EXIT
12 | 
13 | DIR="$(cd "$(dirname "$0")" && pwd)"
14 | 
15 | 
16 | # SCRAPER_KEY must be set
17 | if [ -z $SCRAPER_KEY ] ; then
18 |   echo "SCRAPER_KEY env variable must be set"; 
19 |   exit 1
20 | fi
21 | 
22 | area="Kanton_${SCRAPER_KEY}"
23 | if [ "$SCRAPER_KEY" = "FL" ] ; then
24 |    area="${SCRAPER_KEY}"
25 | fi
26 | 
27 | # 1. Validate the result
28 | node $DIR/../scripts/validate-csv.js $DIR/../fallzahlen_kanton_total_csv_v2/COVID19_Fallzahlen_${area}_total.csv
29 | 
30 | # 2. Check for outliers
31 | python $DIR/../scripts/check_for_outliers.py $DIR/../fallzahlen_kanton_total_csv_v2/COVID19_Fallzahlen_${area}_total.csv
32 | 


--------------------------------------------------------------------------------
/fallzahlen_kanton_alter_geschlecht_csv/COVID19_Einwohner_Kanton_ZH_altersklassen_geschlecht.csv:
--------------------------------------------------------------------------------
 1 | Year,Area,AgeYearCat,Gender,Inhabitants
 2 | 2019,Canton_ZH,0-9,M,82878
 3 | 2019,Canton_ZH,0-9,F,78735
 4 | 2019,Canton_ZH,10-19,M,72994
 5 | 2019,Canton_ZH,10-19,F,68488
 6 | 2019,Canton_ZH,100+,M,45
 7 | 2019,Canton_ZH,100+,F,200
 8 | 2019,Canton_ZH,20-29,M,95172
 9 | 2019,Canton_ZH,20-29,F,91194
10 | 2019,Canton_ZH,30-39,M,127998
11 | 2019,Canton_ZH,30-39,F,125184
12 | 2019,Canton_ZH,40-49,M,116400
13 | 2019,Canton_ZH,40-49,F,111604
14 | 2019,Canton_ZH,50-59,M,112667
15 | 2019,Canton_ZH,50-59,F,107919
16 | 2019,Canton_ZH,60-69,M,73383
17 | 2019,Canton_ZH,60-69,F,78006
18 | 2019,Canton_ZH,70-79,M,54372
19 | 2019,Canton_ZH,70-79,F,63877
20 | 2019,Canton_ZH,80-89,M,24989
21 | 2019,Canton_ZH,80-89,F,36988
22 | 2019,Canton_ZH,90-99,M,4020
23 | 2019,Canton_ZH,90-99,F,9293
24 | 


--------------------------------------------------------------------------------
/scrapers/scrape_so_common.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import re
 5 | from bs4 import BeautifulSoup
 6 | import scrape_common as sc
 7 | 
 8 | 
 9 | def strip_value(value):
10 |     return value.replace('\'', '')
11 | 
12 | 
13 | def get_latest_weekly_pdf_url():
14 |     return get_all_weekly_pdf_urls()[0]
15 | 
16 | 
17 | def get_all_weekly_pdf_urls():
18 |     base_url = 'https://corona.so.ch'
19 |     url = f'{base_url}/bevoelkerung/daten/woechentlicher-situationsbericht/'
20 |     d = sc.download(url, silent=True)
21 |     soup = BeautifulSoup(d, 'html.parser')
22 |     links = soup.find_all(href=re.compile(r'\.pdf$'))
23 |     result = []
24 |     for link in links:
25 |         file_ref = link.get('href')
26 |         url = f'{base_url}{file_ref}'
27 |         if url not in result:
28 |             result.append(url)
29 |     return result
30 | 


--------------------------------------------------------------------------------
/scripts/update_dates_in_readme.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | DIR="$(cd "$(dirname "$0")" && pwd)"
 4 | 
 5 | today=$(date +%s)
 6 | 
 7 | areas="FL AG AI AR BE BL BS FR GE GL GR JU LU NE NW OW SG SH SO SZ TG TI UR VD VS ZG ZH"
 8 | for area in $areas
 9 | do
10 |     update_date_str=`grep $area $DIR/../COVID19_Fallzahlen_CH_total_v2.csv | tail -n 1 | awk -F, '{print $1}'`
11 |     update_date=$(date --date="$update_date_str" +%s)
12 |     diff=$(($today-$update_date))
13 | 
14 |     if [ $diff -lt 84000 ]; then
15 |         color='4d9221'
16 |     elif [ $diff -lt 144000 ]; then
17 |         color='b8e186'
18 |     else
19 |         color='de77ae'
20 |     fi
21 |     sed -i -e "/\[$area\]/s#update on [^|]*|#update on $update_date_str](https://placehold.jp/$color/000000/200x50.png?text=$update_date_str 'Last update on $update_date_str')|#" $DIR/../README.md
22 |     echo "Update README for ${area} (date: ${update_date_str}, color: ${color})"
23 | done
24 | 


--------------------------------------------------------------------------------
/scrapers/db_common.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import csv
 4 | import os
 5 | 
 6 | 
 7 | def get_location():
 8 |     location = os.path.realpath(
 9 |         os.path.join(
10 |             os.getcwd(),
11 |             os.path.dirname(__file__)
12 |         )
13 |     )
14 |     return location
15 | 
16 | 
17 | def load_csv(filename):
18 |     columns = []
19 |     with open(filename, 'r') as f:
20 |         dr = csv.DictReader(f)
21 |         if not columns:
22 |             columns = dr.fieldnames
23 |         to_db = []
24 |         for r in dr:
25 |             db_row = []
26 |             for col in columns:
27 |                 db_row.append(r[col])
28 |             to_db.append(db_row)
29 |     return columns, to_db
30 | 
31 | 
32 | def insert_db_query(columns):
33 |     query = 'INSERT INTO data (\n'
34 |     query += ",\n".join(columns)
35 |     query += ') VALUES ('
36 |     query += ",".join(['?'] * len(columns))
37 |     query += ');'
38 |     return query
39 | 


--------------------------------------------------------------------------------
/scrapers/scrape_fr_tests.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import scrape_common as sc
 5 | from scrape_fr_common import get_fr_csv
 6 | 
 7 | """
 8 | csv_url, csv_data, main_url = get_fr_csv()
 9 | reader = csv.DictReader(StringIO(csv_data), delimiter=';')
10 | 
11 | 
12 | year = '2020'
13 | 
14 | for row in rows:
15 |     week = row['semaine /Woche']
16 |     if not week:
17 |         continue
18 | 
19 |     if week == 1:
20 |         year = '2021'
21 | 
22 |     td = sc.TestData(canton='FR', url=main_url)
23 |     td.week = int(week)
24 |     td.year = year
25 |     td.pcr_total_tests = int(row['Tests PCR'])
26 |     if row['Taux/Rate PCR']:
27 |         td.pcr_positivity_rate = round(row['Taux/Rate PCR'] * 100)
28 |     td.ag_total_tests = int(row['Tests AG'])
29 |     if row['Taux/Rate AG']:
30 |         td.ag_positivity_rate = round(row['Taux/Rate AG'] * 100)
31 |     td.total_tests = td.pcr_total_tests + td.ag_total_tests
32 |     print(td)
33 | """
34 | 


--------------------------------------------------------------------------------
/scrapers/scrape_nw_tests.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import re
 5 | import scrape_common as sc
 6 | import scrape_nw_common as snc
 7 | 
 8 | url, soup = snc.get_nw_page()
 9 | 
10 | td = sc.TestData(canton='NW', url=url)
11 | 
12 | item = soup.find(text=re.compile('Anzahl F.lle')).find_parent('p')
13 | assert item, f"Could not find title item in {url}"
14 | 
15 | date = sc.find(r'Stand: (\d+\. .* 20\d{2})', item.text)
16 | date = sc.date_from_text(date)
17 | td.start_date = date.isoformat()
18 | td.end_date = date.isoformat()
19 | 
20 | rows = item.find_next('table').findChildren('tr')
21 | for row in rows:
22 |     cols = row.findChildren('td')
23 |     item = cols[0].text
24 |     if re.match(r'Covid-19-Tests innert 24h.*', item, re.I):
25 |         res = re.match(r'(\d+)\s+(\d+\.?\d?)%', cols[1].text)
26 |         if res is not None:
27 |             td.total_tests = res[1]
28 |             td.positivity_rate = res[2]
29 | 
30 | if td:
31 |     print(td)
32 | 


--------------------------------------------------------------------------------
/scrapers/scrape_zh.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import csv
 4 | import re
 5 | from io import StringIO
 6 | import scrape_common as sc
 7 | 
 8 | url = "https://www.zh.ch/de/gesundheit/coronavirus.html"
 9 | csv_url = 'https://raw.githubusercontent.com/openzh/covid_19/master/fallzahlen_kanton_zh/COVID19_Fallzahlen_Kanton_ZH_total.csv'
10 | d_csv = sc.download(csv_url, silent=True)
11 | reader = csv.DictReader(StringIO(d_csv), delimiter=',')
12 | 
13 | is_first = True
14 | for row in reader:
15 |     if not is_first:
16 |         print('-' * 10)
17 |     is_first = False
18 | 
19 |     dd = sc.DayData(canton='ZH', url=url)
20 |     dd.datetime = f"{row['date']} {row['time']}"
21 |     dd.cases = row['ncumul_conf']
22 |     dd.deaths = row['ncumul_deceased']
23 |     dd.hospitalized = row['current_hosp']
24 |     dd.vent = row['current_vent']
25 |     dd.icu = row['current_icu']
26 |     dd.isolated = row['current_isolated']
27 |     dd.quarantined = row['current_quarantined']
28 |     print(dd)
29 |     
30 | 
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/scrapers/scrape_tg_districts.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import csv
 4 | from io import StringIO
 5 | import requests
 6 | import scrape_common as sc
 7 | 
 8 | # perma link to TG COVID dataset on opendata.swiss
 9 | r = requests.get(
10 |     'https://opendata.swiss/api/3/action/ogdch_dataset_by_identifier',
11 |     params={'identifier': 'dfs-ga-3@kanton-thurgau '}
12 | )
13 | dataset = r.json()['result']
14 | resource = next(r for r in dataset['resources'] if r['mimetype'] == 'text/csv')
15 | 
16 | assert resource['download_url'], "Download URL not found"
17 | 
18 | d_csv = sc.download(resource['download_url'], silent=True, encoding='latin1')
19 | 
20 | reader = csv.DictReader(StringIO(d_csv), delimiter=';')
21 | for row in reader:
22 |     dd = sc.DistrictData(canton='TG')
23 |     dd.district_id = row['districtid']
24 |     dd.district = row['district']
25 |     dd.population = row['population']
26 |     dd.week = row['week']
27 |     dd.year = row['year']
28 |     dd.new_cases = row['newconfcases']
29 |     dd.url = resource['download_url']
30 |     print(dd)
31 | 


--------------------------------------------------------------------------------
/scrapers/scrape_ti_tests.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from bs4 import BeautifulSoup
 5 | import re
 6 | import scrape_common as sc
 7 | 
 8 | main_url = 'https://www4.ti.ch/dss/dsp/covid19/home/'
 9 | d = sc.download(main_url, silent=True)
10 | soup = BeautifulSoup(d, 'html.parser')
11 | 
12 | td = sc.TestData(canton='TI', url=main_url)
13 | 
14 | container = soup.find('h2', string=re.compile(r'Test PCR')).find_next('div')
15 | for item in container.find_all('div'):
16 |     divs = item.find_all('div')
17 |     if len(divs) == 3:
18 |         if divs[2].string:
19 |             date = sc.find(r'.*?(\d+\.\d+\.\d{2})', divs[2].string)
20 |             date = sc.date_from_text(date)
21 |             td.start_date = date.isoformat()
22 |             td.end_date = date.isoformat()
23 |         if sc.find(r'^(Totale test).*', divs[1].string):
24 |             td.total_tests = divs[0].string
25 |         if sc.find(r'^(% test).*', divs[1].string):
26 |             td.positivity_rate = divs[0].string
27 | 
28 | if td:
29 |     assert td.start_date and td.end_date, 'failed to extract date'
30 |     print(td)
31 | 


--------------------------------------------------------------------------------
/scrapers/scrape_zg_tests.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import collections
 4 | import csv
 5 | import datetime
 6 | from io import StringIO
 7 | import scrape_common as sc
 8 | 
 9 | 
10 | csv_url = 'https://www.zg.ch/behoerden/gesundheitsdirektion/statistikfachstelle/daten/themen/result-themen-14-03-07-i2-k4-b1.csv'
11 | d_csv = sc.download(csv_url, silent=True)
12 | """
13 | "Woche","Geschlecht","Anzahl Fälle","Meta","Type","Content"
14 | 2020-05-25,"männlich","151",NA,NA,NA
15 | 2020-06-01,"männlich","117",NA,NA,NA
16 | """
17 | 
18 | reader = csv.DictReader(StringIO(d_csv), delimiter=',')
19 | data = collections.defaultdict(dict)
20 | for row in reader:
21 |     if row['Woche'] == 'NA':
22 |         continue
23 |     date = sc.date_from_text(row['Woche'])
24 |     if date not in data:
25 |         data[date] = 0
26 |     data[date] += int(row['Anzahl Fälle'])
27 | 
28 | days = list(data.keys())
29 | for day in days:
30 |     td = sc.TestData(canton='ZG', url=csv_url)
31 |     td.start_date = day.isoformat()
32 |     td.end_date = (day + datetime.timedelta(days=6)).isoformat()
33 |     td.total_tests = data[day]
34 |     print(td)
35 | 


--------------------------------------------------------------------------------
/scrapers/scrape_sh_tests.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import datetime
 5 | import re
 6 | from bs4 import BeautifulSoup
 7 | import scrape_common as sc
 8 | import scrape_sh_common as shc
 9 | 
10 | main_url, xls = shc.get_sh_xlsx()
11 | 
12 | rows = sc.parse_xls(xls, sheet_name='Datensatz_Tests', header_row=0)
13 | for row in rows:
14 |     if not (row['Jahr'] or row['Kalenderwoche']):
15 |         continue
16 | 
17 |     td = sc.TestData(canton='SH', url=main_url)
18 |     td.year = row['Jahr']
19 |     td.week = row['Kalenderwoche']
20 | 
21 |     td.pcr_total_tests = 0
22 |     pcr_cols = ['Tests KAZ', 'Tests Apotheken', 'Tests KSSH', 'Test Praxen']
23 |     for col in pcr_cols:
24 |         if sc.represents_int(row[col]):
25 |             td.pcr_total_tests += row[col]
26 | 
27 |     td.ag_total_tests = 0
28 |     ag_cols = ['Schnelltests KAZ', 'Schnelltests Apotheken', 'Schnelltests KSSH', 'Schnelltest Praxen']
29 |     for col in ag_cols:
30 |         if sc.represents_int(row[col]):
31 |             td.ag_total_tests += row[col]
32 |     td.total_tests = td.pcr_total_tests + td.ag_total_tests
33 |     print(td)
34 | 


--------------------------------------------------------------------------------
/scrapers/scrape_gl_tests.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import re
 5 | from bs4 import BeautifulSoup
 6 | import scrape_common as sc
 7 | import scrape_gl_common as sgc
 8 | 
 9 | pdf_url = sgc.get_gl_pdf_url()
10 | if pdf_url is not None:
11 |     pdf = sc.download_content(pdf_url, silent=True)
12 |     content = sc.pdftotext(pdf, page=1, layout=True)
13 |     # remove 1k separators
14 |     content = re.sub(r'(\d)\'(\d)', r'\1\2', content)
15 | 
16 |     year = sc.find(r'Stand: \d{2}\.\d{2}.(\d{4})', content)
17 |     week = sc.find(r'KW(\d+)\.pdf', pdf_url)
18 | 
19 |     # Insgesamt Anzahl, 100k, 14 Tage Anzahl, 100k, 7 Tage Anzahl, 100k
20 |     number_of_tests = sc.find(r'PCR-Tests/Schnelltests\sKanton Glarus\s+\d+\s+\d+\.?\d+?\s+\d+\s+\d+\.?\d+?\s+(\d+)\s+\d+', content)
21 |     # Insgesamt, 14 Tage, 7 Tage
22 |     positivity_rate = sc.find(r'Positivit.tsrate GL\s?\*+?\s+\d+\.\d%\s+\d+\.\d%\s+(\d+\.\d)%\s+', content)
23 | 
24 |     td = sc.TestData(canton='GL', url=pdf_url)
25 |     td.week = week
26 |     td.year = year
27 |     td.total_tests = number_of_tests
28 |     td.positivity_rate = positivity_rate
29 |     print(td)
30 | 


--------------------------------------------------------------------------------
/scrapers/scrape_nw.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import re
 5 | from bs4 import BeautifulSoup
 6 | import scrape_common as sc
 7 | import scrape_nw_common as snc
 8 | 
 9 | is_first = True
10 | xls_url = 'http://www.nw.ch/coronastatistik'
11 | xls = sc.xlsdownload(xls_url, silent=True)
12 | rows = sc.parse_xls(xls, header_row=2)
13 | for row in rows:
14 |     dd = sc.DayData(canton='NW', url=xls_url)
15 |     dd.datetime = row['A'].date().isoformat()
16 |     dd.cases = row['Positiv getestete Personen (kumuliert)']
17 |     dd.icu = row['Davon auf der Intensivstation']
18 | 
19 |     try:
20 |         dd.hospitalized = row['Aktuell hospitalisierte Personen']
21 |     except KeyError:
22 |         dd.hospitalized = row['Hospitalisierte Personen']
23 | 
24 |     try:
25 |         dd.deaths = row['Personen verstorben']
26 |     except KeyError:
27 |         dd.deaths = row['Verstorbene Personen']
28 | 
29 |     # skip empty rows
30 |     if dd.cases is None and dd.icu is None and dd.hospitalized is None and dd.deaths is None:
31 |         continue
32 | 
33 |     if not is_first:
34 |         print('-' * 10)
35 |     is_first = False
36 |     print(dd)
37 | 


--------------------------------------------------------------------------------
/scripts/validate-schema.js:
--------------------------------------------------------------------------------
 1 | const csval = require("csval");
 2 | const fs = require("fs").promises;
 3 | const path = require("path");
 4 | 
 5 | const DIR = path.resolve(process.argv[2] || process.cwd());
 6 | 
 7 | const validateSequentially = async csvFiles => {
 8 |   const rules = await csval.readRules(path.join(DIR, "schema.json"));
 9 | 
10 |   let failedChecks = 0;
11 | 
12 |   for (let csvFile of csvFiles) {
13 |     const csv = await csval.readCsv(path.join(DIR, csvFile));
14 |     const parsed = await csval.parseCsv(csv);
15 |     let valid = false;
16 |     try {
17 |       valid = await csval.validate(parsed, rules);
18 |     } catch (e) {
19 |       failedChecks++;
20 |       console.log(`× ${csvFile} failed the following checks:${e.message}\n`);
21 |     }
22 |     if (valid) {
23 |       console.log(`✓ ${csvFile} is valid.`);
24 |     }
25 |   }
26 | 
27 |   return failedChecks;
28 | };
29 | 
30 | const run = async () => {
31 |   const csvFiles = (await fs.readdir(DIR)).filter(f => f.match(/\.csv$/));
32 |   const failedChecks = await validateSequentially(csvFiles);
33 | 
34 |   if (failedChecks > 0) {
35 |     process.exit(1);
36 |   }
37 | };
38 | 
39 | run().catch(e => console.error(e));
40 | 


--------------------------------------------------------------------------------
/scrapers/run_district_scraper.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Script to run a single district scraper
 4 | 
 5 | set -e
 6 | set -o pipefail
 7 | 
 8 | function cleanup {
 9 |   exit $?
10 | }
11 | trap "cleanup" EXIT
12 | 
13 | DIR="$(cd "$(dirname "$0")" && pwd)"
14 | 
15 | 
16 | # SCRAPER_KEY must be set
17 | if [ -z $SCRAPER_KEY ] ; then
18 |   echo "SCRAPER_KEY env variable must be set"; 
19 |   exit 1
20 | fi
21 | 
22 | # 1. populate the database with the current CSV
23 | echo "Populating database from CSV fallzahlen_kanton_${SCRAPER_KEY}_bezirk..."
24 | $DIR/populate_district_database.py $DIR/../fallzahlen_bezirke/fallzahlen_kanton_${SCRAPER_KEY}_bezirk.csv
25 | 
26 | # 2. run the scraper, update the db
27 | echo "Run the district scraper..."
28 | scrape_script="${DIR}/scrape_${SCRAPER_KEY,,}_districts.py"
29 | $scrape_script | $DIR/add_district_db_entry.py
30 | 
31 | # 3. Export the database as csv
32 | echo "Export database to CSV..."
33 | sqlite3 -header -csv $DIR/data.sqlite "select * from data order by DistrictId, District, Canton, Date, Year, Week+0 asc;" > $DIR/../fallzahlen_bezirke/fallzahlen_kanton_${SCRAPER_KEY}_bezirk.csv
34 | sed -i 's/""//g' $DIR/../fallzahlen_bezirke/fallzahlen_kanton_${SCRAPER_KEY}_bezirk.csv
35 | 


--------------------------------------------------------------------------------
/scrapers/scrape_vs_common.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import re
 4 | import datetime
 5 | 
 6 | from bs4 import BeautifulSoup
 7 | 
 8 | import scrape_common as sc
 9 | 
10 | 
11 | def get_vs_latest_weekly_pdf_url():
12 |     pdfs = get_vs_weekly_pdf_urls()
13 |     assert pdfs, "Could not find weekly PDFs"
14 |     return pdfs[0]
15 | 
16 | 
17 | def get_vs_weekly_pdf_urls():
18 |     base_url = 'https://www.vs.ch'
19 |     url = base_url + '/de/web/coronavirus/statistiques-hebdomadaires'
20 |     content = sc.download(url, silent=True)
21 |     soup = BeautifulSoup(content, 'html.parser')
22 |     links = soup.find_all(href=re.compile(r'Synthese.*Woche'))
23 |     result = []
24 |     for link in links:
25 |         url = base_url + link['href'].replace(' ', '%20')
26 |         result.append(url)
27 |     return result
28 | 
29 | 
30 | def get_vs_weekly_general_data(pdf):
31 |     content = sc.pdftotext(pdf, page=1)
32 |     week = int(sc.find(r'Epidemiologische Situation Woche (\d+)', content))
33 |     end_date = sc.find(r'bis\s+(\d+\.\d+\.\d{4})', content)
34 |     end_date = sc.date_from_text(end_date)
35 |     start_date = end_date - datetime.timedelta(days=7)
36 |     year = start_date.year
37 |     return week, year
38 | 


--------------------------------------------------------------------------------
/scrapers/test/test_district_data.py:
--------------------------------------------------------------------------------
 1 | from scrapers.scrape_common import DistrictData
 2 | 
 3 | def test_district_data():
 4 |     dd = DistrictData()
 5 |     dd.date = '1'
 6 |     dd.week = 2
 7 |     dd.year = 3
 8 |     dd.canton = '4'
 9 |     dd.district = '5'
10 |     dd.district_id = 6
11 |     dd.population = 7
12 |     dd.total_cases = 8
13 |     dd.new_cases = 9
14 |     dd.total_deceased = 10
15 |     dd.new_deceased = 11
16 |     dd.url = '12'
17 | 
18 |     string = str(dd)
19 | 
20 |     dd_parsed = DistrictData()
21 |     assert dd_parsed.parse(string)
22 |     assert dd.date == dd_parsed.date
23 |     assert dd.week == dd_parsed.week
24 |     assert dd.year == dd_parsed.year
25 |     assert dd.canton == dd_parsed.canton
26 |     assert dd.district == dd_parsed.district
27 |     assert dd.district_id == dd_parsed.district_id
28 |     assert dd.population == dd_parsed.population
29 |     assert dd.total_cases == dd_parsed.total_cases
30 |     assert dd.new_cases == dd_parsed.new_cases
31 |     assert dd.total_deceased == dd_parsed.total_deceased
32 |     assert dd.new_deceased == dd_parsed.new_deceased
33 |     assert dd.url == dd_parsed.url
34 | 
35 | 
36 | if __name__ == "__main__":
37 |     test_district_data()
38 | 


--------------------------------------------------------------------------------
/scrapers/run_tests_scraper.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Script to run a single tests scraper
 4 | 
 5 | set -e
 6 | set -o pipefail
 7 | 
 8 | function cleanup {
 9 |   exit $?
10 | }
11 | trap "cleanup" EXIT
12 | 
13 | DIR="$(cd "$(dirname "$0")" && pwd)"
14 | 
15 | 
16 | # SCRAPER_KEY must be set
17 | if [ -z $SCRAPER_KEY ] ; then
18 |   echo "SCRAPER_KEY env variable must be set"; 
19 |   exit 1
20 | fi
21 | 
22 | area="kanton_${SCRAPER_KEY}"
23 | if [ "$SCRAPER_KEY" = "FL" ] ; then
24 |    area="${SCRAPER_KEY}"
25 | fi
26 | 
27 | # 1. populate the database with the current CSV
28 | echo "Populating database from CSV fallzahlen_${area}_tests..."
29 | $DIR/populate_tests_database.py $DIR/../fallzahlen_tests/fallzahlen_${area}_tests.csv
30 | 
31 | # 2. run the scraper, update the db
32 | echo "Run the tests scraper..."
33 | scrape_script="${DIR}/scrape_${SCRAPER_KEY,,}_tests.py"
34 | $scrape_script | $DIR/add_tests_db_entry.py
35 | 
36 | # 3. Export the database as csv
37 | echo "Export database to CSV..."
38 | sqlite3 -header -csv $DIR/data.sqlite "select * from data order by canton, start_date, end_date, year, week+0 asc;" > $DIR/../fallzahlen_tests/fallzahlen_${area}_tests.csv
39 | sed -i 's/""//g' $DIR/../fallzahlen_tests/fallzahlen_${area}_tests.csv
40 | 


--------------------------------------------------------------------------------
/scrapers/meta_scrape.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # Scrapers are expected to output data on standard output in the following
 4 | # format:
 5 | #
 6 | # GR
 7 | # Scraped at: 2020-03-21T19:22:10+01:00
 8 | # Date and time: 20.03.2020
 9 | # Confirmed cases: 213
10 | # Deaths: 3
11 | #
12 | # Abbreviation of the canton first.
13 | #
14 | # Then scraped timestamp. Current time in ISO-8601 format. Implicitly in Swiss
15 | # timezone (TZ=Europe/Zurich), CET, or CEST.
16 | #
17 | # The information about time of when the data was published / gathered.
18 | # The data and time, or just time, can be omitted if not available.
19 | # Any date / time format is ok. More accurate the better. It is advised to strip
20 | # the name of the weekday. Add time parser to the parse_scrape_output.py script
21 | # if needed.
22 | #
23 | # Number of cases.
24 | #
25 | # Number of deaths can be omitted, if not available.
26 | 
27 | for s in ./scrape_??.py;
28 | do
29 |   L=$(./$s | ./parse_scrape_output.py)
30 |   if ! echo "${L}" | egrep ' (OK|FAILED)' >/dev/null; then
31 |     a=$(echo "$s" | sed -E -e 's/^.*scrape_(..)\..*$/\1/' | tr a-z A-Z) # ' # To make my editor happy.
32 |     echo "$a" - - - FAILED "$(date --iso-8601=seconds)"
33 |   else
34 |     echo "${L}"
35 |   fi
36 | done
37 | 


--------------------------------------------------------------------------------
/scrapers/scrape_sg_tests.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import csv
 4 | from io import StringIO
 5 | import scrape_common as sc
 6 | 
 7 | 
 8 | url = 'https://www.sg.ch/ueber-den-kanton-st-gallen/statistik/covid-19/_jcr_content/Par/sgch_downloadlist_729873930/DownloadListPar/sgch_download.ocFile/KantonSG_C19-Tests_download.csv'
 9 | data = sc.download(url, silent=True)
10 | 
11 | # strip the "header" / description lines
12 | data = "\n".join(data.split("\n")[9:])
13 | 
14 | reader = csv.DictReader(StringIO(data), delimiter=';')
15 | for row in reader:
16 |     td = sc.TestData(canton='SG', url=url)
17 |     td.start_date = row['Datum']
18 |     td.end_date = row['Datum']
19 |     td.pcr_positive_tests = row['Positiv (PCR)']
20 |     td.pcr_negative_tests = row['Negativ (PCR)']
21 |     td.ag_positive_tests = row['Positiv (Schnelltest)']
22 |     td.ag_negative_tests = row['Negativ (Schnelltest)']
23 |     td.positive_tests = row['Total positive Tests']
24 |     td.negative_tests = row['Total negative Tests']
25 |     td.total_tests = row['Total Tests']
26 |     if row['Positiv in % vom Total']:
27 |         td.positivity_rate = float(row['Positiv in % vom Total']) * 100
28 |         td.positivity_rate = round(10 * td.positivity_rate) / 10
29 |     print(td)
30 | 


--------------------------------------------------------------------------------
/scrapers/scrape_vs_tests.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import re
 4 | 
 5 | import scrape_common as sc
 6 | import scrape_vs_common as svc
 7 | 
 8 | 
 9 | # get all PDFs
10 | for url in svc.get_vs_weekly_pdf_urls():
11 |     td = sc.TestData(canton='VS', url=url)
12 | 
13 |     pdf = sc.download_content(url, silent=True)
14 |     td.week, td.year = svc.get_vs_weekly_general_data(pdf)
15 | 
16 |     for page in range(4, 6):
17 |         content = sc.pdftotext(pdf, page=page, raw=True)
18 |         content = re.sub(r'(\d)\‘(\d)', r'\1\2', content)
19 |         content = re.sub(r'(\d)\’(\d)', r'\1\2', content)
20 |         content = re.sub(r'(\d)\'(\d)', r'\1\2', content)
21 | 
22 |         td.total_tests = sc.find(r'Alle\s+Arten\s+von\s+Tests\s+(\d+)', content)
23 |         td.positivity_rate = sc.find(r'Alle\s+Arten\s+von\s+Tests\s+\d+\s+(\d+\.\d+)%', content)
24 |         td.pcr_total_tests = sc.find(r'PCR\s+(\d+)', content)
25 |         td.pcr_positivity_rate = sc.find(r'PCR\s+\d+\s+(\d+\.\d+)%', content)
26 |         td.ag_total_tests = sc.find(r'Antigentests\s+(\d+)', content)
27 |         td.ag_positivity_rate = sc.find(r'Antigentests\s+\d+\s+(\d+\.\d+)%', content)
28 | 
29 |         if not td.total_tests:
30 |             continue
31 | 
32 |         print(td)
33 | 


--------------------------------------------------------------------------------
/scrapers/test_tests_scraper.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Script to run all tests scraper
 4 | 
 5 | set -e
 6 | set -o pipefail
 7 | 
 8 | function cleanup {
 9 |   exit $?
10 | }
11 | trap "cleanup" EXIT
12 | 
13 | DIR="$(cd "$(dirname "$0")" && pwd)"
14 | NEWLINE=$'\n'
15 | 
16 | echo "Run all tests scrapers..."
17 | 
18 | exit_code=0
19 | errors=''
20 | for scrape_script in $DIR/scrape_??_tests.py
21 | do
22 |     if [ -f $scrape_script -a -x $scrape_script ]
23 |     then
24 |         name=`basename $scrape_script`
25 |         canton=${name:7:2}
26 |         export SCRAPER_KEY=${canton^^}
27 |         echo ""
28 |         echo "Running ${SCRAPER_KEY} tests scraper..."
29 |         echo "=========================================="
30 | 
31 |         set +e
32 |         $DIR/run_tests_scraper.sh
33 |         ret=$?
34 |         if [ $ret -ne 0 ]
35 |         then
36 |             echo "ERROR: ${scrape_script} failed with exit code $ret. continue." >&2
37 |             errors=$"${errors}${NEWLINE}ERROR: ${scrape_script} failed with exit code $ret"
38 |             exit_code=1
39 |         fi
40 |         set -e
41 | 
42 |         echo "=========================================="
43 |         echo ""
44 |     fi
45 | done
46 | 
47 | 
48 | echo "$errors"
49 | exit $exit_code
50 | 


--------------------------------------------------------------------------------
/fallzahlen_kanton_zh/COVID19_VOC_Kanton_ZH.csv:
--------------------------------------------------------------------------------
 1 | date,new_pcr_pos,new_voc
 2 | 2021-02-10,168,35
 3 | 2021-02-09,247,54
 4 | 2021-02-08,134,44
 5 | 2021-02-07,82,29
 6 | 2021-02-06,188,62
 7 | 2021-02-05,194,41
 8 | 2021-02-04,209,38
 9 | 2021-02-03,215,43
10 | 2021-02-02,272,67
11 | 2021-02-01,143,37
12 | 2021-01-31,65,12
13 | 2021-01-30,193,34
14 | 2021-01-29,208,32
15 | 2021-01-28,287,34
16 | 2021-01-27,273,32
17 | 2021-01-26,316,41
18 | 2021-01-25,152,25
19 | 2021-01-24,115,16
20 | 2021-01-23,245,18
21 | 2021-01-22,390,23
22 | 2021-01-21,197,17
23 | 2021-01-20,301,14
24 | 2021-01-19,336,10
25 | 2021-01-18,217,6
26 | 2021-01-17,103,5
27 | 2021-01-16,251,8
28 | 2021-01-15,277,10
29 | 2021-01-14,273,5
30 | 2021-01-13,352,4
31 | 2021-01-12,392,8
32 | 2021-01-11,291,3
33 | 2021-01-10,163,0
34 | 2021-01-09,347,0
35 | 2021-01-08,446,6
36 | 2021-01-07,449,2
37 | 2021-01-06,616,4
38 | 2021-01-05,658,6
39 | 2021-01-04,494,2
40 | 2021-01-03,280,1
41 | 2021-01-02,388,2
42 | 2021-01-01,204,0
43 | 2020-12-31,638,0
44 | 2020-12-30,595,2
45 | 2020-12-29,731,4
46 | 2020-12-28,368,1
47 | 2020-12-27,284,0
48 | 2020-12-26,429,2
49 | 2020-12-25,229,0
50 | 2020-12-24,793,0
51 | 2020-12-23,855,1
52 | 2020-12-22,736,0
53 | 2020-12-21,414,1
54 | 2020-12-20,312,0
55 | 2020-12-19,494,2
56 | 2020-12-18,723,0
57 | 


--------------------------------------------------------------------------------
/scrapers/scrape_tg.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import csv
 4 | from io import StringIO
 5 | import requests
 6 | import scrape_common as sc
 7 | 
 8 | # perma link to TG COVID dataset on opendata.swiss
 9 | r = requests.get(
10 |     'https://opendata.swiss/api/3/action/ogdch_dataset_by_identifier',
11 |     params={'identifier': 'dfs-ga-1@kanton-thurgau'}
12 | )
13 | dataset = r.json()['result']
14 | resource = next(r for r in dataset['resources'] if r['mimetype'] == 'text/csv')
15 | 
16 | assert resource['download_url'], "Download URL not found"
17 |     
18 | d_csv = sc.download(resource['download_url'], silent=True)
19 | 
20 | reader = csv.DictReader(StringIO(d_csv), delimiter=';')
21 | is_first = True
22 | for row in reader:
23 |     if not row['date']:
24 |         continue
25 |     if not is_first:
26 |         print('-' * 10)
27 |     is_first = False
28 |     dd = sc.DayData(canton='TG', url=row['source'])
29 |     dd.datetime = f"{row['date']} {row['time']}"
30 |     dd.cases = row['ncumul_conf']
31 |     dd.deaths = row['ncumul_deceased']
32 |     dd.hospitalized = row['current_hosp']
33 |     dd.new_hosp = row['new_hosp']
34 |     dd.recovered = row['ncumul_released']
35 |     dd.icu = row['current_icu']
36 |     dd.isolated = row['num_isolated']
37 |     print(dd)
38 | 


--------------------------------------------------------------------------------
/scrapers/run_scraper.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Script to run a single scraper
 4 | 
 5 | set -e
 6 | set -o pipefail
 7 | 
 8 | function cleanup {
 9 |   exit $?
10 | }
11 | trap "cleanup" EXIT
12 | 
13 | DIR="$(cd "$(dirname "$0")" && pwd)"
14 | 
15 | 
16 | # SCRAPER_KEY must be set
17 | if [ -z $SCRAPER_KEY ] ; then
18 |   echo "SCRAPER_KEY env variable must be set"; 
19 |   exit 1
20 | fi
21 | 
22 | area="Kanton_${SCRAPER_KEY}"
23 | if [ "$SCRAPER_KEY" = "FL" ] ; then
24 |    area="${SCRAPER_KEY}"
25 | fi
26 | 
27 | # 1. populate the database with the current CSV
28 | echo "Populating database from CSV COVID19_Fallzahlen_${area}_total.csv..."
29 | $DIR/populate_database.py $DIR/../fallzahlen_kanton_total_csv_v2/COVID19_Fallzahlen_${area}_total.csv
30 | 
31 | # 2. run the scraper, update the db
32 | echo "Run the scraper..."
33 | scrape_script="${DIR}/scrape_${SCRAPER_KEY,,}.py"
34 | $scrape_script | $DIR/parse_scrape_output.py | $DIR/add_db_entry.py
35 | 
36 | # 3. Export the database as csv
37 | echo "Export database to CSV..."
38 | sqlite3 -header -csv $DIR/data.sqlite "select * from data order by date asc;" > $DIR/../fallzahlen_kanton_total_csv_v2/COVID19_Fallzahlen_${area}_total.csv
39 | sed -i 's/""//g' $DIR/../fallzahlen_kanton_total_csv_v2/COVID19_Fallzahlen_${area}_total.csv
40 | 


--------------------------------------------------------------------------------
/scrapers/test_district_scraper.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Script to test run all district scraper
 4 | 
 5 | set -e
 6 | set -o pipefail
 7 | 
 8 | function cleanup {
 9 |   exit $?
10 | }
11 | trap "cleanup" EXIT
12 | 
13 | DIR="$(cd "$(dirname "$0")" && pwd)"
14 | NEWLINE=$'\n'
15 | 
16 | echo "Run all district scrapers..."
17 | 
18 | exit_code=0
19 | errors=''
20 | for scrape_script in $DIR/scrape_??_districts.py
21 | do
22 |     if [ -f $scrape_script -a -x $scrape_script ]
23 |     then
24 |         name=`basename $scrape_script`
25 |         canton=${name:7:2}
26 |         export SCRAPER_KEY=${canton^^}
27 |         echo ""
28 |         echo "Running ${SCRAPER_KEY} district scraper..."
29 |         echo "=========================================="
30 | 
31 |         set +e
32 |         $DIR/run_district_scraper.sh
33 |         ret=$?
34 |         if [ $ret -ne 0 ]
35 |         then
36 |             echo "ERROR: ${scrape_script} failed with exit code $ret. continue." >&2
37 |             errors=$"${errors}${NEWLINE}ERROR: ${scrape_script} failed with exit code $ret"
38 |             exit_code=1
39 |         fi
40 |         set -e
41 | 
42 |         echo "=========================================="
43 |         echo ""
44 |     fi
45 | done
46 | 
47 | echo "$errors"
48 | exit $exit_code
49 | 


--------------------------------------------------------------------------------
/scrapers/scrape_ne.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import sys
 5 | import datetime
 6 | import scrape_common as sc
 7 | 
 8 | xls_url = 'https://www.ne.ch/autorites/DFS/SCSP/medecin-cantonal/maladies-vaccinations/Documents/Covid-19-Statistiques/COVID19_PublicationInternet.xlsx'
 9 | xls = sc.xlsdownload(xls_url, silent=True)
10 | rows = sc.parse_xls(xls)
11 | is_first = True
12 | for row in rows[:3000]:
13 |     if row['A'] is None:
14 |         continue
15 |     if not isinstance(row['A'], datetime.datetime):
16 |         print(f"WARNING: {row['A']} is not a valid date, skipping.", file=sys.stderr)
17 |         continue
18 | 
19 |     if not is_first:
20 |         print('-' * 10)
21 |     is_first = False
22 | 
23 |     dd = sc.DayData(canton='NE', url=xls_url)
24 |     dd.datetime = row['A'].date().isoformat()
25 |     dd.cases = row['Cumul']
26 |     dd.hospitalized = row['Total des cas hospitalisés']
27 |     if row['Soins intensifs (intubés)'] is not None and row['Soins intensifs (non intubés)'] is not None:
28 |         ICU = row['Soins intensifs (intubés)']
29 |         ICU2 = row['Soins intensifs (non intubés)']
30 |         dd.icu = int(ICU)+int(ICU2)
31 |     dd.vent = row['Soins intensifs (intubés)']
32 |     dd.deaths = row['Cumul des décès']
33 |     print(dd)
34 | 


--------------------------------------------------------------------------------
/scrapers/scrape_ag_tests.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import scrape_common as sc
 4 | import scrape_ag_common as sac
 5 | 
 6 | 
 7 | def get_value_int(value):
 8 |     if value is not None and value != '':
 9 |         return int(value)
10 |     return None
11 | 
12 | 
13 | def get_value_float(value):
14 |     if value is not None and value != '':
15 |         return float(value)
16 |     return None
17 | 
18 | 
19 | xls_url = sac.get_ag_xls_url()
20 | xls = sc.xlsdownload(xls_url, silent=True)
21 | 
22 | year = '2020'
23 | rows = sc.parse_xls(xls, sheet_name='1.4 Labortests', header_row=1, enable_float=True)
24 | for row in rows:
25 |     if not row['Anzahl Tests']:
26 |         continue
27 |     if row['Anzahl Tests'] == 'Anzahl Tests':
28 |         break
29 | 
30 |     td = sc.TestData(canton='AG', url=xls_url)
31 |     td.week = int(row['Kalenderwoche'])
32 |     if td.week == 1:
33 |         year = '2021'
34 |     td.year = year
35 |     td.positive_tests = get_value_int(row['Positive Tests'])
36 |     td.negative_tests = get_value_int(row['Negative Tests'])
37 |     td.total_tests = int(row['Anzahl Tests'])
38 |     td.positivity_rate = get_value_float(row['Positivitätsrate'])
39 |     td.pcr_positivity_rate = get_value_float(row['F'])
40 |     td.ag_positivity_rate = get_value_float(row['G'])
41 |     if td:
42 |         print(td)
43 | 


--------------------------------------------------------------------------------
/scrapers/scrape_ju_tests.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import re
 5 | from bs4 import BeautifulSoup
 6 | import scrape_common as sc
 7 | 
 8 | base_url = 'https://www.jura.ch'
 9 | url = f'{base_url}/fr/Autorites/Coronavirus/Infos-Actualite/Statistiques-COVID/Evolution-des-cas-COVID-19-dans-le-Jura.html'
10 | d = sc.download(url, silent=True)
11 | d = d.replace('&nbsp;', ' ')
12 | soup = BeautifulSoup(d, 'html.parser')
13 | 
14 | pdf_url = soup.find('a', title=re.compile(r'Situation.*PDF.*')).get('href')
15 | if not pdf_url.startswith('http'):
16 |     pdf_url = f'{base_url}{pdf_url}'
17 | pdf_url = pdf_url.replace('?download=1', '')
18 | 
19 | pdf = sc.download_content(pdf_url, silent=True)
20 | 
21 | td = sc.TestData(canton='JU', url=pdf_url)
22 | 
23 | content = sc.pdftotext(pdf, page=1)
24 | td.week = sc.find(r'Situation semaine épidémiologique (\d+)', content)
25 | td.year = sc.find(r'Du \d+.* (\d{4})', content)
26 | 
27 | content = sc.pdftotext(pdf, page=2)
28 | td.total_tests = sc.find(r'Nombre de tests\d?\s+(\d+)', content)
29 | res = re.match(r'.*Nombre de tests positifs .*\s+(\d+)\s+\((\d+\.?\d?)%\s?\d?\)', content, re.DOTALL | re.MULTILINE)
30 | assert res, 'failed to find number of positive tests and positivity rate'
31 | td.positive_tests = res[1]
32 | td.positivity_rate = res[2]
33 | 
34 | print(td)
35 | 


--------------------------------------------------------------------------------
/scripts/latest_per_canton.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | echo "| Canton | Confirmed cases | Deceased | Last update            |"
 4 | echo "|:------:| ---------------:| --------:|:---------------------- |"
 5 | #     |   BL   |             282 |        0 | 2020-03-21             |
 6 | 
 7 | # PER CANTON / FL
 8 | 
 9 | # 1    2    3                          4             5           6           7          8           9               10
10 | # date,time,abbreviation_canton_and_fl,ncumul_tested,ncumul_conf,ncumul_hosp,ncumul_ICU,ncumul_vent,ncumul_released,ncumul_deceased,source
11 | 
12 | for f in *.csv; do
13 |   # Output latest row with non-zero commulative number of cases (and deaths). Then sort by number of cases, and print the date.
14 |   awk -F , '{if ($5) { printf("|   %2s   | %15d | %8d | %-21s  |\n", $3, $5, $10, $2 != "\"\"" ? $1 "T" $2 : $1); }}' "$f" | tail -1
15 | done |  sort -r -n -k 4
16 | 
17 | # TOTAL
18 | 
19 | DATE=$(TZ="Europe/Zurich" date --iso-8601=minutes)
20 | 
21 | for f in *.csv; do
22 |   # Output last row with non-zero commulative number of cases (and deaths)
23 |   awk -F , '{if ($5) { print $1, $3, $5, $10; }}' "$f" | tail -1
24 |   # The do sums.
25 | done | awk "BEGIN { sum_cases = 0; sum_deceased = 0; } { sum_cases += \$3; sum_deceased += \$4; } END { printf(\"|  TOTAL | %15d | %8d | %-22s |\n\", sum_cases, sum_deceased, \"${DATE}\"); }"
26 | 


--------------------------------------------------------------------------------
/.github/workflows/test_scraper.yml:
--------------------------------------------------------------------------------
 1 | name: Test run of scrapers
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ master ]
 6 |     paths:
 7 |     - 'scrapers/**'
 8 |     - '!scrapers/*districts*'
 9 |     - '!scrapers/*tests*'
10 |     - '.github/workflows/**'
11 |   pull_request:
12 |     branches: [ master ]
13 |     paths:
14 |     - 'scrapers/**'
15 |     - '!scrapers/*districts*'
16 |     - '!scrapers/*tests*'
17 |     - '.github/workflows/**'
18 |   workflow_dispatch: ~
19 | 
20 | jobs:
21 |   test_run:
22 |     runs-on: ubuntu-20.04
23 |     timeout-minutes: 10
24 | 
25 |     steps:
26 |     - uses: actions/checkout@v3
27 | 
28 |     - name: Set up Python 3.7
29 |       uses: actions/setup-python@v4
30 |       with:
31 |         python-version: 3.7
32 | 
33 |     - name: Remove broken apt repos
34 |       run: |
35 |         for apt_file in `grep -lr microsoft /etc/apt/sources.list.d/`; do sudo rm $apt_file; done
36 | 
37 |     - name: Install dependencies
38 |       run: |
39 |         npm ci
40 |         python -m pip install --upgrade pip setuptools wheel
41 |         pip install -r requirements.txt
42 |         sudo apt update || true # do not fail if update does not work
43 |         sudo apt-get install poppler-utils
44 |         sudo apt-get install chromium-browser
45 | 
46 |     - name: Test run of all scrapers
47 |       run: ./scrapers/test_scraper.sh
48 | 
49 | 


--------------------------------------------------------------------------------
/scrapers/scrape_ai.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import re
 4 | import scrape_common as sc
 5 | 
 6 | url = 'https://www.ai.ch/themen/gesundheit-alter-und-soziales/gesundheitsfoerderung-und-praevention/uebertragbare-krankheiten/coronavirus'
 7 | d = sc.download(url, silent=True)
 8 | d = re.sub(r'(\d+)\'(\d+)', r'\1\2', d)
 9 | 
10 | """
11 | no separate date for hospitalizations on 2020-11-19
12 | # Hospitalisations
13 | dd_hosp = sc.DayData(canton='AI', url=url)
14 | dd_hosp.datetime = sc.find('>.*Hospitalisationen\s+\(Stand\s+(.*\d{4})\)', d)
15 | dd_hosp.hospitalized = sc.find('<li>.*?([0-9]+)\s*Hospitalisationen.*<\/li>', d)
16 | print(dd_hosp)
17 | print('-' * 10)
18 | """
19 | 
20 | # cases
21 | dd = sc.DayData(canton='AI', url=url)
22 | dd.datetime = sc.find('>.*Stand (.+ Uhr).*</div>', d)
23 | dd.cases = sc.find('<li>.*?([0-9]+)\s*(infizierte Person(en)?|(labor)?bestätigte Fälle).*<\/li>', d)
24 | dd.deaths = sc.find('<li>.*?([0-9]+)\s*Todesf.+?lle.*<\/li>', d)
25 | dd.isolated = sc.find('<li>.*?([0-9]+)\s*Personen\s+in\s*Isolation.*<\/li>', d)
26 | dd.quarantined = sc.find('<li>.*?([0-9]+)\+?\s*enge\s+Kontaktpersonen\s+in\s+Quarant.ne.*<\/li>', d)
27 | dd.quarantine_riskareatravel = sc.find('<li>.*?([0-9]+)\+?\s*Personen\s+in\s*Quarant.+ne.*Einreise\s+Risikoland.*<\/li>', d)
28 | dd.hospitalized = sc.find(r'<li>.*?([0-9]+)\s*Person\sim\sSpital.*<\/li>', d)
29 | print(dd)
30 | 


--------------------------------------------------------------------------------
/scrapers/scrape_bl_common.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import datetime
 5 | from bs4 import BeautifulSoup
 6 | import re
 7 | import scrape_common as sc
 8 | 
 9 | 
10 | def get_latest_bl_bulletin_url():
11 |     return get_all_bl_bulletin_urls()[0]
12 | 
13 | 
14 | def get_all_bl_bulletin_urls():
15 |     news_url = 'https://www.baselland.ch/politik-und-behorden/direktionen/volkswirtschafts-und-gesundheitsdirektion/amt-fur-gesundheit/medizinische-dienste/kantonsarztlicher-dienst/aktuelles/medienmitteilungen-1'
16 |     news_content = sc.download(news_url, silent=True)
17 |     soup = BeautifulSoup(news_content, 'html.parser')
18 | 
19 |     bulletins = soup.find_all('a', href=re.compile(r'.*/coronavirus-wochenbulletin.*'))
20 |     bulletin_urls = []
21 |     for bulletin in bulletins:
22 |         bulletin_urls.append(bulletin.get('href'))
23 |     return bulletin_urls
24 | 
25 | 
26 | def strip_bl_bulletin_numbers(content):
27 |     content = re.sub(r'(\d+)’(\d+)', r'\1\2', content)
28 |     content = re.sub(r'(\d+)\'(\d+)', r'\1\2', content)
29 |     return content
30 | 
31 | 
32 | def parse_bl_date(s):
33 |     row_date = s.replace('-', '.')
34 |     row_date = s.replace('/', '.')
35 |     parts = row_date.split('.')
36 |     s_date = datetime.datetime(day=int(parts[0]), month=int(parts[1]), year=int(parts[2]))
37 |     key = s_date.date().isoformat()
38 |     return (key, row_date)
39 | 


--------------------------------------------------------------------------------
/correction_status.csv:
--------------------------------------------------------------------------------
 1 | date,abbreviation_canton_and_fl,column
 2 | 2020-12-25,FL,ncumul_conf
 3 | 2021-06-08,ZG,ncumul_released
 4 | 2021-06-22,NW,ncumul_conf
 5 | 2021-06-29,BS,ncumul_conf
 6 | 2021-06-27,ZG,ncumul_released
 7 | 2021-06-30,NW,ncumul_conf
 8 | 2021-07-02,ZG,ncumul_released
 9 | 2021-07-05,ZG,ncumul_released
10 | 2021-07-05,BS,ncumul_released
11 | 2021-07-08,BS,ncumul_released
12 | 2021-07-14,BS,ncumul_released
13 | 2021-07-30,BS,ncumul_released
14 | 2021-08-19,ZG,ncumul_released
15 | 2021-08-20,ZG,ncumul_released
16 | 2021-09-03,ZG,ncumul_released
17 | 2021-10-01,ZG,ncumul_released
18 | 2021-10-04,SG,ncumul_deceased
19 | 2021-10-04,SG,ncumul_released
20 | 2021-10-22,ZG,ncumul_released
21 | 2021-10-24,ZG,ncumul_released
22 | 2021-11-05,ZG,ncumul_released
23 | 2021-11-07,ZG,ncumul_released
24 | 2021-11-12,ZG,ncumul_released
25 | 2022-02-17,UR,ncumul_deceased
26 | 2022-03-07,TI,ncumul_conf
27 | 2022-03-07,TI,ncumul_deceased
28 | 2022-04-10,FL,ncumul_released
29 | 2022-04-16,FL,ncumul_released
30 | 2022-05-30,FR,ncumul_released
31 | 2022-07-11,FR,ncumul_released
32 | 2022-08-16,NW,ncumul_conf
33 | 2022-09-05,NW,ncumul_conf
34 | 2022-11-16,NW,ncumul_conf
35 | 2023-01-25,BS,ncumul_released
36 | 2023-02-02,GE,ncumul_conf
37 | 2023-02-02,GE,ncumul_released
38 | 2023-02-02,GE,ncumul_deceased
39 | 2023-02-08,GE,ncumul_released
40 | 2023-02-08,GE,ncumul_deceased
41 | 2023-03-21,FL,ncumul_released
42 | 2023-03-29,FL,ncumul_released
43 | 


--------------------------------------------------------------------------------
/scrapers/scrape_fr.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import csv
 5 | import re
 6 | from typing import Optional
 7 | from io import StringIO
 8 | import datetime
 9 | import sys
10 | import scrape_common as sc
11 | from scrape_fr_common import get_fr_csv
12 | 
13 | def trim_val(val: str) -> Optional[int]:
14 |     if len(val) > 0:
15 |         return int(re.sub(r'(\d+)\s+(\d+)', r'\1\2', val))
16 |     return None
17 | 
18 | csv_url, csv_data, main_url = get_fr_csv()
19 | reader = csv.DictReader(StringIO(csv_data), delimiter=';')
20 | is_first = True
21 | 
22 | for row in reader:
23 |     if not is_first:
24 |         print('-' * 10)
25 |     is_first = False
26 | 
27 |     dd = sc.DayData(canton='FR', url=main_url)
28 |     for key, val in row.items():
29 |         if sc.find(r'(Date).*', key):
30 |             dd.datetime = val
31 |         if sc.find(r'(Total cas av.r.s).*', key):
32 |             dd.cases = trim_val(val)
33 |         elif sc.find(r'(Personnes hospitalis.es).*', key):
34 |             dd.hospitalized = trim_val(val)
35 |         elif sc.find(r'(aux soins intensifs).*', key):
36 |             dd.icu = trim_val(val)
37 |         elif sc.find(r'(Total d.c.s).*', key):
38 |             dd.deaths = trim_val(val)
39 |         elif sc.find(r'(Total Sorties de l\'h.pital).*', key):
40 |             dd.recovered = trim_val(val)
41 | 
42 |     assert dd
43 |     assert dd.datetime
44 |     print(dd)
45 | 


--------------------------------------------------------------------------------
/scrapers/scrape_be_districts.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import csv
 5 | from io import StringIO
 6 | import scrape_common as sc
 7 | 
 8 | 
 9 | # https://www.bfs.admin.ch/bfs/de/home/statistiken/kataloge-datenbanken/karten.assetdetail.5688189.html
10 | district_ids = {
11 |     241: 'Jura bernois',
12 |     242: 'Biel/Bienne',
13 |     243: 'Seeland',
14 |     244: 'Oberaargau',
15 |     245: 'Emmental',
16 |     246: 'Bern-Mittelland',
17 |     247: 'Thun',
18 |     248: 'Obersimmental-Saanen',
19 |     249: 'Frutigen-Niedersimmental',
20 |     250: 'Interlaken-Oberhasli',
21 | }
22 | 
23 | url = 'https://covid-kennzahlen.apps.be.ch/#/de/cockpit'
24 | csv_url = 'https://raw.githubusercontent.com/openDataBE/covid19Data/develop/7_d_inzidenz_verwaltungskreis.csv'
25 | d = sc.download(csv_url, silent=True)
26 | reader = csv.DictReader(StringIO(d), delimiter=',')
27 | for row in reader:
28 |     #dd = sc.DistrictData(district=district, canton='BE')
29 |     district_id = int(row['bfs_nummer'])
30 |     dd = sc.DistrictData(district=district_ids[district_id], canton='BE')
31 |     dd.url = url
32 |     dd.district_id = district_id
33 |     dd.population = row['einwohnerzahl']
34 |     date = sc.date_from_text(row['datum'])
35 |     week = date.isocalendar()[1]
36 |     dd.week = week
37 |     dd.year = date.year
38 |     dd.new_cases = round(float(row['7_d_inzidenz']) / 100e3 * int(row['einwohnerzahl']))
39 |     print(dd)
40 | 


--------------------------------------------------------------------------------
/.github/workflows/activate_scraper.yml:
--------------------------------------------------------------------------------
 1 | name: Activate a scraper
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |     inputs:
 6 |       canton:
 7 |         description: 'Abbreviation of Canton'
 8 |         required: true
 9 | 
10 | jobs:
11 |   activate_scraper:
12 |     runs-on: ubuntu-20.04
13 |     timeout-minutes: 10
14 | 
15 |     steps:
16 |     - uses: actions/checkout@v3
17 | 
18 |     - name: Activate scraper
19 |       env:
20 |         CANTON: ${{ github.event.inputs.canton }}
21 |       run: |
22 |         sed -e "/- $CANTON/I s/^#*//" -i ./.github/workflows/run_scrapers.yml
23 |   
24 |     - name: Commit and push to repo
25 |       env:
26 |         GHA_DEPLOY_KEY: ${{ secrets.GHA_DEPLOY_KEY }}
27 |         CANTON: ${{ github.event.inputs.canton }}
28 |       run: |
29 |         if ! git diff --no-ext-diff --quiet --exit-code; then
30 |           git add .
31 |           git config --local user.email "scraper@open.zh.ch"
32 |           git config --local user.name "GitHub Action Scraper"
33 |           git commit -a -m "Activate $CANTON scraper"
34 |           git remote set-url origin "$(git config --get remote.origin.url | sed 's#http.*com/#git@github.com:#g')"
35 |           eval `ssh-agent -t 60 -s`
36 |           echo "$GHA_DEPLOY_KEY" | ssh-add -
37 |           mkdir -p ~/.ssh/
38 |           ssh-keyscan github.com >> ~/.ssh/known_hosts
39 |           git push
40 |           ssh-agent -k
41 |          else
42 |           echo "Nothing to commit."
43 |         fi
44 | 


--------------------------------------------------------------------------------
/.github/workflows/deactivate_scraper.yml:
--------------------------------------------------------------------------------
 1 | name: Deactivate a scraper
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |     inputs:
 6 |       canton:
 7 |         description: 'Abbreviation of Canton'
 8 |         required: true
 9 | 
10 | jobs:
11 |   deactivate_scraper:
12 |     runs-on: ubuntu-20.04
13 |     timeout-minutes: 10
14 | 
15 |     steps:
16 |     - uses: actions/checkout@v3
17 | 
18 |     - name: Deactivate scraper
19 |       env:
20 |         CANTON: ${{ github.event.inputs.canton }}
21 |       run: |
22 |         sed -e "/- $CANTON/I s/^#*/#/" -i ./.github/workflows/run_scrapers.yml
23 |   
24 |     - name: Commit and push to repo
25 |       env:
26 |         GHA_DEPLOY_KEY: ${{ secrets.GHA_DEPLOY_KEY }}
27 |         CANTON: ${{ github.event.inputs.canton }}
28 |       run: |
29 |         if ! git diff --no-ext-diff --quiet --exit-code; then
30 |           git add .
31 |           git config --local user.email "scraper@open.zh.ch"
32 |           git config --local user.name "GitHub Action Scraper"
33 |           git commit -a -m "Deactivate $CANTON scraper"
34 |           git remote set-url origin "$(git config --get remote.origin.url | sed 's#http.*com/#git@github.com:#g')"
35 |           eval `ssh-agent -t 60 -s`
36 |           echo "$GHA_DEPLOY_KEY" | ssh-add -
37 |           mkdir -p ~/.ssh/
38 |           ssh-keyscan github.com >> ~/.ssh/known_hosts
39 |           git push
40 |           ssh-agent -k
41 |         else
42 |           echo "Nothing to commit."
43 |         fi
44 | 


--------------------------------------------------------------------------------
/scrapers/scrape_ge_tests.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import re
 5 | 
 6 | from selenium import webdriver
 7 | from selenium.webdriver.chrome.options import Options
 8 | 
 9 | import scrape_common as sc
10 | import scrape_ge_common as sgc
11 | 
12 | 
13 | chrome_options = Options()
14 | chrome_options.add_argument("--headless")
15 | driver = webdriver.Chrome(options=chrome_options)
16 | driver.implicitly_wait(5)
17 | 
18 | url = 'https://infocovid.smc.unige.ch/'
19 | driver.get(url)
20 | elem = driver.find_element_by_link_text('Graphiques')
21 | elem.click()
22 | elem = driver.find_element_by_partial_link_text('Tests')
23 | elem.click()
24 | xls_url = sgc.get_link_from_element(driver, 'save_plot_nombre_tests_data')
25 | assert xls_url, "Couldn't find tests XLS url"
26 | 
27 | xls = sc.xlsdownload(xls_url, silent=True)
28 | rows = sc.parse_xls(xls, header_row=0, enable_float=True)
29 | for row in rows:
30 |     td = sc.TestData(canton='GE', url=url)
31 |     res = re.search(r'(\d{2})-(\d{2})', row['week_res'])
32 |     assert res, f"failed to extract year and week from {row['week_res']}"
33 |     td.week = int(res[2])
34 |     td.year = f'20{res[1]}'
35 |     td.positive_tests = int(row['positifs'])
36 |     td.negative_tests = int(row['négatifs'])
37 |     td.total_tests = int(row['total'])
38 |     # 2020-02/03 values are empty
39 |     td.positivity_rate = 0
40 |     if row['ratio']:
41 |         td.positivity_rate = float(row['ratio'])
42 |     print(td)
43 | 


--------------------------------------------------------------------------------
/scrapers/test_scraper.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Script to run all scrapers
 4 | 
 5 | set -e
 6 | set -o pipefail
 7 | 
 8 | function cleanup {
 9 |   exit $?
10 | }
11 | trap "cleanup" EXIT
12 | 
13 | DIR="$(cd "$(dirname "$0")" && pwd)"
14 | NEWLINE=$'\n'
15 | 
16 | echo "Run all scrapers..."
17 | 
18 | exit_code=0
19 | errors=''
20 | for scrape_script in $DIR/scrape_??.py
21 | do
22 |     if [ -f $scrape_script -a -x $scrape_script ]
23 |     then
24 |         name=`basename $scrape_script`
25 |         canton=${name:7:2}
26 |         export SCRAPER_KEY=${canton^^}
27 |         echo ""
28 |         echo "Running ${SCRAPER_KEY} scraper..."
29 |         echo "=========================================="
30 | 
31 |         set +e
32 |         $DIR/run_scraper.sh
33 |         ret=$?
34 |         if [ $ret -ne 0 ]
35 |         then
36 |             echo "ERROR: ${scrape_script} failed with exit code $ret. continue." >&2
37 |             errors=$"${errors}${NEWLINE}ERROR: ${scrape_script} failed with exit code $ret"
38 |             exit_code=1
39 |         fi
40 |         $DIR/validate_scraper_output.sh
41 |         ret=$?
42 |         if [ $ret -ne 0 ]
43 |         then
44 |             echo "ERROR: Validation for ${SCRAPER_KEY} failed with exit code $ret. continue." >&2
45 |             errors=$"${errors}${NEWLINE}ERROR: Validation for ${SCRAPER_KEY} failed with exit code $ret"
46 |             exit_code=1
47 |         fi
48 |         set -e
49 | 
50 |         echo "=========================================="
51 |         echo ""
52 |     fi
53 | done
54 | 
55 | echo "$errors"
56 | exit $exit_code
57 | 


--------------------------------------------------------------------------------
/scrapers/scrape_lu.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import re
 4 | from bs4 import BeautifulSoup
 5 | import scrape_common as sc
 6 | 
 7 | 
 8 | base_url = 'https://www.lustat.ch'
 9 | url = f'{base_url}/daten?id=28177'
10 | d = sc.download(url, silent=True)
11 | soup = BeautifulSoup(d, 'html.parser')
12 | 
13 | xls_url = soup.find('a', href=re.compile(r'.*\.xlsx')).get('href')
14 | if not xls_url.startswith('http'):
15 |     xls_url = f'{base_url}{xls_url}'
16 | xls = sc.xlsdownload(xls_url, silent=True)
17 | rows = sc.parse_xls(xls, header_row=5)
18 | total_cases = 0
19 | total_deaths = 0
20 | is_first = True
21 | for row in rows:
22 |     dd = sc.DayData(canton='LU', url=xls_url)
23 |     dd.datetime = row['Datum']
24 |     dd.cases = sc.int_or_word(row.search(r'Neue\s+Fälle'))
25 |     if dd.cases:
26 |         total_cases += dd.cases
27 |         dd.cases = total_cases
28 |     dd.deaths = sc.int_or_word(row['Verstorbene'])
29 |     if dd.deaths:
30 |         total_deaths += dd.deaths
31 |         dd.deaths = total_deaths
32 |     dd.hospitalized = sc.int_or_word(row['Total'])
33 |     dd.vent = sc.int_or_word(row.search(r'davon\s+beatmet'))
34 |     dd.isolated = sc.int_or_word(row.search(r'in\s+Isolation'))
35 |     dd.quarantined = sc.int_or_word(row.search(r'in\s+Quarantäne'))
36 |     dd.quarantine_riskareatravel = sc.int_or_word(row.search(r'Reiserückkehrer\s+in\s+Quarantäne'))
37 |     if dd.cases is None and dd.datetime == '31.12.2022':
38 |         continue
39 |     if dd:
40 |         if not is_first:
41 |             print('-' * 10)
42 |         is_first = False
43 |         print(dd)
44 | 


--------------------------------------------------------------------------------
/scrapers/scrape_sg_districts.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import csv
 4 | from io import StringIO
 5 | import scrape_common as sc
 6 | 
 7 | inhabitants = {
 8 |     'St.Gallen': 127198,
 9 |     'Rorschach': 44110,
10 |     'Rheintal': 74580,
11 |     'Werdenberg': 40239,
12 |     'Sarganserland': 41736,
13 |     'See-Gaster': 76913,
14 |     'Toggenburg': 47272,
15 |     'Wil': 77018,
16 | }
17 | 
18 | district_ids = {
19 |     'St.Gallen': 1721,
20 |     'Rorschach': 1722,
21 |     'Rheintal': 1723,
22 |     'Werdenberg': 1724,
23 |     'Sarganserland': 1725,
24 |     'See-Gaster': 1726,
25 |     'Toggenburg': 1727,
26 |     'Wil': 1728,
27 | }
28 | 
29 | url = 'https://www.sg.ch/ueber-den-kanton-st-gallen/statistik/covid-19/_jcr_content/Par/sgch_downloadlist/DownloadListPar/sgch_download.ocFile/KantonSG_C19-Faelle_download.csv'
30 | d = sc.download(url, silent=True)
31 | 
32 | # strip the "header" / description lines
33 | d = "\n".join(d.split("\n")[5:])
34 | 
35 | reader = csv.DictReader(StringIO(d), delimiter=';')
36 | for row in reader:
37 |     week = sc.find(r'W(\d+)', row['Kalenderwoche'])
38 |     date = sc.date_from_text(row['Falldatum'])
39 | 
40 |     for key, value in inhabitants.items():
41 |         dd = sc.DistrictData(canton='SG', district=key)
42 |         dd.url = url
43 |         dd.week = week
44 |         dd.year = date.year
45 |         dd.date = date.isoformat()
46 |         dd.district_id = district_ids[key]
47 |         dd.new_cases = row['Wahlkreis ' + key]
48 |         dd.total_cases = row['Wahlkreis ' + key + ' (kumuliert)']
49 |         dd.population = value
50 |         print(dd)
51 | 


--------------------------------------------------------------------------------
/scrapers/scrape_bs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import csv
 5 | from io import StringIO
 6 | import scrape_common as sc
 7 | 
 8 | d_csv = sc.download('https://data.bs.ch/explore/dataset/100073/download/?format=csv&timezone=Europe/Zurich&lang=en&use_labels_for_header=false&csv_separator=,', silent=True)
 9 | 
10 | reader = csv.DictReader(StringIO(d_csv), delimiter=',')
11 | is_first = True
12 | for row in reader:
13 |     if not row['ncumul_conf']:
14 |         continue
15 |     if not is_first:
16 |         print('-' * 10)
17 |     is_first = False
18 |     dd = sc.DayData(canton='BS', url=row['source'])
19 |     dd.datetime = f"{row['date']} {row['time']}"
20 |     dd.cases = sc.safeint(row['ncumul_conf'])
21 |     dd.new_hosp = row['new_hosp']
22 |     dd.hospitalized = row['current_hosp']
23 |     dd.icu = row['current_icu']
24 |     dd.vent = row['current_vent']
25 |     dd.recovered = row['ncumul_released']
26 |     dd.deaths = row['ncumul_deceased']
27 |     dd.isolated = row['current_isolated']
28 |     dd.quarantined = row['current_quarantined']
29 |     dd.confirmed_non_resident = row['ncumul_confirmed_non_resident']
30 |     dd.hosp_non_resident = row['current_hosp_non_resident']
31 |     dd.quarantine_riskareatravel = row['current_quarantined_riskareatravel']
32 |     dd.quarantine_total = row['current_quarantined_total']
33 |     dd.hosp_resident = row['current_hosp_resident']
34 |     
35 |     # TODO: remove if source is fixed
36 |     # BS corrected data on 2021-03-01 without adapting their time series
37 |     if row['date'] in ('2021-02-27', '2021-02-28'):
38 |         dd.cases = ''
39 |         dd.recovered = ''
40 |     print(dd)
41 | 


--------------------------------------------------------------------------------
/scrapers/download.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # This is a simple wrapper around curl or wget, that can also be used to
 4 | # save downloaded pages for archival purposes, as well for feeding fake
 5 | # (test) data to the scrapers.
 6 | 
 7 | # echo "DOWNLOADING:" "$@" >&2
 8 | 
 9 | #WEBARCHIVE_SNAPSHOT=1
10 | 
11 | if [ "x${WEBARCHIVE_SNAPSHOT}" != "x" ]; then
12 |   # Note: JSON only allows strings in double quotes.
13 |   (
14 |   echo "$(date --iso-8601=seconds)" "Snapshoting: $1"
15 |   W=$(curl -X POST -H "Content-Type: application/json" --data-raw "{\"url\": \"$1\", \"annotation\": {\"id\": \"lst-ib\", \"message\": \"openZH covid_19 github archiving\"}}" "https://pragma.archivelab.org/" 2>&1)
16 |   echo "Response:"
17 |   echo "${W}"
18 |   ) >> webarchiveorg.log
19 | fi
20 | 
21 | if which curl >/dev/null; then
22 |   # Few sites, like GL, JU, SZ don't like curl, and return 403, or block site completly per-IP.
23 |   # --output -, because curl, doesn't like to pipe binary files sometimes.
24 |   exec curl -k --silent --output - --user-agent "Mozilla Firefox Mozilla/5.0; openZH covid_19 at github" "$@"
25 |   exit 1
26 | fi
27 | 
28 | if which wget >/dev/null; then
29 |   # Few sites, like GL, JU, SZ don't like curl, and return 403, or block site completly per-IP.
30 |   exec wget --output-document=- --quiet --user-agent="Mozilla Firefox Mozilla/5.0; openZH covid_19 at github" "$@"
31 |   exit 1
32 | fi
33 | 
34 | if which GET >/dev/null; then
35 |   # Few sites, like GL, JU, SZ don't like curl, and return 403, or block site completly per-IP.
36 |   exec GET "$@"
37 |   exit 1
38 | fi
39 | 
40 | echo "$0: No curl, wget or GET found. Install curl (recommended), or wget." >&2
41 | exit 2
42 | 


--------------------------------------------------------------------------------
/scrapers/scrape_tests.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import scrape_common as sc
 4 | import sys
 5 | import re
 6 | 
 7 | 
 8 | # download latest PDF
 9 | pdf_url = 'https://www.bag.admin.ch/dam/bag/de/dokumente/mt/k-und-i/aktuelle-ausbrueche-pandemien/2019-nCoV/covid-19-woechentlicher-lagebericht.pdf.download.pdf/BAG_COVID-19_Woechentliche_Lage.pdf'
10 | d = sc.pdfdownload(pdf_url, raw=True, silent=True)
11 | 
12 | """
13 | Coronavirus-Krankheit-2019 (COVID-19)
14 | Eidgen<C3><B6>ssisches Departement des Innern EDI
15 | Bundesamt f<C3><BC>r Gesundheit BAG
16 | Direktionsbereich <C3><96>ffentliche Gesundheit
17 | Situationsbericht zur epidemiologischen Lage in der Schweiz
18 | und im F<C3><BC>rstentum Liechtenstein - Woche 28 (06.-12.07.2020)
19 | """
20 | 
21 | datetime = sc.find(r'Liechtenstein - Woche .*(\d{2}\.\d{2}\.\d{4})\)', d)
22 | 
23 | """
24 | Canton, tests of previous-week then current-week
25 | 
26 | AG 5478 3588 808 529 1.3 1.8
27 | AI 96 55 595 341 0.0 0.0
28 | AR 391 249 708 451 0.5 1.2
29 | BE 6924 4652 669 449 0.4 0.9
30 | ...
31 | """
32 | start = d.find('Anzahl PCR-Tests in der Schweiz')
33 | if start > 0:
34 |     start = d.find('\nAG ', start)
35 | else:
36 |     start = 0
37 | end = d.find('Tabelle 4. Durchgeführte Tests nach Kalenderwoche', start)
38 | if start > 0 and end > start:
39 |     tests_table = d[start:end]
40 |     for line in tests_table.splitlines():
41 |         canton = sc.find(r'^([A-Z][A-Z]) ', line)
42 |         if canton is not None:
43 |             dd = sc.DayData(canton=canton, url=pdf_url)
44 |             dd.datetime = datetime
45 |             dd.tested = sc.find(r'^[A-Z][A-Z] \d+ (\d+)', line)
46 |             print('-' * 10)
47 |             print(dd)
48 | 
49 | 


--------------------------------------------------------------------------------
/scripts/new2oldcsv.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # This script convert CSV files from the new to the old structure
 4 | 
 5 | import csv
 6 | import sys
 7 | import traceback
 8 | 
 9 | assert len(sys.argv) == 2, "Call script with CSV file as parameter"
10 | 
11 | try:
12 |     filename = sys.argv[1]
13 |     rows = []
14 |     with open(filename, 'r') as f:
15 |         dr = csv.DictReader(f) 
16 |         for r in dr:
17 |             # map old to new structure
18 |             data = {
19 |               'date': r['date'],
20 |               'time': r['time'],
21 |               'abbreviation_canton_and_fl': r['abbreviation_canton_and_fl'],
22 |               'ncumul_tested': r['ncumul_tested'],
23 |               'ncumul_conf': r['ncumul_conf'],
24 |               'ncumul_hosp': r['current_hosp'],
25 |               'ncumul_ICU': r['current_icu'],
26 |               'ncumul_vent': r['current_vent'],
27 |               'ncumul_released': r['ncumul_released'],
28 |               'ncumul_deceased': r['ncumul_deceased'],
29 |               'source': r['source'],
30 |             }
31 |             # re-add extra columns
32 |             for col in dr.fieldnames[15:]:
33 |                 data[col] = r[col]
34 |             rows.append(data)
35 | 
36 |     writer = csv.DictWriter(
37 |         sys.stdout,
38 |         rows[0].keys(),
39 |         delimiter=',',
40 |         quotechar='"',
41 |         lineterminator='\n',
42 |         quoting=csv.QUOTE_MINIMAL
43 |     )
44 |     writer.writeheader()
45 |     writer.writerows(rows)
46 | except Exception as e:
47 |     print("Error: %s" % e, file=sys.stderr)
48 |     print(traceback.format_exc(), file=sys.stderr)
49 |     sys.exit(1)
50 | finally:
51 |     sys.stdout.flush()
52 | 


--------------------------------------------------------------------------------
/scrapers/validate_scrapers.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import re
 4 | import subprocess
 5 | import sys
 6 | import os
 7 | from scrape_matrix import matrix
 8 | 
 9 | __location__ = os.path.realpath(
10 |     os.path.join(
11 |         os.getcwd(),
12 |         os.path.dirname(__file__)
13 |     )
14 | )
15 | 
16 | 
17 | if __name__ == '__main__':
18 |     all_features = ['Confirmed cases', 'Deaths', 'Released', 'Hospitalized', 'ICU', 'Vent']
19 |     has_issue = False
20 |     for canton, features in matrix.items():
21 |         print(canton)
22 |         scraper = f'{__location__}/scrape_{canton.lower()}.py'
23 |         if not os.access(scraper, os.X_OK):
24 |             print(f"{scraper} is not executable; skipping")
25 |             continue
26 |         result = subprocess.run([scraper], stdout=subprocess.PIPE)
27 |         output = re.sub('----------\n$', '', result.stdout.decode('utf-8')).split('----------\n')[-1]
28 |         for feature in features:
29 |             if feature == 'Released':
30 |                 feature = r'(:?Released|Recovered)'
31 |             matches = re.search(f'{feature}: (.+)', output)
32 |             if matches is None or matches[1].startswith('None'):
33 |                 has_issue = True
34 |                 print(f"missing {feature} for {canton}")
35 |         for feature in all_features:
36 |             if feature not in features:
37 |                 if feature == 'Released':
38 |                     feature = r'(:?Released|Recovered)'
39 |                 if re.search(f'{feature}:', output) is not None:
40 |                     has_issue = True
41 |                     print(f"{feature} is present for {canton} but not listed in feature matrix")
42 | 
43 |     if has_issue:
44 |         sys.exit(1)
45 | 


--------------------------------------------------------------------------------
/scrapers/scrape_ge_common.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import re
 5 | import time
 6 | from bs4 import BeautifulSoup
 7 | from selenium.webdriver.common.by import By
 8 | from selenium.webdriver.support.ui import WebDriverWait
 9 | from selenium.webdriver.support import expected_conditions as EC
10 | import scrape_common as sc
11 | 
12 | 
13 | def get_latest_ge_weekly_pdf_url():
14 |     return get_ge_weekly_pdf_urls()[0]
15 | 
16 | 
17 | def get_ge_weekly_pdf_urls():
18 |     d = sc.download('https://www.ge.ch/document/covid-19-bilan-epidemiologique-hebdomadaire', silent=True)
19 |     soup = BeautifulSoup(d, 'html.parser')
20 |     links = soup.find_all('a', title=re.compile(r"\.pdf$"))
21 |     result = []
22 |     for link in links:
23 |         pdf_url = link.get('href')
24 |         assert pdf_url, "pdf URL is empty"
25 |         if not pdf_url.startswith('http'):
26 |             pdf_url = f'https://www.ge.ch{pdf_url}'
27 |         if pdf_url not in result:
28 |             result.append(pdf_url)
29 |     return result
30 | 
31 | 
32 | class element_has_link(object):
33 |   def __init__(self, locator):
34 |     self.locator = locator
35 | 
36 |   def __call__(self, driver):
37 |     element = driver.find_element(*self.locator)   # Finding the referenced element
38 |     if element.get_attribute('href'):
39 |         return element
40 |     else:
41 |         return False
42 | 
43 | 
44 | def get_link_from_element(driver, element_id):
45 |     # the xls download links do not appear immediately for some reason
46 |     # add some delay to get it.
47 |     wait = WebDriverWait(driver, 30)
48 |     elem = wait.until(element_has_link((By.ID, element_id)))
49 |     url = elem.get_attribute('href')
50 | 
51 |     return url
52 | 


--------------------------------------------------------------------------------
/scrapers/populate_district_database.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # This script creates a new sqlite database based on the CSV is reiceives as an argument
 4 | # The sqlite database is used as an intermediate step to merge new data in existing CSVs
 5 | 
 6 | import sqlite3
 7 | import traceback
 8 | import os
 9 | import sys
10 | import db_common as dc
11 | 
12 | 
13 | __location__ = dc.get_location()
14 | 
15 | try:
16 |     # load the csv to sqlite db
17 |     assert len(sys.argv) == 2, "Call script with CSV file as parameter"
18 |     columns, to_db = dc.load_csv(sys.argv[1])
19 | 
20 |     # create db
21 |     DATABASE_NAME = os.path.join(__location__, 'data.sqlite')
22 |     conn = sqlite3.connect(DATABASE_NAME)
23 |     c = conn.cursor()
24 |     c.execute('DROP TABLE IF EXISTS data')
25 |     c.execute(
26 |         '''
27 |         CREATE TABLE IF NOT EXISTS data (
28 |             DistrictId integer NOT NULL,
29 |             District text NOT NULL,
30 |             Canton text NOT NULL,
31 |             Date text NOT NULL,
32 |             Week text NOT NULL,
33 |             Year text NOT NULL,
34 |             Population integer,
35 |             TotalConfCases integer,
36 |             NewConfCases integer,
37 |             TotalDeaths integer,
38 |             NewDeaths integer,
39 |             SourceUrl text,
40 |             UNIQUE(DistrictId, District, Canton, Date, Week, Year)
41 |         )
42 |         '''
43 |     )
44 | 
45 |     # add entries
46 |     query = dc.insert_db_query(columns)
47 |     c.executemany(query, to_db)
48 |     conn.commit()
49 | except Exception as e:
50 |     print("Error: %s" % e, file=sys.stderr)
51 |     print(traceback.format_exc(), file=sys.stderr)
52 |     sys.exit(1)
53 | finally:
54 |     conn.close()
55 | 


--------------------------------------------------------------------------------
/scrapers/scrape_so_districts.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from bs4 import BeautifulSoup
 5 | import scrape_common as sc
 6 | 
 7 | url = 'https://corona.so.ch/bevoelkerung/daten/fallzahlen-nach-gemeinden/'
 8 | d = sc.download(url, silent=True)
 9 | 
10 | date = sc.find(r'Stand (\d+\.\d+\.20\d{2})', d)
11 | date = sc.date_from_text(date)
12 | 
13 | population = {
14 |     'Solothurn': 16933,
15 |     'Bucheggberg': 7954,
16 |     'Dorneck': 20678,
17 |     'Gäu': 21605,
18 |     'Gösgen': 24536,
19 |     'Lebern': 24536,
20 |     'Olten': 55686,
21 |     'Thal': 14785,
22 |     'Thierstein': 14747,
23 |     'Wasseramt': 52134,
24 | }
25 | 
26 | district_ids = {
27 |     'Solothurn': 1109,
28 |     'Bucheggberg': 1103,
29 |     'Dorneck': 1104,
30 |     'Gäu': 1101,
31 |     'Gösgen': 1105,
32 |     'Lebern': 1107,
33 |     'Olten': 1108,
34 |     'Thal': 1102,
35 |     'Thierstein': 1110,
36 |     'Wasseramt': 1106,
37 | }
38 | 
39 | 
40 | def strip_so_number(value):
41 |     value = value.replace('\'', '')
42 |     value = value.replace('^', '')
43 |     return int(value)
44 | 
45 | 
46 | soup = BeautifulSoup(d, 'html.parser')
47 | for district, d_id in district_ids.items():
48 |     table = soup.find(text=district).find_next('table')
49 |     tr = table.find('strong', text='Total').find_parent('tr')
50 |     tds = tr.find_all('td')
51 |     assert tds[0].text == 'Total', f'Expected "Total" row, got {tds[0].text}'
52 |     dd = sc.DistrictData(canton='SO', district=district)
53 |     dd.url = url
54 |     dd.date = date.isoformat()
55 |     dd.population = strip_so_number(tds[1].text)
56 |     dd.district_id = d_id
57 |     dd.total_cases = strip_so_number(tds[2].text)
58 |     dd.new_cases = int(tds[3].text)
59 |     print(dd)
60 | 


--------------------------------------------------------------------------------
/scrapers/scrape_sh.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import datetime
 5 | import re
 6 | from bs4 import BeautifulSoup
 7 | import scrape_common as sc
 8 | import scrape_sh_common as shc
 9 | 
10 | main_url, xls = shc.get_sh_xlsx()
11 | 
12 | rows = sc.parse_xls(xls, header_row=0)
13 | is_first = True
14 | for row in rows:
15 |     if not isinstance(row['Datum'], datetime.datetime):
16 |         continue
17 |     if not (row['Positiv'] or row.search(r'Hospitalisation isoliert\s+bestätigt') or row.search(r'Hospitalisation\s+intensiv.*$') or row['Verstorben']):
18 |         continue
19 | 
20 |     if not is_first:
21 |         print('-' * 10)
22 |     is_first = False
23 | 
24 |     dd = sc.DayData(canton='SH', url=main_url)
25 |     dd.datetime = row['Datum'].date().isoformat()
26 |     dd.cases = row['Positiv']
27 | 
28 |     if sc.represents_int(row.search(r'Hospitalisation isoliert\s+bestätigt')) and sc.represents_int(row.search(r'Hospitalisation\s+intensiv.*$')):
29 |         dd.hospitalized = row.search(r'Hospitalisation isoliert\s+bestätigt') + row.search(r'Hospitalisation\s+intensiv.*$')
30 |         dd.icu = row.search(r'Hospitalisation\s+intensiv.*$')
31 |     if row['Verstorben'] is not None:
32 |         dd.deaths = row['Verstorben']
33 | 
34 |     isolated = row.search(r'Anzahl Personen\s+in Isolation.*')
35 |     if isolated is not None:
36 |         dd.isolated = isolated
37 |     quarantined = row.search(r'Anzahl Personen\s+in Quarantäne\s+.*Kontaktpersonen.*')
38 |     if quarantined is not None:
39 |         dd.quarantined = quarantined
40 |     quarantined_risk = row.search(r'Anzahl Personen\s+in Quarantäne\s+.*Rückkehr.*Risikoländer.*')
41 |     if quarantined_risk is not None:
42 |         dd.quarantine_riskareatravel = quarantined_risk
43 | 
44 |     print(dd)
45 | 


--------------------------------------------------------------------------------
/scripts/old2newcsv.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # This script convert CSV files from the old to the new structure
 4 | 
 5 | import csv
 6 | import sys
 7 | import traceback
 8 | 
 9 | assert len(sys.argv) == 2, "Call script with CSV file as parameter"
10 | 
11 | try:
12 |     filename = sys.argv[1]
13 |     rows = []
14 |     with open(filename, 'r') as f:
15 |         dr = csv.DictReader(f) 
16 |         for r in dr:
17 |             # map old to new structure
18 |             data = {
19 |               'date': r['date'],
20 |               'time': r['time'],
21 |               'abbreviation_canton_and_fl': r['abbreviation_canton_and_fl'],
22 |               'ncumul_tested': r['ncumul_tested'],
23 |               'ncumul_conf': r['ncumul_conf'],
24 |               'new_hosp': '',
25 |               'current_hosp': r['ncumul_hosp'],
26 |               'current_icu': r['ncumul_ICU'],
27 |               'current_vent': r['ncumul_vent'],
28 |               'ncumul_released': r['ncumul_released'],
29 |               'ncumul_deceased': r['ncumul_deceased'],
30 |               'source': r['source'],
31 |               'current_isolated': '',
32 |               'current_quarantined': '',
33 |             }
34 |             # re-add extra columns
35 |             for col in dr.fieldnames[11:]:
36 |                 data[col] = r[col]
37 |             rows.append(data)
38 | 
39 |     writer = csv.DictWriter(
40 |         sys.stdout,
41 |         rows[0].keys(),
42 |         delimiter=',',
43 |         quotechar='"',
44 |         lineterminator='\n',
45 |         quoting=csv.QUOTE_MINIMAL
46 |     )
47 |     writer.writeheader()
48 |     writer.writerows(rows)
49 | except Exception as e:
50 |     print("Error: %s" % e, file=sys.stderr)
51 |     print(traceback.format_exc(), file=sys.stderr)
52 |     sys.exit(1)
53 | finally:
54 |     sys.stdout.flush()
55 | 


--------------------------------------------------------------------------------
/scrapers/scrape_bs_tests.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import csv
 4 | from io import StringIO
 5 | import scrape_common as sc
 6 | 
 7 | 
 8 | def prettify_positivity_rate(positivity_rate):
 9 |     if not positivity_rate:
10 |         return None
11 |     return round(10 * float(positivity_rate)) / 10
12 | 
13 | 
14 | url = 'https://data.bs.ch/explore/dataset/100094/download/?format=csv&timezone=Europe/Berlin&lang=en&use_labels_for_header=true&csv_separator=%3B'
15 | data = sc.download(url, silent=True)
16 | 
17 | reader = csv.DictReader(StringIO(data), delimiter=';')
18 | for row in reader:
19 |     td = sc.TestData(canton='BS', url=url)
20 |     td.start_date = row['Datum']
21 |     td.end_date = row['Datum']
22 |     td.positive_tests = row['Positive Tests'] or None
23 |     td.negative_tests = row['Negative Tests'] or None
24 |     td.total_tests = row['Total Tests'] or None
25 |     td.positivity_rate = row['Anteil positive Tests in Prozent'] or None
26 | 
27 |     td.pcr_positive_tests = row['Positive PCR Tests'] or None
28 |     td.pcr_negative_tests = row['Negative PCR Tests'] or None
29 |     td.pcr_total_tests = row['Total PCR Tests'] or None
30 |     td.pcr_positivity_rate = row['Anteil positive PCR Tests in Prozent'] or None
31 | 
32 |     td.ag_positive_tests = row['Positive Antigen Schnelltests'] or None
33 |     td.ag_negative_tests = row['Negative Antigen Schnelltests'] or None
34 |     td.ag_total_tests = row['Total Antigen Schnelltests'] or None
35 |     td.ag_positivity_rate = row['Anteil positive Antigen Schnelltests in Prozent'] or None
36 | 
37 |     if td:
38 |         td.positivity_rate = prettify_positivity_rate(td.positivity_rate)
39 |         td.pcr_positivity_rate = prettify_positivity_rate(td.pcr_positivity_rate)
40 |         td.ag_positivity_rate = prettify_positivity_rate(td.ag_positivity_rate)
41 |         print(td)
42 | 


--------------------------------------------------------------------------------
/scrapers/scrape_be.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import csv
 4 | from io import StringIO
 5 | import re
 6 | import scrape_common as sc
 7 | 
 8 | url = 'https://covid-kennzahlen.apps.be.ch/#/de/cockpit'
 9 | 
10 | csv_url = 'https://raw.githubusercontent.com/openDataBE/covid19Data/develop/total_faelle.csv'
11 | d = sc.download(csv_url, silent=True)
12 | reader = csv.DictReader(StringIO(d), delimiter=',')
13 | is_first = True
14 | for row in reader:
15 |     if not is_first:
16 |         print('-' * 10)
17 |     is_first = False
18 | 
19 |     dd = sc.DayData(canton='BE', url=url)
20 |     dd.datetime = row['datum']
21 |     dd.cases = row['total_laborbestaetigte_faelle']
22 |     dd.deaths = row['total_todesfaelle']
23 |     print(dd)
24 | 
25 | csv_url = 'https://raw.githubusercontent.com/openDataBE/covid19Data/develop/spa_auslastung.csv'
26 | d = sc.download(csv_url, silent=True)
27 | reader = csv.DictReader(StringIO(d), delimiter=',')
28 | is_first = True
29 | for row in reader:
30 |     if not is_first:
31 |         print('-' * 10)
32 |     is_first = False
33 | 
34 |     dd = sc.DayData(canton='BE', url=url)
35 |     dd.datetime = row['datum']
36 |     dd.hospitalized = row['personen_hospitalisiert']
37 |     dd.vent = int(row['auf_intensivpflegestation_beatmet'])
38 |     dd.icu = int(row['auf_intensivpflegestation_unbeatmet']) + dd.vent
39 |     print(dd)
40 | 
41 | csv_url = 'https://raw.githubusercontent.com/openDataBE/covid19Data/develop/contact_tracing.csv'
42 | d = sc.download(csv_url, silent=True)
43 | reader = csv.DictReader(StringIO(d), delimiter=',')
44 | is_first = True
45 | for row in reader:
46 |     if not is_first:
47 |         print('-' * 10)
48 |     is_first = False
49 | 
50 |     dd = sc.DayData(canton='BE', url=url)
51 |     dd.datetime = row['datum']
52 |     dd.quarantined = row['personen_in_quarantaene']
53 |     dd.isolated = row['personen_in_isolation']
54 |     print(dd)
55 | 


--------------------------------------------------------------------------------
/.github/workflows/test_tests_scraper.yml:
--------------------------------------------------------------------------------
 1 | name: Test run of tests scrapers
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ master ]
 6 |     paths:
 7 |     - 'scrapers/*tests*'
 8 |     - 'scrapers/parse_scrape_output.py'
 9 |     - 'scrapers/populate_tests_database.py'
10 |     - 'scrapers/run_tests_scraper.sh'
11 |     - 'scrapers/scrape_dates.py'
12 |     - 'scrapers/scrape_matrix.py'
13 |     - 'scrapers/validate_scraper*'
14 |     - 'scrapers/*_common.py'
15 |     - '!scrapers/*_districts.py'
16 |     - '.github/workflows/**'
17 |   pull_request:
18 |     branches: [ master ]
19 |     paths:
20 |     - 'scrapers/*tests*'
21 |     - 'scrapers/parse_scrape_output.py'
22 |     - 'scrapers/populate_tests_database.py'
23 |     - 'scrapers/run_tests_scraper.sh'
24 |     - 'scrapers/scrape_dates.py'
25 |     - 'scrapers/scrape_matrix.py'
26 |     - 'scrapers/validate_scraper*'
27 |     - 'scrapers/*_common.py'
28 |     - '!scrapers/*_districts.py'
29 |     - '.github/workflows/**'
30 |   workflow_dispatch: ~
31 | 
32 | jobs:
33 |   test_run:
34 |     runs-on: ubuntu-20.04
35 |     timeout-minutes: 10
36 | 
37 |     steps:
38 |     - uses: actions/checkout@v3
39 | 
40 |     - name: Set up Python 3.7
41 |       uses: actions/setup-python@v4
42 |       with:
43 |         python-version: 3.7
44 | 
45 |     - name: Remove broken apt repos
46 |       run: |
47 |         for apt_file in `grep -lr microsoft /etc/apt/sources.list.d/`; do sudo rm $apt_file; done
48 | 
49 |     - name: Install dependencies
50 |       run: |
51 |         npm ci
52 |         python -m pip install --upgrade pip setuptools wheel
53 |         pip install -r requirements.txt
54 |         sudo apt update || true # do not fail if update does not work
55 |         sudo apt-get install poppler-utils
56 |         sudo apt-get install chromium-browser
57 | 
58 |     - name: Test run of all tests scrapers
59 |       run: ./scrapers/test_tests_scraper.sh
60 | 
61 | 


--------------------------------------------------------------------------------
/scrapers/scrape_sz_districts.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import re
 5 | 
 6 | from bs4 import BeautifulSoup
 7 | 
 8 | import scrape_common as sc
 9 | 
10 | url = 'https://www.sz.ch/behoerden/information-medien/medienmitteilungen/coronavirus.html/72-416-412-1379-6948'
11 | content = sc.download(url, silent=True)
12 | soup = BeautifulSoup(content, 'html.parser')
13 | pdf_url = soup.find('a', text=re.compile(r'Coronafälle pro Gemeinde')).get('href')
14 | 
15 | content = sc.pdfdownload(pdf_url, layout=True, silent=True)
16 | date = sc.find(r'Stand\W+(\d+\.\d+\.20\d{2})', content)
17 | date = sc.date_from_text(date).isoformat()
18 | district_data = re.findall(r'^Bezirk\W+(\w+)\s+(≤?\s?\d+)', content, re.MULTILINE)
19 | 
20 | # https://www.bfs.admin.ch/bfs/de/home/statistiken/kataloge-datenbanken/karten.assetdetail.5688189.html
21 | district_ids = {
22 |     'Einsiedeln': 501,
23 |     'Gersau': 502,
24 |     'Höfe': 503,
25 |     'Küssnacht': 504,
26 |     'March': 505,
27 |     'Schwyz': 506,
28 | }
29 | 
30 | # https://www.sz.ch/kanton/bezirke/schwyz.html/72-210-112-106
31 | population = {
32 |     'Einsiedeln': 16027,
33 |     'Gersau': 2314,
34 |     'Höfe': 29123,
35 |     'Küssnacht': 13270,
36 |     'March': 43528,
37 |     'Schwyz': 55390,
38 | }
39 | 
40 | assert len(district_data) == len(district_ids), f'expected {len(district_ids)} districts available, but got {len(district_data)}: {district_data}'
41 | 
42 | for district, total_cases in district_data:
43 |     assert district in district_ids, f'District {district} is unknown'
44 | 
45 |     dd = sc.DistrictData(canton='SZ', district=district)
46 |     dd.url = pdf_url
47 |     dd.district_id = district_ids[district]
48 |     dd.population = population[district]
49 |     dd.date = date
50 |     # skip total_cases for ≤ entries
51 |     if not sc.find(r'(≤)', total_cases):
52 |         dd.total_cases = total_cases
53 |     print(dd)
54 | 


--------------------------------------------------------------------------------
/.github/workflows/test_district_scraper.yml:
--------------------------------------------------------------------------------
 1 | name: Test run of district scrapers
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ master ]
 6 |     paths:
 7 |     - 'scrapers/*_districts*'
 8 |     - 'scrapers/parse_scrape_output.py'
 9 |     - 'scrapers/populate_district_database.py'
10 |     - 'scrapers/run_district_scraper.sh'
11 |     - 'scrapers/scrape_dates.py'
12 |     - 'scrapers/scrape_matrix.py'
13 |     - 'scrapers/validate_scraper*'
14 |     - 'scrapers/*_common.py'
15 |     - '!scrapers/*_tests.py'
16 |     - '.github/workflows/**'
17 |   pull_request:
18 |     branches: [ master ]
19 |     paths:
20 |     - 'scrapers/*_districts*'
21 |     - 'scrapers/parse_scrape_output.py'
22 |     - 'scrapers/populate_district_database.py'
23 |     - 'scrapers/run_district_scraper.sh'
24 |     - 'scrapers/scrape_dates.py'
25 |     - 'scrapers/scrape_matrix.py'
26 |     - 'scrapers/validate_scraper*'
27 |     - 'scrapers/*_common.py'
28 |     - '!scrapers/*_tests.py'
29 |     - '.github/workflows/**'
30 |   workflow_dispatch: ~
31 | 
32 | jobs:
33 |   test_run:
34 |     runs-on: ubuntu-20.04
35 |     timeout-minutes: 10
36 | 
37 |     steps:
38 |     - uses: actions/checkout@v3
39 | 
40 |     - name: Set up Python 3.7
41 |       uses: actions/setup-python@v4
42 |       with:
43 |         python-version: 3.7
44 | 
45 |     - name: Remove broken apt repos
46 |       run: |
47 |         for apt_file in `grep -lr microsoft /etc/apt/sources.list.d/`; do sudo rm $apt_file; done
48 | 
49 |     - name: Install dependencies
50 |       run: |
51 |         npm ci
52 |         python -m pip install --upgrade pip setuptools wheel
53 |         pip install -r requirements.txt
54 |         pip install -r requirements-ocr.txt
55 |         sudo apt update || true # do not fail if update does not work
56 |         sudo apt-get install poppler-utils
57 | 
58 |     - name: Test run of all district scrapers
59 |       run: ./scrapers/test_district_scraper.sh
60 | 
61 | 


--------------------------------------------------------------------------------
/scrapers/populate_database.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # This script creates a new sqlite database based on the CSV is reiceives as an argument
 4 | # The sqlite database is used as an intermediate step to merge new data in existing CSVs
 5 | 
 6 | import sqlite3
 7 | import traceback
 8 | import os
 9 | import sys
10 | import db_common as dc
11 | 
12 | 
13 | __location__ = dc.get_location()
14 | 
15 | try:
16 |     # load the csv to sqlite db
17 |     assert len(sys.argv) == 2, "Call script with CSV file as parameter"
18 |     columns, to_db = dc.load_csv(sys.argv[1])
19 | 
20 |     # create db
21 |     DATABASE_NAME = os.path.join(__location__, 'data.sqlite')
22 |     conn = sqlite3.connect(DATABASE_NAME)
23 |     c = conn.cursor()
24 |     c.execute('DROP TABLE IF EXISTS data')
25 |     c.execute(
26 |         '''
27 |         CREATE TABLE IF NOT EXISTS data (
28 |             date text,
29 |             time text,
30 |             abbreviation_canton_and_fl text,
31 |             ncumul_tested  integer,
32 |             ncumul_conf integer,
33 |             new_hosp integer,
34 |             current_hosp integer,
35 |             current_icu integer,
36 |             current_vent integer,
37 |             ncumul_released integer,
38 |             ncumul_deceased integer,
39 |             source text,
40 |             current_isolated integer,
41 |             current_quarantined integer,
42 |             UNIQUE(date, abbreviation_canton_and_fl)
43 |         )
44 |         '''
45 |     )
46 |     # check if there are extra columns
47 |     for col in columns[14:]:
48 |         c.execute(f'ALTER TABLE data ADD COLUMN {col} integer;')
49 | 
50 |     # add entries
51 |     query = dc.insert_db_query(columns)
52 |     c.executemany(query, to_db)
53 |     conn.commit()
54 | except Exception as e:
55 |     print("Error: %s" % e, file=sys.stderr)
56 |     print(traceback.format_exc(), file=sys.stderr)
57 |     sys.exit(1)
58 | finally:
59 |     conn.close()
60 | 


--------------------------------------------------------------------------------
/scrapers/scrape_gr_districts.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import datetime
 4 | import requests
 5 | 
 6 | import scrape_common as sc
 7 | 
 8 | inhabitants = {
 9 |     'Albula': 8054,
10 |     'Bernina': 4613,
11 |     'Engiadina Bassa/Val Müstair': 9197,
12 |     'Imboden': 21293,
13 |     'Landquart': 25402,
14 |     'Maloja': 18184,
15 |     'Moesa': 8671,
16 |     'Plessur': 42446,
17 |     'Prättigau/Davos': 26089,
18 |     'Surselva': 21289,
19 |     'Viamala': 13783,
20 | }
21 | 
22 | district_ids = {
23 |     'Albula': 1841,
24 |     'Bernina': 1842,
25 |     'Engiadina Bassa/Val Müstair': 1843,
26 |     'Imboden': 1844,
27 |     'Landquart': 1845,
28 |     'Maloja': 1846,
29 |     'Moesa': 1847,
30 |     'Plessur': 1848,
31 |     'Prättigau/Davos': 1849,
32 |     'Surselva': 1850,
33 |     'Viamala': 1851,
34 | }
35 | 
36 | 
37 | limit = '100'
38 | url = 'https://services1.arcgis.com/YAuo6vcW85VPu7OE/arcgis/rest/services/Fallzahlen_Pro_Region/FeatureServer/0/query?f=json&where=Datum%3E%3Dtimestamp%20%272020-02-01%2000%3A00%3A00%27&returnGeometry=false&spatialRel=esriSpatialRelIntersects&outFields=*&orderByFields=Region%20asc&resultOffset=0&resultRecordCount=10000&resultType=standard&cacheHint=true'
39 | 
40 | 
41 | resp = requests.get(url=url)
42 | json_data = resp.json()
43 | 
44 | for attributes in json_data['features']:
45 |     element = attributes['attributes']
46 | 
47 |     if element['Region'] in district_ids:
48 |         dd = sc.DistrictData(canton='GR', district=element['Region'])
49 |         dd.url = url
50 |         date = datetime.datetime.utcfromtimestamp(element['Datum'] / 1000)
51 |         dd.date = date.date().isoformat()
52 |         dd.total_cases = element['Faelle__kumuliert_']
53 |         dd.new_cases = element['Neue_Faelle']
54 |         dd.total_deceased = element['Verstorbene__kumuliert_']
55 |         dd.new_deceased = element['Verstorbene']
56 |         dd.population = inhabitants[dd.district]
57 |         dd.district_id = district_ids[dd.district]
58 |         print(dd)
59 | 


--------------------------------------------------------------------------------
/scrapers/populate_tests_database.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # This script creates a new sqlite database based on the CSV is reiceives as an argument
 4 | # The sqlite database is used as an intermediate step to merge new data in existing CSVs
 5 | 
 6 | import sqlite3
 7 | import traceback
 8 | import os
 9 | import sys
10 | import db_common as dc
11 | 
12 | 
13 | __location__ = dc.get_location()
14 | 
15 | try:
16 |     # load the csv to sqlite db
17 |     assert len(sys.argv) == 2, "Call script with CSV file as parameter"
18 |     columns, to_db = dc.load_csv(sys.argv[1])
19 | 
20 |     # create db
21 |     DATABASE_NAME = os.path.join(__location__, 'data.sqlite')
22 |     conn = sqlite3.connect(DATABASE_NAME)
23 |     c = conn.cursor()
24 |     c.execute('DROP TABLE IF EXISTS data')
25 |     c.execute(
26 |         '''
27 |         CREATE TABLE IF NOT EXISTS data (
28 |             canton text NOT NULL,
29 |             start_date text NOT NULL,
30 |             end_date text NOT NULL,
31 |             week text NOT NULL,
32 |             year text NOT NULL,
33 |             positive_tests integer,
34 |             negative_tests integer,
35 |             total_tests integer,
36 |             positivity_rate float,
37 |             source text,
38 |             pcr_positive_tests integer,
39 |             pcr_negative_tests integer,
40 |             pcr_total_tests integer,
41 |             pcr_positivity_rate float,
42 |             ag_positive_tests integer,
43 |             ag_negative_tests integer,
44 |             ag_total_tests integer,
45 |             ag_positivity_rate float,
46 |             UNIQUE(canton, start_date, end_date, week, year)
47 |         )
48 |         '''
49 |     )
50 | 
51 |     # add entries
52 |     query = dc.insert_db_query(columns)
53 |     c.executemany(query, to_db)
54 |     conn.commit()
55 | except Exception as e:
56 |     print("Error: %s" % e, file=sys.stderr)
57 |     print(traceback.format_exc(), file=sys.stderr)
58 |     sys.exit(1)
59 | finally:
60 |     conn.close()
61 | 


--------------------------------------------------------------------------------
/scripts/add_new_columns.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # This script convert CSV files from the new to the old structure
 4 | 
 5 | import csv
 6 | import sys
 7 | import traceback
 8 | 
 9 | assert len(sys.argv) == 2, "Call script with CSV file as parameter"
10 | 
11 | try:
12 |     filename = sys.argv[1]
13 |     rows = []
14 |     with open(filename, 'r') as f:
15 |         dr = csv.DictReader(f) 
16 |         for r in dr:
17 |             # map old to new structure
18 |             data = {
19 |               'date': r['date'],
20 |               'time': r['time'],
21 |               'abbreviation_canton_and_fl': r['abbreviation_canton_and_fl'],
22 |               'ncumul_tested': r['ncumul_tested'],
23 |               'ncumul_conf': r['ncumul_conf'],
24 |               'new_hosp': r['new_hosp'],
25 |               'current_hosp': r['current_hosp'],
26 |               'current_icu': r['current_icu'],
27 |               'current_vent': r['current_vent'],
28 |               'ncumul_released': r['ncumul_released'],
29 |               'ncumul_deceased': r['ncumul_deceased'],
30 |               'source': r['source'],
31 |               'current_isolated': r.get('current_isolated', ''),
32 |               'current_quarantined': r.get('current_quarantined', ''),
33 |               'current_quarantined_riskareatravel': r.get('current_quarantined_riskareatravel', ''),  # new field
34 |             }
35 |             # re-add extra columns
36 |             for col in dr.fieldnames[12:]:
37 |                 data[col] = r[col]
38 |             rows.append(data)
39 | 
40 |     writer = csv.DictWriter(
41 |         sys.stdout,
42 |         rows[0].keys(),
43 |         delimiter=',',
44 |         quotechar='"',
45 |         lineterminator='\n',
46 |         quoting=csv.QUOTE_MINIMAL
47 |     )
48 |     writer.writeheader()
49 |     writer.writerows(rows)
50 | except Exception as e:
51 |     print("Error: %s" % e, file=sys.stderr)
52 |     print(traceback.format_exc(), file=sys.stderr)
53 |     sys.exit(1)
54 | finally:
55 |     sys.stdout.flush()
56 | 


--------------------------------------------------------------------------------
/.github/workflows/validate-csv.yml:
--------------------------------------------------------------------------------
 1 | name: Validate CSV
 2 | 
 3 | on:
 4 |   schedule:
 5 |     - cron:  '15 */4 * * *'
 6 |   workflow_dispatch: ~
 7 |   push:
 8 |     branches: [ master ]
 9 |     paths:
10 |     - '**.csv'
11 |   pull_request:
12 |     branches: [ master ]
13 |     paths:
14 |     - '**.csv'
15 |   
16 | jobs:
17 |   validate:
18 |     runs-on: ubuntu-20.04
19 |     timeout-minutes: 10
20 | 
21 |     steps:
22 |     - uses: actions/checkout@v3
23 | 
24 |     - name: Set up Python 3.7
25 |       uses: actions/setup-python@v4
26 |       with:
27 |         python-version: 3.7
28 | 
29 |     - name: Install dependencies
30 |       run: |
31 |         npm ci
32 |         python -m pip install --upgrade pip
33 |         pip install -r requirements.txt
34 | 
35 |     - name: Validate structure and content of CSVs
36 |       run: node scripts/validate-csv.js fallzahlen_kanton_total_csv_v2/*.csv
37 | 
38 |     - name: Check if there are empty lines
39 |       run: scripts/check_for_empty_lines.sh fallzahlen_kanton_total_csv_v2/*.csv
40 | 
41 |     - name: Check for outliers in CSVs
42 |       run: python scripts/check_for_outliers.py fallzahlen_kanton_total_csv_v2/*.csv
43 | 
44 |     - name: Get current unix timestamp
45 |       if: always()
46 |       id: date
47 |       run: echo "ts=$(date +'%s')" >> $GITHUB_OUTPUT
48 | 
49 |     # notify slack if a CSV validation failed
50 |     - name: Notify slack failure
51 |       if: ${{ failure() }}
52 |       env:
53 |         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
54 |       uses: pullreminders/slack-action@master
55 |       with:
56 |         args: '{\"channel\":\"C013C0UUQ4S\", \"attachments\": [{\"fallback\": \"Job failed.\", \"color\": \"danger\", \"title\": \"Validate CSV\", \"title_link\": \"https://github.com/openZH/covid_19/actions/runs/${{ github.run_id }}?check_suite_focus=true\", \"text\": \":x: CSV validation failed\", \"footer\": \"<https://github.com/openZH/covid_19|openZH/covid_19>\", \"footer_icon\": \"https://github.com/abinoda/slack-action/raw/master/docs/app-icons/github-icon.png\", \"ts\": \"${{steps.date.outputs.ts}}\"}]}'
57 | 


--------------------------------------------------------------------------------
/scrapers/scrape_tg_tests.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import re
 4 | import scrape_common as sc
 5 | 
 6 | url = 'https://statistik.tg.ch/themen-und-daten/covid-19.html/10816'
 7 | content = sc.download(url, silent=True)
 8 | 
 9 | res = re.search(r".*name: '2020',\s+categories: \[\'(.*)\]\s+}", content)
10 | assert res, f'failed to extract 2020 weeks, got {res}'
11 | weeks_2020 = res[1].split(',')
12 | 
13 | res = re.search(r".*name: '2021',\s+categories: \[\'(.*)\]\s+}", content)
14 | assert res, f'failed to extract 2021 weeks, got {res}'
15 | weeks_2021 = res[1].split(',')
16 | 
17 | res = re.search(r".*name: '2022',\s+categories: \[\'(.*)\]\s+}", content)
18 | assert res, f'failed to extract 2022 weeks, got {res}'
19 | weeks_2022 = res[1].split(',')
20 | 
21 | res = re.search(r".*name: '2023',\s+categories: \[\'(.*)\]\s+}", content)
22 | assert res, f'failed to extract 2023 weeks, got {res}'
23 | weeks_2023 = res[1].split(',')
24 | 
25 | weeks = weeks_2020 + weeks_2021 + weeks_2022 + weeks_2023
26 | years = ['2020'] * len(weeks_2020) + ['2021'] * len(weeks_2021) + ['2022'] * len(weeks_2022) + ['2023'] * len(weeks_2023)
27 | 
28 | res = re.search(r".*name: 'Anzahl negativer Tests.?',\s+color: '.*',\s+data: \[(.*)\],", content)
29 | assert res, f'failed to extract negative tests, got {res}'
30 | negative_tests = res[1].split(',')
31 | 
32 | res = re.search(r".*name: 'Anzahl positiver Tests.?',\s+color: '.*',\s+data: \[(.*)\],", content)
33 | assert res, f'failed to extract positive tests, got {res}'
34 | positive_tests = res[1].split(',')
35 | 
36 | res = re.search(r".*name: 'Positivitätsrate',\s+color: '.*',\s+data: \[(.*)\],", content)
37 | assert res, f'failed to extract positivtiy rate, got {res}'
38 | positivity_rate = res[1].split(',')
39 | 
40 | assert len(weeks) == len(negative_tests) == len(positive_tests) == len(positivity_rate), f'Expected same length for weeks {len(weeks)}, neg. tests {len(negative_tests)}, pos. tests {len(positive_tests)}, pos. rate {len(positivity_rate)}'
41 | 
42 | for week, year, neg, pos, rate in zip(weeks, years, negative_tests, positive_tests, positivity_rate):
43 |     td = sc.TestData(canton='TG', url=url)
44 |     td.week = sc.find(r'KW (\d+)', week)
45 |     td.year = year
46 |     td.positive_tests = int(pos)
47 |     td.negative_tests = int(neg)
48 |     td.positivity_rate = float(rate)
49 |     print(td)
50 | 


--------------------------------------------------------------------------------
/scripts/remove_older_entries.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # This script removes (=sets to empty string) older entries from a CSV
 4 | # in this example remove current_hosp prior to 2020-05-19
 5 | 
 6 | import csv
 7 | import sys
 8 | import traceback
 9 | import datetime
10 | 
11 | assert len(sys.argv) == 2, "Call script with CSV file as parameter"
12 | 
13 | try:
14 |     filename = sys.argv[1]
15 |     rows = []
16 |     with open(filename, 'r') as f:
17 |         dr = csv.DictReader(f) 
18 |         for r in dr:
19 |             # map old to new structure
20 |             data = {
21 |               'date': r['date'],
22 |               'time': r['time'],
23 |               'abbreviation_canton_and_fl': r['abbreviation_canton_and_fl'],
24 |               'ncumul_tested': r['ncumul_tested'],
25 |               'ncumul_conf': r['ncumul_conf'],
26 |               'new_hosp': r['new_hosp'],
27 |               'current_hosp': r['current_hosp'],
28 |               'current_icu': r['current_icu'],
29 |               'current_vent': r['current_vent'],
30 |               'ncumul_released': r['ncumul_released'],
31 |               'ncumul_deceased': r['ncumul_deceased'],
32 |               'source': r['source'],
33 |               'current_isolated': r.get('current_isolated', ''),
34 |               'current_quarantined': r.get('current_quarantined', ''),
35 |               'current_quarantined_riskareatravel': r.get('current_quarantined_riskareatravel', ''),  # new field
36 |               'current_quarantined_total': r.get('current_quarantined_total', ''),  # new field
37 |             }
38 |             if datetime.datetime.strptime(data['date'], '%Y-%m-%d') < datetime.datetime(2020, 5, 19):
39 |                 data['current_hosp'] = ''
40 |             # re-add extra columns
41 |             for col in dr.fieldnames[12:]:
42 |                 data[col] = r[col]
43 |             rows.append(data)
44 | 
45 |     writer = csv.DictWriter(
46 |         sys.stdout,
47 |         rows[0].keys(),
48 |         delimiter=',',
49 |         quotechar='"',
50 |         lineterminator='\n',
51 |         quoting=csv.QUOTE_MINIMAL
52 |     )
53 |     writer.writeheader()
54 |     writer.writerows(rows)
55 | except Exception as e:
56 |     print("Error: %s" % e, file=sys.stderr)
57 |     print(traceback.format_exc(), file=sys.stderr)
58 |     sys.exit(1)
59 | finally:
60 |     sys.stdout.flush()
61 | 


--------------------------------------------------------------------------------
/scrapers/scrape_ti.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from bs4 import BeautifulSoup
 5 | import re
 6 | import datetime
 7 | import scrape_common as sc
 8 | 
 9 | # get pdf and xlsx URL from covid19 page of TI
10 | main_url = 'https://www4.ti.ch/dss/dsp/covid19/home/'
11 | d = sc.download(main_url, silent=True)
12 | soup = BeautifulSoup(d, 'html.parser')
13 | 
14 | is_first = True
15 | 
16 | """
17 | container = soup.find('h2', string=re.compile(r'Isolamento e quarantena')).find_next('div')
18 | for item in container.find_all('div'):
19 |     divs = item.find_all('div')
20 |     if len(divs) == 3:
21 |         dd = sc.DayData(canton='TI', url=main_url)
22 |         dd.datetime = sc.find(r'.*?(\d+\.\d+\.\d{2})', divs[2].string)
23 |         if sc.find(r'.*(quarantena)', divs[1].string):
24 |             dd.quarantined = divs[0].string
25 |         if sc.find(r'.*(isolamento)', divs[1].string):
26 |             dd.isolated = divs[0].string
27 |         if dd:
28 |             if not is_first:
29 |                 print('-' * 10)
30 |             is_first = False
31 |             print(dd)
32 | """
33 | 
34 | xls_url = soup.find(href=re.compile("\.xlsx$")).get('href')
35 | assert xls_url, "URL is empty"
36 | 
37 | if not xls_url.startswith('http'):
38 |     xls_url = f'https://www4.ti.ch/{xls_url}'
39 | 
40 | xls = sc.xlsdownload(xls_url, silent=True)
41 | rows = sc.parse_xls(xls, header_row=0)
42 | prev_date = None
43 | for row in rows:
44 |     if row is None:
45 |         continue
46 |     if 'Data' not in row:
47 |         continue
48 |     if row['Data'] is None:
49 |         continue
50 | 
51 |     if not is_first:
52 |         print('-' * 10)
53 |     is_first = False
54 | 
55 |     dd = sc.DayData(canton='TI', url=xls_url)
56 |     dd.datetime = f"{row['Data'].date().isoformat()}"
57 |     if dd.datetime == "2023-08-09" and prev_date == "2023-03-08":
58 |         dd.datetime = "2023-03-09"
59 |     prev_date = dd.datetime
60 |     if row.get('Ora'):
61 |         dd.datetime += f"T{row['Ora'].time().isoformat()}"
62 |     dd.cases = row['Totale casi confermati']
63 |     dd.hospitalized = row['Totale giornaliero pazienti ricoverati']
64 |     dd.icu = row['Totale giornaliero pazienti cure intense']
65 |     dd.vent = row['Totale giornaliero pazienti ventilati']
66 |     dd.recovered = row['Totale pazienti dimessi da ospedali']
67 |     dd.deaths = row['Totale decessi']
68 |     print(dd)
69 | 


--------------------------------------------------------------------------------
/scrapers/scrape_sz.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import re
 5 | import sys
 6 | import datetime
 7 | from bs4 import BeautifulSoup
 8 | import scrape_common as sc
 9 | 
10 | url = 'https://www.sz.ch/behoerden/information-medien/medienmitteilungen/coronavirus.html/72-416-412-1379-6948'
11 | d = sc.download(url, silent=True)
12 | soup = BeautifulSoup(d, 'html.parser')
13 | 
14 | is_first = True
15 | 
16 | """
17 | Disabled for now, the PDFs from October 2020 contained hospitalized and quarntined data
18 | 
19 | pdfs = soup.find_all('a', string=re.compile(r'Medienmitteilung vom'))
20 | for pdf in pdfs:
21 |     pdf_url = pdf['href']
22 |     pdf_content = sc.pdfdownload(pdf_url, layout=True, silent=True)
23 |     date = sc.find(r'Stand:\s(\d+\.\s.*\s20\d{2})', pdf_content)
24 |     res = re.search(r'.*\s+(?P<iso>\d+)\s+\d+\s+\d+\s+(?P<hosp>\d+)\s+(?P<quar>\d+)\s+(?P<qtravel>\d+)\s+', pdf_content)
25 |     if not date or not res:
26 |         continue
27 | 
28 |     if not is_first:
29 |         print('-' * 10)
30 |     is_first = False
31 |     dd = sc.DayData(canton='SZ', url=pdf_url)
32 |     dd.datetime = date.replace('\n', ' ')
33 |     dd.isolated = res['iso']
34 |     dd.hospitalized = res['hosp']
35 |     dd.quarantined = res['quar']
36 |     dd.quarantine_riskareatravel = res['qtravel']
37 |     print(dd)
38 |     is_first = False
39 | """
40 | 
41 | try:
42 |     xls_url = soup.find('a', string=re.compile(r'Coronaf.lle\s*im\s*Kanton\s*Schwyz'))['href']
43 | except TypeError:
44 |     print("Unable to determine xls url", file=sys.stderr)
45 |     sys.exit(1)
46 | xls = sc.xlsdownload(xls_url, silent=True)
47 | 
48 | rows = sc.parse_xls(xls)
49 | for row in rows:
50 |     if not isinstance(row['Datum'], datetime.datetime):
51 |         continue
52 | 
53 |     if not is_first:
54 |         print('-' * 10)
55 |     is_first = False
56 | 
57 |     # TODO: remove when source is fixed
58 |     # handle wrong value on 2020-03-25, see issue #631
59 |     if row['Datum'].date().isoformat() == '2020-03-25':
60 |         row['Bestätigte Fälle (kumuliert)'] = ''
61 | 
62 |     dd = sc.DayData(canton='SZ', url=url)
63 |     dd.datetime = row['Datum'].date().isoformat()
64 |     if row['Zeit']:
65 |         dd.datetime += ' ' + row['Zeit'].time().isoformat()
66 |     dd.cases = row['Bestätigte Fälle (kumuliert)']
67 |     dd.deaths = row['Todesfälle (kumuliert)']
68 |     dd.recovered = row['Genesene (kumuliert)']
69 |     print(dd)
70 | 


--------------------------------------------------------------------------------
/scrapers/certificate.pem:
--------------------------------------------------------------------------------
 1 | # SwissSign EV Gold CA 2014 - G22
 2 | -----BEGIN CERTIFICATE-----
 3 | MIIGuTCCBKGgAwIBAgIQAIEIODzAB3XEDG1za+MwizANBgkqhkiG9w0BAQsFADBF
 4 | MQswCQYDVQQGEwJDSDEVMBMGA1UEChMMU3dpc3NTaWduIEFHMR8wHQYDVQQDExZT
 5 | d2lzc1NpZ24gR29sZCBDQSAtIEcyMB4XDTE0MDkxNTE2MTYzN1oXDTM1MDMwNDE2
 6 | MTYzN1owTjELMAkGA1UEBhMCQ0gxFTATBgNVBAoTDFN3aXNzU2lnbiBBRzEoMCYG
 7 | A1UEAxMfU3dpc3NTaWduIEVWIEdvbGQgQ0EgMjAxNCAtIEcyMjCCASIwDQYJKoZI
 8 | hvcNAQEBBQADggEPADCCAQoCggEBAL+MVu10kh055MUIkpRaC7sfiuFQ4gAYFv4B
 9 | 5LfsK6NSpTaJybYvrA/lr0JBE/xTsQl3Jrka60FgprSh9pXgE94UVoE2Qb4LiHEo
10 | AIYyBQY0aA3nL9GEkT436uXs0tV2Veg6+6CgGRzgaoQtDu3hXWV5GOyNOAtlmzR4
11 | md1JH6oFap9d3kVwJLExUI930Cwjzwt0XAcvjy8+fLheBanG5VFGnRrntRSWiRzY
12 | QIjjAkBDTi+lj552h9aKzFvFEQ5NSiBmrGVk2wIlrh+AZe8NYnXrRBzv0Z5SODD4
13 | jxyPkTAX7f9zkJ9s0yMVEmalWnfwXn4K4Rz3x7fmWeyxipUOhSkCAwEAAaOCApow
14 | ggKWMA4GA1UdDwEB/wQEAwIBBjASBgNVHRMBAf8ECDAGAQH/AgEAMB0GA1UdDgQW
15 | BBTu/UbK9ydekbxatueHzQr6VQomQjAfBgNVHSMEGDAWgBRbJXuWpGVRfrg588B4
16 | Zl7oOufw7jCB/wYDVR0fBIH3MIH0MEegRaBDhkFodHRwOi8vY3JsLnN3aXNzc2ln
17 | bi5uZXQvNUIyNTdCOTZBNDY1NTE3RUI4MzlGM0MwNzg2NjVFRTgzQUU3RjBFRTCB
18 | qKCBpaCBooaBn2xkYXA6Ly9kaXJlY3Rvcnkuc3dpc3NzaWduLm5ldC9DTj01QjI1
19 | N0I5NkE0NjU1MTdFQjgzOUYzQzA3ODY2NUVFODNBRTdGMEVFJTJDTz1Td2lzc1Np
20 | Z24lMkNDPUNIP2NlcnRpZmljYXRlUmV2b2NhdGlvbkxpc3Q/YmFzZT9vYmplY3RD
21 | bGFzcz1jUkxEaXN0cmlidXRpb25Qb2ludDBaBgNVHSAEUzBRME8GBFUdIAAwRzBF
22 | BggrBgEFBQcCARY5aHR0cDovL3JlcG9zaXRvcnkuc3dpc3NzaWduLmNvbS9Td2lz
23 | c1NpZ24tR29sZC1DUC1DUFMucGRmMIHRBggrBgEFBQcBAQSBxDCBwTBkBggrBgEF
24 | BQcwAoZYaHR0cDovL3N3aXNzc2lnbi5uZXQvY2dpLWJpbi9hdXRob3JpdHkvZG93
25 | bmxvYWQvNUIyNTdCOTZBNDY1NTE3RUI4MzlGM0MwNzg2NjVFRTgzQUU3RjBFRTBZ
26 | BggrBgEFBQcwAYZNaHR0cDovL2dvbGQtZXYtZzIub2NzcC5zd2lzc3NpZ24ubmV0
27 | LzVCMjU3Qjk2QTQ2NTUxN0VCODM5RjNDMDc4NjY1RUU4M0FFN0YwRUUwDQYJKoZI
28 | hvcNAQELBQADggIBACVxhUgwnsFZgEmC50cCMExcmvY9OQkPxcQbMMFCYvfvBFNz
29 | 65iu0MkXTo0jhaIe8wOOsv230q/zYJbTZOGbMpvUg5MRRIK9DCq3bDwAqN9bIjFw
30 | wK1bODt260m9+4gLxJJdt2MH5LAglQ2J0123+RodYxvv3b+5k6/DZ19dJUgXrjbD
31 | +0PWuO5+5DRangp3VELIRWjHAAnpmq3guORiLuVDS+PoinFp/CKEFRhgWIhp6sZd
32 | yA/9egO+ZH+U7KzLaMuYRNHfJr2UrgQUEufsOM0WUqQXS8RzO7ZGW/argfyc4NdS
33 | CivO97xZBroON0XaLOlTAAbubomhzz/K/Uv2S5T+I/AfYWCme7Vx/KyeA9if/eLA
34 | jQNn5lIb1cXhompM2M+kLAGjNhdpQvUSkjAhKOkzoeezJEN+RXU4P5tOJxw03LtJ
35 | VxmdQxQwgXOR0rBZT+9aFJSX1nIj7zWRnMwFu5w+gBaX1/5MuLP/ThJCckoVgb0o
36 | nbFLRn6siH6dNE+gZ5VgiMWeDOkwlR1UMWGMNwoKNExoTKYwKnpuMfv4q7Fx4uI9
37 | qVzGTL6yfW8+SRdxVFQa6K9hekBr2kZyAKBCqz+jpQq1EPCcvn4HiNx81Na++iqe
38 | K+d2mfZxdEuAwFoZIcyk1aTWHHT1Cqzys00wlukvSmnXUBbGU5Vpwzjlj3N4
39 | -----END CERTIFICATE-----
40 | 


--------------------------------------------------------------------------------
/scrapers/scrape_ag.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import datetime
 4 | import scrape_common as sc
 5 | import scrape_ag_common as sac
 6 | 
 7 | 
 8 | xls_url = sac.get_ag_xls_url()
 9 | xls = sc.xlsdownload(xls_url, silent=True)
10 | is_first = True
11 | 
12 | # quarantine_riskareatravel
13 | """
14 | rows = sc.parse_xls(xls, sheet_name='5. Quarantäne nach Einreise', header_row=2)
15 | for row in rows:
16 |     if not isinstance(row['A'], datetime.datetime):
17 |         continue
18 | 
19 | 
20 |     dd = sc.DayData(canton='AG', url=xls_url)
21 |     dd.datetime = f"{row['A'].date().isoformat()} {row['A'].time().isoformat()}"
22 |     dd.quarantine_riskareatravel = row['Gesamtzahl aktuell betreuter Personen']
23 |     if dd:
24 |         if not is_first:
25 |             print('-' * 10)
26 |         is_first = False
27 |         print(dd)
28 | """
29 | 
30 | # quarantine + isolation
31 | rows = sc.parse_xls(xls, sheet_name='2. Contact Tracing', header_row=2)
32 | for row in rows:
33 |     if not isinstance(row['A'], datetime.datetime):
34 |         continue
35 | 
36 |     dd = sc.DayData(canton='AG', url=xls_url)
37 |     dd.datetime = f"{row['A'].date().isoformat()} {row['A'].time().isoformat()}"
38 |     isolated = row['Gesamtzahl aktuell betreuter Personen']
39 |     if sc.represents_int(isolated):
40 |         dd.isolated = isolated
41 |     #dd.quarantined = row['Gesamtzahl aktuell betreuter Personen5']
42 |     if dd:
43 |         if not is_first:
44 |             print('-' * 10)
45 |         is_first = False
46 |         print(dd)
47 | 
48 | # cases + hospitalization
49 | rows = sc.parse_xls(xls, sheet_name='1. Covid-19-Daten', header_row=2)
50 | for row in rows:
51 |     if not isinstance(row['A'], datetime.datetime):
52 |         continue
53 | 
54 |     dd = sc.DayData(canton='AG', url=xls_url)
55 |     dd.datetime = f"{row['A'].date().isoformat()} {row['A'].time().isoformat()}"
56 |     if 'Gesamtzahl' in row:
57 |         dd.cases = row['Gesamtzahl']
58 | 
59 |     non_icu = row['Bestätigte Fälle Bettenstation (ohne IPS/IMC)']
60 |     icu = row['Bestätigte Fälle Intensivpflegestation (IPS)']
61 |     icf = row['Bestätigte Fälle Intermediate Care (IMC)']
62 |     if sc.represents_int(non_icu) and sc.represents_int(icu) and sc.represents_int(icf):
63 |         dd.hospitalized = int(non_icu) + int(icu) + int(icf)
64 |         dd.icu = icu
65 |         dd.icf = icf
66 |     if 'Gesamtzahl21' in row:
67 |         dd.deaths = row['Gesamtzahl21']
68 |     if 'Gesamtzahl25' in row:
69 |         dd.recovered = row['Gesamtzahl25']
70 | 
71 |     if dd:
72 |         if not is_first:
73 |             print('-' * 10)
74 |         is_first = False
75 |         print(dd)
76 | 


--------------------------------------------------------------------------------
/scrapers/scrape_vd_tests.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import datetime
 5 | import re
 6 | import scrape_common as sc
 7 | import scrape_vd_common as svc
 8 | 
 9 | 
10 | pdf_urls = svc.get_all_weekly_pdf_urls()
11 | for pdf_url in pdf_urls:
12 |     pdf = sc.pdfdownload(pdf_url, silent=True, page=1)
13 |     pdf = re.sub(r'(\d+)\'(\d+)', r'\1\2', pdf)
14 |     pdf = re.sub(r'(\d+)’(\d+)', r'\1\2', pdf)
15 |     pdf = re.sub(r'(\d)er', r'\1', pdf)
16 | 
17 |     td = sc.TestData(canton='VD', url=pdf_url)
18 | 
19 |     year = sc.find(r'Situation au \d+.*(20\d{2})', pdf)
20 |     date = sc.find(r'Point .pid.miologique au (\d+\s+\w+\s+20\d{2})', pdf)
21 |     if date is None:
22 |         date = sc.find(r'Point .pid.miologique au (\d+\.\d+\.20\d{2})', pdf)
23 |     res = re.search(r'Entre\s+(?P<et>et\s+)?le\s+(?P<start>\d+\s+\w+)\s+et\s+le\s+(?P<end>\d+\s+\w+)(?P<year>\s+\d{4})?,', pdf, flags=re.I|re.UNICODE)
24 |     res_with_year = re.search(r'Entre\s+le\s+(?P<start>\d+\s+\w+\s+\d{4})\s+et\s+le\s+(?P<end>\d+\s+\w+\s+\d{4}),', pdf, flags=re.I|re.UNICODE)
25 |     res_no_month = re.search(r'Entre\s+le\s+(?P<start>\d+)\s+et\s+le\s+(?P<end>\d+\s+\w+),', pdf, flags=re.I|re.UNICODE)
26 |     res_no_month_with_year = re.search(r'Entre(?P<et>\s+et)?\s+le\s+(?P<start>\d+)\s+et\s+le\s+(?P<end>\d+\s+\w+\s+\d{4}),', pdf, flags=re.I|re.UNICODE)
27 | 
28 |     if res:
29 |         start_date = sc.date_from_text(f"{res['start']} {year}")
30 |         end_date = sc.date_from_text(f"{res['end']} {year}")
31 |     elif res_with_year:
32 |         start_date = sc.date_from_text(res_with_year['start'])
33 |         end_date = sc.date_from_text(res_with_year['end'])
34 |     elif res_no_month:
35 |         end_date = sc.date_from_text(f"{res_no_month['end']} {year}")
36 |         start_date = sc.date_from_text(f"{res_no_month['start']}.{end_date.month}.{year}")
37 |     elif res_no_month_with_year:
38 |         end_date = sc.date_from_text(res_no_month_with_year['end'])
39 |         start_date = sc.date_from_text(f"{res_no_month_with_year['start']}.{end_date.month}.{end_date.year}")
40 |     elif date:
41 |         end_date = sc.date_from_text(date)
42 |         start_date = end_date - datetime.timedelta(days=6)
43 | 
44 |     assert start_date and end_date, f'failed to extract start and end dates from {pdf_url}'
45 |     td.start_date = start_date
46 |     td.end_date = end_date
47 | 
48 |     res = re.search(r'une\s+moyenne\s+de\s+(\d+)\s+frottis\s+SARS-CoV(-)?2', pdf)
49 |     if res:
50 |         days = (end_date - start_date).days
51 |         td.total_tests = days * int(res[1])
52 | 
53 |     res = re.search(r'dont\s+(\d+\.?\d?)\s?%\s+étaient\s+positifs', pdf)
54 |     if res:
55 |         td.positivity_rate = res[1]
56 | 
57 |     if td:
58 |         print(td)
59 | 


--------------------------------------------------------------------------------
/scrapers/scrape_so.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import re
 5 | from bs4 import BeautifulSoup
 6 | import scrape_common as sc
 7 | import scrape_so_common as soc
 8 | 
 9 | 
10 | base_url = 'https://corona.so.ch'
11 | pdf_url = soc.get_latest_weekly_pdf_url()
12 | content = sc.pdfdownload(pdf_url, layout=True, silent=True, page=1)
13 | content = re.sub(r'(\d+)\'(\d+)', r'\1\2', content)
14 | 
15 | """
16 | Hospitalisationen im Kanton  Anzahl Personen in Isolation  davon Kontakte in Quarantäne  Anzahl zusätzlicher Personen in Quarantäne nach Rückkehr aus Risikoland  Re- Wert***
17 | 6 (6)                        120 (71)                      280 (189)                     388 (280)                                                                1.46 (1.1)
18 | """
19 | 
20 | rows = []
21 | 
22 | date = sc.find(r'S\s?tand: (\d+\.\d+\.20\d{2})', content)
23 | number_of_tests = sc.find(r'Gem\s?eldete\s+Tes\s?ts\s+\(Total\)\*+?\s+(\d+)\s', content, flags=re.DOTALL)
24 | res = re.search(r'Hospitalisationen im Kanton.*\d+ \(\d+\)\s+(\d+) \(\d+\)\s+(\d+) \(\d+\)\s+(\d+) \(\d+\)\s+', content, re.DOTALL)
25 | if res is not None:
26 |     data = sc.DayData(canton='SO', url=pdf_url)
27 |     data.datetime = date
28 |     data.tested = number_of_tests
29 |     data.isolated = soc.strip_value(res[1])
30 |     data.quarantined = soc.strip_value(res[2])
31 |     data.quarantine_riskareatravel = soc.strip_value(res[3])
32 |     rows.append(data)
33 | 
34 | 
35 | # scrape the main page as well
36 | url = "https://corona.so.ch/bevoelkerung/daten/"
37 | d = sc.download(url, silent=True)
38 | soup = BeautifulSoup(d, 'html.parser')
39 | title = soup.find('h3', text=re.compile("Stand"))
40 | data = sc.DayData(canton='SO', url=url)
41 | data.datetime = sc.find(r'Stand\s*(\d+\.\d+\.\d{4})\s*', title.string)
42 | table = title.find_next('table')
43 | for table_row in table.find_all('tr'):
44 |     title = table_row.find_all('th')
45 |     items = table_row.find_all('td')
46 |     if len(items) == 0:
47 |         continue
48 |     name = title[0].text
49 |     value = items[0].text.replace("'", "")
50 |     if sc.find(r'(Laborbestätigte Infektionen).*?:', name):
51 |         data.cases = value
52 |         continue
53 |     if name == 'Verstorbene Personen (kumuliert seit 06.03.2020):':
54 |         data.deaths = value
55 |         continue
56 |     if name == 'Im Kanton hospitalisierte Covid-19-positive Patientinnen und Patienten:':
57 |         data.hospitalized = value
58 |         continue
59 |     if name.strip() == 'Davon befinden sich auf Intensivstationen:':
60 |         data.icu = value
61 |         continue
62 | if data:
63 |     rows.append(data)
64 | 
65 | 
66 | is_first = True
67 | # skip first row
68 | for row in rows:
69 |     if not is_first:
70 |         print('-' * 10)
71 |     is_first = False
72 |     print(row)
73 | 


--------------------------------------------------------------------------------
/scrapers/scrape_fl.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import scrape_common as sc
 4 | import sys
 5 | import re
 6 | import datetime
 7 | from bs4 import BeautifulSoup
 8 | 
 9 | 
10 | # get the daily bulletin
11 | base_url = 'https://www.regierung.li'
12 | d = sc.download(base_url, silent=True)
13 | soup = BeautifulSoup(d, 'html.parser')
14 | 
15 | is_first = True
16 | bulletin = soup.find('h1', text=re.compile(r'COVID-19: Situationsbericht.*'))
17 | if bulletin:
18 |     bulletin = bulletin.find_next('a')
19 | if bulletin:
20 |     url = f"{base_url}{bulletin.get('href')}"
21 |     bulletin_d = sc.download(url, silent=True)
22 |     bulletin_soup = BeautifulSoup(bulletin_d, 'html.parser')
23 | 
24 |     dd = sc.DayData(canton='FL', url=url)
25 | 
26 |     title = bulletin_soup.find('h1', text=re.compile(r'.*Situationsbericht.*'))
27 |     dd.datetime = sc.find(r'Situationsbericht vom (.*? 20\d{2})', title.text)
28 | 
29 |     content = title.find_next('div').text
30 |     content = re.sub(r'(\d+)’(\d+)', r'\1\2', content)
31 | 
32 |     dd.cases = sc.find(r"insgesamt\s+([0-9]+)\s+laborbestätigte\s+Fälle", content)
33 |     dd.deaths = sc.find(r'(Damit\s+traten\s+)?(?:bisher|bislang)\s+(traten\s+)?(?P<death>\d+)\s+(Todesfall|Todesfälle)', content, flags=re.I, group='death')
34 | 
35 |     if re.search(r'Alle\s+weiteren\s+Erkrankten\s+sind\s+in\s+der\s+Zwischenzeit\s+genesen', content):
36 |         dd.recovered = int(dd.cases) - int(dd.deaths)
37 | 
38 |     m = re.search(r'(\S+)\s+Erkrankte\s+sind\s+derzeit\s+hospitalisiert', content)
39 |     if m:
40 |         dd.hospitalized = sc.int_or_word(m[1].lower())
41 | 
42 |     m = re.search(r'Gegenwärtig\s+befinden\s+sich\s+(\w+)\s+enge\s+Kontaktpersonen\s+in\s+Quarantäne.', content)
43 |     if m:
44 |         dd.quarantined = sc.int_or_word(m[1])
45 | 
46 |     if dd:
47 |         if not is_first:
48 |             print('-' * 10)
49 |         print(dd)
50 |         is_first = False
51 | 
52 | 
53 | # get the data from XLS file containing full history
54 | history_url='https://www.llv.li/files/ag/aktuelle-fallzahlen.xlsx'
55 | xls = sc.xlsdownload(history_url, silent=True)
56 | rows = sc.parse_xls(xls, header_row=3)
57 | for row in rows:
58 |     dd_full_list = sc.DayData(canton='FL', url=history_url)
59 |     if isinstance(row['Datenstand'], datetime.datetime):
60 |         dd_full_list.datetime = row['Datenstand']
61 |     else:
62 |         dd_full_list.datetime = str(row['Datenstand']).replace(':', '.')
63 |         
64 |     dd_full_list.cases = str(row['Anzahl pos. Fälle kumuliert']).replace("'","")
65 |     dd_full_list.recovered = row['Genesene kumuliert']
66 |     dd_full_list.hospitalized = row['Hospitalisierte Personen*']
67 |     dd_full_list.deaths = row['Todesfälle kumuliert']
68 |     if dd_full_list:
69 |         if not is_first:
70 |             print('-' * 10)
71 |         is_first = False
72 |         print(dd_full_list)
73 | 


--------------------------------------------------------------------------------
/scrapers/scrape_ow.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import re
 4 | import datetime
 5 | from bs4 import BeautifulSoup
 6 | import scrape_common as sc
 7 | 
 8 | 
 9 | base_url = 'https://www.ow.ch'
10 | url = f'{base_url}/de/verwaltung/dienstleistungen/?dienst_id=5962'
11 | """
12 | d = sc.download(url, silent=True, encoding='windows-1252')
13 | d = d.replace('&nbsp;', ' ')
14 | soup = BeautifulSoup(d, 'html.parser')
15 | 
16 | dd = sc.DayData(canton='OW', url=url)
17 | date = sc.find(r'Stand (\d+\.\s+\w+\s+20\d{2})', d)
18 | time = sc.find(r'Stand .*,\s?([\d\.:]+).*Uhr', d)
19 | dd.datetime = f'{date}, {time} Uhr'
20 | dd.isolated = soup.find(text=re.compile(r'In Isolation \(aktuell\)')).find_next('td').string
21 | dd.quarantined = soup.find(text=re.compile(r'In Quarant.ne \(aktuell\)')).find_next('td').string
22 | dd.quarantine_riskareatravel = soup.find(text=re.compile(r'Reiser.ckkehrer in Quarant.ne')).find_next('td').string
23 | 
24 | is_first = True
25 | if dd:
26 |     print(dd)
27 |     is_first = False
28 | """
29 | 
30 | is_first = True
31 | 
32 | 
33 | d = sc.download(f'{base_url}/de/kanton/publired/publikationen/?action=info&pubid=20318',
34 |                 encoding='windows-1252', silent=True)
35 | soup = BeautifulSoup(d, 'html.parser')
36 | xls_url = soup.find('a', string=re.compile("Download")).get('href')
37 | assert xls_url, "URL is empty"
38 | xls_url = f'{base_url}{xls_url}'
39 | 
40 | for row in soup.find_all('dl'):
41 |     cells = row.find_all('dd')
42 |     if cells[0].string:
43 |         file_date = cells[0].string
44 | 
45 | xls = sc.xlsdownload(xls_url, silent=True)
46 | rows = sc.parse_xls(xls, header_row=4)
47 | for row in rows:
48 |     if isinstance(row['A'], datetime.datetime):
49 |         dd = sc.DayData(canton='OW', url=url)
50 |         dd.datetime = row['A']
51 |         data_found = False
52 |         if isinstance(row['Infizierte Personen (kumuliert)'], int) and row['Infizierte Personen (kumuliert)'] > 0:
53 |             dd.cases = row['Infizierte Personen (kumuliert)']
54 |             data_found = True
55 |         hosp_key = """Hospitalisierte Personen im KSOW /
56 | Eintritte Covid-Station; Alle Einwohner OW alle Spitäler CH***"""
57 |         if isinstance(row[hosp_key], int):
58 |             dd.hospitalized = row[hosp_key]
59 |         if isinstance(row['Gestorbene Personen (kumuliert)'], int):
60 |             dd.deaths = row['Gestorbene Personen (kumuliert)']
61 |         if isinstance(row['Isolation'], int):
62 |             dd.isolated = row['Isolation']
63 |         if isinstance(row['Quarantäne'], int):
64 |             dd.quarantined = row['Quarantäne']
65 |         if isinstance(row['Quarantäne Reiserückkehrer'], int):
66 |             dd.quarantine_riskareatravel = row['Quarantäne Reiserückkehrer']
67 |         if data_found:
68 |             if not is_first:
69 |                 print('-' * 10)
70 |             else:
71 |                 is_first = False
72 |             print(dd)
73 | 


--------------------------------------------------------------------------------
/scrapers/scrape_vs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import datetime
 5 | import re
 6 | from bs4 import BeautifulSoup
 7 | import scrape_common as sc
 8 | 
 9 | 
10 | def strip_value(value):
11 |     if value:
12 |         return re.sub(r'[^0-9]', '', value)
13 |     return None
14 | 
15 | 
16 | base_url = 'https://www.vs.ch'
17 | url = f'{base_url}/web/coronavirus/statistiques'
18 | content = sc.download(url, silent=True)
19 | soup = BeautifulSoup(content, 'html.parser')
20 | pdf_url = soup.find('a', string=re.compile(r'20\d{2}.*Sit Epid.*')).get('href')
21 | pdf_url = f'{base_url}{pdf_url}'
22 | 
23 | content = sc.pdfdownload(pdf_url, silent=True, layout=True, page=1)
24 | 
25 | dd = sc.DayData(canton='VS', url=pdf_url)
26 | dd.datetime = sc.find(r'(\d{2}/\d{2}/20\d{2})', content)
27 | dd.datetime = re.sub(r'/', '.', dd.datetime)
28 | dd.cases = strip_value(sc.find(r'.*Cumul cas positifs.*\s+(\d+.\d+)\s+', content))
29 | dd.deaths = strip_value(sc.find(r'.*Cumul d.c.s.*\s+(\d+.\d+)\s+', content))
30 | dd.hospitalized = strip_value(sc.find(r'.*Hospitalisations en cours de cas COVID-19.*\s+(\d+)\s+', content))
31 | dd.icu = strip_value(sc.find(r'.*SI en cours.*\s+(\d+)\s+', content))
32 | dd.vent = strip_value(sc.find(r'.*Intubation en cours.*\s+(\d+)\s+', content))
33 | 
34 | is_first = True
35 | if dd:
36 |     is_first = False
37 |     print(dd)
38 | 
39 | 
40 | xls_url = 'https://raw.githubusercontent.com/statistikZH/covid19_drop/master/Chiffres%20COVID-19%20Valais.xlsx'
41 | main_url = 'https://www.vs.ch/de/web/coronavirus'
42 | xls = sc.xlsdownload(xls_url, silent=True)
43 | rows = sc.parse_xls(xls, header_row=1)
44 | for i, row in enumerate(rows):
45 |     if not isinstance(row['Date'], datetime.datetime):
46 |         continue
47 |     if not sc.represents_int(row['Cumul cas positifs']):
48 |         continue
49 |     if row['Nb nouveaux cas positifs'] is None and row["Nb nouvelles admissions à l'hôpital"] is None:
50 |         continue
51 | 
52 |     dd = sc.DayData(canton='VS', url=main_url)
53 |     dd.datetime = row['Date'].date().isoformat()
54 |     dd.cases = row['Cumul cas positifs']
55 |     dd.hospitalized = row['Total hospitalisations COVID-19']
56 |     dd.new_hosp = row['Nb nouvelles admissions à l\'hôpital']
57 |     dd.icu = row['Patients COVID-19 aux SI total (y.c. intubés)']
58 |     dd.vent = row['Patients COVID-19 intubés']
59 |     dd.deaths = row['Cumul décès COVID-19']
60 |     # Since 2020-10-19 VS does no longer publish data about isolation/quarantined
61 |     #dd.isolated = row['Nombre de cas en cours d\'isolement']
62 |     #dd.quarantined = row['Nombre de contacts en cours de quarantaine']
63 |     #dd.quarantine_riskareatravel = row['Nombre de voyageurs en cours de quarantaine']
64 | 
65 |     if row['Nb de nouvelles sorties'] is not None:
66 |         dd.recovered = sum(r['Nb de nouvelles sorties'] for r in rows[:i+1])
67 |     if not is_first:
68 |         print('-' * 10)
69 |     is_first = False
70 |     print(dd)
71 | 


--------------------------------------------------------------------------------
/scrapers/scrape_sh_common.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import json
 5 | import re
 6 | from bs4 import BeautifulSoup
 7 | import scrape_common as sc
 8 | 
 9 | 
10 | def get_sh_url_from_json(url):
11 |     m = sc.jsondownload(url, silent=True)
12 | 
13 |     # 2020-04-24
14 |     """
15 |     {
16 |         data_filetype: "xlsx",
17 |         data_shareInAreaPage: "[]",
18 |         data_kachellabel: "Fallzahlen Corona Kanton Schaffhausen.xlsx",
19 |         data_areaPage_repositoryid: "3275",
20 |         data_custom_author: "Gesundheitsamt Kanton Schaffhausen",
21 |         data_tagarea: "[]",
22 |         data_shareInDomain: "[]",
23 |         data_zielgruppen: "",
24 |         data_publication_date: "23.04.2020",
25 |         data_idpath: "/1752/8540/1753/1765/1755/1763/2733/2747/3275/3666465",
26 |         data_custom_publication_date_date: "23.04.2020",
27 |         data_shareArticleProfileId: "",
28 |         data_file_name: "Fallzahlen Corona Kanton Schaffhausen.xlsx",
29 |         data_author: "MWETT",
30 |         data_file_copyrights: "",
31 |         data_custom_publication_timed: "[]",
32 |         data_published: "published",
33 |         data_addmodules: "",
34 |         data_listlabel: "Fallzahlen Corona Kanton Schaffhausen.xlsx",
35 |         data_tags: "",
36 |         data_widget_data: "[]",
37 |         data_filemeta: "{"uploaded":1,"fileName":"d4ffb019-a2ef-4782-87be-0aafb4b43558","key":"TEMPUPLOADFILES","url":"/CMS/get/file/d4ffb019-a2ef-4782-87be-0aafb4b43558","originalname":"Fallzahlen Corona Kanton Schaffhausen.xlsx","fileid":"d4ffb019-a2ef-4782-87be-0aafb4b43558","category":"null","title":"null","filesize":12286}",
38 |         data_shareInGlobal: "[]",
39 |         data_verbande: "",
40 |         data_file_description: "",
41 |         data_custom_publication_date_time: "09:31",
42 |         data_galleries: "[]",
43 |         data_sharepaths: "",
44 |         data_permalink: "/Webseite/Kanton-Schaffhausen/Beh-rde/Verwaltung/Departement-des-Innern/Gesundheitsamt-3666465-DE.html",
45 |         data_schlagworte: "",
46 |         data_approvedpaths: "["/1752/8540/1753/1765/1755/1763/2733/2747/3275/3666465"]",
47 |         contentid: "3666465",
48 |         domainid: "1753",
49 |         contenttypeid: "101",
50 |         transactiontime: "23.04 09:09",
51 |         author: "dande",
52 |         language: "DE",
53 |         activated_languages: [
54 |                 "DE"
55 |                 ],
56 |                 sliderimages: [ ],
57 |                 genericimages: { }
58 |     }
59 |     """
60 | 
61 |     meta = json.loads(m['data_filemeta'])
62 |     url = f"https://sh.ch{meta['url']}"
63 |     return url
64 | 
65 | def get_sh_xlsx():
66 |     main_url = 'https://coviddashboard.sh.ch/'
67 |     content = sc.download(main_url, silent=True)
68 |     soup = BeautifulSoup(content, 'html.parser')
69 |     link = soup.find('a', href=re.compile(r'.*\.xlsx'))
70 |     xls = sc.xlsdownload(link.get('href'), silent=True)
71 |     return main_url, xls
72 | 


--------------------------------------------------------------------------------
/scrapers/scrape_fr_districts.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import csv
 5 | from io import StringIO
 6 | import re
 7 | from bs4 import BeautifulSoup
 8 | import scrape_common as sc
 9 | from scrape_fr_common import get_fr_csv
10 | 
11 | inhabitants = {
12 |     'Broye': 32894,
13 |     'Glane': 24337,
14 |     'Greyerz': 55726,
15 |     'Saane': 106136,
16 |     'See': 36800,
17 |     'Sense': 43990,
18 |     'Vivisbach': 18831,
19 | }
20 | 
21 | district_ids = {
22 |     'Broye': 1001,
23 |     'Glane': 1002,
24 |     'Greyerz': 1003,
25 |     'Saane': 1004,
26 |     'See': 1005,
27 |     'Sense': 1006,
28 |     'Vivisbach': 1007,
29 | }
30 | 
31 | district_xls = {
32 |     'Broye': 'Broye',
33 |     'Glane': 'Gl.ne',
34 |     'Greyerz': 'Gruy.re',
35 |     'Saane': 'Sarine',
36 |     'See': 'Lac',
37 |     'Sense': 'Singine',
38 |     'Vivisbach': 'Veveyse',
39 | }
40 | 
41 | # weekly data
42 | url = 'https://www.fr.ch/de/gesundheit/covid-19/coronavirus-statistik-ueber-die-entwicklung-im-kanton'
43 | """
44 | d = sc.download(url, silent=True)
45 | d = d.replace('&nbsp;', ' ')
46 | 
47 | soup = BeautifulSoup(d, 'html.parser')
48 | table = soup.find(string=re.compile(r'Anzahl positive F.lle nach Bezirk')).find_next('table')
49 | 
50 | weeks = []
51 | years = []
52 | week_regex = re.compile(r'Woche \d+')
53 | trs = table.find_all('tr')
54 | for header in trs[0]:
55 |     week = sc.find(r'Woche (\d+)', header.string)
56 |     if week is not None:
57 |         weeks.append(week)
58 |         years.append('2021')
59 | 
60 | for tr in trs[1:]:
61 |     tds = tr.find_all('td')
62 | 
63 |     for i in range(len(weeks)):
64 |         district = tds[0].string
65 |         if district in inhabitants:
66 |             dd = sc.DistrictData(canton='FR', district=district)
67 |             dd.url = url
68 |             dd.week = weeks[i]
69 |             # TODO restore once all weeks are in 2021
70 |             # dd.year = '20' + year
71 |             dd.year = years[i]
72 |             dd.new_cases = tds[i + 1].string
73 |             dd.population = inhabitants[district]
74 |             dd.district_id = district_ids[district]
75 |             print(dd)
76 | """
77 | 
78 | # daily data from csv
79 | csv_url, csv_data, main_url = get_fr_csv()
80 | reader = csv.DictReader(StringIO(csv_data), delimiter=';')
81 | 
82 | for row in reader:
83 |     row_date = None
84 |     for key, val in row.items():
85 |         if sc.find(r'(Date).*', key):
86 |             row_date = val
87 |     assert row_date
88 |     row_date = sc.date_from_text(row_date)
89 |     for district, xls_district in district_xls.items():
90 |         for key, val in row.items():
91 |             if sc.find(r'.*(' + xls_district + ').*', key):
92 |                 dd = sc.DistrictData(canton='FR', district=district)
93 |                 dd.url = url
94 |                 dd.date = row_date.isoformat()
95 |                 dd.new_cases = val
96 |                 dd.population = inhabitants[district]
97 |                 dd.district_id = district_ids[district]
98 |                 print(dd)
99 | 


--------------------------------------------------------------------------------
/scrapers/scrape_bl_tests.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from bs4 import BeautifulSoup
 5 | import re
 6 | import scrape_common as sc
 7 | import scrape_bl_common as sbc
 8 | from datetime import timedelta
 9 | 
10 | 
11 | # weekly data
12 | bulletin_urls = sbc.get_all_bl_bulletin_urls()
13 | for bulletin_url in bulletin_urls:
14 |     bulletin_content = sc.download(bulletin_url, silent=True)
15 |     soup = BeautifulSoup(bulletin_content, 'html.parser')
16 |     content = soup.find(string=re.compile(r'Per heute .*')).string
17 |     content = sbc.strip_bl_bulletin_numbers(content)
18 | 
19 |     date = sc.find(r'Per heute \w+, (\d+\. \w+ 20\d{2})', content)
20 |     date = sc.date_from_text(date)
21 |     # previous week
22 |     date = date - timedelta(days=7)
23 | 
24 |     td = sc.TestData(canton='BL', url=bulletin_url)
25 |     td.week = date.isocalendar()[1]
26 |     td.year = date.year
27 |     td.total_tests = sc.find(r'In der Vorwoche wurden (\d+) PCR-Tests', content)
28 |     td.positivity_rate = sc.find(r'von diesen waren (\d+\.?,?\d?) Prozent positiv', content)
29 |     if td.total_tests and td.positivity_rate:
30 |         td.positivity_rate = td.positivity_rate.replace(',', '.')
31 |         print(td)
32 | 
33 | 
34 | # daily data
35 | main_url = 'https://www.baselland.ch/politik-und-behorden/direktionen/volkswirtschafts-und-gesundheitsdirektion/amt-fur-gesundheit/medizinische-dienste/kantonsarztlicher-dienst/aktuelles/covid-19-faelle-kanton-basel-landschaft/covid-19-bl-tests'
36 | main_content = sc.download(main_url, silent=True)
37 | soup = BeautifulSoup(main_content, 'html.parser')
38 | 
39 | def create_bs_test_data(date):
40 |     td = sc.TestData(canton='BL', url=main_url)
41 |     td.start_date = date
42 |     td.end_date = date
43 |     return td
44 | 
45 | tests_data = {}
46 | 
47 | for iframe in soup.find_all('iframe'):
48 |     iframe_url = iframe['src']
49 |     d = sc.download(iframe_url, silent=True)
50 |     d = d.replace('\n', ' ')
51 | 
52 |     # Taegliche PCR-Tests BL
53 |     data = sc.find(r'<pre id="data[^"]*".*?> ?Datum,&quot;Negative Tests&quot;,&quot;Positive Tests&quot;\s*([^<]+)</pre>', d)
54 |     if data:
55 |         for row in data.split(" "):
56 |             c = row.split(',')
57 |             date = sbc.parse_bl_date(c[0])[0]
58 |             if date not in tests_data:
59 |                 tests_data[date] = create_bs_test_data(date)
60 |             tests_data[date].negative_tests = round(float(c[1]))
61 |             tests_data[date].positive_tests = round(float(c[2]))
62 |         continue
63 | 
64 |     # Taegliche Positivitaetsrate BL
65 |     data = sc.find(r'<pre id="data[^"]*".*?> ?Datum,&quot;T.gliche Positivit.tsrate BL&quot;\s*([^<]+)</pre>', d)
66 |     if data:
67 |         for row in data.split(" "):
68 |             c = row.split(',')
69 |             date = sbc.parse_bl_date(c[0])[0]
70 |             if date not in tests_data:
71 |                 tests_data[date] = create_bs_test_data(date)
72 |             tests_data[date].positivity_rate = c[1]
73 |         continue
74 | 
75 | for date, td in tests_data.items():
76 |     print(td)
77 | 


--------------------------------------------------------------------------------
/scrapers/scrape_ur.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import re
 4 | from bs4 import BeautifulSoup
 5 | import scrape_common as sc
 6 | 
 7 | url = 'https://www.ur.ch/themen/2962'
 8 | d = sc.download(url, silent=True)
 9 | d = d.replace('&nbsp;', ' ')
10 | d = d.replace('<br />', ' ')
11 | d = re.sub(r'(\d+)\'(\d+)', r'\1\2', d)
12 | 
13 | # 2020-03-26 (and possibly earlier) from https://www.ur.ch/themen/2962
14 | # 2020-07-07 they changed the title, so we're using the table header to find the table
15 | # 2020-07-24 column "Genesen" was removed
16 | """
17 | <table cellpadding="1" cellspacing="1" class="icms-wysiwyg-table" icms="CLEAN" style="width:100%">
18 | 	<caption><br>
19 | 	Stand: 24.07.2020, 11.00 Uhr</caption>
20 | 	<tbody>
21 | 		<tr>
22 | 			<td icms=""><strong>Positiv getestete Erkrankungsfälle</strong></td>
23 | 			<td icms=""><strong>Hospitalisiert</strong></td>
24 | 			<td icms=""><strong>Verstorben</strong></td>
25 | 		</tr>
26 | 		<tr>
27 | 			<td icms="">115</td>
28 | 			<td icms="">1</td>
29 | 			<td icms="">7</td>
30 | 		</tr>
31 | 	</tbody>
32 | </table>
33 | """
34 | 
35 | # 2020-08-03 new table layout with 6 columns
36 | """
37 | <table cellpadding="1" cellspacing="1" class="icms-wysiwyg-table" icms="CLEAN" style="width:100%">
38 | 	<caption><br>
39 | 	Stand: 03.08.2020, 16.00 Uhr</caption>
40 | 	<tbody>
41 | 		<tr>
42 | 			<td icms=""><strong>Aktive Fälle&nbsp;</strong></td>
43 | 			<td icms=""><strong>Positiv getestete Erkrankungsfälle</strong></td>
44 | 			<td icms=""><strong>Hospitalisiert</strong></td>
45 | 			<td icms=""><strong>Quarantäne</strong></td>
46 | 			<td icms=""><strong>Verstorben</strong></td>
47 | 			<td icms="">&nbsp;</td>
48 | 		</tr>
49 | 		<tr>
50 | 			<td icms="">4</td>
51 | 			<td icms="">117</td>
52 | 			<td icms="">0</td>
53 | 			<td icms="">47</td>
54 | 			<td icms="">7</td>
55 | 			<td icms="">&nbsp;</td>
56 | 		</tr>
57 | 	</tbody>
58 | </table>
59 | """
60 | 
61 | soup = BeautifulSoup(d, 'html.parser')
62 | data_table = soup.find(string=re.compile(r'Positive\s+Fälle\s+total')).find_parent('table')
63 | 
64 | assert data_table, "Can't find data table"
65 | 
66 | dd = sc.DayData(canton='UR', url=url)
67 | dd.datetime = sc.find(r'Stand: (.* Uhr)', d)
68 | 
69 | rows = data_table.find_all('tr')
70 | assert len(rows) == 2, f"Number of rows changed, {len(rows)} != 2"
71 | 
72 | headers = rows[0].find_all('td') or rows[0].find_all('th')
73 | assert len(headers) == 5, f"Number of header columns changed, {len(headers)} != 5"
74 | assert re.search(r'(aktive\s+fälle)', headers[0].text, flags=re.I) is not None
75 | assert re.search(r"(positive\s+fälle\s+total\s+seit\s+märz\s+2020)", headers[1].text, flags=re.I) is not None
76 | assert headers[2].text.lower() == "hospitalisiert"
77 | assert re.search(r"(total\s+verstorbene)", headers[3].text, flags=re.I) is not None
78 | 
79 | cells = rows[1].find_all('td')
80 | assert len(cells) == 4, f"Number of columns changed, {len(cells)} != 4"
81 | 
82 | ur_number_regex = r'(\d+)\s*(\(.+?\))?'
83 | dd.cases = sc.find(ur_number_regex, cells[1].text)
84 | dd.hospitalized = sc.find(ur_number_regex, cells[2].text)
85 | dd.deaths = sc.find(ur_number_regex, cells[3].text)
86 | 
87 | print(dd)
88 | 


--------------------------------------------------------------------------------
/scrapers/scrape_gl.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import re
 5 | import sys
 6 | from bs4 import BeautifulSoup
 7 | import csv
 8 | from io import StringIO
 9 | import scrape_common as sc
10 | import scrape_gl_common as sgc
11 | 
12 | def split_whitespace(text):
13 |     if not text:
14 |         return []
15 |     text = re.sub(r'\s\s+', ' ', text)
16 |     return text.split(' ')
17 | 
18 | is_first = True
19 | 
20 | # weekly pdf
21 | pdf_url = sgc.get_gl_pdf_url()
22 | if pdf_url is not None:
23 |     pdf = sc.download_content(pdf_url, silent=True)
24 |     content = sc.pdftotext(pdf, page=1)
25 |     content = re.sub(r'(\d+)\'(\d+)', r'\1\2', content)
26 |     content = re.sub(r'(\d+)’(\d+)', r'\1\2', content)
27 | 
28 |     pdf_date = sc.find(r'Stand: (\d{2}\.\d{2}.\d{4})', content)
29 |     pdf_date = sc.date_from_text(pdf_date)
30 | 
31 |     number_of_tests = sc.find(r'PCR-Tests/Schnelltests\sKanton Glarus\s(\d+)\s', content)
32 |     if number_of_tests:
33 |             dd = sc.DayData(canton='GL', url=pdf_url)
34 |             dd.datetime = pdf_date
35 |             dd.tested = number_of_tests
36 |             is_first = False
37 |             print(dd)
38 | 
39 | 
40 |     content = sc.pdftotext(pdf, page=2, raw=True)
41 |     dates = split_whitespace(sc.find(r'\n(\d+\.\d+\s+\d+\.\d+\s+.*)\nAnzahl\s+in\s+Isolation', content))
42 |     isolation = split_whitespace(sc.find(r'\nAnzahl\s+in\s+Isolation\s+(\d.*)\n', content))
43 |     quarantined = split_whitespace(sc.find(r'\nKontaktpersonen\s+in\s+Quarant.ne\s+(\d.*)\n', content))
44 | 
45 |     if len(dates) == len(isolation) == len(quarantined):
46 |         for date, iso, qua in zip(dates, isolation, quarantined):
47 |             if sc.find(r'(\d{2}\.12)', date):
48 |                 year = '2020'
49 |             else:
50 |                 year = pdf_date.year
51 |             dd = sc.DayData(canton='GL', url=pdf_url)
52 |             dd.datetime = f'{date}.{year}'
53 |             dd.isolated = iso
54 |             dd.quarantined = qua
55 |             if not is_first:
56 |                 print('-' * 10)
57 |             is_first = False
58 |             print(dd)
59 |     else:
60 |         print('PDF data is inconsistent!', file=sys.stderr)
61 |         print(f'dates: {len(dates)}, isolation: {len(isolation)},  quarantined: {len(quarantined)}', file=sys.stderr)
62 | 
63 | 
64 | # CSV from Google Spreadsheets
65 | main_url = 'https://docs.google.com/spreadsheets/d/1Q7VoxM6wvbdsC84DLWrzyNymkcxUKqIXHy6BpB2Ez0k/edit#gid=0'
66 | csv_url = 'https://docs.google.com/spreadsheets/d/1Q7VoxM6wvbdsC84DLWrzyNymkcxUKqIXHy6BpB2Ez0k/export?format=csv&id=1Q7VoxM6wvbdsC84DLWrzyNymkcxUKqIXHy6BpB2Ez0k&gid=0'
67 | d_csv = sc.download(csv_url, silent=True)
68 | 
69 | reader = csv.DictReader(StringIO(d_csv), delimiter=',')
70 | for row in reader:
71 |     if row['Datum'] == '':
72 |         continue
73 |     if not is_first:
74 |         print('-' * 10)
75 |     is_first = False
76 |     dd = sc.DayData(canton='GL', url=main_url)
77 |     dd.datetime = row['Datum']
78 |     dd.cases = row['Fallzahlen Total']
79 |     dd.hospitalized = row['Personen in Spitalpflege']
80 |     dd.deaths = row['Todesfälle (kumuliert)']
81 |     print(dd)
82 | 


--------------------------------------------------------------------------------
/scrapers/scrape_so_tests.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import datetime
 5 | import re
 6 | import scrape_common as sc
 7 | import scrape_so_common as soc
 8 | 
 9 | 
10 | pdf_urls = soc.get_all_weekly_pdf_urls()
11 | # start with the oldest PDF to have the most recent ones last
12 | pdf_urls.reverse()
13 | for pdf_url in pdf_urls:
14 |     content = sc.pdfdownload(pdf_url, layout=True, silent=True, page=1)
15 |     # remove ' separator to simplify pattern matching
16 |     content = re.sub(r'(\d)\'(\d)', r'\1\2', content)
17 | 
18 |     date = sc.find(r'S\s?tand: (\d+\.\d+\.20\d{2})', content)
19 |     date = sc.date_from_text(date)
20 |     year1 = (date - datetime.timedelta(weeks=2)).year
21 |     year2 = (date - datetime.timedelta(weeks=1)).year
22 |     res = re.match(r'.*Woche (?P<w1>\d+)(\s+\(\d+\.\d+-\d+\.\d+\))?\s+Woche (?P<w2>\d+)\s+', content, re.DOTALL)
23 |     assert res, 'Weeks could not be extracted'
24 |     week1 = res['w1']
25 |     week2 = res['w2']
26 | 
27 |     res = re.match(r'.*PCR-Tes\s?ts\s+(\d.*\n)?Total\s+\d+\s+\d+\s+(\d+)\s+\d+\.?\d?\s+(\d+)\s', content, re.DOTALL)
28 |     if not res:
29 |         res = re.match(r'.*Labortes\s?ts\s\(PCR\s-\sund\sS\s?chnelltes\s?ts\s?\)\s+(\d.*\n)?Total\s+\d+\s+\d+\.?\d?\s+(\d+)\s+\d+\.?\d?\s+(\d+)\s', content, re.DOTALL)
30 |     if not res:
31 |         res = re.match(r'.*Labortes\s?ts\s\(PCR\s-\sund\sS\s?chnelltes\s?ts\s?\)\s+(\d.*\n)?Total\s+\d+\s+(\d+)\s+\d+\.?\d?\s+(\d+)\s', content, re.DOTALL)
32 |     if res:
33 |         total_tests1 = res[2]
34 |         total_tests2 = res[3]
35 | 
36 |     if not res:
37 |         res = re.match(r'.*\s+PCR\s+(\d+\s+)?(\d+)\s+(\d+)\s', content, re.DOTALL)
38 |         assert res, f'PCR tests for week {week1} or {week2} could not be extracted!'
39 |         if res:
40 |             total_tests1 = int(res[2])
41 |             total_tests2 = int(res[3])
42 | 
43 |         res = re.match(r'.*\s+Antigen-Schnelltests\s+(\d+\s+)?(\d+)\s+(\d+)', content, re.DOTALL)
44 |         assert res, f'Antigen tests for week {week1} or {week2} could not be extracted!'
45 |         if res:
46 |             total_tests1 += int(res[2])
47 |             total_tests2 += int(res[3])
48 | 
49 |     assert res, f'PCR tests for week {week1} or {week2} could not be extracted!'
50 | 
51 |     res = re.match(r'.*Positivit.tsrate\s+\*+?\s+\d+\.?\d?%?\s+(\d+\.?\d?)%?\s+(\d+\.?\d?)%?', content, re.DOTALL)
52 |     pos_rate1 = None
53 |     pos_rate2 = None
54 |     if res:
55 |         pos_rate1 = res[1]
56 |         pos_rate2 = res[2]
57 |     else:
58 |         res = re.match(r'.*Anteil\s+pos\s?itiv\s?er\s+Tes\s?ts\s+\(%\)\s+(\d+\w+)?\s+(\d+\.?\d?)%?\s+(\d+\.?\d?)%?', content, re.DOTALL)
59 |         if res:
60 |             pos_rate1 = res[2]
61 |             pos_rate2 = res[3]
62 | 
63 |     data = sc.TestData(canton='SO', url=pdf_url)
64 |     data.week = week1
65 |     data.year = year1
66 |     data.total_tests = total_tests1
67 |     data.positivity_rate = pos_rate1
68 |     print(data)
69 | 
70 |     data = sc.TestData(canton='SO', url=pdf_url)
71 |     data.week = week2
72 |     data.year = year2
73 |     data.total_tests = total_tests2
74 |     data.positivity_rate = pos_rate2
75 |     print(data)
76 | 


--------------------------------------------------------------------------------
/scrapers/test/test_test_data.py:
--------------------------------------------------------------------------------
 1 | from scrapers.scrape_common import TestData
 2 | 
 3 | def test_test_data():
 4 |     dd = TestData()
 5 |     dd.start_date = '1'
 6 |     dd.end_date = '2'
 7 |     dd.week = 3
 8 |     dd.year = 4
 9 |     dd.canton = '5'
10 |     dd.positive_tests = 6
11 |     dd.negative_tests = 7
12 |     dd.total_tests = 8
13 |     dd.positivity_rate = 9
14 |     dd.url = '10'
15 | 
16 |     string = str(dd)
17 | 
18 |     dd_parsed = TestData()
19 |     assert dd_parsed.parse(string)
20 |     assert dd.start_date == dd_parsed.start_date
21 |     assert dd.end_date == dd_parsed.end_date
22 |     assert dd.week == dd_parsed.week
23 |     assert dd.year == dd_parsed.year
24 |     assert dd.canton == dd_parsed.canton
25 | 
26 |     assert dd.positive_tests == dd_parsed.positive_tests
27 |     assert dd.negative_tests == dd_parsed.negative_tests
28 |     assert dd.positivity_rate == dd_parsed.positivity_rate
29 | 
30 |     assert dd.positive_tests == dd_parsed.positive_tests
31 |     assert dd.negative_tests == dd_parsed.negative_tests
32 |     assert dd.positivity_rate == dd_parsed.positivity_rate
33 | 
34 |     assert dd.pcr_positive_tests == dd_parsed.pcr_positive_tests
35 |     assert dd.pcr_negative_tests == dd_parsed.pcr_negative_tests
36 |     assert dd.pcr_positivity_rate == dd_parsed.pcr_positivity_rate
37 | 
38 |     assert dd.ag_positive_tests == dd_parsed.ag_positive_tests
39 |     assert dd.ag_negative_tests == dd_parsed.ag_negative_tests
40 |     assert dd.ag_positivity_rate == dd_parsed.ag_positivity_rate
41 | 
42 |     assert dd.url == dd_parsed.url
43 | 
44 | 
45 | def test_test_data_with_PCR_antigen():
46 |     dd = TestData()
47 |     dd.start_date = '1'
48 |     dd.end_date = '2'
49 |     dd.week = 3
50 |     dd.year = 4
51 |     dd.canton = '5'
52 | 
53 |     dd.positive_tests = 6
54 |     dd.negative_tests = 7
55 |     dd.total_tests = 8
56 |     dd.positivity_rate = 9
57 | 
58 |     dd.pcr_positive_tests = 10
59 |     dd.pcr_negative_tests = 11
60 |     dd.pcr_total_tests = 12
61 |     dd.pcr_positivity_rate = 13
62 | 
63 |     dd.ag_positive_tests = 14
64 |     dd.ag_negative_tests = 15
65 |     dd.ag_total_tests = 16
66 |     dd.ag_positivity_rate = 17
67 | 
68 |     dd.url = '18'
69 | 
70 |     string = str(dd)
71 | 
72 |     dd_parsed = TestData()
73 |     assert dd_parsed.parse(string)
74 |     assert dd.start_date == dd_parsed.start_date
75 |     assert dd.end_date == dd_parsed.end_date
76 |     assert dd.week == dd_parsed.week
77 |     assert dd.year == dd_parsed.year
78 |     assert dd.canton == dd_parsed.canton
79 | 
80 |     assert dd.positive_tests == dd_parsed.positive_tests
81 |     assert dd.negative_tests == dd_parsed.negative_tests
82 |     assert dd.positivity_rate == dd_parsed.positivity_rate
83 | 
84 |     assert dd.pcr_positive_tests == dd_parsed.pcr_positive_tests
85 |     assert dd.pcr_negative_tests == dd_parsed.pcr_negative_tests
86 |     assert dd.pcr_positivity_rate == dd_parsed.pcr_positivity_rate
87 | 
88 |     assert dd.ag_positive_tests == dd_parsed.ag_positive_tests
89 |     assert dd.ag_negative_tests == dd_parsed.ag_negative_tests
90 |     assert dd.ag_positivity_rate == dd_parsed.ag_positivity_rate
91 | 
92 |     assert dd.url == dd_parsed.url
93 | 
94 | 
95 | if __name__ == "__main__":
96 |     test_test_data()
97 | 


--------------------------------------------------------------------------------
/scripts/check_for_outliers.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import sys
 5 | import os
 6 | import pandas as pd
 7 | import math
 8 | 
 9 | __location__ = os.path.realpath(
10 |     os.path.join(
11 |         os.getcwd(),
12 |         os.path.dirname(__file__)
13 |     )
14 | )
15 | 
16 | # only values above this MIN_VALUE are considered outliers
17 | # this is to prevent a failing scraper run if the absolute value is not very high
18 | # this outlier detection is mostly to prevent human error (wrong data added)
19 | MIN_VALUE = 20
20 | 
21 | # only check the last x days
22 | LAG_PERIODS = 10
23 | 
24 | # periods considered "recent"
25 | RECENT_PERIODS = 5
26 | 
27 | # IQR factor, determines how many times the IQR is the limit for an outlier
28 | FACTOR = 1.5
29 | 
30 | assert len(sys.argv) >= 2, "Error: Call this script with the path(s) to CSV file(s)"
31 | 
32 | fail = False
33 | 
34 | args = sys.argv[1:]
35 | for csv_file in args:
36 | 
37 |     # load canton file from covid_19 repo
38 |     df = pd.read_csv(csv_file, parse_dates=[0])
39 |     df_ignore = pd.read_csv(os.path.join(__location__, '..', 'outlier_status.csv'), parse_dates=[0])
40 |     df = pd.merge(df, df_ignore, left_on=['date', 'abbreviation_canton_and_fl'], right_on=['date', 'abbreviation_canton_and_fl'], how='left')
41 | 
42 |     # create new column for current cases
43 |     df_conf = df[['date', 'ncumul_conf', 'ncumul_conf_outlier']].reset_index(drop=True)
44 |     df_conf['current_conf'] = df['ncumul_conf'] - df['ncumul_conf'].shift(1)
45 | 
46 |     # only use the last 30 rows
47 |     df_conf = df_conf.tail(LAG_PERIODS).reset_index(drop=True)
48 | 
49 |     # caculate iqr for confirmed cases
50 |     q1 = df_conf['current_conf'].quantile(0.25)
51 |     q3 = df_conf['current_conf'].quantile(0.75)
52 |     iqr = q3 - q1
53 | 
54 |     if pd.isna(q1) or pd.isna(q3) or pd.isna(iqr):
55 |         print(f"⚠️ {csv_file} has too many missing/NaN values (Q1: {q1}, Q3: {q3}, IQR: {iqr})  to calculate outliers, skipping.")
56 |         continue
57 | 
58 |     lower_limit = q1 - (iqr * FACTOR)
59 |     upper_limit = math.ceil(q3 + (iqr * FACTOR))
60 | 
61 |     upper_limit = max(upper_limit, MIN_VALUE)
62 |     lower_limit = 0 # always use 0 as lower limit
63 |     df_conf['q1'] = q1
64 |     df_conf['q3'] = q3
65 |     df_conf['iqr'] = iqr
66 |     df_conf['factor'] = FACTOR
67 |     df_conf['upper_limit'] = upper_limit
68 |     df_conf['lower_limit'] = lower_limit
69 | 
70 |     # use IQR*factor to get outliers
71 |     outliers = df_conf.query('(current_conf < @lower_limit) or (current_conf > @upper_limit)')
72 |     recent_outliers = df_conf.tail(RECENT_PERIODS).query("((current_conf < @lower_limit) or (current_conf > @upper_limit)) and (ncumul_conf_outlier != 'ignore')")
73 |     if outliers.empty:
74 |         print(f"✅ {csv_file} has no outliers.")
75 |     else:
76 |         if not recent_outliers.empty:
77 |             fail = True
78 |             print(f"❌ {csv_file} has recent outliers, please check if this is an error.")
79 |         else:
80 |             print(f"⚠️ {csv_file} has older or ignored outliers.")
81 |         print(outliers[['date', 'ncumul_conf', 'current_conf', 'iqr', 'factor', 'upper_limit']])
82 |         print('')
83 | 
84 | if fail:
85 |     sys.exit(1)
86 | 
87 | 


--------------------------------------------------------------------------------
/scrapers/test/test_dates.py:
--------------------------------------------------------------------------------
 1 | from scrapers.scrape_dates import parse_date
 2 | 
 3 | def test_dates():
 4 |     date_tests = [
 5 |         ('20. März 2020 15.00 Uhr',             '2020-03-20T15:00'),
 6 |         ('21. März 2020, 10 Uhr',               '2020-03-21T10:00'),
 7 |         ('21. M&auml;rz 2020, 11:00 Uhr',       '2020-03-21T11:00'),
 8 |         ('21.03.2020, 15h30',                   '2020-03-21T15:30'),
 9 |         ('21. März 2020, 8.00 Uhr',             '2020-03-21T08:00'),
10 |         ('21.&nbsp;März 2020, 18.15&nbsp; Uhr', '2020-03-21T18:15'),
11 |         ('21. März 2020, 18.15  Uhr',           '2020-03-21T18:15'),
12 |         ('21. März 2020, 14.00 Uhr',            '2020-03-21T14:00'),
13 |         ('23. M&auml;rz 2020, 15 Uhr',          '2020-03-23T15:00'),
14 |         ('18. April 2020,16.00 Uhr',            '2020-04-18T16:00'),
15 |         ('21. März 2020',                       '2020-03-21T'),
16 |         ('21.3.20',                             '2020-03-21T'),
17 |         ('20.3.2020, 16.30',                    '2020-03-20T16:30'),
18 |         ('21.03.2020, 15h30',                   '2020-03-21T15:30'),
19 |         ('23.03.2020, 12:00',                   '2020-03-23T12:00'),
20 |         ('23.03.2020 12:00',                    '2020-03-23T12:00'),
21 |         ('08.04.2020: 09.30 Uhr',               '2020-04-08T09:30'),
22 |         ('07.04.2020 15.00h',                   '2020-04-07T15:00'),
23 |         ('31.03.20, 08.00 h',                   '2020-03-31T08:00'),
24 |         ('20.03.2020',                          '2020-03-20T'),
25 |         ('21 mars 2020 (18h)',                  '2020-03-21T18:00'),
26 |         ('1er avril 2020 (16h)',                '2020-04-01T16:00'),
27 |         ('21 mars 2020',                        '2020-03-21T'),
28 |         ('6avril2020',                          '2020-04-06T'),
29 |         ('20.03 à 8h00',                        '2020-03-20T08:00'),
30 |         ('23.03 à 12h',                         '2020-03-23T12:00'),
31 |         ('21 marzo 2020, ore 8.00',             '2020-03-21T08:00'),
32 |         ('27.03.2020 ore 08:00',                '2020-03-27T08:00'),
33 |         ('2020-03-23',                          '2020-03-23T'),
34 |         ('24.3. / 10h',                         '2020-03-24T10:00'),
35 |         ('2020-03-23T15:00:00',                 '2020-03-23T15:00'),
36 |         ('2020-03-23 15:00:00',                 '2020-03-23T15:00'),
37 |         ('2020-03-23 15:00',                    '2020-03-23T15:00'),
38 |         ('30.04.2020,13.30 Uhr',                '2020-04-30T13:30'),
39 |         ('1.Mai 2020',                          '2020-05-01T'),
40 |         ('05-05-2020 00:00',                    '2020-05-05T00:00'),
41 |         ('07.05.2020, 00;00 Uhr',               '2020-05-07T00:00'),
42 |         ('17.06.2020 um 8 Uhr',                 '2020-06-17T08:00'),
43 |         ('08.07.2020, um 8 Uhr',                '2020-07-08T08:00'),
44 |         ('8. Juli 2020 um 14:30 Uhr',           '2020-07-08T14:30'),
45 |         ('17.07.20 08:00',                      '2020-07-17T08:00'),
46 |         ('12. 8. 2020',                         '2020-08-12T'),
47 |         ('1er septembre 2020',                  '2020-09-01T'),
48 |     ]
49 |     for text, date in date_tests:
50 |         assert parse_date(text) == date, f"parse_date('{text}') = '{parse_date(text)}', but expected '{date}'"
51 | 
52 | if __name__ == "__main__":
53 |     test_dates()   
54 | 


--------------------------------------------------------------------------------
/.github/workflows/run_district_scrapers.yml:
--------------------------------------------------------------------------------
 1 | name: Run district scrapers
 2 | 
 3 | on:
 4 |   schedule:
 5 |   - cron:  '10 * * * *' # run every hour at xx:10
 6 |   workflow_dispatch: ~
 7 | jobs:
 8 |   run_scraper:
 9 |     runs-on: ubuntu-20.04
10 |     continue-on-error: false
11 |     timeout-minutes: 10
12 |     strategy:
13 |       fail-fast: false
14 |       matrix:
15 |         canton:
16 |           #- AG
17 |           - BE
18 |           #- BL
19 |           #- FR
20 |           #- GR
21 |           - SG
22 |           #- SO
23 |           #- SZ
24 |           - TG
25 |           #- VS
26 | 
27 |     steps:
28 |     - uses: actions/checkout@v3
29 |  
30 |     - name: Set up Python 3.7
31 |       uses: actions/setup-python@v4
32 |       with:
33 |         python-version: 3.7
34 |     - run: npm ci
35 |     - name: Remove broken apt repos
36 |       run: |
37 |         for apt_file in `grep -lr microsoft /etc/apt/sources.list.d/`; do sudo rm $apt_file; done
38 |     - name: Install dependencies
39 |       env:
40 |         SCRAPER_KEY: ${{ matrix.canton }}
41 |       run: |
42 |         python -m pip install --upgrade pip setuptools wheel
43 |         pip install -r requirements.txt
44 |         sudo apt update || true # do not fail if update does not work
45 |         sudo apt-get install sqlite3 poppler-utils
46 |         if [ "$SCRAPER_KEY" = "AG" ] ; then
47 |           pip install -r requirements-ocr.txt
48 |           sudo apt-get install tesseract-ocr=3.04.01-4
49 |         fi
50 | 
51 |     - name: Scrape new data
52 |       env:
53 |         SCRAPER_KEY: ${{ matrix.canton }}
54 |       run: |
55 |         ./scrapers/run_district_scraper.sh
56 |         
57 |     - name: Check if there are changes in the repo
58 |       run: |
59 |         if git diff -w --no-ext-diff --quiet
60 |         then
61 |           echo "changed=0" >> $GITHUB_OUTPUT
62 |         else
63 |           echo "changed=1" >> $GITHUB_OUTPUT
64 |         fi
65 |       id: changes
66 |   
67 |     - name: Set commit message
68 |       env:
69 |         SCRAPER_KEY: ${{ matrix.canton }}
70 |       run: |
71 |         echo "commit_msg=Update fallzahlen_kanton_${SCRAPER_KEY}_bezirk.csv from scraper" >> $GITHUB_ENV
72 |         
73 |     - name: Commit and push to repo
74 |       if: steps.changes.outputs.changed == 1 # only try to commit if there are actually changes
75 |       uses: github-actions-x/commit@v2.9
76 |       with:
77 |         github-token: ${{ secrets.GITHUB_TOKEN }}
78 |         push-branch: master
79 |         name: GitHub Action Scraper
80 |         email: scraper@open.zh.ch
81 |         commit-message: ${{ env.commit_msg }}
82 |         rebase: 'true'
83 |         
84 |     - name: Get current unix timestamp
85 |       if: always()
86 |       id: date
87 |       run: echo "ts=$(date +'%s')" >> $GITHUB_OUTPUT
88 | 
89 |     - name: Notify slack failure
90 |       if: ${{ failure()  || cancelled() }}
91 |       env:
92 |         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
93 |       uses: pullreminders/slack-action@master
94 |       with:
95 |         args: '{\"channel\":\"C013C0UUQ4S\", \"attachments\": [{\"fallback\": \"Job failed.\", \"color\": \"danger\", \"title\": \"Run district scrapers ${{ matrix.canton }}\", \"title_link\": \"https://github.com/openZH/covid_19/actions/runs/${{ github.run_id }}?check_suite_focus=true\", \"text\": \":x: District scraper failed\", \"footer\": \"<https://github.com/openZH/covid_19|openZH/covid_19>\", \"footer_icon\": \"https://github.com/abinoda/slack-action/raw/master/docs/app-icons/github-icon.png\", \"ts\": \"${{steps.date.outputs.ts}}\"}]}'
96 | 
97 | 


--------------------------------------------------------------------------------
/.github/workflows/run_tests_scraper.yml:
--------------------------------------------------------------------------------
  1 | name: Run tests scrapers
  2 | 
  3 | on:
  4 |   schedule:
  5 |   - cron:  '20 * * * *' # run every hour at xx:20
  6 |   workflow_dispatch: ~
  7 | jobs:
  8 |   run_scraper:
  9 |     runs-on: ubuntu-20.04
 10 |     continue-on-error: false
 11 |     timeout-minutes: 10
 12 |     strategy:
 13 |       fail-fast: false
 14 |       matrix:
 15 |         canton:
 16 | #          - AG
 17 |           - BE
 18 |           - BL
 19 |           - BS
 20 |           - FL
 21 | #          - FR # no more data published anymore
 22 | #          - GE
 23 |           - GL
 24 | #          - JU #disable until PDF is fixed
 25 | #          - NW
 26 |           - SG
 27 |           - SH
 28 | #          - SO
 29 |           - TG
 30 | #          - TI # no more data published anymore
 31 | #          - VD
 32 |           - VS
 33 |           - ZG
 34 |           - ZH
 35 | 
 36 |     steps:
 37 |     - uses: actions/checkout@v3
 38 |  
 39 |     - name: Set up Python 3.7
 40 |       uses: actions/setup-python@v4
 41 |       with:
 42 |         python-version: 3.7
 43 |     - run: npm ci
 44 |     - name: Remove broken apt repos
 45 |       run: |
 46 |         for apt_file in `grep -lr microsoft /etc/apt/sources.list.d/`; do sudo rm $apt_file; done
 47 |     - name: Install dependencies
 48 |       env:
 49 |         SCRAPER_KEY: ${{ matrix.canton }}
 50 |       run: |
 51 |         python -m pip install --upgrade pip setuptools wheel
 52 |         pip install -r requirements.txt
 53 |         sudo apt update || true # do not fail if update does not work
 54 |         sudo apt-get install sqlite3 poppler-utils
 55 |         if [ "$SCRAPER_KEY" = "GE" ] ; then
 56 |                 sudo apt-get install chromium-browser
 57 |         fi
 58 | 
 59 |     - name: Scrape new data
 60 |       env:
 61 |         SCRAPER_KEY: ${{ matrix.canton }}
 62 |       run: |
 63 |         ./scrapers/run_tests_scraper.sh
 64 |         
 65 |     - name: Check if there are changes in the repo
 66 |       run: |
 67 |         if git diff -w --no-ext-diff --quiet
 68 |         then
 69 |           echo "changed=0" >> $GITHUB_OUTPUT
 70 |         else
 71 |           echo "changed=1" >> $GITHUB_OUTPUT
 72 |         fi
 73 |       id: changes
 74 |   
 75 |     - name: Set commit message
 76 |       env:
 77 |         SCRAPER_KEY: ${{ matrix.canton }}
 78 |       run: |
 79 |         if [ "$SCRAPER_KEY" = "FL" ] ; then
 80 |           echo "commit_msg=Update fallzahlen_${SCRAPER_KEY}_tests.csv from scraper" >> $GITHUB_ENV
 81 |         else
 82 |           echo "commit_msg=Update fallzahlen_kanton_${SCRAPER_KEY}_tests.csv from scraper" >> $GITHUB_ENV
 83 |         fi
 84 |         
 85 |     - name: Commit and push to repo
 86 |       if: steps.changes.outputs.changed == 1 # only try to commit if there are actually changes
 87 |       uses: github-actions-x/commit@v2.9
 88 |       with:
 89 |         github-token: ${{ secrets.GITHUB_TOKEN }}
 90 |         push-branch: master
 91 |         name: GitHub Action Scraper
 92 |         email: scraper@open.zh.ch
 93 |         commit-message: ${{ env.commit_msg }}
 94 |         rebase: 'true'
 95 |         
 96 |     - name: Get current unix timestamp
 97 |       if: always()
 98 |       id: date
 99 |       run: echo "ts=$(date +'%s')" >> $GITHUB_OUTPUT
100 | 
101 |     - name: Notify slack failure
102 |       if: ${{ failure()  || cancelled() }}
103 |       env:
104 |         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
105 |       uses: pullreminders/slack-action@master
106 |       with:
107 |         args: '{\"channel\":\"C013C0UUQ4S\", \"attachments\": [{\"fallback\": \"Job failed.\", \"color\": \"danger\", \"title\": \"Run tests scrapers ${{ matrix.canton }}\", \"title_link\": \"https://github.com/openZH/covid_19/actions/runs/${{ github.run_id }}?check_suite_focus=true\", \"text\": \":x: Tests scraper failed\", \"footer\": \"<https://github.com/openZH/covid_19|openZH/covid_19>\", \"footer_icon\": \"https://github.com/abinoda/slack-action/raw/master/docs/app-icons/github-icon.png\", \"ts\": \"${{steps.date.outputs.ts}}\"}]}'
108 | 
109 | 


--------------------------------------------------------------------------------
/fallzahlen_bezirke/fallzahlen_kanton_AG_bezirk.csv:
--------------------------------------------------------------------------------
 1 | DistrictId,District,Canton,Date,Week,Year,Population,TotalConfCases,NewConfCases,TotalDeaths,NewDeaths,SourceUrl
 2 | 1901,Aarau,AG,2020-10-26,,,79702,353,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
 3 | 1901,Aarau,AG,2020-11-04,,,79702,527,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
 4 | 1901,Aarau,AG,2020-11-13,,,79702,527,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
 5 | 1902,Baden,AG,2020-10-26,,,145696,735,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
 6 | 1902,Baden,AG,2020-11-04,,,145696,1079,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
 7 | 1902,Baden,AG,2020-11-13,,,145696,1079,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
 8 | 1903,Bremgarten,AG,2020-10-26,,,78745,277,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
 9 | 1903,Bremgarten,AG,2020-11-04,,,78745,430,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
10 | 1903,Bremgarten,AG,2020-11-13,,,78745,430,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
11 | 1904,Brugg,AG,2020-10-26,,,51814,179,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
12 | 1904,Brugg,AG,2020-11-04,,,51814,270,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
13 | 1904,Brugg,AG,2020-11-13,,,51814,270,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
14 | 1905,Kulm,AG,2020-10-26,,,42412,153,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
15 | 1905,Kulm,AG,2020-11-04,,,42412,232,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
16 | 1905,Kulm,AG,2020-11-13,,,42412,232,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
17 | 1906,Laufenburg,AG,2020-10-26,,,33035,96,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
18 | 1906,Laufenburg,AG,2020-11-04,,,33035,130,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
19 | 1906,Laufenburg,AG,2020-11-13,,,33035,130,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
20 | 1907,Lenzburg,AG,2020-10-26,,,64792,261,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
21 | 1907,Lenzburg,AG,2020-11-04,,,64792,378,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
22 | 1907,Lenzburg,AG,2020-11-13,,,64792,378,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
23 | 1908,Muri,AG,2020-10-26,,,37170,152,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
24 | 1908,Muri,AG,2020-11-04,,,37170,213,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
25 | 1908,Muri,AG,2020-11-13,,,37170,213,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
26 | 1909,Rheinfelden,AG,2020-10-26,,,47926,158,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
27 | 1909,Rheinfelden,AG,2020-11-04,,,47926,235,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
28 | 1909,Rheinfelden,AG,2020-11-13,,,47926,235,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
29 | 1910,Zofingen,AG,2020-10-26,,,73136,271,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
30 | 1910,Zofingen,AG,2020-11-04,,,73136,408,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
31 | 1910,Zofingen,AG,2020-11-13,,,73136,408,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
32 | 1911,Zurzach,AG,2020-10-26,,,34650,127,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
33 | 1911,Zurzach,AG,2020-11-04,,,34650,206,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
34 | 1911,Zurzach,AG,2020-11-13,,,34650,206,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
35 | 


--------------------------------------------------------------------------------
/scrapers/add_district_db_entry.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import sys
  4 | import sqlite3
  5 | import traceback
  6 | import os
  7 | 
  8 | import db_common as dc
  9 | import scrape_common as sc
 10 | 
 11 | __location__ = dc.get_location()
 12 | 
 13 | input_failures = 0
 14 | 
 15 | try:
 16 |     DATABASE_NAME = os.path.join(__location__, 'data.sqlite')
 17 |     conn = sqlite3.connect(DATABASE_NAME)
 18 | 
 19 |     i = 0
 20 |     for line in sys.stdin:
 21 |         dd = sc.DistrictData()
 22 |         if dd.parse(line.strip()):
 23 |             c = conn.cursor()
 24 |             try:
 25 |                 print(dd)
 26 | 
 27 |                 c.execute(
 28 |                     '''
 29 |                     INSERT INTO data (
 30 |                       DistrictId,
 31 |                       District,
 32 |                       Canton,
 33 |                       Date,
 34 |                       Week,
 35 |                       Year,
 36 |                       Population,
 37 |                       TotalConfCases,
 38 |                       NewConfCases,
 39 |                       TotalDeaths,
 40 |                       NewDeaths,
 41 |                       SourceUrl
 42 |                     )
 43 |                     VALUES
 44 |                     (?,?,?,?,?,?,?,?,?,?,?,?)
 45 |                       ;
 46 | 
 47 |                     ''',
 48 |                     [
 49 |                         dd.district_id,
 50 |                         dd.district,
 51 |                         dd.canton,
 52 |                         dd.date or '',
 53 |                         dd.week or '',
 54 |                         dd.year or '',
 55 |                         dd.population,
 56 |                         dd.total_cases,
 57 |                         dd.new_cases,
 58 |                         dd.total_deceased,
 59 |                         dd.new_deceased,
 60 |                         dd.url,
 61 |                     ]
 62 |                 )
 63 | 
 64 |                 print("Successfully added new entry.")
 65 |             except sqlite3.IntegrityError as e:
 66 |                 # try UPDATE if INSERT didn't work (i.e. constraint violation)
 67 |                 try:
 68 |                     c.execute(
 69 |                         '''
 70 |                         UPDATE data SET 
 71 |                           Population = ?,
 72 |                           TotalConfCases = ?,
 73 |                           NewConfCases = ?,
 74 |                           TotalDeaths = ?,
 75 |                           NewDeaths = ?,
 76 |                           SourceUrl = ?
 77 |                         WHERE DistrictId = ?
 78 |                         AND   District = ?
 79 |                         AND   Canton = ? 
 80 |                         AND   Date = ?
 81 |                         AND   Week = ?
 82 |                         AND   Year = ?
 83 |                         ;
 84 |                         ''',
 85 |                         [
 86 |                             dd.population,
 87 |                             dd.total_cases,
 88 |                             dd.new_cases,
 89 |                             dd.total_deceased,
 90 |                             dd.new_deceased,
 91 |                             dd.url,
 92 |                             dd.district_id,
 93 |                             dd.district,
 94 |                             dd.canton,
 95 |                             dd.date or '',
 96 |                             dd.week or '',
 97 |                             dd.year or '',
 98 |                         ]
 99 |                     )
100 |                     print("Successfully updated entry.")
101 |                 except sqlite3.Error as e:
102 |                     print("Error: an error occured in sqlite3: ", e.args[0], file=sys.stderr)
103 |                     conn.rollback()
104 |                     input_failures += 1
105 |             finally:
106 |                 conn.commit()
107 | except Exception as e:
108 |     print("Error: %s" % e, file=sys.stderr)
109 |     print(traceback.format_exc(), file=sys.stderr)
110 |     sys.exit(1)
111 | finally:
112 |     conn.close()
113 | 
114 | if input_failures:
115 |     print(f'input_failures: {input_failures}')
116 |     sys.exit(1)
117 | 


--------------------------------------------------------------------------------
/scrapers/convert_parsed_to_csv.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | # Reads data in a format produced by ./parse_scrape_output.py
  4 | # from standard input, and converts into CSV file on a standard output.
  5 | #
  6 | # Example usage:
  7 | #   ./meta_scrape.sh | ./convert_parsed_to_csv.py > latest.csv
  8 | #   ./scrape_vd.sh | ./parse_scrape_output.py | ./convert_parsed_to_csv.py > vd.csv
  9 | #   cat *0.txt | ./convert_parsed_to_csv.py > full_history.csv
 10 | #
 11 | # See README.md for details about columns defined in CSV format.
 12 | 
 13 | import csv
 14 | import re
 15 | import sys
 16 | 
 17 | # See README.md for more details about these fields.
 18 | field_names = [
 19 |     'date',
 20 |     'time',
 21 |     'abbreviation_canton_and_fl',
 22 |     'ncumul_tested',
 23 |     'ncumul_conf',
 24 |     'ncumul_hosp',  # Actually not cumulative.
 25 |     'ncumul_ICU',   # Actually not cumulative.
 26 |     'ncumul_vent',  # Actually not cumulative.
 27 |     'ncumul_released',
 28 |     'ncumul_deceased',
 29 |     'source',
 30 | ]
 31 | 
 32 | writer = csv.DictWriter(sys.stdout, field_names,
 33 |                         delimiter=',',
 34 |                         quotechar='"',
 35 |                         lineterminator='\n',
 36 |                         quoting=csv.QUOTE_MINIMAL)
 37 | 
 38 | writer.writeheader()
 39 | 
 40 | input_failures = 0
 41 | for line in sys.stdin:
 42 |     l = line.strip()
 43 | 
 44 |     # AR 2020-03-23T10:00      30       1 OK 2020-03-23T19:12:09+01:00 https://www.ai.ch/themen/gesundheit-alter-und-soziales/gesundheitsfoerderung-und-praevention/uebertragbare-krankheiten/coronavirus
 45 |     # GE 2020-03-27T         1924      23 OK 2020-03-28T18:57:34+01:00 # Extras: ncumul_hosp=313,ncumul_ICU=54 # URLs: https://www.ge.ch/document/point-coronavirus-maladie-covid-19/telecharger
 46 | 
 47 |     # Groups:             1              2                                         3       4              5                               6                            7             8
 48 |     match = re.search(r'^([A-Z][A-Z])\s+((?:\d\d\d\d-\d\d-\d\d)T(?:\d\d:\d\d)?)\s+(\d+)\s+(\d+|-)\s+OK\s+([0-9:\+\-\.T]+)(?:\s+# Extras: ([^#]+))?(?:\s+(?:(# URLs: )?(h.+)))?(?:\s+(http.+))?$', l)
 49 |     if not match:
 50 |         input_failures += 1
 51 |         print(f"Failed to parse line: {l}", file=sys.stderr)
 52 |         continue
 53 | 
 54 |     abbr = match.group(1)
 55 | 
 56 |     date_part = match.group(2).split('T', 2)
 57 | 
 58 |     data = {
 59 |         'date': date_part[0],
 60 |         'time': None,
 61 |         'abbreviation_canton_and_fl': abbr,
 62 |         'ncumul_tested': None,
 63 |         'ncumul_conf': int(match.group(3)),
 64 |         'ncumul_hosp': None,
 65 |         'ncumul_ICU': None,
 66 |         'ncumul_vent': None,
 67 |         'ncumul_released': None,
 68 |         'ncumul_deceased': None,
 69 |         'source': '',
 70 |     }
 71 | 
 72 |     if len(date_part) == 2:
 73 |         data['time'] = date_part[1]
 74 | 
 75 |     if match.group(4) != '-':
 76 |         data['ncumul_deceased'] = int(match.group(4))
 77 | 
 78 |     scrape_time = match.group(5)
 79 | 
 80 |     url_sources = match.group(7)
 81 |     if match.group(8):
 82 |         url_sources = match.group(8)
 83 |     if url_sources:
 84 |         data['source'] = f'Scraper for {abbr} at {scrape_time} using {url_sources}'
 85 |     else:
 86 |         data['source'] = f'Scraper for {abbr} at {scrape_time}'
 87 | 
 88 |     # Parse optional data.
 89 |     extras_list = match.group(6)
 90 |     if extras_list:
 91 |         try:
 92 |             extras = extras_list.strip()
 93 |             extras = extras.split(',')
 94 |             extras = { kv.split('=', 2)[0]: int(kv.split('=', 2)[1]) for kv in extras }
 95 |             # data.update(extras)
 96 |             for k in ['ncumul_hosp', 'ncumul_ICU', 'ncumul_vent', 'ncumul_released', 'new_hosp', 'current_hosp']:
 97 |                 if k in extras:
 98 |                     data[k] = extras[k]
 99 |         except Exception as e:
100 |             input_failures += 1
101 |             print(f'Error: Parsing optional data failed, ignoring: {extras_list}', file=sys.stderr)
102 | 
103 |     # print(data)
104 |     writer.writerow(data)
105 | 
106 | sys.stdout.flush()
107 | 
108 | if input_failures:
109 |     sys.exit(1)
110 | 


--------------------------------------------------------------------------------
/scrapers/scrape_gr.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import datetime
  4 | import re
  5 | from bs4 import BeautifulSoup
  6 | import scrape_common as sc
  7 | 
  8 | 
  9 | is_first = True
 10 | 
 11 | url = 'https://www.gr.ch/DE/institutionen/verwaltung/djsg/ga/coronavirus/info/Seiten/Start.aspx'
 12 | data = sc.download(url, silent=True)
 13 | data = re.sub(r'(\d+)&#39;(\d+)', r'\1\2', data)
 14 | soup = BeautifulSoup(data, 'html.parser')
 15 | elem = soup.find('h2', text=re.compile(r'Fallzahlen\s+Kanton.*'))
 16 | if elem is not None:
 17 |     table = elem.find_next('table')
 18 |     body = table.find('tbody')
 19 |     for row in body.find_all('tr'):
 20 |         tds = row.find_all('td')
 21 | 
 22 |         if not is_first:
 23 |             print('-' * 10)
 24 |         is_first = False
 25 | 
 26 |         dd = sc.DayData(canton='GR', url=url)
 27 |         dd.datetime = tds[0].text
 28 |         dd.cases = tds[1].text
 29 |         dd.isolated = tds[3].text
 30 |         dd.quarantined = tds[4].text
 31 |         dd.deaths = tds[6].text
 32 |         dd.hospitalized = tds[8].text
 33 |         dd.icu = tds[10].text
 34 |         dd.vent = tds[11].text
 35 |         print(dd)
 36 | 
 37 | 
 38 | json_url = 'https://services1.arcgis.com/YAuo6vcW85VPu7OE/arcgis/rest/services/Fallzahlen_Total_Kanton/FeatureServer/0/query?where=1%3D1&objectIds=&time=&resultType=none&outFields=*&returnHiddenFields=false&returnIdsOnly=false&returnUniqueIdsOnly=false&returnCountOnly=false&returnDistinctValues=false&cacheHint=false&orderByFields=Eingangs_Datum&groupByFieldsForStatistics=&outStatistics=&having=&resultOffset=&resultRecordCount=&sqlFormat=standard&f=pjson'
 39 | data = sc.jsondownload(json_url, silent=True)
 40 | 
 41 | # 2020-04-02
 42 | """
 43 | features: [
 44 | {
 45 |     attributes: {
 46 |             Eingangs_Datum: 1582675200000,
 47 |             Anzahl_Fälle_total__kumuliert_: 2,
 48 |             Neue_Faelle: 2,
 49 |             Neue_aktive_Fälle: 2,
 50 |             Anzahl_aktive_Fälle_total: 2,
 51 |             Anzahl_Personen_in_Isolation: 0,
 52 |             Anzahl_Personen_in_Quarantäne: 0,
 53 |             Verstorbene: 0,
 54 |             Verstorbene__kumuliert_: 0,
 55 |             Neue_Hospitalisierungen: 0,
 56 |             Hospitalisiert_Total: 0,
 57 |             Neu_Pflege: 0,
 58 |             Hospitalisiert_Pflege: 0,
 59 |             Neu_IPS: 0,
 60 |             Hospialisiert_IPS: 0,
 61 |             Neu_IPS_beatmet: 0,
 62 |             Hospitalisiert_IPS_beatmet: 0,
 63 |             FID: 1
 64 |     }
 65 | },
 66 | {
 67 |     attributes: {
 68 |             Eingangs_Datum: 1582761600000,
 69 |             Anzahl_Fälle_total__kumuliert_: 2,
 70 |             Neue_Faelle: 0,
 71 |             Neue_aktive_Fälle: 0,
 72 |             Anzahl_aktive_Fälle_total: 2,
 73 |             Anzahl_Personen_in_Isolation: 0,
 74 |             Anzahl_Personen_in_Quarantäne: 0,
 75 |             Verstorbene: 0,
 76 |             Verstorbene__kumuliert_: 0,
 77 |             Neue_Hospitalisierungen: 0,
 78 |             Hospitalisiert_Total: 0,
 79 |             Neu_Pflege: 0,
 80 |             Hospitalisiert_Pflege: 0,
 81 |             Neu_IPS: 0,
 82 |             Hospialisiert_IPS: 0,
 83 |             Neu_IPS_beatmet: 0,
 84 |             Hospitalisiert_IPS_beatmet: 0,
 85 |             FID: 2
 86 |     }
 87 | },
 88 | """
 89 | 
 90 | assert 'features' in data, "JSON did not contain `features` key"
 91 | 
 92 | for feature in data['features']:
 93 |     row = feature['attributes']
 94 |     if not is_first:
 95 |         print('-' * 10)
 96 |     is_first = False
 97 | 
 98 |     dd = sc.DayData(canton='GR', url=json_url)
 99 |     dd.datetime = datetime.datetime.fromtimestamp(row['Eingangs_Datum'] / 1000).date().isoformat()
100 |     dd.cases = row['Anzahl_Fälle_total__kumuliert_']
101 |     dd.hospitalized = row['Hospitalisiert_Total']
102 |     dd.icu = row['Hospialisiert_IPS']
103 |     dd.vent = row['Hospitalisiert_IPS_beatmet']
104 |     # Neue_Hospotalisierungen does currently not match our definition of new_hosp
105 |     # GR provides this calculated field as the difference between
106 |     # hospitalized from yesterday and today
107 |     #dd.new_hosp = row['Neue_Hospitalisierungen']
108 |     dd.deaths = row['Verstorbene__kumuliert_']
109 |     dd.isolated = row['Anzahl_Personen_in_Isolation']
110 |     dd.quarantined = row['Anzahl_Personen_in_Quarantäne']
111 |     print(dd)
112 | 


--------------------------------------------------------------------------------
/fallzahlen_tests/fallzahlen_kanton_JU_tests.csv:
--------------------------------------------------------------------------------
 1 | canton,start_date,end_date,week,year,positive_tests,negative_tests,total_tests,positivity_rate,source,pcr_positive_tests,pcr_negative_tests,pcr_total_tests,pcr_positivity_rate,ag_positive_tests,ag_negative_tests,ag_total_tests,ag_positivity_rate
 2 | JU,,,43,2020,179,,719,25.0,https://www.jura.ch/Htdocs/Files/v/35815.pdf,,,,,,,,
 3 | JU,,,44,2020,219,,1064,23.0,https://www.jura.ch/Htdocs/Files/v/35911.pdf,,,,,,,,
 4 | JU,,,45,2020,418,,1590,27.0,https://www.jura.ch/Htdocs/Files/v/35986.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Cornavirus/Chiffres/rapport_hebdo_COVID_JU_sem45_vf.pdf?download=1,,,,,,,,
 5 | JU,,,46,2020,252,,1130,24.0,https://www.jura.ch/Htdocs/Files/v/36049.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Cornavirus/Chiffres/rapport_hebdo_COVID_JU_sem46_vf.pdf?download=1,,,,,,,,
 6 | JU,,,47,2020,203,,853,25.0,https://www.jura.ch/Htdocs/Files/v/36126.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Cornavirus/rapport_hebdo_COVID_JU_sem47_vf.pdf?download=1,,,,,,,,
 7 | JU,,,48,2020,158,,736,22.0,https://www.jura.ch/Htdocs/Files/v/36196.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Cornavirus/Chiffres/rapport_hebdo_COVID_JU_sem48.pdf,,,,,,,,
 8 | JU,,,49,2020,136,,882,15.0,https://www.jura.ch/Htdocs/Files/v/36338.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Cornavirus/Chiffres/rapport_hebdo_COVID_JU_sem49_vf.pdf,,,,,,,,
 9 | JU,,,50,2020,145,,1125,13.0,https://www.jura.ch/Htdocs/Files/v/36416.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Cornavirus/Chiffres/rapport_hebdo_COVID_JU_sem50.pdf,,,,,,,,
10 | JU,,,51,2020,242,,1552,16.0,https://www.jura.ch/Htdocs/Files/v/36492.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Cornavirus/Chiffres/rapport_hebdo_COVID_JU_sem51.pdf,,,,,,,,
11 | JU,,,52,2020,144,,1072,13.0,https://www.jura.ch/Htdocs/Files/v/36498.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Cornavirus/Chiffres/rapport_hebdo_COVID_JU_sem52.pdf,,,,,,,,
12 | JU,,,53,2020,244,,1235,20.0,https://www.jura.ch/Htdocs/Files/v/36536.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Cornavirus/Chiffres/rapport_hebdo_COVID_JU_sem53.pdf,,,,,,,,
13 | JU,,,1,2021,246,,1143,22.0,https://www.jura.ch/Htdocs/Files/v/36563.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Cornavirus/Chiffres/rapport_hebdo_COVID_JU_sem1_2021.pdf,,,,,,,,
14 | JU,,,2,2021,215,,1231,17.0,https://www.jura.ch/Htdocs/Files/v/36660.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Cornavirus/Chiffres/rapport_hebdo_COVID_JU_sem2_2021_corr.pdf,,,,,,,,
15 | JU,,,3,2021,179,,1117,16.0,https://www.jura.ch/Htdocs/Files/v/36720.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Cornavirus/Chiffres/rapport_hebdo_COVID_JU_sem3_2021_vf.pdf,,,,,,,,
16 | JU,,,4,2021,207,,1448,14.0,https://www.jura.ch/Htdocs/Files/v/36790.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Cornavirus/Chiffres/rapport_hebdo_COVID_JU_sem4_2021.pdf,,,,,,,,
17 | JU,,,5,2021,127,,1877,7.0,https://www.jura.ch/Htdocs/Files/v/36821.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Cornavirus/Chiffres/rapport_hebdo_COVID_JU_sem5_2021.pdf,,,,,,,,
18 | JU,,,6,2021,127,,1342,9.0,https://www.jura.ch/Htdocs/Files/v/36872.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Cornavirus/Chiffres/rapport_hebdo_COVID_JU_sem6_2021.pdf,,,,,,,,
19 | JU,,,7,2021,143,,1500,10.0,https://www.jura.ch/Htdocs/Files/v/36918.pdf/Departements/CHA/SIC/Communiques/2021/rapport_hebdo_COVID_JU_sem7_2021.pdf,,,,,,,,
20 | JU,,,8,2021,151,,969,13.0,https://www.jura.ch/Htdocs/Files/v/36986.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Chiffres/rapport_hebdo_COVID_JU_sem8_2021.pdf,,,,,,,,
21 | JU,,,9,2021,154,,927,14.0,https://www.jura.ch/Htdocs/Files/v/37064.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Chiffres/rapport_hebdo_COVID_JU_sem9_2021.pdf,,,,,,,,
22 | JU,,,10,2021,80,,1099,7.0,https://www.jura.ch/Htdocs/Files/v/37125.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Chiffres/rapport_hebdo_COVID_JU_sem10_2021.pdf,,,,,,,,
23 | JU,,,11,2021,97,,1383,7.0,https://www.jura.ch/Htdocs/Files/v/37180.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Chiffres/rapport_hebdo_COVID_JU_sem11_2021.pdf,,,,,,,,
24 | JU,,,12,2021,104,,1715,6.0,https://www.jura.ch/Htdocs/Files/v/37241.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Chiffres/rapport-hebdomadaire.pdf,,,,,,,,
25 | JU,,,13,2021,148,,2116,7.0,https://www.jura.ch/Htdocs/Files/v/37276.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Chiffres/Rapport_hebdo_COVID_sem13.pdf,,,,,,,,
26 | JU,,,14,2021,110,,1205,8.0,https://www.jura.ch/Htdocs/Files/v/37332.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Chiffres/rapport-hebdo-COVID-sem14.pdf,,,,,,,,
27 | 


--------------------------------------------------------------------------------
/scrapers/scrape_bl_districts.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | from bs4 import BeautifulSoup
  5 | import scrape_common as sc
  6 | import scrape_bl_common as sbc
  7 | from collections import defaultdict, OrderedDict
  8 | from datetime import datetime
  9 | 
 10 | main_url = "https://www.baselland.ch/politik-und-behorden/direktionen/volkswirtschafts-und-gesundheitsdirektion/amt-fur-gesundheit/medizinische-dienste/kantonsarztlicher-dienst/aktuelles/covid-19-faelle-kanton-basel-landschaft"
 11 | main_site = sc.download(main_url, silent=True)
 12 | 
 13 | # 2020-04-08, two iframes
 14 | """
 15 |     <iframe width="100%" class="iframeblock loading" onload="onIframeLoaded(this)" src="https://www.statistik.bl.ch/files/sites/Grafiken/COVID19/20200407_COVID19_BL.htm" scrolling="auto" height="600"></iframe>
 16 |     <iframe width="100%" class="iframeblock loading" onload="onIframeLoaded(this)" src="https://www.statistik.bl.ch/files/sites/Grafiken/COVID19/20200407_COVID19_BL_Hosp.htm" scrolling="auto" height="600"></iframe>
 17 | """
 18 | 
 19 | 
 20 | def parse_row_date(s):
 21 |     return sbc.parse_bl_date(s)[0]
 22 | 
 23 | 
 24 | rows = defaultdict(dict)
 25 | soup = BeautifulSoup(main_site, 'html.parser')
 26 | for iframe in soup.find_all('iframe'):
 27 |     iframe_url = (iframe['src'])
 28 | 
 29 |     if iframe_url.find('/dbw/360') <= 0:
 30 |         continue
 31 | 
 32 |     d = sc.download(iframe_url, silent=True)
 33 | 
 34 |     # 2020-07-29
 35 |     """
 36 |     <pre id="data_1" style="display:none; margin-top: 20px;">
 37 |     Datum,&quot;Personen in Isolation&quot;,&quot;Personen in Quarantäne (Tracing)&quot;,&quot;Personen in Quarantäne (Rückreise Risikoländer)&quot;
 38 |     11-05-2020,0.0,0.0,
 39 |     """
 40 | 
 41 |     d = d.replace('\n', ' ')
 42 | 
 43 |     # district data!
 44 |     data = sc.find(r'<pre id="data_1".*?> ?Datum,&quot;Bezirk Arlesheim&quot;,&quot;Bezirk Laufen&quot;,&quot;Bezirk Liestal&quot;,&quot;Bezirk Sissach&quot;,&quot;Bezirk Waldenburg&quot;\s*([^<]+)</pre>', d)
 45 |     if data:
 46 |         # take "Fallzahlen Bezirke BL ab Juni 2020", but not the 14d averaged one
 47 |         for row in data.split(" "):
 48 |             c = row.split(',')
 49 |             assert len(c) == 6, f"Number of fields changed, {len(c)} != 6"
 50 |             row_date = parse_row_date(c[0])
 51 |             rows[row_date]['date'] = row_date
 52 |             rows[row_date]['Arlesheim'] = sc.safeint(c[1])
 53 |             rows[row_date]['Laufen'] = sc.safeint(c[2])
 54 |             rows[row_date]['Liestal'] = sc.safeint(c[3])
 55 |             rows[row_date]['Sissach'] = sc.safeint(c[4])
 56 |             rows[row_date]['Waldenburg'] = sc.safeint(c[5])
 57 |         break
 58 | 
 59 | assert rows, "Couldn't find district data in iframes"
 60 | 
 61 | # https://www.bfs.admin.ch/bfs/de/home/statistiken/kataloge-datenbanken/karten.assetdetail.5688189.html
 62 | district_ids = {
 63 |     'Arlesheim': 1301,
 64 |     'Laufen': 1302,
 65 |     'Liestal': 1303,
 66 |     'Sissach': 1304,
 67 |     'Waldenburg': 1305,
 68 | }
 69 | 
 70 | # https://www.statistik.bl.ch/web_portal/1
 71 | population = {
 72 |     'Arlesheim': 157253,
 73 |     'Laufen': 20141,
 74 |     'Liestal': 61201,
 75 |     'Sissach': 36051,
 76 |     'Waldenburg': 16119,
 77 | }
 78 | 
 79 | # based on https://github.com/openZH/covid_19/issues/1185#issuecomment-709952315
 80 | initial_cases = {
 81 |     'Arlesheim': 0,
 82 |     'Laufen': 0,
 83 |     'Liestal': 0,
 84 |     'Sissach': 0,
 85 |     'Waldenburg': 0,
 86 | }
 87 | 
 88 | # order dict by key to ensure the most recent entry is last
 89 | ordered_rows = OrderedDict(sorted(rows.items()))
 90 | 
 91 | #for row_date, row in ordered_rows.items():
 92 | #    for district, district_id in district_ids.items():
 93 | 
 94 | for district, district_id in district_ids.items():
 95 |     last_total_cases_val = initial_cases[district]
 96 |     if district == 'Arlesheim':
 97 |         # 2020-05-31 is 527
 98 |         last_total_cases_val = 0
 99 | 
100 |     for row_date, row in ordered_rows.items():
101 |         dd = sc.DistrictData(canton='BL', district=district)
102 |         dd.district_id = district_id
103 |         dd.population = population[district]
104 |         dd.url = main_url
105 |         dd.date = row['date']
106 |         dd.total_cases = row[district] + initial_cases[district]
107 |         dd.new_cases = dd.total_cases - last_total_cases_val
108 |         last_total_cases_val = dd.total_cases
109 |         print(dd)
110 | 


--------------------------------------------------------------------------------
/scrapers/scrape_ag_districts.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | from bs4 import BeautifulSoup
  4 | import re
  5 | import datetime
  6 | import scrape_common as sc
  7 | from scrape_dates import parse_date
  8 | import cv2
  9 | import pytesseract
 10 | import numpy as np
 11 | import tempfile
 12 | import os
 13 | 
 14 | 
 15 | districts = {
 16 |     'Baden': {
 17 |         'pattern': r'^Baden.*',
 18 |         'district_id': '1902',
 19 |         'population': 145696,
 20 |     },
 21 |     'Muri': {
 22 |         'pattern': r'^Muri.*',
 23 |         'district_id': '1908',
 24 |         'population': 37170,
 25 |     },
 26 |     'Lenzburg': {
 27 |         'pattern': r'^Lenzburg.*',
 28 |         'district_id': '1907',
 29 |         'population': 64792,
 30 |     },
 31 |     'Zofingen': {
 32 |         'pattern': r'^Zo.+ngen.*',
 33 |         'district_id': '1910',
 34 |         'population': 73136,
 35 |     },
 36 |     'Aarau': {
 37 |         'pattern': r'^Aarau.*',
 38 |         'district_id': '1901',
 39 |         'population': 79702,
 40 |     },
 41 |     'Bremgarten': {
 42 |         'pattern': r'^Bremga.+en.*',
 43 |         'district_id': '1903',
 44 |         'population': 78745,
 45 |     },
 46 |     'Brugg': {
 47 |         'pattern': r'^Brugg.*',
 48 |         'district_id': '1904',
 49 |         'population': 51814,
 50 |     },
 51 |     'Kulm': {
 52 |         'pattern': r'^Kulm.*',
 53 |         'district_id': '1905',
 54 |         'population': 42412,
 55 |     },
 56 |     'Laufenburg': {
 57 |         'pattern': r'^Laufen.*burg.*',
 58 |         'district_id': '1906',
 59 |         'population': 33035,
 60 |     },
 61 |     'Rheinfelden': {
 62 |         'pattern': r'^Rheinfelden.*',
 63 |         'district_id': '1909',
 64 |         'population': 47926,
 65 |     },
 66 |     'Zurzach': {
 67 |         'pattern': r'^Z.+zach.*',
 68 |         'district_id': '1911',
 69 |         'population': 34650,
 70 |     },
 71 | }
 72 | 
 73 | data_url = 'https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp'
 74 | d = sc.download(data_url, silent=True)
 75 | soup = BeautifulSoup(d, 'html.parser')
 76 | img_caption = soup.find(string=re.compile(r".*Inzidenz pro 100'000 Einwohner nach Bezirke.*"))
 77 | img_date = sc.find(r'\(Stand:?\s+(.*\d{4})', img_caption.string)
 78 | img_date = datetime.datetime.fromisoformat(parse_date(img_date).split('T', 1)[0])
 79 | img_url = img_caption.find_previous('img')['src']
 80 | img_url = 'https://www.ag.ch/media/kanton_aargau/themen_1/coronavirus_1/bilder_11/daten/Inzidenz_pro_100K_Einwohner_content_large.jpg'
 81 | if not img_url.startswith('http'):
 82 |     img_url = f'https://www.ag.ch{img_url}'
 83 | 
 84 | # download the image to a temporary file
 85 | _, path = tempfile.mkstemp(suffix='.jpg')
 86 | sc.download_file(img_url, path)
 87 | 
 88 | # convert to binary image
 89 | img = cv2.imread(path)
 90 | gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
 91 | gray, img_bin = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
 92 | gray = cv2.bitwise_not(img_bin)
 93 | 
 94 | # improve image and extract text
 95 | kernel = np.ones((2, 1), np.uint8)
 96 | img = cv2.erode(gray, kernel, iterations=1)
 97 | img = cv2.dilate(img, kernel, iterations=1)
 98 | #cv2.imshow('img', img)
 99 | #cv2.waitKey(0)
100 | custom_config = '--oem 3 --psm 6'
101 | text_in_img = pytesseract.image_to_string(img, config=custom_config)
102 | 
103 | # delete the temp img file
104 | os.remove(path)
105 | 
106 | def parse_line(line):
107 |     in_str = "OBFT"
108 |     out_str = "0877"
109 |     tab = str.maketrans(in_str, out_str)
110 |     match = re.match(r'^(.*)\s+(?:[_-]\s+)?(\S+)\s+(\S+)\s+(\S+)$', line)
111 |     if match:
112 |         return (int(match[3].replace("'", "").translate(tab)), int(match[4].replace("'", "").translate(tab)))
113 |     return (None, None)
114 | 
115 | for name, config in districts.items():
116 |     for line in text_in_img.split('\n'):
117 |         dd = sc.DistrictData(canton='AG', district=name)
118 |         dd.district_id = config['district_id']
119 |         dd.url = data_url
120 |         if re.search(config['pattern'], line, flags=re.I):
121 |             population, total_cases = parse_line(line)
122 |             assert population == config['population'], f"Population number for {name} does not match, {population} != {config['population']}"
123 |             dd.date = img_date.date().isoformat()
124 |             dd.population = population
125 |             dd.total_cases = total_cases
126 |             break
127 |     assert dd, f"No data found for district {name}, Text: {text_in_img}"
128 |     print(dd)
129 | 


--------------------------------------------------------------------------------
/scrapers/scrape_vs_districts.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import re
  4 | 
  5 | import scrape_common as sc
  6 | import scrape_vs_common as svc
  7 | 
  8 | # get the latest weekly PDF
  9 | url = svc.get_vs_latest_weekly_pdf_url()
 10 | 
 11 | # fetch the PDF
 12 | pdf = sc.download_content(url, silent=True)
 13 | week, year = svc.get_vs_weekly_general_data(pdf)
 14 | 
 15 | # second last page contains the district data
 16 | pages = int(sc.pdfinfo(pdf))
 17 | page = None
 18 | for p in range(1, pages):
 19 |     content = sc.pdftotext(pdf, page=p, layout=True)
 20 |     if sc.find(r'(Geografische)\s+.*', content):
 21 |         page = p
 22 |         break
 23 | 
 24 | assert page > 0
 25 | content = sc.pdftotext(pdf, page=page, layout=True, rect=[0, 443, 420, 50], fixed=2)
 26 | 
 27 | # strip everything including the "Anzahl Faelle" column + values
 28 | def strip_left_number(content):
 29 |     lines = content.split('\n')
 30 |     pos = None
 31 |     for line in lines:
 32 |         res = re.search(r'\s+(\d+)   ', line)
 33 |         if res is not None:
 34 |             if pos is None:
 35 |                 pos = res.end()
 36 |             else:
 37 |                 pos = min(pos, res.end())
 38 |     new_content = []
 39 |     for line in lines:
 40 |         new_content.append(line[pos:])
 41 |     return '\n'.join(new_content)
 42 | 
 43 | 
 44 | # strip from the right the "Inzidenz pro 100k Einwohner" column / description
 45 | def strip_right_items(content):
 46 |     lines = content.split('\n')
 47 |     pos = None
 48 |     for line in lines:
 49 |         res = re.search(r'(\d+|\d+\.\d+)\s?$', line)
 50 |         if res is not None:
 51 |             if pos is None:
 52 |                 pos = res.start()
 53 |             else:
 54 |                 pos = max(pos, res.start())
 55 |     new_content = []
 56 |     for line in lines:
 57 |         new_content.append(line[:pos])
 58 |     return '\n'.join(new_content)
 59 | 
 60 | # kill the left and right axis
 61 | content = strip_left_number(content)
 62 | # content = strip_right_items(content)
 63 | 
 64 | # remove strange characters at the end of the string
 65 | #content = content.rstrip()
 66 | 
 67 | """
 68 | this results in something like this (13 columns expected for the districts)
 69 | 
 70 |                                                                                                                               6.6
 71 | 
 72 |                                   9          6                       7          2           5           8         15           1           6          16
 73 | """
 74 | 
 75 | # approximate the width of each "column" in the table
 76 | # get the maxima and divide it by the 13 expected districts
 77 | length=None
 78 | for line in content.split('\n'):
 79 |     llenght = len(line)
 80 |     if length is None:
 81 |         length = llenght
 82 |     else:
 83 |         length = max(llenght, length)
 84 | length = round(length / 14.5)
 85 | 
 86 | # split up all lines by the length and use the "lowest line" value
 87 | district_values = []
 88 | for i in range(0, 13):
 89 |     value = ''
 90 |     for line in content.split('\n'):
 91 |         val = line[i * length:(i + 1) * length].strip()
 92 |         if val != '':
 93 |             value = val
 94 |     if value == '':
 95 |         value = 0
 96 |     district_values.append(int(value))
 97 | 
 98 | 
 99 | # this is the order in the PDF
100 | districts = [
101 |     'Goms',
102 |     'Raron',
103 |     'Brig',
104 |     'Visp',
105 |     'Leuk',
106 |     'Sierre',
107 |     'Herens',
108 |     'Sion',
109 |     'Conthey',
110 |     'Martigny',
111 |     'Entremont',
112 |     'St-Maurice',
113 |     'Monthey',
114 | ]
115 | 
116 | district_ids = [
117 |     2304,
118 |     2309,
119 |     2301,
120 |     2313,
121 |     2306,
122 |     2311,
123 |     2305,
124 |     2312,
125 |     2302,
126 |     2307,
127 |     2303,
128 |     2310,
129 |     2308,
130 | ]
131 | 
132 | population = [
133 |     4440,
134 |     10930,
135 |     26910,
136 |     28650,
137 |     12360,
138 |     49230,
139 |     10860,
140 |     47750,
141 |     28910,
142 |     47980,
143 |     15260,
144 |     13830,
145 |     46840,
146 | ]
147 | 
148 | 
149 | assert len(district_values) == 13, f'expected 13 district values, but got {len(district_values)} for {url}'
150 | i = 0
151 | for value in district_values:
152 |     dd = sc.DistrictData(canton='VS', district=districts[i])
153 |     dd.url = url
154 |     dd.district_id = district_ids[i]
155 |     dd.population = population[i]
156 |     dd.week = week
157 |     dd.year = year
158 |     dd.new_cases = value
159 |     print(dd)
160 |     i += 1
161 | 


--------------------------------------------------------------------------------
/scrapers/scrape_vd.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import datetime
  5 | import re
  6 | import sys
  7 | import requests
  8 | from bs4 import BeautifulSoup
  9 | import scrape_common as sc
 10 | import scrape_vd_common as svc
 11 | 
 12 | 
 13 | def parse_html():
 14 |     # https://www.vd.ch/toutes-les-actualites/hotline-et-informations-sur-le-coronavirus/point-de-situation-statistique-dans-le-canton-de-vaud/
 15 |     # includes a content from datawrapper ( https://datawrapper.dwcdn.net/tr5bJ/14/ ),
 16 |     # which provides actual data and table rendering.
 17 |     # Here we instead use datawrapper API directly to fetch the data.
 18 |     main_url = 'https://www.vd.ch/toutes-les-actualites/hotline-et-informations-sur-le-coronavirus/point-de-situation-statistique-dans-le-canton-de-vaud/'
 19 |     url = 'https://api.datawrapper.de/v3/charts/tr5bJ/data'
 20 |     print('Downloading:', main_url)
 21 |     # The bearer authentication token provided by Alex Robert ( https://github.com/AlexBobAlex )
 22 |     data = requests.get(url,
 23 |                         headers={'accept': 'text/csv',
 24 |                                  'Authorization': 'Bearer 6868e7b3be4d7a69eff00b1a434ea37af3dac1e76f32d9087fc544dbb3f4e229'})
 25 |     d = data.text
 26 | 
 27 |     # Date	Hospitalisations en cours	Dont soins intensifs	Sortis de l'hôpital	Décès	Total cas confirmés
 28 |     # 10.03.2020	36	8	5	1	130
 29 |     # 11.03.2020	38	7	5	3	200
 30 | 
 31 |     rows = d.split('\n')
 32 | 
 33 |     # Remove empty rows
 34 |     rows = [row for row in rows if len(row.strip())]
 35 | 
 36 |     headers = rows[0].split('\t')
 37 |     assert headers[0:6] == ["Date", "Hospitalisations en cours", "Dont soins intensifs", "Sortis de l'hôpital", "Décès", "Total cas confirmés"], f"Table header mismatch: Got: {headers}"
 38 | 
 39 |     is_first = True
 40 |     for row in rows:
 41 |         if not is_first:
 42 |             print('-' * 10)
 43 |         is_first = False
 44 | 
 45 |         cells = row.split('\t')
 46 |         print('VD')
 47 |         sc.timestamp()
 48 |         print('Downloading:', main_url)
 49 |         print('Date and time:', cells[0])
 50 |         print('Confirmed cases:', cells[5])
 51 |         print('Deaths:', cells[4])
 52 |         print('Hospitalized:', cells[1])
 53 |         print('ICU:', cells[2])
 54 |         if cells[3].isnumeric():
 55 |             print('Recovered:', cells[3])
 56 | 
 57 | 
 58 | def parse_xlsx():
 59 |     html_url = 'https://www.vd.ch/toutes-les-actualites/hotline-et-informations-sur-le-coronavirus/point-de-situation-statistique-dans-le-canton-de-vaud/'
 60 |     d = sc.download(html_url, silent=True)
 61 |     soup = BeautifulSoup(d, 'html.parser')
 62 |     xls_url = soup.find('a', string=re.compile("les donn.*es", flags=re.I)).get('href')
 63 |     assert xls_url, "URL is empty"
 64 |     xls = sc.xlsdownload(xls_url, silent=True)
 65 |     rows = [row for row in sc.parse_xls(xls, header_row=2) if isinstance(row['Date'], datetime.datetime)]
 66 |     is_first = True
 67 |     for row in sorted(rows, key=lambda row: row['Date'].date().isoformat()):
 68 |         if not is_first:
 69 |             print('-' * 10)
 70 |         is_first = False
 71 | 
 72 |         print('VD')
 73 |         sc.timestamp()
 74 |         print('Downloading:', html_url)
 75 |         print('Date and time:', row['Date'].date().isoformat())
 76 |         print('Confirmed cases:', row['Nombre total de cas confirmés positifs'])
 77 |         print('Hospitalized:', row['Hospitalisation en cours'])
 78 |         print('ICU:', row['Dont soins intensifs'])
 79 |         print('Deaths:', row['Décès parmi cas confirmés'])
 80 | 
 81 | 
 82 | def text_to_int(text):
 83 |     return int(re.sub('[^0-9]', '', text))
 84 | 
 85 | 
 86 | def parse_weekly_pdf():
 87 |     pdf_url = svc.get_weekly_pdf_url()
 88 |     pdf = sc.pdfdownload(pdf_url, silent=True)
 89 | 
 90 |     dd = sc.DayData(canton='VD', url=pdf_url)
 91 |     res = re.findall('Situation\s+au\s+(\d+\s+\w+\s+\d{4})', pdf, re.MULTILINE | re.DOTALL)
 92 |     if len(res) == 1:
 93 |         dd.datetime = res[0]
 94 |     dd.datetime = dd.datetime.replace('\n', ' ')
 95 |     if dd.datetime is None:
 96 |         dd.datetime = sc.find('Point .pid.miologique au (\d+\.\d+\.\d{4})', pdf)
 97 |     #dd.cases = text_to_int(sc.find('\s(\d+.\d+)\s+personnes ont .t. d.clar.es positives au SARS-CoV-2.', pdf))
 98 |     dd.hospitalized = sc.find('(\d+)\s+patients\s+(COVID-19\s+)?sont\s+(actuellement\s+)?hospitalis.s', pdf)
 99 |     dd.icu = sc.find('dont\s+(\d+)\s+en\s+soins\s+intensifs', pdf)
100 |     assert dd
101 |     print(dd)
102 |     print('-' * 10)
103 | 
104 | 
105 | if __name__ == '__main__':
106 |     parse_weekly_pdf()
107 |     # parse_xlsx()
108 | 


--------------------------------------------------------------------------------
/scripts/validate-csv.js:
--------------------------------------------------------------------------------
  1 | const csv = require('csv-validator');
  2 | const fs = require("fs").promises;
  3 | const path = require("path");
  4 | 
  5 | const csvFiles = process.argv.slice(2);
  6 | 
  7 | const validateSequentially = async csvFiles => {
  8 |     //field names starting with `_` are optional
  9 |     const headers = {
 10 |         date: /^\d{4}-\d{2}-\d{2}$/,
 11 |         _time: /^(([0-1]?[0-9]|2[0-3]):[0-5][0-9])?$/,
 12 |         abbreviation_canton_and_fl: /^[A-Z]{2}$/,
 13 |         _ncumul_tested: /^(\d+)?$/,
 14 |         _ncumul_conf: /^(\d+)?$/,
 15 |         _new_hosp: /^(\d+)?$/,
 16 |         _current_hosp: /^(\d+)?$/,
 17 |         _current_icu: /^(\d+)?$/,
 18 |         _current_vent: /^(\d+)?$/,
 19 |         _ncumul_released: /^(\d+)?$/,
 20 |         _ncumul_deceased: /^(\d+)?$/,
 21 |         _source: '',
 22 |         _current_isolated: /^(\d+)?$/,
 23 |         _current_quarantined: /^(\d+)?$/
 24 |     };
 25 |     const requiredKeys = [
 26 |       "date",
 27 |       "time",
 28 |       "abbreviation_canton_and_fl",
 29 |       "ncumul_tested",
 30 |       "ncumul_conf",
 31 |       "new_hosp",
 32 |       "current_hosp",
 33 |       "current_icu",
 34 |       "current_vent",
 35 |       "ncumul_released",
 36 |       "ncumul_deceased",
 37 |       "source",
 38 |       "current_isolated",
 39 |       "current_quarantined"
 40 |     ];
 41 |     
 42 |     const cumulativeFields = [
 43 |       "ncumul_tested",
 44 |       "ncumul_conf",
 45 |       "ncumul_released",
 46 |       "ncumul_deceased"
 47 |     ];
 48 | 
 49 | 
 50 |   const csvCorrectionFilePath = path.resolve('correction_status.csv');
 51 |   const parsedCorrection = await csv(csvCorrectionFilePath, headers);
 52 |   let correction = {};
 53 |   parsedCorrection.forEach(function (item, index) {
 54 |       if (correction[item['date']] === undefined) {
 55 |           correction[item['date']] = {};
 56 |       }
 57 |       if (correction[item['date']][item['abbreviation_canton_and_fl']] === undefined) {
 58 |           correction[item['date']][item['abbreviation_canton_and_fl']] = {};
 59 |       }
 60 |       correction[item['date']][item['abbreviation_canton_and_fl']][item['column']] = 1;
 61 |   });
 62 | 
 63 |   let failedChecks = 0;
 64 | 
 65 |   for (let csvFile of csvFiles) {
 66 |     const csvFilePath = path.resolve(csvFile);
 67 | 
 68 |     try {
 69 |         // check if file can be parsed
 70 |     	const parsed = await csv(csvFilePath, headers);
 71 | 
 72 |         //make sure all keys are present
 73 |         const hasAllKeys = requiredKeys.every(key => parsed[0].hasOwnProperty(key));
 74 |         if (!hasAllKeys) {
 75 |             throw new Error(`Required field missing`);
 76 |         }
 77 | 
 78 |         var last = {};
 79 |         var errors = [];
 80 |         var unique = {};
 81 |         var today = new Date();
 82 |         parsed.forEach(function (item, index) {
 83 |             // check if date is in the future
 84 |             var abbr = item['abbreviation_canton_and_fl'];
 85 |             var date = item['date'];
 86 |             var dateObj = new Date(date);
 87 |             if (dateObj.getTime() > today.getTime()) {
 88 |                 errors.push(`Row ${index+2}: date ${date} is in the future.`);
 89 |             }
 90 | 
 91 |             // check if cumulative field only increase
 92 |             cumulativeFields.forEach(function(col, col_idx) {
 93 | 	        const skip = correction[date] !== undefined && correction[date][abbr] !== undefined && correction[date][abbr][col] !== undefined;
 94 |                 if (col in last && last[col] && item[col] && parseInt(item[col]) < parseInt(last[col]) && !skip) {
 95 |                     errors.push(`Row ${index+2}: cumulative field ${col}: ${item[col]} < ${last[col]}`);
 96 |                 }
 97 |                 if (item[col]) {
 98 |                     last[col] = item[col];
 99 |                 }
100 |             });
101 | 
102 | 
103 |             // check if there is only one entry per area and date
104 |             if (!(date in unique)) {
105 |                 unique[date] = {};
106 |             }
107 |             if (abbr in unique[date]) {
108 |                 unique[date][abbr] += 1;
109 |                 errors.push(`Row ${index+2}: duplicate entry for date ${date}`);
110 |             } else {
111 |                 unique[date][abbr] = 1;
112 |             }
113 |         });
114 |         if (errors.length > 0) {
115 |             throw new Error(errors.join("\n"));
116 |         }
117 |     } catch (e) {
118 |       failedChecks++;
119 |       console.log(`× ${csvFile} failed the following checks:\n${e}`);
120 |       continue;
121 | 	}
122 |     console.log(`✓ ${csvFile} is valid.`);
123 |   }
124 | 
125 |   return failedChecks;
126 | };
127 | 
128 | const run = async () => {
129 |   const failedChecks = await validateSequentially(csvFiles);
130 | 
131 |   if (failedChecks > 0) {
132 |     process.exit(1);
133 |   }
134 | };
135 | 
136 | run().catch(e => console.error(e));
137 | 


--------------------------------------------------------------------------------