├── scrapers ├── __init__.py ├── test │ ├── __init__.py │ ├── test_district_data.py │ ├── test_test_data.py │ └── test_dates.py ├── .gitignore ├── scrape_nw_common.py ├── scrape_gl_common.py ├── scrape_ag_common.py ├── scrape_vd_common.py ├── scrape_fl_tests.py ├── scrape_zh_tests.py ├── scrape_be_tests.py ├── scrape_fr_common.py ├── validate_scraper_output.sh ├── scrape_so_common.py ├── db_common.py ├── scrape_fr_tests.py ├── scrape_nw_tests.py ├── scrape_zh.py ├── scrape_tg_districts.py ├── scrape_ti_tests.py ├── scrape_zg_tests.py ├── scrape_sh_tests.py ├── scrape_gl_tests.py ├── scrape_nw.py ├── run_district_scraper.sh ├── scrape_vs_common.py ├── run_tests_scraper.sh ├── meta_scrape.sh ├── scrape_sg_tests.py ├── scrape_vs_tests.py ├── test_tests_scraper.sh ├── scrape_tg.py ├── run_scraper.sh ├── test_district_scraper.sh ├── scrape_ne.py ├── scrape_ag_tests.py ├── scrape_ju_tests.py ├── scrape_ai.py ├── scrape_bl_common.py ├── scrape_fr.py ├── scrape_be_districts.py ├── scrape_ge_tests.py ├── test_scraper.sh ├── scrape_lu.py ├── scrape_sg_districts.py ├── scrape_bs.py ├── download.sh ├── scrape_tests.py ├── validate_scrapers.py ├── scrape_ge_common.py ├── populate_district_database.py ├── scrape_so_districts.py ├── scrape_sh.py ├── scrape_bs_tests.py ├── scrape_be.py ├── scrape_sz_districts.py ├── populate_database.py ├── scrape_gr_districts.py ├── populate_tests_database.py ├── scrape_tg_tests.py ├── scrape_ti.py ├── scrape_sz.py ├── certificate.pem ├── scrape_ag.py ├── scrape_vd_tests.py ├── scrape_so.py ├── scrape_fl.py ├── scrape_ow.py ├── scrape_vs.py ├── scrape_sh_common.py ├── scrape_fr_districts.py ├── scrape_bl_tests.py ├── scrape_ur.py ├── scrape_gl.py ├── scrape_so_tests.py ├── add_district_db_entry.py ├── convert_parsed_to_csv.py ├── scrape_gr.py ├── scrape_bl_districts.py ├── scrape_ag_districts.py ├── scrape_vs_districts.py └── scrape_vd.py ├── gd.png ├── logos.png ├── requirements-ocr.txt ├── dashboard └── dashboard.png ├── binder └── environment.yml ├── statistisches_amt_kt_zh.png ├── .gitignore ├── requirements.txt ├── setup.py ├── fallzahlen_bezirke ├── Readme.md └── fallzahlen_kanton_AG_bezirk.csv ├── fallzahlen_plz └── Readme.md ├── fallzahlen_kanton_total_csv_v2 └── README.md ├── COVID19_Fallzahlen_Kanton_ZH_isolated_quarantined.csv ├── COVID19_Fallzahlen_Kanton_ZH_Beispiel_alter_geschlecht_.csv ├── fallzahlen_kanton_alter_geschlecht_csv ├── COVID19_Fallzahlen_Kanton_AI_alter_geschlecht.csv ├── COVID19_Fallzahlen_Kanton_alter_geschlecht_BEISPIEL.csv ├── Readme.md ├── COVID19_Fallzahlen_Kanton_AR_alter_geschlecht.csv ├── COVID19_Fallzahlen_Kanton_AG_alter_geschlecht.csv └── COVID19_Einwohner_Kanton_ZH_altersklassen_geschlecht.csv ├── fallzahlen_kanton_zh ├── README.md └── COVID19_VOC_Kanton_ZH.csv ├── scripts ├── latest_total.sh ├── transform_all_new2old.sh ├── transform_all_add_columns.sh ├── transform_all_old2new.sh ├── check_for_empty_lines.sh ├── merge_canton_csvs.rb ├── update_dates_in_readme.sh ├── validate-schema.js ├── latest_per_canton.sh ├── new2oldcsv.py ├── old2newcsv.py ├── add_new_columns.py ├── remove_older_entries.py ├── check_for_outliers.py └── validate-csv.js ├── mappingCanton_BFS.csv ├── package.json ├── COVID19_Fallzahlen_Beispiel.csv ├── CONTRIBUTING.md ├── fallzahlen_kanton_total_csv └── README.md ├── .github └── workflows │ ├── rebase.yml │ ├── lint_python.yml │ ├── test_scraper.yml │ ├── activate_scraper.yml │ ├── deactivate_scraper.yml │ ├── test_tests_scraper.yml │ ├── test_district_scraper.yml │ ├── validate-csv.yml │ ├── run_district_scrapers.yml │ └── run_tests_scraper.yml ├── correction_status.csv └── fallzahlen_tests └── fallzahlen_kanton_JU_tests.csv /scrapers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scrapers/test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scrapers/.gitignore: -------------------------------------------------------------------------------- 1 | webarchiveorg.log 2 | __pycache__ 3 | -------------------------------------------------------------------------------- /gd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openZH/covid_19/HEAD/gd.png -------------------------------------------------------------------------------- /logos.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openZH/covid_19/HEAD/logos.png -------------------------------------------------------------------------------- /requirements-ocr.txt: -------------------------------------------------------------------------------- 1 | opencv-python==4.4.0.44 2 | numpy 3 | pytesseract 4 | -------------------------------------------------------------------------------- /dashboard/dashboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openZH/covid_19/HEAD/dashboard/dashboard.png -------------------------------------------------------------------------------- /binder/environment.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | dependencies: 4 | - matplotlib 5 | - pandas 6 | -------------------------------------------------------------------------------- /statistisches_amt_kt_zh.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openZH/covid_19/HEAD/statistisches_amt_kt_zh.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/* 2 | node_modules 3 | scrapers/data.sqlite 4 | *.pyc 5 | boxplot.png 6 | geckodriver.log 7 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4 2 | requests 3 | dateparser 4 | xlrd==1.2.0 5 | pytest 6 | pandas 7 | selenium 8 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup(name="scrapers", packages=find_packages()) 4 | -------------------------------------------------------------------------------- /fallzahlen_bezirke/Readme.md: -------------------------------------------------------------------------------- 1 | # Metadata 2 | 3 | See https://github.com/openZH/covid_19/tree/master#canton-zurich-districts-bezirk. 4 | -------------------------------------------------------------------------------- /fallzahlen_plz/Readme.md: -------------------------------------------------------------------------------- 1 | # Metadata 2 | See: https://github.com/openZH/covid_19/tree/master#canton-zurich-postal-codes-postleitzahl. 3 | -------------------------------------------------------------------------------- /fallzahlen_kanton_total_csv_v2/README.md: -------------------------------------------------------------------------------- 1 | # Metadata 2 | Siehe: https://github.com/openZH/covid_19/tree/master#swiss-cantons-and-principality-of-liechtenstein-unified-dataset. 3 | -------------------------------------------------------------------------------- /COVID19_Fallzahlen_Kanton_ZH_isolated_quarantined.csv: -------------------------------------------------------------------------------- 1 | date,abbreviation_canton_and_fl,current_isolated,current_quarantined 2 | 2020-05-26,ZH,14,58 3 | 2020-05-29,ZH,22,67 4 | 2020-06-02,ZH,18,47 5 | -------------------------------------------------------------------------------- /COVID19_Fallzahlen_Kanton_ZH_Beispiel_alter_geschlecht_.csv: -------------------------------------------------------------------------------- 1 | Date,Area,AgeYear,Gender,NewConfCases,NewDeaths,PreExistingCond 2 | 2020-03-01,Canton_ZH,30,F,1,0,0 3 | 2020-03-01,Canton_ZH,32,M,0,1,1 4 | 5 | -------------------------------------------------------------------------------- /fallzahlen_kanton_alter_geschlecht_csv/COVID19_Fallzahlen_Kanton_AI_alter_geschlecht.csv: -------------------------------------------------------------------------------- 1 | Date,Area,AgeYear,Gender,NewConfCases,NewPosTests1,NewCured,NewDeaths 2 | 14.03.2020,Canton_AI,59,m,,1,, 3 | 14.03.2020,Canton_AI,57,f,,1,, 4 | -------------------------------------------------------------------------------- /fallzahlen_kanton_alter_geschlecht_csv/COVID19_Fallzahlen_Kanton_alter_geschlecht_BEISPIEL.csv: -------------------------------------------------------------------------------- 1 | Date,Area,AgeYear,Gender,NewConfCases,NewDeaths,PreExistingCond 2 | 2020-03-01,Canton_ZH,30,F,1,0,0 3 | 2020-03-01,Canton_ZH,32,M,0,1,1 4 | 5 | -------------------------------------------------------------------------------- /fallzahlen_kanton_zh/README.md: -------------------------------------------------------------------------------- 1 | # Metadata 2 | See: 3 | - https://github.com/openZH/covid_19/tree/master#canton-z%C3%BCrich-unified-dataset 4 | - https://github.com/openZH/covid_19/blob/master/README.md#canton-z%C3%BCrich-more-detailed-dataset. 5 | -------------------------------------------------------------------------------- /scripts/latest_total.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | for f in *.csv; do 4 | # Output last row with non-zero commulative number of cases 5 | awk -F , '{if ($5) { print $1, $3, $5; }}' "$f" | tail -1 6 | done | awk 'BEGIN { sum = 0; } { sum += $3; } END { print sum; }' 7 | -------------------------------------------------------------------------------- /fallzahlen_kanton_alter_geschlecht_csv/Readme.md: -------------------------------------------------------------------------------- 1 | # Metadata 2 | 3 | See: 4 | - https://github.com/openZH/covid_19/tree/master#swiss-cantons-and-principality-of-liechtenstein-more-detailed-dataset 5 | - https://github.com/openZH/covid_19/tree/master#canton-zurich-more-detailed-dataset 6 | -------------------------------------------------------------------------------- /scripts/transform_all_new2old.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | DIR="$(cd "$(dirname "$0")" && pwd)" 4 | 5 | for f in $DIR/../fallzahlen_kanton_total_csv_v2/*.csv; 6 | do 7 | filename="$(basename "$f")" 8 | $DIR/new2oldcsv.py $f > $DIR/../fallzahlen_kanton_total_csv/$filename 9 | done 10 | -------------------------------------------------------------------------------- /mappingCanton_BFS.csv: -------------------------------------------------------------------------------- 1 | abk,bfs 2 | ZH,01 3 | BE,02 4 | LU,03 5 | UR,04 6 | SZ,05 7 | OW,06 8 | NW,07 9 | GL,08 10 | ZG,09 11 | FR,10 12 | SO,11 13 | BS,12 14 | BL,13 15 | SH,14 16 | AR,15 17 | AI,16 18 | SG,17 19 | GR,18 20 | AG,19 21 | TG,20 22 | TI,21 23 | VD,22 24 | VS,23 25 | NE,24 26 | GE,25 27 | JU,26 28 | FL,99 29 | -------------------------------------------------------------------------------- /fallzahlen_kanton_alter_geschlecht_csv/COVID19_Fallzahlen_Kanton_AR_alter_geschlecht.csv: -------------------------------------------------------------------------------- 1 | Date,Area,AgeYear,Gender,NewConfCases,NewPosTests1,NewCured,NewDeaths 2 | 05.03.2020,Canton_AR,50,f,,1,, 3 | 09.03.2020,Canton_AR,,f,,1,, 4 | 12.03.2020,Canton_AR,69,f,,1,, 5 | 12.03.2020,Canton_AR,38,f,,1,, 6 | 12.03.2020,Canton_AR,42,f,,1,, 7 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "covid_19", 3 | "version": "1.0.0", 4 | "repository": "git@github.com:openZH/covid_19.git", 5 | "license": "MIT", 6 | "dependencies": { 7 | "csv-validator": "0.0.3" 8 | }, 9 | "scripts": { 10 | "test": "node scripts/validate-csv.js fallzahlen_kanton_total_csv_v2/*.csv" 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /scripts/transform_all_add_columns.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | DIR="$(cd "$(dirname "$0")" && pwd)" 4 | 5 | for f in $DIR/../fallzahlen_kanton_total_csv_v2/*.csv; 6 | do 7 | filename="$(basename "$f")" 8 | $DIR/add_new_columns.py $f > /tmp/columnfile 9 | cat /tmp/columnfile > $DIR/../fallzahlen_kanton_total_csv_v2/$filename 10 | done 11 | -------------------------------------------------------------------------------- /scripts/transform_all_old2new.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | DIR="$(cd "$(dirname "$0")" && pwd)" 4 | 5 | mkdir -p $DIR/../fallzahlen_kanton_total_csv_v2 6 | 7 | for f in $DIR/../fallzahlen_kanton_total_csv/*.csv; 8 | do 9 | filename="$(basename "$f")" 10 | $DIR/old2newcsv.py $f > $DIR/../fallzahlen_kanton_total_csv_v2/$filename 11 | done 12 | -------------------------------------------------------------------------------- /scripts/check_for_empty_lines.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | path="$*" 4 | output=$(grep --line-number --with-filename '^\s*$' $path) 5 | grep_exit=$? 6 | 7 | if [ $grep_exit -eq 0 ] ; then 8 | echo "× Found empty lines in the following files/line number:" 9 | echo $output 10 | exit 1 11 | else 12 | echo "✓ No empty lines found" 13 | exit 0 14 | fi 15 | 16 | -------------------------------------------------------------------------------- /COVID19_Fallzahlen_Beispiel.csv: -------------------------------------------------------------------------------- 1 | date,time,abbreviation_canton_and_fl,ncumul_tested,ncumul_conf,new_hosp,current_hosp,current_icu,current_vent,ncumul_released,ncumul_deceased,source,current_isolated,current_quarantined,current_quarantined_riskareatravel 2 | 2020-02-27,17:40,AG,10000,1000,10,100,10,10,100,10,https://ag.ch/...,37,88,112 3 | 2020-02-28,11:00,AG,11000,1010,5,80,5,5,120,15,https://ag.ch/...,35,67,132 4 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | Contributors to data collection & cleaning: please check https://github.com/openZH/covid_19/issues for open issues, and use this to flag any problems. 2 | 3 | The best way to get started right now is to join the discussion at https://github.com/openZH/covid_19/discussions?discussions_q=sort%3Atop 4 | 5 | Users of the data: please share links to your projects in https://github.com/openZH/covid_19#community-contributions 6 | -------------------------------------------------------------------------------- /fallzahlen_kanton_alter_geschlecht_csv/COVID19_Fallzahlen_Kanton_AG_alter_geschlecht.csv: -------------------------------------------------------------------------------- 1 | Date,Area,AgeYear,Gender,NewConfCases,NewPosTests1,NewCured,NewDeaths,source 2 | 01.03.2020,Canton_AG,31,m,1,,,,https://www.ag.ch/de/aktuelles/medienportal/medienmitteilung/medienmitteilungen/mediendetails_138717.jsp 3 | 01.03.2020,Canton_AG,74,f,,,,1,https://www.ag.ch/media/kanton_aargau/themen_1/coronavirus_1/lagebulletins/200305_KFS_Coronavirus_Lagebulletin_5.pdf 4 | -------------------------------------------------------------------------------- /scrapers/scrape_nw_common.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import re 5 | from bs4 import BeautifulSoup 6 | import scrape_common as sc 7 | 8 | 9 | def get_nw_page(): 10 | url = 'https://www.nw.ch/gesundheitsamtdienste/6044' 11 | content = sc.download(url, silent=True) 12 | content = content.replace(" ", " ") 13 | content = re.sub(r'(\d+)\'(\d+)', r'\1\2', content) 14 | soup = BeautifulSoup(content, 'html.parser') 15 | return url, soup 16 | -------------------------------------------------------------------------------- /scrapers/scrape_gl_common.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import re 5 | from bs4 import BeautifulSoup 6 | import scrape_common as sc 7 | 8 | 9 | def get_gl_pdf_url(): 10 | d = sc.download('https://www.gl.ch/verwaltung/finanzen-und-gesundheit/gesundheit/coronavirus.html/4817', silent=True) 11 | soup = BeautifulSoup(d, 'html.parser') 12 | 13 | # weekly pdf 14 | elem = soup.find(href=re.compile(r'Sentinella.*\.pdf')) 15 | if elem is None: 16 | return None 17 | return elem.get('href') 18 | -------------------------------------------------------------------------------- /scrapers/scrape_ag_common.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from bs4 import BeautifulSoup 4 | import re 5 | import scrape_common as sc 6 | 7 | 8 | def get_ag_xls_url(): 9 | data_url = 'https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp' 10 | d = sc.download(data_url, silent=True) 11 | soup = BeautifulSoup(d, 'html.parser') 12 | xls_url = soup.find('a', href=re.compile(r'\.xlsx$'))['href'] 13 | if not xls_url.startswith('http'): 14 | xls_url = f'https://www.ag.ch{xls_url}' 15 | return xls_url 16 | -------------------------------------------------------------------------------- /fallzahlen_kanton_total_csv/README.md: -------------------------------------------------------------------------------- 1 | # Warning: Do not manually update files in this directory 2 | 3 | This directory contains all cantonal files in the "old" structure (before 2020-04-09). 4 | All CSV files in this directory will be **updated automatically** every 15min based on the corresponding file in the "fallzahlen_kanton_total_csv_v2" directory. 5 | 6 | All manual changes to these files will be overwritten. 7 | 8 | # Metadata 9 | See: https://github.com/openZH/covid_19/tree/master#swiss-cantons-and-principality-of-liechtenstein-unified-dataset. 10 | -------------------------------------------------------------------------------- /.github/workflows/rebase.yml: -------------------------------------------------------------------------------- 1 | on: 2 | issue_comment: 3 | types: [created] 4 | name: Automatic Rebase 5 | jobs: 6 | rebase: 7 | name: Rebase 8 | if: github.event.issue.pull_request != '' && contains(github.event.comment.body, '/rebase') 9 | runs-on: ubuntu-latest 10 | steps: 11 | - name: Checkout the latest code 12 | uses: actions/checkout@v3 13 | with: 14 | fetch-depth: 0 15 | - name: Automatic Rebase 16 | uses: cirrus-actions/rebase@1.3.1 17 | env: 18 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 19 | -------------------------------------------------------------------------------- /scrapers/scrape_vd_common.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import re 5 | import scrape_common as sc 6 | 7 | 8 | def get_weekly_pdf_url(): 9 | return get_all_weekly_pdf_urls()[0] 10 | 11 | 12 | def get_all_weekly_pdf_urls(): 13 | base_url = 'https://www.infosan.vd.ch' 14 | d = sc.download(base_url, silent=True) 15 | 16 | urls = re.findall(r"window.open\('(.*_epidemio\.pdf)'", d) 17 | result = [] 18 | for url in urls: 19 | if not url.startswith('http'): 20 | url = f'{base_url}/{url}' 21 | result.append(url) 22 | return result 23 | -------------------------------------------------------------------------------- /scrapers/scrape_fl_tests.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import scrape_common as sc 4 | 5 | url = 'https://www.llv.li/files/as/grafik_covid19_tests_pro_kw.xlsx' 6 | xls = sc.xlsdownload(url, silent=True) 7 | rows = sc.parse_xls(xls, header_row=74, sheet_name='gTests_AG') 8 | year = '2020' 9 | for row in rows: 10 | if row['C'] is None: 11 | # skip the footer line 12 | continue 13 | td = sc.TestData(canton='FL', url=url) 14 | td.week = int(sc.find(r'KW (\d+)', row['C'])) 15 | if td.week == 1: 16 | year = '2021' 17 | td.year = year 18 | td.negative_tests = row['Negativ'] 19 | td.positive_tests = row['Positiv'] 20 | print(td) 21 | -------------------------------------------------------------------------------- /.github/workflows/lint_python.yml: -------------------------------------------------------------------------------- 1 | name: Tests + Linting Python 2 | on: 3 | pull_request: 4 | push: 5 | branches: [master] 6 | workflow_dispatch: ~ 7 | jobs: 8 | lint_python: 9 | runs-on: ubuntu-20.04 10 | timeout-minutes: 10 11 | steps: 12 | - uses: actions/checkout@v3 13 | - name: Set up Python 3.7 14 | uses: actions/setup-python@v4 15 | with: 16 | python-version: 3.7 17 | - run: python -m pip install --upgrade pip 18 | - run: pip install flake8 pytest 19 | - run: pip install -r requirements.txt 20 | - run: flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 21 | - run: PYTHONPATH=scrapers pytest 22 | -------------------------------------------------------------------------------- /scrapers/scrape_zh_tests.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import csv 4 | from io import StringIO 5 | import scrape_common as sc 6 | 7 | 8 | url = 'https://raw.githubusercontent.com/openZH/covid_19/master/fallzahlen_kanton_zh/COVID19_Anteil_positiver_Test_pro_KW.csv' 9 | data = sc.download(url, silent=True) 10 | 11 | reader = csv.DictReader(StringIO(data), delimiter=',') 12 | for row in reader: 13 | td = sc.TestData(canton='ZH', url=url) 14 | td.start_date = row['Woche_von'] 15 | td.end_date = row['Woche_bis'] 16 | td.week = row['Kalenderwoche'] 17 | td.positive_tests = int(row['Anzahl_positiv']) 18 | td.negative_tests = int(row['Anzahl_negativ']) 19 | td.positivity_rate = float(row['Anteil_positiv']) 20 | print(td) 21 | -------------------------------------------------------------------------------- /scrapers/scrape_be_tests.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import csv 4 | from io import StringIO 5 | import scrape_common as sc 6 | 7 | url = 'https://covid-kennzahlen.apps.be.ch/#/de/cockpit' 8 | 9 | csv_url = 'https://raw.githubusercontent.com/openDataBE/covid19Data/develop/vortag_tests.csv' 10 | d = sc.download(csv_url, silent=True) 11 | reader = csv.DictReader(StringIO(d), delimiter=',') 12 | for row in reader: 13 | td = sc.TestData(canton='BE', url=url) 14 | date = sc.date_from_text(row['datum']).isoformat() 15 | td.start_date = date 16 | td.end_date = date 17 | td.total_tests = row['durchgefuehrte_tests'] 18 | td.positive_tests = row['positive_tests'] 19 | td.positivity_rate = row['positivitaetsrate'] 20 | print(td) 21 | -------------------------------------------------------------------------------- /scripts/merge_canton_csvs.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require 'csv' 4 | 5 | # get files 6 | files = Dir["fallzahlen_kanton_total_csv_v2/*.csv"] 7 | 8 | # output array 9 | rows = [] 10 | 11 | # read headers 12 | header = CSV.read(files.first).first 13 | 14 | # read all csv files 15 | files.each do |fn| 16 | CSV.foreach(fn, headers: true) do |row| 17 | # make sure time is formatted with leading zeroes 18 | if row[1] =~ /(\d{1,2}):(\d{1,2})/ 19 | row[1] = sprintf "%02d:%02d", $1.to_i, $2.to_i 20 | end 21 | rows << row[0..14] 22 | end 23 | end 24 | 25 | # sort records by date 26 | rows.sort_by! { |x| "#{x[0]}-#{x[1]}-#{x[2]}" } 27 | 28 | 29 | # output 30 | puts header.to_csv 31 | rows.each{ |row| puts row.to_csv } 32 | 33 | 34 | -------------------------------------------------------------------------------- /scrapers/scrape_fr_common.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import re 5 | from bs4 import BeautifulSoup 6 | import scrape_common as sc 7 | 8 | 9 | def get_fr_csv(): 10 | main_url = 'https://www.fr.ch/de/gesundheit/covid-19/coronavirus-statistik-ueber-die-entwicklung-im-kanton' 11 | d = sc.download(main_url, silent=True) 12 | 13 | soup = BeautifulSoup(d, 'html.parser') 14 | item = soup.find('a', title=re.compile(r"Statistik .ber die Entwicklungen im Kanton.*")) 15 | csv_url = item.get('href') 16 | assert csv_url, "URL is empty" 17 | if not csv_url.startswith('http'): 18 | csv_url = f'https://www.fr.ch{csv_url}' 19 | 20 | csv = sc.download(csv_url, silent=True) 21 | csv = re.sub(r'(\d+)\'(\d+)', r'\1\2', csv) 22 | return csv_url, csv, main_url 23 | -------------------------------------------------------------------------------- /scrapers/validate_scraper_output.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Script to run a single scraper 4 | 5 | set -e 6 | set -o pipefail 7 | 8 | function cleanup { 9 | exit $? 10 | } 11 | trap "cleanup" EXIT 12 | 13 | DIR="$(cd "$(dirname "$0")" && pwd)" 14 | 15 | 16 | # SCRAPER_KEY must be set 17 | if [ -z $SCRAPER_KEY ] ; then 18 | echo "SCRAPER_KEY env variable must be set"; 19 | exit 1 20 | fi 21 | 22 | area="Kanton_${SCRAPER_KEY}" 23 | if [ "$SCRAPER_KEY" = "FL" ] ; then 24 | area="${SCRAPER_KEY}" 25 | fi 26 | 27 | # 1. Validate the result 28 | node $DIR/../scripts/validate-csv.js $DIR/../fallzahlen_kanton_total_csv_v2/COVID19_Fallzahlen_${area}_total.csv 29 | 30 | # 2. Check for outliers 31 | python $DIR/../scripts/check_for_outliers.py $DIR/../fallzahlen_kanton_total_csv_v2/COVID19_Fallzahlen_${area}_total.csv 32 | -------------------------------------------------------------------------------- /fallzahlen_kanton_alter_geschlecht_csv/COVID19_Einwohner_Kanton_ZH_altersklassen_geschlecht.csv: -------------------------------------------------------------------------------- 1 | Year,Area,AgeYearCat,Gender,Inhabitants 2 | 2019,Canton_ZH,0-9,M,82878 3 | 2019,Canton_ZH,0-9,F,78735 4 | 2019,Canton_ZH,10-19,M,72994 5 | 2019,Canton_ZH,10-19,F,68488 6 | 2019,Canton_ZH,100+,M,45 7 | 2019,Canton_ZH,100+,F,200 8 | 2019,Canton_ZH,20-29,M,95172 9 | 2019,Canton_ZH,20-29,F,91194 10 | 2019,Canton_ZH,30-39,M,127998 11 | 2019,Canton_ZH,30-39,F,125184 12 | 2019,Canton_ZH,40-49,M,116400 13 | 2019,Canton_ZH,40-49,F,111604 14 | 2019,Canton_ZH,50-59,M,112667 15 | 2019,Canton_ZH,50-59,F,107919 16 | 2019,Canton_ZH,60-69,M,73383 17 | 2019,Canton_ZH,60-69,F,78006 18 | 2019,Canton_ZH,70-79,M,54372 19 | 2019,Canton_ZH,70-79,F,63877 20 | 2019,Canton_ZH,80-89,M,24989 21 | 2019,Canton_ZH,80-89,F,36988 22 | 2019,Canton_ZH,90-99,M,4020 23 | 2019,Canton_ZH,90-99,F,9293 24 | -------------------------------------------------------------------------------- /scrapers/scrape_so_common.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import re 5 | from bs4 import BeautifulSoup 6 | import scrape_common as sc 7 | 8 | 9 | def strip_value(value): 10 | return value.replace('\'', '') 11 | 12 | 13 | def get_latest_weekly_pdf_url(): 14 | return get_all_weekly_pdf_urls()[0] 15 | 16 | 17 | def get_all_weekly_pdf_urls(): 18 | base_url = 'https://corona.so.ch' 19 | url = f'{base_url}/bevoelkerung/daten/woechentlicher-situationsbericht/' 20 | d = sc.download(url, silent=True) 21 | soup = BeautifulSoup(d, 'html.parser') 22 | links = soup.find_all(href=re.compile(r'\.pdf$')) 23 | result = [] 24 | for link in links: 25 | file_ref = link.get('href') 26 | url = f'{base_url}{file_ref}' 27 | if url not in result: 28 | result.append(url) 29 | return result 30 | -------------------------------------------------------------------------------- /scripts/update_dates_in_readme.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DIR="$(cd "$(dirname "$0")" && pwd)" 4 | 5 | today=$(date +%s) 6 | 7 | areas="FL AG AI AR BE BL BS FR GE GL GR JU LU NE NW OW SG SH SO SZ TG TI UR VD VS ZG ZH" 8 | for area in $areas 9 | do 10 | update_date_str=`grep $area $DIR/../COVID19_Fallzahlen_CH_total_v2.csv | tail -n 1 | awk -F, '{print $1}'` 11 | update_date=$(date --date="$update_date_str" +%s) 12 | diff=$(($today-$update_date)) 13 | 14 | if [ $diff -lt 84000 ]; then 15 | color='4d9221' 16 | elif [ $diff -lt 144000 ]; then 17 | color='b8e186' 18 | else 19 | color='de77ae' 20 | fi 21 | sed -i -e "/\[$area\]/s#update on [^|]*|#update on $update_date_str](https://placehold.jp/$color/000000/200x50.png?text=$update_date_str 'Last update on $update_date_str')|#" $DIR/../README.md 22 | echo "Update README for ${area} (date: ${update_date_str}, color: ${color})" 23 | done 24 | -------------------------------------------------------------------------------- /scrapers/db_common.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import csv 4 | import os 5 | 6 | 7 | def get_location(): 8 | location = os.path.realpath( 9 | os.path.join( 10 | os.getcwd(), 11 | os.path.dirname(__file__) 12 | ) 13 | ) 14 | return location 15 | 16 | 17 | def load_csv(filename): 18 | columns = [] 19 | with open(filename, 'r') as f: 20 | dr = csv.DictReader(f) 21 | if not columns: 22 | columns = dr.fieldnames 23 | to_db = [] 24 | for r in dr: 25 | db_row = [] 26 | for col in columns: 27 | db_row.append(r[col]) 28 | to_db.append(db_row) 29 | return columns, to_db 30 | 31 | 32 | def insert_db_query(columns): 33 | query = 'INSERT INTO data (\n' 34 | query += ",\n".join(columns) 35 | query += ') VALUES (' 36 | query += ",".join(['?'] * len(columns)) 37 | query += ');' 38 | return query 39 | -------------------------------------------------------------------------------- /scrapers/scrape_fr_tests.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import scrape_common as sc 5 | from scrape_fr_common import get_fr_csv 6 | 7 | """ 8 | csv_url, csv_data, main_url = get_fr_csv() 9 | reader = csv.DictReader(StringIO(csv_data), delimiter=';') 10 | 11 | 12 | year = '2020' 13 | 14 | for row in rows: 15 | week = row['semaine /Woche'] 16 | if not week: 17 | continue 18 | 19 | if week == 1: 20 | year = '2021' 21 | 22 | td = sc.TestData(canton='FR', url=main_url) 23 | td.week = int(week) 24 | td.year = year 25 | td.pcr_total_tests = int(row['Tests PCR']) 26 | if row['Taux/Rate PCR']: 27 | td.pcr_positivity_rate = round(row['Taux/Rate PCR'] * 100) 28 | td.ag_total_tests = int(row['Tests AG']) 29 | if row['Taux/Rate AG']: 30 | td.ag_positivity_rate = round(row['Taux/Rate AG'] * 100) 31 | td.total_tests = td.pcr_total_tests + td.ag_total_tests 32 | print(td) 33 | """ 34 | -------------------------------------------------------------------------------- /scrapers/scrape_nw_tests.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import re 5 | import scrape_common as sc 6 | import scrape_nw_common as snc 7 | 8 | url, soup = snc.get_nw_page() 9 | 10 | td = sc.TestData(canton='NW', url=url) 11 | 12 | item = soup.find(text=re.compile('Anzahl F.lle')).find_parent('p') 13 | assert item, f"Could not find title item in {url}" 14 | 15 | date = sc.find(r'Stand: (\d+\. .* 20\d{2})', item.text) 16 | date = sc.date_from_text(date) 17 | td.start_date = date.isoformat() 18 | td.end_date = date.isoformat() 19 | 20 | rows = item.find_next('table').findChildren('tr') 21 | for row in rows: 22 | cols = row.findChildren('td') 23 | item = cols[0].text 24 | if re.match(r'Covid-19-Tests innert 24h.*', item, re.I): 25 | res = re.match(r'(\d+)\s+(\d+\.?\d?)%', cols[1].text) 26 | if res is not None: 27 | td.total_tests = res[1] 28 | td.positivity_rate = res[2] 29 | 30 | if td: 31 | print(td) 32 | -------------------------------------------------------------------------------- /scrapers/scrape_zh.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import csv 4 | import re 5 | from io import StringIO 6 | import scrape_common as sc 7 | 8 | url = "https://www.zh.ch/de/gesundheit/coronavirus.html" 9 | csv_url = 'https://raw.githubusercontent.com/openzh/covid_19/master/fallzahlen_kanton_zh/COVID19_Fallzahlen_Kanton_ZH_total.csv' 10 | d_csv = sc.download(csv_url, silent=True) 11 | reader = csv.DictReader(StringIO(d_csv), delimiter=',') 12 | 13 | is_first = True 14 | for row in reader: 15 | if not is_first: 16 | print('-' * 10) 17 | is_first = False 18 | 19 | dd = sc.DayData(canton='ZH', url=url) 20 | dd.datetime = f"{row['date']} {row['time']}" 21 | dd.cases = row['ncumul_conf'] 22 | dd.deaths = row['ncumul_deceased'] 23 | dd.hospitalized = row['current_hosp'] 24 | dd.vent = row['current_vent'] 25 | dd.icu = row['current_icu'] 26 | dd.isolated = row['current_isolated'] 27 | dd.quarantined = row['current_quarantined'] 28 | print(dd) 29 | 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /scrapers/scrape_tg_districts.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import csv 4 | from io import StringIO 5 | import requests 6 | import scrape_common as sc 7 | 8 | # perma link to TG COVID dataset on opendata.swiss 9 | r = requests.get( 10 | 'https://opendata.swiss/api/3/action/ogdch_dataset_by_identifier', 11 | params={'identifier': 'dfs-ga-3@kanton-thurgau '} 12 | ) 13 | dataset = r.json()['result'] 14 | resource = next(r for r in dataset['resources'] if r['mimetype'] == 'text/csv') 15 | 16 | assert resource['download_url'], "Download URL not found" 17 | 18 | d_csv = sc.download(resource['download_url'], silent=True, encoding='latin1') 19 | 20 | reader = csv.DictReader(StringIO(d_csv), delimiter=';') 21 | for row in reader: 22 | dd = sc.DistrictData(canton='TG') 23 | dd.district_id = row['districtid'] 24 | dd.district = row['district'] 25 | dd.population = row['population'] 26 | dd.week = row['week'] 27 | dd.year = row['year'] 28 | dd.new_cases = row['newconfcases'] 29 | dd.url = resource['download_url'] 30 | print(dd) 31 | -------------------------------------------------------------------------------- /scrapers/scrape_ti_tests.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | from bs4 import BeautifulSoup 5 | import re 6 | import scrape_common as sc 7 | 8 | main_url = 'https://www4.ti.ch/dss/dsp/covid19/home/' 9 | d = sc.download(main_url, silent=True) 10 | soup = BeautifulSoup(d, 'html.parser') 11 | 12 | td = sc.TestData(canton='TI', url=main_url) 13 | 14 | container = soup.find('h2', string=re.compile(r'Test PCR')).find_next('div') 15 | for item in container.find_all('div'): 16 | divs = item.find_all('div') 17 | if len(divs) == 3: 18 | if divs[2].string: 19 | date = sc.find(r'.*?(\d+\.\d+\.\d{2})', divs[2].string) 20 | date = sc.date_from_text(date) 21 | td.start_date = date.isoformat() 22 | td.end_date = date.isoformat() 23 | if sc.find(r'^(Totale test).*', divs[1].string): 24 | td.total_tests = divs[0].string 25 | if sc.find(r'^(% test).*', divs[1].string): 26 | td.positivity_rate = divs[0].string 27 | 28 | if td: 29 | assert td.start_date and td.end_date, 'failed to extract date' 30 | print(td) 31 | -------------------------------------------------------------------------------- /scrapers/scrape_zg_tests.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import collections 4 | import csv 5 | import datetime 6 | from io import StringIO 7 | import scrape_common as sc 8 | 9 | 10 | csv_url = 'https://www.zg.ch/behoerden/gesundheitsdirektion/statistikfachstelle/daten/themen/result-themen-14-03-07-i2-k4-b1.csv' 11 | d_csv = sc.download(csv_url, silent=True) 12 | """ 13 | "Woche","Geschlecht","Anzahl Fälle","Meta","Type","Content" 14 | 2020-05-25,"männlich","151",NA,NA,NA 15 | 2020-06-01,"männlich","117",NA,NA,NA 16 | """ 17 | 18 | reader = csv.DictReader(StringIO(d_csv), delimiter=',') 19 | data = collections.defaultdict(dict) 20 | for row in reader: 21 | if row['Woche'] == 'NA': 22 | continue 23 | date = sc.date_from_text(row['Woche']) 24 | if date not in data: 25 | data[date] = 0 26 | data[date] += int(row['Anzahl Fälle']) 27 | 28 | days = list(data.keys()) 29 | for day in days: 30 | td = sc.TestData(canton='ZG', url=csv_url) 31 | td.start_date = day.isoformat() 32 | td.end_date = (day + datetime.timedelta(days=6)).isoformat() 33 | td.total_tests = data[day] 34 | print(td) 35 | -------------------------------------------------------------------------------- /scrapers/scrape_sh_tests.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import datetime 5 | import re 6 | from bs4 import BeautifulSoup 7 | import scrape_common as sc 8 | import scrape_sh_common as shc 9 | 10 | main_url, xls = shc.get_sh_xlsx() 11 | 12 | rows = sc.parse_xls(xls, sheet_name='Datensatz_Tests', header_row=0) 13 | for row in rows: 14 | if not (row['Jahr'] or row['Kalenderwoche']): 15 | continue 16 | 17 | td = sc.TestData(canton='SH', url=main_url) 18 | td.year = row['Jahr'] 19 | td.week = row['Kalenderwoche'] 20 | 21 | td.pcr_total_tests = 0 22 | pcr_cols = ['Tests KAZ', 'Tests Apotheken', 'Tests KSSH', 'Test Praxen'] 23 | for col in pcr_cols: 24 | if sc.represents_int(row[col]): 25 | td.pcr_total_tests += row[col] 26 | 27 | td.ag_total_tests = 0 28 | ag_cols = ['Schnelltests KAZ', 'Schnelltests Apotheken', 'Schnelltests KSSH', 'Schnelltest Praxen'] 29 | for col in ag_cols: 30 | if sc.represents_int(row[col]): 31 | td.ag_total_tests += row[col] 32 | td.total_tests = td.pcr_total_tests + td.ag_total_tests 33 | print(td) 34 | -------------------------------------------------------------------------------- /scrapers/scrape_gl_tests.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import re 5 | from bs4 import BeautifulSoup 6 | import scrape_common as sc 7 | import scrape_gl_common as sgc 8 | 9 | pdf_url = sgc.get_gl_pdf_url() 10 | if pdf_url is not None: 11 | pdf = sc.download_content(pdf_url, silent=True) 12 | content = sc.pdftotext(pdf, page=1, layout=True) 13 | # remove 1k separators 14 | content = re.sub(r'(\d)\'(\d)', r'\1\2', content) 15 | 16 | year = sc.find(r'Stand: \d{2}\.\d{2}.(\d{4})', content) 17 | week = sc.find(r'KW(\d+)\.pdf', pdf_url) 18 | 19 | # Insgesamt Anzahl, 100k, 14 Tage Anzahl, 100k, 7 Tage Anzahl, 100k 20 | number_of_tests = sc.find(r'PCR-Tests/Schnelltests\sKanton Glarus\s+\d+\s+\d+\.?\d+?\s+\d+\s+\d+\.?\d+?\s+(\d+)\s+\d+', content) 21 | # Insgesamt, 14 Tage, 7 Tage 22 | positivity_rate = sc.find(r'Positivit.tsrate GL\s?\*+?\s+\d+\.\d%\s+\d+\.\d%\s+(\d+\.\d)%\s+', content) 23 | 24 | td = sc.TestData(canton='GL', url=pdf_url) 25 | td.week = week 26 | td.year = year 27 | td.total_tests = number_of_tests 28 | td.positivity_rate = positivity_rate 29 | print(td) 30 | -------------------------------------------------------------------------------- /scrapers/scrape_nw.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import re 5 | from bs4 import BeautifulSoup 6 | import scrape_common as sc 7 | import scrape_nw_common as snc 8 | 9 | is_first = True 10 | xls_url = 'http://www.nw.ch/coronastatistik' 11 | xls = sc.xlsdownload(xls_url, silent=True) 12 | rows = sc.parse_xls(xls, header_row=2) 13 | for row in rows: 14 | dd = sc.DayData(canton='NW', url=xls_url) 15 | dd.datetime = row['A'].date().isoformat() 16 | dd.cases = row['Positiv getestete Personen (kumuliert)'] 17 | dd.icu = row['Davon auf der Intensivstation'] 18 | 19 | try: 20 | dd.hospitalized = row['Aktuell hospitalisierte Personen'] 21 | except KeyError: 22 | dd.hospitalized = row['Hospitalisierte Personen'] 23 | 24 | try: 25 | dd.deaths = row['Personen verstorben'] 26 | except KeyError: 27 | dd.deaths = row['Verstorbene Personen'] 28 | 29 | # skip empty rows 30 | if dd.cases is None and dd.icu is None and dd.hospitalized is None and dd.deaths is None: 31 | continue 32 | 33 | if not is_first: 34 | print('-' * 10) 35 | is_first = False 36 | print(dd) 37 | -------------------------------------------------------------------------------- /scripts/validate-schema.js: -------------------------------------------------------------------------------- 1 | const csval = require("csval"); 2 | const fs = require("fs").promises; 3 | const path = require("path"); 4 | 5 | const DIR = path.resolve(process.argv[2] || process.cwd()); 6 | 7 | const validateSequentially = async csvFiles => { 8 | const rules = await csval.readRules(path.join(DIR, "schema.json")); 9 | 10 | let failedChecks = 0; 11 | 12 | for (let csvFile of csvFiles) { 13 | const csv = await csval.readCsv(path.join(DIR, csvFile)); 14 | const parsed = await csval.parseCsv(csv); 15 | let valid = false; 16 | try { 17 | valid = await csval.validate(parsed, rules); 18 | } catch (e) { 19 | failedChecks++; 20 | console.log(`× ${csvFile} failed the following checks:${e.message}\n`); 21 | } 22 | if (valid) { 23 | console.log(`✓ ${csvFile} is valid.`); 24 | } 25 | } 26 | 27 | return failedChecks; 28 | }; 29 | 30 | const run = async () => { 31 | const csvFiles = (await fs.readdir(DIR)).filter(f => f.match(/\.csv$/)); 32 | const failedChecks = await validateSequentially(csvFiles); 33 | 34 | if (failedChecks > 0) { 35 | process.exit(1); 36 | } 37 | }; 38 | 39 | run().catch(e => console.error(e)); 40 | -------------------------------------------------------------------------------- /scrapers/run_district_scraper.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Script to run a single district scraper 4 | 5 | set -e 6 | set -o pipefail 7 | 8 | function cleanup { 9 | exit $? 10 | } 11 | trap "cleanup" EXIT 12 | 13 | DIR="$(cd "$(dirname "$0")" && pwd)" 14 | 15 | 16 | # SCRAPER_KEY must be set 17 | if [ -z $SCRAPER_KEY ] ; then 18 | echo "SCRAPER_KEY env variable must be set"; 19 | exit 1 20 | fi 21 | 22 | # 1. populate the database with the current CSV 23 | echo "Populating database from CSV fallzahlen_kanton_${SCRAPER_KEY}_bezirk..." 24 | $DIR/populate_district_database.py $DIR/../fallzahlen_bezirke/fallzahlen_kanton_${SCRAPER_KEY}_bezirk.csv 25 | 26 | # 2. run the scraper, update the db 27 | echo "Run the district scraper..." 28 | scrape_script="${DIR}/scrape_${SCRAPER_KEY,,}_districts.py" 29 | $scrape_script | $DIR/add_district_db_entry.py 30 | 31 | # 3. Export the database as csv 32 | echo "Export database to CSV..." 33 | sqlite3 -header -csv $DIR/data.sqlite "select * from data order by DistrictId, District, Canton, Date, Year, Week+0 asc;" > $DIR/../fallzahlen_bezirke/fallzahlen_kanton_${SCRAPER_KEY}_bezirk.csv 34 | sed -i 's/""//g' $DIR/../fallzahlen_bezirke/fallzahlen_kanton_${SCRAPER_KEY}_bezirk.csv 35 | -------------------------------------------------------------------------------- /scrapers/scrape_vs_common.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import re 4 | import datetime 5 | 6 | from bs4 import BeautifulSoup 7 | 8 | import scrape_common as sc 9 | 10 | 11 | def get_vs_latest_weekly_pdf_url(): 12 | pdfs = get_vs_weekly_pdf_urls() 13 | assert pdfs, "Could not find weekly PDFs" 14 | return pdfs[0] 15 | 16 | 17 | def get_vs_weekly_pdf_urls(): 18 | base_url = 'https://www.vs.ch' 19 | url = base_url + '/de/web/coronavirus/statistiques-hebdomadaires' 20 | content = sc.download(url, silent=True) 21 | soup = BeautifulSoup(content, 'html.parser') 22 | links = soup.find_all(href=re.compile(r'Synthese.*Woche')) 23 | result = [] 24 | for link in links: 25 | url = base_url + link['href'].replace(' ', '%20') 26 | result.append(url) 27 | return result 28 | 29 | 30 | def get_vs_weekly_general_data(pdf): 31 | content = sc.pdftotext(pdf, page=1) 32 | week = int(sc.find(r'Epidemiologische Situation Woche (\d+)', content)) 33 | end_date = sc.find(r'bis\s+(\d+\.\d+\.\d{4})', content) 34 | end_date = sc.date_from_text(end_date) 35 | start_date = end_date - datetime.timedelta(days=7) 36 | year = start_date.year 37 | return week, year 38 | -------------------------------------------------------------------------------- /scrapers/test/test_district_data.py: -------------------------------------------------------------------------------- 1 | from scrapers.scrape_common import DistrictData 2 | 3 | def test_district_data(): 4 | dd = DistrictData() 5 | dd.date = '1' 6 | dd.week = 2 7 | dd.year = 3 8 | dd.canton = '4' 9 | dd.district = '5' 10 | dd.district_id = 6 11 | dd.population = 7 12 | dd.total_cases = 8 13 | dd.new_cases = 9 14 | dd.total_deceased = 10 15 | dd.new_deceased = 11 16 | dd.url = '12' 17 | 18 | string = str(dd) 19 | 20 | dd_parsed = DistrictData() 21 | assert dd_parsed.parse(string) 22 | assert dd.date == dd_parsed.date 23 | assert dd.week == dd_parsed.week 24 | assert dd.year == dd_parsed.year 25 | assert dd.canton == dd_parsed.canton 26 | assert dd.district == dd_parsed.district 27 | assert dd.district_id == dd_parsed.district_id 28 | assert dd.population == dd_parsed.population 29 | assert dd.total_cases == dd_parsed.total_cases 30 | assert dd.new_cases == dd_parsed.new_cases 31 | assert dd.total_deceased == dd_parsed.total_deceased 32 | assert dd.new_deceased == dd_parsed.new_deceased 33 | assert dd.url == dd_parsed.url 34 | 35 | 36 | if __name__ == "__main__": 37 | test_district_data() 38 | -------------------------------------------------------------------------------- /scrapers/run_tests_scraper.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Script to run a single tests scraper 4 | 5 | set -e 6 | set -o pipefail 7 | 8 | function cleanup { 9 | exit $? 10 | } 11 | trap "cleanup" EXIT 12 | 13 | DIR="$(cd "$(dirname "$0")" && pwd)" 14 | 15 | 16 | # SCRAPER_KEY must be set 17 | if [ -z $SCRAPER_KEY ] ; then 18 | echo "SCRAPER_KEY env variable must be set"; 19 | exit 1 20 | fi 21 | 22 | area="kanton_${SCRAPER_KEY}" 23 | if [ "$SCRAPER_KEY" = "FL" ] ; then 24 | area="${SCRAPER_KEY}" 25 | fi 26 | 27 | # 1. populate the database with the current CSV 28 | echo "Populating database from CSV fallzahlen_${area}_tests..." 29 | $DIR/populate_tests_database.py $DIR/../fallzahlen_tests/fallzahlen_${area}_tests.csv 30 | 31 | # 2. run the scraper, update the db 32 | echo "Run the tests scraper..." 33 | scrape_script="${DIR}/scrape_${SCRAPER_KEY,,}_tests.py" 34 | $scrape_script | $DIR/add_tests_db_entry.py 35 | 36 | # 3. Export the database as csv 37 | echo "Export database to CSV..." 38 | sqlite3 -header -csv $DIR/data.sqlite "select * from data order by canton, start_date, end_date, year, week+0 asc;" > $DIR/../fallzahlen_tests/fallzahlen_${area}_tests.csv 39 | sed -i 's/""//g' $DIR/../fallzahlen_tests/fallzahlen_${area}_tests.csv 40 | -------------------------------------------------------------------------------- /scrapers/meta_scrape.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Scrapers are expected to output data on standard output in the following 4 | # format: 5 | # 6 | # GR 7 | # Scraped at: 2020-03-21T19:22:10+01:00 8 | # Date and time: 20.03.2020 9 | # Confirmed cases: 213 10 | # Deaths: 3 11 | # 12 | # Abbreviation of the canton first. 13 | # 14 | # Then scraped timestamp. Current time in ISO-8601 format. Implicitly in Swiss 15 | # timezone (TZ=Europe/Zurich), CET, or CEST. 16 | # 17 | # The information about time of when the data was published / gathered. 18 | # The data and time, or just time, can be omitted if not available. 19 | # Any date / time format is ok. More accurate the better. It is advised to strip 20 | # the name of the weekday. Add time parser to the parse_scrape_output.py script 21 | # if needed. 22 | # 23 | # Number of cases. 24 | # 25 | # Number of deaths can be omitted, if not available. 26 | 27 | for s in ./scrape_??.py; 28 | do 29 | L=$(./$s | ./parse_scrape_output.py) 30 | if ! echo "${L}" | egrep ' (OK|FAILED)' >/dev/null; then 31 | a=$(echo "$s" | sed -E -e 's/^.*scrape_(..)\..*$/\1/' | tr a-z A-Z) # ' # To make my editor happy. 32 | echo "$a" - - - FAILED "$(date --iso-8601=seconds)" 33 | else 34 | echo "${L}" 35 | fi 36 | done 37 | -------------------------------------------------------------------------------- /scrapers/scrape_sg_tests.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import csv 4 | from io import StringIO 5 | import scrape_common as sc 6 | 7 | 8 | url = 'https://www.sg.ch/ueber-den-kanton-st-gallen/statistik/covid-19/_jcr_content/Par/sgch_downloadlist_729873930/DownloadListPar/sgch_download.ocFile/KantonSG_C19-Tests_download.csv' 9 | data = sc.download(url, silent=True) 10 | 11 | # strip the "header" / description lines 12 | data = "\n".join(data.split("\n")[9:]) 13 | 14 | reader = csv.DictReader(StringIO(data), delimiter=';') 15 | for row in reader: 16 | td = sc.TestData(canton='SG', url=url) 17 | td.start_date = row['Datum'] 18 | td.end_date = row['Datum'] 19 | td.pcr_positive_tests = row['Positiv (PCR)'] 20 | td.pcr_negative_tests = row['Negativ (PCR)'] 21 | td.ag_positive_tests = row['Positiv (Schnelltest)'] 22 | td.ag_negative_tests = row['Negativ (Schnelltest)'] 23 | td.positive_tests = row['Total positive Tests'] 24 | td.negative_tests = row['Total negative Tests'] 25 | td.total_tests = row['Total Tests'] 26 | if row['Positiv in % vom Total']: 27 | td.positivity_rate = float(row['Positiv in % vom Total']) * 100 28 | td.positivity_rate = round(10 * td.positivity_rate) / 10 29 | print(td) 30 | -------------------------------------------------------------------------------- /scrapers/scrape_vs_tests.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import re 4 | 5 | import scrape_common as sc 6 | import scrape_vs_common as svc 7 | 8 | 9 | # get all PDFs 10 | for url in svc.get_vs_weekly_pdf_urls(): 11 | td = sc.TestData(canton='VS', url=url) 12 | 13 | pdf = sc.download_content(url, silent=True) 14 | td.week, td.year = svc.get_vs_weekly_general_data(pdf) 15 | 16 | for page in range(4, 6): 17 | content = sc.pdftotext(pdf, page=page, raw=True) 18 | content = re.sub(r'(\d)\‘(\d)', r'\1\2', content) 19 | content = re.sub(r'(\d)\’(\d)', r'\1\2', content) 20 | content = re.sub(r'(\d)\'(\d)', r'\1\2', content) 21 | 22 | td.total_tests = sc.find(r'Alle\s+Arten\s+von\s+Tests\s+(\d+)', content) 23 | td.positivity_rate = sc.find(r'Alle\s+Arten\s+von\s+Tests\s+\d+\s+(\d+\.\d+)%', content) 24 | td.pcr_total_tests = sc.find(r'PCR\s+(\d+)', content) 25 | td.pcr_positivity_rate = sc.find(r'PCR\s+\d+\s+(\d+\.\d+)%', content) 26 | td.ag_total_tests = sc.find(r'Antigentests\s+(\d+)', content) 27 | td.ag_positivity_rate = sc.find(r'Antigentests\s+\d+\s+(\d+\.\d+)%', content) 28 | 29 | if not td.total_tests: 30 | continue 31 | 32 | print(td) 33 | -------------------------------------------------------------------------------- /scrapers/test_tests_scraper.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Script to run all tests scraper 4 | 5 | set -e 6 | set -o pipefail 7 | 8 | function cleanup { 9 | exit $? 10 | } 11 | trap "cleanup" EXIT 12 | 13 | DIR="$(cd "$(dirname "$0")" && pwd)" 14 | NEWLINE=$'\n' 15 | 16 | echo "Run all tests scrapers..." 17 | 18 | exit_code=0 19 | errors='' 20 | for scrape_script in $DIR/scrape_??_tests.py 21 | do 22 | if [ -f $scrape_script -a -x $scrape_script ] 23 | then 24 | name=`basename $scrape_script` 25 | canton=${name:7:2} 26 | export SCRAPER_KEY=${canton^^} 27 | echo "" 28 | echo "Running ${SCRAPER_KEY} tests scraper..." 29 | echo "==========================================" 30 | 31 | set +e 32 | $DIR/run_tests_scraper.sh 33 | ret=$? 34 | if [ $ret -ne 0 ] 35 | then 36 | echo "ERROR: ${scrape_script} failed with exit code $ret. continue." >&2 37 | errors=$"${errors}${NEWLINE}ERROR: ${scrape_script} failed with exit code $ret" 38 | exit_code=1 39 | fi 40 | set -e 41 | 42 | echo "==========================================" 43 | echo "" 44 | fi 45 | done 46 | 47 | 48 | echo "$errors" 49 | exit $exit_code 50 | -------------------------------------------------------------------------------- /fallzahlen_kanton_zh/COVID19_VOC_Kanton_ZH.csv: -------------------------------------------------------------------------------- 1 | date,new_pcr_pos,new_voc 2 | 2021-02-10,168,35 3 | 2021-02-09,247,54 4 | 2021-02-08,134,44 5 | 2021-02-07,82,29 6 | 2021-02-06,188,62 7 | 2021-02-05,194,41 8 | 2021-02-04,209,38 9 | 2021-02-03,215,43 10 | 2021-02-02,272,67 11 | 2021-02-01,143,37 12 | 2021-01-31,65,12 13 | 2021-01-30,193,34 14 | 2021-01-29,208,32 15 | 2021-01-28,287,34 16 | 2021-01-27,273,32 17 | 2021-01-26,316,41 18 | 2021-01-25,152,25 19 | 2021-01-24,115,16 20 | 2021-01-23,245,18 21 | 2021-01-22,390,23 22 | 2021-01-21,197,17 23 | 2021-01-20,301,14 24 | 2021-01-19,336,10 25 | 2021-01-18,217,6 26 | 2021-01-17,103,5 27 | 2021-01-16,251,8 28 | 2021-01-15,277,10 29 | 2021-01-14,273,5 30 | 2021-01-13,352,4 31 | 2021-01-12,392,8 32 | 2021-01-11,291,3 33 | 2021-01-10,163,0 34 | 2021-01-09,347,0 35 | 2021-01-08,446,6 36 | 2021-01-07,449,2 37 | 2021-01-06,616,4 38 | 2021-01-05,658,6 39 | 2021-01-04,494,2 40 | 2021-01-03,280,1 41 | 2021-01-02,388,2 42 | 2021-01-01,204,0 43 | 2020-12-31,638,0 44 | 2020-12-30,595,2 45 | 2020-12-29,731,4 46 | 2020-12-28,368,1 47 | 2020-12-27,284,0 48 | 2020-12-26,429,2 49 | 2020-12-25,229,0 50 | 2020-12-24,793,0 51 | 2020-12-23,855,1 52 | 2020-12-22,736,0 53 | 2020-12-21,414,1 54 | 2020-12-20,312,0 55 | 2020-12-19,494,2 56 | 2020-12-18,723,0 57 | -------------------------------------------------------------------------------- /scrapers/scrape_tg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import csv 4 | from io import StringIO 5 | import requests 6 | import scrape_common as sc 7 | 8 | # perma link to TG COVID dataset on opendata.swiss 9 | r = requests.get( 10 | 'https://opendata.swiss/api/3/action/ogdch_dataset_by_identifier', 11 | params={'identifier': 'dfs-ga-1@kanton-thurgau'} 12 | ) 13 | dataset = r.json()['result'] 14 | resource = next(r for r in dataset['resources'] if r['mimetype'] == 'text/csv') 15 | 16 | assert resource['download_url'], "Download URL not found" 17 | 18 | d_csv = sc.download(resource['download_url'], silent=True) 19 | 20 | reader = csv.DictReader(StringIO(d_csv), delimiter=';') 21 | is_first = True 22 | for row in reader: 23 | if not row['date']: 24 | continue 25 | if not is_first: 26 | print('-' * 10) 27 | is_first = False 28 | dd = sc.DayData(canton='TG', url=row['source']) 29 | dd.datetime = f"{row['date']} {row['time']}" 30 | dd.cases = row['ncumul_conf'] 31 | dd.deaths = row['ncumul_deceased'] 32 | dd.hospitalized = row['current_hosp'] 33 | dd.new_hosp = row['new_hosp'] 34 | dd.recovered = row['ncumul_released'] 35 | dd.icu = row['current_icu'] 36 | dd.isolated = row['num_isolated'] 37 | print(dd) 38 | -------------------------------------------------------------------------------- /scrapers/run_scraper.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Script to run a single scraper 4 | 5 | set -e 6 | set -o pipefail 7 | 8 | function cleanup { 9 | exit $? 10 | } 11 | trap "cleanup" EXIT 12 | 13 | DIR="$(cd "$(dirname "$0")" && pwd)" 14 | 15 | 16 | # SCRAPER_KEY must be set 17 | if [ -z $SCRAPER_KEY ] ; then 18 | echo "SCRAPER_KEY env variable must be set"; 19 | exit 1 20 | fi 21 | 22 | area="Kanton_${SCRAPER_KEY}" 23 | if [ "$SCRAPER_KEY" = "FL" ] ; then 24 | area="${SCRAPER_KEY}" 25 | fi 26 | 27 | # 1. populate the database with the current CSV 28 | echo "Populating database from CSV COVID19_Fallzahlen_${area}_total.csv..." 29 | $DIR/populate_database.py $DIR/../fallzahlen_kanton_total_csv_v2/COVID19_Fallzahlen_${area}_total.csv 30 | 31 | # 2. run the scraper, update the db 32 | echo "Run the scraper..." 33 | scrape_script="${DIR}/scrape_${SCRAPER_KEY,,}.py" 34 | $scrape_script | $DIR/parse_scrape_output.py | $DIR/add_db_entry.py 35 | 36 | # 3. Export the database as csv 37 | echo "Export database to CSV..." 38 | sqlite3 -header -csv $DIR/data.sqlite "select * from data order by date asc;" > $DIR/../fallzahlen_kanton_total_csv_v2/COVID19_Fallzahlen_${area}_total.csv 39 | sed -i 's/""//g' $DIR/../fallzahlen_kanton_total_csv_v2/COVID19_Fallzahlen_${area}_total.csv 40 | -------------------------------------------------------------------------------- /scrapers/test_district_scraper.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Script to test run all district scraper 4 | 5 | set -e 6 | set -o pipefail 7 | 8 | function cleanup { 9 | exit $? 10 | } 11 | trap "cleanup" EXIT 12 | 13 | DIR="$(cd "$(dirname "$0")" && pwd)" 14 | NEWLINE=$'\n' 15 | 16 | echo "Run all district scrapers..." 17 | 18 | exit_code=0 19 | errors='' 20 | for scrape_script in $DIR/scrape_??_districts.py 21 | do 22 | if [ -f $scrape_script -a -x $scrape_script ] 23 | then 24 | name=`basename $scrape_script` 25 | canton=${name:7:2} 26 | export SCRAPER_KEY=${canton^^} 27 | echo "" 28 | echo "Running ${SCRAPER_KEY} district scraper..." 29 | echo "==========================================" 30 | 31 | set +e 32 | $DIR/run_district_scraper.sh 33 | ret=$? 34 | if [ $ret -ne 0 ] 35 | then 36 | echo "ERROR: ${scrape_script} failed with exit code $ret. continue." >&2 37 | errors=$"${errors}${NEWLINE}ERROR: ${scrape_script} failed with exit code $ret" 38 | exit_code=1 39 | fi 40 | set -e 41 | 42 | echo "==========================================" 43 | echo "" 44 | fi 45 | done 46 | 47 | echo "$errors" 48 | exit $exit_code 49 | -------------------------------------------------------------------------------- /scrapers/scrape_ne.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | import datetime 6 | import scrape_common as sc 7 | 8 | xls_url = 'https://www.ne.ch/autorites/DFS/SCSP/medecin-cantonal/maladies-vaccinations/Documents/Covid-19-Statistiques/COVID19_PublicationInternet.xlsx' 9 | xls = sc.xlsdownload(xls_url, silent=True) 10 | rows = sc.parse_xls(xls) 11 | is_first = True 12 | for row in rows[:3000]: 13 | if row['A'] is None: 14 | continue 15 | if not isinstance(row['A'], datetime.datetime): 16 | print(f"WARNING: {row['A']} is not a valid date, skipping.", file=sys.stderr) 17 | continue 18 | 19 | if not is_first: 20 | print('-' * 10) 21 | is_first = False 22 | 23 | dd = sc.DayData(canton='NE', url=xls_url) 24 | dd.datetime = row['A'].date().isoformat() 25 | dd.cases = row['Cumul'] 26 | dd.hospitalized = row['Total des cas hospitalisés'] 27 | if row['Soins intensifs (intubés)'] is not None and row['Soins intensifs (non intubés)'] is not None: 28 | ICU = row['Soins intensifs (intubés)'] 29 | ICU2 = row['Soins intensifs (non intubés)'] 30 | dd.icu = int(ICU)+int(ICU2) 31 | dd.vent = row['Soins intensifs (intubés)'] 32 | dd.deaths = row['Cumul des décès'] 33 | print(dd) 34 | -------------------------------------------------------------------------------- /scrapers/scrape_ag_tests.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import scrape_common as sc 4 | import scrape_ag_common as sac 5 | 6 | 7 | def get_value_int(value): 8 | if value is not None and value != '': 9 | return int(value) 10 | return None 11 | 12 | 13 | def get_value_float(value): 14 | if value is not None and value != '': 15 | return float(value) 16 | return None 17 | 18 | 19 | xls_url = sac.get_ag_xls_url() 20 | xls = sc.xlsdownload(xls_url, silent=True) 21 | 22 | year = '2020' 23 | rows = sc.parse_xls(xls, sheet_name='1.4 Labortests', header_row=1, enable_float=True) 24 | for row in rows: 25 | if not row['Anzahl Tests']: 26 | continue 27 | if row['Anzahl Tests'] == 'Anzahl Tests': 28 | break 29 | 30 | td = sc.TestData(canton='AG', url=xls_url) 31 | td.week = int(row['Kalenderwoche']) 32 | if td.week == 1: 33 | year = '2021' 34 | td.year = year 35 | td.positive_tests = get_value_int(row['Positive Tests']) 36 | td.negative_tests = get_value_int(row['Negative Tests']) 37 | td.total_tests = int(row['Anzahl Tests']) 38 | td.positivity_rate = get_value_float(row['Positivitätsrate']) 39 | td.pcr_positivity_rate = get_value_float(row['F']) 40 | td.ag_positivity_rate = get_value_float(row['G']) 41 | if td: 42 | print(td) 43 | -------------------------------------------------------------------------------- /scrapers/scrape_ju_tests.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import re 5 | from bs4 import BeautifulSoup 6 | import scrape_common as sc 7 | 8 | base_url = 'https://www.jura.ch' 9 | url = f'{base_url}/fr/Autorites/Coronavirus/Infos-Actualite/Statistiques-COVID/Evolution-des-cas-COVID-19-dans-le-Jura.html' 10 | d = sc.download(url, silent=True) 11 | d = d.replace(' ', ' ') 12 | soup = BeautifulSoup(d, 'html.parser') 13 | 14 | pdf_url = soup.find('a', title=re.compile(r'Situation.*PDF.*')).get('href') 15 | if not pdf_url.startswith('http'): 16 | pdf_url = f'{base_url}{pdf_url}' 17 | pdf_url = pdf_url.replace('?download=1', '') 18 | 19 | pdf = sc.download_content(pdf_url, silent=True) 20 | 21 | td = sc.TestData(canton='JU', url=pdf_url) 22 | 23 | content = sc.pdftotext(pdf, page=1) 24 | td.week = sc.find(r'Situation semaine épidémiologique (\d+)', content) 25 | td.year = sc.find(r'Du \d+.* (\d{4})', content) 26 | 27 | content = sc.pdftotext(pdf, page=2) 28 | td.total_tests = sc.find(r'Nombre de tests\d?\s+(\d+)', content) 29 | res = re.match(r'.*Nombre de tests positifs .*\s+(\d+)\s+\((\d+\.?\d?)%\s?\d?\)', content, re.DOTALL | re.MULTILINE) 30 | assert res, 'failed to find number of positive tests and positivity rate' 31 | td.positive_tests = res[1] 32 | td.positivity_rate = res[2] 33 | 34 | print(td) 35 | -------------------------------------------------------------------------------- /scripts/latest_per_canton.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | echo "| Canton | Confirmed cases | Deceased | Last update |" 4 | echo "|:------:| ---------------:| --------:|:---------------------- |" 5 | # | BL | 282 | 0 | 2020-03-21 | 6 | 7 | # PER CANTON / FL 8 | 9 | # 1 2 3 4 5 6 7 8 9 10 10 | # date,time,abbreviation_canton_and_fl,ncumul_tested,ncumul_conf,ncumul_hosp,ncumul_ICU,ncumul_vent,ncumul_released,ncumul_deceased,source 11 | 12 | for f in *.csv; do 13 | # Output latest row with non-zero commulative number of cases (and deaths). Then sort by number of cases, and print the date. 14 | awk -F , '{if ($5) { printf("| %2s | %15d | %8d | %-21s |\n", $3, $5, $10, $2 != "\"\"" ? $1 "T" $2 : $1); }}' "$f" | tail -1 15 | done | sort -r -n -k 4 16 | 17 | # TOTAL 18 | 19 | DATE=$(TZ="Europe/Zurich" date --iso-8601=minutes) 20 | 21 | for f in *.csv; do 22 | # Output last row with non-zero commulative number of cases (and deaths) 23 | awk -F , '{if ($5) { print $1, $3, $5, $10; }}' "$f" | tail -1 24 | # The do sums. 25 | done | awk "BEGIN { sum_cases = 0; sum_deceased = 0; } { sum_cases += \$3; sum_deceased += \$4; } END { printf(\"| TOTAL | %15d | %8d | %-22s |\n\", sum_cases, sum_deceased, \"${DATE}\"); }" 26 | -------------------------------------------------------------------------------- /.github/workflows/test_scraper.yml: -------------------------------------------------------------------------------- 1 | name: Test run of scrapers 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | paths: 7 | - 'scrapers/**' 8 | - '!scrapers/*districts*' 9 | - '!scrapers/*tests*' 10 | - '.github/workflows/**' 11 | pull_request: 12 | branches: [ master ] 13 | paths: 14 | - 'scrapers/**' 15 | - '!scrapers/*districts*' 16 | - '!scrapers/*tests*' 17 | - '.github/workflows/**' 18 | workflow_dispatch: ~ 19 | 20 | jobs: 21 | test_run: 22 | runs-on: ubuntu-20.04 23 | timeout-minutes: 10 24 | 25 | steps: 26 | - uses: actions/checkout@v3 27 | 28 | - name: Set up Python 3.7 29 | uses: actions/setup-python@v4 30 | with: 31 | python-version: 3.7 32 | 33 | - name: Remove broken apt repos 34 | run: | 35 | for apt_file in `grep -lr microsoft /etc/apt/sources.list.d/`; do sudo rm $apt_file; done 36 | 37 | - name: Install dependencies 38 | run: | 39 | npm ci 40 | python -m pip install --upgrade pip setuptools wheel 41 | pip install -r requirements.txt 42 | sudo apt update || true # do not fail if update does not work 43 | sudo apt-get install poppler-utils 44 | sudo apt-get install chromium-browser 45 | 46 | - name: Test run of all scrapers 47 | run: ./scrapers/test_scraper.sh 48 | 49 | -------------------------------------------------------------------------------- /scrapers/scrape_ai.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import re 4 | import scrape_common as sc 5 | 6 | url = 'https://www.ai.ch/themen/gesundheit-alter-und-soziales/gesundheitsfoerderung-und-praevention/uebertragbare-krankheiten/coronavirus' 7 | d = sc.download(url, silent=True) 8 | d = re.sub(r'(\d+)\'(\d+)', r'\1\2', d) 9 | 10 | """ 11 | no separate date for hospitalizations on 2020-11-19 12 | # Hospitalisations 13 | dd_hosp = sc.DayData(canton='AI', url=url) 14 | dd_hosp.datetime = sc.find('>.*Hospitalisationen\s+\(Stand\s+(.*\d{4})\)', d) 15 | dd_hosp.hospitalized = sc.find('
  • .*?([0-9]+)\s*Hospitalisationen.*<\/li>', d) 16 | print(dd_hosp) 17 | print('-' * 10) 18 | """ 19 | 20 | # cases 21 | dd = sc.DayData(canton='AI', url=url) 22 | dd.datetime = sc.find('>.*Stand (.+ Uhr).*', d) 23 | dd.cases = sc.find('
  • .*?([0-9]+)\s*(infizierte Person(en)?|(labor)?bestätigte Fälle).*<\/li>', d) 24 | dd.deaths = sc.find('
  • .*?([0-9]+)\s*Todesf.+?lle.*<\/li>', d) 25 | dd.isolated = sc.find('
  • .*?([0-9]+)\s*Personen\s+in\s*Isolation.*<\/li>', d) 26 | dd.quarantined = sc.find('
  • .*?([0-9]+)\+?\s*enge\s+Kontaktpersonen\s+in\s+Quarant.ne.*<\/li>', d) 27 | dd.quarantine_riskareatravel = sc.find('
  • .*?([0-9]+)\+?\s*Personen\s+in\s*Quarant.+ne.*Einreise\s+Risikoland.*<\/li>', d) 28 | dd.hospitalized = sc.find(r'
  • .*?([0-9]+)\s*Person\sim\sSpital.*<\/li>', d) 29 | print(dd) 30 | -------------------------------------------------------------------------------- /scrapers/scrape_bl_common.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import datetime 5 | from bs4 import BeautifulSoup 6 | import re 7 | import scrape_common as sc 8 | 9 | 10 | def get_latest_bl_bulletin_url(): 11 | return get_all_bl_bulletin_urls()[0] 12 | 13 | 14 | def get_all_bl_bulletin_urls(): 15 | news_url = 'https://www.baselland.ch/politik-und-behorden/direktionen/volkswirtschafts-und-gesundheitsdirektion/amt-fur-gesundheit/medizinische-dienste/kantonsarztlicher-dienst/aktuelles/medienmitteilungen-1' 16 | news_content = sc.download(news_url, silent=True) 17 | soup = BeautifulSoup(news_content, 'html.parser') 18 | 19 | bulletins = soup.find_all('a', href=re.compile(r'.*/coronavirus-wochenbulletin.*')) 20 | bulletin_urls = [] 21 | for bulletin in bulletins: 22 | bulletin_urls.append(bulletin.get('href')) 23 | return bulletin_urls 24 | 25 | 26 | def strip_bl_bulletin_numbers(content): 27 | content = re.sub(r'(\d+)’(\d+)', r'\1\2', content) 28 | content = re.sub(r'(\d+)\'(\d+)', r'\1\2', content) 29 | return content 30 | 31 | 32 | def parse_bl_date(s): 33 | row_date = s.replace('-', '.') 34 | row_date = s.replace('/', '.') 35 | parts = row_date.split('.') 36 | s_date = datetime.datetime(day=int(parts[0]), month=int(parts[1]), year=int(parts[2])) 37 | key = s_date.date().isoformat() 38 | return (key, row_date) 39 | -------------------------------------------------------------------------------- /correction_status.csv: -------------------------------------------------------------------------------- 1 | date,abbreviation_canton_and_fl,column 2 | 2020-12-25,FL,ncumul_conf 3 | 2021-06-08,ZG,ncumul_released 4 | 2021-06-22,NW,ncumul_conf 5 | 2021-06-29,BS,ncumul_conf 6 | 2021-06-27,ZG,ncumul_released 7 | 2021-06-30,NW,ncumul_conf 8 | 2021-07-02,ZG,ncumul_released 9 | 2021-07-05,ZG,ncumul_released 10 | 2021-07-05,BS,ncumul_released 11 | 2021-07-08,BS,ncumul_released 12 | 2021-07-14,BS,ncumul_released 13 | 2021-07-30,BS,ncumul_released 14 | 2021-08-19,ZG,ncumul_released 15 | 2021-08-20,ZG,ncumul_released 16 | 2021-09-03,ZG,ncumul_released 17 | 2021-10-01,ZG,ncumul_released 18 | 2021-10-04,SG,ncumul_deceased 19 | 2021-10-04,SG,ncumul_released 20 | 2021-10-22,ZG,ncumul_released 21 | 2021-10-24,ZG,ncumul_released 22 | 2021-11-05,ZG,ncumul_released 23 | 2021-11-07,ZG,ncumul_released 24 | 2021-11-12,ZG,ncumul_released 25 | 2022-02-17,UR,ncumul_deceased 26 | 2022-03-07,TI,ncumul_conf 27 | 2022-03-07,TI,ncumul_deceased 28 | 2022-04-10,FL,ncumul_released 29 | 2022-04-16,FL,ncumul_released 30 | 2022-05-30,FR,ncumul_released 31 | 2022-07-11,FR,ncumul_released 32 | 2022-08-16,NW,ncumul_conf 33 | 2022-09-05,NW,ncumul_conf 34 | 2022-11-16,NW,ncumul_conf 35 | 2023-01-25,BS,ncumul_released 36 | 2023-02-02,GE,ncumul_conf 37 | 2023-02-02,GE,ncumul_released 38 | 2023-02-02,GE,ncumul_deceased 39 | 2023-02-08,GE,ncumul_released 40 | 2023-02-08,GE,ncumul_deceased 41 | 2023-03-21,FL,ncumul_released 42 | 2023-03-29,FL,ncumul_released 43 | -------------------------------------------------------------------------------- /scrapers/scrape_fr.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import csv 5 | import re 6 | from typing import Optional 7 | from io import StringIO 8 | import datetime 9 | import sys 10 | import scrape_common as sc 11 | from scrape_fr_common import get_fr_csv 12 | 13 | def trim_val(val: str) -> Optional[int]: 14 | if len(val) > 0: 15 | return int(re.sub(r'(\d+)\s+(\d+)', r'\1\2', val)) 16 | return None 17 | 18 | csv_url, csv_data, main_url = get_fr_csv() 19 | reader = csv.DictReader(StringIO(csv_data), delimiter=';') 20 | is_first = True 21 | 22 | for row in reader: 23 | if not is_first: 24 | print('-' * 10) 25 | is_first = False 26 | 27 | dd = sc.DayData(canton='FR', url=main_url) 28 | for key, val in row.items(): 29 | if sc.find(r'(Date).*', key): 30 | dd.datetime = val 31 | if sc.find(r'(Total cas av.r.s).*', key): 32 | dd.cases = trim_val(val) 33 | elif sc.find(r'(Personnes hospitalis.es).*', key): 34 | dd.hospitalized = trim_val(val) 35 | elif sc.find(r'(aux soins intensifs).*', key): 36 | dd.icu = trim_val(val) 37 | elif sc.find(r'(Total d.c.s).*', key): 38 | dd.deaths = trim_val(val) 39 | elif sc.find(r'(Total Sorties de l\'h.pital).*', key): 40 | dd.recovered = trim_val(val) 41 | 42 | assert dd 43 | assert dd.datetime 44 | print(dd) 45 | -------------------------------------------------------------------------------- /scrapers/scrape_be_districts.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import csv 5 | from io import StringIO 6 | import scrape_common as sc 7 | 8 | 9 | # https://www.bfs.admin.ch/bfs/de/home/statistiken/kataloge-datenbanken/karten.assetdetail.5688189.html 10 | district_ids = { 11 | 241: 'Jura bernois', 12 | 242: 'Biel/Bienne', 13 | 243: 'Seeland', 14 | 244: 'Oberaargau', 15 | 245: 'Emmental', 16 | 246: 'Bern-Mittelland', 17 | 247: 'Thun', 18 | 248: 'Obersimmental-Saanen', 19 | 249: 'Frutigen-Niedersimmental', 20 | 250: 'Interlaken-Oberhasli', 21 | } 22 | 23 | url = 'https://covid-kennzahlen.apps.be.ch/#/de/cockpit' 24 | csv_url = 'https://raw.githubusercontent.com/openDataBE/covid19Data/develop/7_d_inzidenz_verwaltungskreis.csv' 25 | d = sc.download(csv_url, silent=True) 26 | reader = csv.DictReader(StringIO(d), delimiter=',') 27 | for row in reader: 28 | #dd = sc.DistrictData(district=district, canton='BE') 29 | district_id = int(row['bfs_nummer']) 30 | dd = sc.DistrictData(district=district_ids[district_id], canton='BE') 31 | dd.url = url 32 | dd.district_id = district_id 33 | dd.population = row['einwohnerzahl'] 34 | date = sc.date_from_text(row['datum']) 35 | week = date.isocalendar()[1] 36 | dd.week = week 37 | dd.year = date.year 38 | dd.new_cases = round(float(row['7_d_inzidenz']) / 100e3 * int(row['einwohnerzahl'])) 39 | print(dd) 40 | -------------------------------------------------------------------------------- /.github/workflows/activate_scraper.yml: -------------------------------------------------------------------------------- 1 | name: Activate a scraper 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | canton: 7 | description: 'Abbreviation of Canton' 8 | required: true 9 | 10 | jobs: 11 | activate_scraper: 12 | runs-on: ubuntu-20.04 13 | timeout-minutes: 10 14 | 15 | steps: 16 | - uses: actions/checkout@v3 17 | 18 | - name: Activate scraper 19 | env: 20 | CANTON: ${{ github.event.inputs.canton }} 21 | run: | 22 | sed -e "/- $CANTON/I s/^#*//" -i ./.github/workflows/run_scrapers.yml 23 | 24 | - name: Commit and push to repo 25 | env: 26 | GHA_DEPLOY_KEY: ${{ secrets.GHA_DEPLOY_KEY }} 27 | CANTON: ${{ github.event.inputs.canton }} 28 | run: | 29 | if ! git diff --no-ext-diff --quiet --exit-code; then 30 | git add . 31 | git config --local user.email "scraper@open.zh.ch" 32 | git config --local user.name "GitHub Action Scraper" 33 | git commit -a -m "Activate $CANTON scraper" 34 | git remote set-url origin "$(git config --get remote.origin.url | sed 's#http.*com/#git@github.com:#g')" 35 | eval `ssh-agent -t 60 -s` 36 | echo "$GHA_DEPLOY_KEY" | ssh-add - 37 | mkdir -p ~/.ssh/ 38 | ssh-keyscan github.com >> ~/.ssh/known_hosts 39 | git push 40 | ssh-agent -k 41 | else 42 | echo "Nothing to commit." 43 | fi 44 | -------------------------------------------------------------------------------- /.github/workflows/deactivate_scraper.yml: -------------------------------------------------------------------------------- 1 | name: Deactivate a scraper 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | canton: 7 | description: 'Abbreviation of Canton' 8 | required: true 9 | 10 | jobs: 11 | deactivate_scraper: 12 | runs-on: ubuntu-20.04 13 | timeout-minutes: 10 14 | 15 | steps: 16 | - uses: actions/checkout@v3 17 | 18 | - name: Deactivate scraper 19 | env: 20 | CANTON: ${{ github.event.inputs.canton }} 21 | run: | 22 | sed -e "/- $CANTON/I s/^#*/#/" -i ./.github/workflows/run_scrapers.yml 23 | 24 | - name: Commit and push to repo 25 | env: 26 | GHA_DEPLOY_KEY: ${{ secrets.GHA_DEPLOY_KEY }} 27 | CANTON: ${{ github.event.inputs.canton }} 28 | run: | 29 | if ! git diff --no-ext-diff --quiet --exit-code; then 30 | git add . 31 | git config --local user.email "scraper@open.zh.ch" 32 | git config --local user.name "GitHub Action Scraper" 33 | git commit -a -m "Deactivate $CANTON scraper" 34 | git remote set-url origin "$(git config --get remote.origin.url | sed 's#http.*com/#git@github.com:#g')" 35 | eval `ssh-agent -t 60 -s` 36 | echo "$GHA_DEPLOY_KEY" | ssh-add - 37 | mkdir -p ~/.ssh/ 38 | ssh-keyscan github.com >> ~/.ssh/known_hosts 39 | git push 40 | ssh-agent -k 41 | else 42 | echo "Nothing to commit." 43 | fi 44 | -------------------------------------------------------------------------------- /scrapers/scrape_ge_tests.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import re 5 | 6 | from selenium import webdriver 7 | from selenium.webdriver.chrome.options import Options 8 | 9 | import scrape_common as sc 10 | import scrape_ge_common as sgc 11 | 12 | 13 | chrome_options = Options() 14 | chrome_options.add_argument("--headless") 15 | driver = webdriver.Chrome(options=chrome_options) 16 | driver.implicitly_wait(5) 17 | 18 | url = 'https://infocovid.smc.unige.ch/' 19 | driver.get(url) 20 | elem = driver.find_element_by_link_text('Graphiques') 21 | elem.click() 22 | elem = driver.find_element_by_partial_link_text('Tests') 23 | elem.click() 24 | xls_url = sgc.get_link_from_element(driver, 'save_plot_nombre_tests_data') 25 | assert xls_url, "Couldn't find tests XLS url" 26 | 27 | xls = sc.xlsdownload(xls_url, silent=True) 28 | rows = sc.parse_xls(xls, header_row=0, enable_float=True) 29 | for row in rows: 30 | td = sc.TestData(canton='GE', url=url) 31 | res = re.search(r'(\d{2})-(\d{2})', row['week_res']) 32 | assert res, f"failed to extract year and week from {row['week_res']}" 33 | td.week = int(res[2]) 34 | td.year = f'20{res[1]}' 35 | td.positive_tests = int(row['positifs']) 36 | td.negative_tests = int(row['négatifs']) 37 | td.total_tests = int(row['total']) 38 | # 2020-02/03 values are empty 39 | td.positivity_rate = 0 40 | if row['ratio']: 41 | td.positivity_rate = float(row['ratio']) 42 | print(td) 43 | -------------------------------------------------------------------------------- /scrapers/test_scraper.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Script to run all scrapers 4 | 5 | set -e 6 | set -o pipefail 7 | 8 | function cleanup { 9 | exit $? 10 | } 11 | trap "cleanup" EXIT 12 | 13 | DIR="$(cd "$(dirname "$0")" && pwd)" 14 | NEWLINE=$'\n' 15 | 16 | echo "Run all scrapers..." 17 | 18 | exit_code=0 19 | errors='' 20 | for scrape_script in $DIR/scrape_??.py 21 | do 22 | if [ -f $scrape_script -a -x $scrape_script ] 23 | then 24 | name=`basename $scrape_script` 25 | canton=${name:7:2} 26 | export SCRAPER_KEY=${canton^^} 27 | echo "" 28 | echo "Running ${SCRAPER_KEY} scraper..." 29 | echo "==========================================" 30 | 31 | set +e 32 | $DIR/run_scraper.sh 33 | ret=$? 34 | if [ $ret -ne 0 ] 35 | then 36 | echo "ERROR: ${scrape_script} failed with exit code $ret. continue." >&2 37 | errors=$"${errors}${NEWLINE}ERROR: ${scrape_script} failed with exit code $ret" 38 | exit_code=1 39 | fi 40 | $DIR/validate_scraper_output.sh 41 | ret=$? 42 | if [ $ret -ne 0 ] 43 | then 44 | echo "ERROR: Validation for ${SCRAPER_KEY} failed with exit code $ret. continue." >&2 45 | errors=$"${errors}${NEWLINE}ERROR: Validation for ${SCRAPER_KEY} failed with exit code $ret" 46 | exit_code=1 47 | fi 48 | set -e 49 | 50 | echo "==========================================" 51 | echo "" 52 | fi 53 | done 54 | 55 | echo "$errors" 56 | exit $exit_code 57 | -------------------------------------------------------------------------------- /scrapers/scrape_lu.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import re 4 | from bs4 import BeautifulSoup 5 | import scrape_common as sc 6 | 7 | 8 | base_url = 'https://www.lustat.ch' 9 | url = f'{base_url}/daten?id=28177' 10 | d = sc.download(url, silent=True) 11 | soup = BeautifulSoup(d, 'html.parser') 12 | 13 | xls_url = soup.find('a', href=re.compile(r'.*\.xlsx')).get('href') 14 | if not xls_url.startswith('http'): 15 | xls_url = f'{base_url}{xls_url}' 16 | xls = sc.xlsdownload(xls_url, silent=True) 17 | rows = sc.parse_xls(xls, header_row=5) 18 | total_cases = 0 19 | total_deaths = 0 20 | is_first = True 21 | for row in rows: 22 | dd = sc.DayData(canton='LU', url=xls_url) 23 | dd.datetime = row['Datum'] 24 | dd.cases = sc.int_or_word(row.search(r'Neue\s+Fälle')) 25 | if dd.cases: 26 | total_cases += dd.cases 27 | dd.cases = total_cases 28 | dd.deaths = sc.int_or_word(row['Verstorbene']) 29 | if dd.deaths: 30 | total_deaths += dd.deaths 31 | dd.deaths = total_deaths 32 | dd.hospitalized = sc.int_or_word(row['Total']) 33 | dd.vent = sc.int_or_word(row.search(r'davon\s+beatmet')) 34 | dd.isolated = sc.int_or_word(row.search(r'in\s+Isolation')) 35 | dd.quarantined = sc.int_or_word(row.search(r'in\s+Quarantäne')) 36 | dd.quarantine_riskareatravel = sc.int_or_word(row.search(r'Reiserückkehrer\s+in\s+Quarantäne')) 37 | if dd.cases is None and dd.datetime == '31.12.2022': 38 | continue 39 | if dd: 40 | if not is_first: 41 | print('-' * 10) 42 | is_first = False 43 | print(dd) 44 | -------------------------------------------------------------------------------- /scrapers/scrape_sg_districts.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import csv 4 | from io import StringIO 5 | import scrape_common as sc 6 | 7 | inhabitants = { 8 | 'St.Gallen': 127198, 9 | 'Rorschach': 44110, 10 | 'Rheintal': 74580, 11 | 'Werdenberg': 40239, 12 | 'Sarganserland': 41736, 13 | 'See-Gaster': 76913, 14 | 'Toggenburg': 47272, 15 | 'Wil': 77018, 16 | } 17 | 18 | district_ids = { 19 | 'St.Gallen': 1721, 20 | 'Rorschach': 1722, 21 | 'Rheintal': 1723, 22 | 'Werdenberg': 1724, 23 | 'Sarganserland': 1725, 24 | 'See-Gaster': 1726, 25 | 'Toggenburg': 1727, 26 | 'Wil': 1728, 27 | } 28 | 29 | url = 'https://www.sg.ch/ueber-den-kanton-st-gallen/statistik/covid-19/_jcr_content/Par/sgch_downloadlist/DownloadListPar/sgch_download.ocFile/KantonSG_C19-Faelle_download.csv' 30 | d = sc.download(url, silent=True) 31 | 32 | # strip the "header" / description lines 33 | d = "\n".join(d.split("\n")[5:]) 34 | 35 | reader = csv.DictReader(StringIO(d), delimiter=';') 36 | for row in reader: 37 | week = sc.find(r'W(\d+)', row['Kalenderwoche']) 38 | date = sc.date_from_text(row['Falldatum']) 39 | 40 | for key, value in inhabitants.items(): 41 | dd = sc.DistrictData(canton='SG', district=key) 42 | dd.url = url 43 | dd.week = week 44 | dd.year = date.year 45 | dd.date = date.isoformat() 46 | dd.district_id = district_ids[key] 47 | dd.new_cases = row['Wahlkreis ' + key] 48 | dd.total_cases = row['Wahlkreis ' + key + ' (kumuliert)'] 49 | dd.population = value 50 | print(dd) 51 | -------------------------------------------------------------------------------- /scrapers/scrape_bs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import csv 5 | from io import StringIO 6 | import scrape_common as sc 7 | 8 | d_csv = sc.download('https://data.bs.ch/explore/dataset/100073/download/?format=csv&timezone=Europe/Zurich&lang=en&use_labels_for_header=false&csv_separator=,', silent=True) 9 | 10 | reader = csv.DictReader(StringIO(d_csv), delimiter=',') 11 | is_first = True 12 | for row in reader: 13 | if not row['ncumul_conf']: 14 | continue 15 | if not is_first: 16 | print('-' * 10) 17 | is_first = False 18 | dd = sc.DayData(canton='BS', url=row['source']) 19 | dd.datetime = f"{row['date']} {row['time']}" 20 | dd.cases = sc.safeint(row['ncumul_conf']) 21 | dd.new_hosp = row['new_hosp'] 22 | dd.hospitalized = row['current_hosp'] 23 | dd.icu = row['current_icu'] 24 | dd.vent = row['current_vent'] 25 | dd.recovered = row['ncumul_released'] 26 | dd.deaths = row['ncumul_deceased'] 27 | dd.isolated = row['current_isolated'] 28 | dd.quarantined = row['current_quarantined'] 29 | dd.confirmed_non_resident = row['ncumul_confirmed_non_resident'] 30 | dd.hosp_non_resident = row['current_hosp_non_resident'] 31 | dd.quarantine_riskareatravel = row['current_quarantined_riskareatravel'] 32 | dd.quarantine_total = row['current_quarantined_total'] 33 | dd.hosp_resident = row['current_hosp_resident'] 34 | 35 | # TODO: remove if source is fixed 36 | # BS corrected data on 2021-03-01 without adapting their time series 37 | if row['date'] in ('2021-02-27', '2021-02-28'): 38 | dd.cases = '' 39 | dd.recovered = '' 40 | print(dd) 41 | -------------------------------------------------------------------------------- /scrapers/download.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # This is a simple wrapper around curl or wget, that can also be used to 4 | # save downloaded pages for archival purposes, as well for feeding fake 5 | # (test) data to the scrapers. 6 | 7 | # echo "DOWNLOADING:" "$@" >&2 8 | 9 | #WEBARCHIVE_SNAPSHOT=1 10 | 11 | if [ "x${WEBARCHIVE_SNAPSHOT}" != "x" ]; then 12 | # Note: JSON only allows strings in double quotes. 13 | ( 14 | echo "$(date --iso-8601=seconds)" "Snapshoting: $1" 15 | W=$(curl -X POST -H "Content-Type: application/json" --data-raw "{\"url\": \"$1\", \"annotation\": {\"id\": \"lst-ib\", \"message\": \"openZH covid_19 github archiving\"}}" "https://pragma.archivelab.org/" 2>&1) 16 | echo "Response:" 17 | echo "${W}" 18 | ) >> webarchiveorg.log 19 | fi 20 | 21 | if which curl >/dev/null; then 22 | # Few sites, like GL, JU, SZ don't like curl, and return 403, or block site completly per-IP. 23 | # --output -, because curl, doesn't like to pipe binary files sometimes. 24 | exec curl -k --silent --output - --user-agent "Mozilla Firefox Mozilla/5.0; openZH covid_19 at github" "$@" 25 | exit 1 26 | fi 27 | 28 | if which wget >/dev/null; then 29 | # Few sites, like GL, JU, SZ don't like curl, and return 403, or block site completly per-IP. 30 | exec wget --output-document=- --quiet --user-agent="Mozilla Firefox Mozilla/5.0; openZH covid_19 at github" "$@" 31 | exit 1 32 | fi 33 | 34 | if which GET >/dev/null; then 35 | # Few sites, like GL, JU, SZ don't like curl, and return 403, or block site completly per-IP. 36 | exec GET "$@" 37 | exit 1 38 | fi 39 | 40 | echo "$0: No curl, wget or GET found. Install curl (recommended), or wget." >&2 41 | exit 2 42 | -------------------------------------------------------------------------------- /scrapers/scrape_tests.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import scrape_common as sc 4 | import sys 5 | import re 6 | 7 | 8 | # download latest PDF 9 | pdf_url = 'https://www.bag.admin.ch/dam/bag/de/dokumente/mt/k-und-i/aktuelle-ausbrueche-pandemien/2019-nCoV/covid-19-woechentlicher-lagebericht.pdf.download.pdf/BAG_COVID-19_Woechentliche_Lage.pdf' 10 | d = sc.pdfdownload(pdf_url, raw=True, silent=True) 11 | 12 | """ 13 | Coronavirus-Krankheit-2019 (COVID-19) 14 | Eidgenssisches Departement des Innern EDI 15 | Bundesamt fr Gesundheit BAG 16 | Direktionsbereich <96>ffentliche Gesundheit 17 | Situationsbericht zur epidemiologischen Lage in der Schweiz 18 | und im Frstentum Liechtenstein - Woche 28 (06.-12.07.2020) 19 | """ 20 | 21 | datetime = sc.find(r'Liechtenstein - Woche .*(\d{2}\.\d{2}\.\d{4})\)', d) 22 | 23 | """ 24 | Canton, tests of previous-week then current-week 25 | 26 | AG 5478 3588 808 529 1.3 1.8 27 | AI 96 55 595 341 0.0 0.0 28 | AR 391 249 708 451 0.5 1.2 29 | BE 6924 4652 669 449 0.4 0.9 30 | ... 31 | """ 32 | start = d.find('Anzahl PCR-Tests in der Schweiz') 33 | if start > 0: 34 | start = d.find('\nAG ', start) 35 | else: 36 | start = 0 37 | end = d.find('Tabelle 4. Durchgeführte Tests nach Kalenderwoche', start) 38 | if start > 0 and end > start: 39 | tests_table = d[start:end] 40 | for line in tests_table.splitlines(): 41 | canton = sc.find(r'^([A-Z][A-Z]) ', line) 42 | if canton is not None: 43 | dd = sc.DayData(canton=canton, url=pdf_url) 44 | dd.datetime = datetime 45 | dd.tested = sc.find(r'^[A-Z][A-Z] \d+ (\d+)', line) 46 | print('-' * 10) 47 | print(dd) 48 | 49 | -------------------------------------------------------------------------------- /scripts/new2oldcsv.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # This script convert CSV files from the new to the old structure 4 | 5 | import csv 6 | import sys 7 | import traceback 8 | 9 | assert len(sys.argv) == 2, "Call script with CSV file as parameter" 10 | 11 | try: 12 | filename = sys.argv[1] 13 | rows = [] 14 | with open(filename, 'r') as f: 15 | dr = csv.DictReader(f) 16 | for r in dr: 17 | # map old to new structure 18 | data = { 19 | 'date': r['date'], 20 | 'time': r['time'], 21 | 'abbreviation_canton_and_fl': r['abbreviation_canton_and_fl'], 22 | 'ncumul_tested': r['ncumul_tested'], 23 | 'ncumul_conf': r['ncumul_conf'], 24 | 'ncumul_hosp': r['current_hosp'], 25 | 'ncumul_ICU': r['current_icu'], 26 | 'ncumul_vent': r['current_vent'], 27 | 'ncumul_released': r['ncumul_released'], 28 | 'ncumul_deceased': r['ncumul_deceased'], 29 | 'source': r['source'], 30 | } 31 | # re-add extra columns 32 | for col in dr.fieldnames[15:]: 33 | data[col] = r[col] 34 | rows.append(data) 35 | 36 | writer = csv.DictWriter( 37 | sys.stdout, 38 | rows[0].keys(), 39 | delimiter=',', 40 | quotechar='"', 41 | lineterminator='\n', 42 | quoting=csv.QUOTE_MINIMAL 43 | ) 44 | writer.writeheader() 45 | writer.writerows(rows) 46 | except Exception as e: 47 | print("Error: %s" % e, file=sys.stderr) 48 | print(traceback.format_exc(), file=sys.stderr) 49 | sys.exit(1) 50 | finally: 51 | sys.stdout.flush() 52 | -------------------------------------------------------------------------------- /scrapers/validate_scrapers.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import re 4 | import subprocess 5 | import sys 6 | import os 7 | from scrape_matrix import matrix 8 | 9 | __location__ = os.path.realpath( 10 | os.path.join( 11 | os.getcwd(), 12 | os.path.dirname(__file__) 13 | ) 14 | ) 15 | 16 | 17 | if __name__ == '__main__': 18 | all_features = ['Confirmed cases', 'Deaths', 'Released', 'Hospitalized', 'ICU', 'Vent'] 19 | has_issue = False 20 | for canton, features in matrix.items(): 21 | print(canton) 22 | scraper = f'{__location__}/scrape_{canton.lower()}.py' 23 | if not os.access(scraper, os.X_OK): 24 | print(f"{scraper} is not executable; skipping") 25 | continue 26 | result = subprocess.run([scraper], stdout=subprocess.PIPE) 27 | output = re.sub('----------\n$', '', result.stdout.decode('utf-8')).split('----------\n')[-1] 28 | for feature in features: 29 | if feature == 'Released': 30 | feature = r'(:?Released|Recovered)' 31 | matches = re.search(f'{feature}: (.+)', output) 32 | if matches is None or matches[1].startswith('None'): 33 | has_issue = True 34 | print(f"missing {feature} for {canton}") 35 | for feature in all_features: 36 | if feature not in features: 37 | if feature == 'Released': 38 | feature = r'(:?Released|Recovered)' 39 | if re.search(f'{feature}:', output) is not None: 40 | has_issue = True 41 | print(f"{feature} is present for {canton} but not listed in feature matrix") 42 | 43 | if has_issue: 44 | sys.exit(1) 45 | -------------------------------------------------------------------------------- /scrapers/scrape_ge_common.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import re 5 | import time 6 | from bs4 import BeautifulSoup 7 | from selenium.webdriver.common.by import By 8 | from selenium.webdriver.support.ui import WebDriverWait 9 | from selenium.webdriver.support import expected_conditions as EC 10 | import scrape_common as sc 11 | 12 | 13 | def get_latest_ge_weekly_pdf_url(): 14 | return get_ge_weekly_pdf_urls()[0] 15 | 16 | 17 | def get_ge_weekly_pdf_urls(): 18 | d = sc.download('https://www.ge.ch/document/covid-19-bilan-epidemiologique-hebdomadaire', silent=True) 19 | soup = BeautifulSoup(d, 'html.parser') 20 | links = soup.find_all('a', title=re.compile(r"\.pdf$")) 21 | result = [] 22 | for link in links: 23 | pdf_url = link.get('href') 24 | assert pdf_url, "pdf URL is empty" 25 | if not pdf_url.startswith('http'): 26 | pdf_url = f'https://www.ge.ch{pdf_url}' 27 | if pdf_url not in result: 28 | result.append(pdf_url) 29 | return result 30 | 31 | 32 | class element_has_link(object): 33 | def __init__(self, locator): 34 | self.locator = locator 35 | 36 | def __call__(self, driver): 37 | element = driver.find_element(*self.locator) # Finding the referenced element 38 | if element.get_attribute('href'): 39 | return element 40 | else: 41 | return False 42 | 43 | 44 | def get_link_from_element(driver, element_id): 45 | # the xls download links do not appear immediately for some reason 46 | # add some delay to get it. 47 | wait = WebDriverWait(driver, 30) 48 | elem = wait.until(element_has_link((By.ID, element_id))) 49 | url = elem.get_attribute('href') 50 | 51 | return url 52 | -------------------------------------------------------------------------------- /scrapers/populate_district_database.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # This script creates a new sqlite database based on the CSV is reiceives as an argument 4 | # The sqlite database is used as an intermediate step to merge new data in existing CSVs 5 | 6 | import sqlite3 7 | import traceback 8 | import os 9 | import sys 10 | import db_common as dc 11 | 12 | 13 | __location__ = dc.get_location() 14 | 15 | try: 16 | # load the csv to sqlite db 17 | assert len(sys.argv) == 2, "Call script with CSV file as parameter" 18 | columns, to_db = dc.load_csv(sys.argv[1]) 19 | 20 | # create db 21 | DATABASE_NAME = os.path.join(__location__, 'data.sqlite') 22 | conn = sqlite3.connect(DATABASE_NAME) 23 | c = conn.cursor() 24 | c.execute('DROP TABLE IF EXISTS data') 25 | c.execute( 26 | ''' 27 | CREATE TABLE IF NOT EXISTS data ( 28 | DistrictId integer NOT NULL, 29 | District text NOT NULL, 30 | Canton text NOT NULL, 31 | Date text NOT NULL, 32 | Week text NOT NULL, 33 | Year text NOT NULL, 34 | Population integer, 35 | TotalConfCases integer, 36 | NewConfCases integer, 37 | TotalDeaths integer, 38 | NewDeaths integer, 39 | SourceUrl text, 40 | UNIQUE(DistrictId, District, Canton, Date, Week, Year) 41 | ) 42 | ''' 43 | ) 44 | 45 | # add entries 46 | query = dc.insert_db_query(columns) 47 | c.executemany(query, to_db) 48 | conn.commit() 49 | except Exception as e: 50 | print("Error: %s" % e, file=sys.stderr) 51 | print(traceback.format_exc(), file=sys.stderr) 52 | sys.exit(1) 53 | finally: 54 | conn.close() 55 | -------------------------------------------------------------------------------- /scrapers/scrape_so_districts.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | from bs4 import BeautifulSoup 5 | import scrape_common as sc 6 | 7 | url = 'https://corona.so.ch/bevoelkerung/daten/fallzahlen-nach-gemeinden/' 8 | d = sc.download(url, silent=True) 9 | 10 | date = sc.find(r'Stand (\d+\.\d+\.20\d{2})', d) 11 | date = sc.date_from_text(date) 12 | 13 | population = { 14 | 'Solothurn': 16933, 15 | 'Bucheggberg': 7954, 16 | 'Dorneck': 20678, 17 | 'Gäu': 21605, 18 | 'Gösgen': 24536, 19 | 'Lebern': 24536, 20 | 'Olten': 55686, 21 | 'Thal': 14785, 22 | 'Thierstein': 14747, 23 | 'Wasseramt': 52134, 24 | } 25 | 26 | district_ids = { 27 | 'Solothurn': 1109, 28 | 'Bucheggberg': 1103, 29 | 'Dorneck': 1104, 30 | 'Gäu': 1101, 31 | 'Gösgen': 1105, 32 | 'Lebern': 1107, 33 | 'Olten': 1108, 34 | 'Thal': 1102, 35 | 'Thierstein': 1110, 36 | 'Wasseramt': 1106, 37 | } 38 | 39 | 40 | def strip_so_number(value): 41 | value = value.replace('\'', '') 42 | value = value.replace('^', '') 43 | return int(value) 44 | 45 | 46 | soup = BeautifulSoup(d, 'html.parser') 47 | for district, d_id in district_ids.items(): 48 | table = soup.find(text=district).find_next('table') 49 | tr = table.find('strong', text='Total').find_parent('tr') 50 | tds = tr.find_all('td') 51 | assert tds[0].text == 'Total', f'Expected "Total" row, got {tds[0].text}' 52 | dd = sc.DistrictData(canton='SO', district=district) 53 | dd.url = url 54 | dd.date = date.isoformat() 55 | dd.population = strip_so_number(tds[1].text) 56 | dd.district_id = d_id 57 | dd.total_cases = strip_so_number(tds[2].text) 58 | dd.new_cases = int(tds[3].text) 59 | print(dd) 60 | -------------------------------------------------------------------------------- /scrapers/scrape_sh.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import datetime 5 | import re 6 | from bs4 import BeautifulSoup 7 | import scrape_common as sc 8 | import scrape_sh_common as shc 9 | 10 | main_url, xls = shc.get_sh_xlsx() 11 | 12 | rows = sc.parse_xls(xls, header_row=0) 13 | is_first = True 14 | for row in rows: 15 | if not isinstance(row['Datum'], datetime.datetime): 16 | continue 17 | if not (row['Positiv'] or row.search(r'Hospitalisation isoliert\s+bestätigt') or row.search(r'Hospitalisation\s+intensiv.*$') or row['Verstorben']): 18 | continue 19 | 20 | if not is_first: 21 | print('-' * 10) 22 | is_first = False 23 | 24 | dd = sc.DayData(canton='SH', url=main_url) 25 | dd.datetime = row['Datum'].date().isoformat() 26 | dd.cases = row['Positiv'] 27 | 28 | if sc.represents_int(row.search(r'Hospitalisation isoliert\s+bestätigt')) and sc.represents_int(row.search(r'Hospitalisation\s+intensiv.*$')): 29 | dd.hospitalized = row.search(r'Hospitalisation isoliert\s+bestätigt') + row.search(r'Hospitalisation\s+intensiv.*$') 30 | dd.icu = row.search(r'Hospitalisation\s+intensiv.*$') 31 | if row['Verstorben'] is not None: 32 | dd.deaths = row['Verstorben'] 33 | 34 | isolated = row.search(r'Anzahl Personen\s+in Isolation.*') 35 | if isolated is not None: 36 | dd.isolated = isolated 37 | quarantined = row.search(r'Anzahl Personen\s+in Quarantäne\s+.*Kontaktpersonen.*') 38 | if quarantined is not None: 39 | dd.quarantined = quarantined 40 | quarantined_risk = row.search(r'Anzahl Personen\s+in Quarantäne\s+.*Rückkehr.*Risikoländer.*') 41 | if quarantined_risk is not None: 42 | dd.quarantine_riskareatravel = quarantined_risk 43 | 44 | print(dd) 45 | -------------------------------------------------------------------------------- /scripts/old2newcsv.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # This script convert CSV files from the old to the new structure 4 | 5 | import csv 6 | import sys 7 | import traceback 8 | 9 | assert len(sys.argv) == 2, "Call script with CSV file as parameter" 10 | 11 | try: 12 | filename = sys.argv[1] 13 | rows = [] 14 | with open(filename, 'r') as f: 15 | dr = csv.DictReader(f) 16 | for r in dr: 17 | # map old to new structure 18 | data = { 19 | 'date': r['date'], 20 | 'time': r['time'], 21 | 'abbreviation_canton_and_fl': r['abbreviation_canton_and_fl'], 22 | 'ncumul_tested': r['ncumul_tested'], 23 | 'ncumul_conf': r['ncumul_conf'], 24 | 'new_hosp': '', 25 | 'current_hosp': r['ncumul_hosp'], 26 | 'current_icu': r['ncumul_ICU'], 27 | 'current_vent': r['ncumul_vent'], 28 | 'ncumul_released': r['ncumul_released'], 29 | 'ncumul_deceased': r['ncumul_deceased'], 30 | 'source': r['source'], 31 | 'current_isolated': '', 32 | 'current_quarantined': '', 33 | } 34 | # re-add extra columns 35 | for col in dr.fieldnames[11:]: 36 | data[col] = r[col] 37 | rows.append(data) 38 | 39 | writer = csv.DictWriter( 40 | sys.stdout, 41 | rows[0].keys(), 42 | delimiter=',', 43 | quotechar='"', 44 | lineterminator='\n', 45 | quoting=csv.QUOTE_MINIMAL 46 | ) 47 | writer.writeheader() 48 | writer.writerows(rows) 49 | except Exception as e: 50 | print("Error: %s" % e, file=sys.stderr) 51 | print(traceback.format_exc(), file=sys.stderr) 52 | sys.exit(1) 53 | finally: 54 | sys.stdout.flush() 55 | -------------------------------------------------------------------------------- /scrapers/scrape_bs_tests.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import csv 4 | from io import StringIO 5 | import scrape_common as sc 6 | 7 | 8 | def prettify_positivity_rate(positivity_rate): 9 | if not positivity_rate: 10 | return None 11 | return round(10 * float(positivity_rate)) / 10 12 | 13 | 14 | url = 'https://data.bs.ch/explore/dataset/100094/download/?format=csv&timezone=Europe/Berlin&lang=en&use_labels_for_header=true&csv_separator=%3B' 15 | data = sc.download(url, silent=True) 16 | 17 | reader = csv.DictReader(StringIO(data), delimiter=';') 18 | for row in reader: 19 | td = sc.TestData(canton='BS', url=url) 20 | td.start_date = row['Datum'] 21 | td.end_date = row['Datum'] 22 | td.positive_tests = row['Positive Tests'] or None 23 | td.negative_tests = row['Negative Tests'] or None 24 | td.total_tests = row['Total Tests'] or None 25 | td.positivity_rate = row['Anteil positive Tests in Prozent'] or None 26 | 27 | td.pcr_positive_tests = row['Positive PCR Tests'] or None 28 | td.pcr_negative_tests = row['Negative PCR Tests'] or None 29 | td.pcr_total_tests = row['Total PCR Tests'] or None 30 | td.pcr_positivity_rate = row['Anteil positive PCR Tests in Prozent'] or None 31 | 32 | td.ag_positive_tests = row['Positive Antigen Schnelltests'] or None 33 | td.ag_negative_tests = row['Negative Antigen Schnelltests'] or None 34 | td.ag_total_tests = row['Total Antigen Schnelltests'] or None 35 | td.ag_positivity_rate = row['Anteil positive Antigen Schnelltests in Prozent'] or None 36 | 37 | if td: 38 | td.positivity_rate = prettify_positivity_rate(td.positivity_rate) 39 | td.pcr_positivity_rate = prettify_positivity_rate(td.pcr_positivity_rate) 40 | td.ag_positivity_rate = prettify_positivity_rate(td.ag_positivity_rate) 41 | print(td) 42 | -------------------------------------------------------------------------------- /scrapers/scrape_be.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import csv 4 | from io import StringIO 5 | import re 6 | import scrape_common as sc 7 | 8 | url = 'https://covid-kennzahlen.apps.be.ch/#/de/cockpit' 9 | 10 | csv_url = 'https://raw.githubusercontent.com/openDataBE/covid19Data/develop/total_faelle.csv' 11 | d = sc.download(csv_url, silent=True) 12 | reader = csv.DictReader(StringIO(d), delimiter=',') 13 | is_first = True 14 | for row in reader: 15 | if not is_first: 16 | print('-' * 10) 17 | is_first = False 18 | 19 | dd = sc.DayData(canton='BE', url=url) 20 | dd.datetime = row['datum'] 21 | dd.cases = row['total_laborbestaetigte_faelle'] 22 | dd.deaths = row['total_todesfaelle'] 23 | print(dd) 24 | 25 | csv_url = 'https://raw.githubusercontent.com/openDataBE/covid19Data/develop/spa_auslastung.csv' 26 | d = sc.download(csv_url, silent=True) 27 | reader = csv.DictReader(StringIO(d), delimiter=',') 28 | is_first = True 29 | for row in reader: 30 | if not is_first: 31 | print('-' * 10) 32 | is_first = False 33 | 34 | dd = sc.DayData(canton='BE', url=url) 35 | dd.datetime = row['datum'] 36 | dd.hospitalized = row['personen_hospitalisiert'] 37 | dd.vent = int(row['auf_intensivpflegestation_beatmet']) 38 | dd.icu = int(row['auf_intensivpflegestation_unbeatmet']) + dd.vent 39 | print(dd) 40 | 41 | csv_url = 'https://raw.githubusercontent.com/openDataBE/covid19Data/develop/contact_tracing.csv' 42 | d = sc.download(csv_url, silent=True) 43 | reader = csv.DictReader(StringIO(d), delimiter=',') 44 | is_first = True 45 | for row in reader: 46 | if not is_first: 47 | print('-' * 10) 48 | is_first = False 49 | 50 | dd = sc.DayData(canton='BE', url=url) 51 | dd.datetime = row['datum'] 52 | dd.quarantined = row['personen_in_quarantaene'] 53 | dd.isolated = row['personen_in_isolation'] 54 | print(dd) 55 | -------------------------------------------------------------------------------- /.github/workflows/test_tests_scraper.yml: -------------------------------------------------------------------------------- 1 | name: Test run of tests scrapers 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | paths: 7 | - 'scrapers/*tests*' 8 | - 'scrapers/parse_scrape_output.py' 9 | - 'scrapers/populate_tests_database.py' 10 | - 'scrapers/run_tests_scraper.sh' 11 | - 'scrapers/scrape_dates.py' 12 | - 'scrapers/scrape_matrix.py' 13 | - 'scrapers/validate_scraper*' 14 | - 'scrapers/*_common.py' 15 | - '!scrapers/*_districts.py' 16 | - '.github/workflows/**' 17 | pull_request: 18 | branches: [ master ] 19 | paths: 20 | - 'scrapers/*tests*' 21 | - 'scrapers/parse_scrape_output.py' 22 | - 'scrapers/populate_tests_database.py' 23 | - 'scrapers/run_tests_scraper.sh' 24 | - 'scrapers/scrape_dates.py' 25 | - 'scrapers/scrape_matrix.py' 26 | - 'scrapers/validate_scraper*' 27 | - 'scrapers/*_common.py' 28 | - '!scrapers/*_districts.py' 29 | - '.github/workflows/**' 30 | workflow_dispatch: ~ 31 | 32 | jobs: 33 | test_run: 34 | runs-on: ubuntu-20.04 35 | timeout-minutes: 10 36 | 37 | steps: 38 | - uses: actions/checkout@v3 39 | 40 | - name: Set up Python 3.7 41 | uses: actions/setup-python@v4 42 | with: 43 | python-version: 3.7 44 | 45 | - name: Remove broken apt repos 46 | run: | 47 | for apt_file in `grep -lr microsoft /etc/apt/sources.list.d/`; do sudo rm $apt_file; done 48 | 49 | - name: Install dependencies 50 | run: | 51 | npm ci 52 | python -m pip install --upgrade pip setuptools wheel 53 | pip install -r requirements.txt 54 | sudo apt update || true # do not fail if update does not work 55 | sudo apt-get install poppler-utils 56 | sudo apt-get install chromium-browser 57 | 58 | - name: Test run of all tests scrapers 59 | run: ./scrapers/test_tests_scraper.sh 60 | 61 | -------------------------------------------------------------------------------- /scrapers/scrape_sz_districts.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import re 5 | 6 | from bs4 import BeautifulSoup 7 | 8 | import scrape_common as sc 9 | 10 | url = 'https://www.sz.ch/behoerden/information-medien/medienmitteilungen/coronavirus.html/72-416-412-1379-6948' 11 | content = sc.download(url, silent=True) 12 | soup = BeautifulSoup(content, 'html.parser') 13 | pdf_url = soup.find('a', text=re.compile(r'Coronafälle pro Gemeinde')).get('href') 14 | 15 | content = sc.pdfdownload(pdf_url, layout=True, silent=True) 16 | date = sc.find(r'Stand\W+(\d+\.\d+\.20\d{2})', content) 17 | date = sc.date_from_text(date).isoformat() 18 | district_data = re.findall(r'^Bezirk\W+(\w+)\s+(≤?\s?\d+)', content, re.MULTILINE) 19 | 20 | # https://www.bfs.admin.ch/bfs/de/home/statistiken/kataloge-datenbanken/karten.assetdetail.5688189.html 21 | district_ids = { 22 | 'Einsiedeln': 501, 23 | 'Gersau': 502, 24 | 'Höfe': 503, 25 | 'Küssnacht': 504, 26 | 'March': 505, 27 | 'Schwyz': 506, 28 | } 29 | 30 | # https://www.sz.ch/kanton/bezirke/schwyz.html/72-210-112-106 31 | population = { 32 | 'Einsiedeln': 16027, 33 | 'Gersau': 2314, 34 | 'Höfe': 29123, 35 | 'Küssnacht': 13270, 36 | 'March': 43528, 37 | 'Schwyz': 55390, 38 | } 39 | 40 | assert len(district_data) == len(district_ids), f'expected {len(district_ids)} districts available, but got {len(district_data)}: {district_data}' 41 | 42 | for district, total_cases in district_data: 43 | assert district in district_ids, f'District {district} is unknown' 44 | 45 | dd = sc.DistrictData(canton='SZ', district=district) 46 | dd.url = pdf_url 47 | dd.district_id = district_ids[district] 48 | dd.population = population[district] 49 | dd.date = date 50 | # skip total_cases for ≤ entries 51 | if not sc.find(r'(≤)', total_cases): 52 | dd.total_cases = total_cases 53 | print(dd) 54 | -------------------------------------------------------------------------------- /.github/workflows/test_district_scraper.yml: -------------------------------------------------------------------------------- 1 | name: Test run of district scrapers 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | paths: 7 | - 'scrapers/*_districts*' 8 | - 'scrapers/parse_scrape_output.py' 9 | - 'scrapers/populate_district_database.py' 10 | - 'scrapers/run_district_scraper.sh' 11 | - 'scrapers/scrape_dates.py' 12 | - 'scrapers/scrape_matrix.py' 13 | - 'scrapers/validate_scraper*' 14 | - 'scrapers/*_common.py' 15 | - '!scrapers/*_tests.py' 16 | - '.github/workflows/**' 17 | pull_request: 18 | branches: [ master ] 19 | paths: 20 | - 'scrapers/*_districts*' 21 | - 'scrapers/parse_scrape_output.py' 22 | - 'scrapers/populate_district_database.py' 23 | - 'scrapers/run_district_scraper.sh' 24 | - 'scrapers/scrape_dates.py' 25 | - 'scrapers/scrape_matrix.py' 26 | - 'scrapers/validate_scraper*' 27 | - 'scrapers/*_common.py' 28 | - '!scrapers/*_tests.py' 29 | - '.github/workflows/**' 30 | workflow_dispatch: ~ 31 | 32 | jobs: 33 | test_run: 34 | runs-on: ubuntu-20.04 35 | timeout-minutes: 10 36 | 37 | steps: 38 | - uses: actions/checkout@v3 39 | 40 | - name: Set up Python 3.7 41 | uses: actions/setup-python@v4 42 | with: 43 | python-version: 3.7 44 | 45 | - name: Remove broken apt repos 46 | run: | 47 | for apt_file in `grep -lr microsoft /etc/apt/sources.list.d/`; do sudo rm $apt_file; done 48 | 49 | - name: Install dependencies 50 | run: | 51 | npm ci 52 | python -m pip install --upgrade pip setuptools wheel 53 | pip install -r requirements.txt 54 | pip install -r requirements-ocr.txt 55 | sudo apt update || true # do not fail if update does not work 56 | sudo apt-get install poppler-utils 57 | 58 | - name: Test run of all district scrapers 59 | run: ./scrapers/test_district_scraper.sh 60 | 61 | -------------------------------------------------------------------------------- /scrapers/populate_database.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # This script creates a new sqlite database based on the CSV is reiceives as an argument 4 | # The sqlite database is used as an intermediate step to merge new data in existing CSVs 5 | 6 | import sqlite3 7 | import traceback 8 | import os 9 | import sys 10 | import db_common as dc 11 | 12 | 13 | __location__ = dc.get_location() 14 | 15 | try: 16 | # load the csv to sqlite db 17 | assert len(sys.argv) == 2, "Call script with CSV file as parameter" 18 | columns, to_db = dc.load_csv(sys.argv[1]) 19 | 20 | # create db 21 | DATABASE_NAME = os.path.join(__location__, 'data.sqlite') 22 | conn = sqlite3.connect(DATABASE_NAME) 23 | c = conn.cursor() 24 | c.execute('DROP TABLE IF EXISTS data') 25 | c.execute( 26 | ''' 27 | CREATE TABLE IF NOT EXISTS data ( 28 | date text, 29 | time text, 30 | abbreviation_canton_and_fl text, 31 | ncumul_tested integer, 32 | ncumul_conf integer, 33 | new_hosp integer, 34 | current_hosp integer, 35 | current_icu integer, 36 | current_vent integer, 37 | ncumul_released integer, 38 | ncumul_deceased integer, 39 | source text, 40 | current_isolated integer, 41 | current_quarantined integer, 42 | UNIQUE(date, abbreviation_canton_and_fl) 43 | ) 44 | ''' 45 | ) 46 | # check if there are extra columns 47 | for col in columns[14:]: 48 | c.execute(f'ALTER TABLE data ADD COLUMN {col} integer;') 49 | 50 | # add entries 51 | query = dc.insert_db_query(columns) 52 | c.executemany(query, to_db) 53 | conn.commit() 54 | except Exception as e: 55 | print("Error: %s" % e, file=sys.stderr) 56 | print(traceback.format_exc(), file=sys.stderr) 57 | sys.exit(1) 58 | finally: 59 | conn.close() 60 | -------------------------------------------------------------------------------- /scrapers/scrape_gr_districts.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import datetime 4 | import requests 5 | 6 | import scrape_common as sc 7 | 8 | inhabitants = { 9 | 'Albula': 8054, 10 | 'Bernina': 4613, 11 | 'Engiadina Bassa/Val Müstair': 9197, 12 | 'Imboden': 21293, 13 | 'Landquart': 25402, 14 | 'Maloja': 18184, 15 | 'Moesa': 8671, 16 | 'Plessur': 42446, 17 | 'Prättigau/Davos': 26089, 18 | 'Surselva': 21289, 19 | 'Viamala': 13783, 20 | } 21 | 22 | district_ids = { 23 | 'Albula': 1841, 24 | 'Bernina': 1842, 25 | 'Engiadina Bassa/Val Müstair': 1843, 26 | 'Imboden': 1844, 27 | 'Landquart': 1845, 28 | 'Maloja': 1846, 29 | 'Moesa': 1847, 30 | 'Plessur': 1848, 31 | 'Prättigau/Davos': 1849, 32 | 'Surselva': 1850, 33 | 'Viamala': 1851, 34 | } 35 | 36 | 37 | limit = '100' 38 | url = 'https://services1.arcgis.com/YAuo6vcW85VPu7OE/arcgis/rest/services/Fallzahlen_Pro_Region/FeatureServer/0/query?f=json&where=Datum%3E%3Dtimestamp%20%272020-02-01%2000%3A00%3A00%27&returnGeometry=false&spatialRel=esriSpatialRelIntersects&outFields=*&orderByFields=Region%20asc&resultOffset=0&resultRecordCount=10000&resultType=standard&cacheHint=true' 39 | 40 | 41 | resp = requests.get(url=url) 42 | json_data = resp.json() 43 | 44 | for attributes in json_data['features']: 45 | element = attributes['attributes'] 46 | 47 | if element['Region'] in district_ids: 48 | dd = sc.DistrictData(canton='GR', district=element['Region']) 49 | dd.url = url 50 | date = datetime.datetime.utcfromtimestamp(element['Datum'] / 1000) 51 | dd.date = date.date().isoformat() 52 | dd.total_cases = element['Faelle__kumuliert_'] 53 | dd.new_cases = element['Neue_Faelle'] 54 | dd.total_deceased = element['Verstorbene__kumuliert_'] 55 | dd.new_deceased = element['Verstorbene'] 56 | dd.population = inhabitants[dd.district] 57 | dd.district_id = district_ids[dd.district] 58 | print(dd) 59 | -------------------------------------------------------------------------------- /scrapers/populate_tests_database.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # This script creates a new sqlite database based on the CSV is reiceives as an argument 4 | # The sqlite database is used as an intermediate step to merge new data in existing CSVs 5 | 6 | import sqlite3 7 | import traceback 8 | import os 9 | import sys 10 | import db_common as dc 11 | 12 | 13 | __location__ = dc.get_location() 14 | 15 | try: 16 | # load the csv to sqlite db 17 | assert len(sys.argv) == 2, "Call script with CSV file as parameter" 18 | columns, to_db = dc.load_csv(sys.argv[1]) 19 | 20 | # create db 21 | DATABASE_NAME = os.path.join(__location__, 'data.sqlite') 22 | conn = sqlite3.connect(DATABASE_NAME) 23 | c = conn.cursor() 24 | c.execute('DROP TABLE IF EXISTS data') 25 | c.execute( 26 | ''' 27 | CREATE TABLE IF NOT EXISTS data ( 28 | canton text NOT NULL, 29 | start_date text NOT NULL, 30 | end_date text NOT NULL, 31 | week text NOT NULL, 32 | year text NOT NULL, 33 | positive_tests integer, 34 | negative_tests integer, 35 | total_tests integer, 36 | positivity_rate float, 37 | source text, 38 | pcr_positive_tests integer, 39 | pcr_negative_tests integer, 40 | pcr_total_tests integer, 41 | pcr_positivity_rate float, 42 | ag_positive_tests integer, 43 | ag_negative_tests integer, 44 | ag_total_tests integer, 45 | ag_positivity_rate float, 46 | UNIQUE(canton, start_date, end_date, week, year) 47 | ) 48 | ''' 49 | ) 50 | 51 | # add entries 52 | query = dc.insert_db_query(columns) 53 | c.executemany(query, to_db) 54 | conn.commit() 55 | except Exception as e: 56 | print("Error: %s" % e, file=sys.stderr) 57 | print(traceback.format_exc(), file=sys.stderr) 58 | sys.exit(1) 59 | finally: 60 | conn.close() 61 | -------------------------------------------------------------------------------- /scripts/add_new_columns.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # This script convert CSV files from the new to the old structure 4 | 5 | import csv 6 | import sys 7 | import traceback 8 | 9 | assert len(sys.argv) == 2, "Call script with CSV file as parameter" 10 | 11 | try: 12 | filename = sys.argv[1] 13 | rows = [] 14 | with open(filename, 'r') as f: 15 | dr = csv.DictReader(f) 16 | for r in dr: 17 | # map old to new structure 18 | data = { 19 | 'date': r['date'], 20 | 'time': r['time'], 21 | 'abbreviation_canton_and_fl': r['abbreviation_canton_and_fl'], 22 | 'ncumul_tested': r['ncumul_tested'], 23 | 'ncumul_conf': r['ncumul_conf'], 24 | 'new_hosp': r['new_hosp'], 25 | 'current_hosp': r['current_hosp'], 26 | 'current_icu': r['current_icu'], 27 | 'current_vent': r['current_vent'], 28 | 'ncumul_released': r['ncumul_released'], 29 | 'ncumul_deceased': r['ncumul_deceased'], 30 | 'source': r['source'], 31 | 'current_isolated': r.get('current_isolated', ''), 32 | 'current_quarantined': r.get('current_quarantined', ''), 33 | 'current_quarantined_riskareatravel': r.get('current_quarantined_riskareatravel', ''), # new field 34 | } 35 | # re-add extra columns 36 | for col in dr.fieldnames[12:]: 37 | data[col] = r[col] 38 | rows.append(data) 39 | 40 | writer = csv.DictWriter( 41 | sys.stdout, 42 | rows[0].keys(), 43 | delimiter=',', 44 | quotechar='"', 45 | lineterminator='\n', 46 | quoting=csv.QUOTE_MINIMAL 47 | ) 48 | writer.writeheader() 49 | writer.writerows(rows) 50 | except Exception as e: 51 | print("Error: %s" % e, file=sys.stderr) 52 | print(traceback.format_exc(), file=sys.stderr) 53 | sys.exit(1) 54 | finally: 55 | sys.stdout.flush() 56 | -------------------------------------------------------------------------------- /.github/workflows/validate-csv.yml: -------------------------------------------------------------------------------- 1 | name: Validate CSV 2 | 3 | on: 4 | schedule: 5 | - cron: '15 */4 * * *' 6 | workflow_dispatch: ~ 7 | push: 8 | branches: [ master ] 9 | paths: 10 | - '**.csv' 11 | pull_request: 12 | branches: [ master ] 13 | paths: 14 | - '**.csv' 15 | 16 | jobs: 17 | validate: 18 | runs-on: ubuntu-20.04 19 | timeout-minutes: 10 20 | 21 | steps: 22 | - uses: actions/checkout@v3 23 | 24 | - name: Set up Python 3.7 25 | uses: actions/setup-python@v4 26 | with: 27 | python-version: 3.7 28 | 29 | - name: Install dependencies 30 | run: | 31 | npm ci 32 | python -m pip install --upgrade pip 33 | pip install -r requirements.txt 34 | 35 | - name: Validate structure and content of CSVs 36 | run: node scripts/validate-csv.js fallzahlen_kanton_total_csv_v2/*.csv 37 | 38 | - name: Check if there are empty lines 39 | run: scripts/check_for_empty_lines.sh fallzahlen_kanton_total_csv_v2/*.csv 40 | 41 | - name: Check for outliers in CSVs 42 | run: python scripts/check_for_outliers.py fallzahlen_kanton_total_csv_v2/*.csv 43 | 44 | - name: Get current unix timestamp 45 | if: always() 46 | id: date 47 | run: echo "ts=$(date +'%s')" >> $GITHUB_OUTPUT 48 | 49 | # notify slack if a CSV validation failed 50 | - name: Notify slack failure 51 | if: ${{ failure() }} 52 | env: 53 | SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} 54 | uses: pullreminders/slack-action@master 55 | with: 56 | args: '{\"channel\":\"C013C0UUQ4S\", \"attachments\": [{\"fallback\": \"Job failed.\", \"color\": \"danger\", \"title\": \"Validate CSV\", \"title_link\": \"https://github.com/openZH/covid_19/actions/runs/${{ github.run_id }}?check_suite_focus=true\", \"text\": \":x: CSV validation failed\", \"footer\": \"\", \"footer_icon\": \"https://github.com/abinoda/slack-action/raw/master/docs/app-icons/github-icon.png\", \"ts\": \"${{steps.date.outputs.ts}}\"}]}' 57 | -------------------------------------------------------------------------------- /scrapers/scrape_tg_tests.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import re 4 | import scrape_common as sc 5 | 6 | url = 'https://statistik.tg.ch/themen-und-daten/covid-19.html/10816' 7 | content = sc.download(url, silent=True) 8 | 9 | res = re.search(r".*name: '2020',\s+categories: \[\'(.*)\]\s+}", content) 10 | assert res, f'failed to extract 2020 weeks, got {res}' 11 | weeks_2020 = res[1].split(',') 12 | 13 | res = re.search(r".*name: '2021',\s+categories: \[\'(.*)\]\s+}", content) 14 | assert res, f'failed to extract 2021 weeks, got {res}' 15 | weeks_2021 = res[1].split(',') 16 | 17 | res = re.search(r".*name: '2022',\s+categories: \[\'(.*)\]\s+}", content) 18 | assert res, f'failed to extract 2022 weeks, got {res}' 19 | weeks_2022 = res[1].split(',') 20 | 21 | res = re.search(r".*name: '2023',\s+categories: \[\'(.*)\]\s+}", content) 22 | assert res, f'failed to extract 2023 weeks, got {res}' 23 | weeks_2023 = res[1].split(',') 24 | 25 | weeks = weeks_2020 + weeks_2021 + weeks_2022 + weeks_2023 26 | years = ['2020'] * len(weeks_2020) + ['2021'] * len(weeks_2021) + ['2022'] * len(weeks_2022) + ['2023'] * len(weeks_2023) 27 | 28 | res = re.search(r".*name: 'Anzahl negativer Tests.?',\s+color: '.*',\s+data: \[(.*)\],", content) 29 | assert res, f'failed to extract negative tests, got {res}' 30 | negative_tests = res[1].split(',') 31 | 32 | res = re.search(r".*name: 'Anzahl positiver Tests.?',\s+color: '.*',\s+data: \[(.*)\],", content) 33 | assert res, f'failed to extract positive tests, got {res}' 34 | positive_tests = res[1].split(',') 35 | 36 | res = re.search(r".*name: 'Positivitätsrate',\s+color: '.*',\s+data: \[(.*)\],", content) 37 | assert res, f'failed to extract positivtiy rate, got {res}' 38 | positivity_rate = res[1].split(',') 39 | 40 | assert len(weeks) == len(negative_tests) == len(positive_tests) == len(positivity_rate), f'Expected same length for weeks {len(weeks)}, neg. tests {len(negative_tests)}, pos. tests {len(positive_tests)}, pos. rate {len(positivity_rate)}' 41 | 42 | for week, year, neg, pos, rate in zip(weeks, years, negative_tests, positive_tests, positivity_rate): 43 | td = sc.TestData(canton='TG', url=url) 44 | td.week = sc.find(r'KW (\d+)', week) 45 | td.year = year 46 | td.positive_tests = int(pos) 47 | td.negative_tests = int(neg) 48 | td.positivity_rate = float(rate) 49 | print(td) 50 | -------------------------------------------------------------------------------- /scripts/remove_older_entries.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # This script removes (=sets to empty string) older entries from a CSV 4 | # in this example remove current_hosp prior to 2020-05-19 5 | 6 | import csv 7 | import sys 8 | import traceback 9 | import datetime 10 | 11 | assert len(sys.argv) == 2, "Call script with CSV file as parameter" 12 | 13 | try: 14 | filename = sys.argv[1] 15 | rows = [] 16 | with open(filename, 'r') as f: 17 | dr = csv.DictReader(f) 18 | for r in dr: 19 | # map old to new structure 20 | data = { 21 | 'date': r['date'], 22 | 'time': r['time'], 23 | 'abbreviation_canton_and_fl': r['abbreviation_canton_and_fl'], 24 | 'ncumul_tested': r['ncumul_tested'], 25 | 'ncumul_conf': r['ncumul_conf'], 26 | 'new_hosp': r['new_hosp'], 27 | 'current_hosp': r['current_hosp'], 28 | 'current_icu': r['current_icu'], 29 | 'current_vent': r['current_vent'], 30 | 'ncumul_released': r['ncumul_released'], 31 | 'ncumul_deceased': r['ncumul_deceased'], 32 | 'source': r['source'], 33 | 'current_isolated': r.get('current_isolated', ''), 34 | 'current_quarantined': r.get('current_quarantined', ''), 35 | 'current_quarantined_riskareatravel': r.get('current_quarantined_riskareatravel', ''), # new field 36 | 'current_quarantined_total': r.get('current_quarantined_total', ''), # new field 37 | } 38 | if datetime.datetime.strptime(data['date'], '%Y-%m-%d') < datetime.datetime(2020, 5, 19): 39 | data['current_hosp'] = '' 40 | # re-add extra columns 41 | for col in dr.fieldnames[12:]: 42 | data[col] = r[col] 43 | rows.append(data) 44 | 45 | writer = csv.DictWriter( 46 | sys.stdout, 47 | rows[0].keys(), 48 | delimiter=',', 49 | quotechar='"', 50 | lineterminator='\n', 51 | quoting=csv.QUOTE_MINIMAL 52 | ) 53 | writer.writeheader() 54 | writer.writerows(rows) 55 | except Exception as e: 56 | print("Error: %s" % e, file=sys.stderr) 57 | print(traceback.format_exc(), file=sys.stderr) 58 | sys.exit(1) 59 | finally: 60 | sys.stdout.flush() 61 | -------------------------------------------------------------------------------- /scrapers/scrape_ti.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | from bs4 import BeautifulSoup 5 | import re 6 | import datetime 7 | import scrape_common as sc 8 | 9 | # get pdf and xlsx URL from covid19 page of TI 10 | main_url = 'https://www4.ti.ch/dss/dsp/covid19/home/' 11 | d = sc.download(main_url, silent=True) 12 | soup = BeautifulSoup(d, 'html.parser') 13 | 14 | is_first = True 15 | 16 | """ 17 | container = soup.find('h2', string=re.compile(r'Isolamento e quarantena')).find_next('div') 18 | for item in container.find_all('div'): 19 | divs = item.find_all('div') 20 | if len(divs) == 3: 21 | dd = sc.DayData(canton='TI', url=main_url) 22 | dd.datetime = sc.find(r'.*?(\d+\.\d+\.\d{2})', divs[2].string) 23 | if sc.find(r'.*(quarantena)', divs[1].string): 24 | dd.quarantined = divs[0].string 25 | if sc.find(r'.*(isolamento)', divs[1].string): 26 | dd.isolated = divs[0].string 27 | if dd: 28 | if not is_first: 29 | print('-' * 10) 30 | is_first = False 31 | print(dd) 32 | """ 33 | 34 | xls_url = soup.find(href=re.compile("\.xlsx$")).get('href') 35 | assert xls_url, "URL is empty" 36 | 37 | if not xls_url.startswith('http'): 38 | xls_url = f'https://www4.ti.ch/{xls_url}' 39 | 40 | xls = sc.xlsdownload(xls_url, silent=True) 41 | rows = sc.parse_xls(xls, header_row=0) 42 | prev_date = None 43 | for row in rows: 44 | if row is None: 45 | continue 46 | if 'Data' not in row: 47 | continue 48 | if row['Data'] is None: 49 | continue 50 | 51 | if not is_first: 52 | print('-' * 10) 53 | is_first = False 54 | 55 | dd = sc.DayData(canton='TI', url=xls_url) 56 | dd.datetime = f"{row['Data'].date().isoformat()}" 57 | if dd.datetime == "2023-08-09" and prev_date == "2023-03-08": 58 | dd.datetime = "2023-03-09" 59 | prev_date = dd.datetime 60 | if row.get('Ora'): 61 | dd.datetime += f"T{row['Ora'].time().isoformat()}" 62 | dd.cases = row['Totale casi confermati'] 63 | dd.hospitalized = row['Totale giornaliero pazienti ricoverati'] 64 | dd.icu = row['Totale giornaliero pazienti cure intense'] 65 | dd.vent = row['Totale giornaliero pazienti ventilati'] 66 | dd.recovered = row['Totale pazienti dimessi da ospedali'] 67 | dd.deaths = row['Totale decessi'] 68 | print(dd) 69 | -------------------------------------------------------------------------------- /scrapers/scrape_sz.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import re 5 | import sys 6 | import datetime 7 | from bs4 import BeautifulSoup 8 | import scrape_common as sc 9 | 10 | url = 'https://www.sz.ch/behoerden/information-medien/medienmitteilungen/coronavirus.html/72-416-412-1379-6948' 11 | d = sc.download(url, silent=True) 12 | soup = BeautifulSoup(d, 'html.parser') 13 | 14 | is_first = True 15 | 16 | """ 17 | Disabled for now, the PDFs from October 2020 contained hospitalized and quarntined data 18 | 19 | pdfs = soup.find_all('a', string=re.compile(r'Medienmitteilung vom')) 20 | for pdf in pdfs: 21 | pdf_url = pdf['href'] 22 | pdf_content = sc.pdfdownload(pdf_url, layout=True, silent=True) 23 | date = sc.find(r'Stand:\s(\d+\.\s.*\s20\d{2})', pdf_content) 24 | res = re.search(r'.*\s+(?P\d+)\s+\d+\s+\d+\s+(?P\d+)\s+(?P\d+)\s+(?P\d+)\s+', pdf_content) 25 | if not date or not res: 26 | continue 27 | 28 | if not is_first: 29 | print('-' * 10) 30 | is_first = False 31 | dd = sc.DayData(canton='SZ', url=pdf_url) 32 | dd.datetime = date.replace('\n', ' ') 33 | dd.isolated = res['iso'] 34 | dd.hospitalized = res['hosp'] 35 | dd.quarantined = res['quar'] 36 | dd.quarantine_riskareatravel = res['qtravel'] 37 | print(dd) 38 | is_first = False 39 | """ 40 | 41 | try: 42 | xls_url = soup.find('a', string=re.compile(r'Coronaf.lle\s*im\s*Kanton\s*Schwyz'))['href'] 43 | except TypeError: 44 | print("Unable to determine xls url", file=sys.stderr) 45 | sys.exit(1) 46 | xls = sc.xlsdownload(xls_url, silent=True) 47 | 48 | rows = sc.parse_xls(xls) 49 | for row in rows: 50 | if not isinstance(row['Datum'], datetime.datetime): 51 | continue 52 | 53 | if not is_first: 54 | print('-' * 10) 55 | is_first = False 56 | 57 | # TODO: remove when source is fixed 58 | # handle wrong value on 2020-03-25, see issue #631 59 | if row['Datum'].date().isoformat() == '2020-03-25': 60 | row['Bestätigte Fälle (kumuliert)'] = '' 61 | 62 | dd = sc.DayData(canton='SZ', url=url) 63 | dd.datetime = row['Datum'].date().isoformat() 64 | if row['Zeit']: 65 | dd.datetime += ' ' + row['Zeit'].time().isoformat() 66 | dd.cases = row['Bestätigte Fälle (kumuliert)'] 67 | dd.deaths = row['Todesfälle (kumuliert)'] 68 | dd.recovered = row['Genesene (kumuliert)'] 69 | print(dd) 70 | -------------------------------------------------------------------------------- /scrapers/certificate.pem: -------------------------------------------------------------------------------- 1 | # SwissSign EV Gold CA 2014 - G22 2 | -----BEGIN CERTIFICATE----- 3 | MIIGuTCCBKGgAwIBAgIQAIEIODzAB3XEDG1za+MwizANBgkqhkiG9w0BAQsFADBF 4 | MQswCQYDVQQGEwJDSDEVMBMGA1UEChMMU3dpc3NTaWduIEFHMR8wHQYDVQQDExZT 5 | d2lzc1NpZ24gR29sZCBDQSAtIEcyMB4XDTE0MDkxNTE2MTYzN1oXDTM1MDMwNDE2 6 | MTYzN1owTjELMAkGA1UEBhMCQ0gxFTATBgNVBAoTDFN3aXNzU2lnbiBBRzEoMCYG 7 | A1UEAxMfU3dpc3NTaWduIEVWIEdvbGQgQ0EgMjAxNCAtIEcyMjCCASIwDQYJKoZI 8 | hvcNAQEBBQADggEPADCCAQoCggEBAL+MVu10kh055MUIkpRaC7sfiuFQ4gAYFv4B 9 | 5LfsK6NSpTaJybYvrA/lr0JBE/xTsQl3Jrka60FgprSh9pXgE94UVoE2Qb4LiHEo 10 | AIYyBQY0aA3nL9GEkT436uXs0tV2Veg6+6CgGRzgaoQtDu3hXWV5GOyNOAtlmzR4 11 | md1JH6oFap9d3kVwJLExUI930Cwjzwt0XAcvjy8+fLheBanG5VFGnRrntRSWiRzY 12 | QIjjAkBDTi+lj552h9aKzFvFEQ5NSiBmrGVk2wIlrh+AZe8NYnXrRBzv0Z5SODD4 13 | jxyPkTAX7f9zkJ9s0yMVEmalWnfwXn4K4Rz3x7fmWeyxipUOhSkCAwEAAaOCApow 14 | ggKWMA4GA1UdDwEB/wQEAwIBBjASBgNVHRMBAf8ECDAGAQH/AgEAMB0GA1UdDgQW 15 | BBTu/UbK9ydekbxatueHzQr6VQomQjAfBgNVHSMEGDAWgBRbJXuWpGVRfrg588B4 16 | Zl7oOufw7jCB/wYDVR0fBIH3MIH0MEegRaBDhkFodHRwOi8vY3JsLnN3aXNzc2ln 17 | bi5uZXQvNUIyNTdCOTZBNDY1NTE3RUI4MzlGM0MwNzg2NjVFRTgzQUU3RjBFRTCB 18 | qKCBpaCBooaBn2xkYXA6Ly9kaXJlY3Rvcnkuc3dpc3NzaWduLm5ldC9DTj01QjI1 19 | N0I5NkE0NjU1MTdFQjgzOUYzQzA3ODY2NUVFODNBRTdGMEVFJTJDTz1Td2lzc1Np 20 | Z24lMkNDPUNIP2NlcnRpZmljYXRlUmV2b2NhdGlvbkxpc3Q/YmFzZT9vYmplY3RD 21 | bGFzcz1jUkxEaXN0cmlidXRpb25Qb2ludDBaBgNVHSAEUzBRME8GBFUdIAAwRzBF 22 | BggrBgEFBQcCARY5aHR0cDovL3JlcG9zaXRvcnkuc3dpc3NzaWduLmNvbS9Td2lz 23 | c1NpZ24tR29sZC1DUC1DUFMucGRmMIHRBggrBgEFBQcBAQSBxDCBwTBkBggrBgEF 24 | BQcwAoZYaHR0cDovL3N3aXNzc2lnbi5uZXQvY2dpLWJpbi9hdXRob3JpdHkvZG93 25 | bmxvYWQvNUIyNTdCOTZBNDY1NTE3RUI4MzlGM0MwNzg2NjVFRTgzQUU3RjBFRTBZ 26 | BggrBgEFBQcwAYZNaHR0cDovL2dvbGQtZXYtZzIub2NzcC5zd2lzc3NpZ24ubmV0 27 | LzVCMjU3Qjk2QTQ2NTUxN0VCODM5RjNDMDc4NjY1RUU4M0FFN0YwRUUwDQYJKoZI 28 | hvcNAQELBQADggIBACVxhUgwnsFZgEmC50cCMExcmvY9OQkPxcQbMMFCYvfvBFNz 29 | 65iu0MkXTo0jhaIe8wOOsv230q/zYJbTZOGbMpvUg5MRRIK9DCq3bDwAqN9bIjFw 30 | wK1bODt260m9+4gLxJJdt2MH5LAglQ2J0123+RodYxvv3b+5k6/DZ19dJUgXrjbD 31 | +0PWuO5+5DRangp3VELIRWjHAAnpmq3guORiLuVDS+PoinFp/CKEFRhgWIhp6sZd 32 | yA/9egO+ZH+U7KzLaMuYRNHfJr2UrgQUEufsOM0WUqQXS8RzO7ZGW/argfyc4NdS 33 | CivO97xZBroON0XaLOlTAAbubomhzz/K/Uv2S5T+I/AfYWCme7Vx/KyeA9if/eLA 34 | jQNn5lIb1cXhompM2M+kLAGjNhdpQvUSkjAhKOkzoeezJEN+RXU4P5tOJxw03LtJ 35 | VxmdQxQwgXOR0rBZT+9aFJSX1nIj7zWRnMwFu5w+gBaX1/5MuLP/ThJCckoVgb0o 36 | nbFLRn6siH6dNE+gZ5VgiMWeDOkwlR1UMWGMNwoKNExoTKYwKnpuMfv4q7Fx4uI9 37 | qVzGTL6yfW8+SRdxVFQa6K9hekBr2kZyAKBCqz+jpQq1EPCcvn4HiNx81Na++iqe 38 | K+d2mfZxdEuAwFoZIcyk1aTWHHT1Cqzys00wlukvSmnXUBbGU5Vpwzjlj3N4 39 | -----END CERTIFICATE----- 40 | -------------------------------------------------------------------------------- /scrapers/scrape_ag.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import datetime 4 | import scrape_common as sc 5 | import scrape_ag_common as sac 6 | 7 | 8 | xls_url = sac.get_ag_xls_url() 9 | xls = sc.xlsdownload(xls_url, silent=True) 10 | is_first = True 11 | 12 | # quarantine_riskareatravel 13 | """ 14 | rows = sc.parse_xls(xls, sheet_name='5. Quarantäne nach Einreise', header_row=2) 15 | for row in rows: 16 | if not isinstance(row['A'], datetime.datetime): 17 | continue 18 | 19 | 20 | dd = sc.DayData(canton='AG', url=xls_url) 21 | dd.datetime = f"{row['A'].date().isoformat()} {row['A'].time().isoformat()}" 22 | dd.quarantine_riskareatravel = row['Gesamtzahl aktuell betreuter Personen'] 23 | if dd: 24 | if not is_first: 25 | print('-' * 10) 26 | is_first = False 27 | print(dd) 28 | """ 29 | 30 | # quarantine + isolation 31 | rows = sc.parse_xls(xls, sheet_name='2. Contact Tracing', header_row=2) 32 | for row in rows: 33 | if not isinstance(row['A'], datetime.datetime): 34 | continue 35 | 36 | dd = sc.DayData(canton='AG', url=xls_url) 37 | dd.datetime = f"{row['A'].date().isoformat()} {row['A'].time().isoformat()}" 38 | isolated = row['Gesamtzahl aktuell betreuter Personen'] 39 | if sc.represents_int(isolated): 40 | dd.isolated = isolated 41 | #dd.quarantined = row['Gesamtzahl aktuell betreuter Personen5'] 42 | if dd: 43 | if not is_first: 44 | print('-' * 10) 45 | is_first = False 46 | print(dd) 47 | 48 | # cases + hospitalization 49 | rows = sc.parse_xls(xls, sheet_name='1. Covid-19-Daten', header_row=2) 50 | for row in rows: 51 | if not isinstance(row['A'], datetime.datetime): 52 | continue 53 | 54 | dd = sc.DayData(canton='AG', url=xls_url) 55 | dd.datetime = f"{row['A'].date().isoformat()} {row['A'].time().isoformat()}" 56 | if 'Gesamtzahl' in row: 57 | dd.cases = row['Gesamtzahl'] 58 | 59 | non_icu = row['Bestätigte Fälle Bettenstation (ohne IPS/IMC)'] 60 | icu = row['Bestätigte Fälle Intensivpflegestation (IPS)'] 61 | icf = row['Bestätigte Fälle Intermediate Care (IMC)'] 62 | if sc.represents_int(non_icu) and sc.represents_int(icu) and sc.represents_int(icf): 63 | dd.hospitalized = int(non_icu) + int(icu) + int(icf) 64 | dd.icu = icu 65 | dd.icf = icf 66 | if 'Gesamtzahl21' in row: 67 | dd.deaths = row['Gesamtzahl21'] 68 | if 'Gesamtzahl25' in row: 69 | dd.recovered = row['Gesamtzahl25'] 70 | 71 | if dd: 72 | if not is_first: 73 | print('-' * 10) 74 | is_first = False 75 | print(dd) 76 | -------------------------------------------------------------------------------- /scrapers/scrape_vd_tests.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import datetime 5 | import re 6 | import scrape_common as sc 7 | import scrape_vd_common as svc 8 | 9 | 10 | pdf_urls = svc.get_all_weekly_pdf_urls() 11 | for pdf_url in pdf_urls: 12 | pdf = sc.pdfdownload(pdf_url, silent=True, page=1) 13 | pdf = re.sub(r'(\d+)\'(\d+)', r'\1\2', pdf) 14 | pdf = re.sub(r'(\d+)’(\d+)', r'\1\2', pdf) 15 | pdf = re.sub(r'(\d)er', r'\1', pdf) 16 | 17 | td = sc.TestData(canton='VD', url=pdf_url) 18 | 19 | year = sc.find(r'Situation au \d+.*(20\d{2})', pdf) 20 | date = sc.find(r'Point .pid.miologique au (\d+\s+\w+\s+20\d{2})', pdf) 21 | if date is None: 22 | date = sc.find(r'Point .pid.miologique au (\d+\.\d+\.20\d{2})', pdf) 23 | res = re.search(r'Entre\s+(?Pet\s+)?le\s+(?P\d+\s+\w+)\s+et\s+le\s+(?P\d+\s+\w+)(?P\s+\d{4})?,', pdf, flags=re.I|re.UNICODE) 24 | res_with_year = re.search(r'Entre\s+le\s+(?P\d+\s+\w+\s+\d{4})\s+et\s+le\s+(?P\d+\s+\w+\s+\d{4}),', pdf, flags=re.I|re.UNICODE) 25 | res_no_month = re.search(r'Entre\s+le\s+(?P\d+)\s+et\s+le\s+(?P\d+\s+\w+),', pdf, flags=re.I|re.UNICODE) 26 | res_no_month_with_year = re.search(r'Entre(?P\s+et)?\s+le\s+(?P\d+)\s+et\s+le\s+(?P\d+\s+\w+\s+\d{4}),', pdf, flags=re.I|re.UNICODE) 27 | 28 | if res: 29 | start_date = sc.date_from_text(f"{res['start']} {year}") 30 | end_date = sc.date_from_text(f"{res['end']} {year}") 31 | elif res_with_year: 32 | start_date = sc.date_from_text(res_with_year['start']) 33 | end_date = sc.date_from_text(res_with_year['end']) 34 | elif res_no_month: 35 | end_date = sc.date_from_text(f"{res_no_month['end']} {year}") 36 | start_date = sc.date_from_text(f"{res_no_month['start']}.{end_date.month}.{year}") 37 | elif res_no_month_with_year: 38 | end_date = sc.date_from_text(res_no_month_with_year['end']) 39 | start_date = sc.date_from_text(f"{res_no_month_with_year['start']}.{end_date.month}.{end_date.year}") 40 | elif date: 41 | end_date = sc.date_from_text(date) 42 | start_date = end_date - datetime.timedelta(days=6) 43 | 44 | assert start_date and end_date, f'failed to extract start and end dates from {pdf_url}' 45 | td.start_date = start_date 46 | td.end_date = end_date 47 | 48 | res = re.search(r'une\s+moyenne\s+de\s+(\d+)\s+frottis\s+SARS-CoV(-)?2', pdf) 49 | if res: 50 | days = (end_date - start_date).days 51 | td.total_tests = days * int(res[1]) 52 | 53 | res = re.search(r'dont\s+(\d+\.?\d?)\s?%\s+étaient\s+positifs', pdf) 54 | if res: 55 | td.positivity_rate = res[1] 56 | 57 | if td: 58 | print(td) 59 | -------------------------------------------------------------------------------- /scrapers/scrape_so.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import re 5 | from bs4 import BeautifulSoup 6 | import scrape_common as sc 7 | import scrape_so_common as soc 8 | 9 | 10 | base_url = 'https://corona.so.ch' 11 | pdf_url = soc.get_latest_weekly_pdf_url() 12 | content = sc.pdfdownload(pdf_url, layout=True, silent=True, page=1) 13 | content = re.sub(r'(\d+)\'(\d+)', r'\1\2', content) 14 | 15 | """ 16 | Hospitalisationen im Kanton Anzahl Personen in Isolation davon Kontakte in Quarantäne Anzahl zusätzlicher Personen in Quarantäne nach Rückkehr aus Risikoland Re- Wert*** 17 | 6 (6) 120 (71) 280 (189) 388 (280) 1.46 (1.1) 18 | """ 19 | 20 | rows = [] 21 | 22 | date = sc.find(r'S\s?tand: (\d+\.\d+\.20\d{2})', content) 23 | number_of_tests = sc.find(r'Gem\s?eldete\s+Tes\s?ts\s+\(Total\)\*+?\s+(\d+)\s', content, flags=re.DOTALL) 24 | res = re.search(r'Hospitalisationen im Kanton.*\d+ \(\d+\)\s+(\d+) \(\d+\)\s+(\d+) \(\d+\)\s+(\d+) \(\d+\)\s+', content, re.DOTALL) 25 | if res is not None: 26 | data = sc.DayData(canton='SO', url=pdf_url) 27 | data.datetime = date 28 | data.tested = number_of_tests 29 | data.isolated = soc.strip_value(res[1]) 30 | data.quarantined = soc.strip_value(res[2]) 31 | data.quarantine_riskareatravel = soc.strip_value(res[3]) 32 | rows.append(data) 33 | 34 | 35 | # scrape the main page as well 36 | url = "https://corona.so.ch/bevoelkerung/daten/" 37 | d = sc.download(url, silent=True) 38 | soup = BeautifulSoup(d, 'html.parser') 39 | title = soup.find('h3', text=re.compile("Stand")) 40 | data = sc.DayData(canton='SO', url=url) 41 | data.datetime = sc.find(r'Stand\s*(\d+\.\d+\.\d{4})\s*', title.string) 42 | table = title.find_next('table') 43 | for table_row in table.find_all('tr'): 44 | title = table_row.find_all('th') 45 | items = table_row.find_all('td') 46 | if len(items) == 0: 47 | continue 48 | name = title[0].text 49 | value = items[0].text.replace("'", "") 50 | if sc.find(r'(Laborbestätigte Infektionen).*?:', name): 51 | data.cases = value 52 | continue 53 | if name == 'Verstorbene Personen (kumuliert seit 06.03.2020):': 54 | data.deaths = value 55 | continue 56 | if name == 'Im Kanton hospitalisierte Covid-19-positive Patientinnen und Patienten:': 57 | data.hospitalized = value 58 | continue 59 | if name.strip() == 'Davon befinden sich auf Intensivstationen:': 60 | data.icu = value 61 | continue 62 | if data: 63 | rows.append(data) 64 | 65 | 66 | is_first = True 67 | # skip first row 68 | for row in rows: 69 | if not is_first: 70 | print('-' * 10) 71 | is_first = False 72 | print(row) 73 | -------------------------------------------------------------------------------- /scrapers/scrape_fl.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import scrape_common as sc 4 | import sys 5 | import re 6 | import datetime 7 | from bs4 import BeautifulSoup 8 | 9 | 10 | # get the daily bulletin 11 | base_url = 'https://www.regierung.li' 12 | d = sc.download(base_url, silent=True) 13 | soup = BeautifulSoup(d, 'html.parser') 14 | 15 | is_first = True 16 | bulletin = soup.find('h1', text=re.compile(r'COVID-19: Situationsbericht.*')) 17 | if bulletin: 18 | bulletin = bulletin.find_next('a') 19 | if bulletin: 20 | url = f"{base_url}{bulletin.get('href')}" 21 | bulletin_d = sc.download(url, silent=True) 22 | bulletin_soup = BeautifulSoup(bulletin_d, 'html.parser') 23 | 24 | dd = sc.DayData(canton='FL', url=url) 25 | 26 | title = bulletin_soup.find('h1', text=re.compile(r'.*Situationsbericht.*')) 27 | dd.datetime = sc.find(r'Situationsbericht vom (.*? 20\d{2})', title.text) 28 | 29 | content = title.find_next('div').text 30 | content = re.sub(r'(\d+)’(\d+)', r'\1\2', content) 31 | 32 | dd.cases = sc.find(r"insgesamt\s+([0-9]+)\s+laborbestätigte\s+Fälle", content) 33 | dd.deaths = sc.find(r'(Damit\s+traten\s+)?(?:bisher|bislang)\s+(traten\s+)?(?P\d+)\s+(Todesfall|Todesfälle)', content, flags=re.I, group='death') 34 | 35 | if re.search(r'Alle\s+weiteren\s+Erkrankten\s+sind\s+in\s+der\s+Zwischenzeit\s+genesen', content): 36 | dd.recovered = int(dd.cases) - int(dd.deaths) 37 | 38 | m = re.search(r'(\S+)\s+Erkrankte\s+sind\s+derzeit\s+hospitalisiert', content) 39 | if m: 40 | dd.hospitalized = sc.int_or_word(m[1].lower()) 41 | 42 | m = re.search(r'Gegenwärtig\s+befinden\s+sich\s+(\w+)\s+enge\s+Kontaktpersonen\s+in\s+Quarantäne.', content) 43 | if m: 44 | dd.quarantined = sc.int_or_word(m[1]) 45 | 46 | if dd: 47 | if not is_first: 48 | print('-' * 10) 49 | print(dd) 50 | is_first = False 51 | 52 | 53 | # get the data from XLS file containing full history 54 | history_url='https://www.llv.li/files/ag/aktuelle-fallzahlen.xlsx' 55 | xls = sc.xlsdownload(history_url, silent=True) 56 | rows = sc.parse_xls(xls, header_row=3) 57 | for row in rows: 58 | dd_full_list = sc.DayData(canton='FL', url=history_url) 59 | if isinstance(row['Datenstand'], datetime.datetime): 60 | dd_full_list.datetime = row['Datenstand'] 61 | else: 62 | dd_full_list.datetime = str(row['Datenstand']).replace(':', '.') 63 | 64 | dd_full_list.cases = str(row['Anzahl pos. Fälle kumuliert']).replace("'","") 65 | dd_full_list.recovered = row['Genesene kumuliert'] 66 | dd_full_list.hospitalized = row['Hospitalisierte Personen*'] 67 | dd_full_list.deaths = row['Todesfälle kumuliert'] 68 | if dd_full_list: 69 | if not is_first: 70 | print('-' * 10) 71 | is_first = False 72 | print(dd_full_list) 73 | -------------------------------------------------------------------------------- /scrapers/scrape_ow.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import re 4 | import datetime 5 | from bs4 import BeautifulSoup 6 | import scrape_common as sc 7 | 8 | 9 | base_url = 'https://www.ow.ch' 10 | url = f'{base_url}/de/verwaltung/dienstleistungen/?dienst_id=5962' 11 | """ 12 | d = sc.download(url, silent=True, encoding='windows-1252') 13 | d = d.replace(' ', ' ') 14 | soup = BeautifulSoup(d, 'html.parser') 15 | 16 | dd = sc.DayData(canton='OW', url=url) 17 | date = sc.find(r'Stand (\d+\.\s+\w+\s+20\d{2})', d) 18 | time = sc.find(r'Stand .*,\s?([\d\.:]+).*Uhr', d) 19 | dd.datetime = f'{date}, {time} Uhr' 20 | dd.isolated = soup.find(text=re.compile(r'In Isolation \(aktuell\)')).find_next('td').string 21 | dd.quarantined = soup.find(text=re.compile(r'In Quarant.ne \(aktuell\)')).find_next('td').string 22 | dd.quarantine_riskareatravel = soup.find(text=re.compile(r'Reiser.ckkehrer in Quarant.ne')).find_next('td').string 23 | 24 | is_first = True 25 | if dd: 26 | print(dd) 27 | is_first = False 28 | """ 29 | 30 | is_first = True 31 | 32 | 33 | d = sc.download(f'{base_url}/de/kanton/publired/publikationen/?action=info&pubid=20318', 34 | encoding='windows-1252', silent=True) 35 | soup = BeautifulSoup(d, 'html.parser') 36 | xls_url = soup.find('a', string=re.compile("Download")).get('href') 37 | assert xls_url, "URL is empty" 38 | xls_url = f'{base_url}{xls_url}' 39 | 40 | for row in soup.find_all('dl'): 41 | cells = row.find_all('dd') 42 | if cells[0].string: 43 | file_date = cells[0].string 44 | 45 | xls = sc.xlsdownload(xls_url, silent=True) 46 | rows = sc.parse_xls(xls, header_row=4) 47 | for row in rows: 48 | if isinstance(row['A'], datetime.datetime): 49 | dd = sc.DayData(canton='OW', url=url) 50 | dd.datetime = row['A'] 51 | data_found = False 52 | if isinstance(row['Infizierte Personen (kumuliert)'], int) and row['Infizierte Personen (kumuliert)'] > 0: 53 | dd.cases = row['Infizierte Personen (kumuliert)'] 54 | data_found = True 55 | hosp_key = """Hospitalisierte Personen im KSOW / 56 | Eintritte Covid-Station; Alle Einwohner OW alle Spitäler CH***""" 57 | if isinstance(row[hosp_key], int): 58 | dd.hospitalized = row[hosp_key] 59 | if isinstance(row['Gestorbene Personen (kumuliert)'], int): 60 | dd.deaths = row['Gestorbene Personen (kumuliert)'] 61 | if isinstance(row['Isolation'], int): 62 | dd.isolated = row['Isolation'] 63 | if isinstance(row['Quarantäne'], int): 64 | dd.quarantined = row['Quarantäne'] 65 | if isinstance(row['Quarantäne Reiserückkehrer'], int): 66 | dd.quarantine_riskareatravel = row['Quarantäne Reiserückkehrer'] 67 | if data_found: 68 | if not is_first: 69 | print('-' * 10) 70 | else: 71 | is_first = False 72 | print(dd) 73 | -------------------------------------------------------------------------------- /scrapers/scrape_vs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import datetime 5 | import re 6 | from bs4 import BeautifulSoup 7 | import scrape_common as sc 8 | 9 | 10 | def strip_value(value): 11 | if value: 12 | return re.sub(r'[^0-9]', '', value) 13 | return None 14 | 15 | 16 | base_url = 'https://www.vs.ch' 17 | url = f'{base_url}/web/coronavirus/statistiques' 18 | content = sc.download(url, silent=True) 19 | soup = BeautifulSoup(content, 'html.parser') 20 | pdf_url = soup.find('a', string=re.compile(r'20\d{2}.*Sit Epid.*')).get('href') 21 | pdf_url = f'{base_url}{pdf_url}' 22 | 23 | content = sc.pdfdownload(pdf_url, silent=True, layout=True, page=1) 24 | 25 | dd = sc.DayData(canton='VS', url=pdf_url) 26 | dd.datetime = sc.find(r'(\d{2}/\d{2}/20\d{2})', content) 27 | dd.datetime = re.sub(r'/', '.', dd.datetime) 28 | dd.cases = strip_value(sc.find(r'.*Cumul cas positifs.*\s+(\d+.\d+)\s+', content)) 29 | dd.deaths = strip_value(sc.find(r'.*Cumul d.c.s.*\s+(\d+.\d+)\s+', content)) 30 | dd.hospitalized = strip_value(sc.find(r'.*Hospitalisations en cours de cas COVID-19.*\s+(\d+)\s+', content)) 31 | dd.icu = strip_value(sc.find(r'.*SI en cours.*\s+(\d+)\s+', content)) 32 | dd.vent = strip_value(sc.find(r'.*Intubation en cours.*\s+(\d+)\s+', content)) 33 | 34 | is_first = True 35 | if dd: 36 | is_first = False 37 | print(dd) 38 | 39 | 40 | xls_url = 'https://raw.githubusercontent.com/statistikZH/covid19_drop/master/Chiffres%20COVID-19%20Valais.xlsx' 41 | main_url = 'https://www.vs.ch/de/web/coronavirus' 42 | xls = sc.xlsdownload(xls_url, silent=True) 43 | rows = sc.parse_xls(xls, header_row=1) 44 | for i, row in enumerate(rows): 45 | if not isinstance(row['Date'], datetime.datetime): 46 | continue 47 | if not sc.represents_int(row['Cumul cas positifs']): 48 | continue 49 | if row['Nb nouveaux cas positifs'] is None and row["Nb nouvelles admissions à l'hôpital"] is None: 50 | continue 51 | 52 | dd = sc.DayData(canton='VS', url=main_url) 53 | dd.datetime = row['Date'].date().isoformat() 54 | dd.cases = row['Cumul cas positifs'] 55 | dd.hospitalized = row['Total hospitalisations COVID-19'] 56 | dd.new_hosp = row['Nb nouvelles admissions à l\'hôpital'] 57 | dd.icu = row['Patients COVID-19 aux SI total (y.c. intubés)'] 58 | dd.vent = row['Patients COVID-19 intubés'] 59 | dd.deaths = row['Cumul décès COVID-19'] 60 | # Since 2020-10-19 VS does no longer publish data about isolation/quarantined 61 | #dd.isolated = row['Nombre de cas en cours d\'isolement'] 62 | #dd.quarantined = row['Nombre de contacts en cours de quarantaine'] 63 | #dd.quarantine_riskareatravel = row['Nombre de voyageurs en cours de quarantaine'] 64 | 65 | if row['Nb de nouvelles sorties'] is not None: 66 | dd.recovered = sum(r['Nb de nouvelles sorties'] for r in rows[:i+1]) 67 | if not is_first: 68 | print('-' * 10) 69 | is_first = False 70 | print(dd) 71 | -------------------------------------------------------------------------------- /scrapers/scrape_sh_common.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import json 5 | import re 6 | from bs4 import BeautifulSoup 7 | import scrape_common as sc 8 | 9 | 10 | def get_sh_url_from_json(url): 11 | m = sc.jsondownload(url, silent=True) 12 | 13 | # 2020-04-24 14 | """ 15 | { 16 | data_filetype: "xlsx", 17 | data_shareInAreaPage: "[]", 18 | data_kachellabel: "Fallzahlen Corona Kanton Schaffhausen.xlsx", 19 | data_areaPage_repositoryid: "3275", 20 | data_custom_author: "Gesundheitsamt Kanton Schaffhausen", 21 | data_tagarea: "[]", 22 | data_shareInDomain: "[]", 23 | data_zielgruppen: "", 24 | data_publication_date: "23.04.2020", 25 | data_idpath: "/1752/8540/1753/1765/1755/1763/2733/2747/3275/3666465", 26 | data_custom_publication_date_date: "23.04.2020", 27 | data_shareArticleProfileId: "", 28 | data_file_name: "Fallzahlen Corona Kanton Schaffhausen.xlsx", 29 | data_author: "MWETT", 30 | data_file_copyrights: "", 31 | data_custom_publication_timed: "[]", 32 | data_published: "published", 33 | data_addmodules: "", 34 | data_listlabel: "Fallzahlen Corona Kanton Schaffhausen.xlsx", 35 | data_tags: "", 36 | data_widget_data: "[]", 37 | data_filemeta: "{"uploaded":1,"fileName":"d4ffb019-a2ef-4782-87be-0aafb4b43558","key":"TEMPUPLOADFILES","url":"/CMS/get/file/d4ffb019-a2ef-4782-87be-0aafb4b43558","originalname":"Fallzahlen Corona Kanton Schaffhausen.xlsx","fileid":"d4ffb019-a2ef-4782-87be-0aafb4b43558","category":"null","title":"null","filesize":12286}", 38 | data_shareInGlobal: "[]", 39 | data_verbande: "", 40 | data_file_description: "", 41 | data_custom_publication_date_time: "09:31", 42 | data_galleries: "[]", 43 | data_sharepaths: "", 44 | data_permalink: "/Webseite/Kanton-Schaffhausen/Beh-rde/Verwaltung/Departement-des-Innern/Gesundheitsamt-3666465-DE.html", 45 | data_schlagworte: "", 46 | data_approvedpaths: "["/1752/8540/1753/1765/1755/1763/2733/2747/3275/3666465"]", 47 | contentid: "3666465", 48 | domainid: "1753", 49 | contenttypeid: "101", 50 | transactiontime: "23.04 09:09", 51 | author: "dande", 52 | language: "DE", 53 | activated_languages: [ 54 | "DE" 55 | ], 56 | sliderimages: [ ], 57 | genericimages: { } 58 | } 59 | """ 60 | 61 | meta = json.loads(m['data_filemeta']) 62 | url = f"https://sh.ch{meta['url']}" 63 | return url 64 | 65 | def get_sh_xlsx(): 66 | main_url = 'https://coviddashboard.sh.ch/' 67 | content = sc.download(main_url, silent=True) 68 | soup = BeautifulSoup(content, 'html.parser') 69 | link = soup.find('a', href=re.compile(r'.*\.xlsx')) 70 | xls = sc.xlsdownload(link.get('href'), silent=True) 71 | return main_url, xls 72 | -------------------------------------------------------------------------------- /scrapers/scrape_fr_districts.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import csv 5 | from io import StringIO 6 | import re 7 | from bs4 import BeautifulSoup 8 | import scrape_common as sc 9 | from scrape_fr_common import get_fr_csv 10 | 11 | inhabitants = { 12 | 'Broye': 32894, 13 | 'Glane': 24337, 14 | 'Greyerz': 55726, 15 | 'Saane': 106136, 16 | 'See': 36800, 17 | 'Sense': 43990, 18 | 'Vivisbach': 18831, 19 | } 20 | 21 | district_ids = { 22 | 'Broye': 1001, 23 | 'Glane': 1002, 24 | 'Greyerz': 1003, 25 | 'Saane': 1004, 26 | 'See': 1005, 27 | 'Sense': 1006, 28 | 'Vivisbach': 1007, 29 | } 30 | 31 | district_xls = { 32 | 'Broye': 'Broye', 33 | 'Glane': 'Gl.ne', 34 | 'Greyerz': 'Gruy.re', 35 | 'Saane': 'Sarine', 36 | 'See': 'Lac', 37 | 'Sense': 'Singine', 38 | 'Vivisbach': 'Veveyse', 39 | } 40 | 41 | # weekly data 42 | url = 'https://www.fr.ch/de/gesundheit/covid-19/coronavirus-statistik-ueber-die-entwicklung-im-kanton' 43 | """ 44 | d = sc.download(url, silent=True) 45 | d = d.replace(' ', ' ') 46 | 47 | soup = BeautifulSoup(d, 'html.parser') 48 | table = soup.find(string=re.compile(r'Anzahl positive F.lle nach Bezirk')).find_next('table') 49 | 50 | weeks = [] 51 | years = [] 52 | week_regex = re.compile(r'Woche \d+') 53 | trs = table.find_all('tr') 54 | for header in trs[0]: 55 | week = sc.find(r'Woche (\d+)', header.string) 56 | if week is not None: 57 | weeks.append(week) 58 | years.append('2021') 59 | 60 | for tr in trs[1:]: 61 | tds = tr.find_all('td') 62 | 63 | for i in range(len(weeks)): 64 | district = tds[0].string 65 | if district in inhabitants: 66 | dd = sc.DistrictData(canton='FR', district=district) 67 | dd.url = url 68 | dd.week = weeks[i] 69 | # TODO restore once all weeks are in 2021 70 | # dd.year = '20' + year 71 | dd.year = years[i] 72 | dd.new_cases = tds[i + 1].string 73 | dd.population = inhabitants[district] 74 | dd.district_id = district_ids[district] 75 | print(dd) 76 | """ 77 | 78 | # daily data from csv 79 | csv_url, csv_data, main_url = get_fr_csv() 80 | reader = csv.DictReader(StringIO(csv_data), delimiter=';') 81 | 82 | for row in reader: 83 | row_date = None 84 | for key, val in row.items(): 85 | if sc.find(r'(Date).*', key): 86 | row_date = val 87 | assert row_date 88 | row_date = sc.date_from_text(row_date) 89 | for district, xls_district in district_xls.items(): 90 | for key, val in row.items(): 91 | if sc.find(r'.*(' + xls_district + ').*', key): 92 | dd = sc.DistrictData(canton='FR', district=district) 93 | dd.url = url 94 | dd.date = row_date.isoformat() 95 | dd.new_cases = val 96 | dd.population = inhabitants[district] 97 | dd.district_id = district_ids[district] 98 | print(dd) 99 | -------------------------------------------------------------------------------- /scrapers/scrape_bl_tests.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | from bs4 import BeautifulSoup 5 | import re 6 | import scrape_common as sc 7 | import scrape_bl_common as sbc 8 | from datetime import timedelta 9 | 10 | 11 | # weekly data 12 | bulletin_urls = sbc.get_all_bl_bulletin_urls() 13 | for bulletin_url in bulletin_urls: 14 | bulletin_content = sc.download(bulletin_url, silent=True) 15 | soup = BeautifulSoup(bulletin_content, 'html.parser') 16 | content = soup.find(string=re.compile(r'Per heute .*')).string 17 | content = sbc.strip_bl_bulletin_numbers(content) 18 | 19 | date = sc.find(r'Per heute \w+, (\d+\. \w+ 20\d{2})', content) 20 | date = sc.date_from_text(date) 21 | # previous week 22 | date = date - timedelta(days=7) 23 | 24 | td = sc.TestData(canton='BL', url=bulletin_url) 25 | td.week = date.isocalendar()[1] 26 | td.year = date.year 27 | td.total_tests = sc.find(r'In der Vorwoche wurden (\d+) PCR-Tests', content) 28 | td.positivity_rate = sc.find(r'von diesen waren (\d+\.?,?\d?) Prozent positiv', content) 29 | if td.total_tests and td.positivity_rate: 30 | td.positivity_rate = td.positivity_rate.replace(',', '.') 31 | print(td) 32 | 33 | 34 | # daily data 35 | main_url = 'https://www.baselland.ch/politik-und-behorden/direktionen/volkswirtschafts-und-gesundheitsdirektion/amt-fur-gesundheit/medizinische-dienste/kantonsarztlicher-dienst/aktuelles/covid-19-faelle-kanton-basel-landschaft/covid-19-bl-tests' 36 | main_content = sc.download(main_url, silent=True) 37 | soup = BeautifulSoup(main_content, 'html.parser') 38 | 39 | def create_bs_test_data(date): 40 | td = sc.TestData(canton='BL', url=main_url) 41 | td.start_date = date 42 | td.end_date = date 43 | return td 44 | 45 | tests_data = {} 46 | 47 | for iframe in soup.find_all('iframe'): 48 | iframe_url = iframe['src'] 49 | d = sc.download(iframe_url, silent=True) 50 | d = d.replace('\n', ' ') 51 | 52 | # Taegliche PCR-Tests BL 53 | data = sc.find(r'
     ?Datum,"Negative Tests","Positive Tests"\s*([^<]+)
    ', d) 54 | if data: 55 | for row in data.split(" "): 56 | c = row.split(',') 57 | date = sbc.parse_bl_date(c[0])[0] 58 | if date not in tests_data: 59 | tests_data[date] = create_bs_test_data(date) 60 | tests_data[date].negative_tests = round(float(c[1])) 61 | tests_data[date].positive_tests = round(float(c[2])) 62 | continue 63 | 64 | # Taegliche Positivitaetsrate BL 65 | data = sc.find(r'
     ?Datum,"T.gliche Positivit.tsrate BL"\s*([^<]+)
    ', d) 66 | if data: 67 | for row in data.split(" "): 68 | c = row.split(',') 69 | date = sbc.parse_bl_date(c[0])[0] 70 | if date not in tests_data: 71 | tests_data[date] = create_bs_test_data(date) 72 | tests_data[date].positivity_rate = c[1] 73 | continue 74 | 75 | for date, td in tests_data.items(): 76 | print(td) 77 | -------------------------------------------------------------------------------- /scrapers/scrape_ur.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import re 4 | from bs4 import BeautifulSoup 5 | import scrape_common as sc 6 | 7 | url = 'https://www.ur.ch/themen/2962' 8 | d = sc.download(url, silent=True) 9 | d = d.replace(' ', ' ') 10 | d = d.replace('
    ', ' ') 11 | d = re.sub(r'(\d+)\'(\d+)', r'\1\2', d) 12 | 13 | # 2020-03-26 (and possibly earlier) from https://www.ur.ch/themen/2962 14 | # 2020-07-07 they changed the title, so we're using the table header to find the table 15 | # 2020-07-24 column "Genesen" was removed 16 | """ 17 | 18 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 |

    19 | Stand: 24.07.2020, 11.00 Uhr
    Positiv getestete ErkrankungsfälleHospitalisiertVerstorben
    11517
    33 | """ 34 | 35 | # 2020-08-03 new table layout with 6 columns 36 | """ 37 | 38 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 |

    39 | Stand: 03.08.2020, 16.00 Uhr
    Aktive Fälle Positiv getestete ErkrankungsfälleHospitalisiertQuarantäneVerstorben 
    41170477 
    59 | """ 60 | 61 | soup = BeautifulSoup(d, 'html.parser') 62 | data_table = soup.find(string=re.compile(r'Positive\s+Fälle\s+total')).find_parent('table') 63 | 64 | assert data_table, "Can't find data table" 65 | 66 | dd = sc.DayData(canton='UR', url=url) 67 | dd.datetime = sc.find(r'Stand: (.* Uhr)', d) 68 | 69 | rows = data_table.find_all('tr') 70 | assert len(rows) == 2, f"Number of rows changed, {len(rows)} != 2" 71 | 72 | headers = rows[0].find_all('td') or rows[0].find_all('th') 73 | assert len(headers) == 5, f"Number of header columns changed, {len(headers)} != 5" 74 | assert re.search(r'(aktive\s+fälle)', headers[0].text, flags=re.I) is not None 75 | assert re.search(r"(positive\s+fälle\s+total\s+seit\s+märz\s+2020)", headers[1].text, flags=re.I) is not None 76 | assert headers[2].text.lower() == "hospitalisiert" 77 | assert re.search(r"(total\s+verstorbene)", headers[3].text, flags=re.I) is not None 78 | 79 | cells = rows[1].find_all('td') 80 | assert len(cells) == 4, f"Number of columns changed, {len(cells)} != 4" 81 | 82 | ur_number_regex = r'(\d+)\s*(\(.+?\))?' 83 | dd.cases = sc.find(ur_number_regex, cells[1].text) 84 | dd.hospitalized = sc.find(ur_number_regex, cells[2].text) 85 | dd.deaths = sc.find(ur_number_regex, cells[3].text) 86 | 87 | print(dd) 88 | -------------------------------------------------------------------------------- /scrapers/scrape_gl.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import re 5 | import sys 6 | from bs4 import BeautifulSoup 7 | import csv 8 | from io import StringIO 9 | import scrape_common as sc 10 | import scrape_gl_common as sgc 11 | 12 | def split_whitespace(text): 13 | if not text: 14 | return [] 15 | text = re.sub(r'\s\s+', ' ', text) 16 | return text.split(' ') 17 | 18 | is_first = True 19 | 20 | # weekly pdf 21 | pdf_url = sgc.get_gl_pdf_url() 22 | if pdf_url is not None: 23 | pdf = sc.download_content(pdf_url, silent=True) 24 | content = sc.pdftotext(pdf, page=1) 25 | content = re.sub(r'(\d+)\'(\d+)', r'\1\2', content) 26 | content = re.sub(r'(\d+)’(\d+)', r'\1\2', content) 27 | 28 | pdf_date = sc.find(r'Stand: (\d{2}\.\d{2}.\d{4})', content) 29 | pdf_date = sc.date_from_text(pdf_date) 30 | 31 | number_of_tests = sc.find(r'PCR-Tests/Schnelltests\sKanton Glarus\s(\d+)\s', content) 32 | if number_of_tests: 33 | dd = sc.DayData(canton='GL', url=pdf_url) 34 | dd.datetime = pdf_date 35 | dd.tested = number_of_tests 36 | is_first = False 37 | print(dd) 38 | 39 | 40 | content = sc.pdftotext(pdf, page=2, raw=True) 41 | dates = split_whitespace(sc.find(r'\n(\d+\.\d+\s+\d+\.\d+\s+.*)\nAnzahl\s+in\s+Isolation', content)) 42 | isolation = split_whitespace(sc.find(r'\nAnzahl\s+in\s+Isolation\s+(\d.*)\n', content)) 43 | quarantined = split_whitespace(sc.find(r'\nKontaktpersonen\s+in\s+Quarant.ne\s+(\d.*)\n', content)) 44 | 45 | if len(dates) == len(isolation) == len(quarantined): 46 | for date, iso, qua in zip(dates, isolation, quarantined): 47 | if sc.find(r'(\d{2}\.12)', date): 48 | year = '2020' 49 | else: 50 | year = pdf_date.year 51 | dd = sc.DayData(canton='GL', url=pdf_url) 52 | dd.datetime = f'{date}.{year}' 53 | dd.isolated = iso 54 | dd.quarantined = qua 55 | if not is_first: 56 | print('-' * 10) 57 | is_first = False 58 | print(dd) 59 | else: 60 | print('PDF data is inconsistent!', file=sys.stderr) 61 | print(f'dates: {len(dates)}, isolation: {len(isolation)}, quarantined: {len(quarantined)}', file=sys.stderr) 62 | 63 | 64 | # CSV from Google Spreadsheets 65 | main_url = 'https://docs.google.com/spreadsheets/d/1Q7VoxM6wvbdsC84DLWrzyNymkcxUKqIXHy6BpB2Ez0k/edit#gid=0' 66 | csv_url = 'https://docs.google.com/spreadsheets/d/1Q7VoxM6wvbdsC84DLWrzyNymkcxUKqIXHy6BpB2Ez0k/export?format=csv&id=1Q7VoxM6wvbdsC84DLWrzyNymkcxUKqIXHy6BpB2Ez0k&gid=0' 67 | d_csv = sc.download(csv_url, silent=True) 68 | 69 | reader = csv.DictReader(StringIO(d_csv), delimiter=',') 70 | for row in reader: 71 | if row['Datum'] == '': 72 | continue 73 | if not is_first: 74 | print('-' * 10) 75 | is_first = False 76 | dd = sc.DayData(canton='GL', url=main_url) 77 | dd.datetime = row['Datum'] 78 | dd.cases = row['Fallzahlen Total'] 79 | dd.hospitalized = row['Personen in Spitalpflege'] 80 | dd.deaths = row['Todesfälle (kumuliert)'] 81 | print(dd) 82 | -------------------------------------------------------------------------------- /scrapers/scrape_so_tests.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import datetime 5 | import re 6 | import scrape_common as sc 7 | import scrape_so_common as soc 8 | 9 | 10 | pdf_urls = soc.get_all_weekly_pdf_urls() 11 | # start with the oldest PDF to have the most recent ones last 12 | pdf_urls.reverse() 13 | for pdf_url in pdf_urls: 14 | content = sc.pdfdownload(pdf_url, layout=True, silent=True, page=1) 15 | # remove ' separator to simplify pattern matching 16 | content = re.sub(r'(\d)\'(\d)', r'\1\2', content) 17 | 18 | date = sc.find(r'S\s?tand: (\d+\.\d+\.20\d{2})', content) 19 | date = sc.date_from_text(date) 20 | year1 = (date - datetime.timedelta(weeks=2)).year 21 | year2 = (date - datetime.timedelta(weeks=1)).year 22 | res = re.match(r'.*Woche (?P\d+)(\s+\(\d+\.\d+-\d+\.\d+\))?\s+Woche (?P\d+)\s+', content, re.DOTALL) 23 | assert res, 'Weeks could not be extracted' 24 | week1 = res['w1'] 25 | week2 = res['w2'] 26 | 27 | res = re.match(r'.*PCR-Tes\s?ts\s+(\d.*\n)?Total\s+\d+\s+\d+\s+(\d+)\s+\d+\.?\d?\s+(\d+)\s', content, re.DOTALL) 28 | if not res: 29 | res = re.match(r'.*Labortes\s?ts\s\(PCR\s-\sund\sS\s?chnelltes\s?ts\s?\)\s+(\d.*\n)?Total\s+\d+\s+\d+\.?\d?\s+(\d+)\s+\d+\.?\d?\s+(\d+)\s', content, re.DOTALL) 30 | if not res: 31 | res = re.match(r'.*Labortes\s?ts\s\(PCR\s-\sund\sS\s?chnelltes\s?ts\s?\)\s+(\d.*\n)?Total\s+\d+\s+(\d+)\s+\d+\.?\d?\s+(\d+)\s', content, re.DOTALL) 32 | if res: 33 | total_tests1 = res[2] 34 | total_tests2 = res[3] 35 | 36 | if not res: 37 | res = re.match(r'.*\s+PCR\s+(\d+\s+)?(\d+)\s+(\d+)\s', content, re.DOTALL) 38 | assert res, f'PCR tests for week {week1} or {week2} could not be extracted!' 39 | if res: 40 | total_tests1 = int(res[2]) 41 | total_tests2 = int(res[3]) 42 | 43 | res = re.match(r'.*\s+Antigen-Schnelltests\s+(\d+\s+)?(\d+)\s+(\d+)', content, re.DOTALL) 44 | assert res, f'Antigen tests for week {week1} or {week2} could not be extracted!' 45 | if res: 46 | total_tests1 += int(res[2]) 47 | total_tests2 += int(res[3]) 48 | 49 | assert res, f'PCR tests for week {week1} or {week2} could not be extracted!' 50 | 51 | res = re.match(r'.*Positivit.tsrate\s+\*+?\s+\d+\.?\d?%?\s+(\d+\.?\d?)%?\s+(\d+\.?\d?)%?', content, re.DOTALL) 52 | pos_rate1 = None 53 | pos_rate2 = None 54 | if res: 55 | pos_rate1 = res[1] 56 | pos_rate2 = res[2] 57 | else: 58 | res = re.match(r'.*Anteil\s+pos\s?itiv\s?er\s+Tes\s?ts\s+\(%\)\s+(\d+\w+)?\s+(\d+\.?\d?)%?\s+(\d+\.?\d?)%?', content, re.DOTALL) 59 | if res: 60 | pos_rate1 = res[2] 61 | pos_rate2 = res[3] 62 | 63 | data = sc.TestData(canton='SO', url=pdf_url) 64 | data.week = week1 65 | data.year = year1 66 | data.total_tests = total_tests1 67 | data.positivity_rate = pos_rate1 68 | print(data) 69 | 70 | data = sc.TestData(canton='SO', url=pdf_url) 71 | data.week = week2 72 | data.year = year2 73 | data.total_tests = total_tests2 74 | data.positivity_rate = pos_rate2 75 | print(data) 76 | -------------------------------------------------------------------------------- /scrapers/test/test_test_data.py: -------------------------------------------------------------------------------- 1 | from scrapers.scrape_common import TestData 2 | 3 | def test_test_data(): 4 | dd = TestData() 5 | dd.start_date = '1' 6 | dd.end_date = '2' 7 | dd.week = 3 8 | dd.year = 4 9 | dd.canton = '5' 10 | dd.positive_tests = 6 11 | dd.negative_tests = 7 12 | dd.total_tests = 8 13 | dd.positivity_rate = 9 14 | dd.url = '10' 15 | 16 | string = str(dd) 17 | 18 | dd_parsed = TestData() 19 | assert dd_parsed.parse(string) 20 | assert dd.start_date == dd_parsed.start_date 21 | assert dd.end_date == dd_parsed.end_date 22 | assert dd.week == dd_parsed.week 23 | assert dd.year == dd_parsed.year 24 | assert dd.canton == dd_parsed.canton 25 | 26 | assert dd.positive_tests == dd_parsed.positive_tests 27 | assert dd.negative_tests == dd_parsed.negative_tests 28 | assert dd.positivity_rate == dd_parsed.positivity_rate 29 | 30 | assert dd.positive_tests == dd_parsed.positive_tests 31 | assert dd.negative_tests == dd_parsed.negative_tests 32 | assert dd.positivity_rate == dd_parsed.positivity_rate 33 | 34 | assert dd.pcr_positive_tests == dd_parsed.pcr_positive_tests 35 | assert dd.pcr_negative_tests == dd_parsed.pcr_negative_tests 36 | assert dd.pcr_positivity_rate == dd_parsed.pcr_positivity_rate 37 | 38 | assert dd.ag_positive_tests == dd_parsed.ag_positive_tests 39 | assert dd.ag_negative_tests == dd_parsed.ag_negative_tests 40 | assert dd.ag_positivity_rate == dd_parsed.ag_positivity_rate 41 | 42 | assert dd.url == dd_parsed.url 43 | 44 | 45 | def test_test_data_with_PCR_antigen(): 46 | dd = TestData() 47 | dd.start_date = '1' 48 | dd.end_date = '2' 49 | dd.week = 3 50 | dd.year = 4 51 | dd.canton = '5' 52 | 53 | dd.positive_tests = 6 54 | dd.negative_tests = 7 55 | dd.total_tests = 8 56 | dd.positivity_rate = 9 57 | 58 | dd.pcr_positive_tests = 10 59 | dd.pcr_negative_tests = 11 60 | dd.pcr_total_tests = 12 61 | dd.pcr_positivity_rate = 13 62 | 63 | dd.ag_positive_tests = 14 64 | dd.ag_negative_tests = 15 65 | dd.ag_total_tests = 16 66 | dd.ag_positivity_rate = 17 67 | 68 | dd.url = '18' 69 | 70 | string = str(dd) 71 | 72 | dd_parsed = TestData() 73 | assert dd_parsed.parse(string) 74 | assert dd.start_date == dd_parsed.start_date 75 | assert dd.end_date == dd_parsed.end_date 76 | assert dd.week == dd_parsed.week 77 | assert dd.year == dd_parsed.year 78 | assert dd.canton == dd_parsed.canton 79 | 80 | assert dd.positive_tests == dd_parsed.positive_tests 81 | assert dd.negative_tests == dd_parsed.negative_tests 82 | assert dd.positivity_rate == dd_parsed.positivity_rate 83 | 84 | assert dd.pcr_positive_tests == dd_parsed.pcr_positive_tests 85 | assert dd.pcr_negative_tests == dd_parsed.pcr_negative_tests 86 | assert dd.pcr_positivity_rate == dd_parsed.pcr_positivity_rate 87 | 88 | assert dd.ag_positive_tests == dd_parsed.ag_positive_tests 89 | assert dd.ag_negative_tests == dd_parsed.ag_negative_tests 90 | assert dd.ag_positivity_rate == dd_parsed.ag_positivity_rate 91 | 92 | assert dd.url == dd_parsed.url 93 | 94 | 95 | if __name__ == "__main__": 96 | test_test_data() 97 | -------------------------------------------------------------------------------- /scripts/check_for_outliers.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | import os 6 | import pandas as pd 7 | import math 8 | 9 | __location__ = os.path.realpath( 10 | os.path.join( 11 | os.getcwd(), 12 | os.path.dirname(__file__) 13 | ) 14 | ) 15 | 16 | # only values above this MIN_VALUE are considered outliers 17 | # this is to prevent a failing scraper run if the absolute value is not very high 18 | # this outlier detection is mostly to prevent human error (wrong data added) 19 | MIN_VALUE = 20 20 | 21 | # only check the last x days 22 | LAG_PERIODS = 10 23 | 24 | # periods considered "recent" 25 | RECENT_PERIODS = 5 26 | 27 | # IQR factor, determines how many times the IQR is the limit for an outlier 28 | FACTOR = 1.5 29 | 30 | assert len(sys.argv) >= 2, "Error: Call this script with the path(s) to CSV file(s)" 31 | 32 | fail = False 33 | 34 | args = sys.argv[1:] 35 | for csv_file in args: 36 | 37 | # load canton file from covid_19 repo 38 | df = pd.read_csv(csv_file, parse_dates=[0]) 39 | df_ignore = pd.read_csv(os.path.join(__location__, '..', 'outlier_status.csv'), parse_dates=[0]) 40 | df = pd.merge(df, df_ignore, left_on=['date', 'abbreviation_canton_and_fl'], right_on=['date', 'abbreviation_canton_and_fl'], how='left') 41 | 42 | # create new column for current cases 43 | df_conf = df[['date', 'ncumul_conf', 'ncumul_conf_outlier']].reset_index(drop=True) 44 | df_conf['current_conf'] = df['ncumul_conf'] - df['ncumul_conf'].shift(1) 45 | 46 | # only use the last 30 rows 47 | df_conf = df_conf.tail(LAG_PERIODS).reset_index(drop=True) 48 | 49 | # caculate iqr for confirmed cases 50 | q1 = df_conf['current_conf'].quantile(0.25) 51 | q3 = df_conf['current_conf'].quantile(0.75) 52 | iqr = q3 - q1 53 | 54 | if pd.isna(q1) or pd.isna(q3) or pd.isna(iqr): 55 | print(f"⚠️ {csv_file} has too many missing/NaN values (Q1: {q1}, Q3: {q3}, IQR: {iqr}) to calculate outliers, skipping.") 56 | continue 57 | 58 | lower_limit = q1 - (iqr * FACTOR) 59 | upper_limit = math.ceil(q3 + (iqr * FACTOR)) 60 | 61 | upper_limit = max(upper_limit, MIN_VALUE) 62 | lower_limit = 0 # always use 0 as lower limit 63 | df_conf['q1'] = q1 64 | df_conf['q3'] = q3 65 | df_conf['iqr'] = iqr 66 | df_conf['factor'] = FACTOR 67 | df_conf['upper_limit'] = upper_limit 68 | df_conf['lower_limit'] = lower_limit 69 | 70 | # use IQR*factor to get outliers 71 | outliers = df_conf.query('(current_conf < @lower_limit) or (current_conf > @upper_limit)') 72 | recent_outliers = df_conf.tail(RECENT_PERIODS).query("((current_conf < @lower_limit) or (current_conf > @upper_limit)) and (ncumul_conf_outlier != 'ignore')") 73 | if outliers.empty: 74 | print(f"✅ {csv_file} has no outliers.") 75 | else: 76 | if not recent_outliers.empty: 77 | fail = True 78 | print(f"❌ {csv_file} has recent outliers, please check if this is an error.") 79 | else: 80 | print(f"⚠️ {csv_file} has older or ignored outliers.") 81 | print(outliers[['date', 'ncumul_conf', 'current_conf', 'iqr', 'factor', 'upper_limit']]) 82 | print('') 83 | 84 | if fail: 85 | sys.exit(1) 86 | 87 | -------------------------------------------------------------------------------- /scrapers/test/test_dates.py: -------------------------------------------------------------------------------- 1 | from scrapers.scrape_dates import parse_date 2 | 3 | def test_dates(): 4 | date_tests = [ 5 | ('20. März 2020 15.00 Uhr', '2020-03-20T15:00'), 6 | ('21. März 2020, 10 Uhr', '2020-03-21T10:00'), 7 | ('21. März 2020, 11:00 Uhr', '2020-03-21T11:00'), 8 | ('21.03.2020, 15h30', '2020-03-21T15:30'), 9 | ('21. März 2020, 8.00 Uhr', '2020-03-21T08:00'), 10 | ('21. März 2020, 18.15  Uhr', '2020-03-21T18:15'), 11 | ('21. März 2020, 18.15 Uhr', '2020-03-21T18:15'), 12 | ('21. März 2020, 14.00 Uhr', '2020-03-21T14:00'), 13 | ('23. März 2020, 15 Uhr', '2020-03-23T15:00'), 14 | ('18. April 2020,16.00 Uhr', '2020-04-18T16:00'), 15 | ('21. März 2020', '2020-03-21T'), 16 | ('21.3.20', '2020-03-21T'), 17 | ('20.3.2020, 16.30', '2020-03-20T16:30'), 18 | ('21.03.2020, 15h30', '2020-03-21T15:30'), 19 | ('23.03.2020, 12:00', '2020-03-23T12:00'), 20 | ('23.03.2020 12:00', '2020-03-23T12:00'), 21 | ('08.04.2020: 09.30 Uhr', '2020-04-08T09:30'), 22 | ('07.04.2020 15.00h', '2020-04-07T15:00'), 23 | ('31.03.20, 08.00 h', '2020-03-31T08:00'), 24 | ('20.03.2020', '2020-03-20T'), 25 | ('21 mars 2020 (18h)', '2020-03-21T18:00'), 26 | ('1er avril 2020 (16h)', '2020-04-01T16:00'), 27 | ('21 mars 2020', '2020-03-21T'), 28 | ('6avril2020', '2020-04-06T'), 29 | ('20.03 à 8h00', '2020-03-20T08:00'), 30 | ('23.03 à 12h', '2020-03-23T12:00'), 31 | ('21 marzo 2020, ore 8.00', '2020-03-21T08:00'), 32 | ('27.03.2020 ore 08:00', '2020-03-27T08:00'), 33 | ('2020-03-23', '2020-03-23T'), 34 | ('24.3. / 10h', '2020-03-24T10:00'), 35 | ('2020-03-23T15:00:00', '2020-03-23T15:00'), 36 | ('2020-03-23 15:00:00', '2020-03-23T15:00'), 37 | ('2020-03-23 15:00', '2020-03-23T15:00'), 38 | ('30.04.2020,13.30 Uhr', '2020-04-30T13:30'), 39 | ('1.Mai 2020', '2020-05-01T'), 40 | ('05-05-2020 00:00', '2020-05-05T00:00'), 41 | ('07.05.2020, 00;00 Uhr', '2020-05-07T00:00'), 42 | ('17.06.2020 um 8 Uhr', '2020-06-17T08:00'), 43 | ('08.07.2020, um 8 Uhr', '2020-07-08T08:00'), 44 | ('8. Juli 2020 um 14:30 Uhr', '2020-07-08T14:30'), 45 | ('17.07.20 08:00', '2020-07-17T08:00'), 46 | ('12. 8. 2020', '2020-08-12T'), 47 | ('1er septembre 2020', '2020-09-01T'), 48 | ] 49 | for text, date in date_tests: 50 | assert parse_date(text) == date, f"parse_date('{text}') = '{parse_date(text)}', but expected '{date}'" 51 | 52 | if __name__ == "__main__": 53 | test_dates() 54 | -------------------------------------------------------------------------------- /.github/workflows/run_district_scrapers.yml: -------------------------------------------------------------------------------- 1 | name: Run district scrapers 2 | 3 | on: 4 | schedule: 5 | - cron: '10 * * * *' # run every hour at xx:10 6 | workflow_dispatch: ~ 7 | jobs: 8 | run_scraper: 9 | runs-on: ubuntu-20.04 10 | continue-on-error: false 11 | timeout-minutes: 10 12 | strategy: 13 | fail-fast: false 14 | matrix: 15 | canton: 16 | #- AG 17 | - BE 18 | #- BL 19 | #- FR 20 | #- GR 21 | - SG 22 | #- SO 23 | #- SZ 24 | - TG 25 | #- VS 26 | 27 | steps: 28 | - uses: actions/checkout@v3 29 | 30 | - name: Set up Python 3.7 31 | uses: actions/setup-python@v4 32 | with: 33 | python-version: 3.7 34 | - run: npm ci 35 | - name: Remove broken apt repos 36 | run: | 37 | for apt_file in `grep -lr microsoft /etc/apt/sources.list.d/`; do sudo rm $apt_file; done 38 | - name: Install dependencies 39 | env: 40 | SCRAPER_KEY: ${{ matrix.canton }} 41 | run: | 42 | python -m pip install --upgrade pip setuptools wheel 43 | pip install -r requirements.txt 44 | sudo apt update || true # do not fail if update does not work 45 | sudo apt-get install sqlite3 poppler-utils 46 | if [ "$SCRAPER_KEY" = "AG" ] ; then 47 | pip install -r requirements-ocr.txt 48 | sudo apt-get install tesseract-ocr=3.04.01-4 49 | fi 50 | 51 | - name: Scrape new data 52 | env: 53 | SCRAPER_KEY: ${{ matrix.canton }} 54 | run: | 55 | ./scrapers/run_district_scraper.sh 56 | 57 | - name: Check if there are changes in the repo 58 | run: | 59 | if git diff -w --no-ext-diff --quiet 60 | then 61 | echo "changed=0" >> $GITHUB_OUTPUT 62 | else 63 | echo "changed=1" >> $GITHUB_OUTPUT 64 | fi 65 | id: changes 66 | 67 | - name: Set commit message 68 | env: 69 | SCRAPER_KEY: ${{ matrix.canton }} 70 | run: | 71 | echo "commit_msg=Update fallzahlen_kanton_${SCRAPER_KEY}_bezirk.csv from scraper" >> $GITHUB_ENV 72 | 73 | - name: Commit and push to repo 74 | if: steps.changes.outputs.changed == 1 # only try to commit if there are actually changes 75 | uses: github-actions-x/commit@v2.9 76 | with: 77 | github-token: ${{ secrets.GITHUB_TOKEN }} 78 | push-branch: master 79 | name: GitHub Action Scraper 80 | email: scraper@open.zh.ch 81 | commit-message: ${{ env.commit_msg }} 82 | rebase: 'true' 83 | 84 | - name: Get current unix timestamp 85 | if: always() 86 | id: date 87 | run: echo "ts=$(date +'%s')" >> $GITHUB_OUTPUT 88 | 89 | - name: Notify slack failure 90 | if: ${{ failure() || cancelled() }} 91 | env: 92 | SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} 93 | uses: pullreminders/slack-action@master 94 | with: 95 | args: '{\"channel\":\"C013C0UUQ4S\", \"attachments\": [{\"fallback\": \"Job failed.\", \"color\": \"danger\", \"title\": \"Run district scrapers ${{ matrix.canton }}\", \"title_link\": \"https://github.com/openZH/covid_19/actions/runs/${{ github.run_id }}?check_suite_focus=true\", \"text\": \":x: District scraper failed\", \"footer\": \"\", \"footer_icon\": \"https://github.com/abinoda/slack-action/raw/master/docs/app-icons/github-icon.png\", \"ts\": \"${{steps.date.outputs.ts}}\"}]}' 96 | 97 | -------------------------------------------------------------------------------- /.github/workflows/run_tests_scraper.yml: -------------------------------------------------------------------------------- 1 | name: Run tests scrapers 2 | 3 | on: 4 | schedule: 5 | - cron: '20 * * * *' # run every hour at xx:20 6 | workflow_dispatch: ~ 7 | jobs: 8 | run_scraper: 9 | runs-on: ubuntu-20.04 10 | continue-on-error: false 11 | timeout-minutes: 10 12 | strategy: 13 | fail-fast: false 14 | matrix: 15 | canton: 16 | # - AG 17 | - BE 18 | - BL 19 | - BS 20 | - FL 21 | # - FR # no more data published anymore 22 | # - GE 23 | - GL 24 | # - JU #disable until PDF is fixed 25 | # - NW 26 | - SG 27 | - SH 28 | # - SO 29 | - TG 30 | # - TI # no more data published anymore 31 | # - VD 32 | - VS 33 | - ZG 34 | - ZH 35 | 36 | steps: 37 | - uses: actions/checkout@v3 38 | 39 | - name: Set up Python 3.7 40 | uses: actions/setup-python@v4 41 | with: 42 | python-version: 3.7 43 | - run: npm ci 44 | - name: Remove broken apt repos 45 | run: | 46 | for apt_file in `grep -lr microsoft /etc/apt/sources.list.d/`; do sudo rm $apt_file; done 47 | - name: Install dependencies 48 | env: 49 | SCRAPER_KEY: ${{ matrix.canton }} 50 | run: | 51 | python -m pip install --upgrade pip setuptools wheel 52 | pip install -r requirements.txt 53 | sudo apt update || true # do not fail if update does not work 54 | sudo apt-get install sqlite3 poppler-utils 55 | if [ "$SCRAPER_KEY" = "GE" ] ; then 56 | sudo apt-get install chromium-browser 57 | fi 58 | 59 | - name: Scrape new data 60 | env: 61 | SCRAPER_KEY: ${{ matrix.canton }} 62 | run: | 63 | ./scrapers/run_tests_scraper.sh 64 | 65 | - name: Check if there are changes in the repo 66 | run: | 67 | if git diff -w --no-ext-diff --quiet 68 | then 69 | echo "changed=0" >> $GITHUB_OUTPUT 70 | else 71 | echo "changed=1" >> $GITHUB_OUTPUT 72 | fi 73 | id: changes 74 | 75 | - name: Set commit message 76 | env: 77 | SCRAPER_KEY: ${{ matrix.canton }} 78 | run: | 79 | if [ "$SCRAPER_KEY" = "FL" ] ; then 80 | echo "commit_msg=Update fallzahlen_${SCRAPER_KEY}_tests.csv from scraper" >> $GITHUB_ENV 81 | else 82 | echo "commit_msg=Update fallzahlen_kanton_${SCRAPER_KEY}_tests.csv from scraper" >> $GITHUB_ENV 83 | fi 84 | 85 | - name: Commit and push to repo 86 | if: steps.changes.outputs.changed == 1 # only try to commit if there are actually changes 87 | uses: github-actions-x/commit@v2.9 88 | with: 89 | github-token: ${{ secrets.GITHUB_TOKEN }} 90 | push-branch: master 91 | name: GitHub Action Scraper 92 | email: scraper@open.zh.ch 93 | commit-message: ${{ env.commit_msg }} 94 | rebase: 'true' 95 | 96 | - name: Get current unix timestamp 97 | if: always() 98 | id: date 99 | run: echo "ts=$(date +'%s')" >> $GITHUB_OUTPUT 100 | 101 | - name: Notify slack failure 102 | if: ${{ failure() || cancelled() }} 103 | env: 104 | SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} 105 | uses: pullreminders/slack-action@master 106 | with: 107 | args: '{\"channel\":\"C013C0UUQ4S\", \"attachments\": [{\"fallback\": \"Job failed.\", \"color\": \"danger\", \"title\": \"Run tests scrapers ${{ matrix.canton }}\", \"title_link\": \"https://github.com/openZH/covid_19/actions/runs/${{ github.run_id }}?check_suite_focus=true\", \"text\": \":x: Tests scraper failed\", \"footer\": \"\", \"footer_icon\": \"https://github.com/abinoda/slack-action/raw/master/docs/app-icons/github-icon.png\", \"ts\": \"${{steps.date.outputs.ts}}\"}]}' 108 | 109 | -------------------------------------------------------------------------------- /fallzahlen_bezirke/fallzahlen_kanton_AG_bezirk.csv: -------------------------------------------------------------------------------- 1 | DistrictId,District,Canton,Date,Week,Year,Population,TotalConfCases,NewConfCases,TotalDeaths,NewDeaths,SourceUrl 2 | 1901,Aarau,AG,2020-10-26,,,79702,353,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp 3 | 1901,Aarau,AG,2020-11-04,,,79702,527,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp 4 | 1901,Aarau,AG,2020-11-13,,,79702,527,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp 5 | 1902,Baden,AG,2020-10-26,,,145696,735,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp 6 | 1902,Baden,AG,2020-11-04,,,145696,1079,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp 7 | 1902,Baden,AG,2020-11-13,,,145696,1079,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp 8 | 1903,Bremgarten,AG,2020-10-26,,,78745,277,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp 9 | 1903,Bremgarten,AG,2020-11-04,,,78745,430,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp 10 | 1903,Bremgarten,AG,2020-11-13,,,78745,430,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp 11 | 1904,Brugg,AG,2020-10-26,,,51814,179,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp 12 | 1904,Brugg,AG,2020-11-04,,,51814,270,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp 13 | 1904,Brugg,AG,2020-11-13,,,51814,270,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp 14 | 1905,Kulm,AG,2020-10-26,,,42412,153,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp 15 | 1905,Kulm,AG,2020-11-04,,,42412,232,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp 16 | 1905,Kulm,AG,2020-11-13,,,42412,232,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp 17 | 1906,Laufenburg,AG,2020-10-26,,,33035,96,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp 18 | 1906,Laufenburg,AG,2020-11-04,,,33035,130,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp 19 | 1906,Laufenburg,AG,2020-11-13,,,33035,130,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp 20 | 1907,Lenzburg,AG,2020-10-26,,,64792,261,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp 21 | 1907,Lenzburg,AG,2020-11-04,,,64792,378,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp 22 | 1907,Lenzburg,AG,2020-11-13,,,64792,378,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp 23 | 1908,Muri,AG,2020-10-26,,,37170,152,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp 24 | 1908,Muri,AG,2020-11-04,,,37170,213,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp 25 | 1908,Muri,AG,2020-11-13,,,37170,213,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp 26 | 1909,Rheinfelden,AG,2020-10-26,,,47926,158,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp 27 | 1909,Rheinfelden,AG,2020-11-04,,,47926,235,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp 28 | 1909,Rheinfelden,AG,2020-11-13,,,47926,235,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp 29 | 1910,Zofingen,AG,2020-10-26,,,73136,271,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp 30 | 1910,Zofingen,AG,2020-11-04,,,73136,408,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp 31 | 1910,Zofingen,AG,2020-11-13,,,73136,408,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp 32 | 1911,Zurzach,AG,2020-10-26,,,34650,127,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp 33 | 1911,Zurzach,AG,2020-11-04,,,34650,206,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp 34 | 1911,Zurzach,AG,2020-11-13,,,34650,206,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp 35 | -------------------------------------------------------------------------------- /scrapers/add_district_db_entry.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | import sqlite3 5 | import traceback 6 | import os 7 | 8 | import db_common as dc 9 | import scrape_common as sc 10 | 11 | __location__ = dc.get_location() 12 | 13 | input_failures = 0 14 | 15 | try: 16 | DATABASE_NAME = os.path.join(__location__, 'data.sqlite') 17 | conn = sqlite3.connect(DATABASE_NAME) 18 | 19 | i = 0 20 | for line in sys.stdin: 21 | dd = sc.DistrictData() 22 | if dd.parse(line.strip()): 23 | c = conn.cursor() 24 | try: 25 | print(dd) 26 | 27 | c.execute( 28 | ''' 29 | INSERT INTO data ( 30 | DistrictId, 31 | District, 32 | Canton, 33 | Date, 34 | Week, 35 | Year, 36 | Population, 37 | TotalConfCases, 38 | NewConfCases, 39 | TotalDeaths, 40 | NewDeaths, 41 | SourceUrl 42 | ) 43 | VALUES 44 | (?,?,?,?,?,?,?,?,?,?,?,?) 45 | ; 46 | 47 | ''', 48 | [ 49 | dd.district_id, 50 | dd.district, 51 | dd.canton, 52 | dd.date or '', 53 | dd.week or '', 54 | dd.year or '', 55 | dd.population, 56 | dd.total_cases, 57 | dd.new_cases, 58 | dd.total_deceased, 59 | dd.new_deceased, 60 | dd.url, 61 | ] 62 | ) 63 | 64 | print("Successfully added new entry.") 65 | except sqlite3.IntegrityError as e: 66 | # try UPDATE if INSERT didn't work (i.e. constraint violation) 67 | try: 68 | c.execute( 69 | ''' 70 | UPDATE data SET 71 | Population = ?, 72 | TotalConfCases = ?, 73 | NewConfCases = ?, 74 | TotalDeaths = ?, 75 | NewDeaths = ?, 76 | SourceUrl = ? 77 | WHERE DistrictId = ? 78 | AND District = ? 79 | AND Canton = ? 80 | AND Date = ? 81 | AND Week = ? 82 | AND Year = ? 83 | ; 84 | ''', 85 | [ 86 | dd.population, 87 | dd.total_cases, 88 | dd.new_cases, 89 | dd.total_deceased, 90 | dd.new_deceased, 91 | dd.url, 92 | dd.district_id, 93 | dd.district, 94 | dd.canton, 95 | dd.date or '', 96 | dd.week or '', 97 | dd.year or '', 98 | ] 99 | ) 100 | print("Successfully updated entry.") 101 | except sqlite3.Error as e: 102 | print("Error: an error occured in sqlite3: ", e.args[0], file=sys.stderr) 103 | conn.rollback() 104 | input_failures += 1 105 | finally: 106 | conn.commit() 107 | except Exception as e: 108 | print("Error: %s" % e, file=sys.stderr) 109 | print(traceback.format_exc(), file=sys.stderr) 110 | sys.exit(1) 111 | finally: 112 | conn.close() 113 | 114 | if input_failures: 115 | print(f'input_failures: {input_failures}') 116 | sys.exit(1) 117 | -------------------------------------------------------------------------------- /scrapers/convert_parsed_to_csv.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Reads data in a format produced by ./parse_scrape_output.py 4 | # from standard input, and converts into CSV file on a standard output. 5 | # 6 | # Example usage: 7 | # ./meta_scrape.sh | ./convert_parsed_to_csv.py > latest.csv 8 | # ./scrape_vd.sh | ./parse_scrape_output.py | ./convert_parsed_to_csv.py > vd.csv 9 | # cat *0.txt | ./convert_parsed_to_csv.py > full_history.csv 10 | # 11 | # See README.md for details about columns defined in CSV format. 12 | 13 | import csv 14 | import re 15 | import sys 16 | 17 | # See README.md for more details about these fields. 18 | field_names = [ 19 | 'date', 20 | 'time', 21 | 'abbreviation_canton_and_fl', 22 | 'ncumul_tested', 23 | 'ncumul_conf', 24 | 'ncumul_hosp', # Actually not cumulative. 25 | 'ncumul_ICU', # Actually not cumulative. 26 | 'ncumul_vent', # Actually not cumulative. 27 | 'ncumul_released', 28 | 'ncumul_deceased', 29 | 'source', 30 | ] 31 | 32 | writer = csv.DictWriter(sys.stdout, field_names, 33 | delimiter=',', 34 | quotechar='"', 35 | lineterminator='\n', 36 | quoting=csv.QUOTE_MINIMAL) 37 | 38 | writer.writeheader() 39 | 40 | input_failures = 0 41 | for line in sys.stdin: 42 | l = line.strip() 43 | 44 | # AR 2020-03-23T10:00 30 1 OK 2020-03-23T19:12:09+01:00 https://www.ai.ch/themen/gesundheit-alter-und-soziales/gesundheitsfoerderung-und-praevention/uebertragbare-krankheiten/coronavirus 45 | # GE 2020-03-27T 1924 23 OK 2020-03-28T18:57:34+01:00 # Extras: ncumul_hosp=313,ncumul_ICU=54 # URLs: https://www.ge.ch/document/point-coronavirus-maladie-covid-19/telecharger 46 | 47 | # Groups: 1 2 3 4 5 6 7 8 48 | match = re.search(r'^([A-Z][A-Z])\s+((?:\d\d\d\d-\d\d-\d\d)T(?:\d\d:\d\d)?)\s+(\d+)\s+(\d+|-)\s+OK\s+([0-9:\+\-\.T]+)(?:\s+# Extras: ([^#]+))?(?:\s+(?:(# URLs: )?(h.+)))?(?:\s+(http.+))?$', l) 49 | if not match: 50 | input_failures += 1 51 | print(f"Failed to parse line: {l}", file=sys.stderr) 52 | continue 53 | 54 | abbr = match.group(1) 55 | 56 | date_part = match.group(2).split('T', 2) 57 | 58 | data = { 59 | 'date': date_part[0], 60 | 'time': None, 61 | 'abbreviation_canton_and_fl': abbr, 62 | 'ncumul_tested': None, 63 | 'ncumul_conf': int(match.group(3)), 64 | 'ncumul_hosp': None, 65 | 'ncumul_ICU': None, 66 | 'ncumul_vent': None, 67 | 'ncumul_released': None, 68 | 'ncumul_deceased': None, 69 | 'source': '', 70 | } 71 | 72 | if len(date_part) == 2: 73 | data['time'] = date_part[1] 74 | 75 | if match.group(4) != '-': 76 | data['ncumul_deceased'] = int(match.group(4)) 77 | 78 | scrape_time = match.group(5) 79 | 80 | url_sources = match.group(7) 81 | if match.group(8): 82 | url_sources = match.group(8) 83 | if url_sources: 84 | data['source'] = f'Scraper for {abbr} at {scrape_time} using {url_sources}' 85 | else: 86 | data['source'] = f'Scraper for {abbr} at {scrape_time}' 87 | 88 | # Parse optional data. 89 | extras_list = match.group(6) 90 | if extras_list: 91 | try: 92 | extras = extras_list.strip() 93 | extras = extras.split(',') 94 | extras = { kv.split('=', 2)[0]: int(kv.split('=', 2)[1]) for kv in extras } 95 | # data.update(extras) 96 | for k in ['ncumul_hosp', 'ncumul_ICU', 'ncumul_vent', 'ncumul_released', 'new_hosp', 'current_hosp']: 97 | if k in extras: 98 | data[k] = extras[k] 99 | except Exception as e: 100 | input_failures += 1 101 | print(f'Error: Parsing optional data failed, ignoring: {extras_list}', file=sys.stderr) 102 | 103 | # print(data) 104 | writer.writerow(data) 105 | 106 | sys.stdout.flush() 107 | 108 | if input_failures: 109 | sys.exit(1) 110 | -------------------------------------------------------------------------------- /scrapers/scrape_gr.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import datetime 4 | import re 5 | from bs4 import BeautifulSoup 6 | import scrape_common as sc 7 | 8 | 9 | is_first = True 10 | 11 | url = 'https://www.gr.ch/DE/institutionen/verwaltung/djsg/ga/coronavirus/info/Seiten/Start.aspx' 12 | data = sc.download(url, silent=True) 13 | data = re.sub(r'(\d+)'(\d+)', r'\1\2', data) 14 | soup = BeautifulSoup(data, 'html.parser') 15 | elem = soup.find('h2', text=re.compile(r'Fallzahlen\s+Kanton.*')) 16 | if elem is not None: 17 | table = elem.find_next('table') 18 | body = table.find('tbody') 19 | for row in body.find_all('tr'): 20 | tds = row.find_all('td') 21 | 22 | if not is_first: 23 | print('-' * 10) 24 | is_first = False 25 | 26 | dd = sc.DayData(canton='GR', url=url) 27 | dd.datetime = tds[0].text 28 | dd.cases = tds[1].text 29 | dd.isolated = tds[3].text 30 | dd.quarantined = tds[4].text 31 | dd.deaths = tds[6].text 32 | dd.hospitalized = tds[8].text 33 | dd.icu = tds[10].text 34 | dd.vent = tds[11].text 35 | print(dd) 36 | 37 | 38 | json_url = 'https://services1.arcgis.com/YAuo6vcW85VPu7OE/arcgis/rest/services/Fallzahlen_Total_Kanton/FeatureServer/0/query?where=1%3D1&objectIds=&time=&resultType=none&outFields=*&returnHiddenFields=false&returnIdsOnly=false&returnUniqueIdsOnly=false&returnCountOnly=false&returnDistinctValues=false&cacheHint=false&orderByFields=Eingangs_Datum&groupByFieldsForStatistics=&outStatistics=&having=&resultOffset=&resultRecordCount=&sqlFormat=standard&f=pjson' 39 | data = sc.jsondownload(json_url, silent=True) 40 | 41 | # 2020-04-02 42 | """ 43 | features: [ 44 | { 45 | attributes: { 46 | Eingangs_Datum: 1582675200000, 47 | Anzahl_Fälle_total__kumuliert_: 2, 48 | Neue_Faelle: 2, 49 | Neue_aktive_Fälle: 2, 50 | Anzahl_aktive_Fälle_total: 2, 51 | Anzahl_Personen_in_Isolation: 0, 52 | Anzahl_Personen_in_Quarantäne: 0, 53 | Verstorbene: 0, 54 | Verstorbene__kumuliert_: 0, 55 | Neue_Hospitalisierungen: 0, 56 | Hospitalisiert_Total: 0, 57 | Neu_Pflege: 0, 58 | Hospitalisiert_Pflege: 0, 59 | Neu_IPS: 0, 60 | Hospialisiert_IPS: 0, 61 | Neu_IPS_beatmet: 0, 62 | Hospitalisiert_IPS_beatmet: 0, 63 | FID: 1 64 | } 65 | }, 66 | { 67 | attributes: { 68 | Eingangs_Datum: 1582761600000, 69 | Anzahl_Fälle_total__kumuliert_: 2, 70 | Neue_Faelle: 0, 71 | Neue_aktive_Fälle: 0, 72 | Anzahl_aktive_Fälle_total: 2, 73 | Anzahl_Personen_in_Isolation: 0, 74 | Anzahl_Personen_in_Quarantäne: 0, 75 | Verstorbene: 0, 76 | Verstorbene__kumuliert_: 0, 77 | Neue_Hospitalisierungen: 0, 78 | Hospitalisiert_Total: 0, 79 | Neu_Pflege: 0, 80 | Hospitalisiert_Pflege: 0, 81 | Neu_IPS: 0, 82 | Hospialisiert_IPS: 0, 83 | Neu_IPS_beatmet: 0, 84 | Hospitalisiert_IPS_beatmet: 0, 85 | FID: 2 86 | } 87 | }, 88 | """ 89 | 90 | assert 'features' in data, "JSON did not contain `features` key" 91 | 92 | for feature in data['features']: 93 | row = feature['attributes'] 94 | if not is_first: 95 | print('-' * 10) 96 | is_first = False 97 | 98 | dd = sc.DayData(canton='GR', url=json_url) 99 | dd.datetime = datetime.datetime.fromtimestamp(row['Eingangs_Datum'] / 1000).date().isoformat() 100 | dd.cases = row['Anzahl_Fälle_total__kumuliert_'] 101 | dd.hospitalized = row['Hospitalisiert_Total'] 102 | dd.icu = row['Hospialisiert_IPS'] 103 | dd.vent = row['Hospitalisiert_IPS_beatmet'] 104 | # Neue_Hospotalisierungen does currently not match our definition of new_hosp 105 | # GR provides this calculated field as the difference between 106 | # hospitalized from yesterday and today 107 | #dd.new_hosp = row['Neue_Hospitalisierungen'] 108 | dd.deaths = row['Verstorbene__kumuliert_'] 109 | dd.isolated = row['Anzahl_Personen_in_Isolation'] 110 | dd.quarantined = row['Anzahl_Personen_in_Quarantäne'] 111 | print(dd) 112 | -------------------------------------------------------------------------------- /fallzahlen_tests/fallzahlen_kanton_JU_tests.csv: -------------------------------------------------------------------------------- 1 | canton,start_date,end_date,week,year,positive_tests,negative_tests,total_tests,positivity_rate,source,pcr_positive_tests,pcr_negative_tests,pcr_total_tests,pcr_positivity_rate,ag_positive_tests,ag_negative_tests,ag_total_tests,ag_positivity_rate 2 | JU,,,43,2020,179,,719,25.0,https://www.jura.ch/Htdocs/Files/v/35815.pdf,,,,,,,, 3 | JU,,,44,2020,219,,1064,23.0,https://www.jura.ch/Htdocs/Files/v/35911.pdf,,,,,,,, 4 | JU,,,45,2020,418,,1590,27.0,https://www.jura.ch/Htdocs/Files/v/35986.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Cornavirus/Chiffres/rapport_hebdo_COVID_JU_sem45_vf.pdf?download=1,,,,,,,, 5 | JU,,,46,2020,252,,1130,24.0,https://www.jura.ch/Htdocs/Files/v/36049.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Cornavirus/Chiffres/rapport_hebdo_COVID_JU_sem46_vf.pdf?download=1,,,,,,,, 6 | JU,,,47,2020,203,,853,25.0,https://www.jura.ch/Htdocs/Files/v/36126.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Cornavirus/rapport_hebdo_COVID_JU_sem47_vf.pdf?download=1,,,,,,,, 7 | JU,,,48,2020,158,,736,22.0,https://www.jura.ch/Htdocs/Files/v/36196.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Cornavirus/Chiffres/rapport_hebdo_COVID_JU_sem48.pdf,,,,,,,, 8 | JU,,,49,2020,136,,882,15.0,https://www.jura.ch/Htdocs/Files/v/36338.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Cornavirus/Chiffres/rapport_hebdo_COVID_JU_sem49_vf.pdf,,,,,,,, 9 | JU,,,50,2020,145,,1125,13.0,https://www.jura.ch/Htdocs/Files/v/36416.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Cornavirus/Chiffres/rapport_hebdo_COVID_JU_sem50.pdf,,,,,,,, 10 | JU,,,51,2020,242,,1552,16.0,https://www.jura.ch/Htdocs/Files/v/36492.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Cornavirus/Chiffres/rapport_hebdo_COVID_JU_sem51.pdf,,,,,,,, 11 | JU,,,52,2020,144,,1072,13.0,https://www.jura.ch/Htdocs/Files/v/36498.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Cornavirus/Chiffres/rapport_hebdo_COVID_JU_sem52.pdf,,,,,,,, 12 | JU,,,53,2020,244,,1235,20.0,https://www.jura.ch/Htdocs/Files/v/36536.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Cornavirus/Chiffres/rapport_hebdo_COVID_JU_sem53.pdf,,,,,,,, 13 | JU,,,1,2021,246,,1143,22.0,https://www.jura.ch/Htdocs/Files/v/36563.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Cornavirus/Chiffres/rapport_hebdo_COVID_JU_sem1_2021.pdf,,,,,,,, 14 | JU,,,2,2021,215,,1231,17.0,https://www.jura.ch/Htdocs/Files/v/36660.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Cornavirus/Chiffres/rapport_hebdo_COVID_JU_sem2_2021_corr.pdf,,,,,,,, 15 | JU,,,3,2021,179,,1117,16.0,https://www.jura.ch/Htdocs/Files/v/36720.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Cornavirus/Chiffres/rapport_hebdo_COVID_JU_sem3_2021_vf.pdf,,,,,,,, 16 | JU,,,4,2021,207,,1448,14.0,https://www.jura.ch/Htdocs/Files/v/36790.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Cornavirus/Chiffres/rapport_hebdo_COVID_JU_sem4_2021.pdf,,,,,,,, 17 | JU,,,5,2021,127,,1877,7.0,https://www.jura.ch/Htdocs/Files/v/36821.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Cornavirus/Chiffres/rapport_hebdo_COVID_JU_sem5_2021.pdf,,,,,,,, 18 | JU,,,6,2021,127,,1342,9.0,https://www.jura.ch/Htdocs/Files/v/36872.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Cornavirus/Chiffres/rapport_hebdo_COVID_JU_sem6_2021.pdf,,,,,,,, 19 | JU,,,7,2021,143,,1500,10.0,https://www.jura.ch/Htdocs/Files/v/36918.pdf/Departements/CHA/SIC/Communiques/2021/rapport_hebdo_COVID_JU_sem7_2021.pdf,,,,,,,, 20 | JU,,,8,2021,151,,969,13.0,https://www.jura.ch/Htdocs/Files/v/36986.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Chiffres/rapport_hebdo_COVID_JU_sem8_2021.pdf,,,,,,,, 21 | JU,,,9,2021,154,,927,14.0,https://www.jura.ch/Htdocs/Files/v/37064.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Chiffres/rapport_hebdo_COVID_JU_sem9_2021.pdf,,,,,,,, 22 | JU,,,10,2021,80,,1099,7.0,https://www.jura.ch/Htdocs/Files/v/37125.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Chiffres/rapport_hebdo_COVID_JU_sem10_2021.pdf,,,,,,,, 23 | JU,,,11,2021,97,,1383,7.0,https://www.jura.ch/Htdocs/Files/v/37180.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Chiffres/rapport_hebdo_COVID_JU_sem11_2021.pdf,,,,,,,, 24 | JU,,,12,2021,104,,1715,6.0,https://www.jura.ch/Htdocs/Files/v/37241.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Chiffres/rapport-hebdomadaire.pdf,,,,,,,, 25 | JU,,,13,2021,148,,2116,7.0,https://www.jura.ch/Htdocs/Files/v/37276.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Chiffres/Rapport_hebdo_COVID_sem13.pdf,,,,,,,, 26 | JU,,,14,2021,110,,1205,8.0,https://www.jura.ch/Htdocs/Files/v/37332.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Chiffres/rapport-hebdo-COVID-sem14.pdf,,,,,,,, 27 | -------------------------------------------------------------------------------- /scrapers/scrape_bl_districts.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | from bs4 import BeautifulSoup 5 | import scrape_common as sc 6 | import scrape_bl_common as sbc 7 | from collections import defaultdict, OrderedDict 8 | from datetime import datetime 9 | 10 | main_url = "https://www.baselland.ch/politik-und-behorden/direktionen/volkswirtschafts-und-gesundheitsdirektion/amt-fur-gesundheit/medizinische-dienste/kantonsarztlicher-dienst/aktuelles/covid-19-faelle-kanton-basel-landschaft" 11 | main_site = sc.download(main_url, silent=True) 12 | 13 | # 2020-04-08, two iframes 14 | """ 15 | 16 | 17 | """ 18 | 19 | 20 | def parse_row_date(s): 21 | return sbc.parse_bl_date(s)[0] 22 | 23 | 24 | rows = defaultdict(dict) 25 | soup = BeautifulSoup(main_site, 'html.parser') 26 | for iframe in soup.find_all('iframe'): 27 | iframe_url = (iframe['src']) 28 | 29 | if iframe_url.find('/dbw/360') <= 0: 30 | continue 31 | 32 | d = sc.download(iframe_url, silent=True) 33 | 34 | # 2020-07-29 35 | """ 36 |