├── scrapers
├── __init__.py
├── test
│ ├── __init__.py
│ ├── test_district_data.py
│ ├── test_test_data.py
│ └── test_dates.py
├── .gitignore
├── scrape_nw_common.py
├── scrape_gl_common.py
├── scrape_ag_common.py
├── scrape_vd_common.py
├── scrape_fl_tests.py
├── scrape_zh_tests.py
├── scrape_be_tests.py
├── scrape_fr_common.py
├── validate_scraper_output.sh
├── scrape_so_common.py
├── db_common.py
├── scrape_fr_tests.py
├── scrape_nw_tests.py
├── scrape_zh.py
├── scrape_tg_districts.py
├── scrape_ti_tests.py
├── scrape_zg_tests.py
├── scrape_sh_tests.py
├── scrape_gl_tests.py
├── scrape_nw.py
├── run_district_scraper.sh
├── scrape_vs_common.py
├── run_tests_scraper.sh
├── meta_scrape.sh
├── scrape_sg_tests.py
├── scrape_vs_tests.py
├── test_tests_scraper.sh
├── scrape_tg.py
├── run_scraper.sh
├── test_district_scraper.sh
├── scrape_ne.py
├── scrape_ag_tests.py
├── scrape_ju_tests.py
├── scrape_ai.py
├── scrape_bl_common.py
├── scrape_fr.py
├── scrape_be_districts.py
├── scrape_ge_tests.py
├── test_scraper.sh
├── scrape_lu.py
├── scrape_sg_districts.py
├── scrape_bs.py
├── download.sh
├── scrape_tests.py
├── validate_scrapers.py
├── scrape_ge_common.py
├── populate_district_database.py
├── scrape_so_districts.py
├── scrape_sh.py
├── scrape_bs_tests.py
├── scrape_be.py
├── scrape_sz_districts.py
├── populate_database.py
├── scrape_gr_districts.py
├── populate_tests_database.py
├── scrape_tg_tests.py
├── scrape_ti.py
├── scrape_sz.py
├── certificate.pem
├── scrape_ag.py
├── scrape_vd_tests.py
├── scrape_so.py
├── scrape_fl.py
├── scrape_ow.py
├── scrape_vs.py
├── scrape_sh_common.py
├── scrape_fr_districts.py
├── scrape_bl_tests.py
├── scrape_ur.py
├── scrape_gl.py
├── scrape_so_tests.py
├── add_district_db_entry.py
├── convert_parsed_to_csv.py
├── scrape_gr.py
├── scrape_bl_districts.py
├── scrape_ag_districts.py
├── scrape_vs_districts.py
└── scrape_vd.py
├── gd.png
├── logos.png
├── requirements-ocr.txt
├── dashboard
└── dashboard.png
├── binder
└── environment.yml
├── statistisches_amt_kt_zh.png
├── .gitignore
├── requirements.txt
├── setup.py
├── fallzahlen_bezirke
├── Readme.md
└── fallzahlen_kanton_AG_bezirk.csv
├── fallzahlen_plz
└── Readme.md
├── fallzahlen_kanton_total_csv_v2
└── README.md
├── COVID19_Fallzahlen_Kanton_ZH_isolated_quarantined.csv
├── COVID19_Fallzahlen_Kanton_ZH_Beispiel_alter_geschlecht_.csv
├── fallzahlen_kanton_alter_geschlecht_csv
├── COVID19_Fallzahlen_Kanton_AI_alter_geschlecht.csv
├── COVID19_Fallzahlen_Kanton_alter_geschlecht_BEISPIEL.csv
├── Readme.md
├── COVID19_Fallzahlen_Kanton_AR_alter_geschlecht.csv
├── COVID19_Fallzahlen_Kanton_AG_alter_geschlecht.csv
└── COVID19_Einwohner_Kanton_ZH_altersklassen_geschlecht.csv
├── fallzahlen_kanton_zh
├── README.md
└── COVID19_VOC_Kanton_ZH.csv
├── scripts
├── latest_total.sh
├── transform_all_new2old.sh
├── transform_all_add_columns.sh
├── transform_all_old2new.sh
├── check_for_empty_lines.sh
├── merge_canton_csvs.rb
├── update_dates_in_readme.sh
├── validate-schema.js
├── latest_per_canton.sh
├── new2oldcsv.py
├── old2newcsv.py
├── add_new_columns.py
├── remove_older_entries.py
├── check_for_outliers.py
└── validate-csv.js
├── mappingCanton_BFS.csv
├── package.json
├── COVID19_Fallzahlen_Beispiel.csv
├── CONTRIBUTING.md
├── fallzahlen_kanton_total_csv
└── README.md
├── .github
└── workflows
│ ├── rebase.yml
│ ├── lint_python.yml
│ ├── test_scraper.yml
│ ├── activate_scraper.yml
│ ├── deactivate_scraper.yml
│ ├── test_tests_scraper.yml
│ ├── test_district_scraper.yml
│ ├── validate-csv.yml
│ ├── run_district_scrapers.yml
│ └── run_tests_scraper.yml
├── correction_status.csv
└── fallzahlen_tests
└── fallzahlen_kanton_JU_tests.csv
/scrapers/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/scrapers/test/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/scrapers/.gitignore:
--------------------------------------------------------------------------------
1 | webarchiveorg.log
2 | __pycache__
3 |
--------------------------------------------------------------------------------
/gd.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openZH/covid_19/HEAD/gd.png
--------------------------------------------------------------------------------
/logos.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openZH/covid_19/HEAD/logos.png
--------------------------------------------------------------------------------
/requirements-ocr.txt:
--------------------------------------------------------------------------------
1 | opencv-python==4.4.0.44
2 | numpy
3 | pytesseract
4 |
--------------------------------------------------------------------------------
/dashboard/dashboard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openZH/covid_19/HEAD/dashboard/dashboard.png
--------------------------------------------------------------------------------
/binder/environment.yml:
--------------------------------------------------------------------------------
1 | channels:
2 | - conda-forge
3 | dependencies:
4 | - matplotlib
5 | - pandas
6 |
--------------------------------------------------------------------------------
/statistisches_amt_kt_zh.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openZH/covid_19/HEAD/statistisches_amt_kt_zh.png
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/*
2 | node_modules
3 | scrapers/data.sqlite
4 | *.pyc
5 | boxplot.png
6 | geckodriver.log
7 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4
2 | requests
3 | dateparser
4 | xlrd==1.2.0
5 | pytest
6 | pandas
7 | selenium
8 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 |
3 | setup(name="scrapers", packages=find_packages())
4 |
--------------------------------------------------------------------------------
/fallzahlen_bezirke/Readme.md:
--------------------------------------------------------------------------------
1 | # Metadata
2 |
3 | See https://github.com/openZH/covid_19/tree/master#canton-zurich-districts-bezirk.
4 |
--------------------------------------------------------------------------------
/fallzahlen_plz/Readme.md:
--------------------------------------------------------------------------------
1 | # Metadata
2 | See: https://github.com/openZH/covid_19/tree/master#canton-zurich-postal-codes-postleitzahl.
3 |
--------------------------------------------------------------------------------
/fallzahlen_kanton_total_csv_v2/README.md:
--------------------------------------------------------------------------------
1 | # Metadata
2 | Siehe: https://github.com/openZH/covid_19/tree/master#swiss-cantons-and-principality-of-liechtenstein-unified-dataset.
3 |
--------------------------------------------------------------------------------
/COVID19_Fallzahlen_Kanton_ZH_isolated_quarantined.csv:
--------------------------------------------------------------------------------
1 | date,abbreviation_canton_and_fl,current_isolated,current_quarantined
2 | 2020-05-26,ZH,14,58
3 | 2020-05-29,ZH,22,67
4 | 2020-06-02,ZH,18,47
5 |
--------------------------------------------------------------------------------
/COVID19_Fallzahlen_Kanton_ZH_Beispiel_alter_geschlecht_.csv:
--------------------------------------------------------------------------------
1 | Date,Area,AgeYear,Gender,NewConfCases,NewDeaths,PreExistingCond
2 | 2020-03-01,Canton_ZH,30,F,1,0,0
3 | 2020-03-01,Canton_ZH,32,M,0,1,1
4 |
5 |
--------------------------------------------------------------------------------
/fallzahlen_kanton_alter_geschlecht_csv/COVID19_Fallzahlen_Kanton_AI_alter_geschlecht.csv:
--------------------------------------------------------------------------------
1 | Date,Area,AgeYear,Gender,NewConfCases,NewPosTests1,NewCured,NewDeaths
2 | 14.03.2020,Canton_AI,59,m,,1,,
3 | 14.03.2020,Canton_AI,57,f,,1,,
4 |
--------------------------------------------------------------------------------
/fallzahlen_kanton_alter_geschlecht_csv/COVID19_Fallzahlen_Kanton_alter_geschlecht_BEISPIEL.csv:
--------------------------------------------------------------------------------
1 | Date,Area,AgeYear,Gender,NewConfCases,NewDeaths,PreExistingCond
2 | 2020-03-01,Canton_ZH,30,F,1,0,0
3 | 2020-03-01,Canton_ZH,32,M,0,1,1
4 |
5 |
--------------------------------------------------------------------------------
/fallzahlen_kanton_zh/README.md:
--------------------------------------------------------------------------------
1 | # Metadata
2 | See:
3 | - https://github.com/openZH/covid_19/tree/master#canton-z%C3%BCrich-unified-dataset
4 | - https://github.com/openZH/covid_19/blob/master/README.md#canton-z%C3%BCrich-more-detailed-dataset.
5 |
--------------------------------------------------------------------------------
/scripts/latest_total.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | for f in *.csv; do
4 | # Output last row with non-zero commulative number of cases
5 | awk -F , '{if ($5) { print $1, $3, $5; }}' "$f" | tail -1
6 | done | awk 'BEGIN { sum = 0; } { sum += $3; } END { print sum; }'
7 |
--------------------------------------------------------------------------------
/fallzahlen_kanton_alter_geschlecht_csv/Readme.md:
--------------------------------------------------------------------------------
1 | # Metadata
2 |
3 | See:
4 | - https://github.com/openZH/covid_19/tree/master#swiss-cantons-and-principality-of-liechtenstein-more-detailed-dataset
5 | - https://github.com/openZH/covid_19/tree/master#canton-zurich-more-detailed-dataset
6 |
--------------------------------------------------------------------------------
/scripts/transform_all_new2old.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | DIR="$(cd "$(dirname "$0")" && pwd)"
4 |
5 | for f in $DIR/../fallzahlen_kanton_total_csv_v2/*.csv;
6 | do
7 | filename="$(basename "$f")"
8 | $DIR/new2oldcsv.py $f > $DIR/../fallzahlen_kanton_total_csv/$filename
9 | done
10 |
--------------------------------------------------------------------------------
/mappingCanton_BFS.csv:
--------------------------------------------------------------------------------
1 | abk,bfs
2 | ZH,01
3 | BE,02
4 | LU,03
5 | UR,04
6 | SZ,05
7 | OW,06
8 | NW,07
9 | GL,08
10 | ZG,09
11 | FR,10
12 | SO,11
13 | BS,12
14 | BL,13
15 | SH,14
16 | AR,15
17 | AI,16
18 | SG,17
19 | GR,18
20 | AG,19
21 | TG,20
22 | TI,21
23 | VD,22
24 | VS,23
25 | NE,24
26 | GE,25
27 | JU,26
28 | FL,99
29 |
--------------------------------------------------------------------------------
/fallzahlen_kanton_alter_geschlecht_csv/COVID19_Fallzahlen_Kanton_AR_alter_geschlecht.csv:
--------------------------------------------------------------------------------
1 | Date,Area,AgeYear,Gender,NewConfCases,NewPosTests1,NewCured,NewDeaths
2 | 05.03.2020,Canton_AR,50,f,,1,,
3 | 09.03.2020,Canton_AR,,f,,1,,
4 | 12.03.2020,Canton_AR,69,f,,1,,
5 | 12.03.2020,Canton_AR,38,f,,1,,
6 | 12.03.2020,Canton_AR,42,f,,1,,
7 |
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "covid_19",
3 | "version": "1.0.0",
4 | "repository": "git@github.com:openZH/covid_19.git",
5 | "license": "MIT",
6 | "dependencies": {
7 | "csv-validator": "0.0.3"
8 | },
9 | "scripts": {
10 | "test": "node scripts/validate-csv.js fallzahlen_kanton_total_csv_v2/*.csv"
11 | }
12 | }
13 |
--------------------------------------------------------------------------------
/scripts/transform_all_add_columns.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | DIR="$(cd "$(dirname "$0")" && pwd)"
4 |
5 | for f in $DIR/../fallzahlen_kanton_total_csv_v2/*.csv;
6 | do
7 | filename="$(basename "$f")"
8 | $DIR/add_new_columns.py $f > /tmp/columnfile
9 | cat /tmp/columnfile > $DIR/../fallzahlen_kanton_total_csv_v2/$filename
10 | done
11 |
--------------------------------------------------------------------------------
/scripts/transform_all_old2new.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | DIR="$(cd "$(dirname "$0")" && pwd)"
4 |
5 | mkdir -p $DIR/../fallzahlen_kanton_total_csv_v2
6 |
7 | for f in $DIR/../fallzahlen_kanton_total_csv/*.csv;
8 | do
9 | filename="$(basename "$f")"
10 | $DIR/old2newcsv.py $f > $DIR/../fallzahlen_kanton_total_csv_v2/$filename
11 | done
12 |
--------------------------------------------------------------------------------
/scripts/check_for_empty_lines.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | path="$*"
4 | output=$(grep --line-number --with-filename '^\s*$' $path)
5 | grep_exit=$?
6 |
7 | if [ $grep_exit -eq 0 ] ; then
8 | echo "× Found empty lines in the following files/line number:"
9 | echo $output
10 | exit 1
11 | else
12 | echo "✓ No empty lines found"
13 | exit 0
14 | fi
15 |
16 |
--------------------------------------------------------------------------------
/COVID19_Fallzahlen_Beispiel.csv:
--------------------------------------------------------------------------------
1 | date,time,abbreviation_canton_and_fl,ncumul_tested,ncumul_conf,new_hosp,current_hosp,current_icu,current_vent,ncumul_released,ncumul_deceased,source,current_isolated,current_quarantined,current_quarantined_riskareatravel
2 | 2020-02-27,17:40,AG,10000,1000,10,100,10,10,100,10,https://ag.ch/...,37,88,112
3 | 2020-02-28,11:00,AG,11000,1010,5,80,5,5,120,15,https://ag.ch/...,35,67,132
4 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | Contributors to data collection & cleaning: please check https://github.com/openZH/covid_19/issues for open issues, and use this to flag any problems.
2 |
3 | The best way to get started right now is to join the discussion at https://github.com/openZH/covid_19/discussions?discussions_q=sort%3Atop
4 |
5 | Users of the data: please share links to your projects in https://github.com/openZH/covid_19#community-contributions
6 |
--------------------------------------------------------------------------------
/fallzahlen_kanton_alter_geschlecht_csv/COVID19_Fallzahlen_Kanton_AG_alter_geschlecht.csv:
--------------------------------------------------------------------------------
1 | Date,Area,AgeYear,Gender,NewConfCases,NewPosTests1,NewCured,NewDeaths,source
2 | 01.03.2020,Canton_AG,31,m,1,,,,https://www.ag.ch/de/aktuelles/medienportal/medienmitteilung/medienmitteilungen/mediendetails_138717.jsp
3 | 01.03.2020,Canton_AG,74,f,,,,1,https://www.ag.ch/media/kanton_aargau/themen_1/coronavirus_1/lagebulletins/200305_KFS_Coronavirus_Lagebulletin_5.pdf
4 |
--------------------------------------------------------------------------------
/scrapers/scrape_nw_common.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 |
4 | import re
5 | from bs4 import BeautifulSoup
6 | import scrape_common as sc
7 |
8 |
9 | def get_nw_page():
10 | url = 'https://www.nw.ch/gesundheitsamtdienste/6044'
11 | content = sc.download(url, silent=True)
12 | content = content.replace(" ", " ")
13 | content = re.sub(r'(\d+)\'(\d+)', r'\1\2', content)
14 | soup = BeautifulSoup(content, 'html.parser')
15 | return url, soup
16 |
--------------------------------------------------------------------------------
/scrapers/scrape_gl_common.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 |
4 | import re
5 | from bs4 import BeautifulSoup
6 | import scrape_common as sc
7 |
8 |
9 | def get_gl_pdf_url():
10 | d = sc.download('https://www.gl.ch/verwaltung/finanzen-und-gesundheit/gesundheit/coronavirus.html/4817', silent=True)
11 | soup = BeautifulSoup(d, 'html.parser')
12 |
13 | # weekly pdf
14 | elem = soup.find(href=re.compile(r'Sentinella.*\.pdf'))
15 | if elem is None:
16 | return None
17 | return elem.get('href')
18 |
--------------------------------------------------------------------------------
/scrapers/scrape_ag_common.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | from bs4 import BeautifulSoup
4 | import re
5 | import scrape_common as sc
6 |
7 |
8 | def get_ag_xls_url():
9 | data_url = 'https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp'
10 | d = sc.download(data_url, silent=True)
11 | soup = BeautifulSoup(d, 'html.parser')
12 | xls_url = soup.find('a', href=re.compile(r'\.xlsx$'))['href']
13 | if not xls_url.startswith('http'):
14 | xls_url = f'https://www.ag.ch{xls_url}'
15 | return xls_url
16 |
--------------------------------------------------------------------------------
/fallzahlen_kanton_total_csv/README.md:
--------------------------------------------------------------------------------
1 | # Warning: Do not manually update files in this directory
2 |
3 | This directory contains all cantonal files in the "old" structure (before 2020-04-09).
4 | All CSV files in this directory will be **updated automatically** every 15min based on the corresponding file in the "fallzahlen_kanton_total_csv_v2" directory.
5 |
6 | All manual changes to these files will be overwritten.
7 |
8 | # Metadata
9 | See: https://github.com/openZH/covid_19/tree/master#swiss-cantons-and-principality-of-liechtenstein-unified-dataset.
10 |
--------------------------------------------------------------------------------
/.github/workflows/rebase.yml:
--------------------------------------------------------------------------------
1 | on:
2 | issue_comment:
3 | types: [created]
4 | name: Automatic Rebase
5 | jobs:
6 | rebase:
7 | name: Rebase
8 | if: github.event.issue.pull_request != '' && contains(github.event.comment.body, '/rebase')
9 | runs-on: ubuntu-latest
10 | steps:
11 | - name: Checkout the latest code
12 | uses: actions/checkout@v3
13 | with:
14 | fetch-depth: 0
15 | - name: Automatic Rebase
16 | uses: cirrus-actions/rebase@1.3.1
17 | env:
18 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
19 |
--------------------------------------------------------------------------------
/scrapers/scrape_vd_common.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 |
4 | import re
5 | import scrape_common as sc
6 |
7 |
8 | def get_weekly_pdf_url():
9 | return get_all_weekly_pdf_urls()[0]
10 |
11 |
12 | def get_all_weekly_pdf_urls():
13 | base_url = 'https://www.infosan.vd.ch'
14 | d = sc.download(base_url, silent=True)
15 |
16 | urls = re.findall(r"window.open\('(.*_epidemio\.pdf)'", d)
17 | result = []
18 | for url in urls:
19 | if not url.startswith('http'):
20 | url = f'{base_url}/{url}'
21 | result.append(url)
22 | return result
23 |
--------------------------------------------------------------------------------
/scrapers/scrape_fl_tests.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import scrape_common as sc
4 |
5 | url = 'https://www.llv.li/files/as/grafik_covid19_tests_pro_kw.xlsx'
6 | xls = sc.xlsdownload(url, silent=True)
7 | rows = sc.parse_xls(xls, header_row=74, sheet_name='gTests_AG')
8 | year = '2020'
9 | for row in rows:
10 | if row['C'] is None:
11 | # skip the footer line
12 | continue
13 | td = sc.TestData(canton='FL', url=url)
14 | td.week = int(sc.find(r'KW (\d+)', row['C']))
15 | if td.week == 1:
16 | year = '2021'
17 | td.year = year
18 | td.negative_tests = row['Negativ']
19 | td.positive_tests = row['Positiv']
20 | print(td)
21 |
--------------------------------------------------------------------------------
/.github/workflows/lint_python.yml:
--------------------------------------------------------------------------------
1 | name: Tests + Linting Python
2 | on:
3 | pull_request:
4 | push:
5 | branches: [master]
6 | workflow_dispatch: ~
7 | jobs:
8 | lint_python:
9 | runs-on: ubuntu-20.04
10 | timeout-minutes: 10
11 | steps:
12 | - uses: actions/checkout@v3
13 | - name: Set up Python 3.7
14 | uses: actions/setup-python@v4
15 | with:
16 | python-version: 3.7
17 | - run: python -m pip install --upgrade pip
18 | - run: pip install flake8 pytest
19 | - run: pip install -r requirements.txt
20 | - run: flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
21 | - run: PYTHONPATH=scrapers pytest
22 |
--------------------------------------------------------------------------------
/scrapers/scrape_zh_tests.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import csv
4 | from io import StringIO
5 | import scrape_common as sc
6 |
7 |
8 | url = 'https://raw.githubusercontent.com/openZH/covid_19/master/fallzahlen_kanton_zh/COVID19_Anteil_positiver_Test_pro_KW.csv'
9 | data = sc.download(url, silent=True)
10 |
11 | reader = csv.DictReader(StringIO(data), delimiter=',')
12 | for row in reader:
13 | td = sc.TestData(canton='ZH', url=url)
14 | td.start_date = row['Woche_von']
15 | td.end_date = row['Woche_bis']
16 | td.week = row['Kalenderwoche']
17 | td.positive_tests = int(row['Anzahl_positiv'])
18 | td.negative_tests = int(row['Anzahl_negativ'])
19 | td.positivity_rate = float(row['Anteil_positiv'])
20 | print(td)
21 |
--------------------------------------------------------------------------------
/scrapers/scrape_be_tests.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import csv
4 | from io import StringIO
5 | import scrape_common as sc
6 |
7 | url = 'https://covid-kennzahlen.apps.be.ch/#/de/cockpit'
8 |
9 | csv_url = 'https://raw.githubusercontent.com/openDataBE/covid19Data/develop/vortag_tests.csv'
10 | d = sc.download(csv_url, silent=True)
11 | reader = csv.DictReader(StringIO(d), delimiter=',')
12 | for row in reader:
13 | td = sc.TestData(canton='BE', url=url)
14 | date = sc.date_from_text(row['datum']).isoformat()
15 | td.start_date = date
16 | td.end_date = date
17 | td.total_tests = row['durchgefuehrte_tests']
18 | td.positive_tests = row['positive_tests']
19 | td.positivity_rate = row['positivitaetsrate']
20 | print(td)
21 |
--------------------------------------------------------------------------------
/scripts/merge_canton_csvs.rb:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 |
3 | require 'csv'
4 |
5 | # get files
6 | files = Dir["fallzahlen_kanton_total_csv_v2/*.csv"]
7 |
8 | # output array
9 | rows = []
10 |
11 | # read headers
12 | header = CSV.read(files.first).first
13 |
14 | # read all csv files
15 | files.each do |fn|
16 | CSV.foreach(fn, headers: true) do |row|
17 | # make sure time is formatted with leading zeroes
18 | if row[1] =~ /(\d{1,2}):(\d{1,2})/
19 | row[1] = sprintf "%02d:%02d", $1.to_i, $2.to_i
20 | end
21 | rows << row[0..14]
22 | end
23 | end
24 |
25 | # sort records by date
26 | rows.sort_by! { |x| "#{x[0]}-#{x[1]}-#{x[2]}" }
27 |
28 |
29 | # output
30 | puts header.to_csv
31 | rows.each{ |row| puts row.to_csv }
32 |
33 |
34 |
--------------------------------------------------------------------------------
/scrapers/scrape_fr_common.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 |
4 | import re
5 | from bs4 import BeautifulSoup
6 | import scrape_common as sc
7 |
8 |
9 | def get_fr_csv():
10 | main_url = 'https://www.fr.ch/de/gesundheit/covid-19/coronavirus-statistik-ueber-die-entwicklung-im-kanton'
11 | d = sc.download(main_url, silent=True)
12 |
13 | soup = BeautifulSoup(d, 'html.parser')
14 | item = soup.find('a', title=re.compile(r"Statistik .ber die Entwicklungen im Kanton.*"))
15 | csv_url = item.get('href')
16 | assert csv_url, "URL is empty"
17 | if not csv_url.startswith('http'):
18 | csv_url = f'https://www.fr.ch{csv_url}'
19 |
20 | csv = sc.download(csv_url, silent=True)
21 | csv = re.sub(r'(\d+)\'(\d+)', r'\1\2', csv)
22 | return csv_url, csv, main_url
23 |
--------------------------------------------------------------------------------
/scrapers/validate_scraper_output.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Script to run a single scraper
4 |
5 | set -e
6 | set -o pipefail
7 |
8 | function cleanup {
9 | exit $?
10 | }
11 | trap "cleanup" EXIT
12 |
13 | DIR="$(cd "$(dirname "$0")" && pwd)"
14 |
15 |
16 | # SCRAPER_KEY must be set
17 | if [ -z $SCRAPER_KEY ] ; then
18 | echo "SCRAPER_KEY env variable must be set";
19 | exit 1
20 | fi
21 |
22 | area="Kanton_${SCRAPER_KEY}"
23 | if [ "$SCRAPER_KEY" = "FL" ] ; then
24 | area="${SCRAPER_KEY}"
25 | fi
26 |
27 | # 1. Validate the result
28 | node $DIR/../scripts/validate-csv.js $DIR/../fallzahlen_kanton_total_csv_v2/COVID19_Fallzahlen_${area}_total.csv
29 |
30 | # 2. Check for outliers
31 | python $DIR/../scripts/check_for_outliers.py $DIR/../fallzahlen_kanton_total_csv_v2/COVID19_Fallzahlen_${area}_total.csv
32 |
--------------------------------------------------------------------------------
/fallzahlen_kanton_alter_geschlecht_csv/COVID19_Einwohner_Kanton_ZH_altersklassen_geschlecht.csv:
--------------------------------------------------------------------------------
1 | Year,Area,AgeYearCat,Gender,Inhabitants
2 | 2019,Canton_ZH,0-9,M,82878
3 | 2019,Canton_ZH,0-9,F,78735
4 | 2019,Canton_ZH,10-19,M,72994
5 | 2019,Canton_ZH,10-19,F,68488
6 | 2019,Canton_ZH,100+,M,45
7 | 2019,Canton_ZH,100+,F,200
8 | 2019,Canton_ZH,20-29,M,95172
9 | 2019,Canton_ZH,20-29,F,91194
10 | 2019,Canton_ZH,30-39,M,127998
11 | 2019,Canton_ZH,30-39,F,125184
12 | 2019,Canton_ZH,40-49,M,116400
13 | 2019,Canton_ZH,40-49,F,111604
14 | 2019,Canton_ZH,50-59,M,112667
15 | 2019,Canton_ZH,50-59,F,107919
16 | 2019,Canton_ZH,60-69,M,73383
17 | 2019,Canton_ZH,60-69,F,78006
18 | 2019,Canton_ZH,70-79,M,54372
19 | 2019,Canton_ZH,70-79,F,63877
20 | 2019,Canton_ZH,80-89,M,24989
21 | 2019,Canton_ZH,80-89,F,36988
22 | 2019,Canton_ZH,90-99,M,4020
23 | 2019,Canton_ZH,90-99,F,9293
24 |
--------------------------------------------------------------------------------
/scrapers/scrape_so_common.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 |
4 | import re
5 | from bs4 import BeautifulSoup
6 | import scrape_common as sc
7 |
8 |
9 | def strip_value(value):
10 | return value.replace('\'', '')
11 |
12 |
13 | def get_latest_weekly_pdf_url():
14 | return get_all_weekly_pdf_urls()[0]
15 |
16 |
17 | def get_all_weekly_pdf_urls():
18 | base_url = 'https://corona.so.ch'
19 | url = f'{base_url}/bevoelkerung/daten/woechentlicher-situationsbericht/'
20 | d = sc.download(url, silent=True)
21 | soup = BeautifulSoup(d, 'html.parser')
22 | links = soup.find_all(href=re.compile(r'\.pdf$'))
23 | result = []
24 | for link in links:
25 | file_ref = link.get('href')
26 | url = f'{base_url}{file_ref}'
27 | if url not in result:
28 | result.append(url)
29 | return result
30 |
--------------------------------------------------------------------------------
/scripts/update_dates_in_readme.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | DIR="$(cd "$(dirname "$0")" && pwd)"
4 |
5 | today=$(date +%s)
6 |
7 | areas="FL AG AI AR BE BL BS FR GE GL GR JU LU NE NW OW SG SH SO SZ TG TI UR VD VS ZG ZH"
8 | for area in $areas
9 | do
10 | update_date_str=`grep $area $DIR/../COVID19_Fallzahlen_CH_total_v2.csv | tail -n 1 | awk -F, '{print $1}'`
11 | update_date=$(date --date="$update_date_str" +%s)
12 | diff=$(($today-$update_date))
13 |
14 | if [ $diff -lt 84000 ]; then
15 | color='4d9221'
16 | elif [ $diff -lt 144000 ]; then
17 | color='b8e186'
18 | else
19 | color='de77ae'
20 | fi
21 | sed -i -e "/\[$area\]/s#update on [^|]*|#update on $update_date_str](https://placehold.jp/$color/000000/200x50.png?text=$update_date_str 'Last update on $update_date_str')|#" $DIR/../README.md
22 | echo "Update README for ${area} (date: ${update_date_str}, color: ${color})"
23 | done
24 |
--------------------------------------------------------------------------------
/scrapers/db_common.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import csv
4 | import os
5 |
6 |
7 | def get_location():
8 | location = os.path.realpath(
9 | os.path.join(
10 | os.getcwd(),
11 | os.path.dirname(__file__)
12 | )
13 | )
14 | return location
15 |
16 |
17 | def load_csv(filename):
18 | columns = []
19 | with open(filename, 'r') as f:
20 | dr = csv.DictReader(f)
21 | if not columns:
22 | columns = dr.fieldnames
23 | to_db = []
24 | for r in dr:
25 | db_row = []
26 | for col in columns:
27 | db_row.append(r[col])
28 | to_db.append(db_row)
29 | return columns, to_db
30 |
31 |
32 | def insert_db_query(columns):
33 | query = 'INSERT INTO data (\n'
34 | query += ",\n".join(columns)
35 | query += ') VALUES ('
36 | query += ",".join(['?'] * len(columns))
37 | query += ');'
38 | return query
39 |
--------------------------------------------------------------------------------
/scrapers/scrape_fr_tests.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 |
4 | import scrape_common as sc
5 | from scrape_fr_common import get_fr_csv
6 |
7 | """
8 | csv_url, csv_data, main_url = get_fr_csv()
9 | reader = csv.DictReader(StringIO(csv_data), delimiter=';')
10 |
11 |
12 | year = '2020'
13 |
14 | for row in rows:
15 | week = row['semaine /Woche']
16 | if not week:
17 | continue
18 |
19 | if week == 1:
20 | year = '2021'
21 |
22 | td = sc.TestData(canton='FR', url=main_url)
23 | td.week = int(week)
24 | td.year = year
25 | td.pcr_total_tests = int(row['Tests PCR'])
26 | if row['Taux/Rate PCR']:
27 | td.pcr_positivity_rate = round(row['Taux/Rate PCR'] * 100)
28 | td.ag_total_tests = int(row['Tests AG'])
29 | if row['Taux/Rate AG']:
30 | td.ag_positivity_rate = round(row['Taux/Rate AG'] * 100)
31 | td.total_tests = td.pcr_total_tests + td.ag_total_tests
32 | print(td)
33 | """
34 |
--------------------------------------------------------------------------------
/scrapers/scrape_nw_tests.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 |
4 | import re
5 | import scrape_common as sc
6 | import scrape_nw_common as snc
7 |
8 | url, soup = snc.get_nw_page()
9 |
10 | td = sc.TestData(canton='NW', url=url)
11 |
12 | item = soup.find(text=re.compile('Anzahl F.lle')).find_parent('p')
13 | assert item, f"Could not find title item in {url}"
14 |
15 | date = sc.find(r'Stand: (\d+\. .* 20\d{2})', item.text)
16 | date = sc.date_from_text(date)
17 | td.start_date = date.isoformat()
18 | td.end_date = date.isoformat()
19 |
20 | rows = item.find_next('table').findChildren('tr')
21 | for row in rows:
22 | cols = row.findChildren('td')
23 | item = cols[0].text
24 | if re.match(r'Covid-19-Tests innert 24h.*', item, re.I):
25 | res = re.match(r'(\d+)\s+(\d+\.?\d?)%', cols[1].text)
26 | if res is not None:
27 | td.total_tests = res[1]
28 | td.positivity_rate = res[2]
29 |
30 | if td:
31 | print(td)
32 |
--------------------------------------------------------------------------------
/scrapers/scrape_zh.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import csv
4 | import re
5 | from io import StringIO
6 | import scrape_common as sc
7 |
8 | url = "https://www.zh.ch/de/gesundheit/coronavirus.html"
9 | csv_url = 'https://raw.githubusercontent.com/openzh/covid_19/master/fallzahlen_kanton_zh/COVID19_Fallzahlen_Kanton_ZH_total.csv'
10 | d_csv = sc.download(csv_url, silent=True)
11 | reader = csv.DictReader(StringIO(d_csv), delimiter=',')
12 |
13 | is_first = True
14 | for row in reader:
15 | if not is_first:
16 | print('-' * 10)
17 | is_first = False
18 |
19 | dd = sc.DayData(canton='ZH', url=url)
20 | dd.datetime = f"{row['date']} {row['time']}"
21 | dd.cases = row['ncumul_conf']
22 | dd.deaths = row['ncumul_deceased']
23 | dd.hospitalized = row['current_hosp']
24 | dd.vent = row['current_vent']
25 | dd.icu = row['current_icu']
26 | dd.isolated = row['current_isolated']
27 | dd.quarantined = row['current_quarantined']
28 | print(dd)
29 |
30 |
31 |
32 |
33 |
--------------------------------------------------------------------------------
/scrapers/scrape_tg_districts.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import csv
4 | from io import StringIO
5 | import requests
6 | import scrape_common as sc
7 |
8 | # perma link to TG COVID dataset on opendata.swiss
9 | r = requests.get(
10 | 'https://opendata.swiss/api/3/action/ogdch_dataset_by_identifier',
11 | params={'identifier': 'dfs-ga-3@kanton-thurgau '}
12 | )
13 | dataset = r.json()['result']
14 | resource = next(r for r in dataset['resources'] if r['mimetype'] == 'text/csv')
15 |
16 | assert resource['download_url'], "Download URL not found"
17 |
18 | d_csv = sc.download(resource['download_url'], silent=True, encoding='latin1')
19 |
20 | reader = csv.DictReader(StringIO(d_csv), delimiter=';')
21 | for row in reader:
22 | dd = sc.DistrictData(canton='TG')
23 | dd.district_id = row['districtid']
24 | dd.district = row['district']
25 | dd.population = row['population']
26 | dd.week = row['week']
27 | dd.year = row['year']
28 | dd.new_cases = row['newconfcases']
29 | dd.url = resource['download_url']
30 | print(dd)
31 |
--------------------------------------------------------------------------------
/scrapers/scrape_ti_tests.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 |
4 | from bs4 import BeautifulSoup
5 | import re
6 | import scrape_common as sc
7 |
8 | main_url = 'https://www4.ti.ch/dss/dsp/covid19/home/'
9 | d = sc.download(main_url, silent=True)
10 | soup = BeautifulSoup(d, 'html.parser')
11 |
12 | td = sc.TestData(canton='TI', url=main_url)
13 |
14 | container = soup.find('h2', string=re.compile(r'Test PCR')).find_next('div')
15 | for item in container.find_all('div'):
16 | divs = item.find_all('div')
17 | if len(divs) == 3:
18 | if divs[2].string:
19 | date = sc.find(r'.*?(\d+\.\d+\.\d{2})', divs[2].string)
20 | date = sc.date_from_text(date)
21 | td.start_date = date.isoformat()
22 | td.end_date = date.isoformat()
23 | if sc.find(r'^(Totale test).*', divs[1].string):
24 | td.total_tests = divs[0].string
25 | if sc.find(r'^(% test).*', divs[1].string):
26 | td.positivity_rate = divs[0].string
27 |
28 | if td:
29 | assert td.start_date and td.end_date, 'failed to extract date'
30 | print(td)
31 |
--------------------------------------------------------------------------------
/scrapers/scrape_zg_tests.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import collections
4 | import csv
5 | import datetime
6 | from io import StringIO
7 | import scrape_common as sc
8 |
9 |
10 | csv_url = 'https://www.zg.ch/behoerden/gesundheitsdirektion/statistikfachstelle/daten/themen/result-themen-14-03-07-i2-k4-b1.csv'
11 | d_csv = sc.download(csv_url, silent=True)
12 | """
13 | "Woche","Geschlecht","Anzahl Fälle","Meta","Type","Content"
14 | 2020-05-25,"männlich","151",NA,NA,NA
15 | 2020-06-01,"männlich","117",NA,NA,NA
16 | """
17 |
18 | reader = csv.DictReader(StringIO(d_csv), delimiter=',')
19 | data = collections.defaultdict(dict)
20 | for row in reader:
21 | if row['Woche'] == 'NA':
22 | continue
23 | date = sc.date_from_text(row['Woche'])
24 | if date not in data:
25 | data[date] = 0
26 | data[date] += int(row['Anzahl Fälle'])
27 |
28 | days = list(data.keys())
29 | for day in days:
30 | td = sc.TestData(canton='ZG', url=csv_url)
31 | td.start_date = day.isoformat()
32 | td.end_date = (day + datetime.timedelta(days=6)).isoformat()
33 | td.total_tests = data[day]
34 | print(td)
35 |
--------------------------------------------------------------------------------
/scrapers/scrape_sh_tests.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 |
4 | import datetime
5 | import re
6 | from bs4 import BeautifulSoup
7 | import scrape_common as sc
8 | import scrape_sh_common as shc
9 |
10 | main_url, xls = shc.get_sh_xlsx()
11 |
12 | rows = sc.parse_xls(xls, sheet_name='Datensatz_Tests', header_row=0)
13 | for row in rows:
14 | if not (row['Jahr'] or row['Kalenderwoche']):
15 | continue
16 |
17 | td = sc.TestData(canton='SH', url=main_url)
18 | td.year = row['Jahr']
19 | td.week = row['Kalenderwoche']
20 |
21 | td.pcr_total_tests = 0
22 | pcr_cols = ['Tests KAZ', 'Tests Apotheken', 'Tests KSSH', 'Test Praxen']
23 | for col in pcr_cols:
24 | if sc.represents_int(row[col]):
25 | td.pcr_total_tests += row[col]
26 |
27 | td.ag_total_tests = 0
28 | ag_cols = ['Schnelltests KAZ', 'Schnelltests Apotheken', 'Schnelltests KSSH', 'Schnelltest Praxen']
29 | for col in ag_cols:
30 | if sc.represents_int(row[col]):
31 | td.ag_total_tests += row[col]
32 | td.total_tests = td.pcr_total_tests + td.ag_total_tests
33 | print(td)
34 |
--------------------------------------------------------------------------------
/scrapers/scrape_gl_tests.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 |
4 | import re
5 | from bs4 import BeautifulSoup
6 | import scrape_common as sc
7 | import scrape_gl_common as sgc
8 |
9 | pdf_url = sgc.get_gl_pdf_url()
10 | if pdf_url is not None:
11 | pdf = sc.download_content(pdf_url, silent=True)
12 | content = sc.pdftotext(pdf, page=1, layout=True)
13 | # remove 1k separators
14 | content = re.sub(r'(\d)\'(\d)', r'\1\2', content)
15 |
16 | year = sc.find(r'Stand: \d{2}\.\d{2}.(\d{4})', content)
17 | week = sc.find(r'KW(\d+)\.pdf', pdf_url)
18 |
19 | # Insgesamt Anzahl, 100k, 14 Tage Anzahl, 100k, 7 Tage Anzahl, 100k
20 | number_of_tests = sc.find(r'PCR-Tests/Schnelltests\sKanton Glarus\s+\d+\s+\d+\.?\d+?\s+\d+\s+\d+\.?\d+?\s+(\d+)\s+\d+', content)
21 | # Insgesamt, 14 Tage, 7 Tage
22 | positivity_rate = sc.find(r'Positivit.tsrate GL\s?\*+?\s+\d+\.\d%\s+\d+\.\d%\s+(\d+\.\d)%\s+', content)
23 |
24 | td = sc.TestData(canton='GL', url=pdf_url)
25 | td.week = week
26 | td.year = year
27 | td.total_tests = number_of_tests
28 | td.positivity_rate = positivity_rate
29 | print(td)
30 |
--------------------------------------------------------------------------------
/scrapers/scrape_nw.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 |
4 | import re
5 | from bs4 import BeautifulSoup
6 | import scrape_common as sc
7 | import scrape_nw_common as snc
8 |
9 | is_first = True
10 | xls_url = 'http://www.nw.ch/coronastatistik'
11 | xls = sc.xlsdownload(xls_url, silent=True)
12 | rows = sc.parse_xls(xls, header_row=2)
13 | for row in rows:
14 | dd = sc.DayData(canton='NW', url=xls_url)
15 | dd.datetime = row['A'].date().isoformat()
16 | dd.cases = row['Positiv getestete Personen (kumuliert)']
17 | dd.icu = row['Davon auf der Intensivstation']
18 |
19 | try:
20 | dd.hospitalized = row['Aktuell hospitalisierte Personen']
21 | except KeyError:
22 | dd.hospitalized = row['Hospitalisierte Personen']
23 |
24 | try:
25 | dd.deaths = row['Personen verstorben']
26 | except KeyError:
27 | dd.deaths = row['Verstorbene Personen']
28 |
29 | # skip empty rows
30 | if dd.cases is None and dd.icu is None and dd.hospitalized is None and dd.deaths is None:
31 | continue
32 |
33 | if not is_first:
34 | print('-' * 10)
35 | is_first = False
36 | print(dd)
37 |
--------------------------------------------------------------------------------
/scripts/validate-schema.js:
--------------------------------------------------------------------------------
1 | const csval = require("csval");
2 | const fs = require("fs").promises;
3 | const path = require("path");
4 |
5 | const DIR = path.resolve(process.argv[2] || process.cwd());
6 |
7 | const validateSequentially = async csvFiles => {
8 | const rules = await csval.readRules(path.join(DIR, "schema.json"));
9 |
10 | let failedChecks = 0;
11 |
12 | for (let csvFile of csvFiles) {
13 | const csv = await csval.readCsv(path.join(DIR, csvFile));
14 | const parsed = await csval.parseCsv(csv);
15 | let valid = false;
16 | try {
17 | valid = await csval.validate(parsed, rules);
18 | } catch (e) {
19 | failedChecks++;
20 | console.log(`× ${csvFile} failed the following checks:${e.message}\n`);
21 | }
22 | if (valid) {
23 | console.log(`✓ ${csvFile} is valid.`);
24 | }
25 | }
26 |
27 | return failedChecks;
28 | };
29 |
30 | const run = async () => {
31 | const csvFiles = (await fs.readdir(DIR)).filter(f => f.match(/\.csv$/));
32 | const failedChecks = await validateSequentially(csvFiles);
33 |
34 | if (failedChecks > 0) {
35 | process.exit(1);
36 | }
37 | };
38 |
39 | run().catch(e => console.error(e));
40 |
--------------------------------------------------------------------------------
/scrapers/run_district_scraper.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Script to run a single district scraper
4 |
5 | set -e
6 | set -o pipefail
7 |
8 | function cleanup {
9 | exit $?
10 | }
11 | trap "cleanup" EXIT
12 |
13 | DIR="$(cd "$(dirname "$0")" && pwd)"
14 |
15 |
16 | # SCRAPER_KEY must be set
17 | if [ -z $SCRAPER_KEY ] ; then
18 | echo "SCRAPER_KEY env variable must be set";
19 | exit 1
20 | fi
21 |
22 | # 1. populate the database with the current CSV
23 | echo "Populating database from CSV fallzahlen_kanton_${SCRAPER_KEY}_bezirk..."
24 | $DIR/populate_district_database.py $DIR/../fallzahlen_bezirke/fallzahlen_kanton_${SCRAPER_KEY}_bezirk.csv
25 |
26 | # 2. run the scraper, update the db
27 | echo "Run the district scraper..."
28 | scrape_script="${DIR}/scrape_${SCRAPER_KEY,,}_districts.py"
29 | $scrape_script | $DIR/add_district_db_entry.py
30 |
31 | # 3. Export the database as csv
32 | echo "Export database to CSV..."
33 | sqlite3 -header -csv $DIR/data.sqlite "select * from data order by DistrictId, District, Canton, Date, Year, Week+0 asc;" > $DIR/../fallzahlen_bezirke/fallzahlen_kanton_${SCRAPER_KEY}_bezirk.csv
34 | sed -i 's/""//g' $DIR/../fallzahlen_bezirke/fallzahlen_kanton_${SCRAPER_KEY}_bezirk.csv
35 |
--------------------------------------------------------------------------------
/scrapers/scrape_vs_common.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import re
4 | import datetime
5 |
6 | from bs4 import BeautifulSoup
7 |
8 | import scrape_common as sc
9 |
10 |
11 | def get_vs_latest_weekly_pdf_url():
12 | pdfs = get_vs_weekly_pdf_urls()
13 | assert pdfs, "Could not find weekly PDFs"
14 | return pdfs[0]
15 |
16 |
17 | def get_vs_weekly_pdf_urls():
18 | base_url = 'https://www.vs.ch'
19 | url = base_url + '/de/web/coronavirus/statistiques-hebdomadaires'
20 | content = sc.download(url, silent=True)
21 | soup = BeautifulSoup(content, 'html.parser')
22 | links = soup.find_all(href=re.compile(r'Synthese.*Woche'))
23 | result = []
24 | for link in links:
25 | url = base_url + link['href'].replace(' ', '%20')
26 | result.append(url)
27 | return result
28 |
29 |
30 | def get_vs_weekly_general_data(pdf):
31 | content = sc.pdftotext(pdf, page=1)
32 | week = int(sc.find(r'Epidemiologische Situation Woche (\d+)', content))
33 | end_date = sc.find(r'bis\s+(\d+\.\d+\.\d{4})', content)
34 | end_date = sc.date_from_text(end_date)
35 | start_date = end_date - datetime.timedelta(days=7)
36 | year = start_date.year
37 | return week, year
38 |
--------------------------------------------------------------------------------
/scrapers/test/test_district_data.py:
--------------------------------------------------------------------------------
1 | from scrapers.scrape_common import DistrictData
2 |
3 | def test_district_data():
4 | dd = DistrictData()
5 | dd.date = '1'
6 | dd.week = 2
7 | dd.year = 3
8 | dd.canton = '4'
9 | dd.district = '5'
10 | dd.district_id = 6
11 | dd.population = 7
12 | dd.total_cases = 8
13 | dd.new_cases = 9
14 | dd.total_deceased = 10
15 | dd.new_deceased = 11
16 | dd.url = '12'
17 |
18 | string = str(dd)
19 |
20 | dd_parsed = DistrictData()
21 | assert dd_parsed.parse(string)
22 | assert dd.date == dd_parsed.date
23 | assert dd.week == dd_parsed.week
24 | assert dd.year == dd_parsed.year
25 | assert dd.canton == dd_parsed.canton
26 | assert dd.district == dd_parsed.district
27 | assert dd.district_id == dd_parsed.district_id
28 | assert dd.population == dd_parsed.population
29 | assert dd.total_cases == dd_parsed.total_cases
30 | assert dd.new_cases == dd_parsed.new_cases
31 | assert dd.total_deceased == dd_parsed.total_deceased
32 | assert dd.new_deceased == dd_parsed.new_deceased
33 | assert dd.url == dd_parsed.url
34 |
35 |
36 | if __name__ == "__main__":
37 | test_district_data()
38 |
--------------------------------------------------------------------------------
/scrapers/run_tests_scraper.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Script to run a single tests scraper
4 |
5 | set -e
6 | set -o pipefail
7 |
8 | function cleanup {
9 | exit $?
10 | }
11 | trap "cleanup" EXIT
12 |
13 | DIR="$(cd "$(dirname "$0")" && pwd)"
14 |
15 |
16 | # SCRAPER_KEY must be set
17 | if [ -z $SCRAPER_KEY ] ; then
18 | echo "SCRAPER_KEY env variable must be set";
19 | exit 1
20 | fi
21 |
22 | area="kanton_${SCRAPER_KEY}"
23 | if [ "$SCRAPER_KEY" = "FL" ] ; then
24 | area="${SCRAPER_KEY}"
25 | fi
26 |
27 | # 1. populate the database with the current CSV
28 | echo "Populating database from CSV fallzahlen_${area}_tests..."
29 | $DIR/populate_tests_database.py $DIR/../fallzahlen_tests/fallzahlen_${area}_tests.csv
30 |
31 | # 2. run the scraper, update the db
32 | echo "Run the tests scraper..."
33 | scrape_script="${DIR}/scrape_${SCRAPER_KEY,,}_tests.py"
34 | $scrape_script | $DIR/add_tests_db_entry.py
35 |
36 | # 3. Export the database as csv
37 | echo "Export database to CSV..."
38 | sqlite3 -header -csv $DIR/data.sqlite "select * from data order by canton, start_date, end_date, year, week+0 asc;" > $DIR/../fallzahlen_tests/fallzahlen_${area}_tests.csv
39 | sed -i 's/""//g' $DIR/../fallzahlen_tests/fallzahlen_${area}_tests.csv
40 |
--------------------------------------------------------------------------------
/scrapers/meta_scrape.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | # Scrapers are expected to output data on standard output in the following
4 | # format:
5 | #
6 | # GR
7 | # Scraped at: 2020-03-21T19:22:10+01:00
8 | # Date and time: 20.03.2020
9 | # Confirmed cases: 213
10 | # Deaths: 3
11 | #
12 | # Abbreviation of the canton first.
13 | #
14 | # Then scraped timestamp. Current time in ISO-8601 format. Implicitly in Swiss
15 | # timezone (TZ=Europe/Zurich), CET, or CEST.
16 | #
17 | # The information about time of when the data was published / gathered.
18 | # The data and time, or just time, can be omitted if not available.
19 | # Any date / time format is ok. More accurate the better. It is advised to strip
20 | # the name of the weekday. Add time parser to the parse_scrape_output.py script
21 | # if needed.
22 | #
23 | # Number of cases.
24 | #
25 | # Number of deaths can be omitted, if not available.
26 |
27 | for s in ./scrape_??.py;
28 | do
29 | L=$(./$s | ./parse_scrape_output.py)
30 | if ! echo "${L}" | egrep ' (OK|FAILED)' >/dev/null; then
31 | a=$(echo "$s" | sed -E -e 's/^.*scrape_(..)\..*$/\1/' | tr a-z A-Z) # ' # To make my editor happy.
32 | echo "$a" - - - FAILED "$(date --iso-8601=seconds)"
33 | else
34 | echo "${L}"
35 | fi
36 | done
37 |
--------------------------------------------------------------------------------
/scrapers/scrape_sg_tests.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import csv
4 | from io import StringIO
5 | import scrape_common as sc
6 |
7 |
8 | url = 'https://www.sg.ch/ueber-den-kanton-st-gallen/statistik/covid-19/_jcr_content/Par/sgch_downloadlist_729873930/DownloadListPar/sgch_download.ocFile/KantonSG_C19-Tests_download.csv'
9 | data = sc.download(url, silent=True)
10 |
11 | # strip the "header" / description lines
12 | data = "\n".join(data.split("\n")[9:])
13 |
14 | reader = csv.DictReader(StringIO(data), delimiter=';')
15 | for row in reader:
16 | td = sc.TestData(canton='SG', url=url)
17 | td.start_date = row['Datum']
18 | td.end_date = row['Datum']
19 | td.pcr_positive_tests = row['Positiv (PCR)']
20 | td.pcr_negative_tests = row['Negativ (PCR)']
21 | td.ag_positive_tests = row['Positiv (Schnelltest)']
22 | td.ag_negative_tests = row['Negativ (Schnelltest)']
23 | td.positive_tests = row['Total positive Tests']
24 | td.negative_tests = row['Total negative Tests']
25 | td.total_tests = row['Total Tests']
26 | if row['Positiv in % vom Total']:
27 | td.positivity_rate = float(row['Positiv in % vom Total']) * 100
28 | td.positivity_rate = round(10 * td.positivity_rate) / 10
29 | print(td)
30 |
--------------------------------------------------------------------------------
/scrapers/scrape_vs_tests.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import re
4 |
5 | import scrape_common as sc
6 | import scrape_vs_common as svc
7 |
8 |
9 | # get all PDFs
10 | for url in svc.get_vs_weekly_pdf_urls():
11 | td = sc.TestData(canton='VS', url=url)
12 |
13 | pdf = sc.download_content(url, silent=True)
14 | td.week, td.year = svc.get_vs_weekly_general_data(pdf)
15 |
16 | for page in range(4, 6):
17 | content = sc.pdftotext(pdf, page=page, raw=True)
18 | content = re.sub(r'(\d)\‘(\d)', r'\1\2', content)
19 | content = re.sub(r'(\d)\’(\d)', r'\1\2', content)
20 | content = re.sub(r'(\d)\'(\d)', r'\1\2', content)
21 |
22 | td.total_tests = sc.find(r'Alle\s+Arten\s+von\s+Tests\s+(\d+)', content)
23 | td.positivity_rate = sc.find(r'Alle\s+Arten\s+von\s+Tests\s+\d+\s+(\d+\.\d+)%', content)
24 | td.pcr_total_tests = sc.find(r'PCR\s+(\d+)', content)
25 | td.pcr_positivity_rate = sc.find(r'PCR\s+\d+\s+(\d+\.\d+)%', content)
26 | td.ag_total_tests = sc.find(r'Antigentests\s+(\d+)', content)
27 | td.ag_positivity_rate = sc.find(r'Antigentests\s+\d+\s+(\d+\.\d+)%', content)
28 |
29 | if not td.total_tests:
30 | continue
31 |
32 | print(td)
33 |
--------------------------------------------------------------------------------
/scrapers/test_tests_scraper.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Script to run all tests scraper
4 |
5 | set -e
6 | set -o pipefail
7 |
8 | function cleanup {
9 | exit $?
10 | }
11 | trap "cleanup" EXIT
12 |
13 | DIR="$(cd "$(dirname "$0")" && pwd)"
14 | NEWLINE=$'\n'
15 |
16 | echo "Run all tests scrapers..."
17 |
18 | exit_code=0
19 | errors=''
20 | for scrape_script in $DIR/scrape_??_tests.py
21 | do
22 | if [ -f $scrape_script -a -x $scrape_script ]
23 | then
24 | name=`basename $scrape_script`
25 | canton=${name:7:2}
26 | export SCRAPER_KEY=${canton^^}
27 | echo ""
28 | echo "Running ${SCRAPER_KEY} tests scraper..."
29 | echo "=========================================="
30 |
31 | set +e
32 | $DIR/run_tests_scraper.sh
33 | ret=$?
34 | if [ $ret -ne 0 ]
35 | then
36 | echo "ERROR: ${scrape_script} failed with exit code $ret. continue." >&2
37 | errors=$"${errors}${NEWLINE}ERROR: ${scrape_script} failed with exit code $ret"
38 | exit_code=1
39 | fi
40 | set -e
41 |
42 | echo "=========================================="
43 | echo ""
44 | fi
45 | done
46 |
47 |
48 | echo "$errors"
49 | exit $exit_code
50 |
--------------------------------------------------------------------------------
/fallzahlen_kanton_zh/COVID19_VOC_Kanton_ZH.csv:
--------------------------------------------------------------------------------
1 | date,new_pcr_pos,new_voc
2 | 2021-02-10,168,35
3 | 2021-02-09,247,54
4 | 2021-02-08,134,44
5 | 2021-02-07,82,29
6 | 2021-02-06,188,62
7 | 2021-02-05,194,41
8 | 2021-02-04,209,38
9 | 2021-02-03,215,43
10 | 2021-02-02,272,67
11 | 2021-02-01,143,37
12 | 2021-01-31,65,12
13 | 2021-01-30,193,34
14 | 2021-01-29,208,32
15 | 2021-01-28,287,34
16 | 2021-01-27,273,32
17 | 2021-01-26,316,41
18 | 2021-01-25,152,25
19 | 2021-01-24,115,16
20 | 2021-01-23,245,18
21 | 2021-01-22,390,23
22 | 2021-01-21,197,17
23 | 2021-01-20,301,14
24 | 2021-01-19,336,10
25 | 2021-01-18,217,6
26 | 2021-01-17,103,5
27 | 2021-01-16,251,8
28 | 2021-01-15,277,10
29 | 2021-01-14,273,5
30 | 2021-01-13,352,4
31 | 2021-01-12,392,8
32 | 2021-01-11,291,3
33 | 2021-01-10,163,0
34 | 2021-01-09,347,0
35 | 2021-01-08,446,6
36 | 2021-01-07,449,2
37 | 2021-01-06,616,4
38 | 2021-01-05,658,6
39 | 2021-01-04,494,2
40 | 2021-01-03,280,1
41 | 2021-01-02,388,2
42 | 2021-01-01,204,0
43 | 2020-12-31,638,0
44 | 2020-12-30,595,2
45 | 2020-12-29,731,4
46 | 2020-12-28,368,1
47 | 2020-12-27,284,0
48 | 2020-12-26,429,2
49 | 2020-12-25,229,0
50 | 2020-12-24,793,0
51 | 2020-12-23,855,1
52 | 2020-12-22,736,0
53 | 2020-12-21,414,1
54 | 2020-12-20,312,0
55 | 2020-12-19,494,2
56 | 2020-12-18,723,0
57 |
--------------------------------------------------------------------------------
/scrapers/scrape_tg.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import csv
4 | from io import StringIO
5 | import requests
6 | import scrape_common as sc
7 |
8 | # perma link to TG COVID dataset on opendata.swiss
9 | r = requests.get(
10 | 'https://opendata.swiss/api/3/action/ogdch_dataset_by_identifier',
11 | params={'identifier': 'dfs-ga-1@kanton-thurgau'}
12 | )
13 | dataset = r.json()['result']
14 | resource = next(r for r in dataset['resources'] if r['mimetype'] == 'text/csv')
15 |
16 | assert resource['download_url'], "Download URL not found"
17 |
18 | d_csv = sc.download(resource['download_url'], silent=True)
19 |
20 | reader = csv.DictReader(StringIO(d_csv), delimiter=';')
21 | is_first = True
22 | for row in reader:
23 | if not row['date']:
24 | continue
25 | if not is_first:
26 | print('-' * 10)
27 | is_first = False
28 | dd = sc.DayData(canton='TG', url=row['source'])
29 | dd.datetime = f"{row['date']} {row['time']}"
30 | dd.cases = row['ncumul_conf']
31 | dd.deaths = row['ncumul_deceased']
32 | dd.hospitalized = row['current_hosp']
33 | dd.new_hosp = row['new_hosp']
34 | dd.recovered = row['ncumul_released']
35 | dd.icu = row['current_icu']
36 | dd.isolated = row['num_isolated']
37 | print(dd)
38 |
--------------------------------------------------------------------------------
/scrapers/run_scraper.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Script to run a single scraper
4 |
5 | set -e
6 | set -o pipefail
7 |
8 | function cleanup {
9 | exit $?
10 | }
11 | trap "cleanup" EXIT
12 |
13 | DIR="$(cd "$(dirname "$0")" && pwd)"
14 |
15 |
16 | # SCRAPER_KEY must be set
17 | if [ -z $SCRAPER_KEY ] ; then
18 | echo "SCRAPER_KEY env variable must be set";
19 | exit 1
20 | fi
21 |
22 | area="Kanton_${SCRAPER_KEY}"
23 | if [ "$SCRAPER_KEY" = "FL" ] ; then
24 | area="${SCRAPER_KEY}"
25 | fi
26 |
27 | # 1. populate the database with the current CSV
28 | echo "Populating database from CSV COVID19_Fallzahlen_${area}_total.csv..."
29 | $DIR/populate_database.py $DIR/../fallzahlen_kanton_total_csv_v2/COVID19_Fallzahlen_${area}_total.csv
30 |
31 | # 2. run the scraper, update the db
32 | echo "Run the scraper..."
33 | scrape_script="${DIR}/scrape_${SCRAPER_KEY,,}.py"
34 | $scrape_script | $DIR/parse_scrape_output.py | $DIR/add_db_entry.py
35 |
36 | # 3. Export the database as csv
37 | echo "Export database to CSV..."
38 | sqlite3 -header -csv $DIR/data.sqlite "select * from data order by date asc;" > $DIR/../fallzahlen_kanton_total_csv_v2/COVID19_Fallzahlen_${area}_total.csv
39 | sed -i 's/""//g' $DIR/../fallzahlen_kanton_total_csv_v2/COVID19_Fallzahlen_${area}_total.csv
40 |
--------------------------------------------------------------------------------
/scrapers/test_district_scraper.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Script to test run all district scraper
4 |
5 | set -e
6 | set -o pipefail
7 |
8 | function cleanup {
9 | exit $?
10 | }
11 | trap "cleanup" EXIT
12 |
13 | DIR="$(cd "$(dirname "$0")" && pwd)"
14 | NEWLINE=$'\n'
15 |
16 | echo "Run all district scrapers..."
17 |
18 | exit_code=0
19 | errors=''
20 | for scrape_script in $DIR/scrape_??_districts.py
21 | do
22 | if [ -f $scrape_script -a -x $scrape_script ]
23 | then
24 | name=`basename $scrape_script`
25 | canton=${name:7:2}
26 | export SCRAPER_KEY=${canton^^}
27 | echo ""
28 | echo "Running ${SCRAPER_KEY} district scraper..."
29 | echo "=========================================="
30 |
31 | set +e
32 | $DIR/run_district_scraper.sh
33 | ret=$?
34 | if [ $ret -ne 0 ]
35 | then
36 | echo "ERROR: ${scrape_script} failed with exit code $ret. continue." >&2
37 | errors=$"${errors}${NEWLINE}ERROR: ${scrape_script} failed with exit code $ret"
38 | exit_code=1
39 | fi
40 | set -e
41 |
42 | echo "=========================================="
43 | echo ""
44 | fi
45 | done
46 |
47 | echo "$errors"
48 | exit $exit_code
49 |
--------------------------------------------------------------------------------
/scrapers/scrape_ne.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 |
4 | import sys
5 | import datetime
6 | import scrape_common as sc
7 |
8 | xls_url = 'https://www.ne.ch/autorites/DFS/SCSP/medecin-cantonal/maladies-vaccinations/Documents/Covid-19-Statistiques/COVID19_PublicationInternet.xlsx'
9 | xls = sc.xlsdownload(xls_url, silent=True)
10 | rows = sc.parse_xls(xls)
11 | is_first = True
12 | for row in rows[:3000]:
13 | if row['A'] is None:
14 | continue
15 | if not isinstance(row['A'], datetime.datetime):
16 | print(f"WARNING: {row['A']} is not a valid date, skipping.", file=sys.stderr)
17 | continue
18 |
19 | if not is_first:
20 | print('-' * 10)
21 | is_first = False
22 |
23 | dd = sc.DayData(canton='NE', url=xls_url)
24 | dd.datetime = row['A'].date().isoformat()
25 | dd.cases = row['Cumul']
26 | dd.hospitalized = row['Total des cas hospitalisés']
27 | if row['Soins intensifs (intubés)'] is not None and row['Soins intensifs (non intubés)'] is not None:
28 | ICU = row['Soins intensifs (intubés)']
29 | ICU2 = row['Soins intensifs (non intubés)']
30 | dd.icu = int(ICU)+int(ICU2)
31 | dd.vent = row['Soins intensifs (intubés)']
32 | dd.deaths = row['Cumul des décès']
33 | print(dd)
34 |
--------------------------------------------------------------------------------
/scrapers/scrape_ag_tests.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import scrape_common as sc
4 | import scrape_ag_common as sac
5 |
6 |
7 | def get_value_int(value):
8 | if value is not None and value != '':
9 | return int(value)
10 | return None
11 |
12 |
13 | def get_value_float(value):
14 | if value is not None and value != '':
15 | return float(value)
16 | return None
17 |
18 |
19 | xls_url = sac.get_ag_xls_url()
20 | xls = sc.xlsdownload(xls_url, silent=True)
21 |
22 | year = '2020'
23 | rows = sc.parse_xls(xls, sheet_name='1.4 Labortests', header_row=1, enable_float=True)
24 | for row in rows:
25 | if not row['Anzahl Tests']:
26 | continue
27 | if row['Anzahl Tests'] == 'Anzahl Tests':
28 | break
29 |
30 | td = sc.TestData(canton='AG', url=xls_url)
31 | td.week = int(row['Kalenderwoche'])
32 | if td.week == 1:
33 | year = '2021'
34 | td.year = year
35 | td.positive_tests = get_value_int(row['Positive Tests'])
36 | td.negative_tests = get_value_int(row['Negative Tests'])
37 | td.total_tests = int(row['Anzahl Tests'])
38 | td.positivity_rate = get_value_float(row['Positivitätsrate'])
39 | td.pcr_positivity_rate = get_value_float(row['F'])
40 | td.ag_positivity_rate = get_value_float(row['G'])
41 | if td:
42 | print(td)
43 |
--------------------------------------------------------------------------------
/scrapers/scrape_ju_tests.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 |
4 | import re
5 | from bs4 import BeautifulSoup
6 | import scrape_common as sc
7 |
8 | base_url = 'https://www.jura.ch'
9 | url = f'{base_url}/fr/Autorites/Coronavirus/Infos-Actualite/Statistiques-COVID/Evolution-des-cas-COVID-19-dans-le-Jura.html'
10 | d = sc.download(url, silent=True)
11 | d = d.replace(' ', ' ')
12 | soup = BeautifulSoup(d, 'html.parser')
13 |
14 | pdf_url = soup.find('a', title=re.compile(r'Situation.*PDF.*')).get('href')
15 | if not pdf_url.startswith('http'):
16 | pdf_url = f'{base_url}{pdf_url}'
17 | pdf_url = pdf_url.replace('?download=1', '')
18 |
19 | pdf = sc.download_content(pdf_url, silent=True)
20 |
21 | td = sc.TestData(canton='JU', url=pdf_url)
22 |
23 | content = sc.pdftotext(pdf, page=1)
24 | td.week = sc.find(r'Situation semaine épidémiologique (\d+)', content)
25 | td.year = sc.find(r'Du \d+.* (\d{4})', content)
26 |
27 | content = sc.pdftotext(pdf, page=2)
28 | td.total_tests = sc.find(r'Nombre de tests\d?\s+(\d+)', content)
29 | res = re.match(r'.*Nombre de tests positifs .*\s+(\d+)\s+\((\d+\.?\d?)%\s?\d?\)', content, re.DOTALL | re.MULTILINE)
30 | assert res, 'failed to find number of positive tests and positivity rate'
31 | td.positive_tests = res[1]
32 | td.positivity_rate = res[2]
33 |
34 | print(td)
35 |
--------------------------------------------------------------------------------
/scripts/latest_per_canton.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | echo "| Canton | Confirmed cases | Deceased | Last update |"
4 | echo "|:------:| ---------------:| --------:|:---------------------- |"
5 | # | BL | 282 | 0 | 2020-03-21 |
6 |
7 | # PER CANTON / FL
8 |
9 | # 1 2 3 4 5 6 7 8 9 10
10 | # date,time,abbreviation_canton_and_fl,ncumul_tested,ncumul_conf,ncumul_hosp,ncumul_ICU,ncumul_vent,ncumul_released,ncumul_deceased,source
11 |
12 | for f in *.csv; do
13 | # Output latest row with non-zero commulative number of cases (and deaths). Then sort by number of cases, and print the date.
14 | awk -F , '{if ($5) { printf("| %2s | %15d | %8d | %-21s |\n", $3, $5, $10, $2 != "\"\"" ? $1 "T" $2 : $1); }}' "$f" | tail -1
15 | done | sort -r -n -k 4
16 |
17 | # TOTAL
18 |
19 | DATE=$(TZ="Europe/Zurich" date --iso-8601=minutes)
20 |
21 | for f in *.csv; do
22 | # Output last row with non-zero commulative number of cases (and deaths)
23 | awk -F , '{if ($5) { print $1, $3, $5, $10; }}' "$f" | tail -1
24 | # The do sums.
25 | done | awk "BEGIN { sum_cases = 0; sum_deceased = 0; } { sum_cases += \$3; sum_deceased += \$4; } END { printf(\"| TOTAL | %15d | %8d | %-22s |\n\", sum_cases, sum_deceased, \"${DATE}\"); }"
26 |
--------------------------------------------------------------------------------
/.github/workflows/test_scraper.yml:
--------------------------------------------------------------------------------
1 | name: Test run of scrapers
2 |
3 | on:
4 | push:
5 | branches: [ master ]
6 | paths:
7 | - 'scrapers/**'
8 | - '!scrapers/*districts*'
9 | - '!scrapers/*tests*'
10 | - '.github/workflows/**'
11 | pull_request:
12 | branches: [ master ]
13 | paths:
14 | - 'scrapers/**'
15 | - '!scrapers/*districts*'
16 | - '!scrapers/*tests*'
17 | - '.github/workflows/**'
18 | workflow_dispatch: ~
19 |
20 | jobs:
21 | test_run:
22 | runs-on: ubuntu-20.04
23 | timeout-minutes: 10
24 |
25 | steps:
26 | - uses: actions/checkout@v3
27 |
28 | - name: Set up Python 3.7
29 | uses: actions/setup-python@v4
30 | with:
31 | python-version: 3.7
32 |
33 | - name: Remove broken apt repos
34 | run: |
35 | for apt_file in `grep -lr microsoft /etc/apt/sources.list.d/`; do sudo rm $apt_file; done
36 |
37 | - name: Install dependencies
38 | run: |
39 | npm ci
40 | python -m pip install --upgrade pip setuptools wheel
41 | pip install -r requirements.txt
42 | sudo apt update || true # do not fail if update does not work
43 | sudo apt-get install poppler-utils
44 | sudo apt-get install chromium-browser
45 |
46 | - name: Test run of all scrapers
47 | run: ./scrapers/test_scraper.sh
48 |
49 |
--------------------------------------------------------------------------------
/scrapers/scrape_ai.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import re
4 | import scrape_common as sc
5 |
6 | url = 'https://www.ai.ch/themen/gesundheit-alter-und-soziales/gesundheitsfoerderung-und-praevention/uebertragbare-krankheiten/coronavirus'
7 | d = sc.download(url, silent=True)
8 | d = re.sub(r'(\d+)\'(\d+)', r'\1\2', d)
9 |
10 | """
11 | no separate date for hospitalizations on 2020-11-19
12 | # Hospitalisations
13 | dd_hosp = sc.DayData(canton='AI', url=url)
14 | dd_hosp.datetime = sc.find('>.*Hospitalisationen\s+\(Stand\s+(.*\d{4})\)', d)
15 | dd_hosp.hospitalized = sc.find('
.*?([0-9]+)\s*Hospitalisationen.*<\/li>', d)
16 | print(dd_hosp)
17 | print('-' * 10)
18 | """
19 |
20 | # cases
21 | dd = sc.DayData(canton='AI', url=url)
22 | dd.datetime = sc.find('>.*Stand (.+ Uhr).*', d)
23 | dd.cases = sc.find('.*?([0-9]+)\s*(infizierte Person(en)?|(labor)?bestätigte Fälle).*<\/li>', d)
24 | dd.deaths = sc.find('.*?([0-9]+)\s*Todesf.+?lle.*<\/li>', d)
25 | dd.isolated = sc.find('.*?([0-9]+)\s*Personen\s+in\s*Isolation.*<\/li>', d)
26 | dd.quarantined = sc.find('.*?([0-9]+)\+?\s*enge\s+Kontaktpersonen\s+in\s+Quarant.ne.*<\/li>', d)
27 | dd.quarantine_riskareatravel = sc.find('.*?([0-9]+)\+?\s*Personen\s+in\s*Quarant.+ne.*Einreise\s+Risikoland.*<\/li>', d)
28 | dd.hospitalized = sc.find(r'.*?([0-9]+)\s*Person\sim\sSpital.*<\/li>', d)
29 | print(dd)
30 |
--------------------------------------------------------------------------------
/scrapers/scrape_bl_common.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 |
4 | import datetime
5 | from bs4 import BeautifulSoup
6 | import re
7 | import scrape_common as sc
8 |
9 |
10 | def get_latest_bl_bulletin_url():
11 | return get_all_bl_bulletin_urls()[0]
12 |
13 |
14 | def get_all_bl_bulletin_urls():
15 | news_url = 'https://www.baselland.ch/politik-und-behorden/direktionen/volkswirtschafts-und-gesundheitsdirektion/amt-fur-gesundheit/medizinische-dienste/kantonsarztlicher-dienst/aktuelles/medienmitteilungen-1'
16 | news_content = sc.download(news_url, silent=True)
17 | soup = BeautifulSoup(news_content, 'html.parser')
18 |
19 | bulletins = soup.find_all('a', href=re.compile(r'.*/coronavirus-wochenbulletin.*'))
20 | bulletin_urls = []
21 | for bulletin in bulletins:
22 | bulletin_urls.append(bulletin.get('href'))
23 | return bulletin_urls
24 |
25 |
26 | def strip_bl_bulletin_numbers(content):
27 | content = re.sub(r'(\d+)’(\d+)', r'\1\2', content)
28 | content = re.sub(r'(\d+)\'(\d+)', r'\1\2', content)
29 | return content
30 |
31 |
32 | def parse_bl_date(s):
33 | row_date = s.replace('-', '.')
34 | row_date = s.replace('/', '.')
35 | parts = row_date.split('.')
36 | s_date = datetime.datetime(day=int(parts[0]), month=int(parts[1]), year=int(parts[2]))
37 | key = s_date.date().isoformat()
38 | return (key, row_date)
39 |
--------------------------------------------------------------------------------
/correction_status.csv:
--------------------------------------------------------------------------------
1 | date,abbreviation_canton_and_fl,column
2 | 2020-12-25,FL,ncumul_conf
3 | 2021-06-08,ZG,ncumul_released
4 | 2021-06-22,NW,ncumul_conf
5 | 2021-06-29,BS,ncumul_conf
6 | 2021-06-27,ZG,ncumul_released
7 | 2021-06-30,NW,ncumul_conf
8 | 2021-07-02,ZG,ncumul_released
9 | 2021-07-05,ZG,ncumul_released
10 | 2021-07-05,BS,ncumul_released
11 | 2021-07-08,BS,ncumul_released
12 | 2021-07-14,BS,ncumul_released
13 | 2021-07-30,BS,ncumul_released
14 | 2021-08-19,ZG,ncumul_released
15 | 2021-08-20,ZG,ncumul_released
16 | 2021-09-03,ZG,ncumul_released
17 | 2021-10-01,ZG,ncumul_released
18 | 2021-10-04,SG,ncumul_deceased
19 | 2021-10-04,SG,ncumul_released
20 | 2021-10-22,ZG,ncumul_released
21 | 2021-10-24,ZG,ncumul_released
22 | 2021-11-05,ZG,ncumul_released
23 | 2021-11-07,ZG,ncumul_released
24 | 2021-11-12,ZG,ncumul_released
25 | 2022-02-17,UR,ncumul_deceased
26 | 2022-03-07,TI,ncumul_conf
27 | 2022-03-07,TI,ncumul_deceased
28 | 2022-04-10,FL,ncumul_released
29 | 2022-04-16,FL,ncumul_released
30 | 2022-05-30,FR,ncumul_released
31 | 2022-07-11,FR,ncumul_released
32 | 2022-08-16,NW,ncumul_conf
33 | 2022-09-05,NW,ncumul_conf
34 | 2022-11-16,NW,ncumul_conf
35 | 2023-01-25,BS,ncumul_released
36 | 2023-02-02,GE,ncumul_conf
37 | 2023-02-02,GE,ncumul_released
38 | 2023-02-02,GE,ncumul_deceased
39 | 2023-02-08,GE,ncumul_released
40 | 2023-02-08,GE,ncumul_deceased
41 | 2023-03-21,FL,ncumul_released
42 | 2023-03-29,FL,ncumul_released
43 |
--------------------------------------------------------------------------------
/scrapers/scrape_fr.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 |
4 | import csv
5 | import re
6 | from typing import Optional
7 | from io import StringIO
8 | import datetime
9 | import sys
10 | import scrape_common as sc
11 | from scrape_fr_common import get_fr_csv
12 |
13 | def trim_val(val: str) -> Optional[int]:
14 | if len(val) > 0:
15 | return int(re.sub(r'(\d+)\s+(\d+)', r'\1\2', val))
16 | return None
17 |
18 | csv_url, csv_data, main_url = get_fr_csv()
19 | reader = csv.DictReader(StringIO(csv_data), delimiter=';')
20 | is_first = True
21 |
22 | for row in reader:
23 | if not is_first:
24 | print('-' * 10)
25 | is_first = False
26 |
27 | dd = sc.DayData(canton='FR', url=main_url)
28 | for key, val in row.items():
29 | if sc.find(r'(Date).*', key):
30 | dd.datetime = val
31 | if sc.find(r'(Total cas av.r.s).*', key):
32 | dd.cases = trim_val(val)
33 | elif sc.find(r'(Personnes hospitalis.es).*', key):
34 | dd.hospitalized = trim_val(val)
35 | elif sc.find(r'(aux soins intensifs).*', key):
36 | dd.icu = trim_val(val)
37 | elif sc.find(r'(Total d.c.s).*', key):
38 | dd.deaths = trim_val(val)
39 | elif sc.find(r'(Total Sorties de l\'h.pital).*', key):
40 | dd.recovered = trim_val(val)
41 |
42 | assert dd
43 | assert dd.datetime
44 | print(dd)
45 |
--------------------------------------------------------------------------------
/scrapers/scrape_be_districts.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 |
4 | import csv
5 | from io import StringIO
6 | import scrape_common as sc
7 |
8 |
9 | # https://www.bfs.admin.ch/bfs/de/home/statistiken/kataloge-datenbanken/karten.assetdetail.5688189.html
10 | district_ids = {
11 | 241: 'Jura bernois',
12 | 242: 'Biel/Bienne',
13 | 243: 'Seeland',
14 | 244: 'Oberaargau',
15 | 245: 'Emmental',
16 | 246: 'Bern-Mittelland',
17 | 247: 'Thun',
18 | 248: 'Obersimmental-Saanen',
19 | 249: 'Frutigen-Niedersimmental',
20 | 250: 'Interlaken-Oberhasli',
21 | }
22 |
23 | url = 'https://covid-kennzahlen.apps.be.ch/#/de/cockpit'
24 | csv_url = 'https://raw.githubusercontent.com/openDataBE/covid19Data/develop/7_d_inzidenz_verwaltungskreis.csv'
25 | d = sc.download(csv_url, silent=True)
26 | reader = csv.DictReader(StringIO(d), delimiter=',')
27 | for row in reader:
28 | #dd = sc.DistrictData(district=district, canton='BE')
29 | district_id = int(row['bfs_nummer'])
30 | dd = sc.DistrictData(district=district_ids[district_id], canton='BE')
31 | dd.url = url
32 | dd.district_id = district_id
33 | dd.population = row['einwohnerzahl']
34 | date = sc.date_from_text(row['datum'])
35 | week = date.isocalendar()[1]
36 | dd.week = week
37 | dd.year = date.year
38 | dd.new_cases = round(float(row['7_d_inzidenz']) / 100e3 * int(row['einwohnerzahl']))
39 | print(dd)
40 |
--------------------------------------------------------------------------------
/.github/workflows/activate_scraper.yml:
--------------------------------------------------------------------------------
1 | name: Activate a scraper
2 |
3 | on:
4 | workflow_dispatch:
5 | inputs:
6 | canton:
7 | description: 'Abbreviation of Canton'
8 | required: true
9 |
10 | jobs:
11 | activate_scraper:
12 | runs-on: ubuntu-20.04
13 | timeout-minutes: 10
14 |
15 | steps:
16 | - uses: actions/checkout@v3
17 |
18 | - name: Activate scraper
19 | env:
20 | CANTON: ${{ github.event.inputs.canton }}
21 | run: |
22 | sed -e "/- $CANTON/I s/^#*//" -i ./.github/workflows/run_scrapers.yml
23 |
24 | - name: Commit and push to repo
25 | env:
26 | GHA_DEPLOY_KEY: ${{ secrets.GHA_DEPLOY_KEY }}
27 | CANTON: ${{ github.event.inputs.canton }}
28 | run: |
29 | if ! git diff --no-ext-diff --quiet --exit-code; then
30 | git add .
31 | git config --local user.email "scraper@open.zh.ch"
32 | git config --local user.name "GitHub Action Scraper"
33 | git commit -a -m "Activate $CANTON scraper"
34 | git remote set-url origin "$(git config --get remote.origin.url | sed 's#http.*com/#git@github.com:#g')"
35 | eval `ssh-agent -t 60 -s`
36 | echo "$GHA_DEPLOY_KEY" | ssh-add -
37 | mkdir -p ~/.ssh/
38 | ssh-keyscan github.com >> ~/.ssh/known_hosts
39 | git push
40 | ssh-agent -k
41 | else
42 | echo "Nothing to commit."
43 | fi
44 |
--------------------------------------------------------------------------------
/.github/workflows/deactivate_scraper.yml:
--------------------------------------------------------------------------------
1 | name: Deactivate a scraper
2 |
3 | on:
4 | workflow_dispatch:
5 | inputs:
6 | canton:
7 | description: 'Abbreviation of Canton'
8 | required: true
9 |
10 | jobs:
11 | deactivate_scraper:
12 | runs-on: ubuntu-20.04
13 | timeout-minutes: 10
14 |
15 | steps:
16 | - uses: actions/checkout@v3
17 |
18 | - name: Deactivate scraper
19 | env:
20 | CANTON: ${{ github.event.inputs.canton }}
21 | run: |
22 | sed -e "/- $CANTON/I s/^#*/#/" -i ./.github/workflows/run_scrapers.yml
23 |
24 | - name: Commit and push to repo
25 | env:
26 | GHA_DEPLOY_KEY: ${{ secrets.GHA_DEPLOY_KEY }}
27 | CANTON: ${{ github.event.inputs.canton }}
28 | run: |
29 | if ! git diff --no-ext-diff --quiet --exit-code; then
30 | git add .
31 | git config --local user.email "scraper@open.zh.ch"
32 | git config --local user.name "GitHub Action Scraper"
33 | git commit -a -m "Deactivate $CANTON scraper"
34 | git remote set-url origin "$(git config --get remote.origin.url | sed 's#http.*com/#git@github.com:#g')"
35 | eval `ssh-agent -t 60 -s`
36 | echo "$GHA_DEPLOY_KEY" | ssh-add -
37 | mkdir -p ~/.ssh/
38 | ssh-keyscan github.com >> ~/.ssh/known_hosts
39 | git push
40 | ssh-agent -k
41 | else
42 | echo "Nothing to commit."
43 | fi
44 |
--------------------------------------------------------------------------------
/scrapers/scrape_ge_tests.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 |
4 | import re
5 |
6 | from selenium import webdriver
7 | from selenium.webdriver.chrome.options import Options
8 |
9 | import scrape_common as sc
10 | import scrape_ge_common as sgc
11 |
12 |
13 | chrome_options = Options()
14 | chrome_options.add_argument("--headless")
15 | driver = webdriver.Chrome(options=chrome_options)
16 | driver.implicitly_wait(5)
17 |
18 | url = 'https://infocovid.smc.unige.ch/'
19 | driver.get(url)
20 | elem = driver.find_element_by_link_text('Graphiques')
21 | elem.click()
22 | elem = driver.find_element_by_partial_link_text('Tests')
23 | elem.click()
24 | xls_url = sgc.get_link_from_element(driver, 'save_plot_nombre_tests_data')
25 | assert xls_url, "Couldn't find tests XLS url"
26 |
27 | xls = sc.xlsdownload(xls_url, silent=True)
28 | rows = sc.parse_xls(xls, header_row=0, enable_float=True)
29 | for row in rows:
30 | td = sc.TestData(canton='GE', url=url)
31 | res = re.search(r'(\d{2})-(\d{2})', row['week_res'])
32 | assert res, f"failed to extract year and week from {row['week_res']}"
33 | td.week = int(res[2])
34 | td.year = f'20{res[1]}'
35 | td.positive_tests = int(row['positifs'])
36 | td.negative_tests = int(row['négatifs'])
37 | td.total_tests = int(row['total'])
38 | # 2020-02/03 values are empty
39 | td.positivity_rate = 0
40 | if row['ratio']:
41 | td.positivity_rate = float(row['ratio'])
42 | print(td)
43 |
--------------------------------------------------------------------------------
/scrapers/test_scraper.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Script to run all scrapers
4 |
5 | set -e
6 | set -o pipefail
7 |
8 | function cleanup {
9 | exit $?
10 | }
11 | trap "cleanup" EXIT
12 |
13 | DIR="$(cd "$(dirname "$0")" && pwd)"
14 | NEWLINE=$'\n'
15 |
16 | echo "Run all scrapers..."
17 |
18 | exit_code=0
19 | errors=''
20 | for scrape_script in $DIR/scrape_??.py
21 | do
22 | if [ -f $scrape_script -a -x $scrape_script ]
23 | then
24 | name=`basename $scrape_script`
25 | canton=${name:7:2}
26 | export SCRAPER_KEY=${canton^^}
27 | echo ""
28 | echo "Running ${SCRAPER_KEY} scraper..."
29 | echo "=========================================="
30 |
31 | set +e
32 | $DIR/run_scraper.sh
33 | ret=$?
34 | if [ $ret -ne 0 ]
35 | then
36 | echo "ERROR: ${scrape_script} failed with exit code $ret. continue." >&2
37 | errors=$"${errors}${NEWLINE}ERROR: ${scrape_script} failed with exit code $ret"
38 | exit_code=1
39 | fi
40 | $DIR/validate_scraper_output.sh
41 | ret=$?
42 | if [ $ret -ne 0 ]
43 | then
44 | echo "ERROR: Validation for ${SCRAPER_KEY} failed with exit code $ret. continue." >&2
45 | errors=$"${errors}${NEWLINE}ERROR: Validation for ${SCRAPER_KEY} failed with exit code $ret"
46 | exit_code=1
47 | fi
48 | set -e
49 |
50 | echo "=========================================="
51 | echo ""
52 | fi
53 | done
54 |
55 | echo "$errors"
56 | exit $exit_code
57 |
--------------------------------------------------------------------------------
/scrapers/scrape_lu.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import re
4 | from bs4 import BeautifulSoup
5 | import scrape_common as sc
6 |
7 |
8 | base_url = 'https://www.lustat.ch'
9 | url = f'{base_url}/daten?id=28177'
10 | d = sc.download(url, silent=True)
11 | soup = BeautifulSoup(d, 'html.parser')
12 |
13 | xls_url = soup.find('a', href=re.compile(r'.*\.xlsx')).get('href')
14 | if not xls_url.startswith('http'):
15 | xls_url = f'{base_url}{xls_url}'
16 | xls = sc.xlsdownload(xls_url, silent=True)
17 | rows = sc.parse_xls(xls, header_row=5)
18 | total_cases = 0
19 | total_deaths = 0
20 | is_first = True
21 | for row in rows:
22 | dd = sc.DayData(canton='LU', url=xls_url)
23 | dd.datetime = row['Datum']
24 | dd.cases = sc.int_or_word(row.search(r'Neue\s+Fälle'))
25 | if dd.cases:
26 | total_cases += dd.cases
27 | dd.cases = total_cases
28 | dd.deaths = sc.int_or_word(row['Verstorbene'])
29 | if dd.deaths:
30 | total_deaths += dd.deaths
31 | dd.deaths = total_deaths
32 | dd.hospitalized = sc.int_or_word(row['Total'])
33 | dd.vent = sc.int_or_word(row.search(r'davon\s+beatmet'))
34 | dd.isolated = sc.int_or_word(row.search(r'in\s+Isolation'))
35 | dd.quarantined = sc.int_or_word(row.search(r'in\s+Quarantäne'))
36 | dd.quarantine_riskareatravel = sc.int_or_word(row.search(r'Reiserückkehrer\s+in\s+Quarantäne'))
37 | if dd.cases is None and dd.datetime == '31.12.2022':
38 | continue
39 | if dd:
40 | if not is_first:
41 | print('-' * 10)
42 | is_first = False
43 | print(dd)
44 |
--------------------------------------------------------------------------------
/scrapers/scrape_sg_districts.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import csv
4 | from io import StringIO
5 | import scrape_common as sc
6 |
7 | inhabitants = {
8 | 'St.Gallen': 127198,
9 | 'Rorschach': 44110,
10 | 'Rheintal': 74580,
11 | 'Werdenberg': 40239,
12 | 'Sarganserland': 41736,
13 | 'See-Gaster': 76913,
14 | 'Toggenburg': 47272,
15 | 'Wil': 77018,
16 | }
17 |
18 | district_ids = {
19 | 'St.Gallen': 1721,
20 | 'Rorschach': 1722,
21 | 'Rheintal': 1723,
22 | 'Werdenberg': 1724,
23 | 'Sarganserland': 1725,
24 | 'See-Gaster': 1726,
25 | 'Toggenburg': 1727,
26 | 'Wil': 1728,
27 | }
28 |
29 | url = 'https://www.sg.ch/ueber-den-kanton-st-gallen/statistik/covid-19/_jcr_content/Par/sgch_downloadlist/DownloadListPar/sgch_download.ocFile/KantonSG_C19-Faelle_download.csv'
30 | d = sc.download(url, silent=True)
31 |
32 | # strip the "header" / description lines
33 | d = "\n".join(d.split("\n")[5:])
34 |
35 | reader = csv.DictReader(StringIO(d), delimiter=';')
36 | for row in reader:
37 | week = sc.find(r'W(\d+)', row['Kalenderwoche'])
38 | date = sc.date_from_text(row['Falldatum'])
39 |
40 | for key, value in inhabitants.items():
41 | dd = sc.DistrictData(canton='SG', district=key)
42 | dd.url = url
43 | dd.week = week
44 | dd.year = date.year
45 | dd.date = date.isoformat()
46 | dd.district_id = district_ids[key]
47 | dd.new_cases = row['Wahlkreis ' + key]
48 | dd.total_cases = row['Wahlkreis ' + key + ' (kumuliert)']
49 | dd.population = value
50 | print(dd)
51 |
--------------------------------------------------------------------------------
/scrapers/scrape_bs.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 |
4 | import csv
5 | from io import StringIO
6 | import scrape_common as sc
7 |
8 | d_csv = sc.download('https://data.bs.ch/explore/dataset/100073/download/?format=csv&timezone=Europe/Zurich&lang=en&use_labels_for_header=false&csv_separator=,', silent=True)
9 |
10 | reader = csv.DictReader(StringIO(d_csv), delimiter=',')
11 | is_first = True
12 | for row in reader:
13 | if not row['ncumul_conf']:
14 | continue
15 | if not is_first:
16 | print('-' * 10)
17 | is_first = False
18 | dd = sc.DayData(canton='BS', url=row['source'])
19 | dd.datetime = f"{row['date']} {row['time']}"
20 | dd.cases = sc.safeint(row['ncumul_conf'])
21 | dd.new_hosp = row['new_hosp']
22 | dd.hospitalized = row['current_hosp']
23 | dd.icu = row['current_icu']
24 | dd.vent = row['current_vent']
25 | dd.recovered = row['ncumul_released']
26 | dd.deaths = row['ncumul_deceased']
27 | dd.isolated = row['current_isolated']
28 | dd.quarantined = row['current_quarantined']
29 | dd.confirmed_non_resident = row['ncumul_confirmed_non_resident']
30 | dd.hosp_non_resident = row['current_hosp_non_resident']
31 | dd.quarantine_riskareatravel = row['current_quarantined_riskareatravel']
32 | dd.quarantine_total = row['current_quarantined_total']
33 | dd.hosp_resident = row['current_hosp_resident']
34 |
35 | # TODO: remove if source is fixed
36 | # BS corrected data on 2021-03-01 without adapting their time series
37 | if row['date'] in ('2021-02-27', '2021-02-28'):
38 | dd.cases = ''
39 | dd.recovered = ''
40 | print(dd)
41 |
--------------------------------------------------------------------------------
/scrapers/download.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | # This is a simple wrapper around curl or wget, that can also be used to
4 | # save downloaded pages for archival purposes, as well for feeding fake
5 | # (test) data to the scrapers.
6 |
7 | # echo "DOWNLOADING:" "$@" >&2
8 |
9 | #WEBARCHIVE_SNAPSHOT=1
10 |
11 | if [ "x${WEBARCHIVE_SNAPSHOT}" != "x" ]; then
12 | # Note: JSON only allows strings in double quotes.
13 | (
14 | echo "$(date --iso-8601=seconds)" "Snapshoting: $1"
15 | W=$(curl -X POST -H "Content-Type: application/json" --data-raw "{\"url\": \"$1\", \"annotation\": {\"id\": \"lst-ib\", \"message\": \"openZH covid_19 github archiving\"}}" "https://pragma.archivelab.org/" 2>&1)
16 | echo "Response:"
17 | echo "${W}"
18 | ) >> webarchiveorg.log
19 | fi
20 |
21 | if which curl >/dev/null; then
22 | # Few sites, like GL, JU, SZ don't like curl, and return 403, or block site completly per-IP.
23 | # --output -, because curl, doesn't like to pipe binary files sometimes.
24 | exec curl -k --silent --output - --user-agent "Mozilla Firefox Mozilla/5.0; openZH covid_19 at github" "$@"
25 | exit 1
26 | fi
27 |
28 | if which wget >/dev/null; then
29 | # Few sites, like GL, JU, SZ don't like curl, and return 403, or block site completly per-IP.
30 | exec wget --output-document=- --quiet --user-agent="Mozilla Firefox Mozilla/5.0; openZH covid_19 at github" "$@"
31 | exit 1
32 | fi
33 |
34 | if which GET >/dev/null; then
35 | # Few sites, like GL, JU, SZ don't like curl, and return 403, or block site completly per-IP.
36 | exec GET "$@"
37 | exit 1
38 | fi
39 |
40 | echo "$0: No curl, wget or GET found. Install curl (recommended), or wget." >&2
41 | exit 2
42 |
--------------------------------------------------------------------------------
/scrapers/scrape_tests.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import scrape_common as sc
4 | import sys
5 | import re
6 |
7 |
8 | # download latest PDF
9 | pdf_url = 'https://www.bag.admin.ch/dam/bag/de/dokumente/mt/k-und-i/aktuelle-ausbrueche-pandemien/2019-nCoV/covid-19-woechentlicher-lagebericht.pdf.download.pdf/BAG_COVID-19_Woechentliche_Lage.pdf'
10 | d = sc.pdfdownload(pdf_url, raw=True, silent=True)
11 |
12 | """
13 | Coronavirus-Krankheit-2019 (COVID-19)
14 | Eidgenssisches Departement des Innern EDI
15 | Bundesamt fr Gesundheit BAG
16 | Direktionsbereich <96>ffentliche Gesundheit
17 | Situationsbericht zur epidemiologischen Lage in der Schweiz
18 | und im Frstentum Liechtenstein - Woche 28 (06.-12.07.2020)
19 | """
20 |
21 | datetime = sc.find(r'Liechtenstein - Woche .*(\d{2}\.\d{2}\.\d{4})\)', d)
22 |
23 | """
24 | Canton, tests of previous-week then current-week
25 |
26 | AG 5478 3588 808 529 1.3 1.8
27 | AI 96 55 595 341 0.0 0.0
28 | AR 391 249 708 451 0.5 1.2
29 | BE 6924 4652 669 449 0.4 0.9
30 | ...
31 | """
32 | start = d.find('Anzahl PCR-Tests in der Schweiz')
33 | if start > 0:
34 | start = d.find('\nAG ', start)
35 | else:
36 | start = 0
37 | end = d.find('Tabelle 4. Durchgeführte Tests nach Kalenderwoche', start)
38 | if start > 0 and end > start:
39 | tests_table = d[start:end]
40 | for line in tests_table.splitlines():
41 | canton = sc.find(r'^([A-Z][A-Z]) ', line)
42 | if canton is not None:
43 | dd = sc.DayData(canton=canton, url=pdf_url)
44 | dd.datetime = datetime
45 | dd.tested = sc.find(r'^[A-Z][A-Z] \d+ (\d+)', line)
46 | print('-' * 10)
47 | print(dd)
48 |
49 |
--------------------------------------------------------------------------------
/scripts/new2oldcsv.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | # This script convert CSV files from the new to the old structure
4 |
5 | import csv
6 | import sys
7 | import traceback
8 |
9 | assert len(sys.argv) == 2, "Call script with CSV file as parameter"
10 |
11 | try:
12 | filename = sys.argv[1]
13 | rows = []
14 | with open(filename, 'r') as f:
15 | dr = csv.DictReader(f)
16 | for r in dr:
17 | # map old to new structure
18 | data = {
19 | 'date': r['date'],
20 | 'time': r['time'],
21 | 'abbreviation_canton_and_fl': r['abbreviation_canton_and_fl'],
22 | 'ncumul_tested': r['ncumul_tested'],
23 | 'ncumul_conf': r['ncumul_conf'],
24 | 'ncumul_hosp': r['current_hosp'],
25 | 'ncumul_ICU': r['current_icu'],
26 | 'ncumul_vent': r['current_vent'],
27 | 'ncumul_released': r['ncumul_released'],
28 | 'ncumul_deceased': r['ncumul_deceased'],
29 | 'source': r['source'],
30 | }
31 | # re-add extra columns
32 | for col in dr.fieldnames[15:]:
33 | data[col] = r[col]
34 | rows.append(data)
35 |
36 | writer = csv.DictWriter(
37 | sys.stdout,
38 | rows[0].keys(),
39 | delimiter=',',
40 | quotechar='"',
41 | lineterminator='\n',
42 | quoting=csv.QUOTE_MINIMAL
43 | )
44 | writer.writeheader()
45 | writer.writerows(rows)
46 | except Exception as e:
47 | print("Error: %s" % e, file=sys.stderr)
48 | print(traceback.format_exc(), file=sys.stderr)
49 | sys.exit(1)
50 | finally:
51 | sys.stdout.flush()
52 |
--------------------------------------------------------------------------------
/scrapers/validate_scrapers.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import re
4 | import subprocess
5 | import sys
6 | import os
7 | from scrape_matrix import matrix
8 |
9 | __location__ = os.path.realpath(
10 | os.path.join(
11 | os.getcwd(),
12 | os.path.dirname(__file__)
13 | )
14 | )
15 |
16 |
17 | if __name__ == '__main__':
18 | all_features = ['Confirmed cases', 'Deaths', 'Released', 'Hospitalized', 'ICU', 'Vent']
19 | has_issue = False
20 | for canton, features in matrix.items():
21 | print(canton)
22 | scraper = f'{__location__}/scrape_{canton.lower()}.py'
23 | if not os.access(scraper, os.X_OK):
24 | print(f"{scraper} is not executable; skipping")
25 | continue
26 | result = subprocess.run([scraper], stdout=subprocess.PIPE)
27 | output = re.sub('----------\n$', '', result.stdout.decode('utf-8')).split('----------\n')[-1]
28 | for feature in features:
29 | if feature == 'Released':
30 | feature = r'(:?Released|Recovered)'
31 | matches = re.search(f'{feature}: (.+)', output)
32 | if matches is None or matches[1].startswith('None'):
33 | has_issue = True
34 | print(f"missing {feature} for {canton}")
35 | for feature in all_features:
36 | if feature not in features:
37 | if feature == 'Released':
38 | feature = r'(:?Released|Recovered)'
39 | if re.search(f'{feature}:', output) is not None:
40 | has_issue = True
41 | print(f"{feature} is present for {canton} but not listed in feature matrix")
42 |
43 | if has_issue:
44 | sys.exit(1)
45 |
--------------------------------------------------------------------------------
/scrapers/scrape_ge_common.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 |
4 | import re
5 | import time
6 | from bs4 import BeautifulSoup
7 | from selenium.webdriver.common.by import By
8 | from selenium.webdriver.support.ui import WebDriverWait
9 | from selenium.webdriver.support import expected_conditions as EC
10 | import scrape_common as sc
11 |
12 |
13 | def get_latest_ge_weekly_pdf_url():
14 | return get_ge_weekly_pdf_urls()[0]
15 |
16 |
17 | def get_ge_weekly_pdf_urls():
18 | d = sc.download('https://www.ge.ch/document/covid-19-bilan-epidemiologique-hebdomadaire', silent=True)
19 | soup = BeautifulSoup(d, 'html.parser')
20 | links = soup.find_all('a', title=re.compile(r"\.pdf$"))
21 | result = []
22 | for link in links:
23 | pdf_url = link.get('href')
24 | assert pdf_url, "pdf URL is empty"
25 | if not pdf_url.startswith('http'):
26 | pdf_url = f'https://www.ge.ch{pdf_url}'
27 | if pdf_url not in result:
28 | result.append(pdf_url)
29 | return result
30 |
31 |
32 | class element_has_link(object):
33 | def __init__(self, locator):
34 | self.locator = locator
35 |
36 | def __call__(self, driver):
37 | element = driver.find_element(*self.locator) # Finding the referenced element
38 | if element.get_attribute('href'):
39 | return element
40 | else:
41 | return False
42 |
43 |
44 | def get_link_from_element(driver, element_id):
45 | # the xls download links do not appear immediately for some reason
46 | # add some delay to get it.
47 | wait = WebDriverWait(driver, 30)
48 | elem = wait.until(element_has_link((By.ID, element_id)))
49 | url = elem.get_attribute('href')
50 |
51 | return url
52 |
--------------------------------------------------------------------------------
/scrapers/populate_district_database.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | # This script creates a new sqlite database based on the CSV is reiceives as an argument
4 | # The sqlite database is used as an intermediate step to merge new data in existing CSVs
5 |
6 | import sqlite3
7 | import traceback
8 | import os
9 | import sys
10 | import db_common as dc
11 |
12 |
13 | __location__ = dc.get_location()
14 |
15 | try:
16 | # load the csv to sqlite db
17 | assert len(sys.argv) == 2, "Call script with CSV file as parameter"
18 | columns, to_db = dc.load_csv(sys.argv[1])
19 |
20 | # create db
21 | DATABASE_NAME = os.path.join(__location__, 'data.sqlite')
22 | conn = sqlite3.connect(DATABASE_NAME)
23 | c = conn.cursor()
24 | c.execute('DROP TABLE IF EXISTS data')
25 | c.execute(
26 | '''
27 | CREATE TABLE IF NOT EXISTS data (
28 | DistrictId integer NOT NULL,
29 | District text NOT NULL,
30 | Canton text NOT NULL,
31 | Date text NOT NULL,
32 | Week text NOT NULL,
33 | Year text NOT NULL,
34 | Population integer,
35 | TotalConfCases integer,
36 | NewConfCases integer,
37 | TotalDeaths integer,
38 | NewDeaths integer,
39 | SourceUrl text,
40 | UNIQUE(DistrictId, District, Canton, Date, Week, Year)
41 | )
42 | '''
43 | )
44 |
45 | # add entries
46 | query = dc.insert_db_query(columns)
47 | c.executemany(query, to_db)
48 | conn.commit()
49 | except Exception as e:
50 | print("Error: %s" % e, file=sys.stderr)
51 | print(traceback.format_exc(), file=sys.stderr)
52 | sys.exit(1)
53 | finally:
54 | conn.close()
55 |
--------------------------------------------------------------------------------
/scrapers/scrape_so_districts.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 |
4 | from bs4 import BeautifulSoup
5 | import scrape_common as sc
6 |
7 | url = 'https://corona.so.ch/bevoelkerung/daten/fallzahlen-nach-gemeinden/'
8 | d = sc.download(url, silent=True)
9 |
10 | date = sc.find(r'Stand (\d+\.\d+\.20\d{2})', d)
11 | date = sc.date_from_text(date)
12 |
13 | population = {
14 | 'Solothurn': 16933,
15 | 'Bucheggberg': 7954,
16 | 'Dorneck': 20678,
17 | 'Gäu': 21605,
18 | 'Gösgen': 24536,
19 | 'Lebern': 24536,
20 | 'Olten': 55686,
21 | 'Thal': 14785,
22 | 'Thierstein': 14747,
23 | 'Wasseramt': 52134,
24 | }
25 |
26 | district_ids = {
27 | 'Solothurn': 1109,
28 | 'Bucheggberg': 1103,
29 | 'Dorneck': 1104,
30 | 'Gäu': 1101,
31 | 'Gösgen': 1105,
32 | 'Lebern': 1107,
33 | 'Olten': 1108,
34 | 'Thal': 1102,
35 | 'Thierstein': 1110,
36 | 'Wasseramt': 1106,
37 | }
38 |
39 |
40 | def strip_so_number(value):
41 | value = value.replace('\'', '')
42 | value = value.replace('^', '')
43 | return int(value)
44 |
45 |
46 | soup = BeautifulSoup(d, 'html.parser')
47 | for district, d_id in district_ids.items():
48 | table = soup.find(text=district).find_next('table')
49 | tr = table.find('strong', text='Total').find_parent('tr')
50 | tds = tr.find_all('td')
51 | assert tds[0].text == 'Total', f'Expected "Total" row, got {tds[0].text}'
52 | dd = sc.DistrictData(canton='SO', district=district)
53 | dd.url = url
54 | dd.date = date.isoformat()
55 | dd.population = strip_so_number(tds[1].text)
56 | dd.district_id = d_id
57 | dd.total_cases = strip_so_number(tds[2].text)
58 | dd.new_cases = int(tds[3].text)
59 | print(dd)
60 |
--------------------------------------------------------------------------------
/scrapers/scrape_sh.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 |
4 | import datetime
5 | import re
6 | from bs4 import BeautifulSoup
7 | import scrape_common as sc
8 | import scrape_sh_common as shc
9 |
10 | main_url, xls = shc.get_sh_xlsx()
11 |
12 | rows = sc.parse_xls(xls, header_row=0)
13 | is_first = True
14 | for row in rows:
15 | if not isinstance(row['Datum'], datetime.datetime):
16 | continue
17 | if not (row['Positiv'] or row.search(r'Hospitalisation isoliert\s+bestätigt') or row.search(r'Hospitalisation\s+intensiv.*$') or row['Verstorben']):
18 | continue
19 |
20 | if not is_first:
21 | print('-' * 10)
22 | is_first = False
23 |
24 | dd = sc.DayData(canton='SH', url=main_url)
25 | dd.datetime = row['Datum'].date().isoformat()
26 | dd.cases = row['Positiv']
27 |
28 | if sc.represents_int(row.search(r'Hospitalisation isoliert\s+bestätigt')) and sc.represents_int(row.search(r'Hospitalisation\s+intensiv.*$')):
29 | dd.hospitalized = row.search(r'Hospitalisation isoliert\s+bestätigt') + row.search(r'Hospitalisation\s+intensiv.*$')
30 | dd.icu = row.search(r'Hospitalisation\s+intensiv.*$')
31 | if row['Verstorben'] is not None:
32 | dd.deaths = row['Verstorben']
33 |
34 | isolated = row.search(r'Anzahl Personen\s+in Isolation.*')
35 | if isolated is not None:
36 | dd.isolated = isolated
37 | quarantined = row.search(r'Anzahl Personen\s+in Quarantäne\s+.*Kontaktpersonen.*')
38 | if quarantined is not None:
39 | dd.quarantined = quarantined
40 | quarantined_risk = row.search(r'Anzahl Personen\s+in Quarantäne\s+.*Rückkehr.*Risikoländer.*')
41 | if quarantined_risk is not None:
42 | dd.quarantine_riskareatravel = quarantined_risk
43 |
44 | print(dd)
45 |
--------------------------------------------------------------------------------
/scripts/old2newcsv.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | # This script convert CSV files from the old to the new structure
4 |
5 | import csv
6 | import sys
7 | import traceback
8 |
9 | assert len(sys.argv) == 2, "Call script with CSV file as parameter"
10 |
11 | try:
12 | filename = sys.argv[1]
13 | rows = []
14 | with open(filename, 'r') as f:
15 | dr = csv.DictReader(f)
16 | for r in dr:
17 | # map old to new structure
18 | data = {
19 | 'date': r['date'],
20 | 'time': r['time'],
21 | 'abbreviation_canton_and_fl': r['abbreviation_canton_and_fl'],
22 | 'ncumul_tested': r['ncumul_tested'],
23 | 'ncumul_conf': r['ncumul_conf'],
24 | 'new_hosp': '',
25 | 'current_hosp': r['ncumul_hosp'],
26 | 'current_icu': r['ncumul_ICU'],
27 | 'current_vent': r['ncumul_vent'],
28 | 'ncumul_released': r['ncumul_released'],
29 | 'ncumul_deceased': r['ncumul_deceased'],
30 | 'source': r['source'],
31 | 'current_isolated': '',
32 | 'current_quarantined': '',
33 | }
34 | # re-add extra columns
35 | for col in dr.fieldnames[11:]:
36 | data[col] = r[col]
37 | rows.append(data)
38 |
39 | writer = csv.DictWriter(
40 | sys.stdout,
41 | rows[0].keys(),
42 | delimiter=',',
43 | quotechar='"',
44 | lineterminator='\n',
45 | quoting=csv.QUOTE_MINIMAL
46 | )
47 | writer.writeheader()
48 | writer.writerows(rows)
49 | except Exception as e:
50 | print("Error: %s" % e, file=sys.stderr)
51 | print(traceback.format_exc(), file=sys.stderr)
52 | sys.exit(1)
53 | finally:
54 | sys.stdout.flush()
55 |
--------------------------------------------------------------------------------
/scrapers/scrape_bs_tests.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import csv
4 | from io import StringIO
5 | import scrape_common as sc
6 |
7 |
8 | def prettify_positivity_rate(positivity_rate):
9 | if not positivity_rate:
10 | return None
11 | return round(10 * float(positivity_rate)) / 10
12 |
13 |
14 | url = 'https://data.bs.ch/explore/dataset/100094/download/?format=csv&timezone=Europe/Berlin&lang=en&use_labels_for_header=true&csv_separator=%3B'
15 | data = sc.download(url, silent=True)
16 |
17 | reader = csv.DictReader(StringIO(data), delimiter=';')
18 | for row in reader:
19 | td = sc.TestData(canton='BS', url=url)
20 | td.start_date = row['Datum']
21 | td.end_date = row['Datum']
22 | td.positive_tests = row['Positive Tests'] or None
23 | td.negative_tests = row['Negative Tests'] or None
24 | td.total_tests = row['Total Tests'] or None
25 | td.positivity_rate = row['Anteil positive Tests in Prozent'] or None
26 |
27 | td.pcr_positive_tests = row['Positive PCR Tests'] or None
28 | td.pcr_negative_tests = row['Negative PCR Tests'] or None
29 | td.pcr_total_tests = row['Total PCR Tests'] or None
30 | td.pcr_positivity_rate = row['Anteil positive PCR Tests in Prozent'] or None
31 |
32 | td.ag_positive_tests = row['Positive Antigen Schnelltests'] or None
33 | td.ag_negative_tests = row['Negative Antigen Schnelltests'] or None
34 | td.ag_total_tests = row['Total Antigen Schnelltests'] or None
35 | td.ag_positivity_rate = row['Anteil positive Antigen Schnelltests in Prozent'] or None
36 |
37 | if td:
38 | td.positivity_rate = prettify_positivity_rate(td.positivity_rate)
39 | td.pcr_positivity_rate = prettify_positivity_rate(td.pcr_positivity_rate)
40 | td.ag_positivity_rate = prettify_positivity_rate(td.ag_positivity_rate)
41 | print(td)
42 |
--------------------------------------------------------------------------------
/scrapers/scrape_be.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import csv
4 | from io import StringIO
5 | import re
6 | import scrape_common as sc
7 |
8 | url = 'https://covid-kennzahlen.apps.be.ch/#/de/cockpit'
9 |
10 | csv_url = 'https://raw.githubusercontent.com/openDataBE/covid19Data/develop/total_faelle.csv'
11 | d = sc.download(csv_url, silent=True)
12 | reader = csv.DictReader(StringIO(d), delimiter=',')
13 | is_first = True
14 | for row in reader:
15 | if not is_first:
16 | print('-' * 10)
17 | is_first = False
18 |
19 | dd = sc.DayData(canton='BE', url=url)
20 | dd.datetime = row['datum']
21 | dd.cases = row['total_laborbestaetigte_faelle']
22 | dd.deaths = row['total_todesfaelle']
23 | print(dd)
24 |
25 | csv_url = 'https://raw.githubusercontent.com/openDataBE/covid19Data/develop/spa_auslastung.csv'
26 | d = sc.download(csv_url, silent=True)
27 | reader = csv.DictReader(StringIO(d), delimiter=',')
28 | is_first = True
29 | for row in reader:
30 | if not is_first:
31 | print('-' * 10)
32 | is_first = False
33 |
34 | dd = sc.DayData(canton='BE', url=url)
35 | dd.datetime = row['datum']
36 | dd.hospitalized = row['personen_hospitalisiert']
37 | dd.vent = int(row['auf_intensivpflegestation_beatmet'])
38 | dd.icu = int(row['auf_intensivpflegestation_unbeatmet']) + dd.vent
39 | print(dd)
40 |
41 | csv_url = 'https://raw.githubusercontent.com/openDataBE/covid19Data/develop/contact_tracing.csv'
42 | d = sc.download(csv_url, silent=True)
43 | reader = csv.DictReader(StringIO(d), delimiter=',')
44 | is_first = True
45 | for row in reader:
46 | if not is_first:
47 | print('-' * 10)
48 | is_first = False
49 |
50 | dd = sc.DayData(canton='BE', url=url)
51 | dd.datetime = row['datum']
52 | dd.quarantined = row['personen_in_quarantaene']
53 | dd.isolated = row['personen_in_isolation']
54 | print(dd)
55 |
--------------------------------------------------------------------------------
/.github/workflows/test_tests_scraper.yml:
--------------------------------------------------------------------------------
1 | name: Test run of tests scrapers
2 |
3 | on:
4 | push:
5 | branches: [ master ]
6 | paths:
7 | - 'scrapers/*tests*'
8 | - 'scrapers/parse_scrape_output.py'
9 | - 'scrapers/populate_tests_database.py'
10 | - 'scrapers/run_tests_scraper.sh'
11 | - 'scrapers/scrape_dates.py'
12 | - 'scrapers/scrape_matrix.py'
13 | - 'scrapers/validate_scraper*'
14 | - 'scrapers/*_common.py'
15 | - '!scrapers/*_districts.py'
16 | - '.github/workflows/**'
17 | pull_request:
18 | branches: [ master ]
19 | paths:
20 | - 'scrapers/*tests*'
21 | - 'scrapers/parse_scrape_output.py'
22 | - 'scrapers/populate_tests_database.py'
23 | - 'scrapers/run_tests_scraper.sh'
24 | - 'scrapers/scrape_dates.py'
25 | - 'scrapers/scrape_matrix.py'
26 | - 'scrapers/validate_scraper*'
27 | - 'scrapers/*_common.py'
28 | - '!scrapers/*_districts.py'
29 | - '.github/workflows/**'
30 | workflow_dispatch: ~
31 |
32 | jobs:
33 | test_run:
34 | runs-on: ubuntu-20.04
35 | timeout-minutes: 10
36 |
37 | steps:
38 | - uses: actions/checkout@v3
39 |
40 | - name: Set up Python 3.7
41 | uses: actions/setup-python@v4
42 | with:
43 | python-version: 3.7
44 |
45 | - name: Remove broken apt repos
46 | run: |
47 | for apt_file in `grep -lr microsoft /etc/apt/sources.list.d/`; do sudo rm $apt_file; done
48 |
49 | - name: Install dependencies
50 | run: |
51 | npm ci
52 | python -m pip install --upgrade pip setuptools wheel
53 | pip install -r requirements.txt
54 | sudo apt update || true # do not fail if update does not work
55 | sudo apt-get install poppler-utils
56 | sudo apt-get install chromium-browser
57 |
58 | - name: Test run of all tests scrapers
59 | run: ./scrapers/test_tests_scraper.sh
60 |
61 |
--------------------------------------------------------------------------------
/scrapers/scrape_sz_districts.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | import re
5 |
6 | from bs4 import BeautifulSoup
7 |
8 | import scrape_common as sc
9 |
10 | url = 'https://www.sz.ch/behoerden/information-medien/medienmitteilungen/coronavirus.html/72-416-412-1379-6948'
11 | content = sc.download(url, silent=True)
12 | soup = BeautifulSoup(content, 'html.parser')
13 | pdf_url = soup.find('a', text=re.compile(r'Coronafälle pro Gemeinde')).get('href')
14 |
15 | content = sc.pdfdownload(pdf_url, layout=True, silent=True)
16 | date = sc.find(r'Stand\W+(\d+\.\d+\.20\d{2})', content)
17 | date = sc.date_from_text(date).isoformat()
18 | district_data = re.findall(r'^Bezirk\W+(\w+)\s+(≤?\s?\d+)', content, re.MULTILINE)
19 |
20 | # https://www.bfs.admin.ch/bfs/de/home/statistiken/kataloge-datenbanken/karten.assetdetail.5688189.html
21 | district_ids = {
22 | 'Einsiedeln': 501,
23 | 'Gersau': 502,
24 | 'Höfe': 503,
25 | 'Küssnacht': 504,
26 | 'March': 505,
27 | 'Schwyz': 506,
28 | }
29 |
30 | # https://www.sz.ch/kanton/bezirke/schwyz.html/72-210-112-106
31 | population = {
32 | 'Einsiedeln': 16027,
33 | 'Gersau': 2314,
34 | 'Höfe': 29123,
35 | 'Küssnacht': 13270,
36 | 'March': 43528,
37 | 'Schwyz': 55390,
38 | }
39 |
40 | assert len(district_data) == len(district_ids), f'expected {len(district_ids)} districts available, but got {len(district_data)}: {district_data}'
41 |
42 | for district, total_cases in district_data:
43 | assert district in district_ids, f'District {district} is unknown'
44 |
45 | dd = sc.DistrictData(canton='SZ', district=district)
46 | dd.url = pdf_url
47 | dd.district_id = district_ids[district]
48 | dd.population = population[district]
49 | dd.date = date
50 | # skip total_cases for ≤ entries
51 | if not sc.find(r'(≤)', total_cases):
52 | dd.total_cases = total_cases
53 | print(dd)
54 |
--------------------------------------------------------------------------------
/.github/workflows/test_district_scraper.yml:
--------------------------------------------------------------------------------
1 | name: Test run of district scrapers
2 |
3 | on:
4 | push:
5 | branches: [ master ]
6 | paths:
7 | - 'scrapers/*_districts*'
8 | - 'scrapers/parse_scrape_output.py'
9 | - 'scrapers/populate_district_database.py'
10 | - 'scrapers/run_district_scraper.sh'
11 | - 'scrapers/scrape_dates.py'
12 | - 'scrapers/scrape_matrix.py'
13 | - 'scrapers/validate_scraper*'
14 | - 'scrapers/*_common.py'
15 | - '!scrapers/*_tests.py'
16 | - '.github/workflows/**'
17 | pull_request:
18 | branches: [ master ]
19 | paths:
20 | - 'scrapers/*_districts*'
21 | - 'scrapers/parse_scrape_output.py'
22 | - 'scrapers/populate_district_database.py'
23 | - 'scrapers/run_district_scraper.sh'
24 | - 'scrapers/scrape_dates.py'
25 | - 'scrapers/scrape_matrix.py'
26 | - 'scrapers/validate_scraper*'
27 | - 'scrapers/*_common.py'
28 | - '!scrapers/*_tests.py'
29 | - '.github/workflows/**'
30 | workflow_dispatch: ~
31 |
32 | jobs:
33 | test_run:
34 | runs-on: ubuntu-20.04
35 | timeout-minutes: 10
36 |
37 | steps:
38 | - uses: actions/checkout@v3
39 |
40 | - name: Set up Python 3.7
41 | uses: actions/setup-python@v4
42 | with:
43 | python-version: 3.7
44 |
45 | - name: Remove broken apt repos
46 | run: |
47 | for apt_file in `grep -lr microsoft /etc/apt/sources.list.d/`; do sudo rm $apt_file; done
48 |
49 | - name: Install dependencies
50 | run: |
51 | npm ci
52 | python -m pip install --upgrade pip setuptools wheel
53 | pip install -r requirements.txt
54 | pip install -r requirements-ocr.txt
55 | sudo apt update || true # do not fail if update does not work
56 | sudo apt-get install poppler-utils
57 |
58 | - name: Test run of all district scrapers
59 | run: ./scrapers/test_district_scraper.sh
60 |
61 |
--------------------------------------------------------------------------------
/scrapers/populate_database.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | # This script creates a new sqlite database based on the CSV is reiceives as an argument
4 | # The sqlite database is used as an intermediate step to merge new data in existing CSVs
5 |
6 | import sqlite3
7 | import traceback
8 | import os
9 | import sys
10 | import db_common as dc
11 |
12 |
13 | __location__ = dc.get_location()
14 |
15 | try:
16 | # load the csv to sqlite db
17 | assert len(sys.argv) == 2, "Call script with CSV file as parameter"
18 | columns, to_db = dc.load_csv(sys.argv[1])
19 |
20 | # create db
21 | DATABASE_NAME = os.path.join(__location__, 'data.sqlite')
22 | conn = sqlite3.connect(DATABASE_NAME)
23 | c = conn.cursor()
24 | c.execute('DROP TABLE IF EXISTS data')
25 | c.execute(
26 | '''
27 | CREATE TABLE IF NOT EXISTS data (
28 | date text,
29 | time text,
30 | abbreviation_canton_and_fl text,
31 | ncumul_tested integer,
32 | ncumul_conf integer,
33 | new_hosp integer,
34 | current_hosp integer,
35 | current_icu integer,
36 | current_vent integer,
37 | ncumul_released integer,
38 | ncumul_deceased integer,
39 | source text,
40 | current_isolated integer,
41 | current_quarantined integer,
42 | UNIQUE(date, abbreviation_canton_and_fl)
43 | )
44 | '''
45 | )
46 | # check if there are extra columns
47 | for col in columns[14:]:
48 | c.execute(f'ALTER TABLE data ADD COLUMN {col} integer;')
49 |
50 | # add entries
51 | query = dc.insert_db_query(columns)
52 | c.executemany(query, to_db)
53 | conn.commit()
54 | except Exception as e:
55 | print("Error: %s" % e, file=sys.stderr)
56 | print(traceback.format_exc(), file=sys.stderr)
57 | sys.exit(1)
58 | finally:
59 | conn.close()
60 |
--------------------------------------------------------------------------------
/scrapers/scrape_gr_districts.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import datetime
4 | import requests
5 |
6 | import scrape_common as sc
7 |
8 | inhabitants = {
9 | 'Albula': 8054,
10 | 'Bernina': 4613,
11 | 'Engiadina Bassa/Val Müstair': 9197,
12 | 'Imboden': 21293,
13 | 'Landquart': 25402,
14 | 'Maloja': 18184,
15 | 'Moesa': 8671,
16 | 'Plessur': 42446,
17 | 'Prättigau/Davos': 26089,
18 | 'Surselva': 21289,
19 | 'Viamala': 13783,
20 | }
21 |
22 | district_ids = {
23 | 'Albula': 1841,
24 | 'Bernina': 1842,
25 | 'Engiadina Bassa/Val Müstair': 1843,
26 | 'Imboden': 1844,
27 | 'Landquart': 1845,
28 | 'Maloja': 1846,
29 | 'Moesa': 1847,
30 | 'Plessur': 1848,
31 | 'Prättigau/Davos': 1849,
32 | 'Surselva': 1850,
33 | 'Viamala': 1851,
34 | }
35 |
36 |
37 | limit = '100'
38 | url = 'https://services1.arcgis.com/YAuo6vcW85VPu7OE/arcgis/rest/services/Fallzahlen_Pro_Region/FeatureServer/0/query?f=json&where=Datum%3E%3Dtimestamp%20%272020-02-01%2000%3A00%3A00%27&returnGeometry=false&spatialRel=esriSpatialRelIntersects&outFields=*&orderByFields=Region%20asc&resultOffset=0&resultRecordCount=10000&resultType=standard&cacheHint=true'
39 |
40 |
41 | resp = requests.get(url=url)
42 | json_data = resp.json()
43 |
44 | for attributes in json_data['features']:
45 | element = attributes['attributes']
46 |
47 | if element['Region'] in district_ids:
48 | dd = sc.DistrictData(canton='GR', district=element['Region'])
49 | dd.url = url
50 | date = datetime.datetime.utcfromtimestamp(element['Datum'] / 1000)
51 | dd.date = date.date().isoformat()
52 | dd.total_cases = element['Faelle__kumuliert_']
53 | dd.new_cases = element['Neue_Faelle']
54 | dd.total_deceased = element['Verstorbene__kumuliert_']
55 | dd.new_deceased = element['Verstorbene']
56 | dd.population = inhabitants[dd.district]
57 | dd.district_id = district_ids[dd.district]
58 | print(dd)
59 |
--------------------------------------------------------------------------------
/scrapers/populate_tests_database.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | # This script creates a new sqlite database based on the CSV is reiceives as an argument
4 | # The sqlite database is used as an intermediate step to merge new data in existing CSVs
5 |
6 | import sqlite3
7 | import traceback
8 | import os
9 | import sys
10 | import db_common as dc
11 |
12 |
13 | __location__ = dc.get_location()
14 |
15 | try:
16 | # load the csv to sqlite db
17 | assert len(sys.argv) == 2, "Call script with CSV file as parameter"
18 | columns, to_db = dc.load_csv(sys.argv[1])
19 |
20 | # create db
21 | DATABASE_NAME = os.path.join(__location__, 'data.sqlite')
22 | conn = sqlite3.connect(DATABASE_NAME)
23 | c = conn.cursor()
24 | c.execute('DROP TABLE IF EXISTS data')
25 | c.execute(
26 | '''
27 | CREATE TABLE IF NOT EXISTS data (
28 | canton text NOT NULL,
29 | start_date text NOT NULL,
30 | end_date text NOT NULL,
31 | week text NOT NULL,
32 | year text NOT NULL,
33 | positive_tests integer,
34 | negative_tests integer,
35 | total_tests integer,
36 | positivity_rate float,
37 | source text,
38 | pcr_positive_tests integer,
39 | pcr_negative_tests integer,
40 | pcr_total_tests integer,
41 | pcr_positivity_rate float,
42 | ag_positive_tests integer,
43 | ag_negative_tests integer,
44 | ag_total_tests integer,
45 | ag_positivity_rate float,
46 | UNIQUE(canton, start_date, end_date, week, year)
47 | )
48 | '''
49 | )
50 |
51 | # add entries
52 | query = dc.insert_db_query(columns)
53 | c.executemany(query, to_db)
54 | conn.commit()
55 | except Exception as e:
56 | print("Error: %s" % e, file=sys.stderr)
57 | print(traceback.format_exc(), file=sys.stderr)
58 | sys.exit(1)
59 | finally:
60 | conn.close()
61 |
--------------------------------------------------------------------------------
/scripts/add_new_columns.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | # This script convert CSV files from the new to the old structure
4 |
5 | import csv
6 | import sys
7 | import traceback
8 |
9 | assert len(sys.argv) == 2, "Call script with CSV file as parameter"
10 |
11 | try:
12 | filename = sys.argv[1]
13 | rows = []
14 | with open(filename, 'r') as f:
15 | dr = csv.DictReader(f)
16 | for r in dr:
17 | # map old to new structure
18 | data = {
19 | 'date': r['date'],
20 | 'time': r['time'],
21 | 'abbreviation_canton_and_fl': r['abbreviation_canton_and_fl'],
22 | 'ncumul_tested': r['ncumul_tested'],
23 | 'ncumul_conf': r['ncumul_conf'],
24 | 'new_hosp': r['new_hosp'],
25 | 'current_hosp': r['current_hosp'],
26 | 'current_icu': r['current_icu'],
27 | 'current_vent': r['current_vent'],
28 | 'ncumul_released': r['ncumul_released'],
29 | 'ncumul_deceased': r['ncumul_deceased'],
30 | 'source': r['source'],
31 | 'current_isolated': r.get('current_isolated', ''),
32 | 'current_quarantined': r.get('current_quarantined', ''),
33 | 'current_quarantined_riskareatravel': r.get('current_quarantined_riskareatravel', ''), # new field
34 | }
35 | # re-add extra columns
36 | for col in dr.fieldnames[12:]:
37 | data[col] = r[col]
38 | rows.append(data)
39 |
40 | writer = csv.DictWriter(
41 | sys.stdout,
42 | rows[0].keys(),
43 | delimiter=',',
44 | quotechar='"',
45 | lineterminator='\n',
46 | quoting=csv.QUOTE_MINIMAL
47 | )
48 | writer.writeheader()
49 | writer.writerows(rows)
50 | except Exception as e:
51 | print("Error: %s" % e, file=sys.stderr)
52 | print(traceback.format_exc(), file=sys.stderr)
53 | sys.exit(1)
54 | finally:
55 | sys.stdout.flush()
56 |
--------------------------------------------------------------------------------
/.github/workflows/validate-csv.yml:
--------------------------------------------------------------------------------
1 | name: Validate CSV
2 |
3 | on:
4 | schedule:
5 | - cron: '15 */4 * * *'
6 | workflow_dispatch: ~
7 | push:
8 | branches: [ master ]
9 | paths:
10 | - '**.csv'
11 | pull_request:
12 | branches: [ master ]
13 | paths:
14 | - '**.csv'
15 |
16 | jobs:
17 | validate:
18 | runs-on: ubuntu-20.04
19 | timeout-minutes: 10
20 |
21 | steps:
22 | - uses: actions/checkout@v3
23 |
24 | - name: Set up Python 3.7
25 | uses: actions/setup-python@v4
26 | with:
27 | python-version: 3.7
28 |
29 | - name: Install dependencies
30 | run: |
31 | npm ci
32 | python -m pip install --upgrade pip
33 | pip install -r requirements.txt
34 |
35 | - name: Validate structure and content of CSVs
36 | run: node scripts/validate-csv.js fallzahlen_kanton_total_csv_v2/*.csv
37 |
38 | - name: Check if there are empty lines
39 | run: scripts/check_for_empty_lines.sh fallzahlen_kanton_total_csv_v2/*.csv
40 |
41 | - name: Check for outliers in CSVs
42 | run: python scripts/check_for_outliers.py fallzahlen_kanton_total_csv_v2/*.csv
43 |
44 | - name: Get current unix timestamp
45 | if: always()
46 | id: date
47 | run: echo "ts=$(date +'%s')" >> $GITHUB_OUTPUT
48 |
49 | # notify slack if a CSV validation failed
50 | - name: Notify slack failure
51 | if: ${{ failure() }}
52 | env:
53 | SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
54 | uses: pullreminders/slack-action@master
55 | with:
56 | args: '{\"channel\":\"C013C0UUQ4S\", \"attachments\": [{\"fallback\": \"Job failed.\", \"color\": \"danger\", \"title\": \"Validate CSV\", \"title_link\": \"https://github.com/openZH/covid_19/actions/runs/${{ github.run_id }}?check_suite_focus=true\", \"text\": \":x: CSV validation failed\", \"footer\": \"\", \"footer_icon\": \"https://github.com/abinoda/slack-action/raw/master/docs/app-icons/github-icon.png\", \"ts\": \"${{steps.date.outputs.ts}}\"}]}'
57 |
--------------------------------------------------------------------------------
/scrapers/scrape_tg_tests.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import re
4 | import scrape_common as sc
5 |
6 | url = 'https://statistik.tg.ch/themen-und-daten/covid-19.html/10816'
7 | content = sc.download(url, silent=True)
8 |
9 | res = re.search(r".*name: '2020',\s+categories: \[\'(.*)\]\s+}", content)
10 | assert res, f'failed to extract 2020 weeks, got {res}'
11 | weeks_2020 = res[1].split(',')
12 |
13 | res = re.search(r".*name: '2021',\s+categories: \[\'(.*)\]\s+}", content)
14 | assert res, f'failed to extract 2021 weeks, got {res}'
15 | weeks_2021 = res[1].split(',')
16 |
17 | res = re.search(r".*name: '2022',\s+categories: \[\'(.*)\]\s+}", content)
18 | assert res, f'failed to extract 2022 weeks, got {res}'
19 | weeks_2022 = res[1].split(',')
20 |
21 | res = re.search(r".*name: '2023',\s+categories: \[\'(.*)\]\s+}", content)
22 | assert res, f'failed to extract 2023 weeks, got {res}'
23 | weeks_2023 = res[1].split(',')
24 |
25 | weeks = weeks_2020 + weeks_2021 + weeks_2022 + weeks_2023
26 | years = ['2020'] * len(weeks_2020) + ['2021'] * len(weeks_2021) + ['2022'] * len(weeks_2022) + ['2023'] * len(weeks_2023)
27 |
28 | res = re.search(r".*name: 'Anzahl negativer Tests.?',\s+color: '.*',\s+data: \[(.*)\],", content)
29 | assert res, f'failed to extract negative tests, got {res}'
30 | negative_tests = res[1].split(',')
31 |
32 | res = re.search(r".*name: 'Anzahl positiver Tests.?',\s+color: '.*',\s+data: \[(.*)\],", content)
33 | assert res, f'failed to extract positive tests, got {res}'
34 | positive_tests = res[1].split(',')
35 |
36 | res = re.search(r".*name: 'Positivitätsrate',\s+color: '.*',\s+data: \[(.*)\],", content)
37 | assert res, f'failed to extract positivtiy rate, got {res}'
38 | positivity_rate = res[1].split(',')
39 |
40 | assert len(weeks) == len(negative_tests) == len(positive_tests) == len(positivity_rate), f'Expected same length for weeks {len(weeks)}, neg. tests {len(negative_tests)}, pos. tests {len(positive_tests)}, pos. rate {len(positivity_rate)}'
41 |
42 | for week, year, neg, pos, rate in zip(weeks, years, negative_tests, positive_tests, positivity_rate):
43 | td = sc.TestData(canton='TG', url=url)
44 | td.week = sc.find(r'KW (\d+)', week)
45 | td.year = year
46 | td.positive_tests = int(pos)
47 | td.negative_tests = int(neg)
48 | td.positivity_rate = float(rate)
49 | print(td)
50 |
--------------------------------------------------------------------------------
/scripts/remove_older_entries.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | # This script removes (=sets to empty string) older entries from a CSV
4 | # in this example remove current_hosp prior to 2020-05-19
5 |
6 | import csv
7 | import sys
8 | import traceback
9 | import datetime
10 |
11 | assert len(sys.argv) == 2, "Call script with CSV file as parameter"
12 |
13 | try:
14 | filename = sys.argv[1]
15 | rows = []
16 | with open(filename, 'r') as f:
17 | dr = csv.DictReader(f)
18 | for r in dr:
19 | # map old to new structure
20 | data = {
21 | 'date': r['date'],
22 | 'time': r['time'],
23 | 'abbreviation_canton_and_fl': r['abbreviation_canton_and_fl'],
24 | 'ncumul_tested': r['ncumul_tested'],
25 | 'ncumul_conf': r['ncumul_conf'],
26 | 'new_hosp': r['new_hosp'],
27 | 'current_hosp': r['current_hosp'],
28 | 'current_icu': r['current_icu'],
29 | 'current_vent': r['current_vent'],
30 | 'ncumul_released': r['ncumul_released'],
31 | 'ncumul_deceased': r['ncumul_deceased'],
32 | 'source': r['source'],
33 | 'current_isolated': r.get('current_isolated', ''),
34 | 'current_quarantined': r.get('current_quarantined', ''),
35 | 'current_quarantined_riskareatravel': r.get('current_quarantined_riskareatravel', ''), # new field
36 | 'current_quarantined_total': r.get('current_quarantined_total', ''), # new field
37 | }
38 | if datetime.datetime.strptime(data['date'], '%Y-%m-%d') < datetime.datetime(2020, 5, 19):
39 | data['current_hosp'] = ''
40 | # re-add extra columns
41 | for col in dr.fieldnames[12:]:
42 | data[col] = r[col]
43 | rows.append(data)
44 |
45 | writer = csv.DictWriter(
46 | sys.stdout,
47 | rows[0].keys(),
48 | delimiter=',',
49 | quotechar='"',
50 | lineterminator='\n',
51 | quoting=csv.QUOTE_MINIMAL
52 | )
53 | writer.writeheader()
54 | writer.writerows(rows)
55 | except Exception as e:
56 | print("Error: %s" % e, file=sys.stderr)
57 | print(traceback.format_exc(), file=sys.stderr)
58 | sys.exit(1)
59 | finally:
60 | sys.stdout.flush()
61 |
--------------------------------------------------------------------------------
/scrapers/scrape_ti.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 |
4 | from bs4 import BeautifulSoup
5 | import re
6 | import datetime
7 | import scrape_common as sc
8 |
9 | # get pdf and xlsx URL from covid19 page of TI
10 | main_url = 'https://www4.ti.ch/dss/dsp/covid19/home/'
11 | d = sc.download(main_url, silent=True)
12 | soup = BeautifulSoup(d, 'html.parser')
13 |
14 | is_first = True
15 |
16 | """
17 | container = soup.find('h2', string=re.compile(r'Isolamento e quarantena')).find_next('div')
18 | for item in container.find_all('div'):
19 | divs = item.find_all('div')
20 | if len(divs) == 3:
21 | dd = sc.DayData(canton='TI', url=main_url)
22 | dd.datetime = sc.find(r'.*?(\d+\.\d+\.\d{2})', divs[2].string)
23 | if sc.find(r'.*(quarantena)', divs[1].string):
24 | dd.quarantined = divs[0].string
25 | if sc.find(r'.*(isolamento)', divs[1].string):
26 | dd.isolated = divs[0].string
27 | if dd:
28 | if not is_first:
29 | print('-' * 10)
30 | is_first = False
31 | print(dd)
32 | """
33 |
34 | xls_url = soup.find(href=re.compile("\.xlsx$")).get('href')
35 | assert xls_url, "URL is empty"
36 |
37 | if not xls_url.startswith('http'):
38 | xls_url = f'https://www4.ti.ch/{xls_url}'
39 |
40 | xls = sc.xlsdownload(xls_url, silent=True)
41 | rows = sc.parse_xls(xls, header_row=0)
42 | prev_date = None
43 | for row in rows:
44 | if row is None:
45 | continue
46 | if 'Data' not in row:
47 | continue
48 | if row['Data'] is None:
49 | continue
50 |
51 | if not is_first:
52 | print('-' * 10)
53 | is_first = False
54 |
55 | dd = sc.DayData(canton='TI', url=xls_url)
56 | dd.datetime = f"{row['Data'].date().isoformat()}"
57 | if dd.datetime == "2023-08-09" and prev_date == "2023-03-08":
58 | dd.datetime = "2023-03-09"
59 | prev_date = dd.datetime
60 | if row.get('Ora'):
61 | dd.datetime += f"T{row['Ora'].time().isoformat()}"
62 | dd.cases = row['Totale casi confermati']
63 | dd.hospitalized = row['Totale giornaliero pazienti ricoverati']
64 | dd.icu = row['Totale giornaliero pazienti cure intense']
65 | dd.vent = row['Totale giornaliero pazienti ventilati']
66 | dd.recovered = row['Totale pazienti dimessi da ospedali']
67 | dd.deaths = row['Totale decessi']
68 | print(dd)
69 |
--------------------------------------------------------------------------------
/scrapers/scrape_sz.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 |
4 | import re
5 | import sys
6 | import datetime
7 | from bs4 import BeautifulSoup
8 | import scrape_common as sc
9 |
10 | url = 'https://www.sz.ch/behoerden/information-medien/medienmitteilungen/coronavirus.html/72-416-412-1379-6948'
11 | d = sc.download(url, silent=True)
12 | soup = BeautifulSoup(d, 'html.parser')
13 |
14 | is_first = True
15 |
16 | """
17 | Disabled for now, the PDFs from October 2020 contained hospitalized and quarntined data
18 |
19 | pdfs = soup.find_all('a', string=re.compile(r'Medienmitteilung vom'))
20 | for pdf in pdfs:
21 | pdf_url = pdf['href']
22 | pdf_content = sc.pdfdownload(pdf_url, layout=True, silent=True)
23 | date = sc.find(r'Stand:\s(\d+\.\s.*\s20\d{2})', pdf_content)
24 | res = re.search(r'.*\s+(?P\d+)\s+\d+\s+\d+\s+(?P\d+)\s+(?P\d+)\s+(?P\d+)\s+', pdf_content)
25 | if not date or not res:
26 | continue
27 |
28 | if not is_first:
29 | print('-' * 10)
30 | is_first = False
31 | dd = sc.DayData(canton='SZ', url=pdf_url)
32 | dd.datetime = date.replace('\n', ' ')
33 | dd.isolated = res['iso']
34 | dd.hospitalized = res['hosp']
35 | dd.quarantined = res['quar']
36 | dd.quarantine_riskareatravel = res['qtravel']
37 | print(dd)
38 | is_first = False
39 | """
40 |
41 | try:
42 | xls_url = soup.find('a', string=re.compile(r'Coronaf.lle\s*im\s*Kanton\s*Schwyz'))['href']
43 | except TypeError:
44 | print("Unable to determine xls url", file=sys.stderr)
45 | sys.exit(1)
46 | xls = sc.xlsdownload(xls_url, silent=True)
47 |
48 | rows = sc.parse_xls(xls)
49 | for row in rows:
50 | if not isinstance(row['Datum'], datetime.datetime):
51 | continue
52 |
53 | if not is_first:
54 | print('-' * 10)
55 | is_first = False
56 |
57 | # TODO: remove when source is fixed
58 | # handle wrong value on 2020-03-25, see issue #631
59 | if row['Datum'].date().isoformat() == '2020-03-25':
60 | row['Bestätigte Fälle (kumuliert)'] = ''
61 |
62 | dd = sc.DayData(canton='SZ', url=url)
63 | dd.datetime = row['Datum'].date().isoformat()
64 | if row['Zeit']:
65 | dd.datetime += ' ' + row['Zeit'].time().isoformat()
66 | dd.cases = row['Bestätigte Fälle (kumuliert)']
67 | dd.deaths = row['Todesfälle (kumuliert)']
68 | dd.recovered = row['Genesene (kumuliert)']
69 | print(dd)
70 |
--------------------------------------------------------------------------------
/scrapers/certificate.pem:
--------------------------------------------------------------------------------
1 | # SwissSign EV Gold CA 2014 - G22
2 | -----BEGIN CERTIFICATE-----
3 | MIIGuTCCBKGgAwIBAgIQAIEIODzAB3XEDG1za+MwizANBgkqhkiG9w0BAQsFADBF
4 | MQswCQYDVQQGEwJDSDEVMBMGA1UEChMMU3dpc3NTaWduIEFHMR8wHQYDVQQDExZT
5 | d2lzc1NpZ24gR29sZCBDQSAtIEcyMB4XDTE0MDkxNTE2MTYzN1oXDTM1MDMwNDE2
6 | MTYzN1owTjELMAkGA1UEBhMCQ0gxFTATBgNVBAoTDFN3aXNzU2lnbiBBRzEoMCYG
7 | A1UEAxMfU3dpc3NTaWduIEVWIEdvbGQgQ0EgMjAxNCAtIEcyMjCCASIwDQYJKoZI
8 | hvcNAQEBBQADggEPADCCAQoCggEBAL+MVu10kh055MUIkpRaC7sfiuFQ4gAYFv4B
9 | 5LfsK6NSpTaJybYvrA/lr0JBE/xTsQl3Jrka60FgprSh9pXgE94UVoE2Qb4LiHEo
10 | AIYyBQY0aA3nL9GEkT436uXs0tV2Veg6+6CgGRzgaoQtDu3hXWV5GOyNOAtlmzR4
11 | md1JH6oFap9d3kVwJLExUI930Cwjzwt0XAcvjy8+fLheBanG5VFGnRrntRSWiRzY
12 | QIjjAkBDTi+lj552h9aKzFvFEQ5NSiBmrGVk2wIlrh+AZe8NYnXrRBzv0Z5SODD4
13 | jxyPkTAX7f9zkJ9s0yMVEmalWnfwXn4K4Rz3x7fmWeyxipUOhSkCAwEAAaOCApow
14 | ggKWMA4GA1UdDwEB/wQEAwIBBjASBgNVHRMBAf8ECDAGAQH/AgEAMB0GA1UdDgQW
15 | BBTu/UbK9ydekbxatueHzQr6VQomQjAfBgNVHSMEGDAWgBRbJXuWpGVRfrg588B4
16 | Zl7oOufw7jCB/wYDVR0fBIH3MIH0MEegRaBDhkFodHRwOi8vY3JsLnN3aXNzc2ln
17 | bi5uZXQvNUIyNTdCOTZBNDY1NTE3RUI4MzlGM0MwNzg2NjVFRTgzQUU3RjBFRTCB
18 | qKCBpaCBooaBn2xkYXA6Ly9kaXJlY3Rvcnkuc3dpc3NzaWduLm5ldC9DTj01QjI1
19 | N0I5NkE0NjU1MTdFQjgzOUYzQzA3ODY2NUVFODNBRTdGMEVFJTJDTz1Td2lzc1Np
20 | Z24lMkNDPUNIP2NlcnRpZmljYXRlUmV2b2NhdGlvbkxpc3Q/YmFzZT9vYmplY3RD
21 | bGFzcz1jUkxEaXN0cmlidXRpb25Qb2ludDBaBgNVHSAEUzBRME8GBFUdIAAwRzBF
22 | BggrBgEFBQcCARY5aHR0cDovL3JlcG9zaXRvcnkuc3dpc3NzaWduLmNvbS9Td2lz
23 | c1NpZ24tR29sZC1DUC1DUFMucGRmMIHRBggrBgEFBQcBAQSBxDCBwTBkBggrBgEF
24 | BQcwAoZYaHR0cDovL3N3aXNzc2lnbi5uZXQvY2dpLWJpbi9hdXRob3JpdHkvZG93
25 | bmxvYWQvNUIyNTdCOTZBNDY1NTE3RUI4MzlGM0MwNzg2NjVFRTgzQUU3RjBFRTBZ
26 | BggrBgEFBQcwAYZNaHR0cDovL2dvbGQtZXYtZzIub2NzcC5zd2lzc3NpZ24ubmV0
27 | LzVCMjU3Qjk2QTQ2NTUxN0VCODM5RjNDMDc4NjY1RUU4M0FFN0YwRUUwDQYJKoZI
28 | hvcNAQELBQADggIBACVxhUgwnsFZgEmC50cCMExcmvY9OQkPxcQbMMFCYvfvBFNz
29 | 65iu0MkXTo0jhaIe8wOOsv230q/zYJbTZOGbMpvUg5MRRIK9DCq3bDwAqN9bIjFw
30 | wK1bODt260m9+4gLxJJdt2MH5LAglQ2J0123+RodYxvv3b+5k6/DZ19dJUgXrjbD
31 | +0PWuO5+5DRangp3VELIRWjHAAnpmq3guORiLuVDS+PoinFp/CKEFRhgWIhp6sZd
32 | yA/9egO+ZH+U7KzLaMuYRNHfJr2UrgQUEufsOM0WUqQXS8RzO7ZGW/argfyc4NdS
33 | CivO97xZBroON0XaLOlTAAbubomhzz/K/Uv2S5T+I/AfYWCme7Vx/KyeA9if/eLA
34 | jQNn5lIb1cXhompM2M+kLAGjNhdpQvUSkjAhKOkzoeezJEN+RXU4P5tOJxw03LtJ
35 | VxmdQxQwgXOR0rBZT+9aFJSX1nIj7zWRnMwFu5w+gBaX1/5MuLP/ThJCckoVgb0o
36 | nbFLRn6siH6dNE+gZ5VgiMWeDOkwlR1UMWGMNwoKNExoTKYwKnpuMfv4q7Fx4uI9
37 | qVzGTL6yfW8+SRdxVFQa6K9hekBr2kZyAKBCqz+jpQq1EPCcvn4HiNx81Na++iqe
38 | K+d2mfZxdEuAwFoZIcyk1aTWHHT1Cqzys00wlukvSmnXUBbGU5Vpwzjlj3N4
39 | -----END CERTIFICATE-----
40 |
--------------------------------------------------------------------------------
/scrapers/scrape_ag.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import datetime
4 | import scrape_common as sc
5 | import scrape_ag_common as sac
6 |
7 |
8 | xls_url = sac.get_ag_xls_url()
9 | xls = sc.xlsdownload(xls_url, silent=True)
10 | is_first = True
11 |
12 | # quarantine_riskareatravel
13 | """
14 | rows = sc.parse_xls(xls, sheet_name='5. Quarantäne nach Einreise', header_row=2)
15 | for row in rows:
16 | if not isinstance(row['A'], datetime.datetime):
17 | continue
18 |
19 |
20 | dd = sc.DayData(canton='AG', url=xls_url)
21 | dd.datetime = f"{row['A'].date().isoformat()} {row['A'].time().isoformat()}"
22 | dd.quarantine_riskareatravel = row['Gesamtzahl aktuell betreuter Personen']
23 | if dd:
24 | if not is_first:
25 | print('-' * 10)
26 | is_first = False
27 | print(dd)
28 | """
29 |
30 | # quarantine + isolation
31 | rows = sc.parse_xls(xls, sheet_name='2. Contact Tracing', header_row=2)
32 | for row in rows:
33 | if not isinstance(row['A'], datetime.datetime):
34 | continue
35 |
36 | dd = sc.DayData(canton='AG', url=xls_url)
37 | dd.datetime = f"{row['A'].date().isoformat()} {row['A'].time().isoformat()}"
38 | isolated = row['Gesamtzahl aktuell betreuter Personen']
39 | if sc.represents_int(isolated):
40 | dd.isolated = isolated
41 | #dd.quarantined = row['Gesamtzahl aktuell betreuter Personen5']
42 | if dd:
43 | if not is_first:
44 | print('-' * 10)
45 | is_first = False
46 | print(dd)
47 |
48 | # cases + hospitalization
49 | rows = sc.parse_xls(xls, sheet_name='1. Covid-19-Daten', header_row=2)
50 | for row in rows:
51 | if not isinstance(row['A'], datetime.datetime):
52 | continue
53 |
54 | dd = sc.DayData(canton='AG', url=xls_url)
55 | dd.datetime = f"{row['A'].date().isoformat()} {row['A'].time().isoformat()}"
56 | if 'Gesamtzahl' in row:
57 | dd.cases = row['Gesamtzahl']
58 |
59 | non_icu = row['Bestätigte Fälle Bettenstation (ohne IPS/IMC)']
60 | icu = row['Bestätigte Fälle Intensivpflegestation (IPS)']
61 | icf = row['Bestätigte Fälle Intermediate Care (IMC)']
62 | if sc.represents_int(non_icu) and sc.represents_int(icu) and sc.represents_int(icf):
63 | dd.hospitalized = int(non_icu) + int(icu) + int(icf)
64 | dd.icu = icu
65 | dd.icf = icf
66 | if 'Gesamtzahl21' in row:
67 | dd.deaths = row['Gesamtzahl21']
68 | if 'Gesamtzahl25' in row:
69 | dd.recovered = row['Gesamtzahl25']
70 |
71 | if dd:
72 | if not is_first:
73 | print('-' * 10)
74 | is_first = False
75 | print(dd)
76 |
--------------------------------------------------------------------------------
/scrapers/scrape_vd_tests.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 |
4 | import datetime
5 | import re
6 | import scrape_common as sc
7 | import scrape_vd_common as svc
8 |
9 |
10 | pdf_urls = svc.get_all_weekly_pdf_urls()
11 | for pdf_url in pdf_urls:
12 | pdf = sc.pdfdownload(pdf_url, silent=True, page=1)
13 | pdf = re.sub(r'(\d+)\'(\d+)', r'\1\2', pdf)
14 | pdf = re.sub(r'(\d+)’(\d+)', r'\1\2', pdf)
15 | pdf = re.sub(r'(\d)er', r'\1', pdf)
16 |
17 | td = sc.TestData(canton='VD', url=pdf_url)
18 |
19 | year = sc.find(r'Situation au \d+.*(20\d{2})', pdf)
20 | date = sc.find(r'Point .pid.miologique au (\d+\s+\w+\s+20\d{2})', pdf)
21 | if date is None:
22 | date = sc.find(r'Point .pid.miologique au (\d+\.\d+\.20\d{2})', pdf)
23 | res = re.search(r'Entre\s+(?Pet\s+)?le\s+(?P\d+\s+\w+)\s+et\s+le\s+(?P\d+\s+\w+)(?P\s+\d{4})?,', pdf, flags=re.I|re.UNICODE)
24 | res_with_year = re.search(r'Entre\s+le\s+(?P\d+\s+\w+\s+\d{4})\s+et\s+le\s+(?P\d+\s+\w+\s+\d{4}),', pdf, flags=re.I|re.UNICODE)
25 | res_no_month = re.search(r'Entre\s+le\s+(?P\d+)\s+et\s+le\s+(?P\d+\s+\w+),', pdf, flags=re.I|re.UNICODE)
26 | res_no_month_with_year = re.search(r'Entre(?P\s+et)?\s+le\s+(?P\d+)\s+et\s+le\s+(?P\d+\s+\w+\s+\d{4}),', pdf, flags=re.I|re.UNICODE)
27 |
28 | if res:
29 | start_date = sc.date_from_text(f"{res['start']} {year}")
30 | end_date = sc.date_from_text(f"{res['end']} {year}")
31 | elif res_with_year:
32 | start_date = sc.date_from_text(res_with_year['start'])
33 | end_date = sc.date_from_text(res_with_year['end'])
34 | elif res_no_month:
35 | end_date = sc.date_from_text(f"{res_no_month['end']} {year}")
36 | start_date = sc.date_from_text(f"{res_no_month['start']}.{end_date.month}.{year}")
37 | elif res_no_month_with_year:
38 | end_date = sc.date_from_text(res_no_month_with_year['end'])
39 | start_date = sc.date_from_text(f"{res_no_month_with_year['start']}.{end_date.month}.{end_date.year}")
40 | elif date:
41 | end_date = sc.date_from_text(date)
42 | start_date = end_date - datetime.timedelta(days=6)
43 |
44 | assert start_date and end_date, f'failed to extract start and end dates from {pdf_url}'
45 | td.start_date = start_date
46 | td.end_date = end_date
47 |
48 | res = re.search(r'une\s+moyenne\s+de\s+(\d+)\s+frottis\s+SARS-CoV(-)?2', pdf)
49 | if res:
50 | days = (end_date - start_date).days
51 | td.total_tests = days * int(res[1])
52 |
53 | res = re.search(r'dont\s+(\d+\.?\d?)\s?%\s+étaient\s+positifs', pdf)
54 | if res:
55 | td.positivity_rate = res[1]
56 |
57 | if td:
58 | print(td)
59 |
--------------------------------------------------------------------------------
/scrapers/scrape_so.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 |
4 | import re
5 | from bs4 import BeautifulSoup
6 | import scrape_common as sc
7 | import scrape_so_common as soc
8 |
9 |
10 | base_url = 'https://corona.so.ch'
11 | pdf_url = soc.get_latest_weekly_pdf_url()
12 | content = sc.pdfdownload(pdf_url, layout=True, silent=True, page=1)
13 | content = re.sub(r'(\d+)\'(\d+)', r'\1\2', content)
14 |
15 | """
16 | Hospitalisationen im Kanton Anzahl Personen in Isolation davon Kontakte in Quarantäne Anzahl zusätzlicher Personen in Quarantäne nach Rückkehr aus Risikoland Re- Wert***
17 | 6 (6) 120 (71) 280 (189) 388 (280) 1.46 (1.1)
18 | """
19 |
20 | rows = []
21 |
22 | date = sc.find(r'S\s?tand: (\d+\.\d+\.20\d{2})', content)
23 | number_of_tests = sc.find(r'Gem\s?eldete\s+Tes\s?ts\s+\(Total\)\*+?\s+(\d+)\s', content, flags=re.DOTALL)
24 | res = re.search(r'Hospitalisationen im Kanton.*\d+ \(\d+\)\s+(\d+) \(\d+\)\s+(\d+) \(\d+\)\s+(\d+) \(\d+\)\s+', content, re.DOTALL)
25 | if res is not None:
26 | data = sc.DayData(canton='SO', url=pdf_url)
27 | data.datetime = date
28 | data.tested = number_of_tests
29 | data.isolated = soc.strip_value(res[1])
30 | data.quarantined = soc.strip_value(res[2])
31 | data.quarantine_riskareatravel = soc.strip_value(res[3])
32 | rows.append(data)
33 |
34 |
35 | # scrape the main page as well
36 | url = "https://corona.so.ch/bevoelkerung/daten/"
37 | d = sc.download(url, silent=True)
38 | soup = BeautifulSoup(d, 'html.parser')
39 | title = soup.find('h3', text=re.compile("Stand"))
40 | data = sc.DayData(canton='SO', url=url)
41 | data.datetime = sc.find(r'Stand\s*(\d+\.\d+\.\d{4})\s*', title.string)
42 | table = title.find_next('table')
43 | for table_row in table.find_all('tr'):
44 | title = table_row.find_all('th')
45 | items = table_row.find_all('td')
46 | if len(items) == 0:
47 | continue
48 | name = title[0].text
49 | value = items[0].text.replace("'", "")
50 | if sc.find(r'(Laborbestätigte Infektionen).*?:', name):
51 | data.cases = value
52 | continue
53 | if name == 'Verstorbene Personen (kumuliert seit 06.03.2020):':
54 | data.deaths = value
55 | continue
56 | if name == 'Im Kanton hospitalisierte Covid-19-positive Patientinnen und Patienten:':
57 | data.hospitalized = value
58 | continue
59 | if name.strip() == 'Davon befinden sich auf Intensivstationen:':
60 | data.icu = value
61 | continue
62 | if data:
63 | rows.append(data)
64 |
65 |
66 | is_first = True
67 | # skip first row
68 | for row in rows:
69 | if not is_first:
70 | print('-' * 10)
71 | is_first = False
72 | print(row)
73 |
--------------------------------------------------------------------------------
/scrapers/scrape_fl.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import scrape_common as sc
4 | import sys
5 | import re
6 | import datetime
7 | from bs4 import BeautifulSoup
8 |
9 |
10 | # get the daily bulletin
11 | base_url = 'https://www.regierung.li'
12 | d = sc.download(base_url, silent=True)
13 | soup = BeautifulSoup(d, 'html.parser')
14 |
15 | is_first = True
16 | bulletin = soup.find('h1', text=re.compile(r'COVID-19: Situationsbericht.*'))
17 | if bulletin:
18 | bulletin = bulletin.find_next('a')
19 | if bulletin:
20 | url = f"{base_url}{bulletin.get('href')}"
21 | bulletin_d = sc.download(url, silent=True)
22 | bulletin_soup = BeautifulSoup(bulletin_d, 'html.parser')
23 |
24 | dd = sc.DayData(canton='FL', url=url)
25 |
26 | title = bulletin_soup.find('h1', text=re.compile(r'.*Situationsbericht.*'))
27 | dd.datetime = sc.find(r'Situationsbericht vom (.*? 20\d{2})', title.text)
28 |
29 | content = title.find_next('div').text
30 | content = re.sub(r'(\d+)’(\d+)', r'\1\2', content)
31 |
32 | dd.cases = sc.find(r"insgesamt\s+([0-9]+)\s+laborbestätigte\s+Fälle", content)
33 | dd.deaths = sc.find(r'(Damit\s+traten\s+)?(?:bisher|bislang)\s+(traten\s+)?(?P\d+)\s+(Todesfall|Todesfälle)', content, flags=re.I, group='death')
34 |
35 | if re.search(r'Alle\s+weiteren\s+Erkrankten\s+sind\s+in\s+der\s+Zwischenzeit\s+genesen', content):
36 | dd.recovered = int(dd.cases) - int(dd.deaths)
37 |
38 | m = re.search(r'(\S+)\s+Erkrankte\s+sind\s+derzeit\s+hospitalisiert', content)
39 | if m:
40 | dd.hospitalized = sc.int_or_word(m[1].lower())
41 |
42 | m = re.search(r'Gegenwärtig\s+befinden\s+sich\s+(\w+)\s+enge\s+Kontaktpersonen\s+in\s+Quarantäne.', content)
43 | if m:
44 | dd.quarantined = sc.int_or_word(m[1])
45 |
46 | if dd:
47 | if not is_first:
48 | print('-' * 10)
49 | print(dd)
50 | is_first = False
51 |
52 |
53 | # get the data from XLS file containing full history
54 | history_url='https://www.llv.li/files/ag/aktuelle-fallzahlen.xlsx'
55 | xls = sc.xlsdownload(history_url, silent=True)
56 | rows = sc.parse_xls(xls, header_row=3)
57 | for row in rows:
58 | dd_full_list = sc.DayData(canton='FL', url=history_url)
59 | if isinstance(row['Datenstand'], datetime.datetime):
60 | dd_full_list.datetime = row['Datenstand']
61 | else:
62 | dd_full_list.datetime = str(row['Datenstand']).replace(':', '.')
63 |
64 | dd_full_list.cases = str(row['Anzahl pos. Fälle kumuliert']).replace("'","")
65 | dd_full_list.recovered = row['Genesene kumuliert']
66 | dd_full_list.hospitalized = row['Hospitalisierte Personen*']
67 | dd_full_list.deaths = row['Todesfälle kumuliert']
68 | if dd_full_list:
69 | if not is_first:
70 | print('-' * 10)
71 | is_first = False
72 | print(dd_full_list)
73 |
--------------------------------------------------------------------------------
/scrapers/scrape_ow.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import re
4 | import datetime
5 | from bs4 import BeautifulSoup
6 | import scrape_common as sc
7 |
8 |
9 | base_url = 'https://www.ow.ch'
10 | url = f'{base_url}/de/verwaltung/dienstleistungen/?dienst_id=5962'
11 | """
12 | d = sc.download(url, silent=True, encoding='windows-1252')
13 | d = d.replace(' ', ' ')
14 | soup = BeautifulSoup(d, 'html.parser')
15 |
16 | dd = sc.DayData(canton='OW', url=url)
17 | date = sc.find(r'Stand (\d+\.\s+\w+\s+20\d{2})', d)
18 | time = sc.find(r'Stand .*,\s?([\d\.:]+).*Uhr', d)
19 | dd.datetime = f'{date}, {time} Uhr'
20 | dd.isolated = soup.find(text=re.compile(r'In Isolation \(aktuell\)')).find_next('td').string
21 | dd.quarantined = soup.find(text=re.compile(r'In Quarant.ne \(aktuell\)')).find_next('td').string
22 | dd.quarantine_riskareatravel = soup.find(text=re.compile(r'Reiser.ckkehrer in Quarant.ne')).find_next('td').string
23 |
24 | is_first = True
25 | if dd:
26 | print(dd)
27 | is_first = False
28 | """
29 |
30 | is_first = True
31 |
32 |
33 | d = sc.download(f'{base_url}/de/kanton/publired/publikationen/?action=info&pubid=20318',
34 | encoding='windows-1252', silent=True)
35 | soup = BeautifulSoup(d, 'html.parser')
36 | xls_url = soup.find('a', string=re.compile("Download")).get('href')
37 | assert xls_url, "URL is empty"
38 | xls_url = f'{base_url}{xls_url}'
39 |
40 | for row in soup.find_all('dl'):
41 | cells = row.find_all('dd')
42 | if cells[0].string:
43 | file_date = cells[0].string
44 |
45 | xls = sc.xlsdownload(xls_url, silent=True)
46 | rows = sc.parse_xls(xls, header_row=4)
47 | for row in rows:
48 | if isinstance(row['A'], datetime.datetime):
49 | dd = sc.DayData(canton='OW', url=url)
50 | dd.datetime = row['A']
51 | data_found = False
52 | if isinstance(row['Infizierte Personen (kumuliert)'], int) and row['Infizierte Personen (kumuliert)'] > 0:
53 | dd.cases = row['Infizierte Personen (kumuliert)']
54 | data_found = True
55 | hosp_key = """Hospitalisierte Personen im KSOW /
56 | Eintritte Covid-Station; Alle Einwohner OW alle Spitäler CH***"""
57 | if isinstance(row[hosp_key], int):
58 | dd.hospitalized = row[hosp_key]
59 | if isinstance(row['Gestorbene Personen (kumuliert)'], int):
60 | dd.deaths = row['Gestorbene Personen (kumuliert)']
61 | if isinstance(row['Isolation'], int):
62 | dd.isolated = row['Isolation']
63 | if isinstance(row['Quarantäne'], int):
64 | dd.quarantined = row['Quarantäne']
65 | if isinstance(row['Quarantäne Reiserückkehrer'], int):
66 | dd.quarantine_riskareatravel = row['Quarantäne Reiserückkehrer']
67 | if data_found:
68 | if not is_first:
69 | print('-' * 10)
70 | else:
71 | is_first = False
72 | print(dd)
73 |
--------------------------------------------------------------------------------
/scrapers/scrape_vs.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 |
4 | import datetime
5 | import re
6 | from bs4 import BeautifulSoup
7 | import scrape_common as sc
8 |
9 |
10 | def strip_value(value):
11 | if value:
12 | return re.sub(r'[^0-9]', '', value)
13 | return None
14 |
15 |
16 | base_url = 'https://www.vs.ch'
17 | url = f'{base_url}/web/coronavirus/statistiques'
18 | content = sc.download(url, silent=True)
19 | soup = BeautifulSoup(content, 'html.parser')
20 | pdf_url = soup.find('a', string=re.compile(r'20\d{2}.*Sit Epid.*')).get('href')
21 | pdf_url = f'{base_url}{pdf_url}'
22 |
23 | content = sc.pdfdownload(pdf_url, silent=True, layout=True, page=1)
24 |
25 | dd = sc.DayData(canton='VS', url=pdf_url)
26 | dd.datetime = sc.find(r'(\d{2}/\d{2}/20\d{2})', content)
27 | dd.datetime = re.sub(r'/', '.', dd.datetime)
28 | dd.cases = strip_value(sc.find(r'.*Cumul cas positifs.*\s+(\d+.\d+)\s+', content))
29 | dd.deaths = strip_value(sc.find(r'.*Cumul d.c.s.*\s+(\d+.\d+)\s+', content))
30 | dd.hospitalized = strip_value(sc.find(r'.*Hospitalisations en cours de cas COVID-19.*\s+(\d+)\s+', content))
31 | dd.icu = strip_value(sc.find(r'.*SI en cours.*\s+(\d+)\s+', content))
32 | dd.vent = strip_value(sc.find(r'.*Intubation en cours.*\s+(\d+)\s+', content))
33 |
34 | is_first = True
35 | if dd:
36 | is_first = False
37 | print(dd)
38 |
39 |
40 | xls_url = 'https://raw.githubusercontent.com/statistikZH/covid19_drop/master/Chiffres%20COVID-19%20Valais.xlsx'
41 | main_url = 'https://www.vs.ch/de/web/coronavirus'
42 | xls = sc.xlsdownload(xls_url, silent=True)
43 | rows = sc.parse_xls(xls, header_row=1)
44 | for i, row in enumerate(rows):
45 | if not isinstance(row['Date'], datetime.datetime):
46 | continue
47 | if not sc.represents_int(row['Cumul cas positifs']):
48 | continue
49 | if row['Nb nouveaux cas positifs'] is None and row["Nb nouvelles admissions à l'hôpital"] is None:
50 | continue
51 |
52 | dd = sc.DayData(canton='VS', url=main_url)
53 | dd.datetime = row['Date'].date().isoformat()
54 | dd.cases = row['Cumul cas positifs']
55 | dd.hospitalized = row['Total hospitalisations COVID-19']
56 | dd.new_hosp = row['Nb nouvelles admissions à l\'hôpital']
57 | dd.icu = row['Patients COVID-19 aux SI total (y.c. intubés)']
58 | dd.vent = row['Patients COVID-19 intubés']
59 | dd.deaths = row['Cumul décès COVID-19']
60 | # Since 2020-10-19 VS does no longer publish data about isolation/quarantined
61 | #dd.isolated = row['Nombre de cas en cours d\'isolement']
62 | #dd.quarantined = row['Nombre de contacts en cours de quarantaine']
63 | #dd.quarantine_riskareatravel = row['Nombre de voyageurs en cours de quarantaine']
64 |
65 | if row['Nb de nouvelles sorties'] is not None:
66 | dd.recovered = sum(r['Nb de nouvelles sorties'] for r in rows[:i+1])
67 | if not is_first:
68 | print('-' * 10)
69 | is_first = False
70 | print(dd)
71 |
--------------------------------------------------------------------------------
/scrapers/scrape_sh_common.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 |
4 | import json
5 | import re
6 | from bs4 import BeautifulSoup
7 | import scrape_common as sc
8 |
9 |
10 | def get_sh_url_from_json(url):
11 | m = sc.jsondownload(url, silent=True)
12 |
13 | # 2020-04-24
14 | """
15 | {
16 | data_filetype: "xlsx",
17 | data_shareInAreaPage: "[]",
18 | data_kachellabel: "Fallzahlen Corona Kanton Schaffhausen.xlsx",
19 | data_areaPage_repositoryid: "3275",
20 | data_custom_author: "Gesundheitsamt Kanton Schaffhausen",
21 | data_tagarea: "[]",
22 | data_shareInDomain: "[]",
23 | data_zielgruppen: "",
24 | data_publication_date: "23.04.2020",
25 | data_idpath: "/1752/8540/1753/1765/1755/1763/2733/2747/3275/3666465",
26 | data_custom_publication_date_date: "23.04.2020",
27 | data_shareArticleProfileId: "",
28 | data_file_name: "Fallzahlen Corona Kanton Schaffhausen.xlsx",
29 | data_author: "MWETT",
30 | data_file_copyrights: "",
31 | data_custom_publication_timed: "[]",
32 | data_published: "published",
33 | data_addmodules: "",
34 | data_listlabel: "Fallzahlen Corona Kanton Schaffhausen.xlsx",
35 | data_tags: "",
36 | data_widget_data: "[]",
37 | data_filemeta: "{"uploaded":1,"fileName":"d4ffb019-a2ef-4782-87be-0aafb4b43558","key":"TEMPUPLOADFILES","url":"/CMS/get/file/d4ffb019-a2ef-4782-87be-0aafb4b43558","originalname":"Fallzahlen Corona Kanton Schaffhausen.xlsx","fileid":"d4ffb019-a2ef-4782-87be-0aafb4b43558","category":"null","title":"null","filesize":12286}",
38 | data_shareInGlobal: "[]",
39 | data_verbande: "",
40 | data_file_description: "",
41 | data_custom_publication_date_time: "09:31",
42 | data_galleries: "[]",
43 | data_sharepaths: "",
44 | data_permalink: "/Webseite/Kanton-Schaffhausen/Beh-rde/Verwaltung/Departement-des-Innern/Gesundheitsamt-3666465-DE.html",
45 | data_schlagworte: "",
46 | data_approvedpaths: "["/1752/8540/1753/1765/1755/1763/2733/2747/3275/3666465"]",
47 | contentid: "3666465",
48 | domainid: "1753",
49 | contenttypeid: "101",
50 | transactiontime: "23.04 09:09",
51 | author: "dande",
52 | language: "DE",
53 | activated_languages: [
54 | "DE"
55 | ],
56 | sliderimages: [ ],
57 | genericimages: { }
58 | }
59 | """
60 |
61 | meta = json.loads(m['data_filemeta'])
62 | url = f"https://sh.ch{meta['url']}"
63 | return url
64 |
65 | def get_sh_xlsx():
66 | main_url = 'https://coviddashboard.sh.ch/'
67 | content = sc.download(main_url, silent=True)
68 | soup = BeautifulSoup(content, 'html.parser')
69 | link = soup.find('a', href=re.compile(r'.*\.xlsx'))
70 | xls = sc.xlsdownload(link.get('href'), silent=True)
71 | return main_url, xls
72 |
--------------------------------------------------------------------------------
/scrapers/scrape_fr_districts.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 |
4 | import csv
5 | from io import StringIO
6 | import re
7 | from bs4 import BeautifulSoup
8 | import scrape_common as sc
9 | from scrape_fr_common import get_fr_csv
10 |
11 | inhabitants = {
12 | 'Broye': 32894,
13 | 'Glane': 24337,
14 | 'Greyerz': 55726,
15 | 'Saane': 106136,
16 | 'See': 36800,
17 | 'Sense': 43990,
18 | 'Vivisbach': 18831,
19 | }
20 |
21 | district_ids = {
22 | 'Broye': 1001,
23 | 'Glane': 1002,
24 | 'Greyerz': 1003,
25 | 'Saane': 1004,
26 | 'See': 1005,
27 | 'Sense': 1006,
28 | 'Vivisbach': 1007,
29 | }
30 |
31 | district_xls = {
32 | 'Broye': 'Broye',
33 | 'Glane': 'Gl.ne',
34 | 'Greyerz': 'Gruy.re',
35 | 'Saane': 'Sarine',
36 | 'See': 'Lac',
37 | 'Sense': 'Singine',
38 | 'Vivisbach': 'Veveyse',
39 | }
40 |
41 | # weekly data
42 | url = 'https://www.fr.ch/de/gesundheit/covid-19/coronavirus-statistik-ueber-die-entwicklung-im-kanton'
43 | """
44 | d = sc.download(url, silent=True)
45 | d = d.replace(' ', ' ')
46 |
47 | soup = BeautifulSoup(d, 'html.parser')
48 | table = soup.find(string=re.compile(r'Anzahl positive F.lle nach Bezirk')).find_next('table')
49 |
50 | weeks = []
51 | years = []
52 | week_regex = re.compile(r'Woche \d+')
53 | trs = table.find_all('tr')
54 | for header in trs[0]:
55 | week = sc.find(r'Woche (\d+)', header.string)
56 | if week is not None:
57 | weeks.append(week)
58 | years.append('2021')
59 |
60 | for tr in trs[1:]:
61 | tds = tr.find_all('td')
62 |
63 | for i in range(len(weeks)):
64 | district = tds[0].string
65 | if district in inhabitants:
66 | dd = sc.DistrictData(canton='FR', district=district)
67 | dd.url = url
68 | dd.week = weeks[i]
69 | # TODO restore once all weeks are in 2021
70 | # dd.year = '20' + year
71 | dd.year = years[i]
72 | dd.new_cases = tds[i + 1].string
73 | dd.population = inhabitants[district]
74 | dd.district_id = district_ids[district]
75 | print(dd)
76 | """
77 |
78 | # daily data from csv
79 | csv_url, csv_data, main_url = get_fr_csv()
80 | reader = csv.DictReader(StringIO(csv_data), delimiter=';')
81 |
82 | for row in reader:
83 | row_date = None
84 | for key, val in row.items():
85 | if sc.find(r'(Date).*', key):
86 | row_date = val
87 | assert row_date
88 | row_date = sc.date_from_text(row_date)
89 | for district, xls_district in district_xls.items():
90 | for key, val in row.items():
91 | if sc.find(r'.*(' + xls_district + ').*', key):
92 | dd = sc.DistrictData(canton='FR', district=district)
93 | dd.url = url
94 | dd.date = row_date.isoformat()
95 | dd.new_cases = val
96 | dd.population = inhabitants[district]
97 | dd.district_id = district_ids[district]
98 | print(dd)
99 |
--------------------------------------------------------------------------------
/scrapers/scrape_bl_tests.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 |
4 | from bs4 import BeautifulSoup
5 | import re
6 | import scrape_common as sc
7 | import scrape_bl_common as sbc
8 | from datetime import timedelta
9 |
10 |
11 | # weekly data
12 | bulletin_urls = sbc.get_all_bl_bulletin_urls()
13 | for bulletin_url in bulletin_urls:
14 | bulletin_content = sc.download(bulletin_url, silent=True)
15 | soup = BeautifulSoup(bulletin_content, 'html.parser')
16 | content = soup.find(string=re.compile(r'Per heute .*')).string
17 | content = sbc.strip_bl_bulletin_numbers(content)
18 |
19 | date = sc.find(r'Per heute \w+, (\d+\. \w+ 20\d{2})', content)
20 | date = sc.date_from_text(date)
21 | # previous week
22 | date = date - timedelta(days=7)
23 |
24 | td = sc.TestData(canton='BL', url=bulletin_url)
25 | td.week = date.isocalendar()[1]
26 | td.year = date.year
27 | td.total_tests = sc.find(r'In der Vorwoche wurden (\d+) PCR-Tests', content)
28 | td.positivity_rate = sc.find(r'von diesen waren (\d+\.?,?\d?) Prozent positiv', content)
29 | if td.total_tests and td.positivity_rate:
30 | td.positivity_rate = td.positivity_rate.replace(',', '.')
31 | print(td)
32 |
33 |
34 | # daily data
35 | main_url = 'https://www.baselland.ch/politik-und-behorden/direktionen/volkswirtschafts-und-gesundheitsdirektion/amt-fur-gesundheit/medizinische-dienste/kantonsarztlicher-dienst/aktuelles/covid-19-faelle-kanton-basel-landschaft/covid-19-bl-tests'
36 | main_content = sc.download(main_url, silent=True)
37 | soup = BeautifulSoup(main_content, 'html.parser')
38 |
39 | def create_bs_test_data(date):
40 | td = sc.TestData(canton='BL', url=main_url)
41 | td.start_date = date
42 | td.end_date = date
43 | return td
44 |
45 | tests_data = {}
46 |
47 | for iframe in soup.find_all('iframe'):
48 | iframe_url = iframe['src']
49 | d = sc.download(iframe_url, silent=True)
50 | d = d.replace('\n', ' ')
51 |
52 | # Taegliche PCR-Tests BL
53 | data = sc.find(r' ?Datum,"Negative Tests","Positive Tests"\s*([^<]+)
', d)
54 | if data:
55 | for row in data.split(" "):
56 | c = row.split(',')
57 | date = sbc.parse_bl_date(c[0])[0]
58 | if date not in tests_data:
59 | tests_data[date] = create_bs_test_data(date)
60 | tests_data[date].negative_tests = round(float(c[1]))
61 | tests_data[date].positive_tests = round(float(c[2]))
62 | continue
63 |
64 | # Taegliche Positivitaetsrate BL
65 | data = sc.find(r' ?Datum,"T.gliche Positivit.tsrate BL"\s*([^<]+)
', d)
66 | if data:
67 | for row in data.split(" "):
68 | c = row.split(',')
69 | date = sbc.parse_bl_date(c[0])[0]
70 | if date not in tests_data:
71 | tests_data[date] = create_bs_test_data(date)
72 | tests_data[date].positivity_rate = c[1]
73 | continue
74 |
75 | for date, td in tests_data.items():
76 | print(td)
77 |
--------------------------------------------------------------------------------
/scrapers/scrape_ur.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import re
4 | from bs4 import BeautifulSoup
5 | import scrape_common as sc
6 |
7 | url = 'https://www.ur.ch/themen/2962'
8 | d = sc.download(url, silent=True)
9 | d = d.replace(' ', ' ')
10 | d = d.replace('
', ' ')
11 | d = re.sub(r'(\d+)\'(\d+)', r'\1\2', d)
12 |
13 | # 2020-03-26 (and possibly earlier) from https://www.ur.ch/themen/2962
14 | # 2020-07-07 they changed the title, so we're using the table header to find the table
15 | # 2020-07-24 column "Genesen" was removed
16 | """
17 |
18 |
19 | Stand: 24.07.2020, 11.00 Uhr
20 |
21 |
22 | | Positiv getestete Erkrankungsfälle |
23 | Hospitalisiert |
24 | Verstorben |
25 |
26 |
27 | | 115 |
28 | 1 |
29 | 7 |
30 |
31 |
32 |
33 | """
34 |
35 | # 2020-08-03 new table layout with 6 columns
36 | """
37 |
38 |
39 | Stand: 03.08.2020, 16.00 Uhr
40 |
41 |
42 | | Aktive Fälle |
43 | Positiv getestete Erkrankungsfälle |
44 | Hospitalisiert |
45 | Quarantäne |
46 | Verstorben |
47 | |
48 |
49 |
50 | | 4 |
51 | 117 |
52 | 0 |
53 | 47 |
54 | 7 |
55 | |
56 |
57 |
58 |
59 | """
60 |
61 | soup = BeautifulSoup(d, 'html.parser')
62 | data_table = soup.find(string=re.compile(r'Positive\s+Fälle\s+total')).find_parent('table')
63 |
64 | assert data_table, "Can't find data table"
65 |
66 | dd = sc.DayData(canton='UR', url=url)
67 | dd.datetime = sc.find(r'Stand: (.* Uhr)', d)
68 |
69 | rows = data_table.find_all('tr')
70 | assert len(rows) == 2, f"Number of rows changed, {len(rows)} != 2"
71 |
72 | headers = rows[0].find_all('td') or rows[0].find_all('th')
73 | assert len(headers) == 5, f"Number of header columns changed, {len(headers)} != 5"
74 | assert re.search(r'(aktive\s+fälle)', headers[0].text, flags=re.I) is not None
75 | assert re.search(r"(positive\s+fälle\s+total\s+seit\s+märz\s+2020)", headers[1].text, flags=re.I) is not None
76 | assert headers[2].text.lower() == "hospitalisiert"
77 | assert re.search(r"(total\s+verstorbene)", headers[3].text, flags=re.I) is not None
78 |
79 | cells = rows[1].find_all('td')
80 | assert len(cells) == 4, f"Number of columns changed, {len(cells)} != 4"
81 |
82 | ur_number_regex = r'(\d+)\s*(\(.+?\))?'
83 | dd.cases = sc.find(ur_number_regex, cells[1].text)
84 | dd.hospitalized = sc.find(ur_number_regex, cells[2].text)
85 | dd.deaths = sc.find(ur_number_regex, cells[3].text)
86 |
87 | print(dd)
88 |
--------------------------------------------------------------------------------
/scrapers/scrape_gl.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 |
4 | import re
5 | import sys
6 | from bs4 import BeautifulSoup
7 | import csv
8 | from io import StringIO
9 | import scrape_common as sc
10 | import scrape_gl_common as sgc
11 |
12 | def split_whitespace(text):
13 | if not text:
14 | return []
15 | text = re.sub(r'\s\s+', ' ', text)
16 | return text.split(' ')
17 |
18 | is_first = True
19 |
20 | # weekly pdf
21 | pdf_url = sgc.get_gl_pdf_url()
22 | if pdf_url is not None:
23 | pdf = sc.download_content(pdf_url, silent=True)
24 | content = sc.pdftotext(pdf, page=1)
25 | content = re.sub(r'(\d+)\'(\d+)', r'\1\2', content)
26 | content = re.sub(r'(\d+)’(\d+)', r'\1\2', content)
27 |
28 | pdf_date = sc.find(r'Stand: (\d{2}\.\d{2}.\d{4})', content)
29 | pdf_date = sc.date_from_text(pdf_date)
30 |
31 | number_of_tests = sc.find(r'PCR-Tests/Schnelltests\sKanton Glarus\s(\d+)\s', content)
32 | if number_of_tests:
33 | dd = sc.DayData(canton='GL', url=pdf_url)
34 | dd.datetime = pdf_date
35 | dd.tested = number_of_tests
36 | is_first = False
37 | print(dd)
38 |
39 |
40 | content = sc.pdftotext(pdf, page=2, raw=True)
41 | dates = split_whitespace(sc.find(r'\n(\d+\.\d+\s+\d+\.\d+\s+.*)\nAnzahl\s+in\s+Isolation', content))
42 | isolation = split_whitespace(sc.find(r'\nAnzahl\s+in\s+Isolation\s+(\d.*)\n', content))
43 | quarantined = split_whitespace(sc.find(r'\nKontaktpersonen\s+in\s+Quarant.ne\s+(\d.*)\n', content))
44 |
45 | if len(dates) == len(isolation) == len(quarantined):
46 | for date, iso, qua in zip(dates, isolation, quarantined):
47 | if sc.find(r'(\d{2}\.12)', date):
48 | year = '2020'
49 | else:
50 | year = pdf_date.year
51 | dd = sc.DayData(canton='GL', url=pdf_url)
52 | dd.datetime = f'{date}.{year}'
53 | dd.isolated = iso
54 | dd.quarantined = qua
55 | if not is_first:
56 | print('-' * 10)
57 | is_first = False
58 | print(dd)
59 | else:
60 | print('PDF data is inconsistent!', file=sys.stderr)
61 | print(f'dates: {len(dates)}, isolation: {len(isolation)}, quarantined: {len(quarantined)}', file=sys.stderr)
62 |
63 |
64 | # CSV from Google Spreadsheets
65 | main_url = 'https://docs.google.com/spreadsheets/d/1Q7VoxM6wvbdsC84DLWrzyNymkcxUKqIXHy6BpB2Ez0k/edit#gid=0'
66 | csv_url = 'https://docs.google.com/spreadsheets/d/1Q7VoxM6wvbdsC84DLWrzyNymkcxUKqIXHy6BpB2Ez0k/export?format=csv&id=1Q7VoxM6wvbdsC84DLWrzyNymkcxUKqIXHy6BpB2Ez0k&gid=0'
67 | d_csv = sc.download(csv_url, silent=True)
68 |
69 | reader = csv.DictReader(StringIO(d_csv), delimiter=',')
70 | for row in reader:
71 | if row['Datum'] == '':
72 | continue
73 | if not is_first:
74 | print('-' * 10)
75 | is_first = False
76 | dd = sc.DayData(canton='GL', url=main_url)
77 | dd.datetime = row['Datum']
78 | dd.cases = row['Fallzahlen Total']
79 | dd.hospitalized = row['Personen in Spitalpflege']
80 | dd.deaths = row['Todesfälle (kumuliert)']
81 | print(dd)
82 |
--------------------------------------------------------------------------------
/scrapers/scrape_so_tests.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 |
4 | import datetime
5 | import re
6 | import scrape_common as sc
7 | import scrape_so_common as soc
8 |
9 |
10 | pdf_urls = soc.get_all_weekly_pdf_urls()
11 | # start with the oldest PDF to have the most recent ones last
12 | pdf_urls.reverse()
13 | for pdf_url in pdf_urls:
14 | content = sc.pdfdownload(pdf_url, layout=True, silent=True, page=1)
15 | # remove ' separator to simplify pattern matching
16 | content = re.sub(r'(\d)\'(\d)', r'\1\2', content)
17 |
18 | date = sc.find(r'S\s?tand: (\d+\.\d+\.20\d{2})', content)
19 | date = sc.date_from_text(date)
20 | year1 = (date - datetime.timedelta(weeks=2)).year
21 | year2 = (date - datetime.timedelta(weeks=1)).year
22 | res = re.match(r'.*Woche (?P\d+)(\s+\(\d+\.\d+-\d+\.\d+\))?\s+Woche (?P\d+)\s+', content, re.DOTALL)
23 | assert res, 'Weeks could not be extracted'
24 | week1 = res['w1']
25 | week2 = res['w2']
26 |
27 | res = re.match(r'.*PCR-Tes\s?ts\s+(\d.*\n)?Total\s+\d+\s+\d+\s+(\d+)\s+\d+\.?\d?\s+(\d+)\s', content, re.DOTALL)
28 | if not res:
29 | res = re.match(r'.*Labortes\s?ts\s\(PCR\s-\sund\sS\s?chnelltes\s?ts\s?\)\s+(\d.*\n)?Total\s+\d+\s+\d+\.?\d?\s+(\d+)\s+\d+\.?\d?\s+(\d+)\s', content, re.DOTALL)
30 | if not res:
31 | res = re.match(r'.*Labortes\s?ts\s\(PCR\s-\sund\sS\s?chnelltes\s?ts\s?\)\s+(\d.*\n)?Total\s+\d+\s+(\d+)\s+\d+\.?\d?\s+(\d+)\s', content, re.DOTALL)
32 | if res:
33 | total_tests1 = res[2]
34 | total_tests2 = res[3]
35 |
36 | if not res:
37 | res = re.match(r'.*\s+PCR\s+(\d+\s+)?(\d+)\s+(\d+)\s', content, re.DOTALL)
38 | assert res, f'PCR tests for week {week1} or {week2} could not be extracted!'
39 | if res:
40 | total_tests1 = int(res[2])
41 | total_tests2 = int(res[3])
42 |
43 | res = re.match(r'.*\s+Antigen-Schnelltests\s+(\d+\s+)?(\d+)\s+(\d+)', content, re.DOTALL)
44 | assert res, f'Antigen tests for week {week1} or {week2} could not be extracted!'
45 | if res:
46 | total_tests1 += int(res[2])
47 | total_tests2 += int(res[3])
48 |
49 | assert res, f'PCR tests for week {week1} or {week2} could not be extracted!'
50 |
51 | res = re.match(r'.*Positivit.tsrate\s+\*+?\s+\d+\.?\d?%?\s+(\d+\.?\d?)%?\s+(\d+\.?\d?)%?', content, re.DOTALL)
52 | pos_rate1 = None
53 | pos_rate2 = None
54 | if res:
55 | pos_rate1 = res[1]
56 | pos_rate2 = res[2]
57 | else:
58 | res = re.match(r'.*Anteil\s+pos\s?itiv\s?er\s+Tes\s?ts\s+\(%\)\s+(\d+\w+)?\s+(\d+\.?\d?)%?\s+(\d+\.?\d?)%?', content, re.DOTALL)
59 | if res:
60 | pos_rate1 = res[2]
61 | pos_rate2 = res[3]
62 |
63 | data = sc.TestData(canton='SO', url=pdf_url)
64 | data.week = week1
65 | data.year = year1
66 | data.total_tests = total_tests1
67 | data.positivity_rate = pos_rate1
68 | print(data)
69 |
70 | data = sc.TestData(canton='SO', url=pdf_url)
71 | data.week = week2
72 | data.year = year2
73 | data.total_tests = total_tests2
74 | data.positivity_rate = pos_rate2
75 | print(data)
76 |
--------------------------------------------------------------------------------
/scrapers/test/test_test_data.py:
--------------------------------------------------------------------------------
1 | from scrapers.scrape_common import TestData
2 |
3 | def test_test_data():
4 | dd = TestData()
5 | dd.start_date = '1'
6 | dd.end_date = '2'
7 | dd.week = 3
8 | dd.year = 4
9 | dd.canton = '5'
10 | dd.positive_tests = 6
11 | dd.negative_tests = 7
12 | dd.total_tests = 8
13 | dd.positivity_rate = 9
14 | dd.url = '10'
15 |
16 | string = str(dd)
17 |
18 | dd_parsed = TestData()
19 | assert dd_parsed.parse(string)
20 | assert dd.start_date == dd_parsed.start_date
21 | assert dd.end_date == dd_parsed.end_date
22 | assert dd.week == dd_parsed.week
23 | assert dd.year == dd_parsed.year
24 | assert dd.canton == dd_parsed.canton
25 |
26 | assert dd.positive_tests == dd_parsed.positive_tests
27 | assert dd.negative_tests == dd_parsed.negative_tests
28 | assert dd.positivity_rate == dd_parsed.positivity_rate
29 |
30 | assert dd.positive_tests == dd_parsed.positive_tests
31 | assert dd.negative_tests == dd_parsed.negative_tests
32 | assert dd.positivity_rate == dd_parsed.positivity_rate
33 |
34 | assert dd.pcr_positive_tests == dd_parsed.pcr_positive_tests
35 | assert dd.pcr_negative_tests == dd_parsed.pcr_negative_tests
36 | assert dd.pcr_positivity_rate == dd_parsed.pcr_positivity_rate
37 |
38 | assert dd.ag_positive_tests == dd_parsed.ag_positive_tests
39 | assert dd.ag_negative_tests == dd_parsed.ag_negative_tests
40 | assert dd.ag_positivity_rate == dd_parsed.ag_positivity_rate
41 |
42 | assert dd.url == dd_parsed.url
43 |
44 |
45 | def test_test_data_with_PCR_antigen():
46 | dd = TestData()
47 | dd.start_date = '1'
48 | dd.end_date = '2'
49 | dd.week = 3
50 | dd.year = 4
51 | dd.canton = '5'
52 |
53 | dd.positive_tests = 6
54 | dd.negative_tests = 7
55 | dd.total_tests = 8
56 | dd.positivity_rate = 9
57 |
58 | dd.pcr_positive_tests = 10
59 | dd.pcr_negative_tests = 11
60 | dd.pcr_total_tests = 12
61 | dd.pcr_positivity_rate = 13
62 |
63 | dd.ag_positive_tests = 14
64 | dd.ag_negative_tests = 15
65 | dd.ag_total_tests = 16
66 | dd.ag_positivity_rate = 17
67 |
68 | dd.url = '18'
69 |
70 | string = str(dd)
71 |
72 | dd_parsed = TestData()
73 | assert dd_parsed.parse(string)
74 | assert dd.start_date == dd_parsed.start_date
75 | assert dd.end_date == dd_parsed.end_date
76 | assert dd.week == dd_parsed.week
77 | assert dd.year == dd_parsed.year
78 | assert dd.canton == dd_parsed.canton
79 |
80 | assert dd.positive_tests == dd_parsed.positive_tests
81 | assert dd.negative_tests == dd_parsed.negative_tests
82 | assert dd.positivity_rate == dd_parsed.positivity_rate
83 |
84 | assert dd.pcr_positive_tests == dd_parsed.pcr_positive_tests
85 | assert dd.pcr_negative_tests == dd_parsed.pcr_negative_tests
86 | assert dd.pcr_positivity_rate == dd_parsed.pcr_positivity_rate
87 |
88 | assert dd.ag_positive_tests == dd_parsed.ag_positive_tests
89 | assert dd.ag_negative_tests == dd_parsed.ag_negative_tests
90 | assert dd.ag_positivity_rate == dd_parsed.ag_positivity_rate
91 |
92 | assert dd.url == dd_parsed.url
93 |
94 |
95 | if __name__ == "__main__":
96 | test_test_data()
97 |
--------------------------------------------------------------------------------
/scripts/check_for_outliers.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 |
4 | import sys
5 | import os
6 | import pandas as pd
7 | import math
8 |
9 | __location__ = os.path.realpath(
10 | os.path.join(
11 | os.getcwd(),
12 | os.path.dirname(__file__)
13 | )
14 | )
15 |
16 | # only values above this MIN_VALUE are considered outliers
17 | # this is to prevent a failing scraper run if the absolute value is not very high
18 | # this outlier detection is mostly to prevent human error (wrong data added)
19 | MIN_VALUE = 20
20 |
21 | # only check the last x days
22 | LAG_PERIODS = 10
23 |
24 | # periods considered "recent"
25 | RECENT_PERIODS = 5
26 |
27 | # IQR factor, determines how many times the IQR is the limit for an outlier
28 | FACTOR = 1.5
29 |
30 | assert len(sys.argv) >= 2, "Error: Call this script with the path(s) to CSV file(s)"
31 |
32 | fail = False
33 |
34 | args = sys.argv[1:]
35 | for csv_file in args:
36 |
37 | # load canton file from covid_19 repo
38 | df = pd.read_csv(csv_file, parse_dates=[0])
39 | df_ignore = pd.read_csv(os.path.join(__location__, '..', 'outlier_status.csv'), parse_dates=[0])
40 | df = pd.merge(df, df_ignore, left_on=['date', 'abbreviation_canton_and_fl'], right_on=['date', 'abbreviation_canton_and_fl'], how='left')
41 |
42 | # create new column for current cases
43 | df_conf = df[['date', 'ncumul_conf', 'ncumul_conf_outlier']].reset_index(drop=True)
44 | df_conf['current_conf'] = df['ncumul_conf'] - df['ncumul_conf'].shift(1)
45 |
46 | # only use the last 30 rows
47 | df_conf = df_conf.tail(LAG_PERIODS).reset_index(drop=True)
48 |
49 | # caculate iqr for confirmed cases
50 | q1 = df_conf['current_conf'].quantile(0.25)
51 | q3 = df_conf['current_conf'].quantile(0.75)
52 | iqr = q3 - q1
53 |
54 | if pd.isna(q1) or pd.isna(q3) or pd.isna(iqr):
55 | print(f"⚠️ {csv_file} has too many missing/NaN values (Q1: {q1}, Q3: {q3}, IQR: {iqr}) to calculate outliers, skipping.")
56 | continue
57 |
58 | lower_limit = q1 - (iqr * FACTOR)
59 | upper_limit = math.ceil(q3 + (iqr * FACTOR))
60 |
61 | upper_limit = max(upper_limit, MIN_VALUE)
62 | lower_limit = 0 # always use 0 as lower limit
63 | df_conf['q1'] = q1
64 | df_conf['q3'] = q3
65 | df_conf['iqr'] = iqr
66 | df_conf['factor'] = FACTOR
67 | df_conf['upper_limit'] = upper_limit
68 | df_conf['lower_limit'] = lower_limit
69 |
70 | # use IQR*factor to get outliers
71 | outliers = df_conf.query('(current_conf < @lower_limit) or (current_conf > @upper_limit)')
72 | recent_outliers = df_conf.tail(RECENT_PERIODS).query("((current_conf < @lower_limit) or (current_conf > @upper_limit)) and (ncumul_conf_outlier != 'ignore')")
73 | if outliers.empty:
74 | print(f"✅ {csv_file} has no outliers.")
75 | else:
76 | if not recent_outliers.empty:
77 | fail = True
78 | print(f"❌ {csv_file} has recent outliers, please check if this is an error.")
79 | else:
80 | print(f"⚠️ {csv_file} has older or ignored outliers.")
81 | print(outliers[['date', 'ncumul_conf', 'current_conf', 'iqr', 'factor', 'upper_limit']])
82 | print('')
83 |
84 | if fail:
85 | sys.exit(1)
86 |
87 |
--------------------------------------------------------------------------------
/scrapers/test/test_dates.py:
--------------------------------------------------------------------------------
1 | from scrapers.scrape_dates import parse_date
2 |
3 | def test_dates():
4 | date_tests = [
5 | ('20. März 2020 15.00 Uhr', '2020-03-20T15:00'),
6 | ('21. März 2020, 10 Uhr', '2020-03-21T10:00'),
7 | ('21. März 2020, 11:00 Uhr', '2020-03-21T11:00'),
8 | ('21.03.2020, 15h30', '2020-03-21T15:30'),
9 | ('21. März 2020, 8.00 Uhr', '2020-03-21T08:00'),
10 | ('21. März 2020, 18.15 Uhr', '2020-03-21T18:15'),
11 | ('21. März 2020, 18.15 Uhr', '2020-03-21T18:15'),
12 | ('21. März 2020, 14.00 Uhr', '2020-03-21T14:00'),
13 | ('23. März 2020, 15 Uhr', '2020-03-23T15:00'),
14 | ('18. April 2020,16.00 Uhr', '2020-04-18T16:00'),
15 | ('21. März 2020', '2020-03-21T'),
16 | ('21.3.20', '2020-03-21T'),
17 | ('20.3.2020, 16.30', '2020-03-20T16:30'),
18 | ('21.03.2020, 15h30', '2020-03-21T15:30'),
19 | ('23.03.2020, 12:00', '2020-03-23T12:00'),
20 | ('23.03.2020 12:00', '2020-03-23T12:00'),
21 | ('08.04.2020: 09.30 Uhr', '2020-04-08T09:30'),
22 | ('07.04.2020 15.00h', '2020-04-07T15:00'),
23 | ('31.03.20, 08.00 h', '2020-03-31T08:00'),
24 | ('20.03.2020', '2020-03-20T'),
25 | ('21 mars 2020 (18h)', '2020-03-21T18:00'),
26 | ('1er avril 2020 (16h)', '2020-04-01T16:00'),
27 | ('21 mars 2020', '2020-03-21T'),
28 | ('6avril2020', '2020-04-06T'),
29 | ('20.03 à 8h00', '2020-03-20T08:00'),
30 | ('23.03 à 12h', '2020-03-23T12:00'),
31 | ('21 marzo 2020, ore 8.00', '2020-03-21T08:00'),
32 | ('27.03.2020 ore 08:00', '2020-03-27T08:00'),
33 | ('2020-03-23', '2020-03-23T'),
34 | ('24.3. / 10h', '2020-03-24T10:00'),
35 | ('2020-03-23T15:00:00', '2020-03-23T15:00'),
36 | ('2020-03-23 15:00:00', '2020-03-23T15:00'),
37 | ('2020-03-23 15:00', '2020-03-23T15:00'),
38 | ('30.04.2020,13.30 Uhr', '2020-04-30T13:30'),
39 | ('1.Mai 2020', '2020-05-01T'),
40 | ('05-05-2020 00:00', '2020-05-05T00:00'),
41 | ('07.05.2020, 00;00 Uhr', '2020-05-07T00:00'),
42 | ('17.06.2020 um 8 Uhr', '2020-06-17T08:00'),
43 | ('08.07.2020, um 8 Uhr', '2020-07-08T08:00'),
44 | ('8. Juli 2020 um 14:30 Uhr', '2020-07-08T14:30'),
45 | ('17.07.20 08:00', '2020-07-17T08:00'),
46 | ('12. 8. 2020', '2020-08-12T'),
47 | ('1er septembre 2020', '2020-09-01T'),
48 | ]
49 | for text, date in date_tests:
50 | assert parse_date(text) == date, f"parse_date('{text}') = '{parse_date(text)}', but expected '{date}'"
51 |
52 | if __name__ == "__main__":
53 | test_dates()
54 |
--------------------------------------------------------------------------------
/.github/workflows/run_district_scrapers.yml:
--------------------------------------------------------------------------------
1 | name: Run district scrapers
2 |
3 | on:
4 | schedule:
5 | - cron: '10 * * * *' # run every hour at xx:10
6 | workflow_dispatch: ~
7 | jobs:
8 | run_scraper:
9 | runs-on: ubuntu-20.04
10 | continue-on-error: false
11 | timeout-minutes: 10
12 | strategy:
13 | fail-fast: false
14 | matrix:
15 | canton:
16 | #- AG
17 | - BE
18 | #- BL
19 | #- FR
20 | #- GR
21 | - SG
22 | #- SO
23 | #- SZ
24 | - TG
25 | #- VS
26 |
27 | steps:
28 | - uses: actions/checkout@v3
29 |
30 | - name: Set up Python 3.7
31 | uses: actions/setup-python@v4
32 | with:
33 | python-version: 3.7
34 | - run: npm ci
35 | - name: Remove broken apt repos
36 | run: |
37 | for apt_file in `grep -lr microsoft /etc/apt/sources.list.d/`; do sudo rm $apt_file; done
38 | - name: Install dependencies
39 | env:
40 | SCRAPER_KEY: ${{ matrix.canton }}
41 | run: |
42 | python -m pip install --upgrade pip setuptools wheel
43 | pip install -r requirements.txt
44 | sudo apt update || true # do not fail if update does not work
45 | sudo apt-get install sqlite3 poppler-utils
46 | if [ "$SCRAPER_KEY" = "AG" ] ; then
47 | pip install -r requirements-ocr.txt
48 | sudo apt-get install tesseract-ocr=3.04.01-4
49 | fi
50 |
51 | - name: Scrape new data
52 | env:
53 | SCRAPER_KEY: ${{ matrix.canton }}
54 | run: |
55 | ./scrapers/run_district_scraper.sh
56 |
57 | - name: Check if there are changes in the repo
58 | run: |
59 | if git diff -w --no-ext-diff --quiet
60 | then
61 | echo "changed=0" >> $GITHUB_OUTPUT
62 | else
63 | echo "changed=1" >> $GITHUB_OUTPUT
64 | fi
65 | id: changes
66 |
67 | - name: Set commit message
68 | env:
69 | SCRAPER_KEY: ${{ matrix.canton }}
70 | run: |
71 | echo "commit_msg=Update fallzahlen_kanton_${SCRAPER_KEY}_bezirk.csv from scraper" >> $GITHUB_ENV
72 |
73 | - name: Commit and push to repo
74 | if: steps.changes.outputs.changed == 1 # only try to commit if there are actually changes
75 | uses: github-actions-x/commit@v2.9
76 | with:
77 | github-token: ${{ secrets.GITHUB_TOKEN }}
78 | push-branch: master
79 | name: GitHub Action Scraper
80 | email: scraper@open.zh.ch
81 | commit-message: ${{ env.commit_msg }}
82 | rebase: 'true'
83 |
84 | - name: Get current unix timestamp
85 | if: always()
86 | id: date
87 | run: echo "ts=$(date +'%s')" >> $GITHUB_OUTPUT
88 |
89 | - name: Notify slack failure
90 | if: ${{ failure() || cancelled() }}
91 | env:
92 | SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
93 | uses: pullreminders/slack-action@master
94 | with:
95 | args: '{\"channel\":\"C013C0UUQ4S\", \"attachments\": [{\"fallback\": \"Job failed.\", \"color\": \"danger\", \"title\": \"Run district scrapers ${{ matrix.canton }}\", \"title_link\": \"https://github.com/openZH/covid_19/actions/runs/${{ github.run_id }}?check_suite_focus=true\", \"text\": \":x: District scraper failed\", \"footer\": \"\", \"footer_icon\": \"https://github.com/abinoda/slack-action/raw/master/docs/app-icons/github-icon.png\", \"ts\": \"${{steps.date.outputs.ts}}\"}]}'
96 |
97 |
--------------------------------------------------------------------------------
/.github/workflows/run_tests_scraper.yml:
--------------------------------------------------------------------------------
1 | name: Run tests scrapers
2 |
3 | on:
4 | schedule:
5 | - cron: '20 * * * *' # run every hour at xx:20
6 | workflow_dispatch: ~
7 | jobs:
8 | run_scraper:
9 | runs-on: ubuntu-20.04
10 | continue-on-error: false
11 | timeout-minutes: 10
12 | strategy:
13 | fail-fast: false
14 | matrix:
15 | canton:
16 | # - AG
17 | - BE
18 | - BL
19 | - BS
20 | - FL
21 | # - FR # no more data published anymore
22 | # - GE
23 | - GL
24 | # - JU #disable until PDF is fixed
25 | # - NW
26 | - SG
27 | - SH
28 | # - SO
29 | - TG
30 | # - TI # no more data published anymore
31 | # - VD
32 | - VS
33 | - ZG
34 | - ZH
35 |
36 | steps:
37 | - uses: actions/checkout@v3
38 |
39 | - name: Set up Python 3.7
40 | uses: actions/setup-python@v4
41 | with:
42 | python-version: 3.7
43 | - run: npm ci
44 | - name: Remove broken apt repos
45 | run: |
46 | for apt_file in `grep -lr microsoft /etc/apt/sources.list.d/`; do sudo rm $apt_file; done
47 | - name: Install dependencies
48 | env:
49 | SCRAPER_KEY: ${{ matrix.canton }}
50 | run: |
51 | python -m pip install --upgrade pip setuptools wheel
52 | pip install -r requirements.txt
53 | sudo apt update || true # do not fail if update does not work
54 | sudo apt-get install sqlite3 poppler-utils
55 | if [ "$SCRAPER_KEY" = "GE" ] ; then
56 | sudo apt-get install chromium-browser
57 | fi
58 |
59 | - name: Scrape new data
60 | env:
61 | SCRAPER_KEY: ${{ matrix.canton }}
62 | run: |
63 | ./scrapers/run_tests_scraper.sh
64 |
65 | - name: Check if there are changes in the repo
66 | run: |
67 | if git diff -w --no-ext-diff --quiet
68 | then
69 | echo "changed=0" >> $GITHUB_OUTPUT
70 | else
71 | echo "changed=1" >> $GITHUB_OUTPUT
72 | fi
73 | id: changes
74 |
75 | - name: Set commit message
76 | env:
77 | SCRAPER_KEY: ${{ matrix.canton }}
78 | run: |
79 | if [ "$SCRAPER_KEY" = "FL" ] ; then
80 | echo "commit_msg=Update fallzahlen_${SCRAPER_KEY}_tests.csv from scraper" >> $GITHUB_ENV
81 | else
82 | echo "commit_msg=Update fallzahlen_kanton_${SCRAPER_KEY}_tests.csv from scraper" >> $GITHUB_ENV
83 | fi
84 |
85 | - name: Commit and push to repo
86 | if: steps.changes.outputs.changed == 1 # only try to commit if there are actually changes
87 | uses: github-actions-x/commit@v2.9
88 | with:
89 | github-token: ${{ secrets.GITHUB_TOKEN }}
90 | push-branch: master
91 | name: GitHub Action Scraper
92 | email: scraper@open.zh.ch
93 | commit-message: ${{ env.commit_msg }}
94 | rebase: 'true'
95 |
96 | - name: Get current unix timestamp
97 | if: always()
98 | id: date
99 | run: echo "ts=$(date +'%s')" >> $GITHUB_OUTPUT
100 |
101 | - name: Notify slack failure
102 | if: ${{ failure() || cancelled() }}
103 | env:
104 | SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
105 | uses: pullreminders/slack-action@master
106 | with:
107 | args: '{\"channel\":\"C013C0UUQ4S\", \"attachments\": [{\"fallback\": \"Job failed.\", \"color\": \"danger\", \"title\": \"Run tests scrapers ${{ matrix.canton }}\", \"title_link\": \"https://github.com/openZH/covid_19/actions/runs/${{ github.run_id }}?check_suite_focus=true\", \"text\": \":x: Tests scraper failed\", \"footer\": \"\", \"footer_icon\": \"https://github.com/abinoda/slack-action/raw/master/docs/app-icons/github-icon.png\", \"ts\": \"${{steps.date.outputs.ts}}\"}]}'
108 |
109 |
--------------------------------------------------------------------------------
/fallzahlen_bezirke/fallzahlen_kanton_AG_bezirk.csv:
--------------------------------------------------------------------------------
1 | DistrictId,District,Canton,Date,Week,Year,Population,TotalConfCases,NewConfCases,TotalDeaths,NewDeaths,SourceUrl
2 | 1901,Aarau,AG,2020-10-26,,,79702,353,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
3 | 1901,Aarau,AG,2020-11-04,,,79702,527,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
4 | 1901,Aarau,AG,2020-11-13,,,79702,527,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
5 | 1902,Baden,AG,2020-10-26,,,145696,735,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
6 | 1902,Baden,AG,2020-11-04,,,145696,1079,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
7 | 1902,Baden,AG,2020-11-13,,,145696,1079,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
8 | 1903,Bremgarten,AG,2020-10-26,,,78745,277,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
9 | 1903,Bremgarten,AG,2020-11-04,,,78745,430,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
10 | 1903,Bremgarten,AG,2020-11-13,,,78745,430,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
11 | 1904,Brugg,AG,2020-10-26,,,51814,179,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
12 | 1904,Brugg,AG,2020-11-04,,,51814,270,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
13 | 1904,Brugg,AG,2020-11-13,,,51814,270,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
14 | 1905,Kulm,AG,2020-10-26,,,42412,153,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
15 | 1905,Kulm,AG,2020-11-04,,,42412,232,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
16 | 1905,Kulm,AG,2020-11-13,,,42412,232,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
17 | 1906,Laufenburg,AG,2020-10-26,,,33035,96,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
18 | 1906,Laufenburg,AG,2020-11-04,,,33035,130,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
19 | 1906,Laufenburg,AG,2020-11-13,,,33035,130,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
20 | 1907,Lenzburg,AG,2020-10-26,,,64792,261,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
21 | 1907,Lenzburg,AG,2020-11-04,,,64792,378,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
22 | 1907,Lenzburg,AG,2020-11-13,,,64792,378,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
23 | 1908,Muri,AG,2020-10-26,,,37170,152,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
24 | 1908,Muri,AG,2020-11-04,,,37170,213,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
25 | 1908,Muri,AG,2020-11-13,,,37170,213,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
26 | 1909,Rheinfelden,AG,2020-10-26,,,47926,158,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
27 | 1909,Rheinfelden,AG,2020-11-04,,,47926,235,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
28 | 1909,Rheinfelden,AG,2020-11-13,,,47926,235,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
29 | 1910,Zofingen,AG,2020-10-26,,,73136,271,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
30 | 1910,Zofingen,AG,2020-11-04,,,73136,408,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
31 | 1910,Zofingen,AG,2020-11-13,,,73136,408,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
32 | 1911,Zurzach,AG,2020-10-26,,,34650,127,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
33 | 1911,Zurzach,AG,2020-11-04,,,34650,206,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
34 | 1911,Zurzach,AG,2020-11-13,,,34650,206,,,,https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp
35 |
--------------------------------------------------------------------------------
/scrapers/add_district_db_entry.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import sys
4 | import sqlite3
5 | import traceback
6 | import os
7 |
8 | import db_common as dc
9 | import scrape_common as sc
10 |
11 | __location__ = dc.get_location()
12 |
13 | input_failures = 0
14 |
15 | try:
16 | DATABASE_NAME = os.path.join(__location__, 'data.sqlite')
17 | conn = sqlite3.connect(DATABASE_NAME)
18 |
19 | i = 0
20 | for line in sys.stdin:
21 | dd = sc.DistrictData()
22 | if dd.parse(line.strip()):
23 | c = conn.cursor()
24 | try:
25 | print(dd)
26 |
27 | c.execute(
28 | '''
29 | INSERT INTO data (
30 | DistrictId,
31 | District,
32 | Canton,
33 | Date,
34 | Week,
35 | Year,
36 | Population,
37 | TotalConfCases,
38 | NewConfCases,
39 | TotalDeaths,
40 | NewDeaths,
41 | SourceUrl
42 | )
43 | VALUES
44 | (?,?,?,?,?,?,?,?,?,?,?,?)
45 | ;
46 |
47 | ''',
48 | [
49 | dd.district_id,
50 | dd.district,
51 | dd.canton,
52 | dd.date or '',
53 | dd.week or '',
54 | dd.year or '',
55 | dd.population,
56 | dd.total_cases,
57 | dd.new_cases,
58 | dd.total_deceased,
59 | dd.new_deceased,
60 | dd.url,
61 | ]
62 | )
63 |
64 | print("Successfully added new entry.")
65 | except sqlite3.IntegrityError as e:
66 | # try UPDATE if INSERT didn't work (i.e. constraint violation)
67 | try:
68 | c.execute(
69 | '''
70 | UPDATE data SET
71 | Population = ?,
72 | TotalConfCases = ?,
73 | NewConfCases = ?,
74 | TotalDeaths = ?,
75 | NewDeaths = ?,
76 | SourceUrl = ?
77 | WHERE DistrictId = ?
78 | AND District = ?
79 | AND Canton = ?
80 | AND Date = ?
81 | AND Week = ?
82 | AND Year = ?
83 | ;
84 | ''',
85 | [
86 | dd.population,
87 | dd.total_cases,
88 | dd.new_cases,
89 | dd.total_deceased,
90 | dd.new_deceased,
91 | dd.url,
92 | dd.district_id,
93 | dd.district,
94 | dd.canton,
95 | dd.date or '',
96 | dd.week or '',
97 | dd.year or '',
98 | ]
99 | )
100 | print("Successfully updated entry.")
101 | except sqlite3.Error as e:
102 | print("Error: an error occured in sqlite3: ", e.args[0], file=sys.stderr)
103 | conn.rollback()
104 | input_failures += 1
105 | finally:
106 | conn.commit()
107 | except Exception as e:
108 | print("Error: %s" % e, file=sys.stderr)
109 | print(traceback.format_exc(), file=sys.stderr)
110 | sys.exit(1)
111 | finally:
112 | conn.close()
113 |
114 | if input_failures:
115 | print(f'input_failures: {input_failures}')
116 | sys.exit(1)
117 |
--------------------------------------------------------------------------------
/scrapers/convert_parsed_to_csv.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | # Reads data in a format produced by ./parse_scrape_output.py
4 | # from standard input, and converts into CSV file on a standard output.
5 | #
6 | # Example usage:
7 | # ./meta_scrape.sh | ./convert_parsed_to_csv.py > latest.csv
8 | # ./scrape_vd.sh | ./parse_scrape_output.py | ./convert_parsed_to_csv.py > vd.csv
9 | # cat *0.txt | ./convert_parsed_to_csv.py > full_history.csv
10 | #
11 | # See README.md for details about columns defined in CSV format.
12 |
13 | import csv
14 | import re
15 | import sys
16 |
17 | # See README.md for more details about these fields.
18 | field_names = [
19 | 'date',
20 | 'time',
21 | 'abbreviation_canton_and_fl',
22 | 'ncumul_tested',
23 | 'ncumul_conf',
24 | 'ncumul_hosp', # Actually not cumulative.
25 | 'ncumul_ICU', # Actually not cumulative.
26 | 'ncumul_vent', # Actually not cumulative.
27 | 'ncumul_released',
28 | 'ncumul_deceased',
29 | 'source',
30 | ]
31 |
32 | writer = csv.DictWriter(sys.stdout, field_names,
33 | delimiter=',',
34 | quotechar='"',
35 | lineterminator='\n',
36 | quoting=csv.QUOTE_MINIMAL)
37 |
38 | writer.writeheader()
39 |
40 | input_failures = 0
41 | for line in sys.stdin:
42 | l = line.strip()
43 |
44 | # AR 2020-03-23T10:00 30 1 OK 2020-03-23T19:12:09+01:00 https://www.ai.ch/themen/gesundheit-alter-und-soziales/gesundheitsfoerderung-und-praevention/uebertragbare-krankheiten/coronavirus
45 | # GE 2020-03-27T 1924 23 OK 2020-03-28T18:57:34+01:00 # Extras: ncumul_hosp=313,ncumul_ICU=54 # URLs: https://www.ge.ch/document/point-coronavirus-maladie-covid-19/telecharger
46 |
47 | # Groups: 1 2 3 4 5 6 7 8
48 | match = re.search(r'^([A-Z][A-Z])\s+((?:\d\d\d\d-\d\d-\d\d)T(?:\d\d:\d\d)?)\s+(\d+)\s+(\d+|-)\s+OK\s+([0-9:\+\-\.T]+)(?:\s+# Extras: ([^#]+))?(?:\s+(?:(# URLs: )?(h.+)))?(?:\s+(http.+))?$', l)
49 | if not match:
50 | input_failures += 1
51 | print(f"Failed to parse line: {l}", file=sys.stderr)
52 | continue
53 |
54 | abbr = match.group(1)
55 |
56 | date_part = match.group(2).split('T', 2)
57 |
58 | data = {
59 | 'date': date_part[0],
60 | 'time': None,
61 | 'abbreviation_canton_and_fl': abbr,
62 | 'ncumul_tested': None,
63 | 'ncumul_conf': int(match.group(3)),
64 | 'ncumul_hosp': None,
65 | 'ncumul_ICU': None,
66 | 'ncumul_vent': None,
67 | 'ncumul_released': None,
68 | 'ncumul_deceased': None,
69 | 'source': '',
70 | }
71 |
72 | if len(date_part) == 2:
73 | data['time'] = date_part[1]
74 |
75 | if match.group(4) != '-':
76 | data['ncumul_deceased'] = int(match.group(4))
77 |
78 | scrape_time = match.group(5)
79 |
80 | url_sources = match.group(7)
81 | if match.group(8):
82 | url_sources = match.group(8)
83 | if url_sources:
84 | data['source'] = f'Scraper for {abbr} at {scrape_time} using {url_sources}'
85 | else:
86 | data['source'] = f'Scraper for {abbr} at {scrape_time}'
87 |
88 | # Parse optional data.
89 | extras_list = match.group(6)
90 | if extras_list:
91 | try:
92 | extras = extras_list.strip()
93 | extras = extras.split(',')
94 | extras = { kv.split('=', 2)[0]: int(kv.split('=', 2)[1]) for kv in extras }
95 | # data.update(extras)
96 | for k in ['ncumul_hosp', 'ncumul_ICU', 'ncumul_vent', 'ncumul_released', 'new_hosp', 'current_hosp']:
97 | if k in extras:
98 | data[k] = extras[k]
99 | except Exception as e:
100 | input_failures += 1
101 | print(f'Error: Parsing optional data failed, ignoring: {extras_list}', file=sys.stderr)
102 |
103 | # print(data)
104 | writer.writerow(data)
105 |
106 | sys.stdout.flush()
107 |
108 | if input_failures:
109 | sys.exit(1)
110 |
--------------------------------------------------------------------------------
/scrapers/scrape_gr.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import datetime
4 | import re
5 | from bs4 import BeautifulSoup
6 | import scrape_common as sc
7 |
8 |
9 | is_first = True
10 |
11 | url = 'https://www.gr.ch/DE/institutionen/verwaltung/djsg/ga/coronavirus/info/Seiten/Start.aspx'
12 | data = sc.download(url, silent=True)
13 | data = re.sub(r'(\d+)'(\d+)', r'\1\2', data)
14 | soup = BeautifulSoup(data, 'html.parser')
15 | elem = soup.find('h2', text=re.compile(r'Fallzahlen\s+Kanton.*'))
16 | if elem is not None:
17 | table = elem.find_next('table')
18 | body = table.find('tbody')
19 | for row in body.find_all('tr'):
20 | tds = row.find_all('td')
21 |
22 | if not is_first:
23 | print('-' * 10)
24 | is_first = False
25 |
26 | dd = sc.DayData(canton='GR', url=url)
27 | dd.datetime = tds[0].text
28 | dd.cases = tds[1].text
29 | dd.isolated = tds[3].text
30 | dd.quarantined = tds[4].text
31 | dd.deaths = tds[6].text
32 | dd.hospitalized = tds[8].text
33 | dd.icu = tds[10].text
34 | dd.vent = tds[11].text
35 | print(dd)
36 |
37 |
38 | json_url = 'https://services1.arcgis.com/YAuo6vcW85VPu7OE/arcgis/rest/services/Fallzahlen_Total_Kanton/FeatureServer/0/query?where=1%3D1&objectIds=&time=&resultType=none&outFields=*&returnHiddenFields=false&returnIdsOnly=false&returnUniqueIdsOnly=false&returnCountOnly=false&returnDistinctValues=false&cacheHint=false&orderByFields=Eingangs_Datum&groupByFieldsForStatistics=&outStatistics=&having=&resultOffset=&resultRecordCount=&sqlFormat=standard&f=pjson'
39 | data = sc.jsondownload(json_url, silent=True)
40 |
41 | # 2020-04-02
42 | """
43 | features: [
44 | {
45 | attributes: {
46 | Eingangs_Datum: 1582675200000,
47 | Anzahl_Fälle_total__kumuliert_: 2,
48 | Neue_Faelle: 2,
49 | Neue_aktive_Fälle: 2,
50 | Anzahl_aktive_Fälle_total: 2,
51 | Anzahl_Personen_in_Isolation: 0,
52 | Anzahl_Personen_in_Quarantäne: 0,
53 | Verstorbene: 0,
54 | Verstorbene__kumuliert_: 0,
55 | Neue_Hospitalisierungen: 0,
56 | Hospitalisiert_Total: 0,
57 | Neu_Pflege: 0,
58 | Hospitalisiert_Pflege: 0,
59 | Neu_IPS: 0,
60 | Hospialisiert_IPS: 0,
61 | Neu_IPS_beatmet: 0,
62 | Hospitalisiert_IPS_beatmet: 0,
63 | FID: 1
64 | }
65 | },
66 | {
67 | attributes: {
68 | Eingangs_Datum: 1582761600000,
69 | Anzahl_Fälle_total__kumuliert_: 2,
70 | Neue_Faelle: 0,
71 | Neue_aktive_Fälle: 0,
72 | Anzahl_aktive_Fälle_total: 2,
73 | Anzahl_Personen_in_Isolation: 0,
74 | Anzahl_Personen_in_Quarantäne: 0,
75 | Verstorbene: 0,
76 | Verstorbene__kumuliert_: 0,
77 | Neue_Hospitalisierungen: 0,
78 | Hospitalisiert_Total: 0,
79 | Neu_Pflege: 0,
80 | Hospitalisiert_Pflege: 0,
81 | Neu_IPS: 0,
82 | Hospialisiert_IPS: 0,
83 | Neu_IPS_beatmet: 0,
84 | Hospitalisiert_IPS_beatmet: 0,
85 | FID: 2
86 | }
87 | },
88 | """
89 |
90 | assert 'features' in data, "JSON did not contain `features` key"
91 |
92 | for feature in data['features']:
93 | row = feature['attributes']
94 | if not is_first:
95 | print('-' * 10)
96 | is_first = False
97 |
98 | dd = sc.DayData(canton='GR', url=json_url)
99 | dd.datetime = datetime.datetime.fromtimestamp(row['Eingangs_Datum'] / 1000).date().isoformat()
100 | dd.cases = row['Anzahl_Fälle_total__kumuliert_']
101 | dd.hospitalized = row['Hospitalisiert_Total']
102 | dd.icu = row['Hospialisiert_IPS']
103 | dd.vent = row['Hospitalisiert_IPS_beatmet']
104 | # Neue_Hospotalisierungen does currently not match our definition of new_hosp
105 | # GR provides this calculated field as the difference between
106 | # hospitalized from yesterday and today
107 | #dd.new_hosp = row['Neue_Hospitalisierungen']
108 | dd.deaths = row['Verstorbene__kumuliert_']
109 | dd.isolated = row['Anzahl_Personen_in_Isolation']
110 | dd.quarantined = row['Anzahl_Personen_in_Quarantäne']
111 | print(dd)
112 |
--------------------------------------------------------------------------------
/fallzahlen_tests/fallzahlen_kanton_JU_tests.csv:
--------------------------------------------------------------------------------
1 | canton,start_date,end_date,week,year,positive_tests,negative_tests,total_tests,positivity_rate,source,pcr_positive_tests,pcr_negative_tests,pcr_total_tests,pcr_positivity_rate,ag_positive_tests,ag_negative_tests,ag_total_tests,ag_positivity_rate
2 | JU,,,43,2020,179,,719,25.0,https://www.jura.ch/Htdocs/Files/v/35815.pdf,,,,,,,,
3 | JU,,,44,2020,219,,1064,23.0,https://www.jura.ch/Htdocs/Files/v/35911.pdf,,,,,,,,
4 | JU,,,45,2020,418,,1590,27.0,https://www.jura.ch/Htdocs/Files/v/35986.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Cornavirus/Chiffres/rapport_hebdo_COVID_JU_sem45_vf.pdf?download=1,,,,,,,,
5 | JU,,,46,2020,252,,1130,24.0,https://www.jura.ch/Htdocs/Files/v/36049.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Cornavirus/Chiffres/rapport_hebdo_COVID_JU_sem46_vf.pdf?download=1,,,,,,,,
6 | JU,,,47,2020,203,,853,25.0,https://www.jura.ch/Htdocs/Files/v/36126.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Cornavirus/rapport_hebdo_COVID_JU_sem47_vf.pdf?download=1,,,,,,,,
7 | JU,,,48,2020,158,,736,22.0,https://www.jura.ch/Htdocs/Files/v/36196.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Cornavirus/Chiffres/rapport_hebdo_COVID_JU_sem48.pdf,,,,,,,,
8 | JU,,,49,2020,136,,882,15.0,https://www.jura.ch/Htdocs/Files/v/36338.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Cornavirus/Chiffres/rapport_hebdo_COVID_JU_sem49_vf.pdf,,,,,,,,
9 | JU,,,50,2020,145,,1125,13.0,https://www.jura.ch/Htdocs/Files/v/36416.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Cornavirus/Chiffres/rapport_hebdo_COVID_JU_sem50.pdf,,,,,,,,
10 | JU,,,51,2020,242,,1552,16.0,https://www.jura.ch/Htdocs/Files/v/36492.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Cornavirus/Chiffres/rapport_hebdo_COVID_JU_sem51.pdf,,,,,,,,
11 | JU,,,52,2020,144,,1072,13.0,https://www.jura.ch/Htdocs/Files/v/36498.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Cornavirus/Chiffres/rapport_hebdo_COVID_JU_sem52.pdf,,,,,,,,
12 | JU,,,53,2020,244,,1235,20.0,https://www.jura.ch/Htdocs/Files/v/36536.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Cornavirus/Chiffres/rapport_hebdo_COVID_JU_sem53.pdf,,,,,,,,
13 | JU,,,1,2021,246,,1143,22.0,https://www.jura.ch/Htdocs/Files/v/36563.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Cornavirus/Chiffres/rapport_hebdo_COVID_JU_sem1_2021.pdf,,,,,,,,
14 | JU,,,2,2021,215,,1231,17.0,https://www.jura.ch/Htdocs/Files/v/36660.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Cornavirus/Chiffres/rapport_hebdo_COVID_JU_sem2_2021_corr.pdf,,,,,,,,
15 | JU,,,3,2021,179,,1117,16.0,https://www.jura.ch/Htdocs/Files/v/36720.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Cornavirus/Chiffres/rapport_hebdo_COVID_JU_sem3_2021_vf.pdf,,,,,,,,
16 | JU,,,4,2021,207,,1448,14.0,https://www.jura.ch/Htdocs/Files/v/36790.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Cornavirus/Chiffres/rapport_hebdo_COVID_JU_sem4_2021.pdf,,,,,,,,
17 | JU,,,5,2021,127,,1877,7.0,https://www.jura.ch/Htdocs/Files/v/36821.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Cornavirus/Chiffres/rapport_hebdo_COVID_JU_sem5_2021.pdf,,,,,,,,
18 | JU,,,6,2021,127,,1342,9.0,https://www.jura.ch/Htdocs/Files/v/36872.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Cornavirus/Chiffres/rapport_hebdo_COVID_JU_sem6_2021.pdf,,,,,,,,
19 | JU,,,7,2021,143,,1500,10.0,https://www.jura.ch/Htdocs/Files/v/36918.pdf/Departements/CHA/SIC/Communiques/2021/rapport_hebdo_COVID_JU_sem7_2021.pdf,,,,,,,,
20 | JU,,,8,2021,151,,969,13.0,https://www.jura.ch/Htdocs/Files/v/36986.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Chiffres/rapport_hebdo_COVID_JU_sem8_2021.pdf,,,,,,,,
21 | JU,,,9,2021,154,,927,14.0,https://www.jura.ch/Htdocs/Files/v/37064.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Chiffres/rapport_hebdo_COVID_JU_sem9_2021.pdf,,,,,,,,
22 | JU,,,10,2021,80,,1099,7.0,https://www.jura.ch/Htdocs/Files/v/37125.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Chiffres/rapport_hebdo_COVID_JU_sem10_2021.pdf,,,,,,,,
23 | JU,,,11,2021,97,,1383,7.0,https://www.jura.ch/Htdocs/Files/v/37180.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Chiffres/rapport_hebdo_COVID_JU_sem11_2021.pdf,,,,,,,,
24 | JU,,,12,2021,104,,1715,6.0,https://www.jura.ch/Htdocs/Files/v/37241.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Chiffres/rapport-hebdomadaire.pdf,,,,,,,,
25 | JU,,,13,2021,148,,2116,7.0,https://www.jura.ch/Htdocs/Files/v/37276.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Chiffres/Rapport_hebdo_COVID_sem13.pdf,,,,,,,,
26 | JU,,,14,2021,110,,1205,8.0,https://www.jura.ch/Htdocs/Files/v/37332.pdf/Departements/CHA/SIC/Carrousel/Coronavirus/Chiffres/rapport-hebdo-COVID-sem14.pdf,,,,,,,,
27 |
--------------------------------------------------------------------------------
/scrapers/scrape_bl_districts.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 |
4 | from bs4 import BeautifulSoup
5 | import scrape_common as sc
6 | import scrape_bl_common as sbc
7 | from collections import defaultdict, OrderedDict
8 | from datetime import datetime
9 |
10 | main_url = "https://www.baselland.ch/politik-und-behorden/direktionen/volkswirtschafts-und-gesundheitsdirektion/amt-fur-gesundheit/medizinische-dienste/kantonsarztlicher-dienst/aktuelles/covid-19-faelle-kanton-basel-landschaft"
11 | main_site = sc.download(main_url, silent=True)
12 |
13 | # 2020-04-08, two iframes
14 | """
15 |
16 |
17 | """
18 |
19 |
20 | def parse_row_date(s):
21 | return sbc.parse_bl_date(s)[0]
22 |
23 |
24 | rows = defaultdict(dict)
25 | soup = BeautifulSoup(main_site, 'html.parser')
26 | for iframe in soup.find_all('iframe'):
27 | iframe_url = (iframe['src'])
28 |
29 | if iframe_url.find('/dbw/360') <= 0:
30 | continue
31 |
32 | d = sc.download(iframe_url, silent=True)
33 |
34 | # 2020-07-29
35 | """
36 |
37 | Datum,"Personen in Isolation","Personen in Quarantäne (Tracing)","Personen in Quarantäne (Rückreise Risikoländer)"
38 | 11-05-2020,0.0,0.0,
39 | """
40 |
41 | d = d.replace('\n', ' ')
42 |
43 | # district data!
44 | data = sc.find(r' ?Datum,"Bezirk Arlesheim","Bezirk Laufen","Bezirk Liestal","Bezirk Sissach","Bezirk Waldenburg"\s*([^<]+)
', d)
45 | if data:
46 | # take "Fallzahlen Bezirke BL ab Juni 2020", but not the 14d averaged one
47 | for row in data.split(" "):
48 | c = row.split(',')
49 | assert len(c) == 6, f"Number of fields changed, {len(c)} != 6"
50 | row_date = parse_row_date(c[0])
51 | rows[row_date]['date'] = row_date
52 | rows[row_date]['Arlesheim'] = sc.safeint(c[1])
53 | rows[row_date]['Laufen'] = sc.safeint(c[2])
54 | rows[row_date]['Liestal'] = sc.safeint(c[3])
55 | rows[row_date]['Sissach'] = sc.safeint(c[4])
56 | rows[row_date]['Waldenburg'] = sc.safeint(c[5])
57 | break
58 |
59 | assert rows, "Couldn't find district data in iframes"
60 |
61 | # https://www.bfs.admin.ch/bfs/de/home/statistiken/kataloge-datenbanken/karten.assetdetail.5688189.html
62 | district_ids = {
63 | 'Arlesheim': 1301,
64 | 'Laufen': 1302,
65 | 'Liestal': 1303,
66 | 'Sissach': 1304,
67 | 'Waldenburg': 1305,
68 | }
69 |
70 | # https://www.statistik.bl.ch/web_portal/1
71 | population = {
72 | 'Arlesheim': 157253,
73 | 'Laufen': 20141,
74 | 'Liestal': 61201,
75 | 'Sissach': 36051,
76 | 'Waldenburg': 16119,
77 | }
78 |
79 | # based on https://github.com/openZH/covid_19/issues/1185#issuecomment-709952315
80 | initial_cases = {
81 | 'Arlesheim': 0,
82 | 'Laufen': 0,
83 | 'Liestal': 0,
84 | 'Sissach': 0,
85 | 'Waldenburg': 0,
86 | }
87 |
88 | # order dict by key to ensure the most recent entry is last
89 | ordered_rows = OrderedDict(sorted(rows.items()))
90 |
91 | #for row_date, row in ordered_rows.items():
92 | # for district, district_id in district_ids.items():
93 |
94 | for district, district_id in district_ids.items():
95 | last_total_cases_val = initial_cases[district]
96 | if district == 'Arlesheim':
97 | # 2020-05-31 is 527
98 | last_total_cases_val = 0
99 |
100 | for row_date, row in ordered_rows.items():
101 | dd = sc.DistrictData(canton='BL', district=district)
102 | dd.district_id = district_id
103 | dd.population = population[district]
104 | dd.url = main_url
105 | dd.date = row['date']
106 | dd.total_cases = row[district] + initial_cases[district]
107 | dd.new_cases = dd.total_cases - last_total_cases_val
108 | last_total_cases_val = dd.total_cases
109 | print(dd)
110 |
--------------------------------------------------------------------------------
/scrapers/scrape_ag_districts.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | from bs4 import BeautifulSoup
4 | import re
5 | import datetime
6 | import scrape_common as sc
7 | from scrape_dates import parse_date
8 | import cv2
9 | import pytesseract
10 | import numpy as np
11 | import tempfile
12 | import os
13 |
14 |
15 | districts = {
16 | 'Baden': {
17 | 'pattern': r'^Baden.*',
18 | 'district_id': '1902',
19 | 'population': 145696,
20 | },
21 | 'Muri': {
22 | 'pattern': r'^Muri.*',
23 | 'district_id': '1908',
24 | 'population': 37170,
25 | },
26 | 'Lenzburg': {
27 | 'pattern': r'^Lenzburg.*',
28 | 'district_id': '1907',
29 | 'population': 64792,
30 | },
31 | 'Zofingen': {
32 | 'pattern': r'^Zo.+ngen.*',
33 | 'district_id': '1910',
34 | 'population': 73136,
35 | },
36 | 'Aarau': {
37 | 'pattern': r'^Aarau.*',
38 | 'district_id': '1901',
39 | 'population': 79702,
40 | },
41 | 'Bremgarten': {
42 | 'pattern': r'^Bremga.+en.*',
43 | 'district_id': '1903',
44 | 'population': 78745,
45 | },
46 | 'Brugg': {
47 | 'pattern': r'^Brugg.*',
48 | 'district_id': '1904',
49 | 'population': 51814,
50 | },
51 | 'Kulm': {
52 | 'pattern': r'^Kulm.*',
53 | 'district_id': '1905',
54 | 'population': 42412,
55 | },
56 | 'Laufenburg': {
57 | 'pattern': r'^Laufen.*burg.*',
58 | 'district_id': '1906',
59 | 'population': 33035,
60 | },
61 | 'Rheinfelden': {
62 | 'pattern': r'^Rheinfelden.*',
63 | 'district_id': '1909',
64 | 'population': 47926,
65 | },
66 | 'Zurzach': {
67 | 'pattern': r'^Z.+zach.*',
68 | 'district_id': '1911',
69 | 'population': 34650,
70 | },
71 | }
72 |
73 | data_url = 'https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp'
74 | d = sc.download(data_url, silent=True)
75 | soup = BeautifulSoup(d, 'html.parser')
76 | img_caption = soup.find(string=re.compile(r".*Inzidenz pro 100'000 Einwohner nach Bezirke.*"))
77 | img_date = sc.find(r'\(Stand:?\s+(.*\d{4})', img_caption.string)
78 | img_date = datetime.datetime.fromisoformat(parse_date(img_date).split('T', 1)[0])
79 | img_url = img_caption.find_previous('img')['src']
80 | img_url = 'https://www.ag.ch/media/kanton_aargau/themen_1/coronavirus_1/bilder_11/daten/Inzidenz_pro_100K_Einwohner_content_large.jpg'
81 | if not img_url.startswith('http'):
82 | img_url = f'https://www.ag.ch{img_url}'
83 |
84 | # download the image to a temporary file
85 | _, path = tempfile.mkstemp(suffix='.jpg')
86 | sc.download_file(img_url, path)
87 |
88 | # convert to binary image
89 | img = cv2.imread(path)
90 | gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
91 | gray, img_bin = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
92 | gray = cv2.bitwise_not(img_bin)
93 |
94 | # improve image and extract text
95 | kernel = np.ones((2, 1), np.uint8)
96 | img = cv2.erode(gray, kernel, iterations=1)
97 | img = cv2.dilate(img, kernel, iterations=1)
98 | #cv2.imshow('img', img)
99 | #cv2.waitKey(0)
100 | custom_config = '--oem 3 --psm 6'
101 | text_in_img = pytesseract.image_to_string(img, config=custom_config)
102 |
103 | # delete the temp img file
104 | os.remove(path)
105 |
106 | def parse_line(line):
107 | in_str = "OBFT"
108 | out_str = "0877"
109 | tab = str.maketrans(in_str, out_str)
110 | match = re.match(r'^(.*)\s+(?:[_-]\s+)?(\S+)\s+(\S+)\s+(\S+)$', line)
111 | if match:
112 | return (int(match[3].replace("'", "").translate(tab)), int(match[4].replace("'", "").translate(tab)))
113 | return (None, None)
114 |
115 | for name, config in districts.items():
116 | for line in text_in_img.split('\n'):
117 | dd = sc.DistrictData(canton='AG', district=name)
118 | dd.district_id = config['district_id']
119 | dd.url = data_url
120 | if re.search(config['pattern'], line, flags=re.I):
121 | population, total_cases = parse_line(line)
122 | assert population == config['population'], f"Population number for {name} does not match, {population} != {config['population']}"
123 | dd.date = img_date.date().isoformat()
124 | dd.population = population
125 | dd.total_cases = total_cases
126 | break
127 | assert dd, f"No data found for district {name}, Text: {text_in_img}"
128 | print(dd)
129 |
--------------------------------------------------------------------------------
/scrapers/scrape_vs_districts.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import re
4 |
5 | import scrape_common as sc
6 | import scrape_vs_common as svc
7 |
8 | # get the latest weekly PDF
9 | url = svc.get_vs_latest_weekly_pdf_url()
10 |
11 | # fetch the PDF
12 | pdf = sc.download_content(url, silent=True)
13 | week, year = svc.get_vs_weekly_general_data(pdf)
14 |
15 | # second last page contains the district data
16 | pages = int(sc.pdfinfo(pdf))
17 | page = None
18 | for p in range(1, pages):
19 | content = sc.pdftotext(pdf, page=p, layout=True)
20 | if sc.find(r'(Geografische)\s+.*', content):
21 | page = p
22 | break
23 |
24 | assert page > 0
25 | content = sc.pdftotext(pdf, page=page, layout=True, rect=[0, 443, 420, 50], fixed=2)
26 |
27 | # strip everything including the "Anzahl Faelle" column + values
28 | def strip_left_number(content):
29 | lines = content.split('\n')
30 | pos = None
31 | for line in lines:
32 | res = re.search(r'\s+(\d+) ', line)
33 | if res is not None:
34 | if pos is None:
35 | pos = res.end()
36 | else:
37 | pos = min(pos, res.end())
38 | new_content = []
39 | for line in lines:
40 | new_content.append(line[pos:])
41 | return '\n'.join(new_content)
42 |
43 |
44 | # strip from the right the "Inzidenz pro 100k Einwohner" column / description
45 | def strip_right_items(content):
46 | lines = content.split('\n')
47 | pos = None
48 | for line in lines:
49 | res = re.search(r'(\d+|\d+\.\d+)\s?$', line)
50 | if res is not None:
51 | if pos is None:
52 | pos = res.start()
53 | else:
54 | pos = max(pos, res.start())
55 | new_content = []
56 | for line in lines:
57 | new_content.append(line[:pos])
58 | return '\n'.join(new_content)
59 |
60 | # kill the left and right axis
61 | content = strip_left_number(content)
62 | # content = strip_right_items(content)
63 |
64 | # remove strange characters at the end of the string
65 | #content = content.rstrip()
66 |
67 | """
68 | this results in something like this (13 columns expected for the districts)
69 |
70 | 6.6
71 |
72 | 9 6 7 2 5 8 15 1 6 16
73 | """
74 |
75 | # approximate the width of each "column" in the table
76 | # get the maxima and divide it by the 13 expected districts
77 | length=None
78 | for line in content.split('\n'):
79 | llenght = len(line)
80 | if length is None:
81 | length = llenght
82 | else:
83 | length = max(llenght, length)
84 | length = round(length / 14.5)
85 |
86 | # split up all lines by the length and use the "lowest line" value
87 | district_values = []
88 | for i in range(0, 13):
89 | value = ''
90 | for line in content.split('\n'):
91 | val = line[i * length:(i + 1) * length].strip()
92 | if val != '':
93 | value = val
94 | if value == '':
95 | value = 0
96 | district_values.append(int(value))
97 |
98 |
99 | # this is the order in the PDF
100 | districts = [
101 | 'Goms',
102 | 'Raron',
103 | 'Brig',
104 | 'Visp',
105 | 'Leuk',
106 | 'Sierre',
107 | 'Herens',
108 | 'Sion',
109 | 'Conthey',
110 | 'Martigny',
111 | 'Entremont',
112 | 'St-Maurice',
113 | 'Monthey',
114 | ]
115 |
116 | district_ids = [
117 | 2304,
118 | 2309,
119 | 2301,
120 | 2313,
121 | 2306,
122 | 2311,
123 | 2305,
124 | 2312,
125 | 2302,
126 | 2307,
127 | 2303,
128 | 2310,
129 | 2308,
130 | ]
131 |
132 | population = [
133 | 4440,
134 | 10930,
135 | 26910,
136 | 28650,
137 | 12360,
138 | 49230,
139 | 10860,
140 | 47750,
141 | 28910,
142 | 47980,
143 | 15260,
144 | 13830,
145 | 46840,
146 | ]
147 |
148 |
149 | assert len(district_values) == 13, f'expected 13 district values, but got {len(district_values)} for {url}'
150 | i = 0
151 | for value in district_values:
152 | dd = sc.DistrictData(canton='VS', district=districts[i])
153 | dd.url = url
154 | dd.district_id = district_ids[i]
155 | dd.population = population[i]
156 | dd.week = week
157 | dd.year = year
158 | dd.new_cases = value
159 | print(dd)
160 | i += 1
161 |
--------------------------------------------------------------------------------
/scrapers/scrape_vd.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 |
4 | import datetime
5 | import re
6 | import sys
7 | import requests
8 | from bs4 import BeautifulSoup
9 | import scrape_common as sc
10 | import scrape_vd_common as svc
11 |
12 |
13 | def parse_html():
14 | # https://www.vd.ch/toutes-les-actualites/hotline-et-informations-sur-le-coronavirus/point-de-situation-statistique-dans-le-canton-de-vaud/
15 | # includes a content from datawrapper ( https://datawrapper.dwcdn.net/tr5bJ/14/ ),
16 | # which provides actual data and table rendering.
17 | # Here we instead use datawrapper API directly to fetch the data.
18 | main_url = 'https://www.vd.ch/toutes-les-actualites/hotline-et-informations-sur-le-coronavirus/point-de-situation-statistique-dans-le-canton-de-vaud/'
19 | url = 'https://api.datawrapper.de/v3/charts/tr5bJ/data'
20 | print('Downloading:', main_url)
21 | # The bearer authentication token provided by Alex Robert ( https://github.com/AlexBobAlex )
22 | data = requests.get(url,
23 | headers={'accept': 'text/csv',
24 | 'Authorization': 'Bearer 6868e7b3be4d7a69eff00b1a434ea37af3dac1e76f32d9087fc544dbb3f4e229'})
25 | d = data.text
26 |
27 | # Date Hospitalisations en cours Dont soins intensifs Sortis de l'hôpital Décès Total cas confirmés
28 | # 10.03.2020 36 8 5 1 130
29 | # 11.03.2020 38 7 5 3 200
30 |
31 | rows = d.split('\n')
32 |
33 | # Remove empty rows
34 | rows = [row for row in rows if len(row.strip())]
35 |
36 | headers = rows[0].split('\t')
37 | assert headers[0:6] == ["Date", "Hospitalisations en cours", "Dont soins intensifs", "Sortis de l'hôpital", "Décès", "Total cas confirmés"], f"Table header mismatch: Got: {headers}"
38 |
39 | is_first = True
40 | for row in rows:
41 | if not is_first:
42 | print('-' * 10)
43 | is_first = False
44 |
45 | cells = row.split('\t')
46 | print('VD')
47 | sc.timestamp()
48 | print('Downloading:', main_url)
49 | print('Date and time:', cells[0])
50 | print('Confirmed cases:', cells[5])
51 | print('Deaths:', cells[4])
52 | print('Hospitalized:', cells[1])
53 | print('ICU:', cells[2])
54 | if cells[3].isnumeric():
55 | print('Recovered:', cells[3])
56 |
57 |
58 | def parse_xlsx():
59 | html_url = 'https://www.vd.ch/toutes-les-actualites/hotline-et-informations-sur-le-coronavirus/point-de-situation-statistique-dans-le-canton-de-vaud/'
60 | d = sc.download(html_url, silent=True)
61 | soup = BeautifulSoup(d, 'html.parser')
62 | xls_url = soup.find('a', string=re.compile("les donn.*es", flags=re.I)).get('href')
63 | assert xls_url, "URL is empty"
64 | xls = sc.xlsdownload(xls_url, silent=True)
65 | rows = [row for row in sc.parse_xls(xls, header_row=2) if isinstance(row['Date'], datetime.datetime)]
66 | is_first = True
67 | for row in sorted(rows, key=lambda row: row['Date'].date().isoformat()):
68 | if not is_first:
69 | print('-' * 10)
70 | is_first = False
71 |
72 | print('VD')
73 | sc.timestamp()
74 | print('Downloading:', html_url)
75 | print('Date and time:', row['Date'].date().isoformat())
76 | print('Confirmed cases:', row['Nombre total de cas confirmés positifs'])
77 | print('Hospitalized:', row['Hospitalisation en cours'])
78 | print('ICU:', row['Dont soins intensifs'])
79 | print('Deaths:', row['Décès parmi cas confirmés'])
80 |
81 |
82 | def text_to_int(text):
83 | return int(re.sub('[^0-9]', '', text))
84 |
85 |
86 | def parse_weekly_pdf():
87 | pdf_url = svc.get_weekly_pdf_url()
88 | pdf = sc.pdfdownload(pdf_url, silent=True)
89 |
90 | dd = sc.DayData(canton='VD', url=pdf_url)
91 | res = re.findall('Situation\s+au\s+(\d+\s+\w+\s+\d{4})', pdf, re.MULTILINE | re.DOTALL)
92 | if len(res) == 1:
93 | dd.datetime = res[0]
94 | dd.datetime = dd.datetime.replace('\n', ' ')
95 | if dd.datetime is None:
96 | dd.datetime = sc.find('Point .pid.miologique au (\d+\.\d+\.\d{4})', pdf)
97 | #dd.cases = text_to_int(sc.find('\s(\d+.\d+)\s+personnes ont .t. d.clar.es positives au SARS-CoV-2.', pdf))
98 | dd.hospitalized = sc.find('(\d+)\s+patients\s+(COVID-19\s+)?sont\s+(actuellement\s+)?hospitalis.s', pdf)
99 | dd.icu = sc.find('dont\s+(\d+)\s+en\s+soins\s+intensifs', pdf)
100 | assert dd
101 | print(dd)
102 | print('-' * 10)
103 |
104 |
105 | if __name__ == '__main__':
106 | parse_weekly_pdf()
107 | # parse_xlsx()
108 |
--------------------------------------------------------------------------------
/scripts/validate-csv.js:
--------------------------------------------------------------------------------
1 | const csv = require('csv-validator');
2 | const fs = require("fs").promises;
3 | const path = require("path");
4 |
5 | const csvFiles = process.argv.slice(2);
6 |
7 | const validateSequentially = async csvFiles => {
8 | //field names starting with `_` are optional
9 | const headers = {
10 | date: /^\d{4}-\d{2}-\d{2}$/,
11 | _time: /^(([0-1]?[0-9]|2[0-3]):[0-5][0-9])?$/,
12 | abbreviation_canton_and_fl: /^[A-Z]{2}$/,
13 | _ncumul_tested: /^(\d+)?$/,
14 | _ncumul_conf: /^(\d+)?$/,
15 | _new_hosp: /^(\d+)?$/,
16 | _current_hosp: /^(\d+)?$/,
17 | _current_icu: /^(\d+)?$/,
18 | _current_vent: /^(\d+)?$/,
19 | _ncumul_released: /^(\d+)?$/,
20 | _ncumul_deceased: /^(\d+)?$/,
21 | _source: '',
22 | _current_isolated: /^(\d+)?$/,
23 | _current_quarantined: /^(\d+)?$/
24 | };
25 | const requiredKeys = [
26 | "date",
27 | "time",
28 | "abbreviation_canton_and_fl",
29 | "ncumul_tested",
30 | "ncumul_conf",
31 | "new_hosp",
32 | "current_hosp",
33 | "current_icu",
34 | "current_vent",
35 | "ncumul_released",
36 | "ncumul_deceased",
37 | "source",
38 | "current_isolated",
39 | "current_quarantined"
40 | ];
41 |
42 | const cumulativeFields = [
43 | "ncumul_tested",
44 | "ncumul_conf",
45 | "ncumul_released",
46 | "ncumul_deceased"
47 | ];
48 |
49 |
50 | const csvCorrectionFilePath = path.resolve('correction_status.csv');
51 | const parsedCorrection = await csv(csvCorrectionFilePath, headers);
52 | let correction = {};
53 | parsedCorrection.forEach(function (item, index) {
54 | if (correction[item['date']] === undefined) {
55 | correction[item['date']] = {};
56 | }
57 | if (correction[item['date']][item['abbreviation_canton_and_fl']] === undefined) {
58 | correction[item['date']][item['abbreviation_canton_and_fl']] = {};
59 | }
60 | correction[item['date']][item['abbreviation_canton_and_fl']][item['column']] = 1;
61 | });
62 |
63 | let failedChecks = 0;
64 |
65 | for (let csvFile of csvFiles) {
66 | const csvFilePath = path.resolve(csvFile);
67 |
68 | try {
69 | // check if file can be parsed
70 | const parsed = await csv(csvFilePath, headers);
71 |
72 | //make sure all keys are present
73 | const hasAllKeys = requiredKeys.every(key => parsed[0].hasOwnProperty(key));
74 | if (!hasAllKeys) {
75 | throw new Error(`Required field missing`);
76 | }
77 |
78 | var last = {};
79 | var errors = [];
80 | var unique = {};
81 | var today = new Date();
82 | parsed.forEach(function (item, index) {
83 | // check if date is in the future
84 | var abbr = item['abbreviation_canton_and_fl'];
85 | var date = item['date'];
86 | var dateObj = new Date(date);
87 | if (dateObj.getTime() > today.getTime()) {
88 | errors.push(`Row ${index+2}: date ${date} is in the future.`);
89 | }
90 |
91 | // check if cumulative field only increase
92 | cumulativeFields.forEach(function(col, col_idx) {
93 | const skip = correction[date] !== undefined && correction[date][abbr] !== undefined && correction[date][abbr][col] !== undefined;
94 | if (col in last && last[col] && item[col] && parseInt(item[col]) < parseInt(last[col]) && !skip) {
95 | errors.push(`Row ${index+2}: cumulative field ${col}: ${item[col]} < ${last[col]}`);
96 | }
97 | if (item[col]) {
98 | last[col] = item[col];
99 | }
100 | });
101 |
102 |
103 | // check if there is only one entry per area and date
104 | if (!(date in unique)) {
105 | unique[date] = {};
106 | }
107 | if (abbr in unique[date]) {
108 | unique[date][abbr] += 1;
109 | errors.push(`Row ${index+2}: duplicate entry for date ${date}`);
110 | } else {
111 | unique[date][abbr] = 1;
112 | }
113 | });
114 | if (errors.length > 0) {
115 | throw new Error(errors.join("\n"));
116 | }
117 | } catch (e) {
118 | failedChecks++;
119 | console.log(`× ${csvFile} failed the following checks:\n${e}`);
120 | continue;
121 | }
122 | console.log(`✓ ${csvFile} is valid.`);
123 | }
124 |
125 | return failedChecks;
126 | };
127 |
128 | const run = async () => {
129 | const failedChecks = await validateSequentially(csvFiles);
130 |
131 | if (failedChecks > 0) {
132 | process.exit(1);
133 | }
134 | };
135 |
136 | run().catch(e => console.error(e));
137 |
--------------------------------------------------------------------------------