├── .env.example
├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── agent
    ├── Dockerfile
    ├── config.toml
    ├── docker-compose.yml
    └── requirements.txt
├── crawl.docker-compose.yml
├── create-certs.yml
├── docker-compose.yml
├── flow
    ├── Dockerfile
    ├── config.toml
    ├── requirements.txt
    └── scripts
    │   ├── UID_ISO_FIPS_LookUp_Table.csv
    │   ├── crawl_google_news.py
    │   ├── crawl_mapping.py
    │   ├── crawl_tweets.py
    │   ├── insert_france.py
    │   ├── insert_france_virtests.py
    │   ├── insert_owid.py
    │   ├── mapping.py
    │   └── parse_insert.py
├── illustrations
    ├── france_live_status.png
    ├── latest_news.png
    ├── live_dashboard.png
    ├── news_web_app.png
    └── vaccination_map.png
├── insert.docker-compose.yml
├── instances
    ├── pem.yml
    └── pkcs_12.yml
├── kibana.yml
├── news_app
    ├── app
    │   ├── .gitignore
    │   ├── Dockerfile
    │   ├── app
    │   │   ├── package-lock.json
    │   │   ├── package.json
    │   │   ├── public
    │   │   │   ├── css
    │   │   │   │   └── bootstrap.css
    │   │   │   ├── index.html
    │   │   │   ├── logo.png
    │   │   │   ├── manifest.json
    │   │   │   └── robots.txt
    │   │   └── src
    │   │   │   ├── About.js
    │   │   │   ├── App.js
    │   │   │   ├── App.test.js
    │   │   │   ├── Home.js
    │   │   │   ├── Layout.js
    │   │   │   ├── NavigationBar.js
    │   │   │   ├── SearchUI.js
    │   │   │   ├── index.css
    │   │   │   ├── index.js
    │   │   │   └── serviceWorker.js
    │   ├── entrypoint.sh
    │   └── package-lock.json
    └── docker-compose.yml
├── pandemic_knowledge.png
└── prefect
    ├── Dockerfile
    └── prefect.config


/.env.example:
--------------------------------------------------------------------------------
 1 | PREFECT_UI_TAG="latest"
 2 | PREFECT_SERVER_TAG="latest"
 3 | 
 4 | POSTGRES_USER="prefect_user"
 5 | # PLEASE CHANGE !
 6 | POSTGRES_PASSWORD="prefect_password"
 7 | POSTGRES_DB="prefect_db"
 8 | # PLEASE CHANGE according to POSTGRES_PASSWORD !
 9 | DB_CONNECTION_URL="postgresql://prefect_user:prefect_password@prefect_postgres:5432/prefect_db"
10 | 
11 | PREFECT_SERVER_DB_CMD="prefect-server database upgrade -y"
12 | # PLEASE CHANGE !
13 | PREFECT_SERVER__HASURA__ADMIN_SECRET="hasura-secret-admin-secret" 
14 | PREFECT_SERVER__TELEMETRY__ENABLED="false"
15 | PREFECT_SERVER__APOLLO_URL="http://localhost:4200/graphql"
16 | 
17 | MINIO_SCHEME=http
18 | MINIO_ENDPOINT=172.17.0.1:9000
19 | MINIO_ACCESS_KEY=minio
20 | MINIO_SECRET_KEY=minio123
21 | 
22 | ELASTIC_SCHEME=https
23 | ELASTIC_PORT=9200
24 | ELASTIC_ENDPOINT=172.17.0.1
25 | ELASTIC_USER=elastic
26 | ELASTICSEARCH_PWD=elastic
27 | MAX_ES_ROW_INJECT=1000


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode/
2 | .env
3 | certs/
4 | node_modules/


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | services:
 2 |     - docker
 3 | env:
 4 |     - DOCKER_COMPOSE_VERSION=1.23.2
 5 | 
 6 | before_install:
 7 |     - sudo rm /usr/local/bin/docker-compose
 8 |     - curl -L https://github.com/docker/compose/releases/download/${DOCKER_COMPOSE_VERSION}/docker-compose-`uname -s`-`uname -m` > docker-compose
 9 |     - chmod +x docker-compose
10 |     - sudo mv docker-compose /usr/local/bin
11 | 
12 | script:
13 |     - cp .env.example .env
14 |     - docker-compose build
15 |     - docker-compose -f crawl.docker-compose.yml build
16 |     - docker-compose -f insert.docker-compose.yml build
17 |     - docker-compose -f agent/docker-compose.yml build
18 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Flavien Berwick
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Pandemic-Knowledge
  2 | 
  3 | ![Pandemic Knowledge logo](./pandemic_knowledge.png)
  4 | 
  5 | <p align="center">
  6 |     <a href="https://travis-ci.com/flavienbwk/Pandemic-Knowledge" target="_blank">
  7 |         <img src="https://travis-ci.org/flavienbwk/Pandemic-Knowledge.svg?branch=main"/>
  8 |     </a>
  9 |     <a href="https://github.com/psf/black"><img alt="Code style: black" src="https://img.shields.io/badge/code%20style-black-000000.svg"></a>
 10 |     <a href="./LICENSE"><img atl="Repo license MIT" src="https://img.shields.io/badge/License-MIT-yellow.svg"/></a>
 11 | </p>
 12 | 
 13 | A fully-featured multi-source data pipeline for continuously extracting knowledge from COVID-19 data.
 14 | 
 15 | - Contamination figures
 16 | - Vaccination figures
 17 | - Death figures
 18 | - COVID-19-related news (Google News, Twitter)
 19 | 
 20 | ## What you can achieve
 21 | 
 22 | |                        Live contaminations map + Latest news                        |                   Last 7 days news                    |
 23 | | :---------------------------------------------------------------------------------: | :---------------------------------------------------: |
 24 | | ![Live contamination and vaccination world map](./illustrations/live_dashboard.png) | ![Last news, live !](./illustrations/latest_news.png) |
 25 | 
 26 | |            France 3-weeks live map (Kibana Canvas)            |                     Live vaccinations map                     |
 27 | | :-----------------------------------------------------------: | :-----------------------------------------------------------: |
 28 | | ![France Live Status](./illustrations/france_live_status.png) | ![World vaccination map](./illustrations/vaccination_map.png) |
 29 | 
 30 | ## Context
 31 | 
 32 | This project was realized over 4 days as part of a MSc hackathon from [ETNA](https://etna.io), a french computer science school.
 33 | 
 34 | The incentives were both to experiment/prototype a big data pipeline and contribute to an open source project.
 35 | 
 36 | ## Install
 37 | 
 38 | Below, you'll find the procedure to process COVID-related file and news into the Pandemic Knowledge database (elasticsearch).
 39 | 
 40 | The process is **scheduled** to run every 24 hours so you can update the files and obtain the latest news
 41 | 
 42 | - [Pandemic-Knowledge](#pandemic-knowledge)
 43 |   - [What you can achieve](#what-you-can-achieve)
 44 |   - [Context](#context)
 45 |   - [Install](#install)
 46 |     - [Env file](#env-file)
 47 |     - [Initialize elasticsearch](#initialize-elasticsearch)
 48 |     - [Initialize Prefect](#initialize-prefect)
 49 |     - [Run Prefect workers](#run-prefect-workers)
 50 |     - [COVID-19 data](#covid-19-data)
 51 |     - [News data](#news-data)
 52 |     - [News web app](#news-web-app)
 53 | 
 54 | ### Env file
 55 | 
 56 | Running this project on your local computer ? Just copy the `.env.example` file :
 57 | 
 58 | ```bash
 59 | cp .env.example .env
 60 | ```
 61 | 
 62 | Open this `.env` file and edit password-related variables.
 63 | 
 64 | ### Initialize elasticsearch
 65 | 
 66 | Raise your host's ulimits for ElasticSearch to handle high I/O :
 67 | 
 68 | ```bash
 69 | sudo sysctl -w vm.max_map_count=500000
 70 | ```
 71 | 
 72 | Then :
 73 | 
 74 | ```bash
 75 | docker-compose -f create-certs.yml run --rm create_certs
 76 | docker-compose up -d es01 es02 es03 kibana
 77 | ```
 78 | 
 79 | ### Initialize Prefect
 80 | 
 81 | Create a `~/.prefect/config.toml` file with the following content :
 82 | 
 83 | ```bash
 84 | # debug mode
 85 | debug = true
 86 | 
 87 | # base configuration directory (typically you won't change this!)
 88 | home_dir = "~/.prefect"
 89 | 
 90 | backend = "server"
 91 | 
 92 | [server]
 93 | host = "http://172.17.0.1"
 94 | port = "4200"
 95 | host_port = "4200"
 96 | endpoint = "${server.host}:${server.port}"
 97 | ```
 98 | 
 99 | Run Prefect :
100 | 
101 | ```bash
102 | docker-compose up -d prefect_postgres prefect_hasura prefect_graphql prefect_towel prefect_apollo prefect_ui
103 | ```
104 | 
105 | We need to create a _tenant_. Execute on your host :
106 | 
107 | ```bash
108 | pip3 install prefect
109 | prefect backend server
110 | prefect server create-tenant --name default --slug default
111 | ```
112 | 
113 | Access the web UI at [localhost:8081](http://localhost:8081)
114 | 
115 | ### Run Prefect workers
116 | 
117 | Agents are services that run your scheduled flows.
118 | 
119 | 1. Open and optionally edit the [`agent/config.toml`](./agent/config.toml) file.
120 | 
121 | 2. Let's instanciate 3 workers :
122 | 
123 |   ```bash
124 |   docker-compose -f agent/docker-compose.yml up -d --build --scale agent=3 agent
125 |   ```
126 | 
127 |   > :information_source: You can run the agent on another machine than the one with the Prefect server. Edit the [`agent/config.toml`](./agent/config.toml) file for that.
128 | 
129 | ### COVID-19 data
130 | 
131 | Injection scripts should are scheduled in Prefect so they automatically inject data with the latest news (delete + inject).
132 | 
133 | There are several data source supported by Pandemic Knowledge
134 | 
135 | - [Our World In Data](https://ourworldindata.org/coronavirus-data); used by Google
136 |   - docker-compose slug : `insert_owid`
137 |   - MinIO bucket : `contamination-owid`
138 |   - Format : CSV
139 | - [OpenCovid19-Fr](https://github.com/opencovid19-fr/data)
140 |   - docker-compose slug : `insert_france`
141 |   - Format : CSV (download from Internet)
142 | - [Public Health France - Virological test results](https://www.data.gouv.fr/en/datasets/donnees-relatives-aux-resultats-des-tests-virologiques-covid-19/) (official source)
143 |   - docker-compose slug : `insert_france_virtests`
144 |   - Format : CSV (download from Internet)
145 | 
146 | 1. Start MinIO and import your files according to the buckets evoked upper.
147 | 
148 |     For _Our World In Data_, create the `contamination-owid` bucket and import the CSV file inside.
149 | 
150 |     ```bash
151 |     docker-compose up -d minio
152 |     ```
153 | 
154 |     > MinIO is available at `localhost:9000`
155 | 
156 | 2. Download dependencies and start the injection service of your choice. For instance :
157 | 
158 |     ```bash
159 |     pip3 install -r ./flow/requirements.txt
160 |     docker-compose -f insert.docker-compose.yml up --build insert_owid
161 |     ```
162 | 
163 | 3. In [Kibana](https://localhost:5601), create an index pattern `contamination_owid_*`
164 | 
165 | 4. Once injected, we recommend to adjust the number of replicas [in the DevTool](https://localhost:5601/app/dev_tools#/console) :
166 | 
167 |     ```json
168 |     PUT /contamination_owid_*/_settings
169 |     {
170 |         "index" : {
171 |             "number_of_replicas" : "2"
172 |         }
173 |     }
174 |     ```
175 | 
176 | 5. Start making your dashboards in [Kibana](https://localhost:5601) !
177 | 
178 | ### News data
179 | 
180 | There are two sources for news :
181 | 
182 | - Google News (elasticsearch index: `news_googlenews`)
183 | - Twitter (elasticsearch index: `news_tweets`)
184 | 
185 | 1. Run the Google News crawler :
186 | 
187 |   ```bash
188 |   docker-compose -f crawl.docker-compose.yml up --build crawl_google_news # and/or crawl_tweets
189 |   ```
190 | 
191 | 2. In Kibana, create a `news_*` index pattern
192 | 
193 | 3. **Edit** the index pattern fields :
194 | 
195 |   | Name | Type                                                  | Format  |
196 |   | ---- | ----------------------------------------------------- | ------- |
197 |   | img  | string                                                | **Url** |
198 |   | link | string **with Type: Image** with empty _URL template_ | **Url** |
199 | 
200 | 4. Create your visualisation
201 | 
202 | ### News web app
203 | 
204 | Browse through the news with our web application.
205 | 
206 | ![News web app](./illustrations/news_web_app.png)
207 | 
208 | 1. Make sure you've accepted the self-signed certificate of Elasticsearch at [`https://localhost:9200`](https://localhost:9200)
209 | 
210 | 2. Start-up the app
211 | 
212 |     ```bash
213 |     docker-compose -f news_app/docker-compose.yml up --build -d
214 |     ```
215 | 
216 | 3. Discover the app at [`localhost:8080`](http://localhost:8080)
217 | 
218 | ---
219 | 
220 | <details>
221 | <summary>TODOs</summary>
222 | 
223 | Possible improvements :
224 | 
225 | - [ ] [Using Dask for parallelizing](https://docs.prefect.io/core/idioms/parallel.html) process of CSV lines by batch of 1000
226 | - [ ] Removing indices only when source process is successful (adding new index, then remove old index)
227 | - [ ] Removing indices only when crawling is successful (adding new index, then remove old index)
228 | 
229 | </details>
230 | 
231 | <details>
232 | <summary>Useful commands</summary>
233 | 
234 | To stop everything :
235 | 
236 | ```bash
237 | docker-compose down
238 | docker-compose -f agent/docker-compose.yml down
239 | docker-compose -f insert.docker-compose.yml down
240 | docker-compose -f crawl.docker-compose.yml down
241 | ```
242 | 
243 | To start each service, step by step :
244 | 
245 | ```bash
246 | docker-compose up -d es01 es02 es03 kibana
247 | docker-compose up -d minio
248 | docker-compose up -d prefect_postgres prefect_hasura prefect_graphql prefect_towel prefect_apollo prefect_ui
249 | docker-compose -f agent/docker-compose.yml up -d --build --scale agent=3 agent
250 | ```
251 | 
252 | </details>
253 | 


--------------------------------------------------------------------------------
/agent/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.7
2 | 
3 | RUN apt update && apt install uuid -y
4 | RUN pip install --upgrade pip
5 | 
6 | COPY ./requirements.txt /requirements.txt
7 | 
8 | RUN pip install -r /requirements.txt


--------------------------------------------------------------------------------
/agent/config.toml:
--------------------------------------------------------------------------------
 1 | # debug mode
 2 | debug = true
 3 | 
 4 | # base configuration directory (typically you won't change this!)
 5 | home_dir = "~/.prefect"
 6 | 
 7 | backend = "server"
 8 | 
 9 | [server]
10 | host = "http://172.17.0.1"
11 | port = "4200"
12 | host_port = "4200"
13 | endpoint = "${server.host}:${server.port}"


--------------------------------------------------------------------------------
/agent/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3.7"
 2 | 
 3 | services:
 4 | 
 5 |     agent:
 6 |         restart: always
 7 |         build: .
 8 |         dns: 8.8.8.8
 9 |         command: bash -c "prefect agent local start --name $$(uuid) --no-hostname-label --label development"
10 |         volumes:
11 |             - /srv/docker/prefect/flows:/root/.prefect/flows
12 |             - type: bind
13 |               source: ./config.toml
14 |               target: /root/.prefect/config.toml
15 |               read_only: true
16 | 


--------------------------------------------------------------------------------
/agent/requirements.txt:
--------------------------------------------------------------------------------
 1 | prefect==0.14.16
 2 | minio==7.0.3
 3 | clevercsv==0.6.7
 4 | tqdm==4.60.0
 5 | elasticsearch==7.12.0
 6 | geopy==2.1.0
 7 | iso3166==1.0.1
 8 | dateparser==1.0.0
 9 | GoogleNews==1.5.7
10 | snscrape==0.3.4
11 | pandas==1.2.4
12 | 


--------------------------------------------------------------------------------
/crawl.docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3.7"
 2 | 
 3 | services:
 4 | 
 5 |   crawl_google_news:
 6 |     build: ./flow
 7 |     command: python3 /usr/app/crawl_google_news.py
 8 |     volumes:
 9 |       - /srv/docker/prefect/flows:/root/.prefect/flows
10 |       - "./flow/scripts:/usr/app:ro"
11 |       - type: bind
12 |         source: ./flow/config.toml
13 |         target: /root/.prefect/config.toml
14 |         read_only: true
15 |     env_file:
16 |       - .env
17 |     environment:
18 |       MAX_ES_ROW_INJECT: ${MAX_ES_ROW_INJECT}
19 |       ELASTIC_SCHEME: ${ELASTIC_SCHEME}
20 |       ELASTIC_PORT: ${ELASTIC_PORT}
21 |       ELASTIC_ENDPOINT: ${ELASTIC_ENDPOINT}
22 |       ELASTIC_USER: ${ELASTIC_USER}
23 |       ELASTIC_PWD: ${ELASTICSEARCH_PWD}
24 | 
25 |   crawl_tweets: 
26 |     build: ./flow
27 |     command: python3 /usr/app/crawl_tweets.py
28 |     volumes:
29 |       - /srv/docker/prefect/flows:/root/.prefect/flows
30 |       - "./flow/scripts:/usr/app:ro"
31 |       - type: bind
32 |         source: ./flow/config.toml
33 |         target: /root/.prefect/config.toml
34 |         read_only: true
35 |     env_file:
36 |       - .env
37 |     environment:
38 |       MAX_ES_ROW_INJECT: ${MAX_ES_ROW_INJECT}
39 |       ELASTIC_SCHEME: ${ELASTIC_SCHEME}
40 |       ELASTIC_PORT: ${ELASTIC_PORT}
41 |       ELASTIC_ENDPOINT: ${ELASTIC_ENDPOINT}
42 |       ELASTIC_USER: ${ELASTIC_USER}
43 |       ELASTIC_PWD: ${ELASTICSEARCH_PWD}
44 | 


--------------------------------------------------------------------------------
/create-certs.yml:
--------------------------------------------------------------------------------
 1 | version: '3.2'
 2 | 
 3 | services:
 4 | 
 5 |     create_certs:
 6 |         container_name: create_certs
 7 |         image: docker.elastic.co/elasticsearch/elasticsearch:7.10.0
 8 |         command: >
 9 |             bash -c '
10 |             if [[ ! -f /certs/ca.zip ]]; then
11 |                 # Generating CA certificate
12 |                 bin/elasticsearch-certutil ca --silent --pem -out /certs/ca.zip;
13 |                 unzip /certs/ca.zip -d /certs;
14 |             fi;
15 |             if [[ ! -f /certs/pem.zip ]]; then
16 |                 # Generating PEM certificates (ElasticSearch nodes and Kibana)
17 |                 bin/elasticsearch-certutil cert --silent --pem --ca-cert "/certs/ca/ca.crt" --ca-key "/certs/ca/ca.key" --in config/certificates/pem.yml -out /certs/pem.zip;
18 |                 unzip /certs/pem.zip -d /certs;
19 |             fi;
20 |             if [[ ! -f /certs/pkcs_12.zip ]]; then
21 |                 # Generating PKCS#12 certificates (Enterprise Search)
22 |                 bin/elasticsearch-certutil cert --silent --pass "changeme" --ca-cert "/certs/ca/ca.crt" --ca-key "/certs/ca/ca.key" --in config/certificates/pkcs_12.yml -out /certs/pkcs_12.zip;
23 |                 unzip /certs/pkcs_12.zip -d /certs;
24 |             fi;
25 |             '
26 |         working_dir: /usr/share/elasticsearch
27 |         volumes: ['./certs:/certs', './instances:/usr/share/elasticsearch/config/certificates']
28 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
  1 | version: "3.7"
  2 | 
  3 | services:
  4 |     prefect_setup:
  5 |         build: ./prefect
  6 |         networks:
  7 |             - prefect-server
  8 | 
  9 |     prefect_postgres:
 10 |         restart: "always"
 11 |         image: "postgres:11"
 12 |         environment:
 13 |             POSTGRES_USER: ${POSTGRES_USER}
 14 |             POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
 15 |             POSTGRES_DB: ${POSTGRES_DB}
 16 |         volumes:
 17 |             - prefect_postgres:/var/lib/postgresql/data
 18 |         networks:
 19 |             - prefect-server
 20 |         healthcheck:
 21 |             test: pg_isready -q -d $${POSTGRES_DB} -U $${POSTGRES_USER} || exit 1
 22 |             interval: 10s
 23 |             timeout: 2s
 24 |             retries: 60
 25 |             start_period: 2s
 26 |         command:
 27 |             - "postgres"
 28 |             # explicitly set max connections
 29 |             - "-c"
 30 |             - "max_connections=150"
 31 | 
 32 |     prefect_hasura:
 33 |         restart: "always"
 34 |         image: "hasura/graphql-engine:v1.3.3"
 35 |         ports:
 36 |             - "3000:3000"
 37 |         command: "graphql-engine serve"
 38 |         environment:
 39 |             HASURA_GRAPHQL_DATABASE_URL: ${DB_CONNECTION_URL}
 40 |             HASURA_GRAPHQL_ENABLE_CONSOLE: "true"
 41 |             HASURA_GRAPHQL_SERVER_PORT: "3000"
 42 |             HASURA_GRAPHQL_QUERY_PLAN_CACHE_SIZE: 100
 43 |             HASURA_GRAPHQL_LOG_LEVEL: "warn"
 44 |         networks:
 45 |             - prefect-server
 46 |         healthcheck:
 47 |             test: wget -O - http://hasura:3000/healthz &>/dev/null || exit 1
 48 |             interval: 10s
 49 |             timeout: 2s
 50 |             retries: 60
 51 |             start_period: 1s
 52 |         depends_on:
 53 |             - prefect_postgres
 54 | 
 55 |     prefect_graphql:
 56 |         restart: "always"
 57 |         image: "prefecthq/server:latest"
 58 |         ports:
 59 |             - "4201:4201"
 60 |         command: bash -c "${PREFECT_SERVER_DB_CMD} && python src/prefect_server/services/graphql/server.py"
 61 |         environment:
 62 |             PREFECT_SERVER_DB_CMD: ${PREFECT_SERVER_DB_CMD:-"echo 'DATABASE MIGRATIONS SKIPPED'"}
 63 |             PREFECT_SERVER__DATABASE__CONNECTION_URL: ${DB_CONNECTION_URL}
 64 |             PREFECT_SERVER__HASURA__ADMIN_SECRET: ${PREFECT_SERVER__HASURA__ADMIN_SECRET:-hasura-secret-admin-secret}
 65 |             PREFECT_SERVER__HASURA__HOST: prefect_hasura
 66 |         networks:
 67 |             - prefect-server
 68 |         healthcheck:
 69 |             test: curl --fail --silent "http://prefect_graphql:4201/health" &> /dev/null || exit 1
 70 |             interval: 20s
 71 |             timeout: 2s
 72 |             retries: 60
 73 |             start_period: 1s
 74 |         depends_on:
 75 |             - prefect_hasura
 76 | 
 77 |     prefect_towel:
 78 |         restart: "always"
 79 |         image: "prefecthq/server:latest"
 80 |         command: "python src/prefect_server/services/towel/__main__.py"
 81 |         environment:
 82 |             PREFECT_SERVER__HASURA__ADMIN_SECRET: ${PREFECT_SERVER__HASURA__ADMIN_SECRET:-hasura-secret-admin-secret}
 83 |             PREFECT_SERVER__HASURA__HOST: prefect_hasura
 84 |         networks:
 85 |             - prefect-server
 86 |         depends_on:
 87 |             - prefect_graphql
 88 | 
 89 |     prefect_apollo:
 90 |         restart: "always"
 91 |         image: "prefecthq/apollo:latest"
 92 |         command: bash -c "./post-start.sh && npm run serve"
 93 |         ports:
 94 |             - 4200:4200
 95 |         environment:
 96 |             HASURA_API_URL: http://prefect_hasura:3000/v1alpha1/graphql
 97 |             PREFECT_API_URL: http://prefect_graphql:4201/graphql/
 98 |             PREFECT_API_HEALTH_URL: http://prefect_graphql:4201/health
 99 |             PREFECT_SERVER__TELEMETRY__ENABLED: "false"
100 |             GRAPHQL_SERVICE_HOST: http://prefect_graphql
101 |             GRAPHQL_SERVICE_PORT: 4201
102 |         networks:
103 |             - prefect-server
104 |         healthcheck:
105 |             test: curl --fail --silent "http://prefect_apollo:4200/.well-known/apollo/server-health" &> /dev/null || exit 1
106 |             interval: 10s
107 |             timeout: 2s
108 |             retries: 60
109 |             start_period: 1s
110 |         depends_on:
111 |             - prefect_graphql
112 | 
113 |     prefect_ui:
114 |         restart: "always"
115 |         image: "prefecthq/ui:2021-02-23"
116 |         ports:
117 |             - "8081:8080"
118 |         command: "/intercept.sh"
119 |         environment:
120 |             PREFECT_SERVER__APOLLO_URL: http://localhost:4200/graphql
121 |             PREFECT_BACKEND: server
122 |         networks:
123 |             - prefect-server
124 |         healthcheck:
125 |             test: curl --fail --silent --head "http://prefect_ui:8080/" &> /dev/null || exit 1
126 |             interval: 30s
127 |             timeout: 5s
128 |             retries: 3
129 |         depends_on:
130 |             - prefect_apollo
131 | 
132 |     es01:
133 |         restart: always
134 |         image: docker.elastic.co/elasticsearch/elasticsearch:7.12.0
135 |         volumes:
136 |             - "es01:/usr/share/elasticsearch/data"
137 |             - "./certs:/usr/share/elasticsearch/config/certificates:ro"
138 |         ports:
139 |             - "9200:9200"
140 |         environment:
141 |             ES_JAVA_OPTS: "-Xmx512m -Xms512m"
142 |             ELASTIC_PASSWORD: ${ELASTICSEARCH_PWD}
143 |             node.name: es01
144 |             cluster.name: es-docker-cluster
145 |             discovery.seed_hosts: es02,es03
146 |             cluster.initial_master_nodes: es01,es02,es03
147 |             network.host: 0.0.0.0
148 |             xpack.license.self_generated.type: basic
149 |             xpack.monitoring.collection.enabled: "true"
150 |             xpack.security.enabled: "true"
151 |             xpack.security.http.ssl.enabled: "true"
152 |             xpack.security.http.ssl.key: /usr/share/elasticsearch/config/certificates/es01/es01.key
153 |             xpack.security.http.ssl.certificate_authorities: /usr/share/elasticsearch/config/certificates/ca/ca.crt
154 |             xpack.security.http.ssl.certificate: /usr/share/elasticsearch/config/certificates/es01/es01.crt
155 |             xpack.security.transport.ssl.enabled: "true"
156 |             xpack.security.transport.ssl.verification_mode: certificate
157 |             xpack.security.transport.ssl.certificate_authorities: /usr/share/elasticsearch/config/certificates/ca/ca.crt
158 |             xpack.security.transport.ssl.certificate: /usr/share/elasticsearch/config/certificates/es01/es01.crt
159 |             xpack.security.transport.ssl.key: /usr/share/elasticsearch/config/certificates/es01/es01.key
160 |             cluster.routing.allocation.disk.threshold_enabled: "true"
161 |             cluster.routing.allocation.disk.watermark.low: 93%
162 |             cluster.routing.allocation.disk.watermark.high: 95%
163 |             http.cors.enabled : "true"
164 |             http.cors.allow-origin : "*"
165 |             http.cors.allow-methods : OPTIONS, HEAD, GET, POST, PUT, DELETE
166 |             http.cors.allow-headers : Authorization,X-Requested-With,X-Auth-Token,Content-Type, Content-Length
167 |         depends_on:
168 |             - es02
169 |             - es03
170 |         ulimits:
171 |             memlock:
172 |                 soft: 262144
173 |                 hard: 500000
174 | 
175 |     es02:
176 |         restart: always
177 |         image: docker.elastic.co/elasticsearch/elasticsearch:7.12.0
178 |         volumes:
179 |             - "es02:/usr/share/elasticsearch/data"
180 |             - "./certs:/usr/share/elasticsearch/config/certificates:ro"
181 |         environment:
182 |             ES_JAVA_OPTS: "-Xmx512m -Xms512m"
183 |             ELASTIC_PASSWORD: ${ELASTICSEARCH_PWD}
184 |             node.name: es02
185 |             cluster.name: es-docker-cluster
186 |             discovery.seed_hosts: es01,es03
187 |             cluster.initial_master_nodes: es01,es02,es03
188 |             xpack.license.self_generated.type: basic
189 |             xpack.monitoring.collection.enabled: "true"
190 |             xpack.security.enabled: "true"
191 |             xpack.security.http.ssl.enabled: "true"
192 |             xpack.security.http.ssl.key: /usr/share/elasticsearch/config/certificates/es02/es02.key
193 |             xpack.security.http.ssl.certificate_authorities: /usr/share/elasticsearch/config/certificates/ca/ca.crt
194 |             xpack.security.http.ssl.certificate: /usr/share/elasticsearch/config/certificates/es02/es02.crt
195 |             xpack.security.transport.ssl.enabled: "true"
196 |             xpack.security.transport.ssl.verification_mode: certificate
197 |             xpack.security.transport.ssl.certificate_authorities: /usr/share/elasticsearch/config/certificates/ca/ca.crt
198 |             xpack.security.transport.ssl.certificate: /usr/share/elasticsearch/config/certificates/es02/es02.crt
199 |             xpack.security.transport.ssl.key: /usr/share/elasticsearch/config/certificates/es02/es02.key
200 |             cluster.routing.allocation.disk.threshold_enabled: "true"
201 |             cluster.routing.allocation.disk.watermark.low: 93%
202 |             cluster.routing.allocation.disk.watermark.high: 95%
203 |         ulimits:
204 |             memlock:
205 |                 soft: 262144
206 |                 hard: 500000
207 | 
208 |     es03:
209 |         restart: always
210 |         image: docker.elastic.co/elasticsearch/elasticsearch:7.12.0
211 |         volumes:
212 |             - "es03:/usr/share/elasticsearch/data"
213 |             - "./certs:/usr/share/elasticsearch/config/certificates:ro"
214 |         environment:
215 |             ES_JAVA_OPTS: "-Xmx512m -Xms512m"
216 |             ELASTIC_PASSWORD: ${ELASTICSEARCH_PWD}
217 |             node.name: es03
218 |             cluster.name: es-docker-cluster
219 |             discovery.seed_hosts: es01,es02
220 |             cluster.initial_master_nodes: es01,es02,es03
221 |             xpack.license.self_generated.type: basic
222 |             xpack.monitoring.collection.enabled: "true"
223 |             xpack.security.enabled: "true"
224 |             xpack.security.http.ssl.enabled: "true"
225 |             xpack.security.http.ssl.key: /usr/share/elasticsearch/config/certificates/es03/es03.key
226 |             xpack.security.http.ssl.certificate_authorities: /usr/share/elasticsearch/config/certificates/ca/ca.crt
227 |             xpack.security.http.ssl.certificate: /usr/share/elasticsearch/config/certificates/es03/es03.crt
228 |             xpack.security.transport.ssl.enabled: "true"
229 |             xpack.security.transport.ssl.verification_mode: certificate
230 |             xpack.security.transport.ssl.certificate_authorities: /usr/share/elasticsearch/config/certificates/ca/ca.crt
231 |             xpack.security.transport.ssl.certificate: /usr/share/elasticsearch/config/certificates/es03/es03.crt
232 |             xpack.security.transport.ssl.key: /usr/share/elasticsearch/config/certificates/es03/es03.key
233 |             cluster.routing.allocation.disk.threshold_enabled: "true"
234 |             cluster.routing.allocation.disk.watermark.low: 93%
235 |             cluster.routing.allocation.disk.watermark.high: 95%
236 |         ulimits:
237 |             memlock:
238 |                 soft: 262144
239 |                 hard: 500000
240 | 
241 |     kibana:
242 |         image: docker.elastic.co/kibana/kibana:7.12.0
243 |         restart: always
244 |         volumes:
245 |             - type: bind
246 |               source: ./kibana.yml
247 |               target: /usr/share/kibana/config/kibana.yml
248 |               read_only: true
249 |             - "./certs:/usr/share/elasticsearch/config/certificates:ro"
250 |         ports:
251 |             - "5601:5601"
252 |         depends_on:
253 |             - es01
254 | 
255 |     # source : https://docs.min.io/docs/deploy-minio-on-docker-compose.html
256 |     minio:
257 |         restart: always
258 |         image: minio/minio:RELEASE.2021-04-06T23-11-00Z-24-g409125240
259 |         command: server /data
260 |         ports:
261 |             - 9000:9000
262 |         volumes:
263 |             - minio:/data
264 |         environment:
265 |             MINIO_ACCESS_KEY: ${MINIO_ACCESS_KEY}
266 |             MINIO_SECRET_KEY: ${MINIO_SECRET_KEY}
267 | 
268 | 
269 | volumes:
270 |     es01:
271 |     es02:
272 |     es03:
273 |     minio:
274 |     prefect_postgres:
275 | 
276 | networks:
277 |     prefect-server:
278 |         name: prefect-server
279 | 


--------------------------------------------------------------------------------
/flow/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.7
2 | 
3 | RUN apt update
4 | RUN pip install --upgrade pip
5 | 
6 | COPY ./requirements.txt /requirements.txt
7 | RUN pip install -r /requirements.txt


--------------------------------------------------------------------------------
/flow/config.toml:
--------------------------------------------------------------------------------
 1 | # debug mode
 2 | debug = true
 3 | 
 4 | # base configuration directory (typically you won't change this!)
 5 | home_dir = "~/.prefect"
 6 | 
 7 | backend = "server"
 8 | 
 9 | [server]
10 | host = "http://172.17.0.1"
11 | port = "4200"
12 | host_port = "4200"
13 | endpoint = "${server.host}:${server.port}"
14 | 
15 | [s3]
16 | endpoint = "172.17.0.1:9000"
17 | key = "minio"
18 | secret = "minio123"


--------------------------------------------------------------------------------
/flow/requirements.txt:
--------------------------------------------------------------------------------
 1 | prefect==0.14.16
 2 | minio==7.0.3
 3 | clevercsv==0.6.7
 4 | tqdm==4.60.0
 5 | elasticsearch==7.12.0
 6 | geopy==2.1.0
 7 | iso3166==1.0.1
 8 | dateparser==1.0.0
 9 | GoogleNews==1.5.7
10 | snscrape==0.3.4
11 | pandas==1.2.4
12 | 


--------------------------------------------------------------------------------
/flow/scripts/crawl_google_news.py:
--------------------------------------------------------------------------------
  1 | # python3
  2 | import os
  3 | from typing import Iterable
  4 | import uuid
  5 | import prefect
  6 | from elasticsearch import Elasticsearch, helpers
  7 | from prefect import Flow, Task, Client
  8 | from datetime import timedelta, datetime
  9 | 
 10 | from prefect.schedules import IntervalSchedule
 11 | from GoogleNews import GoogleNews
 12 | 
 13 | from crawl_mapping import mapping
 14 | 
 15 | 
 16 | project_name = "pandemic-knowledge-crawl-googlenews"
 17 | index_name = "news_googlenews"
 18 | 
 19 | MAX_ES_ROW_INJECT = int(os.environ.get("MAX_ES_ROW_INJECT", 1000))
 20 | ELASTIC_SCHEME = os.environ.get("ELASTIC_SCHEME")
 21 | ELASTIC_PORT = os.environ.get("ELASTIC_PORT")
 22 | ELASTIC_USER = os.environ.get("ELASTIC_USER")
 23 | ELASTIC_PWD = os.environ.get("ELASTIC_PWD")
 24 | ELASTIC_ENDPOINT = os.environ.get("ELASTIC_ENDPOINT")
 25 | 
 26 | logger = prefect.context.get("logger")
 27 | 
 28 | schedule = IntervalSchedule(
 29 |     start_date=datetime.utcnow() + timedelta(seconds=1), interval=timedelta(hours=24)
 30 | )
 31 | 
 32 | 
 33 | def get_es_instance():
 34 |     es_inst = Elasticsearch(
 35 |         [ELASTIC_ENDPOINT],
 36 |         http_auth=(ELASTIC_USER, ELASTIC_PWD),
 37 |         scheme=ELASTIC_SCHEME,
 38 |         port=ELASTIC_PORT,
 39 |         verify_certs=False,
 40 |     )
 41 |     return es_inst
 42 | 
 43 | 
 44 | def inject_rows_to_es(rows, index_name):
 45 |     es_inst = get_es_instance()
 46 | 
 47 |     logger.info("Injecting {} rows in Elasticsearch".format(len(rows)))
 48 | 
 49 |     actions = [
 50 |         {"_index": index_name, "_id": uuid.uuid4(), "_source": row} for row in rows
 51 |     ]
 52 | 
 53 |     helpers.bulk(es_inst, actions)
 54 | 
 55 | 
 56 | def format_new(new: dict, lang: str) -> dict:
 57 |     """Formatting a single Google News new for elasticsearch injection"""
 58 |     if len(new):
 59 |         return {
 60 |             "title": str(new["title"]),
 61 |             "desc": str(new["desc"]),
 62 |             "img": str(new["img"]),
 63 |             "link": "https://" + str(new["link"]),
 64 |             "source.crawler": "Google News",
 65 |             "source.website": str(new["site"]),
 66 |             "source.url": str(new["link"]),
 67 |             "date": new["datetime"],
 68 |             "lang": lang,
 69 |         }
 70 |     return None
 71 | 
 72 | 
 73 | def get_news(googlenews: GoogleNews, lang: str, search_tag: str) -> Iterable:
 74 |     googlenews.get_news(search_tag)
 75 |     news = googlenews.results(sort=True)
 76 |     if news:
 77 |         for new in news:
 78 |             fmt_new = format_new(new, lang)
 79 |             if fmt_new:
 80 |                 yield fmt_new
 81 |     return []
 82 | 
 83 | 
 84 | class GetNews(Task):
 85 |     def run(self, index_name):
 86 |         googlenews = GoogleNews(
 87 |             period="24h",  # TODO(): Improve using googlenews.set_time_range('02/01/2020','02/28/2020')
 88 |             encode="utf-8",
 89 |         )
 90 |         news_to_inject = []
 91 |         langs = ["fr", "en"]
 92 |         search_tags = ["COVID", "CORONA"]
 93 |         for lang in langs:
 94 |             for search_tag in search_tags:
 95 |                 logger.info(
 96 |                     f"Crawling GoogleNews for '{lang}' lang and {search_tag} search tag..."
 97 |                 )
 98 |                 googlenews.set_lang(lang)
 99 |                 try:
100 |                     news = list(get_news(googlenews, lang, search_tag))
101 |                     news_to_inject += news if len(news) else []
102 |                     logger.info(f"Found {len(news)} news.")
103 |                 except Exception as e:
104 |                     logger.error(e)
105 |                 googlenews.clear()
106 |                 if len(news_to_inject) > 0:
107 |                     inject_rows_to_es(news_to_inject, index_name)
108 |                     news_to_inject = []
109 | 
110 | 
111 | class GenerateEsMapping(Task):
112 |     def __init__(self, index_name, **kwargs):
113 |         self.index_name = index_name
114 |         super().__init__(**kwargs)
115 | 
116 |     def run(self):
117 |         index_name = self.index_name
118 |         es_inst = get_es_instance()
119 | 
120 |         logger.info("Generating mapping for index {}".format(index_name))
121 | 
122 |         es_inst.indices.delete(index=index_name, ignore=[400, 404])
123 | 
124 |         response = es_inst.indices.create(index=index_name, body=mapping, ignore=400)
125 | 
126 |         if "acknowledged" in response:
127 |             if response["acknowledged"] == True:
128 |                 logger.info(
129 |                     "INDEX MAPPING SUCCESS FOR INDEX: {}".format(response["index"])
130 |                 )
131 |             elif "error" in response:
132 |                 logger.error(response["error"]["root_cause"])
133 |                 logger.error("Error type: {}".format(response["error"]["type"]))
134 |                 raise Exception("Unable to create index mapping")
135 | 
136 | 
137 | with Flow("Crawl news and insert", schedule=schedule) as flow:
138 |     flow.set_dependencies(
139 |         upstream_tasks=[GenerateEsMapping(index_name)],
140 |         task=GetNews(),
141 |         keyword_tasks=dict(index_name=index_name),
142 |     )
143 | 
144 | if __name__ == "__main__":
145 |     try:
146 |         client = Client()
147 |         client.create_project(project_name=project_name)
148 |     except prefect.utilities.exceptions.ClientError as e:
149 |         logger.info("Project already exists")
150 | 
151 |     flow.register(
152 |         project_name=project_name,
153 |         labels=["development"],
154 |         add_default_labels=False,
155 |     )
156 | 


--------------------------------------------------------------------------------
/flow/scripts/crawl_mapping.py:
--------------------------------------------------------------------------------
 1 | mapping = {
 2 |     "mappings": {
 3 |         "properties": {
 4 |             "title": {"type": "text", "fields": {"keyword": {"type": "keyword"}}},
 5 |             "desc": {"type": "text"},
 6 |             "date": {
 7 |                 "type": "date",
 8 |                 "format": "strict_date_optional_time||epoch_millis",
 9 |             },
10 |             "link": {"type": "text", "fields": {"keyword": {"type": "keyword"}}},
11 |             "img": {"type": "text"},
12 |             "source": {
13 |                 "properties": {
14 |                     "crawler": {"type": "text"},
15 |                     "website": {"type": "text"},
16 |                     "author": {"type": "text"},
17 |                     "url": {"type": "text"},
18 |                     "tweet": {"properties": {"id": {"type": "text"}}},
19 |                 }
20 |             },
21 |             "lang": {"type": "text", "fields": {"keyword": {"type": "keyword"}}},
22 |         }
23 |     }
24 | }


--------------------------------------------------------------------------------
/flow/scripts/crawl_tweets.py:
--------------------------------------------------------------------------------
  1 | # python3
  2 | import os
  3 | import uuid
  4 | import prefect
  5 | from elasticsearch import Elasticsearch, helpers
  6 | from prefect import Flow, Task, Client
  7 | from datetime import datetime
  8 | from datetime import timedelta
  9 | 
 10 | from prefect.schedules import IntervalSchedule
 11 | import snscrape.modules.twitter as sntwitter
 12 | 
 13 | from crawl_mapping import mapping
 14 | 
 15 | project_name = "pandemic-knowledge-crawl-tweets"
 16 | index_name = "news_tweets"
 17 | 
 18 | lang = "en"
 19 | tweet_limit = 1000
 20 | 
 21 | MAX_ES_ROW_INJECT = int(os.environ.get("MAX_ES_ROW_INJECT", 1000))
 22 | ELASTIC_SCHEME = os.environ.get("ELASTIC_SCHEME")
 23 | ELASTIC_PORT = os.environ.get("ELASTIC_PORT")
 24 | ELASTIC_USER = os.environ.get("ELASTIC_USER")
 25 | ELASTIC_PWD = os.environ.get("ELASTIC_PWD")
 26 | ELASTIC_ENDPOINT = os.environ.get("ELASTIC_ENDPOINT")
 27 | 
 28 | logger = prefect.context.get("logger")
 29 | 
 30 | schedule = IntervalSchedule(
 31 |     start_date=datetime.utcnow() + timedelta(seconds=1), interval=timedelta(hours=24)
 32 | )
 33 | 
 34 | 
 35 | def get_es_instance():
 36 |     es_inst = Elasticsearch(
 37 |         [ELASTIC_ENDPOINT],
 38 |         http_auth=(ELASTIC_USER, ELASTIC_PWD),
 39 |         scheme=ELASTIC_SCHEME,
 40 |         port=ELASTIC_PORT,
 41 |         verify_certs=False,
 42 |     )
 43 |     return es_inst
 44 | 
 45 | 
 46 | def inject_rows_to_es(rows, index_name):
 47 |     es_inst = get_es_instance()
 48 | 
 49 |     logger.info("Injecting {} rows in Elasticsearch".format(len(rows)))
 50 | 
 51 |     actions = [
 52 |         {"_index": index_name, "_id": uuid.uuid4(), "_source": row} for row in rows
 53 |     ]
 54 | 
 55 |     helpers.bulk(es_inst, actions)
 56 | 
 57 | 
 58 | class GetTweets(Task):
 59 |     def run(self, index_name):
 60 |         tweets_from = (datetime.now() - timedelta(days=1)).strftime("%Y-%m-%d")
 61 |         to_inject = []
 62 |         tweets = sntwitter.TwitterSearchScraper(
 63 |             f"covid since:{tweets_from} lang:{lang}"
 64 |         ).get_items()
 65 |         for i, tweet in enumerate(tweets):
 66 |             if i > tweet_limit:
 67 |                 break
 68 |             if i % 100 == 0:
 69 |                 inject_rows_to_es(to_inject, index_name)
 70 |                 to_inject = []
 71 |             to_inject.append(
 72 |                 {
 73 |                     "title": f"Tweet from {tweet.username} the {tweet.date}",
 74 |                     "desc": tweet.content,
 75 |                     "date": tweet.date,
 76 |                     "link": tweet.url,
 77 |                     "source.crawler": "twitter",
 78 |                     "source.website": "https://twitter.com",
 79 |                     "source.author": tweet.username,
 80 |                     "source.url": tweet.url,
 81 |                     "source.tweet.id": tweet.id,
 82 |                     "lang": lang
 83 |                 }
 84 |             )
 85 |         if len(to_inject):
 86 |             inject_rows_to_es(to_inject, index_name)
 87 | 
 88 | 
 89 | class GenerateEsMapping(Task):
 90 |     def __init__(self, index_name, **kwargs):
 91 |         self.index_name = index_name
 92 |         super().__init__(**kwargs)
 93 | 
 94 |     def run(self):
 95 |         index_name = self.index_name
 96 |         es_inst = get_es_instance()
 97 | 
 98 |         logger.info("Generating mapping for index {}".format(index_name))
 99 | 
100 |         es_inst.indices.delete(index=index_name, ignore=[400, 404])
101 | 
102 |         response = es_inst.indices.create(index=index_name, body=mapping, ignore=400)
103 | 
104 |         if "acknowledged" in response:
105 |             if response["acknowledged"] == True:
106 |                 logger.info(
107 |                     "INDEX MAPPING SUCCESS FOR INDEX: {}".format(response["index"])
108 |                 )
109 |             elif "error" in response:
110 |                 logger.error(response["error"]["root_cause"])
111 |                 logger.error("Error type: {}".format(response["error"]["type"]))
112 |                 raise Exception("Unable to create index mapping")
113 | 
114 | 
115 | with Flow("Crawl tweets and insert", schedule=schedule) as flow:
116 |     flow.set_dependencies(
117 |         upstream_tasks=[GenerateEsMapping(index_name)],
118 |         task=GetTweets(),
119 |         keyword_tasks=dict(index_name=index_name),
120 |     )
121 | 
122 | if __name__ == "__main__":
123 |     try:
124 |         client = Client()
125 |         client.create_project(project_name=project_name)
126 |     except prefect.utilities.exceptions.ClientError as e:
127 |         logger.info("Project already exists")
128 | 
129 |     flow.register(
130 |         project_name=project_name,
131 |         labels=["development"],
132 |         add_default_labels=False,
133 |     )
134 | 


--------------------------------------------------------------------------------
/flow/scripts/insert_france.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import dateparser
  3 | import uuid
  4 | import requests
  5 | import prefect
  6 | import clevercsv
  7 | import traceback
  8 | from tqdm import tqdm
  9 | from prefect import Flow, Task, Client, task
 10 | from datetime import timedelta, datetime
 11 | from prefect.schedules import IntervalSchedule
 12 | from elasticsearch import Elasticsearch, helpers
 13 | from geopy.geocoders import Nominatim
 14 | from requests.adapters import HTTPAdapter
 15 | from requests.packages.urllib3.util.retry import Retry
 16 | 
 17 | from mapping import mapping
 18 | 
 19 | MINIO_SCHEME = os.environ.get("MINIO_SCHEME")
 20 | MINIO_ENDPOINT = os.environ.get("MINIO_ENDPOINT")
 21 | MINIO_ACCESS_KEY = os.environ.get("MINIO_ACCESS_KEY")
 22 | MINIO_SECRET_KEY = os.environ.get("MINIO_SECRET_KEY")
 23 | MAX_ES_ROW_INJECT = int(os.environ.get("MAX_ES_ROW_INJECT", 1000))
 24 | ELASTIC_SCHEME = os.environ.get("ELASTIC_SCHEME")
 25 | ELASTIC_PORT = os.environ.get("ELASTIC_PORT")
 26 | ELASTIC_USER = os.environ.get("ELASTIC_USER")
 27 | ELASTIC_PWD = os.environ.get("ELASTIC_PWD")
 28 | ELASTIC_ENDPOINT = os.environ.get("ELASTIC_ENDPOINT")
 29 | 
 30 | csv_endpoint = "https://raw.githubusercontent.com/opencovid19-fr/data/master/dist/chiffres-cles.csv"
 31 | index_name = "contamination_opencovid19_fr"
 32 | project_name = f"pandemic-knowledge-opencovid19-fr"
 33 | flow_name = project_name
 34 | 
 35 | logger = prefect.context.get("logger")
 36 | 
 37 | columns_allowed = {
 38 |     "date": ["date"],
 39 |     "location": ["maille_nom"],
 40 |     "location_name": ["maille_nom"],
 41 |     "confirmed": ["cas_confirmes"],
 42 |     "deaths": ["deces"],
 43 |     "recovered": ["gueris"],
 44 |     "vaccinated": [],
 45 |     "tested": ["depistes"],
 46 | }
 47 | 
 48 | extra_locations = {"EL": "GR"}
 49 | 
 50 | locations_cache = {"World": None}
 51 | 
 52 | 
 53 | def get_es_instance():
 54 |     es_inst = Elasticsearch(
 55 |         [ELASTIC_ENDPOINT],
 56 |         http_auth=(ELASTIC_USER, ELASTIC_PWD),
 57 |         scheme=ELASTIC_SCHEME,
 58 |         port=ELASTIC_PORT,
 59 |         verify_certs=False,
 60 |     )
 61 |     return es_inst
 62 | 
 63 | 
 64 | def format_date(date):
 65 |     if not date:
 66 |         return None
 67 |     try:
 68 |         return dateparser.parse(date)
 69 |     except Exception as e:
 70 |         logger.error(e)
 71 |     return None
 72 | 
 73 | 
 74 | def format_location(lookup_table, location_name):
 75 |     if not location_name:
 76 |         return None
 77 |     if location_name in locations_cache:
 78 |         return locations_cache[location_name]
 79 |     if location_name in lookup_table:
 80 |         return lookup_table[location_name]
 81 |     return None
 82 | 
 83 | 
 84 | def pick_one_of_elements(haystack: list, needles: list):
 85 |     for needle in needles:
 86 |         if needle in haystack:
 87 |             return needle
 88 |     return None
 89 | 
 90 | 
 91 | def pick_nonempty_cell(row, headers, potential_keys):
 92 |     for potential_key in potential_keys:
 93 |         if potential_key in headers and row[headers[potential_key]]:
 94 |             return row[headers[potential_key]]
 95 |     return None
 96 | 
 97 | 
 98 | def format_row(lookup_table, row, headers, filename):
 99 |     date_start = date_end = format_date(
100 |         pick_nonempty_cell(row, headers, columns_allowed["date"])
101 |     )
102 |     location = format_location(
103 |         lookup_table, pick_nonempty_cell(row, headers, columns_allowed["location"])
104 |     )
105 |     location_name = pick_nonempty_cell(row, headers, columns_allowed["location_name"])
106 |     nb_confirmed = pick_nonempty_cell(row, headers, columns_allowed["confirmed"])
107 |     nb_deaths = pick_nonempty_cell(row, headers, columns_allowed["deaths"])
108 |     nb_recovered = pick_nonempty_cell(row, headers, columns_allowed["recovered"])
109 |     nb_vaccinated = pick_nonempty_cell(row, headers, columns_allowed["vaccinated"])
110 |     nb_tested = pick_nonempty_cell(row, headers, columns_allowed["tested"])
111 |     if date_start != None:
112 |         return {
113 |             "date_start": date_start,
114 |             "date_end": date_end,
115 |             "location": location[0] if location else None,
116 |             "location_name": location_name,
117 |             "confirmed": int(float(nb_confirmed)) if nb_confirmed else 0,
118 |             "deaths": int(float(nb_deaths)) if nb_deaths else 0,
119 |             "recovered": int(float(nb_recovered)) if nb_recovered else 0,
120 |             "vaccinated": int(float(nb_vaccinated)) if nb_vaccinated else 0,
121 |             "tested": int(float(nb_tested)) if nb_tested else 0,
122 |             "filename": filename,
123 |             "iso_code2": location[1] if location else None,
124 |             "iso_region2": str(row[2]).replace("DEP", "FR"),
125 |         }
126 |     logger.warning(f"format_row(): Invalid row : {row}")
127 |     return None
128 | 
129 | 
130 | def inject_rows_to_es(rows, index_name):
131 |     es_inst = get_es_instance()
132 | 
133 |     logger.info("Injecting {} rows in Elasticsearch".format(len(rows)))
134 | 
135 |     actions = [
136 |         {"_index": index_name, "_id": uuid.uuid4(), "_source": row} for row in rows
137 |     ]
138 |     helpers.bulk(es_inst, actions)
139 | 
140 | 
141 | def parse_file(lookup_table, file_path):
142 |     with open(file_path, "r", newline="") as fp:
143 |         char_read = 10000 if os.path.getsize(file_path) > 10000 else None
144 | 
145 |         try:
146 |             dialect = clevercsv.Sniffer().sniff(fp.read(char_read), verbose=True)
147 |         except Exception as e:
148 |             logger.error(e)
149 |             return []
150 | 
151 |         fp.seek(0)
152 |         reader = clevercsv.reader(fp, dialect)
153 |         headers_list = next(reader)
154 |         headers = {}
155 |         for i, header in enumerate(headers_list):
156 |             headers[header] = i
157 |         for row in tqdm(reader, unit="entry"):
158 |             if row[1] != "departement":  # multiple granularities
159 |                 continue
160 |             yield format_row(lookup_table, row, headers, file_path)
161 |     return []
162 | 
163 | 
164 | def process_file(lookup_table, index_name, file_path):
165 |     to_inject = []
166 |     logger.info(f"process_file(): Processing {file_path}...")
167 |     for row in parse_file(lookup_table, file_path):
168 |         if row is not None:
169 |             to_inject.append(row)
170 |             if len(to_inject) >= MAX_ES_ROW_INJECT:
171 |                 inject_rows_to_es(to_inject, index_name)
172 |                 to_inject = []
173 |         else:
174 |             logger.warning("process_file(): Invalid row")
175 |     if len(to_inject) > 0:
176 |         inject_rows_to_es(to_inject, index_name)
177 | 
178 | 
179 | class ParseFiles(Task):
180 |     def run(self, lookup_table, index_name, http_csv_uris: list):
181 |         for file_uri in tqdm(http_csv_uris):
182 |             logger.info(f"Processing file {file_uri}...")
183 |             file_path = f"/tmp/{uuid.uuid4()}"
184 |             session = requests.Session()
185 |             retry = Retry(connect=3, backoff_factor=0.5)
186 |             adapter = HTTPAdapter(max_retries=retry)
187 |             session.mount("http://", adapter)
188 |             session.mount("https://", adapter)
189 |             r = session.get(file_uri, allow_redirects=True)
190 |             with open(file_path, "wb") as f:
191 |                 f.write(r.content)
192 |             process_file(lookup_table, index_name, file_path)
193 | 
194 | 
195 | class GenerateEsMapping(Task):
196 |     def run(self, index_name) -> str:
197 |         """
198 |         Returns:
199 |             str: index_name
200 |         """
201 |         es_inst = get_es_instance()
202 |         logger.info("Generating mapping for index {}".format(index_name))
203 |         es_inst.indices.delete(index=index_name, ignore=[400, 404])
204 |         response = es_inst.indices.create(
205 |             index=index_name, body=mapping, ignore=400  # ignore 400 already exists code
206 |         )
207 |         if "acknowledged" in response:
208 |             if response["acknowledged"] == True:
209 |                 logger.info(
210 |                     "INDEX MAPPING SUCCESS FOR INDEX: {}".format(response["index"])
211 |                 )
212 |             elif "error" in response:
213 |                 logger.error(response["error"]["root_cause"])
214 |                 logger.error("Error type: {}".format(response["error"]["type"]))
215 |                 raise Exception("Unable to create index mapping")
216 |         return index_name
217 | 
218 | 
219 | def read_lookup_table(lookup_file_path: str):
220 |     logger.info("Loading lookup table...")
221 |     lookup = {}
222 |     with open(lookup_file_path, "r", newline="") as fp:
223 |         char_read = 10000 if os.path.getsize(lookup_file_path) > 10000 else None
224 |         dialect = clevercsv.Sniffer().sniff(fp.read(char_read), verbose=True)
225 |         fp.seek(0)
226 |         reader = clevercsv.reader(fp, dialect)
227 |         next(reader)
228 |         for row in tqdm(reader, unit="entry"):
229 |             for location in [
230 |                 row[6],  # Province_State
231 |                 row[7],  # Country_Region
232 |                 row[10],  # Combined_Key
233 |             ]:
234 |                 if location and location not in lookup:
235 |                     if row[8] and row[9]:  # Lat, Long
236 |                         lookup[location] = (
237 |                             {"lat": float(row[8]), "lon": float(row[9])},
238 |                             row[1],
239 |                         )
240 |     logger.info(f"Found {len(lookup)} locations.")
241 |     return lookup
242 | 
243 | 
244 | lookup_table = read_lookup_table("/usr/app/UID_ISO_FIPS_LookUp_Table.csv")
245 | 
246 | schedule = IntervalSchedule(
247 |     start_date=datetime.utcnow() + timedelta(seconds=1), interval=timedelta(hours=24)
248 | )
249 | with Flow(flow_name, schedule=schedule) as flow:
250 |     es_mapping_task = GenerateEsMapping()
251 |     index_name = es_mapping_task(index_name)
252 | 
253 |     parse_files_task = ParseFiles()
254 |     parse_files_task(
255 |         lookup_table=lookup_table,
256 |         index_name=index_name,
257 |         http_csv_uris=[csv_endpoint],
258 |     )
259 | 
260 | if __name__ == "__main__":
261 | 
262 |     try:
263 |         client = Client()
264 |         client.create_project(project_name=project_name)
265 |     except prefect.utilities.exceptions.ClientError as e:
266 |         logger.info("Project already exists")
267 | 
268 |     flow.register(
269 |         project_name=project_name, labels=["development"], add_default_labels=False
270 |     )
271 | 


--------------------------------------------------------------------------------
/flow/scripts/insert_france_virtests.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import dateparser
  3 | import uuid
  4 | import requests
  5 | import prefect
  6 | import clevercsv
  7 | from tqdm import tqdm
  8 | from prefect import Flow, Task, Client, task
  9 | from datetime import timedelta, datetime
 10 | from prefect.schedules import IntervalSchedule
 11 | from elasticsearch import Elasticsearch, helpers
 12 | from requests.adapters import HTTPAdapter
 13 | from requests.packages.urllib3.util.retry import Retry
 14 | 
 15 | from mapping import mapping
 16 | 
 17 | MINIO_SCHEME = os.environ.get("MINIO_SCHEME")
 18 | MINIO_ENDPOINT = os.environ.get("MINIO_ENDPOINT")
 19 | MINIO_ACCESS_KEY = os.environ.get("MINIO_ACCESS_KEY")
 20 | MINIO_SECRET_KEY = os.environ.get("MINIO_SECRET_KEY")
 21 | MAX_ES_ROW_INJECT = int(os.environ.get("MAX_ES_ROW_INJECT", 1000))
 22 | ELASTIC_SCHEME = os.environ.get("ELASTIC_SCHEME")
 23 | ELASTIC_PORT = os.environ.get("ELASTIC_PORT")
 24 | ELASTIC_USER = os.environ.get("ELASTIC_USER")
 25 | ELASTIC_PWD = os.environ.get("ELASTIC_PWD")
 26 | ELASTIC_ENDPOINT = os.environ.get("ELASTIC_ENDPOINT")
 27 | 
 28 | csv_endpoint = "https://www.data.gouv.fr/en/datasets/r/406c6a23-e283-4300-9484-54e78c8ae675"
 29 | project_name = f"pandemic-knowledge-santepublic-tests"
 30 | index_name = "contamination_santepublique_vir_tests_fr"
 31 | flow_name = project_name
 32 | 
 33 | logger = prefect.context.get("logger")
 34 | 
 35 | columns_allowed = {
 36 |     "date": ["jour"],
 37 |     "location": ["dep"],
 38 |     "location_name": ["dep"],
 39 |     "confirmed": ["P"],
 40 |     "deaths": [],
 41 |     "recovered": [],
 42 |     "vaccinated": [],
 43 |     "tested": ["T"],
 44 | }
 45 | 
 46 | extra_locations = {"EL": "GR"}
 47 | 
 48 | locations_cache = {"World": None}
 49 | 
 50 | 
 51 | def get_es_instance():
 52 |     es_inst = Elasticsearch(
 53 |         [ELASTIC_ENDPOINT],
 54 |         http_auth=(ELASTIC_USER, ELASTIC_PWD),
 55 |         scheme=ELASTIC_SCHEME,
 56 |         port=ELASTIC_PORT,
 57 |         verify_certs=False,
 58 |     )
 59 |     return es_inst
 60 | 
 61 | 
 62 | def format_date(date):
 63 |     if not date:
 64 |         return None
 65 |     try:
 66 |         return dateparser.parse(date)
 67 |     except Exception as e:
 68 |         logger.error(e)
 69 |     return None
 70 | 
 71 | 
 72 | def format_location(lookup_table, location_name):
 73 |     if not location_name:
 74 |         return None
 75 |     if location_name in locations_cache:
 76 |         return locations_cache[location_name]
 77 |     if location_name in lookup_table:
 78 |         return lookup_table[location_name]
 79 |     return None
 80 | 
 81 | 
 82 | def pick_one_of_elements(haystack: list, needles: list):
 83 |     for needle in needles:
 84 |         if needle in haystack:
 85 |             return needle
 86 |     return None
 87 | 
 88 | 
 89 | def pick_nonempty_cell(row, headers, potential_keys):
 90 |     for potential_key in potential_keys:
 91 |         if potential_key in headers and row[headers[potential_key]]:
 92 |             return row[headers[potential_key]]
 93 |     return None
 94 | 
 95 | 
 96 | def format_row(lookup_table, row, headers, filename):
 97 |     date_start = date_end = format_date(
 98 |         pick_nonempty_cell(row, headers, columns_allowed["date"])
 99 |     )
100 |     location = format_location(
101 |         lookup_table, pick_nonempty_cell(row, headers, columns_allowed["location"])
102 |     )
103 |     location_name = pick_nonempty_cell(row, headers, columns_allowed["location_name"])
104 |     nb_confirmed = pick_nonempty_cell(row, headers, columns_allowed["confirmed"])
105 |     nb_deaths = pick_nonempty_cell(row, headers, columns_allowed["deaths"])
106 |     nb_recovered = pick_nonempty_cell(row, headers, columns_allowed["recovered"])
107 |     nb_vaccinated = pick_nonempty_cell(row, headers, columns_allowed["vaccinated"])
108 |     nb_tested = pick_nonempty_cell(row, headers, columns_allowed["tested"])
109 |     if date_start != None:
110 |         return {
111 |             "date_start": date_start,
112 |             "date_end": date_end,
113 |             "location": location[0] if location else None,
114 |             "location_name": location_name,
115 |             "confirmed": int(float(nb_confirmed)) if nb_confirmed else 0,
116 |             "deaths": int(float(nb_deaths)) if nb_deaths else 0,
117 |             "recovered": int(float(nb_recovered)) if nb_recovered else 0,
118 |             "vaccinated": int(float(nb_vaccinated)) if nb_vaccinated else 0,
119 |             "tested": int(float(nb_tested)) if nb_tested else 0,
120 |             "filename": filename,
121 |             "iso_code2": location[1] if location else None,
122 |             "iso_region2": f"FR-{location_name}",
123 |         }
124 |     logger.warning(f"format_row(): Invalid row : {row}")
125 |     return None
126 | 
127 | 
128 | def inject_rows_to_es(rows, index_name):
129 |     es_inst = get_es_instance()
130 | 
131 |     logger.info("Injecting {} rows in Elasticsearch".format(len(rows)))
132 | 
133 |     actions = [
134 |         {"_index": index_name, "_id": uuid.uuid4(), "_source": row} for row in rows
135 |     ]
136 |     helpers.bulk(es_inst, actions)
137 | 
138 | 
139 | def parse_file(lookup_table, file_path):
140 |     with open(file_path, "r", newline="") as fp:
141 |         char_read = 10000 if os.path.getsize(file_path) > 10000 else None
142 | 
143 |         try:
144 |             dialect = clevercsv.Sniffer().sniff(fp.read(char_read), verbose=True)
145 |         except Exception as e:
146 |             logger.error(e)
147 |             return []
148 | 
149 |         fp.seek(0)
150 |         reader = clevercsv.reader(fp, dialect)
151 |         headers_list = next(reader)
152 |         headers = {}
153 |         for i, header in enumerate(headers_list):
154 |             headers[header] = i
155 |         for row in tqdm(reader, unit="entry"):
156 |             yield format_row(lookup_table, row, headers, file_path)
157 |     return []
158 | 
159 | 
160 | def process_file(lookup_table, index_name, file_path):
161 |     to_inject = []
162 |     logger.info(f"process_file(): Processing {file_path}...")
163 |     for row in parse_file(lookup_table, file_path):
164 |         if row is not None:
165 |             to_inject.append(row)
166 |             if len(to_inject) >= MAX_ES_ROW_INJECT:
167 |                 inject_rows_to_es(to_inject, index_name)
168 |                 to_inject = []
169 |         else:
170 |             logger.warning("process_file(): Invalid row")
171 |     if len(to_inject) > 0:
172 |         inject_rows_to_es(to_inject, index_name)
173 | 
174 | 
175 | class ParseFiles(Task):
176 |     def run(self, lookup_table, index_name, http_csv_uris: list):
177 |         for file_uri in tqdm(http_csv_uris):
178 |             logger.info(f"Processing file {file_uri}...")
179 |             file_path = f"/tmp/{uuid.uuid4()}"
180 |             session = requests.Session()
181 |             retry = Retry(connect=3, backoff_factor=0.5)
182 |             adapter = HTTPAdapter(max_retries=retry)
183 |             session.mount("http://", adapter)
184 |             session.mount("https://", adapter)
185 |             r = session.get(file_uri, allow_redirects=True)
186 |             with open(file_path, "wb") as f:
187 |                 f.write(r.content)
188 |             process_file(lookup_table, index_name, file_path)
189 | 
190 | 
191 | class GenerateEsMapping(Task):
192 |     def run(self, index_name) -> str:
193 |         """
194 |         Returns:
195 |             str: index_name
196 |         """
197 |         es_inst = get_es_instance()
198 |         logger.info("Generating mapping for index {}".format(index_name))
199 |         es_inst.indices.delete(index=index_name, ignore=[400, 404])
200 |         response = es_inst.indices.create(
201 |             index=index_name, body=mapping, ignore=400  # ignore 400 already exists code
202 |         )
203 |         if "acknowledged" in response:
204 |             if response["acknowledged"] == True:
205 |                 logger.info(
206 |                     "INDEX MAPPING SUCCESS FOR INDEX: {}".format(response["index"])
207 |                 )
208 |             elif "error" in response:
209 |                 logger.error(response["error"]["root_cause"])
210 |                 logger.error("Error type: {}".format(response["error"]["type"]))
211 |                 raise Exception("Unable to create index mapping")
212 |         return index_name
213 | 
214 | 
215 | def read_lookup_table(lookup_file_path: str):
216 |     logger.info("Loading lookup table...")
217 |     lookup = {}
218 |     with open(lookup_file_path, "r", newline="") as fp:
219 |         char_read = 10000 if os.path.getsize(lookup_file_path) > 10000 else None
220 |         dialect = clevercsv.Sniffer().sniff(fp.read(char_read), verbose=True)
221 |         fp.seek(0)
222 |         reader = clevercsv.reader(fp, dialect)
223 |         next(reader)
224 |         for row in tqdm(reader, unit="entry"):
225 |             for location in [
226 |                 row[6],  # Province_State
227 |                 row[7],  # Country_Region
228 |                 row[10],  # Combined_Key
229 |             ]:
230 |                 if location and location not in lookup:
231 |                     if row[8] and row[9]:  # Lat, Long
232 |                         lookup[location] = (
233 |                             {"lat": float(row[8]), "lon": float(row[9])},
234 |                             row[1],
235 |                         )
236 |     logger.info(f"Found {len(lookup)} locations.")
237 |     return lookup
238 | 
239 | 
240 | lookup_table = read_lookup_table("/usr/app/UID_ISO_FIPS_LookUp_Table.csv")
241 | 
242 | schedule = IntervalSchedule(
243 |     start_date=datetime.utcnow() + timedelta(seconds=1), interval=timedelta(hours=24)
244 | )
245 | with Flow(flow_name, schedule=schedule) as flow:
246 |     es_mapping_task = GenerateEsMapping()
247 |     index_name = es_mapping_task(index_name)
248 | 
249 |     parse_files_task = ParseFiles()
250 |     parse_files_task(
251 |         lookup_table=lookup_table,
252 |         index_name=index_name,
253 |         http_csv_uris=[csv_endpoint],
254 |     )
255 | 
256 | if __name__ == "__main__":
257 | 
258 |     try:
259 |         client = Client()
260 |         client.create_project(project_name=project_name)
261 |     except prefect.utilities.exceptions.ClientError as e:
262 |         logger.info("Project already exists")
263 | 
264 |     flow.register(
265 |         project_name=project_name, labels=["development"], add_default_labels=False
266 |     )
267 | 


--------------------------------------------------------------------------------
/flow/scripts/insert_owid.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import dateparser
  3 | import uuid
  4 | import prefect
  5 | import clevercsv
  6 | import traceback
  7 | from tqdm import tqdm
  8 | from prefect import Flow, Task, Client, task
  9 | from datetime import timedelta, datetime
 10 | from prefect.schedules import IntervalSchedule
 11 | from minio import Minio
 12 | from elasticsearch import Elasticsearch, helpers
 13 | from geopy.geocoders import Nominatim
 14 | 
 15 | from mapping import mapping
 16 | 
 17 | MINIO_SCHEME = os.environ.get("MINIO_SCHEME")
 18 | MINIO_ENDPOINT = os.environ.get("MINIO_ENDPOINT")
 19 | MINIO_ACCESS_KEY = os.environ.get("MINIO_ACCESS_KEY")
 20 | MINIO_SECRET_KEY = os.environ.get("MINIO_SECRET_KEY")
 21 | MAX_ES_ROW_INJECT = int(os.environ.get("MAX_ES_ROW_INJECT", 1000))
 22 | ELASTIC_SCHEME = os.environ.get("ELASTIC_SCHEME")
 23 | ELASTIC_PORT = os.environ.get("ELASTIC_PORT")
 24 | ELASTIC_USER = os.environ.get("ELASTIC_USER")
 25 | ELASTIC_PWD = os.environ.get("ELASTIC_PWD")
 26 | ELASTIC_ENDPOINT = os.environ.get("ELASTIC_ENDPOINT")
 27 | 
 28 | bucket_name = "contamination-owid"
 29 | project_name = f"pandemic-knowledge-{bucket_name}"
 30 | index_name = f"{bucket_name.replace('-', '_')}"
 31 | flow_name = project_name
 32 | 
 33 | logger = prefect.context.get("logger")
 34 | 
 35 | columns_allowed = {
 36 |     "date": ["date"],
 37 |     "location": ["location"],
 38 |     "location_name": ["location"],
 39 |     "confirmed": ["new_cases"],
 40 |     "deaths": ["new_deaths"],
 41 |     "recovered": [],
 42 |     "vaccinated": ["new_vaccinations"],
 43 |     "tested": ["new_tests"],
 44 | }
 45 | 
 46 | extra_locations = {"EL": "GR"}
 47 | 
 48 | locations_cache = {"World": None}
 49 | 
 50 | 
 51 | def get_es_instance():
 52 |     es_inst = Elasticsearch(
 53 |         [ELASTIC_ENDPOINT],
 54 |         http_auth=(ELASTIC_USER, ELASTIC_PWD),
 55 |         scheme=ELASTIC_SCHEME,
 56 |         port=ELASTIC_PORT,
 57 |         verify_certs=False,
 58 |     )
 59 |     return es_inst
 60 | 
 61 | 
 62 | def format_date(date):
 63 |     if not date:
 64 |         return None
 65 |     try:
 66 |         return dateparser.parse(date)
 67 |     except Exception as e:
 68 |         logger.error(e)
 69 |     return None
 70 | 
 71 | 
 72 | def format_location(lookup_table, location_name):
 73 |     if not location_name:
 74 |         return None
 75 |     if location_name in locations_cache:
 76 |         return locations_cache[location_name]
 77 |     if location_name in lookup_table:
 78 |         return lookup_table[location_name]
 79 | 
 80 |     logger.info(f"Guessing geolocation for {location_name}")
 81 |     geolocator = Nominatim(user_agent="pandemic-knowledge")
 82 |     location = geolocator.geocode(
 83 |         extra_locations[location_name]
 84 |         if location_name in extra_locations
 85 |         else location_name,
 86 |         addressdetails=True,
 87 |     )
 88 | 
 89 |     if location and location.raw:
 90 |         logger.info(f"Found {location.latitude}, {location.longitude}")
 91 |         if "address" in location.raw and "country_code" in location.raw["address"]:
 92 |             locations_cache[location_name] = (
 93 |                 {"lat": location.latitude, "lon": location.longitude},
 94 |                 location.raw["address"]["country_code"].upper(),
 95 |             )
 96 |             return locations_cache[location_name]
 97 |     locations_cache[location_name] = None
 98 |     logger.error(
 99 |         f"Failed to locate (no country code and/or coordinates) for {location}"
100 |     )
101 |     return None
102 | 
103 | 
104 | def pick_one_of_elements(haystack: list, needles: list):
105 |     for needle in needles:
106 |         if needle in haystack:
107 |             return needle
108 |     return None
109 | 
110 | 
111 | def pick_nonempty_cell(row, headers, potential_keys):
112 |     for potential_key in potential_keys:
113 |         if potential_key in headers and row[headers[potential_key]]:
114 |             return row[headers[potential_key]]
115 |     return None
116 | 
117 | 
118 | def format_row(lookup_table, row, headers, filename):
119 |     date_start = date_end = format_date(
120 |         pick_nonempty_cell(row, headers, columns_allowed["date"])
121 |     )
122 |     location = format_location(
123 |         lookup_table, pick_nonempty_cell(row, headers, columns_allowed["location"])
124 |     )
125 |     location_name = pick_nonempty_cell(row, headers, columns_allowed["location_name"])
126 |     nb_confirmed = pick_nonempty_cell(row, headers, columns_allowed["confirmed"])
127 |     nb_deaths = pick_nonempty_cell(row, headers, columns_allowed["deaths"])
128 |     nb_recovered = pick_nonempty_cell(row, headers, columns_allowed["recovered"])
129 |     nb_vaccinated = pick_nonempty_cell(row, headers, columns_allowed["vaccinated"])
130 |     nb_tested = pick_nonempty_cell(row, headers, columns_allowed["tested"])
131 |     if location != None and date_start != None and nb_confirmed != None:
132 |         return {
133 |             "date_start": date_start,
134 |             "date_end": date_end,
135 |             "location": location[0],
136 |             "location_name": location_name,
137 |             "confirmed": int(float(nb_confirmed)) if nb_confirmed else 0,
138 |             "deaths": int(float(nb_deaths)) if nb_deaths else 0,
139 |             "recovered": int(float(nb_recovered)) if nb_recovered else 0,
140 |             "vaccinated": int(float(nb_vaccinated)) if nb_vaccinated else 0,
141 |             "tested": int(float(nb_tested)) if nb_tested else 0,
142 |             "filename": filename,
143 |             "iso_code2": location[1] if len(location) else None,
144 |         }
145 |     return None
146 | 
147 | 
148 | def inject_rows_to_es(rows, index_name):
149 |     es_inst = get_es_instance()
150 | 
151 |     logger.info("Injecting {} rows in Elasticsearch".format(len(rows)))
152 | 
153 |     actions = [
154 |         {"_index": index_name, "_id": uuid.uuid4(), "_source": row} for row in rows
155 |     ]
156 |     helpers.bulk(es_inst, actions)
157 | 
158 | 
159 | def parse_file(lookup_table, minio_client, bucket_name, object_name):
160 |     csv_file_path = "/tmp/" + str(uuid.uuid4())
161 |     minio_client.fget_object(bucket_name, object_name, csv_file_path)
162 |     with open(csv_file_path, "r", newline="") as fp:
163 |         char_read = 10000 if os.path.getsize(csv_file_path) > 10000 else None
164 | 
165 |         try:
166 |             dialect = clevercsv.Sniffer().sniff(fp.read(char_read), verbose=True)
167 |         except Exception as e:
168 |             logger.error(e)
169 |             return []
170 | 
171 |         fp.seek(0)
172 |         reader = clevercsv.reader(fp, dialect)
173 |         headers_list = next(reader)
174 |         headers = {}
175 |         for i, header in enumerate(headers_list):
176 |             headers[header] = i
177 |         for row in tqdm(reader, unit="entry"):
178 |             yield format_row(lookup_table, row, headers, object_name)
179 |     return []
180 | 
181 | 
182 | def process_file(lookup_table, index_name, bucket_name, object_name):
183 |     minio_client = Minio(
184 |         MINIO_ENDPOINT,
185 |         access_key=MINIO_ACCESS_KEY,
186 |         secret_key=MINIO_SECRET_KEY,
187 |         secure=MINIO_SCHEME == "https",
188 |     )
189 |     to_inject = []
190 |     logger.info(f"Processing {object_name}...")
191 |     for row in parse_file(lookup_table, minio_client, bucket_name, object_name):
192 |         if row is not None:
193 |             to_inject.append(row)
194 |             if len(to_inject) >= MAX_ES_ROW_INJECT:
195 |                 inject_rows_to_es(to_inject, index_name)
196 |                 to_inject = []
197 |         else:
198 |             logger.info("Invalid row")
199 |     if len(to_inject) > 0:
200 |         inject_rows_to_es(to_inject, index_name)
201 | 
202 | 
203 | def get_files(bucket_name):
204 |     minio_client = Minio(
205 |         MINIO_ENDPOINT,
206 |         access_key=MINIO_ACCESS_KEY,
207 |         secret_key=MINIO_SECRET_KEY,
208 |         secure=MINIO_SCHEME == "https",
209 |     )
210 |     logger.info("Parse file for bucket {}".format(bucket_name))
211 |     if not minio_client.bucket_exists(bucket_name):
212 |         logger.error("Bucket {} does not exists".format(bucket_name))
213 |         return
214 |     return list(minio_client.list_objects(bucket_name))
215 | 
216 | 
217 | class ParseFiles(Task):
218 |     def run(self, lookup_table, index_name):
219 |         logger.info(lookup_table)
220 |         for file in tqdm(get_files(bucket_name=bucket_name)):
221 |             object_name = file.object_name
222 |             try:
223 |                 logger.info(f"Processing file {object_name}...")
224 |                 process_file(lookup_table, index_name, bucket_name, object_name)
225 |             except Exception as e:
226 |                 logger.error(traceback.format_exc())
227 |                 logger.error(e)
228 |                 logger.error(f"Can't process file {object_name}")
229 | 
230 | 
231 | class GenerateEsMapping(Task):
232 |     def run(self, index_name) -> str:
233 |         """
234 |         Returns:
235 |             str: index_name
236 |         """
237 |         es_inst = get_es_instance()
238 |         logger.info("Generating mapping for index {}".format(index_name))
239 |         es_inst.indices.delete(index=index_name, ignore=[400, 404])
240 |         response = es_inst.indices.create(
241 |             index=index_name, body=mapping, ignore=400  # ignore 400 already exists code
242 |         )
243 |         if "acknowledged" in response:
244 |             if response["acknowledged"] == True:
245 |                 logger.info(
246 |                     "INDEX MAPPING SUCCESS FOR INDEX: {}".format(response["index"])
247 |                 )
248 |             elif "error" in response:
249 |                 logger.error(response["error"]["root_cause"])
250 |                 logger.error("Error type: {}".format(response["error"]["type"]))
251 |                 raise Exception("Unable to create index mapping")
252 |         return index_name
253 | 
254 | 
255 | def read_lookup_table(lookup_file_path: str):
256 |     logger.info("Loading lookup table...")
257 |     lookup = {}
258 |     with open(lookup_file_path, "r", newline="") as fp:
259 |         char_read = 10000 if os.path.getsize(lookup_file_path) > 10000 else None
260 |         dialect = clevercsv.Sniffer().sniff(fp.read(char_read), verbose=True)
261 |         fp.seek(0)
262 |         reader = clevercsv.reader(fp, dialect)
263 |         next(reader)
264 |         for row in tqdm(reader, unit="entry"):
265 |             for location in [
266 |                 row[6],  # Province_State
267 |                 row[7],  # Country_Region
268 |                 row[10],  # Combined_Key
269 |             ]:
270 |                 if location and location not in lookup:
271 |                     if row[8] and row[9]:  # Lat, Long
272 |                         lookup[location] = (
273 |                             {"lat": float(row[8]), "lon": float(row[9])},
274 |                             row[1],
275 |                         )
276 |     logger.info(f"Found {len(lookup)} locations.")
277 |     return lookup
278 | 
279 | 
280 | lookup_table = read_lookup_table("/usr/app/UID_ISO_FIPS_LookUp_Table.csv")
281 | 
282 | schedule = IntervalSchedule(
283 |     start_date=datetime.utcnow() + timedelta(seconds=1), interval=timedelta(hours=24)
284 | )
285 | with Flow(flow_name, schedule=schedule) as flow:
286 |     es_mapping_task = GenerateEsMapping()
287 |     index_name = es_mapping_task(index_name)
288 | 
289 |     parse_files_task = ParseFiles()
290 |     parse_files_task(lookup_table=lookup_table, index_name=index_name)
291 | 
292 | if __name__ == "__main__":
293 | 
294 |     try:
295 |         client = Client()
296 |         client.create_project(project_name=project_name)
297 |     except prefect.utilities.exceptions.ClientError as e:
298 |         logger.info("Project already exists")
299 | 
300 |     flow.register(
301 |         project_name=project_name, labels=["development"], add_default_labels=False
302 |     )
303 | 


--------------------------------------------------------------------------------
/flow/scripts/mapping.py:
--------------------------------------------------------------------------------
 1 | mapping = {
 2 |     "mappings": {
 3 |         "properties": {
 4 |             "date_start": {
 5 |                 "type": "date",
 6 |                 "format": "strict_date_optional_time||epoch_millis",
 7 |             },
 8 |             "date_end": {
 9 |                 "type": "date",
10 |                 "format": "strict_date_optional_time||epoch_millis",
11 |             },
12 |             "location": {"type": "geo_point"},
13 |             "location_name": {
14 |                 "type": "text",
15 |                 "fields": {"keyword": {"type": "keyword"}},
16 |             },
17 |             "confirmed": {"type": "long"},
18 |             "deaths": {"type": "long"},
19 |             "vaccinated": {"type": "long"},
20 |             "tested": {"type": "long"},
21 |             "recovered": {"type": "long"},
22 |             "filename": {"type": "text"},
23 |             "iso_code2": {"type": "text", "fields": {"keyword": {"type": "keyword"}}},
24 |             "iso_region2": {"type": "text", "fields": {"keyword": {"type": "keyword"}}},
25 |             "max_population": {"type": "long"},
26 |             "percentage": {"type": "float"},
27 |         }
28 |     }
29 | }


--------------------------------------------------------------------------------
/flow/scripts/parse_insert.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import uuid
  4 | import prefect
  5 | import clevercsv
  6 | from tqdm import tqdm
  7 | from datetime import datetime, timedelta
  8 | from prefect import Flow, Task, Client
  9 | from minio import Minio
 10 | from elasticsearch import Elasticsearch, helpers
 11 | from ssl import create_default_context
 12 | from geopy.geocoders import Nominatim
 13 | from iso3166 import countries
 14 | from prefect.schedules import IntervalSchedule
 15 | 
 16 | from mapping import mapping
 17 | 
 18 | MINIO_SCHEME = os.environ.get("MINIO_SCHEME")
 19 | MINIO_ENDPOINT = os.environ.get("MINIO_ENDPOINT")
 20 | MINIO_ACCESS_KEY = os.environ.get("MINIO_ACCESS_KEY")
 21 | MINIO_SECRET_KEY = os.environ.get("MINIO_SECRET_KEY")
 22 | MAX_ES_ROW_INJECT = int(os.environ.get("MAX_ES_ROW_INJECT", 1000))
 23 | ELASTIC_SCHEME = os.environ.get("ELASTIC_SCHEME")
 24 | ELASTIC_PORT = os.environ.get("ELASTIC_PORT")
 25 | ELASTIC_USER = os.environ.get("ELASTIC_USER")
 26 | ELASTIC_PWD = os.environ.get("ELASTIC_PWD")
 27 | ELASTIC_ENDPOINT = os.environ.get("ELASTIC_ENDPOINT")
 28 | 
 29 | columns_allowed = {
 30 |     "date": ["YearWeekISO", "dateRep", "date"],
 31 |     "location": ["ReportingCountry", "location", "countriesAndTerritories"],
 32 |     "cases": ["NumberDosesReceived", "new_vaccinations", "cases", "new_cases"],
 33 |     "population": ["population"],
 34 | }
 35 | 
 36 | logger = prefect.context.get("logger")
 37 | 
 38 | extra_locations = {"EL": "GR"}
 39 | 
 40 | locations_cache = {"World": None}
 41 | 
 42 | 
 43 | def get_es_instance():
 44 |     es_inst = Elasticsearch(
 45 |         [ELASTIC_ENDPOINT],
 46 |         http_auth=(ELASTIC_USER, ELASTIC_PWD),
 47 |         scheme=ELASTIC_SCHEME,
 48 |         port=ELASTIC_PORT,
 49 |         verify_certs=False,
 50 |     )
 51 |     return es_inst
 52 | 
 53 | 
 54 | def format_date(date):
 55 |     date = date.replace("/", "-")
 56 |     p = re.compile("(\\d{4})-W(\\d{2})")
 57 |     weekMatches = p.match(date)
 58 |     if weekMatches is not None:
 59 |         groups = weekMatches.groups()
 60 |         date_start = datetime.strptime(
 61 |             f"{groups[0]}-W{int(groups[1]) - 1}-1", "%Y-W%W-%w"
 62 |         ).date()
 63 |         date_end = date_start + timedelta(days=6.9)
 64 |         return date_start.strftime("%Y-%m-%d"), date_end.strftime("%Y-%m-%d")
 65 |     p = re.compile("(\\d{2})-(\\d{2})-(\\d{4})")
 66 |     frDateMatches = p.match(date)
 67 |     if frDateMatches is not None:
 68 |         groups = frDateMatches.groups()
 69 |         date = f"{groups[2]}-{groups[1]}-{groups[0]}"
 70 |         return date, date
 71 |     p = re.compile("(\\d{4})-(\\d{2})-(\\d{2})")
 72 |     dateMatches = p.match(date)
 73 |     if dateMatches is not None:
 74 |         return date, date
 75 |     return None, None
 76 | 
 77 | 
 78 | def format_location(location_name):
 79 |     if location_name in locations_cache:
 80 |         return locations_cache[location_name]
 81 |     geolocator = Nominatim(user_agent="pandemic-knowledge")
 82 |     location = geolocator.geocode(
 83 |         extra_locations[location_name]
 84 |         if location_name in extra_locations
 85 |         else location_name,
 86 |         addressdetails=True,
 87 |     )
 88 | 
 89 |     if location is None or "country_code" not in location.raw["address"]:
 90 |         logger.info(location_name)
 91 |         locations_cache[location_name] = None
 92 |         return None
 93 | 
 94 |     iso2 = location.raw["address"]["country_code"].upper()
 95 | 
 96 |     iso3 = countries.get(iso2).alpha3
 97 | 
 98 |     locations_cache[location_name] = (
 99 |         {"lat": location.latitude, "lon": location.longitude},
100 |         iso2,
101 |     )
102 | 
103 |     return locations_cache[location_name]
104 | 
105 | 
106 | def format_row(row, columns_indexes, filename, bucket_name):
107 |     date_start, date_end = format_date(row[columns_indexes["date"]])
108 |     location = format_location(row[columns_indexes["location"]])
109 |     if location is None:
110 |         return None
111 |     max_population = (
112 |         int(float(row[columns_indexes["population"]]))
113 |         if row[columns_indexes["population"]] != ""
114 |         else 0
115 |     )
116 |     cases = (
117 |         int(float(row[columns_indexes["cases"]]))
118 |         if row[columns_indexes["cases"]] != ""
119 |         else 0
120 |     )
121 |     percentage = (
122 |         float(cases) / float(max_population) * 100 if max_population != 0 else None
123 |     )
124 | 
125 |     formatted = {
126 |         "date_start": date_start,
127 |         "date_end": date_end,
128 |         "location": location[0],
129 |         "filename": filename,
130 |         "iso_code2": location[1],
131 |         "max_population": max_population,
132 |         "percentage": percentage,
133 |     }
134 | 
135 |     formatted["vaccinated" if bucket_name == "vaccination" else "confirmed"] = cases
136 | 
137 |     return formatted
138 | 
139 | 
140 | def inject_rows_to_es(rows, bucket_name):
141 |     es_inst = get_es_instance()
142 | 
143 |     logger.info("Injecting {} rows in Elasticsearch".format(len(rows)))
144 | 
145 |     actions = [
146 |         {"_index": bucket_name, "_id": uuid.uuid4(), "_source": row} for row in rows
147 |     ]
148 | 
149 |     helpers.bulk(es_inst, actions)
150 | 
151 | 
152 | def parse_file(minio_client, obj):
153 |     csv_file_path = "/tmp/" + str(uuid.uuid4())
154 |     minio_client.fget_object(obj.bucket_name, obj.object_name, csv_file_path)
155 |     with open(csv_file_path, "r", newline="") as fp:
156 |         char_read = 100000 if os.path.getsize(csv_file_path) > 100000 else None
157 | 
158 |         try:
159 |             dialect = clevercsv.Sniffer().sniff(fp.read(char_read), verbose=True)
160 |         except Exception as e:
161 |             logger.error(e)
162 |             return []
163 | 
164 |         fp.seek(0)
165 |         reader = clevercsv.reader(fp, dialect)
166 |         headers = next(reader)
167 |         columns_indexes = {}
168 |         malformed_csv = False
169 |         for name in columns_allowed:
170 |             for header in headers:
171 |                 index = (
172 |                     headers.index(header) if header in columns_allowed[name] else None
173 |                 )
174 |                 if index is None:
175 |                     continue
176 |                 columns_indexes[name] = index
177 |                 break
178 |             if name not in columns_indexes:
179 |                 logger.error(
180 |                     "Header {} cannot be found in csv {}".format(name, obj.object_name)
181 |                 )
182 |                 malformed_csv = True
183 |                 continue
184 |         if malformed_csv is True:
185 |             return []
186 |         for row in tqdm(reader, unit="entry"):
187 |             row = format_row(row, columns_indexes, obj.object_name, obj.bucket_name)
188 |             if row is not None:
189 |                 yield row
190 |     return []
191 | 
192 | 
193 | class ParseFiles(Task):
194 |     def run(self, bucket_name):
195 |         minio_client = Minio(
196 |             MINIO_ENDPOINT,
197 |             access_key=MINIO_ACCESS_KEY,
198 |             secret_key=MINIO_SECRET_KEY,
199 |             secure=MINIO_SCHEME == "https",
200 |         )
201 |         logger.info("Parse file for bucket {}".format(bucket_name))
202 |         if not minio_client.bucket_exists(bucket_name):
203 |             logger.error("Bucket {} does not exists".format(bucket_name))
204 |             return
205 |         objects = minio_client.list_objects(bucket_name)
206 |         for obj in objects:
207 |             to_inject = []
208 |             for row in parse_file(minio_client, obj):
209 |                 to_inject.append(row)
210 |                 if len(to_inject) >= MAX_ES_ROW_INJECT:
211 |                     inject_rows_to_es(to_inject, bucket_name)
212 |                     to_inject = []
213 |             if len(to_inject) > 0:
214 |                 inject_rows_to_es(to_inject, bucket_name)
215 | 
216 | 
217 | class GenerateEsMapping(Task):
218 |     def __init__(self, index_name, **kwargs):
219 |         self.index_name = index_name
220 |         super().__init__(**kwargs)
221 | 
222 |     def run(self):
223 |         index_name = self.index_name
224 |         es_inst = get_es_instance()
225 | 
226 |         logger.info("Generating mapping for index {}".format(index_name))
227 | 
228 |         es_inst.indices.delete(index=index_name, ignore=[400, 404])
229 | 
230 |         response = es_inst.indices.create(
231 |             index=index_name, body=mapping, ignore=400  # ignore 400 already exists code
232 |         )
233 | 
234 |         if "acknowledged" in response:
235 |             if response["acknowledged"] == True:
236 |                 logger.info(
237 |                     "INDEX MAPPING SUCCESS FOR INDEX: {}".format(response["index"])
238 |                 )
239 |             elif "error" in response:
240 |                 logger.error(response["error"]["root_cause"])
241 |                 logger.error("Error type: {}".format(response["error"]["type"]))
242 |                 raise Exception("Unable to create index mapping")
243 | 
244 | 
245 | schedule = IntervalSchedule(
246 |     interval=timedelta(hours=24), start_date=datetime.utcnow() + timedelta(seconds=1)
247 | )
248 | 
249 | with Flow("Parse and insert csv files", schedule) as flow:
250 |     for bucket in ["vaccination", "contamination"]:
251 |         flow.set_dependencies(
252 |             task=ParseFiles(),
253 |             upstream_tasks=[GenerateEsMapping(bucket)],
254 |             keyword_tasks=dict(bucket_name=bucket),
255 |         )
256 | 
257 | try:
258 |     client = Client()
259 |     client.create_project(project_name="pandemic-knowledge")
260 | except prefect.utilities.exceptions.ClientError as e:
261 |     logger.info("Project already exists")
262 | 
263 | flow.register(project_name="pandemic-knowledge", labels=["development"])


--------------------------------------------------------------------------------
/illustrations/france_live_status.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flavienbwk/Pandemic-Knowledge/e9082be16d743aac49c514034626721a608ede08/illustrations/france_live_status.png


--------------------------------------------------------------------------------
/illustrations/latest_news.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flavienbwk/Pandemic-Knowledge/e9082be16d743aac49c514034626721a608ede08/illustrations/latest_news.png


--------------------------------------------------------------------------------
/illustrations/live_dashboard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flavienbwk/Pandemic-Knowledge/e9082be16d743aac49c514034626721a608ede08/illustrations/live_dashboard.png


--------------------------------------------------------------------------------
/illustrations/news_web_app.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flavienbwk/Pandemic-Knowledge/e9082be16d743aac49c514034626721a608ede08/illustrations/news_web_app.png


--------------------------------------------------------------------------------
/illustrations/vaccination_map.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flavienbwk/Pandemic-Knowledge/e9082be16d743aac49c514034626721a608ede08/illustrations/vaccination_map.png


--------------------------------------------------------------------------------
/insert.docker-compose.yml:
--------------------------------------------------------------------------------
  1 | version: "3.7"
  2 | 
  3 | services:
  4 | 
  5 |   insert_france_virtests:
  6 |     build: ./flow
  7 |     command: python3 /usr/app/insert_france_virtests.py
  8 |     volumes:
  9 |       - /srv/docker/prefect/flows:/root/.prefect/flows
 10 |       - "./flow/scripts:/usr/app:ro"
 11 |       - type: bind
 12 |         source: ./flow/config.toml
 13 |         target: /root/.prefect/config.toml
 14 |         read_only: true
 15 |     env_file:
 16 |       - .env
 17 |     environment:
 18 |       MINIO_SCHEME: ${MINIO_SCHEME}
 19 |       MINIO_ENDPOINT: ${MINIO_ENDPOINT}
 20 |       MINIO_ACCESS_KEY: ${MINIO_ACCESS_KEY}
 21 |       MINIO_SECRET_KEY: ${MINIO_SECRET_KEY}
 22 |       MAX_ES_ROW_INJECT: ${MAX_ES_ROW_INJECT}
 23 |       ELASTIC_SCHEME: ${ELASTIC_SCHEME}
 24 |       ELASTIC_PORT: ${ELASTIC_PORT}
 25 |       ELASTIC_ENDPOINT: ${ELASTIC_ENDPOINT}
 26 |       ELASTIC_USER: ${ELASTIC_USER}
 27 |       ELASTIC_PWD: ${ELASTICSEARCH_PWD}
 28 | 
 29 |   insert_france:
 30 |     build: ./flow
 31 |     command: python3 /usr/app/insert_france.py
 32 |     volumes:
 33 |       - /srv/docker/prefect/flows:/root/.prefect/flows
 34 |       - "./flow/scripts:/usr/app:ro"
 35 |       - type: bind
 36 |         source: ./flow/config.toml
 37 |         target: /root/.prefect/config.toml
 38 |         read_only: true
 39 |     env_file:
 40 |       - .env
 41 |     environment:
 42 |       MINIO_SCHEME: ${MINIO_SCHEME}
 43 |       MINIO_ENDPOINT: ${MINIO_ENDPOINT}
 44 |       MINIO_ACCESS_KEY: ${MINIO_ACCESS_KEY}
 45 |       MINIO_SECRET_KEY: ${MINIO_SECRET_KEY}
 46 |       MAX_ES_ROW_INJECT: ${MAX_ES_ROW_INJECT}
 47 |       ELASTIC_SCHEME: ${ELASTIC_SCHEME}
 48 |       ELASTIC_PORT: ${ELASTIC_PORT}
 49 |       ELASTIC_ENDPOINT: ${ELASTIC_ENDPOINT}
 50 |       ELASTIC_USER: ${ELASTIC_USER}
 51 |       ELASTIC_PWD: ${ELASTICSEARCH_PWD}
 52 | 
 53 |   insert_owid:
 54 |     build: ./flow
 55 |     command: python3 /usr/app/insert_owid.py
 56 |     volumes:
 57 |       - /srv/docker/prefect/flows:/root/.prefect/flows
 58 |       - "./flow/scripts:/usr/app:ro"
 59 |       - type: bind
 60 |         source: ./flow/config.toml
 61 |         target: /root/.prefect/config.toml
 62 |         read_only: true
 63 |     env_file:
 64 |       - .env
 65 |     environment:
 66 |       MINIO_SCHEME: ${MINIO_SCHEME}
 67 |       MINIO_ENDPOINT: ${MINIO_ENDPOINT}
 68 |       MINIO_ACCESS_KEY: ${MINIO_ACCESS_KEY}
 69 |       MINIO_SECRET_KEY: ${MINIO_SECRET_KEY}
 70 |       MAX_ES_ROW_INJECT: ${MAX_ES_ROW_INJECT}
 71 |       ELASTIC_SCHEME: ${ELASTIC_SCHEME}
 72 |       ELASTIC_PORT: ${ELASTIC_PORT}
 73 |       ELASTIC_ENDPOINT: ${ELASTIC_ENDPOINT}
 74 |       ELASTIC_USER: ${ELASTIC_USER}
 75 |       ELASTIC_PWD: ${ELASTICSEARCH_PWD}
 76 | 
 77 |   parse_insert:
 78 |     build: ./flow
 79 |     command: python3 /usr/app/parse_insert.py
 80 |     volumes:
 81 |       - /srv/docker/prefect/flows:/root/.prefect/flows
 82 |       - "./flow/scripts:/usr/app:ro"
 83 |       - type: bind
 84 |         source: ./flow/config.toml
 85 |         target: /root/.prefect/config.toml
 86 |         read_only: true
 87 |     env_file:
 88 |       - .env
 89 |     environment:
 90 |       MINIO_SCHEME: ${MINIO_SCHEME}
 91 |       MINIO_ENDPOINT: ${MINIO_ENDPOINT}
 92 |       MINIO_ACCESS_KEY: ${MINIO_ACCESS_KEY}
 93 |       MINIO_SECRET_KEY: ${MINIO_SECRET_KEY}
 94 |       MAX_ES_ROW_INJECT: ${MAX_ES_ROW_INJECT}
 95 |       ELASTIC_SCHEME: ${ELASTIC_SCHEME}
 96 |       ELASTIC_PORT: ${ELASTIC_PORT}
 97 |       ELASTIC_ENDPOINT: ${ELASTIC_ENDPOINT}
 98 |       ELASTIC_USER: ${ELASTIC_USER}
 99 |       ELASTIC_PWD: ${ELASTICSEARCH_PWD}
100 | 


--------------------------------------------------------------------------------
/instances/pem.yml:
--------------------------------------------------------------------------------
 1 | instances:
 2 |     - name: es01
 3 |       dns:
 4 |         - es01 
 5 |         - localhost
 6 |       ip:
 7 |         - 127.0.0.1
 8 | 
 9 |     - name: es02
10 |       dns:
11 |         - es02
12 |         - localhost
13 |       ip:
14 |         - 127.0.0.1
15 | 
16 |     - name: es03
17 |       dns:
18 |         - es03
19 |         - localhost
20 |       ip:
21 |         - 127.0.0.1
22 | 
23 |     - name: kibana
24 |       dns:
25 |         - kibana
26 |         - localhost
27 |       ip:
28 |         - 127.0.0.1
29 | 


--------------------------------------------------------------------------------
/instances/pkcs_12.yml:
--------------------------------------------------------------------------------
 1 | # For the moment, Enterprise Search only accepts PKCS#12 keystore
 2 | # so we are forced to specifically create a certutil file for it.
 3 | 
 4 | instances:
 5 | 
 6 |     - name: enterprise_search
 7 |       dns:
 8 |         - enterprise_search
 9 |         - localhost
10 |       ip:
11 |         - 127.0.0.1
12 | 


--------------------------------------------------------------------------------
/kibana.yml:
--------------------------------------------------------------------------------
 1 | server.name: kibana
 2 | server.host: "0.0.0.0"
 3 | elasticsearch.hosts: [ "https://es01:9200" ]
 4 | telemetry.enabled: true
 5 | 
 6 | xpack.monitoring.ui.container.elasticsearch.enabled: "true"
 7 | elasticsearch.username: elastic
 8 | elasticsearch.password: elastic
 9 | 
10 | # Encrypt traffic between the browser and Kibana
11 | server.ssl.enabled: "true"
12 | server.ssl.certificate: "/usr/share/elasticsearch/config/certificates/kibana/kibana.crt"
13 | server.ssl.key: "/usr/share/elasticsearch/config/certificates/kibana/kibana.key"
14 | 
15 | # Encrypt traffic between Kibana and Elasticsearch
16 | elasticsearch.ssl.certificateAuthorities: ["/usr/share/elasticsearch/config/certificates/ca/ca.crt"]
17 | 
18 | # Enterprise Search
19 | enterpriseSearch.host: 'http://enterprise_search:3002'
20 | 


--------------------------------------------------------------------------------
/news_app/app/.gitignore:
--------------------------------------------------------------------------------
1 | app/node_modules/


--------------------------------------------------------------------------------
/news_app/app/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM node:12-alpine
2 | 
3 | WORKDIR '/app'
4 | 
5 | COPY entrypoint.sh /entrypoint.sh
6 | ENTRYPOINT [ "/entrypoint.sh" ]


--------------------------------------------------------------------------------
/news_app/app/app/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "pandemic-knowledge",
 3 |   "version": "1.0.0",
 4 |   "homepage": ".",
 5 |   "dependencies": {
 6 |     "@trendmicro/react-sidenav": "0.5.0",
 7 |     "bootstrap": "^4.3.1",
 8 |     "json-loader": "^0.5.7",
 9 |     "react": "^16.11.0",
10 |     "react-bootstrap": "^1.0.0-beta.14",
11 |     "react-bootstrap-icons": "1.0.1-alpha3",
12 |     "react-click-outsider": "^1.1.1",
13 |     "react-cookie": "4.0.3",
14 |     "react-dom": "^16.11.0",
15 |     "react-highlight-words": "^0.17.0",
16 |     "react-loader-spinner": "3.1.14",
17 |     "react-notifications": "1.6.0",
18 |     "react-router-dom": "^5.1.2",
19 |     "react-scripts": "3.2.0",
20 |     "searchkit": "^2.4.4",
21 |     "styled-components": "^4.4.0"
22 |   },
23 |   "scripts": {
24 |     "start": "/app/node_modules/react-scripts/bin/react-scripts.js start",
25 |     "build": "/app/node_modules/react-scripts/bin/react-scripts.js build",
26 |     "test": "/app/node_modules/react-scripts/bin/react-scripts.js test",
27 |     "eject": "/app/node_modules/react-scripts/bin/react-scripts.js eject"
28 |   },
29 |   "eslintConfig": {
30 |     "extends": "react-app"
31 |   },
32 |   "browserslist": {
33 |     "production": [
34 |       ">0.2%",
35 |       "not dead",
36 |       "not op_mini all"
37 |     ],
38 |     "development": [
39 |       "last 1 chrome version",
40 |       "last 1 firefox version",
41 |       "last 1 safari version"
42 |     ]
43 |   }
44 | }
45 | 


--------------------------------------------------------------------------------
/news_app/app/app/public/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | 
 4 | <head>
 5 |   <meta charset="utf-8" />
 6 |   <link rel="icon" href="%PUBLIC_URL%/logo.png" />
 7 |   <meta name="viewport" content="width=device-width, initial-scale=1" />
 8 |   <meta name="theme-color" content="#000000" />
 9 |   <link rel="manifest" href="%PUBLIC_URL%/manifest.json" />
10 |   <link rel="stylesheet" href="%PUBLIC_URL%/css/bootstrap.css" />
11 |   <title>Pandemic Knowledge</title>
12 | </head>
13 | 
14 | <body>
15 |   <noscript>You need to enable JavaScript to run this app.</noscript>
16 |   <div id="root"></div>
17 | </body>
18 | 
19 | </html>


--------------------------------------------------------------------------------
/news_app/app/app/public/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flavienbwk/Pandemic-Knowledge/e9082be16d743aac49c514034626721a608ede08/news_app/app/app/public/logo.png


--------------------------------------------------------------------------------
/news_app/app/app/public/manifest.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "short_name": "PK",
 3 |   "name": "Pandemic Knowledge",
 4 |   "icons": [
 5 |     {
 6 |       "src": "logo.png",
 7 |       "sizes": "64x64 32x32 24x24 16x16",
 8 |       "type": "image/x-icon"
 9 |     }
10 |   ],
11 |   "start_url": ".",
12 |   "display": "standalone",
13 |   "theme_color": "#000000",
14 |   "background_color": "#ffffff"
15 | }
16 | 


--------------------------------------------------------------------------------
/news_app/app/app/public/robots.txt:
--------------------------------------------------------------------------------
1 | # https://www.robotstxt.org/robotstxt.html
2 | User-agent: *
3 | 


--------------------------------------------------------------------------------
/news_app/app/app/src/About.js:
--------------------------------------------------------------------------------
 1 | import React, { Component } from 'react'
 2 | import packageJson from '../package.json';
 3 | 
 4 | export class About extends Component {
 5 | 
 6 |     render() {
 7 |         return (
 8 |             <div>
 9 |                 <h2>About</h2>
10 |                 <p>A fully-featured multi-source data pipeline for continuously extracting knowledge from COVID-19 data.</p>
11 |                 <p>If you find an issue or have a suggestion, <a target="_blank" rel="noopener noreferrer" href="https://github.com/flavienbwk/Pandemic-Knowledge">please open an issue on Github</a>.</p>
12 |                 <hr />
13 |                 <p>Version <b>{packageJson["version"]}</b></p>
14 |             </div>
15 |         )
16 |     }
17 | 
18 |     componentDidMount() {
19 |         document.title = "About - Pandemic Knowledge";
20 |     }
21 | 
22 | }
23 | 


--------------------------------------------------------------------------------
/news_app/app/app/src/App.js:
--------------------------------------------------------------------------------
 1 | import React, { Component } from 'react'
 2 | import { HashRouter as Router, Route, Switch } from 'react-router-dom'
 3 | import { NotificationContainer } from 'react-notifications'
 4 | import { NavigationBar } from './NavigationBar'
 5 | import { Layout } from './Layout'
 6 | import Home from './Home'
 7 | import { About } from './About'
 8 | import packageJson from '../package.json'
 9 | 
10 | export class App extends Component {
11 | 
12 |     /**
13 |      * Child components may trigger this parent event to
14 |      * inform other routes (<NavigationBar> for example),
15 |      * that authentication information have been updated.
16 |      * 
17 |      * This allows to show the "Login" or "Logout" button
18 |      * depending on user's authentication status.
19 |      */
20 |     onAuthUpdate = () => {}
21 | 
22 |     render() {
23 |             return (
24 |             <React.Fragment>
25 |                 <Router basename={packageJson["homepage"] + "/"}>
26 |                     <NotificationContainer />
27 |                         <NavigationBar />
28 |                     <Layout>
29 |                         <Switch>
30 |                             <Route exact path="/" component={Home} />
31 |                             <Route path="/about" component={About} />
32 |                         </Switch>
33 |                     </Layout>
34 |                 </Router>
35 |             </React.Fragment>
36 |         )
37 |     }
38 | }
39 | 
40 | export default App
41 | 


--------------------------------------------------------------------------------
/news_app/app/app/src/App.test.js:
--------------------------------------------------------------------------------
 1 | import React from 'react';
 2 | import ReactDOM from 'react-dom';
 3 | import App from './App';
 4 | 
 5 | it('renders without crashing', () => {
 6 |   const div = document.createElement('div');
 7 |   ReactDOM.render(<App />, div);
 8 |   ReactDOM.unmountComponentAtNode(div);
 9 | });
10 | 
11 | 


--------------------------------------------------------------------------------
/news_app/app/app/src/Home.js:
--------------------------------------------------------------------------------
 1 | import React, { Component } from 'react';
 2 | import { Container, Row, Col } from 'react-bootstrap';
 3 | import styled from 'styled-components';
 4 | import SearchUI from './SearchUI';
 5 | 
 6 | const Styles = styled.div`
 7 | 	.paddind-bottom {
 8 | 		padding-bottom: 16px;
 9 | 	}
10 | `;
11 | 
12 | class Home extends Component {
13 | 	render() {
14 | 		return (
15 | 			<Styles>
16 | 				<Container fluid>
17 | 					<Row className="paddind-bottom">
18 | 						<Col lg={{ span: 12 }}>
19 | 							<SearchUI />
20 | 						</Col>
21 | 					</Row>
22 | 				</Container>
23 | 			</Styles>
24 | 		);
25 | 	}
26 | 
27 | 	componentDidMount() {
28 | 		document.title = 'Search - Pandemic Knowledge';
29 | 	}
30 | }
31 | 
32 | export default Home;
33 | 


--------------------------------------------------------------------------------
/news_app/app/app/src/Layout.js:
--------------------------------------------------------------------------------
1 | import React from 'react';
2 | import Container from 'react-bootstrap/Container';
3 | 
4 | export const Layout = (props) => (
5 |     <Container>
6 |         {props.children}
7 |     </Container>
8 | )


--------------------------------------------------------------------------------
/news_app/app/app/src/NavigationBar.js:
--------------------------------------------------------------------------------
 1 | import React, { Component } from 'react'
 2 | import { Nav, Navbar } from 'react-bootstrap'
 3 | import styled from 'styled-components'
 4 | import { Link } from 'react-router-dom'
 5 | import packageJson from '../package.json'
 6 | 
 7 | const Styles = styled.div`
 8 |     .navbar {
 9 |         background-color: #222;
10 |     }
11 | 
12 |     .navbar-brand, .navbar-nav .nav-link {
13 |         color: #bbb;
14 | 
15 |         &:hover {
16 |             color: white;
17 |         }
18 |     }
19 | 
20 |     .brand-image {
21 |         max-width: 64px;
22 |         height: 30px;
23 |         padding-right: 16px;
24 |     }
25 | `;
26 | 
27 | export class NavigationBar extends Component {
28 | 
29 |     render() {
30 |         return (
31 |             <Styles>
32 |                 <Navbar bg="light" expand="lg">
33 |                     <Navbar.Brand as={Link} to={'/'}>
34 |                         <img
35 |                             alt="Logo Pandemic Knowledge"
36 |                             src={packageJson["homepage"] + "/logo.png"}
37 |                             className="d-inline-block align-top brand-image"
38 |                         />
39 |                         {'Pandemic Knowledge'}
40 |                     </Navbar.Brand>
41 |                     <Navbar.Toggle aria-controls="basic-navbar-nav" />
42 |                     <Navbar.Collapse id="basic-navbar-nav">
43 |                         <Nav className="ml-auto">
44 |                             <Nav.Link as={Link} to={'/about'}>About</Nav.Link>
45 |                         </Nav>
46 |                     </Navbar.Collapse>
47 |                 </Navbar>
48 |             </Styles>
49 |         )
50 |     }
51 | 
52 | }
53 | 
54 | export default NavigationBar;


--------------------------------------------------------------------------------
/news_app/app/app/src/SearchUI.js:
--------------------------------------------------------------------------------
  1 | import React, { Component } from 'react';
  2 | import { Row, Col, Card } from 'react-bootstrap';
  3 | import { SearchkitManager, SearchkitProvider, SearchBox, Hits } from 'searchkit';
  4 | import Highlighter from 'react-highlight-words';
  5 | 
  6 | const search_kit = new SearchkitManager('https://172.17.0.1:9200/news_*/', {
  7 | 	basicAuth: 'elastic:elastic'
  8 | });
  9 | 
 10 | export class SearchUI extends Component {
 11 | 	state = {
 12 | 		queryValue: ''
 13 | 	};
 14 | 
 15 | 	queryBuilder = (queryString) => {
 16 | 		this.setState({ queryValue: queryString });
 17 | 		return {
 18 | 			bool: {
 19 | 				must: [],
 20 | 				filter: [
 21 | 					{
 22 | 						multi_match: {
 23 | 							type: 'best_fields',
 24 | 							query: queryString,
 25 | 							lenient: true
 26 | 						}
 27 | 					}
 28 | 				],
 29 | 				should: [],
 30 | 				must_not: []
 31 | 			}
 32 | 		};
 33 | 	};
 34 | 
 35 | 	render() {
 36 | 		return (
 37 | 			<SearchkitProvider searchkit={search_kit}>
 38 | 				<div>
 39 | 					<Row>
 40 | 						<Col lg={{ span: 12 }} className="pt-4 pb-4">
 41 | 							<SearchBox
 42 | 								translations={{ 'searchbox.placeholder': 'Search COVID-related news' }}
 43 | 								autofocus={true}
 44 | 								className={'form-control'}
 45 | 								searchOnChange={true}
 46 | 								queryBuilder={this.queryBuilder}
 47 | 							/>
 48 | 						</Col>
 49 | 					</Row>
 50 | 					<Row>
 51 | 						<Hits hitsPerPage={8} itemComponent={<News queryValue={this.state.queryValue} />} />
 52 | 					</Row>
 53 | 				</div>
 54 | 			</SearchkitProvider>
 55 | 		);
 56 | 	}
 57 | }
 58 | 
 59 | class News extends Component {
 60 | 	render() {
 61 | 		return (
 62 | 			<Row className="mt-2"
 63 | 				onClick={() => {
 64 | 					window.open(this.props.result._source.link)
 65 | 				}}
 66 | 				style={{
 67 | 					cursor: "pointer"
 68 | 				}}
 69 | 				title={this.props.result._source.link}
 70 | 			>
 71 | 				<Col lg={{ span: 2 }}>{<img style={{ width: '100%' }} src={this.props.result._source.img} />}</Col>
 72 | 				<Col lg={{ span: 8 }}>
 73 | 					<Card>
 74 | 						<Card.Body>
 75 | 							<Card.Title>
 76 | 								<Highlighter
 77 | 									searchWords={[this.props.queryValue]}
 78 | 									textToHighlight={this.props.result._source.title}
 79 | 									highlightStyle={{ backgroundColor: '#fcf403' }}
 80 | 								/>
 81 | 								<br />
 82 | 								{
 83 | 									(this.props.result._source.date)
 84 | 										?
 85 | 										<small>
 86 | 											<b>{new Date(this.props.result._source.date).toLocaleDateString('fr-FR')}</b>
 87 | 										</small>
 88 | 										: <></>
 89 | 								}
 90 | 							</Card.Title>
 91 | 							<Card.Text>
 92 | 								<Highlighter
 93 | 									searchWords={[this.props.queryValue]}
 94 | 									textToHighlight={this.props.result._source.desc}
 95 | 									highlightStyle={{ backgroundColor: '#fcf403' }}
 96 | 								/>
 97 | 							</Card.Text>
 98 | 						</Card.Body>
 99 | 					</Card>
100 | 				</Col>
101 | 			</Row>
102 | 		);
103 | 	}
104 | }
105 | 
106 | export default SearchUI;
107 | 


--------------------------------------------------------------------------------
/news_app/app/app/src/index.css:
--------------------------------------------------------------------------------
 1 | body {
 2 |   margin: 0;
 3 |   font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", "Roboto", "Oxygen", "Ubuntu", "Cantarell", "Fira Sans", "Droid Sans", "Helvetica Neue", sans-serif;
 4 |   -webkit-font-smoothing: antialiased;
 5 |   -moz-osx-font-smoothing: grayscale;
 6 | }
 7 | 
 8 | code {
 9 |   font-family: source-code-pro, Menlo, Monaco, Consolas, "Courier New", monospace;
10 | }


--------------------------------------------------------------------------------
/news_app/app/app/src/index.js:
--------------------------------------------------------------------------------
 1 | import React from 'react';
 2 | import ReactDOM from 'react-dom';
 3 | import './index.css';
 4 | import App from './App';
 5 | import * as serviceWorker from './serviceWorker';
 6 | 
 7 | ReactDOM.render(<App />, document.getElementById('root'));
 8 | 
 9 | // If you want your app to work offline and load faster, you can change
10 | // unregister() to register() below. Note this comes with some pitfalls.
11 | // Learn more about service workers: https://bit.ly/CRA-PWA
12 | serviceWorker.unregister();
13 | 


--------------------------------------------------------------------------------
/news_app/app/app/src/serviceWorker.js:
--------------------------------------------------------------------------------
  1 | // This optional code is used to register a service worker.
  2 | // register() is not called by default.
  3 | 
  4 | // This lets the app load faster on subsequent visits in production, and gives
  5 | // it offline capabilities. However, it also means that developers (and users)
  6 | // will only see deployed updates on subsequent visits to a page, after all the
  7 | // existing tabs open on the page have been closed, since previously cached
  8 | // resources are updated in the background.
  9 | 
 10 | // To learn more about the benefits of this model and instructions on how to
 11 | // opt-in, read https://bit.ly/CRA-PWA
 12 | 
 13 | const isLocalhost = Boolean(
 14 |   window.location.hostname === 'localhost' ||
 15 |     // [::1] is the IPv6 localhost address.
 16 |     window.location.hostname === '[::1]' ||
 17 |     // 127.0.0.1/8 is considered localhost for IPv4.
 18 |     window.location.hostname.match(
 19 |       /^127(?:\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}$/
 20 |     )
 21 | );
 22 | 
 23 | export function register(config) {
 24 |   if (process.env.NODE_ENV === 'production' && 'serviceWorker' in navigator) {
 25 |     // The URL constructor is available in all browsers that support SW.
 26 |     const publicUrl = new URL(process.env.PUBLIC_URL, window.location.href);
 27 |     if (publicUrl.origin !== window.location.origin) {
 28 |       // Our service worker won't work if PUBLIC_URL is on a different origin
 29 |       // from what our page is served on. This might happen if a CDN is used to
 30 |       // serve assets; see https://github.com/facebook/create-react-app/issues/2374
 31 |       return;
 32 |     }
 33 | 
 34 |     window.addEventListener('load', () => {
 35 |       const swUrl = `${process.env.PUBLIC_URL}/service-worker.js`;
 36 | 
 37 |       if (isLocalhost) {
 38 |         // This is running on localhost. Let's check if a service worker still exists or not.
 39 |         checkValidServiceWorker(swUrl, config);
 40 | 
 41 |         // Add some additional logging to localhost, pointing developers to the
 42 |         // service worker/PWA documentation.
 43 |         navigator.serviceWorker.ready.then(() => {
 44 |           console.log(
 45 |             'This web app is being served cache-first by a service ' +
 46 |               'worker. To learn more, visit https://bit.ly/CRA-PWA'
 47 |           );
 48 |         });
 49 |       } else {
 50 |         // Is not localhost. Just register service worker
 51 |         registerValidSW(swUrl, config);
 52 |       }
 53 |     });
 54 |   }
 55 | }
 56 | 
 57 | function registerValidSW(swUrl, config) {
 58 |   navigator.serviceWorker
 59 |     .register(swUrl)
 60 |     .then(registration => {
 61 |       registration.onupdatefound = () => {
 62 |         const installingWorker = registration.installing;
 63 |         if (installingWorker == null) {
 64 |           return;
 65 |         }
 66 |         installingWorker.onstatechange = () => {
 67 |           if (installingWorker.state === 'installed') {
 68 |             if (navigator.serviceWorker.controller) {
 69 |               // At this point, the updated precached content has been fetched,
 70 |               // but the previous service worker will still serve the older
 71 |               // content until all client tabs are closed.
 72 |               console.log(
 73 |                 'New content is available and will be used when all ' +
 74 |                   'tabs for this page are closed. See https://bit.ly/CRA-PWA.'
 75 |               );
 76 | 
 77 |               // Execute callback
 78 |               if (config && config.onUpdate) {
 79 |                 config.onUpdate(registration);
 80 |               }
 81 |             } else {
 82 |               // At this point, everything has been precached.
 83 |               // It's the perfect time to display a
 84 |               // "Content is cached for offline use." message.
 85 |               console.log('Content is cached for offline use.');
 86 | 
 87 |               // Execute callback
 88 |               if (config && config.onSuccess) {
 89 |                 config.onSuccess(registration);
 90 |               }
 91 |             }
 92 |           }
 93 |         };
 94 |       };
 95 |     })
 96 |     .catch(error => {
 97 |       console.error('Error during service worker registration:', error);
 98 |     });
 99 | }
100 | 
101 | function checkValidServiceWorker(swUrl, config) {
102 |   // Check if the service worker can be found. If it can't reload the page.
103 |   fetch(swUrl)
104 |     .then(response => {
105 |       // Ensure service worker exists, and that we really are getting a JS file.
106 |       const contentType = response.headers.get('content-type');
107 |       if (
108 |         response.status === 404 ||
109 |         (contentType != null && contentType.indexOf('javascript') === -1)
110 |       ) {
111 |         // No service worker found. Probably a different app. Reload the page.
112 |         navigator.serviceWorker.ready.then(registration => {
113 |           registration.unregister().then(() => {
114 |             window.location.reload();
115 |           });
116 |         });
117 |       } else {
118 |         // Service worker found. Proceed as normal.
119 |         registerValidSW(swUrl, config);
120 |       }
121 |     })
122 |     .catch(() => {
123 |       console.log(
124 |         'No internet connection found. App is running in offline mode.'
125 |       );
126 |     });
127 | }
128 | 
129 | export function unregister() {
130 |   if ('serviceWorker' in navigator) {
131 |     navigator.serviceWorker.ready.then(registration => {
132 |       registration.unregister();
133 |     });
134 |   }
135 | }
136 | 


--------------------------------------------------------------------------------
/news_app/app/entrypoint.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | npm install
4 | npm run start
5 | 


--------------------------------------------------------------------------------
/news_app/app/package-lock.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "requires": true,
  3 |   "lockfileVersion": 1,
  4 |   "dependencies": {
  5 |     "@babel/runtime": {
  6 |       "version": "7.13.17",
  7 |       "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.13.17.tgz",
  8 |       "integrity": "sha512-NCdgJEelPTSh+FEFylhnP1ylq848l1z9t9N0j1Lfbcw0+KXGjsTvUmkxy+voLLXB5SOKMbLLx4jxYliGrYQseA==",
  9 |       "requires": {
 10 |         "regenerator-runtime": "^0.13.4"
 11 |       }
 12 |     },
 13 |     "@elastic/search-ui": {
 14 |       "version": "1.5.1",
 15 |       "resolved": "https://registry.npmjs.org/@elastic/search-ui/-/search-ui-1.5.1.tgz",
 16 |       "integrity": "sha512-ssfvX1q76X1UwqYASWtBni4PZ+3SYk1PvHmOjpVf9BYai1OqZLGVaj8Sw+cE1ia56zl5In7viCfciC+CP31ovA==",
 17 |       "requires": {
 18 |         "date-fns": "^1.30.1",
 19 |         "deep-equal": "^1.0.1",
 20 |         "history": "^4.9.0",
 21 |         "qs": "^6.7.0"
 22 |       }
 23 |     },
 24 |     "call-bind": {
 25 |       "version": "1.0.2",
 26 |       "resolved": "https://registry.npmjs.org/call-bind/-/call-bind-1.0.2.tgz",
 27 |       "integrity": "sha512-7O+FbCihrB5WGbFYesctwmTKae6rOiIzmz1icreWJ+0aA7LJfuqhEso2T9ncpcFtzMQtzXf2QGGueWJGTYsqrA==",
 28 |       "requires": {
 29 |         "function-bind": "^1.1.1",
 30 |         "get-intrinsic": "^1.0.2"
 31 |       }
 32 |     },
 33 |     "date-fns": {
 34 |       "version": "1.30.1",
 35 |       "resolved": "https://registry.npmjs.org/date-fns/-/date-fns-1.30.1.tgz",
 36 |       "integrity": "sha512-hBSVCvSmWC+QypYObzwGOd9wqdDpOt+0wl0KbU+R+uuZBS1jN8VsD1ss3irQDknRj5NvxiTF6oj/nDRnN/UQNw=="
 37 |     },
 38 |     "deep-equal": {
 39 |       "version": "1.1.1",
 40 |       "resolved": "https://registry.npmjs.org/deep-equal/-/deep-equal-1.1.1.tgz",
 41 |       "integrity": "sha512-yd9c5AdiqVcR+JjcwUQb9DkhJc8ngNr0MahEBGvDiJw8puWab2yZlh+nkasOnZP+EGTAP6rRp2JzJhJZzvNF8g==",
 42 |       "requires": {
 43 |         "is-arguments": "^1.0.4",
 44 |         "is-date-object": "^1.0.1",
 45 |         "is-regex": "^1.0.4",
 46 |         "object-is": "^1.0.1",
 47 |         "object-keys": "^1.1.1",
 48 |         "regexp.prototype.flags": "^1.2.0"
 49 |       }
 50 |     },
 51 |     "define-properties": {
 52 |       "version": "1.1.3",
 53 |       "resolved": "https://registry.npmjs.org/define-properties/-/define-properties-1.1.3.tgz",
 54 |       "integrity": "sha512-3MqfYKj2lLzdMSf8ZIZE/V+Zuy+BgD6f164e8K2w7dgnpKArBDerGYpM46IYYcjnkdPNMjPk9A6VFB8+3SKlXQ==",
 55 |       "requires": {
 56 |         "object-keys": "^1.0.12"
 57 |       }
 58 |     },
 59 |     "function-bind": {
 60 |       "version": "1.1.1",
 61 |       "resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.1.tgz",
 62 |       "integrity": "sha512-yIovAzMX49sF8Yl58fSCWJ5svSLuaibPxXQJFLmBObTuCr0Mf1KiPopGM9NiFjiYBCbfaa2Fh6breQ6ANVTI0A=="
 63 |     },
 64 |     "get-intrinsic": {
 65 |       "version": "1.1.1",
 66 |       "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.1.1.tgz",
 67 |       "integrity": "sha512-kWZrnVM42QCiEA2Ig1bG8zjoIMOgxWwYCEeNdwY6Tv/cOSeGpcoX4pXHfKUxNKVoArnrEr2e9srnAxxGIraS9Q==",
 68 |       "requires": {
 69 |         "function-bind": "^1.1.1",
 70 |         "has": "^1.0.3",
 71 |         "has-symbols": "^1.0.1"
 72 |       }
 73 |     },
 74 |     "has": {
 75 |       "version": "1.0.3",
 76 |       "resolved": "https://registry.npmjs.org/has/-/has-1.0.3.tgz",
 77 |       "integrity": "sha512-f2dvO0VU6Oej7RkWJGrehjbzMAjFp5/VKPp5tTpWIV4JHHZK1/BxbFRtf/siA2SWTe09caDmVtYYzWEIbBS4zw==",
 78 |       "requires": {
 79 |         "function-bind": "^1.1.1"
 80 |       }
 81 |     },
 82 |     "has-symbols": {
 83 |       "version": "1.0.2",
 84 |       "resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.0.2.tgz",
 85 |       "integrity": "sha512-chXa79rL/UC2KlX17jo3vRGz0azaWEx5tGqZg5pO3NUyEJVB17dMruQlzCCOfUvElghKcm5194+BCRvi2Rv/Gw=="
 86 |     },
 87 |     "history": {
 88 |       "version": "4.10.1",
 89 |       "resolved": "https://registry.npmjs.org/history/-/history-4.10.1.tgz",
 90 |       "integrity": "sha512-36nwAD620w12kuzPAsyINPWJqlNbij+hpK1k9XRloDtym8mxzGYl2c17LnV6IAGB2Dmg4tEa7G7DlawS0+qjew==",
 91 |       "requires": {
 92 |         "@babel/runtime": "^7.1.2",
 93 |         "loose-envify": "^1.2.0",
 94 |         "resolve-pathname": "^3.0.0",
 95 |         "tiny-invariant": "^1.0.2",
 96 |         "tiny-warning": "^1.0.0",
 97 |         "value-equal": "^1.0.1"
 98 |       }
 99 |     },
100 |     "is-arguments": {
101 |       "version": "1.1.0",
102 |       "resolved": "https://registry.npmjs.org/is-arguments/-/is-arguments-1.1.0.tgz",
103 |       "integrity": "sha512-1Ij4lOMPl/xB5kBDn7I+b2ttPMKa8szhEIrXDuXQD/oe3HJLTLhqhgGspwgyGd6MOywBUqVvYicF72lkgDnIHg==",
104 |       "requires": {
105 |         "call-bind": "^1.0.0"
106 |       }
107 |     },
108 |     "is-date-object": {
109 |       "version": "1.0.2",
110 |       "resolved": "https://registry.npmjs.org/is-date-object/-/is-date-object-1.0.2.tgz",
111 |       "integrity": "sha512-USlDT524woQ08aoZFzh3/Z6ch9Y/EWXEHQ/AaRN0SkKq4t2Jw2R2339tSXmwuVoY7LLlBCbOIlx2myP/L5zk0g=="
112 |     },
113 |     "is-regex": {
114 |       "version": "1.1.2",
115 |       "resolved": "https://registry.npmjs.org/is-regex/-/is-regex-1.1.2.tgz",
116 |       "integrity": "sha512-axvdhb5pdhEVThqJzYXwMlVuZwC+FF2DpcOhTS+y/8jVq4trxyPgfcwIxIKiyeuLlSQYKkmUaPQJ8ZE4yNKXDg==",
117 |       "requires": {
118 |         "call-bind": "^1.0.2",
119 |         "has-symbols": "^1.0.1"
120 |       }
121 |     },
122 |     "js-tokens": {
123 |       "version": "4.0.0",
124 |       "resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-4.0.0.tgz",
125 |       "integrity": "sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ=="
126 |     },
127 |     "loose-envify": {
128 |       "version": "1.4.0",
129 |       "resolved": "https://registry.npmjs.org/loose-envify/-/loose-envify-1.4.0.tgz",
130 |       "integrity": "sha512-lyuxPGr/Wfhrlem2CL/UcnUc1zcqKAImBDzukY7Y5F/yQiNdko6+fRLevlw1HgMySw7f611UIY408EtxRSoK3Q==",
131 |       "requires": {
132 |         "js-tokens": "^3.0.0 || ^4.0.0"
133 |       }
134 |     },
135 |     "object-inspect": {
136 |       "version": "1.10.2",
137 |       "resolved": "https://registry.npmjs.org/object-inspect/-/object-inspect-1.10.2.tgz",
138 |       "integrity": "sha512-gz58rdPpadwztRrPjZE9DZLOABUpTGdcANUgOwBFO1C+HZZhePoP83M65WGDmbpwFYJSWqavbl4SgDn4k8RYTA=="
139 |     },
140 |     "object-is": {
141 |       "version": "1.1.5",
142 |       "resolved": "https://registry.npmjs.org/object-is/-/object-is-1.1.5.tgz",
143 |       "integrity": "sha512-3cyDsyHgtmi7I7DfSSI2LDp6SK2lwvtbg0p0R1e0RvTqF5ceGx+K2dfSjm1bKDMVCFEDAQvy+o8c6a7VujOddw==",
144 |       "requires": {
145 |         "call-bind": "^1.0.2",
146 |         "define-properties": "^1.1.3"
147 |       }
148 |     },
149 |     "object-keys": {
150 |       "version": "1.1.1",
151 |       "resolved": "https://registry.npmjs.org/object-keys/-/object-keys-1.1.1.tgz",
152 |       "integrity": "sha512-NuAESUOUMrlIXOfHKzD6bpPu3tYt3xvjNdRIQ+FeT0lNb4K8WR70CaDxhuNguS2XG+GjkyMwOzsN5ZktImfhLA=="
153 |     },
154 |     "qs": {
155 |       "version": "6.10.1",
156 |       "resolved": "https://registry.npmjs.org/qs/-/qs-6.10.1.tgz",
157 |       "integrity": "sha512-M528Hph6wsSVOBiYUnGf+K/7w0hNshs/duGsNXPUCLH5XAqjEtiPGwNONLV0tBH8NoGb0mvD5JubnUTrujKDTg==",
158 |       "requires": {
159 |         "side-channel": "^1.0.4"
160 |       }
161 |     },
162 |     "regenerator-runtime": {
163 |       "version": "0.13.7",
164 |       "resolved": "https://registry.npmjs.org/regenerator-runtime/-/regenerator-runtime-0.13.7.tgz",
165 |       "integrity": "sha512-a54FxoJDIr27pgf7IgeQGxmqUNYrcV338lf/6gH456HZ/PhX+5BcwHXG9ajESmwe6WRO0tAzRUrRmNONWgkrew=="
166 |     },
167 |     "regexp.prototype.flags": {
168 |       "version": "1.3.1",
169 |       "resolved": "https://registry.npmjs.org/regexp.prototype.flags/-/regexp.prototype.flags-1.3.1.tgz",
170 |       "integrity": "sha512-JiBdRBq91WlY7uRJ0ds7R+dU02i6LKi8r3BuQhNXn+kmeLN+EfHhfjqMRis1zJxnlu88hq/4dx0P2OP3APRTOA==",
171 |       "requires": {
172 |         "call-bind": "^1.0.2",
173 |         "define-properties": "^1.1.3"
174 |       }
175 |     },
176 |     "resolve-pathname": {
177 |       "version": "3.0.0",
178 |       "resolved": "https://registry.npmjs.org/resolve-pathname/-/resolve-pathname-3.0.0.tgz",
179 |       "integrity": "sha512-C7rARubxI8bXFNB/hqcp/4iUeIXJhJZvFPFPiSPRnhU5UPxzMFIl+2E6yY6c4k9giDJAhtV+enfA+G89N6Csng=="
180 |     },
181 |     "side-channel": {
182 |       "version": "1.0.4",
183 |       "resolved": "https://registry.npmjs.org/side-channel/-/side-channel-1.0.4.tgz",
184 |       "integrity": "sha512-q5XPytqFEIKHkGdiMIrY10mvLRvnQh42/+GoBlFW3b2LXLE2xxJpZFdm94we0BaoV3RwJyGqg5wS7epxTv0Zvw==",
185 |       "requires": {
186 |         "call-bind": "^1.0.0",
187 |         "get-intrinsic": "^1.0.2",
188 |         "object-inspect": "^1.9.0"
189 |       }
190 |     },
191 |     "tiny-invariant": {
192 |       "version": "1.1.0",
193 |       "resolved": "https://registry.npmjs.org/tiny-invariant/-/tiny-invariant-1.1.0.tgz",
194 |       "integrity": "sha512-ytxQvrb1cPc9WBEI/HSeYYoGD0kWnGEOR8RY6KomWLBVhqz0RgTwVO9dLrGz7dC+nN9llyI7OKAgRq8Vq4ZBSw=="
195 |     },
196 |     "tiny-warning": {
197 |       "version": "1.0.3",
198 |       "resolved": "https://registry.npmjs.org/tiny-warning/-/tiny-warning-1.0.3.tgz",
199 |       "integrity": "sha512-lBN9zLN/oAf68o3zNXYrdCt1kP8WsiGW8Oo2ka41b2IM5JL/S1CTyX1rW0mb/zSuJun0ZUrDxx4sqvYS2FWzPA=="
200 |     },
201 |     "value-equal": {
202 |       "version": "1.0.1",
203 |       "resolved": "https://registry.npmjs.org/value-equal/-/value-equal-1.0.1.tgz",
204 |       "integrity": "sha512-NOJ6JZCAWr0zlxZt+xqCHNTEKOsrks2HQd4MqhP1qy4z1SkbEP467eNx6TgDKXMvUOb+OENfJCZwM+16n7fRfw=="
205 |     }
206 |   }
207 | }
208 | 


--------------------------------------------------------------------------------
/news_app/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3"
 2 | 
 3 | services:
 4 | 
 5 |     app:
 6 |         build: ./app
 7 |         restart: always
 8 |         ports:
 9 |             - "8080:3000"
10 |         volumes:
11 |             - ./app/app:/app
12 |         environment:
13 |             NODE_ENV: "development"
14 |             CHOKIDAR_USEPOLLING: "true"
15 | 


--------------------------------------------------------------------------------
/pandemic_knowledge.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flavienbwk/Pandemic-Knowledge/e9082be16d743aac49c514034626721a608ede08/pandemic_knowledge.png


--------------------------------------------------------------------------------
/prefect/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.8
 2 | 
 3 | RUN apt-get install gcc
 4 | 
 5 | RUN python3 -m pip install prefect
 6 | COPY prefect.config /root/.prefect/config.toml
 7 | 
 8 | ENTRYPOINT \
 9 |     prefect backend server && \
10 |     prefect server create-tenant --name default --slug default
11 | 


--------------------------------------------------------------------------------
/prefect/prefect.config:
--------------------------------------------------------------------------------
 1 | # debug mode
 2 | debug = true
 3 | 
 4 | # base configuration directory (typically you won't change this!)
 5 | home_dir = "~/.prefect"
 6 | 
 7 | backend = "server"
 8 | 
 9 | [server]
10 | host = "http://prefect_apollo"
11 | port = "4200"
12 | host_port = "4200"
13 | endpoint = "${server.host}:${server.port}"
14 | 


--------------------------------------------------------------------------------