├── .env.example ├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── agent ├── Dockerfile ├── config.toml ├── docker-compose.yml └── requirements.txt ├── crawl.docker-compose.yml ├── create-certs.yml ├── docker-compose.yml ├── flow ├── Dockerfile ├── config.toml ├── requirements.txt └── scripts │ ├── UID_ISO_FIPS_LookUp_Table.csv │ ├── crawl_google_news.py │ ├── crawl_mapping.py │ ├── crawl_tweets.py │ ├── insert_france.py │ ├── insert_france_virtests.py │ ├── insert_owid.py │ ├── mapping.py │ └── parse_insert.py ├── illustrations ├── france_live_status.png ├── latest_news.png ├── live_dashboard.png ├── news_web_app.png └── vaccination_map.png ├── insert.docker-compose.yml ├── instances ├── pem.yml └── pkcs_12.yml ├── kibana.yml ├── news_app ├── app │ ├── .gitignore │ ├── Dockerfile │ ├── app │ │ ├── package-lock.json │ │ ├── package.json │ │ ├── public │ │ │ ├── css │ │ │ │ └── bootstrap.css │ │ │ ├── index.html │ │ │ ├── logo.png │ │ │ ├── manifest.json │ │ │ └── robots.txt │ │ └── src │ │ │ ├── About.js │ │ │ ├── App.js │ │ │ ├── App.test.js │ │ │ ├── Home.js │ │ │ ├── Layout.js │ │ │ ├── NavigationBar.js │ │ │ ├── SearchUI.js │ │ │ ├── index.css │ │ │ ├── index.js │ │ │ └── serviceWorker.js │ ├── entrypoint.sh │ └── package-lock.json └── docker-compose.yml ├── pandemic_knowledge.png └── prefect ├── Dockerfile └── prefect.config /.env.example: -------------------------------------------------------------------------------- 1 | PREFECT_UI_TAG="latest" 2 | PREFECT_SERVER_TAG="latest" 3 | 4 | POSTGRES_USER="prefect_user" 5 | # PLEASE CHANGE ! 6 | POSTGRES_PASSWORD="prefect_password" 7 | POSTGRES_DB="prefect_db" 8 | # PLEASE CHANGE according to POSTGRES_PASSWORD ! 9 | DB_CONNECTION_URL="postgresql://prefect_user:prefect_password@prefect_postgres:5432/prefect_db" 10 | 11 | PREFECT_SERVER_DB_CMD="prefect-server database upgrade -y" 12 | # PLEASE CHANGE ! 13 | PREFECT_SERVER__HASURA__ADMIN_SECRET="hasura-secret-admin-secret" 14 | PREFECT_SERVER__TELEMETRY__ENABLED="false" 15 | PREFECT_SERVER__APOLLO_URL="http://localhost:4200/graphql" 16 | 17 | MINIO_SCHEME=http 18 | MINIO_ENDPOINT=172.17.0.1:9000 19 | MINIO_ACCESS_KEY=minio 20 | MINIO_SECRET_KEY=minio123 21 | 22 | ELASTIC_SCHEME=https 23 | ELASTIC_PORT=9200 24 | ELASTIC_ENDPOINT=172.17.0.1 25 | ELASTIC_USER=elastic 26 | ELASTICSEARCH_PWD=elastic 27 | MAX_ES_ROW_INJECT=1000 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/ 2 | .env 3 | certs/ 4 | node_modules/ -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | services: 2 | - docker 3 | env: 4 | - DOCKER_COMPOSE_VERSION=1.23.2 5 | 6 | before_install: 7 | - sudo rm /usr/local/bin/docker-compose 8 | - curl -L https://github.com/docker/compose/releases/download/${DOCKER_COMPOSE_VERSION}/docker-compose-`uname -s`-`uname -m` > docker-compose 9 | - chmod +x docker-compose 10 | - sudo mv docker-compose /usr/local/bin 11 | 12 | script: 13 | - cp .env.example .env 14 | - docker-compose build 15 | - docker-compose -f crawl.docker-compose.yml build 16 | - docker-compose -f insert.docker-compose.yml build 17 | - docker-compose -f agent/docker-compose.yml build 18 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Flavien Berwick 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Pandemic-Knowledge 2 | 3 | ![Pandemic Knowledge logo](./pandemic_knowledge.png) 4 | 5 |

6 | 7 | 8 | 9 | Code style: black 10 | 11 |

12 | 13 | A fully-featured multi-source data pipeline for continuously extracting knowledge from COVID-19 data. 14 | 15 | - Contamination figures 16 | - Vaccination figures 17 | - Death figures 18 | - COVID-19-related news (Google News, Twitter) 19 | 20 | ## What you can achieve 21 | 22 | | Live contaminations map + Latest news | Last 7 days news | 23 | | :---------------------------------------------------------------------------------: | :---------------------------------------------------: | 24 | | ![Live contamination and vaccination world map](./illustrations/live_dashboard.png) | ![Last news, live !](./illustrations/latest_news.png) | 25 | 26 | | France 3-weeks live map (Kibana Canvas) | Live vaccinations map | 27 | | :-----------------------------------------------------------: | :-----------------------------------------------------------: | 28 | | ![France Live Status](./illustrations/france_live_status.png) | ![World vaccination map](./illustrations/vaccination_map.png) | 29 | 30 | ## Context 31 | 32 | This project was realized over 4 days as part of a MSc hackathon from [ETNA](https://etna.io), a french computer science school. 33 | 34 | The incentives were both to experiment/prototype a big data pipeline and contribute to an open source project. 35 | 36 | ## Install 37 | 38 | Below, you'll find the procedure to process COVID-related file and news into the Pandemic Knowledge database (elasticsearch). 39 | 40 | The process is **scheduled** to run every 24 hours so you can update the files and obtain the latest news 41 | 42 | - [Pandemic-Knowledge](#pandemic-knowledge) 43 | - [What you can achieve](#what-you-can-achieve) 44 | - [Context](#context) 45 | - [Install](#install) 46 | - [Env file](#env-file) 47 | - [Initialize elasticsearch](#initialize-elasticsearch) 48 | - [Initialize Prefect](#initialize-prefect) 49 | - [Run Prefect workers](#run-prefect-workers) 50 | - [COVID-19 data](#covid-19-data) 51 | - [News data](#news-data) 52 | - [News web app](#news-web-app) 53 | 54 | ### Env file 55 | 56 | Running this project on your local computer ? Just copy the `.env.example` file : 57 | 58 | ```bash 59 | cp .env.example .env 60 | ``` 61 | 62 | Open this `.env` file and edit password-related variables. 63 | 64 | ### Initialize elasticsearch 65 | 66 | Raise your host's ulimits for ElasticSearch to handle high I/O : 67 | 68 | ```bash 69 | sudo sysctl -w vm.max_map_count=500000 70 | ``` 71 | 72 | Then : 73 | 74 | ```bash 75 | docker-compose -f create-certs.yml run --rm create_certs 76 | docker-compose up -d es01 es02 es03 kibana 77 | ``` 78 | 79 | ### Initialize Prefect 80 | 81 | Create a `~/.prefect/config.toml` file with the following content : 82 | 83 | ```bash 84 | # debug mode 85 | debug = true 86 | 87 | # base configuration directory (typically you won't change this!) 88 | home_dir = "~/.prefect" 89 | 90 | backend = "server" 91 | 92 | [server] 93 | host = "http://172.17.0.1" 94 | port = "4200" 95 | host_port = "4200" 96 | endpoint = "${server.host}:${server.port}" 97 | ``` 98 | 99 | Run Prefect : 100 | 101 | ```bash 102 | docker-compose up -d prefect_postgres prefect_hasura prefect_graphql prefect_towel prefect_apollo prefect_ui 103 | ``` 104 | 105 | We need to create a _tenant_. Execute on your host : 106 | 107 | ```bash 108 | pip3 install prefect 109 | prefect backend server 110 | prefect server create-tenant --name default --slug default 111 | ``` 112 | 113 | Access the web UI at [localhost:8081](http://localhost:8081) 114 | 115 | ### Run Prefect workers 116 | 117 | Agents are services that run your scheduled flows. 118 | 119 | 1. Open and optionally edit the [`agent/config.toml`](./agent/config.toml) file. 120 | 121 | 2. Let's instanciate 3 workers : 122 | 123 | ```bash 124 | docker-compose -f agent/docker-compose.yml up -d --build --scale agent=3 agent 125 | ``` 126 | 127 | > :information_source: You can run the agent on another machine than the one with the Prefect server. Edit the [`agent/config.toml`](./agent/config.toml) file for that. 128 | 129 | ### COVID-19 data 130 | 131 | Injection scripts should are scheduled in Prefect so they automatically inject data with the latest news (delete + inject). 132 | 133 | There are several data source supported by Pandemic Knowledge 134 | 135 | - [Our World In Data](https://ourworldindata.org/coronavirus-data); used by Google 136 | - docker-compose slug : `insert_owid` 137 | - MinIO bucket : `contamination-owid` 138 | - Format : CSV 139 | - [OpenCovid19-Fr](https://github.com/opencovid19-fr/data) 140 | - docker-compose slug : `insert_france` 141 | - Format : CSV (download from Internet) 142 | - [Public Health France - Virological test results](https://www.data.gouv.fr/en/datasets/donnees-relatives-aux-resultats-des-tests-virologiques-covid-19/) (official source) 143 | - docker-compose slug : `insert_france_virtests` 144 | - Format : CSV (download from Internet) 145 | 146 | 1. Start MinIO and import your files according to the buckets evoked upper. 147 | 148 | For _Our World In Data_, create the `contamination-owid` bucket and import the CSV file inside. 149 | 150 | ```bash 151 | docker-compose up -d minio 152 | ``` 153 | 154 | > MinIO is available at `localhost:9000` 155 | 156 | 2. Download dependencies and start the injection service of your choice. For instance : 157 | 158 | ```bash 159 | pip3 install -r ./flow/requirements.txt 160 | docker-compose -f insert.docker-compose.yml up --build insert_owid 161 | ``` 162 | 163 | 3. In [Kibana](https://localhost:5601), create an index pattern `contamination_owid_*` 164 | 165 | 4. Once injected, we recommend to adjust the number of replicas [in the DevTool](https://localhost:5601/app/dev_tools#/console) : 166 | 167 | ```json 168 | PUT /contamination_owid_*/_settings 169 | { 170 | "index" : { 171 | "number_of_replicas" : "2" 172 | } 173 | } 174 | ``` 175 | 176 | 5. Start making your dashboards in [Kibana](https://localhost:5601) ! 177 | 178 | ### News data 179 | 180 | There are two sources for news : 181 | 182 | - Google News (elasticsearch index: `news_googlenews`) 183 | - Twitter (elasticsearch index: `news_tweets`) 184 | 185 | 1. Run the Google News crawler : 186 | 187 | ```bash 188 | docker-compose -f crawl.docker-compose.yml up --build crawl_google_news # and/or crawl_tweets 189 | ``` 190 | 191 | 2. In Kibana, create a `news_*` index pattern 192 | 193 | 3. **Edit** the index pattern fields : 194 | 195 | | Name | Type | Format | 196 | | ---- | ----------------------------------------------------- | ------- | 197 | | img | string | **Url** | 198 | | link | string **with Type: Image** with empty _URL template_ | **Url** | 199 | 200 | 4. Create your visualisation 201 | 202 | ### News web app 203 | 204 | Browse through the news with our web application. 205 | 206 | ![News web app](./illustrations/news_web_app.png) 207 | 208 | 1. Make sure you've accepted the self-signed certificate of Elasticsearch at [`https://localhost:9200`](https://localhost:9200) 209 | 210 | 2. Start-up the app 211 | 212 | ```bash 213 | docker-compose -f news_app/docker-compose.yml up --build -d 214 | ``` 215 | 216 | 3. Discover the app at [`localhost:8080`](http://localhost:8080) 217 | 218 | --- 219 | 220 |
221 | TODOs 222 | 223 | Possible improvements : 224 | 225 | - [ ] [Using Dask for parallelizing](https://docs.prefect.io/core/idioms/parallel.html) process of CSV lines by batch of 1000 226 | - [ ] Removing indices only when source process is successful (adding new index, then remove old index) 227 | - [ ] Removing indices only when crawling is successful (adding new index, then remove old index) 228 | 229 |
230 | 231 |
232 | Useful commands 233 | 234 | To stop everything : 235 | 236 | ```bash 237 | docker-compose down 238 | docker-compose -f agent/docker-compose.yml down 239 | docker-compose -f insert.docker-compose.yml down 240 | docker-compose -f crawl.docker-compose.yml down 241 | ``` 242 | 243 | To start each service, step by step : 244 | 245 | ```bash 246 | docker-compose up -d es01 es02 es03 kibana 247 | docker-compose up -d minio 248 | docker-compose up -d prefect_postgres prefect_hasura prefect_graphql prefect_towel prefect_apollo prefect_ui 249 | docker-compose -f agent/docker-compose.yml up -d --build --scale agent=3 agent 250 | ``` 251 | 252 |
253 | -------------------------------------------------------------------------------- /agent/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.7 2 | 3 | RUN apt update && apt install uuid -y 4 | RUN pip install --upgrade pip 5 | 6 | COPY ./requirements.txt /requirements.txt 7 | 8 | RUN pip install -r /requirements.txt -------------------------------------------------------------------------------- /agent/config.toml: -------------------------------------------------------------------------------- 1 | # debug mode 2 | debug = true 3 | 4 | # base configuration directory (typically you won't change this!) 5 | home_dir = "~/.prefect" 6 | 7 | backend = "server" 8 | 9 | [server] 10 | host = "http://172.17.0.1" 11 | port = "4200" 12 | host_port = "4200" 13 | endpoint = "${server.host}:${server.port}" -------------------------------------------------------------------------------- /agent/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.7" 2 | 3 | services: 4 | 5 | agent: 6 | restart: always 7 | build: . 8 | dns: 8.8.8.8 9 | command: bash -c "prefect agent local start --name $$(uuid) --no-hostname-label --label development" 10 | volumes: 11 | - /srv/docker/prefect/flows:/root/.prefect/flows 12 | - type: bind 13 | source: ./config.toml 14 | target: /root/.prefect/config.toml 15 | read_only: true 16 | -------------------------------------------------------------------------------- /agent/requirements.txt: -------------------------------------------------------------------------------- 1 | prefect==0.14.16 2 | minio==7.0.3 3 | clevercsv==0.6.7 4 | tqdm==4.60.0 5 | elasticsearch==7.12.0 6 | geopy==2.1.0 7 | iso3166==1.0.1 8 | dateparser==1.0.0 9 | GoogleNews==1.5.7 10 | snscrape==0.3.4 11 | pandas==1.2.4 12 | -------------------------------------------------------------------------------- /crawl.docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.7" 2 | 3 | services: 4 | 5 | crawl_google_news: 6 | build: ./flow 7 | command: python3 /usr/app/crawl_google_news.py 8 | volumes: 9 | - /srv/docker/prefect/flows:/root/.prefect/flows 10 | - "./flow/scripts:/usr/app:ro" 11 | - type: bind 12 | source: ./flow/config.toml 13 | target: /root/.prefect/config.toml 14 | read_only: true 15 | env_file: 16 | - .env 17 | environment: 18 | MAX_ES_ROW_INJECT: ${MAX_ES_ROW_INJECT} 19 | ELASTIC_SCHEME: ${ELASTIC_SCHEME} 20 | ELASTIC_PORT: ${ELASTIC_PORT} 21 | ELASTIC_ENDPOINT: ${ELASTIC_ENDPOINT} 22 | ELASTIC_USER: ${ELASTIC_USER} 23 | ELASTIC_PWD: ${ELASTICSEARCH_PWD} 24 | 25 | crawl_tweets: 26 | build: ./flow 27 | command: python3 /usr/app/crawl_tweets.py 28 | volumes: 29 | - /srv/docker/prefect/flows:/root/.prefect/flows 30 | - "./flow/scripts:/usr/app:ro" 31 | - type: bind 32 | source: ./flow/config.toml 33 | target: /root/.prefect/config.toml 34 | read_only: true 35 | env_file: 36 | - .env 37 | environment: 38 | MAX_ES_ROW_INJECT: ${MAX_ES_ROW_INJECT} 39 | ELASTIC_SCHEME: ${ELASTIC_SCHEME} 40 | ELASTIC_PORT: ${ELASTIC_PORT} 41 | ELASTIC_ENDPOINT: ${ELASTIC_ENDPOINT} 42 | ELASTIC_USER: ${ELASTIC_USER} 43 | ELASTIC_PWD: ${ELASTICSEARCH_PWD} 44 | -------------------------------------------------------------------------------- /create-certs.yml: -------------------------------------------------------------------------------- 1 | version: '3.2' 2 | 3 | services: 4 | 5 | create_certs: 6 | container_name: create_certs 7 | image: docker.elastic.co/elasticsearch/elasticsearch:7.10.0 8 | command: > 9 | bash -c ' 10 | if [[ ! -f /certs/ca.zip ]]; then 11 | # Generating CA certificate 12 | bin/elasticsearch-certutil ca --silent --pem -out /certs/ca.zip; 13 | unzip /certs/ca.zip -d /certs; 14 | fi; 15 | if [[ ! -f /certs/pem.zip ]]; then 16 | # Generating PEM certificates (ElasticSearch nodes and Kibana) 17 | bin/elasticsearch-certutil cert --silent --pem --ca-cert "/certs/ca/ca.crt" --ca-key "/certs/ca/ca.key" --in config/certificates/pem.yml -out /certs/pem.zip; 18 | unzip /certs/pem.zip -d /certs; 19 | fi; 20 | if [[ ! -f /certs/pkcs_12.zip ]]; then 21 | # Generating PKCS#12 certificates (Enterprise Search) 22 | bin/elasticsearch-certutil cert --silent --pass "changeme" --ca-cert "/certs/ca/ca.crt" --ca-key "/certs/ca/ca.key" --in config/certificates/pkcs_12.yml -out /certs/pkcs_12.zip; 23 | unzip /certs/pkcs_12.zip -d /certs; 24 | fi; 25 | ' 26 | working_dir: /usr/share/elasticsearch 27 | volumes: ['./certs:/certs', './instances:/usr/share/elasticsearch/config/certificates'] 28 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.7" 2 | 3 | services: 4 | prefect_setup: 5 | build: ./prefect 6 | networks: 7 | - prefect-server 8 | 9 | prefect_postgres: 10 | restart: "always" 11 | image: "postgres:11" 12 | environment: 13 | POSTGRES_USER: ${POSTGRES_USER} 14 | POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} 15 | POSTGRES_DB: ${POSTGRES_DB} 16 | volumes: 17 | - prefect_postgres:/var/lib/postgresql/data 18 | networks: 19 | - prefect-server 20 | healthcheck: 21 | test: pg_isready -q -d $${POSTGRES_DB} -U $${POSTGRES_USER} || exit 1 22 | interval: 10s 23 | timeout: 2s 24 | retries: 60 25 | start_period: 2s 26 | command: 27 | - "postgres" 28 | # explicitly set max connections 29 | - "-c" 30 | - "max_connections=150" 31 | 32 | prefect_hasura: 33 | restart: "always" 34 | image: "hasura/graphql-engine:v1.3.3" 35 | ports: 36 | - "3000:3000" 37 | command: "graphql-engine serve" 38 | environment: 39 | HASURA_GRAPHQL_DATABASE_URL: ${DB_CONNECTION_URL} 40 | HASURA_GRAPHQL_ENABLE_CONSOLE: "true" 41 | HASURA_GRAPHQL_SERVER_PORT: "3000" 42 | HASURA_GRAPHQL_QUERY_PLAN_CACHE_SIZE: 100 43 | HASURA_GRAPHQL_LOG_LEVEL: "warn" 44 | networks: 45 | - prefect-server 46 | healthcheck: 47 | test: wget -O - http://hasura:3000/healthz &>/dev/null || exit 1 48 | interval: 10s 49 | timeout: 2s 50 | retries: 60 51 | start_period: 1s 52 | depends_on: 53 | - prefect_postgres 54 | 55 | prefect_graphql: 56 | restart: "always" 57 | image: "prefecthq/server:latest" 58 | ports: 59 | - "4201:4201" 60 | command: bash -c "${PREFECT_SERVER_DB_CMD} && python src/prefect_server/services/graphql/server.py" 61 | environment: 62 | PREFECT_SERVER_DB_CMD: ${PREFECT_SERVER_DB_CMD:-"echo 'DATABASE MIGRATIONS SKIPPED'"} 63 | PREFECT_SERVER__DATABASE__CONNECTION_URL: ${DB_CONNECTION_URL} 64 | PREFECT_SERVER__HASURA__ADMIN_SECRET: ${PREFECT_SERVER__HASURA__ADMIN_SECRET:-hasura-secret-admin-secret} 65 | PREFECT_SERVER__HASURA__HOST: prefect_hasura 66 | networks: 67 | - prefect-server 68 | healthcheck: 69 | test: curl --fail --silent "http://prefect_graphql:4201/health" &> /dev/null || exit 1 70 | interval: 20s 71 | timeout: 2s 72 | retries: 60 73 | start_period: 1s 74 | depends_on: 75 | - prefect_hasura 76 | 77 | prefect_towel: 78 | restart: "always" 79 | image: "prefecthq/server:latest" 80 | command: "python src/prefect_server/services/towel/__main__.py" 81 | environment: 82 | PREFECT_SERVER__HASURA__ADMIN_SECRET: ${PREFECT_SERVER__HASURA__ADMIN_SECRET:-hasura-secret-admin-secret} 83 | PREFECT_SERVER__HASURA__HOST: prefect_hasura 84 | networks: 85 | - prefect-server 86 | depends_on: 87 | - prefect_graphql 88 | 89 | prefect_apollo: 90 | restart: "always" 91 | image: "prefecthq/apollo:latest" 92 | command: bash -c "./post-start.sh && npm run serve" 93 | ports: 94 | - 4200:4200 95 | environment: 96 | HASURA_API_URL: http://prefect_hasura:3000/v1alpha1/graphql 97 | PREFECT_API_URL: http://prefect_graphql:4201/graphql/ 98 | PREFECT_API_HEALTH_URL: http://prefect_graphql:4201/health 99 | PREFECT_SERVER__TELEMETRY__ENABLED: "false" 100 | GRAPHQL_SERVICE_HOST: http://prefect_graphql 101 | GRAPHQL_SERVICE_PORT: 4201 102 | networks: 103 | - prefect-server 104 | healthcheck: 105 | test: curl --fail --silent "http://prefect_apollo:4200/.well-known/apollo/server-health" &> /dev/null || exit 1 106 | interval: 10s 107 | timeout: 2s 108 | retries: 60 109 | start_period: 1s 110 | depends_on: 111 | - prefect_graphql 112 | 113 | prefect_ui: 114 | restart: "always" 115 | image: "prefecthq/ui:2021-02-23" 116 | ports: 117 | - "8081:8080" 118 | command: "/intercept.sh" 119 | environment: 120 | PREFECT_SERVER__APOLLO_URL: http://localhost:4200/graphql 121 | PREFECT_BACKEND: server 122 | networks: 123 | - prefect-server 124 | healthcheck: 125 | test: curl --fail --silent --head "http://prefect_ui:8080/" &> /dev/null || exit 1 126 | interval: 30s 127 | timeout: 5s 128 | retries: 3 129 | depends_on: 130 | - prefect_apollo 131 | 132 | es01: 133 | restart: always 134 | image: docker.elastic.co/elasticsearch/elasticsearch:7.12.0 135 | volumes: 136 | - "es01:/usr/share/elasticsearch/data" 137 | - "./certs:/usr/share/elasticsearch/config/certificates:ro" 138 | ports: 139 | - "9200:9200" 140 | environment: 141 | ES_JAVA_OPTS: "-Xmx512m -Xms512m" 142 | ELASTIC_PASSWORD: ${ELASTICSEARCH_PWD} 143 | node.name: es01 144 | cluster.name: es-docker-cluster 145 | discovery.seed_hosts: es02,es03 146 | cluster.initial_master_nodes: es01,es02,es03 147 | network.host: 0.0.0.0 148 | xpack.license.self_generated.type: basic 149 | xpack.monitoring.collection.enabled: "true" 150 | xpack.security.enabled: "true" 151 | xpack.security.http.ssl.enabled: "true" 152 | xpack.security.http.ssl.key: /usr/share/elasticsearch/config/certificates/es01/es01.key 153 | xpack.security.http.ssl.certificate_authorities: /usr/share/elasticsearch/config/certificates/ca/ca.crt 154 | xpack.security.http.ssl.certificate: /usr/share/elasticsearch/config/certificates/es01/es01.crt 155 | xpack.security.transport.ssl.enabled: "true" 156 | xpack.security.transport.ssl.verification_mode: certificate 157 | xpack.security.transport.ssl.certificate_authorities: /usr/share/elasticsearch/config/certificates/ca/ca.crt 158 | xpack.security.transport.ssl.certificate: /usr/share/elasticsearch/config/certificates/es01/es01.crt 159 | xpack.security.transport.ssl.key: /usr/share/elasticsearch/config/certificates/es01/es01.key 160 | cluster.routing.allocation.disk.threshold_enabled: "true" 161 | cluster.routing.allocation.disk.watermark.low: 93% 162 | cluster.routing.allocation.disk.watermark.high: 95% 163 | http.cors.enabled : "true" 164 | http.cors.allow-origin : "*" 165 | http.cors.allow-methods : OPTIONS, HEAD, GET, POST, PUT, DELETE 166 | http.cors.allow-headers : Authorization,X-Requested-With,X-Auth-Token,Content-Type, Content-Length 167 | depends_on: 168 | - es02 169 | - es03 170 | ulimits: 171 | memlock: 172 | soft: 262144 173 | hard: 500000 174 | 175 | es02: 176 | restart: always 177 | image: docker.elastic.co/elasticsearch/elasticsearch:7.12.0 178 | volumes: 179 | - "es02:/usr/share/elasticsearch/data" 180 | - "./certs:/usr/share/elasticsearch/config/certificates:ro" 181 | environment: 182 | ES_JAVA_OPTS: "-Xmx512m -Xms512m" 183 | ELASTIC_PASSWORD: ${ELASTICSEARCH_PWD} 184 | node.name: es02 185 | cluster.name: es-docker-cluster 186 | discovery.seed_hosts: es01,es03 187 | cluster.initial_master_nodes: es01,es02,es03 188 | xpack.license.self_generated.type: basic 189 | xpack.monitoring.collection.enabled: "true" 190 | xpack.security.enabled: "true" 191 | xpack.security.http.ssl.enabled: "true" 192 | xpack.security.http.ssl.key: /usr/share/elasticsearch/config/certificates/es02/es02.key 193 | xpack.security.http.ssl.certificate_authorities: /usr/share/elasticsearch/config/certificates/ca/ca.crt 194 | xpack.security.http.ssl.certificate: /usr/share/elasticsearch/config/certificates/es02/es02.crt 195 | xpack.security.transport.ssl.enabled: "true" 196 | xpack.security.transport.ssl.verification_mode: certificate 197 | xpack.security.transport.ssl.certificate_authorities: /usr/share/elasticsearch/config/certificates/ca/ca.crt 198 | xpack.security.transport.ssl.certificate: /usr/share/elasticsearch/config/certificates/es02/es02.crt 199 | xpack.security.transport.ssl.key: /usr/share/elasticsearch/config/certificates/es02/es02.key 200 | cluster.routing.allocation.disk.threshold_enabled: "true" 201 | cluster.routing.allocation.disk.watermark.low: 93% 202 | cluster.routing.allocation.disk.watermark.high: 95% 203 | ulimits: 204 | memlock: 205 | soft: 262144 206 | hard: 500000 207 | 208 | es03: 209 | restart: always 210 | image: docker.elastic.co/elasticsearch/elasticsearch:7.12.0 211 | volumes: 212 | - "es03:/usr/share/elasticsearch/data" 213 | - "./certs:/usr/share/elasticsearch/config/certificates:ro" 214 | environment: 215 | ES_JAVA_OPTS: "-Xmx512m -Xms512m" 216 | ELASTIC_PASSWORD: ${ELASTICSEARCH_PWD} 217 | node.name: es03 218 | cluster.name: es-docker-cluster 219 | discovery.seed_hosts: es01,es02 220 | cluster.initial_master_nodes: es01,es02,es03 221 | xpack.license.self_generated.type: basic 222 | xpack.monitoring.collection.enabled: "true" 223 | xpack.security.enabled: "true" 224 | xpack.security.http.ssl.enabled: "true" 225 | xpack.security.http.ssl.key: /usr/share/elasticsearch/config/certificates/es03/es03.key 226 | xpack.security.http.ssl.certificate_authorities: /usr/share/elasticsearch/config/certificates/ca/ca.crt 227 | xpack.security.http.ssl.certificate: /usr/share/elasticsearch/config/certificates/es03/es03.crt 228 | xpack.security.transport.ssl.enabled: "true" 229 | xpack.security.transport.ssl.verification_mode: certificate 230 | xpack.security.transport.ssl.certificate_authorities: /usr/share/elasticsearch/config/certificates/ca/ca.crt 231 | xpack.security.transport.ssl.certificate: /usr/share/elasticsearch/config/certificates/es03/es03.crt 232 | xpack.security.transport.ssl.key: /usr/share/elasticsearch/config/certificates/es03/es03.key 233 | cluster.routing.allocation.disk.threshold_enabled: "true" 234 | cluster.routing.allocation.disk.watermark.low: 93% 235 | cluster.routing.allocation.disk.watermark.high: 95% 236 | ulimits: 237 | memlock: 238 | soft: 262144 239 | hard: 500000 240 | 241 | kibana: 242 | image: docker.elastic.co/kibana/kibana:7.12.0 243 | restart: always 244 | volumes: 245 | - type: bind 246 | source: ./kibana.yml 247 | target: /usr/share/kibana/config/kibana.yml 248 | read_only: true 249 | - "./certs:/usr/share/elasticsearch/config/certificates:ro" 250 | ports: 251 | - "5601:5601" 252 | depends_on: 253 | - es01 254 | 255 | # source : https://docs.min.io/docs/deploy-minio-on-docker-compose.html 256 | minio: 257 | restart: always 258 | image: minio/minio:RELEASE.2021-04-06T23-11-00Z-24-g409125240 259 | command: server /data 260 | ports: 261 | - 9000:9000 262 | volumes: 263 | - minio:/data 264 | environment: 265 | MINIO_ACCESS_KEY: ${MINIO_ACCESS_KEY} 266 | MINIO_SECRET_KEY: ${MINIO_SECRET_KEY} 267 | 268 | 269 | volumes: 270 | es01: 271 | es02: 272 | es03: 273 | minio: 274 | prefect_postgres: 275 | 276 | networks: 277 | prefect-server: 278 | name: prefect-server 279 | -------------------------------------------------------------------------------- /flow/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.7 2 | 3 | RUN apt update 4 | RUN pip install --upgrade pip 5 | 6 | COPY ./requirements.txt /requirements.txt 7 | RUN pip install -r /requirements.txt -------------------------------------------------------------------------------- /flow/config.toml: -------------------------------------------------------------------------------- 1 | # debug mode 2 | debug = true 3 | 4 | # base configuration directory (typically you won't change this!) 5 | home_dir = "~/.prefect" 6 | 7 | backend = "server" 8 | 9 | [server] 10 | host = "http://172.17.0.1" 11 | port = "4200" 12 | host_port = "4200" 13 | endpoint = "${server.host}:${server.port}" 14 | 15 | [s3] 16 | endpoint = "172.17.0.1:9000" 17 | key = "minio" 18 | secret = "minio123" -------------------------------------------------------------------------------- /flow/requirements.txt: -------------------------------------------------------------------------------- 1 | prefect==0.14.16 2 | minio==7.0.3 3 | clevercsv==0.6.7 4 | tqdm==4.60.0 5 | elasticsearch==7.12.0 6 | geopy==2.1.0 7 | iso3166==1.0.1 8 | dateparser==1.0.0 9 | GoogleNews==1.5.7 10 | snscrape==0.3.4 11 | pandas==1.2.4 12 | -------------------------------------------------------------------------------- /flow/scripts/crawl_google_news.py: -------------------------------------------------------------------------------- 1 | # python3 2 | import os 3 | from typing import Iterable 4 | import uuid 5 | import prefect 6 | from elasticsearch import Elasticsearch, helpers 7 | from prefect import Flow, Task, Client 8 | from datetime import timedelta, datetime 9 | 10 | from prefect.schedules import IntervalSchedule 11 | from GoogleNews import GoogleNews 12 | 13 | from crawl_mapping import mapping 14 | 15 | 16 | project_name = "pandemic-knowledge-crawl-googlenews" 17 | index_name = "news_googlenews" 18 | 19 | MAX_ES_ROW_INJECT = int(os.environ.get("MAX_ES_ROW_INJECT", 1000)) 20 | ELASTIC_SCHEME = os.environ.get("ELASTIC_SCHEME") 21 | ELASTIC_PORT = os.environ.get("ELASTIC_PORT") 22 | ELASTIC_USER = os.environ.get("ELASTIC_USER") 23 | ELASTIC_PWD = os.environ.get("ELASTIC_PWD") 24 | ELASTIC_ENDPOINT = os.environ.get("ELASTIC_ENDPOINT") 25 | 26 | logger = prefect.context.get("logger") 27 | 28 | schedule = IntervalSchedule( 29 | start_date=datetime.utcnow() + timedelta(seconds=1), interval=timedelta(hours=24) 30 | ) 31 | 32 | 33 | def get_es_instance(): 34 | es_inst = Elasticsearch( 35 | [ELASTIC_ENDPOINT], 36 | http_auth=(ELASTIC_USER, ELASTIC_PWD), 37 | scheme=ELASTIC_SCHEME, 38 | port=ELASTIC_PORT, 39 | verify_certs=False, 40 | ) 41 | return es_inst 42 | 43 | 44 | def inject_rows_to_es(rows, index_name): 45 | es_inst = get_es_instance() 46 | 47 | logger.info("Injecting {} rows in Elasticsearch".format(len(rows))) 48 | 49 | actions = [ 50 | {"_index": index_name, "_id": uuid.uuid4(), "_source": row} for row in rows 51 | ] 52 | 53 | helpers.bulk(es_inst, actions) 54 | 55 | 56 | def format_new(new: dict, lang: str) -> dict: 57 | """Formatting a single Google News new for elasticsearch injection""" 58 | if len(new): 59 | return { 60 | "title": str(new["title"]), 61 | "desc": str(new["desc"]), 62 | "img": str(new["img"]), 63 | "link": "https://" + str(new["link"]), 64 | "source.crawler": "Google News", 65 | "source.website": str(new["site"]), 66 | "source.url": str(new["link"]), 67 | "date": new["datetime"], 68 | "lang": lang, 69 | } 70 | return None 71 | 72 | 73 | def get_news(googlenews: GoogleNews, lang: str, search_tag: str) -> Iterable: 74 | googlenews.get_news(search_tag) 75 | news = googlenews.results(sort=True) 76 | if news: 77 | for new in news: 78 | fmt_new = format_new(new, lang) 79 | if fmt_new: 80 | yield fmt_new 81 | return [] 82 | 83 | 84 | class GetNews(Task): 85 | def run(self, index_name): 86 | googlenews = GoogleNews( 87 | period="24h", # TODO(): Improve using googlenews.set_time_range('02/01/2020','02/28/2020') 88 | encode="utf-8", 89 | ) 90 | news_to_inject = [] 91 | langs = ["fr", "en"] 92 | search_tags = ["COVID", "CORONA"] 93 | for lang in langs: 94 | for search_tag in search_tags: 95 | logger.info( 96 | f"Crawling GoogleNews for '{lang}' lang and {search_tag} search tag..." 97 | ) 98 | googlenews.set_lang(lang) 99 | try: 100 | news = list(get_news(googlenews, lang, search_tag)) 101 | news_to_inject += news if len(news) else [] 102 | logger.info(f"Found {len(news)} news.") 103 | except Exception as e: 104 | logger.error(e) 105 | googlenews.clear() 106 | if len(news_to_inject) > 0: 107 | inject_rows_to_es(news_to_inject, index_name) 108 | news_to_inject = [] 109 | 110 | 111 | class GenerateEsMapping(Task): 112 | def __init__(self, index_name, **kwargs): 113 | self.index_name = index_name 114 | super().__init__(**kwargs) 115 | 116 | def run(self): 117 | index_name = self.index_name 118 | es_inst = get_es_instance() 119 | 120 | logger.info("Generating mapping for index {}".format(index_name)) 121 | 122 | es_inst.indices.delete(index=index_name, ignore=[400, 404]) 123 | 124 | response = es_inst.indices.create(index=index_name, body=mapping, ignore=400) 125 | 126 | if "acknowledged" in response: 127 | if response["acknowledged"] == True: 128 | logger.info( 129 | "INDEX MAPPING SUCCESS FOR INDEX: {}".format(response["index"]) 130 | ) 131 | elif "error" in response: 132 | logger.error(response["error"]["root_cause"]) 133 | logger.error("Error type: {}".format(response["error"]["type"])) 134 | raise Exception("Unable to create index mapping") 135 | 136 | 137 | with Flow("Crawl news and insert", schedule=schedule) as flow: 138 | flow.set_dependencies( 139 | upstream_tasks=[GenerateEsMapping(index_name)], 140 | task=GetNews(), 141 | keyword_tasks=dict(index_name=index_name), 142 | ) 143 | 144 | if __name__ == "__main__": 145 | try: 146 | client = Client() 147 | client.create_project(project_name=project_name) 148 | except prefect.utilities.exceptions.ClientError as e: 149 | logger.info("Project already exists") 150 | 151 | flow.register( 152 | project_name=project_name, 153 | labels=["development"], 154 | add_default_labels=False, 155 | ) 156 | -------------------------------------------------------------------------------- /flow/scripts/crawl_mapping.py: -------------------------------------------------------------------------------- 1 | mapping = { 2 | "mappings": { 3 | "properties": { 4 | "title": {"type": "text", "fields": {"keyword": {"type": "keyword"}}}, 5 | "desc": {"type": "text"}, 6 | "date": { 7 | "type": "date", 8 | "format": "strict_date_optional_time||epoch_millis", 9 | }, 10 | "link": {"type": "text", "fields": {"keyword": {"type": "keyword"}}}, 11 | "img": {"type": "text"}, 12 | "source": { 13 | "properties": { 14 | "crawler": {"type": "text"}, 15 | "website": {"type": "text"}, 16 | "author": {"type": "text"}, 17 | "url": {"type": "text"}, 18 | "tweet": {"properties": {"id": {"type": "text"}}}, 19 | } 20 | }, 21 | "lang": {"type": "text", "fields": {"keyword": {"type": "keyword"}}}, 22 | } 23 | } 24 | } -------------------------------------------------------------------------------- /flow/scripts/crawl_tweets.py: -------------------------------------------------------------------------------- 1 | # python3 2 | import os 3 | import uuid 4 | import prefect 5 | from elasticsearch import Elasticsearch, helpers 6 | from prefect import Flow, Task, Client 7 | from datetime import datetime 8 | from datetime import timedelta 9 | 10 | from prefect.schedules import IntervalSchedule 11 | import snscrape.modules.twitter as sntwitter 12 | 13 | from crawl_mapping import mapping 14 | 15 | project_name = "pandemic-knowledge-crawl-tweets" 16 | index_name = "news_tweets" 17 | 18 | lang = "en" 19 | tweet_limit = 1000 20 | 21 | MAX_ES_ROW_INJECT = int(os.environ.get("MAX_ES_ROW_INJECT", 1000)) 22 | ELASTIC_SCHEME = os.environ.get("ELASTIC_SCHEME") 23 | ELASTIC_PORT = os.environ.get("ELASTIC_PORT") 24 | ELASTIC_USER = os.environ.get("ELASTIC_USER") 25 | ELASTIC_PWD = os.environ.get("ELASTIC_PWD") 26 | ELASTIC_ENDPOINT = os.environ.get("ELASTIC_ENDPOINT") 27 | 28 | logger = prefect.context.get("logger") 29 | 30 | schedule = IntervalSchedule( 31 | start_date=datetime.utcnow() + timedelta(seconds=1), interval=timedelta(hours=24) 32 | ) 33 | 34 | 35 | def get_es_instance(): 36 | es_inst = Elasticsearch( 37 | [ELASTIC_ENDPOINT], 38 | http_auth=(ELASTIC_USER, ELASTIC_PWD), 39 | scheme=ELASTIC_SCHEME, 40 | port=ELASTIC_PORT, 41 | verify_certs=False, 42 | ) 43 | return es_inst 44 | 45 | 46 | def inject_rows_to_es(rows, index_name): 47 | es_inst = get_es_instance() 48 | 49 | logger.info("Injecting {} rows in Elasticsearch".format(len(rows))) 50 | 51 | actions = [ 52 | {"_index": index_name, "_id": uuid.uuid4(), "_source": row} for row in rows 53 | ] 54 | 55 | helpers.bulk(es_inst, actions) 56 | 57 | 58 | class GetTweets(Task): 59 | def run(self, index_name): 60 | tweets_from = (datetime.now() - timedelta(days=1)).strftime("%Y-%m-%d") 61 | to_inject = [] 62 | tweets = sntwitter.TwitterSearchScraper( 63 | f"covid since:{tweets_from} lang:{lang}" 64 | ).get_items() 65 | for i, tweet in enumerate(tweets): 66 | if i > tweet_limit: 67 | break 68 | if i % 100 == 0: 69 | inject_rows_to_es(to_inject, index_name) 70 | to_inject = [] 71 | to_inject.append( 72 | { 73 | "title": f"Tweet from {tweet.username} the {tweet.date}", 74 | "desc": tweet.content, 75 | "date": tweet.date, 76 | "link": tweet.url, 77 | "source.crawler": "twitter", 78 | "source.website": "https://twitter.com", 79 | "source.author": tweet.username, 80 | "source.url": tweet.url, 81 | "source.tweet.id": tweet.id, 82 | "lang": lang 83 | } 84 | ) 85 | if len(to_inject): 86 | inject_rows_to_es(to_inject, index_name) 87 | 88 | 89 | class GenerateEsMapping(Task): 90 | def __init__(self, index_name, **kwargs): 91 | self.index_name = index_name 92 | super().__init__(**kwargs) 93 | 94 | def run(self): 95 | index_name = self.index_name 96 | es_inst = get_es_instance() 97 | 98 | logger.info("Generating mapping for index {}".format(index_name)) 99 | 100 | es_inst.indices.delete(index=index_name, ignore=[400, 404]) 101 | 102 | response = es_inst.indices.create(index=index_name, body=mapping, ignore=400) 103 | 104 | if "acknowledged" in response: 105 | if response["acknowledged"] == True: 106 | logger.info( 107 | "INDEX MAPPING SUCCESS FOR INDEX: {}".format(response["index"]) 108 | ) 109 | elif "error" in response: 110 | logger.error(response["error"]["root_cause"]) 111 | logger.error("Error type: {}".format(response["error"]["type"])) 112 | raise Exception("Unable to create index mapping") 113 | 114 | 115 | with Flow("Crawl tweets and insert", schedule=schedule) as flow: 116 | flow.set_dependencies( 117 | upstream_tasks=[GenerateEsMapping(index_name)], 118 | task=GetTweets(), 119 | keyword_tasks=dict(index_name=index_name), 120 | ) 121 | 122 | if __name__ == "__main__": 123 | try: 124 | client = Client() 125 | client.create_project(project_name=project_name) 126 | except prefect.utilities.exceptions.ClientError as e: 127 | logger.info("Project already exists") 128 | 129 | flow.register( 130 | project_name=project_name, 131 | labels=["development"], 132 | add_default_labels=False, 133 | ) 134 | -------------------------------------------------------------------------------- /flow/scripts/insert_france.py: -------------------------------------------------------------------------------- 1 | import os 2 | import dateparser 3 | import uuid 4 | import requests 5 | import prefect 6 | import clevercsv 7 | import traceback 8 | from tqdm import tqdm 9 | from prefect import Flow, Task, Client, task 10 | from datetime import timedelta, datetime 11 | from prefect.schedules import IntervalSchedule 12 | from elasticsearch import Elasticsearch, helpers 13 | from geopy.geocoders import Nominatim 14 | from requests.adapters import HTTPAdapter 15 | from requests.packages.urllib3.util.retry import Retry 16 | 17 | from mapping import mapping 18 | 19 | MINIO_SCHEME = os.environ.get("MINIO_SCHEME") 20 | MINIO_ENDPOINT = os.environ.get("MINIO_ENDPOINT") 21 | MINIO_ACCESS_KEY = os.environ.get("MINIO_ACCESS_KEY") 22 | MINIO_SECRET_KEY = os.environ.get("MINIO_SECRET_KEY") 23 | MAX_ES_ROW_INJECT = int(os.environ.get("MAX_ES_ROW_INJECT", 1000)) 24 | ELASTIC_SCHEME = os.environ.get("ELASTIC_SCHEME") 25 | ELASTIC_PORT = os.environ.get("ELASTIC_PORT") 26 | ELASTIC_USER = os.environ.get("ELASTIC_USER") 27 | ELASTIC_PWD = os.environ.get("ELASTIC_PWD") 28 | ELASTIC_ENDPOINT = os.environ.get("ELASTIC_ENDPOINT") 29 | 30 | csv_endpoint = "https://raw.githubusercontent.com/opencovid19-fr/data/master/dist/chiffres-cles.csv" 31 | index_name = "contamination_opencovid19_fr" 32 | project_name = f"pandemic-knowledge-opencovid19-fr" 33 | flow_name = project_name 34 | 35 | logger = prefect.context.get("logger") 36 | 37 | columns_allowed = { 38 | "date": ["date"], 39 | "location": ["maille_nom"], 40 | "location_name": ["maille_nom"], 41 | "confirmed": ["cas_confirmes"], 42 | "deaths": ["deces"], 43 | "recovered": ["gueris"], 44 | "vaccinated": [], 45 | "tested": ["depistes"], 46 | } 47 | 48 | extra_locations = {"EL": "GR"} 49 | 50 | locations_cache = {"World": None} 51 | 52 | 53 | def get_es_instance(): 54 | es_inst = Elasticsearch( 55 | [ELASTIC_ENDPOINT], 56 | http_auth=(ELASTIC_USER, ELASTIC_PWD), 57 | scheme=ELASTIC_SCHEME, 58 | port=ELASTIC_PORT, 59 | verify_certs=False, 60 | ) 61 | return es_inst 62 | 63 | 64 | def format_date(date): 65 | if not date: 66 | return None 67 | try: 68 | return dateparser.parse(date) 69 | except Exception as e: 70 | logger.error(e) 71 | return None 72 | 73 | 74 | def format_location(lookup_table, location_name): 75 | if not location_name: 76 | return None 77 | if location_name in locations_cache: 78 | return locations_cache[location_name] 79 | if location_name in lookup_table: 80 | return lookup_table[location_name] 81 | return None 82 | 83 | 84 | def pick_one_of_elements(haystack: list, needles: list): 85 | for needle in needles: 86 | if needle in haystack: 87 | return needle 88 | return None 89 | 90 | 91 | def pick_nonempty_cell(row, headers, potential_keys): 92 | for potential_key in potential_keys: 93 | if potential_key in headers and row[headers[potential_key]]: 94 | return row[headers[potential_key]] 95 | return None 96 | 97 | 98 | def format_row(lookup_table, row, headers, filename): 99 | date_start = date_end = format_date( 100 | pick_nonempty_cell(row, headers, columns_allowed["date"]) 101 | ) 102 | location = format_location( 103 | lookup_table, pick_nonempty_cell(row, headers, columns_allowed["location"]) 104 | ) 105 | location_name = pick_nonempty_cell(row, headers, columns_allowed["location_name"]) 106 | nb_confirmed = pick_nonempty_cell(row, headers, columns_allowed["confirmed"]) 107 | nb_deaths = pick_nonempty_cell(row, headers, columns_allowed["deaths"]) 108 | nb_recovered = pick_nonempty_cell(row, headers, columns_allowed["recovered"]) 109 | nb_vaccinated = pick_nonempty_cell(row, headers, columns_allowed["vaccinated"]) 110 | nb_tested = pick_nonempty_cell(row, headers, columns_allowed["tested"]) 111 | if date_start != None: 112 | return { 113 | "date_start": date_start, 114 | "date_end": date_end, 115 | "location": location[0] if location else None, 116 | "location_name": location_name, 117 | "confirmed": int(float(nb_confirmed)) if nb_confirmed else 0, 118 | "deaths": int(float(nb_deaths)) if nb_deaths else 0, 119 | "recovered": int(float(nb_recovered)) if nb_recovered else 0, 120 | "vaccinated": int(float(nb_vaccinated)) if nb_vaccinated else 0, 121 | "tested": int(float(nb_tested)) if nb_tested else 0, 122 | "filename": filename, 123 | "iso_code2": location[1] if location else None, 124 | "iso_region2": str(row[2]).replace("DEP", "FR"), 125 | } 126 | logger.warning(f"format_row(): Invalid row : {row}") 127 | return None 128 | 129 | 130 | def inject_rows_to_es(rows, index_name): 131 | es_inst = get_es_instance() 132 | 133 | logger.info("Injecting {} rows in Elasticsearch".format(len(rows))) 134 | 135 | actions = [ 136 | {"_index": index_name, "_id": uuid.uuid4(), "_source": row} for row in rows 137 | ] 138 | helpers.bulk(es_inst, actions) 139 | 140 | 141 | def parse_file(lookup_table, file_path): 142 | with open(file_path, "r", newline="") as fp: 143 | char_read = 10000 if os.path.getsize(file_path) > 10000 else None 144 | 145 | try: 146 | dialect = clevercsv.Sniffer().sniff(fp.read(char_read), verbose=True) 147 | except Exception as e: 148 | logger.error(e) 149 | return [] 150 | 151 | fp.seek(0) 152 | reader = clevercsv.reader(fp, dialect) 153 | headers_list = next(reader) 154 | headers = {} 155 | for i, header in enumerate(headers_list): 156 | headers[header] = i 157 | for row in tqdm(reader, unit="entry"): 158 | if row[1] != "departement": # multiple granularities 159 | continue 160 | yield format_row(lookup_table, row, headers, file_path) 161 | return [] 162 | 163 | 164 | def process_file(lookup_table, index_name, file_path): 165 | to_inject = [] 166 | logger.info(f"process_file(): Processing {file_path}...") 167 | for row in parse_file(lookup_table, file_path): 168 | if row is not None: 169 | to_inject.append(row) 170 | if len(to_inject) >= MAX_ES_ROW_INJECT: 171 | inject_rows_to_es(to_inject, index_name) 172 | to_inject = [] 173 | else: 174 | logger.warning("process_file(): Invalid row") 175 | if len(to_inject) > 0: 176 | inject_rows_to_es(to_inject, index_name) 177 | 178 | 179 | class ParseFiles(Task): 180 | def run(self, lookup_table, index_name, http_csv_uris: list): 181 | for file_uri in tqdm(http_csv_uris): 182 | logger.info(f"Processing file {file_uri}...") 183 | file_path = f"/tmp/{uuid.uuid4()}" 184 | session = requests.Session() 185 | retry = Retry(connect=3, backoff_factor=0.5) 186 | adapter = HTTPAdapter(max_retries=retry) 187 | session.mount("http://", adapter) 188 | session.mount("https://", adapter) 189 | r = session.get(file_uri, allow_redirects=True) 190 | with open(file_path, "wb") as f: 191 | f.write(r.content) 192 | process_file(lookup_table, index_name, file_path) 193 | 194 | 195 | class GenerateEsMapping(Task): 196 | def run(self, index_name) -> str: 197 | """ 198 | Returns: 199 | str: index_name 200 | """ 201 | es_inst = get_es_instance() 202 | logger.info("Generating mapping for index {}".format(index_name)) 203 | es_inst.indices.delete(index=index_name, ignore=[400, 404]) 204 | response = es_inst.indices.create( 205 | index=index_name, body=mapping, ignore=400 # ignore 400 already exists code 206 | ) 207 | if "acknowledged" in response: 208 | if response["acknowledged"] == True: 209 | logger.info( 210 | "INDEX MAPPING SUCCESS FOR INDEX: {}".format(response["index"]) 211 | ) 212 | elif "error" in response: 213 | logger.error(response["error"]["root_cause"]) 214 | logger.error("Error type: {}".format(response["error"]["type"])) 215 | raise Exception("Unable to create index mapping") 216 | return index_name 217 | 218 | 219 | def read_lookup_table(lookup_file_path: str): 220 | logger.info("Loading lookup table...") 221 | lookup = {} 222 | with open(lookup_file_path, "r", newline="") as fp: 223 | char_read = 10000 if os.path.getsize(lookup_file_path) > 10000 else None 224 | dialect = clevercsv.Sniffer().sniff(fp.read(char_read), verbose=True) 225 | fp.seek(0) 226 | reader = clevercsv.reader(fp, dialect) 227 | next(reader) 228 | for row in tqdm(reader, unit="entry"): 229 | for location in [ 230 | row[6], # Province_State 231 | row[7], # Country_Region 232 | row[10], # Combined_Key 233 | ]: 234 | if location and location not in lookup: 235 | if row[8] and row[9]: # Lat, Long 236 | lookup[location] = ( 237 | {"lat": float(row[8]), "lon": float(row[9])}, 238 | row[1], 239 | ) 240 | logger.info(f"Found {len(lookup)} locations.") 241 | return lookup 242 | 243 | 244 | lookup_table = read_lookup_table("/usr/app/UID_ISO_FIPS_LookUp_Table.csv") 245 | 246 | schedule = IntervalSchedule( 247 | start_date=datetime.utcnow() + timedelta(seconds=1), interval=timedelta(hours=24) 248 | ) 249 | with Flow(flow_name, schedule=schedule) as flow: 250 | es_mapping_task = GenerateEsMapping() 251 | index_name = es_mapping_task(index_name) 252 | 253 | parse_files_task = ParseFiles() 254 | parse_files_task( 255 | lookup_table=lookup_table, 256 | index_name=index_name, 257 | http_csv_uris=[csv_endpoint], 258 | ) 259 | 260 | if __name__ == "__main__": 261 | 262 | try: 263 | client = Client() 264 | client.create_project(project_name=project_name) 265 | except prefect.utilities.exceptions.ClientError as e: 266 | logger.info("Project already exists") 267 | 268 | flow.register( 269 | project_name=project_name, labels=["development"], add_default_labels=False 270 | ) 271 | -------------------------------------------------------------------------------- /flow/scripts/insert_france_virtests.py: -------------------------------------------------------------------------------- 1 | import os 2 | import dateparser 3 | import uuid 4 | import requests 5 | import prefect 6 | import clevercsv 7 | from tqdm import tqdm 8 | from prefect import Flow, Task, Client, task 9 | from datetime import timedelta, datetime 10 | from prefect.schedules import IntervalSchedule 11 | from elasticsearch import Elasticsearch, helpers 12 | from requests.adapters import HTTPAdapter 13 | from requests.packages.urllib3.util.retry import Retry 14 | 15 | from mapping import mapping 16 | 17 | MINIO_SCHEME = os.environ.get("MINIO_SCHEME") 18 | MINIO_ENDPOINT = os.environ.get("MINIO_ENDPOINT") 19 | MINIO_ACCESS_KEY = os.environ.get("MINIO_ACCESS_KEY") 20 | MINIO_SECRET_KEY = os.environ.get("MINIO_SECRET_KEY") 21 | MAX_ES_ROW_INJECT = int(os.environ.get("MAX_ES_ROW_INJECT", 1000)) 22 | ELASTIC_SCHEME = os.environ.get("ELASTIC_SCHEME") 23 | ELASTIC_PORT = os.environ.get("ELASTIC_PORT") 24 | ELASTIC_USER = os.environ.get("ELASTIC_USER") 25 | ELASTIC_PWD = os.environ.get("ELASTIC_PWD") 26 | ELASTIC_ENDPOINT = os.environ.get("ELASTIC_ENDPOINT") 27 | 28 | csv_endpoint = "https://www.data.gouv.fr/en/datasets/r/406c6a23-e283-4300-9484-54e78c8ae675" 29 | project_name = f"pandemic-knowledge-santepublic-tests" 30 | index_name = "contamination_santepublique_vir_tests_fr" 31 | flow_name = project_name 32 | 33 | logger = prefect.context.get("logger") 34 | 35 | columns_allowed = { 36 | "date": ["jour"], 37 | "location": ["dep"], 38 | "location_name": ["dep"], 39 | "confirmed": ["P"], 40 | "deaths": [], 41 | "recovered": [], 42 | "vaccinated": [], 43 | "tested": ["T"], 44 | } 45 | 46 | extra_locations = {"EL": "GR"} 47 | 48 | locations_cache = {"World": None} 49 | 50 | 51 | def get_es_instance(): 52 | es_inst = Elasticsearch( 53 | [ELASTIC_ENDPOINT], 54 | http_auth=(ELASTIC_USER, ELASTIC_PWD), 55 | scheme=ELASTIC_SCHEME, 56 | port=ELASTIC_PORT, 57 | verify_certs=False, 58 | ) 59 | return es_inst 60 | 61 | 62 | def format_date(date): 63 | if not date: 64 | return None 65 | try: 66 | return dateparser.parse(date) 67 | except Exception as e: 68 | logger.error(e) 69 | return None 70 | 71 | 72 | def format_location(lookup_table, location_name): 73 | if not location_name: 74 | return None 75 | if location_name in locations_cache: 76 | return locations_cache[location_name] 77 | if location_name in lookup_table: 78 | return lookup_table[location_name] 79 | return None 80 | 81 | 82 | def pick_one_of_elements(haystack: list, needles: list): 83 | for needle in needles: 84 | if needle in haystack: 85 | return needle 86 | return None 87 | 88 | 89 | def pick_nonempty_cell(row, headers, potential_keys): 90 | for potential_key in potential_keys: 91 | if potential_key in headers and row[headers[potential_key]]: 92 | return row[headers[potential_key]] 93 | return None 94 | 95 | 96 | def format_row(lookup_table, row, headers, filename): 97 | date_start = date_end = format_date( 98 | pick_nonempty_cell(row, headers, columns_allowed["date"]) 99 | ) 100 | location = format_location( 101 | lookup_table, pick_nonempty_cell(row, headers, columns_allowed["location"]) 102 | ) 103 | location_name = pick_nonempty_cell(row, headers, columns_allowed["location_name"]) 104 | nb_confirmed = pick_nonempty_cell(row, headers, columns_allowed["confirmed"]) 105 | nb_deaths = pick_nonempty_cell(row, headers, columns_allowed["deaths"]) 106 | nb_recovered = pick_nonempty_cell(row, headers, columns_allowed["recovered"]) 107 | nb_vaccinated = pick_nonempty_cell(row, headers, columns_allowed["vaccinated"]) 108 | nb_tested = pick_nonempty_cell(row, headers, columns_allowed["tested"]) 109 | if date_start != None: 110 | return { 111 | "date_start": date_start, 112 | "date_end": date_end, 113 | "location": location[0] if location else None, 114 | "location_name": location_name, 115 | "confirmed": int(float(nb_confirmed)) if nb_confirmed else 0, 116 | "deaths": int(float(nb_deaths)) if nb_deaths else 0, 117 | "recovered": int(float(nb_recovered)) if nb_recovered else 0, 118 | "vaccinated": int(float(nb_vaccinated)) if nb_vaccinated else 0, 119 | "tested": int(float(nb_tested)) if nb_tested else 0, 120 | "filename": filename, 121 | "iso_code2": location[1] if location else None, 122 | "iso_region2": f"FR-{location_name}", 123 | } 124 | logger.warning(f"format_row(): Invalid row : {row}") 125 | return None 126 | 127 | 128 | def inject_rows_to_es(rows, index_name): 129 | es_inst = get_es_instance() 130 | 131 | logger.info("Injecting {} rows in Elasticsearch".format(len(rows))) 132 | 133 | actions = [ 134 | {"_index": index_name, "_id": uuid.uuid4(), "_source": row} for row in rows 135 | ] 136 | helpers.bulk(es_inst, actions) 137 | 138 | 139 | def parse_file(lookup_table, file_path): 140 | with open(file_path, "r", newline="") as fp: 141 | char_read = 10000 if os.path.getsize(file_path) > 10000 else None 142 | 143 | try: 144 | dialect = clevercsv.Sniffer().sniff(fp.read(char_read), verbose=True) 145 | except Exception as e: 146 | logger.error(e) 147 | return [] 148 | 149 | fp.seek(0) 150 | reader = clevercsv.reader(fp, dialect) 151 | headers_list = next(reader) 152 | headers = {} 153 | for i, header in enumerate(headers_list): 154 | headers[header] = i 155 | for row in tqdm(reader, unit="entry"): 156 | yield format_row(lookup_table, row, headers, file_path) 157 | return [] 158 | 159 | 160 | def process_file(lookup_table, index_name, file_path): 161 | to_inject = [] 162 | logger.info(f"process_file(): Processing {file_path}...") 163 | for row in parse_file(lookup_table, file_path): 164 | if row is not None: 165 | to_inject.append(row) 166 | if len(to_inject) >= MAX_ES_ROW_INJECT: 167 | inject_rows_to_es(to_inject, index_name) 168 | to_inject = [] 169 | else: 170 | logger.warning("process_file(): Invalid row") 171 | if len(to_inject) > 0: 172 | inject_rows_to_es(to_inject, index_name) 173 | 174 | 175 | class ParseFiles(Task): 176 | def run(self, lookup_table, index_name, http_csv_uris: list): 177 | for file_uri in tqdm(http_csv_uris): 178 | logger.info(f"Processing file {file_uri}...") 179 | file_path = f"/tmp/{uuid.uuid4()}" 180 | session = requests.Session() 181 | retry = Retry(connect=3, backoff_factor=0.5) 182 | adapter = HTTPAdapter(max_retries=retry) 183 | session.mount("http://", adapter) 184 | session.mount("https://", adapter) 185 | r = session.get(file_uri, allow_redirects=True) 186 | with open(file_path, "wb") as f: 187 | f.write(r.content) 188 | process_file(lookup_table, index_name, file_path) 189 | 190 | 191 | class GenerateEsMapping(Task): 192 | def run(self, index_name) -> str: 193 | """ 194 | Returns: 195 | str: index_name 196 | """ 197 | es_inst = get_es_instance() 198 | logger.info("Generating mapping for index {}".format(index_name)) 199 | es_inst.indices.delete(index=index_name, ignore=[400, 404]) 200 | response = es_inst.indices.create( 201 | index=index_name, body=mapping, ignore=400 # ignore 400 already exists code 202 | ) 203 | if "acknowledged" in response: 204 | if response["acknowledged"] == True: 205 | logger.info( 206 | "INDEX MAPPING SUCCESS FOR INDEX: {}".format(response["index"]) 207 | ) 208 | elif "error" in response: 209 | logger.error(response["error"]["root_cause"]) 210 | logger.error("Error type: {}".format(response["error"]["type"])) 211 | raise Exception("Unable to create index mapping") 212 | return index_name 213 | 214 | 215 | def read_lookup_table(lookup_file_path: str): 216 | logger.info("Loading lookup table...") 217 | lookup = {} 218 | with open(lookup_file_path, "r", newline="") as fp: 219 | char_read = 10000 if os.path.getsize(lookup_file_path) > 10000 else None 220 | dialect = clevercsv.Sniffer().sniff(fp.read(char_read), verbose=True) 221 | fp.seek(0) 222 | reader = clevercsv.reader(fp, dialect) 223 | next(reader) 224 | for row in tqdm(reader, unit="entry"): 225 | for location in [ 226 | row[6], # Province_State 227 | row[7], # Country_Region 228 | row[10], # Combined_Key 229 | ]: 230 | if location and location not in lookup: 231 | if row[8] and row[9]: # Lat, Long 232 | lookup[location] = ( 233 | {"lat": float(row[8]), "lon": float(row[9])}, 234 | row[1], 235 | ) 236 | logger.info(f"Found {len(lookup)} locations.") 237 | return lookup 238 | 239 | 240 | lookup_table = read_lookup_table("/usr/app/UID_ISO_FIPS_LookUp_Table.csv") 241 | 242 | schedule = IntervalSchedule( 243 | start_date=datetime.utcnow() + timedelta(seconds=1), interval=timedelta(hours=24) 244 | ) 245 | with Flow(flow_name, schedule=schedule) as flow: 246 | es_mapping_task = GenerateEsMapping() 247 | index_name = es_mapping_task(index_name) 248 | 249 | parse_files_task = ParseFiles() 250 | parse_files_task( 251 | lookup_table=lookup_table, 252 | index_name=index_name, 253 | http_csv_uris=[csv_endpoint], 254 | ) 255 | 256 | if __name__ == "__main__": 257 | 258 | try: 259 | client = Client() 260 | client.create_project(project_name=project_name) 261 | except prefect.utilities.exceptions.ClientError as e: 262 | logger.info("Project already exists") 263 | 264 | flow.register( 265 | project_name=project_name, labels=["development"], add_default_labels=False 266 | ) 267 | -------------------------------------------------------------------------------- /flow/scripts/insert_owid.py: -------------------------------------------------------------------------------- 1 | import os 2 | import dateparser 3 | import uuid 4 | import prefect 5 | import clevercsv 6 | import traceback 7 | from tqdm import tqdm 8 | from prefect import Flow, Task, Client, task 9 | from datetime import timedelta, datetime 10 | from prefect.schedules import IntervalSchedule 11 | from minio import Minio 12 | from elasticsearch import Elasticsearch, helpers 13 | from geopy.geocoders import Nominatim 14 | 15 | from mapping import mapping 16 | 17 | MINIO_SCHEME = os.environ.get("MINIO_SCHEME") 18 | MINIO_ENDPOINT = os.environ.get("MINIO_ENDPOINT") 19 | MINIO_ACCESS_KEY = os.environ.get("MINIO_ACCESS_KEY") 20 | MINIO_SECRET_KEY = os.environ.get("MINIO_SECRET_KEY") 21 | MAX_ES_ROW_INJECT = int(os.environ.get("MAX_ES_ROW_INJECT", 1000)) 22 | ELASTIC_SCHEME = os.environ.get("ELASTIC_SCHEME") 23 | ELASTIC_PORT = os.environ.get("ELASTIC_PORT") 24 | ELASTIC_USER = os.environ.get("ELASTIC_USER") 25 | ELASTIC_PWD = os.environ.get("ELASTIC_PWD") 26 | ELASTIC_ENDPOINT = os.environ.get("ELASTIC_ENDPOINT") 27 | 28 | bucket_name = "contamination-owid" 29 | project_name = f"pandemic-knowledge-{bucket_name}" 30 | index_name = f"{bucket_name.replace('-', '_')}" 31 | flow_name = project_name 32 | 33 | logger = prefect.context.get("logger") 34 | 35 | columns_allowed = { 36 | "date": ["date"], 37 | "location": ["location"], 38 | "location_name": ["location"], 39 | "confirmed": ["new_cases"], 40 | "deaths": ["new_deaths"], 41 | "recovered": [], 42 | "vaccinated": ["new_vaccinations"], 43 | "tested": ["new_tests"], 44 | } 45 | 46 | extra_locations = {"EL": "GR"} 47 | 48 | locations_cache = {"World": None} 49 | 50 | 51 | def get_es_instance(): 52 | es_inst = Elasticsearch( 53 | [ELASTIC_ENDPOINT], 54 | http_auth=(ELASTIC_USER, ELASTIC_PWD), 55 | scheme=ELASTIC_SCHEME, 56 | port=ELASTIC_PORT, 57 | verify_certs=False, 58 | ) 59 | return es_inst 60 | 61 | 62 | def format_date(date): 63 | if not date: 64 | return None 65 | try: 66 | return dateparser.parse(date) 67 | except Exception as e: 68 | logger.error(e) 69 | return None 70 | 71 | 72 | def format_location(lookup_table, location_name): 73 | if not location_name: 74 | return None 75 | if location_name in locations_cache: 76 | return locations_cache[location_name] 77 | if location_name in lookup_table: 78 | return lookup_table[location_name] 79 | 80 | logger.info(f"Guessing geolocation for {location_name}") 81 | geolocator = Nominatim(user_agent="pandemic-knowledge") 82 | location = geolocator.geocode( 83 | extra_locations[location_name] 84 | if location_name in extra_locations 85 | else location_name, 86 | addressdetails=True, 87 | ) 88 | 89 | if location and location.raw: 90 | logger.info(f"Found {location.latitude}, {location.longitude}") 91 | if "address" in location.raw and "country_code" in location.raw["address"]: 92 | locations_cache[location_name] = ( 93 | {"lat": location.latitude, "lon": location.longitude}, 94 | location.raw["address"]["country_code"].upper(), 95 | ) 96 | return locations_cache[location_name] 97 | locations_cache[location_name] = None 98 | logger.error( 99 | f"Failed to locate (no country code and/or coordinates) for {location}" 100 | ) 101 | return None 102 | 103 | 104 | def pick_one_of_elements(haystack: list, needles: list): 105 | for needle in needles: 106 | if needle in haystack: 107 | return needle 108 | return None 109 | 110 | 111 | def pick_nonempty_cell(row, headers, potential_keys): 112 | for potential_key in potential_keys: 113 | if potential_key in headers and row[headers[potential_key]]: 114 | return row[headers[potential_key]] 115 | return None 116 | 117 | 118 | def format_row(lookup_table, row, headers, filename): 119 | date_start = date_end = format_date( 120 | pick_nonempty_cell(row, headers, columns_allowed["date"]) 121 | ) 122 | location = format_location( 123 | lookup_table, pick_nonempty_cell(row, headers, columns_allowed["location"]) 124 | ) 125 | location_name = pick_nonempty_cell(row, headers, columns_allowed["location_name"]) 126 | nb_confirmed = pick_nonempty_cell(row, headers, columns_allowed["confirmed"]) 127 | nb_deaths = pick_nonempty_cell(row, headers, columns_allowed["deaths"]) 128 | nb_recovered = pick_nonempty_cell(row, headers, columns_allowed["recovered"]) 129 | nb_vaccinated = pick_nonempty_cell(row, headers, columns_allowed["vaccinated"]) 130 | nb_tested = pick_nonempty_cell(row, headers, columns_allowed["tested"]) 131 | if location != None and date_start != None and nb_confirmed != None: 132 | return { 133 | "date_start": date_start, 134 | "date_end": date_end, 135 | "location": location[0], 136 | "location_name": location_name, 137 | "confirmed": int(float(nb_confirmed)) if nb_confirmed else 0, 138 | "deaths": int(float(nb_deaths)) if nb_deaths else 0, 139 | "recovered": int(float(nb_recovered)) if nb_recovered else 0, 140 | "vaccinated": int(float(nb_vaccinated)) if nb_vaccinated else 0, 141 | "tested": int(float(nb_tested)) if nb_tested else 0, 142 | "filename": filename, 143 | "iso_code2": location[1] if len(location) else None, 144 | } 145 | return None 146 | 147 | 148 | def inject_rows_to_es(rows, index_name): 149 | es_inst = get_es_instance() 150 | 151 | logger.info("Injecting {} rows in Elasticsearch".format(len(rows))) 152 | 153 | actions = [ 154 | {"_index": index_name, "_id": uuid.uuid4(), "_source": row} for row in rows 155 | ] 156 | helpers.bulk(es_inst, actions) 157 | 158 | 159 | def parse_file(lookup_table, minio_client, bucket_name, object_name): 160 | csv_file_path = "/tmp/" + str(uuid.uuid4()) 161 | minio_client.fget_object(bucket_name, object_name, csv_file_path) 162 | with open(csv_file_path, "r", newline="") as fp: 163 | char_read = 10000 if os.path.getsize(csv_file_path) > 10000 else None 164 | 165 | try: 166 | dialect = clevercsv.Sniffer().sniff(fp.read(char_read), verbose=True) 167 | except Exception as e: 168 | logger.error(e) 169 | return [] 170 | 171 | fp.seek(0) 172 | reader = clevercsv.reader(fp, dialect) 173 | headers_list = next(reader) 174 | headers = {} 175 | for i, header in enumerate(headers_list): 176 | headers[header] = i 177 | for row in tqdm(reader, unit="entry"): 178 | yield format_row(lookup_table, row, headers, object_name) 179 | return [] 180 | 181 | 182 | def process_file(lookup_table, index_name, bucket_name, object_name): 183 | minio_client = Minio( 184 | MINIO_ENDPOINT, 185 | access_key=MINIO_ACCESS_KEY, 186 | secret_key=MINIO_SECRET_KEY, 187 | secure=MINIO_SCHEME == "https", 188 | ) 189 | to_inject = [] 190 | logger.info(f"Processing {object_name}...") 191 | for row in parse_file(lookup_table, minio_client, bucket_name, object_name): 192 | if row is not None: 193 | to_inject.append(row) 194 | if len(to_inject) >= MAX_ES_ROW_INJECT: 195 | inject_rows_to_es(to_inject, index_name) 196 | to_inject = [] 197 | else: 198 | logger.info("Invalid row") 199 | if len(to_inject) > 0: 200 | inject_rows_to_es(to_inject, index_name) 201 | 202 | 203 | def get_files(bucket_name): 204 | minio_client = Minio( 205 | MINIO_ENDPOINT, 206 | access_key=MINIO_ACCESS_KEY, 207 | secret_key=MINIO_SECRET_KEY, 208 | secure=MINIO_SCHEME == "https", 209 | ) 210 | logger.info("Parse file for bucket {}".format(bucket_name)) 211 | if not minio_client.bucket_exists(bucket_name): 212 | logger.error("Bucket {} does not exists".format(bucket_name)) 213 | return 214 | return list(minio_client.list_objects(bucket_name)) 215 | 216 | 217 | class ParseFiles(Task): 218 | def run(self, lookup_table, index_name): 219 | logger.info(lookup_table) 220 | for file in tqdm(get_files(bucket_name=bucket_name)): 221 | object_name = file.object_name 222 | try: 223 | logger.info(f"Processing file {object_name}...") 224 | process_file(lookup_table, index_name, bucket_name, object_name) 225 | except Exception as e: 226 | logger.error(traceback.format_exc()) 227 | logger.error(e) 228 | logger.error(f"Can't process file {object_name}") 229 | 230 | 231 | class GenerateEsMapping(Task): 232 | def run(self, index_name) -> str: 233 | """ 234 | Returns: 235 | str: index_name 236 | """ 237 | es_inst = get_es_instance() 238 | logger.info("Generating mapping for index {}".format(index_name)) 239 | es_inst.indices.delete(index=index_name, ignore=[400, 404]) 240 | response = es_inst.indices.create( 241 | index=index_name, body=mapping, ignore=400 # ignore 400 already exists code 242 | ) 243 | if "acknowledged" in response: 244 | if response["acknowledged"] == True: 245 | logger.info( 246 | "INDEX MAPPING SUCCESS FOR INDEX: {}".format(response["index"]) 247 | ) 248 | elif "error" in response: 249 | logger.error(response["error"]["root_cause"]) 250 | logger.error("Error type: {}".format(response["error"]["type"])) 251 | raise Exception("Unable to create index mapping") 252 | return index_name 253 | 254 | 255 | def read_lookup_table(lookup_file_path: str): 256 | logger.info("Loading lookup table...") 257 | lookup = {} 258 | with open(lookup_file_path, "r", newline="") as fp: 259 | char_read = 10000 if os.path.getsize(lookup_file_path) > 10000 else None 260 | dialect = clevercsv.Sniffer().sniff(fp.read(char_read), verbose=True) 261 | fp.seek(0) 262 | reader = clevercsv.reader(fp, dialect) 263 | next(reader) 264 | for row in tqdm(reader, unit="entry"): 265 | for location in [ 266 | row[6], # Province_State 267 | row[7], # Country_Region 268 | row[10], # Combined_Key 269 | ]: 270 | if location and location not in lookup: 271 | if row[8] and row[9]: # Lat, Long 272 | lookup[location] = ( 273 | {"lat": float(row[8]), "lon": float(row[9])}, 274 | row[1], 275 | ) 276 | logger.info(f"Found {len(lookup)} locations.") 277 | return lookup 278 | 279 | 280 | lookup_table = read_lookup_table("/usr/app/UID_ISO_FIPS_LookUp_Table.csv") 281 | 282 | schedule = IntervalSchedule( 283 | start_date=datetime.utcnow() + timedelta(seconds=1), interval=timedelta(hours=24) 284 | ) 285 | with Flow(flow_name, schedule=schedule) as flow: 286 | es_mapping_task = GenerateEsMapping() 287 | index_name = es_mapping_task(index_name) 288 | 289 | parse_files_task = ParseFiles() 290 | parse_files_task(lookup_table=lookup_table, index_name=index_name) 291 | 292 | if __name__ == "__main__": 293 | 294 | try: 295 | client = Client() 296 | client.create_project(project_name=project_name) 297 | except prefect.utilities.exceptions.ClientError as e: 298 | logger.info("Project already exists") 299 | 300 | flow.register( 301 | project_name=project_name, labels=["development"], add_default_labels=False 302 | ) 303 | -------------------------------------------------------------------------------- /flow/scripts/mapping.py: -------------------------------------------------------------------------------- 1 | mapping = { 2 | "mappings": { 3 | "properties": { 4 | "date_start": { 5 | "type": "date", 6 | "format": "strict_date_optional_time||epoch_millis", 7 | }, 8 | "date_end": { 9 | "type": "date", 10 | "format": "strict_date_optional_time||epoch_millis", 11 | }, 12 | "location": {"type": "geo_point"}, 13 | "location_name": { 14 | "type": "text", 15 | "fields": {"keyword": {"type": "keyword"}}, 16 | }, 17 | "confirmed": {"type": "long"}, 18 | "deaths": {"type": "long"}, 19 | "vaccinated": {"type": "long"}, 20 | "tested": {"type": "long"}, 21 | "recovered": {"type": "long"}, 22 | "filename": {"type": "text"}, 23 | "iso_code2": {"type": "text", "fields": {"keyword": {"type": "keyword"}}}, 24 | "iso_region2": {"type": "text", "fields": {"keyword": {"type": "keyword"}}}, 25 | "max_population": {"type": "long"}, 26 | "percentage": {"type": "float"}, 27 | } 28 | } 29 | } -------------------------------------------------------------------------------- /flow/scripts/parse_insert.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import uuid 4 | import prefect 5 | import clevercsv 6 | from tqdm import tqdm 7 | from datetime import datetime, timedelta 8 | from prefect import Flow, Task, Client 9 | from minio import Minio 10 | from elasticsearch import Elasticsearch, helpers 11 | from ssl import create_default_context 12 | from geopy.geocoders import Nominatim 13 | from iso3166 import countries 14 | from prefect.schedules import IntervalSchedule 15 | 16 | from mapping import mapping 17 | 18 | MINIO_SCHEME = os.environ.get("MINIO_SCHEME") 19 | MINIO_ENDPOINT = os.environ.get("MINIO_ENDPOINT") 20 | MINIO_ACCESS_KEY = os.environ.get("MINIO_ACCESS_KEY") 21 | MINIO_SECRET_KEY = os.environ.get("MINIO_SECRET_KEY") 22 | MAX_ES_ROW_INJECT = int(os.environ.get("MAX_ES_ROW_INJECT", 1000)) 23 | ELASTIC_SCHEME = os.environ.get("ELASTIC_SCHEME") 24 | ELASTIC_PORT = os.environ.get("ELASTIC_PORT") 25 | ELASTIC_USER = os.environ.get("ELASTIC_USER") 26 | ELASTIC_PWD = os.environ.get("ELASTIC_PWD") 27 | ELASTIC_ENDPOINT = os.environ.get("ELASTIC_ENDPOINT") 28 | 29 | columns_allowed = { 30 | "date": ["YearWeekISO", "dateRep", "date"], 31 | "location": ["ReportingCountry", "location", "countriesAndTerritories"], 32 | "cases": ["NumberDosesReceived", "new_vaccinations", "cases", "new_cases"], 33 | "population": ["population"], 34 | } 35 | 36 | logger = prefect.context.get("logger") 37 | 38 | extra_locations = {"EL": "GR"} 39 | 40 | locations_cache = {"World": None} 41 | 42 | 43 | def get_es_instance(): 44 | es_inst = Elasticsearch( 45 | [ELASTIC_ENDPOINT], 46 | http_auth=(ELASTIC_USER, ELASTIC_PWD), 47 | scheme=ELASTIC_SCHEME, 48 | port=ELASTIC_PORT, 49 | verify_certs=False, 50 | ) 51 | return es_inst 52 | 53 | 54 | def format_date(date): 55 | date = date.replace("/", "-") 56 | p = re.compile("(\\d{4})-W(\\d{2})") 57 | weekMatches = p.match(date) 58 | if weekMatches is not None: 59 | groups = weekMatches.groups() 60 | date_start = datetime.strptime( 61 | f"{groups[0]}-W{int(groups[1]) - 1}-1", "%Y-W%W-%w" 62 | ).date() 63 | date_end = date_start + timedelta(days=6.9) 64 | return date_start.strftime("%Y-%m-%d"), date_end.strftime("%Y-%m-%d") 65 | p = re.compile("(\\d{2})-(\\d{2})-(\\d{4})") 66 | frDateMatches = p.match(date) 67 | if frDateMatches is not None: 68 | groups = frDateMatches.groups() 69 | date = f"{groups[2]}-{groups[1]}-{groups[0]}" 70 | return date, date 71 | p = re.compile("(\\d{4})-(\\d{2})-(\\d{2})") 72 | dateMatches = p.match(date) 73 | if dateMatches is not None: 74 | return date, date 75 | return None, None 76 | 77 | 78 | def format_location(location_name): 79 | if location_name in locations_cache: 80 | return locations_cache[location_name] 81 | geolocator = Nominatim(user_agent="pandemic-knowledge") 82 | location = geolocator.geocode( 83 | extra_locations[location_name] 84 | if location_name in extra_locations 85 | else location_name, 86 | addressdetails=True, 87 | ) 88 | 89 | if location is None or "country_code" not in location.raw["address"]: 90 | logger.info(location_name) 91 | locations_cache[location_name] = None 92 | return None 93 | 94 | iso2 = location.raw["address"]["country_code"].upper() 95 | 96 | iso3 = countries.get(iso2).alpha3 97 | 98 | locations_cache[location_name] = ( 99 | {"lat": location.latitude, "lon": location.longitude}, 100 | iso2, 101 | ) 102 | 103 | return locations_cache[location_name] 104 | 105 | 106 | def format_row(row, columns_indexes, filename, bucket_name): 107 | date_start, date_end = format_date(row[columns_indexes["date"]]) 108 | location = format_location(row[columns_indexes["location"]]) 109 | if location is None: 110 | return None 111 | max_population = ( 112 | int(float(row[columns_indexes["population"]])) 113 | if row[columns_indexes["population"]] != "" 114 | else 0 115 | ) 116 | cases = ( 117 | int(float(row[columns_indexes["cases"]])) 118 | if row[columns_indexes["cases"]] != "" 119 | else 0 120 | ) 121 | percentage = ( 122 | float(cases) / float(max_population) * 100 if max_population != 0 else None 123 | ) 124 | 125 | formatted = { 126 | "date_start": date_start, 127 | "date_end": date_end, 128 | "location": location[0], 129 | "filename": filename, 130 | "iso_code2": location[1], 131 | "max_population": max_population, 132 | "percentage": percentage, 133 | } 134 | 135 | formatted["vaccinated" if bucket_name == "vaccination" else "confirmed"] = cases 136 | 137 | return formatted 138 | 139 | 140 | def inject_rows_to_es(rows, bucket_name): 141 | es_inst = get_es_instance() 142 | 143 | logger.info("Injecting {} rows in Elasticsearch".format(len(rows))) 144 | 145 | actions = [ 146 | {"_index": bucket_name, "_id": uuid.uuid4(), "_source": row} for row in rows 147 | ] 148 | 149 | helpers.bulk(es_inst, actions) 150 | 151 | 152 | def parse_file(minio_client, obj): 153 | csv_file_path = "/tmp/" + str(uuid.uuid4()) 154 | minio_client.fget_object(obj.bucket_name, obj.object_name, csv_file_path) 155 | with open(csv_file_path, "r", newline="") as fp: 156 | char_read = 100000 if os.path.getsize(csv_file_path) > 100000 else None 157 | 158 | try: 159 | dialect = clevercsv.Sniffer().sniff(fp.read(char_read), verbose=True) 160 | except Exception as e: 161 | logger.error(e) 162 | return [] 163 | 164 | fp.seek(0) 165 | reader = clevercsv.reader(fp, dialect) 166 | headers = next(reader) 167 | columns_indexes = {} 168 | malformed_csv = False 169 | for name in columns_allowed: 170 | for header in headers: 171 | index = ( 172 | headers.index(header) if header in columns_allowed[name] else None 173 | ) 174 | if index is None: 175 | continue 176 | columns_indexes[name] = index 177 | break 178 | if name not in columns_indexes: 179 | logger.error( 180 | "Header {} cannot be found in csv {}".format(name, obj.object_name) 181 | ) 182 | malformed_csv = True 183 | continue 184 | if malformed_csv is True: 185 | return [] 186 | for row in tqdm(reader, unit="entry"): 187 | row = format_row(row, columns_indexes, obj.object_name, obj.bucket_name) 188 | if row is not None: 189 | yield row 190 | return [] 191 | 192 | 193 | class ParseFiles(Task): 194 | def run(self, bucket_name): 195 | minio_client = Minio( 196 | MINIO_ENDPOINT, 197 | access_key=MINIO_ACCESS_KEY, 198 | secret_key=MINIO_SECRET_KEY, 199 | secure=MINIO_SCHEME == "https", 200 | ) 201 | logger.info("Parse file for bucket {}".format(bucket_name)) 202 | if not minio_client.bucket_exists(bucket_name): 203 | logger.error("Bucket {} does not exists".format(bucket_name)) 204 | return 205 | objects = minio_client.list_objects(bucket_name) 206 | for obj in objects: 207 | to_inject = [] 208 | for row in parse_file(minio_client, obj): 209 | to_inject.append(row) 210 | if len(to_inject) >= MAX_ES_ROW_INJECT: 211 | inject_rows_to_es(to_inject, bucket_name) 212 | to_inject = [] 213 | if len(to_inject) > 0: 214 | inject_rows_to_es(to_inject, bucket_name) 215 | 216 | 217 | class GenerateEsMapping(Task): 218 | def __init__(self, index_name, **kwargs): 219 | self.index_name = index_name 220 | super().__init__(**kwargs) 221 | 222 | def run(self): 223 | index_name = self.index_name 224 | es_inst = get_es_instance() 225 | 226 | logger.info("Generating mapping for index {}".format(index_name)) 227 | 228 | es_inst.indices.delete(index=index_name, ignore=[400, 404]) 229 | 230 | response = es_inst.indices.create( 231 | index=index_name, body=mapping, ignore=400 # ignore 400 already exists code 232 | ) 233 | 234 | if "acknowledged" in response: 235 | if response["acknowledged"] == True: 236 | logger.info( 237 | "INDEX MAPPING SUCCESS FOR INDEX: {}".format(response["index"]) 238 | ) 239 | elif "error" in response: 240 | logger.error(response["error"]["root_cause"]) 241 | logger.error("Error type: {}".format(response["error"]["type"])) 242 | raise Exception("Unable to create index mapping") 243 | 244 | 245 | schedule = IntervalSchedule( 246 | interval=timedelta(hours=24), start_date=datetime.utcnow() + timedelta(seconds=1) 247 | ) 248 | 249 | with Flow("Parse and insert csv files", schedule) as flow: 250 | for bucket in ["vaccination", "contamination"]: 251 | flow.set_dependencies( 252 | task=ParseFiles(), 253 | upstream_tasks=[GenerateEsMapping(bucket)], 254 | keyword_tasks=dict(bucket_name=bucket), 255 | ) 256 | 257 | try: 258 | client = Client() 259 | client.create_project(project_name="pandemic-knowledge") 260 | except prefect.utilities.exceptions.ClientError as e: 261 | logger.info("Project already exists") 262 | 263 | flow.register(project_name="pandemic-knowledge", labels=["development"]) -------------------------------------------------------------------------------- /illustrations/france_live_status.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flavienbwk/Pandemic-Knowledge/e9082be16d743aac49c514034626721a608ede08/illustrations/france_live_status.png -------------------------------------------------------------------------------- /illustrations/latest_news.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flavienbwk/Pandemic-Knowledge/e9082be16d743aac49c514034626721a608ede08/illustrations/latest_news.png -------------------------------------------------------------------------------- /illustrations/live_dashboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flavienbwk/Pandemic-Knowledge/e9082be16d743aac49c514034626721a608ede08/illustrations/live_dashboard.png -------------------------------------------------------------------------------- /illustrations/news_web_app.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flavienbwk/Pandemic-Knowledge/e9082be16d743aac49c514034626721a608ede08/illustrations/news_web_app.png -------------------------------------------------------------------------------- /illustrations/vaccination_map.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flavienbwk/Pandemic-Knowledge/e9082be16d743aac49c514034626721a608ede08/illustrations/vaccination_map.png -------------------------------------------------------------------------------- /insert.docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.7" 2 | 3 | services: 4 | 5 | insert_france_virtests: 6 | build: ./flow 7 | command: python3 /usr/app/insert_france_virtests.py 8 | volumes: 9 | - /srv/docker/prefect/flows:/root/.prefect/flows 10 | - "./flow/scripts:/usr/app:ro" 11 | - type: bind 12 | source: ./flow/config.toml 13 | target: /root/.prefect/config.toml 14 | read_only: true 15 | env_file: 16 | - .env 17 | environment: 18 | MINIO_SCHEME: ${MINIO_SCHEME} 19 | MINIO_ENDPOINT: ${MINIO_ENDPOINT} 20 | MINIO_ACCESS_KEY: ${MINIO_ACCESS_KEY} 21 | MINIO_SECRET_KEY: ${MINIO_SECRET_KEY} 22 | MAX_ES_ROW_INJECT: ${MAX_ES_ROW_INJECT} 23 | ELASTIC_SCHEME: ${ELASTIC_SCHEME} 24 | ELASTIC_PORT: ${ELASTIC_PORT} 25 | ELASTIC_ENDPOINT: ${ELASTIC_ENDPOINT} 26 | ELASTIC_USER: ${ELASTIC_USER} 27 | ELASTIC_PWD: ${ELASTICSEARCH_PWD} 28 | 29 | insert_france: 30 | build: ./flow 31 | command: python3 /usr/app/insert_france.py 32 | volumes: 33 | - /srv/docker/prefect/flows:/root/.prefect/flows 34 | - "./flow/scripts:/usr/app:ro" 35 | - type: bind 36 | source: ./flow/config.toml 37 | target: /root/.prefect/config.toml 38 | read_only: true 39 | env_file: 40 | - .env 41 | environment: 42 | MINIO_SCHEME: ${MINIO_SCHEME} 43 | MINIO_ENDPOINT: ${MINIO_ENDPOINT} 44 | MINIO_ACCESS_KEY: ${MINIO_ACCESS_KEY} 45 | MINIO_SECRET_KEY: ${MINIO_SECRET_KEY} 46 | MAX_ES_ROW_INJECT: ${MAX_ES_ROW_INJECT} 47 | ELASTIC_SCHEME: ${ELASTIC_SCHEME} 48 | ELASTIC_PORT: ${ELASTIC_PORT} 49 | ELASTIC_ENDPOINT: ${ELASTIC_ENDPOINT} 50 | ELASTIC_USER: ${ELASTIC_USER} 51 | ELASTIC_PWD: ${ELASTICSEARCH_PWD} 52 | 53 | insert_owid: 54 | build: ./flow 55 | command: python3 /usr/app/insert_owid.py 56 | volumes: 57 | - /srv/docker/prefect/flows:/root/.prefect/flows 58 | - "./flow/scripts:/usr/app:ro" 59 | - type: bind 60 | source: ./flow/config.toml 61 | target: /root/.prefect/config.toml 62 | read_only: true 63 | env_file: 64 | - .env 65 | environment: 66 | MINIO_SCHEME: ${MINIO_SCHEME} 67 | MINIO_ENDPOINT: ${MINIO_ENDPOINT} 68 | MINIO_ACCESS_KEY: ${MINIO_ACCESS_KEY} 69 | MINIO_SECRET_KEY: ${MINIO_SECRET_KEY} 70 | MAX_ES_ROW_INJECT: ${MAX_ES_ROW_INJECT} 71 | ELASTIC_SCHEME: ${ELASTIC_SCHEME} 72 | ELASTIC_PORT: ${ELASTIC_PORT} 73 | ELASTIC_ENDPOINT: ${ELASTIC_ENDPOINT} 74 | ELASTIC_USER: ${ELASTIC_USER} 75 | ELASTIC_PWD: ${ELASTICSEARCH_PWD} 76 | 77 | parse_insert: 78 | build: ./flow 79 | command: python3 /usr/app/parse_insert.py 80 | volumes: 81 | - /srv/docker/prefect/flows:/root/.prefect/flows 82 | - "./flow/scripts:/usr/app:ro" 83 | - type: bind 84 | source: ./flow/config.toml 85 | target: /root/.prefect/config.toml 86 | read_only: true 87 | env_file: 88 | - .env 89 | environment: 90 | MINIO_SCHEME: ${MINIO_SCHEME} 91 | MINIO_ENDPOINT: ${MINIO_ENDPOINT} 92 | MINIO_ACCESS_KEY: ${MINIO_ACCESS_KEY} 93 | MINIO_SECRET_KEY: ${MINIO_SECRET_KEY} 94 | MAX_ES_ROW_INJECT: ${MAX_ES_ROW_INJECT} 95 | ELASTIC_SCHEME: ${ELASTIC_SCHEME} 96 | ELASTIC_PORT: ${ELASTIC_PORT} 97 | ELASTIC_ENDPOINT: ${ELASTIC_ENDPOINT} 98 | ELASTIC_USER: ${ELASTIC_USER} 99 | ELASTIC_PWD: ${ELASTICSEARCH_PWD} 100 | -------------------------------------------------------------------------------- /instances/pem.yml: -------------------------------------------------------------------------------- 1 | instances: 2 | - name: es01 3 | dns: 4 | - es01 5 | - localhost 6 | ip: 7 | - 127.0.0.1 8 | 9 | - name: es02 10 | dns: 11 | - es02 12 | - localhost 13 | ip: 14 | - 127.0.0.1 15 | 16 | - name: es03 17 | dns: 18 | - es03 19 | - localhost 20 | ip: 21 | - 127.0.0.1 22 | 23 | - name: kibana 24 | dns: 25 | - kibana 26 | - localhost 27 | ip: 28 | - 127.0.0.1 29 | -------------------------------------------------------------------------------- /instances/pkcs_12.yml: -------------------------------------------------------------------------------- 1 | # For the moment, Enterprise Search only accepts PKCS#12 keystore 2 | # so we are forced to specifically create a certutil file for it. 3 | 4 | instances: 5 | 6 | - name: enterprise_search 7 | dns: 8 | - enterprise_search 9 | - localhost 10 | ip: 11 | - 127.0.0.1 12 | -------------------------------------------------------------------------------- /kibana.yml: -------------------------------------------------------------------------------- 1 | server.name: kibana 2 | server.host: "0.0.0.0" 3 | elasticsearch.hosts: [ "https://es01:9200" ] 4 | telemetry.enabled: true 5 | 6 | xpack.monitoring.ui.container.elasticsearch.enabled: "true" 7 | elasticsearch.username: elastic 8 | elasticsearch.password: elastic 9 | 10 | # Encrypt traffic between the browser and Kibana 11 | server.ssl.enabled: "true" 12 | server.ssl.certificate: "/usr/share/elasticsearch/config/certificates/kibana/kibana.crt" 13 | server.ssl.key: "/usr/share/elasticsearch/config/certificates/kibana/kibana.key" 14 | 15 | # Encrypt traffic between Kibana and Elasticsearch 16 | elasticsearch.ssl.certificateAuthorities: ["/usr/share/elasticsearch/config/certificates/ca/ca.crt"] 17 | 18 | # Enterprise Search 19 | enterpriseSearch.host: 'http://enterprise_search:3002' 20 | -------------------------------------------------------------------------------- /news_app/app/.gitignore: -------------------------------------------------------------------------------- 1 | app/node_modules/ -------------------------------------------------------------------------------- /news_app/app/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM node:12-alpine 2 | 3 | WORKDIR '/app' 4 | 5 | COPY entrypoint.sh /entrypoint.sh 6 | ENTRYPOINT [ "/entrypoint.sh" ] -------------------------------------------------------------------------------- /news_app/app/app/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "pandemic-knowledge", 3 | "version": "1.0.0", 4 | "homepage": ".", 5 | "dependencies": { 6 | "@trendmicro/react-sidenav": "0.5.0", 7 | "bootstrap": "^4.3.1", 8 | "json-loader": "^0.5.7", 9 | "react": "^16.11.0", 10 | "react-bootstrap": "^1.0.0-beta.14", 11 | "react-bootstrap-icons": "1.0.1-alpha3", 12 | "react-click-outsider": "^1.1.1", 13 | "react-cookie": "4.0.3", 14 | "react-dom": "^16.11.0", 15 | "react-highlight-words": "^0.17.0", 16 | "react-loader-spinner": "3.1.14", 17 | "react-notifications": "1.6.0", 18 | "react-router-dom": "^5.1.2", 19 | "react-scripts": "3.2.0", 20 | "searchkit": "^2.4.4", 21 | "styled-components": "^4.4.0" 22 | }, 23 | "scripts": { 24 | "start": "/app/node_modules/react-scripts/bin/react-scripts.js start", 25 | "build": "/app/node_modules/react-scripts/bin/react-scripts.js build", 26 | "test": "/app/node_modules/react-scripts/bin/react-scripts.js test", 27 | "eject": "/app/node_modules/react-scripts/bin/react-scripts.js eject" 28 | }, 29 | "eslintConfig": { 30 | "extends": "react-app" 31 | }, 32 | "browserslist": { 33 | "production": [ 34 | ">0.2%", 35 | "not dead", 36 | "not op_mini all" 37 | ], 38 | "development": [ 39 | "last 1 chrome version", 40 | "last 1 firefox version", 41 | "last 1 safari version" 42 | ] 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /news_app/app/app/public/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | Pandemic Knowledge 12 | 13 | 14 | 15 | 16 |
17 | 18 | 19 | -------------------------------------------------------------------------------- /news_app/app/app/public/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flavienbwk/Pandemic-Knowledge/e9082be16d743aac49c514034626721a608ede08/news_app/app/app/public/logo.png -------------------------------------------------------------------------------- /news_app/app/app/public/manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "short_name": "PK", 3 | "name": "Pandemic Knowledge", 4 | "icons": [ 5 | { 6 | "src": "logo.png", 7 | "sizes": "64x64 32x32 24x24 16x16", 8 | "type": "image/x-icon" 9 | } 10 | ], 11 | "start_url": ".", 12 | "display": "standalone", 13 | "theme_color": "#000000", 14 | "background_color": "#ffffff" 15 | } 16 | -------------------------------------------------------------------------------- /news_app/app/app/public/robots.txt: -------------------------------------------------------------------------------- 1 | # https://www.robotstxt.org/robotstxt.html 2 | User-agent: * 3 | -------------------------------------------------------------------------------- /news_app/app/app/src/About.js: -------------------------------------------------------------------------------- 1 | import React, { Component } from 'react' 2 | import packageJson from '../package.json'; 3 | 4 | export class About extends Component { 5 | 6 | render() { 7 | return ( 8 |
9 |

About

10 |

A fully-featured multi-source data pipeline for continuously extracting knowledge from COVID-19 data.

11 |

If you find an issue or have a suggestion, please open an issue on Github.

12 |
13 |

Version {packageJson["version"]}

14 |
15 | ) 16 | } 17 | 18 | componentDidMount() { 19 | document.title = "About - Pandemic Knowledge"; 20 | } 21 | 22 | } 23 | -------------------------------------------------------------------------------- /news_app/app/app/src/App.js: -------------------------------------------------------------------------------- 1 | import React, { Component } from 'react' 2 | import { HashRouter as Router, Route, Switch } from 'react-router-dom' 3 | import { NotificationContainer } from 'react-notifications' 4 | import { NavigationBar } from './NavigationBar' 5 | import { Layout } from './Layout' 6 | import Home from './Home' 7 | import { About } from './About' 8 | import packageJson from '../package.json' 9 | 10 | export class App extends Component { 11 | 12 | /** 13 | * Child components may trigger this parent event to 14 | * inform other routes ( for example), 15 | * that authentication information have been updated. 16 | * 17 | * This allows to show the "Login" or "Logout" button 18 | * depending on user's authentication status. 19 | */ 20 | onAuthUpdate = () => {} 21 | 22 | render() { 23 | return ( 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | ) 37 | } 38 | } 39 | 40 | export default App 41 | -------------------------------------------------------------------------------- /news_app/app/app/src/App.test.js: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import ReactDOM from 'react-dom'; 3 | import App from './App'; 4 | 5 | it('renders without crashing', () => { 6 | const div = document.createElement('div'); 7 | ReactDOM.render(, div); 8 | ReactDOM.unmountComponentAtNode(div); 9 | }); 10 | 11 | -------------------------------------------------------------------------------- /news_app/app/app/src/Home.js: -------------------------------------------------------------------------------- 1 | import React, { Component } from 'react'; 2 | import { Container, Row, Col } from 'react-bootstrap'; 3 | import styled from 'styled-components'; 4 | import SearchUI from './SearchUI'; 5 | 6 | const Styles = styled.div` 7 | .paddind-bottom { 8 | padding-bottom: 16px; 9 | } 10 | `; 11 | 12 | class Home extends Component { 13 | render() { 14 | return ( 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | ); 25 | } 26 | 27 | componentDidMount() { 28 | document.title = 'Search - Pandemic Knowledge'; 29 | } 30 | } 31 | 32 | export default Home; 33 | -------------------------------------------------------------------------------- /news_app/app/app/src/Layout.js: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import Container from 'react-bootstrap/Container'; 3 | 4 | export const Layout = (props) => ( 5 | 6 | {props.children} 7 | 8 | ) -------------------------------------------------------------------------------- /news_app/app/app/src/NavigationBar.js: -------------------------------------------------------------------------------- 1 | import React, { Component } from 'react' 2 | import { Nav, Navbar } from 'react-bootstrap' 3 | import styled from 'styled-components' 4 | import { Link } from 'react-router-dom' 5 | import packageJson from '../package.json' 6 | 7 | const Styles = styled.div` 8 | .navbar { 9 | background-color: #222; 10 | } 11 | 12 | .navbar-brand, .navbar-nav .nav-link { 13 | color: #bbb; 14 | 15 | &:hover { 16 | color: white; 17 | } 18 | } 19 | 20 | .brand-image { 21 | max-width: 64px; 22 | height: 30px; 23 | padding-right: 16px; 24 | } 25 | `; 26 | 27 | export class NavigationBar extends Component { 28 | 29 | render() { 30 | return ( 31 | 32 | 33 | 34 | Logo Pandemic Knowledge 39 | {'Pandemic Knowledge'} 40 | 41 | 42 | 43 | 46 | 47 | 48 | 49 | ) 50 | } 51 | 52 | } 53 | 54 | export default NavigationBar; -------------------------------------------------------------------------------- /news_app/app/app/src/SearchUI.js: -------------------------------------------------------------------------------- 1 | import React, { Component } from 'react'; 2 | import { Row, Col, Card } from 'react-bootstrap'; 3 | import { SearchkitManager, SearchkitProvider, SearchBox, Hits } from 'searchkit'; 4 | import Highlighter from 'react-highlight-words'; 5 | 6 | const search_kit = new SearchkitManager('https://172.17.0.1:9200/news_*/', { 7 | basicAuth: 'elastic:elastic' 8 | }); 9 | 10 | export class SearchUI extends Component { 11 | state = { 12 | queryValue: '' 13 | }; 14 | 15 | queryBuilder = (queryString) => { 16 | this.setState({ queryValue: queryString }); 17 | return { 18 | bool: { 19 | must: [], 20 | filter: [ 21 | { 22 | multi_match: { 23 | type: 'best_fields', 24 | query: queryString, 25 | lenient: true 26 | } 27 | } 28 | ], 29 | should: [], 30 | must_not: [] 31 | } 32 | }; 33 | }; 34 | 35 | render() { 36 | return ( 37 | 38 |
39 | 40 | 41 | 48 | 49 | 50 | 51 | } /> 52 | 53 |
54 |
55 | ); 56 | } 57 | } 58 | 59 | class News extends Component { 60 | render() { 61 | return ( 62 | { 64 | window.open(this.props.result._source.link) 65 | }} 66 | style={{ 67 | cursor: "pointer" 68 | }} 69 | title={this.props.result._source.link} 70 | > 71 | {} 72 | 73 | 74 | 75 | 76 | 81 |
82 | { 83 | (this.props.result._source.date) 84 | ? 85 | 86 | {new Date(this.props.result._source.date).toLocaleDateString('fr-FR')} 87 | 88 | : <> 89 | } 90 |
91 | 92 | 97 | 98 |
99 |
100 | 101 |
102 | ); 103 | } 104 | } 105 | 106 | export default SearchUI; 107 | -------------------------------------------------------------------------------- /news_app/app/app/src/index.css: -------------------------------------------------------------------------------- 1 | body { 2 | margin: 0; 3 | font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", "Roboto", "Oxygen", "Ubuntu", "Cantarell", "Fira Sans", "Droid Sans", "Helvetica Neue", sans-serif; 4 | -webkit-font-smoothing: antialiased; 5 | -moz-osx-font-smoothing: grayscale; 6 | } 7 | 8 | code { 9 | font-family: source-code-pro, Menlo, Monaco, Consolas, "Courier New", monospace; 10 | } -------------------------------------------------------------------------------- /news_app/app/app/src/index.js: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import ReactDOM from 'react-dom'; 3 | import './index.css'; 4 | import App from './App'; 5 | import * as serviceWorker from './serviceWorker'; 6 | 7 | ReactDOM.render(, document.getElementById('root')); 8 | 9 | // If you want your app to work offline and load faster, you can change 10 | // unregister() to register() below. Note this comes with some pitfalls. 11 | // Learn more about service workers: https://bit.ly/CRA-PWA 12 | serviceWorker.unregister(); 13 | -------------------------------------------------------------------------------- /news_app/app/app/src/serviceWorker.js: -------------------------------------------------------------------------------- 1 | // This optional code is used to register a service worker. 2 | // register() is not called by default. 3 | 4 | // This lets the app load faster on subsequent visits in production, and gives 5 | // it offline capabilities. However, it also means that developers (and users) 6 | // will only see deployed updates on subsequent visits to a page, after all the 7 | // existing tabs open on the page have been closed, since previously cached 8 | // resources are updated in the background. 9 | 10 | // To learn more about the benefits of this model and instructions on how to 11 | // opt-in, read https://bit.ly/CRA-PWA 12 | 13 | const isLocalhost = Boolean( 14 | window.location.hostname === 'localhost' || 15 | // [::1] is the IPv6 localhost address. 16 | window.location.hostname === '[::1]' || 17 | // 127.0.0.1/8 is considered localhost for IPv4. 18 | window.location.hostname.match( 19 | /^127(?:\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}$/ 20 | ) 21 | ); 22 | 23 | export function register(config) { 24 | if (process.env.NODE_ENV === 'production' && 'serviceWorker' in navigator) { 25 | // The URL constructor is available in all browsers that support SW. 26 | const publicUrl = new URL(process.env.PUBLIC_URL, window.location.href); 27 | if (publicUrl.origin !== window.location.origin) { 28 | // Our service worker won't work if PUBLIC_URL is on a different origin 29 | // from what our page is served on. This might happen if a CDN is used to 30 | // serve assets; see https://github.com/facebook/create-react-app/issues/2374 31 | return; 32 | } 33 | 34 | window.addEventListener('load', () => { 35 | const swUrl = `${process.env.PUBLIC_URL}/service-worker.js`; 36 | 37 | if (isLocalhost) { 38 | // This is running on localhost. Let's check if a service worker still exists or not. 39 | checkValidServiceWorker(swUrl, config); 40 | 41 | // Add some additional logging to localhost, pointing developers to the 42 | // service worker/PWA documentation. 43 | navigator.serviceWorker.ready.then(() => { 44 | console.log( 45 | 'This web app is being served cache-first by a service ' + 46 | 'worker. To learn more, visit https://bit.ly/CRA-PWA' 47 | ); 48 | }); 49 | } else { 50 | // Is not localhost. Just register service worker 51 | registerValidSW(swUrl, config); 52 | } 53 | }); 54 | } 55 | } 56 | 57 | function registerValidSW(swUrl, config) { 58 | navigator.serviceWorker 59 | .register(swUrl) 60 | .then(registration => { 61 | registration.onupdatefound = () => { 62 | const installingWorker = registration.installing; 63 | if (installingWorker == null) { 64 | return; 65 | } 66 | installingWorker.onstatechange = () => { 67 | if (installingWorker.state === 'installed') { 68 | if (navigator.serviceWorker.controller) { 69 | // At this point, the updated precached content has been fetched, 70 | // but the previous service worker will still serve the older 71 | // content until all client tabs are closed. 72 | console.log( 73 | 'New content is available and will be used when all ' + 74 | 'tabs for this page are closed. See https://bit.ly/CRA-PWA.' 75 | ); 76 | 77 | // Execute callback 78 | if (config && config.onUpdate) { 79 | config.onUpdate(registration); 80 | } 81 | } else { 82 | // At this point, everything has been precached. 83 | // It's the perfect time to display a 84 | // "Content is cached for offline use." message. 85 | console.log('Content is cached for offline use.'); 86 | 87 | // Execute callback 88 | if (config && config.onSuccess) { 89 | config.onSuccess(registration); 90 | } 91 | } 92 | } 93 | }; 94 | }; 95 | }) 96 | .catch(error => { 97 | console.error('Error during service worker registration:', error); 98 | }); 99 | } 100 | 101 | function checkValidServiceWorker(swUrl, config) { 102 | // Check if the service worker can be found. If it can't reload the page. 103 | fetch(swUrl) 104 | .then(response => { 105 | // Ensure service worker exists, and that we really are getting a JS file. 106 | const contentType = response.headers.get('content-type'); 107 | if ( 108 | response.status === 404 || 109 | (contentType != null && contentType.indexOf('javascript') === -1) 110 | ) { 111 | // No service worker found. Probably a different app. Reload the page. 112 | navigator.serviceWorker.ready.then(registration => { 113 | registration.unregister().then(() => { 114 | window.location.reload(); 115 | }); 116 | }); 117 | } else { 118 | // Service worker found. Proceed as normal. 119 | registerValidSW(swUrl, config); 120 | } 121 | }) 122 | .catch(() => { 123 | console.log( 124 | 'No internet connection found. App is running in offline mode.' 125 | ); 126 | }); 127 | } 128 | 129 | export function unregister() { 130 | if ('serviceWorker' in navigator) { 131 | navigator.serviceWorker.ready.then(registration => { 132 | registration.unregister(); 133 | }); 134 | } 135 | } 136 | -------------------------------------------------------------------------------- /news_app/app/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | npm install 4 | npm run start 5 | -------------------------------------------------------------------------------- /news_app/app/package-lock.json: -------------------------------------------------------------------------------- 1 | { 2 | "requires": true, 3 | "lockfileVersion": 1, 4 | "dependencies": { 5 | "@babel/runtime": { 6 | "version": "7.13.17", 7 | "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.13.17.tgz", 8 | "integrity": "sha512-NCdgJEelPTSh+FEFylhnP1ylq848l1z9t9N0j1Lfbcw0+KXGjsTvUmkxy+voLLXB5SOKMbLLx4jxYliGrYQseA==", 9 | "requires": { 10 | "regenerator-runtime": "^0.13.4" 11 | } 12 | }, 13 | "@elastic/search-ui": { 14 | "version": "1.5.1", 15 | "resolved": "https://registry.npmjs.org/@elastic/search-ui/-/search-ui-1.5.1.tgz", 16 | "integrity": "sha512-ssfvX1q76X1UwqYASWtBni4PZ+3SYk1PvHmOjpVf9BYai1OqZLGVaj8Sw+cE1ia56zl5In7viCfciC+CP31ovA==", 17 | "requires": { 18 | "date-fns": "^1.30.1", 19 | "deep-equal": "^1.0.1", 20 | "history": "^4.9.0", 21 | "qs": "^6.7.0" 22 | } 23 | }, 24 | "call-bind": { 25 | "version": "1.0.2", 26 | "resolved": "https://registry.npmjs.org/call-bind/-/call-bind-1.0.2.tgz", 27 | "integrity": "sha512-7O+FbCihrB5WGbFYesctwmTKae6rOiIzmz1icreWJ+0aA7LJfuqhEso2T9ncpcFtzMQtzXf2QGGueWJGTYsqrA==", 28 | "requires": { 29 | "function-bind": "^1.1.1", 30 | "get-intrinsic": "^1.0.2" 31 | } 32 | }, 33 | "date-fns": { 34 | "version": "1.30.1", 35 | "resolved": "https://registry.npmjs.org/date-fns/-/date-fns-1.30.1.tgz", 36 | "integrity": "sha512-hBSVCvSmWC+QypYObzwGOd9wqdDpOt+0wl0KbU+R+uuZBS1jN8VsD1ss3irQDknRj5NvxiTF6oj/nDRnN/UQNw==" 37 | }, 38 | "deep-equal": { 39 | "version": "1.1.1", 40 | "resolved": "https://registry.npmjs.org/deep-equal/-/deep-equal-1.1.1.tgz", 41 | "integrity": "sha512-yd9c5AdiqVcR+JjcwUQb9DkhJc8ngNr0MahEBGvDiJw8puWab2yZlh+nkasOnZP+EGTAP6rRp2JzJhJZzvNF8g==", 42 | "requires": { 43 | "is-arguments": "^1.0.4", 44 | "is-date-object": "^1.0.1", 45 | "is-regex": "^1.0.4", 46 | "object-is": "^1.0.1", 47 | "object-keys": "^1.1.1", 48 | "regexp.prototype.flags": "^1.2.0" 49 | } 50 | }, 51 | "define-properties": { 52 | "version": "1.1.3", 53 | "resolved": "https://registry.npmjs.org/define-properties/-/define-properties-1.1.3.tgz", 54 | "integrity": "sha512-3MqfYKj2lLzdMSf8ZIZE/V+Zuy+BgD6f164e8K2w7dgnpKArBDerGYpM46IYYcjnkdPNMjPk9A6VFB8+3SKlXQ==", 55 | "requires": { 56 | "object-keys": "^1.0.12" 57 | } 58 | }, 59 | "function-bind": { 60 | "version": "1.1.1", 61 | "resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.1.tgz", 62 | "integrity": "sha512-yIovAzMX49sF8Yl58fSCWJ5svSLuaibPxXQJFLmBObTuCr0Mf1KiPopGM9NiFjiYBCbfaa2Fh6breQ6ANVTI0A==" 63 | }, 64 | "get-intrinsic": { 65 | "version": "1.1.1", 66 | "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.1.1.tgz", 67 | "integrity": "sha512-kWZrnVM42QCiEA2Ig1bG8zjoIMOgxWwYCEeNdwY6Tv/cOSeGpcoX4pXHfKUxNKVoArnrEr2e9srnAxxGIraS9Q==", 68 | "requires": { 69 | "function-bind": "^1.1.1", 70 | "has": "^1.0.3", 71 | "has-symbols": "^1.0.1" 72 | } 73 | }, 74 | "has": { 75 | "version": "1.0.3", 76 | "resolved": "https://registry.npmjs.org/has/-/has-1.0.3.tgz", 77 | "integrity": "sha512-f2dvO0VU6Oej7RkWJGrehjbzMAjFp5/VKPp5tTpWIV4JHHZK1/BxbFRtf/siA2SWTe09caDmVtYYzWEIbBS4zw==", 78 | "requires": { 79 | "function-bind": "^1.1.1" 80 | } 81 | }, 82 | "has-symbols": { 83 | "version": "1.0.2", 84 | "resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.0.2.tgz", 85 | "integrity": "sha512-chXa79rL/UC2KlX17jo3vRGz0azaWEx5tGqZg5pO3NUyEJVB17dMruQlzCCOfUvElghKcm5194+BCRvi2Rv/Gw==" 86 | }, 87 | "history": { 88 | "version": "4.10.1", 89 | "resolved": "https://registry.npmjs.org/history/-/history-4.10.1.tgz", 90 | "integrity": "sha512-36nwAD620w12kuzPAsyINPWJqlNbij+hpK1k9XRloDtym8mxzGYl2c17LnV6IAGB2Dmg4tEa7G7DlawS0+qjew==", 91 | "requires": { 92 | "@babel/runtime": "^7.1.2", 93 | "loose-envify": "^1.2.0", 94 | "resolve-pathname": "^3.0.0", 95 | "tiny-invariant": "^1.0.2", 96 | "tiny-warning": "^1.0.0", 97 | "value-equal": "^1.0.1" 98 | } 99 | }, 100 | "is-arguments": { 101 | "version": "1.1.0", 102 | "resolved": "https://registry.npmjs.org/is-arguments/-/is-arguments-1.1.0.tgz", 103 | "integrity": "sha512-1Ij4lOMPl/xB5kBDn7I+b2ttPMKa8szhEIrXDuXQD/oe3HJLTLhqhgGspwgyGd6MOywBUqVvYicF72lkgDnIHg==", 104 | "requires": { 105 | "call-bind": "^1.0.0" 106 | } 107 | }, 108 | "is-date-object": { 109 | "version": "1.0.2", 110 | "resolved": "https://registry.npmjs.org/is-date-object/-/is-date-object-1.0.2.tgz", 111 | "integrity": "sha512-USlDT524woQ08aoZFzh3/Z6ch9Y/EWXEHQ/AaRN0SkKq4t2Jw2R2339tSXmwuVoY7LLlBCbOIlx2myP/L5zk0g==" 112 | }, 113 | "is-regex": { 114 | "version": "1.1.2", 115 | "resolved": "https://registry.npmjs.org/is-regex/-/is-regex-1.1.2.tgz", 116 | "integrity": "sha512-axvdhb5pdhEVThqJzYXwMlVuZwC+FF2DpcOhTS+y/8jVq4trxyPgfcwIxIKiyeuLlSQYKkmUaPQJ8ZE4yNKXDg==", 117 | "requires": { 118 | "call-bind": "^1.0.2", 119 | "has-symbols": "^1.0.1" 120 | } 121 | }, 122 | "js-tokens": { 123 | "version": "4.0.0", 124 | "resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-4.0.0.tgz", 125 | "integrity": "sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ==" 126 | }, 127 | "loose-envify": { 128 | "version": "1.4.0", 129 | "resolved": "https://registry.npmjs.org/loose-envify/-/loose-envify-1.4.0.tgz", 130 | "integrity": "sha512-lyuxPGr/Wfhrlem2CL/UcnUc1zcqKAImBDzukY7Y5F/yQiNdko6+fRLevlw1HgMySw7f611UIY408EtxRSoK3Q==", 131 | "requires": { 132 | "js-tokens": "^3.0.0 || ^4.0.0" 133 | } 134 | }, 135 | "object-inspect": { 136 | "version": "1.10.2", 137 | "resolved": "https://registry.npmjs.org/object-inspect/-/object-inspect-1.10.2.tgz", 138 | "integrity": "sha512-gz58rdPpadwztRrPjZE9DZLOABUpTGdcANUgOwBFO1C+HZZhePoP83M65WGDmbpwFYJSWqavbl4SgDn4k8RYTA==" 139 | }, 140 | "object-is": { 141 | "version": "1.1.5", 142 | "resolved": "https://registry.npmjs.org/object-is/-/object-is-1.1.5.tgz", 143 | "integrity": "sha512-3cyDsyHgtmi7I7DfSSI2LDp6SK2lwvtbg0p0R1e0RvTqF5ceGx+K2dfSjm1bKDMVCFEDAQvy+o8c6a7VujOddw==", 144 | "requires": { 145 | "call-bind": "^1.0.2", 146 | "define-properties": "^1.1.3" 147 | } 148 | }, 149 | "object-keys": { 150 | "version": "1.1.1", 151 | "resolved": "https://registry.npmjs.org/object-keys/-/object-keys-1.1.1.tgz", 152 | "integrity": "sha512-NuAESUOUMrlIXOfHKzD6bpPu3tYt3xvjNdRIQ+FeT0lNb4K8WR70CaDxhuNguS2XG+GjkyMwOzsN5ZktImfhLA==" 153 | }, 154 | "qs": { 155 | "version": "6.10.1", 156 | "resolved": "https://registry.npmjs.org/qs/-/qs-6.10.1.tgz", 157 | "integrity": "sha512-M528Hph6wsSVOBiYUnGf+K/7w0hNshs/duGsNXPUCLH5XAqjEtiPGwNONLV0tBH8NoGb0mvD5JubnUTrujKDTg==", 158 | "requires": { 159 | "side-channel": "^1.0.4" 160 | } 161 | }, 162 | "regenerator-runtime": { 163 | "version": "0.13.7", 164 | "resolved": "https://registry.npmjs.org/regenerator-runtime/-/regenerator-runtime-0.13.7.tgz", 165 | "integrity": "sha512-a54FxoJDIr27pgf7IgeQGxmqUNYrcV338lf/6gH456HZ/PhX+5BcwHXG9ajESmwe6WRO0tAzRUrRmNONWgkrew==" 166 | }, 167 | "regexp.prototype.flags": { 168 | "version": "1.3.1", 169 | "resolved": "https://registry.npmjs.org/regexp.prototype.flags/-/regexp.prototype.flags-1.3.1.tgz", 170 | "integrity": "sha512-JiBdRBq91WlY7uRJ0ds7R+dU02i6LKi8r3BuQhNXn+kmeLN+EfHhfjqMRis1zJxnlu88hq/4dx0P2OP3APRTOA==", 171 | "requires": { 172 | "call-bind": "^1.0.2", 173 | "define-properties": "^1.1.3" 174 | } 175 | }, 176 | "resolve-pathname": { 177 | "version": "3.0.0", 178 | "resolved": "https://registry.npmjs.org/resolve-pathname/-/resolve-pathname-3.0.0.tgz", 179 | "integrity": "sha512-C7rARubxI8bXFNB/hqcp/4iUeIXJhJZvFPFPiSPRnhU5UPxzMFIl+2E6yY6c4k9giDJAhtV+enfA+G89N6Csng==" 180 | }, 181 | "side-channel": { 182 | "version": "1.0.4", 183 | "resolved": "https://registry.npmjs.org/side-channel/-/side-channel-1.0.4.tgz", 184 | "integrity": "sha512-q5XPytqFEIKHkGdiMIrY10mvLRvnQh42/+GoBlFW3b2LXLE2xxJpZFdm94we0BaoV3RwJyGqg5wS7epxTv0Zvw==", 185 | "requires": { 186 | "call-bind": "^1.0.0", 187 | "get-intrinsic": "^1.0.2", 188 | "object-inspect": "^1.9.0" 189 | } 190 | }, 191 | "tiny-invariant": { 192 | "version": "1.1.0", 193 | "resolved": "https://registry.npmjs.org/tiny-invariant/-/tiny-invariant-1.1.0.tgz", 194 | "integrity": "sha512-ytxQvrb1cPc9WBEI/HSeYYoGD0kWnGEOR8RY6KomWLBVhqz0RgTwVO9dLrGz7dC+nN9llyI7OKAgRq8Vq4ZBSw==" 195 | }, 196 | "tiny-warning": { 197 | "version": "1.0.3", 198 | "resolved": "https://registry.npmjs.org/tiny-warning/-/tiny-warning-1.0.3.tgz", 199 | "integrity": "sha512-lBN9zLN/oAf68o3zNXYrdCt1kP8WsiGW8Oo2ka41b2IM5JL/S1CTyX1rW0mb/zSuJun0ZUrDxx4sqvYS2FWzPA==" 200 | }, 201 | "value-equal": { 202 | "version": "1.0.1", 203 | "resolved": "https://registry.npmjs.org/value-equal/-/value-equal-1.0.1.tgz", 204 | "integrity": "sha512-NOJ6JZCAWr0zlxZt+xqCHNTEKOsrks2HQd4MqhP1qy4z1SkbEP467eNx6TgDKXMvUOb+OENfJCZwM+16n7fRfw==" 205 | } 206 | } 207 | } 208 | -------------------------------------------------------------------------------- /news_app/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | 3 | services: 4 | 5 | app: 6 | build: ./app 7 | restart: always 8 | ports: 9 | - "8080:3000" 10 | volumes: 11 | - ./app/app:/app 12 | environment: 13 | NODE_ENV: "development" 14 | CHOKIDAR_USEPOLLING: "true" 15 | -------------------------------------------------------------------------------- /pandemic_knowledge.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flavienbwk/Pandemic-Knowledge/e9082be16d743aac49c514034626721a608ede08/pandemic_knowledge.png -------------------------------------------------------------------------------- /prefect/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.8 2 | 3 | RUN apt-get install gcc 4 | 5 | RUN python3 -m pip install prefect 6 | COPY prefect.config /root/.prefect/config.toml 7 | 8 | ENTRYPOINT \ 9 | prefect backend server && \ 10 | prefect server create-tenant --name default --slug default 11 | -------------------------------------------------------------------------------- /prefect/prefect.config: -------------------------------------------------------------------------------- 1 | # debug mode 2 | debug = true 3 | 4 | # base configuration directory (typically you won't change this!) 5 | home_dir = "~/.prefect" 6 | 7 | backend = "server" 8 | 9 | [server] 10 | host = "http://prefect_apollo" 11 | port = "4200" 12 | host_port = "4200" 13 | endpoint = "${server.host}:${server.port}" 14 | --------------------------------------------------------------------------------