├── backend
    ├── __init__.py
    ├── models
    │   └── __init__.py
    ├── .python-version
    ├── controller
    │   ├── __init__.py
    │   ├── errors
    │   │   └── http_error.py
    │   ├── router.py
    │   ├── feedback.py
    │   └── autocomplete.py
    ├── readme.md
    ├── config.py
    ├── api.py
    ├── data_ingestion.py
    └── .gitignore
├── covid_nlp
    ├── __init__.py
    ├── language
    │   ├── requirements.txt
    │   ├── ms_translate.py
    │   └── detect_language.py
    ├── modeling
    │   ├── tfidf
    │   │   ├── requirements.txt
    │   │   ├── README.md
    │   │   ├── tfidf_train.py
    │   │   ├── tfidf_client.py
    │   │   └── preprocess.py
    │   └── transformer
    │   │   ├── eval_pretrained_haystack.py
    │   │   └── train_quora_dedup_bert.py
    └── eval.py
├── datasources
    ├── __init__.py
    ├── automatic
    │   ├── DATASOURCE_INSTRUCTIONS.md
    │   ├── scraper.py
    │   └── testing_WHO_scraper.py
    ├── scrapers
    │   ├── RKI_scraper.py
    │   ├── GOV_pl_scraper.py
    │   ├── BMAS_scraper.py
    │   ├── CDC_Water_scraper.py
    │   ├── Bundesregierung_scraper.py
    │   ├── UNICEF_scraper.py
    │   ├── BAUA_scraper.py
    │   ├── CDC_Travel_scraper.py
    │   ├── Salute_IT_scraper.py
    │   ├── BMWI_scraper.py
    │   ├── BVF_scraper.py
    │   ├── Arbeitsagentur_scraper.py
    │   ├── CDC_General_scraper.py
    │   ├── IHK_scraper.py
    │   ├── ECDC_scraper.py
    │   ├── FHM_EN_scraper.py
    │   ├── WHO_scraper.py
    │   ├── BMG_scraper.py
    │   ├── FHM_SV_scraper.py
    │   └── BerlinerSenat_scraper.py
    ├── scrapers_unused
    │   └── ZEIT_scraper.py
    └── scrapers_outdated
    │   ├── BZgA_scraper.py
    │   ├── CDC_Individuals_scraper.py
    │   └── CDC_Children_scraper.py
├── covid-frontend
    ├── src
    │   ├── components
    │   │   ├── NotFound
    │   │   │   ├── styles.module.scss
    │   │   │   └── index.js
    │   │   ├── themes
    │   │   │   ├── common
    │   │   │   │   ├── index.js
    │   │   │   │   └── Header
    │   │   │   │   │   ├── index.js
    │   │   │   │   │   └── styles.module.scss
    │   │   │   ├── index.js
    │   │   │   └── MainTemplate
    │   │   │   │   ├── styles.module.scss
    │   │   │   │   └── index.js
    │   │   ├── common
    │   │   │   ├── index.js
    │   │   │   ├── Loader
    │   │   │   │   ├── styles.module.scss
    │   │   │   │   └── index.js
    │   │   │   ├── Tag
    │   │   │   │   ├── styles.module.scss
    │   │   │   │   └── index.js
    │   │   │   └── InputContainer
    │   │   │   │   ├── styles.module.scss
    │   │   │   │   └── index.js
    │   │   ├── App.js
    │   │   ├── Provider.js
    │   │   ├── UserFeedback
    │   │   │   ├── styles.module.scss
    │   │   │   └── index.js
    │   │   ├── Home
    │   │   │   ├── index.js
    │   │   │   └── styles.module.scss
    │   │   └── Answers
    │   │   │   └── styles.module.scss
    │   ├── core
    │   │   ├── constants
    │   │   │   └── env.js
    │   │   ├── utils
    │   │   │   └── string.js
    │   │   └── api
    │   │   │   └── index.js
    │   ├── assets
    │   │   ├── images
    │   │   │   ├── logo.png
    │   │   │   ├── logo.psd
    │   │   │   ├── pwc.png
    │   │   │   ├── powedby.png
    │   │   │   ├── powedby.psd
    │   │   │   └── deepset_logo_small.png
    │   │   ├── fonts
    │   │   │   ├── gothic-a1-v8-latin-500.eot
    │   │   │   ├── gothic-a1-v8-latin-500.ttf
    │   │   │   ├── gothic-a1-v8-latin-500.woff
    │   │   │   ├── gothic-a1-v8-latin-600.eot
    │   │   │   ├── gothic-a1-v8-latin-600.ttf
    │   │   │   ├── gothic-a1-v8-latin-600.woff
    │   │   │   ├── gothic-a1-v8-latin-700.eot
    │   │   │   ├── gothic-a1-v8-latin-700.ttf
    │   │   │   ├── gothic-a1-v8-latin-700.woff
    │   │   │   ├── gothic-a1-v8-latin-500.woff2
    │   │   │   ├── gothic-a1-v8-latin-600.woff2
    │   │   │   ├── gothic-a1-v8-latin-700.woff2
    │   │   │   ├── gothic-a1-v8-latin-regular.eot
    │   │   │   ├── gothic-a1-v8-latin-regular.ttf
    │   │   │   ├── gothic-a1-v8-latin-regular.woff
    │   │   │   └── gothic-a1-v8-latin-regular.woff2
    │   │   ├── styles
    │   │   │   ├── _mixins.scss
    │   │   │   ├── wirvsvirus
    │   │   │   │   └── wirvsvirus-theme.css
    │   │   │   ├── antd
    │   │   │   │   └── antd-theme.less
    │   │   │   ├── _reset.css
    │   │   │   └── _variables.scss
    │   │   └── icons
    │   │   │   └── brain.svg
    │   ├── routes
    │   │   ├── links.js
    │   │   └── index.js
    │   ├── history.js
    │   ├── store
    │   │   ├── reducers
    │   │   │   ├── index.js
    │   │   │   ├── activeAnswers.js
    │   │   │   └── globalSearch.js
    │   │   ├── sagas
    │   │   │   ├── index.js
    │   │   │   ├── api
    │   │   │   │   └── index.js
    │   │   │   ├── globalSearch.js
    │   │   │   └── activeAnswers.js
    │   │   ├── types
    │   │   │   ├── globalSearch.js
    │   │   │   └── activeAnswers.js
    │   │   └── actions
    │   │   │   ├── globalSearch.js
    │   │   │   └── activeAnswers.js
    │   ├── i18n.js
    │   └── index.js
    ├── public
    │   ├── robots.txt
    │   ├── favicon.ico
    │   ├── manifest.json
    │   ├── index.html
    │   └── locales
    │   │   ├── en
    │   │       └── translation.json
    │   │   └── de
    │   │       └── translation.json
    ├── jsconfig.json
    ├── .env.production
    ├── .env.staging
    ├── Dockerfile
    ├── README.md
    ├── .eslintrc
    ├── .gitignore
    ├── nginx.conf
    ├── package.json
    └── .gitlab-ci.yml
├── telegram-bot
    ├── gradle.properties
    ├── settings.gradle
    ├── gradle
    │   └── wrapper
    │   │   ├── gradle-wrapper.jar
    │   │   └── gradle-wrapper.properties
    ├── src
    │   └── main
    │   │   ├── kotlin
    │   │       └── com
    │   │       │   └── theapache64
    │   │       │       └── cs
    │   │       │           ├── utils
    │   │       │               ├── GsonUtil.kt
    │   │       │               ├── FeedbackParser.kt
    │   │       │               ├── TelegramAPI.kt
    │   │       │               └── RestClient.kt
    │   │       │           ├── models
    │   │       │               ├── Feedback.kt
    │   │       │               └── rest
    │   │       │               │   ├── telegram
    │   │       │               │       ├── AnswerCallbackRequest.kt
    │   │       │               │       ├── SendChatActionRequest.kt
    │   │       │               │       ├── SendMessageResponse.kt
    │   │       │               │       ├── SendMessageRequest.kt
    │   │       │               │       ├── TelegramUpdate.kt
    │   │       │               │       └── TelegramCallbackQuery.kt
    │   │       │               │   ├── CoronaQuestion.kt
    │   │       │               │   └── AddFeedbackRequest.kt
    │   │       │           ├── servlets
    │   │       │               └── TestServlet.kt
    │   │       │           └── core
    │   │       │               └── Scholar.kt
    │   │   └── webapp
    │   │       └── index.jsp
    ├── build.gradle
    ├── gradlew.bat
    └── .gitignore
├── .gitignore
├── docs
    └── img
    │   ├── covid-bert.png
    │   └── example-data-format.png
├── data
    ├── question-answering
    │   └── Handbook - Labelling Tool.pdf
    └── faqs
    │   └── deduplicate_with_sentenceBert.py
├── requirements.txt
├── Dockerfile
├── docker-compose.yml
└── README.md


/backend/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/covid_nlp/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/datasources/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/backend/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/backend/.python-version:
--------------------------------------------------------------------------------
1 | covid
2 | 


--------------------------------------------------------------------------------
/backend/controller/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/covid-frontend/src/components/NotFound/styles.module.scss:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/telegram-bot/gradle.properties:
--------------------------------------------------------------------------------
1 | kotlin.code.style=official


--------------------------------------------------------------------------------
/covid_nlp/language/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas
2 | pycld2
3 | pycld3
4 | 


--------------------------------------------------------------------------------
/telegram-bot/settings.gradle:
--------------------------------------------------------------------------------
1 | rootProject.name = 'corona-scholar'
2 | 
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | .DS_Store
3 | .vscode/settings.json
4 | data
5 | __pycache__


--------------------------------------------------------------------------------
/covid_nlp/modeling/tfidf/requirements.txt:
--------------------------------------------------------------------------------
1 | sentencepiece
2 | sklearn
3 | nltk
4 | 


--------------------------------------------------------------------------------
/covid-frontend/public/robots.txt:
--------------------------------------------------------------------------------
1 | # https://www.robotstxt.org/robotstxt.html
2 | User-agent: *
3 | 


--------------------------------------------------------------------------------
/covid-frontend/src/core/constants/env.js:
--------------------------------------------------------------------------------
1 | export const baseUrl = process.env.REACT_APP_API;
2 | 


--------------------------------------------------------------------------------
/covid-frontend/jsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 |   "compilerOptions": {
3 |     "baseUrl": "src"
4 |   }
5 | }
6 | 


--------------------------------------------------------------------------------
/covid-frontend/src/components/themes/common/index.js:
--------------------------------------------------------------------------------
1 | export { default as Header } from './Header';
2 | 


--------------------------------------------------------------------------------
/docs/img/covid-bert.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/docs/img/covid-bert.png


--------------------------------------------------------------------------------
/covid-frontend/src/components/themes/index.js:
--------------------------------------------------------------------------------
1 | export { default as MainTemplate } from './MainTemplate';
2 | 


--------------------------------------------------------------------------------
/covid-frontend/.env.production:
--------------------------------------------------------------------------------
1 | REACT_APP_ENV=production
2 | REACT_APP_API=https://covid-backend.deepset.ai
3 | 


--------------------------------------------------------------------------------
/covid-frontend/.env.staging:
--------------------------------------------------------------------------------
1 | 
2 | REACT_APP_ENV=staging
3 | REACT_APP_API=https://covid-backend.deepset.ai
4 | 


--------------------------------------------------------------------------------
/covid-frontend/public/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/covid-frontend/public/favicon.ico


--------------------------------------------------------------------------------
/docs/img/example-data-format.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/docs/img/example-data-format.png


--------------------------------------------------------------------------------
/covid-frontend/src/assets/images/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/covid-frontend/src/assets/images/logo.png


--------------------------------------------------------------------------------
/covid-frontend/src/assets/images/logo.psd:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/covid-frontend/src/assets/images/logo.psd


--------------------------------------------------------------------------------
/covid-frontend/src/assets/images/pwc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/covid-frontend/src/assets/images/pwc.png


--------------------------------------------------------------------------------
/covid-frontend/src/assets/images/powedby.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/covid-frontend/src/assets/images/powedby.png


--------------------------------------------------------------------------------
/covid-frontend/src/assets/images/powedby.psd:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/covid-frontend/src/assets/images/powedby.psd


--------------------------------------------------------------------------------
/covid-frontend/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM nginx:stable
2 | COPY nginx.conf /etc/nginx/conf.d/default.conf
3 | COPY build/ /usr/share/nginx/html/
4 | EXPOSE 80


--------------------------------------------------------------------------------
/covid-frontend/src/routes/links.js:
--------------------------------------------------------------------------------
1 | export default {
2 |   home: '/home',
3 |   answers: '/answers',
4 |   // questions: '/questions',
5 | };
6 | 


--------------------------------------------------------------------------------
/telegram-bot/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/telegram-bot/gradle/wrapper/gradle-wrapper.jar


--------------------------------------------------------------------------------
/covid-frontend/README.md:
--------------------------------------------------------------------------------
1 | ## Setup
2 | 
3 | React Js frontend for the covid tool
4 | 
5 |     npm i
6 |     cp .env.staging .env
7 |     npm start
8 | 
9 | 


--------------------------------------------------------------------------------
/data/question-answering/Handbook - Labelling Tool.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/data/question-answering/Handbook - Labelling Tool.pdf


--------------------------------------------------------------------------------
/covid-frontend/src/assets/images/deepset_logo_small.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/covid-frontend/src/assets/images/deepset_logo_small.png


--------------------------------------------------------------------------------
/covid-frontend/src/assets/fonts/gothic-a1-v8-latin-500.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/covid-frontend/src/assets/fonts/gothic-a1-v8-latin-500.eot


--------------------------------------------------------------------------------
/covid-frontend/src/assets/fonts/gothic-a1-v8-latin-500.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/covid-frontend/src/assets/fonts/gothic-a1-v8-latin-500.ttf


--------------------------------------------------------------------------------
/covid-frontend/src/assets/fonts/gothic-a1-v8-latin-500.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/covid-frontend/src/assets/fonts/gothic-a1-v8-latin-500.woff


--------------------------------------------------------------------------------
/covid-frontend/src/assets/fonts/gothic-a1-v8-latin-600.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/covid-frontend/src/assets/fonts/gothic-a1-v8-latin-600.eot


--------------------------------------------------------------------------------
/covid-frontend/src/assets/fonts/gothic-a1-v8-latin-600.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/covid-frontend/src/assets/fonts/gothic-a1-v8-latin-600.ttf


--------------------------------------------------------------------------------
/covid-frontend/src/assets/fonts/gothic-a1-v8-latin-600.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/covid-frontend/src/assets/fonts/gothic-a1-v8-latin-600.woff


--------------------------------------------------------------------------------
/covid-frontend/src/assets/fonts/gothic-a1-v8-latin-700.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/covid-frontend/src/assets/fonts/gothic-a1-v8-latin-700.eot


--------------------------------------------------------------------------------
/covid-frontend/src/assets/fonts/gothic-a1-v8-latin-700.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/covid-frontend/src/assets/fonts/gothic-a1-v8-latin-700.ttf


--------------------------------------------------------------------------------
/covid-frontend/src/assets/fonts/gothic-a1-v8-latin-700.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/covid-frontend/src/assets/fonts/gothic-a1-v8-latin-700.woff


--------------------------------------------------------------------------------
/covid-frontend/.eslintrc:
--------------------------------------------------------------------------------
1 | {
2 |   "extends": "react-app",
3 |   "rules": {
4 |     "semi": [2, "always"],
5 |     "indent": ["error", 2, { "SwitchCase": 1 }]
6 |   }
7 | }
8 | 


--------------------------------------------------------------------------------
/covid-frontend/src/assets/fonts/gothic-a1-v8-latin-500.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/covid-frontend/src/assets/fonts/gothic-a1-v8-latin-500.woff2


--------------------------------------------------------------------------------
/covid-frontend/src/assets/fonts/gothic-a1-v8-latin-600.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/covid-frontend/src/assets/fonts/gothic-a1-v8-latin-600.woff2


--------------------------------------------------------------------------------
/covid-frontend/src/assets/fonts/gothic-a1-v8-latin-700.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/covid-frontend/src/assets/fonts/gothic-a1-v8-latin-700.woff2


--------------------------------------------------------------------------------
/covid-frontend/src/assets/fonts/gothic-a1-v8-latin-regular.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/covid-frontend/src/assets/fonts/gothic-a1-v8-latin-regular.eot


--------------------------------------------------------------------------------
/covid-frontend/src/assets/fonts/gothic-a1-v8-latin-regular.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/covid-frontend/src/assets/fonts/gothic-a1-v8-latin-regular.ttf


--------------------------------------------------------------------------------
/covid-frontend/src/assets/fonts/gothic-a1-v8-latin-regular.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/covid-frontend/src/assets/fonts/gothic-a1-v8-latin-regular.woff


--------------------------------------------------------------------------------
/covid-frontend/src/assets/fonts/gothic-a1-v8-latin-regular.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/covid-frontend/src/assets/fonts/gothic-a1-v8-latin-regular.woff2


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | #crawler
2 | Scrapy==2.0.1
3 | # haystack
4 | -e git://github.com/deepset-ai/haystack.git@master#egg=farm-haystack
5 | langid===1.1.6
6 | elastic-apm
7 | pycld2
8 | 


--------------------------------------------------------------------------------
/covid-frontend/src/history.js:
--------------------------------------------------------------------------------
1 | import { createBrowserHistory } from 'history';
2 | 
3 | // configure, create, and export the project's history instance
4 | export default createBrowserHistory();
5 | 


--------------------------------------------------------------------------------
/covid-frontend/src/components/common/index.js:
--------------------------------------------------------------------------------
1 | export { default as InputContainer } from './InputContainer';
2 | export { default as Tag } from './Tag';
3 | export { default as Loader } from './Loader';
4 | 


--------------------------------------------------------------------------------
/telegram-bot/src/main/kotlin/com/theapache64/cs/utils/GsonUtil.kt:
--------------------------------------------------------------------------------
1 | package com.theapache64.cs.utils
2 | 
3 | import com.google.gson.GsonBuilder
4 | 
5 | object GsonUtil {
6 |     val gson = GsonBuilder().create()
7 | }


--------------------------------------------------------------------------------
/telegram-bot/src/main/kotlin/com/theapache64/cs/models/Feedback.kt:
--------------------------------------------------------------------------------
1 | package com.theapache64.cs.models
2 | 
3 | data class Feedback(
4 |     val modelId:Int,
5 |     val feedback: String,
6 |     val question: String,
7 |     val documentId: Long
8 | )


--------------------------------------------------------------------------------
/covid-frontend/src/store/reducers/index.js:
--------------------------------------------------------------------------------
1 | import { combineReducers } from 'redux';
2 | import globalSearch from './globalSearch';
3 | import activeAnswers from './activeAnswers';
4 | 
5 | export default combineReducers({
6 |   globalSearch,
7 |   activeAnswers,
8 | });
9 | 


--------------------------------------------------------------------------------
/covid-frontend/src/core/utils/string.js:
--------------------------------------------------------------------------------
1 | 
2 | export const prefix = (value) => (string) => `${value}${string}`;
3 | 
4 | // result example - 12345.67 --> "12,345.67"
5 | export const formatNumber = (value = 0, precision = 2) =>
6 |   value.toFixed(precision).replace(/\d(?=(\d{3})+\.)/g, '$&,');
7 | 


--------------------------------------------------------------------------------
/telegram-bot/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | #Tue Mar 24 01:07:13 IST 2020
2 | distributionUrl=https\://services.gradle.org/distributions/gradle-5.2.1-all.zip
3 | distributionBase=GRADLE_USER_HOME
4 | distributionPath=wrapper/dists
5 | zipStorePath=wrapper/dists
6 | zipStoreBase=GRADLE_USER_HOME
7 | 


--------------------------------------------------------------------------------
/telegram-bot/src/main/kotlin/com/theapache64/cs/models/rest/telegram/AnswerCallbackRequest.kt:
--------------------------------------------------------------------------------
1 | package com.theapache64.cs.models.rest.telegram
2 | 
3 | import com.google.gson.annotations.SerializedName
4 | 
5 | 
6 | data class AnswerCallbackRequest(
7 |     @SerializedName("callback_query_id")
8 |     val callbackQueryId: String // 123
9 | )


--------------------------------------------------------------------------------
/backend/controller/errors/http_error.py:
--------------------------------------------------------------------------------
1 | from fastapi import HTTPException
2 | from starlette.requests import Request
3 | from starlette.responses import JSONResponse
4 | 
5 | 
6 | async def http_error_handler(_: Request, exc: HTTPException) -> JSONResponse:
7 |     return JSONResponse({"errors": [exc.detail]}, status_code=exc.status_code)
8 | 


--------------------------------------------------------------------------------
/backend/controller/router.py:
--------------------------------------------------------------------------------
1 | from fastapi import APIRouter
2 | 
3 | from backend.controller import autocomplete, model, feedback
4 | 
5 | router = APIRouter()
6 | router.include_router(autocomplete.router, tags=["autocomplete"])
7 | router.include_router(model.router, tags=["model"])
8 | router.include_router(feedback.router, tags=["feedback"])
9 | 


--------------------------------------------------------------------------------
/covid-frontend/src/store/sagas/index.js:
--------------------------------------------------------------------------------
 1 | import { all, fork } from 'redux-saga/effects';
 2 | import activeAnswersSaga from './activeAnswers';
 3 | import globalSearchSaga from './globalSearch';
 4 | 
 5 | export default function* rootSaga() {
 6 |   yield all([
 7 |     fork(activeAnswersSaga),
 8 |     fork(globalSearchSaga),
 9 |   ]);
10 | }
11 | 


--------------------------------------------------------------------------------
/telegram-bot/src/main/kotlin/com/theapache64/cs/models/rest/CoronaQuestion.kt:
--------------------------------------------------------------------------------
 1 | package com.theapache64.cs.models.rest
 2 | 
 3 | import com.google.gson.annotations.SerializedName
 4 | 
 5 | class CoronaQuestion(
 6 |     @SerializedName("questions")
 7 |     val questions: Array<String>, // How does corona spread?
 8 |     @SerializedName("top_k_retriever")
 9 |     val resultCount: Int = 1
10 | )


--------------------------------------------------------------------------------
/covid-frontend/src/components/themes/MainTemplate/styles.module.scss:
--------------------------------------------------------------------------------
 1 | @import '../../../assets/styles/_variables';
 2 | 
 3 | .wrapper {
 4 |   background-color: $light-grey;
 5 |   height: 100%;
 6 |   min-height: 100vh;
 7 |   position: relative;
 8 |   width: 100%;
 9 | }
10 | 
11 | .content {
12 |   margin: 0 auto;
13 |   max-width: 1000px;
14 |   padding: 16px 0 32px;
15 |   width: 100%;
16 | }
17 | 


--------------------------------------------------------------------------------
/telegram-bot/src/main/kotlin/com/theapache64/cs/models/rest/telegram/SendChatActionRequest.kt:
--------------------------------------------------------------------------------
 1 | package com.theapache64.cs.models.rest.telegram
 2 | 
 3 | import com.google.gson.annotations.SerializedName
 4 | 
 5 | 
 6 | data class SendChatActionRequest(
 7 |     @SerializedName("action")
 8 |     val action: String, // String
 9 |     @SerializedName("chat_id")
10 |     val chatId: String // String
11 | )


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.7.4-stretch
 2 | 
 3 | WORKDIR /home/user
 4 | 
 5 | # install haystack and dependencies
 6 | COPY requirements.txt /home/user/
 7 | RUN pip install -r requirements.txt
 8 | 
 9 | COPY backend /home/user/backend
10 | COPY covid_nlp /home/user/covid_nlp
11 | 
12 | EXPOSE 8000
13 | 
14 | # cmd for running the API
15 | CMD ["uvicorn", "backend.api:app", "--host", "0.0.0.0", "--port", "8000"]


--------------------------------------------------------------------------------
/telegram-bot/src/main/webapp/index.jsp:
--------------------------------------------------------------------------------
 1 | <%--
 2 |   Created by IntelliJ IDEA.
 3 |   User: theapache64
 4 |   Date: 24/03/20
 5 |   Time: 1:06 AM
 6 |   To change this template use File | Settings | File Templates.
 7 | --%>
 8 | <%@ page contentType="text/html;charset=UTF-8" language="java" %>
 9 | <html>
10 | <head>
11 |     <title>$Title$</title>
12 | </head>
13 | <body>
14 | $END$
15 | </body>
16 | </html>
17 | 


--------------------------------------------------------------------------------
/covid-frontend/public/manifest.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "short_name": "Corona scholar",
 3 |   "name": "Corona scholar - scientific corona knowledge",
 4 |   "icons": [
 5 |     {
 6 |       "src": "favicon.ico",
 7 |       "sizes": "64x64 32x32 24x24 16x16",
 8 |       "type": "image/x-icon"
 9 |     }
10 |   ],
11 |   "start_url": ".",
12 |   "display": "standalone",
13 |   "theme_color": "#000000",
14 |   "background_color": "#ffffff"
15 | }
16 | 


--------------------------------------------------------------------------------
/telegram-bot/src/main/kotlin/com/theapache64/cs/models/rest/AddFeedbackRequest.kt:
--------------------------------------------------------------------------------
 1 | package com.theapache64.cs.models.rest
 2 | 
 3 | import com.google.gson.annotations.SerializedName
 4 | 
 5 | 
 6 | data class AddFeedbackRequest(
 7 |     @SerializedName("feedback")
 8 |     val feedback: String,  // relevant
 9 |     @SerializedName("question")
10 |     val question: String,
11 |     @SerializedName("document_id")
12 |     val documentId: Long
13 | )


--------------------------------------------------------------------------------
/covid-frontend/.gitignore:
--------------------------------------------------------------------------------
 1 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
 2 | 
 3 | # dependencies
 4 | /node_modules
 5 | /.pnp
 6 | .pnp.js
 7 | 
 8 | # testing
 9 | /coverage
10 | 
11 | .env
12 | 
13 | # production
14 | /build
15 | 
16 | # misc
17 | .DS_Store
18 | .env.local
19 | .env.development.local
20 | .env.test.local
21 | .env.production.local
22 | 
23 | npm-debug.log*
24 | yarn-debug.log*
25 | yarn-error.log*
26 | .idea
27 | 


--------------------------------------------------------------------------------
/covid-frontend/src/components/App.js:
--------------------------------------------------------------------------------
 1 | import React, { Component, Fragment } from 'react';
 2 | import { ConfigProvider } from 'antd';
 3 | import deDE from 'antd/es/locale/de_DE';
 4 | 
 5 | class App extends Component {
 6 | 
 7 |   render () {
 8 |     return (
 9 |       <Fragment>
10 |         <ConfigProvider locale={deDE}>
11 |           { this.props.children }
12 |         </ConfigProvider>
13 |       </Fragment>
14 |     );
15 |   }
16 | }
17 | 
18 | export default App;
19 | 


--------------------------------------------------------------------------------
/covid-frontend/src/components/common/Loader/styles.module.scss:
--------------------------------------------------------------------------------
 1 | @import '../../../assets/styles/_variables';
 2 | 
 3 | .selfContained {
 4 |   align-items: center;
 5 |   height: 140px;
 6 |   display: flex;
 7 |   justify-content: center;
 8 |   width: 100%;
 9 | }
10 | .fullSized {
11 |   align-items: center;
12 |   display: flex;
13 |   justify-content: center;
14 |   height: 100vh;
15 |   position: fixed;
16 |   top: 0;
17 |   left: 0;
18 |   width: 100vw;
19 |   z-index: 99;
20 | }
21 | 


--------------------------------------------------------------------------------
/telegram-bot/src/main/kotlin/com/theapache64/cs/servlets/TestServlet.kt:
--------------------------------------------------------------------------------
 1 | package com.theapache64.cs.servlets
 2 | 
 3 | import javax.servlet.annotation.WebServlet
 4 | import javax.servlet.http.HttpServlet
 5 | import javax.servlet.http.HttpServletRequest
 6 | import javax.servlet.http.HttpServletResponse
 7 | 
 8 | @WebServlet(urlPatterns = ["/test"])
 9 | class TestServlet : HttpServlet() {
10 | 
11 |     override fun doGet(req: HttpServletRequest?, resp: HttpServletResponse?) {
12 |         resp!!.writer.write("This is sample resp")
13 |     }
14 | }


--------------------------------------------------------------------------------
/datasources/automatic/DATASOURCE_INSTRUCTIONS.md:
--------------------------------------------------------------------------------
1 | # Datasources
2 | 
3 | ## How to add sources
4 | 1. Please add official datasources of any language to the sources.csv file
5 | 2. The CSV is sorted by link values, please put additions into the right order (to avoid duplicates)
6 | 3. Please fill out as many columns as you can - it might make sense to have the scraper fill out values individually. 
7 | E.g. when the FAQ site contains text in different languages (use automatic language detection 
8 | from covid_nlp/language/detect_language.py) or if each FAQ entry has a dedicated category.


--------------------------------------------------------------------------------
/covid-frontend/src/components/themes/MainTemplate/index.js:
--------------------------------------------------------------------------------
 1 | import React, { Component } from 'react';
 2 | import { Header } from '../common';
 3 | import styles from './styles.module.scss';
 4 | 
 5 | class MainTemplate extends Component {
 6 | 
 7 |   static propTypes = {}
 8 | 
 9 |   render() {
10 |     return (
11 |       <div className={styles.wrapper}>
12 |         <Header />
13 | 
14 |         <div className={styles.content}>
15 |           { this.props.children }
16 |         </div>
17 | 
18 |       </div>
19 | 
20 |     );
21 |   }
22 | }
23 | 
24 | export default MainTemplate;
25 | 


--------------------------------------------------------------------------------
/covid_nlp/modeling/tfidf/README.md:
--------------------------------------------------------------------------------
 1 | ## Train and Evaluate TF-IDF Model
 2 | 
 3 | ### 1. Train sentencepiece model
 4 | 
 5 | Preprocessing takes max 1 argument (= vocab size for sentencepiece) which defaults to 24000 if not set.
 6 | 
 7 | `cat my_large_text | python3 ./preprocess.py 16000`
 8 | 
 9 | ### 2. Train TF-IDF Vectors
10 | 
11 | TF-IDF Vectors are trained with 1- and 2-bigrams, with otherwise default settings
12 | 
13 | `cat my_questions | python3 ./tfidf_train.py`
14 | 
15 | ### 3. Score and submit
16 | 
17 | Each pair in the eval set is scored with cosine similarity and then results are posted to mflow
18 | 
19 | `python3 ./tfidf_client.py`
20 | 


--------------------------------------------------------------------------------
/covid-frontend/nginx.conf:
--------------------------------------------------------------------------------
 1 | # Expires map
 2 | map $sent_http_content_type $expires {
 3 |     default                    off;
 4 |     text/html                  epoch;
 5 |     text/css                   1y;
 6 |     application/javascript     1y;
 7 | }
 8 | 
 9 | server {
10 |     listen       80;
11 |     server_name  localhost;
12 |     
13 |     expires $expires;
14 | 
15 |     location / {
16 |         root   /usr/share/nginx/html;
17 |         index  index.html index.htm;
18 |         try_files $uri $uri/ /index.html;
19 |     }
20 |     
21 |     error_page   500 502 503 504  /50x.html;
22 |     location = /50x.html {
23 |         root   /usr/share/nginx/html;
24 |     }
25 | }


--------------------------------------------------------------------------------
/covid-frontend/src/assets/styles/_mixins.scss:
--------------------------------------------------------------------------------
 1 | @import "variables";
 2 | 
 3 | @mixin text($size: regular) {
 4 |   $text-size: map-get($text, $size);
 5 | 
 6 |   font-size: map-get($text-size, font-size);
 7 |   line-height: map-get($text-size, line-height);
 8 | }
 9 | 
10 | @mixin border($color: $border-grey, $radius: $base-radius) {
11 |   border: 1px solid $color;
12 |   border-radius: $radius;
13 | }
14 | 
15 | @mixin outline($offset: 0, $width: 2px) {
16 |   outline-color: #80c0d8;
17 |   outline-offset: $offset;
18 |   outline-width: $width;
19 | }
20 | 
21 | @mixin text-overflow() {
22 |   overflow: hidden;
23 |   text-overflow: ellipsis;
24 |   white-space: nowrap;
25 | }
26 | 


--------------------------------------------------------------------------------
/covid-frontend/src/store/types/globalSearch.js:
--------------------------------------------------------------------------------
 1 | import { prefix } from 'core/utils/string';
 2 | 
 3 | const searchPrefix = prefix('globalSearch/');
 4 | 
 5 | export const SET_SELECTED_VALUE = searchPrefix('SET_SELECTED_VALUE');
 6 | 
 7 | export const UPDATE_SEARCH_VALUE = searchPrefix('UPDATE_SEARCH_VALUE');
 8 | export const UPDATE_SEARCH_FILTERS = searchPrefix('UPDATE_SEARCH_FILTES');
 9 | export const UPDATE_LAST_SEARCH_VALUE = searchPrefix('UPDATE_LAST_SEARCH_VALUE');
10 | export const UPDATE_SEARCH_OPTIONS = searchPrefix('UPDATE_SEARCH_OPTIONS');
11 | 
12 | export const SET_LOADING_STATUS = searchPrefix('SET_LOADING_STATUS');
13 | 
14 | export const RESET = searchPrefix('RESET');
15 | 


--------------------------------------------------------------------------------
/covid-frontend/src/components/common/Tag/styles.module.scss:
--------------------------------------------------------------------------------
 1 | @import '../../../assets/styles/_variables';
 2 | @import '../../../assets/styles/_mixins';
 3 | 
 4 | .tag {
 5 |   display: inline-block;
 6 |   flex-grow: 0;
 7 |   padding: 8px 12px;
 8 |   text-align: center;
 9 |   white-space: nowrap;
10 | 
11 |   // &.red {
12 |   //   background-color: $error-light2;
13 |   //   color: $error;
14 |   //   @include border($error-light, 4px);
15 |   // }
16 |   // &.orange {
17 |   //   background-color: $warning-light2;
18 |   //   color: $warning;
19 |   //   @include border($warning-light, 4px);
20 |   // }
21 |   // &.green {
22 |   //   background-color: $success-light2;
23 |   //   color: $success;
24 |   //   @include border($success-light, 4px);
25 |   // }
26 | }
27 | 


--------------------------------------------------------------------------------
/covid-frontend/src/components/themes/common/Header/index.js:
--------------------------------------------------------------------------------
 1 | import React, { PureComponent } from 'react';
 2 | import { Link } from 'react-router-dom';
 3 | import links from 'routes/links';
 4 | import logo from 'assets/images/logo.png';
 5 | import styles from './styles.module.scss';
 6 | 
 7 | class Header extends PureComponent {
 8 | 
 9 |   render() {
10 | 
11 |     return (
12 |       <div className={styles.wrapper}>
13 |         <header className={styles.header}>
14 |           <Link to={links.home} className={styles.homeLink}>
15 |             <div className={styles.logo}>
16 |               <img src={logo} alt="corona-scholar logo" />
17 |             </div>
18 |           </Link>
19 |         </header>
20 |       </div>
21 |     );
22 |   }
23 | }
24 | 
25 | 
26 | export default Header;
27 | 


--------------------------------------------------------------------------------
/covid-frontend/src/components/themes/common/Header/styles.module.scss:
--------------------------------------------------------------------------------
 1 | @import '../../../../assets/styles/_variables';
 2 | @import '../../../../assets/styles/_mixins';
 3 | 
 4 | .wrapper {
 5 |   position: relative;
 6 |   width: 100%;
 7 |   z-index: 10;
 8 | }
 9 | 
10 | .header {
11 |   display: flex;
12 |   justify-content: center;
13 |   max-width: 1280px;
14 |   margin-left: auto;
15 |   margin-right: auto;
16 |   padding: 28px 30px;
17 |   width: 100%;
18 | }
19 | 
20 | 
21 | .homeLink {
22 |   display: inline-block;
23 |   vertical-align: top;
24 |   white-space: nowrap;
25 | }
26 | 
27 | 
28 | .logo {
29 |   display: inline-block;
30 |   position: relative;
31 |   width: 136px;
32 | 
33 |   img {
34 |     display: block;
35 |     height: 100%;
36 |     object-fit: contain;
37 |     width: 100%;
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/covid-frontend/src/i18n.js:
--------------------------------------------------------------------------------
 1 | import i18n from 'i18next';
 2 | import Backend from 'i18next-xhr-backend';
 3 | import LanguageDetector from 'i18next-browser-languagedetector';
 4 | import { initReactI18next } from 'react-i18next';
 5 | 
 6 | const fallbackLng = ['en']; 
 7 | const availableLanguages = ['de', 'en'];
 8 | 
 9 | i18n
10 |   .use(Backend) // load translation using xhr -> see /public/locales. We will add locales in the next step
11 |   .use(LanguageDetector) // detect user language
12 |   .use(initReactI18next) // pass the i18n instance to react-i18next.
13 |   .init({
14 |     fallbackLng, // if user computer language is not on the list of available languages, than we will be using the fallback language specified earlier
15 |     debug: true,
16 |     whitelist: availableLanguages,
17 | 
18 |     interpolation: {
19 |       escapeValue: false
20 |     },
21 |   });
22 | 
23 | export default i18n;
24 | 


--------------------------------------------------------------------------------
/covid-frontend/src/store/actions/globalSearch.js:
--------------------------------------------------------------------------------
 1 | import * as types from 'store/types/globalSearch';
 2 | 
 3 | export const setSelectedValue = (payload) => ({
 4 |   type: types.SET_SELECTED_VALUE,
 5 |   payload
 6 | });
 7 | 
 8 | export const updateSearchValue = (payload) => ({
 9 |   type: types.UPDATE_SEARCH_VALUE,
10 |   payload
11 | });
12 | 
13 | export const updateLastSearchValue = (payload) => ({
14 |   type: types.UPDATE_LAST_SEARCH_VALUE,
15 |   payload
16 | });
17 | 
18 | export const updateSearchOptions = (payload) => ({
19 |   type: types.UPDATE_SEARCH_OPTIONS,
20 |   payload
21 | });
22 | export const updateSearchFilters = (payload) => ({
23 |   type: types.UPDATE_SEARCH_FILTERS,
24 |   payload
25 | });
26 | 
27 | export const setLoadingStatus = (status) => ({
28 |   type: types.SET_LOADING_STATUS,
29 |   status
30 | });
31 | 
32 | export const reset = () => ({
33 |   type: types.RESET,
34 | });
35 | 


--------------------------------------------------------------------------------
/covid-frontend/src/store/types/activeAnswers.js:
--------------------------------------------------------------------------------
 1 | import { prefix } from 'core/utils/string';
 2 | 
 3 | const activeAnswersPrefix = prefix('activeAnswers/');
 4 | 
 5 | export const GET = activeAnswersPrefix('GET');
 6 | export const SET = activeAnswersPrefix('SET');
 7 | 
 8 | export const SET_LOADING_STATUS = activeAnswersPrefix('SET_LOADING_STATUS');
 9 | export const SHOW_USER_FEEDBACK_PANEL = activeAnswersPrefix('SHOW_USER_FEEDBACK_PANEL');
10 | export const HIDE_USER_FEEDBACK_PANEL = activeAnswersPrefix('HIDE_USER_FEEDBACK_PANEL');
11 | 
12 | export const RESET = activeAnswersPrefix('RESET');
13 | 
14 | export const MARK_AS_CORRECT_ANSWER = activeAnswersPrefix('MARK_AS_CORRECT_ANSWER');
15 | export const MARK_AS_WRONG_ANSWER = activeAnswersPrefix('MARK_AS_WRONG_ANSWER');
16 | export const MARK_AS_FEEDBACK_GIVEN = activeAnswersPrefix('MARK_AS_FEEDBACK_GIVEN');
17 | export const CLEAR_FEEDBACK_GIVEN = activeAnswersPrefix('CLEAR_FEEDBACK_GIVEN');
18 | 


--------------------------------------------------------------------------------
/covid-frontend/public/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 |   <head>
 4 |     <meta charset="utf-8" />
 5 |     <link rel="icon" href="%PUBLIC_URL%/favicon.ico" />
 6 |     <meta name="viewport" content="width=device-width, initial-scale=1" />
 7 |     <meta name="theme-color" content="#000000" />
 8 |     <meta name="Description" content="Covid-19 FAQ chat bot based on scientific knowledge, made during wirvsvirushackathon.org hackathon.">
 9 |     <link rel="manifest" href="%PUBLIC_URL%/manifest.json" />
10 | 
11 |     <title lang="DE_de">Corona Scholar – Covid-19 Frage-und-Antwort Chat Bot basierend auf wissenschaftlicher Faktensammlung und Künstlicher Intelligenz</title>
12 |     <title>Corona Scholar – Covid-19 FAQ chat bot based on scientific knowledge and AI</title>
13 | 
14 |   </head>
15 |   <body>
16 |     <noscript>You need to enable JavaScript to run this app.</noscript>
17 |     <div id="root"></div>
18 |     
19 |   </body>
20 | </html>
21 | 


--------------------------------------------------------------------------------
/covid-frontend/src/components/NotFound/index.js:
--------------------------------------------------------------------------------
 1 | import React, { PureComponent } from 'react';
 2 | import PropTypes from 'prop-types';
 3 | import { connect } from 'react-redux';
 4 | import { Result, Button } from 'antd';
 5 | import links from 'routes/links';
 6 | import { withTranslation } from 'react-i18next';
 7 | 
 8 | class NotFound extends PureComponent {
 9 | 
10 |   static propTypes = {
11 |     history: PropTypes.object
12 |   }
13 | 
14 |   handleBackHome = () => {
15 |     this.props.history.push(links.home);
16 |   }
17 | 
18 |   render() {
19 |     const { t } = this.props;
20 | 
21 |     return (
22 |       <Result
23 |         status="404"
24 |         title="404"
25 |         subTitle={t('404.subtitle')}
26 |         extra={
27 |           <Button onClick={this.handleBackHome}>
28 |             {t('404.button-text')}
29 |           </Button>
30 |         }
31 |       />
32 |     );
33 |   }
34 | }
35 | 
36 | export default connect()(withTranslation()(NotFound));
37 | 


--------------------------------------------------------------------------------
/covid-frontend/src/components/common/Tag/index.js:
--------------------------------------------------------------------------------
 1 | import React, { PureComponent } from 'react';
 2 | import PropTypes from 'prop-types';
 3 | import cn from 'classnames';
 4 | import styles from './styles.module.scss';
 5 | 
 6 | class Tag extends PureComponent {
 7 |   static propTypes = {
 8 |     text: PropTypes.string,
 9 |     theme: PropTypes.oneOf(['red', 'green', 'orange']),
10 |     className: PropTypes.string,
11 |   }
12 | 
13 |   static defaultProps = {
14 |     text: '',
15 |     theme: 'green',
16 |   }
17 | 
18 |   // Themes signify level of confidence in answer
19 |   static themes = {
20 |     RED: 'red',
21 |     GREEN: 'green',
22 |     ORANGE: 'orange'
23 |   }
24 | 
25 |   render() {
26 |     const { text, theme, className } = this.props;
27 |     return (
28 |       <div className={cn(
29 |         styles.tag,
30 |         styles[theme],
31 |         className
32 |       )}>
33 |         {text}
34 |       </div>
35 |     );
36 |   }
37 | }
38 | 
39 | export default Tag;
40 | 


--------------------------------------------------------------------------------
/covid-frontend/src/assets/icons/brain.svg:
--------------------------------------------------------------------------------
1 | <svg width="20" height="21" xmlns="http://www.w3.org/2000/svg"><g stroke="#FFF" stroke-width="2" fill="none" fill-rule="evenodd" stroke-linecap="round" stroke-linejoin="round"><path d="M16.9 14.173c1.018-.723 1.627-1.656 1.827-2.8.2-1.143-.096-2.131-.887-2.965.2-1.17-.007-2.138-.618-2.905-.612-.767-1.478-1.113-2.598-1.04 0-1.445-.643-2.282-1.928-2.508-1.285-.227-2.184.366-2.696 1.78v14.404c.348 1.161 1.008 1.741 1.978 1.741s1.61-.714 1.92-2.144c2.216.269 3.324-.483 3.324-2.255 0-1.772-1.108-2.474-3.323-2.107"/><path d="M3.22 14.173c-1.017-.723-1.626-1.656-1.826-2.8-.2-1.143.095-2.131.886-2.965-.2-1.17.007-2.138.619-2.905.612-.767 1.478-1.113 2.598-1.04 0-1.445.642-2.282 1.928-2.508 1.285-.227 2.183.366 2.695 1.78v14.404C9.772 19.3 9.113 19.88 8.142 19.88c-.97 0-1.61-.714-1.92-2.144-2.215.269-3.323-.483-3.323-2.255 0-1.772 1.108-2.474 3.323-2.107"/><path d="M10.12 6.79c-.338.937-.745 1.603-1.222 1.998-.476.394-1.142.617-1.999.669"/><path d="M10 6.79c.338.937.745 1.603 1.221 1.998.477.394 1.143.617 2 .669"/></g></svg>


--------------------------------------------------------------------------------
/covid-frontend/src/components/Provider.js:
--------------------------------------------------------------------------------
 1 | import React, { PureComponent, Fragment } from 'react';
 2 | // import PropTypes from 'prop-types';
 3 | import { connect } from 'react-redux';
 4 | // import { bindActionCreators } from 'redux';
 5 | // import { Loader } from 'components/common/presentational';
 6 | 
 7 | class Provider extends PureComponent {
 8 | 
 9 |   static propTypes = {
10 |     // companies: PropTypes.object,
11 |     // userActions: PropTypes.object,
12 |     // companiesActions: PropTypes.object
13 |   }
14 | 
15 |   render () {
16 | 
17 |     // next will be removed later
18 |     // if (!companies.isReady) {
19 |     //   return <Loader fullSized size={30} />;
20 |     // }
21 | 
22 |     return (
23 |       <Fragment>
24 |         { this.props.children }
25 |       </Fragment>
26 | 
27 |     );
28 |   }
29 | }
30 | 
31 | export default connect(
32 |   // state => ({
33 |   //   companies: state.companies
34 |   // }),
35 |   // dispatch => ({
36 |   //   companiesActions: bindActionCreators(companiesActions, dispatch),
37 |   // })
38 | )(Provider);
39 | 


--------------------------------------------------------------------------------
/telegram-bot/src/main/kotlin/com/theapache64/cs/utils/FeedbackParser.kt:
--------------------------------------------------------------------------------
 1 | package com.theapache64.cs.utils
 2 | 
 3 | import com.theapache64.cs.models.Feedback
 4 | 
 5 | object FeedbackParser {
 6 |     private val feedbackRegEx = "(?<feedback>\\w)(?<modelId>\\d+)d(?<documentId>\\d+)(?<question>.+)".toRegex()
 7 |     fun parse(data: String): Feedback {
 8 |         val match = feedbackRegEx.find(data)
 9 |         val groups = match!!.groups
10 |         return Feedback(
11 |             groups["modelId"]!!.value.toInt(),
12 |             getFeedbackString(groups["feedback"]!!.value[0]),
13 |             groups["question"]!!.value,
14 |             groups["documentId"]!!.value.toLong()
15 |         )
16 |     }
17 | 
18 |     private fun getFeedbackString(feedback: Char): String {
19 |         return when (feedback) {
20 |             'r' -> "relevant"
21 |             'f' -> "fake"
22 |             'o' -> "outdated"
23 |             'i' -> "irrelevant"
24 |             else -> throw IllegalArgumentException("Undefined feedback char `$feedback`")
25 |         }
26 |     }
27 | }


--------------------------------------------------------------------------------
/covid-frontend/src/assets/styles/wirvsvirus/wirvsvirus-theme.css:
--------------------------------------------------------------------------------
 1 | /* Custom CSS styles for Hackathon WirVsVirus */
 2 | /* TODO: Evaluate removal of ant-design. At least get rid of those arbitrary CSS scoping hashes from class names :( */
 3 | 
 4 | .ant-row .ant-col span {
 5 |     color: #707070;
 6 | }
 7 | 
 8 | .ant-col-19 {
 9 |     /* TODO: Can layout be better controlled through ant-design col/row components? */
10 |     width: 100%;
11 | }
12 | 
13 | .all-answers-wrapper .ant-col.ant-col-19 {
14 |     padding: 2rem;
15 | }
16 | 
17 | .top-answer-wrapper {
18 |   background-color: #59A4B7;
19 |   color: white;
20 |   padding: 0 20px;
21 |     margin: 0 0px !important;
22 |     padding-bottom:30px;
23 | }
24 | 
25 | .headline-faq-match {
26 |     font-size: 1.25rem;
27 | }
28 | 
29 | .answer-text {
30 |     line-height: 1.5rem;
31 |     max-width:800px;
32 | }
33 | 
34 | .answer-meta-info.top-answer {
35 |     margin-top: 0;
36 | }
37 | 
38 | 
39 | .other-answer-row .headline-faq-match-confidence {
40 |     color: black;
41 | }
42 | 
43 | .result-confidence-box {
44 |     font-style: italic;
45 | }
46 | 


--------------------------------------------------------------------------------
/telegram-bot/build.gradle:
--------------------------------------------------------------------------------
 1 | plugins {
 2 |     id 'java'
 3 |     id 'org.jetbrains.kotlin.jvm' version '1.3.70'
 4 |     id 'war'
 5 | }
 6 | 
 7 | group 'com.theapache64'
 8 | version 'v1.1.0-alpha02'
 9 | 
10 | sourceCompatibility = 1.8
11 | 
12 | repositories {
13 |     mavenCentral()
14 | }
15 | 
16 | dependencies {
17 |     implementation "org.jetbrains.kotlin:kotlin-stdlib-jdk8"
18 | 
19 |     // https://mvnrepository.com/artifact/com.google.code.gson/gson
20 |     implementation group: 'com.google.code.gson', name: 'gson', version: '2.8.6'
21 | 
22 |     // https://mvnrepository.com/artifact/com.squareup.okhttp3/okhttp
23 |     implementation group: 'com.squareup.okhttp3', name: 'okhttp', version: '4.4.1'
24 | 
25 |     // https://mvnrepository.com/artifact/javax.servlet/javax.servlet-api
26 |     providedCompile group: 'javax.servlet', name: 'javax.servlet-api', version: '4.0.1'
27 | 
28 | 
29 |     testImplementation group: 'junit', name: 'junit', version: '4.12'
30 | }
31 | 
32 | compileKotlin {
33 |     kotlinOptions.jvmTarget = "1.8"
34 | }
35 | compileTestKotlin {
36 |     kotlinOptions.jvmTarget = "1.8"
37 | }


--------------------------------------------------------------------------------
/covid-frontend/src/components/common/InputContainer/styles.module.scss:
--------------------------------------------------------------------------------
 1 | @import '../../../assets/styles/_variables';
 2 | @import '../../../assets/styles/_mixins';
 3 | 
 4 | .container {
 5 |   display: inline-block;
 6 |   position: relative;
 7 | 
 8 |   & > label {
 9 |     color: $primary-grey;
10 |     display: block;
11 |     @include text(tiny);
12 |     margin: 11px 0;
13 |     letter-spacing: $secondary-spacing;
14 |     text-transform: uppercase;
15 |   }
16 | 
17 |   .error {
18 |     @include text(small);
19 |     color: $error;
20 |     display: block;
21 |     margin-top: 6px;
22 |   }
23 | 
24 |   .info {
25 |     @include text(small);
26 |     color: $primary-grey;
27 |     display: block;
28 |     margin-top: 6px;
29 |   }
30 | 
31 |   &.fluid {
32 |     display: block;
33 |     width: 100%;
34 |   }
35 | 
36 |   &.withError {
37 |     textarea,
38 |     input {
39 |       @include border($error);
40 |     }
41 |     :global(.ant-input-number),
42 |     :global(.ant-select-selection) {
43 |       @include border($error);
44 |       input {
45 |         border: none;
46 |       }
47 |     }
48 |   }
49 | }
50 | 


--------------------------------------------------------------------------------
/telegram-bot/src/main/kotlin/com/theapache64/cs/models/rest/telegram/SendMessageResponse.kt:
--------------------------------------------------------------------------------
 1 | package com.theapache64.cs.models.rest.telegram
 2 | 
 3 | import com.google.gson.annotations.SerializedName
 4 | 
 5 | 
 6 | data class SendMessageResponse(
 7 |     @SerializedName("ok")
 8 |     val ok: Boolean, // true
 9 |     @SerializedName("result")
10 |     val result: Result
11 | ) {
12 |     data class Result(
13 |         @SerializedName("chat")
14 |         val chat: Chat,
15 |         @SerializedName("date")
16 |         val date: Long, // 1584216383
17 |         @SerializedName("message_id")
18 |         val messageId: Long, // 146
19 |         @SerializedName("text")
20 |         val text: String // This is some text
21 |     ) {
22 |         data class Chat(
23 |             @SerializedName("id")
24 |             val id: Long, // -1001423106120
25 |             @SerializedName("title")
26 |             val title: String, // Movie Monk
27 |             @SerializedName("type")
28 |             val type: String, // channel
29 |             @SerializedName("username")
30 |             val username: String // movie_m0nk
31 |         )
32 |     }
33 | }


--------------------------------------------------------------------------------
/backend/controller/feedback.py:
--------------------------------------------------------------------------------
 1 | from fastapi import APIRouter, status
 2 | from fastapi.responses import JSONResponse
 3 | from pydantic import BaseModel
 4 | from typing import Optional
 5 | 
 6 | from backend import api
 7 | from backend.config import DB_INDEX_FEEDBACK
 8 | 
 9 | router = APIRouter()
10 | 
11 | 
12 | class Feedback(BaseModel):
13 |     # Note: the question here is the user's question (=query) and not the matched one from our FAQs (=response)
14 |     question: str
15 |     answer: Optional[str]
16 |     feedback: str
17 |     document_id: int
18 | 
19 | 
20 | @router.post("/models/{model_id}/feedback")
21 | def feedback(model_id: int, request: Feedback):
22 |     feedback_payload = request.__dict__
23 |     if feedback_payload["feedback"] not in ("relevant", "fake", "outdated", "irrelevant"):
24 |         return JSONResponse(
25 |             status_code=status.HTTP_400_BAD_REQUEST,
26 |             content="Invalid 'feedback'. It must be one of relevant, fake, outdated or irrelevant",
27 |         )
28 |     feedback_payload["model_id"] = model_id
29 |     api.elasticsearch_client.index(index=DB_INDEX_FEEDBACK, body=feedback_payload)
30 | 


--------------------------------------------------------------------------------
/covid-frontend/src/store/actions/activeAnswers.js:
--------------------------------------------------------------------------------
 1 | import * as types from 'store/types/activeAnswers';
 2 | 
 3 | export const get = () => ({
 4 |   type: types.GET
 5 | });
 6 | 
 7 | export const set = (payload) => ({
 8 |   type: types.SET,
 9 |   payload
10 | });
11 | 
12 | export const setLoadingStatus = (status) => ({
13 |   type: types.SET_LOADING_STATUS,
14 |   status
15 | });
16 | 
17 | export const showUserFeedbackPanel = (payload) => ({
18 |   type: types.SHOW_USER_FEEDBACK_PANEL,
19 |   payload
20 | });
21 | 
22 | export const hideUserFeedbackPanel = () => ({
23 |   type: types.HIDE_USER_FEEDBACK_PANEL
24 | });
25 | 
26 | export const reset = () => ({
27 |   type: types.RESET,
28 | });
29 | 
30 | export const markAsCorrectAnswer = (payload) => ({
31 |   type: types.MARK_AS_CORRECT_ANSWER,
32 |   payload
33 | });
34 | 
35 | export const markAsWrongAnswer = (payload) => ({
36 |   type: types.MARK_AS_WRONG_ANSWER,
37 |   payload
38 | });
39 | 
40 | export const markAsFeedbackGiven = (payload) => ({
41 |   type: types.MARK_AS_FEEDBACK_GIVEN,
42 |   payload
43 | });
44 | 
45 | export const clearFeedbackGiven = () => ({
46 |   type: types.CLEAR_FEEDBACK_GIVEN
47 | });


--------------------------------------------------------------------------------
/covid-frontend/src/assets/styles/antd/antd-theme.less:
--------------------------------------------------------------------------------
 1 | @import "../../../../node_modules/antd/dist/antd.less";
 2 | 
 3 | // see list of variables in
 4 | // https://github.com/ant-design/ant-design/blob/master/components/style/themes/default.less
 5 | 
 6 | @primary-color: #70b2fc;
 7 | 
 8 | // Base Scaffolding Variables
 9 | @border-radius-base: 6px;
10 | @font-family: 'Gothic A1', sans-serif;
11 | @text-color: #333638;
12 | 
13 | @btn-height-lg: 44px;
14 | @input-height-lg: 44px;
15 | @font-size-lg: 14px;
16 | 
17 | // Border color
18 | @border-color-base: #d3dae0;
19 | 
20 | // table
21 | @table-header-bg: transparent;
22 | @table-header-color: #a3a9ad;
23 | @table-row-hover-bg: #f9fafc;
24 | @table-padding-vertical: 14px;
25 | @table-padding-horizontal: 24px;
26 | 
27 | 
28 | 
29 | // table customization
30 | .ant-table-thead > tr > th {
31 |   font-size: 12px;
32 |   line-height: 14px;
33 |   padding: 11px 24px;
34 | }
35 | 
36 | .ant-table-tbody > tr > td {
37 |   border: none;
38 |   line-height: 18px;
39 |   height: 48px; // works as min-height
40 |   vertical-align: middle;
41 | }
42 | 
43 | .ant-table-placeholder {
44 |   border-bottom-color: transparent;
45 | }
46 | 


--------------------------------------------------------------------------------
/covid-frontend/src/store/sagas/api/index.js:
--------------------------------------------------------------------------------
 1 | import api from 'core/api';
 2 | 
 3 | export function * get (path, query = null) {
 4 |   return yield apiCall(path, 'GET', query);
 5 | }
 6 | 
 7 | export function * post (path, query = null, body = null) {
 8 |   return yield apiCall(path, 'POST', body ? query : null, body || query);
 9 | }
10 | 
11 | export function * put (url, query = null, body = null) {
12 |   return yield apiCall(url, 'PUT', body ? query : null, body || query);
13 | }
14 | 
15 | export function * patch (url, query = null, body = null) {
16 |   return yield apiCall(url, 'PATCH', body ? query : null, body || query);
17 | }
18 | 
19 | export function * del (url, query = null) {
20 |   return yield apiCall(url, 'DELETE', query);
21 | }
22 | 
23 | 
24 | function * apiCall (path, method, query, body) {
25 |   const apiInstance = api();
26 |   // const { token } = yield select(state => state.auth);
27 | 
28 |   // if (token) {
29 |   //   apiInstance.setAuthorization(`Bearer ${token}`);
30 |   // }
31 | 
32 |   let result;
33 |   try {
34 |     result = yield apiInstance.call(path, method, query, body);
35 |   } catch (error) {
36 |     throw error;
37 |   }
38 | 
39 |   return result;
40 | }
41 | 


--------------------------------------------------------------------------------
/covid-frontend/src/index.js:
--------------------------------------------------------------------------------
 1 | import React, { Suspense } from 'react';
 2 | import ReactDOM from 'react-dom';
 3 | import { Provider } from 'react-redux';
 4 | import { Router } from 'react-router-dom';
 5 | import { composeWithDevTools } from 'redux-devtools-extension';
 6 | import createSagaMiddleware from 'redux-saga';
 7 | import { createStore, applyMiddleware } from 'redux';
 8 | 
 9 | 
10 | import history from './history';
11 | import Routes from './routes';
12 | import rootReducer from './store/reducers';
13 | import rootSaga from './store/sagas';
14 | 
15 | import './i18n';
16 | import './assets/styles/global.scss';
17 | 
18 | 
19 | console.log('%c env: ', 'color: #bada55', process.env.REACT_APP_ENV);
20 | 
21 | 
22 | const sagaMiddleware = createSagaMiddleware();
23 | 
24 | const store = createStore(
25 |   rootReducer,
26 |   composeWithDevTools(
27 |     applyMiddleware(sagaMiddleware)
28 |   )
29 | );
30 | 
31 | sagaMiddleware.run(rootSaga);
32 | 
33 | ReactDOM.render(
34 |   <Provider store={store}>
35 |     <Router history={history}>
36 |       <Suspense fallback={<div></div>}>
37 |         <Routes />
38 |       </Suspense>
39 |     </Router>
40 |   </Provider>,
41 |   document.getElementById('root')
42 | );
43 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3'
 2 | services:
 3 |   model-api:
 4 |     image: "deepset/covid-qa-haystack"
 5 |     ports:
 6 |       - "80:80"
 7 |     environment:
 8 |       # see backend/config.py for additional variables to configure
 9 |       - TEXT_FIELD_NAME=answer
10 |       - SEARCH_FIELD_NAME=question
11 |       - EXCLUDE_META_DATA_FIELDS=["question_emb"]
12 |       # optional: use an embedding model instead of plain elasticsearch query
13 | #      - EMBEDDING_FIELD_NAME=question_emb
14 | #      - EMBEDDING_DIM=768
15 | #      - EMBEDDING_MODEL_PATH=deepset/sentence-bert # MUST be the same as used for indexing the FAQs
16 |       - USE_GPU=False
17 |     command: /bin/sh -c "cd /home/user && sleep 20 && uvicorn backend.api:app --host 0.0.0.0 --port 80 --limit-concurrency 10 --workers 1"
18 |     network_mode: host
19 |   elastic:
20 |   # use plain elasticsearch image if you want to ingest fresh data (via backend/data_ingestion.py)
21 |   # use covid-qa-elastic only for dev to have some (old) docs preindexed.
22 |   # image: "elasticsearch:7.5.1"
23 |     image: "deepset/covid-qa-elastic"
24 |     ports:
25 |       - "9200:9200"
26 |       - "9300:9300"
27 |     environment:
28 |       - discovery.type=single-node


--------------------------------------------------------------------------------
/covid-frontend/src/components/UserFeedback/styles.module.scss:
--------------------------------------------------------------------------------
 1 | @import '../../assets/styles/_variables';
 2 | @import '../../assets/styles/_mixins';
 3 | 
 4 | .wrapper {
 5 |   position: fixed;
 6 |   display: flex;
 7 |   justify-content: center;
 8 |   align-items: center;
 9 | 
10 |   width: 100vw;
11 |   height: 100vh;
12 |   left: 0;
13 |   top:  0;
14 |   background-color: rgba(0, 0, 0, 0.7);
15 |   z-index: 10;
16 |   backdrop-filter: blur(2px);
17 | 
18 |   & > div {
19 |     background-color: $white;
20 |     @include border();
21 |     box-shadow: $base-shadow;
22 |     padding: 24px 24px 24px 24px;
23 |   
24 |     position: absolute;
25 |     width: auto;
26 |     height: auto;
27 |     max-height: 60vh;
28 |     z-index: 2;
29 |   }
30 | 
31 |   h2 {
32 |     margin-bottom: 1em;
33 |   }
34 | 
35 |   button {
36 |     display: block;
37 |     white-space: nowrap;
38 |     margin: 0.5em 0;
39 |     cursor: pointer;
40 |     background: $warning-light2;
41 |     border-radius: 12px;
42 |     margin-left: 12px;
43 |     outline: 1px;
44 | 
45 |     &:last-child {
46 |       background: $accent-light;
47 |     }
48 | 
49 |     & > i {
50 |       margin-top: 3px;
51 |       margin-right: 5px;
52 |     }
53 |   }
54 | }
55 | 


--------------------------------------------------------------------------------
/backend/readme.md:
--------------------------------------------------------------------------------
 1 | # Overview
 2 | We run two services in the backend: elasticsearch + the model API. 
 3 | The model API is configured via environment variables that can be passed into the docker container or set in backend/config.py
 4 | 
 5 | # Run elasticsearch
 6 | a) Fresh elasticsearch index:
 7 | 
 8 |          docker run -d -p 9200:9200 -p 9300:9300 -e "discovery.type=single-node" elasticsearch:7.5.1
 9 |    Then ingest data via `data_ingestion.py`
10 |    
11 | b) Dev: 
12 | 
13 |         docker run -d -p 9200:9200 -p 9300:9300 -e "discovery.type=single-node" deepset/covid-qa-elastic
14 | 
15 |  This image has already some docs indexed, so you can skip `data_ingestion.py`
16 | 
17 | 
18 | 
19 | # Run model API 
20 |      docker image build -t deepset/covid-qa-haystack .
21 |      docker run --net=host -e TEXT_FIELD_NAME=answer -e SEARCH_FIELD_NAME=question -e EXCLUDE_META_DATA_FIELDS='["question_emb"]' deepset/covid-qa-haystack:latest
22 | 
23 | or without docker:
24 | 
25 |     pip install -r requirements.txt
26 |     uvicorn backend.api:app
27 | 
28 | # Alternative: Run both via docker-compose 
29 |      docker-compose up
30 | Edit `docker-compose.yml`, if you want to configure elasticsearch host, models etc.
31 |     
32 | 


--------------------------------------------------------------------------------
/covid-frontend/src/routes/index.js:
--------------------------------------------------------------------------------
 1 | import React, { PureComponent } from 'react';
 2 | import { Route, Switch, Redirect, withRouter } from 'react-router-dom';
 3 | import App from 'components/App';
 4 | import Provider from 'components/Provider';
 5 | import links from 'routes/links';
 6 | import { MainTemplate } from 'components/themes';
 7 | import Home from 'components/Home';
 8 | import Answers from 'components/Answers';
 9 | import NotFound from 'components/NotFound';
10 | 
11 | class Routes extends PureComponent {
12 |   render () {
13 |     return (
14 |       <App>
15 | 
16 |         <Provider>
17 |           <Switch>
18 |             <Redirect exact from='/' to={links.home} />
19 |             {/* <Redirect exact from='/index.html' to={links.home} /> */}
20 | 
21 |             <Route path={links.home} component={Home} />
22 | 
23 |             <MainTemplate>
24 |               <Switch>
25 |                 <Route exact path={links.answers} component={Answers} />
26 | 
27 |                 <Route component={NotFound} />
28 |               </Switch>
29 | 
30 |             </MainTemplate>
31 | 
32 |             <Route component={NotFound} />
33 |           </Switch>
34 |         </Provider>
35 | 
36 |       </App>
37 |     );
38 |   }
39 | }
40 | 
41 | export default withRouter(Routes);
42 | 


--------------------------------------------------------------------------------
/covid-frontend/public/locales/en/translation.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "inputs": {
 3 |     "question": {
 4 |         "label": "Your Question",
 5 |         "placeholder": "Ask a question about COVID-19 (corona virus)"
 6 |     }
 7 |   },
 8 |   "loader": {
 9 |       "heading": "The BERT is working",
10 |       "text": "Please Wait – Bitte warten..."
11 |   },
12 |   "answer": {
13 |       "meta": {
14 |           "datelabel": "From",
15 |           "source": "Source"
16 |       },
17 |       "feedback": {
18 |           "header": "Feedback"
19 |       },
20 |       "no-answer": "Found no answers",
21 |       "other-answers": "Other answers",
22 |       "tags": {
23 |           "probability": "Relevance"
24 |       }
25 |   },
26 |   "feedback": {
27 |       "title": "Thank you for giving us feedback.",
28 |       "text": "What was wrong with the answer?",
29 |       "fake": "The stated facts were inaccurate or wrong.",
30 |       "outdated": "The information were outdated.",
31 |       "irrelevant": "The answer had nothing to do with my question.",
32 |       "nothing": "Nothing."
33 |   },
34 |   "languages": {
35 |       "de": "German",
36 |       "en": "English"
37 |   },
38 |   "404": {
39 |       "subtitle": "Sorry, the page you visited does not exist.",
40 |       "button-text": "Back to Home"
41 |   }
42 | }
43 | 


--------------------------------------------------------------------------------
/covid-frontend/src/assets/styles/_reset.css:
--------------------------------------------------------------------------------
 1 | /* http://meyerweb.com/eric/tools/css/reset/
 2 |    v2.0 | 20110126
 3 |    License: none (public domain)
 4 | */
 5 | 
 6 | html, body, div, span, applet, object, iframe,
 7 | h1, h2, h3, h4, h5, h6, p, blockquote, pre,
 8 | a, abbr, acronym, address, big, cite, code,
 9 | del, dfn, em, img, ins, kbd, q, s, samp,
10 | small, strike, strong, sub, sup, tt, var,
11 | b, u, i, center,
12 | dl, dt, dd, ol, ul, li,
13 | fieldset, form, label, legend,
14 | table, caption, tbody, tfoot, thead, tr, th, td,
15 | article, aside, canvas, details, embed,
16 | figure, figcaption, footer, header, hgroup,
17 | menu, nav, output, ruby, section, summary,
18 | time, mark, audio, video {
19 |   margin: 0;
20 |   padding: 0;
21 |   border: 0;
22 |   font-size: 100%;
23 |   font: inherit;
24 |   vertical-align: baseline;
25 | }
26 | /* HTML5 display-role reset for older browsers */
27 | article, aside, details, figcaption, figure,
28 | footer, header, hgroup, menu, nav, section {
29 |   display: block;
30 | }
31 | body {
32 |   line-height: 1;
33 | }
34 | ol, ul {
35 |   list-style: none;
36 | }
37 | blockquote, q {
38 |   quotes: none;
39 | }
40 | blockquote:before, blockquote:after,
41 | q:before, q:after {
42 |   content: '';
43 |   content: none;
44 | }
45 | table {
46 |   border-collapse: collapse;
47 |   border-spacing: 0;
48 | }
49 | 


--------------------------------------------------------------------------------
/covid-frontend/public/locales/de/translation.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "inputs": {
 3 |       "question": {
 4 |           "label": "Ihre Frage",
 5 |           "placeholder": "Stellen Sie eine Frage zu COVID-19 (Corona-Virus)"
 6 |       }
 7 |     },
 8 |     "loader": {
 9 |         "heading": "The BERT is working",
10 |         "text": "Please Wait – Bitte warten..."
11 |     },
12 |     "answer": {
13 |         "meta": {
14 |             "datelabel": "Stand",
15 |             "source": "Quelle"
16 |         },
17 |         "feedback": {
18 |             "header": "Feedback"
19 |         },
20 |         "no-answer": "Keine Antworten gefunden",
21 |         "other-answers": "Weitere Antworten",
22 |         "tags": {
23 |             "probability": "Relevanz"
24 |         }
25 |     },
26 |     "feedback": {
27 |         "title": "Danke für Ihr Feedback!",
28 |         "text": "Was war falsch mit der Antwort?",
29 |         "fake": "Die Antwort war falsch oder ungenau.",
30 |         "outdated": "Die Informationen waren veraltet.",
31 |         "irrelevant": "Die Antwort hatte nichts mit meiner Frage zu tun.",
32 |         "nothing": "Nichts."
33 |     },
34 |     "languages": {
35 |         "de": "Deutsch",
36 |         "en": "Englisch"
37 |     },
38 |     "404": {
39 |         "subtitle": "Tut uns leid, diese Seite gibt es nicht.",
40 |         "button-text": "Zurück"
41 |     }
42 | }
43 | 


--------------------------------------------------------------------------------
/telegram-bot/src/main/kotlin/com/theapache64/cs/core/Scholar.kt:
--------------------------------------------------------------------------------
 1 | package com.theapache64.cs.core
 2 | 
 3 | import com.theapache64.cs.models.Feedback
 4 | import com.theapache64.cs.models.rest.AddFeedbackRequest
 5 | import com.theapache64.cs.models.rest.CoronaAnswer
 6 | import com.theapache64.cs.models.rest.CoronaQuestion
 7 | import com.theapache64.cs.utils.GsonUtil
 8 | import com.theapache64.cs.utils.RestClient
 9 | 
10 | object Scholar {
11 | 
12 |     private const val BASE_URL = "https://covid-backend.deepset.ai"
13 | 
14 |     fun getAnswer(question: String): CoronaAnswer? {
15 |         val jsonString = RestClient.post(
16 |             "$BASE_URL/question/ask",
17 |             null,
18 |             CoronaQuestion(
19 |                 arrayOf(question)
20 |             )
21 |         ).body!!.string()
22 | 
23 |         println(jsonString)
24 | 
25 |         return GsonUtil.gson.fromJson(jsonString, CoronaAnswer::class.java)
26 |     }
27 | 
28 |     fun addFeedback(feedback: Feedback) {
29 |         val jsonString = RestClient.post(
30 |             "$BASE_URL/models/${feedback.modelId}/feedback",
31 |             null,
32 |             AddFeedbackRequest(
33 |                 feedback.feedback,
34 |                 feedback.question,
35 |                 feedback.documentId
36 |             )
37 |         ).body!!.string()
38 |         println("Feedback response : $jsonString")
39 |     }
40 | 
41 | 
42 | }


--------------------------------------------------------------------------------
/covid-frontend/src/components/common/InputContainer/index.js:
--------------------------------------------------------------------------------
 1 | import React, { PureComponent } from 'react';
 2 | import PropTypes from 'prop-types';
 3 | import cn from 'classnames';
 4 | import styles from './styles.module.scss';
 5 | 
 6 | class InputContainer extends PureComponent {
 7 |   static propTypes = {
 8 |     label: PropTypes.string,
 9 |     error: PropTypes.oneOfType([
10 |       PropTypes.bool,
11 |       PropTypes.string
12 |     ]),
13 |     info: PropTypes.oneOfType([
14 |       PropTypes.bool,
15 |       PropTypes.string
16 |     ]),
17 |     fluid: PropTypes.bool,
18 |     className: PropTypes.string,
19 |   }
20 | 
21 |   static defaultProps = {
22 |     label: '',
23 |     info: '',
24 |     error: '',
25 |     fluid: false,
26 | 
27 |     className: ''
28 |   }
29 | 
30 |   render() {
31 |     const { label, info, error, fluid, className, children } = this.props;
32 | 
33 |     const classes = cn(
34 |       styles.container,
35 |       { [styles.fluid]: fluid },
36 |       { [styles.withError]: error },
37 |       className
38 |     );
39 | 
40 |     return (
41 |       <div className={classes}>
42 |         { label && <label>{label}</label> }
43 |         { children }
44 |         { (error && (typeof error === 'string')) && <span className={styles.error}>{error}</span> }
45 |         { info && <span className={styles.info}>{info}</span> }
46 |       </div>
47 |     );
48 |   }
49 | }
50 | 
51 | export default InputContainer;
52 | 


--------------------------------------------------------------------------------
/telegram-bot/src/main/kotlin/com/theapache64/cs/models/rest/telegram/SendMessageRequest.kt:
--------------------------------------------------------------------------------
 1 | package com.theapache64.cs.models.rest.telegram
 2 | 
 3 | import com.google.gson.annotations.SerializedName
 4 | 
 5 | 
 6 | data class SendMessageRequest(
 7 |     @SerializedName("chat_id")
 8 |     val chatId: String, // to
 9 |     @SerializedName("text")
10 |     val text: String, // This is some message
11 |     @SerializedName("disable_web_page_preview")
12 |     val isDisableWebPagePreview: Boolean?,
13 |     @SerializedName("parse_mode")
14 |     val parseMode: String?,
15 |     @SerializedName("reply_to_message_id")
16 |     val replyMsgId: Long?,
17 |     @SerializedName("reply_markup")
18 |     val replyMarkup: ReplyMarkup?
19 | ) {
20 |     data class ReplyMarkup(
21 |         @SerializedName("inline_keyboard")
22 |         val inlineKeyboard: List<List<InlineButton>>
23 |     )
24 | 
25 |     data class InlineButton(
26 |         @SerializedName("text")
27 |         val text: String, // ✅ Relevant
28 |         @SerializedName("callback_data")
29 |         val callbackData: String // r123
30 |     ) {
31 |         class ByteOverflowException(message: String?) : Throwable(message)
32 | 
33 |         init {
34 |             val byteSize = callbackData.toByteArray().size
35 |             if (byteSize > 64) {
36 |                 throw ByteOverflowException(
37 |                     "Callback data exceeded"
38 |                 )
39 |             }
40 |         }
41 |     }
42 | }


--------------------------------------------------------------------------------
/data/faqs/deduplicate_with_sentenceBert.py:
--------------------------------------------------------------------------------
 1 | from haystack.retriever.elasticsearch import ElasticsearchRetriever
 2 | import pandas as pd
 3 | import numpy as np
 4 | from sklearn.metrics.pairwise import cosine_similarity
 5 | 
 6 | # loading questions and calculating similarities based of sentence bert embeddings
 7 | df = pd.read_csv("200416_englishFAQ.csv",sep=",")
 8 | if df.columns[0] != "question":
 9 |     df = df.iloc[:,1:]
10 | 
11 | #df = pd.concat((df.loc[df.name == "CDC General FAQ"],df.loc[df.name != "CDC General FAQ"]),ignore_index=True)
12 | df = df.loc[df.name == "CDC General FAQ"]
13 | df = df.loc[df.category != "School Dismissals and Children"]
14 | 
15 | df.reset_index(inplace=True,drop=True)
16 | 
17 | 
18 | questions = [{"text": v} for v in df.question.values]
19 | retriever = ElasticsearchRetriever(document_store=None, embedding_model="deepset/sentence_bert", gpu=False)
20 | res1 = retriever.embedding_model.extract_vectors(
21 |     dicts=questions,
22 |     extraction_strategy="reduce_mean",
23 |     extraction_layer=-1)
24 | res1 = np.array([i["vec"] for i in res1])
25 | sims = cosine_similarity(res1,res1)
26 | 
27 | threshold = 0.85
28 | indices = [0]
29 | for i in range(1,len(questions)):
30 |     if (sims[:i,i] < threshold).all():
31 |         indices.append(i)
32 |     else:
33 |         print(df.question[i])
34 |         idxs = np.nonzero(sims[:i,i] > threshold)[0]
35 |         print(df.iloc[idxs,1])
36 |         print("newexample \n")
37 | 
38 | 
39 | newdf = df.iloc[indices,:]
40 | print(newdf.shape)
41 | print(df.shape)
42 | newdf.to_csv("200416_CDCGen_dedup.csv",index=True,sep=",")
43 | 
44 | 
45 | 


--------------------------------------------------------------------------------
/covid-frontend/src/components/common/Loader/index.js:
--------------------------------------------------------------------------------
 1 | import React, { PureComponent } from 'react';
 2 | import PropTypes from 'prop-types';
 3 | import { Spin, Icon } from 'antd';
 4 | import cn from 'classnames';
 5 | 
 6 | import styles from './styles.module.scss';
 7 | 
 8 | class Loader extends PureComponent {
 9 |   static propTypes = {
10 |     size: PropTypes.number,
11 |     loading: PropTypes.bool, // works only when loader has children
12 |     selfContained: PropTypes.bool, // works only when loader has no children
13 |     fullSized: PropTypes.bool, // works only when loader has no children
14 |     className: PropTypes.string, // works only when loader has no children
15 |   }
16 | 
17 |   static defaultProps = {
18 |     size: 24,
19 |     selfContained: true,
20 |     fullSized: false,
21 |     className: '',
22 |     loading: false
23 |   }
24 | 
25 |   render() {
26 |     const { children, loading, size, selfContained, fullSized, className } = this.props;
27 | 
28 |     const classNames = cn({
29 |       [styles.selfContained]: selfContained,
30 |       [styles.fullSized]: fullSized,
31 |       className
32 |     });
33 | 
34 |     if (children) {
35 |       return (
36 |         <Spin
37 |           indicator={<Icon type="loading" style={{ fontSize: size }} spin />}
38 |           spinning={loading}
39 |         >
40 |           { children }
41 |         </Spin>
42 |       );
43 |     }
44 | 
45 |     return (
46 |       <div className={classNames}>
47 |         <Spin indicator={<Icon type="loading" style={{ fontSize: size }} spin />} />
48 |       </div>
49 |     );
50 |   }
51 | }
52 | 
53 | export default Loader;
54 | 


--------------------------------------------------------------------------------
/covid-frontend/src/assets/styles/_variables.scss:
--------------------------------------------------------------------------------
 1 | // colors
 2 | $primary: #081741;
 3 | $dark: #333638;
 4 | 
 5 | $contrast: #005da9;
 6 | 
 7 | $accent: #70b2fc;
 8 | $accent-light: #e7f2ff;
 9 | $accent-light2: #d2ebff;
10 | $accent-light3: #b2d8f8;
11 | 
12 | $success: #009c10;
13 | $success-light: #99d7a0;
14 | $success-light2: rgba(0, 156, 16, 0.1);
15 | 
16 | $warning: #ed9700;
17 | $warning-light: #ffd387;
18 | $warning-light2: #fff4e0;
19 | 
20 | $strong-emotion: #be0000;
21 | $error: #be0000;
22 | $error-light: #ff9c9c;
23 | $error-light2: #ffdede;
24 | 
25 | 
26 | $primary-grey: #a3a9ad;
27 | $secondary-grey: #878b90;
28 | $border-grey: #d3dae0;
29 | $light-border-grey: #e4e9ed;
30 | $light-grey: #f7f8fa;
31 | 
32 | $white: #ffffff;
33 | 
34 | // shadows
35 | $base-shadow: 0 1px 3px 0 rgba(113, 118, 122, 0.35);
36 | $reversed-shadow: 0 -3px 5px 0 rgba(0, 0, 0, 0.1);
37 | 
38 | // letter-spacing
39 | $base-spacing: 0.65px;
40 | $secondary-spacing: 1px;
41 | 
42 | // radiuses
43 | $base-radius: 6px;
44 | 
45 | // transitions
46 | $base-transition-speed: .3s;
47 | $fast-transition-speed: .15s;
48 | 
49 | // text sizes
50 | $text: (
51 |   tiny: (
52 |     font-size: 11px,
53 |     line-height: 1
54 |   ),
55 |   small: (
56 |     font-size: 12px,
57 |     line-height: 1
58 |   ),
59 |   regular: (
60 |     font-size: 14px,
61 |     line-height: 1
62 |   ),
63 |   semiBig: (
64 |     font-size: 16px,
65 |     line-height: 1
66 |   ),
67 |   big: (
68 |     font-size: 20px,
69 |     line-height: 1
70 |   ),
71 |   huge: (
72 |     font-size: 24px,
73 |     line-height: 1
74 |   ),
75 |   jumbo: (
76 |     font-size: 32px,
77 |     line-height: 1
78 |   )
79 | );
80 | 


--------------------------------------------------------------------------------
/covid-frontend/src/store/reducers/activeAnswers.js:
--------------------------------------------------------------------------------
 1 | import * as types from 'store/types/activeAnswers';
 2 | 
 3 | const initialState = {
 4 |   entries: [],
 5 | 
 6 |   isLoading: false,
 7 | 
 8 |   // the state of the answer-popup
 9 |   userFeedbackPopup: {
10 |     visible: false,
11 |     answerDocumentId: null
12 |   },
13 | 
14 |   // a list of answers which the user has already given his feedback to
15 |   feedbackGiven: {}
16 | };
17 | 
18 | export default (state = initialState, action) => {
19 |   switch (action.type) {
20 |     case types.SET:
21 |       return {
22 |         ...state,
23 |         entries: action.payload
24 |       };
25 | 
26 |     case types.SET_LOADING_STATUS:
27 |       return {
28 |         ...state,
29 |         isLoading: action.status
30 |       };
31 | 
32 |     case types.SHOW_USER_FEEDBACK_PANEL:
33 |       return {
34 |         ...state,
35 |         userFeedbackPopup: { ...state.userFeedbackPopup, visible: true, answerDocumentId: action.payload }
36 |       };
37 | 
38 |     case types.HIDE_USER_FEEDBACK_PANEL:
39 |       return {
40 |         ...state,
41 |         userFeedbackPopup: { ...initialState.userFeedbackPopup }
42 |       };
43 | 
44 |     case types.MARK_AS_FEEDBACK_GIVEN:
45 |       return {
46 |         ...state,
47 |         feedbackGiven: { ...state.feedbackGiven, ...action.payload }
48 |       };
49 | 
50 |     case types.CLEAR_FEEDBACK_GIVEN:
51 |       return {
52 |         ...state,
53 |         feedbackGiven: { ...initialState.feedbackGiven }
54 |       };
55 | 
56 |     case types.RESET:
57 |       return {
58 |         ...initialState
59 |       };
60 | 
61 |     default:
62 |       return state;
63 |   };
64 | };
65 | 


--------------------------------------------------------------------------------
/telegram-bot/src/main/kotlin/com/theapache64/cs/models/rest/telegram/TelegramUpdate.kt:
--------------------------------------------------------------------------------
 1 | package com.theapache64.cs.models.rest.telegram
 2 | 
 3 | import com.google.gson.annotations.SerializedName
 4 | 
 5 | 
 6 | data class TelegramUpdate(
 7 |     @SerializedName("message")
 8 |     val message: Message,
 9 |     @SerializedName("update_id")
10 |     val updateId: Int // 102073005
11 | ) {
12 |     data class Message(
13 |         @SerializedName("chat")
14 |         val chat: Chat,
15 |         @SerializedName("date")
16 |         val date: Int, // 1584880886
17 |         @SerializedName("from")
18 |         val from: From,
19 |         @SerializedName("message_id")
20 |         val messageId: Long, // 8
21 |         @SerializedName("text")
22 |         val text: String // Dbrhrfjggkjgj nfgntnt t
23 |     ) {
24 |         data class Chat(
25 |             @SerializedName("first_name")
26 |             val firstName: String, // theapache64
27 |             @SerializedName("id")
28 |             val id: Int, // 240810054
29 |             @SerializedName("type")
30 |             val type: String, // private
31 |             @SerializedName("username")
32 |             val username: String // theapache64
33 |         )
34 | 
35 |         data class From(
36 |             @SerializedName("first_name")
37 |             val firstName: String, // theapache64
38 |             @SerializedName("id")
39 |             val id: Int, // 240810054
40 |             @SerializedName("is_bot")
41 |             val isBot: Boolean, // false
42 |             @SerializedName("language_code")
43 |             val languageCode: String, // en
44 |             @SerializedName("username")
45 |             val username: String // theapache64
46 |         )
47 |     }
48 | }


--------------------------------------------------------------------------------
/covid-frontend/src/store/reducers/globalSearch.js:
--------------------------------------------------------------------------------
 1 | import * as types from 'store/types/globalSearch';
 2 | 
 3 | const initialState = {
 4 |   selectedValue: '',
 5 |   search: {
 6 |     currentString: '',
 7 |     lastString: '',
 8 |     options: [],
 9 |   },
10 |   isLoading: false
11 | };
12 | 
13 | export default (state = initialState, action) => {
14 |   switch (action.type) {
15 |     case types.SET_SELECTED_VALUE:
16 |       return {
17 |         ...state,
18 |         selectedValue: action.payload,
19 |         search: {
20 |           ...state.search,
21 |           currentString: action.payload
22 |         }
23 |       };
24 |     case types.UPDATE_SEARCH_VALUE:
25 |       return {
26 |         ...state,
27 |         search: {
28 |           ...state.search,
29 |           currentString: action.payload
30 |         }
31 |       };
32 |     case types.UPDATE_LAST_SEARCH_VALUE:
33 |       return {
34 |         ...state,
35 |         search: {
36 |           ...state.search,
37 |           lastString: action.payload
38 |         }
39 |       };
40 |     case types.UPDATE_SEARCH_FILTERS:
41 |       return {
42 |         ...state,
43 |         search: {
44 |           ...state.search,
45 |           filters: action.payload
46 |         }
47 |       };
48 |     case types.UPDATE_SEARCH_OPTIONS:
49 |       return {
50 |         ...state,
51 |         search: {
52 |           ...state.search,
53 |           options: action.payload
54 |         }
55 |       };
56 |     case types.SET_LOADING_STATUS:
57 |       return {
58 |         ...state,
59 |         isLoading: action.status
60 |       };
61 |     case types.RESET:
62 |       return {
63 |         ...initialState
64 |       };
65 |     default:
66 |       return state;
67 |   };
68 | };
69 | 


--------------------------------------------------------------------------------
/covid_nlp/language/ms_translate.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os, requests, uuid, json
 3 | import sys
 4 | 
 5 | import pandas as pd
 6 | 
 7 | 
 8 | class MSTranslator():
 9 |     def __init__(self, key = None, endpoint = None, lang = None):
10 |         if key:
11 |             self.azure_key = key
12 |         else:
13 |             self.azure_key = os.environ['AZURE_TRANSLATE_KEY']
14 |         self.azure_endpoint = endpoint
15 |         self.lang = lang
16 |         self.url = f"{self.azure_endpoint}/translate?api-version=3.0&to={self.lang}"
17 |         self.headers =  {
18 |                 'Ocp-Apim-Subscription-Key': self.azure_key,
19 |                 'Content-type': 'application/json',
20 |                 'X-ClientTraceId': str(uuid.uuid4())
21 |         }
22 | 
23 |     def translate(self, text):
24 |         body = [{'text': text.strip()}]
25 |         request = requests.post(self.url, headers = self.headers, json = body)
26 |         response = request.json()
27 |         trans_text = ""
28 |         if len(response) > 0:
29 |             trans_text = response[0]['translations'][0]['text']
30 |         return trans_text
31 | 
32 | 
33 | def main():
34 |     lang = "ar"
35 |     azure_endpoint = "https://api.cognitive.microsofttranslator.com/"
36 |     ms_translator = MSTranslator(endpoint = azure_endpoint, lang = lang)
37 | 
38 |     faq_file = "../../data/faqs/faq_covidbert.csv"
39 |     df = pd.read_csv(faq_file)
40 |     df[f'question_{lang}'] = df.apply(lambda x: ms_translator.translate(x.question), axis=1)
41 |     df[f'answer_{lang}'] = df.apply(lambda x: ms_translator.translate(x.answer), axis=1)
42 | 
43 |     faq_filename = os.path.basename(faq_file)
44 |     df.to_csv(f"MT_{lang}_{faq_filename}")
45 | 
46 | if __name__ == "__main__":
47 |     main()
48 | 


--------------------------------------------------------------------------------
/covid-frontend/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "irda",
 3 |   "version": "0.1.0",
 4 |   "private": true,
 5 |   "dependencies": {
 6 |     "@ant-design/icons": "latest",
 7 |     "antd": "^3.26.13",
 8 |     "chart.js": "^2.9.3",
 9 |     "classnames": "^2.2.6",
10 |     "env-cmd": "^9.0.3",
11 |     "history": "^4.9.0",
12 |     "i18next": "^19.3.3",
13 |     "i18next-browser-languagedetector": "^4.0.2",
14 |     "i18next-xhr-backend": "^3.2.2",
15 |     "less": "^2.7.2",
16 |     "less-plugin-clean-css": "^1.5.1",
17 |     "node-sass": "^4.13.1",
18 |     "prop-types": "^15.7.2",
19 |     "react": "^16.13.1",
20 |     "react-chartjs-2": "^2.9.0",
21 |     "react-custom-scrollbars": "^4.2.1",
22 |     "react-dom": "^16.13.1",
23 |     "react-i18next": "^11.3.4",
24 |     "react-redux": "^6.0.0",
25 |     "react-router": "^4.3.1",
26 |     "react-router-dom": "^5.0.0",
27 |     "react-scripts": "^3.4.1",
28 |     "redux": "^4.0.5",
29 |     "redux-devtools-extension": "^2.13.8",
30 |     "redux-saga": "^1.1.3",
31 |     "uuid": "^3.4.0"
32 |   },
33 |   "scripts": {
34 |     "start": "react-scripts start",
35 |     "build": "react-scripts build",
36 |     "build:staging": "env-cmd -f .env.staging npm run build",
37 |     "build:production": "env-cmd -f .env.production npm run build",
38 |     "test": "react-scripts test",
39 |     "eject": "react-scripts eject",
40 |     "antd-theme": "lessc --clean-css src/assets/styles/antd/antd-theme.less src/assets/styles/antd/antd.min.css"
41 |   },
42 |   "eslintConfig": {
43 |     "extends": "react-app"
44 |   },
45 |   "browserslist": {
46 |     "production": [
47 |       ">0.2%",
48 |       "not dead",
49 |       "not op_mini all"
50 |     ],
51 |     "development": [
52 |       "last 1 chrome version",
53 |       "last 1 firefox version",
54 |       "last 1 safari version"
55 |     ]
56 |   }
57 | }
58 | 


--------------------------------------------------------------------------------
/backend/controller/autocomplete.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | from fastapi import APIRouter
 4 | from pydantic import BaseModel
 5 | 
 6 | import langid
 7 | langid.set_languages(['de', 'en'])  # ISO 639-1 codes
 8 | 
 9 | #
10 | # not a good idea to work with global variables like this.
11 | #
12 | from backend import api
13 | 
14 | DB_INDEX_AUTOCOMPLETE = "autocomplete"
15 | 
16 | router = APIRouter()
17 | 
18 | 
19 | class Request(BaseModel):
20 |     search: str
21 | 
22 | 
23 | def addQuestionToAutocomplete(question: str):
24 |     # todo: if it already exists; we need to increment count;
25 |     body = {
26 |         'phrase': question,
27 |         'count' : 1
28 |     }
29 |     res = api.elasticsearch_client.index(index=DB_INDEX_AUTOCOMPLETE,body=body)
30 | 
31 | 
32 | 
33 | 
34 | @router.get("/query/autocomplete")
35 | def ask(search: str):
36 |     interim = api.elasticsearch_client.search(index=DB_INDEX_AUTOCOMPLETE, body=
37 |     {
38 |         '_source':['phrase'],
39 |         'query':{
40 |             "bool": {
41 |                 "must": [{
42 |                     "match": {
43 |                         "phrase": search
44 |                     }
45 |                 },
46 |                     {
47 |                         "exists": {
48 |                             "field": "count"
49 |                         }
50 |                     }]
51 |             }
52 |         },
53 |         'size': 10,
54 |         'sort' :[
55 |                 {'count' : {'order' : 'desc' }}
56 |         ]
57 |     })
58 | 
59 |     resultCount = len(interim['hits']['hits'])
60 |     result = []
61 |     for i in range(resultCount):
62 |         result.append(interim['hits']['hits'][i]['_source']['phrase'])
63 | 
64 | 
65 |     lang, score = langid.classify(search)
66 | 
67 |     return {
68 |             "results":result,
69 |             "language": lang
70 |         }
71 | 
72 | 
73 | 


--------------------------------------------------------------------------------
/covid-frontend/src/store/sagas/globalSearch.js:
--------------------------------------------------------------------------------
 1 | import { all, put, select, takeLatest, delay } from 'redux-saga/effects';
 2 | import { message } from 'antd';
 3 | import * as api from 'store/sagas/api';
 4 | import * as types from 'store/types/globalSearch';
 5 | import * as actions from 'store/actions/globalSearch';
 6 | 
 7 | 
 8 | export function* getOptions(value) {
 9 |   const { currentString, lastString } = yield select(state => state.globalSearch.search);
10 | 
11 |   // return and reset fields if string is empty
12 |   if (!currentString.length) {
13 |     yield put(actions.updateSearchOptions([]));
14 |     yield put(actions.updateSearchFilters({}));
15 |     yield put(actions.updateLastSearchValue(''));
16 | 
17 |     return;
18 |   }
19 | 
20 |   // return if options for the string already exist
21 |   if (currentString.length && lastString.startsWith(currentString) && currentString.length <= lastString.length) {
22 |     return;
23 |   }
24 | 
25 |   yield put(actions.setLoadingStatus(true));
26 | 
27 |   try {
28 |     yield put(actions.updateLastSearchValue(value));
29 |     yield delay(400);
30 |     const data = yield api.get(`/query/autocomplete`, { search: currentString });
31 |     let i = 0;
32 | 
33 |     // filter duplicates
34 |     let results = data.results;
35 |     results = results.filter((v,i) => results.indexOf(v) === i)
36 | 
37 |     const searchResults = results.map(question =>{
38 |       return {question, id: i++ };
39 |     });
40 | 
41 |     yield put(actions.updateSearchOptions(searchResults));
42 |     yield put(actions.updateSearchFilters({language:data.language}));
43 | 
44 |   } catch (error) {
45 |     message.error(error.message);
46 |   }
47 |   yield put(actions.setLoadingStatus(false));
48 | }
49 | 
50 | export default function* () {
51 |   yield all([
52 |     takeLatest(types.UPDATE_SEARCH_VALUE, ({ payload }) => getOptions(payload)),
53 |   ]);
54 | }
55 | 


--------------------------------------------------------------------------------
/covid_nlp/eval.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | from sklearn.metrics import roc_auc_score, f1_score
 5 | from farm.utils import MLFlowLogger
 6 | 
 7 | 
 8 | def eval_question_similarity(y_true, y_pred, lang, model_name, params, user=None, log_to_mlflow=True, run_name="default"):
 9 |     # basic metrics
10 |     mean_diff = np.mean(np.abs(y_true - y_pred))
11 |     roc_auc = roc_auc_score(y_true, y_pred)
12 |     f1 = f1_score(y_true, y_pred.round(0))
13 |     metrics = {"roc_auc": roc_auc, "mean_abs_diff": mean_diff, "f1_score": f1}
14 |     print(metrics)
15 | 
16 |     # log experiment results to MLFlow (visit https://public-mlflow.deepset.ai/)
17 |     if log_to_mlflow:
18 |         params["lang"] = lang
19 |         params["model_name"] = model_name
20 |         if user:
21 |             params["user"] = user
22 | 
23 |         ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
24 |         ml_logger.init_experiment(experiment_name="COVID-question-sim", run_name=run_name)
25 |         ml_logger.log_params(params)
26 |         ml_logger.log_metrics(metrics, step=0)
27 | 
28 | 
29 | if __name__ == "__main__":
30 |     # config
31 |     eval_file = "../data/eval_question_similarity_en.csv"
32 |     lang = "en"
33 |     model_name = "naive_baseline"
34 |     experiment_name = "naive_baseline_1"
35 |     log_to_mlflow = True
36 |     params = {"some_model_param": 0}
37 | 
38 |     # load eval data
39 |     df = pd.read_csv(eval_file)
40 | 
41 |     # predict similarity of samples (e.g. via embeddings + cosine similarity)
42 |     # here: dummy preds for naive baseline
43 |     y_true = df["similar"].values
44 |     y_pred = [0.5] * len(y_true)
45 | 
46 |     # eval & track results
47 |     eval_question_similarity(y_true=y_true, y_pred=y_pred, lang=lang, model_name=model_name,
48 |                              params=params, user="malte", log_to_mlflow=log_to_mlflow, run_name=experiment_name)
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/datasources/automatic/scraper.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import pandas as pd
 3 | from scrapy.crawler import CrawlerProcess
 4 | from datasources.automatic.testing_WHO_scraper import CovidScraper
 5 | 
 6 | logger = logging.getLogger(__name__)
 7 | 
 8 | 
 9 | def scrape(url):
10 |     # try to extract question and answer for each url
11 |     questions, answers = "q","a" # do scraping here
12 |     return questions, answers
13 | 
14 | 
15 | ########## TESTING CODE
16 | RESULTS = []
17 | class Pipeline(object):
18 |     def process_item(self, item, spider):
19 |         df = pd.DataFrame.from_dict(item)
20 |         RESULTS.append(df)
21 | 
22 | def get_test_data():
23 |     # Code for getting the test set of questions and answers
24 |     process = CrawlerProcess({
25 |         'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
26 |         'ITEM_PIPELINES': {'__main__.Pipeline': 1}
27 |     })
28 |     process.crawl(
29 |         CovidScraper)  # uses the WHO manual scraper with version fixed through waybackmachine (see import above)
30 |     process.start()
31 |     dataframe = pd.concat(RESULTS)
32 |     questions_truth = dataframe.question
33 |     answers_truth = dataframe.answer
34 |     return questions_truth,answers_truth
35 | ######### END TESTING CODE
36 | 
37 | if __name__ == "__main__":
38 |     logging.disable(logging.WARNING)
39 |     questions_truth, answers_truth = get_test_data()
40 |     print(questions_truth)
41 | 
42 |     # for the intelligent scraper, a fixed version of WHO website is used so results coming back from get_test_data can be fixed
43 |     #urls = ["https://www.who.int/news-room/q-a-detail/q-a-coronaviruses"]
44 |     urls = ["https://web.archive.org/web/20200331131108/https://www.who.int/news-room/q-a-detail/q-a-coronaviruses"]
45 |     questions_auto, answers_auto = scrape(urls)
46 | 
47 |     # check weather questions_truth is similar to questions_auto,
48 |     # and answers_truth similar to answers_auto
49 | 
50 | 
51 | 
52 | 


--------------------------------------------------------------------------------
/covid-frontend/src/components/Home/index.js:
--------------------------------------------------------------------------------
 1 | import React, { PureComponent } from 'react';
 2 | import PropTypes from 'prop-types';
 3 | import { connect } from 'react-redux';
 4 | import { bindActionCreators } from 'redux';
 5 | import { Row, Col } from 'antd';
 6 | import links from 'routes/links';
 7 | import * as actions from 'store/actions/globalSearch';
 8 | import { WrappedSearchForm as SearchForm } from './SearchForm';
 9 | import logo from 'assets/images/logo.png';
10 | import styles from './styles.module.scss';
11 | 
12 | class Home extends PureComponent {
13 | 
14 |   static propTypes = {
15 |     history: PropTypes.object,
16 |     globalSearch: PropTypes.object,
17 |     actions: PropTypes.object
18 |   }
19 | 
20 |   handleSubmit = (value) => {
21 |     this.props.actions.setSelectedValue(value);
22 |     this.props.history.push(links.answers);
23 |   }
24 | 
25 |   render() {
26 |     const { currentString, options, filters } = this.props.globalSearch.search;
27 |     return (
28 |       <div className={styles.wrapper}>
29 |         <Row>
30 |           <Col sm={{ span: 20, offset: 2 }} md={{ span: 16, offset: 4 }} lg={{ span: 8, offset: 8 }}>
31 |             <div className={styles.content}>
32 | 
33 |               <div className={styles.logoWrapper}>
34 |                 <div className={styles.logo}>
35 |                   <img src={logo} alt="logo"/>
36 |                 </div>
37 |               </div>
38 | 
39 |               <SearchForm
40 |                 value={currentString}
41 |                 options={options}
42 |                 filters={filters}
43 |                 onSearch={this.props.actions.updateSearchValue}
44 |                 onSubmit={this.handleSubmit}
45 |               />
46 |             </div>
47 |           </Col>
48 |         </Row>
49 |       </div>
50 |     );
51 |   }
52 | }
53 | 
54 | export default connect(
55 |   state => ({
56 |     globalSearch: state.globalSearch
57 |   }),
58 |   dispatch => ({
59 |     actions: bindActionCreators(actions, dispatch)
60 |   })
61 | )(Home);
62 | 


--------------------------------------------------------------------------------
/covid_nlp/modeling/tfidf/tfidf_train.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import sys
 3 | import re
 4 | import pickle
 5 | import os
 6 | 
 7 | from sklearn.feature_extraction.text import TfidfVectorizer
 8 | from sklearn.metrics.pairwise import cosine_similarity
 9 | 
10 | from preprocess import Preprocessor
11 | 
12 | class TfidfTrainer():
13 | 
14 |     def __init__(self, instream = None):
15 |         self.preprocessor = Preprocessor(instream = instream)
16 |         self.feature_vectors = None
17 |         self.vectorizer = None
18 | 
19 |     def preprocess_corpus(self, corpus = None):
20 |         if corpus:
21 |             pcorpus = self.preprocessor.preprocess_sp(corpus)
22 |         else:
23 |             pcorpus = self.preprocessor.sentencepiece_apply(self.preprocessor.corpus)
24 |         return pcorpus
25 | 
26 |     def train_model(self, corpus):
27 |         # creating vocabulary using uni-gram and bi-gram
28 |         self.vectorizer = TfidfVectorizer(min_df=2, max_df=.95, ngram_range=(1, 2))
29 |         self.vectorizer.fit(corpus) # fit the vectorizer with the list of texts
30 |         self.feature_vectors = self.vectorizer.transform(corpus) # list of tfidf vectors
31 | 
32 |     def save_model(self, prefix = "./tfidf_"):
33 |         with open(f"{prefix}feature_vectors.pkl", 'wb') as outfile:
34 |             pickle.dump(self.feature_vectors, outfile)
35 | 
36 |         with open(f"{prefix}vectorizer.pkl", 'wb') as outfile:
37 |             pickle.dump(self.vectorizer, outfile)
38 | 
39 |     def load_model(self, prefix = "./tfidf_"):
40 |         with open(f"{prefix}feature_vectors.pkl", 'rb') as infile:
41 |             self.feature_vectors = pickle.load(infile)
42 | 
43 |         with open(f"{prefix}vectorizer.pkl", 'rb') as infile:
44 |             self.vectorizer = pickle.load(infile)
45 | 
46 | 
47 | def main():
48 |     trainer = TfidfTrainer()
49 |     corpus = trainer.preprocess_corpus()
50 |     trainer.train_model(corpus)
51 |     trainer.save_model()
52 |  
53 | if __name__ == "__main__":
54 |     main()
55 | 


--------------------------------------------------------------------------------
/backend/config.py:
--------------------------------------------------------------------------------
 1 | import ast
 2 | import os
 3 | 
 4 | # Resources / Computation
 5 | USE_GPU = os.getenv("USE_GPU", "True").lower() == "true"
 6 | MAX_PROCESSES = int(os.getenv("MAX_PROCESSES", 4))
 7 | BATCHSIZE = int(os.getenv("BATCHSIZE", 50))
 8 | 
 9 | # Monitoring
10 | APM_SERVER = "http://localhost:8200"
11 | 
12 | # Reader
13 | READER_MODEL_PATH = os.getenv("READER_MODEL_PATH", None)
14 | CONTEXT_WINDOW_SIZE = int(os.getenv("CONTEXT_WINDOW_SIZE", 500))
15 | DEFAULT_TOP_K_READER = int(os.getenv("DEFAULT_TOP_K_READER", 5))
16 | TOP_K_PER_CANDIDATE = int(os.getenv("TOP_K_PER_CANDIDATE", 3))
17 | NO_ANS_BOOST = int(os.getenv("NO_ANS_BOOST", -10))
18 | DOC_STRIDE = int(os.getenv("DOC_STRIDE", 128))
19 | MAX_SEQ_LEN = int(os.getenv("MAX_SEQ_LEN", 256))
20 | 
21 | # Retriever
22 | DEFAULT_TOP_K_RETRIEVER = int(os.getenv("DEFAULT_TOP_K_RETRIEVER", 10))
23 | EMBEDDING_MODEL_PATH = os.getenv("EMBEDDING_MODEL_PATH", "deepset/sentence_bert")
24 | EMBEDDING_POOLING_STRATEGY = os.getenv("EMBEDDING_POOLING_STRATEGY", "reduce_mean")
25 | EMBEDDING_EXTRACTION_LAYER = int(os.getenv("EMBEDDING_EXTRACTION_LAYER", -2))
26 | 
27 | # Database access
28 | DB_HOST = os.getenv("DB_HOST", "localhost")
29 | DB_USER = os.getenv("DB_USER", "")
30 | DB_PW = os.getenv("DB_PW", "")
31 | DB_INDEX = os.getenv("DB_INDEX", "document")
32 | DB_INDEX_FEEDBACK = os.getenv("DB_INDEX_FEEDBACK", "feedback")
33 | ES_CONN_SCHEME = os.getenv("ES_CONN_SCHEME", "http")
34 | TEXT_FIELD_NAME = os.getenv("TEXT_FIELD_NAME", "answer")
35 | SEARCH_FIELD_NAME = os.getenv("SEARCH_FIELD_NAME", "question")
36 | EMBEDDING_FIELD_NAME = os.getenv("EMBEDDING_FIELD_NAME", "question_emb")
37 | EMBEDDING_DIM = os.getenv("EMBEDDING_DIM", None)
38 | 
39 | EXCLUDE_META_DATA_FIELDS = os.getenv("EXCLUDE_META_DATA_FIELDS", "['question_emb']")
40 | if EXCLUDE_META_DATA_FIELDS:
41 |     EXCLUDE_META_DATA_FIELDS = ast.literal_eval(EXCLUDE_META_DATA_FIELDS)
42 | 
43 | # SIL language detection API
44 | SIL_API_KEY=os.getenv("SIL_API_KEY", "")
45 | SIL_API_SECRET=os.getenv("SIL_API_SECRET", "")
46 | SIL_API_URL=os.getenv("SIL_API_URL", "https://langdetect.apis.sil.org/langdetect")
47 | 


--------------------------------------------------------------------------------
/covid-frontend/src/components/Home/styles.module.scss:
--------------------------------------------------------------------------------
  1 | @import '../../assets/styles/_variables';
  2 | @import '../../assets/styles/_mixins';
  3 | 
  4 | .content {
  5 |   align-content: center;
  6 |   flex-direction: column;
  7 |   display: flex;
  8 |   justify-content: center;
  9 |   padding: 0 0 120px;
 10 |   min-height: 100vh;
 11 | }
 12 | 
 13 | .form {
 14 |   :global(.ant-form-explain) {
 15 |     margin-top: 0;
 16 |   }
 17 | 
 18 |   :global(.ant-btn-lg) {
 19 |     padding: 10px 15px;
 20 |   }
 21 | }
 22 | 
 23 | .logoWrapper {
 24 |   align-items: center;
 25 |   justify-content: space-around;
 26 |   display: flex;
 27 |   margin: 0 16px 80px;
 28 | }
 29 | 
 30 | .logo {
 31 |   position: relative;
 32 |   width: 250px;
 33 | 
 34 |   img {
 35 |     display: block;
 36 |     height: 100%;
 37 |     object-fit: contain;
 38 |     width: 100%;
 39 |   }
 40 | }
 41 | 
 42 | .poweredBy {
 43 |   text-align: right;
 44 |   color: $secondary-grey;
 45 |   span {
 46 |     color:#d80808 !important;
 47 |   }
 48 |   a {
 49 |     text-decoration: none;
 50 | 
 51 |     &:hover {
 52 |       color: $strong-emotion;
 53 |       text-decoration: underline;
 54 |     }
 55 |   }
 56 |   img {
 57 |     width: 300px;
 58 |   }
 59 |   p {
 60 |     text-align: right;
 61 |   }
 62 |   div {
 63 |     text-align: center;
 64 |     margin-top: 2em;
 65 |     img {
 66 |       height: 70px;
 67 |     }
 68 |   }
 69 | }
 70 | 
 71 | 
 72 | .autocomplete {
 73 |   width: 100%;
 74 | 
 75 |   :global(.ant-select-selection) {
 76 |     background-color: $accent-light;
 77 |     border-color: $accent-light;
 78 | 
 79 |     &::-webkit-input-placeholder { /* Edge */
 80 |       color: $accent;
 81 |     }
 82 | 
 83 |     &:-ms-input-placeholder { /* Internet Explorer 10-11 */
 84 |       color: $accent;
 85 |     }
 86 | 
 87 |     &::placeholder {
 88 |       color: $accent;
 89 |     }
 90 |   }
 91 | }
 92 | 
 93 | .detectedLanguage {
 94 |   text-align: right;
 95 |   color: $primary-grey;
 96 | }
 97 | .projectLogo {
 98 |   padding-top:50px;
 99 |   width:100%;
100 |   a,img {
101 |     width:180px;
102 |     margin:0 auto;
103 |     display: block;
104 |   }
105 | 
106 | }
107 | 


--------------------------------------------------------------------------------
/backend/api.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import uvicorn
 4 | from elasticapm.contrib.starlette import make_apm_client, ElasticAPM
 5 | from elasticsearch import Elasticsearch
 6 | from fastapi import FastAPI, HTTPException
 7 | from starlette.middleware.cors import CORSMiddleware
 8 | 
 9 | from backend.config import DB_HOST, DB_USER, DB_PW, APM_SERVER
10 | from backend.controller.errors.http_error import http_error_handler
11 | from backend.controller.router import router as api_router
12 | # from backend.events.fastapi import create_start_app_handler, create_stop_app_handler
13 | 
14 | logging.basicConfig(format="%(asctime)s %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p")
15 | logger = logging.getLogger(__name__)
16 | logging.getLogger("elasticsearch").setLevel(logging.WARNING)
17 | 
18 | elasticsearch_client = Elasticsearch(
19 |     hosts=[{"host": DB_HOST}], http_auth=(DB_USER, DB_PW), scheme="http", ca_certs=False, verify_certs=False
20 | )
21 | 
22 | 
23 | def get_application() -> FastAPI:
24 |     application = FastAPI(title="Haystack API", debug=True, version="0.1")
25 | 
26 |     application.add_middleware(
27 |         CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"],
28 |     )
29 |     apm_config = {"SERVICE_NAME": "covid-backend", "SERVER_URL": APM_SERVER, "CAPTURE_BODY": "all"}
30 |     elasticapm = make_apm_client(apm_config)
31 |     application.add_middleware(ElasticAPM, client=elasticapm)
32 | 
33 |     application.add_exception_handler(HTTPException, http_error_handler)
34 |     # application.add_event_handler("startup", create_start_app_handler(application))
35 |     # application.add_event_handler("shutdown", create_stop_app_handler(application))
36 | 
37 |     application.include_router(api_router)
38 | 
39 |     return application
40 | 
41 | 
42 | app = get_application()
43 | 
44 | logger.info("Open http://127.0.0.1:8000/docs to see Swagger API Documentation.")
45 | logger.info(
46 |     """
47 | Or just try it out directly: curl --request POST --url 'http://127.0.0.1:8000/models/1/faq-qa' --data '{"questions": ["What are symptoms?"]}'
48 | """
49 | )
50 | 
51 | if __name__ == "__main__":
52 |     uvicorn.run(app, host="0.0.0.0", port=8000)
53 | 


--------------------------------------------------------------------------------
/covid_nlp/modeling/tfidf/tfidf_client.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import re
 3 | import pickle
 4 | import os
 5 | import json
 6 | 
 7 | from sklearn.feature_extraction.text import TfidfVectorizer
 8 | from sklearn.metrics.pairwise import cosine_similarity
 9 | 
10 | import pandas as pd
11 | 
12 | from preprocess import Preprocessor
13 | from tfidf_train import TfidfTrainer
14 | 
15 | sys.path.insert(0, "./../../")
16 | from eval import eval_question_similarity
17 | 
18 | 
19 | class TfidfEvaluator():
20 |     def __init__(self):
21 |         self.model = TfidfTrainer(instream = "dummy")
22 |         self.model.load_model()
23 | 
24 |     def process_string(self, mystring):
25 |         corpus = self.model.preprocess_corpus([mystring])
26 |         corpus_vectors = self.model.vectorizer.transform([corpus[0]])
27 |         return corpus_vectors
28 | 
29 |     def find_best_matches(self, cos_list, top_n = 10):
30 |         cos_list_enumerated = [ (i, cos_sim) for i, cos_sim in enumerate(cos_list) ]
31 |         cos_list_enumerated.sort(key=lambda x:x[1], reverse=True)
32 |         return cos_list_enumerated[:top_n]
33 | 
34 |     def score_string_pair(self, string1, string2):
35 |         vec1 = self.process_string(string1)
36 |         vec2 = self.process_string(string2)
37 |         cos_sim = cosine_similarity(vec1, vec2)
38 |         return cos_sim[0][0]
39 | 
40 | def main():
41 |     evaluator = TfidfEvaluator()
42 | 
43 |     eval_file = "../../../data/eval_question_similarity_en.csv"
44 |     df = pd.read_csv(eval_file)
45 |     # predict similarity of samples (e.g. via embeddings + cosine similarity)
46 |     df['pred'] = df.apply(lambda x: evaluator.score_string_pair(x.question_1, x.question_2), axis=1)
47 |     y_true = df["similar"].values
48 |     y_pred = df["pred"].values
49 | 
50 |     model_name = "tfidf_baseline"
51 |     exp_name = "tfidf_cos_sim_2"
52 |     params = {"sp_voc": 16000, "max_ngram": 2, "remove_stopwords": 1, 
53 |                 "data_train": "eval, scraped", "data_sp": "eval, scraped, CORD-19.200k"}
54 |     eval_question_similarity(y_true=y_true, y_pred=y_pred, lang="en", model_name=model_name,
55 |                              params=params, user="carmen", log_to_mlflow=True, run_name=exp_name)
56 | 
57 | 
58 | if __name__ == "__main__":
59 |     main()
60 | 


--------------------------------------------------------------------------------
/telegram-bot/src/main/kotlin/com/theapache64/cs/utils/TelegramAPI.kt:
--------------------------------------------------------------------------------
 1 | package com.theapache64.cs.utils
 2 | 
 3 | import com.theapache64.cs.models.rest.telegram.AnswerCallbackRequest
 4 | import com.theapache64.cs.models.rest.telegram.SendChatActionRequest
 5 | import com.theapache64.cs.models.rest.telegram.SendMessageRequest
 6 | import com.theapache64.cs.models.rest.telegram.SendMessageResponse
 7 | import java.io.IOException
 8 | 
 9 | object TelegramAPI {
10 | 
11 |     private const val BASE_URL = "https://api.telegram.org"
12 | 
13 | 
14 |     /**
15 |      * To send a text with Markdown
16 |      */
17 |     @Throws(IOException::class)
18 |     fun sendHtmlMessage(
19 |         from: String,
20 |         to: String,
21 |         message: String,
22 |         replyMsgId: Long?,
23 |         replayMarkup: SendMessageRequest.ReplyMarkup?
24 |     ): SendMessageResponse {
25 | 
26 |         val url = "$BASE_URL/bot$from/sendMessage"
27 | 
28 |         val response = RestClient.post(
29 |             url,
30 |             null,
31 |             SendMessageRequest(
32 |                 to,
33 |                 message,
34 |                 true,
35 |                 "HTML",
36 |                 replyMsgId,
37 |                 replayMarkup
38 |             )
39 |         )
40 | 
41 |         val respJsonString = response.body!!.string()
42 |         if (response.code != 200) {
43 |             throw IOException("Failed to send message '$message' -> $respJsonString")
44 |         }
45 |         return GsonUtil.gson.fromJson(respJsonString, SendMessageResponse::class.java)
46 |     }
47 | 
48 |     fun answerCallbackQuery(
49 |         from: String,
50 |         id: String
51 |     ) {
52 |         val url = "$BASE_URL/bot$from/answerCallbackQuery"
53 |         val resp = RestClient.post(
54 |             url,
55 |             null,
56 |             AnswerCallbackRequest(id)
57 |         ).body!!.string()
58 | 
59 |     }
60 | 
61 |     fun sendChatAction(
62 |         from: String,
63 |         chatId: String,
64 |         action: String
65 |     ) {
66 |         val url = "$BASE_URL/bot$from/sendChatAction"
67 |         val resp = RestClient.post(
68 |             url,
69 |             null,
70 |             SendChatActionRequest(
71 |                 action,
72 |                 chatId
73 |             )
74 |         ).body!!.string()
75 | 
76 |     }
77 | }


--------------------------------------------------------------------------------
/telegram-bot/src/main/kotlin/com/theapache64/cs/utils/RestClient.kt:
--------------------------------------------------------------------------------
 1 | package com.theapache64.cs.utils
 2 | 
 3 | import okhttp3.OkHttpClient
 4 | import okhttp3.Request
 5 | import okhttp3.RequestBody.Companion.toRequestBody
 6 | import okhttp3.Response
 7 | import java.util.concurrent.TimeUnit
 8 | 
 9 | object RestClient {
10 | 
11 |     fun get(url: String, headers: Map<String, String>? = null): Response {
12 |         return call("GET", url, headers, null)
13 |     }
14 | 
15 |     private fun getNewOkHttpClient(): OkHttpClient {
16 |         return OkHttpClient.Builder()
17 |             .connectTimeout(30, TimeUnit.SECONDS)
18 |             .readTimeout(30, TimeUnit.SECONDS)
19 |             .writeTimeout(30, TimeUnit.SECONDS)
20 |             .followRedirects(true)
21 |             .followSslRedirects(true)
22 |             .build()
23 |     }
24 | 
25 |     private fun call(method: String, url: String, headers: Map<String, String>?, body: Any?): Response {
26 | 
27 | 
28 |         val request = Request.Builder()
29 |             .url(url)
30 | 
31 |         if (body != null) {
32 |             val json = GsonUtil.gson.toJson(body)
33 | 
34 |             println("$method : $url -> $json")
35 | 
36 |             request.addHeader("Content-Type", "application/json")
37 |             request.method(method, json.toRequestBody())
38 |         } else {
39 |             request.method(method, null)
40 |         }
41 | 
42 |         if (headers != null) {
43 |             for (header in headers) {
44 |                 request.addHeader(header.key, header.value)
45 |             }
46 |         }
47 | 
48 |         request.addHeader(
49 |             "User-Agent",
50 |             "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
51 |         )
52 | 
53 |         return getNewOkHttpClient().newCall(request.build()).execute()
54 |     }
55 | 
56 |     fun post(url: String, headers: Map<String, String>?, body: Any): Response {
57 |         return call(
58 |             "POST",
59 |             url,
60 |             headers,
61 |             body
62 |         )
63 |     }
64 | 
65 |     fun put(
66 |         url: String,
67 |         headers: Map<String, String>,
68 |         body: Any
69 |     ): Response {
70 |         return call(
71 |             "PUT",
72 |             url,
73 |             headers,
74 |             body
75 |         )
76 |     }
77 | }


--------------------------------------------------------------------------------
/datasources/scrapers/RKI_scraper.py:
--------------------------------------------------------------------------------
 1 | # run 'scrapy runspider RKI_scraper.py' to scrape data
 2 | 
 3 | from datetime import date
 4 | 
 5 | import scrapy
 6 | from scrapy.crawler import CrawlerProcess
 7 | 
 8 | 
 9 | class CovidScraper(scrapy.Spider):
10 |     name = 'rki_spyder'
11 |     start_urls = ['https://www.rki.de/SharedDocs/FAQ/NCOV2019/FAQ_Liste.html']
12 | 
13 |     def parse(self, response):
14 |         columns = {
15 |             "question": [],
16 |             "answer": [],
17 |             "answer_html": [],
18 |             "link": [],
19 |             "name": [],
20 |             "source": [],
21 |             "category": [],
22 |             "country": [],
23 |             "region": [],
24 |             "city": [],
25 |             "lang": [],
26 |             "last_update": [],
27 |         }
28 | 
29 |         for x in response.xpath('//div[@class="alt-accordion-box-box"]/@id').extract():
30 |             question_text = response.xpath(str('//*[@id="' + x + '"]/h2/text()')).extract()[0]
31 |             answer_text = " ".join(response.xpath(str('//*[@id="' + x + '"]/div/p')).xpath('string()').extract())
32 |             answer_html = " ".join(response.xpath(str('//*[@id="' + x + '"]/div/p')).extract())
33 | 
34 |             columns['question'].append(question_text)
35 |             columns['answer'].append(answer_text)
36 |             columns['answer_html'].append(answer_html)
37 | 
38 |         today = date.today()
39 | 
40 |         columns["link"] = ["https://www.rki.de/SharedDocs/FAQ/NCOV2019/FAQ_Liste.html"] * len(columns["question"])
41 |         columns["name"] = ["Q&A on coronaviruses (COVID-19)"] * len(columns["question"])
42 |         columns["source"] = ["Robert Koch Institute (RKI)"] * len(columns["question"])
43 |         columns["category"] = [""] * len(columns["question"])
44 |         columns["country"] = ["DE"] * len(columns["question"])
45 |         columns["region"] = [""] * len(columns["question"])
46 |         columns["city"] = [""] * len(columns["question"])
47 |         columns["lang"] = ["de"] * len(columns["question"])
48 |         columns["last_update"] = [today.strftime("%Y/%m/%d")] * len(columns["question"])
49 | 
50 |         return columns
51 | 
52 | 
53 | if __name__ == "__main__":
54 |     process = CrawlerProcess({
55 |         'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
56 |     })
57 |     process.crawl(CovidScraper)
58 |     process.start()
59 | 


--------------------------------------------------------------------------------
/datasources/scrapers/GOV_pl_scraper.py:
--------------------------------------------------------------------------------
 1 | # run 'scrapy runspider GOV_pl_scraper.py' to scrape data
 2 | 
 3 | from datetime import date
 4 | 
 5 | import scrapy
 6 | from scrapy.crawler import CrawlerProcess
 7 | 
 8 | 
 9 | class CovidScraper(scrapy.Spider):
10 |     name = 'polish_GOV_spyder'
11 |     start_urls = ['https://www.gov.pl/web/koronawirus/pytania-i-odpowiedzi']
12 | 
13 |     def parse(self, response):
14 |         columns = {
15 |             "question": [],
16 |             "answer": [],
17 |             "answer_html": [],
18 |             "link": [],
19 |             "name": [],
20 |             "source": [],
21 |             "category": [],
22 |             "country": [],
23 |             "region": [],
24 |             "city": [],
25 |             "lang": [],
26 |             "last_update": [],
27 |         }
28 | 
29 |         for x in range(0, len(response.xpath('//summary/text()').extract())):
30 |             question_text = response.xpath('//summary/text()').extract()[x]
31 |             answer_text = "".join(response.xpath(
32 |                 '//summary[text()="' + question_text + '"]/following-sibling::node()/descendant-or-self::text()').extract())
33 |             answer_html = "".join(
34 |                 response.xpath('//summary[text()="' + question_text + '"]/following-sibling::node()').extract())
35 | 
36 |             columns['question'].append(question_text)
37 |             columns['answer'].append(answer_text)
38 |             columns['answer_html'].append(answer_html)
39 | 
40 |         today = date.today()
41 | 
42 |         columns["link"] = ["https://www.gov.pl/web/koronawirus/pytania-i-odpowiedzi"] * len(columns["question"])
43 |         columns["name"] = ["Pytania i odpowiedzi (COVID-19)"] * len(columns["question"])
44 |         columns["source"] = ["GOV Polska"] * len(columns["question"])
45 |         columns["category"] = [""] * len(columns["question"])
46 |         columns["country"] = ["PL"] * len(columns["question"])
47 |         columns["region"] = [""] * len(columns["question"])
48 |         columns["city"] = [""] * len(columns["question"])
49 |         columns["lang"] = ["pl"] * len(columns["question"])
50 |         columns["last_update"] = [today.strftime("%Y/%m/%d")] * len(columns["question"])
51 | 
52 |         return columns
53 | 
54 | 
55 | if __name__ == "__main__":
56 |     process = CrawlerProcess({
57 |         'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
58 |     })
59 |     process.crawl(CovidScraper)
60 |     process.start()
61 | 


--------------------------------------------------------------------------------
/covid-frontend/src/core/api/index.js:
--------------------------------------------------------------------------------
 1 | import { baseUrl } from 'core/constants/env';
 2 | 
 3 | class Api {
 4 |   token = null;
 5 |   headers = {
 6 |     'Content-Type': 'application/json'
 7 |   }
 8 | 
 9 |   setAuthorization (token) {
10 |     this.token = token;
11 | 
12 |     return this;
13 |   }
14 | 
15 |   setHeaders (headers) {
16 |     Object.getOwnPropertyNames(headers).forEach(key => {
17 |       if (headers[key]) {
18 |         this.headers[key] = headers[key];
19 |       } else {
20 |         delete this.headers[key];
21 |       }
22 |     });
23 | 
24 |     return this;
25 |   }
26 | 
27 |   get (url, query = null) {
28 |     return this.call(url, 'GET', query);
29 |   }
30 | 
31 |   post (url, query = null, body = null) {
32 |     return this.call(url, 'POST', body ? query : null, body || query);
33 |   }
34 | 
35 |   put (url, query = null, body = null) {
36 |     return this.call(url, 'PUT', body ? query : null, body || query);
37 |   }
38 | 
39 |   del (url, query = null) {
40 |     return this.call(url, 'DELETE', query);
41 |   }
42 | 
43 |   call (url, method, query = null, body = null) {
44 |     const queryString =
45 |       Object.keys(query || {})
46 |         .map(key => {
47 |           let value = query[key];
48 | 
49 |           if (typeof value === 'object' && value !== null) {
50 |             value = JSON.stringify(value);
51 |           }
52 | 
53 |           return `${key}=${encodeURIComponent(value)}`;
54 |         })
55 |         .join('&');
56 | 
57 |     let options = {
58 |       method,
59 |       headers: {
60 |         ...this.headers,
61 |         // 'Authorization': this.token
62 |       }
63 |     };
64 | 
65 |     if (body) {
66 |       options.body = body.constructor.name !== 'FormData'
67 |         ? JSON.stringify(body)
68 |         : body;
69 |     }
70 | 
71 |     const urlString = `${baseUrl}${url}${queryString ? `?${queryString}` : ''}`;
72 | 
73 |     return fetch(urlString, options).then(response => {
74 |       this.response = response;
75 | 
76 |       if (response.status >= 200 && response.status < 300) {
77 |         return response.json();
78 |       }
79 | 
80 |       return response.json()
81 |         .catch(() => {
82 |           // if couldn't parse json
83 |           throw new Error(`${response.status} - ${response.statusText}`);
84 |         })
85 |         // if got a valid json response with error
86 |         .then(error => {
87 |           throw error;
88 |         });
89 |     });
90 |   }
91 | }
92 | 
93 | export default () => new Api();
94 | 


--------------------------------------------------------------------------------
/covid_nlp/language/detect_language.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import pycld2 as cld2
 4 | import hmac
 5 | from hashlib import sha1
 6 | from time import time
 7 | import requests
 8 | 
 9 | class LanguageDetector():
10 |     def __init__(self, model = 'sil'):
11 |         self.model = model
12 | 
13 |     def detect_lang_cld2(self, text):
14 |         pred = cld2.detect(text)[2][0]
15 |         return pred[1], float(pred[2])
16 | 
17 |     def detect_lang_cld3(self, text):
18 |         import cld3  # requires protobuf
19 |         pred = cld3.get_language(text)
20 |         return pred.language, 100*pred.probability
21 | 
22 |     def detect_lang_sil(self, text):
23 |         algorithm = 'HMAC+SHA1'
24 |         curr_time = str(int(time()))
25 |         concat = curr_time+os.environ.get('SIL_API_KEY')
26 |         concatB = (concat).encode('utf-8')
27 |         secretB = os.environ.get('SIL_API_SECRET').encode('utf-8')
28 |         h1 = hmac.new(secretB, concatB, sha1)
29 |         api_sig = h1.hexdigest()
30 |         params = {'api_key': os.environ.get('SIL_API_KEY'), 'api_sig': api_sig}
31 |         headers = {'Content-Type': 'application/json'}
32 |         r = requests.post(os.environ.get('SIL_API_URL'), json=[{"text": text}], 
33 |                 headers=headers, params=params)
34 |         return r.json()[0]['language'], 100*r.json()[0]['probability']
35 | 
36 |     def detect_lang(self, text):
37 |         if self.model == 'cld2':
38 |             return self.detect_lang_cld2(text)
39 |         if self.model == 'cld3':
40 |             return self.detect_lang_cld3(text)
41 |         if self.model == 'sil':
42 |             return self.detect_lang_sil(text)
43 | 
44 |     def detect_freq_lang(self, text, n = 3):
45 |         import cld3  # requires protobuf
46 |         pred = cld3.get_frequent_languages(text, num_langs = n)
47 |         pred_list = [ (p.language, 100*p.probability) for p in pred ]
48 |         return pred_list
49 | 
50 | 
51 | def main():
52 |     my_text = "Was ist das Coronavirus?"
53 | 
54 |     ld3 = LanguageDetector(model = 'cld3')
55 |     ld3_result = ld3.detect_lang(my_text)
56 |     print(f"cld3: {ld3_result}")
57 |     ld3_top_results = ld3.detect_freq_lang(my_text, 4)
58 |     print(f"cld3-freq: {ld3_top_results}")
59 | 
60 |     ld2 = LanguageDetector(model = 'cld2')
61 |     ld2_result = ld2.detect_lang(my_text)
62 |     print(f"cld2: {ld2_result}")
63 | 
64 |     ldsil = LanguageDetector(model = 'sil')
65 |     ldsil_result = ldsil.detect_lang(my_text)
66 |     print(f"sil: {ldsil_result}")
67 | 
68 | if __name__ == "__main__":
69 |     main()
70 | 


--------------------------------------------------------------------------------
/telegram-bot/gradlew.bat:
--------------------------------------------------------------------------------
 1 | @if "%DEBUG%" == "" @echo off
 2 | @rem ##########################################################################
 3 | @rem
 4 | @rem  Gradle startup script for Windows
 5 | @rem
 6 | @rem ##########################################################################
 7 | 
 8 | @rem Set local scope for the variables with windows NT shell
 9 | if "%OS%"=="Windows_NT" setlocal
10 | 
11 | set DIRNAME=%~dp0
12 | if "%DIRNAME%" == "" set DIRNAME=.
13 | set APP_BASE_NAME=%~n0
14 | set APP_HOME=%DIRNAME%
15 | 
16 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
17 | set DEFAULT_JVM_OPTS="-Xmx64m"
18 | 
19 | @rem Find java.exe
20 | if defined JAVA_HOME goto findJavaFromJavaHome
21 | 
22 | set JAVA_EXE=java.exe
23 | %JAVA_EXE% -version >NUL 2>&1
24 | if "%ERRORLEVEL%" == "0" goto init
25 | 
26 | echo.
27 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
28 | echo.
29 | echo Please set the JAVA_HOME variable in your environment to match the
30 | echo location of your Java installation.
31 | 
32 | goto fail
33 | 
34 | :findJavaFromJavaHome
35 | set JAVA_HOME=%JAVA_HOME:"=%
36 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe
37 | 
38 | if exist "%JAVA_EXE%" goto init
39 | 
40 | echo.
41 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
42 | echo.
43 | echo Please set the JAVA_HOME variable in your environment to match the
44 | echo location of your Java installation.
45 | 
46 | goto fail
47 | 
48 | :init
49 | @rem Get command-line arguments, handling Windows variants
50 | 
51 | if not "%OS%" == "Windows_NT" goto win9xME_args
52 | 
53 | :win9xME_args
54 | @rem Slurp the command line arguments.
55 | set CMD_LINE_ARGS=
56 | set _SKIP=2
57 | 
58 | :win9xME_args_slurp
59 | if "x%~1" == "x" goto execute
60 | 
61 | set CMD_LINE_ARGS=%*
62 | 
63 | :execute
64 | @rem Setup the command line
65 | 
66 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
67 | 
68 | @rem Execute Gradle
69 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
70 | 
71 | :end
72 | @rem End local scope for the variables with windows NT shell
73 | if "%ERRORLEVEL%"=="0" goto mainEnd
74 | 
75 | :fail
76 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
77 | rem the _cmd.exe /c_ return code!
78 | if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
79 | exit /b 1
80 | 
81 | :mainEnd
82 | if "%OS%"=="Windows_NT" endlocal
83 | 
84 | :omega
85 | 


--------------------------------------------------------------------------------
/covid-frontend/src/components/UserFeedback/index.js:
--------------------------------------------------------------------------------
 1 | import React, { PureComponent } from 'react';
 2 | import PropTypes from 'prop-types';
 3 | import { connect } from 'react-redux';
 4 | import { bindActionCreators } from 'redux';
 5 | import { Icon } from 'antd';
 6 | import * as answersActions from 'store/actions/activeAnswers';
 7 | import styles from './styles.module.scss';
 8 | import { withTranslation } from 'react-i18next';
 9 | 
10 | class UserFeedback extends PureComponent {
11 | 
12 |   static propTypes = {
13 |     globalSearch: PropTypes.object,
14 |     answersActions: PropTypes.object
15 |   }
16 | 
17 |   closeHandler = () => {
18 |     this.props.answersActions.hideUserFeedbackPanel();
19 |   }
20 | 
21 |   onFeedbackNegative = (feedback, event) => {
22 |     event.preventDefault()
23 | 
24 |     this.props.answersActions.markAsWrongAnswer({
25 |       question: this.props.globalSearch,
26 |       answerDocumentId: this.props.answers.userFeedbackPopup && this.props.answers.userFeedbackPopup.answerDocumentId,
27 |       feedback
28 |     });
29 | 
30 |     this.props.answersActions.hideUserFeedbackPanel();
31 | 
32 |     return false;
33 |   }
34 | 
35 | 
36 |   render() {
37 |     const { t } = this.props;
38 | 
39 |     return (
40 |       <div className={styles.wrapper}>
41 |         <div>
42 |           <h2>{t('feedback.title')}</h2>
43 |           <p>{t('feedback.text')}</p>
44 |           <button rel="noopener noreferrer" className={styles.answerDocLink}
45 |             onClick={this.onFeedbackNegative.bind(this, 'fake')}>
46 |             <Icon type="warning" /> {t('feedback.fake')}
47 |           </button>
48 |           <button rel="noopener noreferrer" className={styles.answerDocLink}
49 |             onClick={this.onFeedbackNegative.bind(this, 'outdated')}>
50 |             <Icon type="clock-circle" /> {t('feedback.outdated')}
51 |           </button>
52 |           <button rel="noopener noreferrer" className={styles.answerDocLink}
53 |             onClick={this.onFeedbackNegative.bind(this, 'irrelevant')}>
54 |             <Icon type="question" /> {t('feedback.irrelevant')}
55 |           </button>
56 |           <button rel="noopener noreferrer" className={styles.answerDocLink} onClick={this.closeHandler}>
57 |             <Icon type="like" /> {t('feedback.nothing')}
58 |           </button>
59 |         </div>
60 |       </div>
61 |     );
62 |   }
63 | }
64 | 
65 | export default connect(
66 |   state => ({
67 |     globalSearch: state.globalSearch,
68 |     answers: state.activeAnswers
69 |   }),
70 |   dispatch => ({
71 |     answersActions: bindActionCreators(answersActions, dispatch)
72 |   })
73 | )(withTranslation()(UserFeedback));
74 | 


--------------------------------------------------------------------------------
/covid-frontend/src/components/Answers/styles.module.scss:
--------------------------------------------------------------------------------
  1 | @import "../../assets/styles/_variables";
  2 | @import "../../assets/styles/_mixins";
  3 | 
  4 | .wrapper {
  5 |   background-color: $white;
  6 |   @include border();
  7 |   box-shadow: $base-shadow;
  8 |   min-height: 680px;
  9 |   padding: 24px 24px 60px 24px;
 10 | 
 11 |   .autocomplete {
 12 |     width: 100%;
 13 | 
 14 |     :global(.ant-select-selection) {
 15 |       background-color: $accent-light;
 16 |       border-color: $accent-light;
 17 | 
 18 |       &::-webkit-input-placeholder {
 19 |         /* Edge */
 20 |         color: $accent;
 21 |       }
 22 | 
 23 |       &:-ms-input-placeholder {
 24 |         /* Internet Explorer 10-11 */
 25 |         color: $accent;
 26 |       }
 27 | 
 28 |       &::placeholder {
 29 |         color: $accent;
 30 |       }
 31 |     }
 32 |   }
 33 | }
 34 | 
 35 | .titleRow {
 36 |   margin-bottom: 48px;
 37 | }
 38 | 
 39 | .loaderContainer {
 40 |   padding: 60px 0;
 41 |   text-align: center;
 42 | 
 43 |   h2 {
 44 |     font-weight: 600;
 45 |     margin: 16px 0;
 46 |   }
 47 | 
 48 |   div {
 49 |     color: $primary-grey;
 50 |     font-weight: 600;
 51 |     @include text(semiBig);
 52 |   }
 53 | }
 54 | 
 55 | .topAnswerTitle {
 56 |   text-transform: uppercase;
 57 |   text-align: center;
 58 |   color: white;
 59 |   margin-bottom: 1em;;
 60 | }
 61 | 
 62 | .otherAnswersTitle {
 63 |   color: $primary-grey;
 64 |   margin: 56px 0 32px;
 65 |   text-transform: uppercase;
 66 |   letter-spacing: $secondary-spacing;
 67 |   @include text(tiny);
 68 | }
 69 | 
 70 | .answerTitle {
 71 |   font-weight: 700;
 72 |   margin-bottom: 16px;
 73 | }
 74 | 
 75 | .answerText {
 76 | 
 77 |   p {
 78 |     line-height: 1.3;
 79 |     max-width:800px;
 80 |   }
 81 | 
 82 |   span {
 83 |     color: $success;
 84 |   }
 85 | }
 86 | 
 87 | .answerMeta {
 88 |   padding:20px;
 89 |   float:right;
 90 |   background:#f8f8f8;
 91 |   margin: 8px 0;
 92 | 
 93 |   & > div {
 94 |     margin: 2px 0;
 95 |   }
 96 | 
 97 |   span {
 98 |     color: $primary-grey;
 99 |     display: inline-block;
100 |     width: 80px;
101 |   }
102 | }
103 | 
104 | .answerDocLink {
105 |   border: solid 1px;
106 |   border-radius: 50%;
107 |   display: inline-block;
108 |   height: 22px;
109 |   margin-left: 8px;
110 |   text-align: center;
111 |   vertical-align: middle;
112 |   width: 22px;
113 |   & > i {
114 |     vertical-align: middle;
115 |   }
116 | }
117 | 
118 | .answerDocLinkPositive {
119 |   @extend .answerDocLink;
120 | 
121 |   color: $success;
122 | }
123 | 
124 | .answerDocLinkNegative {
125 |   @extend .answerDocLink;
126 | 
127 |   color: $strong-emotion;
128 | }
129 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <img alt="cover-photo" src="https://github.com/deepset-ai/COVID-QA/blob/master/docs/img/covid-bert.png?raw=true" width="130" height="298" />
 2 | 
 3 | This open source project serves two purposes. 
 4 | 1. Collection and evaluation of a Question Answering dataset to improve existing QA/search methods - **COVID-QA**
 5 | 2. Question matching capabilities: Provide trustworthy answers to questions about COVID-19 via NLP - **outdated**
 6 | 
 7 | # COVID-QA
 8 | - Link to [COVID-QA Dataset](https://github.com/deepset-ai/COVID-QA/tree/master/data/question-answering/COVID-QA.json) 
 9 | - Accompanying paper on [OpenReview](https://openreview.net/forum?id=JENSKEEzsoU)
10 | - Annotation guidelines as [pdf](https://drive.google.com/file/d/1Wv3OIC0Z7ibHIzOm9Xw_r0gjTFmpl-33/view?usp=sharing) or [videos](https://www.youtube.com/playlist?list=PL0pJupneBHx4rkCtNmaXUs1q7SV7EjLED)
11 | - [deepset/roberta-base-squad2-covid](https://huggingface.co/deepset/roberta-base-squad2-covid) a QA model trained on COVID-QA 
12 | 
13 | **Update 14th April, 2020:** We are open sourcing the first batch of 
14 | [SQuAD style question answering annotations](https://github.com/deepset-ai/COVID-QA/tree/master/data/question-answering).
15 | Thanks to [Tony Reina](https://www.linkedin.com/in/skysurgery/) for managing the process and the 
16 | many professional annotators who spend valuable time looking through Covid related research papers.
17 | 
18 | 
19 | # FAQ matching
20 | **Update 17th June, 2020**: As the pandemic is thankfully slowing down and other information sources have catched up, we decided to take our hosted API and UI offline. We will keep the repository here as an inspiration for other projects and to share the COVID-QA dataset.
21 | 
22 | ### :zap: Problem
23 | - People have many questions about COVID-19
24 | - Answers are scattered on different websites 
25 | - Finding the right answers takes a lot of time
26 | - Trustworthiness of answers is hard to judge
27 | - Many answers get outdated soon
28 | 
29 | ### :bulb: Idea
30 | - Aggregate FAQs and texts from trustworthy data sources (WHO, CDC ...)
31 | - Provide a UI where people can ask questions
32 | - Use NLP to match incoming questions of users with meaningful answers
33 | - Users can provide feedback about answers to improve the NLP model and flag outdated or wrong answers
34 | - Display most common queries without good answers to guide data collection and model improvements
35 | 
36 | ### :gear:	Tech 
37 | - Scrapers to collect data
38 | - Elasticsearch to store texts, FAQs, embeddings
39 | - NLP Models implemented via [Haystack](https://github.com/deepset-ai/haystack/) to find answers via a) detecting similar question in FAQs b) detect answers in free texts (extractive QA)
40 | - React Frontend
41 | 
42 | 


--------------------------------------------------------------------------------
/datasources/scrapers/BMAS_scraper.py:
--------------------------------------------------------------------------------
 1 | from datetime import date
 2 | 
 3 | import scrapy
 4 | from scrapy.crawler import CrawlerProcess
 5 | 
 6 | 
 7 | class CovidScraper(scrapy.Spider):
 8 |     name = "BMAS_scraper"
 9 |     start_urls = ["https://www.bmas.de/DE/Presse/Meldungen/2020/corona-virus-arbeitsrechtliche-auswirkungen.html"]
10 | 
11 |     def parse(self, response):
12 |         columns = {
13 |             "question": [],
14 |             "answer": [],
15 |             "answer_html": [],
16 |             "link": [],
17 |             "name": [],
18 |             "source": [],
19 |             "category": [],
20 |             "country": [],
21 |             "region": [],
22 |             "city": [],
23 |             "lang": [],
24 |             "last_update": [],
25 |         }
26 | 
27 |         QUESTION_ANSWER_SELECTOR = ".panel"
28 |         QUESTION_SELECTOR = ".collapsed ::text"
29 |         ANSWER_SELECTOR = "./div[@id[starts-with(., 'collapse')]]"
30 | 
31 |         questions_answers = response.css(QUESTION_ANSWER_SELECTOR)
32 |         for question_answer in questions_answers:
33 |             question = question_answer.css(QUESTION_SELECTOR).getall()
34 |             question = " ".join(question).strip().replace("\xad", "")
35 |             answer = question_answer.xpath(ANSWER_SELECTOR).css(" ::text").getall()
36 |             answer = " ".join(answer).strip().replace("\xad", "")
37 |             answer_html = question_answer.xpath(ANSWER_SELECTOR).get()
38 | 
39 |             # add question-answer pair to data dictionary
40 |             columns["question"].append(question)
41 |             columns["answer"].append(answer)
42 |             columns["answer_html"].append(answer_html)
43 | 
44 |         today = date.today()
45 | 
46 |         columns["link"] = ["https://www.bmas.de/DE/Presse/Meldungen/2020/corona-virus-arbeitsrechtliche-auswirkungen.html"] * len(columns["question"])
47 |         columns["name"] = ["Arbeits- und arbeitsschutzrechtliche Fragen zum Coronavirus (SARS-CoV-2)"] * len(columns["question"])
48 |         columns["source"] = ["Bundesministerium für Arbeit und Soziales (BMAS)"] * len(columns["question"])
49 |         columns["category"] = [""] * len(columns["question"])
50 |         columns["country"] = ["DE"] * len(columns["question"])
51 |         columns["region"] = [""] * len(columns["question"])
52 |         columns["city"] = [""] * len(columns["question"])
53 |         columns["lang"] = ["de"] * len(columns["question"])
54 |         columns["last_update"] = [today.strftime("%Y/%m/%d")] * len(columns["question"])
55 | 
56 |         return columns
57 | 
58 | 
59 | if __name__ == "__main__":
60 |     process = CrawlerProcess({
61 |         'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
62 |     })
63 | 
64 |     process.crawl(CovidScraper)
65 |     process.start()
66 | 


--------------------------------------------------------------------------------
/datasources/automatic/testing_WHO_scraper.py:
--------------------------------------------------------------------------------
 1 | ######### this scraper is exactly like the scrapers.WHO_scraper.py
 2 | # but it uses an URL from the waybackmachine, so the site doesnt change over time and crawling fails
 3 | # This is only for testing purposes
 4 | 
 5 | from datetime import date
 6 | 
 7 | import scrapy
 8 | 
 9 | 
10 | class CovidScraper(scrapy.Spider):
11 |     name = "WHO_scraper"
12 |     #start_urls = ["https://www.who.int/news-room/q-a-detail/q-a-coronaviruses"]
13 |     start_urls = ["https://web.archive.org/web/20200331131108/https://www.who.int/news-room/q-a-detail/q-a-coronaviruses"]
14 | 
15 |     def parse(self, response):
16 |         columns = {
17 |             "question": [],
18 |             "answer": [],
19 |             "answer_html": [],
20 |             "link": [],
21 |             "name": [],
22 |             "source": [],
23 |             "category": [],
24 |             "country": [],
25 |             "region": [],
26 |             "city": [],
27 |             "lang": [],
28 |             "last_update": [],
29 |         }
30 | 
31 |         QUESTION_ANSWER_SELECTOR = ".sf-accordion__panel"
32 |         QUESTION_SELECTOR = ".sf-accordion__link::text"
33 |         ANSWER_SELECTOR = ".sf-accordion__content ::text"
34 |         ANSWER_HTML_SELECTOR = ".sf-accordion__content"
35 | 
36 |         questions_answers = response.css(QUESTION_ANSWER_SELECTOR)
37 |         for question_answer in questions_answers:
38 |             question = question_answer.css(QUESTION_SELECTOR).getall()
39 |             question = " ".join(question).strip()
40 |             answer = question_answer.css(ANSWER_SELECTOR).getall()
41 |             answer = " ".join(answer).strip()
42 |             answer_html = question_answer.css(ANSWER_HTML_SELECTOR).getall()
43 |             answer_html = " ".join(answer_html).strip()
44 | 
45 |             # add question-answer pair to data dictionary
46 |             columns["question"].append(question)
47 |             columns["answer"].append(answer)
48 |             columns["answer_html"].append(answer_html)
49 | 
50 |         today = date.today()
51 | 
52 |         columns["link"] = ["https://www.who.int/news-room/q-a-detail/q-a-coronaviruses"] * len(columns["question"])
53 |         columns["name"] = ["Q&A on coronaviruses (COVID-19)"] * len(columns["question"])
54 |         columns["source"] = ["World Health Organization (WHO)"] * len(columns["question"])
55 |         columns["category"] = [""] * len(columns["question"])
56 |         columns["country"] = [""] * len(columns["question"])
57 |         columns["region"] = [""] * len(columns["question"])
58 |         columns["city"] = [""] * len(columns["question"])
59 |         columns["lang"] = ["en"] * len(columns["question"])
60 |         columns["last_update"] = [today.strftime("%Y/%m/%d")] * len(columns["question"])
61 | 
62 |         return columns
63 | 


--------------------------------------------------------------------------------
/datasources/scrapers_unused/ZEIT_scraper.py:
--------------------------------------------------------------------------------
 1 | # run 'scrapy runspider ZEIT_scraper.py' to scrape data
 2 | 
 3 | from datetime import date
 4 | import scrapy
 5 | 
 6 | class CovidScraper(scrapy.Spider):
 7 |   name = "ZEIT_faq_scraper"
 8 |   start_urls = ["https://www.zeit.de/wissen/gesundheit/2020-02/coronavirus-sars-cov-2-risiko-symptome-schutz-rechte-faq"]
 9 | 
10 |   custom_settings = {
11 |         'USER_AGENT': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
12 |     }
13 | 
14 |   def parse(self, response):
15 |     columns = {
16 |       "question" : [],
17 |       "answer" : [],
18 |       "answer_html" : [],
19 |       "link" : [],
20 |       "name" : [],
21 |       "source" : [],
22 |       "category" : [],
23 |       "country" : [],
24 |       "region" : [],
25 |       "city" : [],
26 |       "lang" : [],
27 |       "last_update" : [],
28 |     }
29 | 
30 |     QUESTION_ANSWER_SELECTOR = "div.article-page div[itemscope]:not(div[itemprop='acceptedAnswer'])"
31 |     QUESTION_SELECTOR = ".article__subheading::text"
32 |     ANSWER_SELECTOR = "p.paragraph.article__item ::text"
33 |     ANSWER_HTML_SELECTOR = "p.paragraph.article__item"
34 |     QUESTION_LINK_ID_SELECTOR = ".article__subheading"
35 | 
36 |     questions_answers = response.css(QUESTION_ANSWER_SELECTOR)
37 |     for question_answer in questions_answers:
38 |       question = question_answer.css(QUESTION_SELECTOR).getall()
39 |       question = " ".join(question).strip()
40 |       answer = question_answer.css(ANSWER_SELECTOR).getall()
41 |       answer = " ".join(answer).replace('\n', '').replace('\xa0', '').strip()
42 |       answer_html = question_answer.css(ANSWER_HTML_SELECTOR).getall()
43 |       answer_html = " ".join(answer_html).strip()
44 |       link_id = question_answer.css(QUESTION_LINK_ID_SELECTOR)[0].root.attrib['id']
45 | 
46 |       # add question-answer pair to data dictionary
47 |       columns["question"].append(question)
48 |       columns["answer"].append(answer)
49 |       columns["answer_html"].append(answer_html)
50 |       columns["link"].append("https://www.zeit.de/wissen/gesundheit/2020-02/coronavirus-sars-cov-2-risiko-symptome-schutz-rechte-faq#" + link_id)
51 | 
52 |     today = date.today()
53 | 
54 |     columns["name"] = ["Coronavirus Sars-CoV-2: Die wichtigsten Antworten zum Corona-Ausbruch"] * len(columns["question"])
55 |     columns["source"] = ["ZEIT ONLINE GmbH"] * len(columns["question"])
56 |     columns["category"] = [""] * len(columns["question"])
57 |     columns["country"] = [""] * len(columns["question"])
58 |     columns["region"] = [""] * len(columns["question"])
59 |     columns["city"] = [""] * len(columns["question"])
60 |     columns["lang"] = ["de"] * len(columns["question"])
61 |     columns["last_update"] = [today.strftime("%Y/%m/%d")] * len(columns["question"])
62 | 
63 |     return columns
64 |  
65 | 
66 | 
67 | 


--------------------------------------------------------------------------------
/covid_nlp/modeling/tfidf/preprocess.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import re
 3 | import os
 4 | 
 5 | import nltk
 6 | from nltk import word_tokenize
 7 | from nltk.corpus import stopwords
 8 | nltk.download('stopwords')
 9 | nltk.download('punkt')
10 | import string
11 | 
12 | import sentencepiece as spm
13 | 
14 | class Preprocessor():
15 | 
16 |     def __init__(self, language = 'english', instream = None):
17 |         self.language = language
18 |         if instream:
19 |             self.corpus_orig = self.read_string(instream)
20 |         else:
21 |             self.corpus_orig = self.read_articles(sys.stdin)
22 |         self.corpus = self.preprocess(self.corpus_orig)
23 | 
24 |     def preprocess(self, corpus_list):
25 |         preproc_corpus_list = []
26 |         question_words = set(['how', 'what', 'which', 'when', 'where', 'who', 'why'])
27 |         stopset = stopwords.words(self.language) + list(string.punctuation)
28 |         stopset = list(set(stopset) - question_words)
29 |         for corpus in corpus_list:
30 |             corpus = corpus.lower()
31 |             corpus = " ".join([ i for i in word_tokenize(corpus) if i not in stopset ])
32 |             preproc_corpus_list.append(corpus)
33 |         return preproc_corpus_list
34 | 
35 |     def preprocess_sp(self, corpus_list):
36 |         return self.sentencepiece_apply(self.preprocess(corpus_list))
37 | 
38 |     def sentencepiece_train(self, corpus_list, vocab_size = 24000):
39 |         fp_out = open("./sp_corpus.txt", 'w')
40 |         for corpus in corpus_list:
41 |             print(corpus, file=fp_out)
42 |         fp_out.close()
43 |         spm.SentencePieceTrainer.Train(f"--input=sp_corpus.txt --model_prefix=sp_model --vocab_size={vocab_size} --max_sentence_length=1000 --character_coverage=1.0 --num_threads=4 --hard_vocab_limit=false")
44 |         return None
45 | 
46 |     def sentencepiece_apply(self, corpus_list):
47 |         sent_corpus_list = []
48 |         sp = spm.SentencePieceProcessor()
49 |         sp.Load("./sp_model.model")
50 |         for corpus in corpus_list:
51 |             sent_corpus_list.append(" ".join(sp.EncodeAsPieces(corpus)))
52 |         return sent_corpus_list
53 | 
54 |     def read_articles(self, fp):
55 |         articles = []
56 |         for line in fp:
57 |             if line.strip() != "":
58 |                 articles.append(line.strip())
59 |         return articles
60 | 
61 |     def read_string(self, mystring):
62 |         articles = [mystring]
63 |         return articles
64 | 
65 | 
66 | def main():
67 |     vocab_size = 24000
68 |     if len(sys.argv) > 1:
69 |         vocab_size = sys.argv[1]
70 |     print("Create Preprocessor")
71 |     preprocessor = Preprocessor(language = 'english')
72 |     print("Train spm")
73 |     preprocessor.sentencepiece_train(preprocessor.corpus, vocab_size = vocab_size)
74 | 
75 | if __name__ == "__main__":
76 |     main()
77 | 


--------------------------------------------------------------------------------
/datasources/scrapers/CDC_Water_scraper.py:
--------------------------------------------------------------------------------
 1 | # run 'scrapy runspider CDC_Water_scraper.py' to scrape data
 2 | 
 3 | from datetime import date
 4 | 
 5 | import scrapy
 6 | from scrapy.crawler import CrawlerProcess
 7 | 
 8 | 
 9 | class CovidScraper(scrapy.Spider):
10 |     name = "CDC_Travel_Scraper"
11 |     start_urls = ["https://www.cdc.gov/coronavirus/2019-ncov/php/water.html"]
12 | 
13 |     def parse(self, response):
14 |         columns = {
15 |             "question": [],
16 |             "answer": [],
17 |             "answer_html": [],
18 |             "link": [],
19 |             "name": [],
20 |             "source": [],
21 |             "category": [],
22 |             "country": [],
23 |             "region": [],
24 |             "city": [],
25 |             "lang": [],
26 |             "last_update": [],
27 |         }
28 | 
29 |         found_question = False
30 | 
31 |         all_nodes = response.xpath("//*")
32 |         for node in all_nodes:
33 |             # in question
34 |             if node.attrib.get("role") == "heading":
35 |                 found_question = True
36 |                 current_question = node.css("::text").get()
37 |                 continue
38 | 
39 |             # in answer
40 |             if found_question and (node.attrib.get("class") == "collapse "):
41 |                 current_answer = node.css(" ::text").getall()
42 |                 current_answer = " ".join(current_answer).strip()
43 |                 current_answer_html = node.getall()
44 |                 current_answer_html = " ".join(current_answer_html).strip()
45 | 
46 |                 columns["question"].append(current_question)
47 |                 columns["answer"].append(current_answer)
48 |                 columns["answer_html"].append(current_answer_html)
49 |             else:
50 |                 found_question = False
51 | 
52 |         today = date.today()
53 | 
54 |         columns["link"] = ["https://www.cdc.gov/coronavirus/2019-ncov/php/water.html"] * len(columns["question"])
55 |         columns["name"] = ["Water Transmission and COVID-19"] * len(columns["question"])
56 |         columns["source"] = ["Center for Disease Control and Prevention (CDC)"] * len(columns["question"])
57 |         columns["category"] = [""] * len(columns["question"])
58 |         columns["country"] = ["USA"] * len(columns["question"])
59 |         columns["region"] = [""] * len(columns["question"])
60 |         columns["city"] = [""] * len(columns["question"])
61 |         columns["lang"] = ["en"] * len(columns["question"])
62 |         columns["last_update"] = [today.strftime("%Y/%m/%d")] * len(columns["question"])
63 | 
64 |         return columns
65 | 
66 | if __name__ == "__main__":
67 |     process = CrawlerProcess({
68 |         'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
69 |     })
70 | 
71 |     process.crawl(CovidScraper)
72 |     process.start()
73 | 


--------------------------------------------------------------------------------
/backend/data_ingestion.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from haystack import Finder
 3 | from haystack.database.elasticsearch import ElasticsearchDocumentStore
 4 | from haystack.retriever.elasticsearch import ElasticsearchRetriever
 5 | 
 6 | def index_new_docs(document_store, retriever):
 7 |     # Get dataframe with questions, answers and some metadata
 8 |     df = pd.read_csv("data/faqs/faq_covidbert.csv")
 9 |     df.fillna(value="", inplace=True)
10 | 
11 |     # Index to ES
12 |     if document_store.get_document_count() == 0:
13 |         docs_to_index = []
14 |         for idx, row in df.iterrows():
15 |             d = row.to_dict()
16 |             d = {k: v.strip() for k, v in d.items()}
17 |             d["document_id"] = idx
18 |             # add embedding
19 |             question_embedding = retriever.create_embedding(row["question"])
20 |             d["question_emb"] = question_embedding
21 |             docs_to_index.append(d)
22 |             print(idx)
23 |         document_store.write_documents(docs_to_index)
24 | 
25 | 
26 | def update_embeddings(document_store, retriever):
27 |     #TODO move this upstream into haystack
28 |     body = {
29 |         "size": 10000,
30 |         "query": {
31 |         "match_all": {}
32 |     },
33 |     "_source": {"includes":["question"]}
34 | 
35 | }
36 |     results = document_store.client.search(index=document_store.index, body=body, )["hits"]["hits"]
37 |     # update embedding field
38 |     for r in results:
39 |         question_embedding = retriever.create_embedding(r["_source"]["question"])
40 | 
41 |         body = {
42 |         "doc" : {
43 |             "question_emb": question_embedding
44 |         }
45 |     }
46 |         document_store.client.update(index=document_store.index, id=r["_id"], body=body)
47 | 
48 | 
49 | if __name__=="__main__":
50 | 
51 |     document_store = ElasticsearchDocumentStore(
52 |         host="localhost",
53 |         username="",
54 |         password="",
55 |         index="document",
56 |         text_field="answer",
57 |         embedding_field="question_emb",
58 |         embedding_dim=768,
59 |         excluded_meta_data=["question_emb"],
60 |     )
61 | 
62 |     MODEL = "deepset/sentence_bert"
63 |     GPU = False
64 | 
65 |     retriever = ElasticsearchRetriever(document_store=document_store, embedding_model=MODEL, gpu=GPU,
66 |                                        emb_extraction_layer=-2, pooling_strategy="reduce_mean")
67 | 
68 |     # index new docs
69 |     index_new_docs(document_store, retriever)
70 | 
71 |     # or just update embeddings
72 |     # update_embeddings(document_store, retriever)
73 | 
74 |     # test with a query
75 |     finder = Finder(reader=None, retriever=retriever)
76 |     prediction = finder.get_answers_via_similar_questions(question="How high is mortality?", top_k_retriever=10)
77 |     for p in prediction["answers"]:
78 |         print(p["question"])
79 | 


--------------------------------------------------------------------------------
/datasources/scrapers/Bundesregierung_scraper.py:
--------------------------------------------------------------------------------
 1 | from datetime import date
 2 | 
 3 | import scrapy
 4 | 
 5 | 
 6 | class CovidScraper(scrapy.Spider):
 7 |     name = "Bundesregierung_scraper"
 8 |     start_urls = ["https://www.bundesregierung.de/breg-de/themen/coronavirus/ausbreitung-coronavirus-1716188"]
 9 | 
10 |     def parse(self, response):
11 |         columns = {
12 |             "question": [],
13 |             "answer": [],
14 |             "answer_html": [],
15 |             "link": [],
16 |             "name": [],
17 |             "source": [],
18 |             "category": [],
19 |             "country": [],
20 |             "region": [],
21 |             "city": [],
22 |             "lang": [],
23 |             "last_update": [],
24 |         }
25 | 
26 |         QUESTION_ELEMENT_SELECTOR = "h2.mt-3"
27 |         QUESTION_SELECTOR = "::text"
28 | 
29 |         questions = response.css(QUESTION_ELEMENT_SELECTOR)
30 |         for question_elm in questions:
31 |             question = question_elm.css(QUESTION_SELECTOR).getall()
32 |             question = " ".join(question).strip()
33 | 
34 |             # all paragraphs till the next question header are considert to be the answer
35 |             following_siblings = question_elm.xpath('following-sibling::*')
36 |             answer = []
37 |             answer_html = []
38 |             for elm in following_siblings:
39 |                 if elm.root.tag == 'p' and 'navToTop' not in elm.root.classes:
40 |                     answer += elm.css("::text").getall()
41 |                     answer_html += [elm.get()]
42 |                 else:
43 |                     break
44 |             answer = "".join(answer).replace('\n', '').strip()
45 |             answer_html = " ".join(answer_html).strip()
46 | 
47 |             # add question-answer pair to data dictionary
48 |             columns["question"].append(question)
49 |             columns["answer"].append(answer)
50 |             columns["answer_html"].append(answer_html)
51 | 
52 |         today = date.today()
53 | 
54 |         columns["link"] = [
55 |                               "https://www.bundesregierung.de/breg-de/themen/coronavirus/ausbreitung-coronavirus-1716188"] * len(
56 |             columns["question"])
57 |         columns["name"] = ["Wichtige Fragen und Antworten zum Coronavirus"] * len(columns["question"])
58 |         columns["source"] = ["Presse- und Informationsamt der Bundesregierung"] * len(columns["question"])
59 |         columns["category"] = [""] * len(columns["question"])
60 |         columns["country"] = ["DE"] * len(columns["question"])
61 |         columns["region"] = [""] * len(columns["question"])
62 |         columns["city"] = [""] * len(columns["question"])
63 |         columns["lang"] = ["de"] * len(columns["question"])
64 |         columns["last_update"] = [today.strftime("%Y/%m/%d")] * len(columns["question"])
65 | 
66 |         return columns
67 | 


--------------------------------------------------------------------------------
/datasources/scrapers/UNICEF_scraper.py:
--------------------------------------------------------------------------------
 1 | from datetime import date
 2 | 
 3 | import scrapy
 4 | 
 5 | 
 6 | class CovidScraper(scrapy.Spider):
 7 |     name = "UNICEF_scraper"
 8 |     start_urls = ["https://www.unicef.org/stories/novel-coronavirus-outbreak-what-parents-should-know"]
 9 | 
10 |     def parse(self, response):
11 |         columns = {
12 |             "question": [],
13 |             "answer": [],
14 |             "answer_html": [],
15 |             "link": [],
16 |             "name": [],
17 |             "source": [],
18 |             "category": [],
19 |             "country": [],
20 |             "region": [],
21 |             "city": [],
22 |             "lang": [],
23 |             "last_update": [],
24 |         }
25 | 
26 |         QUESTION_ANSWER_SELECTOR = ".field .field--name-field-component-text-content"
27 |         QUESTION_SELECTOR = "h4::text"
28 |         ANSWER_SELECTOR = "p:not(p:contains('< Back')) ::text"
29 |         ANSWER_HTML_SELECTOR = "p:not(p:contains('< Back'))"
30 | 
31 |         questions_answers = response.css(QUESTION_ANSWER_SELECTOR)
32 |         for question_answer in questions_answers:
33 |             question = question_answer.css(QUESTION_SELECTOR).getall()
34 |             question = " ".join(question).strip()
35 |             answer = question_answer.css(ANSWER_SELECTOR).getall()
36 |             answer = " ".join(answer).strip()
37 |             answer_html = question_answer.css(ANSWER_HTML_SELECTOR).getall()
38 |             answer_html = " ".join(answer_html).strip()
39 | 
40 |             # if no question, answer belongs to last question. ("How can I avoid the risk of infection?")
41 |             if (question == ''):
42 |                 columns["answer"][-1] += ' ' + answer
43 |                 columns["answer_html"][-1] += ' ' + answer_html
44 |                 continue
45 | 
46 |             # add question-answer pair to data dictionary
47 |             columns["question"].append(question)
48 |             columns["answer"].append(answer)
49 |             columns["answer_html"].append(answer_html)
50 | 
51 |         today = date.today()
52 | 
53 |         columns["link"] = ["https://www.unicef.org/stories/novel-coronavirus-outbreak-what-parents-should-know"] * len(
54 |             columns["question"])
55 |         columns["name"] = ["Coronavirus disease (COVID-19): What parents should know"] * len(columns["question"])
56 |         columns["source"] = ["UNICEF"] * len(columns["question"])
57 |         columns["category"] = [""] * len(columns["question"])
58 |         columns["country"] = [""] * len(columns["question"])
59 |         columns["region"] = [""] * len(columns["question"])
60 |         columns["city"] = [""] * len(columns["question"])
61 |         columns["lang"] = ["en"] * len(columns["question"])
62 |         columns["last_update"] = [today.strftime("%Y/%m/%d")] * len(columns["question"])
63 | 
64 |         return columns
65 | 


--------------------------------------------------------------------------------
/datasources/scrapers/BAUA_scraper.py:
--------------------------------------------------------------------------------
 1 | from datetime import date
 2 | 
 3 | import scrapy
 4 | from scrapy.crawler import CrawlerProcess
 5 | 
 6 | 
 7 | class CovidScraper(scrapy.Spider):
 8 |     name = "BAUA_scraper"
 9 |     start_urls = ["https://www.baua.de/DE/Themen/Arbeitsgestaltung-im-Betrieb/Biostoffe/FAQ/FAQ_node.html"]
10 | 
11 |     def parse(self, response):
12 |         columns = {
13 |             "question": [],
14 |             "answer": [],
15 |             "answer_html": [],
16 |             "link": [],
17 |             "name": [],
18 |             "source": [],
19 |             "category": [],
20 |             "country": [],
21 |             "region": [],
22 |             "city": [],
23 |             "lang": [],
24 |             "last_update": [],
25 |         }
26 |         QUESTIONS_SELECTOR = "//div[@class='tabs-container']/h2[@class='heading']"
27 |         QUESTION_SELECTOR = " ::text"
28 |         ANSWERS_SELECTOR = "//div[@class='tabs-container']/div"
29 |         ANSWER_SELECTOR = "*::text"
30 |         ANSWER_HTML_SELECTOR = "*"
31 | 
32 |         for q in response.xpath(QUESTIONS_SELECTOR):
33 |             question = q.css(QUESTION_SELECTOR).getall()
34 |             question = " ".join(question).strip()
35 | 
36 |             columns["question"].append(question)
37 | 
38 |         for a in response.xpath(ANSWERS_SELECTOR):
39 |             answer = a.css(ANSWER_SELECTOR).getall()
40 |             answer = " ".join(answer).strip()
41 |             answer_html = a.css(ANSWER_HTML_SELECTOR).getall()
42 |             answer_html = " ".join(answer_html).strip()
43 | 
44 |             columns["answer"].append(answer)
45 |             columns["answer_html"].append(answer_html)
46 | 
47 |         today = date.today()
48 | 
49 |         columns["link"] = [
50 |                               "https://www.baua.de/DE/Themen/Arbeitsgestaltung-im-Betrieb/Biostoffe/FAQ/FAQ_node.html"] * len(
51 |             columns["question"])
52 |         columns["name"] = ["Antworten auf häufig gestellte Fragen zu beruflichen Tätigkeiten mit SARS-CoV-2"] * len(
53 |             columns["question"])
54 |         columns["source"] = ["Bundesanstalt für Arbeitsschutz und Arbeitsmedizin (BAuA)"] * len(columns["question"])
55 |         columns["category"] = [""] * len(columns["question"])
56 |         columns["country"] = ["DE"] * len(columns["question"])
57 |         columns["region"] = [""] * len(columns["question"])
58 |         columns["city"] = [""] * len(columns["question"])
59 |         columns["lang"] = ["de"] * len(columns["question"])
60 |         columns["last_update"] = [today.strftime("%Y/%m/%d")] * len(columns["question"])
61 | 
62 |         return columns
63 | 
64 | 
65 | if __name__ == "__main__":
66 |     process = CrawlerProcess({
67 |         'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
68 |     })
69 | 
70 |     process.crawl(CovidScraper)
71 |     process.start()
72 | 


--------------------------------------------------------------------------------
/datasources/scrapers_outdated/BZgA_scraper.py:
--------------------------------------------------------------------------------
 1 | # run 'scrapy runspider WHO_scraper.py' to scrape data
 2 | 
 3 | from datetime import date
 4 | 
 5 | import scrapy
 6 | from scrapy.crawler import CrawlerProcess
 7 | 
 8 | 
 9 | class CovidScraper(scrapy.Spider):
10 |     name = "BZgA_scraper"
11 |     start_urls = ["https://www.infektionsschutz.de/coronavirus/faqs-coronaviruscovid-19.html"]
12 | 
13 |     def parse(self, response):
14 |         columns = {
15 |             "question": [],
16 |             "answer": [],
17 |             "answer_html": [],
18 |             "link": [],
19 |             "name": [],
20 |             "source": [],
21 |             "category": [],
22 |             "country": [],
23 |             "region": [],
24 |             "city": [],
25 |             "lang": [],
26 |             "last_update": [],
27 |         }
28 | 
29 |         QUESTION_ANSWER_SELECTOR = ".c-accordion__item"
30 |         QUESTION_SELECTOR = ".c-accordion__button::text"
31 |         ANSWER_SELECTOR = ".c-accordion__section ::text"
32 |         ANSWER_HTML_SELECTOR = ".c-text"
33 | 
34 |         questions_answers = response.css(QUESTION_ANSWER_SELECTOR)
35 |         for question_answer in questions_answers:
36 |             question = question_answer.css(QUESTION_SELECTOR).getall()
37 |             question = " ".join(question).strip()
38 |             answer = question_answer.css(ANSWER_SELECTOR).getall()
39 |             answer = "".join(answer).strip()
40 |             answer_html = question_answer.css(ANSWER_HTML_SELECTOR).getall()
41 |             answer_html = " ".join(answer_html).strip()
42 | 
43 |             # add question-answer pair to data dictionary
44 |             columns["question"].append(question)
45 |             columns["answer"].append(answer)
46 |             columns["answer_html"].append(answer_html)
47 | 
48 |         today = date.today()
49 | 
50 |         columns["link"] = ["https://www.infektionsschutz.de/coronavirus/faqs-coronaviruscovid-19.html"] * len(
51 |             columns["question"])
52 |         columns["name"] = ["FAQs Coronavirus/Covid-19"] * len(columns["question"])
53 |         columns["source"] = ["Bundeszentrale für gesundheitliche Aufklärung (BZgA)"] * len(columns["question"])
54 |         columns["category"] = [""] * len(columns["question"])
55 |         columns["country"] = ["DE"] * len(columns["question"])
56 |         columns["region"] = [""] * len(columns["question"])
57 |         columns["city"] = [""] * len(columns["question"])
58 |         columns["lang"] = ["de"] * len(columns["question"])
59 |         columns["last_update"] = [today.strftime("%Y/%m/%d")] * len(columns["question"])
60 | 
61 |         return columns
62 | 
63 | 
64 | if __name__ == "__main__":
65 |     process = CrawlerProcess({
66 |         'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
67 |     })
68 | 
69 |     process.crawl(CovidScraper)
70 |     process.start()
71 | 


--------------------------------------------------------------------------------
/backend/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | cover/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | .pybuilder/
 77 | target/
 78 | 
 79 | # Jupyter Notebook
 80 | .ipynb_checkpoints
 81 | 
 82 | # IPython
 83 | profile_default/
 84 | ipython_config.py
 85 | 
 86 | # pyenv
 87 | #   For a library or package, you might want to ignore these files since the code is
 88 | #   intended to run in multiple environments; otherwise, check them in:
 89 | # .python-version
 90 | 
 91 | # pipenv
 92 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 93 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 94 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 95 | #   install all needed dependencies.
 96 | #Pipfile.lock
 97 | 
 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 99 | __pypackages__/
100 | 
101 | # Celery stuff
102 | celerybeat-schedule
103 | celerybeat.pid
104 | 
105 | # SageMath parsed files
106 | *.sage.py
107 | 
108 | # Environments
109 | .env
110 | .venv
111 | env/
112 | venv/
113 | ENV/
114 | env.bak/
115 | venv.bak/
116 | 
117 | # Spyder project settings
118 | .spyderproject
119 | .spyproject
120 | 
121 | # Rope project settings
122 | .ropeproject
123 | 
124 | # mkdocs documentation
125 | /site
126 | 
127 | # mypy
128 | .mypy_cache/
129 | .dmypy.json
130 | dmypy.json
131 | 
132 | # Pyre type checker
133 | .pyre/
134 | 
135 | # pytype static type analyzer
136 | .pytype/
137 | 
138 | # Cython debug symbols
139 | cython_debug/
140 | 
141 | # static files generated from Django application using `collectstatic`
142 | media
143 | static
144 | 


--------------------------------------------------------------------------------
/datasources/scrapers/CDC_Travel_scraper.py:
--------------------------------------------------------------------------------
 1 | # run 'scrapy runspider CDC_Travel_scraper.py' to scrape data
 2 | 
 3 | from datetime import date
 4 | 
 5 | import scrapy
 6 | 
 7 | 
 8 | class CovidScraper(scrapy.Spider):
 9 |     name = "CDC_Travel_Scraper"
10 |     start_urls = ["https://www.cdc.gov/coronavirus/2019-ncov/travelers/faqs.html"]
11 | 
12 |     def parse(self, response):
13 |         columns = {
14 |             "question": [],
15 |             "answer": [],
16 |             "answer_html": [],
17 |             "link": [],
18 |             "name": [],
19 |             "source": [],
20 |             "category": [],
21 |             "country": [],
22 |             "region": [],
23 |             "city": [],
24 |             "lang": [],
25 |             "last_update": [],
26 |         }
27 | 
28 |         current_category = ""
29 | 
30 |         all_nodes = response.xpath("//*")
31 |         for node in all_nodes:
32 |             # in category
33 |             if node.attrib.get("class") == "onThisPageAnchor":
34 |                 current_category = node.attrib.get("title")
35 | 
36 |             # in category
37 |             if current_category:
38 |                 # in question
39 |                 if node.attrib.get("role") == "heading":
40 |                     current_question = node.css("::text").get()
41 | 
42 |                 # in answer
43 |                 if node.attrib.get("class") == "card-body":
44 |                     current_answer = node.css(" ::text").getall()
45 |                     current_answer = " ".join(current_answer).strip()
46 |                     current_answer_html = node.getall()
47 |                     current_answer_html = " ".join(current_answer_html).strip()
48 | 
49 |                     # add question-answer-pair to data dictionary
50 |                     columns["question"].append(current_question)
51 |                     columns["answer"].append(current_answer)
52 |                     columns["answer_html"].append(current_answer_html)
53 |                     columns["category"].append(current_category)
54 | 
55 |             # end of FAQ
56 |             if node.attrib.get("class") == "text-right mb-2":
57 |                 current_category = ""
58 | 
59 |         today = date.today()
60 | 
61 |         columns["link"] = ["https://www.cdc.gov/coronavirus/2019-ncov/travelers/faqs.html"] * len(columns["question"])
62 |         columns["name"] = ["Travel: Frequently Asked Questions and Answers"] * len(columns["question"])
63 |         columns["source"] = ["Center for Disease Control and Prevention (CDC)"] * len(columns["question"])
64 |         columns["country"] = ["USA"] * len(columns["question"])
65 |         columns["region"] = [""] * len(columns["question"])
66 |         columns["city"] = [""] * len(columns["question"])
67 |         columns["lang"] = ["en"] * len(columns["question"])
68 |         columns["last_update"] = [today.strftime("%Y/%m/%d")] * len(columns["question"])
69 | 
70 |         return columns
71 | 


--------------------------------------------------------------------------------
/datasources/scrapers/Salute_IT_scraper.py:
--------------------------------------------------------------------------------
 1 | # run 'scrapy runspider Salute_IT_scraper.py' to scrape data
 2 | 
 3 | from datetime import date
 4 | 
 5 | import scrapy
 6 | from scrapy.crawler import CrawlerProcess
 7 | 
 8 | 
 9 | class CovidScraper(scrapy.Spider):
10 |     name = "Salute_IT_Scraper"
11 |     start_urls = ["https://www.salute.gov.it/portale/nuovocoronavirus/dettaglioFaqNuovoCoronavirus.jsp?id=228"]
12 | 
13 |     def parse(self, response):
14 |         columns = {
15 |             "question": [],
16 |             "answer": [],
17 |             "answer_html": [],
18 |             "link": [],
19 |             "name": [],
20 |             "source": [],
21 |             "category": [],
22 |             "country": [],
23 |             "region": [],
24 |             "city": [],
25 |             "lang": [],
26 |             "last_update": [],
27 |         }
28 | 
29 |         # extract topics
30 |         for x in response.xpath('//dl'):
31 |             # question is in second strong object in dt
32 |             question_list = [q.strip() for q in x.xpath('./dt/strong[2]/text()').extract()]
33 |             # answer is in dd
34 |             answer_html_list = []
35 |             answer_list = []
36 |             for a in x.xpath('./dd')[:-1]:
37 |                 answer_html_list.append(' '.join([h.strip() for h in a.xpath('./descendant-or-self::*').extract()]))
38 |                 answer_list.append(' '.join([t.strip() for t in a.xpath('./descendant-or-self::*/text()').extract()]))
39 |             if len(question_list) == len(answer_list):
40 |                 for question_text, answer_text, answer_html in zip(question_list, answer_list, answer_html_list):
41 |                     columns["question"].append(question_text)
42 |                     columns["answer"].append(answer_text)
43 |                     columns["answer_html"].append(answer_html)
44 | 
45 |         today = date.today()
46 | 
47 |         columns["link"] = [
48 |                               "https://www.salute.gov.it/portale/nuovocoronavirus/dettaglioFaqNuovoCoronavirus.jsp?id=228"] * len(
49 |             columns["question"])
50 |         columns["name"] = ["FAQ - Covid-19, domande e risposte"] * len(columns["question"])
51 |         columns["source"] = ["Ministero della Salute, IT"] * len(columns["question"])
52 |         columns["category"] = [""] * len(columns["question"])
53 |         columns["country"] = ["IT"] * len(columns["question"])
54 |         columns["region"] = [""] * len(columns["question"])
55 |         columns["city"] = [""] * len(columns["question"])
56 |         columns["lang"] = ["it"] * len(columns["question"])
57 |         columns["last_update"] = [today.strftime("%Y/%m/%d")] * len(columns["question"])
58 | 
59 |         return columns
60 | 
61 | 
62 | if __name__ == "__main__":
63 |     process = CrawlerProcess({
64 |         'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
65 |     })
66 | 
67 |     process.crawl(CovidScraper)
68 |     process.start()
69 | 


--------------------------------------------------------------------------------
/covid-frontend/.gitlab-ci.yml:
--------------------------------------------------------------------------------
  1 | image: node:lts
  2 | services:
  3 |   - docker:18.09-dind
  4 | 
  5 | cache:
  6 |   key: ${CI_COMMIT_REF_SLUG}
  7 |   paths:
  8 |   - node_modules/
  9 | 
 10 | stages:
 11 |   - build
 12 |   - nginx
 13 |   - staging
 14 |   - production
 15 | 
 16 | variables:
 17 |   DOCKER_HOST: tcp://localhost:2375
 18 | 
 19 | build_app:staging:
 20 |   stage: build
 21 |   script:
 22 |   - npm install
 23 |   - npm run build:staging
 24 |   artifacts:
 25 |     paths:
 26 |     - build/
 27 |     expire_in: 1 day
 28 |   except:
 29 |     - master
 30 | 
 31 | build_app:production:
 32 |   stage: build
 33 |   script:
 34 |   - npm install
 35 |   - npm run build:production
 36 |   artifacts:
 37 |     paths:
 38 |     - build/
 39 |     expire_in: 1 day
 40 |   only:
 41 |     - master
 42 | 
 43 | build_nginx:staging:
 44 |   stage: nginx
 45 |   image: docker:18.09-dind
 46 |   cache: {}
 47 |   script:
 48 |   - docker login -u gitlab-ci-token -p $CI_BUILD_TOKEN $CI_REGISTRY
 49 |   - docker build --pull -t $CI_REGISTRY_IMAGE:$CI_BUILD_REF .
 50 |   - docker push $CI_REGISTRY_IMAGE:$CI_BUILD_REF
 51 |   dependencies:
 52 |     - build_app:staging
 53 |   only:
 54 |     - develop
 55 | 
 56 | build_nginx:production:
 57 |   stage: nginx
 58 |   image: docker:18.09-dind
 59 |   cache: {}
 60 |   script:
 61 |   - docker login -u gitlab-ci-token -p $CI_BUILD_TOKEN $CI_REGISTRY
 62 |   - docker build --pull -t $CI_REGISTRY_IMAGE:$CI_BUILD_REF .
 63 |   - docker push $CI_REGISTRY_IMAGE:$CI_BUILD_REF
 64 |   dependencies:
 65 |     - build_app:production
 66 |   only:
 67 |     - master
 68 | 
 69 | .kubectl: &kubectl
 70 |   - echo "$CERTIFICATE_AUTHORITY_DATA" > ca-auth.pem
 71 |   - kubectl config set-cluster deepannotate-eks --server="$SERVER"
 72 |   - kubectl config set-cluster deepannotate-eks --embed-certs --certificate-authority=ca-auth.pem
 73 |   - kubectl config set-credentials gitlab --token="$USER_TOKEN"
 74 |   - kubectl config set-context default --cluster=deepannotate-eks --user=gitlab
 75 |   - kubectl config use-context default
 76 | 
 77 | .staging_deploy: &staging_deploy
 78 |   image: dtzar/helm-kubectl
 79 |   cache: {}
 80 |   stage: staging
 81 |   before_script: *kubectl
 82 |   script:
 83 |     - kubectl patch deployment covid-frontend -n covid-staging -p '{"spec":{"template":{"spec":{"containers":[{"name":"'"covid-frontend"'","image":"'"$CI_REGISTRY_IMAGE:$CI_BUILD_REF"'"}]}}}}'
 84 | 
 85 | deploy to staging:
 86 |   <<: *staging_deploy
 87 |   only:
 88 |     - develop
 89 | 
 90 | .production_deploy: &production_deploy
 91 |   image: dtzar/helm-kubectl
 92 |   cache: {}
 93 |   stage: production
 94 |   before_script: *kubectl
 95 |   script:
 96 |     - kubectl patch deployment covid-frontend -n covid-production -p '{"spec":{"template":{"spec":{"containers":[{"name":"'"covid-frontend"'","image":"'"$CI_REGISTRY_IMAGE:$CI_BUILD_REF"'"}]}}}}'
 97 | 
 98 | deploy to production:
 99 |   <<: *production_deploy
100 |   only:
101 |     - master
102 | 


--------------------------------------------------------------------------------
/datasources/scrapers/BMWI_scraper.py:
--------------------------------------------------------------------------------
 1 | # run 'scrapy runspider BMWI_scraper.py' to scrape data
 2 | 
 3 | from datetime import date
 4 | import re
 5 | import scrapy
 6 | from scrapy.crawler import CrawlerProcess
 7 | 
 8 | 
 9 | class CovidScraper(scrapy.Spider):
10 |     name = 'bmwi_spyder'
11 |     start_urls = ['https://www.bmwi.de/Redaktion/DE/FAQ/Coronavirus/faq-coronavirus.html']
12 | 		
13 |     def parse(self, response):
14 |         columns = {
15 |             "question": [],
16 |             "answer": [],
17 |             "answer_html": [],
18 |             "link": [],
19 |             "name": [],
20 |             "source": [],
21 |             "category": [],
22 |             "country": [],
23 |             "region": [],
24 |             "city": [],
25 |             "lang": [],
26 |             "last_update": [],
27 |         }
28 | 
29 |         categoryName = ""
30 |         question = ""
31 |         for elementPath in response.xpath('//div[@class="content"]/div/child::node()'):
32 |             tagName = elementPath.xpath('name()').get()
33 |             if tagName == 'h2':
34 |                 categoryName = ' '.join(elementPath.xpath('.//text()').getall()).strip()
35 |             if len(categoryName) == 0:
36 |                 continue
37 |             if tagName == 'div':
38 |                 question = ' '.join(elementPath.xpath('.//h2//text()').getall()).strip()
39 |                 response = ''
40 |                 responsePath = elementPath.xpath('.//div[@class="accordion-body collapse"]//div[@class="rich-text"]')
41 |                 for path in responsePath.xpath('.//p|.//ul/li'):
42 |                     response += '\n\n' + ' '.join(path.xpath('.//text()').getall())
43 |                 response = re.sub('\(Stand[^)]*\)', '', response).strip()
44 |                 columns['category'].append(categoryName)
45 |                 columns['question'].append(question)
46 |                 columns['answer'].append(response)
47 |                 columns['answer_html'].append(responsePath.get())
48 | 
49 |         today = date.today()
50 | 
51 |         columns["link"] = ["https://www.bmwi.de/Redaktion/DE/FAQ/Coronavirus/faq-coronavirus.html"] * len(columns["question"])
52 |         columns["name"] = ["Coronavirus: Antworten auf häufig gestellte Fragen"] * len(columns["question"])
53 |         columns["source"] = ["Bundesministerium für Wirtschaft und Energie"] * len(columns["question"])
54 |         columns["country"] = ["DE"] * len(columns["question"])
55 |         columns["region"] = [""] * len(columns["question"])
56 |         columns["city"] = [""] * len(columns["question"])
57 |         columns["lang"] = ["de"] * len(columns["question"])
58 |         columns["last_update"] = [today.strftime("%Y/%m/%d")] * len(columns["question"])
59 | 
60 |         return columns
61 | 
62 | 
63 | if __name__ == "__main__":
64 |     process = CrawlerProcess({
65 |         'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
66 |     })
67 |     process.crawl(CovidScraper)
68 |     process.start()
69 | 


--------------------------------------------------------------------------------
/datasources/scrapers_outdated/CDC_Individuals_scraper.py:
--------------------------------------------------------------------------------
 1 | # run 'scrapy runspider CDC_Individuals_scraper.py' to scrape data
 2 | 
 3 | from datetime import date
 4 | 
 5 | import scrapy
 6 | 
 7 | 
 8 | class CovidScraper(scrapy.Spider):
 9 |     name = "CDC_Individuals_Scraper"
10 |     start_urls = ["https://www.cdc.gov/coronavirus/2019-ncov/prepare/faq.html"]
11 | 
12 |     def parse(self, response):
13 |         columns = {
14 |             "question": [],
15 |             "answer": [],
16 |             "answer_html": [],
17 |             "link": [],
18 |             "name": [],
19 |             "source": [],
20 |             "category": [],
21 |             "country": [],
22 |             "region": [],
23 |             "city": [],
24 |             "lang": [],
25 |             "last_update": [],
26 |         }
27 | 
28 |         current_category = ""
29 | 
30 |         all_nodes = response.xpath("//*")
31 |         for node in all_nodes:
32 |             # in category
33 |             if node.attrib.get("class") == "card-header h4 bg-amber-t":
34 |                 current_category = node.css("::text").get()
35 |                 continue
36 | 
37 |             # in category
38 |             if current_category:
39 |                 # in question
40 |                 if node.attrib.get("role") == "heading":
41 |                     current_question = node.css("::text").get()
42 | 
43 |                 # in answer
44 |                 if node.attrib.get("class") == "card-body bg-gray-l3":
45 |                     current_answer = node.css(" ::text").getall()
46 |                     current_answer = " ".join(current_answer).strip()
47 |                     current_answer_html = node.getall()
48 |                     current_answer_html = " ".join(current_answer_html).strip()
49 | 
50 |                     # add question-answer-pair to data dictionary
51 |                     columns["question"].append(current_question)
52 |                     columns["answer"].append(current_answer)
53 |                     columns["answer_html"].append(current_answer_html)
54 |                     columns["category"].append(current_category)
55 | 
56 |             # end of category
57 |             if node.attrib.get("class") == "row":
58 |                 current_category = ""
59 | 
60 |         today = date.today()
61 | 
62 |         columns["link"] = ["https://www.cdc.gov/coronavirus/2019-ncov/prepare/faq.html"] * len(columns["question"])
63 |         columns["name"] = ["FAQs for Individuals and Families"] * len(columns["question"])
64 |         columns["source"] = ["Center for Disease Control and Prevention (CDC)"] * len(columns["question"])
65 |         columns["country"] = ["USA"] * len(columns["question"])
66 |         columns["region"] = [""] * len(columns["question"])
67 |         columns["city"] = [""] * len(columns["question"])
68 |         columns["lang"] = ["en"] * len(columns["question"])
69 |         columns["last_update"] = [today.strftime("%Y/%m/%d")] * len(columns["question"])
70 | 
71 |         return columns
72 | 


--------------------------------------------------------------------------------
/covid_nlp/modeling/transformer/eval_pretrained_haystack.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | from sklearn.metrics import roc_auc_score
 5 | from farm.utils import MLFlowLogger
 6 | from haystack.retriever.elasticsearch import ElasticsearchRetriever
 7 | from sklearn.metrics.pairwise import cosine_similarity
 8 | from covid_nlp.eval import eval_question_similarity
 9 | 
10 | def eval_pretrained_transformers(eval_file, lang, models, pooling_methods, extraction_layers):
11 |     for model_name in models:
12 |         for pooling_method in pooling_methods:
13 |             for extraction_layer in extraction_layers:
14 |                 experiment_name = model_name
15 |                 log_to_mlflow = True
16 |                 params = {"pooling_method": pooling_method,
17 |                           "extraction_layer": extraction_layer}
18 | 
19 |                 # load eval data
20 |                 df = pd.read_csv(eval_file)
21 |                 # predict similarity of samples (e.g. via embeddings + cosine similarity)
22 |                 # here: dummy preds for naive baseline
23 |                 y_true = df["similar"].values
24 |                 retriever = ElasticsearchRetriever(document_store=None, embedding_model=model_name, gpu=True)
25 |                 questions_1 = [{"text": v} for k, v in df["question_1"].to_dict().items()]
26 |                 questions_2 = [{"text": v} for k, v in df["question_2"].to_dict().items()]
27 | 
28 |                 res1 = retriever.embedding_model.extract_vectors(dicts=questions_1,
29 |                                                           extraction_strategy=params["pooling_method"],
30 |                                                           extraction_layer=params["extraction_layer"])
31 | 
32 |                 res2 = retriever.embedding_model.extract_vectors(dicts=questions_2,
33 |                                                           extraction_strategy=params["pooling_method"],
34 |                                                           extraction_layer=params["extraction_layer"])
35 |                 res1 = np.array([i["vec"] for i in res1])
36 |                 res2 = np.array([i["vec"] for i in res2])
37 | 
38 |                 df["pred"] = np.diag(cosine_similarity(res1, res2))
39 | 
40 |                 # eval & track results
41 |                 eval_question_similarity(y_true=y_true, y_pred=df["pred"].values, lang=lang, model_name=model_name,
42 |                                          params=params, user="malte", log_to_mlflow=log_to_mlflow, run_name=experiment_name)
43 | 
44 | if __name__ == "__main__":
45 |     eval_file =  "../data/eval_question_similarity_en.csv"
46 |     lang = "en"
47 | #    models = ["deepset/sentence_bert","bert-base-uncased", "DeepPavlov/bert-base-multilingual-cased-sentence"]
48 |     models = ["deepset/quora_dedup_bert_base"]
49 |     pooling_methods = ["reduce_mean","cls_token","reduce_max"]
50 |     extraction_layers = [-1, -2]
51 |     eval_pretrained_transformers(eval_file, lang, models, pooling_methods, extraction_layers)
52 | 


--------------------------------------------------------------------------------
/datasources/scrapers/BVF_scraper.py:
--------------------------------------------------------------------------------
 1 | from datetime import date
 2 | 
 3 | import scrapy
 4 | from scrapy.crawler import CrawlerProcess
 5 | 
 6 | 
 7 | class CovidScraper(scrapy.Spider):
 8 |     name = "BVF_scraper"
 9 |     start_urls = [
10 |         "https://www.bvf.de/aktuelles/fachliche-meldungen/artikel/news/faq-fuer-schwangere-frauen-und-ihre-familien-zu-spezifischen-risiken-der-covid-19-virusinfektion/"]
11 | 
12 |     def parse(self, response):
13 |         columns = {
14 |             "question": [],
15 |             "answer": [],
16 |             "answer_html": [],
17 |             "link": [],
18 |             "name": [],
19 |             "source": [],
20 |             "category": [],
21 |             "country": [],
22 |             "region": [],
23 |             "city": [],
24 |             "lang": [],
25 |             "last_update": [],
26 |         }
27 |         QUESTIONS_SELECTOR = ".news-text-wrap h3::text"
28 |         ANSWER_SELECTOR = " ::text"
29 |         ANSWER_HTML_SELECTOR = "*"
30 | 
31 |         for q in response.css(QUESTIONS_SELECTOR):
32 |             question = q.get()
33 |             answer = ""
34 |             answer_html = ""
35 |             for selector in response.xpath("//div/h3[contains(text(), '" + question + "')]/following-sibling::*"):
36 |                 if "h3" in selector.get():
37 |                     break
38 |                 else:
39 |                     answer += " ".join(selector.css(ANSWER_SELECTOR).getall()).strip() + "\n"
40 |                     answer_html += " ".join(selector.css(ANSWER_HTML_SELECTOR).getall()).strip()
41 | 
42 |             columns['question'].append(question)
43 |             columns['answer'].append(answer)
44 |             columns['answer_html'].append(answer_html)
45 | 
46 |         today = date.today()
47 | 
48 |         columns["link"] = [
49 |                               "https://www.bvf.de/aktuelles/fachliche-meldungen/artikel/news/faq-fuer-schwangere-frauen-und-ihre-familien-zu-spezifischen-risiken-der-covid-19-virusinfektion/"] * len(
50 |             columns["question"])
51 |         columns["name"] = [
52 |                               "FAQ für schwangere Frauen und ihre Familien zu spezifischen Risiken der COVID-19-Virusinfektion"] * len(
53 |             columns["question"])
54 |         columns["source"] = ["Berufsverband der Frauenärzte (BvF)"] * len(columns["question"])
55 |         columns["category"] = [""] * len(columns["question"])
56 |         columns["country"] = ["DE"] * len(columns["question"])
57 |         columns["region"] = [""] * len(columns["question"])
58 |         columns["city"] = [""] * len(columns["question"])
59 |         columns["lang"] = ["de"] * len(columns["question"])
60 |         columns["last_update"] = [today.strftime("%Y/%m/%d")] * len(columns["question"])
61 | 
62 |         return columns
63 | 
64 | 
65 | if __name__ == "__main__":
66 |     process = CrawlerProcess({
67 |         'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
68 |     })
69 | 
70 |     process.crawl(CovidScraper)
71 |     process.start()
72 | 


--------------------------------------------------------------------------------
/telegram-bot/src/main/kotlin/com/theapache64/cs/models/rest/telegram/TelegramCallbackQuery.kt:
--------------------------------------------------------------------------------
 1 | package com.theapache64.cs.models.rest.telegram
 2 | 
 3 | import com.google.gson.annotations.SerializedName
 4 | 
 5 | 
 6 | data class TelegramCallbackQuery(
 7 |     @SerializedName("callback_query")
 8 |     val callbackQuery: CallbackQuery,
 9 |     @SerializedName("update_id")
10 |     val updateId: Int // 996097080
11 | ) {
12 |     data class CallbackQuery(
13 |         @SerializedName("chat_instance")
14 |         val chatInstance: String, // -4027463488092007398
15 |         @SerializedName("data")
16 |         val `data`: String, // r123
17 |         @SerializedName("from")
18 |         val from: From,
19 |         @SerializedName("id")
20 |         val id: String, // 1034271309301426903
21 |         @SerializedName("message")
22 |         val message: Message
23 |     ) {
24 |         data class From(
25 |             @SerializedName("first_name")
26 |             val firstName: String, // theapache64
27 |             @SerializedName("id")
28 |             val id: Int, // 240810054
29 |             @SerializedName("is_bot")
30 |             val isBot: Boolean, // false
31 |             @SerializedName("language_code")
32 |             val languageCode: String, // en
33 |             @SerializedName("username")
34 |             val username: String // theapache64
35 |         )
36 | 
37 |         data class Message(
38 |             @SerializedName("chat")
39 |             val chat: Chat,
40 |             @SerializedName("date")
41 |             val date: Int, // 1584998447
42 |             @SerializedName("from")
43 |             val from: From,
44 |             @SerializedName("message_id")
45 |             val messageId: Long, // 61
46 |             @SerializedName("reply_markup")
47 |             val replyMarkup: ReplyMarkup,
48 |             @SerializedName("text")
49 |             val text: String // Was it helpful? 😊
50 |         ) {
51 |             data class Chat(
52 |                 @SerializedName("first_name")
53 |                 val firstName: String, // theapache64
54 |                 @SerializedName("id")
55 |                 val id: Int, // 240810054
56 |                 @SerializedName("type")
57 |                 val type: String, // private
58 |                 @SerializedName("username")
59 |                 val username: String // theapache64
60 |             )
61 | 
62 |             data class From(
63 |                 @SerializedName("first_name")
64 |                 val firstName: String, // Corona Scholar - Dev
65 |                 @SerializedName("id")
66 |                 val id: Int, // 1119620721
67 |                 @SerializedName("is_bot")
68 |                 val isBot: Boolean, // true
69 |                 @SerializedName("username")
70 |                 val username: String // corona_scholar_dev_bot
71 |             )
72 | 
73 |             data class ReplyMarkup(
74 |                 @SerializedName("inline_keyboard")
75 |                 val inlineKeyboard: List<Any>
76 |             )
77 |         }
78 |     }
79 | }


--------------------------------------------------------------------------------
/datasources/scrapers/Arbeitsagentur_scraper.py:
--------------------------------------------------------------------------------
 1 | from datetime import date
 2 | import scrapy
 3 | import pandas as pd
 4 | 
 5 | class CovidScraper(scrapy.Spider):
 6 |     name = "Arbeitsagentur_Scraper"
 7 |     start_urls = ["https://www.arbeitsagentur.de/corona-faq"]
 8 | 
 9 |     def parse(self, response):
10 |         columns = {
11 |             "question" : [],
12 |             "answer" : [],
13 |             "answer_html" : [],
14 |             "link" : [],
15 |             "name" : [],
16 |             "source" : [],
17 |             "category" : [],
18 |             "country" : [],
19 |             "region" : [],
20 |             "city" : [],
21 |             "lang" : [],
22 |             "last_update" : [],
23 |         }
24 | 
25 |         current_category = ""
26 |         current_question = ""
27 |         current_answer = ""
28 |         current_answer_html = ""
29 |         ba_content_article_count = 0
30 | 
31 |         all_nodes = response.xpath("//*")
32 |         for node in all_nodes:
33 |             if node.attrib.get("class") == "ba-content-row":
34 |                 ba_content_article_count += 1
35 |                 # end of FAQ 
36 |                 if ba_content_article_count == 4:
37 |                     break
38 | 
39 |             # in question
40 |             if node.attrib.get("class") == "collapsed":
41 |                 # save previous question-answer pair
42 |                 if current_question:
43 |                     columns["question"].append(current_question)
44 |                     columns["answer"].append(current_answer)
45 |                     columns["answer_html"].append(current_answer_html)
46 |                 current_question = node.css("::text").get().strip()
47 |                 continue
48 | 
49 |             # in answer
50 |             if node.attrib.get("class") == "ba-copytext":
51 |                 current_answer = node.css(" ::text").getall()
52 |                 current_answer = " ".join(current_answer).strip()
53 |                 current_answer_html = node.getall()
54 |                 current_answer_html = " ".join(current_answer_html).strip()
55 |                 continue
56 | 
57 | 
58 | 
59 |         columns["question"].append(current_question)
60 |         columns["answer"].append(current_answer)
61 |         columns["answer_html"].append(current_answer_html)
62 | 
63 |         today = date.today()
64 | 
65 |         columns["link"] = ["https://www.arbeitsagentur.de/corona-faq"] * len(columns["question"])
66 |         columns["name"] = ["FAQ: Corona-Virus"] * len(columns["question"])
67 |         columns["source"] = ["Bundesagentur für Arbeit"] * len(columns["question"])
68 |         columns["category"] = [""] * len(columns["question"])
69 |         columns["country"] = ["DE"] * len(columns["question"])
70 |         columns["region"] = [""] * len(columns["question"])
71 |         columns["city"] = [""] * len(columns["question"])
72 |         columns["lang"] = ["de"] * len(columns["question"])
73 |         columns["last_update"] = [today.strftime("%Y/%m/%d")] * len(columns["question"])
74 | 
75 |         return columns
76 | 


--------------------------------------------------------------------------------
/datasources/scrapers_outdated/CDC_Children_scraper.py:
--------------------------------------------------------------------------------
 1 | # run 'scrapy runspider CDC_Children_scraper.py' to scrape data
 2 | 
 3 | from datetime import date
 4 | import scrapy
 5 | from scrapy.crawler import CrawlerProcess
 6 | import pandas as pd
 7 | 
 8 | class CovidScraper(scrapy.Spider):
 9 |     name = "CDC_Children_Scraper"
10 |     start_urls = ["https://www.cdc.gov/coronavirus/2019-ncov/prepare/children-faq.html"]
11 | 
12 |     def parse(self, response):
13 |         columns = {
14 |             "question": [],
15 |             "answer": [],
16 |             "answer_html": [],
17 |             "link": [],
18 |             "name": [],
19 |             "source": [],
20 |             "category": [],
21 |             "country": [],
22 |             "region": [],
23 |             "city": [],
24 |             "lang": [],
25 |             "last_update": [],
26 |         }
27 | 
28 |         found_p = False
29 |         found_question = False
30 |         current_answer = ""
31 | 
32 |         categoryPaths = response.xpath('//div[@class="syndicate"]/div[@class="row "]')
33 |         for catPath in categoryPaths:
34 |             categoryName = catPath.xpath('.//h2/text()').getall()
35 |             if len(categoryName) == 0:
36 |                 continue;
37 |             categoryName = categoryName[0]
38 |             qnaPaths = catPath.xpath('.//div[@role="tablist"]//div[@class="card"]')
39 |             for qnaPath in qnaPaths:
40 |                 question = qnaPath.xpath('.//span[@role="heading"]/text()').get()
41 |                 responseParagraphPaths = qnaPath.xpath('.//div[@class="card-body"]')
42 |                 response = ""
43 |                 for respParaPath in responseParagraphPaths:
44 |                     response += " ".join(respParaPath.xpath('.//text()').getall()) + "\n\n"
45 |                 response = response.strip()
46 |                 columns["question"].append(question)
47 |                 columns["answer"].append(response)
48 |                 columns["answer_html"].append(" ".join(responseParagraphPaths.getall()))
49 | 
50 |         today = date.today()
51 | 
52 |         columns["link"] = ["https://www.cdc.gov/coronavirus/2019-ncov/prepare/children-faq.html"] * len(
53 |             columns["question"])
54 |         columns["name"] = ["Coronavirus Disease-2019 (COVID-19) and Children"] * len(columns["question"])
55 |         columns["source"] = ["Center for Disease Control and Prevention (CDC)"] * len(columns["question"])
56 |         columns["category"] = ["Children"] * len(columns["question"])
57 |         columns["country"] = ["USA"] * len(columns["question"])
58 |         columns["region"] = [""] * len(columns["question"])
59 |         columns["city"] = [""] * len(columns["question"])
60 |         columns["lang"] = ["en"] * len(columns["question"])
61 |         columns["last_update"] = [today.strftime("%Y/%m/%d")] * len(columns["question"])
62 | 
63 |         return columns
64 | 
65 | if __name__ == "__main__":
66 |     process = CrawlerProcess({
67 |         'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
68 |     })
69 | 
70 |     process.crawl(CovidScraper)
71 |     process.start()
72 | 


--------------------------------------------------------------------------------
/covid-frontend/src/store/sagas/activeAnswers.js:
--------------------------------------------------------------------------------
 1 | import { all, put, select, takeLatest } from 'redux-saga/effects';
 2 | import { message } from 'antd';
 3 | import * as api from 'store/sagas/api';
 4 | import * as globalSearchTypes from 'store/types/globalSearch';
 5 | import * as types from 'store/types/activeAnswers';
 6 | import * as actions from 'store/actions/activeAnswers';
 7 | const MODEL_ID = 1;
 8 | 
 9 | export function* get() {
10 |   const { selectedValue } = yield select(state => state.globalSearch);
11 | 
12 |   // reset active answers and return if no question is selected
13 |   if (!selectedValue) {
14 |     yield put(actions.set([]));
15 | 
16 |     return;
17 |   }
18 | 
19 |   yield put(actions.setLoadingStatus(true));
20 |   try {
21 |     const question = selectedValue;
22 | 
23 |     const query = {
24 |       questions: [ question ],
25 |       top_k_retriever: 5,
26 |     };
27 | 
28 |     const data = yield api.post(`/question/ask`, null, query);
29 | 
30 |     const answers = data.results[0].answers
31 |     yield put(actions.set(answers));
32 | 
33 |     // reset the feedbackGiven on each search
34 |     yield put(actions.clearFeedbackGiven());
35 | 
36 |   } catch (error) {
37 |     message.error(error.message);
38 |   }
39 |   yield put(actions.setLoadingStatus(false));
40 | }
41 | 
42 | export function* markAsCorrectAnswer({ question, answerDocumentId }) {
43 |   if (!question.selectedValue || answerDocumentId <= 0) {
44 |     // do nothing
45 |     return;
46 |   }
47 |   const id = parseInt(answerDocumentId, 10);
48 |   try {
49 |     const requestbody = {
50 |       question:  question.selectedValue,
51 |       answer: '',
52 |       feedback: 'relevant',
53 |       document_id: id
54 |     }
55 |     yield api.post(`/models/${MODEL_ID}/feedback`, null, requestbody);
56 |   } catch (error) {
57 |     message.error(error.message);
58 |   }
59 | 
60 |   yield put(actions.markAsFeedbackGiven({ [answerDocumentId]: 'relevant' }));
61 |   message.success('Thanks for giving us feedback.')
62 | }
63 | 
64 | export function* markAsWrongAnswer({ question, answerDocumentId, feedback }) {
65 |   if (!question.selectedValue || answerDocumentId <= 0) {
66 |     // do nothing
67 |     return;
68 |   }
69 |   try {
70 |     const id = parseInt(answerDocumentId, 10);
71 | 
72 |     const requestbody = {
73 |       question:  question.selectedValue,
74 |       answer: '',
75 |       feedback,
76 |       document_id: id
77 |     }
78 |     yield api.post(`/models/${MODEL_ID}/feedback`, null, requestbody);
79 | 
80 |   } catch (error) {
81 |     message.error(error.message);
82 |   }
83 | 
84 |   yield put(actions.markAsFeedbackGiven({ [answerDocumentId]: feedback }));
85 | 
86 |   // the popup did already say 'thank you'
87 |   // message.success('Thanks for giving us feedback.')
88 | }
89 | 
90 | export default function* () {
91 |   yield all([
92 |     takeLatest([types.GET, globalSearchTypes.SET_SELECTED_VALUE], get),
93 |     takeLatest([types.MARK_AS_CORRECT_ANSWER], ({ payload }) => markAsCorrectAnswer(payload)),
94 |     takeLatest([types.MARK_AS_WRONG_ANSWER], ({ payload }) => markAsWrongAnswer(payload)),
95 |   ]);
96 | }
97 | 


--------------------------------------------------------------------------------
/covid_nlp/modeling/transformer/train_quora_dedup_bert.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This example loads the pre-trained bert-base-nli-mean-tokens models from the server.
 3 | It then fine-tunes this model for some epochs on the STS benchmark dataset.
 4 | """
 5 | from torch.utils.data import DataLoader
 6 | import math
 7 | from sentence_transformers import SentenceTransformer,  SentencesDataset, LoggingHandler, losses
 8 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
 9 | from sentence_transformers.readers import STSDataReader
10 | import logging
11 | from datetime import datetime
12 | 
13 | 
14 | #### Just some code to print debug information to stdout
15 | logging.basicConfig(format='%(asctime)s - %(message)s',
16 |                     datefmt='%Y-%m-%d %H:%M:%S',
17 |                     level=logging.INFO,
18 |                     handlers=[LoggingHandler()])
19 | #### /print debug information to stdout
20 | 
21 | # Read the dataset
22 | #model_name = 'bert-base-nli-stsb-mean-tokens'
23 | model_name = "../saved_models"
24 | train_batch_size = 32
25 | num_epochs = 4
26 | model_save_path = 'output/quora_continue_training-'+model_name+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
27 | sts_reader = STSDataReader('../data/quora', normalize_scores=True, s1_col_idx=4, s2_col_idx=5, score_col_idx=6, max_score=1)
28 | 
29 | # Load a pre-trained sentence transformer model
30 | model = SentenceTransformer(model_name)
31 | 
32 | # Convert the dataset to a DataLoader ready for training
33 | logging.info("Read Quora train dataset")
34 | train_data = SentencesDataset(sts_reader.get_examples('train.csv'), model)
35 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
36 | train_loss = losses.CosineSimilarityLoss(model=model)
37 | 
38 | 
39 | logging.info("Read Quora dev dataset")
40 | dev_data = SentencesDataset(examples=sts_reader.get_examples('dev.csv'), model=model)
41 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size)
42 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)
43 | 
44 | 
45 | # Configure the training. We skip evaluation in this example
46 | warmup_steps = math.ceil(len(train_data)*num_epochs/train_batch_size*0.1) #10% of train data for warm-up
47 | logging.info("Warmup-steps: {}".format(warmup_steps))
48 | 
49 | 
50 | # Train the model
51 | model.fit(train_objectives=[(train_dataloader, train_loss)],
52 |           evaluator=evaluator,
53 |           epochs=num_epochs,
54 |           evaluation_steps=1000,
55 |           warmup_steps=warmup_steps,
56 |           output_path=model_save_path)
57 | 
58 | 
59 | ##############################################################################
60 | #
61 | # Load the stored model and evaluate its performance on STS benchmark dataset
62 | #
63 | ##############################################################################
64 | #
65 | # model = SentenceTransformer(model_save_path)
66 | # test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model)
67 | # test_dataloader = DataLoader(test_data, shuffle=False, batch_size=train_batch_size)
68 | # evaluator = EmbeddingSimilarityEvaluator(test_dataloader)
69 | # model.evaluate(evaluator)
70 | 


--------------------------------------------------------------------------------
/datasources/scrapers/CDC_General_scraper.py:
--------------------------------------------------------------------------------
 1 | # run 'scrapy runspider CDC_General_scraper.py' to scrape data
 2 | 
 3 | from datetime import date
 4 | 
 5 | import scrapy
 6 | from scrapy.crawler import CrawlerProcess
 7 | 
 8 | 
 9 | class CovidScraper(scrapy.Spider):
10 |     name = "CDC_Scraper"
11 |     start_urls = ["https://www.cdc.gov/coronavirus/2019-ncov/faq.html"]
12 | 
13 |     def parse(self, response):
14 |         columns = {
15 |             "question": [],
16 |             "answer": [],
17 |             "answer_html": [],
18 |             "link": [],
19 |             "name": [],
20 |             "source": [],
21 |             "category": [],
22 |             "country": [],
23 |             "region": [],
24 |             "city": [],
25 |             "lang": [],
26 |             "last_update": [],
27 |         }
28 | 
29 |         current_category = ""
30 | 
31 |         all_nodes = response.xpath("//*")
32 |         for i,node in enumerate(all_nodes):
33 |             # in category
34 |             if node.attrib.get("class") == "onThisPageAnchor":
35 |                 current_category = node.attrib["title"]
36 |                 continue
37 | 
38 |             # in category
39 |             if current_category:
40 |                 # in question
41 |                 if node.attrib.get("role") == "heading":
42 |                     current_question = node.css("::text").get()
43 | 
44 |                 # in answer
45 |                 if node.attrib.get("class") == "card-body":
46 |                     current_answer = node.css(" ::text").getall()
47 |                     current_answer = " ".join(current_answer).strip()
48 |                     current_answer_html = node.getall()
49 |                     current_answer_html = " ".join(current_answer_html).strip()
50 | 
51 |                     # add question-answer-pair to data dictionary
52 |                     columns["question"].append(current_question)
53 |                     columns["answer"].append(current_answer)
54 |                     columns["answer_html"].append(current_answer_html)
55 |                     columns["category"].append(current_category)
56 | 
57 |             # end of category
58 |             if node.attrib.get("class") == "row":
59 |                 current_category = ""
60 | 
61 |         today = date.today()
62 | 
63 |         columns["link"] = ["https://www.cdc.gov/coronavirus/2019-ncov/faq.html"] * len(columns["question"])
64 |         columns["name"] = ["CDC General FAQ"] * len(columns["question"])
65 |         columns["source"] = ["Center for Disease Control and Prevention (CDC)"] * len(columns["question"])
66 |         columns["country"] = ["USA"] * len(columns["question"])
67 |         columns["region"] = [""] * len(columns["question"])
68 |         columns["city"] = [""] * len(columns["question"])
69 |         columns["lang"] = ["en"] * len(columns["question"])
70 |         columns["last_update"] = [today.strftime("%Y/%m/%d")] * len(columns["question"])
71 | 
72 |         return columns
73 | 
74 | 
75 | 
76 | if __name__ == "__main__":
77 |     process = CrawlerProcess({
78 |         'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
79 |     })
80 |     process.crawl(CovidScraper)
81 |     process.start()
82 | 


--------------------------------------------------------------------------------
/datasources/scrapers/IHK_scraper.py:
--------------------------------------------------------------------------------
 1 | from datetime import date
 2 | import scrapy
 3 | import pandas as pd
 4 | 
 5 | class CovidScraper(scrapy.Spider):
 6 |     name = "IHK_Scraper"
 7 |     start_urls = ["https://www.dihk.de/de/aktuelles-und-presse/coronavirus/faq-19594"]
 8 | 
 9 |     def parse(self, response):
10 |         columns = {
11 |             "question" : [],
12 |             "answer" : [],
13 |             "answer_html" : [],
14 |             "link" : [],
15 |             "name" : [],
16 |             "source" : [],
17 |             "category" : [],
18 |             "country" : [],
19 |             "region" : [],
20 |             "city" : [],
21 |             "lang" : [],
22 |             "last_update" : [],
23 |         }
24 | 
25 |         current_category = ""
26 |         current_question = ""
27 |         current_answer = ""
28 |         current_answer_html = ""
29 |         question_answer_pair = False
30 | 
31 |         all_nodes = response.xpath("//*")
32 |         for node in all_nodes:
33 |             # save previous question-answer pair
34 |             if question_answer_pair:
35 |                 columns["question"].append(current_question)
36 |                 columns["answer"].append(current_answer)
37 |                 columns["answer_html"].append(current_answer_html)
38 |                 columns["category"].append(current_category)
39 |                 question_answer_pair = False
40 | 
41 |             # in category
42 |             if node.attrib.get("class") == "accordion__headline":
43 |                 current_category = node.css("::text").get()
44 |                 continue
45 | 
46 |             if current_category:
47 |                 # in question
48 |                 if node.attrib.get("class") == "accordion__btn-inner":
49 |                     current_question = node.css("::text").get()
50 |                     continue
51 | 
52 |                 # in answer
53 |                 if current_question and (node.attrib.get("class") == "rte__content"):
54 |                     current_answer = node.css(" ::text").getall()
55 |                     current_answer = " ".join(current_answer).strip()
56 |                     current_answer_html = node.getall()
57 |                     current_answer_html = " ".join(current_answer_html).strip()
58 |                     question_answer_pair = True
59 |                     continue
60 | 
61 |             # end of FAQ
62 |             if node.attrib.get("class") == "u-area is-area-cols-2 is-auto-height is-low-margin is-mobile-full":
63 |                 break
64 | 
65 |         today = date.today()
66 | 
67 |         columns["link"] = ["https://www.berlin.de/corona/faq/"] * len(columns["question"])
68 |         columns["name"] = ["Corona-Prävention in Berlin – Fragen und Antworten"] * len(columns["question"])
69 |         columns["source"] = ["Berliner Senat"] * len(columns["question"])
70 |         columns["country"] = ["DE"] * len(columns["question"])
71 |         columns["region"] = ["Berlin"] * len(columns["question"])
72 |         columns["city"] = ["Berlin"] * len(columns["question"])
73 |         columns["lang"] = ["de"] * len(columns["question"])
74 |         columns["last_update"] = [today.strftime("%Y/%m/%d")] * len(columns["question"])
75 | 
76 |         return columns
77 | 


--------------------------------------------------------------------------------
/datasources/scrapers/ECDC_scraper.py:
--------------------------------------------------------------------------------
 1 | # run 'scrapy runspider ECDC_scraper.py' to scrape data
 2 | 
 3 | from datetime import date
 4 | 
 5 | import scrapy
 6 | from scrapy.crawler import CrawlerProcess
 7 | 
 8 | 
 9 | class CovidScraper(scrapy.Spider):
10 |     name = "ECDCS_scraper"
11 |     start_urls = ["https://www.ecdc.europa.eu/en/novel-coronavirus-china/questions-answers"]
12 | 
13 |     def parse(self, response):
14 |         columns = {
15 |             "question": [],
16 |             "answer": [],
17 |             "answer_html": [],
18 |             "link": [],
19 |             "name": [],
20 |             "source": [],
21 |             "category": [],
22 |             "country": [],
23 |             "region": [],
24 |             "city": [],
25 |             "lang": [],
26 |             "last_update": [],
27 |         }
28 | 
29 |         # Scraper Idea: we search for the questions, all paragraphs that follow belong to the question
30 | 
31 |         QUESTION_ELEMENT_SELECTOR = ".ct--view-30 .text-image h3"
32 |         QUESTION_SELECTOR = "::text"
33 | 
34 |         questions = response.css(QUESTION_ELEMENT_SELECTOR)
35 |         for question_elm in questions:
36 |             question = question_elm.css(QUESTION_SELECTOR).getall()
37 |             question = " ".join(question).replace('\xa0', ' ')
38 |             # we remove the first 2 chars, they look like this '1.'
39 |             question = question[2:]
40 |             question = question.strip()
41 | 
42 |             # all paragraphs till the next question header are considert to be the answer
43 |             following_siblings = question_elm.xpath('following-sibling::*')
44 |             answer = []
45 |             answer_html = []
46 |             for elm in following_siblings:
47 |                 if elm.root.tag != 'h3':
48 |                     answer += elm.css("::text").getall()
49 |                     answer_html += [elm.get()]
50 |             answer = "".join(answer).replace('\xa0', ' ').strip()
51 |             answer_html = " ".join(answer_html).strip()
52 | 
53 |             # add question-answer pair to data dictionary
54 |             columns["question"].append(question)
55 |             columns["answer"].append(answer)
56 |             columns["answer_html"].append(answer_html)
57 | 
58 |         today = date.today()
59 | 
60 |         columns["link"] = ["https://www.ecdc.europa.eu/en/novel-coronavirus-china/questions-answers"] * len(
61 |             columns["question"])
62 |         columns["name"] = ["Q & A on COVID-19"] * len(columns["question"])
63 |         columns["source"] = ["European Centre for Disease Prevention and Control"] * len(columns["question"])
64 |         columns["category"] = [""] * len(columns["question"])
65 |         columns["country"] = [""] * len(columns["question"])
66 |         columns["region"] = [""] * len(columns["question"])
67 |         columns["city"] = [""] * len(columns["question"])
68 |         columns["lang"] = ["en"] * len(columns["question"])
69 |         columns["last_update"] = [today.strftime("%Y/%m/%d")] * len(columns["question"])
70 | 
71 |         # df = pd.DataFrame(columns)
72 |         return columns
73 | 
74 | 
75 | if __name__ == "__main__":
76 |     process = CrawlerProcess({
77 |         'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
78 |     })
79 | 
80 |     process.crawl(CovidScraper)
81 |     process.start()
82 | 


--------------------------------------------------------------------------------
/telegram-bot/.gitignore:
--------------------------------------------------------------------------------
  1 | 
  2 | # Created by https://www.gitignore.io/api/kotlin,intellij,gradle
  3 | # Edit at https://www.gitignore.io/?templates=kotlin,intellij,gradle
  4 | 
  5 | ### Intellij ###
  6 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
  7 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
  8 | 
  9 | # User-specific stuff
 10 | .idea/**/workspace.xml
 11 | .idea/**/tasks.xml
 12 | .idea/**/usage.statistics.xml
 13 | .idea/**/dictionaries
 14 | .idea/**/shelf
 15 | 
 16 | # Generated files
 17 | .idea/**/contentModel.xml
 18 | 
 19 | # Sensitive or high-churn files
 20 | .idea/**/dataSources/
 21 | .idea/**/dataSources.ids
 22 | .idea/**/dataSources.local.xml
 23 | .idea/**/sqlDataSources.xml
 24 | .idea/**/dynamic.xml
 25 | .idea/**/uiDesigner.xml
 26 | .idea/**/dbnavigator.xml
 27 | 
 28 | # Gradle
 29 | .idea/**/gradle.xml
 30 | .idea/**/libraries
 31 | 
 32 | # Gradle and Maven with auto-import
 33 | # When using Gradle or Maven with auto-import, you should exclude module files,
 34 | # since they will be recreated, and may cause churn.  Uncomment if using
 35 | # auto-import.
 36 | # .idea/modules.xml
 37 | # .idea/*.iml
 38 | # .idea/modules
 39 | # *.iml
 40 | # *.ipr
 41 | 
 42 | # CMake
 43 | cmake-build-*/
 44 | 
 45 | # Mongo Explorer plugin
 46 | .idea/**/mongoSettings.xml
 47 | 
 48 | # File-based project format
 49 | *.iws
 50 | 
 51 | # IntelliJ
 52 | out/
 53 | 
 54 | # mpeltonen/sbt-idea plugin
 55 | .idea_modules/
 56 | 
 57 | # JIRA plugin
 58 | atlassian-ide-plugin.xml
 59 | 
 60 | # Cursive Clojure plugin
 61 | .idea/replstate.xml
 62 | 
 63 | # Crashlytics plugin (for Android Studio and IntelliJ)
 64 | com_crashlytics_export_strings.xml
 65 | crashlytics.properties
 66 | crashlytics-build.properties
 67 | fabric.properties
 68 | 
 69 | # Editor-based Rest Client
 70 | .idea/httpRequests
 71 | 
 72 | # Android studio 3.1+ serialized cache file
 73 | .idea/caches/build_file_checksums.ser
 74 | 
 75 | ### Intellij Patch ###
 76 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
 77 | 
 78 | # *.iml
 79 | # modules.xml
 80 | # .idea/misc.xml
 81 | # *.ipr
 82 | 
 83 | # Sonarlint plugin
 84 | .idea/**/sonarlint/
 85 | 
 86 | # SonarQube Plugin
 87 | .idea/**/sonarIssues.xml
 88 | 
 89 | # Markdown Navigator plugin
 90 | .idea/**/markdown-navigator.xml
 91 | .idea/**/markdown-navigator/
 92 | 
 93 | ### Kotlin ###
 94 | # Compiled class file
 95 | *.class
 96 | 
 97 | # Log file
 98 | *.log
 99 | 
100 | # BlueJ files
101 | *.ctxt
102 | 
103 | # Mobile Tools for Java (J2ME)
104 | .mtj.tmp/
105 | 
106 | # Package Files #
107 | *.jar
108 | *.war
109 | *.nar
110 | *.ear
111 | *.zip
112 | *.tar.gz
113 | *.rar
114 | 
115 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
116 | hs_err_pid*
117 | 
118 | ### Gradle ###
119 | .gradle
120 | build/
121 | 
122 | # Ignore Gradle GUI config
123 | gradle-app.setting
124 | 
125 | # Avoid ignoring Gradle wrapper jar file (.jar files are usually ignored)
126 | !gradle-wrapper.jar
127 | 
128 | # Cache of project
129 | .gradletasknamecache
130 | 
131 | # # Work around https://youtrack.jetbrains.com/issue/IDEA-116898
132 | # gradle/wrapper/gradle-wrapper.properties
133 | 
134 | ### Gradle Patch ###
135 | **/build/
136 | 
137 | src/main/kotlin/com/theapache64/cs/core/SecretConstants.kt
138 | 
139 | # End of https://www.gitignore.io/api/kotlin,intellij,gradle
140 | 


--------------------------------------------------------------------------------
/datasources/scrapers/FHM_EN_scraper.py:
--------------------------------------------------------------------------------
 1 | # run 'scrapy runspider FHM_SV_scraper.py' to scrape data
 2 | 
 3 | #Add data in English from Folkhälsomyndigheten
 4 | 
 5 | import scrapy
 6 | from datetime import date
 7 | from scrapy.crawler import CrawlerProcess
 8 | 
 9 | 
10 | class CovidScraper(scrapy.Spider):
11 |     name = 'fhm_en_spyder'
12 |     start_urls = ['https://www.folkhalsomyndigheten.se/the-public-health-agency-of-sweden/communicable-disease-control/covid-19/']
13 | 
14 |     questionsOnly = True
15 | 
16 |     def transformContent(self, contentNode):
17 |         responseParts = []
18 |         for responsePart in contentNode.xpath('.//text()').getall():
19 |             strippedPart = responsePart.strip()
20 |             if len(strippedPart) > 0:
21 |                 responseParts.append(strippedPart)
22 |         return ' '.join(responseParts)
23 | 
24 |     def parse(self, response):
25 |         columns = {
26 |             "question": [],
27 |             "answer": [],
28 |             "answer_html": [],
29 |             "link": [],
30 |             "name": [],
31 |             "source": [],
32 |             "category": [],
33 |             "country": [],
34 |             "region": [],
35 |             "city": [],
36 |             "lang": [],
37 |             "last_update": [],
38 |         }
39 | 
40 | 
41 |         categoryPaths = response.xpath('//div[@class="container"]')
42 | 
43 |         for catPath in categoryPaths:
44 | 
45 |             categoryName = catPath.xpath('./h2/text()').getall()
46 |             #print(categoryName)
47 |             if len(categoryName) == 0:
48 |                 continue
49 | 
50 | 
51 |             qnaPaths = catPath.xpath('.//*[@class="accordion__item toggle"]')
52 |             for qnaPath in qnaPaths:
53 | 
54 | 
55 |                 question = qnaPath.xpath('./strong/a/span/text()').getall()
56 | 
57 | 
58 |                 responseParagraphPaths = qnaPath.xpath('.//div[@class="textbody"]')
59 | 
60 | 
61 |                 response = ""
62 |                 for respParaPath in responseParagraphPaths:
63 |                     response += " ".join(respParaPath.xpath('.//text()').getall()) + "\n\n"
64 | 
65 |                 response = response.strip()
66 | 
67 |                 columns["question"].append(question[0])
68 |                 columns["category"].append(categoryName[0])
69 |                 columns["answer"].append(response)
70 |                 columns["answer_html"].append(" ".join(responseParagraphPaths.getall()))
71 |         today = date.today()
72 | 
73 | 
74 |         columns["link"] = ["https://www.folkhalsomyndigheten.se/the-public-health-agency-of-sweden/communicable-disease-control/covid-19/"] * len(columns["question"])
75 |         columns["name"] = ["Q&A on coronaviruses (COVID-19)"] * len(columns["question"])
76 |         columns["source"] = ["FHM, Folkhälsomyndigheten"] * len(columns["question"])
77 |         columns["country"] = ["Sweden"] * len(columns["question"])
78 |         columns["region"] = [""] * len(columns["question"])
79 |         columns["city"] = [""] * len(columns["question"])
80 |         columns["lang"] = ["en"] * len(columns["question"])
81 |         columns["last_update"] = [today.strftime("%Y/%m/%d")] * len(columns["question"])
82 | 
83 |         return columns
84 | 
85 | 
86 | if __name__ == "__main__":
87 |     process = CrawlerProcess({
88 |         'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
89 |     })
90 | 
91 |     process.crawl(CovidScraper)
92 |     process.start()
93 | 


--------------------------------------------------------------------------------
/datasources/scrapers/WHO_scraper.py:
--------------------------------------------------------------------------------
 1 | # run 'scrapy runspider WHO_scraper.py' to scrape data
 2 | 
 3 | from datetime import date
 4 | 
 5 | import scrapy
 6 | 
 7 | 
 8 | class CovidScraper(scrapy.Spider):
 9 |     name = "WHO_scraper"
10 |     start_urls = ["https://www.who.int/news-room/q-a-detail/q-a-coronaviruses",
11 |                   "https://www.who.int/news-room/q-a-detail/q-a-on-covid-19-and-pregnancy-and-childbirth",
12 |                   "https://www.who.int/news-room/q-a-detail/q-a-on-covid-19-and-breastfeeding",
13 |                   "https://www.who.int/news-room/q-a-detail/q-a-on-covid-19-and-masks",
14 |                   "https://www.who.int/news-room/q-a-detail/q-a-on-covid-19-hiv-and-antiretrovirals",
15 |                   "https://www.who.int/news-room/q-a-detail/q-a-on-mass-gatherings-and-covid-19",
16 |                   "https://www.who.int/news-room/q-a-detail/q-a-on-infection-prevention-and-control-for-health-care-workers-caring-for-patients-with-suspected-or-confirmed-2019-ncov",
17 |                   "https://www.who.int/news-room/q-a-detail/be-active-during-covid-19",
18 |                   "https://www.who.int/news-room/q-a-detail/malaria-and-the-covid-19-pandemic",
19 |                   "https://www.who.int/news-room/q-a-detail/violence-against-women-during-covid-19",
20 |                   "https://www.who.int/news-room/q-a-detail/contraception-family-planning-and-covid-19"]
21 | 
22 |     def parse(self, response):
23 |         columns = {
24 |             "question": [],
25 |             "answer": [],
26 |             "answer_html": [],
27 |             "link": [],
28 |             "name": [],
29 |             "source": [],
30 |             "category": [],
31 |             "country": [],
32 |             "region": [],
33 |             "city": [],
34 |             "lang": [],
35 |             "last_update": [],
36 |         }
37 | 
38 |         QUESTION_ANSWER_SELECTOR = ".sf-accordion__panel"
39 |         QUESTION_SELECTOR = ".sf-accordion__link::text"
40 |         ANSWER_SELECTOR = ".sf-accordion__content ::text"
41 |         ANSWER_HTML_SELECTOR = ".sf-accordion__content"
42 | 
43 |         questions_answers = response.css(QUESTION_ANSWER_SELECTOR)
44 |         for question_answer in questions_answers:
45 |             question = question_answer.css(QUESTION_SELECTOR).getall()
46 |             question = " ".join(question).strip()
47 |             answer = question_answer.css(ANSWER_SELECTOR).getall()
48 |             answer = " ".join(answer).strip()
49 |             answer_html = question_answer.css(ANSWER_HTML_SELECTOR).getall()
50 |             answer_html = " ".join(answer_html).strip()
51 | 
52 |             # add question-answer pair to data dictionary
53 |             columns["question"].append(question)
54 |             columns["answer"].append(answer)
55 |             columns["answer_html"].append(answer_html)
56 | 
57 |         today = date.today()
58 | 
59 |         columns["link"] = [response.url] * len(columns["question"])
60 |         columns["name"] = ["Q&A on coronaviruses (COVID-19)"] * len(columns["question"])
61 |         columns["source"] = ["World Health Organization (WHO)"] * len(columns["question"])
62 |         columns["category"] = [""] * len(columns["question"])
63 |         columns["country"] = [""] * len(columns["question"])
64 |         columns["region"] = [""] * len(columns["question"])
65 |         columns["city"] = [""] * len(columns["question"])
66 |         columns["lang"] = ["en"] * len(columns["question"])
67 |         columns["last_update"] = [today.strftime("%Y/%m/%d")] * len(columns["question"])
68 | 
69 |         return columns
70 | 


--------------------------------------------------------------------------------
/datasources/scrapers/BMG_scraper.py:
--------------------------------------------------------------------------------
 1 | from datetime import date
 2 | 
 3 | import scrapy
 4 | 
 5 | 
 6 | class CovidScraper(scrapy.Spider):
 7 |     name = "BMG_scraper"
 8 |     start_urls = ["https://www.zusammengegencorona.de/informieren/basiswissen-coronavirus/",
 9 |                   "https://www.zusammengegencorona.de/informieren/basiswissen-uebertragung/",
10 |                   "https://www.zusammengegencorona.de/informieren/informationen-zum-test/",
11 |                   "https://www.zusammengegencorona.de/informieren/symptome-erkennen/",
12 |                   "https://www.zusammengegencorona.de/informieren/praevention/",
13 |                   "https://www.zusammengegencorona.de/informieren/informationen-alltag/",
14 |                   "https://www.zusammengegencorona.de/informieren/informationen-aeltere-menschen/",
15 |                   "https://www.zusammengegencorona.de/informieren/medizinisches-personal/",
16 |                   "https://www.zusammengegencorona.de/informieren/arbeitsschutz/",
17 |                   "https://www.zusammengegencorona.de/informieren/wirtschaftliche-folgen/",
18 |                   # not real answers, only links...
19 |                   "https://www.zusammengegencorona.de/informieren/weitere-informationen/",
20 |                   # very specific questions and answers as well as other links
21 |                   "https://www.zusammengegencorona.de/informieren/zuhause-bleiben/"]
22 | 
23 |     def parse(self, response):
24 |         columns = {
25 |             "question": [],
26 |             "answer": [],
27 |             "answer_html": [],
28 |             "link": [],
29 |             "name": [],
30 |             "source": [],
31 |             "category": [],
32 |             "country": [],
33 |             "region": [],
34 |             "city": [],
35 |             "lang": [],
36 |             "last_update": [],
37 |         }
38 | 
39 |         QUESTION_ANSWER_SELECTOR = ".accordion__item"
40 |         QUESTION_SELECTOR = ".accordion__heading ::text"
41 |         ANSWER_SELECTOR = ".panel-inner ::text"
42 |         ANSWER_HTML_SELECTOR = ".panel-inner"
43 | 
44 |         questions_answers = response.css(QUESTION_ANSWER_SELECTOR)
45 |         for question_answer in questions_answers:
46 |             question = question_answer.css(QUESTION_SELECTOR).getall()
47 |             question = " ".join(question).strip()
48 |             answer = question_answer.css(ANSWER_SELECTOR).getall()
49 |             answer = " ".join(answer).strip()
50 |             answer_html = question_answer.css(ANSWER_HTML_SELECTOR).getall()
51 |             answer_html = " ".join(answer_html).strip()
52 | 
53 |             # add question-answer pair to data dictionary
54 |             columns["question"].append(question)
55 |             columns["answer"].append(answer)
56 |             columns["answer_html"].append(answer_html)
57 |             columns["link"].append(response.url)
58 | 
59 |         today = date.today()
60 | 
61 |         columns["name"] = ["Ihre Fragen - unsere Antworten zum neuartigen Coronavirus / COVID-19"] * len(
62 |             columns["question"])
63 |         columns["source"] = ["Bundesministerium für Gesundheit (BMG)"] * len(columns["question"])
64 |         columns["category"] = [""] * len(columns["question"])
65 |         columns["country"] = ["DE"] * len(columns["question"])
66 |         columns["region"] = [""] * len(columns["question"])
67 |         columns["city"] = [""] * len(columns["question"])
68 |         columns["lang"] = ["de"] * len(columns["question"])
69 |         columns["last_update"] = [today.strftime("%Y/%m/%d")] * len(columns["question"])
70 | 
71 |         return columns
72 | 


--------------------------------------------------------------------------------
/datasources/scrapers/FHM_SV_scraper.py:
--------------------------------------------------------------------------------
 1 | # run 'scrapy runspider FHM_SV_scraper.py' to scrape data
 2 | 
 3 | #Add data in Swedish from Folkhälsomyndigheten
 4 | 
 5 | import scrapy
 6 | from scrapy.crawler import CrawlerProcess
 7 | 
 8 | 
 9 | class CovidScraper(scrapy.Spider):
10 |     name = 'fhm_sv_spyder'
11 |     start_urls = ['https://www.folkhalsomyndigheten.se/smittskydd-beredskap/utbrott/aktuella-utbrott/covid-19/fragor-och-svar/']
12 | 
13 |     questionsOnly = True
14 | 
15 |     def transformContent(self, contentNode):
16 |         responseParts = []
17 |         for responsePart in contentNode.xpath('.//text()').getall():
18 |             strippedPart = responsePart.strip()
19 |             if len(strippedPart) > 0:
20 |                 responseParts.append(strippedPart)
21 |         return ' '.join(responseParts)
22 | 
23 |     def parse(self, response):
24 |         columns = {
25 |             "question": [],
26 |             "answer": [],
27 |             "answer_html": [],
28 |             "link": [],
29 |             "name": [],
30 |             "source": [],
31 |             "category": [],
32 |             "country": [],
33 |             "region": [],
34 |             "city": [],
35 |             "lang": [],
36 |             "last_update": [],
37 |         }
38 | 
39 | 
40 |         categoryPaths = response.xpath('//div[@class="faq-container"]')
41 | 
42 |         for catPath in categoryPaths:
43 | 
44 |             categoryName = catPath.xpath('./h2/span/text()').getall()
45 |             #print(categoryName)
46 |             if len(categoryName) == 0:
47 |                 continue
48 | 
49 | 
50 |             qnaPaths = catPath.xpath('.//*[@class="accordion__item toggle"]')
51 |             for qnaPath in qnaPaths:
52 | 
53 | 
54 |                 question = qnaPath.xpath('./strong/a/span/span/text()').getall()
55 | 
56 | 
57 |                 responseParagraphPaths = qnaPath.xpath('.//div[@class="textbody"]')
58 | 
59 | 
60 |                 response = ""
61 |                 for respParaPath in responseParagraphPaths:
62 |                     response += " ".join(respParaPath.xpath('.//text()').getall()) + "\n\n"
63 | 
64 |                 #Cleanup text. It contains a link and a date updated in the text
65 |                 response = response.strip()
66 |                 splitted = response.split("\n")
67 |                 dater = splitted[-2].strip().replace("Uppdaterad: ", "").replace("-", "/").split(" ")[0]
68 |                 response = "\n".join(splitted[:-2 or None])
69 | 
70 |                 columns["question"].append(question[0])
71 |                 columns["category"].append(categoryName[0])
72 |                 columns["answer"].append(response)
73 |                 columns["last_update"].append(dater)
74 |                 columns["answer_html"].append(" ".join(responseParagraphPaths.getall()))
75 | 
76 |         columns["link"] = ["https://www.folkhalsomyndigheten.se/smittskydd-beredskap/utbrott/aktuella-utbrott/covid-19/fragor-och-svar/"] * len(columns["question"])
77 |         columns["name"] = ["Q&A on coronaviruses (COVID-19)"] * len(columns["question"])
78 |         columns["source"] = ["FHM, Folkhälsomyndigheten"] * len(columns["question"])
79 |         columns["country"] = ["Sweden"] * len(columns["question"])
80 |         columns["region"] = [""] * len(columns["question"])
81 |         columns["city"] = [""] * len(columns["question"])
82 |         columns["lang"] = ["sv"] * len(columns["question"])
83 | 
84 |         return columns
85 | 
86 | 
87 | if __name__ == "__main__":
88 |     process = CrawlerProcess({
89 |         'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
90 |     })
91 | 
92 |     process.crawl(CovidScraper)
93 |     process.start()
94 | 


--------------------------------------------------------------------------------
/datasources/scrapers/BerlinerSenat_scraper.py:
--------------------------------------------------------------------------------
 1 | from datetime import date
 2 | import scrapy
 3 | import pandas as pd
 4 | 
 5 | class CovidScraper(scrapy.Spider):
 6 |     name = "Berliner_Senat_Scraper"
 7 |     start_urls = ["https://www.berlin.de/corona/faq/"]
 8 | 
 9 |     def parse(self, response):
10 |         columns = {
11 |             "question" : [],
12 |             "answer" : [],
13 |             "answer_html" : [],
14 |             "link" : [],
15 |             "name" : [],
16 |             "source" : [],
17 |             "category" : [],
18 |             "country" : [],
19 |             "region" : [],
20 |             "city" : [],
21 |             "lang" : [],
22 |             "last_update" : [],
23 |         }
24 | 
25 |         current_category = ""
26 |         current_question = ""
27 |         current_answer = ""
28 |         current_answer_html = ""
29 |         question_answer_pair = False
30 | 
31 |         all_nodes = response.xpath("//*")
32 |         for node in all_nodes:
33 |             # in category
34 |             if (node.xpath("name()").get() == "h2") and (node.attrib.get("class") == "title"):
35 |                 current_category = node.css("::text").get()
36 |                 continue
37 | 
38 |             if current_category:
39 |                 # in question-answer pair
40 |                 if node.attrib.get("class") == "html5-section block module-faq land-toggler":
41 |                     # save previous question-answer pair
42 |                     if current_question:
43 |                         columns["question"].append(current_question)
44 |                         columns["answer"].append(current_answer)
45 |                         columns["answer_html"].append(current_answer_html)
46 |                         columns["category"].append(current_category)
47 | 
48 |                     question_answer_pair = True
49 |                     continue
50 | 
51 |                 # in question
52 |                 if question_answer_pair and (node.attrib.get("class") == "land-toggler-button collapsed"):
53 |                     current_question = node.css("::text").get()
54 |                     continue
55 | 
56 |                 # in answer
57 |                 if question_answer_pair and (node.attrib.get("class") == "textile"):
58 |                     current_answer = node.css(" ::text").getall()
59 |                     current_answer = " ".join(current_answer).strip()
60 |                     current_answer_html = node.getall()
61 |                     current_answer_html = " ".join(current_answer_html).strip()
62 |                     continue
63 | 
64 |             # end of FAQ
65 |             if node.attrib.get("class") == "html5-section block modul-text_bild":
66 |                 break
67 | 
68 |         columns["question"].append(current_question)
69 |         columns["answer"].append(current_answer)
70 |         columns["answer_html"].append(current_answer_html)
71 |         columns["category"].append(current_category)
72 | 
73 |         today = date.today()
74 | 
75 |         columns["link"] = ["https://www.berlin.de/corona/faq/"] * len(columns["question"])
76 |         columns["name"] = ["Corona-Prävention in Berlin – Fragen und Antworten"] * len(columns["question"])
77 |         columns["source"] = ["Berliner Senat"] * len(columns["question"])
78 |         columns["country"] = ["DE"] * len(columns["question"])
79 |         columns["region"] = ["Berlin"] * len(columns["question"])
80 |         columns["city"] = ["Berlin"] * len(columns["question"])
81 |         columns["lang"] = ["de"] * len(columns["question"])
82 |         columns["last_update"] = [today.strftime("%Y/%m/%d")] * len(columns["question"])
83 | 
84 |         return columns
85 | 
86 | 
87 | 
88 | 
89 | 


--------------------------------------------------------------------------------