├── backend ├── __init__.py ├── models │ └── __init__.py ├── .python-version ├── controller │ ├── __init__.py │ ├── errors │ │ └── http_error.py │ ├── router.py │ ├── feedback.py │ └── autocomplete.py ├── readme.md ├── config.py ├── api.py ├── data_ingestion.py └── .gitignore ├── covid_nlp ├── __init__.py ├── language │ ├── requirements.txt │ ├── ms_translate.py │ └── detect_language.py ├── modeling │ ├── tfidf │ │ ├── requirements.txt │ │ ├── README.md │ │ ├── tfidf_train.py │ │ ├── tfidf_client.py │ │ └── preprocess.py │ └── transformer │ │ ├── eval_pretrained_haystack.py │ │ └── train_quora_dedup_bert.py └── eval.py ├── datasources ├── __init__.py ├── automatic │ ├── DATASOURCE_INSTRUCTIONS.md │ ├── scraper.py │ └── testing_WHO_scraper.py ├── scrapers │ ├── RKI_scraper.py │ ├── GOV_pl_scraper.py │ ├── BMAS_scraper.py │ ├── CDC_Water_scraper.py │ ├── Bundesregierung_scraper.py │ ├── UNICEF_scraper.py │ ├── BAUA_scraper.py │ ├── CDC_Travel_scraper.py │ ├── Salute_IT_scraper.py │ ├── BMWI_scraper.py │ ├── BVF_scraper.py │ ├── Arbeitsagentur_scraper.py │ ├── CDC_General_scraper.py │ ├── IHK_scraper.py │ ├── ECDC_scraper.py │ ├── FHM_EN_scraper.py │ ├── WHO_scraper.py │ ├── BMG_scraper.py │ ├── FHM_SV_scraper.py │ └── BerlinerSenat_scraper.py ├── scrapers_unused │ └── ZEIT_scraper.py └── scrapers_outdated │ ├── BZgA_scraper.py │ ├── CDC_Individuals_scraper.py │ └── CDC_Children_scraper.py ├── covid-frontend ├── src │ ├── components │ │ ├── NotFound │ │ │ ├── styles.module.scss │ │ │ └── index.js │ │ ├── themes │ │ │ ├── common │ │ │ │ ├── index.js │ │ │ │ └── Header │ │ │ │ │ ├── index.js │ │ │ │ │ └── styles.module.scss │ │ │ ├── index.js │ │ │ └── MainTemplate │ │ │ │ ├── styles.module.scss │ │ │ │ └── index.js │ │ ├── common │ │ │ ├── index.js │ │ │ ├── Loader │ │ │ │ ├── styles.module.scss │ │ │ │ └── index.js │ │ │ ├── Tag │ │ │ │ ├── styles.module.scss │ │ │ │ └── index.js │ │ │ └── InputContainer │ │ │ │ ├── styles.module.scss │ │ │ │ └── index.js │ │ ├── App.js │ │ ├── Provider.js │ │ ├── UserFeedback │ │ │ ├── styles.module.scss │ │ │ └── index.js │ │ ├── Home │ │ │ ├── index.js │ │ │ └── styles.module.scss │ │ └── Answers │ │ │ └── styles.module.scss │ ├── core │ │ ├── constants │ │ │ └── env.js │ │ ├── utils │ │ │ └── string.js │ │ └── api │ │ │ └── index.js │ ├── assets │ │ ├── images │ │ │ ├── logo.png │ │ │ ├── logo.psd │ │ │ ├── pwc.png │ │ │ ├── powedby.png │ │ │ ├── powedby.psd │ │ │ └── deepset_logo_small.png │ │ ├── fonts │ │ │ ├── gothic-a1-v8-latin-500.eot │ │ │ ├── gothic-a1-v8-latin-500.ttf │ │ │ ├── gothic-a1-v8-latin-500.woff │ │ │ ├── gothic-a1-v8-latin-600.eot │ │ │ ├── gothic-a1-v8-latin-600.ttf │ │ │ ├── gothic-a1-v8-latin-600.woff │ │ │ ├── gothic-a1-v8-latin-700.eot │ │ │ ├── gothic-a1-v8-latin-700.ttf │ │ │ ├── gothic-a1-v8-latin-700.woff │ │ │ ├── gothic-a1-v8-latin-500.woff2 │ │ │ ├── gothic-a1-v8-latin-600.woff2 │ │ │ ├── gothic-a1-v8-latin-700.woff2 │ │ │ ├── gothic-a1-v8-latin-regular.eot │ │ │ ├── gothic-a1-v8-latin-regular.ttf │ │ │ ├── gothic-a1-v8-latin-regular.woff │ │ │ └── gothic-a1-v8-latin-regular.woff2 │ │ ├── styles │ │ │ ├── _mixins.scss │ │ │ ├── wirvsvirus │ │ │ │ └── wirvsvirus-theme.css │ │ │ ├── antd │ │ │ │ └── antd-theme.less │ │ │ ├── _reset.css │ │ │ └── _variables.scss │ │ └── icons │ │ │ └── brain.svg │ ├── routes │ │ ├── links.js │ │ └── index.js │ ├── history.js │ ├── store │ │ ├── reducers │ │ │ ├── index.js │ │ │ ├── activeAnswers.js │ │ │ └── globalSearch.js │ │ ├── sagas │ │ │ ├── index.js │ │ │ ├── api │ │ │ │ └── index.js │ │ │ ├── globalSearch.js │ │ │ └── activeAnswers.js │ │ ├── types │ │ │ ├── globalSearch.js │ │ │ └── activeAnswers.js │ │ └── actions │ │ │ ├── globalSearch.js │ │ │ └── activeAnswers.js │ ├── i18n.js │ └── index.js ├── public │ ├── robots.txt │ ├── favicon.ico │ ├── manifest.json │ ├── index.html │ └── locales │ │ ├── en │ │ └── translation.json │ │ └── de │ │ └── translation.json ├── jsconfig.json ├── .env.production ├── .env.staging ├── Dockerfile ├── README.md ├── .eslintrc ├── .gitignore ├── nginx.conf ├── package.json └── .gitlab-ci.yml ├── telegram-bot ├── gradle.properties ├── settings.gradle ├── gradle │ └── wrapper │ │ ├── gradle-wrapper.jar │ │ └── gradle-wrapper.properties ├── src │ └── main │ │ ├── kotlin │ │ └── com │ │ │ └── theapache64 │ │ │ └── cs │ │ │ ├── utils │ │ │ ├── GsonUtil.kt │ │ │ ├── FeedbackParser.kt │ │ │ ├── TelegramAPI.kt │ │ │ └── RestClient.kt │ │ │ ├── models │ │ │ ├── Feedback.kt │ │ │ └── rest │ │ │ │ ├── telegram │ │ │ │ ├── AnswerCallbackRequest.kt │ │ │ │ ├── SendChatActionRequest.kt │ │ │ │ ├── SendMessageResponse.kt │ │ │ │ ├── SendMessageRequest.kt │ │ │ │ ├── TelegramUpdate.kt │ │ │ │ └── TelegramCallbackQuery.kt │ │ │ │ ├── CoronaQuestion.kt │ │ │ │ └── AddFeedbackRequest.kt │ │ │ ├── servlets │ │ │ └── TestServlet.kt │ │ │ └── core │ │ │ └── Scholar.kt │ │ └── webapp │ │ └── index.jsp ├── build.gradle ├── gradlew.bat └── .gitignore ├── .gitignore ├── docs └── img │ ├── covid-bert.png │ └── example-data-format.png ├── data ├── question-answering │ └── Handbook - Labelling Tool.pdf └── faqs │ └── deduplicate_with_sentenceBert.py ├── requirements.txt ├── Dockerfile ├── docker-compose.yml └── README.md /backend/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /covid_nlp/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /datasources/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /backend/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /backend/.python-version: -------------------------------------------------------------------------------- 1 | covid 2 | -------------------------------------------------------------------------------- /backend/controller/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /covid-frontend/src/components/NotFound/styles.module.scss: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /telegram-bot/gradle.properties: -------------------------------------------------------------------------------- 1 | kotlin.code.style=official -------------------------------------------------------------------------------- /covid_nlp/language/requirements.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | pycld2 3 | pycld3 4 | -------------------------------------------------------------------------------- /telegram-bot/settings.gradle: -------------------------------------------------------------------------------- 1 | rootProject.name = 'corona-scholar' 2 | 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | .DS_Store 3 | .vscode/settings.json 4 | data 5 | __pycache__ -------------------------------------------------------------------------------- /covid_nlp/modeling/tfidf/requirements.txt: -------------------------------------------------------------------------------- 1 | sentencepiece 2 | sklearn 3 | nltk 4 | -------------------------------------------------------------------------------- /covid-frontend/public/robots.txt: -------------------------------------------------------------------------------- 1 | # https://www.robotstxt.org/robotstxt.html 2 | User-agent: * 3 | -------------------------------------------------------------------------------- /covid-frontend/src/core/constants/env.js: -------------------------------------------------------------------------------- 1 | export const baseUrl = process.env.REACT_APP_API; 2 | -------------------------------------------------------------------------------- /covid-frontend/jsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "baseUrl": "src" 4 | } 5 | } 6 | -------------------------------------------------------------------------------- /covid-frontend/src/components/themes/common/index.js: -------------------------------------------------------------------------------- 1 | export { default as Header } from './Header'; 2 | -------------------------------------------------------------------------------- /docs/img/covid-bert.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/docs/img/covid-bert.png -------------------------------------------------------------------------------- /covid-frontend/src/components/themes/index.js: -------------------------------------------------------------------------------- 1 | export { default as MainTemplate } from './MainTemplate'; 2 | -------------------------------------------------------------------------------- /covid-frontend/.env.production: -------------------------------------------------------------------------------- 1 | REACT_APP_ENV=production 2 | REACT_APP_API=https://covid-backend.deepset.ai 3 | -------------------------------------------------------------------------------- /covid-frontend/.env.staging: -------------------------------------------------------------------------------- 1 | 2 | REACT_APP_ENV=staging 3 | REACT_APP_API=https://covid-backend.deepset.ai 4 | -------------------------------------------------------------------------------- /covid-frontend/public/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/covid-frontend/public/favicon.ico -------------------------------------------------------------------------------- /docs/img/example-data-format.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/docs/img/example-data-format.png -------------------------------------------------------------------------------- /covid-frontend/src/assets/images/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/covid-frontend/src/assets/images/logo.png -------------------------------------------------------------------------------- /covid-frontend/src/assets/images/logo.psd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/covid-frontend/src/assets/images/logo.psd -------------------------------------------------------------------------------- /covid-frontend/src/assets/images/pwc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/covid-frontend/src/assets/images/pwc.png -------------------------------------------------------------------------------- /covid-frontend/src/assets/images/powedby.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/covid-frontend/src/assets/images/powedby.png -------------------------------------------------------------------------------- /covid-frontend/src/assets/images/powedby.psd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/covid-frontend/src/assets/images/powedby.psd -------------------------------------------------------------------------------- /covid-frontend/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nginx:stable 2 | COPY nginx.conf /etc/nginx/conf.d/default.conf 3 | COPY build/ /usr/share/nginx/html/ 4 | EXPOSE 80 -------------------------------------------------------------------------------- /covid-frontend/src/routes/links.js: -------------------------------------------------------------------------------- 1 | export default { 2 | home: '/home', 3 | answers: '/answers', 4 | // questions: '/questions', 5 | }; 6 | -------------------------------------------------------------------------------- /telegram-bot/gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/telegram-bot/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /covid-frontend/README.md: -------------------------------------------------------------------------------- 1 | ## Setup 2 | 3 | React Js frontend for the covid tool 4 | 5 | npm i 6 | cp .env.staging .env 7 | npm start 8 | 9 | -------------------------------------------------------------------------------- /data/question-answering/Handbook - Labelling Tool.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/data/question-answering/Handbook - Labelling Tool.pdf -------------------------------------------------------------------------------- /covid-frontend/src/assets/images/deepset_logo_small.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/covid-frontend/src/assets/images/deepset_logo_small.png -------------------------------------------------------------------------------- /covid-frontend/src/assets/fonts/gothic-a1-v8-latin-500.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/covid-frontend/src/assets/fonts/gothic-a1-v8-latin-500.eot -------------------------------------------------------------------------------- /covid-frontend/src/assets/fonts/gothic-a1-v8-latin-500.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/covid-frontend/src/assets/fonts/gothic-a1-v8-latin-500.ttf -------------------------------------------------------------------------------- /covid-frontend/src/assets/fonts/gothic-a1-v8-latin-500.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/covid-frontend/src/assets/fonts/gothic-a1-v8-latin-500.woff -------------------------------------------------------------------------------- /covid-frontend/src/assets/fonts/gothic-a1-v8-latin-600.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/covid-frontend/src/assets/fonts/gothic-a1-v8-latin-600.eot -------------------------------------------------------------------------------- /covid-frontend/src/assets/fonts/gothic-a1-v8-latin-600.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/covid-frontend/src/assets/fonts/gothic-a1-v8-latin-600.ttf -------------------------------------------------------------------------------- /covid-frontend/src/assets/fonts/gothic-a1-v8-latin-600.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/covid-frontend/src/assets/fonts/gothic-a1-v8-latin-600.woff -------------------------------------------------------------------------------- /covid-frontend/src/assets/fonts/gothic-a1-v8-latin-700.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/covid-frontend/src/assets/fonts/gothic-a1-v8-latin-700.eot -------------------------------------------------------------------------------- /covid-frontend/src/assets/fonts/gothic-a1-v8-latin-700.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/covid-frontend/src/assets/fonts/gothic-a1-v8-latin-700.ttf -------------------------------------------------------------------------------- /covid-frontend/src/assets/fonts/gothic-a1-v8-latin-700.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/covid-frontend/src/assets/fonts/gothic-a1-v8-latin-700.woff -------------------------------------------------------------------------------- /covid-frontend/.eslintrc: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "react-app", 3 | "rules": { 4 | "semi": [2, "always"], 5 | "indent": ["error", 2, { "SwitchCase": 1 }] 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /covid-frontend/src/assets/fonts/gothic-a1-v8-latin-500.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/covid-frontend/src/assets/fonts/gothic-a1-v8-latin-500.woff2 -------------------------------------------------------------------------------- /covid-frontend/src/assets/fonts/gothic-a1-v8-latin-600.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/covid-frontend/src/assets/fonts/gothic-a1-v8-latin-600.woff2 -------------------------------------------------------------------------------- /covid-frontend/src/assets/fonts/gothic-a1-v8-latin-700.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/covid-frontend/src/assets/fonts/gothic-a1-v8-latin-700.woff2 -------------------------------------------------------------------------------- /covid-frontend/src/assets/fonts/gothic-a1-v8-latin-regular.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/covid-frontend/src/assets/fonts/gothic-a1-v8-latin-regular.eot -------------------------------------------------------------------------------- /covid-frontend/src/assets/fonts/gothic-a1-v8-latin-regular.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/covid-frontend/src/assets/fonts/gothic-a1-v8-latin-regular.ttf -------------------------------------------------------------------------------- /covid-frontend/src/assets/fonts/gothic-a1-v8-latin-regular.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/covid-frontend/src/assets/fonts/gothic-a1-v8-latin-regular.woff -------------------------------------------------------------------------------- /covid-frontend/src/assets/fonts/gothic-a1-v8-latin-regular.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/COVID-QA/HEAD/covid-frontend/src/assets/fonts/gothic-a1-v8-latin-regular.woff2 -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | #crawler 2 | Scrapy==2.0.1 3 | # haystack 4 | -e git://github.com/deepset-ai/haystack.git@master#egg=farm-haystack 5 | langid===1.1.6 6 | elastic-apm 7 | pycld2 8 | -------------------------------------------------------------------------------- /covid-frontend/src/history.js: -------------------------------------------------------------------------------- 1 | import { createBrowserHistory } from 'history'; 2 | 3 | // configure, create, and export the project's history instance 4 | export default createBrowserHistory(); 5 | -------------------------------------------------------------------------------- /covid-frontend/src/components/common/index.js: -------------------------------------------------------------------------------- 1 | export { default as InputContainer } from './InputContainer'; 2 | export { default as Tag } from './Tag'; 3 | export { default as Loader } from './Loader'; 4 | -------------------------------------------------------------------------------- /telegram-bot/src/main/kotlin/com/theapache64/cs/utils/GsonUtil.kt: -------------------------------------------------------------------------------- 1 | package com.theapache64.cs.utils 2 | 3 | import com.google.gson.GsonBuilder 4 | 5 | object GsonUtil { 6 | val gson = GsonBuilder().create() 7 | } -------------------------------------------------------------------------------- /telegram-bot/src/main/kotlin/com/theapache64/cs/models/Feedback.kt: -------------------------------------------------------------------------------- 1 | package com.theapache64.cs.models 2 | 3 | data class Feedback( 4 | val modelId:Int, 5 | val feedback: String, 6 | val question: String, 7 | val documentId: Long 8 | ) -------------------------------------------------------------------------------- /covid-frontend/src/store/reducers/index.js: -------------------------------------------------------------------------------- 1 | import { combineReducers } from 'redux'; 2 | import globalSearch from './globalSearch'; 3 | import activeAnswers from './activeAnswers'; 4 | 5 | export default combineReducers({ 6 | globalSearch, 7 | activeAnswers, 8 | }); 9 | -------------------------------------------------------------------------------- /covid-frontend/src/core/utils/string.js: -------------------------------------------------------------------------------- 1 | 2 | export const prefix = (value) => (string) => `${value}${string}`; 3 | 4 | // result example - 12345.67 --> "12,345.67" 5 | export const formatNumber = (value = 0, precision = 2) => 6 | value.toFixed(precision).replace(/\d(?=(\d{3})+\.)/g, '$&,'); 7 | -------------------------------------------------------------------------------- /telegram-bot/gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | #Tue Mar 24 01:07:13 IST 2020 2 | distributionUrl=https\://services.gradle.org/distributions/gradle-5.2.1-all.zip 3 | distributionBase=GRADLE_USER_HOME 4 | distributionPath=wrapper/dists 5 | zipStorePath=wrapper/dists 6 | zipStoreBase=GRADLE_USER_HOME 7 | -------------------------------------------------------------------------------- /telegram-bot/src/main/kotlin/com/theapache64/cs/models/rest/telegram/AnswerCallbackRequest.kt: -------------------------------------------------------------------------------- 1 | package com.theapache64.cs.models.rest.telegram 2 | 3 | import com.google.gson.annotations.SerializedName 4 | 5 | 6 | data class AnswerCallbackRequest( 7 | @SerializedName("callback_query_id") 8 | val callbackQueryId: String // 123 9 | ) -------------------------------------------------------------------------------- /backend/controller/errors/http_error.py: -------------------------------------------------------------------------------- 1 | from fastapi import HTTPException 2 | from starlette.requests import Request 3 | from starlette.responses import JSONResponse 4 | 5 | 6 | async def http_error_handler(_: Request, exc: HTTPException) -> JSONResponse: 7 | return JSONResponse({"errors": [exc.detail]}, status_code=exc.status_code) 8 | -------------------------------------------------------------------------------- /backend/controller/router.py: -------------------------------------------------------------------------------- 1 | from fastapi import APIRouter 2 | 3 | from backend.controller import autocomplete, model, feedback 4 | 5 | router = APIRouter() 6 | router.include_router(autocomplete.router, tags=["autocomplete"]) 7 | router.include_router(model.router, tags=["model"]) 8 | router.include_router(feedback.router, tags=["feedback"]) 9 | -------------------------------------------------------------------------------- /covid-frontend/src/store/sagas/index.js: -------------------------------------------------------------------------------- 1 | import { all, fork } from 'redux-saga/effects'; 2 | import activeAnswersSaga from './activeAnswers'; 3 | import globalSearchSaga from './globalSearch'; 4 | 5 | export default function* rootSaga() { 6 | yield all([ 7 | fork(activeAnswersSaga), 8 | fork(globalSearchSaga), 9 | ]); 10 | } 11 | -------------------------------------------------------------------------------- /telegram-bot/src/main/kotlin/com/theapache64/cs/models/rest/CoronaQuestion.kt: -------------------------------------------------------------------------------- 1 | package com.theapache64.cs.models.rest 2 | 3 | import com.google.gson.annotations.SerializedName 4 | 5 | class CoronaQuestion( 6 | @SerializedName("questions") 7 | val questions: Array, // How does corona spread? 8 | @SerializedName("top_k_retriever") 9 | val resultCount: Int = 1 10 | ) -------------------------------------------------------------------------------- /covid-frontend/src/components/themes/MainTemplate/styles.module.scss: -------------------------------------------------------------------------------- 1 | @import '../../../assets/styles/_variables'; 2 | 3 | .wrapper { 4 | background-color: $light-grey; 5 | height: 100%; 6 | min-height: 100vh; 7 | position: relative; 8 | width: 100%; 9 | } 10 | 11 | .content { 12 | margin: 0 auto; 13 | max-width: 1000px; 14 | padding: 16px 0 32px; 15 | width: 100%; 16 | } 17 | -------------------------------------------------------------------------------- /telegram-bot/src/main/kotlin/com/theapache64/cs/models/rest/telegram/SendChatActionRequest.kt: -------------------------------------------------------------------------------- 1 | package com.theapache64.cs.models.rest.telegram 2 | 3 | import com.google.gson.annotations.SerializedName 4 | 5 | 6 | data class SendChatActionRequest( 7 | @SerializedName("action") 8 | val action: String, // String 9 | @SerializedName("chat_id") 10 | val chatId: String // String 11 | ) -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.7.4-stretch 2 | 3 | WORKDIR /home/user 4 | 5 | # install haystack and dependencies 6 | COPY requirements.txt /home/user/ 7 | RUN pip install -r requirements.txt 8 | 9 | COPY backend /home/user/backend 10 | COPY covid_nlp /home/user/covid_nlp 11 | 12 | EXPOSE 8000 13 | 14 | # cmd for running the API 15 | CMD ["uvicorn", "backend.api:app", "--host", "0.0.0.0", "--port", "8000"] -------------------------------------------------------------------------------- /telegram-bot/src/main/webapp/index.jsp: -------------------------------------------------------------------------------- 1 | <%-- 2 | Created by IntelliJ IDEA. 3 | User: theapache64 4 | Date: 24/03/20 5 | Time: 1:06 AM 6 | To change this template use File | Settings | File Templates. 7 | --%> 8 | <%@ page contentType="text/html;charset=UTF-8" language="java" %> 9 | 10 | 11 | $Title$ 12 | 13 | 14 | $END$ 15 | 16 | 17 | -------------------------------------------------------------------------------- /covid-frontend/public/manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "short_name": "Corona scholar", 3 | "name": "Corona scholar - scientific corona knowledge", 4 | "icons": [ 5 | { 6 | "src": "favicon.ico", 7 | "sizes": "64x64 32x32 24x24 16x16", 8 | "type": "image/x-icon" 9 | } 10 | ], 11 | "start_url": ".", 12 | "display": "standalone", 13 | "theme_color": "#000000", 14 | "background_color": "#ffffff" 15 | } 16 | -------------------------------------------------------------------------------- /telegram-bot/src/main/kotlin/com/theapache64/cs/models/rest/AddFeedbackRequest.kt: -------------------------------------------------------------------------------- 1 | package com.theapache64.cs.models.rest 2 | 3 | import com.google.gson.annotations.SerializedName 4 | 5 | 6 | data class AddFeedbackRequest( 7 | @SerializedName("feedback") 8 | val feedback: String, // relevant 9 | @SerializedName("question") 10 | val question: String, 11 | @SerializedName("document_id") 12 | val documentId: Long 13 | ) -------------------------------------------------------------------------------- /covid-frontend/.gitignore: -------------------------------------------------------------------------------- 1 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files. 2 | 3 | # dependencies 4 | /node_modules 5 | /.pnp 6 | .pnp.js 7 | 8 | # testing 9 | /coverage 10 | 11 | .env 12 | 13 | # production 14 | /build 15 | 16 | # misc 17 | .DS_Store 18 | .env.local 19 | .env.development.local 20 | .env.test.local 21 | .env.production.local 22 | 23 | npm-debug.log* 24 | yarn-debug.log* 25 | yarn-error.log* 26 | .idea 27 | -------------------------------------------------------------------------------- /covid-frontend/src/components/App.js: -------------------------------------------------------------------------------- 1 | import React, { Component, Fragment } from 'react'; 2 | import { ConfigProvider } from 'antd'; 3 | import deDE from 'antd/es/locale/de_DE'; 4 | 5 | class App extends Component { 6 | 7 | render () { 8 | return ( 9 | 10 | 11 | { this.props.children } 12 | 13 | 14 | ); 15 | } 16 | } 17 | 18 | export default App; 19 | -------------------------------------------------------------------------------- /covid-frontend/src/components/common/Loader/styles.module.scss: -------------------------------------------------------------------------------- 1 | @import '../../../assets/styles/_variables'; 2 | 3 | .selfContained { 4 | align-items: center; 5 | height: 140px; 6 | display: flex; 7 | justify-content: center; 8 | width: 100%; 9 | } 10 | .fullSized { 11 | align-items: center; 12 | display: flex; 13 | justify-content: center; 14 | height: 100vh; 15 | position: fixed; 16 | top: 0; 17 | left: 0; 18 | width: 100vw; 19 | z-index: 99; 20 | } 21 | -------------------------------------------------------------------------------- /telegram-bot/src/main/kotlin/com/theapache64/cs/servlets/TestServlet.kt: -------------------------------------------------------------------------------- 1 | package com.theapache64.cs.servlets 2 | 3 | import javax.servlet.annotation.WebServlet 4 | import javax.servlet.http.HttpServlet 5 | import javax.servlet.http.HttpServletRequest 6 | import javax.servlet.http.HttpServletResponse 7 | 8 | @WebServlet(urlPatterns = ["/test"]) 9 | class TestServlet : HttpServlet() { 10 | 11 | override fun doGet(req: HttpServletRequest?, resp: HttpServletResponse?) { 12 | resp!!.writer.write("This is sample resp") 13 | } 14 | } -------------------------------------------------------------------------------- /datasources/automatic/DATASOURCE_INSTRUCTIONS.md: -------------------------------------------------------------------------------- 1 | # Datasources 2 | 3 | ## How to add sources 4 | 1. Please add official datasources of any language to the sources.csv file 5 | 2. The CSV is sorted by link values, please put additions into the right order (to avoid duplicates) 6 | 3. Please fill out as many columns as you can - it might make sense to have the scraper fill out values individually. 7 | E.g. when the FAQ site contains text in different languages (use automatic language detection 8 | from covid_nlp/language/detect_language.py) or if each FAQ entry has a dedicated category. -------------------------------------------------------------------------------- /covid-frontend/src/components/themes/MainTemplate/index.js: -------------------------------------------------------------------------------- 1 | import React, { Component } from 'react'; 2 | import { Header } from '../common'; 3 | import styles from './styles.module.scss'; 4 | 5 | class MainTemplate extends Component { 6 | 7 | static propTypes = {} 8 | 9 | render() { 10 | return ( 11 |
12 |
13 | 14 |
15 | { this.props.children } 16 |
17 | 18 |
19 | 20 | ); 21 | } 22 | } 23 | 24 | export default MainTemplate; 25 | -------------------------------------------------------------------------------- /covid_nlp/modeling/tfidf/README.md: -------------------------------------------------------------------------------- 1 | ## Train and Evaluate TF-IDF Model 2 | 3 | ### 1. Train sentencepiece model 4 | 5 | Preprocessing takes max 1 argument (= vocab size for sentencepiece) which defaults to 24000 if not set. 6 | 7 | `cat my_large_text | python3 ./preprocess.py 16000` 8 | 9 | ### 2. Train TF-IDF Vectors 10 | 11 | TF-IDF Vectors are trained with 1- and 2-bigrams, with otherwise default settings 12 | 13 | `cat my_questions | python3 ./tfidf_train.py` 14 | 15 | ### 3. Score and submit 16 | 17 | Each pair in the eval set is scored with cosine similarity and then results are posted to mflow 18 | 19 | `python3 ./tfidf_client.py` 20 | -------------------------------------------------------------------------------- /covid-frontend/nginx.conf: -------------------------------------------------------------------------------- 1 | # Expires map 2 | map $sent_http_content_type $expires { 3 | default off; 4 | text/html epoch; 5 | text/css 1y; 6 | application/javascript 1y; 7 | } 8 | 9 | server { 10 | listen 80; 11 | server_name localhost; 12 | 13 | expires $expires; 14 | 15 | location / { 16 | root /usr/share/nginx/html; 17 | index index.html index.htm; 18 | try_files $uri $uri/ /index.html; 19 | } 20 | 21 | error_page 500 502 503 504 /50x.html; 22 | location = /50x.html { 23 | root /usr/share/nginx/html; 24 | } 25 | } -------------------------------------------------------------------------------- /covid-frontend/src/assets/styles/_mixins.scss: -------------------------------------------------------------------------------- 1 | @import "variables"; 2 | 3 | @mixin text($size: regular) { 4 | $text-size: map-get($text, $size); 5 | 6 | font-size: map-get($text-size, font-size); 7 | line-height: map-get($text-size, line-height); 8 | } 9 | 10 | @mixin border($color: $border-grey, $radius: $base-radius) { 11 | border: 1px solid $color; 12 | border-radius: $radius; 13 | } 14 | 15 | @mixin outline($offset: 0, $width: 2px) { 16 | outline-color: #80c0d8; 17 | outline-offset: $offset; 18 | outline-width: $width; 19 | } 20 | 21 | @mixin text-overflow() { 22 | overflow: hidden; 23 | text-overflow: ellipsis; 24 | white-space: nowrap; 25 | } 26 | -------------------------------------------------------------------------------- /covid-frontend/src/store/types/globalSearch.js: -------------------------------------------------------------------------------- 1 | import { prefix } from 'core/utils/string'; 2 | 3 | const searchPrefix = prefix('globalSearch/'); 4 | 5 | export const SET_SELECTED_VALUE = searchPrefix('SET_SELECTED_VALUE'); 6 | 7 | export const UPDATE_SEARCH_VALUE = searchPrefix('UPDATE_SEARCH_VALUE'); 8 | export const UPDATE_SEARCH_FILTERS = searchPrefix('UPDATE_SEARCH_FILTES'); 9 | export const UPDATE_LAST_SEARCH_VALUE = searchPrefix('UPDATE_LAST_SEARCH_VALUE'); 10 | export const UPDATE_SEARCH_OPTIONS = searchPrefix('UPDATE_SEARCH_OPTIONS'); 11 | 12 | export const SET_LOADING_STATUS = searchPrefix('SET_LOADING_STATUS'); 13 | 14 | export const RESET = searchPrefix('RESET'); 15 | -------------------------------------------------------------------------------- /covid-frontend/src/components/common/Tag/styles.module.scss: -------------------------------------------------------------------------------- 1 | @import '../../../assets/styles/_variables'; 2 | @import '../../../assets/styles/_mixins'; 3 | 4 | .tag { 5 | display: inline-block; 6 | flex-grow: 0; 7 | padding: 8px 12px; 8 | text-align: center; 9 | white-space: nowrap; 10 | 11 | // &.red { 12 | // background-color: $error-light2; 13 | // color: $error; 14 | // @include border($error-light, 4px); 15 | // } 16 | // &.orange { 17 | // background-color: $warning-light2; 18 | // color: $warning; 19 | // @include border($warning-light, 4px); 20 | // } 21 | // &.green { 22 | // background-color: $success-light2; 23 | // color: $success; 24 | // @include border($success-light, 4px); 25 | // } 26 | } 27 | -------------------------------------------------------------------------------- /covid-frontend/src/components/themes/common/Header/index.js: -------------------------------------------------------------------------------- 1 | import React, { PureComponent } from 'react'; 2 | import { Link } from 'react-router-dom'; 3 | import links from 'routes/links'; 4 | import logo from 'assets/images/logo.png'; 5 | import styles from './styles.module.scss'; 6 | 7 | class Header extends PureComponent { 8 | 9 | render() { 10 | 11 | return ( 12 |
13 |
14 | 15 |
16 | corona-scholar logo 17 |
18 | 19 |
20 |
21 | ); 22 | } 23 | } 24 | 25 | 26 | export default Header; 27 | -------------------------------------------------------------------------------- /covid-frontend/src/components/themes/common/Header/styles.module.scss: -------------------------------------------------------------------------------- 1 | @import '../../../../assets/styles/_variables'; 2 | @import '../../../../assets/styles/_mixins'; 3 | 4 | .wrapper { 5 | position: relative; 6 | width: 100%; 7 | z-index: 10; 8 | } 9 | 10 | .header { 11 | display: flex; 12 | justify-content: center; 13 | max-width: 1280px; 14 | margin-left: auto; 15 | margin-right: auto; 16 | padding: 28px 30px; 17 | width: 100%; 18 | } 19 | 20 | 21 | .homeLink { 22 | display: inline-block; 23 | vertical-align: top; 24 | white-space: nowrap; 25 | } 26 | 27 | 28 | .logo { 29 | display: inline-block; 30 | position: relative; 31 | width: 136px; 32 | 33 | img { 34 | display: block; 35 | height: 100%; 36 | object-fit: contain; 37 | width: 100%; 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /covid-frontend/src/i18n.js: -------------------------------------------------------------------------------- 1 | import i18n from 'i18next'; 2 | import Backend from 'i18next-xhr-backend'; 3 | import LanguageDetector from 'i18next-browser-languagedetector'; 4 | import { initReactI18next } from 'react-i18next'; 5 | 6 | const fallbackLng = ['en']; 7 | const availableLanguages = ['de', 'en']; 8 | 9 | i18n 10 | .use(Backend) // load translation using xhr -> see /public/locales. We will add locales in the next step 11 | .use(LanguageDetector) // detect user language 12 | .use(initReactI18next) // pass the i18n instance to react-i18next. 13 | .init({ 14 | fallbackLng, // if user computer language is not on the list of available languages, than we will be using the fallback language specified earlier 15 | debug: true, 16 | whitelist: availableLanguages, 17 | 18 | interpolation: { 19 | escapeValue: false 20 | }, 21 | }); 22 | 23 | export default i18n; 24 | -------------------------------------------------------------------------------- /covid-frontend/src/store/actions/globalSearch.js: -------------------------------------------------------------------------------- 1 | import * as types from 'store/types/globalSearch'; 2 | 3 | export const setSelectedValue = (payload) => ({ 4 | type: types.SET_SELECTED_VALUE, 5 | payload 6 | }); 7 | 8 | export const updateSearchValue = (payload) => ({ 9 | type: types.UPDATE_SEARCH_VALUE, 10 | payload 11 | }); 12 | 13 | export const updateLastSearchValue = (payload) => ({ 14 | type: types.UPDATE_LAST_SEARCH_VALUE, 15 | payload 16 | }); 17 | 18 | export const updateSearchOptions = (payload) => ({ 19 | type: types.UPDATE_SEARCH_OPTIONS, 20 | payload 21 | }); 22 | export const updateSearchFilters = (payload) => ({ 23 | type: types.UPDATE_SEARCH_FILTERS, 24 | payload 25 | }); 26 | 27 | export const setLoadingStatus = (status) => ({ 28 | type: types.SET_LOADING_STATUS, 29 | status 30 | }); 31 | 32 | export const reset = () => ({ 33 | type: types.RESET, 34 | }); 35 | -------------------------------------------------------------------------------- /covid-frontend/src/store/types/activeAnswers.js: -------------------------------------------------------------------------------- 1 | import { prefix } from 'core/utils/string'; 2 | 3 | const activeAnswersPrefix = prefix('activeAnswers/'); 4 | 5 | export const GET = activeAnswersPrefix('GET'); 6 | export const SET = activeAnswersPrefix('SET'); 7 | 8 | export const SET_LOADING_STATUS = activeAnswersPrefix('SET_LOADING_STATUS'); 9 | export const SHOW_USER_FEEDBACK_PANEL = activeAnswersPrefix('SHOW_USER_FEEDBACK_PANEL'); 10 | export const HIDE_USER_FEEDBACK_PANEL = activeAnswersPrefix('HIDE_USER_FEEDBACK_PANEL'); 11 | 12 | export const RESET = activeAnswersPrefix('RESET'); 13 | 14 | export const MARK_AS_CORRECT_ANSWER = activeAnswersPrefix('MARK_AS_CORRECT_ANSWER'); 15 | export const MARK_AS_WRONG_ANSWER = activeAnswersPrefix('MARK_AS_WRONG_ANSWER'); 16 | export const MARK_AS_FEEDBACK_GIVEN = activeAnswersPrefix('MARK_AS_FEEDBACK_GIVEN'); 17 | export const CLEAR_FEEDBACK_GIVEN = activeAnswersPrefix('CLEAR_FEEDBACK_GIVEN'); 18 | -------------------------------------------------------------------------------- /covid-frontend/public/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | Corona Scholar – Covid-19 Frage-und-Antwort Chat Bot basierend auf wissenschaftlicher Faktensammlung und Künstlicher Intelligenz 12 | Corona Scholar – Covid-19 FAQ chat bot based on scientific knowledge and AI 13 | 14 | 15 | 16 | 17 |
18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /covid-frontend/src/components/NotFound/index.js: -------------------------------------------------------------------------------- 1 | import React, { PureComponent } from 'react'; 2 | import PropTypes from 'prop-types'; 3 | import { connect } from 'react-redux'; 4 | import { Result, Button } from 'antd'; 5 | import links from 'routes/links'; 6 | import { withTranslation } from 'react-i18next'; 7 | 8 | class NotFound extends PureComponent { 9 | 10 | static propTypes = { 11 | history: PropTypes.object 12 | } 13 | 14 | handleBackHome = () => { 15 | this.props.history.push(links.home); 16 | } 17 | 18 | render() { 19 | const { t } = this.props; 20 | 21 | return ( 22 | 28 | {t('404.button-text')} 29 | 30 | } 31 | /> 32 | ); 33 | } 34 | } 35 | 36 | export default connect()(withTranslation()(NotFound)); 37 | -------------------------------------------------------------------------------- /covid-frontend/src/components/common/Tag/index.js: -------------------------------------------------------------------------------- 1 | import React, { PureComponent } from 'react'; 2 | import PropTypes from 'prop-types'; 3 | import cn from 'classnames'; 4 | import styles from './styles.module.scss'; 5 | 6 | class Tag extends PureComponent { 7 | static propTypes = { 8 | text: PropTypes.string, 9 | theme: PropTypes.oneOf(['red', 'green', 'orange']), 10 | className: PropTypes.string, 11 | } 12 | 13 | static defaultProps = { 14 | text: '', 15 | theme: 'green', 16 | } 17 | 18 | // Themes signify level of confidence in answer 19 | static themes = { 20 | RED: 'red', 21 | GREEN: 'green', 22 | ORANGE: 'orange' 23 | } 24 | 25 | render() { 26 | const { text, theme, className } = this.props; 27 | return ( 28 |
33 | {text} 34 |
35 | ); 36 | } 37 | } 38 | 39 | export default Tag; 40 | -------------------------------------------------------------------------------- /covid-frontend/src/assets/icons/brain.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /covid-frontend/src/components/Provider.js: -------------------------------------------------------------------------------- 1 | import React, { PureComponent, Fragment } from 'react'; 2 | // import PropTypes from 'prop-types'; 3 | import { connect } from 'react-redux'; 4 | // import { bindActionCreators } from 'redux'; 5 | // import { Loader } from 'components/common/presentational'; 6 | 7 | class Provider extends PureComponent { 8 | 9 | static propTypes = { 10 | // companies: PropTypes.object, 11 | // userActions: PropTypes.object, 12 | // companiesActions: PropTypes.object 13 | } 14 | 15 | render () { 16 | 17 | // next will be removed later 18 | // if (!companies.isReady) { 19 | // return ; 20 | // } 21 | 22 | return ( 23 | 24 | { this.props.children } 25 | 26 | 27 | ); 28 | } 29 | } 30 | 31 | export default connect( 32 | // state => ({ 33 | // companies: state.companies 34 | // }), 35 | // dispatch => ({ 36 | // companiesActions: bindActionCreators(companiesActions, dispatch), 37 | // }) 38 | )(Provider); 39 | -------------------------------------------------------------------------------- /telegram-bot/src/main/kotlin/com/theapache64/cs/utils/FeedbackParser.kt: -------------------------------------------------------------------------------- 1 | package com.theapache64.cs.utils 2 | 3 | import com.theapache64.cs.models.Feedback 4 | 5 | object FeedbackParser { 6 | private val feedbackRegEx = "(?\\w)(?\\d+)d(?\\d+)(?.+)".toRegex() 7 | fun parse(data: String): Feedback { 8 | val match = feedbackRegEx.find(data) 9 | val groups = match!!.groups 10 | return Feedback( 11 | groups["modelId"]!!.value.toInt(), 12 | getFeedbackString(groups["feedback"]!!.value[0]), 13 | groups["question"]!!.value, 14 | groups["documentId"]!!.value.toLong() 15 | ) 16 | } 17 | 18 | private fun getFeedbackString(feedback: Char): String { 19 | return when (feedback) { 20 | 'r' -> "relevant" 21 | 'f' -> "fake" 22 | 'o' -> "outdated" 23 | 'i' -> "irrelevant" 24 | else -> throw IllegalArgumentException("Undefined feedback char `$feedback`") 25 | } 26 | } 27 | } -------------------------------------------------------------------------------- /covid-frontend/src/assets/styles/wirvsvirus/wirvsvirus-theme.css: -------------------------------------------------------------------------------- 1 | /* Custom CSS styles for Hackathon WirVsVirus */ 2 | /* TODO: Evaluate removal of ant-design. At least get rid of those arbitrary CSS scoping hashes from class names :( */ 3 | 4 | .ant-row .ant-col span { 5 | color: #707070; 6 | } 7 | 8 | .ant-col-19 { 9 | /* TODO: Can layout be better controlled through ant-design col/row components? */ 10 | width: 100%; 11 | } 12 | 13 | .all-answers-wrapper .ant-col.ant-col-19 { 14 | padding: 2rem; 15 | } 16 | 17 | .top-answer-wrapper { 18 | background-color: #59A4B7; 19 | color: white; 20 | padding: 0 20px; 21 | margin: 0 0px !important; 22 | padding-bottom:30px; 23 | } 24 | 25 | .headline-faq-match { 26 | font-size: 1.25rem; 27 | } 28 | 29 | .answer-text { 30 | line-height: 1.5rem; 31 | max-width:800px; 32 | } 33 | 34 | .answer-meta-info.top-answer { 35 | margin-top: 0; 36 | } 37 | 38 | 39 | .other-answer-row .headline-faq-match-confidence { 40 | color: black; 41 | } 42 | 43 | .result-confidence-box { 44 | font-style: italic; 45 | } 46 | -------------------------------------------------------------------------------- /telegram-bot/build.gradle: -------------------------------------------------------------------------------- 1 | plugins { 2 | id 'java' 3 | id 'org.jetbrains.kotlin.jvm' version '1.3.70' 4 | id 'war' 5 | } 6 | 7 | group 'com.theapache64' 8 | version 'v1.1.0-alpha02' 9 | 10 | sourceCompatibility = 1.8 11 | 12 | repositories { 13 | mavenCentral() 14 | } 15 | 16 | dependencies { 17 | implementation "org.jetbrains.kotlin:kotlin-stdlib-jdk8" 18 | 19 | // https://mvnrepository.com/artifact/com.google.code.gson/gson 20 | implementation group: 'com.google.code.gson', name: 'gson', version: '2.8.6' 21 | 22 | // https://mvnrepository.com/artifact/com.squareup.okhttp3/okhttp 23 | implementation group: 'com.squareup.okhttp3', name: 'okhttp', version: '4.4.1' 24 | 25 | // https://mvnrepository.com/artifact/javax.servlet/javax.servlet-api 26 | providedCompile group: 'javax.servlet', name: 'javax.servlet-api', version: '4.0.1' 27 | 28 | 29 | testImplementation group: 'junit', name: 'junit', version: '4.12' 30 | } 31 | 32 | compileKotlin { 33 | kotlinOptions.jvmTarget = "1.8" 34 | } 35 | compileTestKotlin { 36 | kotlinOptions.jvmTarget = "1.8" 37 | } -------------------------------------------------------------------------------- /covid-frontend/src/components/common/InputContainer/styles.module.scss: -------------------------------------------------------------------------------- 1 | @import '../../../assets/styles/_variables'; 2 | @import '../../../assets/styles/_mixins'; 3 | 4 | .container { 5 | display: inline-block; 6 | position: relative; 7 | 8 | & > label { 9 | color: $primary-grey; 10 | display: block; 11 | @include text(tiny); 12 | margin: 11px 0; 13 | letter-spacing: $secondary-spacing; 14 | text-transform: uppercase; 15 | } 16 | 17 | .error { 18 | @include text(small); 19 | color: $error; 20 | display: block; 21 | margin-top: 6px; 22 | } 23 | 24 | .info { 25 | @include text(small); 26 | color: $primary-grey; 27 | display: block; 28 | margin-top: 6px; 29 | } 30 | 31 | &.fluid { 32 | display: block; 33 | width: 100%; 34 | } 35 | 36 | &.withError { 37 | textarea, 38 | input { 39 | @include border($error); 40 | } 41 | :global(.ant-input-number), 42 | :global(.ant-select-selection) { 43 | @include border($error); 44 | input { 45 | border: none; 46 | } 47 | } 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /telegram-bot/src/main/kotlin/com/theapache64/cs/models/rest/telegram/SendMessageResponse.kt: -------------------------------------------------------------------------------- 1 | package com.theapache64.cs.models.rest.telegram 2 | 3 | import com.google.gson.annotations.SerializedName 4 | 5 | 6 | data class SendMessageResponse( 7 | @SerializedName("ok") 8 | val ok: Boolean, // true 9 | @SerializedName("result") 10 | val result: Result 11 | ) { 12 | data class Result( 13 | @SerializedName("chat") 14 | val chat: Chat, 15 | @SerializedName("date") 16 | val date: Long, // 1584216383 17 | @SerializedName("message_id") 18 | val messageId: Long, // 146 19 | @SerializedName("text") 20 | val text: String // This is some text 21 | ) { 22 | data class Chat( 23 | @SerializedName("id") 24 | val id: Long, // -1001423106120 25 | @SerializedName("title") 26 | val title: String, // Movie Monk 27 | @SerializedName("type") 28 | val type: String, // channel 29 | @SerializedName("username") 30 | val username: String // movie_m0nk 31 | ) 32 | } 33 | } -------------------------------------------------------------------------------- /backend/controller/feedback.py: -------------------------------------------------------------------------------- 1 | from fastapi import APIRouter, status 2 | from fastapi.responses import JSONResponse 3 | from pydantic import BaseModel 4 | from typing import Optional 5 | 6 | from backend import api 7 | from backend.config import DB_INDEX_FEEDBACK 8 | 9 | router = APIRouter() 10 | 11 | 12 | class Feedback(BaseModel): 13 | # Note: the question here is the user's question (=query) and not the matched one from our FAQs (=response) 14 | question: str 15 | answer: Optional[str] 16 | feedback: str 17 | document_id: int 18 | 19 | 20 | @router.post("/models/{model_id}/feedback") 21 | def feedback(model_id: int, request: Feedback): 22 | feedback_payload = request.__dict__ 23 | if feedback_payload["feedback"] not in ("relevant", "fake", "outdated", "irrelevant"): 24 | return JSONResponse( 25 | status_code=status.HTTP_400_BAD_REQUEST, 26 | content="Invalid 'feedback'. It must be one of relevant, fake, outdated or irrelevant", 27 | ) 28 | feedback_payload["model_id"] = model_id 29 | api.elasticsearch_client.index(index=DB_INDEX_FEEDBACK, body=feedback_payload) 30 | -------------------------------------------------------------------------------- /covid-frontend/src/store/actions/activeAnswers.js: -------------------------------------------------------------------------------- 1 | import * as types from 'store/types/activeAnswers'; 2 | 3 | export const get = () => ({ 4 | type: types.GET 5 | }); 6 | 7 | export const set = (payload) => ({ 8 | type: types.SET, 9 | payload 10 | }); 11 | 12 | export const setLoadingStatus = (status) => ({ 13 | type: types.SET_LOADING_STATUS, 14 | status 15 | }); 16 | 17 | export const showUserFeedbackPanel = (payload) => ({ 18 | type: types.SHOW_USER_FEEDBACK_PANEL, 19 | payload 20 | }); 21 | 22 | export const hideUserFeedbackPanel = () => ({ 23 | type: types.HIDE_USER_FEEDBACK_PANEL 24 | }); 25 | 26 | export const reset = () => ({ 27 | type: types.RESET, 28 | }); 29 | 30 | export const markAsCorrectAnswer = (payload) => ({ 31 | type: types.MARK_AS_CORRECT_ANSWER, 32 | payload 33 | }); 34 | 35 | export const markAsWrongAnswer = (payload) => ({ 36 | type: types.MARK_AS_WRONG_ANSWER, 37 | payload 38 | }); 39 | 40 | export const markAsFeedbackGiven = (payload) => ({ 41 | type: types.MARK_AS_FEEDBACK_GIVEN, 42 | payload 43 | }); 44 | 45 | export const clearFeedbackGiven = () => ({ 46 | type: types.CLEAR_FEEDBACK_GIVEN 47 | }); -------------------------------------------------------------------------------- /covid-frontend/src/assets/styles/antd/antd-theme.less: -------------------------------------------------------------------------------- 1 | @import "../../../../node_modules/antd/dist/antd.less"; 2 | 3 | // see list of variables in 4 | // https://github.com/ant-design/ant-design/blob/master/components/style/themes/default.less 5 | 6 | @primary-color: #70b2fc; 7 | 8 | // Base Scaffolding Variables 9 | @border-radius-base: 6px; 10 | @font-family: 'Gothic A1', sans-serif; 11 | @text-color: #333638; 12 | 13 | @btn-height-lg: 44px; 14 | @input-height-lg: 44px; 15 | @font-size-lg: 14px; 16 | 17 | // Border color 18 | @border-color-base: #d3dae0; 19 | 20 | // table 21 | @table-header-bg: transparent; 22 | @table-header-color: #a3a9ad; 23 | @table-row-hover-bg: #f9fafc; 24 | @table-padding-vertical: 14px; 25 | @table-padding-horizontal: 24px; 26 | 27 | 28 | 29 | // table customization 30 | .ant-table-thead > tr > th { 31 | font-size: 12px; 32 | line-height: 14px; 33 | padding: 11px 24px; 34 | } 35 | 36 | .ant-table-tbody > tr > td { 37 | border: none; 38 | line-height: 18px; 39 | height: 48px; // works as min-height 40 | vertical-align: middle; 41 | } 42 | 43 | .ant-table-placeholder { 44 | border-bottom-color: transparent; 45 | } 46 | -------------------------------------------------------------------------------- /covid-frontend/src/store/sagas/api/index.js: -------------------------------------------------------------------------------- 1 | import api from 'core/api'; 2 | 3 | export function * get (path, query = null) { 4 | return yield apiCall(path, 'GET', query); 5 | } 6 | 7 | export function * post (path, query = null, body = null) { 8 | return yield apiCall(path, 'POST', body ? query : null, body || query); 9 | } 10 | 11 | export function * put (url, query = null, body = null) { 12 | return yield apiCall(url, 'PUT', body ? query : null, body || query); 13 | } 14 | 15 | export function * patch (url, query = null, body = null) { 16 | return yield apiCall(url, 'PATCH', body ? query : null, body || query); 17 | } 18 | 19 | export function * del (url, query = null) { 20 | return yield apiCall(url, 'DELETE', query); 21 | } 22 | 23 | 24 | function * apiCall (path, method, query, body) { 25 | const apiInstance = api(); 26 | // const { token } = yield select(state => state.auth); 27 | 28 | // if (token) { 29 | // apiInstance.setAuthorization(`Bearer ${token}`); 30 | // } 31 | 32 | let result; 33 | try { 34 | result = yield apiInstance.call(path, method, query, body); 35 | } catch (error) { 36 | throw error; 37 | } 38 | 39 | return result; 40 | } 41 | -------------------------------------------------------------------------------- /covid-frontend/src/index.js: -------------------------------------------------------------------------------- 1 | import React, { Suspense } from 'react'; 2 | import ReactDOM from 'react-dom'; 3 | import { Provider } from 'react-redux'; 4 | import { Router } from 'react-router-dom'; 5 | import { composeWithDevTools } from 'redux-devtools-extension'; 6 | import createSagaMiddleware from 'redux-saga'; 7 | import { createStore, applyMiddleware } from 'redux'; 8 | 9 | 10 | import history from './history'; 11 | import Routes from './routes'; 12 | import rootReducer from './store/reducers'; 13 | import rootSaga from './store/sagas'; 14 | 15 | import './i18n'; 16 | import './assets/styles/global.scss'; 17 | 18 | 19 | console.log('%c env: ', 'color: #bada55', process.env.REACT_APP_ENV); 20 | 21 | 22 | const sagaMiddleware = createSagaMiddleware(); 23 | 24 | const store = createStore( 25 | rootReducer, 26 | composeWithDevTools( 27 | applyMiddleware(sagaMiddleware) 28 | ) 29 | ); 30 | 31 | sagaMiddleware.run(rootSaga); 32 | 33 | ReactDOM.render( 34 | 35 | 36 | }> 37 | 38 | 39 | 40 | , 41 | document.getElementById('root') 42 | ); 43 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | model-api: 4 | image: "deepset/covid-qa-haystack" 5 | ports: 6 | - "80:80" 7 | environment: 8 | # see backend/config.py for additional variables to configure 9 | - TEXT_FIELD_NAME=answer 10 | - SEARCH_FIELD_NAME=question 11 | - EXCLUDE_META_DATA_FIELDS=["question_emb"] 12 | # optional: use an embedding model instead of plain elasticsearch query 13 | # - EMBEDDING_FIELD_NAME=question_emb 14 | # - EMBEDDING_DIM=768 15 | # - EMBEDDING_MODEL_PATH=deepset/sentence-bert # MUST be the same as used for indexing the FAQs 16 | - USE_GPU=False 17 | command: /bin/sh -c "cd /home/user && sleep 20 && uvicorn backend.api:app --host 0.0.0.0 --port 80 --limit-concurrency 10 --workers 1" 18 | network_mode: host 19 | elastic: 20 | # use plain elasticsearch image if you want to ingest fresh data (via backend/data_ingestion.py) 21 | # use covid-qa-elastic only for dev to have some (old) docs preindexed. 22 | # image: "elasticsearch:7.5.1" 23 | image: "deepset/covid-qa-elastic" 24 | ports: 25 | - "9200:9200" 26 | - "9300:9300" 27 | environment: 28 | - discovery.type=single-node -------------------------------------------------------------------------------- /covid-frontend/src/components/UserFeedback/styles.module.scss: -------------------------------------------------------------------------------- 1 | @import '../../assets/styles/_variables'; 2 | @import '../../assets/styles/_mixins'; 3 | 4 | .wrapper { 5 | position: fixed; 6 | display: flex; 7 | justify-content: center; 8 | align-items: center; 9 | 10 | width: 100vw; 11 | height: 100vh; 12 | left: 0; 13 | top: 0; 14 | background-color: rgba(0, 0, 0, 0.7); 15 | z-index: 10; 16 | backdrop-filter: blur(2px); 17 | 18 | & > div { 19 | background-color: $white; 20 | @include border(); 21 | box-shadow: $base-shadow; 22 | padding: 24px 24px 24px 24px; 23 | 24 | position: absolute; 25 | width: auto; 26 | height: auto; 27 | max-height: 60vh; 28 | z-index: 2; 29 | } 30 | 31 | h2 { 32 | margin-bottom: 1em; 33 | } 34 | 35 | button { 36 | display: block; 37 | white-space: nowrap; 38 | margin: 0.5em 0; 39 | cursor: pointer; 40 | background: $warning-light2; 41 | border-radius: 12px; 42 | margin-left: 12px; 43 | outline: 1px; 44 | 45 | &:last-child { 46 | background: $accent-light; 47 | } 48 | 49 | & > i { 50 | margin-top: 3px; 51 | margin-right: 5px; 52 | } 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /backend/readme.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | We run two services in the backend: elasticsearch + the model API. 3 | The model API is configured via environment variables that can be passed into the docker container or set in backend/config.py 4 | 5 | # Run elasticsearch 6 | a) Fresh elasticsearch index: 7 | 8 | docker run -d -p 9200:9200 -p 9300:9300 -e "discovery.type=single-node" elasticsearch:7.5.1 9 | Then ingest data via `data_ingestion.py` 10 | 11 | b) Dev: 12 | 13 | docker run -d -p 9200:9200 -p 9300:9300 -e "discovery.type=single-node" deepset/covid-qa-elastic 14 | 15 | This image has already some docs indexed, so you can skip `data_ingestion.py` 16 | 17 | 18 | 19 | # Run model API 20 | docker image build -t deepset/covid-qa-haystack . 21 | docker run --net=host -e TEXT_FIELD_NAME=answer -e SEARCH_FIELD_NAME=question -e EXCLUDE_META_DATA_FIELDS='["question_emb"]' deepset/covid-qa-haystack:latest 22 | 23 | or without docker: 24 | 25 | pip install -r requirements.txt 26 | uvicorn backend.api:app 27 | 28 | # Alternative: Run both via docker-compose 29 | docker-compose up 30 | Edit `docker-compose.yml`, if you want to configure elasticsearch host, models etc. 31 | 32 | -------------------------------------------------------------------------------- /covid-frontend/src/routes/index.js: -------------------------------------------------------------------------------- 1 | import React, { PureComponent } from 'react'; 2 | import { Route, Switch, Redirect, withRouter } from 'react-router-dom'; 3 | import App from 'components/App'; 4 | import Provider from 'components/Provider'; 5 | import links from 'routes/links'; 6 | import { MainTemplate } from 'components/themes'; 7 | import Home from 'components/Home'; 8 | import Answers from 'components/Answers'; 9 | import NotFound from 'components/NotFound'; 10 | 11 | class Routes extends PureComponent { 12 | render () { 13 | return ( 14 | 15 | 16 | 17 | 18 | 19 | {/* */} 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | ); 38 | } 39 | } 40 | 41 | export default withRouter(Routes); 42 | -------------------------------------------------------------------------------- /covid-frontend/public/locales/en/translation.json: -------------------------------------------------------------------------------- 1 | { 2 | "inputs": { 3 | "question": { 4 | "label": "Your Question", 5 | "placeholder": "Ask a question about COVID-19 (corona virus)" 6 | } 7 | }, 8 | "loader": { 9 | "heading": "The BERT is working", 10 | "text": "Please Wait – Bitte warten..." 11 | }, 12 | "answer": { 13 | "meta": { 14 | "datelabel": "From", 15 | "source": "Source" 16 | }, 17 | "feedback": { 18 | "header": "Feedback" 19 | }, 20 | "no-answer": "Found no answers", 21 | "other-answers": "Other answers", 22 | "tags": { 23 | "probability": "Relevance" 24 | } 25 | }, 26 | "feedback": { 27 | "title": "Thank you for giving us feedback.", 28 | "text": "What was wrong with the answer?", 29 | "fake": "The stated facts were inaccurate or wrong.", 30 | "outdated": "The information were outdated.", 31 | "irrelevant": "The answer had nothing to do with my question.", 32 | "nothing": "Nothing." 33 | }, 34 | "languages": { 35 | "de": "German", 36 | "en": "English" 37 | }, 38 | "404": { 39 | "subtitle": "Sorry, the page you visited does not exist.", 40 | "button-text": "Back to Home" 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /covid-frontend/src/assets/styles/_reset.css: -------------------------------------------------------------------------------- 1 | /* http://meyerweb.com/eric/tools/css/reset/ 2 | v2.0 | 20110126 3 | License: none (public domain) 4 | */ 5 | 6 | html, body, div, span, applet, object, iframe, 7 | h1, h2, h3, h4, h5, h6, p, blockquote, pre, 8 | a, abbr, acronym, address, big, cite, code, 9 | del, dfn, em, img, ins, kbd, q, s, samp, 10 | small, strike, strong, sub, sup, tt, var, 11 | b, u, i, center, 12 | dl, dt, dd, ol, ul, li, 13 | fieldset, form, label, legend, 14 | table, caption, tbody, tfoot, thead, tr, th, td, 15 | article, aside, canvas, details, embed, 16 | figure, figcaption, footer, header, hgroup, 17 | menu, nav, output, ruby, section, summary, 18 | time, mark, audio, video { 19 | margin: 0; 20 | padding: 0; 21 | border: 0; 22 | font-size: 100%; 23 | font: inherit; 24 | vertical-align: baseline; 25 | } 26 | /* HTML5 display-role reset for older browsers */ 27 | article, aside, details, figcaption, figure, 28 | footer, header, hgroup, menu, nav, section { 29 | display: block; 30 | } 31 | body { 32 | line-height: 1; 33 | } 34 | ol, ul { 35 | list-style: none; 36 | } 37 | blockquote, q { 38 | quotes: none; 39 | } 40 | blockquote:before, blockquote:after, 41 | q:before, q:after { 42 | content: ''; 43 | content: none; 44 | } 45 | table { 46 | border-collapse: collapse; 47 | border-spacing: 0; 48 | } 49 | -------------------------------------------------------------------------------- /covid-frontend/public/locales/de/translation.json: -------------------------------------------------------------------------------- 1 | { 2 | "inputs": { 3 | "question": { 4 | "label": "Ihre Frage", 5 | "placeholder": "Stellen Sie eine Frage zu COVID-19 (Corona-Virus)" 6 | } 7 | }, 8 | "loader": { 9 | "heading": "The BERT is working", 10 | "text": "Please Wait – Bitte warten..." 11 | }, 12 | "answer": { 13 | "meta": { 14 | "datelabel": "Stand", 15 | "source": "Quelle" 16 | }, 17 | "feedback": { 18 | "header": "Feedback" 19 | }, 20 | "no-answer": "Keine Antworten gefunden", 21 | "other-answers": "Weitere Antworten", 22 | "tags": { 23 | "probability": "Relevanz" 24 | } 25 | }, 26 | "feedback": { 27 | "title": "Danke für Ihr Feedback!", 28 | "text": "Was war falsch mit der Antwort?", 29 | "fake": "Die Antwort war falsch oder ungenau.", 30 | "outdated": "Die Informationen waren veraltet.", 31 | "irrelevant": "Die Antwort hatte nichts mit meiner Frage zu tun.", 32 | "nothing": "Nichts." 33 | }, 34 | "languages": { 35 | "de": "Deutsch", 36 | "en": "Englisch" 37 | }, 38 | "404": { 39 | "subtitle": "Tut uns leid, diese Seite gibt es nicht.", 40 | "button-text": "Zurück" 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /telegram-bot/src/main/kotlin/com/theapache64/cs/core/Scholar.kt: -------------------------------------------------------------------------------- 1 | package com.theapache64.cs.core 2 | 3 | import com.theapache64.cs.models.Feedback 4 | import com.theapache64.cs.models.rest.AddFeedbackRequest 5 | import com.theapache64.cs.models.rest.CoronaAnswer 6 | import com.theapache64.cs.models.rest.CoronaQuestion 7 | import com.theapache64.cs.utils.GsonUtil 8 | import com.theapache64.cs.utils.RestClient 9 | 10 | object Scholar { 11 | 12 | private const val BASE_URL = "https://covid-backend.deepset.ai" 13 | 14 | fun getAnswer(question: String): CoronaAnswer? { 15 | val jsonString = RestClient.post( 16 | "$BASE_URL/question/ask", 17 | null, 18 | CoronaQuestion( 19 | arrayOf(question) 20 | ) 21 | ).body!!.string() 22 | 23 | println(jsonString) 24 | 25 | return GsonUtil.gson.fromJson(jsonString, CoronaAnswer::class.java) 26 | } 27 | 28 | fun addFeedback(feedback: Feedback) { 29 | val jsonString = RestClient.post( 30 | "$BASE_URL/models/${feedback.modelId}/feedback", 31 | null, 32 | AddFeedbackRequest( 33 | feedback.feedback, 34 | feedback.question, 35 | feedback.documentId 36 | ) 37 | ).body!!.string() 38 | println("Feedback response : $jsonString") 39 | } 40 | 41 | 42 | } -------------------------------------------------------------------------------- /covid-frontend/src/components/common/InputContainer/index.js: -------------------------------------------------------------------------------- 1 | import React, { PureComponent } from 'react'; 2 | import PropTypes from 'prop-types'; 3 | import cn from 'classnames'; 4 | import styles from './styles.module.scss'; 5 | 6 | class InputContainer extends PureComponent { 7 | static propTypes = { 8 | label: PropTypes.string, 9 | error: PropTypes.oneOfType([ 10 | PropTypes.bool, 11 | PropTypes.string 12 | ]), 13 | info: PropTypes.oneOfType([ 14 | PropTypes.bool, 15 | PropTypes.string 16 | ]), 17 | fluid: PropTypes.bool, 18 | className: PropTypes.string, 19 | } 20 | 21 | static defaultProps = { 22 | label: '', 23 | info: '', 24 | error: '', 25 | fluid: false, 26 | 27 | className: '' 28 | } 29 | 30 | render() { 31 | const { label, info, error, fluid, className, children } = this.props; 32 | 33 | const classes = cn( 34 | styles.container, 35 | { [styles.fluid]: fluid }, 36 | { [styles.withError]: error }, 37 | className 38 | ); 39 | 40 | return ( 41 |
42 | { label && } 43 | { children } 44 | { (error && (typeof error === 'string')) && {error} } 45 | { info && {info} } 46 |
47 | ); 48 | } 49 | } 50 | 51 | export default InputContainer; 52 | -------------------------------------------------------------------------------- /telegram-bot/src/main/kotlin/com/theapache64/cs/models/rest/telegram/SendMessageRequest.kt: -------------------------------------------------------------------------------- 1 | package com.theapache64.cs.models.rest.telegram 2 | 3 | import com.google.gson.annotations.SerializedName 4 | 5 | 6 | data class SendMessageRequest( 7 | @SerializedName("chat_id") 8 | val chatId: String, // to 9 | @SerializedName("text") 10 | val text: String, // This is some message 11 | @SerializedName("disable_web_page_preview") 12 | val isDisableWebPagePreview: Boolean?, 13 | @SerializedName("parse_mode") 14 | val parseMode: String?, 15 | @SerializedName("reply_to_message_id") 16 | val replyMsgId: Long?, 17 | @SerializedName("reply_markup") 18 | val replyMarkup: ReplyMarkup? 19 | ) { 20 | data class ReplyMarkup( 21 | @SerializedName("inline_keyboard") 22 | val inlineKeyboard: List> 23 | ) 24 | 25 | data class InlineButton( 26 | @SerializedName("text") 27 | val text: String, // ✅ Relevant 28 | @SerializedName("callback_data") 29 | val callbackData: String // r123 30 | ) { 31 | class ByteOverflowException(message: String?) : Throwable(message) 32 | 33 | init { 34 | val byteSize = callbackData.toByteArray().size 35 | if (byteSize > 64) { 36 | throw ByteOverflowException( 37 | "Callback data exceeded" 38 | ) 39 | } 40 | } 41 | } 42 | } -------------------------------------------------------------------------------- /data/faqs/deduplicate_with_sentenceBert.py: -------------------------------------------------------------------------------- 1 | from haystack.retriever.elasticsearch import ElasticsearchRetriever 2 | import pandas as pd 3 | import numpy as np 4 | from sklearn.metrics.pairwise import cosine_similarity 5 | 6 | # loading questions and calculating similarities based of sentence bert embeddings 7 | df = pd.read_csv("200416_englishFAQ.csv",sep=",") 8 | if df.columns[0] != "question": 9 | df = df.iloc[:,1:] 10 | 11 | #df = pd.concat((df.loc[df.name == "CDC General FAQ"],df.loc[df.name != "CDC General FAQ"]),ignore_index=True) 12 | df = df.loc[df.name == "CDC General FAQ"] 13 | df = df.loc[df.category != "School Dismissals and Children"] 14 | 15 | df.reset_index(inplace=True,drop=True) 16 | 17 | 18 | questions = [{"text": v} for v in df.question.values] 19 | retriever = ElasticsearchRetriever(document_store=None, embedding_model="deepset/sentence_bert", gpu=False) 20 | res1 = retriever.embedding_model.extract_vectors( 21 | dicts=questions, 22 | extraction_strategy="reduce_mean", 23 | extraction_layer=-1) 24 | res1 = np.array([i["vec"] for i in res1]) 25 | sims = cosine_similarity(res1,res1) 26 | 27 | threshold = 0.85 28 | indices = [0] 29 | for i in range(1,len(questions)): 30 | if (sims[:i,i] < threshold).all(): 31 | indices.append(i) 32 | else: 33 | print(df.question[i]) 34 | idxs = np.nonzero(sims[:i,i] > threshold)[0] 35 | print(df.iloc[idxs,1]) 36 | print("newexample \n") 37 | 38 | 39 | newdf = df.iloc[indices,:] 40 | print(newdf.shape) 41 | print(df.shape) 42 | newdf.to_csv("200416_CDCGen_dedup.csv",index=True,sep=",") 43 | 44 | 45 | -------------------------------------------------------------------------------- /covid-frontend/src/components/common/Loader/index.js: -------------------------------------------------------------------------------- 1 | import React, { PureComponent } from 'react'; 2 | import PropTypes from 'prop-types'; 3 | import { Spin, Icon } from 'antd'; 4 | import cn from 'classnames'; 5 | 6 | import styles from './styles.module.scss'; 7 | 8 | class Loader extends PureComponent { 9 | static propTypes = { 10 | size: PropTypes.number, 11 | loading: PropTypes.bool, // works only when loader has children 12 | selfContained: PropTypes.bool, // works only when loader has no children 13 | fullSized: PropTypes.bool, // works only when loader has no children 14 | className: PropTypes.string, // works only when loader has no children 15 | } 16 | 17 | static defaultProps = { 18 | size: 24, 19 | selfContained: true, 20 | fullSized: false, 21 | className: '', 22 | loading: false 23 | } 24 | 25 | render() { 26 | const { children, loading, size, selfContained, fullSized, className } = this.props; 27 | 28 | const classNames = cn({ 29 | [styles.selfContained]: selfContained, 30 | [styles.fullSized]: fullSized, 31 | className 32 | }); 33 | 34 | if (children) { 35 | return ( 36 | } 38 | spinning={loading} 39 | > 40 | { children } 41 | 42 | ); 43 | } 44 | 45 | return ( 46 |
47 | } /> 48 |
49 | ); 50 | } 51 | } 52 | 53 | export default Loader; 54 | -------------------------------------------------------------------------------- /covid-frontend/src/assets/styles/_variables.scss: -------------------------------------------------------------------------------- 1 | // colors 2 | $primary: #081741; 3 | $dark: #333638; 4 | 5 | $contrast: #005da9; 6 | 7 | $accent: #70b2fc; 8 | $accent-light: #e7f2ff; 9 | $accent-light2: #d2ebff; 10 | $accent-light3: #b2d8f8; 11 | 12 | $success: #009c10; 13 | $success-light: #99d7a0; 14 | $success-light2: rgba(0, 156, 16, 0.1); 15 | 16 | $warning: #ed9700; 17 | $warning-light: #ffd387; 18 | $warning-light2: #fff4e0; 19 | 20 | $strong-emotion: #be0000; 21 | $error: #be0000; 22 | $error-light: #ff9c9c; 23 | $error-light2: #ffdede; 24 | 25 | 26 | $primary-grey: #a3a9ad; 27 | $secondary-grey: #878b90; 28 | $border-grey: #d3dae0; 29 | $light-border-grey: #e4e9ed; 30 | $light-grey: #f7f8fa; 31 | 32 | $white: #ffffff; 33 | 34 | // shadows 35 | $base-shadow: 0 1px 3px 0 rgba(113, 118, 122, 0.35); 36 | $reversed-shadow: 0 -3px 5px 0 rgba(0, 0, 0, 0.1); 37 | 38 | // letter-spacing 39 | $base-spacing: 0.65px; 40 | $secondary-spacing: 1px; 41 | 42 | // radiuses 43 | $base-radius: 6px; 44 | 45 | // transitions 46 | $base-transition-speed: .3s; 47 | $fast-transition-speed: .15s; 48 | 49 | // text sizes 50 | $text: ( 51 | tiny: ( 52 | font-size: 11px, 53 | line-height: 1 54 | ), 55 | small: ( 56 | font-size: 12px, 57 | line-height: 1 58 | ), 59 | regular: ( 60 | font-size: 14px, 61 | line-height: 1 62 | ), 63 | semiBig: ( 64 | font-size: 16px, 65 | line-height: 1 66 | ), 67 | big: ( 68 | font-size: 20px, 69 | line-height: 1 70 | ), 71 | huge: ( 72 | font-size: 24px, 73 | line-height: 1 74 | ), 75 | jumbo: ( 76 | font-size: 32px, 77 | line-height: 1 78 | ) 79 | ); 80 | -------------------------------------------------------------------------------- /covid-frontend/src/store/reducers/activeAnswers.js: -------------------------------------------------------------------------------- 1 | import * as types from 'store/types/activeAnswers'; 2 | 3 | const initialState = { 4 | entries: [], 5 | 6 | isLoading: false, 7 | 8 | // the state of the answer-popup 9 | userFeedbackPopup: { 10 | visible: false, 11 | answerDocumentId: null 12 | }, 13 | 14 | // a list of answers which the user has already given his feedback to 15 | feedbackGiven: {} 16 | }; 17 | 18 | export default (state = initialState, action) => { 19 | switch (action.type) { 20 | case types.SET: 21 | return { 22 | ...state, 23 | entries: action.payload 24 | }; 25 | 26 | case types.SET_LOADING_STATUS: 27 | return { 28 | ...state, 29 | isLoading: action.status 30 | }; 31 | 32 | case types.SHOW_USER_FEEDBACK_PANEL: 33 | return { 34 | ...state, 35 | userFeedbackPopup: { ...state.userFeedbackPopup, visible: true, answerDocumentId: action.payload } 36 | }; 37 | 38 | case types.HIDE_USER_FEEDBACK_PANEL: 39 | return { 40 | ...state, 41 | userFeedbackPopup: { ...initialState.userFeedbackPopup } 42 | }; 43 | 44 | case types.MARK_AS_FEEDBACK_GIVEN: 45 | return { 46 | ...state, 47 | feedbackGiven: { ...state.feedbackGiven, ...action.payload } 48 | }; 49 | 50 | case types.CLEAR_FEEDBACK_GIVEN: 51 | return { 52 | ...state, 53 | feedbackGiven: { ...initialState.feedbackGiven } 54 | }; 55 | 56 | case types.RESET: 57 | return { 58 | ...initialState 59 | }; 60 | 61 | default: 62 | return state; 63 | }; 64 | }; 65 | -------------------------------------------------------------------------------- /telegram-bot/src/main/kotlin/com/theapache64/cs/models/rest/telegram/TelegramUpdate.kt: -------------------------------------------------------------------------------- 1 | package com.theapache64.cs.models.rest.telegram 2 | 3 | import com.google.gson.annotations.SerializedName 4 | 5 | 6 | data class TelegramUpdate( 7 | @SerializedName("message") 8 | val message: Message, 9 | @SerializedName("update_id") 10 | val updateId: Int // 102073005 11 | ) { 12 | data class Message( 13 | @SerializedName("chat") 14 | val chat: Chat, 15 | @SerializedName("date") 16 | val date: Int, // 1584880886 17 | @SerializedName("from") 18 | val from: From, 19 | @SerializedName("message_id") 20 | val messageId: Long, // 8 21 | @SerializedName("text") 22 | val text: String // Dbrhrfjggkjgj nfgntnt t 23 | ) { 24 | data class Chat( 25 | @SerializedName("first_name") 26 | val firstName: String, // theapache64 27 | @SerializedName("id") 28 | val id: Int, // 240810054 29 | @SerializedName("type") 30 | val type: String, // private 31 | @SerializedName("username") 32 | val username: String // theapache64 33 | ) 34 | 35 | data class From( 36 | @SerializedName("first_name") 37 | val firstName: String, // theapache64 38 | @SerializedName("id") 39 | val id: Int, // 240810054 40 | @SerializedName("is_bot") 41 | val isBot: Boolean, // false 42 | @SerializedName("language_code") 43 | val languageCode: String, // en 44 | @SerializedName("username") 45 | val username: String // theapache64 46 | ) 47 | } 48 | } -------------------------------------------------------------------------------- /covid-frontend/src/store/reducers/globalSearch.js: -------------------------------------------------------------------------------- 1 | import * as types from 'store/types/globalSearch'; 2 | 3 | const initialState = { 4 | selectedValue: '', 5 | search: { 6 | currentString: '', 7 | lastString: '', 8 | options: [], 9 | }, 10 | isLoading: false 11 | }; 12 | 13 | export default (state = initialState, action) => { 14 | switch (action.type) { 15 | case types.SET_SELECTED_VALUE: 16 | return { 17 | ...state, 18 | selectedValue: action.payload, 19 | search: { 20 | ...state.search, 21 | currentString: action.payload 22 | } 23 | }; 24 | case types.UPDATE_SEARCH_VALUE: 25 | return { 26 | ...state, 27 | search: { 28 | ...state.search, 29 | currentString: action.payload 30 | } 31 | }; 32 | case types.UPDATE_LAST_SEARCH_VALUE: 33 | return { 34 | ...state, 35 | search: { 36 | ...state.search, 37 | lastString: action.payload 38 | } 39 | }; 40 | case types.UPDATE_SEARCH_FILTERS: 41 | return { 42 | ...state, 43 | search: { 44 | ...state.search, 45 | filters: action.payload 46 | } 47 | }; 48 | case types.UPDATE_SEARCH_OPTIONS: 49 | return { 50 | ...state, 51 | search: { 52 | ...state.search, 53 | options: action.payload 54 | } 55 | }; 56 | case types.SET_LOADING_STATUS: 57 | return { 58 | ...state, 59 | isLoading: action.status 60 | }; 61 | case types.RESET: 62 | return { 63 | ...initialState 64 | }; 65 | default: 66 | return state; 67 | }; 68 | }; 69 | -------------------------------------------------------------------------------- /covid_nlp/language/ms_translate.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os, requests, uuid, json 3 | import sys 4 | 5 | import pandas as pd 6 | 7 | 8 | class MSTranslator(): 9 | def __init__(self, key = None, endpoint = None, lang = None): 10 | if key: 11 | self.azure_key = key 12 | else: 13 | self.azure_key = os.environ['AZURE_TRANSLATE_KEY'] 14 | self.azure_endpoint = endpoint 15 | self.lang = lang 16 | self.url = f"{self.azure_endpoint}/translate?api-version=3.0&to={self.lang}" 17 | self.headers = { 18 | 'Ocp-Apim-Subscription-Key': self.azure_key, 19 | 'Content-type': 'application/json', 20 | 'X-ClientTraceId': str(uuid.uuid4()) 21 | } 22 | 23 | def translate(self, text): 24 | body = [{'text': text.strip()}] 25 | request = requests.post(self.url, headers = self.headers, json = body) 26 | response = request.json() 27 | trans_text = "" 28 | if len(response) > 0: 29 | trans_text = response[0]['translations'][0]['text'] 30 | return trans_text 31 | 32 | 33 | def main(): 34 | lang = "ar" 35 | azure_endpoint = "https://api.cognitive.microsofttranslator.com/" 36 | ms_translator = MSTranslator(endpoint = azure_endpoint, lang = lang) 37 | 38 | faq_file = "../../data/faqs/faq_covidbert.csv" 39 | df = pd.read_csv(faq_file) 40 | df[f'question_{lang}'] = df.apply(lambda x: ms_translator.translate(x.question), axis=1) 41 | df[f'answer_{lang}'] = df.apply(lambda x: ms_translator.translate(x.answer), axis=1) 42 | 43 | faq_filename = os.path.basename(faq_file) 44 | df.to_csv(f"MT_{lang}_{faq_filename}") 45 | 46 | if __name__ == "__main__": 47 | main() 48 | -------------------------------------------------------------------------------- /covid-frontend/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "irda", 3 | "version": "0.1.0", 4 | "private": true, 5 | "dependencies": { 6 | "@ant-design/icons": "latest", 7 | "antd": "^3.26.13", 8 | "chart.js": "^2.9.3", 9 | "classnames": "^2.2.6", 10 | "env-cmd": "^9.0.3", 11 | "history": "^4.9.0", 12 | "i18next": "^19.3.3", 13 | "i18next-browser-languagedetector": "^4.0.2", 14 | "i18next-xhr-backend": "^3.2.2", 15 | "less": "^2.7.2", 16 | "less-plugin-clean-css": "^1.5.1", 17 | "node-sass": "^4.13.1", 18 | "prop-types": "^15.7.2", 19 | "react": "^16.13.1", 20 | "react-chartjs-2": "^2.9.0", 21 | "react-custom-scrollbars": "^4.2.1", 22 | "react-dom": "^16.13.1", 23 | "react-i18next": "^11.3.4", 24 | "react-redux": "^6.0.0", 25 | "react-router": "^4.3.1", 26 | "react-router-dom": "^5.0.0", 27 | "react-scripts": "^3.4.1", 28 | "redux": "^4.0.5", 29 | "redux-devtools-extension": "^2.13.8", 30 | "redux-saga": "^1.1.3", 31 | "uuid": "^3.4.0" 32 | }, 33 | "scripts": { 34 | "start": "react-scripts start", 35 | "build": "react-scripts build", 36 | "build:staging": "env-cmd -f .env.staging npm run build", 37 | "build:production": "env-cmd -f .env.production npm run build", 38 | "test": "react-scripts test", 39 | "eject": "react-scripts eject", 40 | "antd-theme": "lessc --clean-css src/assets/styles/antd/antd-theme.less src/assets/styles/antd/antd.min.css" 41 | }, 42 | "eslintConfig": { 43 | "extends": "react-app" 44 | }, 45 | "browserslist": { 46 | "production": [ 47 | ">0.2%", 48 | "not dead", 49 | "not op_mini all" 50 | ], 51 | "development": [ 52 | "last 1 chrome version", 53 | "last 1 firefox version", 54 | "last 1 safari version" 55 | ] 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /backend/controller/autocomplete.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from fastapi import APIRouter 4 | from pydantic import BaseModel 5 | 6 | import langid 7 | langid.set_languages(['de', 'en']) # ISO 639-1 codes 8 | 9 | # 10 | # not a good idea to work with global variables like this. 11 | # 12 | from backend import api 13 | 14 | DB_INDEX_AUTOCOMPLETE = "autocomplete" 15 | 16 | router = APIRouter() 17 | 18 | 19 | class Request(BaseModel): 20 | search: str 21 | 22 | 23 | def addQuestionToAutocomplete(question: str): 24 | # todo: if it already exists; we need to increment count; 25 | body = { 26 | 'phrase': question, 27 | 'count' : 1 28 | } 29 | res = api.elasticsearch_client.index(index=DB_INDEX_AUTOCOMPLETE,body=body) 30 | 31 | 32 | 33 | 34 | @router.get("/query/autocomplete") 35 | def ask(search: str): 36 | interim = api.elasticsearch_client.search(index=DB_INDEX_AUTOCOMPLETE, body= 37 | { 38 | '_source':['phrase'], 39 | 'query':{ 40 | "bool": { 41 | "must": [{ 42 | "match": { 43 | "phrase": search 44 | } 45 | }, 46 | { 47 | "exists": { 48 | "field": "count" 49 | } 50 | }] 51 | } 52 | }, 53 | 'size': 10, 54 | 'sort' :[ 55 | {'count' : {'order' : 'desc' }} 56 | ] 57 | }) 58 | 59 | resultCount = len(interim['hits']['hits']) 60 | result = [] 61 | for i in range(resultCount): 62 | result.append(interim['hits']['hits'][i]['_source']['phrase']) 63 | 64 | 65 | lang, score = langid.classify(search) 66 | 67 | return { 68 | "results":result, 69 | "language": lang 70 | } 71 | 72 | 73 | -------------------------------------------------------------------------------- /covid-frontend/src/store/sagas/globalSearch.js: -------------------------------------------------------------------------------- 1 | import { all, put, select, takeLatest, delay } from 'redux-saga/effects'; 2 | import { message } from 'antd'; 3 | import * as api from 'store/sagas/api'; 4 | import * as types from 'store/types/globalSearch'; 5 | import * as actions from 'store/actions/globalSearch'; 6 | 7 | 8 | export function* getOptions(value) { 9 | const { currentString, lastString } = yield select(state => state.globalSearch.search); 10 | 11 | // return and reset fields if string is empty 12 | if (!currentString.length) { 13 | yield put(actions.updateSearchOptions([])); 14 | yield put(actions.updateSearchFilters({})); 15 | yield put(actions.updateLastSearchValue('')); 16 | 17 | return; 18 | } 19 | 20 | // return if options for the string already exist 21 | if (currentString.length && lastString.startsWith(currentString) && currentString.length <= lastString.length) { 22 | return; 23 | } 24 | 25 | yield put(actions.setLoadingStatus(true)); 26 | 27 | try { 28 | yield put(actions.updateLastSearchValue(value)); 29 | yield delay(400); 30 | const data = yield api.get(`/query/autocomplete`, { search: currentString }); 31 | let i = 0; 32 | 33 | // filter duplicates 34 | let results = data.results; 35 | results = results.filter((v,i) => results.indexOf(v) === i) 36 | 37 | const searchResults = results.map(question =>{ 38 | return {question, id: i++ }; 39 | }); 40 | 41 | yield put(actions.updateSearchOptions(searchResults)); 42 | yield put(actions.updateSearchFilters({language:data.language})); 43 | 44 | } catch (error) { 45 | message.error(error.message); 46 | } 47 | yield put(actions.setLoadingStatus(false)); 48 | } 49 | 50 | export default function* () { 51 | yield all([ 52 | takeLatest(types.UPDATE_SEARCH_VALUE, ({ payload }) => getOptions(payload)), 53 | ]); 54 | } 55 | -------------------------------------------------------------------------------- /covid_nlp/eval.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | from sklearn.metrics import roc_auc_score, f1_score 5 | from farm.utils import MLFlowLogger 6 | 7 | 8 | def eval_question_similarity(y_true, y_pred, lang, model_name, params, user=None, log_to_mlflow=True, run_name="default"): 9 | # basic metrics 10 | mean_diff = np.mean(np.abs(y_true - y_pred)) 11 | roc_auc = roc_auc_score(y_true, y_pred) 12 | f1 = f1_score(y_true, y_pred.round(0)) 13 | metrics = {"roc_auc": roc_auc, "mean_abs_diff": mean_diff, "f1_score": f1} 14 | print(metrics) 15 | 16 | # log experiment results to MLFlow (visit https://public-mlflow.deepset.ai/) 17 | if log_to_mlflow: 18 | params["lang"] = lang 19 | params["model_name"] = model_name 20 | if user: 21 | params["user"] = user 22 | 23 | ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") 24 | ml_logger.init_experiment(experiment_name="COVID-question-sim", run_name=run_name) 25 | ml_logger.log_params(params) 26 | ml_logger.log_metrics(metrics, step=0) 27 | 28 | 29 | if __name__ == "__main__": 30 | # config 31 | eval_file = "../data/eval_question_similarity_en.csv" 32 | lang = "en" 33 | model_name = "naive_baseline" 34 | experiment_name = "naive_baseline_1" 35 | log_to_mlflow = True 36 | params = {"some_model_param": 0} 37 | 38 | # load eval data 39 | df = pd.read_csv(eval_file) 40 | 41 | # predict similarity of samples (e.g. via embeddings + cosine similarity) 42 | # here: dummy preds for naive baseline 43 | y_true = df["similar"].values 44 | y_pred = [0.5] * len(y_true) 45 | 46 | # eval & track results 47 | eval_question_similarity(y_true=y_true, y_pred=y_pred, lang=lang, model_name=model_name, 48 | params=params, user="malte", log_to_mlflow=log_to_mlflow, run_name=experiment_name) 49 | 50 | 51 | -------------------------------------------------------------------------------- /datasources/automatic/scraper.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pandas as pd 3 | from scrapy.crawler import CrawlerProcess 4 | from datasources.automatic.testing_WHO_scraper import CovidScraper 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | def scrape(url): 10 | # try to extract question and answer for each url 11 | questions, answers = "q","a" # do scraping here 12 | return questions, answers 13 | 14 | 15 | ########## TESTING CODE 16 | RESULTS = [] 17 | class Pipeline(object): 18 | def process_item(self, item, spider): 19 | df = pd.DataFrame.from_dict(item) 20 | RESULTS.append(df) 21 | 22 | def get_test_data(): 23 | # Code for getting the test set of questions and answers 24 | process = CrawlerProcess({ 25 | 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)', 26 | 'ITEM_PIPELINES': {'__main__.Pipeline': 1} 27 | }) 28 | process.crawl( 29 | CovidScraper) # uses the WHO manual scraper with version fixed through waybackmachine (see import above) 30 | process.start() 31 | dataframe = pd.concat(RESULTS) 32 | questions_truth = dataframe.question 33 | answers_truth = dataframe.answer 34 | return questions_truth,answers_truth 35 | ######### END TESTING CODE 36 | 37 | if __name__ == "__main__": 38 | logging.disable(logging.WARNING) 39 | questions_truth, answers_truth = get_test_data() 40 | print(questions_truth) 41 | 42 | # for the intelligent scraper, a fixed version of WHO website is used so results coming back from get_test_data can be fixed 43 | #urls = ["https://www.who.int/news-room/q-a-detail/q-a-coronaviruses"] 44 | urls = ["https://web.archive.org/web/20200331131108/https://www.who.int/news-room/q-a-detail/q-a-coronaviruses"] 45 | questions_auto, answers_auto = scrape(urls) 46 | 47 | # check weather questions_truth is similar to questions_auto, 48 | # and answers_truth similar to answers_auto 49 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /covid-frontend/src/components/Home/index.js: -------------------------------------------------------------------------------- 1 | import React, { PureComponent } from 'react'; 2 | import PropTypes from 'prop-types'; 3 | import { connect } from 'react-redux'; 4 | import { bindActionCreators } from 'redux'; 5 | import { Row, Col } from 'antd'; 6 | import links from 'routes/links'; 7 | import * as actions from 'store/actions/globalSearch'; 8 | import { WrappedSearchForm as SearchForm } from './SearchForm'; 9 | import logo from 'assets/images/logo.png'; 10 | import styles from './styles.module.scss'; 11 | 12 | class Home extends PureComponent { 13 | 14 | static propTypes = { 15 | history: PropTypes.object, 16 | globalSearch: PropTypes.object, 17 | actions: PropTypes.object 18 | } 19 | 20 | handleSubmit = (value) => { 21 | this.props.actions.setSelectedValue(value); 22 | this.props.history.push(links.answers); 23 | } 24 | 25 | render() { 26 | const { currentString, options, filters } = this.props.globalSearch.search; 27 | return ( 28 |
29 | 30 | 31 |
32 | 33 |
34 |
35 | logo 36 |
37 |
38 | 39 | 46 |
47 | 48 |
49 |
50 | ); 51 | } 52 | } 53 | 54 | export default connect( 55 | state => ({ 56 | globalSearch: state.globalSearch 57 | }), 58 | dispatch => ({ 59 | actions: bindActionCreators(actions, dispatch) 60 | }) 61 | )(Home); 62 | -------------------------------------------------------------------------------- /covid_nlp/modeling/tfidf/tfidf_train.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sys 3 | import re 4 | import pickle 5 | import os 6 | 7 | from sklearn.feature_extraction.text import TfidfVectorizer 8 | from sklearn.metrics.pairwise import cosine_similarity 9 | 10 | from preprocess import Preprocessor 11 | 12 | class TfidfTrainer(): 13 | 14 | def __init__(self, instream = None): 15 | self.preprocessor = Preprocessor(instream = instream) 16 | self.feature_vectors = None 17 | self.vectorizer = None 18 | 19 | def preprocess_corpus(self, corpus = None): 20 | if corpus: 21 | pcorpus = self.preprocessor.preprocess_sp(corpus) 22 | else: 23 | pcorpus = self.preprocessor.sentencepiece_apply(self.preprocessor.corpus) 24 | return pcorpus 25 | 26 | def train_model(self, corpus): 27 | # creating vocabulary using uni-gram and bi-gram 28 | self.vectorizer = TfidfVectorizer(min_df=2, max_df=.95, ngram_range=(1, 2)) 29 | self.vectorizer.fit(corpus) # fit the vectorizer with the list of texts 30 | self.feature_vectors = self.vectorizer.transform(corpus) # list of tfidf vectors 31 | 32 | def save_model(self, prefix = "./tfidf_"): 33 | with open(f"{prefix}feature_vectors.pkl", 'wb') as outfile: 34 | pickle.dump(self.feature_vectors, outfile) 35 | 36 | with open(f"{prefix}vectorizer.pkl", 'wb') as outfile: 37 | pickle.dump(self.vectorizer, outfile) 38 | 39 | def load_model(self, prefix = "./tfidf_"): 40 | with open(f"{prefix}feature_vectors.pkl", 'rb') as infile: 41 | self.feature_vectors = pickle.load(infile) 42 | 43 | with open(f"{prefix}vectorizer.pkl", 'rb') as infile: 44 | self.vectorizer = pickle.load(infile) 45 | 46 | 47 | def main(): 48 | trainer = TfidfTrainer() 49 | corpus = trainer.preprocess_corpus() 50 | trainer.train_model(corpus) 51 | trainer.save_model() 52 | 53 | if __name__ == "__main__": 54 | main() 55 | -------------------------------------------------------------------------------- /backend/config.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import os 3 | 4 | # Resources / Computation 5 | USE_GPU = os.getenv("USE_GPU", "True").lower() == "true" 6 | MAX_PROCESSES = int(os.getenv("MAX_PROCESSES", 4)) 7 | BATCHSIZE = int(os.getenv("BATCHSIZE", 50)) 8 | 9 | # Monitoring 10 | APM_SERVER = "http://localhost:8200" 11 | 12 | # Reader 13 | READER_MODEL_PATH = os.getenv("READER_MODEL_PATH", None) 14 | CONTEXT_WINDOW_SIZE = int(os.getenv("CONTEXT_WINDOW_SIZE", 500)) 15 | DEFAULT_TOP_K_READER = int(os.getenv("DEFAULT_TOP_K_READER", 5)) 16 | TOP_K_PER_CANDIDATE = int(os.getenv("TOP_K_PER_CANDIDATE", 3)) 17 | NO_ANS_BOOST = int(os.getenv("NO_ANS_BOOST", -10)) 18 | DOC_STRIDE = int(os.getenv("DOC_STRIDE", 128)) 19 | MAX_SEQ_LEN = int(os.getenv("MAX_SEQ_LEN", 256)) 20 | 21 | # Retriever 22 | DEFAULT_TOP_K_RETRIEVER = int(os.getenv("DEFAULT_TOP_K_RETRIEVER", 10)) 23 | EMBEDDING_MODEL_PATH = os.getenv("EMBEDDING_MODEL_PATH", "deepset/sentence_bert") 24 | EMBEDDING_POOLING_STRATEGY = os.getenv("EMBEDDING_POOLING_STRATEGY", "reduce_mean") 25 | EMBEDDING_EXTRACTION_LAYER = int(os.getenv("EMBEDDING_EXTRACTION_LAYER", -2)) 26 | 27 | # Database access 28 | DB_HOST = os.getenv("DB_HOST", "localhost") 29 | DB_USER = os.getenv("DB_USER", "") 30 | DB_PW = os.getenv("DB_PW", "") 31 | DB_INDEX = os.getenv("DB_INDEX", "document") 32 | DB_INDEX_FEEDBACK = os.getenv("DB_INDEX_FEEDBACK", "feedback") 33 | ES_CONN_SCHEME = os.getenv("ES_CONN_SCHEME", "http") 34 | TEXT_FIELD_NAME = os.getenv("TEXT_FIELD_NAME", "answer") 35 | SEARCH_FIELD_NAME = os.getenv("SEARCH_FIELD_NAME", "question") 36 | EMBEDDING_FIELD_NAME = os.getenv("EMBEDDING_FIELD_NAME", "question_emb") 37 | EMBEDDING_DIM = os.getenv("EMBEDDING_DIM", None) 38 | 39 | EXCLUDE_META_DATA_FIELDS = os.getenv("EXCLUDE_META_DATA_FIELDS", "['question_emb']") 40 | if EXCLUDE_META_DATA_FIELDS: 41 | EXCLUDE_META_DATA_FIELDS = ast.literal_eval(EXCLUDE_META_DATA_FIELDS) 42 | 43 | # SIL language detection API 44 | SIL_API_KEY=os.getenv("SIL_API_KEY", "") 45 | SIL_API_SECRET=os.getenv("SIL_API_SECRET", "") 46 | SIL_API_URL=os.getenv("SIL_API_URL", "https://langdetect.apis.sil.org/langdetect") 47 | -------------------------------------------------------------------------------- /covid-frontend/src/components/Home/styles.module.scss: -------------------------------------------------------------------------------- 1 | @import '../../assets/styles/_variables'; 2 | @import '../../assets/styles/_mixins'; 3 | 4 | .content { 5 | align-content: center; 6 | flex-direction: column; 7 | display: flex; 8 | justify-content: center; 9 | padding: 0 0 120px; 10 | min-height: 100vh; 11 | } 12 | 13 | .form { 14 | :global(.ant-form-explain) { 15 | margin-top: 0; 16 | } 17 | 18 | :global(.ant-btn-lg) { 19 | padding: 10px 15px; 20 | } 21 | } 22 | 23 | .logoWrapper { 24 | align-items: center; 25 | justify-content: space-around; 26 | display: flex; 27 | margin: 0 16px 80px; 28 | } 29 | 30 | .logo { 31 | position: relative; 32 | width: 250px; 33 | 34 | img { 35 | display: block; 36 | height: 100%; 37 | object-fit: contain; 38 | width: 100%; 39 | } 40 | } 41 | 42 | .poweredBy { 43 | text-align: right; 44 | color: $secondary-grey; 45 | span { 46 | color:#d80808 !important; 47 | } 48 | a { 49 | text-decoration: none; 50 | 51 | &:hover { 52 | color: $strong-emotion; 53 | text-decoration: underline; 54 | } 55 | } 56 | img { 57 | width: 300px; 58 | } 59 | p { 60 | text-align: right; 61 | } 62 | div { 63 | text-align: center; 64 | margin-top: 2em; 65 | img { 66 | height: 70px; 67 | } 68 | } 69 | } 70 | 71 | 72 | .autocomplete { 73 | width: 100%; 74 | 75 | :global(.ant-select-selection) { 76 | background-color: $accent-light; 77 | border-color: $accent-light; 78 | 79 | &::-webkit-input-placeholder { /* Edge */ 80 | color: $accent; 81 | } 82 | 83 | &:-ms-input-placeholder { /* Internet Explorer 10-11 */ 84 | color: $accent; 85 | } 86 | 87 | &::placeholder { 88 | color: $accent; 89 | } 90 | } 91 | } 92 | 93 | .detectedLanguage { 94 | text-align: right; 95 | color: $primary-grey; 96 | } 97 | .projectLogo { 98 | padding-top:50px; 99 | width:100%; 100 | a,img { 101 | width:180px; 102 | margin:0 auto; 103 | display: block; 104 | } 105 | 106 | } 107 | -------------------------------------------------------------------------------- /backend/api.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import uvicorn 4 | from elasticapm.contrib.starlette import make_apm_client, ElasticAPM 5 | from elasticsearch import Elasticsearch 6 | from fastapi import FastAPI, HTTPException 7 | from starlette.middleware.cors import CORSMiddleware 8 | 9 | from backend.config import DB_HOST, DB_USER, DB_PW, APM_SERVER 10 | from backend.controller.errors.http_error import http_error_handler 11 | from backend.controller.router import router as api_router 12 | # from backend.events.fastapi import create_start_app_handler, create_stop_app_handler 13 | 14 | logging.basicConfig(format="%(asctime)s %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p") 15 | logger = logging.getLogger(__name__) 16 | logging.getLogger("elasticsearch").setLevel(logging.WARNING) 17 | 18 | elasticsearch_client = Elasticsearch( 19 | hosts=[{"host": DB_HOST}], http_auth=(DB_USER, DB_PW), scheme="http", ca_certs=False, verify_certs=False 20 | ) 21 | 22 | 23 | def get_application() -> FastAPI: 24 | application = FastAPI(title="Haystack API", debug=True, version="0.1") 25 | 26 | application.add_middleware( 27 | CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], 28 | ) 29 | apm_config = {"SERVICE_NAME": "covid-backend", "SERVER_URL": APM_SERVER, "CAPTURE_BODY": "all"} 30 | elasticapm = make_apm_client(apm_config) 31 | application.add_middleware(ElasticAPM, client=elasticapm) 32 | 33 | application.add_exception_handler(HTTPException, http_error_handler) 34 | # application.add_event_handler("startup", create_start_app_handler(application)) 35 | # application.add_event_handler("shutdown", create_stop_app_handler(application)) 36 | 37 | application.include_router(api_router) 38 | 39 | return application 40 | 41 | 42 | app = get_application() 43 | 44 | logger.info("Open http://127.0.0.1:8000/docs to see Swagger API Documentation.") 45 | logger.info( 46 | """ 47 | Or just try it out directly: curl --request POST --url 'http://127.0.0.1:8000/models/1/faq-qa' --data '{"questions": ["What are symptoms?"]}' 48 | """ 49 | ) 50 | 51 | if __name__ == "__main__": 52 | uvicorn.run(app, host="0.0.0.0", port=8000) 53 | -------------------------------------------------------------------------------- /covid_nlp/modeling/tfidf/tfidf_client.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import re 3 | import pickle 4 | import os 5 | import json 6 | 7 | from sklearn.feature_extraction.text import TfidfVectorizer 8 | from sklearn.metrics.pairwise import cosine_similarity 9 | 10 | import pandas as pd 11 | 12 | from preprocess import Preprocessor 13 | from tfidf_train import TfidfTrainer 14 | 15 | sys.path.insert(0, "./../../") 16 | from eval import eval_question_similarity 17 | 18 | 19 | class TfidfEvaluator(): 20 | def __init__(self): 21 | self.model = TfidfTrainer(instream = "dummy") 22 | self.model.load_model() 23 | 24 | def process_string(self, mystring): 25 | corpus = self.model.preprocess_corpus([mystring]) 26 | corpus_vectors = self.model.vectorizer.transform([corpus[0]]) 27 | return corpus_vectors 28 | 29 | def find_best_matches(self, cos_list, top_n = 10): 30 | cos_list_enumerated = [ (i, cos_sim) for i, cos_sim in enumerate(cos_list) ] 31 | cos_list_enumerated.sort(key=lambda x:x[1], reverse=True) 32 | return cos_list_enumerated[:top_n] 33 | 34 | def score_string_pair(self, string1, string2): 35 | vec1 = self.process_string(string1) 36 | vec2 = self.process_string(string2) 37 | cos_sim = cosine_similarity(vec1, vec2) 38 | return cos_sim[0][0] 39 | 40 | def main(): 41 | evaluator = TfidfEvaluator() 42 | 43 | eval_file = "../../../data/eval_question_similarity_en.csv" 44 | df = pd.read_csv(eval_file) 45 | # predict similarity of samples (e.g. via embeddings + cosine similarity) 46 | df['pred'] = df.apply(lambda x: evaluator.score_string_pair(x.question_1, x.question_2), axis=1) 47 | y_true = df["similar"].values 48 | y_pred = df["pred"].values 49 | 50 | model_name = "tfidf_baseline" 51 | exp_name = "tfidf_cos_sim_2" 52 | params = {"sp_voc": 16000, "max_ngram": 2, "remove_stopwords": 1, 53 | "data_train": "eval, scraped", "data_sp": "eval, scraped, CORD-19.200k"} 54 | eval_question_similarity(y_true=y_true, y_pred=y_pred, lang="en", model_name=model_name, 55 | params=params, user="carmen", log_to_mlflow=True, run_name=exp_name) 56 | 57 | 58 | if __name__ == "__main__": 59 | main() 60 | -------------------------------------------------------------------------------- /telegram-bot/src/main/kotlin/com/theapache64/cs/utils/TelegramAPI.kt: -------------------------------------------------------------------------------- 1 | package com.theapache64.cs.utils 2 | 3 | import com.theapache64.cs.models.rest.telegram.AnswerCallbackRequest 4 | import com.theapache64.cs.models.rest.telegram.SendChatActionRequest 5 | import com.theapache64.cs.models.rest.telegram.SendMessageRequest 6 | import com.theapache64.cs.models.rest.telegram.SendMessageResponse 7 | import java.io.IOException 8 | 9 | object TelegramAPI { 10 | 11 | private const val BASE_URL = "https://api.telegram.org" 12 | 13 | 14 | /** 15 | * To send a text with Markdown 16 | */ 17 | @Throws(IOException::class) 18 | fun sendHtmlMessage( 19 | from: String, 20 | to: String, 21 | message: String, 22 | replyMsgId: Long?, 23 | replayMarkup: SendMessageRequest.ReplyMarkup? 24 | ): SendMessageResponse { 25 | 26 | val url = "$BASE_URL/bot$from/sendMessage" 27 | 28 | val response = RestClient.post( 29 | url, 30 | null, 31 | SendMessageRequest( 32 | to, 33 | message, 34 | true, 35 | "HTML", 36 | replyMsgId, 37 | replayMarkup 38 | ) 39 | ) 40 | 41 | val respJsonString = response.body!!.string() 42 | if (response.code != 200) { 43 | throw IOException("Failed to send message '$message' -> $respJsonString") 44 | } 45 | return GsonUtil.gson.fromJson(respJsonString, SendMessageResponse::class.java) 46 | } 47 | 48 | fun answerCallbackQuery( 49 | from: String, 50 | id: String 51 | ) { 52 | val url = "$BASE_URL/bot$from/answerCallbackQuery" 53 | val resp = RestClient.post( 54 | url, 55 | null, 56 | AnswerCallbackRequest(id) 57 | ).body!!.string() 58 | 59 | } 60 | 61 | fun sendChatAction( 62 | from: String, 63 | chatId: String, 64 | action: String 65 | ) { 66 | val url = "$BASE_URL/bot$from/sendChatAction" 67 | val resp = RestClient.post( 68 | url, 69 | null, 70 | SendChatActionRequest( 71 | action, 72 | chatId 73 | ) 74 | ).body!!.string() 75 | 76 | } 77 | } -------------------------------------------------------------------------------- /telegram-bot/src/main/kotlin/com/theapache64/cs/utils/RestClient.kt: -------------------------------------------------------------------------------- 1 | package com.theapache64.cs.utils 2 | 3 | import okhttp3.OkHttpClient 4 | import okhttp3.Request 5 | import okhttp3.RequestBody.Companion.toRequestBody 6 | import okhttp3.Response 7 | import java.util.concurrent.TimeUnit 8 | 9 | object RestClient { 10 | 11 | fun get(url: String, headers: Map? = null): Response { 12 | return call("GET", url, headers, null) 13 | } 14 | 15 | private fun getNewOkHttpClient(): OkHttpClient { 16 | return OkHttpClient.Builder() 17 | .connectTimeout(30, TimeUnit.SECONDS) 18 | .readTimeout(30, TimeUnit.SECONDS) 19 | .writeTimeout(30, TimeUnit.SECONDS) 20 | .followRedirects(true) 21 | .followSslRedirects(true) 22 | .build() 23 | } 24 | 25 | private fun call(method: String, url: String, headers: Map?, body: Any?): Response { 26 | 27 | 28 | val request = Request.Builder() 29 | .url(url) 30 | 31 | if (body != null) { 32 | val json = GsonUtil.gson.toJson(body) 33 | 34 | println("$method : $url -> $json") 35 | 36 | request.addHeader("Content-Type", "application/json") 37 | request.method(method, json.toRequestBody()) 38 | } else { 39 | request.method(method, null) 40 | } 41 | 42 | if (headers != null) { 43 | for (header in headers) { 44 | request.addHeader(header.key, header.value) 45 | } 46 | } 47 | 48 | request.addHeader( 49 | "User-Agent", 50 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36" 51 | ) 52 | 53 | return getNewOkHttpClient().newCall(request.build()).execute() 54 | } 55 | 56 | fun post(url: String, headers: Map?, body: Any): Response { 57 | return call( 58 | "POST", 59 | url, 60 | headers, 61 | body 62 | ) 63 | } 64 | 65 | fun put( 66 | url: String, 67 | headers: Map, 68 | body: Any 69 | ): Response { 70 | return call( 71 | "PUT", 72 | url, 73 | headers, 74 | body 75 | ) 76 | } 77 | } -------------------------------------------------------------------------------- /datasources/scrapers/RKI_scraper.py: -------------------------------------------------------------------------------- 1 | # run 'scrapy runspider RKI_scraper.py' to scrape data 2 | 3 | from datetime import date 4 | 5 | import scrapy 6 | from scrapy.crawler import CrawlerProcess 7 | 8 | 9 | class CovidScraper(scrapy.Spider): 10 | name = 'rki_spyder' 11 | start_urls = ['https://www.rki.de/SharedDocs/FAQ/NCOV2019/FAQ_Liste.html'] 12 | 13 | def parse(self, response): 14 | columns = { 15 | "question": [], 16 | "answer": [], 17 | "answer_html": [], 18 | "link": [], 19 | "name": [], 20 | "source": [], 21 | "category": [], 22 | "country": [], 23 | "region": [], 24 | "city": [], 25 | "lang": [], 26 | "last_update": [], 27 | } 28 | 29 | for x in response.xpath('//div[@class="alt-accordion-box-box"]/@id').extract(): 30 | question_text = response.xpath(str('//*[@id="' + x + '"]/h2/text()')).extract()[0] 31 | answer_text = " ".join(response.xpath(str('//*[@id="' + x + '"]/div/p')).xpath('string()').extract()) 32 | answer_html = " ".join(response.xpath(str('//*[@id="' + x + '"]/div/p')).extract()) 33 | 34 | columns['question'].append(question_text) 35 | columns['answer'].append(answer_text) 36 | columns['answer_html'].append(answer_html) 37 | 38 | today = date.today() 39 | 40 | columns["link"] = ["https://www.rki.de/SharedDocs/FAQ/NCOV2019/FAQ_Liste.html"] * len(columns["question"]) 41 | columns["name"] = ["Q&A on coronaviruses (COVID-19)"] * len(columns["question"]) 42 | columns["source"] = ["Robert Koch Institute (RKI)"] * len(columns["question"]) 43 | columns["category"] = [""] * len(columns["question"]) 44 | columns["country"] = ["DE"] * len(columns["question"]) 45 | columns["region"] = [""] * len(columns["question"]) 46 | columns["city"] = [""] * len(columns["question"]) 47 | columns["lang"] = ["de"] * len(columns["question"]) 48 | columns["last_update"] = [today.strftime("%Y/%m/%d")] * len(columns["question"]) 49 | 50 | return columns 51 | 52 | 53 | if __name__ == "__main__": 54 | process = CrawlerProcess({ 55 | 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)' 56 | }) 57 | process.crawl(CovidScraper) 58 | process.start() 59 | -------------------------------------------------------------------------------- /datasources/scrapers/GOV_pl_scraper.py: -------------------------------------------------------------------------------- 1 | # run 'scrapy runspider GOV_pl_scraper.py' to scrape data 2 | 3 | from datetime import date 4 | 5 | import scrapy 6 | from scrapy.crawler import CrawlerProcess 7 | 8 | 9 | class CovidScraper(scrapy.Spider): 10 | name = 'polish_GOV_spyder' 11 | start_urls = ['https://www.gov.pl/web/koronawirus/pytania-i-odpowiedzi'] 12 | 13 | def parse(self, response): 14 | columns = { 15 | "question": [], 16 | "answer": [], 17 | "answer_html": [], 18 | "link": [], 19 | "name": [], 20 | "source": [], 21 | "category": [], 22 | "country": [], 23 | "region": [], 24 | "city": [], 25 | "lang": [], 26 | "last_update": [], 27 | } 28 | 29 | for x in range(0, len(response.xpath('//summary/text()').extract())): 30 | question_text = response.xpath('//summary/text()').extract()[x] 31 | answer_text = "".join(response.xpath( 32 | '//summary[text()="' + question_text + '"]/following-sibling::node()/descendant-or-self::text()').extract()) 33 | answer_html = "".join( 34 | response.xpath('//summary[text()="' + question_text + '"]/following-sibling::node()').extract()) 35 | 36 | columns['question'].append(question_text) 37 | columns['answer'].append(answer_text) 38 | columns['answer_html'].append(answer_html) 39 | 40 | today = date.today() 41 | 42 | columns["link"] = ["https://www.gov.pl/web/koronawirus/pytania-i-odpowiedzi"] * len(columns["question"]) 43 | columns["name"] = ["Pytania i odpowiedzi (COVID-19)"] * len(columns["question"]) 44 | columns["source"] = ["GOV Polska"] * len(columns["question"]) 45 | columns["category"] = [""] * len(columns["question"]) 46 | columns["country"] = ["PL"] * len(columns["question"]) 47 | columns["region"] = [""] * len(columns["question"]) 48 | columns["city"] = [""] * len(columns["question"]) 49 | columns["lang"] = ["pl"] * len(columns["question"]) 50 | columns["last_update"] = [today.strftime("%Y/%m/%d")] * len(columns["question"]) 51 | 52 | return columns 53 | 54 | 55 | if __name__ == "__main__": 56 | process = CrawlerProcess({ 57 | 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)' 58 | }) 59 | process.crawl(CovidScraper) 60 | process.start() 61 | -------------------------------------------------------------------------------- /covid-frontend/src/core/api/index.js: -------------------------------------------------------------------------------- 1 | import { baseUrl } from 'core/constants/env'; 2 | 3 | class Api { 4 | token = null; 5 | headers = { 6 | 'Content-Type': 'application/json' 7 | } 8 | 9 | setAuthorization (token) { 10 | this.token = token; 11 | 12 | return this; 13 | } 14 | 15 | setHeaders (headers) { 16 | Object.getOwnPropertyNames(headers).forEach(key => { 17 | if (headers[key]) { 18 | this.headers[key] = headers[key]; 19 | } else { 20 | delete this.headers[key]; 21 | } 22 | }); 23 | 24 | return this; 25 | } 26 | 27 | get (url, query = null) { 28 | return this.call(url, 'GET', query); 29 | } 30 | 31 | post (url, query = null, body = null) { 32 | return this.call(url, 'POST', body ? query : null, body || query); 33 | } 34 | 35 | put (url, query = null, body = null) { 36 | return this.call(url, 'PUT', body ? query : null, body || query); 37 | } 38 | 39 | del (url, query = null) { 40 | return this.call(url, 'DELETE', query); 41 | } 42 | 43 | call (url, method, query = null, body = null) { 44 | const queryString = 45 | Object.keys(query || {}) 46 | .map(key => { 47 | let value = query[key]; 48 | 49 | if (typeof value === 'object' && value !== null) { 50 | value = JSON.stringify(value); 51 | } 52 | 53 | return `${key}=${encodeURIComponent(value)}`; 54 | }) 55 | .join('&'); 56 | 57 | let options = { 58 | method, 59 | headers: { 60 | ...this.headers, 61 | // 'Authorization': this.token 62 | } 63 | }; 64 | 65 | if (body) { 66 | options.body = body.constructor.name !== 'FormData' 67 | ? JSON.stringify(body) 68 | : body; 69 | } 70 | 71 | const urlString = `${baseUrl}${url}${queryString ? `?${queryString}` : ''}`; 72 | 73 | return fetch(urlString, options).then(response => { 74 | this.response = response; 75 | 76 | if (response.status >= 200 && response.status < 300) { 77 | return response.json(); 78 | } 79 | 80 | return response.json() 81 | .catch(() => { 82 | // if couldn't parse json 83 | throw new Error(`${response.status} - ${response.statusText}`); 84 | }) 85 | // if got a valid json response with error 86 | .then(error => { 87 | throw error; 88 | }); 89 | }); 90 | } 91 | } 92 | 93 | export default () => new Api(); 94 | -------------------------------------------------------------------------------- /covid_nlp/language/detect_language.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import pycld2 as cld2 4 | import hmac 5 | from hashlib import sha1 6 | from time import time 7 | import requests 8 | 9 | class LanguageDetector(): 10 | def __init__(self, model = 'sil'): 11 | self.model = model 12 | 13 | def detect_lang_cld2(self, text): 14 | pred = cld2.detect(text)[2][0] 15 | return pred[1], float(pred[2]) 16 | 17 | def detect_lang_cld3(self, text): 18 | import cld3 # requires protobuf 19 | pred = cld3.get_language(text) 20 | return pred.language, 100*pred.probability 21 | 22 | def detect_lang_sil(self, text): 23 | algorithm = 'HMAC+SHA1' 24 | curr_time = str(int(time())) 25 | concat = curr_time+os.environ.get('SIL_API_KEY') 26 | concatB = (concat).encode('utf-8') 27 | secretB = os.environ.get('SIL_API_SECRET').encode('utf-8') 28 | h1 = hmac.new(secretB, concatB, sha1) 29 | api_sig = h1.hexdigest() 30 | params = {'api_key': os.environ.get('SIL_API_KEY'), 'api_sig': api_sig} 31 | headers = {'Content-Type': 'application/json'} 32 | r = requests.post(os.environ.get('SIL_API_URL'), json=[{"text": text}], 33 | headers=headers, params=params) 34 | return r.json()[0]['language'], 100*r.json()[0]['probability'] 35 | 36 | def detect_lang(self, text): 37 | if self.model == 'cld2': 38 | return self.detect_lang_cld2(text) 39 | if self.model == 'cld3': 40 | return self.detect_lang_cld3(text) 41 | if self.model == 'sil': 42 | return self.detect_lang_sil(text) 43 | 44 | def detect_freq_lang(self, text, n = 3): 45 | import cld3 # requires protobuf 46 | pred = cld3.get_frequent_languages(text, num_langs = n) 47 | pred_list = [ (p.language, 100*p.probability) for p in pred ] 48 | return pred_list 49 | 50 | 51 | def main(): 52 | my_text = "Was ist das Coronavirus?" 53 | 54 | ld3 = LanguageDetector(model = 'cld3') 55 | ld3_result = ld3.detect_lang(my_text) 56 | print(f"cld3: {ld3_result}") 57 | ld3_top_results = ld3.detect_freq_lang(my_text, 4) 58 | print(f"cld3-freq: {ld3_top_results}") 59 | 60 | ld2 = LanguageDetector(model = 'cld2') 61 | ld2_result = ld2.detect_lang(my_text) 62 | print(f"cld2: {ld2_result}") 63 | 64 | ldsil = LanguageDetector(model = 'sil') 65 | ldsil_result = ldsil.detect_lang(my_text) 66 | print(f"sil: {ldsil_result}") 67 | 68 | if __name__ == "__main__": 69 | main() 70 | -------------------------------------------------------------------------------- /telegram-bot/gradlew.bat: -------------------------------------------------------------------------------- 1 | @if "%DEBUG%" == "" @echo off 2 | @rem ########################################################################## 3 | @rem 4 | @rem Gradle startup script for Windows 5 | @rem 6 | @rem ########################################################################## 7 | 8 | @rem Set local scope for the variables with windows NT shell 9 | if "%OS%"=="Windows_NT" setlocal 10 | 11 | set DIRNAME=%~dp0 12 | if "%DIRNAME%" == "" set DIRNAME=. 13 | set APP_BASE_NAME=%~n0 14 | set APP_HOME=%DIRNAME% 15 | 16 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 17 | set DEFAULT_JVM_OPTS="-Xmx64m" 18 | 19 | @rem Find java.exe 20 | if defined JAVA_HOME goto findJavaFromJavaHome 21 | 22 | set JAVA_EXE=java.exe 23 | %JAVA_EXE% -version >NUL 2>&1 24 | if "%ERRORLEVEL%" == "0" goto init 25 | 26 | echo. 27 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 28 | echo. 29 | echo Please set the JAVA_HOME variable in your environment to match the 30 | echo location of your Java installation. 31 | 32 | goto fail 33 | 34 | :findJavaFromJavaHome 35 | set JAVA_HOME=%JAVA_HOME:"=% 36 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe 37 | 38 | if exist "%JAVA_EXE%" goto init 39 | 40 | echo. 41 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 42 | echo. 43 | echo Please set the JAVA_HOME variable in your environment to match the 44 | echo location of your Java installation. 45 | 46 | goto fail 47 | 48 | :init 49 | @rem Get command-line arguments, handling Windows variants 50 | 51 | if not "%OS%" == "Windows_NT" goto win9xME_args 52 | 53 | :win9xME_args 54 | @rem Slurp the command line arguments. 55 | set CMD_LINE_ARGS= 56 | set _SKIP=2 57 | 58 | :win9xME_args_slurp 59 | if "x%~1" == "x" goto execute 60 | 61 | set CMD_LINE_ARGS=%* 62 | 63 | :execute 64 | @rem Setup the command line 65 | 66 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar 67 | 68 | @rem Execute Gradle 69 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS% 70 | 71 | :end 72 | @rem End local scope for the variables with windows NT shell 73 | if "%ERRORLEVEL%"=="0" goto mainEnd 74 | 75 | :fail 76 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of 77 | rem the _cmd.exe /c_ return code! 78 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 79 | exit /b 1 80 | 81 | :mainEnd 82 | if "%OS%"=="Windows_NT" endlocal 83 | 84 | :omega 85 | -------------------------------------------------------------------------------- /covid-frontend/src/components/UserFeedback/index.js: -------------------------------------------------------------------------------- 1 | import React, { PureComponent } from 'react'; 2 | import PropTypes from 'prop-types'; 3 | import { connect } from 'react-redux'; 4 | import { bindActionCreators } from 'redux'; 5 | import { Icon } from 'antd'; 6 | import * as answersActions from 'store/actions/activeAnswers'; 7 | import styles from './styles.module.scss'; 8 | import { withTranslation } from 'react-i18next'; 9 | 10 | class UserFeedback extends PureComponent { 11 | 12 | static propTypes = { 13 | globalSearch: PropTypes.object, 14 | answersActions: PropTypes.object 15 | } 16 | 17 | closeHandler = () => { 18 | this.props.answersActions.hideUserFeedbackPanel(); 19 | } 20 | 21 | onFeedbackNegative = (feedback, event) => { 22 | event.preventDefault() 23 | 24 | this.props.answersActions.markAsWrongAnswer({ 25 | question: this.props.globalSearch, 26 | answerDocumentId: this.props.answers.userFeedbackPopup && this.props.answers.userFeedbackPopup.answerDocumentId, 27 | feedback 28 | }); 29 | 30 | this.props.answersActions.hideUserFeedbackPanel(); 31 | 32 | return false; 33 | } 34 | 35 | 36 | render() { 37 | const { t } = this.props; 38 | 39 | return ( 40 |
41 |
42 |

{t('feedback.title')}

43 |

{t('feedback.text')}

44 | 48 | 52 | 56 | 59 |
60 |
61 | ); 62 | } 63 | } 64 | 65 | export default connect( 66 | state => ({ 67 | globalSearch: state.globalSearch, 68 | answers: state.activeAnswers 69 | }), 70 | dispatch => ({ 71 | answersActions: bindActionCreators(answersActions, dispatch) 72 | }) 73 | )(withTranslation()(UserFeedback)); 74 | -------------------------------------------------------------------------------- /covid-frontend/src/components/Answers/styles.module.scss: -------------------------------------------------------------------------------- 1 | @import "../../assets/styles/_variables"; 2 | @import "../../assets/styles/_mixins"; 3 | 4 | .wrapper { 5 | background-color: $white; 6 | @include border(); 7 | box-shadow: $base-shadow; 8 | min-height: 680px; 9 | padding: 24px 24px 60px 24px; 10 | 11 | .autocomplete { 12 | width: 100%; 13 | 14 | :global(.ant-select-selection) { 15 | background-color: $accent-light; 16 | border-color: $accent-light; 17 | 18 | &::-webkit-input-placeholder { 19 | /* Edge */ 20 | color: $accent; 21 | } 22 | 23 | &:-ms-input-placeholder { 24 | /* Internet Explorer 10-11 */ 25 | color: $accent; 26 | } 27 | 28 | &::placeholder { 29 | color: $accent; 30 | } 31 | } 32 | } 33 | } 34 | 35 | .titleRow { 36 | margin-bottom: 48px; 37 | } 38 | 39 | .loaderContainer { 40 | padding: 60px 0; 41 | text-align: center; 42 | 43 | h2 { 44 | font-weight: 600; 45 | margin: 16px 0; 46 | } 47 | 48 | div { 49 | color: $primary-grey; 50 | font-weight: 600; 51 | @include text(semiBig); 52 | } 53 | } 54 | 55 | .topAnswerTitle { 56 | text-transform: uppercase; 57 | text-align: center; 58 | color: white; 59 | margin-bottom: 1em;; 60 | } 61 | 62 | .otherAnswersTitle { 63 | color: $primary-grey; 64 | margin: 56px 0 32px; 65 | text-transform: uppercase; 66 | letter-spacing: $secondary-spacing; 67 | @include text(tiny); 68 | } 69 | 70 | .answerTitle { 71 | font-weight: 700; 72 | margin-bottom: 16px; 73 | } 74 | 75 | .answerText { 76 | 77 | p { 78 | line-height: 1.3; 79 | max-width:800px; 80 | } 81 | 82 | span { 83 | color: $success; 84 | } 85 | } 86 | 87 | .answerMeta { 88 | padding:20px; 89 | float:right; 90 | background:#f8f8f8; 91 | margin: 8px 0; 92 | 93 | & > div { 94 | margin: 2px 0; 95 | } 96 | 97 | span { 98 | color: $primary-grey; 99 | display: inline-block; 100 | width: 80px; 101 | } 102 | } 103 | 104 | .answerDocLink { 105 | border: solid 1px; 106 | border-radius: 50%; 107 | display: inline-block; 108 | height: 22px; 109 | margin-left: 8px; 110 | text-align: center; 111 | vertical-align: middle; 112 | width: 22px; 113 | & > i { 114 | vertical-align: middle; 115 | } 116 | } 117 | 118 | .answerDocLinkPositive { 119 | @extend .answerDocLink; 120 | 121 | color: $success; 122 | } 123 | 124 | .answerDocLinkNegative { 125 | @extend .answerDocLink; 126 | 127 | color: $strong-emotion; 128 | } 129 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | cover-photo 2 | 3 | This open source project serves two purposes. 4 | 1. Collection and evaluation of a Question Answering dataset to improve existing QA/search methods - **COVID-QA** 5 | 2. Question matching capabilities: Provide trustworthy answers to questions about COVID-19 via NLP - **outdated** 6 | 7 | # COVID-QA 8 | - Link to [COVID-QA Dataset](https://github.com/deepset-ai/COVID-QA/tree/master/data/question-answering/COVID-QA.json) 9 | - Accompanying paper on [OpenReview](https://openreview.net/forum?id=JENSKEEzsoU) 10 | - Annotation guidelines as [pdf](https://drive.google.com/file/d/1Wv3OIC0Z7ibHIzOm9Xw_r0gjTFmpl-33/view?usp=sharing) or [videos](https://www.youtube.com/playlist?list=PL0pJupneBHx4rkCtNmaXUs1q7SV7EjLED) 11 | - [deepset/roberta-base-squad2-covid](https://huggingface.co/deepset/roberta-base-squad2-covid) a QA model trained on COVID-QA 12 | 13 | **Update 14th April, 2020:** We are open sourcing the first batch of 14 | [SQuAD style question answering annotations](https://github.com/deepset-ai/COVID-QA/tree/master/data/question-answering). 15 | Thanks to [Tony Reina](https://www.linkedin.com/in/skysurgery/) for managing the process and the 16 | many professional annotators who spend valuable time looking through Covid related research papers. 17 | 18 | 19 | # FAQ matching 20 | **Update 17th June, 2020**: As the pandemic is thankfully slowing down and other information sources have catched up, we decided to take our hosted API and UI offline. We will keep the repository here as an inspiration for other projects and to share the COVID-QA dataset. 21 | 22 | ### :zap: Problem 23 | - People have many questions about COVID-19 24 | - Answers are scattered on different websites 25 | - Finding the right answers takes a lot of time 26 | - Trustworthiness of answers is hard to judge 27 | - Many answers get outdated soon 28 | 29 | ### :bulb: Idea 30 | - Aggregate FAQs and texts from trustworthy data sources (WHO, CDC ...) 31 | - Provide a UI where people can ask questions 32 | - Use NLP to match incoming questions of users with meaningful answers 33 | - Users can provide feedback about answers to improve the NLP model and flag outdated or wrong answers 34 | - Display most common queries without good answers to guide data collection and model improvements 35 | 36 | ### :gear: Tech 37 | - Scrapers to collect data 38 | - Elasticsearch to store texts, FAQs, embeddings 39 | - NLP Models implemented via [Haystack](https://github.com/deepset-ai/haystack/) to find answers via a) detecting similar question in FAQs b) detect answers in free texts (extractive QA) 40 | - React Frontend 41 | 42 | -------------------------------------------------------------------------------- /datasources/scrapers/BMAS_scraper.py: -------------------------------------------------------------------------------- 1 | from datetime import date 2 | 3 | import scrapy 4 | from scrapy.crawler import CrawlerProcess 5 | 6 | 7 | class CovidScraper(scrapy.Spider): 8 | name = "BMAS_scraper" 9 | start_urls = ["https://www.bmas.de/DE/Presse/Meldungen/2020/corona-virus-arbeitsrechtliche-auswirkungen.html"] 10 | 11 | def parse(self, response): 12 | columns = { 13 | "question": [], 14 | "answer": [], 15 | "answer_html": [], 16 | "link": [], 17 | "name": [], 18 | "source": [], 19 | "category": [], 20 | "country": [], 21 | "region": [], 22 | "city": [], 23 | "lang": [], 24 | "last_update": [], 25 | } 26 | 27 | QUESTION_ANSWER_SELECTOR = ".panel" 28 | QUESTION_SELECTOR = ".collapsed ::text" 29 | ANSWER_SELECTOR = "./div[@id[starts-with(., 'collapse')]]" 30 | 31 | questions_answers = response.css(QUESTION_ANSWER_SELECTOR) 32 | for question_answer in questions_answers: 33 | question = question_answer.css(QUESTION_SELECTOR).getall() 34 | question = " ".join(question).strip().replace("\xad", "") 35 | answer = question_answer.xpath(ANSWER_SELECTOR).css(" ::text").getall() 36 | answer = " ".join(answer).strip().replace("\xad", "") 37 | answer_html = question_answer.xpath(ANSWER_SELECTOR).get() 38 | 39 | # add question-answer pair to data dictionary 40 | columns["question"].append(question) 41 | columns["answer"].append(answer) 42 | columns["answer_html"].append(answer_html) 43 | 44 | today = date.today() 45 | 46 | columns["link"] = ["https://www.bmas.de/DE/Presse/Meldungen/2020/corona-virus-arbeitsrechtliche-auswirkungen.html"] * len(columns["question"]) 47 | columns["name"] = ["Arbeits- und arbeitsschutzrechtliche Fragen zum Coronavirus (SARS-CoV-2)"] * len(columns["question"]) 48 | columns["source"] = ["Bundesministerium für Arbeit und Soziales (BMAS)"] * len(columns["question"]) 49 | columns["category"] = [""] * len(columns["question"]) 50 | columns["country"] = ["DE"] * len(columns["question"]) 51 | columns["region"] = [""] * len(columns["question"]) 52 | columns["city"] = [""] * len(columns["question"]) 53 | columns["lang"] = ["de"] * len(columns["question"]) 54 | columns["last_update"] = [today.strftime("%Y/%m/%d")] * len(columns["question"]) 55 | 56 | return columns 57 | 58 | 59 | if __name__ == "__main__": 60 | process = CrawlerProcess({ 61 | 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)' 62 | }) 63 | 64 | process.crawl(CovidScraper) 65 | process.start() 66 | -------------------------------------------------------------------------------- /datasources/automatic/testing_WHO_scraper.py: -------------------------------------------------------------------------------- 1 | ######### this scraper is exactly like the scrapers.WHO_scraper.py 2 | # but it uses an URL from the waybackmachine, so the site doesnt change over time and crawling fails 3 | # This is only for testing purposes 4 | 5 | from datetime import date 6 | 7 | import scrapy 8 | 9 | 10 | class CovidScraper(scrapy.Spider): 11 | name = "WHO_scraper" 12 | #start_urls = ["https://www.who.int/news-room/q-a-detail/q-a-coronaviruses"] 13 | start_urls = ["https://web.archive.org/web/20200331131108/https://www.who.int/news-room/q-a-detail/q-a-coronaviruses"] 14 | 15 | def parse(self, response): 16 | columns = { 17 | "question": [], 18 | "answer": [], 19 | "answer_html": [], 20 | "link": [], 21 | "name": [], 22 | "source": [], 23 | "category": [], 24 | "country": [], 25 | "region": [], 26 | "city": [], 27 | "lang": [], 28 | "last_update": [], 29 | } 30 | 31 | QUESTION_ANSWER_SELECTOR = ".sf-accordion__panel" 32 | QUESTION_SELECTOR = ".sf-accordion__link::text" 33 | ANSWER_SELECTOR = ".sf-accordion__content ::text" 34 | ANSWER_HTML_SELECTOR = ".sf-accordion__content" 35 | 36 | questions_answers = response.css(QUESTION_ANSWER_SELECTOR) 37 | for question_answer in questions_answers: 38 | question = question_answer.css(QUESTION_SELECTOR).getall() 39 | question = " ".join(question).strip() 40 | answer = question_answer.css(ANSWER_SELECTOR).getall() 41 | answer = " ".join(answer).strip() 42 | answer_html = question_answer.css(ANSWER_HTML_SELECTOR).getall() 43 | answer_html = " ".join(answer_html).strip() 44 | 45 | # add question-answer pair to data dictionary 46 | columns["question"].append(question) 47 | columns["answer"].append(answer) 48 | columns["answer_html"].append(answer_html) 49 | 50 | today = date.today() 51 | 52 | columns["link"] = ["https://www.who.int/news-room/q-a-detail/q-a-coronaviruses"] * len(columns["question"]) 53 | columns["name"] = ["Q&A on coronaviruses (COVID-19)"] * len(columns["question"]) 54 | columns["source"] = ["World Health Organization (WHO)"] * len(columns["question"]) 55 | columns["category"] = [""] * len(columns["question"]) 56 | columns["country"] = [""] * len(columns["question"]) 57 | columns["region"] = [""] * len(columns["question"]) 58 | columns["city"] = [""] * len(columns["question"]) 59 | columns["lang"] = ["en"] * len(columns["question"]) 60 | columns["last_update"] = [today.strftime("%Y/%m/%d")] * len(columns["question"]) 61 | 62 | return columns 63 | -------------------------------------------------------------------------------- /datasources/scrapers_unused/ZEIT_scraper.py: -------------------------------------------------------------------------------- 1 | # run 'scrapy runspider ZEIT_scraper.py' to scrape data 2 | 3 | from datetime import date 4 | import scrapy 5 | 6 | class CovidScraper(scrapy.Spider): 7 | name = "ZEIT_faq_scraper" 8 | start_urls = ["https://www.zeit.de/wissen/gesundheit/2020-02/coronavirus-sars-cov-2-risiko-symptome-schutz-rechte-faq"] 9 | 10 | custom_settings = { 11 | 'USER_AGENT': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)', 12 | } 13 | 14 | def parse(self, response): 15 | columns = { 16 | "question" : [], 17 | "answer" : [], 18 | "answer_html" : [], 19 | "link" : [], 20 | "name" : [], 21 | "source" : [], 22 | "category" : [], 23 | "country" : [], 24 | "region" : [], 25 | "city" : [], 26 | "lang" : [], 27 | "last_update" : [], 28 | } 29 | 30 | QUESTION_ANSWER_SELECTOR = "div.article-page div[itemscope]:not(div[itemprop='acceptedAnswer'])" 31 | QUESTION_SELECTOR = ".article__subheading::text" 32 | ANSWER_SELECTOR = "p.paragraph.article__item ::text" 33 | ANSWER_HTML_SELECTOR = "p.paragraph.article__item" 34 | QUESTION_LINK_ID_SELECTOR = ".article__subheading" 35 | 36 | questions_answers = response.css(QUESTION_ANSWER_SELECTOR) 37 | for question_answer in questions_answers: 38 | question = question_answer.css(QUESTION_SELECTOR).getall() 39 | question = " ".join(question).strip() 40 | answer = question_answer.css(ANSWER_SELECTOR).getall() 41 | answer = " ".join(answer).replace('\n', '').replace('\xa0', '').strip() 42 | answer_html = question_answer.css(ANSWER_HTML_SELECTOR).getall() 43 | answer_html = " ".join(answer_html).strip() 44 | link_id = question_answer.css(QUESTION_LINK_ID_SELECTOR)[0].root.attrib['id'] 45 | 46 | # add question-answer pair to data dictionary 47 | columns["question"].append(question) 48 | columns["answer"].append(answer) 49 | columns["answer_html"].append(answer_html) 50 | columns["link"].append("https://www.zeit.de/wissen/gesundheit/2020-02/coronavirus-sars-cov-2-risiko-symptome-schutz-rechte-faq#" + link_id) 51 | 52 | today = date.today() 53 | 54 | columns["name"] = ["Coronavirus Sars-CoV-2: Die wichtigsten Antworten zum Corona-Ausbruch"] * len(columns["question"]) 55 | columns["source"] = ["ZEIT ONLINE GmbH"] * len(columns["question"]) 56 | columns["category"] = [""] * len(columns["question"]) 57 | columns["country"] = [""] * len(columns["question"]) 58 | columns["region"] = [""] * len(columns["question"]) 59 | columns["city"] = [""] * len(columns["question"]) 60 | columns["lang"] = ["de"] * len(columns["question"]) 61 | columns["last_update"] = [today.strftime("%Y/%m/%d")] * len(columns["question"]) 62 | 63 | return columns 64 | 65 | 66 | 67 | -------------------------------------------------------------------------------- /covid_nlp/modeling/tfidf/preprocess.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import re 3 | import os 4 | 5 | import nltk 6 | from nltk import word_tokenize 7 | from nltk.corpus import stopwords 8 | nltk.download('stopwords') 9 | nltk.download('punkt') 10 | import string 11 | 12 | import sentencepiece as spm 13 | 14 | class Preprocessor(): 15 | 16 | def __init__(self, language = 'english', instream = None): 17 | self.language = language 18 | if instream: 19 | self.corpus_orig = self.read_string(instream) 20 | else: 21 | self.corpus_orig = self.read_articles(sys.stdin) 22 | self.corpus = self.preprocess(self.corpus_orig) 23 | 24 | def preprocess(self, corpus_list): 25 | preproc_corpus_list = [] 26 | question_words = set(['how', 'what', 'which', 'when', 'where', 'who', 'why']) 27 | stopset = stopwords.words(self.language) + list(string.punctuation) 28 | stopset = list(set(stopset) - question_words) 29 | for corpus in corpus_list: 30 | corpus = corpus.lower() 31 | corpus = " ".join([ i for i in word_tokenize(corpus) if i not in stopset ]) 32 | preproc_corpus_list.append(corpus) 33 | return preproc_corpus_list 34 | 35 | def preprocess_sp(self, corpus_list): 36 | return self.sentencepiece_apply(self.preprocess(corpus_list)) 37 | 38 | def sentencepiece_train(self, corpus_list, vocab_size = 24000): 39 | fp_out = open("./sp_corpus.txt", 'w') 40 | for corpus in corpus_list: 41 | print(corpus, file=fp_out) 42 | fp_out.close() 43 | spm.SentencePieceTrainer.Train(f"--input=sp_corpus.txt --model_prefix=sp_model --vocab_size={vocab_size} --max_sentence_length=1000 --character_coverage=1.0 --num_threads=4 --hard_vocab_limit=false") 44 | return None 45 | 46 | def sentencepiece_apply(self, corpus_list): 47 | sent_corpus_list = [] 48 | sp = spm.SentencePieceProcessor() 49 | sp.Load("./sp_model.model") 50 | for corpus in corpus_list: 51 | sent_corpus_list.append(" ".join(sp.EncodeAsPieces(corpus))) 52 | return sent_corpus_list 53 | 54 | def read_articles(self, fp): 55 | articles = [] 56 | for line in fp: 57 | if line.strip() != "": 58 | articles.append(line.strip()) 59 | return articles 60 | 61 | def read_string(self, mystring): 62 | articles = [mystring] 63 | return articles 64 | 65 | 66 | def main(): 67 | vocab_size = 24000 68 | if len(sys.argv) > 1: 69 | vocab_size = sys.argv[1] 70 | print("Create Preprocessor") 71 | preprocessor = Preprocessor(language = 'english') 72 | print("Train spm") 73 | preprocessor.sentencepiece_train(preprocessor.corpus, vocab_size = vocab_size) 74 | 75 | if __name__ == "__main__": 76 | main() 77 | -------------------------------------------------------------------------------- /datasources/scrapers/CDC_Water_scraper.py: -------------------------------------------------------------------------------- 1 | # run 'scrapy runspider CDC_Water_scraper.py' to scrape data 2 | 3 | from datetime import date 4 | 5 | import scrapy 6 | from scrapy.crawler import CrawlerProcess 7 | 8 | 9 | class CovidScraper(scrapy.Spider): 10 | name = "CDC_Travel_Scraper" 11 | start_urls = ["https://www.cdc.gov/coronavirus/2019-ncov/php/water.html"] 12 | 13 | def parse(self, response): 14 | columns = { 15 | "question": [], 16 | "answer": [], 17 | "answer_html": [], 18 | "link": [], 19 | "name": [], 20 | "source": [], 21 | "category": [], 22 | "country": [], 23 | "region": [], 24 | "city": [], 25 | "lang": [], 26 | "last_update": [], 27 | } 28 | 29 | found_question = False 30 | 31 | all_nodes = response.xpath("//*") 32 | for node in all_nodes: 33 | # in question 34 | if node.attrib.get("role") == "heading": 35 | found_question = True 36 | current_question = node.css("::text").get() 37 | continue 38 | 39 | # in answer 40 | if found_question and (node.attrib.get("class") == "collapse "): 41 | current_answer = node.css(" ::text").getall() 42 | current_answer = " ".join(current_answer).strip() 43 | current_answer_html = node.getall() 44 | current_answer_html = " ".join(current_answer_html).strip() 45 | 46 | columns["question"].append(current_question) 47 | columns["answer"].append(current_answer) 48 | columns["answer_html"].append(current_answer_html) 49 | else: 50 | found_question = False 51 | 52 | today = date.today() 53 | 54 | columns["link"] = ["https://www.cdc.gov/coronavirus/2019-ncov/php/water.html"] * len(columns["question"]) 55 | columns["name"] = ["Water Transmission and COVID-19"] * len(columns["question"]) 56 | columns["source"] = ["Center for Disease Control and Prevention (CDC)"] * len(columns["question"]) 57 | columns["category"] = [""] * len(columns["question"]) 58 | columns["country"] = ["USA"] * len(columns["question"]) 59 | columns["region"] = [""] * len(columns["question"]) 60 | columns["city"] = [""] * len(columns["question"]) 61 | columns["lang"] = ["en"] * len(columns["question"]) 62 | columns["last_update"] = [today.strftime("%Y/%m/%d")] * len(columns["question"]) 63 | 64 | return columns 65 | 66 | if __name__ == "__main__": 67 | process = CrawlerProcess({ 68 | 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)' 69 | }) 70 | 71 | process.crawl(CovidScraper) 72 | process.start() 73 | -------------------------------------------------------------------------------- /backend/data_ingestion.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from haystack import Finder 3 | from haystack.database.elasticsearch import ElasticsearchDocumentStore 4 | from haystack.retriever.elasticsearch import ElasticsearchRetriever 5 | 6 | def index_new_docs(document_store, retriever): 7 | # Get dataframe with questions, answers and some metadata 8 | df = pd.read_csv("data/faqs/faq_covidbert.csv") 9 | df.fillna(value="", inplace=True) 10 | 11 | # Index to ES 12 | if document_store.get_document_count() == 0: 13 | docs_to_index = [] 14 | for idx, row in df.iterrows(): 15 | d = row.to_dict() 16 | d = {k: v.strip() for k, v in d.items()} 17 | d["document_id"] = idx 18 | # add embedding 19 | question_embedding = retriever.create_embedding(row["question"]) 20 | d["question_emb"] = question_embedding 21 | docs_to_index.append(d) 22 | print(idx) 23 | document_store.write_documents(docs_to_index) 24 | 25 | 26 | def update_embeddings(document_store, retriever): 27 | #TODO move this upstream into haystack 28 | body = { 29 | "size": 10000, 30 | "query": { 31 | "match_all": {} 32 | }, 33 | "_source": {"includes":["question"]} 34 | 35 | } 36 | results = document_store.client.search(index=document_store.index, body=body, )["hits"]["hits"] 37 | # update embedding field 38 | for r in results: 39 | question_embedding = retriever.create_embedding(r["_source"]["question"]) 40 | 41 | body = { 42 | "doc" : { 43 | "question_emb": question_embedding 44 | } 45 | } 46 | document_store.client.update(index=document_store.index, id=r["_id"], body=body) 47 | 48 | 49 | if __name__=="__main__": 50 | 51 | document_store = ElasticsearchDocumentStore( 52 | host="localhost", 53 | username="", 54 | password="", 55 | index="document", 56 | text_field="answer", 57 | embedding_field="question_emb", 58 | embedding_dim=768, 59 | excluded_meta_data=["question_emb"], 60 | ) 61 | 62 | MODEL = "deepset/sentence_bert" 63 | GPU = False 64 | 65 | retriever = ElasticsearchRetriever(document_store=document_store, embedding_model=MODEL, gpu=GPU, 66 | emb_extraction_layer=-2, pooling_strategy="reduce_mean") 67 | 68 | # index new docs 69 | index_new_docs(document_store, retriever) 70 | 71 | # or just update embeddings 72 | # update_embeddings(document_store, retriever) 73 | 74 | # test with a query 75 | finder = Finder(reader=None, retriever=retriever) 76 | prediction = finder.get_answers_via_similar_questions(question="How high is mortality?", top_k_retriever=10) 77 | for p in prediction["answers"]: 78 | print(p["question"]) 79 | -------------------------------------------------------------------------------- /datasources/scrapers/Bundesregierung_scraper.py: -------------------------------------------------------------------------------- 1 | from datetime import date 2 | 3 | import scrapy 4 | 5 | 6 | class CovidScraper(scrapy.Spider): 7 | name = "Bundesregierung_scraper" 8 | start_urls = ["https://www.bundesregierung.de/breg-de/themen/coronavirus/ausbreitung-coronavirus-1716188"] 9 | 10 | def parse(self, response): 11 | columns = { 12 | "question": [], 13 | "answer": [], 14 | "answer_html": [], 15 | "link": [], 16 | "name": [], 17 | "source": [], 18 | "category": [], 19 | "country": [], 20 | "region": [], 21 | "city": [], 22 | "lang": [], 23 | "last_update": [], 24 | } 25 | 26 | QUESTION_ELEMENT_SELECTOR = "h2.mt-3" 27 | QUESTION_SELECTOR = "::text" 28 | 29 | questions = response.css(QUESTION_ELEMENT_SELECTOR) 30 | for question_elm in questions: 31 | question = question_elm.css(QUESTION_SELECTOR).getall() 32 | question = " ".join(question).strip() 33 | 34 | # all paragraphs till the next question header are considert to be the answer 35 | following_siblings = question_elm.xpath('following-sibling::*') 36 | answer = [] 37 | answer_html = [] 38 | for elm in following_siblings: 39 | if elm.root.tag == 'p' and 'navToTop' not in elm.root.classes: 40 | answer += elm.css("::text").getall() 41 | answer_html += [elm.get()] 42 | else: 43 | break 44 | answer = "".join(answer).replace('\n', '').strip() 45 | answer_html = " ".join(answer_html).strip() 46 | 47 | # add question-answer pair to data dictionary 48 | columns["question"].append(question) 49 | columns["answer"].append(answer) 50 | columns["answer_html"].append(answer_html) 51 | 52 | today = date.today() 53 | 54 | columns["link"] = [ 55 | "https://www.bundesregierung.de/breg-de/themen/coronavirus/ausbreitung-coronavirus-1716188"] * len( 56 | columns["question"]) 57 | columns["name"] = ["Wichtige Fragen und Antworten zum Coronavirus"] * len(columns["question"]) 58 | columns["source"] = ["Presse- und Informationsamt der Bundesregierung"] * len(columns["question"]) 59 | columns["category"] = [""] * len(columns["question"]) 60 | columns["country"] = ["DE"] * len(columns["question"]) 61 | columns["region"] = [""] * len(columns["question"]) 62 | columns["city"] = [""] * len(columns["question"]) 63 | columns["lang"] = ["de"] * len(columns["question"]) 64 | columns["last_update"] = [today.strftime("%Y/%m/%d")] * len(columns["question"]) 65 | 66 | return columns 67 | -------------------------------------------------------------------------------- /datasources/scrapers/UNICEF_scraper.py: -------------------------------------------------------------------------------- 1 | from datetime import date 2 | 3 | import scrapy 4 | 5 | 6 | class CovidScraper(scrapy.Spider): 7 | name = "UNICEF_scraper" 8 | start_urls = ["https://www.unicef.org/stories/novel-coronavirus-outbreak-what-parents-should-know"] 9 | 10 | def parse(self, response): 11 | columns = { 12 | "question": [], 13 | "answer": [], 14 | "answer_html": [], 15 | "link": [], 16 | "name": [], 17 | "source": [], 18 | "category": [], 19 | "country": [], 20 | "region": [], 21 | "city": [], 22 | "lang": [], 23 | "last_update": [], 24 | } 25 | 26 | QUESTION_ANSWER_SELECTOR = ".field .field--name-field-component-text-content" 27 | QUESTION_SELECTOR = "h4::text" 28 | ANSWER_SELECTOR = "p:not(p:contains('< Back')) ::text" 29 | ANSWER_HTML_SELECTOR = "p:not(p:contains('< Back'))" 30 | 31 | questions_answers = response.css(QUESTION_ANSWER_SELECTOR) 32 | for question_answer in questions_answers: 33 | question = question_answer.css(QUESTION_SELECTOR).getall() 34 | question = " ".join(question).strip() 35 | answer = question_answer.css(ANSWER_SELECTOR).getall() 36 | answer = " ".join(answer).strip() 37 | answer_html = question_answer.css(ANSWER_HTML_SELECTOR).getall() 38 | answer_html = " ".join(answer_html).strip() 39 | 40 | # if no question, answer belongs to last question. ("How can I avoid the risk of infection?") 41 | if (question == ''): 42 | columns["answer"][-1] += ' ' + answer 43 | columns["answer_html"][-1] += ' ' + answer_html 44 | continue 45 | 46 | # add question-answer pair to data dictionary 47 | columns["question"].append(question) 48 | columns["answer"].append(answer) 49 | columns["answer_html"].append(answer_html) 50 | 51 | today = date.today() 52 | 53 | columns["link"] = ["https://www.unicef.org/stories/novel-coronavirus-outbreak-what-parents-should-know"] * len( 54 | columns["question"]) 55 | columns["name"] = ["Coronavirus disease (COVID-19): What parents should know"] * len(columns["question"]) 56 | columns["source"] = ["UNICEF"] * len(columns["question"]) 57 | columns["category"] = [""] * len(columns["question"]) 58 | columns["country"] = [""] * len(columns["question"]) 59 | columns["region"] = [""] * len(columns["question"]) 60 | columns["city"] = [""] * len(columns["question"]) 61 | columns["lang"] = ["en"] * len(columns["question"]) 62 | columns["last_update"] = [today.strftime("%Y/%m/%d")] * len(columns["question"]) 63 | 64 | return columns 65 | -------------------------------------------------------------------------------- /datasources/scrapers/BAUA_scraper.py: -------------------------------------------------------------------------------- 1 | from datetime import date 2 | 3 | import scrapy 4 | from scrapy.crawler import CrawlerProcess 5 | 6 | 7 | class CovidScraper(scrapy.Spider): 8 | name = "BAUA_scraper" 9 | start_urls = ["https://www.baua.de/DE/Themen/Arbeitsgestaltung-im-Betrieb/Biostoffe/FAQ/FAQ_node.html"] 10 | 11 | def parse(self, response): 12 | columns = { 13 | "question": [], 14 | "answer": [], 15 | "answer_html": [], 16 | "link": [], 17 | "name": [], 18 | "source": [], 19 | "category": [], 20 | "country": [], 21 | "region": [], 22 | "city": [], 23 | "lang": [], 24 | "last_update": [], 25 | } 26 | QUESTIONS_SELECTOR = "//div[@class='tabs-container']/h2[@class='heading']" 27 | QUESTION_SELECTOR = " ::text" 28 | ANSWERS_SELECTOR = "//div[@class='tabs-container']/div" 29 | ANSWER_SELECTOR = "*::text" 30 | ANSWER_HTML_SELECTOR = "*" 31 | 32 | for q in response.xpath(QUESTIONS_SELECTOR): 33 | question = q.css(QUESTION_SELECTOR).getall() 34 | question = " ".join(question).strip() 35 | 36 | columns["question"].append(question) 37 | 38 | for a in response.xpath(ANSWERS_SELECTOR): 39 | answer = a.css(ANSWER_SELECTOR).getall() 40 | answer = " ".join(answer).strip() 41 | answer_html = a.css(ANSWER_HTML_SELECTOR).getall() 42 | answer_html = " ".join(answer_html).strip() 43 | 44 | columns["answer"].append(answer) 45 | columns["answer_html"].append(answer_html) 46 | 47 | today = date.today() 48 | 49 | columns["link"] = [ 50 | "https://www.baua.de/DE/Themen/Arbeitsgestaltung-im-Betrieb/Biostoffe/FAQ/FAQ_node.html"] * len( 51 | columns["question"]) 52 | columns["name"] = ["Antworten auf häufig gestellte Fragen zu beruflichen Tätigkeiten mit SARS-CoV-2"] * len( 53 | columns["question"]) 54 | columns["source"] = ["Bundesanstalt für Arbeitsschutz und Arbeitsmedizin (BAuA)"] * len(columns["question"]) 55 | columns["category"] = [""] * len(columns["question"]) 56 | columns["country"] = ["DE"] * len(columns["question"]) 57 | columns["region"] = [""] * len(columns["question"]) 58 | columns["city"] = [""] * len(columns["question"]) 59 | columns["lang"] = ["de"] * len(columns["question"]) 60 | columns["last_update"] = [today.strftime("%Y/%m/%d")] * len(columns["question"]) 61 | 62 | return columns 63 | 64 | 65 | if __name__ == "__main__": 66 | process = CrawlerProcess({ 67 | 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)' 68 | }) 69 | 70 | process.crawl(CovidScraper) 71 | process.start() 72 | -------------------------------------------------------------------------------- /datasources/scrapers_outdated/BZgA_scraper.py: -------------------------------------------------------------------------------- 1 | # run 'scrapy runspider WHO_scraper.py' to scrape data 2 | 3 | from datetime import date 4 | 5 | import scrapy 6 | from scrapy.crawler import CrawlerProcess 7 | 8 | 9 | class CovidScraper(scrapy.Spider): 10 | name = "BZgA_scraper" 11 | start_urls = ["https://www.infektionsschutz.de/coronavirus/faqs-coronaviruscovid-19.html"] 12 | 13 | def parse(self, response): 14 | columns = { 15 | "question": [], 16 | "answer": [], 17 | "answer_html": [], 18 | "link": [], 19 | "name": [], 20 | "source": [], 21 | "category": [], 22 | "country": [], 23 | "region": [], 24 | "city": [], 25 | "lang": [], 26 | "last_update": [], 27 | } 28 | 29 | QUESTION_ANSWER_SELECTOR = ".c-accordion__item" 30 | QUESTION_SELECTOR = ".c-accordion__button::text" 31 | ANSWER_SELECTOR = ".c-accordion__section ::text" 32 | ANSWER_HTML_SELECTOR = ".c-text" 33 | 34 | questions_answers = response.css(QUESTION_ANSWER_SELECTOR) 35 | for question_answer in questions_answers: 36 | question = question_answer.css(QUESTION_SELECTOR).getall() 37 | question = " ".join(question).strip() 38 | answer = question_answer.css(ANSWER_SELECTOR).getall() 39 | answer = "".join(answer).strip() 40 | answer_html = question_answer.css(ANSWER_HTML_SELECTOR).getall() 41 | answer_html = " ".join(answer_html).strip() 42 | 43 | # add question-answer pair to data dictionary 44 | columns["question"].append(question) 45 | columns["answer"].append(answer) 46 | columns["answer_html"].append(answer_html) 47 | 48 | today = date.today() 49 | 50 | columns["link"] = ["https://www.infektionsschutz.de/coronavirus/faqs-coronaviruscovid-19.html"] * len( 51 | columns["question"]) 52 | columns["name"] = ["FAQs Coronavirus/Covid-19"] * len(columns["question"]) 53 | columns["source"] = ["Bundeszentrale für gesundheitliche Aufklärung (BZgA)"] * len(columns["question"]) 54 | columns["category"] = [""] * len(columns["question"]) 55 | columns["country"] = ["DE"] * len(columns["question"]) 56 | columns["region"] = [""] * len(columns["question"]) 57 | columns["city"] = [""] * len(columns["question"]) 58 | columns["lang"] = ["de"] * len(columns["question"]) 59 | columns["last_update"] = [today.strftime("%Y/%m/%d")] * len(columns["question"]) 60 | 61 | return columns 62 | 63 | 64 | if __name__ == "__main__": 65 | process = CrawlerProcess({ 66 | 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)' 67 | }) 68 | 69 | process.crawl(CovidScraper) 70 | process.start() 71 | -------------------------------------------------------------------------------- /backend/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | cover/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | .pybuilder/ 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | # For a library or package, you might want to ignore these files since the code is 88 | # intended to run in multiple environments; otherwise, check them in: 89 | # .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 99 | __pypackages__/ 100 | 101 | # Celery stuff 102 | celerybeat-schedule 103 | celerybeat.pid 104 | 105 | # SageMath parsed files 106 | *.sage.py 107 | 108 | # Environments 109 | .env 110 | .venv 111 | env/ 112 | venv/ 113 | ENV/ 114 | env.bak/ 115 | venv.bak/ 116 | 117 | # Spyder project settings 118 | .spyderproject 119 | .spyproject 120 | 121 | # Rope project settings 122 | .ropeproject 123 | 124 | # mkdocs documentation 125 | /site 126 | 127 | # mypy 128 | .mypy_cache/ 129 | .dmypy.json 130 | dmypy.json 131 | 132 | # Pyre type checker 133 | .pyre/ 134 | 135 | # pytype static type analyzer 136 | .pytype/ 137 | 138 | # Cython debug symbols 139 | cython_debug/ 140 | 141 | # static files generated from Django application using `collectstatic` 142 | media 143 | static 144 | -------------------------------------------------------------------------------- /datasources/scrapers/CDC_Travel_scraper.py: -------------------------------------------------------------------------------- 1 | # run 'scrapy runspider CDC_Travel_scraper.py' to scrape data 2 | 3 | from datetime import date 4 | 5 | import scrapy 6 | 7 | 8 | class CovidScraper(scrapy.Spider): 9 | name = "CDC_Travel_Scraper" 10 | start_urls = ["https://www.cdc.gov/coronavirus/2019-ncov/travelers/faqs.html"] 11 | 12 | def parse(self, response): 13 | columns = { 14 | "question": [], 15 | "answer": [], 16 | "answer_html": [], 17 | "link": [], 18 | "name": [], 19 | "source": [], 20 | "category": [], 21 | "country": [], 22 | "region": [], 23 | "city": [], 24 | "lang": [], 25 | "last_update": [], 26 | } 27 | 28 | current_category = "" 29 | 30 | all_nodes = response.xpath("//*") 31 | for node in all_nodes: 32 | # in category 33 | if node.attrib.get("class") == "onThisPageAnchor": 34 | current_category = node.attrib.get("title") 35 | 36 | # in category 37 | if current_category: 38 | # in question 39 | if node.attrib.get("role") == "heading": 40 | current_question = node.css("::text").get() 41 | 42 | # in answer 43 | if node.attrib.get("class") == "card-body": 44 | current_answer = node.css(" ::text").getall() 45 | current_answer = " ".join(current_answer).strip() 46 | current_answer_html = node.getall() 47 | current_answer_html = " ".join(current_answer_html).strip() 48 | 49 | # add question-answer-pair to data dictionary 50 | columns["question"].append(current_question) 51 | columns["answer"].append(current_answer) 52 | columns["answer_html"].append(current_answer_html) 53 | columns["category"].append(current_category) 54 | 55 | # end of FAQ 56 | if node.attrib.get("class") == "text-right mb-2": 57 | current_category = "" 58 | 59 | today = date.today() 60 | 61 | columns["link"] = ["https://www.cdc.gov/coronavirus/2019-ncov/travelers/faqs.html"] * len(columns["question"]) 62 | columns["name"] = ["Travel: Frequently Asked Questions and Answers"] * len(columns["question"]) 63 | columns["source"] = ["Center for Disease Control and Prevention (CDC)"] * len(columns["question"]) 64 | columns["country"] = ["USA"] * len(columns["question"]) 65 | columns["region"] = [""] * len(columns["question"]) 66 | columns["city"] = [""] * len(columns["question"]) 67 | columns["lang"] = ["en"] * len(columns["question"]) 68 | columns["last_update"] = [today.strftime("%Y/%m/%d")] * len(columns["question"]) 69 | 70 | return columns 71 | -------------------------------------------------------------------------------- /datasources/scrapers/Salute_IT_scraper.py: -------------------------------------------------------------------------------- 1 | # run 'scrapy runspider Salute_IT_scraper.py' to scrape data 2 | 3 | from datetime import date 4 | 5 | import scrapy 6 | from scrapy.crawler import CrawlerProcess 7 | 8 | 9 | class CovidScraper(scrapy.Spider): 10 | name = "Salute_IT_Scraper" 11 | start_urls = ["https://www.salute.gov.it/portale/nuovocoronavirus/dettaglioFaqNuovoCoronavirus.jsp?id=228"] 12 | 13 | def parse(self, response): 14 | columns = { 15 | "question": [], 16 | "answer": [], 17 | "answer_html": [], 18 | "link": [], 19 | "name": [], 20 | "source": [], 21 | "category": [], 22 | "country": [], 23 | "region": [], 24 | "city": [], 25 | "lang": [], 26 | "last_update": [], 27 | } 28 | 29 | # extract topics 30 | for x in response.xpath('//dl'): 31 | # question is in second strong object in dt 32 | question_list = [q.strip() for q in x.xpath('./dt/strong[2]/text()').extract()] 33 | # answer is in dd 34 | answer_html_list = [] 35 | answer_list = [] 36 | for a in x.xpath('./dd')[:-1]: 37 | answer_html_list.append(' '.join([h.strip() for h in a.xpath('./descendant-or-self::*').extract()])) 38 | answer_list.append(' '.join([t.strip() for t in a.xpath('./descendant-or-self::*/text()').extract()])) 39 | if len(question_list) == len(answer_list): 40 | for question_text, answer_text, answer_html in zip(question_list, answer_list, answer_html_list): 41 | columns["question"].append(question_text) 42 | columns["answer"].append(answer_text) 43 | columns["answer_html"].append(answer_html) 44 | 45 | today = date.today() 46 | 47 | columns["link"] = [ 48 | "https://www.salute.gov.it/portale/nuovocoronavirus/dettaglioFaqNuovoCoronavirus.jsp?id=228"] * len( 49 | columns["question"]) 50 | columns["name"] = ["FAQ - Covid-19, domande e risposte"] * len(columns["question"]) 51 | columns["source"] = ["Ministero della Salute, IT"] * len(columns["question"]) 52 | columns["category"] = [""] * len(columns["question"]) 53 | columns["country"] = ["IT"] * len(columns["question"]) 54 | columns["region"] = [""] * len(columns["question"]) 55 | columns["city"] = [""] * len(columns["question"]) 56 | columns["lang"] = ["it"] * len(columns["question"]) 57 | columns["last_update"] = [today.strftime("%Y/%m/%d")] * len(columns["question"]) 58 | 59 | return columns 60 | 61 | 62 | if __name__ == "__main__": 63 | process = CrawlerProcess({ 64 | 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)' 65 | }) 66 | 67 | process.crawl(CovidScraper) 68 | process.start() 69 | -------------------------------------------------------------------------------- /covid-frontend/.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | image: node:lts 2 | services: 3 | - docker:18.09-dind 4 | 5 | cache: 6 | key: ${CI_COMMIT_REF_SLUG} 7 | paths: 8 | - node_modules/ 9 | 10 | stages: 11 | - build 12 | - nginx 13 | - staging 14 | - production 15 | 16 | variables: 17 | DOCKER_HOST: tcp://localhost:2375 18 | 19 | build_app:staging: 20 | stage: build 21 | script: 22 | - npm install 23 | - npm run build:staging 24 | artifacts: 25 | paths: 26 | - build/ 27 | expire_in: 1 day 28 | except: 29 | - master 30 | 31 | build_app:production: 32 | stage: build 33 | script: 34 | - npm install 35 | - npm run build:production 36 | artifacts: 37 | paths: 38 | - build/ 39 | expire_in: 1 day 40 | only: 41 | - master 42 | 43 | build_nginx:staging: 44 | stage: nginx 45 | image: docker:18.09-dind 46 | cache: {} 47 | script: 48 | - docker login -u gitlab-ci-token -p $CI_BUILD_TOKEN $CI_REGISTRY 49 | - docker build --pull -t $CI_REGISTRY_IMAGE:$CI_BUILD_REF . 50 | - docker push $CI_REGISTRY_IMAGE:$CI_BUILD_REF 51 | dependencies: 52 | - build_app:staging 53 | only: 54 | - develop 55 | 56 | build_nginx:production: 57 | stage: nginx 58 | image: docker:18.09-dind 59 | cache: {} 60 | script: 61 | - docker login -u gitlab-ci-token -p $CI_BUILD_TOKEN $CI_REGISTRY 62 | - docker build --pull -t $CI_REGISTRY_IMAGE:$CI_BUILD_REF . 63 | - docker push $CI_REGISTRY_IMAGE:$CI_BUILD_REF 64 | dependencies: 65 | - build_app:production 66 | only: 67 | - master 68 | 69 | .kubectl: &kubectl 70 | - echo "$CERTIFICATE_AUTHORITY_DATA" > ca-auth.pem 71 | - kubectl config set-cluster deepannotate-eks --server="$SERVER" 72 | - kubectl config set-cluster deepannotate-eks --embed-certs --certificate-authority=ca-auth.pem 73 | - kubectl config set-credentials gitlab --token="$USER_TOKEN" 74 | - kubectl config set-context default --cluster=deepannotate-eks --user=gitlab 75 | - kubectl config use-context default 76 | 77 | .staging_deploy: &staging_deploy 78 | image: dtzar/helm-kubectl 79 | cache: {} 80 | stage: staging 81 | before_script: *kubectl 82 | script: 83 | - kubectl patch deployment covid-frontend -n covid-staging -p '{"spec":{"template":{"spec":{"containers":[{"name":"'"covid-frontend"'","image":"'"$CI_REGISTRY_IMAGE:$CI_BUILD_REF"'"}]}}}}' 84 | 85 | deploy to staging: 86 | <<: *staging_deploy 87 | only: 88 | - develop 89 | 90 | .production_deploy: &production_deploy 91 | image: dtzar/helm-kubectl 92 | cache: {} 93 | stage: production 94 | before_script: *kubectl 95 | script: 96 | - kubectl patch deployment covid-frontend -n covid-production -p '{"spec":{"template":{"spec":{"containers":[{"name":"'"covid-frontend"'","image":"'"$CI_REGISTRY_IMAGE:$CI_BUILD_REF"'"}]}}}}' 97 | 98 | deploy to production: 99 | <<: *production_deploy 100 | only: 101 | - master 102 | -------------------------------------------------------------------------------- /datasources/scrapers/BMWI_scraper.py: -------------------------------------------------------------------------------- 1 | # run 'scrapy runspider BMWI_scraper.py' to scrape data 2 | 3 | from datetime import date 4 | import re 5 | import scrapy 6 | from scrapy.crawler import CrawlerProcess 7 | 8 | 9 | class CovidScraper(scrapy.Spider): 10 | name = 'bmwi_spyder' 11 | start_urls = ['https://www.bmwi.de/Redaktion/DE/FAQ/Coronavirus/faq-coronavirus.html'] 12 | 13 | def parse(self, response): 14 | columns = { 15 | "question": [], 16 | "answer": [], 17 | "answer_html": [], 18 | "link": [], 19 | "name": [], 20 | "source": [], 21 | "category": [], 22 | "country": [], 23 | "region": [], 24 | "city": [], 25 | "lang": [], 26 | "last_update": [], 27 | } 28 | 29 | categoryName = "" 30 | question = "" 31 | for elementPath in response.xpath('//div[@class="content"]/div/child::node()'): 32 | tagName = elementPath.xpath('name()').get() 33 | if tagName == 'h2': 34 | categoryName = ' '.join(elementPath.xpath('.//text()').getall()).strip() 35 | if len(categoryName) == 0: 36 | continue 37 | if tagName == 'div': 38 | question = ' '.join(elementPath.xpath('.//h2//text()').getall()).strip() 39 | response = '' 40 | responsePath = elementPath.xpath('.//div[@class="accordion-body collapse"]//div[@class="rich-text"]') 41 | for path in responsePath.xpath('.//p|.//ul/li'): 42 | response += '\n\n' + ' '.join(path.xpath('.//text()').getall()) 43 | response = re.sub('\(Stand[^)]*\)', '', response).strip() 44 | columns['category'].append(categoryName) 45 | columns['question'].append(question) 46 | columns['answer'].append(response) 47 | columns['answer_html'].append(responsePath.get()) 48 | 49 | today = date.today() 50 | 51 | columns["link"] = ["https://www.bmwi.de/Redaktion/DE/FAQ/Coronavirus/faq-coronavirus.html"] * len(columns["question"]) 52 | columns["name"] = ["Coronavirus: Antworten auf häufig gestellte Fragen"] * len(columns["question"]) 53 | columns["source"] = ["Bundesministerium für Wirtschaft und Energie"] * len(columns["question"]) 54 | columns["country"] = ["DE"] * len(columns["question"]) 55 | columns["region"] = [""] * len(columns["question"]) 56 | columns["city"] = [""] * len(columns["question"]) 57 | columns["lang"] = ["de"] * len(columns["question"]) 58 | columns["last_update"] = [today.strftime("%Y/%m/%d")] * len(columns["question"]) 59 | 60 | return columns 61 | 62 | 63 | if __name__ == "__main__": 64 | process = CrawlerProcess({ 65 | 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)' 66 | }) 67 | process.crawl(CovidScraper) 68 | process.start() 69 | -------------------------------------------------------------------------------- /datasources/scrapers_outdated/CDC_Individuals_scraper.py: -------------------------------------------------------------------------------- 1 | # run 'scrapy runspider CDC_Individuals_scraper.py' to scrape data 2 | 3 | from datetime import date 4 | 5 | import scrapy 6 | 7 | 8 | class CovidScraper(scrapy.Spider): 9 | name = "CDC_Individuals_Scraper" 10 | start_urls = ["https://www.cdc.gov/coronavirus/2019-ncov/prepare/faq.html"] 11 | 12 | def parse(self, response): 13 | columns = { 14 | "question": [], 15 | "answer": [], 16 | "answer_html": [], 17 | "link": [], 18 | "name": [], 19 | "source": [], 20 | "category": [], 21 | "country": [], 22 | "region": [], 23 | "city": [], 24 | "lang": [], 25 | "last_update": [], 26 | } 27 | 28 | current_category = "" 29 | 30 | all_nodes = response.xpath("//*") 31 | for node in all_nodes: 32 | # in category 33 | if node.attrib.get("class") == "card-header h4 bg-amber-t": 34 | current_category = node.css("::text").get() 35 | continue 36 | 37 | # in category 38 | if current_category: 39 | # in question 40 | if node.attrib.get("role") == "heading": 41 | current_question = node.css("::text").get() 42 | 43 | # in answer 44 | if node.attrib.get("class") == "card-body bg-gray-l3": 45 | current_answer = node.css(" ::text").getall() 46 | current_answer = " ".join(current_answer).strip() 47 | current_answer_html = node.getall() 48 | current_answer_html = " ".join(current_answer_html).strip() 49 | 50 | # add question-answer-pair to data dictionary 51 | columns["question"].append(current_question) 52 | columns["answer"].append(current_answer) 53 | columns["answer_html"].append(current_answer_html) 54 | columns["category"].append(current_category) 55 | 56 | # end of category 57 | if node.attrib.get("class") == "row": 58 | current_category = "" 59 | 60 | today = date.today() 61 | 62 | columns["link"] = ["https://www.cdc.gov/coronavirus/2019-ncov/prepare/faq.html"] * len(columns["question"]) 63 | columns["name"] = ["FAQs for Individuals and Families"] * len(columns["question"]) 64 | columns["source"] = ["Center for Disease Control and Prevention (CDC)"] * len(columns["question"]) 65 | columns["country"] = ["USA"] * len(columns["question"]) 66 | columns["region"] = [""] * len(columns["question"]) 67 | columns["city"] = [""] * len(columns["question"]) 68 | columns["lang"] = ["en"] * len(columns["question"]) 69 | columns["last_update"] = [today.strftime("%Y/%m/%d")] * len(columns["question"]) 70 | 71 | return columns 72 | -------------------------------------------------------------------------------- /covid_nlp/modeling/transformer/eval_pretrained_haystack.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | from sklearn.metrics import roc_auc_score 5 | from farm.utils import MLFlowLogger 6 | from haystack.retriever.elasticsearch import ElasticsearchRetriever 7 | from sklearn.metrics.pairwise import cosine_similarity 8 | from covid_nlp.eval import eval_question_similarity 9 | 10 | def eval_pretrained_transformers(eval_file, lang, models, pooling_methods, extraction_layers): 11 | for model_name in models: 12 | for pooling_method in pooling_methods: 13 | for extraction_layer in extraction_layers: 14 | experiment_name = model_name 15 | log_to_mlflow = True 16 | params = {"pooling_method": pooling_method, 17 | "extraction_layer": extraction_layer} 18 | 19 | # load eval data 20 | df = pd.read_csv(eval_file) 21 | # predict similarity of samples (e.g. via embeddings + cosine similarity) 22 | # here: dummy preds for naive baseline 23 | y_true = df["similar"].values 24 | retriever = ElasticsearchRetriever(document_store=None, embedding_model=model_name, gpu=True) 25 | questions_1 = [{"text": v} for k, v in df["question_1"].to_dict().items()] 26 | questions_2 = [{"text": v} for k, v in df["question_2"].to_dict().items()] 27 | 28 | res1 = retriever.embedding_model.extract_vectors(dicts=questions_1, 29 | extraction_strategy=params["pooling_method"], 30 | extraction_layer=params["extraction_layer"]) 31 | 32 | res2 = retriever.embedding_model.extract_vectors(dicts=questions_2, 33 | extraction_strategy=params["pooling_method"], 34 | extraction_layer=params["extraction_layer"]) 35 | res1 = np.array([i["vec"] for i in res1]) 36 | res2 = np.array([i["vec"] for i in res2]) 37 | 38 | df["pred"] = np.diag(cosine_similarity(res1, res2)) 39 | 40 | # eval & track results 41 | eval_question_similarity(y_true=y_true, y_pred=df["pred"].values, lang=lang, model_name=model_name, 42 | params=params, user="malte", log_to_mlflow=log_to_mlflow, run_name=experiment_name) 43 | 44 | if __name__ == "__main__": 45 | eval_file = "../data/eval_question_similarity_en.csv" 46 | lang = "en" 47 | # models = ["deepset/sentence_bert","bert-base-uncased", "DeepPavlov/bert-base-multilingual-cased-sentence"] 48 | models = ["deepset/quora_dedup_bert_base"] 49 | pooling_methods = ["reduce_mean","cls_token","reduce_max"] 50 | extraction_layers = [-1, -2] 51 | eval_pretrained_transformers(eval_file, lang, models, pooling_methods, extraction_layers) 52 | -------------------------------------------------------------------------------- /datasources/scrapers/BVF_scraper.py: -------------------------------------------------------------------------------- 1 | from datetime import date 2 | 3 | import scrapy 4 | from scrapy.crawler import CrawlerProcess 5 | 6 | 7 | class CovidScraper(scrapy.Spider): 8 | name = "BVF_scraper" 9 | start_urls = [ 10 | "https://www.bvf.de/aktuelles/fachliche-meldungen/artikel/news/faq-fuer-schwangere-frauen-und-ihre-familien-zu-spezifischen-risiken-der-covid-19-virusinfektion/"] 11 | 12 | def parse(self, response): 13 | columns = { 14 | "question": [], 15 | "answer": [], 16 | "answer_html": [], 17 | "link": [], 18 | "name": [], 19 | "source": [], 20 | "category": [], 21 | "country": [], 22 | "region": [], 23 | "city": [], 24 | "lang": [], 25 | "last_update": [], 26 | } 27 | QUESTIONS_SELECTOR = ".news-text-wrap h3::text" 28 | ANSWER_SELECTOR = " ::text" 29 | ANSWER_HTML_SELECTOR = "*" 30 | 31 | for q in response.css(QUESTIONS_SELECTOR): 32 | question = q.get() 33 | answer = "" 34 | answer_html = "" 35 | for selector in response.xpath("//div/h3[contains(text(), '" + question + "')]/following-sibling::*"): 36 | if "h3" in selector.get(): 37 | break 38 | else: 39 | answer += " ".join(selector.css(ANSWER_SELECTOR).getall()).strip() + "\n" 40 | answer_html += " ".join(selector.css(ANSWER_HTML_SELECTOR).getall()).strip() 41 | 42 | columns['question'].append(question) 43 | columns['answer'].append(answer) 44 | columns['answer_html'].append(answer_html) 45 | 46 | today = date.today() 47 | 48 | columns["link"] = [ 49 | "https://www.bvf.de/aktuelles/fachliche-meldungen/artikel/news/faq-fuer-schwangere-frauen-und-ihre-familien-zu-spezifischen-risiken-der-covid-19-virusinfektion/"] * len( 50 | columns["question"]) 51 | columns["name"] = [ 52 | "FAQ für schwangere Frauen und ihre Familien zu spezifischen Risiken der COVID-19-Virusinfektion"] * len( 53 | columns["question"]) 54 | columns["source"] = ["Berufsverband der Frauenärzte (BvF)"] * len(columns["question"]) 55 | columns["category"] = [""] * len(columns["question"]) 56 | columns["country"] = ["DE"] * len(columns["question"]) 57 | columns["region"] = [""] * len(columns["question"]) 58 | columns["city"] = [""] * len(columns["question"]) 59 | columns["lang"] = ["de"] * len(columns["question"]) 60 | columns["last_update"] = [today.strftime("%Y/%m/%d")] * len(columns["question"]) 61 | 62 | return columns 63 | 64 | 65 | if __name__ == "__main__": 66 | process = CrawlerProcess({ 67 | 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)' 68 | }) 69 | 70 | process.crawl(CovidScraper) 71 | process.start() 72 | -------------------------------------------------------------------------------- /telegram-bot/src/main/kotlin/com/theapache64/cs/models/rest/telegram/TelegramCallbackQuery.kt: -------------------------------------------------------------------------------- 1 | package com.theapache64.cs.models.rest.telegram 2 | 3 | import com.google.gson.annotations.SerializedName 4 | 5 | 6 | data class TelegramCallbackQuery( 7 | @SerializedName("callback_query") 8 | val callbackQuery: CallbackQuery, 9 | @SerializedName("update_id") 10 | val updateId: Int // 996097080 11 | ) { 12 | data class CallbackQuery( 13 | @SerializedName("chat_instance") 14 | val chatInstance: String, // -4027463488092007398 15 | @SerializedName("data") 16 | val `data`: String, // r123 17 | @SerializedName("from") 18 | val from: From, 19 | @SerializedName("id") 20 | val id: String, // 1034271309301426903 21 | @SerializedName("message") 22 | val message: Message 23 | ) { 24 | data class From( 25 | @SerializedName("first_name") 26 | val firstName: String, // theapache64 27 | @SerializedName("id") 28 | val id: Int, // 240810054 29 | @SerializedName("is_bot") 30 | val isBot: Boolean, // false 31 | @SerializedName("language_code") 32 | val languageCode: String, // en 33 | @SerializedName("username") 34 | val username: String // theapache64 35 | ) 36 | 37 | data class Message( 38 | @SerializedName("chat") 39 | val chat: Chat, 40 | @SerializedName("date") 41 | val date: Int, // 1584998447 42 | @SerializedName("from") 43 | val from: From, 44 | @SerializedName("message_id") 45 | val messageId: Long, // 61 46 | @SerializedName("reply_markup") 47 | val replyMarkup: ReplyMarkup, 48 | @SerializedName("text") 49 | val text: String // Was it helpful? 😊 50 | ) { 51 | data class Chat( 52 | @SerializedName("first_name") 53 | val firstName: String, // theapache64 54 | @SerializedName("id") 55 | val id: Int, // 240810054 56 | @SerializedName("type") 57 | val type: String, // private 58 | @SerializedName("username") 59 | val username: String // theapache64 60 | ) 61 | 62 | data class From( 63 | @SerializedName("first_name") 64 | val firstName: String, // Corona Scholar - Dev 65 | @SerializedName("id") 66 | val id: Int, // 1119620721 67 | @SerializedName("is_bot") 68 | val isBot: Boolean, // true 69 | @SerializedName("username") 70 | val username: String // corona_scholar_dev_bot 71 | ) 72 | 73 | data class ReplyMarkup( 74 | @SerializedName("inline_keyboard") 75 | val inlineKeyboard: List 76 | ) 77 | } 78 | } 79 | } -------------------------------------------------------------------------------- /datasources/scrapers/Arbeitsagentur_scraper.py: -------------------------------------------------------------------------------- 1 | from datetime import date 2 | import scrapy 3 | import pandas as pd 4 | 5 | class CovidScraper(scrapy.Spider): 6 | name = "Arbeitsagentur_Scraper" 7 | start_urls = ["https://www.arbeitsagentur.de/corona-faq"] 8 | 9 | def parse(self, response): 10 | columns = { 11 | "question" : [], 12 | "answer" : [], 13 | "answer_html" : [], 14 | "link" : [], 15 | "name" : [], 16 | "source" : [], 17 | "category" : [], 18 | "country" : [], 19 | "region" : [], 20 | "city" : [], 21 | "lang" : [], 22 | "last_update" : [], 23 | } 24 | 25 | current_category = "" 26 | current_question = "" 27 | current_answer = "" 28 | current_answer_html = "" 29 | ba_content_article_count = 0 30 | 31 | all_nodes = response.xpath("//*") 32 | for node in all_nodes: 33 | if node.attrib.get("class") == "ba-content-row": 34 | ba_content_article_count += 1 35 | # end of FAQ 36 | if ba_content_article_count == 4: 37 | break 38 | 39 | # in question 40 | if node.attrib.get("class") == "collapsed": 41 | # save previous question-answer pair 42 | if current_question: 43 | columns["question"].append(current_question) 44 | columns["answer"].append(current_answer) 45 | columns["answer_html"].append(current_answer_html) 46 | current_question = node.css("::text").get().strip() 47 | continue 48 | 49 | # in answer 50 | if node.attrib.get("class") == "ba-copytext": 51 | current_answer = node.css(" ::text").getall() 52 | current_answer = " ".join(current_answer).strip() 53 | current_answer_html = node.getall() 54 | current_answer_html = " ".join(current_answer_html).strip() 55 | continue 56 | 57 | 58 | 59 | columns["question"].append(current_question) 60 | columns["answer"].append(current_answer) 61 | columns["answer_html"].append(current_answer_html) 62 | 63 | today = date.today() 64 | 65 | columns["link"] = ["https://www.arbeitsagentur.de/corona-faq"] * len(columns["question"]) 66 | columns["name"] = ["FAQ: Corona-Virus"] * len(columns["question"]) 67 | columns["source"] = ["Bundesagentur für Arbeit"] * len(columns["question"]) 68 | columns["category"] = [""] * len(columns["question"]) 69 | columns["country"] = ["DE"] * len(columns["question"]) 70 | columns["region"] = [""] * len(columns["question"]) 71 | columns["city"] = [""] * len(columns["question"]) 72 | columns["lang"] = ["de"] * len(columns["question"]) 73 | columns["last_update"] = [today.strftime("%Y/%m/%d")] * len(columns["question"]) 74 | 75 | return columns 76 | -------------------------------------------------------------------------------- /datasources/scrapers_outdated/CDC_Children_scraper.py: -------------------------------------------------------------------------------- 1 | # run 'scrapy runspider CDC_Children_scraper.py' to scrape data 2 | 3 | from datetime import date 4 | import scrapy 5 | from scrapy.crawler import CrawlerProcess 6 | import pandas as pd 7 | 8 | class CovidScraper(scrapy.Spider): 9 | name = "CDC_Children_Scraper" 10 | start_urls = ["https://www.cdc.gov/coronavirus/2019-ncov/prepare/children-faq.html"] 11 | 12 | def parse(self, response): 13 | columns = { 14 | "question": [], 15 | "answer": [], 16 | "answer_html": [], 17 | "link": [], 18 | "name": [], 19 | "source": [], 20 | "category": [], 21 | "country": [], 22 | "region": [], 23 | "city": [], 24 | "lang": [], 25 | "last_update": [], 26 | } 27 | 28 | found_p = False 29 | found_question = False 30 | current_answer = "" 31 | 32 | categoryPaths = response.xpath('//div[@class="syndicate"]/div[@class="row "]') 33 | for catPath in categoryPaths: 34 | categoryName = catPath.xpath('.//h2/text()').getall() 35 | if len(categoryName) == 0: 36 | continue; 37 | categoryName = categoryName[0] 38 | qnaPaths = catPath.xpath('.//div[@role="tablist"]//div[@class="card"]') 39 | for qnaPath in qnaPaths: 40 | question = qnaPath.xpath('.//span[@role="heading"]/text()').get() 41 | responseParagraphPaths = qnaPath.xpath('.//div[@class="card-body"]') 42 | response = "" 43 | for respParaPath in responseParagraphPaths: 44 | response += " ".join(respParaPath.xpath('.//text()').getall()) + "\n\n" 45 | response = response.strip() 46 | columns["question"].append(question) 47 | columns["answer"].append(response) 48 | columns["answer_html"].append(" ".join(responseParagraphPaths.getall())) 49 | 50 | today = date.today() 51 | 52 | columns["link"] = ["https://www.cdc.gov/coronavirus/2019-ncov/prepare/children-faq.html"] * len( 53 | columns["question"]) 54 | columns["name"] = ["Coronavirus Disease-2019 (COVID-19) and Children"] * len(columns["question"]) 55 | columns["source"] = ["Center for Disease Control and Prevention (CDC)"] * len(columns["question"]) 56 | columns["category"] = ["Children"] * len(columns["question"]) 57 | columns["country"] = ["USA"] * len(columns["question"]) 58 | columns["region"] = [""] * len(columns["question"]) 59 | columns["city"] = [""] * len(columns["question"]) 60 | columns["lang"] = ["en"] * len(columns["question"]) 61 | columns["last_update"] = [today.strftime("%Y/%m/%d")] * len(columns["question"]) 62 | 63 | return columns 64 | 65 | if __name__ == "__main__": 66 | process = CrawlerProcess({ 67 | 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)' 68 | }) 69 | 70 | process.crawl(CovidScraper) 71 | process.start() 72 | -------------------------------------------------------------------------------- /covid-frontend/src/store/sagas/activeAnswers.js: -------------------------------------------------------------------------------- 1 | import { all, put, select, takeLatest } from 'redux-saga/effects'; 2 | import { message } from 'antd'; 3 | import * as api from 'store/sagas/api'; 4 | import * as globalSearchTypes from 'store/types/globalSearch'; 5 | import * as types from 'store/types/activeAnswers'; 6 | import * as actions from 'store/actions/activeAnswers'; 7 | const MODEL_ID = 1; 8 | 9 | export function* get() { 10 | const { selectedValue } = yield select(state => state.globalSearch); 11 | 12 | // reset active answers and return if no question is selected 13 | if (!selectedValue) { 14 | yield put(actions.set([])); 15 | 16 | return; 17 | } 18 | 19 | yield put(actions.setLoadingStatus(true)); 20 | try { 21 | const question = selectedValue; 22 | 23 | const query = { 24 | questions: [ question ], 25 | top_k_retriever: 5, 26 | }; 27 | 28 | const data = yield api.post(`/question/ask`, null, query); 29 | 30 | const answers = data.results[0].answers 31 | yield put(actions.set(answers)); 32 | 33 | // reset the feedbackGiven on each search 34 | yield put(actions.clearFeedbackGiven()); 35 | 36 | } catch (error) { 37 | message.error(error.message); 38 | } 39 | yield put(actions.setLoadingStatus(false)); 40 | } 41 | 42 | export function* markAsCorrectAnswer({ question, answerDocumentId }) { 43 | if (!question.selectedValue || answerDocumentId <= 0) { 44 | // do nothing 45 | return; 46 | } 47 | const id = parseInt(answerDocumentId, 10); 48 | try { 49 | const requestbody = { 50 | question: question.selectedValue, 51 | answer: '', 52 | feedback: 'relevant', 53 | document_id: id 54 | } 55 | yield api.post(`/models/${MODEL_ID}/feedback`, null, requestbody); 56 | } catch (error) { 57 | message.error(error.message); 58 | } 59 | 60 | yield put(actions.markAsFeedbackGiven({ [answerDocumentId]: 'relevant' })); 61 | message.success('Thanks for giving us feedback.') 62 | } 63 | 64 | export function* markAsWrongAnswer({ question, answerDocumentId, feedback }) { 65 | if (!question.selectedValue || answerDocumentId <= 0) { 66 | // do nothing 67 | return; 68 | } 69 | try { 70 | const id = parseInt(answerDocumentId, 10); 71 | 72 | const requestbody = { 73 | question: question.selectedValue, 74 | answer: '', 75 | feedback, 76 | document_id: id 77 | } 78 | yield api.post(`/models/${MODEL_ID}/feedback`, null, requestbody); 79 | 80 | } catch (error) { 81 | message.error(error.message); 82 | } 83 | 84 | yield put(actions.markAsFeedbackGiven({ [answerDocumentId]: feedback })); 85 | 86 | // the popup did already say 'thank you' 87 | // message.success('Thanks for giving us feedback.') 88 | } 89 | 90 | export default function* () { 91 | yield all([ 92 | takeLatest([types.GET, globalSearchTypes.SET_SELECTED_VALUE], get), 93 | takeLatest([types.MARK_AS_CORRECT_ANSWER], ({ payload }) => markAsCorrectAnswer(payload)), 94 | takeLatest([types.MARK_AS_WRONG_ANSWER], ({ payload }) => markAsWrongAnswer(payload)), 95 | ]); 96 | } 97 | -------------------------------------------------------------------------------- /covid_nlp/modeling/transformer/train_quora_dedup_bert.py: -------------------------------------------------------------------------------- 1 | """ 2 | This example loads the pre-trained bert-base-nli-mean-tokens models from the server. 3 | It then fine-tunes this model for some epochs on the STS benchmark dataset. 4 | """ 5 | from torch.utils.data import DataLoader 6 | import math 7 | from sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler, losses 8 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator 9 | from sentence_transformers.readers import STSDataReader 10 | import logging 11 | from datetime import datetime 12 | 13 | 14 | #### Just some code to print debug information to stdout 15 | logging.basicConfig(format='%(asctime)s - %(message)s', 16 | datefmt='%Y-%m-%d %H:%M:%S', 17 | level=logging.INFO, 18 | handlers=[LoggingHandler()]) 19 | #### /print debug information to stdout 20 | 21 | # Read the dataset 22 | #model_name = 'bert-base-nli-stsb-mean-tokens' 23 | model_name = "../saved_models" 24 | train_batch_size = 32 25 | num_epochs = 4 26 | model_save_path = 'output/quora_continue_training-'+model_name+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S") 27 | sts_reader = STSDataReader('../data/quora', normalize_scores=True, s1_col_idx=4, s2_col_idx=5, score_col_idx=6, max_score=1) 28 | 29 | # Load a pre-trained sentence transformer model 30 | model = SentenceTransformer(model_name) 31 | 32 | # Convert the dataset to a DataLoader ready for training 33 | logging.info("Read Quora train dataset") 34 | train_data = SentencesDataset(sts_reader.get_examples('train.csv'), model) 35 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size) 36 | train_loss = losses.CosineSimilarityLoss(model=model) 37 | 38 | 39 | logging.info("Read Quora dev dataset") 40 | dev_data = SentencesDataset(examples=sts_reader.get_examples('dev.csv'), model=model) 41 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size) 42 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) 43 | 44 | 45 | # Configure the training. We skip evaluation in this example 46 | warmup_steps = math.ceil(len(train_data)*num_epochs/train_batch_size*0.1) #10% of train data for warm-up 47 | logging.info("Warmup-steps: {}".format(warmup_steps)) 48 | 49 | 50 | # Train the model 51 | model.fit(train_objectives=[(train_dataloader, train_loss)], 52 | evaluator=evaluator, 53 | epochs=num_epochs, 54 | evaluation_steps=1000, 55 | warmup_steps=warmup_steps, 56 | output_path=model_save_path) 57 | 58 | 59 | ############################################################################## 60 | # 61 | # Load the stored model and evaluate its performance on STS benchmark dataset 62 | # 63 | ############################################################################## 64 | # 65 | # model = SentenceTransformer(model_save_path) 66 | # test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model) 67 | # test_dataloader = DataLoader(test_data, shuffle=False, batch_size=train_batch_size) 68 | # evaluator = EmbeddingSimilarityEvaluator(test_dataloader) 69 | # model.evaluate(evaluator) 70 | -------------------------------------------------------------------------------- /datasources/scrapers/CDC_General_scraper.py: -------------------------------------------------------------------------------- 1 | # run 'scrapy runspider CDC_General_scraper.py' to scrape data 2 | 3 | from datetime import date 4 | 5 | import scrapy 6 | from scrapy.crawler import CrawlerProcess 7 | 8 | 9 | class CovidScraper(scrapy.Spider): 10 | name = "CDC_Scraper" 11 | start_urls = ["https://www.cdc.gov/coronavirus/2019-ncov/faq.html"] 12 | 13 | def parse(self, response): 14 | columns = { 15 | "question": [], 16 | "answer": [], 17 | "answer_html": [], 18 | "link": [], 19 | "name": [], 20 | "source": [], 21 | "category": [], 22 | "country": [], 23 | "region": [], 24 | "city": [], 25 | "lang": [], 26 | "last_update": [], 27 | } 28 | 29 | current_category = "" 30 | 31 | all_nodes = response.xpath("//*") 32 | for i,node in enumerate(all_nodes): 33 | # in category 34 | if node.attrib.get("class") == "onThisPageAnchor": 35 | current_category = node.attrib["title"] 36 | continue 37 | 38 | # in category 39 | if current_category: 40 | # in question 41 | if node.attrib.get("role") == "heading": 42 | current_question = node.css("::text").get() 43 | 44 | # in answer 45 | if node.attrib.get("class") == "card-body": 46 | current_answer = node.css(" ::text").getall() 47 | current_answer = " ".join(current_answer).strip() 48 | current_answer_html = node.getall() 49 | current_answer_html = " ".join(current_answer_html).strip() 50 | 51 | # add question-answer-pair to data dictionary 52 | columns["question"].append(current_question) 53 | columns["answer"].append(current_answer) 54 | columns["answer_html"].append(current_answer_html) 55 | columns["category"].append(current_category) 56 | 57 | # end of category 58 | if node.attrib.get("class") == "row": 59 | current_category = "" 60 | 61 | today = date.today() 62 | 63 | columns["link"] = ["https://www.cdc.gov/coronavirus/2019-ncov/faq.html"] * len(columns["question"]) 64 | columns["name"] = ["CDC General FAQ"] * len(columns["question"]) 65 | columns["source"] = ["Center for Disease Control and Prevention (CDC)"] * len(columns["question"]) 66 | columns["country"] = ["USA"] * len(columns["question"]) 67 | columns["region"] = [""] * len(columns["question"]) 68 | columns["city"] = [""] * len(columns["question"]) 69 | columns["lang"] = ["en"] * len(columns["question"]) 70 | columns["last_update"] = [today.strftime("%Y/%m/%d")] * len(columns["question"]) 71 | 72 | return columns 73 | 74 | 75 | 76 | if __name__ == "__main__": 77 | process = CrawlerProcess({ 78 | 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)' 79 | }) 80 | process.crawl(CovidScraper) 81 | process.start() 82 | -------------------------------------------------------------------------------- /datasources/scrapers/IHK_scraper.py: -------------------------------------------------------------------------------- 1 | from datetime import date 2 | import scrapy 3 | import pandas as pd 4 | 5 | class CovidScraper(scrapy.Spider): 6 | name = "IHK_Scraper" 7 | start_urls = ["https://www.dihk.de/de/aktuelles-und-presse/coronavirus/faq-19594"] 8 | 9 | def parse(self, response): 10 | columns = { 11 | "question" : [], 12 | "answer" : [], 13 | "answer_html" : [], 14 | "link" : [], 15 | "name" : [], 16 | "source" : [], 17 | "category" : [], 18 | "country" : [], 19 | "region" : [], 20 | "city" : [], 21 | "lang" : [], 22 | "last_update" : [], 23 | } 24 | 25 | current_category = "" 26 | current_question = "" 27 | current_answer = "" 28 | current_answer_html = "" 29 | question_answer_pair = False 30 | 31 | all_nodes = response.xpath("//*") 32 | for node in all_nodes: 33 | # save previous question-answer pair 34 | if question_answer_pair: 35 | columns["question"].append(current_question) 36 | columns["answer"].append(current_answer) 37 | columns["answer_html"].append(current_answer_html) 38 | columns["category"].append(current_category) 39 | question_answer_pair = False 40 | 41 | # in category 42 | if node.attrib.get("class") == "accordion__headline": 43 | current_category = node.css("::text").get() 44 | continue 45 | 46 | if current_category: 47 | # in question 48 | if node.attrib.get("class") == "accordion__btn-inner": 49 | current_question = node.css("::text").get() 50 | continue 51 | 52 | # in answer 53 | if current_question and (node.attrib.get("class") == "rte__content"): 54 | current_answer = node.css(" ::text").getall() 55 | current_answer = " ".join(current_answer).strip() 56 | current_answer_html = node.getall() 57 | current_answer_html = " ".join(current_answer_html).strip() 58 | question_answer_pair = True 59 | continue 60 | 61 | # end of FAQ 62 | if node.attrib.get("class") == "u-area is-area-cols-2 is-auto-height is-low-margin is-mobile-full": 63 | break 64 | 65 | today = date.today() 66 | 67 | columns["link"] = ["https://www.berlin.de/corona/faq/"] * len(columns["question"]) 68 | columns["name"] = ["Corona-Prävention in Berlin – Fragen und Antworten"] * len(columns["question"]) 69 | columns["source"] = ["Berliner Senat"] * len(columns["question"]) 70 | columns["country"] = ["DE"] * len(columns["question"]) 71 | columns["region"] = ["Berlin"] * len(columns["question"]) 72 | columns["city"] = ["Berlin"] * len(columns["question"]) 73 | columns["lang"] = ["de"] * len(columns["question"]) 74 | columns["last_update"] = [today.strftime("%Y/%m/%d")] * len(columns["question"]) 75 | 76 | return columns 77 | -------------------------------------------------------------------------------- /datasources/scrapers/ECDC_scraper.py: -------------------------------------------------------------------------------- 1 | # run 'scrapy runspider ECDC_scraper.py' to scrape data 2 | 3 | from datetime import date 4 | 5 | import scrapy 6 | from scrapy.crawler import CrawlerProcess 7 | 8 | 9 | class CovidScraper(scrapy.Spider): 10 | name = "ECDCS_scraper" 11 | start_urls = ["https://www.ecdc.europa.eu/en/novel-coronavirus-china/questions-answers"] 12 | 13 | def parse(self, response): 14 | columns = { 15 | "question": [], 16 | "answer": [], 17 | "answer_html": [], 18 | "link": [], 19 | "name": [], 20 | "source": [], 21 | "category": [], 22 | "country": [], 23 | "region": [], 24 | "city": [], 25 | "lang": [], 26 | "last_update": [], 27 | } 28 | 29 | # Scraper Idea: we search for the questions, all paragraphs that follow belong to the question 30 | 31 | QUESTION_ELEMENT_SELECTOR = ".ct--view-30 .text-image h3" 32 | QUESTION_SELECTOR = "::text" 33 | 34 | questions = response.css(QUESTION_ELEMENT_SELECTOR) 35 | for question_elm in questions: 36 | question = question_elm.css(QUESTION_SELECTOR).getall() 37 | question = " ".join(question).replace('\xa0', ' ') 38 | # we remove the first 2 chars, they look like this '1.' 39 | question = question[2:] 40 | question = question.strip() 41 | 42 | # all paragraphs till the next question header are considert to be the answer 43 | following_siblings = question_elm.xpath('following-sibling::*') 44 | answer = [] 45 | answer_html = [] 46 | for elm in following_siblings: 47 | if elm.root.tag != 'h3': 48 | answer += elm.css("::text").getall() 49 | answer_html += [elm.get()] 50 | answer = "".join(answer).replace('\xa0', ' ').strip() 51 | answer_html = " ".join(answer_html).strip() 52 | 53 | # add question-answer pair to data dictionary 54 | columns["question"].append(question) 55 | columns["answer"].append(answer) 56 | columns["answer_html"].append(answer_html) 57 | 58 | today = date.today() 59 | 60 | columns["link"] = ["https://www.ecdc.europa.eu/en/novel-coronavirus-china/questions-answers"] * len( 61 | columns["question"]) 62 | columns["name"] = ["Q & A on COVID-19"] * len(columns["question"]) 63 | columns["source"] = ["European Centre for Disease Prevention and Control"] * len(columns["question"]) 64 | columns["category"] = [""] * len(columns["question"]) 65 | columns["country"] = [""] * len(columns["question"]) 66 | columns["region"] = [""] * len(columns["question"]) 67 | columns["city"] = [""] * len(columns["question"]) 68 | columns["lang"] = ["en"] * len(columns["question"]) 69 | columns["last_update"] = [today.strftime("%Y/%m/%d")] * len(columns["question"]) 70 | 71 | # df = pd.DataFrame(columns) 72 | return columns 73 | 74 | 75 | if __name__ == "__main__": 76 | process = CrawlerProcess({ 77 | 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)' 78 | }) 79 | 80 | process.crawl(CovidScraper) 81 | process.start() 82 | -------------------------------------------------------------------------------- /telegram-bot/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.gitignore.io/api/kotlin,intellij,gradle 3 | # Edit at https://www.gitignore.io/?templates=kotlin,intellij,gradle 4 | 5 | ### Intellij ### 6 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm 7 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 8 | 9 | # User-specific stuff 10 | .idea/**/workspace.xml 11 | .idea/**/tasks.xml 12 | .idea/**/usage.statistics.xml 13 | .idea/**/dictionaries 14 | .idea/**/shelf 15 | 16 | # Generated files 17 | .idea/**/contentModel.xml 18 | 19 | # Sensitive or high-churn files 20 | .idea/**/dataSources/ 21 | .idea/**/dataSources.ids 22 | .idea/**/dataSources.local.xml 23 | .idea/**/sqlDataSources.xml 24 | .idea/**/dynamic.xml 25 | .idea/**/uiDesigner.xml 26 | .idea/**/dbnavigator.xml 27 | 28 | # Gradle 29 | .idea/**/gradle.xml 30 | .idea/**/libraries 31 | 32 | # Gradle and Maven with auto-import 33 | # When using Gradle or Maven with auto-import, you should exclude module files, 34 | # since they will be recreated, and may cause churn. Uncomment if using 35 | # auto-import. 36 | # .idea/modules.xml 37 | # .idea/*.iml 38 | # .idea/modules 39 | # *.iml 40 | # *.ipr 41 | 42 | # CMake 43 | cmake-build-*/ 44 | 45 | # Mongo Explorer plugin 46 | .idea/**/mongoSettings.xml 47 | 48 | # File-based project format 49 | *.iws 50 | 51 | # IntelliJ 52 | out/ 53 | 54 | # mpeltonen/sbt-idea plugin 55 | .idea_modules/ 56 | 57 | # JIRA plugin 58 | atlassian-ide-plugin.xml 59 | 60 | # Cursive Clojure plugin 61 | .idea/replstate.xml 62 | 63 | # Crashlytics plugin (for Android Studio and IntelliJ) 64 | com_crashlytics_export_strings.xml 65 | crashlytics.properties 66 | crashlytics-build.properties 67 | fabric.properties 68 | 69 | # Editor-based Rest Client 70 | .idea/httpRequests 71 | 72 | # Android studio 3.1+ serialized cache file 73 | .idea/caches/build_file_checksums.ser 74 | 75 | ### Intellij Patch ### 76 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 77 | 78 | # *.iml 79 | # modules.xml 80 | # .idea/misc.xml 81 | # *.ipr 82 | 83 | # Sonarlint plugin 84 | .idea/**/sonarlint/ 85 | 86 | # SonarQube Plugin 87 | .idea/**/sonarIssues.xml 88 | 89 | # Markdown Navigator plugin 90 | .idea/**/markdown-navigator.xml 91 | .idea/**/markdown-navigator/ 92 | 93 | ### Kotlin ### 94 | # Compiled class file 95 | *.class 96 | 97 | # Log file 98 | *.log 99 | 100 | # BlueJ files 101 | *.ctxt 102 | 103 | # Mobile Tools for Java (J2ME) 104 | .mtj.tmp/ 105 | 106 | # Package Files # 107 | *.jar 108 | *.war 109 | *.nar 110 | *.ear 111 | *.zip 112 | *.tar.gz 113 | *.rar 114 | 115 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 116 | hs_err_pid* 117 | 118 | ### Gradle ### 119 | .gradle 120 | build/ 121 | 122 | # Ignore Gradle GUI config 123 | gradle-app.setting 124 | 125 | # Avoid ignoring Gradle wrapper jar file (.jar files are usually ignored) 126 | !gradle-wrapper.jar 127 | 128 | # Cache of project 129 | .gradletasknamecache 130 | 131 | # # Work around https://youtrack.jetbrains.com/issue/IDEA-116898 132 | # gradle/wrapper/gradle-wrapper.properties 133 | 134 | ### Gradle Patch ### 135 | **/build/ 136 | 137 | src/main/kotlin/com/theapache64/cs/core/SecretConstants.kt 138 | 139 | # End of https://www.gitignore.io/api/kotlin,intellij,gradle 140 | -------------------------------------------------------------------------------- /datasources/scrapers/FHM_EN_scraper.py: -------------------------------------------------------------------------------- 1 | # run 'scrapy runspider FHM_SV_scraper.py' to scrape data 2 | 3 | #Add data in English from Folkhälsomyndigheten 4 | 5 | import scrapy 6 | from datetime import date 7 | from scrapy.crawler import CrawlerProcess 8 | 9 | 10 | class CovidScraper(scrapy.Spider): 11 | name = 'fhm_en_spyder' 12 | start_urls = ['https://www.folkhalsomyndigheten.se/the-public-health-agency-of-sweden/communicable-disease-control/covid-19/'] 13 | 14 | questionsOnly = True 15 | 16 | def transformContent(self, contentNode): 17 | responseParts = [] 18 | for responsePart in contentNode.xpath('.//text()').getall(): 19 | strippedPart = responsePart.strip() 20 | if len(strippedPart) > 0: 21 | responseParts.append(strippedPart) 22 | return ' '.join(responseParts) 23 | 24 | def parse(self, response): 25 | columns = { 26 | "question": [], 27 | "answer": [], 28 | "answer_html": [], 29 | "link": [], 30 | "name": [], 31 | "source": [], 32 | "category": [], 33 | "country": [], 34 | "region": [], 35 | "city": [], 36 | "lang": [], 37 | "last_update": [], 38 | } 39 | 40 | 41 | categoryPaths = response.xpath('//div[@class="container"]') 42 | 43 | for catPath in categoryPaths: 44 | 45 | categoryName = catPath.xpath('./h2/text()').getall() 46 | #print(categoryName) 47 | if len(categoryName) == 0: 48 | continue 49 | 50 | 51 | qnaPaths = catPath.xpath('.//*[@class="accordion__item toggle"]') 52 | for qnaPath in qnaPaths: 53 | 54 | 55 | question = qnaPath.xpath('./strong/a/span/text()').getall() 56 | 57 | 58 | responseParagraphPaths = qnaPath.xpath('.//div[@class="textbody"]') 59 | 60 | 61 | response = "" 62 | for respParaPath in responseParagraphPaths: 63 | response += " ".join(respParaPath.xpath('.//text()').getall()) + "\n\n" 64 | 65 | response = response.strip() 66 | 67 | columns["question"].append(question[0]) 68 | columns["category"].append(categoryName[0]) 69 | columns["answer"].append(response) 70 | columns["answer_html"].append(" ".join(responseParagraphPaths.getall())) 71 | today = date.today() 72 | 73 | 74 | columns["link"] = ["https://www.folkhalsomyndigheten.se/the-public-health-agency-of-sweden/communicable-disease-control/covid-19/"] * len(columns["question"]) 75 | columns["name"] = ["Q&A on coronaviruses (COVID-19)"] * len(columns["question"]) 76 | columns["source"] = ["FHM, Folkhälsomyndigheten"] * len(columns["question"]) 77 | columns["country"] = ["Sweden"] * len(columns["question"]) 78 | columns["region"] = [""] * len(columns["question"]) 79 | columns["city"] = [""] * len(columns["question"]) 80 | columns["lang"] = ["en"] * len(columns["question"]) 81 | columns["last_update"] = [today.strftime("%Y/%m/%d")] * len(columns["question"]) 82 | 83 | return columns 84 | 85 | 86 | if __name__ == "__main__": 87 | process = CrawlerProcess({ 88 | 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)' 89 | }) 90 | 91 | process.crawl(CovidScraper) 92 | process.start() 93 | -------------------------------------------------------------------------------- /datasources/scrapers/WHO_scraper.py: -------------------------------------------------------------------------------- 1 | # run 'scrapy runspider WHO_scraper.py' to scrape data 2 | 3 | from datetime import date 4 | 5 | import scrapy 6 | 7 | 8 | class CovidScraper(scrapy.Spider): 9 | name = "WHO_scraper" 10 | start_urls = ["https://www.who.int/news-room/q-a-detail/q-a-coronaviruses", 11 | "https://www.who.int/news-room/q-a-detail/q-a-on-covid-19-and-pregnancy-and-childbirth", 12 | "https://www.who.int/news-room/q-a-detail/q-a-on-covid-19-and-breastfeeding", 13 | "https://www.who.int/news-room/q-a-detail/q-a-on-covid-19-and-masks", 14 | "https://www.who.int/news-room/q-a-detail/q-a-on-covid-19-hiv-and-antiretrovirals", 15 | "https://www.who.int/news-room/q-a-detail/q-a-on-mass-gatherings-and-covid-19", 16 | "https://www.who.int/news-room/q-a-detail/q-a-on-infection-prevention-and-control-for-health-care-workers-caring-for-patients-with-suspected-or-confirmed-2019-ncov", 17 | "https://www.who.int/news-room/q-a-detail/be-active-during-covid-19", 18 | "https://www.who.int/news-room/q-a-detail/malaria-and-the-covid-19-pandemic", 19 | "https://www.who.int/news-room/q-a-detail/violence-against-women-during-covid-19", 20 | "https://www.who.int/news-room/q-a-detail/contraception-family-planning-and-covid-19"] 21 | 22 | def parse(self, response): 23 | columns = { 24 | "question": [], 25 | "answer": [], 26 | "answer_html": [], 27 | "link": [], 28 | "name": [], 29 | "source": [], 30 | "category": [], 31 | "country": [], 32 | "region": [], 33 | "city": [], 34 | "lang": [], 35 | "last_update": [], 36 | } 37 | 38 | QUESTION_ANSWER_SELECTOR = ".sf-accordion__panel" 39 | QUESTION_SELECTOR = ".sf-accordion__link::text" 40 | ANSWER_SELECTOR = ".sf-accordion__content ::text" 41 | ANSWER_HTML_SELECTOR = ".sf-accordion__content" 42 | 43 | questions_answers = response.css(QUESTION_ANSWER_SELECTOR) 44 | for question_answer in questions_answers: 45 | question = question_answer.css(QUESTION_SELECTOR).getall() 46 | question = " ".join(question).strip() 47 | answer = question_answer.css(ANSWER_SELECTOR).getall() 48 | answer = " ".join(answer).strip() 49 | answer_html = question_answer.css(ANSWER_HTML_SELECTOR).getall() 50 | answer_html = " ".join(answer_html).strip() 51 | 52 | # add question-answer pair to data dictionary 53 | columns["question"].append(question) 54 | columns["answer"].append(answer) 55 | columns["answer_html"].append(answer_html) 56 | 57 | today = date.today() 58 | 59 | columns["link"] = [response.url] * len(columns["question"]) 60 | columns["name"] = ["Q&A on coronaviruses (COVID-19)"] * len(columns["question"]) 61 | columns["source"] = ["World Health Organization (WHO)"] * len(columns["question"]) 62 | columns["category"] = [""] * len(columns["question"]) 63 | columns["country"] = [""] * len(columns["question"]) 64 | columns["region"] = [""] * len(columns["question"]) 65 | columns["city"] = [""] * len(columns["question"]) 66 | columns["lang"] = ["en"] * len(columns["question"]) 67 | columns["last_update"] = [today.strftime("%Y/%m/%d")] * len(columns["question"]) 68 | 69 | return columns 70 | -------------------------------------------------------------------------------- /datasources/scrapers/BMG_scraper.py: -------------------------------------------------------------------------------- 1 | from datetime import date 2 | 3 | import scrapy 4 | 5 | 6 | class CovidScraper(scrapy.Spider): 7 | name = "BMG_scraper" 8 | start_urls = ["https://www.zusammengegencorona.de/informieren/basiswissen-coronavirus/", 9 | "https://www.zusammengegencorona.de/informieren/basiswissen-uebertragung/", 10 | "https://www.zusammengegencorona.de/informieren/informationen-zum-test/", 11 | "https://www.zusammengegencorona.de/informieren/symptome-erkennen/", 12 | "https://www.zusammengegencorona.de/informieren/praevention/", 13 | "https://www.zusammengegencorona.de/informieren/informationen-alltag/", 14 | "https://www.zusammengegencorona.de/informieren/informationen-aeltere-menschen/", 15 | "https://www.zusammengegencorona.de/informieren/medizinisches-personal/", 16 | "https://www.zusammengegencorona.de/informieren/arbeitsschutz/", 17 | "https://www.zusammengegencorona.de/informieren/wirtschaftliche-folgen/", 18 | # not real answers, only links... 19 | "https://www.zusammengegencorona.de/informieren/weitere-informationen/", 20 | # very specific questions and answers as well as other links 21 | "https://www.zusammengegencorona.de/informieren/zuhause-bleiben/"] 22 | 23 | def parse(self, response): 24 | columns = { 25 | "question": [], 26 | "answer": [], 27 | "answer_html": [], 28 | "link": [], 29 | "name": [], 30 | "source": [], 31 | "category": [], 32 | "country": [], 33 | "region": [], 34 | "city": [], 35 | "lang": [], 36 | "last_update": [], 37 | } 38 | 39 | QUESTION_ANSWER_SELECTOR = ".accordion__item" 40 | QUESTION_SELECTOR = ".accordion__heading ::text" 41 | ANSWER_SELECTOR = ".panel-inner ::text" 42 | ANSWER_HTML_SELECTOR = ".panel-inner" 43 | 44 | questions_answers = response.css(QUESTION_ANSWER_SELECTOR) 45 | for question_answer in questions_answers: 46 | question = question_answer.css(QUESTION_SELECTOR).getall() 47 | question = " ".join(question).strip() 48 | answer = question_answer.css(ANSWER_SELECTOR).getall() 49 | answer = " ".join(answer).strip() 50 | answer_html = question_answer.css(ANSWER_HTML_SELECTOR).getall() 51 | answer_html = " ".join(answer_html).strip() 52 | 53 | # add question-answer pair to data dictionary 54 | columns["question"].append(question) 55 | columns["answer"].append(answer) 56 | columns["answer_html"].append(answer_html) 57 | columns["link"].append(response.url) 58 | 59 | today = date.today() 60 | 61 | columns["name"] = ["Ihre Fragen - unsere Antworten zum neuartigen Coronavirus / COVID-19"] * len( 62 | columns["question"]) 63 | columns["source"] = ["Bundesministerium für Gesundheit (BMG)"] * len(columns["question"]) 64 | columns["category"] = [""] * len(columns["question"]) 65 | columns["country"] = ["DE"] * len(columns["question"]) 66 | columns["region"] = [""] * len(columns["question"]) 67 | columns["city"] = [""] * len(columns["question"]) 68 | columns["lang"] = ["de"] * len(columns["question"]) 69 | columns["last_update"] = [today.strftime("%Y/%m/%d")] * len(columns["question"]) 70 | 71 | return columns 72 | -------------------------------------------------------------------------------- /datasources/scrapers/FHM_SV_scraper.py: -------------------------------------------------------------------------------- 1 | # run 'scrapy runspider FHM_SV_scraper.py' to scrape data 2 | 3 | #Add data in Swedish from Folkhälsomyndigheten 4 | 5 | import scrapy 6 | from scrapy.crawler import CrawlerProcess 7 | 8 | 9 | class CovidScraper(scrapy.Spider): 10 | name = 'fhm_sv_spyder' 11 | start_urls = ['https://www.folkhalsomyndigheten.se/smittskydd-beredskap/utbrott/aktuella-utbrott/covid-19/fragor-och-svar/'] 12 | 13 | questionsOnly = True 14 | 15 | def transformContent(self, contentNode): 16 | responseParts = [] 17 | for responsePart in contentNode.xpath('.//text()').getall(): 18 | strippedPart = responsePart.strip() 19 | if len(strippedPart) > 0: 20 | responseParts.append(strippedPart) 21 | return ' '.join(responseParts) 22 | 23 | def parse(self, response): 24 | columns = { 25 | "question": [], 26 | "answer": [], 27 | "answer_html": [], 28 | "link": [], 29 | "name": [], 30 | "source": [], 31 | "category": [], 32 | "country": [], 33 | "region": [], 34 | "city": [], 35 | "lang": [], 36 | "last_update": [], 37 | } 38 | 39 | 40 | categoryPaths = response.xpath('//div[@class="faq-container"]') 41 | 42 | for catPath in categoryPaths: 43 | 44 | categoryName = catPath.xpath('./h2/span/text()').getall() 45 | #print(categoryName) 46 | if len(categoryName) == 0: 47 | continue 48 | 49 | 50 | qnaPaths = catPath.xpath('.//*[@class="accordion__item toggle"]') 51 | for qnaPath in qnaPaths: 52 | 53 | 54 | question = qnaPath.xpath('./strong/a/span/span/text()').getall() 55 | 56 | 57 | responseParagraphPaths = qnaPath.xpath('.//div[@class="textbody"]') 58 | 59 | 60 | response = "" 61 | for respParaPath in responseParagraphPaths: 62 | response += " ".join(respParaPath.xpath('.//text()').getall()) + "\n\n" 63 | 64 | #Cleanup text. It contains a link and a date updated in the text 65 | response = response.strip() 66 | splitted = response.split("\n") 67 | dater = splitted[-2].strip().replace("Uppdaterad: ", "").replace("-", "/").split(" ")[0] 68 | response = "\n".join(splitted[:-2 or None]) 69 | 70 | columns["question"].append(question[0]) 71 | columns["category"].append(categoryName[0]) 72 | columns["answer"].append(response) 73 | columns["last_update"].append(dater) 74 | columns["answer_html"].append(" ".join(responseParagraphPaths.getall())) 75 | 76 | columns["link"] = ["https://www.folkhalsomyndigheten.se/smittskydd-beredskap/utbrott/aktuella-utbrott/covid-19/fragor-och-svar/"] * len(columns["question"]) 77 | columns["name"] = ["Q&A on coronaviruses (COVID-19)"] * len(columns["question"]) 78 | columns["source"] = ["FHM, Folkhälsomyndigheten"] * len(columns["question"]) 79 | columns["country"] = ["Sweden"] * len(columns["question"]) 80 | columns["region"] = [""] * len(columns["question"]) 81 | columns["city"] = [""] * len(columns["question"]) 82 | columns["lang"] = ["sv"] * len(columns["question"]) 83 | 84 | return columns 85 | 86 | 87 | if __name__ == "__main__": 88 | process = CrawlerProcess({ 89 | 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)' 90 | }) 91 | 92 | process.crawl(CovidScraper) 93 | process.start() 94 | -------------------------------------------------------------------------------- /datasources/scrapers/BerlinerSenat_scraper.py: -------------------------------------------------------------------------------- 1 | from datetime import date 2 | import scrapy 3 | import pandas as pd 4 | 5 | class CovidScraper(scrapy.Spider): 6 | name = "Berliner_Senat_Scraper" 7 | start_urls = ["https://www.berlin.de/corona/faq/"] 8 | 9 | def parse(self, response): 10 | columns = { 11 | "question" : [], 12 | "answer" : [], 13 | "answer_html" : [], 14 | "link" : [], 15 | "name" : [], 16 | "source" : [], 17 | "category" : [], 18 | "country" : [], 19 | "region" : [], 20 | "city" : [], 21 | "lang" : [], 22 | "last_update" : [], 23 | } 24 | 25 | current_category = "" 26 | current_question = "" 27 | current_answer = "" 28 | current_answer_html = "" 29 | question_answer_pair = False 30 | 31 | all_nodes = response.xpath("//*") 32 | for node in all_nodes: 33 | # in category 34 | if (node.xpath("name()").get() == "h2") and (node.attrib.get("class") == "title"): 35 | current_category = node.css("::text").get() 36 | continue 37 | 38 | if current_category: 39 | # in question-answer pair 40 | if node.attrib.get("class") == "html5-section block module-faq land-toggler": 41 | # save previous question-answer pair 42 | if current_question: 43 | columns["question"].append(current_question) 44 | columns["answer"].append(current_answer) 45 | columns["answer_html"].append(current_answer_html) 46 | columns["category"].append(current_category) 47 | 48 | question_answer_pair = True 49 | continue 50 | 51 | # in question 52 | if question_answer_pair and (node.attrib.get("class") == "land-toggler-button collapsed"): 53 | current_question = node.css("::text").get() 54 | continue 55 | 56 | # in answer 57 | if question_answer_pair and (node.attrib.get("class") == "textile"): 58 | current_answer = node.css(" ::text").getall() 59 | current_answer = " ".join(current_answer).strip() 60 | current_answer_html = node.getall() 61 | current_answer_html = " ".join(current_answer_html).strip() 62 | continue 63 | 64 | # end of FAQ 65 | if node.attrib.get("class") == "html5-section block modul-text_bild": 66 | break 67 | 68 | columns["question"].append(current_question) 69 | columns["answer"].append(current_answer) 70 | columns["answer_html"].append(current_answer_html) 71 | columns["category"].append(current_category) 72 | 73 | today = date.today() 74 | 75 | columns["link"] = ["https://www.berlin.de/corona/faq/"] * len(columns["question"]) 76 | columns["name"] = ["Corona-Prävention in Berlin – Fragen und Antworten"] * len(columns["question"]) 77 | columns["source"] = ["Berliner Senat"] * len(columns["question"]) 78 | columns["country"] = ["DE"] * len(columns["question"]) 79 | columns["region"] = ["Berlin"] * len(columns["question"]) 80 | columns["city"] = ["Berlin"] * len(columns["question"]) 81 | columns["lang"] = ["de"] * len(columns["question"]) 82 | columns["last_update"] = [today.strftime("%Y/%m/%d")] * len(columns["question"]) 83 | 84 | return columns 85 | 86 | 87 | 88 | 89 | --------------------------------------------------------------------------------